1/*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 2010 Alexander Motin <mav@FreeBSD.org>
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 */
28
29#include <sys/param.h>
30#include <sys/systm.h>
31#include <sys/bio.h>
32#include <sys/eventhandler.h>
33#include <sys/kernel.h>
34#include <sys/kthread.h>
35#include <sys/limits.h>
36#include <sys/lock.h>
37#include <sys/malloc.h>
38#include <sys/module.h>
39#include <sys/mutex.h>
40#include <sys/proc.h>
41#include <sys/reboot.h>
42#include <sys/sbuf.h>
43#include <sys/sched.h>
44#include <sys/sysctl.h>
45
46#include <vm/uma.h>
47
48#include <geom/geom.h>
49#include <geom/geom_dbg.h>
50#include <geom/raid/g_raid.h>
51#include "g_raid_md_if.h"
52#include "g_raid_tr_if.h"
53
54static MALLOC_DEFINE(M_RAID, "raid_data", "GEOM_RAID Data");
55
56SYSCTL_DECL(_kern_geom);
57SYSCTL_NODE(_kern_geom, OID_AUTO, raid, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
58    "GEOM_RAID stuff");
59int g_raid_enable = 1;
60SYSCTL_INT(_kern_geom_raid, OID_AUTO, enable, CTLFLAG_RWTUN,
61    &g_raid_enable, 0, "Enable on-disk metadata taste");
62u_int g_raid_aggressive_spare = 0;
63SYSCTL_UINT(_kern_geom_raid, OID_AUTO, aggressive_spare, CTLFLAG_RWTUN,
64    &g_raid_aggressive_spare, 0, "Use disks without metadata as spare");
65u_int g_raid_debug = 0;
66SYSCTL_UINT(_kern_geom_raid, OID_AUTO, debug, CTLFLAG_RWTUN, &g_raid_debug, 0,
67    "Debug level");
68int g_raid_read_err_thresh = 10;
69SYSCTL_UINT(_kern_geom_raid, OID_AUTO, read_err_thresh, CTLFLAG_RWTUN,
70    &g_raid_read_err_thresh, 0,
71    "Number of read errors equated to disk failure");
72u_int g_raid_start_timeout = 30;
73SYSCTL_UINT(_kern_geom_raid, OID_AUTO, start_timeout, CTLFLAG_RWTUN,
74    &g_raid_start_timeout, 0,
75    "Time to wait for all array components");
76static u_int g_raid_clean_time = 5;
77SYSCTL_UINT(_kern_geom_raid, OID_AUTO, clean_time, CTLFLAG_RWTUN,
78    &g_raid_clean_time, 0, "Mark volume as clean when idling");
79static u_int g_raid_disconnect_on_failure = 1;
80SYSCTL_UINT(_kern_geom_raid, OID_AUTO, disconnect_on_failure, CTLFLAG_RWTUN,
81    &g_raid_disconnect_on_failure, 0, "Disconnect component on I/O failure.");
82static u_int g_raid_name_format = 0;
83SYSCTL_UINT(_kern_geom_raid, OID_AUTO, name_format, CTLFLAG_RWTUN,
84    &g_raid_name_format, 0, "Providers name format.");
85static u_int g_raid_idle_threshold = 1000000;
86SYSCTL_UINT(_kern_geom_raid, OID_AUTO, idle_threshold, CTLFLAG_RWTUN,
87    &g_raid_idle_threshold, 1000000,
88    "Time in microseconds to consider a volume idle.");
89
90#define	MSLEEP(rv, ident, mtx, priority, wmesg, timeout)	do {	\
91	G_RAID_DEBUG(4, "%s: Sleeping %p.", __func__, (ident));		\
92	rv = msleep((ident), (mtx), (priority), (wmesg), (timeout));	\
93	G_RAID_DEBUG(4, "%s: Woken up %p.", __func__, (ident));		\
94} while (0)
95
96LIST_HEAD(, g_raid_md_class) g_raid_md_classes =
97    LIST_HEAD_INITIALIZER(g_raid_md_classes);
98
99LIST_HEAD(, g_raid_tr_class) g_raid_tr_classes =
100    LIST_HEAD_INITIALIZER(g_raid_tr_classes);
101
102LIST_HEAD(, g_raid_volume) g_raid_volumes =
103    LIST_HEAD_INITIALIZER(g_raid_volumes);
104
105static eventhandler_tag g_raid_post_sync = NULL;
106static int g_raid_started = 0;
107static int g_raid_shutdown = 0;
108
109static int g_raid_destroy_geom(struct gctl_req *req, struct g_class *mp,
110    struct g_geom *gp);
111static g_taste_t g_raid_taste;
112static void g_raid_init(struct g_class *mp);
113static void g_raid_fini(struct g_class *mp);
114
115struct g_class g_raid_class = {
116	.name = G_RAID_CLASS_NAME,
117	.version = G_VERSION,
118	.ctlreq = g_raid_ctl,
119	.taste = g_raid_taste,
120	.destroy_geom = g_raid_destroy_geom,
121	.init = g_raid_init,
122	.fini = g_raid_fini
123};
124
125static void g_raid_destroy_provider(struct g_raid_volume *vol);
126static int g_raid_update_disk(struct g_raid_disk *disk, u_int event);
127static int g_raid_update_subdisk(struct g_raid_subdisk *subdisk, u_int event);
128static int g_raid_update_volume(struct g_raid_volume *vol, u_int event);
129static int g_raid_update_node(struct g_raid_softc *sc, u_int event);
130static void g_raid_dumpconf(struct sbuf *sb, const char *indent,
131    struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp);
132static void g_raid_start(struct bio *bp);
133static void g_raid_start_request(struct bio *bp);
134static void g_raid_disk_done(struct bio *bp);
135static void g_raid_poll(struct g_raid_softc *sc);
136
137static const char *
138g_raid_node_event2str(int event)
139{
140
141	switch (event) {
142	case G_RAID_NODE_E_WAKE:
143		return ("WAKE");
144	case G_RAID_NODE_E_START:
145		return ("START");
146	default:
147		return ("INVALID");
148	}
149}
150
151const char *
152g_raid_disk_state2str(int state)
153{
154
155	switch (state) {
156	case G_RAID_DISK_S_NONE:
157		return ("NONE");
158	case G_RAID_DISK_S_OFFLINE:
159		return ("OFFLINE");
160	case G_RAID_DISK_S_DISABLED:
161		return ("DISABLED");
162	case G_RAID_DISK_S_FAILED:
163		return ("FAILED");
164	case G_RAID_DISK_S_STALE_FAILED:
165		return ("STALE_FAILED");
166	case G_RAID_DISK_S_SPARE:
167		return ("SPARE");
168	case G_RAID_DISK_S_STALE:
169		return ("STALE");
170	case G_RAID_DISK_S_ACTIVE:
171		return ("ACTIVE");
172	default:
173		return ("INVALID");
174	}
175}
176
177static const char *
178g_raid_disk_event2str(int event)
179{
180
181	switch (event) {
182	case G_RAID_DISK_E_DISCONNECTED:
183		return ("DISCONNECTED");
184	default:
185		return ("INVALID");
186	}
187}
188
189const char *
190g_raid_subdisk_state2str(int state)
191{
192
193	switch (state) {
194	case G_RAID_SUBDISK_S_NONE:
195		return ("NONE");
196	case G_RAID_SUBDISK_S_FAILED:
197		return ("FAILED");
198	case G_RAID_SUBDISK_S_NEW:
199		return ("NEW");
200	case G_RAID_SUBDISK_S_REBUILD:
201		return ("REBUILD");
202	case G_RAID_SUBDISK_S_UNINITIALIZED:
203		return ("UNINITIALIZED");
204	case G_RAID_SUBDISK_S_STALE:
205		return ("STALE");
206	case G_RAID_SUBDISK_S_RESYNC:
207		return ("RESYNC");
208	case G_RAID_SUBDISK_S_ACTIVE:
209		return ("ACTIVE");
210	default:
211		return ("INVALID");
212	}
213}
214
215static const char *
216g_raid_subdisk_event2str(int event)
217{
218
219	switch (event) {
220	case G_RAID_SUBDISK_E_NEW:
221		return ("NEW");
222	case G_RAID_SUBDISK_E_FAILED:
223		return ("FAILED");
224	case G_RAID_SUBDISK_E_DISCONNECTED:
225		return ("DISCONNECTED");
226	default:
227		return ("INVALID");
228	}
229}
230
231const char *
232g_raid_volume_state2str(int state)
233{
234
235	switch (state) {
236	case G_RAID_VOLUME_S_STARTING:
237		return ("STARTING");
238	case G_RAID_VOLUME_S_BROKEN:
239		return ("BROKEN");
240	case G_RAID_VOLUME_S_DEGRADED:
241		return ("DEGRADED");
242	case G_RAID_VOLUME_S_SUBOPTIMAL:
243		return ("SUBOPTIMAL");
244	case G_RAID_VOLUME_S_OPTIMAL:
245		return ("OPTIMAL");
246	case G_RAID_VOLUME_S_UNSUPPORTED:
247		return ("UNSUPPORTED");
248	case G_RAID_VOLUME_S_STOPPED:
249		return ("STOPPED");
250	default:
251		return ("INVALID");
252	}
253}
254
255static const char *
256g_raid_volume_event2str(int event)
257{
258
259	switch (event) {
260	case G_RAID_VOLUME_E_UP:
261		return ("UP");
262	case G_RAID_VOLUME_E_DOWN:
263		return ("DOWN");
264	case G_RAID_VOLUME_E_START:
265		return ("START");
266	case G_RAID_VOLUME_E_STARTMD:
267		return ("STARTMD");
268	default:
269		return ("INVALID");
270	}
271}
272
273const char *
274g_raid_volume_level2str(int level, int qual)
275{
276
277	switch (level) {
278	case G_RAID_VOLUME_RL_RAID0:
279		return ("RAID0");
280	case G_RAID_VOLUME_RL_RAID1:
281		return ("RAID1");
282	case G_RAID_VOLUME_RL_RAID3:
283		if (qual == G_RAID_VOLUME_RLQ_R3P0)
284			return ("RAID3-P0");
285		if (qual == G_RAID_VOLUME_RLQ_R3PN)
286			return ("RAID3-PN");
287		return ("RAID3");
288	case G_RAID_VOLUME_RL_RAID4:
289		if (qual == G_RAID_VOLUME_RLQ_R4P0)
290			return ("RAID4-P0");
291		if (qual == G_RAID_VOLUME_RLQ_R4PN)
292			return ("RAID4-PN");
293		return ("RAID4");
294	case G_RAID_VOLUME_RL_RAID5:
295		if (qual == G_RAID_VOLUME_RLQ_R5RA)
296			return ("RAID5-RA");
297		if (qual == G_RAID_VOLUME_RLQ_R5RS)
298			return ("RAID5-RS");
299		if (qual == G_RAID_VOLUME_RLQ_R5LA)
300			return ("RAID5-LA");
301		if (qual == G_RAID_VOLUME_RLQ_R5LS)
302			return ("RAID5-LS");
303		return ("RAID5");
304	case G_RAID_VOLUME_RL_RAID6:
305		if (qual == G_RAID_VOLUME_RLQ_R6RA)
306			return ("RAID6-RA");
307		if (qual == G_RAID_VOLUME_RLQ_R6RS)
308			return ("RAID6-RS");
309		if (qual == G_RAID_VOLUME_RLQ_R6LA)
310			return ("RAID6-LA");
311		if (qual == G_RAID_VOLUME_RLQ_R6LS)
312			return ("RAID6-LS");
313		return ("RAID6");
314	case G_RAID_VOLUME_RL_RAIDMDF:
315		if (qual == G_RAID_VOLUME_RLQ_RMDFRA)
316			return ("RAIDMDF-RA");
317		if (qual == G_RAID_VOLUME_RLQ_RMDFRS)
318			return ("RAIDMDF-RS");
319		if (qual == G_RAID_VOLUME_RLQ_RMDFLA)
320			return ("RAIDMDF-LA");
321		if (qual == G_RAID_VOLUME_RLQ_RMDFLS)
322			return ("RAIDMDF-LS");
323		return ("RAIDMDF");
324	case G_RAID_VOLUME_RL_RAID1E:
325		if (qual == G_RAID_VOLUME_RLQ_R1EA)
326			return ("RAID1E-A");
327		if (qual == G_RAID_VOLUME_RLQ_R1EO)
328			return ("RAID1E-O");
329		return ("RAID1E");
330	case G_RAID_VOLUME_RL_SINGLE:
331		return ("SINGLE");
332	case G_RAID_VOLUME_RL_CONCAT:
333		return ("CONCAT");
334	case G_RAID_VOLUME_RL_RAID5E:
335		if (qual == G_RAID_VOLUME_RLQ_R5ERA)
336			return ("RAID5E-RA");
337		if (qual == G_RAID_VOLUME_RLQ_R5ERS)
338			return ("RAID5E-RS");
339		if (qual == G_RAID_VOLUME_RLQ_R5ELA)
340			return ("RAID5E-LA");
341		if (qual == G_RAID_VOLUME_RLQ_R5ELS)
342			return ("RAID5E-LS");
343		return ("RAID5E");
344	case G_RAID_VOLUME_RL_RAID5EE:
345		if (qual == G_RAID_VOLUME_RLQ_R5EERA)
346			return ("RAID5EE-RA");
347		if (qual == G_RAID_VOLUME_RLQ_R5EERS)
348			return ("RAID5EE-RS");
349		if (qual == G_RAID_VOLUME_RLQ_R5EELA)
350			return ("RAID5EE-LA");
351		if (qual == G_RAID_VOLUME_RLQ_R5EELS)
352			return ("RAID5EE-LS");
353		return ("RAID5EE");
354	case G_RAID_VOLUME_RL_RAID5R:
355		if (qual == G_RAID_VOLUME_RLQ_R5RRA)
356			return ("RAID5R-RA");
357		if (qual == G_RAID_VOLUME_RLQ_R5RRS)
358			return ("RAID5R-RS");
359		if (qual == G_RAID_VOLUME_RLQ_R5RLA)
360			return ("RAID5R-LA");
361		if (qual == G_RAID_VOLUME_RLQ_R5RLS)
362			return ("RAID5R-LS");
363		return ("RAID5E");
364	default:
365		return ("UNKNOWN");
366	}
367}
368
369int
370g_raid_volume_str2level(const char *str, int *level, int *qual)
371{
372
373	*level = G_RAID_VOLUME_RL_UNKNOWN;
374	*qual = G_RAID_VOLUME_RLQ_NONE;
375	if (strcasecmp(str, "RAID0") == 0)
376		*level = G_RAID_VOLUME_RL_RAID0;
377	else if (strcasecmp(str, "RAID1") == 0)
378		*level = G_RAID_VOLUME_RL_RAID1;
379	else if (strcasecmp(str, "RAID3-P0") == 0) {
380		*level = G_RAID_VOLUME_RL_RAID3;
381		*qual = G_RAID_VOLUME_RLQ_R3P0;
382	} else if (strcasecmp(str, "RAID3-PN") == 0 ||
383		   strcasecmp(str, "RAID3") == 0) {
384		*level = G_RAID_VOLUME_RL_RAID3;
385		*qual = G_RAID_VOLUME_RLQ_R3PN;
386	} else if (strcasecmp(str, "RAID4-P0") == 0) {
387		*level = G_RAID_VOLUME_RL_RAID4;
388		*qual = G_RAID_VOLUME_RLQ_R4P0;
389	} else if (strcasecmp(str, "RAID4-PN") == 0 ||
390		   strcasecmp(str, "RAID4") == 0) {
391		*level = G_RAID_VOLUME_RL_RAID4;
392		*qual = G_RAID_VOLUME_RLQ_R4PN;
393	} else if (strcasecmp(str, "RAID5-RA") == 0) {
394		*level = G_RAID_VOLUME_RL_RAID5;
395		*qual = G_RAID_VOLUME_RLQ_R5RA;
396	} else if (strcasecmp(str, "RAID5-RS") == 0) {
397		*level = G_RAID_VOLUME_RL_RAID5;
398		*qual = G_RAID_VOLUME_RLQ_R5RS;
399	} else if (strcasecmp(str, "RAID5") == 0 ||
400		   strcasecmp(str, "RAID5-LA") == 0) {
401		*level = G_RAID_VOLUME_RL_RAID5;
402		*qual = G_RAID_VOLUME_RLQ_R5LA;
403	} else if (strcasecmp(str, "RAID5-LS") == 0) {
404		*level = G_RAID_VOLUME_RL_RAID5;
405		*qual = G_RAID_VOLUME_RLQ_R5LS;
406	} else if (strcasecmp(str, "RAID6-RA") == 0) {
407		*level = G_RAID_VOLUME_RL_RAID6;
408		*qual = G_RAID_VOLUME_RLQ_R6RA;
409	} else if (strcasecmp(str, "RAID6-RS") == 0) {
410		*level = G_RAID_VOLUME_RL_RAID6;
411		*qual = G_RAID_VOLUME_RLQ_R6RS;
412	} else if (strcasecmp(str, "RAID6") == 0 ||
413		   strcasecmp(str, "RAID6-LA") == 0) {
414		*level = G_RAID_VOLUME_RL_RAID6;
415		*qual = G_RAID_VOLUME_RLQ_R6LA;
416	} else if (strcasecmp(str, "RAID6-LS") == 0) {
417		*level = G_RAID_VOLUME_RL_RAID6;
418		*qual = G_RAID_VOLUME_RLQ_R6LS;
419	} else if (strcasecmp(str, "RAIDMDF-RA") == 0) {
420		*level = G_RAID_VOLUME_RL_RAIDMDF;
421		*qual = G_RAID_VOLUME_RLQ_RMDFRA;
422	} else if (strcasecmp(str, "RAIDMDF-RS") == 0) {
423		*level = G_RAID_VOLUME_RL_RAIDMDF;
424		*qual = G_RAID_VOLUME_RLQ_RMDFRS;
425	} else if (strcasecmp(str, "RAIDMDF") == 0 ||
426		   strcasecmp(str, "RAIDMDF-LA") == 0) {
427		*level = G_RAID_VOLUME_RL_RAIDMDF;
428		*qual = G_RAID_VOLUME_RLQ_RMDFLA;
429	} else if (strcasecmp(str, "RAIDMDF-LS") == 0) {
430		*level = G_RAID_VOLUME_RL_RAIDMDF;
431		*qual = G_RAID_VOLUME_RLQ_RMDFLS;
432	} else if (strcasecmp(str, "RAID10") == 0 ||
433		   strcasecmp(str, "RAID1E") == 0 ||
434		   strcasecmp(str, "RAID1E-A") == 0) {
435		*level = G_RAID_VOLUME_RL_RAID1E;
436		*qual = G_RAID_VOLUME_RLQ_R1EA;
437	} else if (strcasecmp(str, "RAID1E-O") == 0) {
438		*level = G_RAID_VOLUME_RL_RAID1E;
439		*qual = G_RAID_VOLUME_RLQ_R1EO;
440	} else if (strcasecmp(str, "SINGLE") == 0)
441		*level = G_RAID_VOLUME_RL_SINGLE;
442	else if (strcasecmp(str, "CONCAT") == 0)
443		*level = G_RAID_VOLUME_RL_CONCAT;
444	else if (strcasecmp(str, "RAID5E-RA") == 0) {
445		*level = G_RAID_VOLUME_RL_RAID5E;
446		*qual = G_RAID_VOLUME_RLQ_R5ERA;
447	} else if (strcasecmp(str, "RAID5E-RS") == 0) {
448		*level = G_RAID_VOLUME_RL_RAID5E;
449		*qual = G_RAID_VOLUME_RLQ_R5ERS;
450	} else if (strcasecmp(str, "RAID5E") == 0 ||
451		   strcasecmp(str, "RAID5E-LA") == 0) {
452		*level = G_RAID_VOLUME_RL_RAID5E;
453		*qual = G_RAID_VOLUME_RLQ_R5ELA;
454	} else if (strcasecmp(str, "RAID5E-LS") == 0) {
455		*level = G_RAID_VOLUME_RL_RAID5E;
456		*qual = G_RAID_VOLUME_RLQ_R5ELS;
457	} else if (strcasecmp(str, "RAID5EE-RA") == 0) {
458		*level = G_RAID_VOLUME_RL_RAID5EE;
459		*qual = G_RAID_VOLUME_RLQ_R5EERA;
460	} else if (strcasecmp(str, "RAID5EE-RS") == 0) {
461		*level = G_RAID_VOLUME_RL_RAID5EE;
462		*qual = G_RAID_VOLUME_RLQ_R5EERS;
463	} else if (strcasecmp(str, "RAID5EE") == 0 ||
464		   strcasecmp(str, "RAID5EE-LA") == 0) {
465		*level = G_RAID_VOLUME_RL_RAID5EE;
466		*qual = G_RAID_VOLUME_RLQ_R5EELA;
467	} else if (strcasecmp(str, "RAID5EE-LS") == 0) {
468		*level = G_RAID_VOLUME_RL_RAID5EE;
469		*qual = G_RAID_VOLUME_RLQ_R5EELS;
470	} else if (strcasecmp(str, "RAID5R-RA") == 0) {
471		*level = G_RAID_VOLUME_RL_RAID5R;
472		*qual = G_RAID_VOLUME_RLQ_R5RRA;
473	} else if (strcasecmp(str, "RAID5R-RS") == 0) {
474		*level = G_RAID_VOLUME_RL_RAID5R;
475		*qual = G_RAID_VOLUME_RLQ_R5RRS;
476	} else if (strcasecmp(str, "RAID5R") == 0 ||
477		   strcasecmp(str, "RAID5R-LA") == 0) {
478		*level = G_RAID_VOLUME_RL_RAID5R;
479		*qual = G_RAID_VOLUME_RLQ_R5RLA;
480	} else if (strcasecmp(str, "RAID5R-LS") == 0) {
481		*level = G_RAID_VOLUME_RL_RAID5R;
482		*qual = G_RAID_VOLUME_RLQ_R5RLS;
483	} else
484		return (-1);
485	return (0);
486}
487
488const char *
489g_raid_get_diskname(struct g_raid_disk *disk)
490{
491
492	if (disk->d_consumer == NULL || disk->d_consumer->provider == NULL)
493		return ("[unknown]");
494	return (disk->d_consumer->provider->name);
495}
496
497void
498g_raid_get_disk_info(struct g_raid_disk *disk)
499{
500	struct g_consumer *cp = disk->d_consumer;
501	int error, len;
502
503	/* Read kernel dumping information. */
504	disk->d_kd.offset = 0;
505	disk->d_kd.length = OFF_MAX;
506	len = sizeof(disk->d_kd);
507	error = g_io_getattr("GEOM::kerneldump", cp, &len, &disk->d_kd);
508	if (error)
509		disk->d_kd.di.dumper = NULL;
510	if (disk->d_kd.di.dumper == NULL)
511		G_RAID_DEBUG1(2, disk->d_softc,
512		    "Dumping not supported by %s: %d.",
513		    cp->provider->name, error);
514
515	/* Read BIO_DELETE support. */
516	error = g_getattr("GEOM::candelete", cp, &disk->d_candelete);
517	if (error)
518		disk->d_candelete = 0;
519	if (!disk->d_candelete)
520		G_RAID_DEBUG1(2, disk->d_softc,
521		    "BIO_DELETE not supported by %s: %d.",
522		    cp->provider->name, error);
523}
524
525void
526g_raid_report_disk_state(struct g_raid_disk *disk)
527{
528	struct g_raid_subdisk *sd;
529	int len, state;
530	uint32_t s;
531
532	if (disk->d_consumer == NULL)
533		return;
534	if (disk->d_state == G_RAID_DISK_S_DISABLED) {
535		s = G_STATE_ACTIVE; /* XXX */
536	} else if (disk->d_state == G_RAID_DISK_S_FAILED ||
537	    disk->d_state == G_RAID_DISK_S_STALE_FAILED) {
538		s = G_STATE_FAILED;
539	} else {
540		state = G_RAID_SUBDISK_S_ACTIVE;
541		TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) {
542			if (sd->sd_state < state)
543				state = sd->sd_state;
544		}
545		if (state == G_RAID_SUBDISK_S_FAILED)
546			s = G_STATE_FAILED;
547		else if (state == G_RAID_SUBDISK_S_NEW ||
548		    state == G_RAID_SUBDISK_S_REBUILD)
549			s = G_STATE_REBUILD;
550		else if (state == G_RAID_SUBDISK_S_STALE ||
551		    state == G_RAID_SUBDISK_S_RESYNC)
552			s = G_STATE_RESYNC;
553		else
554			s = G_STATE_ACTIVE;
555	}
556	len = sizeof(s);
557	g_io_getattr("GEOM::setstate", disk->d_consumer, &len, &s);
558	G_RAID_DEBUG1(2, disk->d_softc, "Disk %s state reported as %d.",
559	    g_raid_get_diskname(disk), s);
560}
561
562void
563g_raid_change_disk_state(struct g_raid_disk *disk, int state)
564{
565
566	G_RAID_DEBUG1(0, disk->d_softc, "Disk %s state changed from %s to %s.",
567	    g_raid_get_diskname(disk),
568	    g_raid_disk_state2str(disk->d_state),
569	    g_raid_disk_state2str(state));
570	disk->d_state = state;
571	g_raid_report_disk_state(disk);
572}
573
574void
575g_raid_change_subdisk_state(struct g_raid_subdisk *sd, int state)
576{
577
578	G_RAID_DEBUG1(0, sd->sd_softc,
579	    "Subdisk %s:%d-%s state changed from %s to %s.",
580	    sd->sd_volume->v_name, sd->sd_pos,
581	    sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]",
582	    g_raid_subdisk_state2str(sd->sd_state),
583	    g_raid_subdisk_state2str(state));
584	sd->sd_state = state;
585	if (sd->sd_disk)
586		g_raid_report_disk_state(sd->sd_disk);
587}
588
589void
590g_raid_change_volume_state(struct g_raid_volume *vol, int state)
591{
592
593	G_RAID_DEBUG1(0, vol->v_softc,
594	    "Volume %s state changed from %s to %s.",
595	    vol->v_name,
596	    g_raid_volume_state2str(vol->v_state),
597	    g_raid_volume_state2str(state));
598	vol->v_state = state;
599}
600
601/*
602 * --- Events handling functions ---
603 * Events in geom_raid are used to maintain subdisks and volumes status
604 * from one thread to simplify locking.
605 */
606static void
607g_raid_event_free(struct g_raid_event *ep)
608{
609
610	free(ep, M_RAID);
611}
612
613int
614g_raid_event_send(void *arg, int event, int flags)
615{
616	struct g_raid_softc *sc;
617	struct g_raid_event *ep;
618	int error;
619
620	if ((flags & G_RAID_EVENT_VOLUME) != 0) {
621		sc = ((struct g_raid_volume *)arg)->v_softc;
622	} else if ((flags & G_RAID_EVENT_DISK) != 0) {
623		sc = ((struct g_raid_disk *)arg)->d_softc;
624	} else if ((flags & G_RAID_EVENT_SUBDISK) != 0) {
625		sc = ((struct g_raid_subdisk *)arg)->sd_softc;
626	} else {
627		sc = arg;
628	}
629	ep = malloc(sizeof(*ep), M_RAID,
630	    sx_xlocked(&sc->sc_lock) ? M_WAITOK : M_NOWAIT);
631	if (ep == NULL)
632		return (ENOMEM);
633	ep->e_tgt = arg;
634	ep->e_event = event;
635	ep->e_flags = flags;
636	ep->e_error = 0;
637	G_RAID_DEBUG1(4, sc, "Sending event %p. Waking up %p.", ep, sc);
638	mtx_lock(&sc->sc_queue_mtx);
639	TAILQ_INSERT_TAIL(&sc->sc_events, ep, e_next);
640	mtx_unlock(&sc->sc_queue_mtx);
641	wakeup(sc);
642
643	if ((flags & G_RAID_EVENT_WAIT) == 0)
644		return (0);
645
646	sx_assert(&sc->sc_lock, SX_XLOCKED);
647	G_RAID_DEBUG1(4, sc, "Sleeping on %p.", ep);
648	sx_xunlock(&sc->sc_lock);
649	while ((ep->e_flags & G_RAID_EVENT_DONE) == 0) {
650		mtx_lock(&sc->sc_queue_mtx);
651		MSLEEP(error, ep, &sc->sc_queue_mtx, PRIBIO | PDROP, "m:event",
652		    hz * 5);
653	}
654	error = ep->e_error;
655	g_raid_event_free(ep);
656	sx_xlock(&sc->sc_lock);
657	return (error);
658}
659
660static void
661g_raid_event_cancel(struct g_raid_softc *sc, void *tgt)
662{
663	struct g_raid_event *ep, *tmpep;
664
665	sx_assert(&sc->sc_lock, SX_XLOCKED);
666
667	mtx_lock(&sc->sc_queue_mtx);
668	TAILQ_FOREACH_SAFE(ep, &sc->sc_events, e_next, tmpep) {
669		if (ep->e_tgt != tgt)
670			continue;
671		TAILQ_REMOVE(&sc->sc_events, ep, e_next);
672		if ((ep->e_flags & G_RAID_EVENT_WAIT) == 0)
673			g_raid_event_free(ep);
674		else {
675			ep->e_error = ECANCELED;
676			wakeup(ep);
677		}
678	}
679	mtx_unlock(&sc->sc_queue_mtx);
680}
681
682static int
683g_raid_event_check(struct g_raid_softc *sc, void *tgt)
684{
685	struct g_raid_event *ep;
686	int	res = 0;
687
688	sx_assert(&sc->sc_lock, SX_XLOCKED);
689
690	mtx_lock(&sc->sc_queue_mtx);
691	TAILQ_FOREACH(ep, &sc->sc_events, e_next) {
692		if (ep->e_tgt != tgt)
693			continue;
694		res = 1;
695		break;
696	}
697	mtx_unlock(&sc->sc_queue_mtx);
698	return (res);
699}
700
701/*
702 * Return the number of disks in given state.
703 * If state is equal to -1, count all connected disks.
704 */
705u_int
706g_raid_ndisks(struct g_raid_softc *sc, int state)
707{
708	struct g_raid_disk *disk;
709	u_int n;
710
711	sx_assert(&sc->sc_lock, SX_LOCKED);
712
713	n = 0;
714	TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
715		if (disk->d_state == state || state == -1)
716			n++;
717	}
718	return (n);
719}
720
721/*
722 * Return the number of subdisks in given state.
723 * If state is equal to -1, count all connected disks.
724 */
725u_int
726g_raid_nsubdisks(struct g_raid_volume *vol, int state)
727{
728	struct g_raid_subdisk *subdisk;
729	struct g_raid_softc *sc __diagused;
730	u_int i, n ;
731
732	sc = vol->v_softc;
733	sx_assert(&sc->sc_lock, SX_LOCKED);
734
735	n = 0;
736	for (i = 0; i < vol->v_disks_count; i++) {
737		subdisk = &vol->v_subdisks[i];
738		if ((state == -1 &&
739		     subdisk->sd_state != G_RAID_SUBDISK_S_NONE) ||
740		    subdisk->sd_state == state)
741			n++;
742	}
743	return (n);
744}
745
746/*
747 * Return the first subdisk in given state.
748 * If state is equal to -1, then the first connected disks.
749 */
750struct g_raid_subdisk *
751g_raid_get_subdisk(struct g_raid_volume *vol, int state)
752{
753	struct g_raid_subdisk *sd;
754	struct g_raid_softc *sc __diagused;
755	u_int i;
756
757	sc = vol->v_softc;
758	sx_assert(&sc->sc_lock, SX_LOCKED);
759
760	for (i = 0; i < vol->v_disks_count; i++) {
761		sd = &vol->v_subdisks[i];
762		if ((state == -1 &&
763		     sd->sd_state != G_RAID_SUBDISK_S_NONE) ||
764		    sd->sd_state == state)
765			return (sd);
766	}
767	return (NULL);
768}
769
770struct g_consumer *
771g_raid_open_consumer(struct g_raid_softc *sc, const char *name)
772{
773	struct g_consumer *cp;
774	struct g_provider *pp;
775
776	g_topology_assert();
777
778	if (strncmp(name, _PATH_DEV, 5) == 0)
779		name += 5;
780	pp = g_provider_by_name(name);
781	if (pp == NULL)
782		return (NULL);
783	cp = g_new_consumer(sc->sc_geom);
784	cp->flags |= G_CF_DIRECT_RECEIVE;
785	if (g_attach(cp, pp) != 0) {
786		g_destroy_consumer(cp);
787		return (NULL);
788	}
789	if (g_access(cp, 1, 1, 1) != 0) {
790		g_detach(cp);
791		g_destroy_consumer(cp);
792		return (NULL);
793	}
794	return (cp);
795}
796
797static u_int
798g_raid_nrequests(struct g_raid_softc *sc, struct g_consumer *cp)
799{
800	struct bio *bp;
801	u_int nreqs = 0;
802
803	mtx_lock(&sc->sc_queue_mtx);
804	TAILQ_FOREACH(bp, &sc->sc_queue.queue, bio_queue) {
805		if (bp->bio_from == cp)
806			nreqs++;
807	}
808	mtx_unlock(&sc->sc_queue_mtx);
809	return (nreqs);
810}
811
812u_int
813g_raid_nopens(struct g_raid_softc *sc)
814{
815	struct g_raid_volume *vol;
816	u_int opens;
817
818	opens = 0;
819	TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
820		if (vol->v_provider_open != 0)
821			opens++;
822	}
823	return (opens);
824}
825
826static int
827g_raid_consumer_is_busy(struct g_raid_softc *sc, struct g_consumer *cp)
828{
829
830	if (cp->index > 0) {
831		G_RAID_DEBUG1(2, sc,
832		    "I/O requests for %s exist, can't destroy it now.",
833		    cp->provider->name);
834		return (1);
835	}
836	if (g_raid_nrequests(sc, cp) > 0) {
837		G_RAID_DEBUG1(2, sc,
838		    "I/O requests for %s in queue, can't destroy it now.",
839		    cp->provider->name);
840		return (1);
841	}
842	return (0);
843}
844
845static void
846g_raid_destroy_consumer(void *arg, int flags __unused)
847{
848	struct g_consumer *cp;
849
850	g_topology_assert();
851
852	cp = arg;
853	G_RAID_DEBUG(1, "Consumer %s destroyed.", cp->provider->name);
854	g_detach(cp);
855	g_destroy_consumer(cp);
856}
857
858void
859g_raid_kill_consumer(struct g_raid_softc *sc, struct g_consumer *cp)
860{
861	struct g_provider *pp;
862	int retaste_wait;
863
864	g_topology_assert_not();
865
866	g_topology_lock();
867	cp->private = NULL;
868	if (g_raid_consumer_is_busy(sc, cp))
869		goto out;
870	pp = cp->provider;
871	retaste_wait = 0;
872	if (cp->acw == 1) {
873		if ((pp->geom->flags & G_GEOM_WITHER) == 0)
874			retaste_wait = 1;
875	}
876	if (cp->acr > 0 || cp->acw > 0 || cp->ace > 0)
877		g_access(cp, -cp->acr, -cp->acw, -cp->ace);
878	if (retaste_wait) {
879		/*
880		 * After retaste event was send (inside g_access()), we can send
881		 * event to detach and destroy consumer.
882		 * A class, which has consumer to the given provider connected
883		 * will not receive retaste event for the provider.
884		 * This is the way how I ignore retaste events when I close
885		 * consumers opened for write: I detach and destroy consumer
886		 * after retaste event is sent.
887		 */
888		g_post_event(g_raid_destroy_consumer, cp, M_WAITOK, NULL);
889		goto out;
890	}
891	G_RAID_DEBUG(1, "Consumer %s destroyed.", pp->name);
892	g_detach(cp);
893	g_destroy_consumer(cp);
894out:
895	g_topology_unlock();
896}
897
898static void
899g_raid_orphan(struct g_consumer *cp)
900{
901	struct g_raid_disk *disk;
902
903	g_topology_assert();
904
905	disk = cp->private;
906	if (disk == NULL)
907		return;
908	g_raid_event_send(disk, G_RAID_DISK_E_DISCONNECTED,
909	    G_RAID_EVENT_DISK);
910}
911
912static void
913g_raid_clean(struct g_raid_volume *vol, int acw)
914{
915	struct g_raid_softc *sc;
916	int timeout;
917
918	sc = vol->v_softc;
919	g_topology_assert_not();
920	sx_assert(&sc->sc_lock, SX_XLOCKED);
921
922//	if ((sc->sc_flags & G_RAID_DEVICE_FLAG_NOFAILSYNC) != 0)
923//		return;
924	if (!vol->v_dirty)
925		return;
926	if (vol->v_writes > 0)
927		return;
928	if (acw > 0 || (acw == -1 &&
929	    vol->v_provider != NULL && vol->v_provider->acw > 0)) {
930		timeout = g_raid_clean_time - (time_uptime - vol->v_last_write);
931		if (!g_raid_shutdown && timeout > 0)
932			return;
933	}
934	vol->v_dirty = 0;
935	G_RAID_DEBUG1(1, sc, "Volume %s marked as clean.",
936	    vol->v_name);
937	g_raid_write_metadata(sc, vol, NULL, NULL);
938}
939
940static void
941g_raid_dirty(struct g_raid_volume *vol)
942{
943	struct g_raid_softc *sc;
944
945	sc = vol->v_softc;
946	g_topology_assert_not();
947	sx_assert(&sc->sc_lock, SX_XLOCKED);
948
949//	if ((sc->sc_flags & G_RAID_DEVICE_FLAG_NOFAILSYNC) != 0)
950//		return;
951	vol->v_dirty = 1;
952	G_RAID_DEBUG1(1, sc, "Volume %s marked as dirty.",
953	    vol->v_name);
954	g_raid_write_metadata(sc, vol, NULL, NULL);
955}
956
957void
958g_raid_tr_flush_common(struct g_raid_tr_object *tr, struct bio *bp)
959{
960	struct g_raid_volume *vol;
961	struct g_raid_subdisk *sd;
962	struct bio_queue_head queue;
963	struct bio *cbp;
964	int i;
965
966	vol = tr->tro_volume;
967
968	/*
969	 * Allocate all bios before sending any request, so we can return
970	 * ENOMEM in nice and clean way.
971	 */
972	bioq_init(&queue);
973	for (i = 0; i < vol->v_disks_count; i++) {
974		sd = &vol->v_subdisks[i];
975		if (sd->sd_state == G_RAID_SUBDISK_S_NONE ||
976		    sd->sd_state == G_RAID_SUBDISK_S_FAILED)
977			continue;
978		cbp = g_clone_bio(bp);
979		if (cbp == NULL)
980			goto failure;
981		cbp->bio_caller1 = sd;
982		bioq_insert_tail(&queue, cbp);
983	}
984	while ((cbp = bioq_takefirst(&queue)) != NULL) {
985		sd = cbp->bio_caller1;
986		cbp->bio_caller1 = NULL;
987		g_raid_subdisk_iostart(sd, cbp);
988	}
989	return;
990failure:
991	while ((cbp = bioq_takefirst(&queue)) != NULL)
992		g_destroy_bio(cbp);
993	if (bp->bio_error == 0)
994		bp->bio_error = ENOMEM;
995	g_raid_iodone(bp, bp->bio_error);
996}
997
998static void
999g_raid_tr_kerneldump_common_done(struct bio *bp)
1000{
1001
1002	bp->bio_flags |= BIO_DONE;
1003}
1004
1005int
1006g_raid_tr_kerneldump_common(struct g_raid_tr_object *tr,
1007    void *virtual, vm_offset_t physical, off_t offset, size_t length)
1008{
1009	struct g_raid_softc *sc;
1010	struct g_raid_volume *vol;
1011	struct bio bp;
1012
1013	vol = tr->tro_volume;
1014	sc = vol->v_softc;
1015
1016	g_reset_bio(&bp);
1017	bp.bio_cmd = BIO_WRITE;
1018	bp.bio_done = g_raid_tr_kerneldump_common_done;
1019	bp.bio_attribute = NULL;
1020	bp.bio_offset = offset;
1021	bp.bio_length = length;
1022	bp.bio_data = virtual;
1023	bp.bio_to = vol->v_provider;
1024
1025	g_raid_start(&bp);
1026	while (!(bp.bio_flags & BIO_DONE)) {
1027		G_RAID_DEBUG1(4, sc, "Poll...");
1028		g_raid_poll(sc);
1029		DELAY(10);
1030	}
1031
1032	return (bp.bio_error != 0 ? EIO : 0);
1033}
1034
1035static int
1036g_raid_dump(void *arg, void *virtual, off_t offset, size_t length)
1037{
1038	struct g_raid_volume *vol;
1039	int error;
1040
1041	vol = (struct g_raid_volume *)arg;
1042	G_RAID_DEBUG1(3, vol->v_softc, "Dumping at off %llu len %llu.",
1043	    (long long unsigned)offset, (long long unsigned)length);
1044
1045	error = G_RAID_TR_KERNELDUMP(vol->v_tr, virtual, offset, length);
1046	return (error);
1047}
1048
1049static void
1050g_raid_kerneldump(struct g_raid_softc *sc, struct bio *bp)
1051{
1052	struct g_kerneldump *gkd;
1053	struct g_provider *pp;
1054	struct g_raid_volume *vol;
1055
1056	gkd = (struct g_kerneldump*)bp->bio_data;
1057	pp = bp->bio_to;
1058	vol = pp->private;
1059	g_trace(G_T_TOPOLOGY, "g_raid_kerneldump(%s, %jd, %jd)",
1060		pp->name, (intmax_t)gkd->offset, (intmax_t)gkd->length);
1061	gkd->di.dumper = g_raid_dump;
1062	gkd->di.priv = vol;
1063	gkd->di.blocksize = vol->v_sectorsize;
1064	gkd->di.maxiosize = DFLTPHYS;
1065	gkd->di.mediaoffset = gkd->offset;
1066	if ((gkd->offset + gkd->length) > vol->v_mediasize)
1067		gkd->length = vol->v_mediasize - gkd->offset;
1068	gkd->di.mediasize = gkd->length;
1069	g_io_deliver(bp, 0);
1070}
1071
1072static void
1073g_raid_candelete(struct g_raid_softc *sc, struct bio *bp)
1074{
1075	struct g_provider *pp;
1076	struct g_raid_volume *vol;
1077	struct g_raid_subdisk *sd;
1078	int i, val;
1079
1080	pp = bp->bio_to;
1081	vol = pp->private;
1082	for (i = 0; i < vol->v_disks_count; i++) {
1083		sd = &vol->v_subdisks[i];
1084		if (sd->sd_state == G_RAID_SUBDISK_S_NONE)
1085			continue;
1086		if (sd->sd_disk->d_candelete)
1087			break;
1088	}
1089	val = i < vol->v_disks_count;
1090	g_handleattr(bp, "GEOM::candelete", &val, sizeof(val));
1091}
1092
1093static void
1094g_raid_start(struct bio *bp)
1095{
1096	struct g_raid_softc *sc;
1097
1098	sc = bp->bio_to->geom->softc;
1099	/*
1100	 * If sc == NULL or there are no valid disks, provider's error
1101	 * should be set and g_raid_start() should not be called at all.
1102	 */
1103//	KASSERT(sc != NULL && sc->sc_state == G_RAID_VOLUME_S_RUNNING,
1104//	    ("Provider's error should be set (error=%d)(mirror=%s).",
1105//	    bp->bio_to->error, bp->bio_to->name));
1106	G_RAID_LOGREQ(3, bp, "Request received.");
1107
1108	switch (bp->bio_cmd) {
1109	case BIO_READ:
1110	case BIO_WRITE:
1111	case BIO_DELETE:
1112	case BIO_FLUSH:
1113	case BIO_SPEEDUP:
1114		break;
1115	case BIO_GETATTR:
1116		if (!strcmp(bp->bio_attribute, "GEOM::candelete"))
1117			g_raid_candelete(sc, bp);
1118		else if (!strcmp(bp->bio_attribute, "GEOM::kerneldump"))
1119			g_raid_kerneldump(sc, bp);
1120		else
1121			g_io_deliver(bp, EOPNOTSUPP);
1122		return;
1123	default:
1124		g_io_deliver(bp, EOPNOTSUPP);
1125		return;
1126	}
1127	mtx_lock(&sc->sc_queue_mtx);
1128	bioq_insert_tail(&sc->sc_queue, bp);
1129	mtx_unlock(&sc->sc_queue_mtx);
1130	if (!dumping) {
1131		G_RAID_DEBUG1(4, sc, "Waking up %p.", sc);
1132		wakeup(sc);
1133	}
1134}
1135
1136static int
1137g_raid_bio_overlaps(const struct bio *bp, off_t lstart, off_t len)
1138{
1139	/*
1140	 * 5 cases:
1141	 * (1) bp entirely below NO
1142	 * (2) bp entirely above NO
1143	 * (3) bp start below, but end in range YES
1144	 * (4) bp entirely within YES
1145	 * (5) bp starts within, ends above YES
1146	 *
1147	 * lock range 10-19 (offset 10 length 10)
1148	 * (1) 1-5: first if kicks it out
1149	 * (2) 30-35: second if kicks it out
1150	 * (3) 5-15: passes both ifs
1151	 * (4) 12-14: passes both ifs
1152	 * (5) 19-20: passes both
1153	 */
1154	off_t lend = lstart + len - 1;
1155	off_t bstart = bp->bio_offset;
1156	off_t bend = bp->bio_offset + bp->bio_length - 1;
1157
1158	if (bend < lstart)
1159		return (0);
1160	if (lend < bstart)
1161		return (0);
1162	return (1);
1163}
1164
1165static int
1166g_raid_is_in_locked_range(struct g_raid_volume *vol, const struct bio *bp)
1167{
1168	struct g_raid_lock *lp;
1169
1170	sx_assert(&vol->v_softc->sc_lock, SX_LOCKED);
1171
1172	LIST_FOREACH(lp, &vol->v_locks, l_next) {
1173		if (g_raid_bio_overlaps(bp, lp->l_offset, lp->l_length))
1174			return (1);
1175	}
1176	return (0);
1177}
1178
1179static void
1180g_raid_start_request(struct bio *bp)
1181{
1182	struct g_raid_softc *sc __diagused;
1183	struct g_raid_volume *vol;
1184
1185	sc = bp->bio_to->geom->softc;
1186	sx_assert(&sc->sc_lock, SX_LOCKED);
1187	vol = bp->bio_to->private;
1188
1189	/*
1190	 * Check to see if this item is in a locked range.  If so,
1191	 * queue it to our locked queue and return.  We'll requeue
1192	 * it when the range is unlocked.  Internal I/O for the
1193	 * rebuild/rescan/recovery process is excluded from this
1194	 * check so we can actually do the recovery.
1195	 */
1196	if (!(bp->bio_cflags & G_RAID_BIO_FLAG_SPECIAL) &&
1197	    g_raid_is_in_locked_range(vol, bp)) {
1198		G_RAID_LOGREQ(3, bp, "Defer request.");
1199		bioq_insert_tail(&vol->v_locked, bp);
1200		return;
1201	}
1202
1203	/*
1204	 * If we're actually going to do the write/delete, then
1205	 * update the idle stats for the volume.
1206	 */
1207	if (bp->bio_cmd == BIO_WRITE || bp->bio_cmd == BIO_DELETE) {
1208		if (!vol->v_dirty)
1209			g_raid_dirty(vol);
1210		vol->v_writes++;
1211	}
1212
1213	/*
1214	 * Put request onto inflight queue, so we can check if new
1215	 * synchronization requests don't collide with it.  Then tell
1216	 * the transformation layer to start the I/O.
1217	 */
1218	bioq_insert_tail(&vol->v_inflight, bp);
1219	G_RAID_LOGREQ(4, bp, "Request started");
1220	G_RAID_TR_IOSTART(vol->v_tr, bp);
1221}
1222
1223static void
1224g_raid_finish_with_locked_ranges(struct g_raid_volume *vol, struct bio *bp)
1225{
1226	off_t off, len;
1227	struct bio *nbp;
1228	struct g_raid_lock *lp;
1229
1230	vol->v_pending_lock = 0;
1231	LIST_FOREACH(lp, &vol->v_locks, l_next) {
1232		if (lp->l_pending) {
1233			off = lp->l_offset;
1234			len = lp->l_length;
1235			lp->l_pending = 0;
1236			TAILQ_FOREACH(nbp, &vol->v_inflight.queue, bio_queue) {
1237				if (g_raid_bio_overlaps(nbp, off, len))
1238					lp->l_pending++;
1239			}
1240			if (lp->l_pending) {
1241				vol->v_pending_lock = 1;
1242				G_RAID_DEBUG1(4, vol->v_softc,
1243				    "Deferred lock(%jd, %jd) has %d pending",
1244				    (intmax_t)off, (intmax_t)(off + len),
1245				    lp->l_pending);
1246				continue;
1247			}
1248			G_RAID_DEBUG1(4, vol->v_softc,
1249			    "Deferred lock of %jd to %jd completed",
1250			    (intmax_t)off, (intmax_t)(off + len));
1251			G_RAID_TR_LOCKED(vol->v_tr, lp->l_callback_arg);
1252		}
1253	}
1254}
1255
1256void
1257g_raid_iodone(struct bio *bp, int error)
1258{
1259	struct g_raid_softc *sc __diagused;
1260	struct g_raid_volume *vol;
1261
1262	sc = bp->bio_to->geom->softc;
1263	sx_assert(&sc->sc_lock, SX_LOCKED);
1264	vol = bp->bio_to->private;
1265	G_RAID_LOGREQ(3, bp, "Request done: %d.", error);
1266
1267	/* Update stats if we done write/delete. */
1268	if (bp->bio_cmd == BIO_WRITE || bp->bio_cmd == BIO_DELETE) {
1269		vol->v_writes--;
1270		vol->v_last_write = time_uptime;
1271	}
1272
1273	bioq_remove(&vol->v_inflight, bp);
1274	if (vol->v_pending_lock && g_raid_is_in_locked_range(vol, bp))
1275		g_raid_finish_with_locked_ranges(vol, bp);
1276	getmicrouptime(&vol->v_last_done);
1277	g_io_deliver(bp, error);
1278}
1279
1280int
1281g_raid_lock_range(struct g_raid_volume *vol, off_t off, off_t len,
1282    struct bio *ignore, void *argp)
1283{
1284	struct g_raid_softc *sc;
1285	struct g_raid_lock *lp;
1286	struct bio *bp;
1287
1288	sc = vol->v_softc;
1289	lp = malloc(sizeof(*lp), M_RAID, M_WAITOK | M_ZERO);
1290	LIST_INSERT_HEAD(&vol->v_locks, lp, l_next);
1291	lp->l_offset = off;
1292	lp->l_length = len;
1293	lp->l_callback_arg = argp;
1294
1295	lp->l_pending = 0;
1296	TAILQ_FOREACH(bp, &vol->v_inflight.queue, bio_queue) {
1297		if (bp != ignore && g_raid_bio_overlaps(bp, off, len))
1298			lp->l_pending++;
1299	}
1300
1301	/*
1302	 * If there are any writes that are pending, we return EBUSY.  All
1303	 * callers will have to wait until all pending writes clear.
1304	 */
1305	if (lp->l_pending > 0) {
1306		vol->v_pending_lock = 1;
1307		G_RAID_DEBUG1(4, sc, "Locking range %jd to %jd deferred %d pend",
1308		    (intmax_t)off, (intmax_t)(off+len), lp->l_pending);
1309		return (EBUSY);
1310	}
1311	G_RAID_DEBUG1(4, sc, "Locking range %jd to %jd",
1312	    (intmax_t)off, (intmax_t)(off+len));
1313	G_RAID_TR_LOCKED(vol->v_tr, lp->l_callback_arg);
1314	return (0);
1315}
1316
1317int
1318g_raid_unlock_range(struct g_raid_volume *vol, off_t off, off_t len)
1319{
1320	struct g_raid_lock *lp;
1321	struct g_raid_softc *sc;
1322	struct bio *bp;
1323
1324	sc = vol->v_softc;
1325	LIST_FOREACH(lp, &vol->v_locks, l_next) {
1326		if (lp->l_offset == off && lp->l_length == len) {
1327			LIST_REMOVE(lp, l_next);
1328			/* XXX
1329			 * Right now we just put them all back on the queue
1330			 * and hope for the best.  We hope this because any
1331			 * locked ranges will go right back on this list
1332			 * when the worker thread runs.
1333			 * XXX
1334			 */
1335			G_RAID_DEBUG1(4, sc, "Unlocked %jd to %jd",
1336			    (intmax_t)lp->l_offset,
1337			    (intmax_t)(lp->l_offset+lp->l_length));
1338			mtx_lock(&sc->sc_queue_mtx);
1339			while ((bp = bioq_takefirst(&vol->v_locked)) != NULL)
1340				bioq_insert_tail(&sc->sc_queue, bp);
1341			mtx_unlock(&sc->sc_queue_mtx);
1342			free(lp, M_RAID);
1343			return (0);
1344		}
1345	}
1346	return (EINVAL);
1347}
1348
1349void
1350g_raid_subdisk_iostart(struct g_raid_subdisk *sd, struct bio *bp)
1351{
1352	struct g_consumer *cp;
1353	struct g_raid_disk *disk, *tdisk;
1354
1355	bp->bio_caller1 = sd;
1356
1357	/*
1358	 * Make sure that the disk is present. Generally it is a task of
1359	 * transformation layers to not send requests to absent disks, but
1360	 * it is better to be safe and report situation then sorry.
1361	 */
1362	if (sd->sd_disk == NULL) {
1363		G_RAID_LOGREQ(0, bp, "Warning! I/O request to an absent disk!");
1364nodisk:
1365		bp->bio_from = NULL;
1366		bp->bio_to = NULL;
1367		bp->bio_error = ENXIO;
1368		g_raid_disk_done(bp);
1369		return;
1370	}
1371	disk = sd->sd_disk;
1372	if (disk->d_state != G_RAID_DISK_S_ACTIVE &&
1373	    disk->d_state != G_RAID_DISK_S_FAILED) {
1374		G_RAID_LOGREQ(0, bp, "Warning! I/O request to a disk in a "
1375		    "wrong state (%s)!", g_raid_disk_state2str(disk->d_state));
1376		goto nodisk;
1377	}
1378
1379	cp = disk->d_consumer;
1380	bp->bio_from = cp;
1381	bp->bio_to = cp->provider;
1382	cp->index++;
1383
1384	/* Update average disks load. */
1385	TAILQ_FOREACH(tdisk, &sd->sd_softc->sc_disks, d_next) {
1386		if (tdisk->d_consumer == NULL)
1387			tdisk->d_load = 0;
1388		else
1389			tdisk->d_load = (tdisk->d_consumer->index *
1390			    G_RAID_SUBDISK_LOAD_SCALE + tdisk->d_load * 7) / 8;
1391	}
1392
1393	disk->d_last_offset = bp->bio_offset + bp->bio_length;
1394	if (dumping) {
1395		G_RAID_LOGREQ(3, bp, "Sending dumping request.");
1396		if (bp->bio_cmd == BIO_WRITE) {
1397			bp->bio_error = g_raid_subdisk_kerneldump(sd,
1398			    bp->bio_data, bp->bio_offset, bp->bio_length);
1399		} else
1400			bp->bio_error = EOPNOTSUPP;
1401		g_raid_disk_done(bp);
1402	} else {
1403		bp->bio_done = g_raid_disk_done;
1404		bp->bio_offset += sd->sd_offset;
1405		G_RAID_LOGREQ(3, bp, "Sending request.");
1406		g_io_request(bp, cp);
1407	}
1408}
1409
1410int
1411g_raid_subdisk_kerneldump(struct g_raid_subdisk *sd, void *virtual,
1412    off_t offset, size_t length)
1413{
1414
1415	if (sd->sd_disk == NULL)
1416		return (ENXIO);
1417	if (sd->sd_disk->d_kd.di.dumper == NULL)
1418		return (EOPNOTSUPP);
1419	return (dump_write(&sd->sd_disk->d_kd.di, virtual,
1420	    sd->sd_disk->d_kd.di.mediaoffset + sd->sd_offset + offset, length));
1421}
1422
1423static void
1424g_raid_disk_done(struct bio *bp)
1425{
1426	struct g_raid_softc *sc;
1427	struct g_raid_subdisk *sd;
1428
1429	sd = bp->bio_caller1;
1430	sc = sd->sd_softc;
1431	mtx_lock(&sc->sc_queue_mtx);
1432	bioq_insert_tail(&sc->sc_queue, bp);
1433	mtx_unlock(&sc->sc_queue_mtx);
1434	if (!dumping)
1435		wakeup(sc);
1436}
1437
1438static void
1439g_raid_disk_done_request(struct bio *bp)
1440{
1441	struct g_raid_softc *sc;
1442	struct g_raid_disk *disk;
1443	struct g_raid_subdisk *sd;
1444	struct g_raid_volume *vol;
1445
1446	g_topology_assert_not();
1447
1448	G_RAID_LOGREQ(3, bp, "Disk request done: %d.", bp->bio_error);
1449	sd = bp->bio_caller1;
1450	sc = sd->sd_softc;
1451	vol = sd->sd_volume;
1452	if (bp->bio_from != NULL) {
1453		bp->bio_from->index--;
1454		disk = bp->bio_from->private;
1455		if (disk == NULL)
1456			g_raid_kill_consumer(sc, bp->bio_from);
1457	}
1458	bp->bio_offset -= sd->sd_offset;
1459
1460	G_RAID_TR_IODONE(vol->v_tr, sd, bp);
1461}
1462
1463static void
1464g_raid_handle_event(struct g_raid_softc *sc, struct g_raid_event *ep)
1465{
1466
1467	if ((ep->e_flags & G_RAID_EVENT_VOLUME) != 0)
1468		ep->e_error = g_raid_update_volume(ep->e_tgt, ep->e_event);
1469	else if ((ep->e_flags & G_RAID_EVENT_DISK) != 0)
1470		ep->e_error = g_raid_update_disk(ep->e_tgt, ep->e_event);
1471	else if ((ep->e_flags & G_RAID_EVENT_SUBDISK) != 0)
1472		ep->e_error = g_raid_update_subdisk(ep->e_tgt, ep->e_event);
1473	else
1474		ep->e_error = g_raid_update_node(ep->e_tgt, ep->e_event);
1475	if ((ep->e_flags & G_RAID_EVENT_WAIT) == 0) {
1476		KASSERT(ep->e_error == 0,
1477		    ("Error cannot be handled."));
1478		g_raid_event_free(ep);
1479	} else {
1480		ep->e_flags |= G_RAID_EVENT_DONE;
1481		G_RAID_DEBUG1(4, sc, "Waking up %p.", ep);
1482		mtx_lock(&sc->sc_queue_mtx);
1483		wakeup(ep);
1484		mtx_unlock(&sc->sc_queue_mtx);
1485	}
1486}
1487
1488/*
1489 * Worker thread.
1490 */
1491static void
1492g_raid_worker(void *arg)
1493{
1494	struct g_raid_softc *sc;
1495	struct g_raid_event *ep;
1496	struct g_raid_volume *vol;
1497	struct bio *bp;
1498	struct timeval now, t;
1499	int timeout, rv;
1500
1501	sc = arg;
1502	thread_lock(curthread);
1503	sched_prio(curthread, PRIBIO);
1504	thread_unlock(curthread);
1505
1506	sx_xlock(&sc->sc_lock);
1507	for (;;) {
1508		mtx_lock(&sc->sc_queue_mtx);
1509		/*
1510		 * First take a look at events.
1511		 * This is important to handle events before any I/O requests.
1512		 */
1513		bp = NULL;
1514		vol = NULL;
1515		rv = 0;
1516		ep = TAILQ_FIRST(&sc->sc_events);
1517		if (ep != NULL)
1518			TAILQ_REMOVE(&sc->sc_events, ep, e_next);
1519		else if ((bp = bioq_takefirst(&sc->sc_queue)) != NULL)
1520			;
1521		else {
1522			getmicrouptime(&now);
1523			t = now;
1524			TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
1525				if (bioq_first(&vol->v_inflight) == NULL &&
1526				    vol->v_tr &&
1527				    timevalcmp(&vol->v_last_done, &t, < ))
1528					t = vol->v_last_done;
1529			}
1530			timevalsub(&t, &now);
1531			timeout = g_raid_idle_threshold +
1532			    t.tv_sec * 1000000 + t.tv_usec;
1533			if (timeout > 0) {
1534				/*
1535				 * Two steps to avoid overflows at HZ=1000
1536				 * and idle timeouts > 2.1s.  Some rounding
1537				 * errors can occur, but they are < 1tick,
1538				 * which is deemed to be close enough for
1539				 * this purpose.
1540				 */
1541				int micpertic = 1000000 / hz;
1542				timeout = (timeout + micpertic - 1) / micpertic;
1543				sx_xunlock(&sc->sc_lock);
1544				MSLEEP(rv, sc, &sc->sc_queue_mtx,
1545				    PRIBIO | PDROP, "-", timeout);
1546				sx_xlock(&sc->sc_lock);
1547				goto process;
1548			} else
1549				rv = EWOULDBLOCK;
1550		}
1551		mtx_unlock(&sc->sc_queue_mtx);
1552process:
1553		if (ep != NULL) {
1554			g_raid_handle_event(sc, ep);
1555		} else if (bp != NULL) {
1556			if (bp->bio_to != NULL &&
1557			    bp->bio_to->geom == sc->sc_geom)
1558				g_raid_start_request(bp);
1559			else
1560				g_raid_disk_done_request(bp);
1561		} else if (rv == EWOULDBLOCK) {
1562			TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
1563				g_raid_clean(vol, -1);
1564				if (bioq_first(&vol->v_inflight) == NULL &&
1565				    vol->v_tr) {
1566					t.tv_sec = g_raid_idle_threshold / 1000000;
1567					t.tv_usec = g_raid_idle_threshold % 1000000;
1568					timevaladd(&t, &vol->v_last_done);
1569					getmicrouptime(&now);
1570					if (timevalcmp(&t, &now, <= )) {
1571						G_RAID_TR_IDLE(vol->v_tr);
1572						vol->v_last_done = now;
1573					}
1574				}
1575			}
1576		}
1577		if (sc->sc_stopping == G_RAID_DESTROY_HARD)
1578			g_raid_destroy_node(sc, 1);	/* May not return. */
1579	}
1580}
1581
1582static void
1583g_raid_poll(struct g_raid_softc *sc)
1584{
1585	struct g_raid_event *ep;
1586	struct bio *bp;
1587
1588	sx_xlock(&sc->sc_lock);
1589	mtx_lock(&sc->sc_queue_mtx);
1590	/*
1591	 * First take a look at events.
1592	 * This is important to handle events before any I/O requests.
1593	 */
1594	ep = TAILQ_FIRST(&sc->sc_events);
1595	if (ep != NULL) {
1596		TAILQ_REMOVE(&sc->sc_events, ep, e_next);
1597		mtx_unlock(&sc->sc_queue_mtx);
1598		g_raid_handle_event(sc, ep);
1599		goto out;
1600	}
1601	bp = bioq_takefirst(&sc->sc_queue);
1602	if (bp != NULL) {
1603		mtx_unlock(&sc->sc_queue_mtx);
1604		if (bp->bio_from == NULL ||
1605		    bp->bio_from->geom != sc->sc_geom)
1606			g_raid_start_request(bp);
1607		else
1608			g_raid_disk_done_request(bp);
1609	}
1610out:
1611	sx_xunlock(&sc->sc_lock);
1612}
1613
1614static void
1615g_raid_launch_provider(struct g_raid_volume *vol)
1616{
1617	struct g_raid_disk *disk;
1618	struct g_raid_subdisk *sd;
1619	struct g_raid_softc *sc;
1620	struct g_provider *pp;
1621	char name[G_RAID_MAX_VOLUMENAME];
1622	off_t off;
1623	int i;
1624
1625	sc = vol->v_softc;
1626	sx_assert(&sc->sc_lock, SX_LOCKED);
1627
1628	g_topology_lock();
1629	/* Try to name provider with volume name. */
1630	snprintf(name, sizeof(name), "raid/%s", vol->v_name);
1631	if (g_raid_name_format == 0 || vol->v_name[0] == 0 ||
1632	    g_provider_by_name(name) != NULL) {
1633		/* Otherwise use sequential volume number. */
1634		snprintf(name, sizeof(name), "raid/r%d", vol->v_global_id);
1635	}
1636
1637	pp = g_new_providerf(sc->sc_geom, "%s", name);
1638	pp->flags |= G_PF_DIRECT_RECEIVE;
1639	if (vol->v_tr->tro_class->trc_accept_unmapped) {
1640		pp->flags |= G_PF_ACCEPT_UNMAPPED;
1641		for (i = 0; i < vol->v_disks_count; i++) {
1642			sd = &vol->v_subdisks[i];
1643			if (sd->sd_state == G_RAID_SUBDISK_S_NONE)
1644				continue;
1645			if ((sd->sd_disk->d_consumer->provider->flags &
1646			    G_PF_ACCEPT_UNMAPPED) == 0)
1647				pp->flags &= ~G_PF_ACCEPT_UNMAPPED;
1648		}
1649	}
1650	pp->private = vol;
1651	pp->mediasize = vol->v_mediasize;
1652	pp->sectorsize = vol->v_sectorsize;
1653	pp->stripesize = 0;
1654	pp->stripeoffset = 0;
1655	if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1 ||
1656	    vol->v_raid_level == G_RAID_VOLUME_RL_RAID3 ||
1657	    vol->v_raid_level == G_RAID_VOLUME_RL_SINGLE ||
1658	    vol->v_raid_level == G_RAID_VOLUME_RL_CONCAT) {
1659		if ((disk = vol->v_subdisks[0].sd_disk) != NULL &&
1660		    disk->d_consumer != NULL &&
1661		    disk->d_consumer->provider != NULL) {
1662			pp->stripesize = disk->d_consumer->provider->stripesize;
1663			off = disk->d_consumer->provider->stripeoffset;
1664			pp->stripeoffset = off + vol->v_subdisks[0].sd_offset;
1665			if (off > 0)
1666				pp->stripeoffset %= off;
1667		}
1668		if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID3) {
1669			pp->stripesize *= (vol->v_disks_count - 1);
1670			pp->stripeoffset *= (vol->v_disks_count - 1);
1671		}
1672	} else
1673		pp->stripesize = vol->v_strip_size;
1674	vol->v_provider = pp;
1675	g_error_provider(pp, 0);
1676	g_topology_unlock();
1677	G_RAID_DEBUG1(0, sc, "Provider %s for volume %s created.",
1678	    pp->name, vol->v_name);
1679}
1680
1681static void
1682g_raid_destroy_provider(struct g_raid_volume *vol)
1683{
1684	struct g_raid_softc *sc;
1685	struct g_provider *pp;
1686	struct bio *bp, *tmp;
1687
1688	g_topology_assert_not();
1689	sc = vol->v_softc;
1690	pp = vol->v_provider;
1691	KASSERT(pp != NULL, ("NULL provider (volume=%s).", vol->v_name));
1692
1693	g_topology_lock();
1694	g_error_provider(pp, ENXIO);
1695	mtx_lock(&sc->sc_queue_mtx);
1696	TAILQ_FOREACH_SAFE(bp, &sc->sc_queue.queue, bio_queue, tmp) {
1697		if (bp->bio_to != pp)
1698			continue;
1699		bioq_remove(&sc->sc_queue, bp);
1700		g_io_deliver(bp, ENXIO);
1701	}
1702	mtx_unlock(&sc->sc_queue_mtx);
1703	G_RAID_DEBUG1(0, sc, "Provider %s for volume %s destroyed.",
1704	    pp->name, vol->v_name);
1705	g_wither_provider(pp, ENXIO);
1706	g_topology_unlock();
1707	vol->v_provider = NULL;
1708}
1709
1710/*
1711 * Update device state.
1712 */
1713static int
1714g_raid_update_volume(struct g_raid_volume *vol, u_int event)
1715{
1716	struct g_raid_softc *sc;
1717
1718	sc = vol->v_softc;
1719	sx_assert(&sc->sc_lock, SX_XLOCKED);
1720
1721	G_RAID_DEBUG1(2, sc, "Event %s for volume %s.",
1722	    g_raid_volume_event2str(event),
1723	    vol->v_name);
1724	switch (event) {
1725	case G_RAID_VOLUME_E_DOWN:
1726		if (vol->v_provider != NULL)
1727			g_raid_destroy_provider(vol);
1728		break;
1729	case G_RAID_VOLUME_E_UP:
1730		if (vol->v_provider == NULL)
1731			g_raid_launch_provider(vol);
1732		break;
1733	case G_RAID_VOLUME_E_START:
1734		if (vol->v_tr)
1735			G_RAID_TR_START(vol->v_tr);
1736		return (0);
1737	default:
1738		if (sc->sc_md)
1739			G_RAID_MD_VOLUME_EVENT(sc->sc_md, vol, event);
1740		return (0);
1741	}
1742
1743	/* Manage root mount release. */
1744	if (vol->v_starting) {
1745		vol->v_starting = 0;
1746		G_RAID_DEBUG1(1, sc, "root_mount_rel %p", vol->v_rootmount);
1747		root_mount_rel(vol->v_rootmount);
1748		vol->v_rootmount = NULL;
1749	}
1750	if (vol->v_stopping && vol->v_provider_open == 0)
1751		g_raid_destroy_volume(vol);
1752	return (0);
1753}
1754
1755/*
1756 * Update subdisk state.
1757 */
1758static int
1759g_raid_update_subdisk(struct g_raid_subdisk *sd, u_int event)
1760{
1761	struct g_raid_softc *sc;
1762	struct g_raid_volume *vol;
1763
1764	sc = sd->sd_softc;
1765	vol = sd->sd_volume;
1766	sx_assert(&sc->sc_lock, SX_XLOCKED);
1767
1768	G_RAID_DEBUG1(2, sc, "Event %s for subdisk %s:%d-%s.",
1769	    g_raid_subdisk_event2str(event),
1770	    vol->v_name, sd->sd_pos,
1771	    sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
1772	if (vol->v_tr)
1773		G_RAID_TR_EVENT(vol->v_tr, sd, event);
1774
1775	return (0);
1776}
1777
1778/*
1779 * Update disk state.
1780 */
1781static int
1782g_raid_update_disk(struct g_raid_disk *disk, u_int event)
1783{
1784	struct g_raid_softc *sc;
1785
1786	sc = disk->d_softc;
1787	sx_assert(&sc->sc_lock, SX_XLOCKED);
1788
1789	G_RAID_DEBUG1(2, sc, "Event %s for disk %s.",
1790	    g_raid_disk_event2str(event),
1791	    g_raid_get_diskname(disk));
1792
1793	if (sc->sc_md)
1794		G_RAID_MD_EVENT(sc->sc_md, disk, event);
1795	return (0);
1796}
1797
1798/*
1799 * Node event.
1800 */
1801static int
1802g_raid_update_node(struct g_raid_softc *sc, u_int event)
1803{
1804	sx_assert(&sc->sc_lock, SX_XLOCKED);
1805
1806	G_RAID_DEBUG1(2, sc, "Event %s for the array.",
1807	    g_raid_node_event2str(event));
1808
1809	if (event == G_RAID_NODE_E_WAKE)
1810		return (0);
1811	if (sc->sc_md)
1812		G_RAID_MD_EVENT(sc->sc_md, NULL, event);
1813	return (0);
1814}
1815
1816static int
1817g_raid_access(struct g_provider *pp, int acr, int acw, int ace)
1818{
1819	struct g_raid_volume *vol;
1820	struct g_raid_softc *sc;
1821	int dcw, opens, error = 0;
1822
1823	g_topology_assert();
1824	sc = pp->geom->softc;
1825	vol = pp->private;
1826	KASSERT(sc != NULL, ("NULL softc (provider=%s).", pp->name));
1827	KASSERT(vol != NULL, ("NULL volume (provider=%s).", pp->name));
1828
1829	G_RAID_DEBUG1(2, sc, "Access request for %s: r%dw%de%d.", pp->name,
1830	    acr, acw, ace);
1831	dcw = pp->acw + acw;
1832
1833	g_topology_unlock();
1834	sx_xlock(&sc->sc_lock);
1835	/* Deny new opens while dying. */
1836	if (sc->sc_stopping != 0 && (acr > 0 || acw > 0 || ace > 0)) {
1837		error = ENXIO;
1838		goto out;
1839	}
1840	/* Deny write opens for read-only volumes. */
1841	if (vol->v_read_only && acw > 0) {
1842		error = EROFS;
1843		goto out;
1844	}
1845	if (dcw == 0)
1846		g_raid_clean(vol, dcw);
1847	vol->v_provider_open += acr + acw + ace;
1848	/* Handle delayed node destruction. */
1849	if (sc->sc_stopping == G_RAID_DESTROY_DELAYED &&
1850	    vol->v_provider_open == 0) {
1851		/* Count open volumes. */
1852		opens = g_raid_nopens(sc);
1853		if (opens == 0) {
1854			sc->sc_stopping = G_RAID_DESTROY_HARD;
1855			/* Wake up worker to make it selfdestruct. */
1856			g_raid_event_send(sc, G_RAID_NODE_E_WAKE, 0);
1857		}
1858	}
1859	/* Handle open volume destruction. */
1860	if (vol->v_stopping && vol->v_provider_open == 0)
1861		g_raid_destroy_volume(vol);
1862out:
1863	sx_xunlock(&sc->sc_lock);
1864	g_topology_lock();
1865	return (error);
1866}
1867
1868struct g_raid_softc *
1869g_raid_create_node(struct g_class *mp,
1870    const char *name, struct g_raid_md_object *md)
1871{
1872	struct g_raid_softc *sc;
1873	struct g_geom *gp;
1874	int error;
1875
1876	g_topology_assert();
1877	G_RAID_DEBUG(1, "Creating array %s.", name);
1878
1879	gp = g_new_geomf(mp, "%s", name);
1880	sc = malloc(sizeof(*sc), M_RAID, M_WAITOK | M_ZERO);
1881	gp->start = g_raid_start;
1882	gp->orphan = g_raid_orphan;
1883	gp->access = g_raid_access;
1884	gp->dumpconf = g_raid_dumpconf;
1885
1886	sc->sc_md = md;
1887	sc->sc_geom = gp;
1888	sc->sc_flags = 0;
1889	TAILQ_INIT(&sc->sc_volumes);
1890	TAILQ_INIT(&sc->sc_disks);
1891	sx_init(&sc->sc_lock, "graid:lock");
1892	mtx_init(&sc->sc_queue_mtx, "graid:queue", NULL, MTX_DEF);
1893	TAILQ_INIT(&sc->sc_events);
1894	bioq_init(&sc->sc_queue);
1895	gp->softc = sc;
1896	error = kproc_create(g_raid_worker, sc, &sc->sc_worker, 0, 0,
1897	    "g_raid %s", name);
1898	if (error != 0) {
1899		G_RAID_DEBUG(0, "Cannot create kernel thread for %s.", name);
1900		mtx_destroy(&sc->sc_queue_mtx);
1901		sx_destroy(&sc->sc_lock);
1902		g_destroy_geom(sc->sc_geom);
1903		free(sc, M_RAID);
1904		return (NULL);
1905	}
1906
1907	G_RAID_DEBUG1(0, sc, "Array %s created.", name);
1908	return (sc);
1909}
1910
1911struct g_raid_volume *
1912g_raid_create_volume(struct g_raid_softc *sc, const char *name, int id)
1913{
1914	struct g_raid_volume	*vol, *vol1;
1915	int i;
1916
1917	G_RAID_DEBUG1(1, sc, "Creating volume %s.", name);
1918	vol = malloc(sizeof(*vol), M_RAID, M_WAITOK | M_ZERO);
1919	vol->v_softc = sc;
1920	strlcpy(vol->v_name, name, G_RAID_MAX_VOLUMENAME);
1921	vol->v_state = G_RAID_VOLUME_S_STARTING;
1922	vol->v_raid_level = G_RAID_VOLUME_RL_UNKNOWN;
1923	vol->v_raid_level_qualifier = G_RAID_VOLUME_RLQ_UNKNOWN;
1924	vol->v_rotate_parity = 1;
1925	bioq_init(&vol->v_inflight);
1926	bioq_init(&vol->v_locked);
1927	LIST_INIT(&vol->v_locks);
1928	for (i = 0; i < G_RAID_MAX_SUBDISKS; i++) {
1929		vol->v_subdisks[i].sd_softc = sc;
1930		vol->v_subdisks[i].sd_volume = vol;
1931		vol->v_subdisks[i].sd_pos = i;
1932		vol->v_subdisks[i].sd_state = G_RAID_DISK_S_NONE;
1933	}
1934
1935	/* Find free ID for this volume. */
1936	g_topology_lock();
1937	vol1 = vol;
1938	if (id >= 0) {
1939		LIST_FOREACH(vol1, &g_raid_volumes, v_global_next) {
1940			if (vol1->v_global_id == id)
1941				break;
1942		}
1943	}
1944	if (vol1 != NULL) {
1945		for (id = 0; ; id++) {
1946			LIST_FOREACH(vol1, &g_raid_volumes, v_global_next) {
1947				if (vol1->v_global_id == id)
1948					break;
1949			}
1950			if (vol1 == NULL)
1951				break;
1952		}
1953	}
1954	vol->v_global_id = id;
1955	LIST_INSERT_HEAD(&g_raid_volumes, vol, v_global_next);
1956	g_topology_unlock();
1957
1958	/* Delay root mounting. */
1959	vol->v_rootmount = root_mount_hold("GRAID");
1960	G_RAID_DEBUG1(1, sc, "root_mount_hold %p", vol->v_rootmount);
1961	vol->v_starting = 1;
1962	TAILQ_INSERT_TAIL(&sc->sc_volumes, vol, v_next);
1963	return (vol);
1964}
1965
1966struct g_raid_disk *
1967g_raid_create_disk(struct g_raid_softc *sc)
1968{
1969	struct g_raid_disk	*disk;
1970
1971	G_RAID_DEBUG1(1, sc, "Creating disk.");
1972	disk = malloc(sizeof(*disk), M_RAID, M_WAITOK | M_ZERO);
1973	disk->d_softc = sc;
1974	disk->d_state = G_RAID_DISK_S_NONE;
1975	TAILQ_INIT(&disk->d_subdisks);
1976	TAILQ_INSERT_TAIL(&sc->sc_disks, disk, d_next);
1977	return (disk);
1978}
1979
1980int g_raid_start_volume(struct g_raid_volume *vol)
1981{
1982	struct g_raid_tr_class *class;
1983	struct g_raid_tr_object *obj;
1984	int status;
1985
1986	G_RAID_DEBUG1(2, vol->v_softc, "Starting volume %s.", vol->v_name);
1987	LIST_FOREACH(class, &g_raid_tr_classes, trc_list) {
1988		if (!class->trc_enable)
1989			continue;
1990		G_RAID_DEBUG1(2, vol->v_softc,
1991		    "Tasting volume %s for %s transformation.",
1992		    vol->v_name, class->name);
1993		obj = (void *)kobj_create((kobj_class_t)class, M_RAID,
1994		    M_WAITOK);
1995		obj->tro_class = class;
1996		obj->tro_volume = vol;
1997		status = G_RAID_TR_TASTE(obj, vol);
1998		if (status != G_RAID_TR_TASTE_FAIL)
1999			break;
2000		kobj_delete((kobj_t)obj, M_RAID);
2001	}
2002	if (class == NULL) {
2003		G_RAID_DEBUG1(0, vol->v_softc,
2004		    "No transformation module found for %s.",
2005		    vol->v_name);
2006		vol->v_tr = NULL;
2007		g_raid_change_volume_state(vol, G_RAID_VOLUME_S_UNSUPPORTED);
2008		g_raid_event_send(vol, G_RAID_VOLUME_E_DOWN,
2009		    G_RAID_EVENT_VOLUME);
2010		return (-1);
2011	}
2012	G_RAID_DEBUG1(2, vol->v_softc,
2013	    "Transformation module %s chosen for %s.",
2014	    class->name, vol->v_name);
2015	vol->v_tr = obj;
2016	return (0);
2017}
2018
2019int
2020g_raid_destroy_node(struct g_raid_softc *sc, int worker)
2021{
2022	struct g_raid_volume *vol, *tmpv;
2023	struct g_raid_disk *disk, *tmpd;
2024	int error = 0;
2025
2026	sc->sc_stopping = G_RAID_DESTROY_HARD;
2027	TAILQ_FOREACH_SAFE(vol, &sc->sc_volumes, v_next, tmpv) {
2028		if (g_raid_destroy_volume(vol))
2029			error = EBUSY;
2030	}
2031	if (error)
2032		return (error);
2033	TAILQ_FOREACH_SAFE(disk, &sc->sc_disks, d_next, tmpd) {
2034		if (g_raid_destroy_disk(disk))
2035			error = EBUSY;
2036	}
2037	if (error)
2038		return (error);
2039	if (sc->sc_md) {
2040		G_RAID_MD_FREE(sc->sc_md);
2041		kobj_delete((kobj_t)sc->sc_md, M_RAID);
2042		sc->sc_md = NULL;
2043	}
2044	if (sc->sc_geom != NULL) {
2045		G_RAID_DEBUG1(0, sc, "Array %s destroyed.", sc->sc_name);
2046		g_topology_lock();
2047		sc->sc_geom->softc = NULL;
2048		g_wither_geom(sc->sc_geom, ENXIO);
2049		g_topology_unlock();
2050		sc->sc_geom = NULL;
2051	} else
2052		G_RAID_DEBUG(1, "Array destroyed.");
2053	if (worker) {
2054		g_raid_event_cancel(sc, sc);
2055		mtx_destroy(&sc->sc_queue_mtx);
2056		sx_xunlock(&sc->sc_lock);
2057		sx_destroy(&sc->sc_lock);
2058		wakeup(&sc->sc_stopping);
2059		free(sc, M_RAID);
2060		curthread->td_pflags &= ~TDP_GEOM;
2061		G_RAID_DEBUG(1, "Thread exiting.");
2062		kproc_exit(0);
2063	} else {
2064		/* Wake up worker to make it selfdestruct. */
2065		g_raid_event_send(sc, G_RAID_NODE_E_WAKE, 0);
2066	}
2067	return (0);
2068}
2069
2070int
2071g_raid_destroy_volume(struct g_raid_volume *vol)
2072{
2073	struct g_raid_softc *sc;
2074	struct g_raid_disk *disk;
2075	int i;
2076
2077	sc = vol->v_softc;
2078	G_RAID_DEBUG1(2, sc, "Destroying volume %s.", vol->v_name);
2079	vol->v_stopping = 1;
2080	if (vol->v_state != G_RAID_VOLUME_S_STOPPED) {
2081		if (vol->v_tr) {
2082			G_RAID_TR_STOP(vol->v_tr);
2083			return (EBUSY);
2084		} else
2085			vol->v_state = G_RAID_VOLUME_S_STOPPED;
2086	}
2087	if (g_raid_event_check(sc, vol) != 0)
2088		return (EBUSY);
2089	if (vol->v_provider != NULL)
2090		return (EBUSY);
2091	if (vol->v_provider_open != 0)
2092		return (EBUSY);
2093	if (vol->v_tr) {
2094		G_RAID_TR_FREE(vol->v_tr);
2095		kobj_delete((kobj_t)vol->v_tr, M_RAID);
2096		vol->v_tr = NULL;
2097	}
2098	if (vol->v_rootmount)
2099		root_mount_rel(vol->v_rootmount);
2100	g_topology_lock();
2101	LIST_REMOVE(vol, v_global_next);
2102	g_topology_unlock();
2103	TAILQ_REMOVE(&sc->sc_volumes, vol, v_next);
2104	for (i = 0; i < G_RAID_MAX_SUBDISKS; i++) {
2105		g_raid_event_cancel(sc, &vol->v_subdisks[i]);
2106		disk = vol->v_subdisks[i].sd_disk;
2107		if (disk == NULL)
2108			continue;
2109		TAILQ_REMOVE(&disk->d_subdisks, &vol->v_subdisks[i], sd_next);
2110	}
2111	G_RAID_DEBUG1(2, sc, "Volume %s destroyed.", vol->v_name);
2112	if (sc->sc_md)
2113		G_RAID_MD_FREE_VOLUME(sc->sc_md, vol);
2114	g_raid_event_cancel(sc, vol);
2115	free(vol, M_RAID);
2116	if (sc->sc_stopping == G_RAID_DESTROY_HARD) {
2117		/* Wake up worker to let it selfdestruct. */
2118		g_raid_event_send(sc, G_RAID_NODE_E_WAKE, 0);
2119	}
2120	return (0);
2121}
2122
2123int
2124g_raid_destroy_disk(struct g_raid_disk *disk)
2125{
2126	struct g_raid_softc *sc;
2127	struct g_raid_subdisk *sd, *tmp;
2128
2129	sc = disk->d_softc;
2130	G_RAID_DEBUG1(2, sc, "Destroying disk.");
2131	if (disk->d_consumer) {
2132		g_raid_kill_consumer(sc, disk->d_consumer);
2133		disk->d_consumer = NULL;
2134	}
2135	TAILQ_FOREACH_SAFE(sd, &disk->d_subdisks, sd_next, tmp) {
2136		g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NONE);
2137		g_raid_event_send(sd, G_RAID_SUBDISK_E_DISCONNECTED,
2138		    G_RAID_EVENT_SUBDISK);
2139		TAILQ_REMOVE(&disk->d_subdisks, sd, sd_next);
2140		sd->sd_disk = NULL;
2141	}
2142	TAILQ_REMOVE(&sc->sc_disks, disk, d_next);
2143	if (sc->sc_md)
2144		G_RAID_MD_FREE_DISK(sc->sc_md, disk);
2145	g_raid_event_cancel(sc, disk);
2146	free(disk, M_RAID);
2147	return (0);
2148}
2149
2150int
2151g_raid_destroy(struct g_raid_softc *sc, int how)
2152{
2153	int error, opens;
2154
2155	g_topology_assert_not();
2156	if (sc == NULL)
2157		return (ENXIO);
2158	sx_assert(&sc->sc_lock, SX_XLOCKED);
2159
2160	/* Count open volumes. */
2161	opens = g_raid_nopens(sc);
2162
2163	/* React on some opened volumes. */
2164	if (opens > 0) {
2165		switch (how) {
2166		case G_RAID_DESTROY_SOFT:
2167			G_RAID_DEBUG1(1, sc,
2168			    "%d volumes are still open.",
2169			    opens);
2170			sx_xunlock(&sc->sc_lock);
2171			return (EBUSY);
2172		case G_RAID_DESTROY_DELAYED:
2173			G_RAID_DEBUG1(1, sc,
2174			    "Array will be destroyed on last close.");
2175			sc->sc_stopping = G_RAID_DESTROY_DELAYED;
2176			sx_xunlock(&sc->sc_lock);
2177			return (EBUSY);
2178		case G_RAID_DESTROY_HARD:
2179			G_RAID_DEBUG1(1, sc,
2180			    "%d volumes are still open.",
2181			    opens);
2182		}
2183	}
2184
2185	/* Mark node for destruction. */
2186	sc->sc_stopping = G_RAID_DESTROY_HARD;
2187	/* Wake up worker to let it selfdestruct. */
2188	g_raid_event_send(sc, G_RAID_NODE_E_WAKE, 0);
2189	/* Sleep until node destroyed. */
2190	error = sx_sleep(&sc->sc_stopping, &sc->sc_lock,
2191	    PRIBIO | PDROP, "r:destroy", hz * 3);
2192	return (error == EWOULDBLOCK ? EBUSY : 0);
2193}
2194
2195static void
2196g_raid_taste_orphan(struct g_consumer *cp)
2197{
2198
2199	KASSERT(1 == 0, ("%s called while tasting %s.", __func__,
2200	    cp->provider->name));
2201}
2202
2203static struct g_geom *
2204g_raid_taste(struct g_class *mp, struct g_provider *pp, int flags __unused)
2205{
2206	struct g_consumer *cp;
2207	struct g_geom *gp, *geom;
2208	struct g_raid_md_class *class;
2209	struct g_raid_md_object *obj;
2210	int status;
2211
2212	g_topology_assert();
2213	g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name);
2214	if (!g_raid_enable)
2215		return (NULL);
2216	G_RAID_DEBUG(2, "Tasting provider %s.", pp->name);
2217
2218	geom = NULL;
2219	status = G_RAID_MD_TASTE_FAIL;
2220	gp = g_new_geomf(mp, "raid:taste");
2221	/*
2222	 * This orphan function should be never called.
2223	 */
2224	gp->orphan = g_raid_taste_orphan;
2225	cp = g_new_consumer(gp);
2226	cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE;
2227	if (g_attach(cp, pp) != 0)
2228		goto ofail2;
2229	if (g_access(cp, 1, 0, 0) != 0)
2230		goto ofail;
2231
2232	LIST_FOREACH(class, &g_raid_md_classes, mdc_list) {
2233		if (!class->mdc_enable)
2234			continue;
2235		G_RAID_DEBUG(2, "Tasting provider %s for %s metadata.",
2236		    pp->name, class->name);
2237		obj = (void *)kobj_create((kobj_class_t)class, M_RAID,
2238		    M_WAITOK);
2239		obj->mdo_class = class;
2240		status = G_RAID_MD_TASTE(obj, mp, cp, &geom);
2241		if (status != G_RAID_MD_TASTE_NEW)
2242			kobj_delete((kobj_t)obj, M_RAID);
2243		if (status != G_RAID_MD_TASTE_FAIL)
2244			break;
2245	}
2246
2247	if (status == G_RAID_MD_TASTE_FAIL)
2248		(void)g_access(cp, -1, 0, 0);
2249ofail:
2250	g_detach(cp);
2251ofail2:
2252	g_destroy_consumer(cp);
2253	g_destroy_geom(gp);
2254	G_RAID_DEBUG(2, "Tasting provider %s done.", pp->name);
2255	return (geom);
2256}
2257
2258int
2259g_raid_create_node_format(const char *format, struct gctl_req *req,
2260    struct g_geom **gp)
2261{
2262	struct g_raid_md_class *class;
2263	struct g_raid_md_object *obj;
2264	int status;
2265
2266	G_RAID_DEBUG(2, "Creating array for %s metadata.", format);
2267	LIST_FOREACH(class, &g_raid_md_classes, mdc_list) {
2268		if (strcasecmp(class->name, format) == 0)
2269			break;
2270	}
2271	if (class == NULL) {
2272		G_RAID_DEBUG(1, "No support for %s metadata.", format);
2273		return (G_RAID_MD_TASTE_FAIL);
2274	}
2275	obj = (void *)kobj_create((kobj_class_t)class, M_RAID,
2276	    M_WAITOK);
2277	obj->mdo_class = class;
2278	status = G_RAID_MD_CREATE_REQ(obj, &g_raid_class, req, gp);
2279	if (status != G_RAID_MD_TASTE_NEW)
2280		kobj_delete((kobj_t)obj, M_RAID);
2281	return (status);
2282}
2283
2284static int
2285g_raid_destroy_geom(struct gctl_req *req __unused,
2286    struct g_class *mp __unused, struct g_geom *gp)
2287{
2288	struct g_raid_softc *sc;
2289	int error;
2290
2291	g_topology_unlock();
2292	sc = gp->softc;
2293	sx_xlock(&sc->sc_lock);
2294	g_cancel_event(sc);
2295	error = g_raid_destroy(gp->softc, G_RAID_DESTROY_SOFT);
2296	g_topology_lock();
2297	return (error);
2298}
2299
2300void g_raid_write_metadata(struct g_raid_softc *sc, struct g_raid_volume *vol,
2301    struct g_raid_subdisk *sd, struct g_raid_disk *disk)
2302{
2303
2304	if (sc->sc_stopping == G_RAID_DESTROY_HARD)
2305		return;
2306	if (sc->sc_md)
2307		G_RAID_MD_WRITE(sc->sc_md, vol, sd, disk);
2308}
2309
2310void g_raid_fail_disk(struct g_raid_softc *sc,
2311    struct g_raid_subdisk *sd, struct g_raid_disk *disk)
2312{
2313
2314	if (disk == NULL)
2315		disk = sd->sd_disk;
2316	if (disk == NULL) {
2317		G_RAID_DEBUG1(0, sc, "Warning! Fail request to an absent disk!");
2318		return;
2319	}
2320	if (disk->d_state != G_RAID_DISK_S_ACTIVE) {
2321		G_RAID_DEBUG1(0, sc, "Warning! Fail request to a disk in a "
2322		    "wrong state (%s)!", g_raid_disk_state2str(disk->d_state));
2323		return;
2324	}
2325	if (sc->sc_md)
2326		G_RAID_MD_FAIL_DISK(sc->sc_md, sd, disk);
2327}
2328
2329static void
2330g_raid_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp,
2331    struct g_consumer *cp, struct g_provider *pp)
2332{
2333	struct g_raid_softc *sc;
2334	struct g_raid_volume *vol;
2335	struct g_raid_subdisk *sd;
2336	struct g_raid_disk *disk;
2337	int i, s;
2338
2339	g_topology_assert();
2340
2341	sc = gp->softc;
2342	if (sc == NULL)
2343		return;
2344	if (pp != NULL) {
2345		vol = pp->private;
2346		g_topology_unlock();
2347		sx_xlock(&sc->sc_lock);
2348		sbuf_printf(sb, "%s<descr>%s %s volume</descr>\n", indent,
2349		    sc->sc_md->mdo_class->name,
2350		    g_raid_volume_level2str(vol->v_raid_level,
2351		    vol->v_raid_level_qualifier));
2352		sbuf_printf(sb, "%s<Label>%s</Label>\n", indent,
2353		    vol->v_name);
2354		sbuf_printf(sb, "%s<RAIDLevel>%s</RAIDLevel>\n", indent,
2355		    g_raid_volume_level2str(vol->v_raid_level,
2356		    vol->v_raid_level_qualifier));
2357		sbuf_printf(sb,
2358		    "%s<Transformation>%s</Transformation>\n", indent,
2359		    vol->v_tr ? vol->v_tr->tro_class->name : "NONE");
2360		sbuf_printf(sb, "%s<Components>%u</Components>\n", indent,
2361		    vol->v_disks_count);
2362		sbuf_printf(sb, "%s<Strip>%u</Strip>\n", indent,
2363		    vol->v_strip_size);
2364		sbuf_printf(sb, "%s<State>%s</State>\n", indent,
2365		    g_raid_volume_state2str(vol->v_state));
2366		sbuf_printf(sb, "%s<Dirty>%s</Dirty>\n", indent,
2367		    vol->v_dirty ? "Yes" : "No");
2368		sbuf_printf(sb, "%s<Subdisks>", indent);
2369		for (i = 0; i < vol->v_disks_count; i++) {
2370			sd = &vol->v_subdisks[i];
2371			if (sd->sd_disk != NULL &&
2372			    sd->sd_disk->d_consumer != NULL) {
2373				sbuf_printf(sb, "%s ",
2374				    g_raid_get_diskname(sd->sd_disk));
2375			} else {
2376				sbuf_cat(sb, "NONE ");
2377			}
2378			sbuf_printf(sb, "(%s",
2379			    g_raid_subdisk_state2str(sd->sd_state));
2380			if (sd->sd_state == G_RAID_SUBDISK_S_REBUILD ||
2381			    sd->sd_state == G_RAID_SUBDISK_S_RESYNC) {
2382				sbuf_printf(sb, " %d%%",
2383				    (int)(sd->sd_rebuild_pos * 100 /
2384				     sd->sd_size));
2385			}
2386			sbuf_cat(sb, ")");
2387			if (i + 1 < vol->v_disks_count)
2388				sbuf_cat(sb, ", ");
2389		}
2390		sbuf_cat(sb, "</Subdisks>\n");
2391		sx_xunlock(&sc->sc_lock);
2392		g_topology_lock();
2393	} else if (cp != NULL) {
2394		disk = cp->private;
2395		if (disk == NULL)
2396			return;
2397		g_topology_unlock();
2398		sx_xlock(&sc->sc_lock);
2399		sbuf_printf(sb, "%s<State>%s", indent,
2400		    g_raid_disk_state2str(disk->d_state));
2401		if (!TAILQ_EMPTY(&disk->d_subdisks)) {
2402			sbuf_cat(sb, " (");
2403			TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) {
2404				sbuf_printf(sb, "%s",
2405				    g_raid_subdisk_state2str(sd->sd_state));
2406				if (sd->sd_state == G_RAID_SUBDISK_S_REBUILD ||
2407				    sd->sd_state == G_RAID_SUBDISK_S_RESYNC) {
2408					sbuf_printf(sb, " %d%%",
2409					    (int)(sd->sd_rebuild_pos * 100 /
2410					     sd->sd_size));
2411				}
2412				if (TAILQ_NEXT(sd, sd_next))
2413					sbuf_cat(sb, ", ");
2414			}
2415			sbuf_cat(sb, ")");
2416		}
2417		sbuf_cat(sb, "</State>\n");
2418		sbuf_printf(sb, "%s<Subdisks>", indent);
2419		TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) {
2420			sbuf_printf(sb, "r%d(%s):%d@%ju",
2421			    sd->sd_volume->v_global_id,
2422			    sd->sd_volume->v_name,
2423			    sd->sd_pos, (uintmax_t)sd->sd_offset);
2424			if (TAILQ_NEXT(sd, sd_next))
2425				sbuf_cat(sb, ", ");
2426		}
2427		sbuf_cat(sb, "</Subdisks>\n");
2428		sbuf_printf(sb, "%s<ReadErrors>%d</ReadErrors>\n", indent,
2429		    disk->d_read_errs);
2430		sx_xunlock(&sc->sc_lock);
2431		g_topology_lock();
2432	} else {
2433		g_topology_unlock();
2434		sx_xlock(&sc->sc_lock);
2435		if (sc->sc_md) {
2436			sbuf_printf(sb, "%s<Metadata>%s</Metadata>\n", indent,
2437			    sc->sc_md->mdo_class->name);
2438		}
2439		if (!TAILQ_EMPTY(&sc->sc_volumes)) {
2440			s = 0xff;
2441			TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
2442				if (vol->v_state < s)
2443					s = vol->v_state;
2444			}
2445			sbuf_printf(sb, "%s<State>%s</State>\n", indent,
2446			    g_raid_volume_state2str(s));
2447		}
2448		sx_xunlock(&sc->sc_lock);
2449		g_topology_lock();
2450	}
2451}
2452
2453static void
2454g_raid_shutdown_post_sync(void *arg, int howto)
2455{
2456	struct g_class *mp;
2457	struct g_geom *gp, *gp2;
2458	struct g_raid_softc *sc;
2459	struct g_raid_volume *vol;
2460
2461	if ((howto & RB_NOSYNC) != 0)
2462		return;
2463
2464	mp = arg;
2465	g_topology_lock();
2466	g_raid_shutdown = 1;
2467	LIST_FOREACH_SAFE(gp, &mp->geom, geom, gp2) {
2468		if ((sc = gp->softc) == NULL)
2469			continue;
2470		g_topology_unlock();
2471		sx_xlock(&sc->sc_lock);
2472		TAILQ_FOREACH(vol, &sc->sc_volumes, v_next)
2473			g_raid_clean(vol, -1);
2474		g_cancel_event(sc);
2475		g_raid_destroy(sc, G_RAID_DESTROY_DELAYED);
2476		g_topology_lock();
2477	}
2478	g_topology_unlock();
2479}
2480
2481static void
2482g_raid_init(struct g_class *mp)
2483{
2484
2485	g_raid_post_sync = EVENTHANDLER_REGISTER(shutdown_post_sync,
2486	    g_raid_shutdown_post_sync, mp, SHUTDOWN_PRI_FIRST);
2487	if (g_raid_post_sync == NULL)
2488		G_RAID_DEBUG(0, "Warning! Cannot register shutdown event.");
2489	g_raid_started = 1;
2490}
2491
2492static void
2493g_raid_fini(struct g_class *mp)
2494{
2495
2496	if (g_raid_post_sync != NULL)
2497		EVENTHANDLER_DEREGISTER(shutdown_post_sync, g_raid_post_sync);
2498	g_raid_started = 0;
2499}
2500
2501int
2502g_raid_md_modevent(module_t mod, int type, void *arg)
2503{
2504	struct g_raid_md_class *class, *c, *nc;
2505	int error;
2506
2507	error = 0;
2508	class = arg;
2509	switch (type) {
2510	case MOD_LOAD:
2511		c = LIST_FIRST(&g_raid_md_classes);
2512		if (c == NULL || c->mdc_priority > class->mdc_priority)
2513			LIST_INSERT_HEAD(&g_raid_md_classes, class, mdc_list);
2514		else {
2515			while ((nc = LIST_NEXT(c, mdc_list)) != NULL &&
2516			    nc->mdc_priority < class->mdc_priority)
2517				c = nc;
2518			LIST_INSERT_AFTER(c, class, mdc_list);
2519		}
2520		if (g_raid_started)
2521			g_retaste(&g_raid_class);
2522		break;
2523	case MOD_UNLOAD:
2524		LIST_REMOVE(class, mdc_list);
2525		break;
2526	default:
2527		error = EOPNOTSUPP;
2528		break;
2529	}
2530
2531	return (error);
2532}
2533
2534int
2535g_raid_tr_modevent(module_t mod, int type, void *arg)
2536{
2537	struct g_raid_tr_class *class, *c, *nc;
2538	int error;
2539
2540	error = 0;
2541	class = arg;
2542	switch (type) {
2543	case MOD_LOAD:
2544		c = LIST_FIRST(&g_raid_tr_classes);
2545		if (c == NULL || c->trc_priority > class->trc_priority)
2546			LIST_INSERT_HEAD(&g_raid_tr_classes, class, trc_list);
2547		else {
2548			while ((nc = LIST_NEXT(c, trc_list)) != NULL &&
2549			    nc->trc_priority < class->trc_priority)
2550				c = nc;
2551			LIST_INSERT_AFTER(c, class, trc_list);
2552		}
2553		break;
2554	case MOD_UNLOAD:
2555		LIST_REMOVE(class, trc_list);
2556		break;
2557	default:
2558		error = EOPNOTSUPP;
2559		break;
2560	}
2561
2562	return (error);
2563}
2564
2565/*
2566 * Use local implementation of DECLARE_GEOM_CLASS(g_raid_class, g_raid)
2567 * to reduce module priority, allowing submodules to register them first.
2568 */
2569static moduledata_t g_raid_mod = {
2570	"g_raid",
2571	g_modevent,
2572	&g_raid_class
2573};
2574DECLARE_MODULE(g_raid, g_raid_mod, SI_SUB_DRIVERS, SI_ORDER_THIRD);
2575MODULE_VERSION(geom_raid, 0);
2576