md_promise.c revision 220209
1/*-
2 * Copyright (c) 2011 Alexander Motin <mav@FreeBSD.org>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26
27#include <sys/cdefs.h>
28__FBSDID("$FreeBSD: head/sys/geom/raid/md_promise.c 220209 2011-03-31 16:14:35Z mav $");
29
30#include <sys/param.h>
31#include <sys/bio.h>
32#include <sys/endian.h>
33#include <sys/kernel.h>
34#include <sys/kobj.h>
35#include <sys/limits.h>
36#include <sys/lock.h>
37#include <sys/malloc.h>
38#include <sys/mutex.h>
39#include <sys/systm.h>
40#include <geom/geom.h>
41#include "geom/raid/g_raid.h"
42#include "g_raid_md_if.h"
43
44static MALLOC_DEFINE(M_MD_PROMISE, "md_promise_data", "GEOM_RAID Promise metadata");
45
46#define	PROMISE_MAX_DISKS	8
47#define	PROMISE_MAX_SUBDISKS	2
48#define	PROMISE_META_OFFSET	14
49
50struct promise_raid_disk {
51	uint8_t		flags;			/* Subdisk status. */
52#define PROMISE_F_VALID		0x01
53#define PROMISE_F_ONLINE	0x02
54#define PROMISE_F_ASSIGNED	0x04
55#define PROMISE_F_SPARE		0x08
56#define PROMISE_F_DUPLICATE	0x10
57#define PROMISE_F_REDIR		0x20
58#define PROMISE_F_DOWN		0x40
59#define PROMISE_F_READY		0x80
60
61	uint8_t		number;			/* Position in a volume. */
62	uint8_t		channel;		/* ATA channel number. */
63	uint8_t		device;			/* ATA device number. */
64	uint64_t	id __packed;		/* Subdisk ID. */
65} __packed;
66
67struct promise_raid_conf {
68	char		promise_id[24];
69#define PROMISE_MAGIC		"Promise Technology, Inc."
70#define FREEBSD_MAGIC		"FreeBSD ATA driver RAID "
71
72	uint32_t	dummy_0;
73	uint64_t	magic_0;
74#define PROMISE_MAGIC0(x)	(((uint64_t)(x.channel) << 48) | \
75				((uint64_t)(x.device != 0) << 56))
76	uint16_t	magic_1;
77	uint32_t	magic_2;
78	uint8_t		filler1[470];
79
80	uint32_t	integrity;
81#define PROMISE_I_VALID		0x00000080
82
83	struct promise_raid_disk	disk;	/* This subdisk info. */
84	uint32_t	disk_offset;		/* Subdisk offset. */
85	uint32_t	disk_sectors;		/* Subdisk size */
86	uint32_t	rebuild_lba;		/* Rebuild position. */
87	uint16_t	generation;		/* Generation number. */
88	uint8_t		status;			/* Volume status. */
89#define PROMISE_S_VALID		0x01
90#define PROMISE_S_ONLINE	0x02
91#define PROMISE_S_INITED	0x04
92#define PROMISE_S_READY		0x08
93#define PROMISE_S_DEGRADED	0x10
94#define PROMISE_S_MARKED	0x20
95#define PROMISE_S_MIGRATING	0x40
96#define PROMISE_S_FUNCTIONAL	0x80
97
98	uint8_t		type;			/* Voluem type. */
99#define PROMISE_T_RAID0		0x00
100#define PROMISE_T_RAID1		0x01
101#define PROMISE_T_RAID3		0x02
102#define PROMISE_T_RAID5		0x04
103#define PROMISE_T_SPAN		0x08
104#define PROMISE_T_JBOD		0x10
105
106	uint8_t		total_disks;		/* Disks in this volume. */
107	uint8_t		stripe_shift;		/* Strip size. */
108	uint8_t		array_width;		/* Number of RAID0 stripes. */
109	uint8_t		array_number;		/* Global volume number. */
110	uint32_t	total_sectors;		/* Volume size. */
111	uint16_t	cylinders;		/* Volume geometry: C. */
112	uint8_t		heads;			/* Volume geometry: H. */
113	uint8_t		sectors;		/* Volume geometry: S. */
114	uint64_t	volume_id __packed;	/* Volume ID, */
115	struct promise_raid_disk	disks[PROMISE_MAX_DISKS];
116						/* Subdisks in this volume. */
117	char		name[32];		/* Volume label. */
118
119	uint32_t	filler2[8];
120	uint32_t	magic_3;	/* Something related to rebuild. */
121	uint64_t	rebuild_lba64;	/* Per-volume rebuild position. */
122	uint32_t	magic_4;
123	uint32_t	magic_5;
124	uint32_t	filler3[325];
125	uint32_t	checksum;
126} __packed;
127
128struct g_raid_md_promise_perdisk {
129	int		 pd_updated;
130	int		 pd_subdisks;
131	struct promise_raid_conf	*pd_meta[PROMISE_MAX_SUBDISKS];
132};
133
134struct g_raid_md_promise_pervolume {
135	struct promise_raid_conf	*pv_meta;
136	uint64_t			 pv_id;
137	uint16_t			 pv_generation;
138	int				 pv_disks_present;
139	int				 pv_started;
140	struct callout			 pv_start_co;	/* STARTING state timer. */
141};
142
143static g_raid_md_create_t g_raid_md_create_promise;
144static g_raid_md_taste_t g_raid_md_taste_promise;
145static g_raid_md_event_t g_raid_md_event_promise;
146static g_raid_md_volume_event_t g_raid_md_volume_event_promise;
147static g_raid_md_ctl_t g_raid_md_ctl_promise;
148static g_raid_md_write_t g_raid_md_write_promise;
149static g_raid_md_fail_disk_t g_raid_md_fail_disk_promise;
150static g_raid_md_free_disk_t g_raid_md_free_disk_promise;
151static g_raid_md_free_volume_t g_raid_md_free_volume_promise;
152static g_raid_md_free_t g_raid_md_free_promise;
153
154static kobj_method_t g_raid_md_promise_methods[] = {
155	KOBJMETHOD(g_raid_md_create,	g_raid_md_create_promise),
156	KOBJMETHOD(g_raid_md_taste,	g_raid_md_taste_promise),
157	KOBJMETHOD(g_raid_md_event,	g_raid_md_event_promise),
158	KOBJMETHOD(g_raid_md_volume_event,	g_raid_md_volume_event_promise),
159	KOBJMETHOD(g_raid_md_ctl,	g_raid_md_ctl_promise),
160	KOBJMETHOD(g_raid_md_write,	g_raid_md_write_promise),
161	KOBJMETHOD(g_raid_md_fail_disk,	g_raid_md_fail_disk_promise),
162	KOBJMETHOD(g_raid_md_free_disk,	g_raid_md_free_disk_promise),
163	KOBJMETHOD(g_raid_md_free_volume,	g_raid_md_free_volume_promise),
164	KOBJMETHOD(g_raid_md_free,	g_raid_md_free_promise),
165	{ 0, 0 }
166};
167
168static struct g_raid_md_class g_raid_md_promise_class = {
169	"Promise",
170	g_raid_md_promise_methods,
171	sizeof(struct g_raid_md_object),
172	.mdc_priority = 100
173};
174
175
176static void
177g_raid_md_promise_print(struct promise_raid_conf *meta)
178{
179	int i;
180
181	if (g_raid_debug < 1)
182		return;
183
184	printf("********* ATA Promise Metadata *********\n");
185	printf("promise_id          <%.24s>\n", meta->promise_id);
186	printf("disk                %02x %02x %02x %02x %016jx\n",
187	    meta->disk.flags, meta->disk.number, meta->disk.channel,
188	    meta->disk.device, meta->disk.id);
189	printf("disk_offset         %u\n", meta->disk_offset);
190	printf("disk_sectors        %u\n", meta->disk_sectors);
191	printf("rebuild_lba         %u\n", meta->rebuild_lba);
192	printf("generation          %u\n", meta->generation);
193	printf("status              0x%02x\n", meta->status);
194	printf("type                %u\n", meta->type);
195	printf("total_disks         %u\n", meta->total_disks);
196	printf("stripe_shift        %u\n", meta->stripe_shift);
197	printf("array_width         %u\n", meta->array_width);
198	printf("array_number        %u\n", meta->array_number);
199	printf("total_sectors       %u\n", meta->total_sectors);
200	printf("cylinders           %u\n", meta->cylinders);
201	printf("heads               %u\n", meta->heads);
202	printf("sectors             %u\n", meta->sectors);
203	printf("volume_id           0x%016jx\n", meta->volume_id);
204	printf("disks:\n");
205	for (i = 0; i < PROMISE_MAX_DISKS; i++ ) {
206		printf("                    %02x %02x %02x %02x %016jx\n",
207		    meta->disks[i].flags, meta->disks[i].number,
208		    meta->disks[i].channel, meta->disks[i].device,
209		    meta->disks[i].id);
210	}
211	printf("name                <%.32s>\n", meta->name);
212	printf("magic_3             0x%08x\n", meta->magic_3);
213	printf("rebuild_lba64       %ju\n", meta->rebuild_lba64);
214	printf("magic_4             0x%08x\n", meta->magic_4);
215	printf("magic_5             0x%08x\n", meta->magic_5);
216	printf("=================================================\n");
217}
218
219static struct promise_raid_conf *
220promise_meta_copy(struct promise_raid_conf *meta)
221{
222	struct promise_raid_conf *nmeta;
223
224	nmeta = malloc(sizeof(*nmeta), M_MD_PROMISE, M_WAITOK);
225	memcpy(nmeta, meta, sizeof(*nmeta));
226	return (nmeta);
227}
228
229static int
230promise_meta_find_disk(struct promise_raid_conf *meta, uint64_t id)
231{
232	int pos;
233
234	for (pos = 0; pos < meta->total_disks; pos++) {
235		if (meta->disks[pos].id == id)
236			return (pos);
237	}
238	return (-1);
239}
240
241static int
242promise_meta_unused_range(struct promise_raid_conf **metaarr, int nsd,
243    uint32_t sectors, uint32_t *off, uint32_t *size)
244{
245	uint32_t coff, csize;
246	int i, j;
247
248	sectors -= 131072;
249	*off = 0;
250	*size = 0;
251	coff = 0;
252	csize = sectors;
253	i = 0;
254	while (1) {
255		for (j = 0; j < nsd; j++) {
256			if (metaarr[j]->disk_offset >= coff) {
257				csize = MIN(csize,
258				    metaarr[j]->disk_offset - coff);
259			}
260		}
261		if (csize > *size) {
262			*off = coff;
263			*size = csize;
264		}
265		if (i >= nsd)
266			break;
267		coff = metaarr[i]->disk_offset + metaarr[i]->disk_sectors;
268		csize = sectors - coff;
269		i++;
270	};
271	return ((*size > 0) ? 1 : 0);
272}
273
274static int
275promise_meta_translate_disk(struct g_raid_volume *vol, int md_disk_pos)
276{
277	int disk_pos, width;
278
279	if (md_disk_pos >= 0 && vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E) {
280		width = vol->v_disks_count / 2;
281		disk_pos = (md_disk_pos / width) +
282		    (md_disk_pos % width) * width;
283	} else
284		disk_pos = md_disk_pos;
285	return (disk_pos);
286}
287
288static void
289promise_meta_get_name(struct promise_raid_conf *meta, char *buf)
290{
291	int i;
292
293	strncpy(buf, meta->name, 32);
294	buf[32] = 0;
295	for (i = 31; i >= 0; i--) {
296		if (buf[i] > 0x20)
297			break;
298		buf[i] = 0;
299	}
300}
301
302static void
303promise_meta_put_name(struct promise_raid_conf *meta, char *buf)
304{
305
306	memset(meta->name, 0x20, 32);
307	memcpy(meta->name, buf, MIN(strlen(buf), 32));
308}
309
310static int
311promise_meta_read(struct g_consumer *cp, struct promise_raid_conf **metaarr)
312{
313	struct g_provider *pp;
314	struct promise_raid_conf *meta;
315	char *buf;
316	int error, i, subdisks;
317	uint32_t checksum, *ptr;
318
319	pp = cp->provider;
320	subdisks = 0;
321next:
322	/* Read metadata block. */
323	buf = g_read_data(cp, pp->mediasize - pp->sectorsize *
324	    (63 - subdisks * PROMISE_META_OFFSET),
325	    pp->sectorsize * 4, &error);
326	if (buf == NULL) {
327		G_RAID_DEBUG(1, "Cannot read metadata from %s (error=%d).",
328		    pp->name, error);
329		return (subdisks);
330	}
331	meta = (struct promise_raid_conf *)buf;
332
333	/* Check if this is an Promise RAID struct */
334	if (strncmp(meta->promise_id, PROMISE_MAGIC, strlen(PROMISE_MAGIC)) &&
335	    strncmp(meta->promise_id, FREEBSD_MAGIC, strlen(FREEBSD_MAGIC))) {
336		if (subdisks == 0)
337			G_RAID_DEBUG(1,
338			    "Promise signature check failed on %s", pp->name);
339		g_free(buf);
340		return (subdisks);
341	}
342	meta = malloc(sizeof(*meta), M_MD_PROMISE, M_WAITOK);
343	memcpy(meta, buf, MIN(sizeof(*meta), pp->sectorsize * 4));
344	g_free(buf);
345
346	/* Check metadata checksum. */
347	for (checksum = 0, ptr = (uint32_t *)meta, i = 0; i < 511; i++)
348		checksum += *ptr++;
349	if (checksum != meta->checksum) {
350		G_RAID_DEBUG(1, "Promise checksum check failed on %s", pp->name);
351		free(meta, M_MD_PROMISE);
352		return (subdisks);
353	}
354
355	if ((meta->integrity & PROMISE_I_VALID) == 0) {
356		G_RAID_DEBUG(1, "Promise metadata is invalid on %s", pp->name);
357		free(meta, M_MD_PROMISE);
358		return (subdisks);
359	}
360
361	if (meta->total_disks > PROMISE_MAX_DISKS) {
362		G_RAID_DEBUG(1, "Wrong number of disks on %s (%d)",
363		    pp->name, meta->total_disks);
364		free(meta, M_MD_PROMISE);
365		return (subdisks);
366	}
367
368	/* Save this part and look for next. */
369	*metaarr = meta;
370	metaarr++;
371	subdisks++;
372	if (subdisks < PROMISE_MAX_SUBDISKS)
373		goto next;
374
375	return (subdisks);
376}
377
378static int
379promise_meta_write(struct g_consumer *cp,
380    struct promise_raid_conf **metaarr, int nsd)
381{
382	struct g_provider *pp;
383	struct promise_raid_conf *meta;
384	char *buf;
385	int error, i, subdisk, fake;
386	uint32_t checksum, *ptr, off, size;
387
388	pp = cp->provider;
389	subdisk = 0;
390	fake = 0;
391next:
392	buf = malloc(pp->sectorsize * 4, M_MD_PROMISE, M_WAITOK | M_ZERO);
393	meta = NULL;
394	if (subdisk < nsd) {
395		meta = metaarr[subdisk];
396	} else if (!fake && promise_meta_unused_range(metaarr, nsd,
397	    cp->provider->mediasize / cp->provider->sectorsize,
398	    &off, &size)) {
399		/* Optionally add record for unused space. */
400		meta = (struct promise_raid_conf *)buf;
401		memcpy(&meta->promise_id[0], PROMISE_MAGIC,
402		    sizeof(PROMISE_MAGIC) - 1);
403		meta->dummy_0 = 0x00020000;
404		meta->integrity = PROMISE_I_VALID;
405		meta->disk.flags = PROMISE_F_ONLINE | PROMISE_F_VALID;
406		meta->disk.number = 0xff;
407		arc4rand(&meta->disk.id, sizeof(meta->disk.id), 0);
408		meta->disk_offset = off;
409		meta->disk_sectors = size;
410		meta->rebuild_lba = UINT32_MAX;
411		fake = 1;
412	}
413	if (meta != NULL) {
414		/* Recalculate checksum for case if metadata were changed. */
415		meta->checksum = 0;
416		for (checksum = 0, ptr = (uint32_t *)meta, i = 0; i < 511; i++)
417			checksum += *ptr++;
418		meta->checksum = checksum;
419		memcpy(buf, meta, MIN(pp->sectorsize * 4, sizeof(*meta)));
420	}
421	error = g_write_data(cp, pp->mediasize - pp->sectorsize *
422	    (63 - subdisk * PROMISE_META_OFFSET),
423	    buf, pp->sectorsize * 4);
424	if (error != 0) {
425		G_RAID_DEBUG(1, "Cannot write metadata to %s (error=%d).",
426		    pp->name, error);
427	}
428	free(buf, M_MD_PROMISE);
429
430	subdisk++;
431	if (subdisk < PROMISE_MAX_SUBDISKS)
432		goto next;
433
434	return (error);
435}
436
437static int
438promise_meta_erase(struct g_consumer *cp)
439{
440	struct g_provider *pp;
441	char *buf;
442	int error, subdisk;
443
444	pp = cp->provider;
445	buf = malloc(4 * pp->sectorsize, M_MD_PROMISE, M_WAITOK | M_ZERO);
446	for (subdisk = 0; subdisk < PROMISE_MAX_SUBDISKS; subdisk++) {
447		error = g_write_data(cp, pp->mediasize - pp->sectorsize *
448		    (63 - subdisk * PROMISE_META_OFFSET),
449		    buf, 4 * pp->sectorsize);
450		if (error != 0) {
451			G_RAID_DEBUG(1, "Cannot erase metadata on %s (error=%d).",
452			    pp->name, error);
453		}
454	}
455	free(buf, M_MD_PROMISE);
456	return (error);
457}
458
459static int
460promise_meta_write_spare(struct g_consumer *cp)
461{
462	struct promise_raid_conf *meta;
463	int error;
464
465	meta = malloc(sizeof(*meta), M_MD_PROMISE, M_WAITOK | M_ZERO);
466	memcpy(&meta->promise_id[0], PROMISE_MAGIC, sizeof(PROMISE_MAGIC) - 1);
467	meta->dummy_0 = 0x00020000;
468	meta->integrity = PROMISE_I_VALID;
469	meta->disk.flags = PROMISE_F_SPARE | PROMISE_F_ONLINE | PROMISE_F_VALID;
470	meta->disk.number = 0xff;
471	arc4rand(&meta->disk.id, sizeof(meta->disk.id), 0);
472	meta->disk_sectors = cp->provider->mediasize / cp->provider->sectorsize;
473	meta->disk_sectors -= 131072;
474	meta->rebuild_lba = UINT32_MAX;
475	error = promise_meta_write(cp, &meta, 1);
476	free(meta, M_MD_PROMISE);
477	return (error);
478}
479
480static struct g_raid_volume *
481g_raid_md_promise_get_volume(struct g_raid_softc *sc, uint64_t id)
482{
483	struct g_raid_volume	*vol;
484	struct g_raid_md_promise_pervolume *pv;
485
486	TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
487		pv = vol->v_md_data;
488		if (pv->pv_id == id)
489			break;
490	}
491	return (vol);
492}
493
494static int
495g_raid_md_promise_purge_volumes(struct g_raid_softc *sc)
496{
497	struct g_raid_volume	*vol, *tvol;
498	struct g_raid_md_promise_pervolume *pv;
499	int i, res;
500
501	res = 0;
502	TAILQ_FOREACH_SAFE(vol, &sc->sc_volumes, v_next, tvol) {
503		pv = vol->v_md_data;
504		if (!pv->pv_started || vol->v_stopping)
505			continue;
506		for (i = 0; i < vol->v_disks_count; i++) {
507			if (vol->v_subdisks[i].sd_state != G_RAID_SUBDISK_S_NONE)
508				break;
509		}
510		if (i >= vol->v_disks_count) {
511			g_raid_destroy_volume(vol);
512			res = 1;
513		}
514	}
515	return (res);
516}
517
518static int
519g_raid_md_promise_purge_disks(struct g_raid_softc *sc)
520{
521	struct g_raid_disk	*disk, *tdisk;
522	struct g_raid_volume	*vol;
523	struct g_raid_md_promise_perdisk *pd;
524	int i, j, res;
525
526	res = 0;
527	TAILQ_FOREACH_SAFE(disk, &sc->sc_disks, d_next, tdisk) {
528		if (disk->d_state == G_RAID_DISK_S_SPARE)
529			continue;
530		pd = (struct g_raid_md_promise_perdisk *)disk->d_md_data;
531
532		/* Scan for deleted volumes. */
533		for (i = 0; i < pd->pd_subdisks; ) {
534			vol = g_raid_md_promise_get_volume(sc,
535			    pd->pd_meta[i]->volume_id);
536			if (vol != NULL && !vol->v_stopping) {
537				i++;
538				continue;
539			}
540			free(pd->pd_meta[i], M_MD_PROMISE);
541			for (j = i; j < pd->pd_subdisks - 1; j++)
542				pd->pd_meta[j] = pd->pd_meta[j + 1];
543			pd->pd_meta[PROMISE_MAX_SUBDISKS - 1] = NULL;
544			pd->pd_subdisks--;
545			pd->pd_updated = 1;
546		}
547
548		/* If there is no metadata left - erase and delete disk. */
549		if (pd->pd_subdisks == 0) {
550			promise_meta_erase(disk->d_consumer);
551			g_raid_destroy_disk(disk);
552			res = 1;
553		}
554	}
555	return (res);
556}
557
558static int
559g_raid_md_promise_supported(int level, int qual, int disks, int force)
560{
561
562	if (disks > PROMISE_MAX_DISKS)
563		return (0);
564	switch (level) {
565	case G_RAID_VOLUME_RL_RAID0:
566		if (disks < 1)
567			return (0);
568		if (!force && disks < 2)
569			return (0);
570		break;
571	case G_RAID_VOLUME_RL_RAID1:
572		if (disks < 1)
573			return (0);
574		if (!force && (disks != 2))
575			return (0);
576		break;
577	case G_RAID_VOLUME_RL_RAID1E:
578		if (disks < 2)
579			return (0);
580		if (disks % 2 != 0)
581			return (0);
582		if (!force && (disks != 4))
583			return (0);
584		break;
585	case G_RAID_VOLUME_RL_SINGLE:
586		if (disks != 1)
587			return (0);
588		break;
589	case G_RAID_VOLUME_RL_CONCAT:
590		if (disks < 2)
591			return (0);
592		break;
593	case G_RAID_VOLUME_RL_RAID5:
594		if (disks < 3)
595			return (0);
596		break;
597	default:
598		return (0);
599	}
600	if (qual != G_RAID_VOLUME_RLQ_NONE)
601		return (0);
602	return (1);
603}
604
605static int
606g_raid_md_promise_start_disk(struct g_raid_disk *disk, int sdn,
607    struct g_raid_volume *vol)
608{
609	struct g_raid_softc *sc;
610	struct g_raid_subdisk *sd;
611	struct g_raid_md_promise_perdisk *pd;
612	struct g_raid_md_promise_pervolume *pv;
613	struct promise_raid_conf *meta;
614	off_t size;
615	int disk_pos, md_disk_pos, i, resurrection = 0;
616	uint32_t eoff, esize;
617
618	sc = disk->d_softc;
619	pd = (struct g_raid_md_promise_perdisk *)disk->d_md_data;
620
621	pv = vol->v_md_data;
622	meta = pv->pv_meta;
623
624	if (sdn >= 0) {
625		/* Find disk position in metadata by it's serial. */
626		md_disk_pos = promise_meta_find_disk(meta, pd->pd_meta[sdn]->disk.id);
627		/* For RAID0+1 we need to translate order. */
628		disk_pos = promise_meta_translate_disk(vol, md_disk_pos);
629	} else {
630		md_disk_pos = -1;
631		disk_pos = -1;
632	}
633	if (disk_pos < 0) {
634		G_RAID_DEBUG1(1, sc, "Disk %s is not part of the volume %s",
635		    g_raid_get_diskname(disk), vol->v_name);
636		/* Failed stale disk is useless for us. */
637		if (sdn >= 0 &&
638		    pd->pd_meta[sdn]->disk.flags & PROMISE_F_DOWN) {
639			g_raid_change_disk_state(disk, G_RAID_DISK_S_STALE_FAILED);
640			return (0);
641		}
642		/* If we were given specific metadata subdisk - erase it. */
643		if (sdn >= 0) {
644			free(pd->pd_meta[sdn], M_MD_PROMISE);
645			for (i = sdn; i < pd->pd_subdisks - 1; i++)
646				pd->pd_meta[i] = pd->pd_meta[i + 1];
647			pd->pd_meta[PROMISE_MAX_SUBDISKS - 1] = NULL;
648			pd->pd_subdisks--;
649		}
650		/* If we are in the start process, that's all for now. */
651		if (!pv->pv_started)
652			goto nofit;
653		/*
654		 * If we have already started - try to get use of the disk.
655		 * Try to replace OFFLINE disks first, then FAILED.
656		 */
657		promise_meta_unused_range(pd->pd_meta, pd->pd_subdisks,
658		    disk->d_consumer->provider->mediasize /
659		    disk->d_consumer->provider->sectorsize,
660		    &eoff, &esize);
661		if (esize == 0) {
662			G_RAID_DEBUG1(1, sc, "No free space on disk %s",
663			    g_raid_get_diskname(disk));
664			goto nofit;
665		}
666		size = INT64_MAX;
667		for (i = 0; i < vol->v_disks_count; i++) {
668			sd = &vol->v_subdisks[i];
669			if (sd->sd_state != G_RAID_SUBDISK_S_NONE)
670				size = sd->sd_size;
671			if (sd->sd_state <= G_RAID_SUBDISK_S_FAILED &&
672			    (disk_pos < 0 ||
673			     vol->v_subdisks[i].sd_state < sd->sd_state))
674				disk_pos = i;
675		}
676		if (disk_pos >= 0 &&
677		    vol->v_raid_level != G_RAID_VOLUME_RL_CONCAT &&
678		    (off_t)esize * 512 < size) {
679			G_RAID_DEBUG1(1, sc, "Disk %s free space "
680			    "is too small (%ju < %ju)",
681			    g_raid_get_diskname(disk),
682			    (off_t)esize * 512, size);
683			disk_pos = -1;
684		}
685		if (disk_pos >= 0) {
686			if (vol->v_raid_level != G_RAID_VOLUME_RL_CONCAT)
687				esize = size / 512;
688			/* For RAID0+1 we need to translate order. */
689			md_disk_pos = promise_meta_translate_disk(vol, disk_pos);
690		} else {
691nofit:
692			if (pd->pd_subdisks == 0) {
693				g_raid_change_disk_state(disk,
694				    G_RAID_DISK_S_SPARE);
695			}
696			return (0);
697		}
698		G_RAID_DEBUG1(1, sc, "Disk %s takes pos %d in the volume %s",
699		    g_raid_get_diskname(disk), disk_pos, vol->v_name);
700		resurrection = 1;
701	}
702
703	sd = &vol->v_subdisks[disk_pos];
704
705	if (resurrection && sd->sd_disk != NULL) {
706		g_raid_change_disk_state(sd->sd_disk,
707		    G_RAID_DISK_S_STALE_FAILED);
708		TAILQ_REMOVE(&sd->sd_disk->d_subdisks,
709		    sd, sd_next);
710	}
711	vol->v_subdisks[disk_pos].sd_disk = disk;
712	TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next);
713
714	/* Welcome the new disk. */
715	if (resurrection)
716		g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE);
717	else if (meta->disks[md_disk_pos].flags & PROMISE_F_DOWN)
718		g_raid_change_disk_state(disk, G_RAID_DISK_S_FAILED);
719	else
720		g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE);
721
722	if (resurrection) {
723		sd->sd_offset = (off_t)eoff * 512;
724		sd->sd_size = (off_t)esize * 512;
725	} else {
726		sd->sd_offset = (off_t)pd->pd_meta[sdn]->disk_offset * 512;
727		sd->sd_size = (off_t)pd->pd_meta[sdn]->disk_sectors * 512;
728	}
729
730	if (resurrection) {
731		/* Stale disk, almost same as new. */
732		g_raid_change_subdisk_state(sd,
733		    G_RAID_SUBDISK_S_NEW);
734	} else if (meta->disks[md_disk_pos].flags & PROMISE_F_DOWN) {
735		/* Failed disk. */
736		g_raid_change_subdisk_state(sd,
737		    G_RAID_SUBDISK_S_FAILED);
738	} else if (meta->disks[md_disk_pos].flags & PROMISE_F_REDIR) {
739		/* Rebuilding disk. */
740		g_raid_change_subdisk_state(sd,
741		    G_RAID_SUBDISK_S_REBUILD);
742		if (pd->pd_meta[sdn]->generation != meta->generation)
743			sd->sd_rebuild_pos = 0;
744		else {
745			sd->sd_rebuild_pos =
746			    (off_t)pd->pd_meta[sdn]->rebuild_lba * 512;
747		}
748	} else if (!(meta->disks[md_disk_pos].flags & PROMISE_F_ONLINE)) {
749		/* Rebuilding disk. */
750		g_raid_change_subdisk_state(sd,
751		    G_RAID_SUBDISK_S_NEW);
752	} else if (pd->pd_meta[sdn]->generation != meta->generation ||
753	    (meta->status & PROMISE_S_MARKED)) {
754		/* Stale disk or dirty volume (unclean shutdown). */
755		g_raid_change_subdisk_state(sd,
756		    G_RAID_SUBDISK_S_STALE);
757	} else {
758		/* Up to date disk. */
759		g_raid_change_subdisk_state(sd,
760		    G_RAID_SUBDISK_S_ACTIVE);
761	}
762	g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW,
763	    G_RAID_EVENT_SUBDISK);
764
765	return (resurrection);
766}
767
768static void
769g_raid_md_promise_refill(struct g_raid_softc *sc)
770{
771	struct g_raid_volume *vol;
772	struct g_raid_subdisk *sd;
773	struct g_raid_disk *disk;
774	struct g_raid_md_object *md;
775	struct g_raid_md_promise_perdisk *pd;
776	struct g_raid_md_promise_pervolume *pv;
777	int update, updated, i, bad;
778
779	md = sc->sc_md;
780restart:
781	updated = 0;
782	TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
783		pv = vol->v_md_data;
784		if (!pv->pv_started || vol->v_stopping)
785			continue;
786
787		/* Search for subdisk that needs replacement. */
788		bad = 0;
789		for (i = 0; i < vol->v_disks_count; i++) {
790			sd = &vol->v_subdisks[i];
791			if (sd->sd_state == G_RAID_SUBDISK_S_NONE ||
792			    sd->sd_state == G_RAID_SUBDISK_S_FAILED)
793			        bad = 1;
794		}
795		if (!bad)
796			continue;
797
798		G_RAID_DEBUG1(1, sc, "Volume %s is not complete, "
799		    "trying to refill.", vol->v_name);
800
801		TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
802			/* Skip failed. */
803			if (disk->d_state < G_RAID_DISK_S_SPARE)
804				continue;
805			/* Skip already used by this volume. */
806			for (i = 0; i < vol->v_disks_count; i++) {
807				sd = &vol->v_subdisks[i];
808				if (sd->sd_disk == disk)
809					break;
810			}
811			if (i < vol->v_disks_count)
812				continue;
813
814			/* Try to use disk if it has empty extents. */
815			pd = disk->d_md_data;
816			if (pd->pd_subdisks < PROMISE_MAX_SUBDISKS) {
817				update =
818				    g_raid_md_promise_start_disk(disk, -1, vol);
819			} else
820				update = 0;
821			if (update) {
822				updated = 1;
823				g_raid_md_write_promise(md, vol, NULL, disk);
824				break;
825			}
826		}
827	}
828	if (updated)
829		goto restart;
830}
831
832static void
833g_raid_md_promise_start(struct g_raid_volume *vol)
834{
835	struct g_raid_softc *sc;
836	struct g_raid_subdisk *sd;
837	struct g_raid_disk *disk;
838	struct g_raid_md_object *md;
839	struct g_raid_md_promise_perdisk *pd;
840	struct g_raid_md_promise_pervolume *pv;
841	struct promise_raid_conf *meta;
842	int i;
843
844	sc = vol->v_softc;
845	md = sc->sc_md;
846	pv = vol->v_md_data;
847	meta = pv->pv_meta;
848
849	if (meta->type == PROMISE_T_RAID0)
850		vol->v_raid_level = G_RAID_VOLUME_RL_RAID0;
851	else if (meta->type == PROMISE_T_RAID1) {
852		if (meta->array_width == 1)
853			vol->v_raid_level = G_RAID_VOLUME_RL_RAID1;
854		else
855			vol->v_raid_level = G_RAID_VOLUME_RL_RAID1E;
856	} else if (meta->type == PROMISE_T_RAID3)
857		vol->v_raid_level = G_RAID_VOLUME_RL_RAID3;
858	else if (meta->type == PROMISE_T_RAID5)
859		vol->v_raid_level = G_RAID_VOLUME_RL_RAID5;
860	else if (meta->type == PROMISE_T_SPAN)
861		vol->v_raid_level = G_RAID_VOLUME_RL_CONCAT;
862	else if (meta->type == PROMISE_T_JBOD)
863		vol->v_raid_level = G_RAID_VOLUME_RL_SINGLE;
864	else
865		vol->v_raid_level = G_RAID_VOLUME_RL_UNKNOWN;
866	vol->v_raid_level_qualifier = G_RAID_VOLUME_RLQ_NONE;
867	vol->v_strip_size = 512 << meta->stripe_shift; //ZZZ
868	vol->v_disks_count = meta->total_disks;
869	vol->v_mediasize = (off_t)meta->total_sectors * 512; //ZZZ
870	vol->v_sectorsize = 512; //ZZZ
871	for (i = 0; i < vol->v_disks_count; i++) {
872		sd = &vol->v_subdisks[i];
873		sd->sd_offset = (off_t)meta->disk_offset * 512; //ZZZ
874		sd->sd_size = (off_t)meta->disk_sectors * 512; //ZZZ
875	}
876	g_raid_start_volume(vol);
877
878	/* Make all disks found till the moment take their places. */
879	TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
880		pd = disk->d_md_data;
881		for (i = 0; i < pd->pd_subdisks; i++) {
882			if (pd->pd_meta[i]->volume_id == meta->volume_id)
883				g_raid_md_promise_start_disk(disk, i, vol);
884		}
885	}
886
887	pv->pv_started = 1;
888	callout_stop(&pv->pv_start_co);
889	G_RAID_DEBUG1(0, sc, "Volume started.");
890	g_raid_md_write_promise(md, vol, NULL, NULL);
891
892	/* Pickup any STALE/SPARE disks to refill array if needed. */
893	g_raid_md_promise_refill(sc);
894
895	g_raid_event_send(vol, G_RAID_VOLUME_E_START, G_RAID_EVENT_VOLUME);
896}
897
898static void
899g_raid_promise_go(void *arg)
900{
901	struct g_raid_volume *vol;
902	struct g_raid_softc *sc;
903	struct g_raid_md_promise_pervolume *pv;
904
905	vol = arg;
906	pv = vol->v_md_data;
907	sc = vol->v_softc;
908	if (!pv->pv_started) {
909		G_RAID_DEBUG1(0, sc, "Force volume start due to timeout.");
910		g_raid_event_send(vol, G_RAID_VOLUME_E_STARTMD,
911		    G_RAID_EVENT_VOLUME);
912	}
913}
914
915static void
916g_raid_md_promise_new_disk(struct g_raid_disk *disk)
917{
918	struct g_raid_softc *sc;
919	struct g_raid_md_object *md;
920	struct promise_raid_conf *pdmeta;
921	struct g_raid_md_promise_perdisk *pd;
922	struct g_raid_md_promise_pervolume *pv;
923	struct g_raid_volume *vol;
924	int i;
925	char buf[33];
926
927	sc = disk->d_softc;
928	md = sc->sc_md;
929	pd = (struct g_raid_md_promise_perdisk *)disk->d_md_data;
930
931	if (pd->pd_subdisks == 0) {
932		g_raid_change_disk_state(disk, G_RAID_DISK_S_SPARE);
933		g_raid_md_promise_refill(sc);
934		return;
935	}
936
937	for (i = 0; i < pd->pd_subdisks; i++) {
938		pdmeta = pd->pd_meta[i];
939
940		/* Look for volume with matching ID. */
941		vol = g_raid_md_promise_get_volume(sc, pdmeta->volume_id);
942		if (vol == NULL) {
943			promise_meta_get_name(pdmeta, buf);
944			vol = g_raid_create_volume(sc, buf, pdmeta->array_number);
945			pv = malloc(sizeof(*pv), M_MD_PROMISE, M_WAITOK | M_ZERO);
946			pv->pv_id = pdmeta->volume_id;
947			vol->v_md_data = pv;
948			callout_init(&pv->pv_start_co, 1);
949			callout_reset(&pv->pv_start_co,
950			    g_raid_start_timeout * hz,
951			    g_raid_promise_go, vol);
952		} else
953			pv = vol->v_md_data;
954
955		/* If we haven't started yet - check metadata freshness. */
956		if (pv->pv_meta == NULL || !pv->pv_started) {
957			if (pv->pv_meta == NULL ||
958			    ((int16_t)(pdmeta->generation - pv->pv_generation)) > 0) {
959				G_RAID_DEBUG1(1, sc, "Newer disk");
960				if (pv->pv_meta != NULL)
961					free(pv->pv_meta, M_MD_PROMISE);
962				pv->pv_meta = promise_meta_copy(pdmeta);
963				pv->pv_generation = pv->pv_meta->generation;
964				pv->pv_disks_present = 1;
965			} else if (pdmeta->generation == pv->pv_generation) {
966				pv->pv_disks_present++;
967				G_RAID_DEBUG1(1, sc, "Matching disk (%d of %d up)",
968				    pv->pv_disks_present,
969				    pv->pv_meta->total_disks);
970			} else {
971				G_RAID_DEBUG1(1, sc, "Older disk");
972			}
973		}
974	}
975
976	for (i = 0; i < pd->pd_subdisks; i++) {
977		pdmeta = pd->pd_meta[i];
978
979		/* Look for volume with matching ID. */
980		vol = g_raid_md_promise_get_volume(sc, pdmeta->volume_id);
981		if (vol == NULL)
982			continue;
983		pv = vol->v_md_data;
984
985		if (pv->pv_started) {
986			if (g_raid_md_promise_start_disk(disk, i, vol))
987				g_raid_md_write_promise(md, vol, NULL, NULL);
988		} else {
989			/* If we collected all needed disks - start array. */
990			if (pv->pv_disks_present == pv->pv_meta->total_disks)
991				g_raid_md_promise_start(vol);
992		}
993	}
994}
995
996static int
997g_raid_md_create_promise(struct g_raid_md_object *md, struct g_class *mp,
998    struct g_geom **gp)
999{
1000	struct g_geom *geom;
1001	struct g_raid_softc *sc;
1002
1003	/* Search for existing node. */
1004	LIST_FOREACH(geom, &mp->geom, geom) {
1005		sc = geom->softc;
1006		if (sc == NULL)
1007			continue;
1008		if (sc->sc_stopping != 0)
1009			continue;
1010		if (sc->sc_md->mdo_class != md->mdo_class)
1011			continue;
1012		break;
1013	}
1014	if (geom != NULL) {
1015		*gp = geom;
1016		return (G_RAID_MD_TASTE_EXISTING);
1017	}
1018
1019	/* Create new one if not found. */
1020	sc = g_raid_create_node(mp, "Promise", md);
1021	if (sc == NULL)
1022		return (G_RAID_MD_TASTE_FAIL);
1023	md->mdo_softc = sc;
1024	*gp = sc->sc_geom;
1025	return (G_RAID_MD_TASTE_NEW);
1026}
1027
1028static int
1029g_raid_md_taste_promise(struct g_raid_md_object *md, struct g_class *mp,
1030                              struct g_consumer *cp, struct g_geom **gp)
1031{
1032	struct g_consumer *rcp;
1033	struct g_provider *pp;
1034	struct g_raid_softc *sc;
1035	struct g_raid_disk *disk;
1036	struct promise_raid_conf *meta, *metaarr[4];
1037	struct g_raid_md_promise_perdisk *pd;
1038	struct g_geom *geom;
1039	int error, i, j, result, len, subdisks;
1040	char name[16];
1041	uint16_t vendor;
1042
1043	G_RAID_DEBUG(1, "Tasting Promise on %s", cp->provider->name);
1044	pp = cp->provider;
1045
1046	/* Read metadata from device. */
1047	meta = NULL;
1048	vendor = 0xffff;
1049	if (g_access(cp, 1, 0, 0) != 0)
1050		return (G_RAID_MD_TASTE_FAIL);
1051	g_topology_unlock();
1052	len = 2;
1053	if (pp->geom->rank == 1)
1054		g_io_getattr("GEOM::hba_vendor", cp, &len, &vendor);
1055	subdisks = promise_meta_read(cp, metaarr);
1056	g_topology_lock();
1057	g_access(cp, -1, 0, 0);
1058	if (subdisks == 0) {
1059		if (g_raid_aggressive_spare) {
1060			if (vendor == 0x105a || vendor == 0x1002) {
1061				G_RAID_DEBUG(1,
1062				    "No Promise metadata, forcing spare.");
1063				goto search;
1064			} else {
1065				G_RAID_DEBUG(1,
1066				    "Promise/ATI vendor mismatch "
1067				    "0x%04x != 0x105a/0x1002",
1068				    vendor);
1069			}
1070		}
1071		return (G_RAID_MD_TASTE_FAIL);
1072	}
1073
1074	/* Metadata valid. Print it. */
1075	for (i = 0; i < subdisks; i++)
1076		g_raid_md_promise_print(metaarr[i]);
1077
1078	/* Purge meaningless (empty/spare) records. */
1079	for (i = 0; i < subdisks; ) {
1080		if (metaarr[i]->disk.flags & PROMISE_F_ASSIGNED) {
1081			i++;
1082			continue;
1083		}
1084		free(metaarr[i], M_MD_PROMISE);
1085		for (j = i; j < subdisks - 1; j++)
1086			metaarr[i] = metaarr[j + 1];
1087		metaarr[PROMISE_MAX_SUBDISKS - 1] = NULL;
1088		subdisks--;
1089	}
1090
1091search:
1092	/* Search for matching node. */
1093	sc = NULL;
1094	LIST_FOREACH(geom, &mp->geom, geom) {
1095		sc = geom->softc;
1096		if (sc == NULL)
1097			continue;
1098		if (sc->sc_stopping != 0)
1099			continue;
1100		if (sc->sc_md->mdo_class != md->mdo_class)
1101			continue;
1102		break;
1103	}
1104
1105	/* Found matching node. */
1106	if (geom != NULL) {
1107		G_RAID_DEBUG(1, "Found matching array %s", sc->sc_name);
1108		result = G_RAID_MD_TASTE_EXISTING;
1109
1110	} else { /* Not found matching node -- create one. */
1111		result = G_RAID_MD_TASTE_NEW;
1112		snprintf(name, sizeof(name), "Promise");
1113		sc = g_raid_create_node(mp, name, md);
1114		md->mdo_softc = sc;
1115		geom = sc->sc_geom;
1116	}
1117
1118	rcp = g_new_consumer(geom);
1119	g_attach(rcp, pp);
1120	if (g_access(rcp, 1, 1, 1) != 0)
1121		; //goto fail1;
1122
1123	g_topology_unlock();
1124	sx_xlock(&sc->sc_lock);
1125
1126	pd = malloc(sizeof(*pd), M_MD_PROMISE, M_WAITOK | M_ZERO);
1127	pd->pd_subdisks = subdisks;
1128	for (i = 0; i < subdisks; i++)
1129		pd->pd_meta[i] = metaarr[i];
1130	disk = g_raid_create_disk(sc);
1131	disk->d_md_data = (void *)pd;
1132	disk->d_consumer = rcp;
1133	rcp->private = disk;
1134
1135	/* Read kernel dumping information. */
1136	disk->d_kd.offset = 0;
1137	disk->d_kd.length = OFF_MAX;
1138	len = sizeof(disk->d_kd);
1139	error = g_io_getattr("GEOM::kerneldump", rcp, &len, &disk->d_kd);
1140	if (disk->d_kd.di.dumper == NULL)
1141		G_RAID_DEBUG1(2, sc, "Dumping not supported by %s: %d.",
1142		    rcp->provider->name, error);
1143
1144	g_raid_md_promise_new_disk(disk);
1145
1146	sx_xunlock(&sc->sc_lock);
1147	g_topology_lock();
1148	*gp = geom;
1149	return (result);
1150}
1151
1152static int
1153g_raid_md_event_promise(struct g_raid_md_object *md,
1154    struct g_raid_disk *disk, u_int event)
1155{
1156	struct g_raid_softc *sc;
1157	struct g_raid_md_promise_perdisk *pd;
1158
1159	sc = md->mdo_softc;
1160	if (disk == NULL)
1161		return (-1);
1162	pd = (struct g_raid_md_promise_perdisk *)disk->d_md_data;
1163	switch (event) {
1164	case G_RAID_DISK_E_DISCONNECTED:
1165		/* Delete disk. */
1166		g_raid_change_disk_state(disk, G_RAID_DISK_S_NONE);
1167		g_raid_destroy_disk(disk);
1168		g_raid_md_promise_purge_volumes(sc);
1169
1170		/* Write updated metadata to all disks. */
1171		g_raid_md_write_promise(md, NULL, NULL, NULL);
1172
1173		/* Check if anything left. */
1174		if (g_raid_ndisks(sc, -1) == 0)
1175			g_raid_destroy_node(sc, 0);
1176		else
1177			g_raid_md_promise_refill(sc);
1178		return (0);
1179	}
1180	return (-2);
1181}
1182
1183static int
1184g_raid_md_volume_event_promise(struct g_raid_md_object *md,
1185    struct g_raid_volume *vol, u_int event)
1186{
1187	struct g_raid_softc *sc;
1188	struct g_raid_md_promise_pervolume *pv;
1189
1190	sc = md->mdo_softc;
1191	pv = (struct g_raid_md_promise_pervolume *)vol->v_md_data;
1192	switch (event) {
1193	case G_RAID_VOLUME_E_STARTMD:
1194		if (!pv->pv_started)
1195			g_raid_md_promise_start(vol);
1196		return (0);
1197	}
1198	return (-2);
1199}
1200
1201static int
1202g_raid_md_ctl_promise(struct g_raid_md_object *md,
1203    struct gctl_req *req)
1204{
1205	struct g_raid_softc *sc;
1206	struct g_raid_volume *vol, *vol1;
1207	struct g_raid_subdisk *sd;
1208	struct g_raid_disk *disk, *disks[PROMISE_MAX_DISKS];
1209	struct g_raid_md_promise_perdisk *pd;
1210	struct g_raid_md_promise_pervolume *pv;
1211	struct g_consumer *cp;
1212	struct g_provider *pp;
1213	char arg[16];
1214	const char *verb, *volname, *levelname, *diskname;
1215	char *tmp;
1216	int *nargs, *force;
1217	off_t size, sectorsize, strip;
1218	intmax_t *sizearg, *striparg;
1219	uint32_t offs[PROMISE_MAX_DISKS], esize;
1220	int numdisks, i, len, level, qual;
1221	int error;
1222
1223	sc = md->mdo_softc;
1224	verb = gctl_get_param(req, "verb", NULL);
1225	nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs));
1226	error = 0;
1227	if (strcmp(verb, "label") == 0) {
1228
1229		if (*nargs < 4) {
1230			gctl_error(req, "Invalid number of arguments.");
1231			return (-1);
1232		}
1233		volname = gctl_get_asciiparam(req, "arg1");
1234		if (volname == NULL) {
1235			gctl_error(req, "No volume name.");
1236			return (-2);
1237		}
1238		levelname = gctl_get_asciiparam(req, "arg2");
1239		if (levelname == NULL) {
1240			gctl_error(req, "No RAID level.");
1241			return (-3);
1242		}
1243		if (g_raid_volume_str2level(levelname, &level, &qual)) {
1244			gctl_error(req, "Unknown RAID level '%s'.", levelname);
1245			return (-4);
1246		}
1247		numdisks = *nargs - 3;
1248		force = gctl_get_paraml(req, "force", sizeof(*force));
1249		if (!g_raid_md_promise_supported(level, qual, numdisks,
1250		    force ? *force : 0)) {
1251			gctl_error(req, "Unsupported RAID level "
1252			    "(0x%02x/0x%02x), or number of disks (%d).",
1253			    level, qual, numdisks);
1254			return (-5);
1255		}
1256
1257		/* Search for disks, connect them and probe. */
1258		size = INT64_MAX;
1259		sectorsize = 0;
1260		bzero(disks, sizeof(disks));
1261		bzero(offs, sizeof(offs));
1262		for (i = 0; i < numdisks; i++) {
1263			snprintf(arg, sizeof(arg), "arg%d", i + 3);
1264			diskname = gctl_get_asciiparam(req, arg);
1265			if (diskname == NULL) {
1266				gctl_error(req, "No disk name (%s).", arg);
1267				error = -6;
1268				break;
1269			}
1270			if (strcmp(diskname, "NONE") == 0)
1271				continue;
1272
1273			TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
1274				if (disk->d_consumer != NULL &&
1275				    disk->d_consumer->provider != NULL &&
1276				    strcmp(disk->d_consumer->provider->name,
1277				     diskname) == 0)
1278					break;
1279			}
1280			if (disk != NULL) {
1281				if (disk->d_state != G_RAID_DISK_S_ACTIVE) {
1282					gctl_error(req, "Disk '%s' is in a "
1283					    "wrong state (%s).", diskname,
1284					    g_raid_disk_state2str(disk->d_state));
1285					error = -7;
1286					break;
1287				}
1288				pd = disk->d_md_data;
1289				if (pd->pd_subdisks >= PROMISE_MAX_SUBDISKS) {
1290					gctl_error(req, "Disk '%s' already "
1291					    "used by %d volumes.",
1292					    diskname, pd->pd_subdisks);
1293					error = -7;
1294					break;
1295				}
1296				pp = disk->d_consumer->provider;
1297				disks[i] = disk;
1298				promise_meta_unused_range(pd->pd_meta,
1299				    pd->pd_subdisks,
1300				    pp->mediasize / pp->sectorsize,
1301				    &offs[i], &esize);
1302				size = MIN(size, (off_t)esize * pp->sectorsize);
1303				sectorsize = MAX(sectorsize, pp->sectorsize);
1304				continue;
1305			}
1306
1307			g_topology_lock();
1308			cp = g_raid_open_consumer(sc, diskname);
1309			if (cp == NULL) {
1310				gctl_error(req, "Can't open disk '%s'.",
1311				    diskname);
1312				g_topology_unlock();
1313				error = -8;
1314				break;
1315			}
1316			pp = cp->provider;
1317			pd = malloc(sizeof(*pd), M_MD_PROMISE, M_WAITOK | M_ZERO);
1318			disk = g_raid_create_disk(sc);
1319			disk->d_md_data = (void *)pd;
1320			disk->d_consumer = cp;
1321			disks[i] = disk;
1322			cp->private = disk;
1323			g_topology_unlock();
1324
1325			/* Read kernel dumping information. */
1326			disk->d_kd.offset = 0;
1327			disk->d_kd.length = OFF_MAX;
1328			len = sizeof(disk->d_kd);
1329			g_io_getattr("GEOM::kerneldump", cp, &len, &disk->d_kd);
1330			if (disk->d_kd.di.dumper == NULL)
1331				G_RAID_DEBUG1(2, sc,
1332				    "Dumping not supported by %s.",
1333				    cp->provider->name);
1334
1335			/* Reserve some space for metadata. */
1336			size = MIN(size, pp->mediasize - 131072llu * pp->sectorsize);
1337			sectorsize = MAX(sectorsize, pp->sectorsize);
1338		}
1339		if (error != 0) {
1340			for (i = 0; i < numdisks; i++) {
1341				if (disks[i] != NULL &&
1342				    disks[i]->d_state == G_RAID_DISK_S_NONE)
1343					g_raid_destroy_disk(disks[i]);
1344			}
1345			return (error);
1346		}
1347
1348		/* Handle size argument. */
1349		len = sizeof(*sizearg);
1350		sizearg = gctl_get_param(req, "size", &len);
1351		if (sizearg != NULL && len == sizeof(*sizearg) &&
1352		    *sizearg > 0) {
1353			if (*sizearg > size) {
1354				gctl_error(req, "Size too big %lld > %lld.",
1355				    (long long)*sizearg, (long long)size);
1356				return (-9);
1357			}
1358			size = *sizearg;
1359		}
1360
1361		/* Handle strip argument. */
1362		strip = 131072;
1363		len = sizeof(*striparg);
1364		striparg = gctl_get_param(req, "strip", &len);
1365		if (striparg != NULL && len == sizeof(*striparg) &&
1366		    *striparg > 0) {
1367			if (*striparg < sectorsize) {
1368				gctl_error(req, "Strip size too small.");
1369				return (-10);
1370			}
1371			if (*striparg % sectorsize != 0) {
1372				gctl_error(req, "Incorrect strip size.");
1373				return (-11);
1374			}
1375			strip = *striparg;
1376		}
1377
1378		/* Round size down to strip or sector. */
1379		if (level == G_RAID_VOLUME_RL_RAID1 ||
1380		    level == G_RAID_VOLUME_RL_SINGLE ||
1381		    level == G_RAID_VOLUME_RL_CONCAT)
1382			size -= (size % sectorsize);
1383		else if (level == G_RAID_VOLUME_RL_RAID1E &&
1384		    (numdisks & 1) != 0)
1385			size -= (size % (2 * strip));
1386		else
1387			size -= (size % strip);
1388		if (size <= 0) {
1389			gctl_error(req, "Size too small.");
1390			return (-13);
1391		}
1392		if (size > 0xffffffffllu * sectorsize) {
1393			gctl_error(req, "Size too big.");
1394			return (-14);
1395		}
1396
1397		/* We have all we need, create things: volume, ... */
1398		pv = malloc(sizeof(*pv), M_MD_PROMISE, M_WAITOK | M_ZERO);
1399		arc4rand(&pv->pv_id, sizeof(pv->pv_id), 0);
1400		pv->pv_generation = 0;
1401		pv->pv_started = 1;
1402		vol = g_raid_create_volume(sc, volname, -1);
1403		vol->v_md_data = pv;
1404		vol->v_raid_level = level;
1405		vol->v_raid_level_qualifier = G_RAID_VOLUME_RLQ_NONE;
1406		vol->v_strip_size = strip;
1407		vol->v_disks_count = numdisks;
1408		if (level == G_RAID_VOLUME_RL_RAID0 ||
1409		    level == G_RAID_VOLUME_RL_CONCAT ||
1410		    level == G_RAID_VOLUME_RL_SINGLE)
1411			vol->v_mediasize = size * numdisks;
1412		else if (level == G_RAID_VOLUME_RL_RAID1)
1413			vol->v_mediasize = size;
1414		else if (level == G_RAID_VOLUME_RL_RAID3 ||
1415		    level == G_RAID_VOLUME_RL_RAID5)
1416			vol->v_mediasize = size * (numdisks - 1);
1417		else { /* RAID1E */
1418			vol->v_mediasize = ((size * numdisks) / strip / 2) *
1419			    strip;
1420		}
1421		vol->v_sectorsize = sectorsize;
1422		g_raid_start_volume(vol);
1423
1424		/* , and subdisks. */
1425		for (i = 0; i < numdisks; i++) {
1426			disk = disks[i];
1427			sd = &vol->v_subdisks[i];
1428			sd->sd_disk = disk;
1429			sd->sd_offset = (off_t)offs[i] * 512;
1430			sd->sd_size = size;
1431			if (disk == NULL)
1432				continue;
1433			TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next);
1434			g_raid_change_disk_state(disk,
1435			    G_RAID_DISK_S_ACTIVE);
1436			g_raid_change_subdisk_state(sd,
1437			    G_RAID_SUBDISK_S_ACTIVE);
1438			g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW,
1439			    G_RAID_EVENT_SUBDISK);
1440		}
1441
1442		/* Write metadata based on created entities. */
1443		G_RAID_DEBUG1(0, sc, "Array started.");
1444		g_raid_md_write_promise(md, vol, NULL, NULL);
1445
1446		/* Pickup any STALE/SPARE disks to refill array if needed. */
1447		g_raid_md_promise_refill(sc);
1448
1449		g_raid_event_send(vol, G_RAID_VOLUME_E_START,
1450		    G_RAID_EVENT_VOLUME);
1451		return (0);
1452	}
1453	if (strcmp(verb, "add") == 0) {
1454
1455		gctl_error(req, "`add` command is not applicable, "
1456		    "use `label` instead.");
1457		return (-99);
1458	}
1459	if (strcmp(verb, "delete") == 0) {
1460
1461		/* Full node destruction. */
1462		if (*nargs == 1) {
1463			/* Check if some volume is still open. */
1464			force = gctl_get_paraml(req, "force", sizeof(*force));
1465			if (force != NULL && *force == 0 &&
1466			    g_raid_nopens(sc) != 0) {
1467				gctl_error(req, "Some volume is still open.");
1468				return (-4);
1469			}
1470
1471			TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
1472				if (disk->d_consumer)
1473					promise_meta_erase(disk->d_consumer);
1474			}
1475			g_raid_destroy_node(sc, 0);
1476			return (0);
1477		}
1478
1479		/* Destroy specified volume. If it was last - all node. */
1480		if (*nargs != 2) {
1481			gctl_error(req, "Invalid number of arguments.");
1482			return (-1);
1483		}
1484		volname = gctl_get_asciiparam(req, "arg1");
1485		if (volname == NULL) {
1486			gctl_error(req, "No volume name.");
1487			return (-2);
1488		}
1489
1490		/* Search for volume. */
1491		TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
1492			if (strcmp(vol->v_name, volname) == 0)
1493				break;
1494		}
1495		if (vol == NULL) {
1496			i = strtol(volname, &tmp, 10);
1497			if (verb != volname && tmp[0] == 0) {
1498				TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
1499					if (vol->v_global_id == i)
1500						break;
1501				}
1502			}
1503		}
1504		if (vol == NULL) {
1505			gctl_error(req, "Volume '%s' not found.", volname);
1506			return (-3);
1507		}
1508
1509		/* Check if volume is still open. */
1510		force = gctl_get_paraml(req, "force", sizeof(*force));
1511		if (force != NULL && *force == 0 &&
1512		    vol->v_provider_open != 0) {
1513			gctl_error(req, "Volume is still open.");
1514			return (-4);
1515		}
1516
1517		/* Destroy volume and potentially node. */
1518		i = 0;
1519		TAILQ_FOREACH(vol1, &sc->sc_volumes, v_next)
1520			i++;
1521		if (i >= 2) {
1522			g_raid_destroy_volume(vol);
1523			g_raid_md_promise_purge_disks(sc);
1524			g_raid_md_write_promise(md, NULL, NULL, NULL);
1525		} else {
1526			TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
1527				if (disk->d_consumer)
1528					promise_meta_erase(disk->d_consumer);
1529			}
1530			g_raid_destroy_node(sc, 0);
1531		}
1532		return (0);
1533	}
1534	if (strcmp(verb, "remove") == 0 ||
1535	    strcmp(verb, "fail") == 0) {
1536		if (*nargs < 2) {
1537			gctl_error(req, "Invalid number of arguments.");
1538			return (-1);
1539		}
1540		for (i = 1; i < *nargs; i++) {
1541			snprintf(arg, sizeof(arg), "arg%d", i);
1542			diskname = gctl_get_asciiparam(req, arg);
1543			if (diskname == NULL) {
1544				gctl_error(req, "No disk name (%s).", arg);
1545				error = -2;
1546				break;
1547			}
1548			if (strncmp(diskname, "/dev/", 5) == 0)
1549				diskname += 5;
1550
1551			TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
1552				if (disk->d_consumer != NULL &&
1553				    disk->d_consumer->provider != NULL &&
1554				    strcmp(disk->d_consumer->provider->name,
1555				     diskname) == 0)
1556					break;
1557			}
1558			if (disk == NULL) {
1559				gctl_error(req, "Disk '%s' not found.",
1560				    diskname);
1561				error = -3;
1562				break;
1563			}
1564
1565			if (strcmp(verb, "fail") == 0) {
1566				g_raid_md_fail_disk_promise(md, NULL, disk);
1567				continue;
1568			}
1569
1570			pd = (struct g_raid_md_promise_perdisk *)disk->d_md_data;
1571
1572			/* Erase metadata on deleting disk and destroy it. */
1573			promise_meta_erase(disk->d_consumer);
1574			g_raid_destroy_disk(disk);
1575		}
1576		g_raid_md_promise_purge_volumes(sc);
1577
1578		/* Write updated metadata to remaining disks. */
1579		g_raid_md_write_promise(md, NULL, NULL, NULL);
1580
1581		/* Check if anything left. */
1582		if (g_raid_ndisks(sc, -1) == 0)
1583			g_raid_destroy_node(sc, 0);
1584		else
1585			g_raid_md_promise_refill(sc);
1586		return (error);
1587	}
1588	if (strcmp(verb, "insert") == 0) {
1589		if (*nargs < 2) {
1590			gctl_error(req, "Invalid number of arguments.");
1591			return (-1);
1592		}
1593		for (i = 1; i < *nargs; i++) {
1594			/* Get disk name. */
1595			snprintf(arg, sizeof(arg), "arg%d", i);
1596			diskname = gctl_get_asciiparam(req, arg);
1597			if (diskname == NULL) {
1598				gctl_error(req, "No disk name (%s).", arg);
1599				error = -3;
1600				break;
1601			}
1602
1603			/* Try to find provider with specified name. */
1604			g_topology_lock();
1605			cp = g_raid_open_consumer(sc, diskname);
1606			if (cp == NULL) {
1607				gctl_error(req, "Can't open disk '%s'.",
1608				    diskname);
1609				g_topology_unlock();
1610				error = -4;
1611				break;
1612			}
1613			pp = cp->provider;
1614			g_topology_unlock();
1615
1616			pd = malloc(sizeof(*pd), M_MD_PROMISE, M_WAITOK | M_ZERO);
1617
1618			disk = g_raid_create_disk(sc);
1619			disk->d_consumer = cp;
1620			disk->d_consumer->private = disk;
1621			disk->d_md_data = (void *)pd;
1622			cp->private = disk;
1623
1624			/* Read kernel dumping information. */
1625			disk->d_kd.offset = 0;
1626			disk->d_kd.length = OFF_MAX;
1627			len = sizeof(disk->d_kd);
1628			g_io_getattr("GEOM::kerneldump", cp, &len, &disk->d_kd);
1629			if (disk->d_kd.di.dumper == NULL)
1630				G_RAID_DEBUG1(2, sc,
1631				    "Dumping not supported by %s.",
1632				    cp->provider->name);
1633
1634			/* Welcome the "new" disk. */
1635			g_raid_change_disk_state(disk, G_RAID_DISK_S_SPARE);
1636			promise_meta_write_spare(cp);
1637			g_raid_md_promise_refill(sc);
1638		}
1639		return (error);
1640	}
1641	return (-100);
1642}
1643
1644static int
1645g_raid_md_write_promise(struct g_raid_md_object *md, struct g_raid_volume *tvol,
1646    struct g_raid_subdisk *tsd, struct g_raid_disk *tdisk)
1647{
1648	struct g_raid_softc *sc;
1649	struct g_raid_volume *vol;
1650	struct g_raid_subdisk *sd;
1651	struct g_raid_disk *disk;
1652	struct g_raid_md_promise_perdisk *pd;
1653	struct g_raid_md_promise_pervolume *pv;
1654	struct promise_raid_conf *meta;
1655	off_t rebuild_lba64;
1656	int i, j, pos, rebuild;
1657
1658	sc = md->mdo_softc;
1659
1660	if (sc->sc_stopping == G_RAID_DESTROY_HARD)
1661		return (0);
1662
1663	/* Generate new per-volume metadata for affected volumes. */
1664	TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
1665		if (vol->v_stopping)
1666			continue;
1667
1668		/* Skip volumes not related to specified targets. */
1669		if (tvol != NULL && vol != tvol)
1670			continue;
1671		if (tsd != NULL && vol != tsd->sd_volume)
1672			continue;
1673		if (tdisk != NULL) {
1674			for (i = 0; i < vol->v_disks_count; i++) {
1675				if (vol->v_subdisks[i].sd_disk == tdisk)
1676					break;
1677			}
1678			if (i >= vol->v_disks_count)
1679				continue;
1680		}
1681
1682		pv = (struct g_raid_md_promise_pervolume *)vol->v_md_data;
1683		pv->pv_generation++;
1684
1685		meta = malloc(sizeof(*meta), M_MD_PROMISE, M_WAITOK | M_ZERO);
1686		if (pv->pv_meta != NULL)
1687			memcpy(meta, pv->pv_meta, sizeof(*meta));
1688		memcpy(meta->promise_id, PROMISE_MAGIC,
1689		    sizeof(PROMISE_MAGIC) - 1);
1690		meta->dummy_0 = 0x00020000;
1691		meta->integrity = PROMISE_I_VALID;
1692
1693		meta->generation = pv->pv_generation;
1694		meta->status = PROMISE_S_VALID | PROMISE_S_ONLINE |
1695		    PROMISE_S_INITED | PROMISE_S_READY;
1696		if (vol->v_state <= G_RAID_VOLUME_S_DEGRADED)
1697			meta->status |= PROMISE_S_DEGRADED;
1698		if (vol->v_dirty)
1699			meta->status |= PROMISE_S_MARKED; /* XXX: INVENTED! */
1700		if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID0 ||
1701		    vol->v_raid_level == G_RAID_VOLUME_RL_SINGLE)
1702			meta->type = PROMISE_T_RAID0;
1703		else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1 ||
1704		    vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E)
1705			meta->type = PROMISE_T_RAID1;
1706		else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID3)
1707			meta->type = PROMISE_T_RAID3;
1708		else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID5)
1709			meta->type = PROMISE_T_RAID5;
1710		else if (vol->v_raid_level == G_RAID_VOLUME_RL_CONCAT)
1711			meta->type = PROMISE_T_SPAN;
1712		else
1713			meta->type = PROMISE_T_JBOD;
1714		meta->total_disks = vol->v_disks_count;
1715		meta->stripe_shift = ffs(vol->v_strip_size / 1024);
1716		meta->array_width = vol->v_disks_count;
1717		if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1 ||
1718		    vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E)
1719			meta->array_width /= 2;
1720		meta->array_number = vol->v_global_id;
1721		meta->total_sectors = vol->v_mediasize / vol->v_sectorsize;
1722		meta->cylinders = meta->total_sectors / (255 * 63) - 1;
1723		meta->heads = 254;
1724		meta->sectors = 63;
1725		meta->volume_id = pv->pv_id;
1726		rebuild_lba64 = UINT64_MAX;
1727		rebuild = 0;
1728		for (i = 0; i < vol->v_disks_count; i++) {
1729			sd = &vol->v_subdisks[i];
1730			/* For RAID0+1 we need to translate order. */
1731			pos = promise_meta_translate_disk(vol, i);
1732			meta->disks[pos].flags = PROMISE_F_VALID |
1733			    PROMISE_F_ASSIGNED;
1734			if (sd->sd_state == G_RAID_SUBDISK_S_NONE) {
1735				meta->disks[pos].flags |= 0;
1736			} else if (sd->sd_state == G_RAID_SUBDISK_S_FAILED) {
1737				meta->disks[pos].flags |=
1738				    PROMISE_F_DOWN | PROMISE_F_REDIR;
1739			} else if (sd->sd_state <= G_RAID_SUBDISK_S_REBUILD) {
1740				meta->disks[pos].flags |=
1741				    PROMISE_F_ONLINE | PROMISE_F_REDIR;
1742				if (sd->sd_state == G_RAID_SUBDISK_S_REBUILD) {
1743					rebuild_lba64 = MIN(rebuild_lba64,
1744					    sd->sd_rebuild_pos / 512);
1745				} else
1746					rebuild_lba64 = 0;
1747				rebuild = 1;
1748			} else {
1749				meta->disks[pos].flags |= PROMISE_F_ONLINE;
1750				if (sd->sd_state < G_RAID_SUBDISK_S_ACTIVE) {
1751					meta->status |= PROMISE_S_MARKED;
1752					if (sd->sd_state == G_RAID_SUBDISK_S_RESYNC) {
1753						rebuild_lba64 = MIN(rebuild_lba64,
1754						    sd->sd_rebuild_pos / 512);
1755					} else
1756						rebuild_lba64 = 0;
1757				}
1758			}
1759			if (pv->pv_meta != NULL) {
1760				meta->disks[pos].id = pv->pv_meta->disks[pos].id;
1761			} else {
1762				meta->disks[pos].number = i * 2;
1763				arc4rand(&meta->disks[pos].id,
1764				    sizeof(meta->disks[pos].id), 0);
1765			}
1766		}
1767		promise_meta_put_name(meta, vol->v_name);
1768
1769		/* Try to mimic AMD BIOS rebuild/resync behavior. */
1770		if (rebuild_lba64 != UINT64_MAX) {
1771			if (rebuild)
1772				meta->magic_3 = 0x03040010UL; /* Rebuild? */
1773			else
1774				meta->magic_3 = 0x03040008UL; /* Resync? */
1775			/* Translate from per-disk to per-volume LBA. */
1776			if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1 ||
1777			    vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E) {
1778				rebuild_lba64 *= meta->array_width;
1779			} else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID3 ||
1780			    vol->v_raid_level == G_RAID_VOLUME_RL_RAID5) {
1781				rebuild_lba64 *= meta->array_width - 1;
1782			} else
1783				rebuild_lba64 = 0;
1784		} else
1785			meta->magic_3 = 0x03000000UL;
1786		meta->rebuild_lba64 = rebuild_lba64;
1787		meta->magic_4 = 0x04010101UL;
1788
1789		/* Replace per-volume metadata with new. */
1790		if (pv->pv_meta != NULL)
1791			free(pv->pv_meta, M_MD_PROMISE);
1792		pv->pv_meta = meta;
1793
1794		/* Copy new metadata to the disks, adding or replacing old. */
1795		for (i = 0; i < vol->v_disks_count; i++) {
1796			sd = &vol->v_subdisks[i];
1797			disk = sd->sd_disk;
1798			if (disk == NULL)
1799				continue;
1800			/* For RAID0+1 we need to translate order. */
1801			pos = promise_meta_translate_disk(vol, i);
1802			pd = (struct g_raid_md_promise_perdisk *)disk->d_md_data;
1803			for (j = 0; j < pd->pd_subdisks; j++) {
1804				if (pd->pd_meta[j]->volume_id == meta->volume_id)
1805					break;
1806			}
1807			if (j == pd->pd_subdisks)
1808				pd->pd_subdisks++;
1809			if (pd->pd_meta[j] != NULL)
1810				free(pd->pd_meta[j], M_MD_PROMISE);
1811			pd->pd_meta[j] = promise_meta_copy(meta);
1812			pd->pd_meta[j]->disk = meta->disks[pos];
1813			pd->pd_meta[j]->disk.number = pos;
1814			pd->pd_meta[j]->disk_offset = sd->sd_offset / 512;
1815			pd->pd_meta[j]->disk_sectors = sd->sd_size / 512;
1816			if (sd->sd_state == G_RAID_SUBDISK_S_REBUILD) {
1817				pd->pd_meta[j]->rebuild_lba =
1818				    sd->sd_rebuild_pos / 512;
1819			} else if (sd->sd_state < G_RAID_SUBDISK_S_REBUILD)
1820				pd->pd_meta[j]->rebuild_lba = 0;
1821			else
1822				pd->pd_meta[j]->rebuild_lba = UINT32_MAX;
1823			pd->pd_updated = 1;
1824		}
1825	}
1826
1827	TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
1828		pd = (struct g_raid_md_promise_perdisk *)disk->d_md_data;
1829		if (disk->d_state != G_RAID_DISK_S_ACTIVE)
1830			continue;
1831		if (!pd->pd_updated)
1832			continue;
1833		G_RAID_DEBUG(1, "Writing Promise metadata to %s",
1834		    g_raid_get_diskname(disk));
1835		for (i = 0; i < pd->pd_subdisks; i++)
1836			g_raid_md_promise_print(pd->pd_meta[i]);
1837		promise_meta_write(disk->d_consumer,
1838		    pd->pd_meta, pd->pd_subdisks);
1839		pd->pd_updated = 0;
1840	}
1841
1842	return (0);
1843}
1844
1845static int
1846g_raid_md_fail_disk_promise(struct g_raid_md_object *md,
1847    struct g_raid_subdisk *tsd, struct g_raid_disk *tdisk)
1848{
1849	struct g_raid_softc *sc;
1850	struct g_raid_md_promise_perdisk *pd;
1851	struct g_raid_subdisk *sd;
1852	int i, pos;
1853
1854	sc = md->mdo_softc;
1855	pd = (struct g_raid_md_promise_perdisk *)tdisk->d_md_data;
1856
1857	/* We can't fail disk that is not a part of array now. */
1858	if (tdisk->d_state != G_RAID_DISK_S_ACTIVE)
1859		return (-1);
1860
1861	/*
1862	 * Mark disk as failed in metadata and try to write that metadata
1863	 * to the disk itself to prevent it's later resurrection as STALE.
1864	 */
1865	if (pd->pd_subdisks > 0 && tdisk->d_consumer != NULL)
1866		G_RAID_DEBUG(1, "Writing Promise metadata to %s",
1867		    g_raid_get_diskname(tdisk));
1868	for (i = 0; i < pd->pd_subdisks; i++) {
1869		pd->pd_meta[i]->disk.flags |=
1870		    PROMISE_F_DOWN | PROMISE_F_REDIR;
1871		pos = pd->pd_meta[i]->disk.number;
1872		if (pos >= 0 && pos < PROMISE_MAX_DISKS) {
1873			pd->pd_meta[i]->disks[pos].flags |=
1874			    PROMISE_F_DOWN | PROMISE_F_REDIR;
1875		}
1876		g_raid_md_promise_print(pd->pd_meta[i]);
1877	}
1878	if (tdisk->d_consumer != NULL)
1879		promise_meta_write(tdisk->d_consumer,
1880		    pd->pd_meta, pd->pd_subdisks);
1881
1882	/* Change states. */
1883	g_raid_change_disk_state(tdisk, G_RAID_DISK_S_FAILED);
1884	TAILQ_FOREACH(sd, &tdisk->d_subdisks, sd_next) {
1885		g_raid_change_subdisk_state(sd,
1886		    G_RAID_SUBDISK_S_FAILED);
1887		g_raid_event_send(sd, G_RAID_SUBDISK_E_FAILED,
1888		    G_RAID_EVENT_SUBDISK);
1889	}
1890
1891	/* Write updated metadata to remaining disks. */
1892	g_raid_md_write_promise(md, NULL, NULL, tdisk);
1893
1894	g_raid_md_promise_refill(sc);
1895	return (0);
1896}
1897
1898static int
1899g_raid_md_free_disk_promise(struct g_raid_md_object *md,
1900    struct g_raid_disk *disk)
1901{
1902	struct g_raid_md_promise_perdisk *pd;
1903	int i;
1904
1905	pd = (struct g_raid_md_promise_perdisk *)disk->d_md_data;
1906	for (i = 0; i < pd->pd_subdisks; i++) {
1907		if (pd->pd_meta[i] != NULL) {
1908			free(pd->pd_meta[i], M_MD_PROMISE);
1909			pd->pd_meta[i] = NULL;
1910		}
1911	}
1912	free(pd, M_MD_PROMISE);
1913	disk->d_md_data = NULL;
1914	return (0);
1915}
1916
1917static int
1918g_raid_md_free_volume_promise(struct g_raid_md_object *md,
1919    struct g_raid_volume *vol)
1920{
1921	struct g_raid_md_promise_pervolume *pv;
1922
1923	pv = (struct g_raid_md_promise_pervolume *)vol->v_md_data;
1924	if (pv && pv->pv_meta != NULL) {
1925		free(pv->pv_meta, M_MD_PROMISE);
1926		pv->pv_meta = NULL;
1927	}
1928	if (pv && !pv->pv_started) {
1929		pv->pv_started = 1;
1930		callout_stop(&pv->pv_start_co);
1931	}
1932	return (0);
1933}
1934
1935static int
1936g_raid_md_free_promise(struct g_raid_md_object *md)
1937{
1938
1939	return (0);
1940}
1941
1942G_RAID_MD_DECLARE(g_raid_md_promise);
1943