md_promise.c revision 226816
1/*-
2 * Copyright (c) 2011 Alexander Motin <mav@FreeBSD.org>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26
27#include <sys/cdefs.h>
28__FBSDID("$FreeBSD: head/sys/geom/raid/md_promise.c 226816 2011-10-26 21:50:10Z mav $");
29
30#include <sys/param.h>
31#include <sys/bio.h>
32#include <sys/endian.h>
33#include <sys/kernel.h>
34#include <sys/kobj.h>
35#include <sys/limits.h>
36#include <sys/lock.h>
37#include <sys/malloc.h>
38#include <sys/mutex.h>
39#include <sys/systm.h>
40#include <geom/geom.h>
41#include "geom/raid/g_raid.h"
42#include "g_raid_md_if.h"
43
44static MALLOC_DEFINE(M_MD_PROMISE, "md_promise_data", "GEOM_RAID Promise metadata");
45
46#define	PROMISE_MAX_DISKS	8
47#define	PROMISE_MAX_SUBDISKS	2
48#define	PROMISE_META_OFFSET	14
49
50struct promise_raid_disk {
51	uint8_t		flags;			/* Subdisk status. */
52#define PROMISE_F_VALID		0x01
53#define PROMISE_F_ONLINE	0x02
54#define PROMISE_F_ASSIGNED	0x04
55#define PROMISE_F_SPARE		0x08
56#define PROMISE_F_DUPLICATE	0x10
57#define PROMISE_F_REDIR		0x20
58#define PROMISE_F_DOWN		0x40
59#define PROMISE_F_READY		0x80
60
61	uint8_t		number;			/* Position in a volume. */
62	uint8_t		channel;		/* ATA channel number. */
63	uint8_t		device;			/* ATA device number. */
64	uint64_t	id __packed;		/* Subdisk ID. */
65} __packed;
66
67struct promise_raid_conf {
68	char		promise_id[24];
69#define PROMISE_MAGIC		"Promise Technology, Inc."
70#define FREEBSD_MAGIC		"FreeBSD ATA driver RAID "
71
72	uint32_t	dummy_0;
73	uint64_t	magic_0;
74#define PROMISE_MAGIC0(x)	(((uint64_t)(x.channel) << 48) | \
75				((uint64_t)(x.device != 0) << 56))
76	uint16_t	magic_1;
77	uint32_t	magic_2;
78	uint8_t		filler1[470];
79
80	uint32_t	integrity;
81#define PROMISE_I_VALID		0x00000080
82
83	struct promise_raid_disk	disk;	/* This subdisk info. */
84	uint32_t	disk_offset;		/* Subdisk offset. */
85	uint32_t	disk_sectors;		/* Subdisk size */
86	uint32_t	rebuild_lba;		/* Rebuild position. */
87	uint16_t	generation;		/* Generation number. */
88	uint8_t		status;			/* Volume status. */
89#define PROMISE_S_VALID		0x01
90#define PROMISE_S_ONLINE	0x02
91#define PROMISE_S_INITED	0x04
92#define PROMISE_S_READY		0x08
93#define PROMISE_S_DEGRADED	0x10
94#define PROMISE_S_MARKED	0x20
95#define PROMISE_S_MIGRATING	0x40
96#define PROMISE_S_FUNCTIONAL	0x80
97
98	uint8_t		type;			/* Voluem type. */
99#define PROMISE_T_RAID0		0x00
100#define PROMISE_T_RAID1		0x01
101#define PROMISE_T_RAID3		0x02
102#define PROMISE_T_RAID5		0x04
103#define PROMISE_T_SPAN		0x08
104#define PROMISE_T_JBOD		0x10
105
106	uint8_t		total_disks;		/* Disks in this volume. */
107	uint8_t		stripe_shift;		/* Strip size. */
108	uint8_t		array_width;		/* Number of RAID0 stripes. */
109	uint8_t		array_number;		/* Global volume number. */
110	uint32_t	total_sectors;		/* Volume size. */
111	uint16_t	cylinders;		/* Volume geometry: C. */
112	uint8_t		heads;			/* Volume geometry: H. */
113	uint8_t		sectors;		/* Volume geometry: S. */
114	uint64_t	volume_id __packed;	/* Volume ID, */
115	struct promise_raid_disk	disks[PROMISE_MAX_DISKS];
116						/* Subdisks in this volume. */
117	char		name[32];		/* Volume label. */
118
119	uint32_t	filler2[8];
120	uint32_t	magic_3;	/* Something related to rebuild. */
121	uint64_t	rebuild_lba64;	/* Per-volume rebuild position. */
122	uint32_t	magic_4;
123	uint32_t	magic_5;
124	uint32_t	total_sectors_high;
125	uint32_t	filler3[324];
126	uint32_t	checksum;
127} __packed;
128
129struct g_raid_md_promise_perdisk {
130	int		 pd_updated;
131	int		 pd_subdisks;
132	struct promise_raid_conf	*pd_meta[PROMISE_MAX_SUBDISKS];
133};
134
135struct g_raid_md_promise_pervolume {
136	struct promise_raid_conf	*pv_meta;
137	uint64_t			 pv_id;
138	uint16_t			 pv_generation;
139	int				 pv_disks_present;
140	int				 pv_started;
141	struct callout			 pv_start_co;	/* STARTING state timer. */
142};
143
144static g_raid_md_create_t g_raid_md_create_promise;
145static g_raid_md_taste_t g_raid_md_taste_promise;
146static g_raid_md_event_t g_raid_md_event_promise;
147static g_raid_md_volume_event_t g_raid_md_volume_event_promise;
148static g_raid_md_ctl_t g_raid_md_ctl_promise;
149static g_raid_md_write_t g_raid_md_write_promise;
150static g_raid_md_fail_disk_t g_raid_md_fail_disk_promise;
151static g_raid_md_free_disk_t g_raid_md_free_disk_promise;
152static g_raid_md_free_volume_t g_raid_md_free_volume_promise;
153static g_raid_md_free_t g_raid_md_free_promise;
154
155static kobj_method_t g_raid_md_promise_methods[] = {
156	KOBJMETHOD(g_raid_md_create,	g_raid_md_create_promise),
157	KOBJMETHOD(g_raid_md_taste,	g_raid_md_taste_promise),
158	KOBJMETHOD(g_raid_md_event,	g_raid_md_event_promise),
159	KOBJMETHOD(g_raid_md_volume_event,	g_raid_md_volume_event_promise),
160	KOBJMETHOD(g_raid_md_ctl,	g_raid_md_ctl_promise),
161	KOBJMETHOD(g_raid_md_write,	g_raid_md_write_promise),
162	KOBJMETHOD(g_raid_md_fail_disk,	g_raid_md_fail_disk_promise),
163	KOBJMETHOD(g_raid_md_free_disk,	g_raid_md_free_disk_promise),
164	KOBJMETHOD(g_raid_md_free_volume,	g_raid_md_free_volume_promise),
165	KOBJMETHOD(g_raid_md_free,	g_raid_md_free_promise),
166	{ 0, 0 }
167};
168
169static struct g_raid_md_class g_raid_md_promise_class = {
170	"Promise",
171	g_raid_md_promise_methods,
172	sizeof(struct g_raid_md_object),
173	.mdc_priority = 100
174};
175
176
177static void
178g_raid_md_promise_print(struct promise_raid_conf *meta)
179{
180	int i;
181
182	if (g_raid_debug < 1)
183		return;
184
185	printf("********* ATA Promise Metadata *********\n");
186	printf("promise_id          <%.24s>\n", meta->promise_id);
187	printf("disk                %02x %02x %02x %02x %016jx\n",
188	    meta->disk.flags, meta->disk.number, meta->disk.channel,
189	    meta->disk.device, meta->disk.id);
190	printf("disk_offset         %u\n", meta->disk_offset);
191	printf("disk_sectors        %u\n", meta->disk_sectors);
192	printf("rebuild_lba         %u\n", meta->rebuild_lba);
193	printf("generation          %u\n", meta->generation);
194	printf("status              0x%02x\n", meta->status);
195	printf("type                %u\n", meta->type);
196	printf("total_disks         %u\n", meta->total_disks);
197	printf("stripe_shift        %u\n", meta->stripe_shift);
198	printf("array_width         %u\n", meta->array_width);
199	printf("array_number        %u\n", meta->array_number);
200	printf("total_sectors       %u\n", meta->total_sectors);
201	printf("cylinders           %u\n", meta->cylinders);
202	printf("heads               %u\n", meta->heads);
203	printf("sectors             %u\n", meta->sectors);
204	printf("volume_id           0x%016jx\n", meta->volume_id);
205	printf("disks:\n");
206	for (i = 0; i < PROMISE_MAX_DISKS; i++ ) {
207		printf("                    %02x %02x %02x %02x %016jx\n",
208		    meta->disks[i].flags, meta->disks[i].number,
209		    meta->disks[i].channel, meta->disks[i].device,
210		    meta->disks[i].id);
211	}
212	printf("name                <%.32s>\n", meta->name);
213	printf("magic_3             0x%08x\n", meta->magic_3);
214	printf("rebuild_lba64       %ju\n", meta->rebuild_lba64);
215	printf("magic_4             0x%08x\n", meta->magic_4);
216	printf("magic_5             0x%08x\n", meta->magic_5);
217	printf("total_sectors_high  0x%08x\n", meta->total_sectors_high);
218	printf("=================================================\n");
219}
220
221static struct promise_raid_conf *
222promise_meta_copy(struct promise_raid_conf *meta)
223{
224	struct promise_raid_conf *nmeta;
225
226	nmeta = malloc(sizeof(*nmeta), M_MD_PROMISE, M_WAITOK);
227	memcpy(nmeta, meta, sizeof(*nmeta));
228	return (nmeta);
229}
230
231static int
232promise_meta_find_disk(struct promise_raid_conf *meta, uint64_t id)
233{
234	int pos;
235
236	for (pos = 0; pos < meta->total_disks; pos++) {
237		if (meta->disks[pos].id == id)
238			return (pos);
239	}
240	return (-1);
241}
242
243static int
244promise_meta_unused_range(struct promise_raid_conf **metaarr, int nsd,
245    uint32_t sectors, uint32_t *off, uint32_t *size)
246{
247	uint32_t coff, csize;
248	int i, j;
249
250	sectors -= 131072;
251	*off = 0;
252	*size = 0;
253	coff = 0;
254	csize = sectors;
255	i = 0;
256	while (1) {
257		for (j = 0; j < nsd; j++) {
258			if (metaarr[j]->disk_offset >= coff) {
259				csize = MIN(csize,
260				    metaarr[j]->disk_offset - coff);
261			}
262		}
263		if (csize > *size) {
264			*off = coff;
265			*size = csize;
266		}
267		if (i >= nsd)
268			break;
269		coff = metaarr[i]->disk_offset + metaarr[i]->disk_sectors;
270		csize = sectors - coff;
271		i++;
272	};
273	return ((*size > 0) ? 1 : 0);
274}
275
276static int
277promise_meta_translate_disk(struct g_raid_volume *vol, int md_disk_pos)
278{
279	int disk_pos, width;
280
281	if (md_disk_pos >= 0 && vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E) {
282		width = vol->v_disks_count / 2;
283		disk_pos = (md_disk_pos / width) +
284		    (md_disk_pos % width) * width;
285	} else
286		disk_pos = md_disk_pos;
287	return (disk_pos);
288}
289
290static void
291promise_meta_get_name(struct promise_raid_conf *meta, char *buf)
292{
293	int i;
294
295	strncpy(buf, meta->name, 32);
296	buf[32] = 0;
297	for (i = 31; i >= 0; i--) {
298		if (buf[i] > 0x20)
299			break;
300		buf[i] = 0;
301	}
302}
303
304static void
305promise_meta_put_name(struct promise_raid_conf *meta, char *buf)
306{
307
308	memset(meta->name, 0x20, 32);
309	memcpy(meta->name, buf, MIN(strlen(buf), 32));
310}
311
312static int
313promise_meta_read(struct g_consumer *cp, struct promise_raid_conf **metaarr)
314{
315	struct g_provider *pp;
316	struct promise_raid_conf *meta;
317	char *buf;
318	int error, i, subdisks;
319	uint32_t checksum, *ptr;
320
321	pp = cp->provider;
322	subdisks = 0;
323next:
324	/* Read metadata block. */
325	buf = g_read_data(cp, pp->mediasize - pp->sectorsize *
326	    (63 - subdisks * PROMISE_META_OFFSET),
327	    pp->sectorsize * 4, &error);
328	if (buf == NULL) {
329		G_RAID_DEBUG(1, "Cannot read metadata from %s (error=%d).",
330		    pp->name, error);
331		return (subdisks);
332	}
333	meta = (struct promise_raid_conf *)buf;
334
335	/* Check if this is an Promise RAID struct */
336	if (strncmp(meta->promise_id, PROMISE_MAGIC, strlen(PROMISE_MAGIC)) &&
337	    strncmp(meta->promise_id, FREEBSD_MAGIC, strlen(FREEBSD_MAGIC))) {
338		if (subdisks == 0)
339			G_RAID_DEBUG(1,
340			    "Promise signature check failed on %s", pp->name);
341		g_free(buf);
342		return (subdisks);
343	}
344	meta = malloc(sizeof(*meta), M_MD_PROMISE, M_WAITOK);
345	memcpy(meta, buf, MIN(sizeof(*meta), pp->sectorsize * 4));
346	g_free(buf);
347
348	/* Check metadata checksum. */
349	for (checksum = 0, ptr = (uint32_t *)meta, i = 0; i < 511; i++)
350		checksum += *ptr++;
351	if (checksum != meta->checksum) {
352		G_RAID_DEBUG(1, "Promise checksum check failed on %s", pp->name);
353		free(meta, M_MD_PROMISE);
354		return (subdisks);
355	}
356
357	if ((meta->integrity & PROMISE_I_VALID) == 0) {
358		G_RAID_DEBUG(1, "Promise metadata is invalid on %s", pp->name);
359		free(meta, M_MD_PROMISE);
360		return (subdisks);
361	}
362
363	if (meta->total_disks > PROMISE_MAX_DISKS) {
364		G_RAID_DEBUG(1, "Wrong number of disks on %s (%d)",
365		    pp->name, meta->total_disks);
366		free(meta, M_MD_PROMISE);
367		return (subdisks);
368	}
369
370	/* Save this part and look for next. */
371	*metaarr = meta;
372	metaarr++;
373	subdisks++;
374	if (subdisks < PROMISE_MAX_SUBDISKS)
375		goto next;
376
377	return (subdisks);
378}
379
380static int
381promise_meta_write(struct g_consumer *cp,
382    struct promise_raid_conf **metaarr, int nsd)
383{
384	struct g_provider *pp;
385	struct promise_raid_conf *meta;
386	char *buf;
387	int error, i, subdisk, fake;
388	uint32_t checksum, *ptr, off, size;
389
390	pp = cp->provider;
391	subdisk = 0;
392	fake = 0;
393next:
394	buf = malloc(pp->sectorsize * 4, M_MD_PROMISE, M_WAITOK | M_ZERO);
395	meta = NULL;
396	if (subdisk < nsd) {
397		meta = metaarr[subdisk];
398	} else if (!fake && promise_meta_unused_range(metaarr, nsd,
399	    cp->provider->mediasize / cp->provider->sectorsize,
400	    &off, &size)) {
401		/* Optionally add record for unused space. */
402		meta = (struct promise_raid_conf *)buf;
403		memcpy(&meta->promise_id[0], PROMISE_MAGIC,
404		    sizeof(PROMISE_MAGIC) - 1);
405		meta->dummy_0 = 0x00020000;
406		meta->integrity = PROMISE_I_VALID;
407		meta->disk.flags = PROMISE_F_ONLINE | PROMISE_F_VALID;
408		meta->disk.number = 0xff;
409		arc4rand(&meta->disk.id, sizeof(meta->disk.id), 0);
410		meta->disk_offset = off;
411		meta->disk_sectors = size;
412		meta->rebuild_lba = UINT32_MAX;
413		fake = 1;
414	}
415	if (meta != NULL) {
416		/* Recalculate checksum for case if metadata were changed. */
417		meta->checksum = 0;
418		for (checksum = 0, ptr = (uint32_t *)meta, i = 0; i < 511; i++)
419			checksum += *ptr++;
420		meta->checksum = checksum;
421		memcpy(buf, meta, MIN(pp->sectorsize * 4, sizeof(*meta)));
422	}
423	error = g_write_data(cp, pp->mediasize - pp->sectorsize *
424	    (63 - subdisk * PROMISE_META_OFFSET),
425	    buf, pp->sectorsize * 4);
426	if (error != 0) {
427		G_RAID_DEBUG(1, "Cannot write metadata to %s (error=%d).",
428		    pp->name, error);
429	}
430	free(buf, M_MD_PROMISE);
431
432	subdisk++;
433	if (subdisk < PROMISE_MAX_SUBDISKS)
434		goto next;
435
436	return (error);
437}
438
439static int
440promise_meta_erase(struct g_consumer *cp)
441{
442	struct g_provider *pp;
443	char *buf;
444	int error, subdisk;
445
446	pp = cp->provider;
447	buf = malloc(4 * pp->sectorsize, M_MD_PROMISE, M_WAITOK | M_ZERO);
448	for (subdisk = 0; subdisk < PROMISE_MAX_SUBDISKS; subdisk++) {
449		error = g_write_data(cp, pp->mediasize - pp->sectorsize *
450		    (63 - subdisk * PROMISE_META_OFFSET),
451		    buf, 4 * pp->sectorsize);
452		if (error != 0) {
453			G_RAID_DEBUG(1, "Cannot erase metadata on %s (error=%d).",
454			    pp->name, error);
455		}
456	}
457	free(buf, M_MD_PROMISE);
458	return (error);
459}
460
461static int
462promise_meta_write_spare(struct g_consumer *cp)
463{
464	struct promise_raid_conf *meta;
465	int error;
466
467	meta = malloc(sizeof(*meta), M_MD_PROMISE, M_WAITOK | M_ZERO);
468	memcpy(&meta->promise_id[0], PROMISE_MAGIC, sizeof(PROMISE_MAGIC) - 1);
469	meta->dummy_0 = 0x00020000;
470	meta->integrity = PROMISE_I_VALID;
471	meta->disk.flags = PROMISE_F_SPARE | PROMISE_F_ONLINE | PROMISE_F_VALID;
472	meta->disk.number = 0xff;
473	arc4rand(&meta->disk.id, sizeof(meta->disk.id), 0);
474	meta->disk_sectors = cp->provider->mediasize / cp->provider->sectorsize;
475	meta->disk_sectors -= 131072;
476	meta->rebuild_lba = UINT32_MAX;
477	error = promise_meta_write(cp, &meta, 1);
478	free(meta, M_MD_PROMISE);
479	return (error);
480}
481
482static struct g_raid_volume *
483g_raid_md_promise_get_volume(struct g_raid_softc *sc, uint64_t id)
484{
485	struct g_raid_volume	*vol;
486	struct g_raid_md_promise_pervolume *pv;
487
488	TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
489		pv = vol->v_md_data;
490		if (pv->pv_id == id)
491			break;
492	}
493	return (vol);
494}
495
496static int
497g_raid_md_promise_purge_volumes(struct g_raid_softc *sc)
498{
499	struct g_raid_volume	*vol, *tvol;
500	struct g_raid_md_promise_pervolume *pv;
501	int i, res;
502
503	res = 0;
504	TAILQ_FOREACH_SAFE(vol, &sc->sc_volumes, v_next, tvol) {
505		pv = vol->v_md_data;
506		if (!pv->pv_started || vol->v_stopping)
507			continue;
508		for (i = 0; i < vol->v_disks_count; i++) {
509			if (vol->v_subdisks[i].sd_state != G_RAID_SUBDISK_S_NONE)
510				break;
511		}
512		if (i >= vol->v_disks_count) {
513			g_raid_destroy_volume(vol);
514			res = 1;
515		}
516	}
517	return (res);
518}
519
520static int
521g_raid_md_promise_purge_disks(struct g_raid_softc *sc)
522{
523	struct g_raid_disk	*disk, *tdisk;
524	struct g_raid_volume	*vol;
525	struct g_raid_md_promise_perdisk *pd;
526	int i, j, res;
527
528	res = 0;
529	TAILQ_FOREACH_SAFE(disk, &sc->sc_disks, d_next, tdisk) {
530		if (disk->d_state == G_RAID_DISK_S_SPARE)
531			continue;
532		pd = (struct g_raid_md_promise_perdisk *)disk->d_md_data;
533
534		/* Scan for deleted volumes. */
535		for (i = 0; i < pd->pd_subdisks; ) {
536			vol = g_raid_md_promise_get_volume(sc,
537			    pd->pd_meta[i]->volume_id);
538			if (vol != NULL && !vol->v_stopping) {
539				i++;
540				continue;
541			}
542			free(pd->pd_meta[i], M_MD_PROMISE);
543			for (j = i; j < pd->pd_subdisks - 1; j++)
544				pd->pd_meta[j] = pd->pd_meta[j + 1];
545			pd->pd_meta[PROMISE_MAX_SUBDISKS - 1] = NULL;
546			pd->pd_subdisks--;
547			pd->pd_updated = 1;
548		}
549
550		/* If there is no metadata left - erase and delete disk. */
551		if (pd->pd_subdisks == 0) {
552			promise_meta_erase(disk->d_consumer);
553			g_raid_destroy_disk(disk);
554			res = 1;
555		}
556	}
557	return (res);
558}
559
560static int
561g_raid_md_promise_supported(int level, int qual, int disks, int force)
562{
563
564	if (disks > PROMISE_MAX_DISKS)
565		return (0);
566	switch (level) {
567	case G_RAID_VOLUME_RL_RAID0:
568		if (disks < 1)
569			return (0);
570		if (!force && disks < 2)
571			return (0);
572		break;
573	case G_RAID_VOLUME_RL_RAID1:
574		if (disks < 1)
575			return (0);
576		if (!force && (disks != 2))
577			return (0);
578		break;
579	case G_RAID_VOLUME_RL_RAID1E:
580		if (disks < 2)
581			return (0);
582		if (disks % 2 != 0)
583			return (0);
584		if (!force && (disks != 4))
585			return (0);
586		break;
587	case G_RAID_VOLUME_RL_SINGLE:
588		if (disks != 1)
589			return (0);
590		break;
591	case G_RAID_VOLUME_RL_CONCAT:
592		if (disks < 2)
593			return (0);
594		break;
595	case G_RAID_VOLUME_RL_RAID5:
596		if (disks < 3)
597			return (0);
598		break;
599	default:
600		return (0);
601	}
602	if (qual != G_RAID_VOLUME_RLQ_NONE)
603		return (0);
604	return (1);
605}
606
607static int
608g_raid_md_promise_start_disk(struct g_raid_disk *disk, int sdn,
609    struct g_raid_volume *vol)
610{
611	struct g_raid_softc *sc;
612	struct g_raid_subdisk *sd;
613	struct g_raid_md_promise_perdisk *pd;
614	struct g_raid_md_promise_pervolume *pv;
615	struct promise_raid_conf *meta;
616	off_t size;
617	int disk_pos, md_disk_pos, i, resurrection = 0;
618	uint32_t eoff, esize;
619
620	sc = disk->d_softc;
621	pd = (struct g_raid_md_promise_perdisk *)disk->d_md_data;
622
623	pv = vol->v_md_data;
624	meta = pv->pv_meta;
625
626	if (sdn >= 0) {
627		/* Find disk position in metadata by it's serial. */
628		md_disk_pos = promise_meta_find_disk(meta, pd->pd_meta[sdn]->disk.id);
629		/* For RAID0+1 we need to translate order. */
630		disk_pos = promise_meta_translate_disk(vol, md_disk_pos);
631	} else {
632		md_disk_pos = -1;
633		disk_pos = -1;
634	}
635	if (disk_pos < 0) {
636		G_RAID_DEBUG1(1, sc, "Disk %s is not part of the volume %s",
637		    g_raid_get_diskname(disk), vol->v_name);
638		/* Failed stale disk is useless for us. */
639		if (sdn >= 0 &&
640		    pd->pd_meta[sdn]->disk.flags & PROMISE_F_DOWN) {
641			g_raid_change_disk_state(disk, G_RAID_DISK_S_STALE_FAILED);
642			return (0);
643		}
644		/* If we were given specific metadata subdisk - erase it. */
645		if (sdn >= 0) {
646			free(pd->pd_meta[sdn], M_MD_PROMISE);
647			for (i = sdn; i < pd->pd_subdisks - 1; i++)
648				pd->pd_meta[i] = pd->pd_meta[i + 1];
649			pd->pd_meta[PROMISE_MAX_SUBDISKS - 1] = NULL;
650			pd->pd_subdisks--;
651		}
652		/* If we are in the start process, that's all for now. */
653		if (!pv->pv_started)
654			goto nofit;
655		/*
656		 * If we have already started - try to get use of the disk.
657		 * Try to replace OFFLINE disks first, then FAILED.
658		 */
659		promise_meta_unused_range(pd->pd_meta, pd->pd_subdisks,
660		    disk->d_consumer->provider->mediasize /
661		    disk->d_consumer->provider->sectorsize,
662		    &eoff, &esize);
663		if (esize == 0) {
664			G_RAID_DEBUG1(1, sc, "No free space on disk %s",
665			    g_raid_get_diskname(disk));
666			goto nofit;
667		}
668		size = INT64_MAX;
669		for (i = 0; i < vol->v_disks_count; i++) {
670			sd = &vol->v_subdisks[i];
671			if (sd->sd_state != G_RAID_SUBDISK_S_NONE)
672				size = sd->sd_size;
673			if (sd->sd_state <= G_RAID_SUBDISK_S_FAILED &&
674			    (disk_pos < 0 ||
675			     vol->v_subdisks[i].sd_state < sd->sd_state))
676				disk_pos = i;
677		}
678		if (disk_pos >= 0 &&
679		    vol->v_raid_level != G_RAID_VOLUME_RL_CONCAT &&
680		    (off_t)esize * 512 < size) {
681			G_RAID_DEBUG1(1, sc, "Disk %s free space "
682			    "is too small (%ju < %ju)",
683			    g_raid_get_diskname(disk),
684			    (off_t)esize * 512, size);
685			disk_pos = -1;
686		}
687		if (disk_pos >= 0) {
688			if (vol->v_raid_level != G_RAID_VOLUME_RL_CONCAT)
689				esize = size / 512;
690			/* For RAID0+1 we need to translate order. */
691			md_disk_pos = promise_meta_translate_disk(vol, disk_pos);
692		} else {
693nofit:
694			if (pd->pd_subdisks == 0) {
695				g_raid_change_disk_state(disk,
696				    G_RAID_DISK_S_SPARE);
697			}
698			return (0);
699		}
700		G_RAID_DEBUG1(1, sc, "Disk %s takes pos %d in the volume %s",
701		    g_raid_get_diskname(disk), disk_pos, vol->v_name);
702		resurrection = 1;
703	}
704
705	sd = &vol->v_subdisks[disk_pos];
706
707	if (resurrection && sd->sd_disk != NULL) {
708		g_raid_change_disk_state(sd->sd_disk,
709		    G_RAID_DISK_S_STALE_FAILED);
710		TAILQ_REMOVE(&sd->sd_disk->d_subdisks,
711		    sd, sd_next);
712	}
713	vol->v_subdisks[disk_pos].sd_disk = disk;
714	TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next);
715
716	/* Welcome the new disk. */
717	if (resurrection)
718		g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE);
719	else if (meta->disks[md_disk_pos].flags & PROMISE_F_DOWN)
720		g_raid_change_disk_state(disk, G_RAID_DISK_S_FAILED);
721	else
722		g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE);
723
724	if (resurrection) {
725		sd->sd_offset = (off_t)eoff * 512;
726		sd->sd_size = (off_t)esize * 512;
727	} else {
728		sd->sd_offset = (off_t)pd->pd_meta[sdn]->disk_offset * 512;
729		sd->sd_size = (off_t)pd->pd_meta[sdn]->disk_sectors * 512;
730	}
731
732	if (resurrection) {
733		/* Stale disk, almost same as new. */
734		g_raid_change_subdisk_state(sd,
735		    G_RAID_SUBDISK_S_NEW);
736	} else if (meta->disks[md_disk_pos].flags & PROMISE_F_DOWN) {
737		/* Failed disk. */
738		g_raid_change_subdisk_state(sd,
739		    G_RAID_SUBDISK_S_FAILED);
740	} else if (meta->disks[md_disk_pos].flags & PROMISE_F_REDIR) {
741		/* Rebuilding disk. */
742		g_raid_change_subdisk_state(sd,
743		    G_RAID_SUBDISK_S_REBUILD);
744		if (pd->pd_meta[sdn]->generation != meta->generation)
745			sd->sd_rebuild_pos = 0;
746		else {
747			sd->sd_rebuild_pos =
748			    (off_t)pd->pd_meta[sdn]->rebuild_lba * 512;
749		}
750	} else if (!(meta->disks[md_disk_pos].flags & PROMISE_F_ONLINE)) {
751		/* Rebuilding disk. */
752		g_raid_change_subdisk_state(sd,
753		    G_RAID_SUBDISK_S_NEW);
754	} else if (pd->pd_meta[sdn]->generation != meta->generation ||
755	    (meta->status & PROMISE_S_MARKED)) {
756		/* Stale disk or dirty volume (unclean shutdown). */
757		g_raid_change_subdisk_state(sd,
758		    G_RAID_SUBDISK_S_STALE);
759	} else {
760		/* Up to date disk. */
761		g_raid_change_subdisk_state(sd,
762		    G_RAID_SUBDISK_S_ACTIVE);
763	}
764	g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW,
765	    G_RAID_EVENT_SUBDISK);
766
767	return (resurrection);
768}
769
770static void
771g_raid_md_promise_refill(struct g_raid_softc *sc)
772{
773	struct g_raid_volume *vol;
774	struct g_raid_subdisk *sd;
775	struct g_raid_disk *disk;
776	struct g_raid_md_object *md;
777	struct g_raid_md_promise_perdisk *pd;
778	struct g_raid_md_promise_pervolume *pv;
779	int update, updated, i, bad;
780
781	md = sc->sc_md;
782restart:
783	updated = 0;
784	TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
785		pv = vol->v_md_data;
786		if (!pv->pv_started || vol->v_stopping)
787			continue;
788
789		/* Search for subdisk that needs replacement. */
790		bad = 0;
791		for (i = 0; i < vol->v_disks_count; i++) {
792			sd = &vol->v_subdisks[i];
793			if (sd->sd_state == G_RAID_SUBDISK_S_NONE ||
794			    sd->sd_state == G_RAID_SUBDISK_S_FAILED)
795			        bad = 1;
796		}
797		if (!bad)
798			continue;
799
800		G_RAID_DEBUG1(1, sc, "Volume %s is not complete, "
801		    "trying to refill.", vol->v_name);
802
803		TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
804			/* Skip failed. */
805			if (disk->d_state < G_RAID_DISK_S_SPARE)
806				continue;
807			/* Skip already used by this volume. */
808			for (i = 0; i < vol->v_disks_count; i++) {
809				sd = &vol->v_subdisks[i];
810				if (sd->sd_disk == disk)
811					break;
812			}
813			if (i < vol->v_disks_count)
814				continue;
815
816			/* Try to use disk if it has empty extents. */
817			pd = disk->d_md_data;
818			if (pd->pd_subdisks < PROMISE_MAX_SUBDISKS) {
819				update =
820				    g_raid_md_promise_start_disk(disk, -1, vol);
821			} else
822				update = 0;
823			if (update) {
824				updated = 1;
825				g_raid_md_write_promise(md, vol, NULL, disk);
826				break;
827			}
828		}
829	}
830	if (updated)
831		goto restart;
832}
833
834static void
835g_raid_md_promise_start(struct g_raid_volume *vol)
836{
837	struct g_raid_softc *sc;
838	struct g_raid_subdisk *sd;
839	struct g_raid_disk *disk;
840	struct g_raid_md_object *md;
841	struct g_raid_md_promise_perdisk *pd;
842	struct g_raid_md_promise_pervolume *pv;
843	struct promise_raid_conf *meta;
844	int i;
845
846	sc = vol->v_softc;
847	md = sc->sc_md;
848	pv = vol->v_md_data;
849	meta = pv->pv_meta;
850
851	if (meta->type == PROMISE_T_RAID0)
852		vol->v_raid_level = G_RAID_VOLUME_RL_RAID0;
853	else if (meta->type == PROMISE_T_RAID1) {
854		if (meta->array_width == 1)
855			vol->v_raid_level = G_RAID_VOLUME_RL_RAID1;
856		else
857			vol->v_raid_level = G_RAID_VOLUME_RL_RAID1E;
858	} else if (meta->type == PROMISE_T_RAID3)
859		vol->v_raid_level = G_RAID_VOLUME_RL_RAID3;
860	else if (meta->type == PROMISE_T_RAID5)
861		vol->v_raid_level = G_RAID_VOLUME_RL_RAID5;
862	else if (meta->type == PROMISE_T_SPAN)
863		vol->v_raid_level = G_RAID_VOLUME_RL_CONCAT;
864	else if (meta->type == PROMISE_T_JBOD)
865		vol->v_raid_level = G_RAID_VOLUME_RL_SINGLE;
866	else
867		vol->v_raid_level = G_RAID_VOLUME_RL_UNKNOWN;
868	vol->v_raid_level_qualifier = G_RAID_VOLUME_RLQ_NONE;
869	vol->v_strip_size = 512 << meta->stripe_shift; //ZZZ
870	vol->v_disks_count = meta->total_disks;
871	vol->v_mediasize = (off_t)meta->total_sectors * 512; //ZZZ
872	if (meta->total_sectors_high < 256) /* If value looks sane. */
873		vol->v_mediasize |=
874		    ((off_t)meta->total_sectors_high << 32) * 512; //ZZZ
875	vol->v_sectorsize = 512; //ZZZ
876	for (i = 0; i < vol->v_disks_count; i++) {
877		sd = &vol->v_subdisks[i];
878		sd->sd_offset = (off_t)meta->disk_offset * 512; //ZZZ
879		sd->sd_size = (off_t)meta->disk_sectors * 512; //ZZZ
880	}
881	g_raid_start_volume(vol);
882
883	/* Make all disks found till the moment take their places. */
884	TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
885		pd = disk->d_md_data;
886		for (i = 0; i < pd->pd_subdisks; i++) {
887			if (pd->pd_meta[i]->volume_id == meta->volume_id)
888				g_raid_md_promise_start_disk(disk, i, vol);
889		}
890	}
891
892	pv->pv_started = 1;
893	callout_stop(&pv->pv_start_co);
894	G_RAID_DEBUG1(0, sc, "Volume started.");
895	g_raid_md_write_promise(md, vol, NULL, NULL);
896
897	/* Pickup any STALE/SPARE disks to refill array if needed. */
898	g_raid_md_promise_refill(sc);
899
900	g_raid_event_send(vol, G_RAID_VOLUME_E_START, G_RAID_EVENT_VOLUME);
901}
902
903static void
904g_raid_promise_go(void *arg)
905{
906	struct g_raid_volume *vol;
907	struct g_raid_softc *sc;
908	struct g_raid_md_promise_pervolume *pv;
909
910	vol = arg;
911	pv = vol->v_md_data;
912	sc = vol->v_softc;
913	if (!pv->pv_started) {
914		G_RAID_DEBUG1(0, sc, "Force volume start due to timeout.");
915		g_raid_event_send(vol, G_RAID_VOLUME_E_STARTMD,
916		    G_RAID_EVENT_VOLUME);
917	}
918}
919
920static void
921g_raid_md_promise_new_disk(struct g_raid_disk *disk)
922{
923	struct g_raid_softc *sc;
924	struct g_raid_md_object *md;
925	struct promise_raid_conf *pdmeta;
926	struct g_raid_md_promise_perdisk *pd;
927	struct g_raid_md_promise_pervolume *pv;
928	struct g_raid_volume *vol;
929	int i;
930	char buf[33];
931
932	sc = disk->d_softc;
933	md = sc->sc_md;
934	pd = (struct g_raid_md_promise_perdisk *)disk->d_md_data;
935
936	if (pd->pd_subdisks == 0) {
937		g_raid_change_disk_state(disk, G_RAID_DISK_S_SPARE);
938		g_raid_md_promise_refill(sc);
939		return;
940	}
941
942	for (i = 0; i < pd->pd_subdisks; i++) {
943		pdmeta = pd->pd_meta[i];
944
945		/* Look for volume with matching ID. */
946		vol = g_raid_md_promise_get_volume(sc, pdmeta->volume_id);
947		if (vol == NULL) {
948			promise_meta_get_name(pdmeta, buf);
949			vol = g_raid_create_volume(sc, buf, pdmeta->array_number);
950			pv = malloc(sizeof(*pv), M_MD_PROMISE, M_WAITOK | M_ZERO);
951			pv->pv_id = pdmeta->volume_id;
952			vol->v_md_data = pv;
953			callout_init(&pv->pv_start_co, 1);
954			callout_reset(&pv->pv_start_co,
955			    g_raid_start_timeout * hz,
956			    g_raid_promise_go, vol);
957		} else
958			pv = vol->v_md_data;
959
960		/* If we haven't started yet - check metadata freshness. */
961		if (pv->pv_meta == NULL || !pv->pv_started) {
962			if (pv->pv_meta == NULL ||
963			    ((int16_t)(pdmeta->generation - pv->pv_generation)) > 0) {
964				G_RAID_DEBUG1(1, sc, "Newer disk");
965				if (pv->pv_meta != NULL)
966					free(pv->pv_meta, M_MD_PROMISE);
967				pv->pv_meta = promise_meta_copy(pdmeta);
968				pv->pv_generation = pv->pv_meta->generation;
969				pv->pv_disks_present = 1;
970			} else if (pdmeta->generation == pv->pv_generation) {
971				pv->pv_disks_present++;
972				G_RAID_DEBUG1(1, sc, "Matching disk (%d of %d up)",
973				    pv->pv_disks_present,
974				    pv->pv_meta->total_disks);
975			} else {
976				G_RAID_DEBUG1(1, sc, "Older disk");
977			}
978		}
979	}
980
981	for (i = 0; i < pd->pd_subdisks; i++) {
982		pdmeta = pd->pd_meta[i];
983
984		/* Look for volume with matching ID. */
985		vol = g_raid_md_promise_get_volume(sc, pdmeta->volume_id);
986		if (vol == NULL)
987			continue;
988		pv = vol->v_md_data;
989
990		if (pv->pv_started) {
991			if (g_raid_md_promise_start_disk(disk, i, vol))
992				g_raid_md_write_promise(md, vol, NULL, NULL);
993		} else {
994			/* If we collected all needed disks - start array. */
995			if (pv->pv_disks_present == pv->pv_meta->total_disks)
996				g_raid_md_promise_start(vol);
997		}
998	}
999}
1000
1001static int
1002g_raid_md_create_promise(struct g_raid_md_object *md, struct g_class *mp,
1003    struct g_geom **gp)
1004{
1005	struct g_geom *geom;
1006	struct g_raid_softc *sc;
1007
1008	/* Search for existing node. */
1009	LIST_FOREACH(geom, &mp->geom, geom) {
1010		sc = geom->softc;
1011		if (sc == NULL)
1012			continue;
1013		if (sc->sc_stopping != 0)
1014			continue;
1015		if (sc->sc_md->mdo_class != md->mdo_class)
1016			continue;
1017		break;
1018	}
1019	if (geom != NULL) {
1020		*gp = geom;
1021		return (G_RAID_MD_TASTE_EXISTING);
1022	}
1023
1024	/* Create new one if not found. */
1025	sc = g_raid_create_node(mp, "Promise", md);
1026	if (sc == NULL)
1027		return (G_RAID_MD_TASTE_FAIL);
1028	md->mdo_softc = sc;
1029	*gp = sc->sc_geom;
1030	return (G_RAID_MD_TASTE_NEW);
1031}
1032
1033static int
1034g_raid_md_taste_promise(struct g_raid_md_object *md, struct g_class *mp,
1035                              struct g_consumer *cp, struct g_geom **gp)
1036{
1037	struct g_consumer *rcp;
1038	struct g_provider *pp;
1039	struct g_raid_softc *sc;
1040	struct g_raid_disk *disk;
1041	struct promise_raid_conf *meta, *metaarr[4];
1042	struct g_raid_md_promise_perdisk *pd;
1043	struct g_geom *geom;
1044	int error, i, j, result, len, subdisks;
1045	char name[16];
1046	uint16_t vendor;
1047
1048	G_RAID_DEBUG(1, "Tasting Promise on %s", cp->provider->name);
1049	pp = cp->provider;
1050
1051	/* Read metadata from device. */
1052	meta = NULL;
1053	vendor = 0xffff;
1054	if (g_access(cp, 1, 0, 0) != 0)
1055		return (G_RAID_MD_TASTE_FAIL);
1056	g_topology_unlock();
1057	len = 2;
1058	if (pp->geom->rank == 1)
1059		g_io_getattr("GEOM::hba_vendor", cp, &len, &vendor);
1060	subdisks = promise_meta_read(cp, metaarr);
1061	g_topology_lock();
1062	g_access(cp, -1, 0, 0);
1063	if (subdisks == 0) {
1064		if (g_raid_aggressive_spare) {
1065			if (vendor == 0x105a || vendor == 0x1002) {
1066				G_RAID_DEBUG(1,
1067				    "No Promise metadata, forcing spare.");
1068				goto search;
1069			} else {
1070				G_RAID_DEBUG(1,
1071				    "Promise/ATI vendor mismatch "
1072				    "0x%04x != 0x105a/0x1002",
1073				    vendor);
1074			}
1075		}
1076		return (G_RAID_MD_TASTE_FAIL);
1077	}
1078
1079	/* Metadata valid. Print it. */
1080	for (i = 0; i < subdisks; i++)
1081		g_raid_md_promise_print(metaarr[i]);
1082
1083	/* Purge meaningless (empty/spare) records. */
1084	for (i = 0; i < subdisks; ) {
1085		if (metaarr[i]->disk.flags & PROMISE_F_ASSIGNED) {
1086			i++;
1087			continue;
1088		}
1089		free(metaarr[i], M_MD_PROMISE);
1090		for (j = i; j < subdisks - 1; j++)
1091			metaarr[i] = metaarr[j + 1];
1092		metaarr[PROMISE_MAX_SUBDISKS - 1] = NULL;
1093		subdisks--;
1094	}
1095
1096search:
1097	/* Search for matching node. */
1098	sc = NULL;
1099	LIST_FOREACH(geom, &mp->geom, geom) {
1100		sc = geom->softc;
1101		if (sc == NULL)
1102			continue;
1103		if (sc->sc_stopping != 0)
1104			continue;
1105		if (sc->sc_md->mdo_class != md->mdo_class)
1106			continue;
1107		break;
1108	}
1109
1110	/* Found matching node. */
1111	if (geom != NULL) {
1112		G_RAID_DEBUG(1, "Found matching array %s", sc->sc_name);
1113		result = G_RAID_MD_TASTE_EXISTING;
1114
1115	} else { /* Not found matching node -- create one. */
1116		result = G_RAID_MD_TASTE_NEW;
1117		snprintf(name, sizeof(name), "Promise");
1118		sc = g_raid_create_node(mp, name, md);
1119		md->mdo_softc = sc;
1120		geom = sc->sc_geom;
1121	}
1122
1123	rcp = g_new_consumer(geom);
1124	g_attach(rcp, pp);
1125	if (g_access(rcp, 1, 1, 1) != 0)
1126		; //goto fail1;
1127
1128	g_topology_unlock();
1129	sx_xlock(&sc->sc_lock);
1130
1131	pd = malloc(sizeof(*pd), M_MD_PROMISE, M_WAITOK | M_ZERO);
1132	pd->pd_subdisks = subdisks;
1133	for (i = 0; i < subdisks; i++)
1134		pd->pd_meta[i] = metaarr[i];
1135	disk = g_raid_create_disk(sc);
1136	disk->d_md_data = (void *)pd;
1137	disk->d_consumer = rcp;
1138	rcp->private = disk;
1139
1140	/* Read kernel dumping information. */
1141	disk->d_kd.offset = 0;
1142	disk->d_kd.length = OFF_MAX;
1143	len = sizeof(disk->d_kd);
1144	error = g_io_getattr("GEOM::kerneldump", rcp, &len, &disk->d_kd);
1145	if (disk->d_kd.di.dumper == NULL)
1146		G_RAID_DEBUG1(2, sc, "Dumping not supported by %s: %d.",
1147		    rcp->provider->name, error);
1148
1149	g_raid_md_promise_new_disk(disk);
1150
1151	sx_xunlock(&sc->sc_lock);
1152	g_topology_lock();
1153	*gp = geom;
1154	return (result);
1155}
1156
1157static int
1158g_raid_md_event_promise(struct g_raid_md_object *md,
1159    struct g_raid_disk *disk, u_int event)
1160{
1161	struct g_raid_softc *sc;
1162
1163	sc = md->mdo_softc;
1164	if (disk == NULL)
1165		return (-1);
1166	switch (event) {
1167	case G_RAID_DISK_E_DISCONNECTED:
1168		/* Delete disk. */
1169		g_raid_change_disk_state(disk, G_RAID_DISK_S_NONE);
1170		g_raid_destroy_disk(disk);
1171		g_raid_md_promise_purge_volumes(sc);
1172
1173		/* Write updated metadata to all disks. */
1174		g_raid_md_write_promise(md, NULL, NULL, NULL);
1175
1176		/* Check if anything left. */
1177		if (g_raid_ndisks(sc, -1) == 0)
1178			g_raid_destroy_node(sc, 0);
1179		else
1180			g_raid_md_promise_refill(sc);
1181		return (0);
1182	}
1183	return (-2);
1184}
1185
1186static int
1187g_raid_md_volume_event_promise(struct g_raid_md_object *md,
1188    struct g_raid_volume *vol, u_int event)
1189{
1190	struct g_raid_md_promise_pervolume *pv;
1191
1192	pv = (struct g_raid_md_promise_pervolume *)vol->v_md_data;
1193	switch (event) {
1194	case G_RAID_VOLUME_E_STARTMD:
1195		if (!pv->pv_started)
1196			g_raid_md_promise_start(vol);
1197		return (0);
1198	}
1199	return (-2);
1200}
1201
1202static int
1203g_raid_md_ctl_promise(struct g_raid_md_object *md,
1204    struct gctl_req *req)
1205{
1206	struct g_raid_softc *sc;
1207	struct g_raid_volume *vol, *vol1;
1208	struct g_raid_subdisk *sd;
1209	struct g_raid_disk *disk, *disks[PROMISE_MAX_DISKS];
1210	struct g_raid_md_promise_perdisk *pd;
1211	struct g_raid_md_promise_pervolume *pv;
1212	struct g_consumer *cp;
1213	struct g_provider *pp;
1214	char arg[16];
1215	const char *verb, *volname, *levelname, *diskname;
1216	char *tmp;
1217	int *nargs, *force;
1218	off_t size, sectorsize, strip;
1219	intmax_t *sizearg, *striparg;
1220	uint32_t offs[PROMISE_MAX_DISKS], esize;
1221	int numdisks, i, len, level, qual;
1222	int error;
1223
1224	sc = md->mdo_softc;
1225	verb = gctl_get_param(req, "verb", NULL);
1226	nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs));
1227	error = 0;
1228	if (strcmp(verb, "label") == 0) {
1229
1230		if (*nargs < 4) {
1231			gctl_error(req, "Invalid number of arguments.");
1232			return (-1);
1233		}
1234		volname = gctl_get_asciiparam(req, "arg1");
1235		if (volname == NULL) {
1236			gctl_error(req, "No volume name.");
1237			return (-2);
1238		}
1239		levelname = gctl_get_asciiparam(req, "arg2");
1240		if (levelname == NULL) {
1241			gctl_error(req, "No RAID level.");
1242			return (-3);
1243		}
1244		if (g_raid_volume_str2level(levelname, &level, &qual)) {
1245			gctl_error(req, "Unknown RAID level '%s'.", levelname);
1246			return (-4);
1247		}
1248		numdisks = *nargs - 3;
1249		force = gctl_get_paraml(req, "force", sizeof(*force));
1250		if (!g_raid_md_promise_supported(level, qual, numdisks,
1251		    force ? *force : 0)) {
1252			gctl_error(req, "Unsupported RAID level "
1253			    "(0x%02x/0x%02x), or number of disks (%d).",
1254			    level, qual, numdisks);
1255			return (-5);
1256		}
1257
1258		/* Search for disks, connect them and probe. */
1259		size = INT64_MAX;
1260		sectorsize = 0;
1261		bzero(disks, sizeof(disks));
1262		bzero(offs, sizeof(offs));
1263		for (i = 0; i < numdisks; i++) {
1264			snprintf(arg, sizeof(arg), "arg%d", i + 3);
1265			diskname = gctl_get_asciiparam(req, arg);
1266			if (diskname == NULL) {
1267				gctl_error(req, "No disk name (%s).", arg);
1268				error = -6;
1269				break;
1270			}
1271			if (strcmp(diskname, "NONE") == 0)
1272				continue;
1273
1274			TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
1275				if (disk->d_consumer != NULL &&
1276				    disk->d_consumer->provider != NULL &&
1277				    strcmp(disk->d_consumer->provider->name,
1278				     diskname) == 0)
1279					break;
1280			}
1281			if (disk != NULL) {
1282				if (disk->d_state != G_RAID_DISK_S_ACTIVE) {
1283					gctl_error(req, "Disk '%s' is in a "
1284					    "wrong state (%s).", diskname,
1285					    g_raid_disk_state2str(disk->d_state));
1286					error = -7;
1287					break;
1288				}
1289				pd = disk->d_md_data;
1290				if (pd->pd_subdisks >= PROMISE_MAX_SUBDISKS) {
1291					gctl_error(req, "Disk '%s' already "
1292					    "used by %d volumes.",
1293					    diskname, pd->pd_subdisks);
1294					error = -7;
1295					break;
1296				}
1297				pp = disk->d_consumer->provider;
1298				disks[i] = disk;
1299				promise_meta_unused_range(pd->pd_meta,
1300				    pd->pd_subdisks,
1301				    pp->mediasize / pp->sectorsize,
1302				    &offs[i], &esize);
1303				size = MIN(size, (off_t)esize * pp->sectorsize);
1304				sectorsize = MAX(sectorsize, pp->sectorsize);
1305				continue;
1306			}
1307
1308			g_topology_lock();
1309			cp = g_raid_open_consumer(sc, diskname);
1310			if (cp == NULL) {
1311				gctl_error(req, "Can't open disk '%s'.",
1312				    diskname);
1313				g_topology_unlock();
1314				error = -8;
1315				break;
1316			}
1317			pp = cp->provider;
1318			pd = malloc(sizeof(*pd), M_MD_PROMISE, M_WAITOK | M_ZERO);
1319			disk = g_raid_create_disk(sc);
1320			disk->d_md_data = (void *)pd;
1321			disk->d_consumer = cp;
1322			disks[i] = disk;
1323			cp->private = disk;
1324			g_topology_unlock();
1325
1326			if (pp->mediasize / pp->sectorsize > UINT32_MAX) {
1327				gctl_error(req,
1328				    "Disk '%s' is too big.", diskname);
1329				error = -8;
1330				break;
1331			}
1332
1333			/* Read kernel dumping information. */
1334			disk->d_kd.offset = 0;
1335			disk->d_kd.length = OFF_MAX;
1336			len = sizeof(disk->d_kd);
1337			g_io_getattr("GEOM::kerneldump", cp, &len, &disk->d_kd);
1338			if (disk->d_kd.di.dumper == NULL)
1339				G_RAID_DEBUG1(2, sc,
1340				    "Dumping not supported by %s.",
1341				    cp->provider->name);
1342
1343			/* Reserve some space for metadata. */
1344			size = MIN(size, pp->mediasize - 131072llu * pp->sectorsize);
1345			sectorsize = MAX(sectorsize, pp->sectorsize);
1346		}
1347		if (error != 0) {
1348			for (i = 0; i < numdisks; i++) {
1349				if (disks[i] != NULL &&
1350				    disks[i]->d_state == G_RAID_DISK_S_NONE)
1351					g_raid_destroy_disk(disks[i]);
1352			}
1353			return (error);
1354		}
1355
1356		if (sectorsize <= 0) {
1357			gctl_error(req, "Can't get sector size.");
1358			return (-8);
1359		}
1360
1361		/* Handle size argument. */
1362		len = sizeof(*sizearg);
1363		sizearg = gctl_get_param(req, "size", &len);
1364		if (sizearg != NULL && len == sizeof(*sizearg) &&
1365		    *sizearg > 0) {
1366			if (*sizearg > size) {
1367				gctl_error(req, "Size too big %lld > %lld.",
1368				    (long long)*sizearg, (long long)size);
1369				return (-9);
1370			}
1371			size = *sizearg;
1372		}
1373
1374		/* Handle strip argument. */
1375		strip = 131072;
1376		len = sizeof(*striparg);
1377		striparg = gctl_get_param(req, "strip", &len);
1378		if (striparg != NULL && len == sizeof(*striparg) &&
1379		    *striparg > 0) {
1380			if (*striparg < sectorsize) {
1381				gctl_error(req, "Strip size too small.");
1382				return (-10);
1383			}
1384			if (*striparg % sectorsize != 0) {
1385				gctl_error(req, "Incorrect strip size.");
1386				return (-11);
1387			}
1388			strip = *striparg;
1389		}
1390
1391		/* Round size down to strip or sector. */
1392		if (level == G_RAID_VOLUME_RL_RAID1 ||
1393		    level == G_RAID_VOLUME_RL_SINGLE ||
1394		    level == G_RAID_VOLUME_RL_CONCAT)
1395			size -= (size % sectorsize);
1396		else if (level == G_RAID_VOLUME_RL_RAID1E &&
1397		    (numdisks & 1) != 0)
1398			size -= (size % (2 * strip));
1399		else
1400			size -= (size % strip);
1401		if (size <= 0) {
1402			gctl_error(req, "Size too small.");
1403			return (-13);
1404		}
1405		if (size > 0xffffffffllu * sectorsize) {
1406			gctl_error(req, "Size too big.");
1407			return (-14);
1408		}
1409
1410		/* We have all we need, create things: volume, ... */
1411		pv = malloc(sizeof(*pv), M_MD_PROMISE, M_WAITOK | M_ZERO);
1412		arc4rand(&pv->pv_id, sizeof(pv->pv_id), 0);
1413		pv->pv_generation = 0;
1414		pv->pv_started = 1;
1415		vol = g_raid_create_volume(sc, volname, -1);
1416		vol->v_md_data = pv;
1417		vol->v_raid_level = level;
1418		vol->v_raid_level_qualifier = G_RAID_VOLUME_RLQ_NONE;
1419		vol->v_strip_size = strip;
1420		vol->v_disks_count = numdisks;
1421		if (level == G_RAID_VOLUME_RL_RAID0 ||
1422		    level == G_RAID_VOLUME_RL_CONCAT ||
1423		    level == G_RAID_VOLUME_RL_SINGLE)
1424			vol->v_mediasize = size * numdisks;
1425		else if (level == G_RAID_VOLUME_RL_RAID1)
1426			vol->v_mediasize = size;
1427		else if (level == G_RAID_VOLUME_RL_RAID3 ||
1428		    level == G_RAID_VOLUME_RL_RAID5)
1429			vol->v_mediasize = size * (numdisks - 1);
1430		else { /* RAID1E */
1431			vol->v_mediasize = ((size * numdisks) / strip / 2) *
1432			    strip;
1433		}
1434		vol->v_sectorsize = sectorsize;
1435		g_raid_start_volume(vol);
1436
1437		/* , and subdisks. */
1438		for (i = 0; i < numdisks; i++) {
1439			disk = disks[i];
1440			sd = &vol->v_subdisks[i];
1441			sd->sd_disk = disk;
1442			sd->sd_offset = (off_t)offs[i] * 512;
1443			sd->sd_size = size;
1444			if (disk == NULL)
1445				continue;
1446			TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next);
1447			g_raid_change_disk_state(disk,
1448			    G_RAID_DISK_S_ACTIVE);
1449			g_raid_change_subdisk_state(sd,
1450			    G_RAID_SUBDISK_S_ACTIVE);
1451			g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW,
1452			    G_RAID_EVENT_SUBDISK);
1453		}
1454
1455		/* Write metadata based on created entities. */
1456		G_RAID_DEBUG1(0, sc, "Array started.");
1457		g_raid_md_write_promise(md, vol, NULL, NULL);
1458
1459		/* Pickup any STALE/SPARE disks to refill array if needed. */
1460		g_raid_md_promise_refill(sc);
1461
1462		g_raid_event_send(vol, G_RAID_VOLUME_E_START,
1463		    G_RAID_EVENT_VOLUME);
1464		return (0);
1465	}
1466	if (strcmp(verb, "add") == 0) {
1467
1468		gctl_error(req, "`add` command is not applicable, "
1469		    "use `label` instead.");
1470		return (-99);
1471	}
1472	if (strcmp(verb, "delete") == 0) {
1473
1474		/* Full node destruction. */
1475		if (*nargs == 1) {
1476			/* Check if some volume is still open. */
1477			force = gctl_get_paraml(req, "force", sizeof(*force));
1478			if (force != NULL && *force == 0 &&
1479			    g_raid_nopens(sc) != 0) {
1480				gctl_error(req, "Some volume is still open.");
1481				return (-4);
1482			}
1483
1484			TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
1485				if (disk->d_consumer)
1486					promise_meta_erase(disk->d_consumer);
1487			}
1488			g_raid_destroy_node(sc, 0);
1489			return (0);
1490		}
1491
1492		/* Destroy specified volume. If it was last - all node. */
1493		if (*nargs != 2) {
1494			gctl_error(req, "Invalid number of arguments.");
1495			return (-1);
1496		}
1497		volname = gctl_get_asciiparam(req, "arg1");
1498		if (volname == NULL) {
1499			gctl_error(req, "No volume name.");
1500			return (-2);
1501		}
1502
1503		/* Search for volume. */
1504		TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
1505			if (strcmp(vol->v_name, volname) == 0)
1506				break;
1507		}
1508		if (vol == NULL) {
1509			i = strtol(volname, &tmp, 10);
1510			if (verb != volname && tmp[0] == 0) {
1511				TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
1512					if (vol->v_global_id == i)
1513						break;
1514				}
1515			}
1516		}
1517		if (vol == NULL) {
1518			gctl_error(req, "Volume '%s' not found.", volname);
1519			return (-3);
1520		}
1521
1522		/* Check if volume is still open. */
1523		force = gctl_get_paraml(req, "force", sizeof(*force));
1524		if (force != NULL && *force == 0 &&
1525		    vol->v_provider_open != 0) {
1526			gctl_error(req, "Volume is still open.");
1527			return (-4);
1528		}
1529
1530		/* Destroy volume and potentially node. */
1531		i = 0;
1532		TAILQ_FOREACH(vol1, &sc->sc_volumes, v_next)
1533			i++;
1534		if (i >= 2) {
1535			g_raid_destroy_volume(vol);
1536			g_raid_md_promise_purge_disks(sc);
1537			g_raid_md_write_promise(md, NULL, NULL, NULL);
1538		} else {
1539			TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
1540				if (disk->d_consumer)
1541					promise_meta_erase(disk->d_consumer);
1542			}
1543			g_raid_destroy_node(sc, 0);
1544		}
1545		return (0);
1546	}
1547	if (strcmp(verb, "remove") == 0 ||
1548	    strcmp(verb, "fail") == 0) {
1549		if (*nargs < 2) {
1550			gctl_error(req, "Invalid number of arguments.");
1551			return (-1);
1552		}
1553		for (i = 1; i < *nargs; i++) {
1554			snprintf(arg, sizeof(arg), "arg%d", i);
1555			diskname = gctl_get_asciiparam(req, arg);
1556			if (diskname == NULL) {
1557				gctl_error(req, "No disk name (%s).", arg);
1558				error = -2;
1559				break;
1560			}
1561			if (strncmp(diskname, "/dev/", 5) == 0)
1562				diskname += 5;
1563
1564			TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
1565				if (disk->d_consumer != NULL &&
1566				    disk->d_consumer->provider != NULL &&
1567				    strcmp(disk->d_consumer->provider->name,
1568				     diskname) == 0)
1569					break;
1570			}
1571			if (disk == NULL) {
1572				gctl_error(req, "Disk '%s' not found.",
1573				    diskname);
1574				error = -3;
1575				break;
1576			}
1577
1578			if (strcmp(verb, "fail") == 0) {
1579				g_raid_md_fail_disk_promise(md, NULL, disk);
1580				continue;
1581			}
1582
1583			/* Erase metadata on deleting disk and destroy it. */
1584			promise_meta_erase(disk->d_consumer);
1585			g_raid_destroy_disk(disk);
1586		}
1587		g_raid_md_promise_purge_volumes(sc);
1588
1589		/* Write updated metadata to remaining disks. */
1590		g_raid_md_write_promise(md, NULL, NULL, NULL);
1591
1592		/* Check if anything left. */
1593		if (g_raid_ndisks(sc, -1) == 0)
1594			g_raid_destroy_node(sc, 0);
1595		else
1596			g_raid_md_promise_refill(sc);
1597		return (error);
1598	}
1599	if (strcmp(verb, "insert") == 0) {
1600		if (*nargs < 2) {
1601			gctl_error(req, "Invalid number of arguments.");
1602			return (-1);
1603		}
1604		for (i = 1; i < *nargs; i++) {
1605			/* Get disk name. */
1606			snprintf(arg, sizeof(arg), "arg%d", i);
1607			diskname = gctl_get_asciiparam(req, arg);
1608			if (diskname == NULL) {
1609				gctl_error(req, "No disk name (%s).", arg);
1610				error = -3;
1611				break;
1612			}
1613
1614			/* Try to find provider with specified name. */
1615			g_topology_lock();
1616			cp = g_raid_open_consumer(sc, diskname);
1617			if (cp == NULL) {
1618				gctl_error(req, "Can't open disk '%s'.",
1619				    diskname);
1620				g_topology_unlock();
1621				error = -4;
1622				break;
1623			}
1624			pp = cp->provider;
1625			g_topology_unlock();
1626
1627			if (pp->mediasize / pp->sectorsize > UINT32_MAX) {
1628				gctl_error(req,
1629				    "Disk '%s' is too big.", diskname);
1630				g_raid_kill_consumer(sc, cp);
1631				error = -8;
1632				break;
1633			}
1634
1635			pd = malloc(sizeof(*pd), M_MD_PROMISE, M_WAITOK | M_ZERO);
1636
1637			disk = g_raid_create_disk(sc);
1638			disk->d_consumer = cp;
1639			disk->d_md_data = (void *)pd;
1640			cp->private = disk;
1641
1642			/* Read kernel dumping information. */
1643			disk->d_kd.offset = 0;
1644			disk->d_kd.length = OFF_MAX;
1645			len = sizeof(disk->d_kd);
1646			g_io_getattr("GEOM::kerneldump", cp, &len, &disk->d_kd);
1647			if (disk->d_kd.di.dumper == NULL)
1648				G_RAID_DEBUG1(2, sc,
1649				    "Dumping not supported by %s.",
1650				    cp->provider->name);
1651
1652			/* Welcome the "new" disk. */
1653			g_raid_change_disk_state(disk, G_RAID_DISK_S_SPARE);
1654			promise_meta_write_spare(cp);
1655			g_raid_md_promise_refill(sc);
1656		}
1657		return (error);
1658	}
1659	return (-100);
1660}
1661
1662static int
1663g_raid_md_write_promise(struct g_raid_md_object *md, struct g_raid_volume *tvol,
1664    struct g_raid_subdisk *tsd, struct g_raid_disk *tdisk)
1665{
1666	struct g_raid_softc *sc;
1667	struct g_raid_volume *vol;
1668	struct g_raid_subdisk *sd;
1669	struct g_raid_disk *disk;
1670	struct g_raid_md_promise_perdisk *pd;
1671	struct g_raid_md_promise_pervolume *pv;
1672	struct promise_raid_conf *meta;
1673	off_t rebuild_lba64;
1674	int i, j, pos, rebuild;
1675
1676	sc = md->mdo_softc;
1677
1678	if (sc->sc_stopping == G_RAID_DESTROY_HARD)
1679		return (0);
1680
1681	/* Generate new per-volume metadata for affected volumes. */
1682	TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
1683		if (vol->v_stopping)
1684			continue;
1685
1686		/* Skip volumes not related to specified targets. */
1687		if (tvol != NULL && vol != tvol)
1688			continue;
1689		if (tsd != NULL && vol != tsd->sd_volume)
1690			continue;
1691		if (tdisk != NULL) {
1692			for (i = 0; i < vol->v_disks_count; i++) {
1693				if (vol->v_subdisks[i].sd_disk == tdisk)
1694					break;
1695			}
1696			if (i >= vol->v_disks_count)
1697				continue;
1698		}
1699
1700		pv = (struct g_raid_md_promise_pervolume *)vol->v_md_data;
1701		pv->pv_generation++;
1702
1703		meta = malloc(sizeof(*meta), M_MD_PROMISE, M_WAITOK | M_ZERO);
1704		if (pv->pv_meta != NULL)
1705			memcpy(meta, pv->pv_meta, sizeof(*meta));
1706		memcpy(meta->promise_id, PROMISE_MAGIC,
1707		    sizeof(PROMISE_MAGIC) - 1);
1708		meta->dummy_0 = 0x00020000;
1709		meta->integrity = PROMISE_I_VALID;
1710
1711		meta->generation = pv->pv_generation;
1712		meta->status = PROMISE_S_VALID | PROMISE_S_ONLINE |
1713		    PROMISE_S_INITED | PROMISE_S_READY;
1714		if (vol->v_state <= G_RAID_VOLUME_S_DEGRADED)
1715			meta->status |= PROMISE_S_DEGRADED;
1716		if (vol->v_dirty)
1717			meta->status |= PROMISE_S_MARKED; /* XXX: INVENTED! */
1718		if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID0 ||
1719		    vol->v_raid_level == G_RAID_VOLUME_RL_SINGLE)
1720			meta->type = PROMISE_T_RAID0;
1721		else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1 ||
1722		    vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E)
1723			meta->type = PROMISE_T_RAID1;
1724		else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID3)
1725			meta->type = PROMISE_T_RAID3;
1726		else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID5)
1727			meta->type = PROMISE_T_RAID5;
1728		else if (vol->v_raid_level == G_RAID_VOLUME_RL_CONCAT)
1729			meta->type = PROMISE_T_SPAN;
1730		else
1731			meta->type = PROMISE_T_JBOD;
1732		meta->total_disks = vol->v_disks_count;
1733		meta->stripe_shift = ffs(vol->v_strip_size / 1024);
1734		meta->array_width = vol->v_disks_count;
1735		if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1 ||
1736		    vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E)
1737			meta->array_width /= 2;
1738		meta->array_number = vol->v_global_id;
1739		meta->total_sectors = vol->v_mediasize / vol->v_sectorsize;
1740		meta->total_sectors_high =
1741		    (vol->v_mediasize / vol->v_sectorsize) >> 32;
1742		meta->cylinders = meta->total_sectors / (255 * 63) - 1;
1743		meta->heads = 254;
1744		meta->sectors = 63;
1745		meta->volume_id = pv->pv_id;
1746		rebuild_lba64 = UINT64_MAX;
1747		rebuild = 0;
1748		for (i = 0; i < vol->v_disks_count; i++) {
1749			sd = &vol->v_subdisks[i];
1750			/* For RAID0+1 we need to translate order. */
1751			pos = promise_meta_translate_disk(vol, i);
1752			meta->disks[pos].flags = PROMISE_F_VALID |
1753			    PROMISE_F_ASSIGNED;
1754			if (sd->sd_state == G_RAID_SUBDISK_S_NONE) {
1755				meta->disks[pos].flags |= 0;
1756			} else if (sd->sd_state == G_RAID_SUBDISK_S_FAILED) {
1757				meta->disks[pos].flags |=
1758				    PROMISE_F_DOWN | PROMISE_F_REDIR;
1759			} else if (sd->sd_state <= G_RAID_SUBDISK_S_REBUILD) {
1760				meta->disks[pos].flags |=
1761				    PROMISE_F_ONLINE | PROMISE_F_REDIR;
1762				if (sd->sd_state == G_RAID_SUBDISK_S_REBUILD) {
1763					rebuild_lba64 = MIN(rebuild_lba64,
1764					    sd->sd_rebuild_pos / 512);
1765				} else
1766					rebuild_lba64 = 0;
1767				rebuild = 1;
1768			} else {
1769				meta->disks[pos].flags |= PROMISE_F_ONLINE;
1770				if (sd->sd_state < G_RAID_SUBDISK_S_ACTIVE) {
1771					meta->status |= PROMISE_S_MARKED;
1772					if (sd->sd_state == G_RAID_SUBDISK_S_RESYNC) {
1773						rebuild_lba64 = MIN(rebuild_lba64,
1774						    sd->sd_rebuild_pos / 512);
1775					} else
1776						rebuild_lba64 = 0;
1777				}
1778			}
1779			if (pv->pv_meta != NULL) {
1780				meta->disks[pos].id = pv->pv_meta->disks[pos].id;
1781			} else {
1782				meta->disks[pos].number = i * 2;
1783				arc4rand(&meta->disks[pos].id,
1784				    sizeof(meta->disks[pos].id), 0);
1785			}
1786		}
1787		promise_meta_put_name(meta, vol->v_name);
1788
1789		/* Try to mimic AMD BIOS rebuild/resync behavior. */
1790		if (rebuild_lba64 != UINT64_MAX) {
1791			if (rebuild)
1792				meta->magic_3 = 0x03040010UL; /* Rebuild? */
1793			else
1794				meta->magic_3 = 0x03040008UL; /* Resync? */
1795			/* Translate from per-disk to per-volume LBA. */
1796			if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1 ||
1797			    vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E) {
1798				rebuild_lba64 *= meta->array_width;
1799			} else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID3 ||
1800			    vol->v_raid_level == G_RAID_VOLUME_RL_RAID5) {
1801				rebuild_lba64 *= meta->array_width - 1;
1802			} else
1803				rebuild_lba64 = 0;
1804		} else
1805			meta->magic_3 = 0x03000000UL;
1806		meta->rebuild_lba64 = rebuild_lba64;
1807		meta->magic_4 = 0x04010101UL;
1808
1809		/* Replace per-volume metadata with new. */
1810		if (pv->pv_meta != NULL)
1811			free(pv->pv_meta, M_MD_PROMISE);
1812		pv->pv_meta = meta;
1813
1814		/* Copy new metadata to the disks, adding or replacing old. */
1815		for (i = 0; i < vol->v_disks_count; i++) {
1816			sd = &vol->v_subdisks[i];
1817			disk = sd->sd_disk;
1818			if (disk == NULL)
1819				continue;
1820			/* For RAID0+1 we need to translate order. */
1821			pos = promise_meta_translate_disk(vol, i);
1822			pd = (struct g_raid_md_promise_perdisk *)disk->d_md_data;
1823			for (j = 0; j < pd->pd_subdisks; j++) {
1824				if (pd->pd_meta[j]->volume_id == meta->volume_id)
1825					break;
1826			}
1827			if (j == pd->pd_subdisks)
1828				pd->pd_subdisks++;
1829			if (pd->pd_meta[j] != NULL)
1830				free(pd->pd_meta[j], M_MD_PROMISE);
1831			pd->pd_meta[j] = promise_meta_copy(meta);
1832			pd->pd_meta[j]->disk = meta->disks[pos];
1833			pd->pd_meta[j]->disk.number = pos;
1834			pd->pd_meta[j]->disk_offset = sd->sd_offset / 512;
1835			pd->pd_meta[j]->disk_sectors = sd->sd_size / 512;
1836			if (sd->sd_state == G_RAID_SUBDISK_S_REBUILD) {
1837				pd->pd_meta[j]->rebuild_lba =
1838				    sd->sd_rebuild_pos / 512;
1839			} else if (sd->sd_state < G_RAID_SUBDISK_S_REBUILD)
1840				pd->pd_meta[j]->rebuild_lba = 0;
1841			else
1842				pd->pd_meta[j]->rebuild_lba = UINT32_MAX;
1843			pd->pd_updated = 1;
1844		}
1845	}
1846
1847	TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
1848		pd = (struct g_raid_md_promise_perdisk *)disk->d_md_data;
1849		if (disk->d_state != G_RAID_DISK_S_ACTIVE)
1850			continue;
1851		if (!pd->pd_updated)
1852			continue;
1853		G_RAID_DEBUG(1, "Writing Promise metadata to %s",
1854		    g_raid_get_diskname(disk));
1855		for (i = 0; i < pd->pd_subdisks; i++)
1856			g_raid_md_promise_print(pd->pd_meta[i]);
1857		promise_meta_write(disk->d_consumer,
1858		    pd->pd_meta, pd->pd_subdisks);
1859		pd->pd_updated = 0;
1860	}
1861
1862	return (0);
1863}
1864
1865static int
1866g_raid_md_fail_disk_promise(struct g_raid_md_object *md,
1867    struct g_raid_subdisk *tsd, struct g_raid_disk *tdisk)
1868{
1869	struct g_raid_softc *sc;
1870	struct g_raid_md_promise_perdisk *pd;
1871	struct g_raid_subdisk *sd;
1872	int i, pos;
1873
1874	sc = md->mdo_softc;
1875	pd = (struct g_raid_md_promise_perdisk *)tdisk->d_md_data;
1876
1877	/* We can't fail disk that is not a part of array now. */
1878	if (tdisk->d_state != G_RAID_DISK_S_ACTIVE)
1879		return (-1);
1880
1881	/*
1882	 * Mark disk as failed in metadata and try to write that metadata
1883	 * to the disk itself to prevent it's later resurrection as STALE.
1884	 */
1885	if (pd->pd_subdisks > 0 && tdisk->d_consumer != NULL)
1886		G_RAID_DEBUG(1, "Writing Promise metadata to %s",
1887		    g_raid_get_diskname(tdisk));
1888	for (i = 0; i < pd->pd_subdisks; i++) {
1889		pd->pd_meta[i]->disk.flags |=
1890		    PROMISE_F_DOWN | PROMISE_F_REDIR;
1891		pos = pd->pd_meta[i]->disk.number;
1892		if (pos >= 0 && pos < PROMISE_MAX_DISKS) {
1893			pd->pd_meta[i]->disks[pos].flags |=
1894			    PROMISE_F_DOWN | PROMISE_F_REDIR;
1895		}
1896		g_raid_md_promise_print(pd->pd_meta[i]);
1897	}
1898	if (tdisk->d_consumer != NULL)
1899		promise_meta_write(tdisk->d_consumer,
1900		    pd->pd_meta, pd->pd_subdisks);
1901
1902	/* Change states. */
1903	g_raid_change_disk_state(tdisk, G_RAID_DISK_S_FAILED);
1904	TAILQ_FOREACH(sd, &tdisk->d_subdisks, sd_next) {
1905		g_raid_change_subdisk_state(sd,
1906		    G_RAID_SUBDISK_S_FAILED);
1907		g_raid_event_send(sd, G_RAID_SUBDISK_E_FAILED,
1908		    G_RAID_EVENT_SUBDISK);
1909	}
1910
1911	/* Write updated metadata to remaining disks. */
1912	g_raid_md_write_promise(md, NULL, NULL, tdisk);
1913
1914	g_raid_md_promise_refill(sc);
1915	return (0);
1916}
1917
1918static int
1919g_raid_md_free_disk_promise(struct g_raid_md_object *md,
1920    struct g_raid_disk *disk)
1921{
1922	struct g_raid_md_promise_perdisk *pd;
1923	int i;
1924
1925	pd = (struct g_raid_md_promise_perdisk *)disk->d_md_data;
1926	for (i = 0; i < pd->pd_subdisks; i++) {
1927		if (pd->pd_meta[i] != NULL) {
1928			free(pd->pd_meta[i], M_MD_PROMISE);
1929			pd->pd_meta[i] = NULL;
1930		}
1931	}
1932	free(pd, M_MD_PROMISE);
1933	disk->d_md_data = NULL;
1934	return (0);
1935}
1936
1937static int
1938g_raid_md_free_volume_promise(struct g_raid_md_object *md,
1939    struct g_raid_volume *vol)
1940{
1941	struct g_raid_md_promise_pervolume *pv;
1942
1943	pv = (struct g_raid_md_promise_pervolume *)vol->v_md_data;
1944	if (pv && pv->pv_meta != NULL) {
1945		free(pv->pv_meta, M_MD_PROMISE);
1946		pv->pv_meta = NULL;
1947	}
1948	if (pv && !pv->pv_started) {
1949		pv->pv_started = 1;
1950		callout_stop(&pv->pv_start_co);
1951	}
1952	return (0);
1953}
1954
1955static int
1956g_raid_md_free_promise(struct g_raid_md_object *md)
1957{
1958
1959	return (0);
1960}
1961
1962G_RAID_MD_DECLARE(g_raid_md_promise);
1963