1/* $OpenBSD: softraid.c,v 1.430 2024/02/03 18:51:58 beck Exp $ */
2/*
3 * Copyright (c) 2007, 2008, 2009 Marco Peereboom <marco@peereboom.us>
4 * Copyright (c) 2008 Chris Kuethe <ckuethe@openbsd.org>
5 * Copyright (c) 2009 Joel Sing <jsing@openbsd.org>
6 *
7 * Permission to use, copy, modify, and distribute this software for any
8 * purpose with or without fee is hereby granted, provided that the above
9 * copyright notice and this permission notice appear in all copies.
10 *
11 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
12 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
13 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
14 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
15 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
16 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
17 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
18 */
19
20#include "bio.h"
21
22#include <sys/param.h>
23#include <sys/systm.h>
24#include <sys/buf.h>
25#include <sys/device.h>
26#include <sys/ioctl.h>
27#include <sys/malloc.h>
28#include <sys/pool.h>
29#include <sys/kernel.h>
30#include <sys/disk.h>
31#include <sys/rwlock.h>
32#include <sys/queue.h>
33#include <sys/fcntl.h>
34#include <sys/disklabel.h>
35#include <sys/vnode.h>
36#include <sys/lock.h>
37#include <sys/mount.h>
38#include <sys/sensors.h>
39#include <sys/stat.h>
40#include <sys/conf.h>
41#include <sys/uio.h>
42#include <sys/task.h>
43#include <sys/kthread.h>
44#include <sys/dkio.h>
45#include <sys/stdint.h>
46
47#include <scsi/scsi_all.h>
48#include <scsi/scsiconf.h>
49#include <scsi/scsi_disk.h>
50
51#include <dev/softraidvar.h>
52
53#ifdef HIBERNATE
54#include <lib/libsa/aes_xts.h>
55#include <sys/hibernate.h>
56#include <scsi/sdvar.h>
57#endif /* HIBERNATE */
58
59/* #define SR_FANCY_STATS */
60
61#ifdef SR_DEBUG
62#define SR_FANCY_STATS
63uint32_t	sr_debug = 0
64		    /* | SR_D_CMD */
65		    /* | SR_D_MISC */
66		    /* | SR_D_INTR */
67		    /* | SR_D_IOCTL */
68		    /* | SR_D_CCB */
69		    /* | SR_D_WU */
70		    /* | SR_D_META */
71		    /* | SR_D_DIS */
72		    /* | SR_D_STATE */
73		    /* | SR_D_REBUILD */
74		;
75#endif
76
77struct sr_softc	*softraid0;
78struct sr_uuid	sr_bootuuid;
79u_int8_t	sr_bootkey[SR_CRYPTO_MAXKEYBYTES];
80
81int		sr_match(struct device *, void *, void *);
82void		sr_attach(struct device *, struct device *, void *);
83int		sr_detach(struct device *, int);
84void		sr_map_root(void);
85
86const struct cfattach softraid_ca = {
87	sizeof(struct sr_softc), sr_match, sr_attach, sr_detach,
88};
89
90struct cfdriver softraid_cd = {
91	NULL, "softraid", DV_DULL
92};
93
94/* scsi & discipline */
95void			sr_scsi_cmd(struct scsi_xfer *);
96int			sr_scsi_probe(struct scsi_link *);
97int			sr_scsi_ioctl(struct scsi_link *, u_long,
98			    caddr_t, int);
99int			sr_bio_ioctl(struct device *, u_long, caddr_t);
100int			sr_bio_handler(struct sr_softc *,
101			    struct sr_discipline *, u_long, struct bio *);
102int			sr_ioctl_inq(struct sr_softc *, struct bioc_inq *);
103int			sr_ioctl_vol(struct sr_softc *, struct bioc_vol *);
104int			sr_ioctl_disk(struct sr_softc *, struct bioc_disk *);
105int			sr_ioctl_setstate(struct sr_softc *,
106			    struct bioc_setstate *);
107int			sr_ioctl_createraid(struct sr_softc *,
108			    struct bioc_createraid *, int, void *);
109int			sr_ioctl_deleteraid(struct sr_softc *,
110			    struct sr_discipline *, struct bioc_deleteraid *);
111int			sr_ioctl_discipline(struct sr_softc *,
112			    struct sr_discipline *, struct bioc_discipline *);
113int			sr_ioctl_installboot(struct sr_softc *,
114			    struct sr_discipline *, struct bioc_installboot *);
115void			sr_chunks_unwind(struct sr_softc *,
116			    struct sr_chunk_head *);
117void			sr_discipline_free(struct sr_discipline *);
118void			sr_discipline_shutdown(struct sr_discipline *, int, int);
119int			sr_discipline_init(struct sr_discipline *, int);
120int			sr_alloc_resources(struct sr_discipline *);
121void			sr_free_resources(struct sr_discipline *);
122void			sr_set_chunk_state(struct sr_discipline *, int, int);
123void			sr_set_vol_state(struct sr_discipline *);
124
125/* utility functions */
126void			sr_shutdown(int);
127void			sr_uuid_generate(struct sr_uuid *);
128char			*sr_uuid_format(struct sr_uuid *);
129void			sr_uuid_print(struct sr_uuid *, int);
130void			sr_checksum_print(u_int8_t *);
131int			sr_boot_assembly(struct sr_softc *);
132int			sr_already_assembled(struct sr_discipline *);
133int			sr_hotspare(struct sr_softc *, dev_t);
134void			sr_hotspare_rebuild(struct sr_discipline *);
135int			sr_rebuild_init(struct sr_discipline *, dev_t, int);
136void			sr_rebuild_start(void *);
137void			sr_rebuild_thread(void *);
138void			sr_rebuild(struct sr_discipline *);
139void			sr_roam_chunks(struct sr_discipline *);
140int			sr_chunk_in_use(struct sr_softc *, dev_t);
141int			sr_rw(struct sr_softc *, dev_t, char *, size_t,
142			    daddr_t, long);
143void			sr_wu_done_callback(void *);
144struct sr_discipline	*sr_find_discipline(struct sr_softc *sc, const char *);
145
146/* don't include these on RAMDISK */
147#ifndef SMALL_KERNEL
148void			sr_sensors_refresh(void *);
149int			sr_sensors_create(struct sr_discipline *);
150void			sr_sensors_delete(struct sr_discipline *);
151#endif
152
153/* metadata */
154int			sr_meta_probe(struct sr_discipline *, dev_t *, int);
155int			sr_meta_attach(struct sr_discipline *, int, int);
156int			sr_meta_rw(struct sr_discipline *, dev_t, void *, long);
157int			sr_meta_clear(struct sr_discipline *);
158void			sr_meta_init(struct sr_discipline *, int, int);
159void			sr_meta_init_complete(struct sr_discipline *);
160void			sr_meta_opt_handler(struct sr_discipline *,
161			    struct sr_meta_opt_hdr *);
162
163/* hotplug magic */
164void			sr_disk_attach(struct disk *, int);
165
166struct sr_hotplug_list {
167	void			(*sh_hotplug)(struct sr_discipline *,
168				    struct disk *, int);
169	struct sr_discipline	*sh_sd;
170
171	SLIST_ENTRY(sr_hotplug_list) shl_link;
172};
173SLIST_HEAD(sr_hotplug_list_head, sr_hotplug_list);
174
175struct			sr_hotplug_list_head	sr_hotplug_callbacks;
176extern void		(*softraid_disk_attach)(struct disk *, int);
177
178/* scsi glue */
179const struct scsi_adapter sr_switch = {
180	sr_scsi_cmd, NULL, sr_scsi_probe, NULL, sr_scsi_ioctl
181};
182
183/* native metadata format */
184int			sr_meta_native_bootprobe(struct sr_softc *, dev_t,
185			    struct sr_boot_chunk_head *);
186#define SR_META_NOTCLAIMED	(0)
187#define SR_META_CLAIMED		(1)
188int			sr_meta_native_probe(struct sr_softc *,
189			   struct sr_chunk *);
190int			sr_meta_native_attach(struct sr_discipline *, int);
191int			sr_meta_native_write(struct sr_discipline *, dev_t,
192			    struct sr_metadata *,void *);
193
194#ifdef SR_DEBUG
195void			sr_meta_print(struct sr_metadata *);
196#else
197#define			sr_meta_print(m)
198#endif
199
200/* the metadata driver should remain stateless */
201struct sr_meta_driver {
202	daddr_t			smd_offset;	/* metadata location */
203	u_int32_t		smd_size;	/* size of metadata */
204
205	int			(*smd_probe)(struct sr_softc *,
206				   struct sr_chunk *);
207	int			(*smd_attach)(struct sr_discipline *, int);
208	int			(*smd_detach)(struct sr_discipline *);
209	int			(*smd_read)(struct sr_discipline *, dev_t,
210				    struct sr_metadata *, void *);
211	int			(*smd_write)(struct sr_discipline *, dev_t,
212				    struct sr_metadata *, void *);
213	int			(*smd_validate)(struct sr_discipline *,
214				    struct sr_metadata *, void *);
215} smd[] = {
216	{ SR_META_OFFSET, SR_META_SIZE * DEV_BSIZE,
217	  sr_meta_native_probe, sr_meta_native_attach, NULL,
218	  sr_meta_native_read, sr_meta_native_write, NULL },
219	{ 0, 0, NULL, NULL, NULL, NULL }
220};
221
222int
223sr_meta_attach(struct sr_discipline *sd, int chunk_no, int force)
224{
225	struct sr_softc		*sc = sd->sd_sc;
226	struct sr_chunk_head	*cl;
227	struct sr_chunk		*ch_entry, *chunk1, *chunk2;
228	int			rv = 1, i = 0;
229
230	DNPRINTF(SR_D_META, "%s: sr_meta_attach(%d)\n", DEVNAME(sc), chunk_no);
231
232	/* in memory copy of metadata */
233	sd->sd_meta = malloc(SR_META_SIZE * DEV_BSIZE, M_DEVBUF,
234	    M_ZERO | M_NOWAIT);
235	if (!sd->sd_meta) {
236		sr_error(sc, "could not allocate memory for metadata");
237		goto bad;
238	}
239
240	if (sd->sd_meta_type != SR_META_F_NATIVE) {
241		/* in memory copy of foreign metadata */
242		sd->sd_meta_foreign = malloc(smd[sd->sd_meta_type].smd_size,
243		    M_DEVBUF, M_ZERO | M_NOWAIT);
244		if (!sd->sd_meta_foreign) {
245			/* unwind frees sd_meta */
246			sr_error(sc, "could not allocate memory for foreign "
247			    "metadata");
248			goto bad;
249		}
250	}
251
252	/* we have a valid list now create an array index */
253	cl = &sd->sd_vol.sv_chunk_list;
254	sd->sd_vol.sv_chunks = mallocarray(chunk_no, sizeof(struct sr_chunk *),
255	    M_DEVBUF, M_WAITOK | M_ZERO);
256
257	/* fill out chunk array */
258	i = 0;
259	SLIST_FOREACH(ch_entry, cl, src_link)
260		sd->sd_vol.sv_chunks[i++] = ch_entry;
261
262	/* attach metadata */
263	if (smd[sd->sd_meta_type].smd_attach(sd, force))
264		goto bad;
265
266	/* Force chunks into correct order now that metadata is attached. */
267	SLIST_INIT(cl);
268	for (i = 0; i < chunk_no; i++) {
269		ch_entry = sd->sd_vol.sv_chunks[i];
270		chunk2 = NULL;
271		SLIST_FOREACH(chunk1, cl, src_link) {
272			if (chunk1->src_meta.scmi.scm_chunk_id >
273			    ch_entry->src_meta.scmi.scm_chunk_id)
274				break;
275			chunk2 = chunk1;
276		}
277		if (chunk2 == NULL)
278			SLIST_INSERT_HEAD(cl, ch_entry, src_link);
279		else
280			SLIST_INSERT_AFTER(chunk2, ch_entry, src_link);
281	}
282	i = 0;
283	SLIST_FOREACH(ch_entry, cl, src_link)
284		sd->sd_vol.sv_chunks[i++] = ch_entry;
285
286	rv = 0;
287bad:
288	return (rv);
289}
290
291int
292sr_meta_probe(struct sr_discipline *sd, dev_t *dt, int no_chunk)
293{
294	struct sr_softc		*sc = sd->sd_sc;
295	struct vnode		*vn;
296	struct sr_chunk		*ch_entry, *ch_prev = NULL;
297	struct sr_chunk_head	*cl;
298	char			devname[32];
299	int			i, d, type, found, prevf, error;
300	dev_t			dev;
301
302	DNPRINTF(SR_D_META, "%s: sr_meta_probe(%d)\n", DEVNAME(sc), no_chunk);
303
304	if (no_chunk == 0)
305		goto unwind;
306
307	cl = &sd->sd_vol.sv_chunk_list;
308
309	for (d = 0, prevf = SR_META_F_INVALID; d < no_chunk; d++) {
310		ch_entry = malloc(sizeof(struct sr_chunk), M_DEVBUF,
311		    M_WAITOK | M_ZERO);
312		/* keep disks in user supplied order */
313		if (ch_prev)
314			SLIST_INSERT_AFTER(ch_prev, ch_entry, src_link);
315		else
316			SLIST_INSERT_HEAD(cl, ch_entry, src_link);
317		ch_prev = ch_entry;
318		dev = dt[d];
319		ch_entry->src_dev_mm = dev;
320
321		if (dev == NODEV) {
322			ch_entry->src_meta.scm_status = BIOC_SDOFFLINE;
323			continue;
324		} else {
325			sr_meta_getdevname(sc, dev, devname, sizeof(devname));
326			if (bdevvp(dev, &vn)) {
327				sr_error(sc, "sr_meta_probe: cannot allocate "
328				    "vnode");
329				goto unwind;
330			}
331
332			/*
333			 * XXX leaving dev open for now; move this to attach
334			 * and figure out the open/close dance for unwind.
335			 */
336			error = VOP_OPEN(vn, FREAD | FWRITE, NOCRED, curproc);
337			if (error) {
338				DNPRINTF(SR_D_META,"%s: sr_meta_probe can't "
339				    "open %s\n", DEVNAME(sc), devname);
340				vput(vn);
341				goto unwind;
342			}
343
344			strlcpy(ch_entry->src_devname, devname,
345			    sizeof(ch_entry->src_devname));
346			ch_entry->src_vn = vn;
347		}
348
349		/* determine if this is a device we understand */
350		for (i = 0, found = SR_META_F_INVALID; smd[i].smd_probe; i++) {
351			type = smd[i].smd_probe(sc, ch_entry);
352			if (type == SR_META_F_INVALID)
353				continue;
354			else {
355				found = type;
356				break;
357			}
358		}
359
360		if (found == SR_META_F_INVALID)
361			goto unwind;
362		if (prevf == SR_META_F_INVALID)
363			prevf = found;
364		if (prevf != found) {
365			DNPRINTF(SR_D_META, "%s: prevf != found\n",
366			    DEVNAME(sc));
367			goto unwind;
368		}
369	}
370
371	return (prevf);
372unwind:
373	return (SR_META_F_INVALID);
374}
375
376void
377sr_meta_getdevname(struct sr_softc *sc, dev_t dev, char *buf, int size)
378{
379	int			maj, unit, part;
380	char			*name;
381
382	DNPRINTF(SR_D_META, "%s: sr_meta_getdevname(%p, %d)\n",
383	    DEVNAME(sc), buf, size);
384
385	if (!buf)
386		return;
387
388	maj = major(dev);
389	part = DISKPART(dev);
390	unit = DISKUNIT(dev);
391
392	name = findblkname(maj);
393	if (name == NULL)
394		return;
395
396	snprintf(buf, size, "%s%d%c", name, unit, part + 'a');
397}
398
399int
400sr_rw(struct sr_softc *sc, dev_t dev, char *buf, size_t size, daddr_t blkno,
401    long flags)
402{
403	struct vnode		*vp;
404	struct buf		b;
405	size_t			bufsize, dma_bufsize;
406	int			rv = 1;
407	char			*dma_buf;
408	int			s;
409
410	DNPRINTF(SR_D_MISC, "%s: sr_rw(0x%x, %p, %zu, %lld 0x%lx)\n",
411	    DEVNAME(sc), dev, buf, size, (long long)blkno, flags);
412
413	dma_bufsize = (size > MAXPHYS) ? MAXPHYS : size;
414	dma_buf = dma_alloc(dma_bufsize, PR_WAITOK);
415
416	if (bdevvp(dev, &vp)) {
417		printf("%s: sr_rw: failed to allocate vnode\n", DEVNAME(sc));
418		goto done;
419	}
420
421	while (size > 0) {
422		DNPRINTF(SR_D_MISC, "%s: dma_buf %p, size %zu, blkno %lld)\n",
423		    DEVNAME(sc), dma_buf, size, (long long)blkno);
424
425		bufsize = (size > MAXPHYS) ? MAXPHYS : size;
426		if (flags == B_WRITE)
427			memcpy(dma_buf, buf, bufsize);
428
429		bzero(&b, sizeof(b));
430		b.b_flags = flags | B_PHYS;
431		b.b_proc = curproc;
432		b.b_dev = dev;
433		b.b_iodone = NULL;
434		b.b_error = 0;
435		b.b_blkno = blkno;
436		b.b_data = dma_buf;
437		b.b_bcount = bufsize;
438		b.b_bufsize = bufsize;
439		b.b_resid = bufsize;
440		b.b_vp = vp;
441
442		if ((b.b_flags & B_READ) == 0) {
443			s = splbio();
444			vp->v_numoutput++;
445			splx(s);
446		}
447
448		VOP_STRATEGY(vp, &b);
449		biowait(&b);
450
451		if (b.b_flags & B_ERROR) {
452			printf("%s: I/O error %d on dev 0x%x at block %llu\n",
453			    DEVNAME(sc), b.b_error, dev, b.b_blkno);
454			goto done;
455		}
456
457		if (flags == B_READ)
458			memcpy(buf, dma_buf, bufsize);
459
460		size -= bufsize;
461		buf += bufsize;
462		blkno += howmany(bufsize, DEV_BSIZE);
463	}
464
465	rv = 0;
466
467done:
468	if (vp)
469		vput(vp);
470
471	dma_free(dma_buf, dma_bufsize);
472
473	return (rv);
474}
475
476int
477sr_meta_rw(struct sr_discipline *sd, dev_t dev, void *md, long flags)
478{
479	int			rv = 1;
480
481	DNPRINTF(SR_D_META, "%s: sr_meta_rw(0x%x, %p, 0x%lx)\n",
482	    DEVNAME(sd->sd_sc), dev, md, flags);
483
484	if (md == NULL) {
485		printf("%s: sr_meta_rw: invalid metadata pointer\n",
486		    DEVNAME(sd->sd_sc));
487		goto done;
488	}
489
490	rv = sr_rw(sd->sd_sc, dev, md, SR_META_SIZE * DEV_BSIZE,
491	    SR_META_OFFSET, flags);
492
493done:
494	return (rv);
495}
496
497int
498sr_meta_clear(struct sr_discipline *sd)
499{
500	struct sr_softc		*sc = sd->sd_sc;
501	struct sr_chunk_head	*cl = &sd->sd_vol.sv_chunk_list;
502	struct sr_chunk		*ch_entry;
503	void			*m;
504	int			rv = 1;
505
506	DNPRINTF(SR_D_META, "%s: sr_meta_clear\n", DEVNAME(sc));
507
508	if (sd->sd_meta_type != SR_META_F_NATIVE) {
509		sr_error(sc, "cannot clear foreign metadata");
510		goto done;
511	}
512
513	m = malloc(SR_META_SIZE * DEV_BSIZE, M_DEVBUF, M_WAITOK | M_ZERO);
514	SLIST_FOREACH(ch_entry, cl, src_link) {
515		if (sr_meta_native_write(sd, ch_entry->src_dev_mm, m, NULL)) {
516			/* XXX mark disk offline */
517			DNPRINTF(SR_D_META, "%s: sr_meta_clear failed to "
518			    "clear %s\n", DEVNAME(sc), ch_entry->src_devname);
519			rv++;
520			continue;
521		}
522		bzero(&ch_entry->src_meta, sizeof(ch_entry->src_meta));
523	}
524
525	bzero(sd->sd_meta, SR_META_SIZE * DEV_BSIZE);
526
527	free(m, M_DEVBUF, SR_META_SIZE * DEV_BSIZE);
528	rv = 0;
529done:
530	return (rv);
531}
532
533void
534sr_meta_init(struct sr_discipline *sd, int level, int no_chunk)
535{
536	struct sr_softc		*sc = sd->sd_sc;
537	struct sr_metadata	*sm = sd->sd_meta;
538	struct sr_chunk_head	*cl = &sd->sd_vol.sv_chunk_list;
539	struct sr_meta_chunk	*scm;
540	struct sr_chunk		*chunk;
541	int			cid = 0;
542	u_int64_t		max_chunk_sz = 0, min_chunk_sz = 0;
543	u_int32_t		secsize = DEV_BSIZE;
544
545	DNPRINTF(SR_D_META, "%s: sr_meta_init\n", DEVNAME(sc));
546
547	if (!sm)
548		return;
549
550	/* Initialise volume metadata. */
551	sm->ssdi.ssd_magic = SR_MAGIC;
552	sm->ssdi.ssd_version = SR_META_VERSION;
553	sm->ssdi.ssd_vol_flags = sd->sd_meta_flags;
554	sm->ssdi.ssd_volid = 0;
555	sm->ssdi.ssd_chunk_no = no_chunk;
556	sm->ssdi.ssd_level = level;
557
558	sm->ssd_data_blkno = SR_DATA_OFFSET;
559	sm->ssd_ondisk = 0;
560
561	sr_uuid_generate(&sm->ssdi.ssd_uuid);
562
563	/* Initialise chunk metadata and get min/max chunk sizes & secsize. */
564	SLIST_FOREACH(chunk, cl, src_link) {
565		scm = &chunk->src_meta;
566		scm->scmi.scm_size = chunk->src_size;
567		scm->scmi.scm_chunk_id = cid++;
568		scm->scm_status = BIOC_SDONLINE;
569		scm->scmi.scm_volid = 0;
570		strlcpy(scm->scmi.scm_devname, chunk->src_devname,
571		    sizeof(scm->scmi.scm_devname));
572		memcpy(&scm->scmi.scm_uuid, &sm->ssdi.ssd_uuid,
573		    sizeof(scm->scmi.scm_uuid));
574		sr_checksum(sc, scm, &scm->scm_checksum,
575		    sizeof(scm->scm_checksum));
576
577		if (min_chunk_sz == 0)
578			min_chunk_sz = scm->scmi.scm_size;
579		if (chunk->src_secsize > secsize)
580			secsize = chunk->src_secsize;
581		min_chunk_sz = MIN(min_chunk_sz, scm->scmi.scm_size);
582		max_chunk_sz = MAX(max_chunk_sz, scm->scmi.scm_size);
583	}
584
585	sm->ssdi.ssd_secsize = secsize;
586
587	/* Equalize chunk sizes. */
588	SLIST_FOREACH(chunk, cl, src_link)
589		chunk->src_meta.scmi.scm_coerced_size = min_chunk_sz;
590
591	sd->sd_vol.sv_chunk_minsz = min_chunk_sz;
592	sd->sd_vol.sv_chunk_maxsz = max_chunk_sz;
593}
594
595void
596sr_meta_init_complete(struct sr_discipline *sd)
597{
598#ifdef SR_DEBUG
599	struct sr_softc		*sc = sd->sd_sc;
600#endif
601	struct sr_metadata	*sm = sd->sd_meta;
602
603	DNPRINTF(SR_D_META, "%s: sr_meta_complete\n", DEVNAME(sc));
604
605	/* Complete initialisation of volume metadata. */
606	strlcpy(sm->ssdi.ssd_vendor, "OPENBSD", sizeof(sm->ssdi.ssd_vendor));
607	snprintf(sm->ssdi.ssd_product, sizeof(sm->ssdi.ssd_product),
608	    "SR %s", sd->sd_name);
609	snprintf(sm->ssdi.ssd_revision, sizeof(sm->ssdi.ssd_revision),
610	    "%03d", sm->ssdi.ssd_version);
611}
612
613void
614sr_meta_opt_handler(struct sr_discipline *sd, struct sr_meta_opt_hdr *om)
615{
616	if (om->som_type != SR_OPT_BOOT)
617		panic("unknown optional metadata type");
618}
619
620void
621sr_meta_save_callback(void *xsd)
622{
623	struct sr_discipline	*sd = xsd;
624	int			s;
625
626	s = splbio();
627
628	if (sr_meta_save(sd, SR_META_DIRTY))
629		printf("%s: save metadata failed\n", DEVNAME(sd->sd_sc));
630
631	sd->sd_must_flush = 0;
632	splx(s);
633}
634
635int
636sr_meta_save(struct sr_discipline *sd, u_int32_t flags)
637{
638	struct sr_softc		*sc = sd->sd_sc;
639	struct sr_metadata	*sm = sd->sd_meta, *m;
640	struct sr_meta_driver	*s;
641	struct sr_chunk		*src;
642	struct sr_meta_chunk	*cm;
643	struct sr_workunit	wu;
644	struct sr_meta_opt_hdr	*omh;
645	struct sr_meta_opt_item *omi;
646	int			i;
647
648	DNPRINTF(SR_D_META, "%s: sr_meta_save %s\n",
649	    DEVNAME(sc), sd->sd_meta->ssd_devname);
650
651	if (!sm) {
652		printf("%s: no in memory copy of metadata\n", DEVNAME(sc));
653		goto bad;
654	}
655
656	/* meta scratchpad */
657	s = &smd[sd->sd_meta_type];
658	m = malloc(SR_META_SIZE * DEV_BSIZE, M_DEVBUF, M_ZERO | M_NOWAIT);
659	if (!m) {
660		printf("%s: could not allocate metadata scratch area\n",
661		    DEVNAME(sc));
662		goto bad;
663	}
664
665	/* from here on out metadata is updated */
666restart:
667	sm->ssd_ondisk++;
668	sm->ssd_meta_flags = flags;
669	memcpy(m, sm, sizeof(*m));
670
671	/* Chunk metadata. */
672	cm = (struct sr_meta_chunk *)(m + 1);
673	for (i = 0; i < sm->ssdi.ssd_chunk_no; i++) {
674		src = sd->sd_vol.sv_chunks[i];
675		memcpy(cm, &src->src_meta, sizeof(*cm));
676		cm++;
677	}
678
679	/* Optional metadata. */
680	omh = (struct sr_meta_opt_hdr *)(cm);
681	SLIST_FOREACH(omi, &sd->sd_meta_opt, omi_link) {
682		DNPRINTF(SR_D_META, "%s: saving optional metadata type %u with "
683		    "length %u\n", DEVNAME(sc), omi->omi_som->som_type,
684		    omi->omi_som->som_length);
685		bzero(&omi->omi_som->som_checksum, MD5_DIGEST_LENGTH);
686		sr_checksum(sc, omi->omi_som, &omi->omi_som->som_checksum,
687		    omi->omi_som->som_length);
688		memcpy(omh, omi->omi_som, omi->omi_som->som_length);
689		omh = (struct sr_meta_opt_hdr *)((u_int8_t *)omh +
690		    omi->omi_som->som_length);
691	}
692
693	for (i = 0; i < sm->ssdi.ssd_chunk_no; i++) {
694		src = sd->sd_vol.sv_chunks[i];
695
696		/* skip disks that are offline */
697		if (src->src_meta.scm_status == BIOC_SDOFFLINE)
698			continue;
699
700		/* calculate metadata checksum for correct chunk */
701		m->ssdi.ssd_chunk_id = i;
702		sr_checksum(sc, m, &m->ssd_checksum,
703		    sizeof(struct sr_meta_invariant));
704
705#ifdef SR_DEBUG
706		DNPRINTF(SR_D_META, "%s: sr_meta_save %s: volid: %d "
707		    "chunkid: %d checksum: ",
708		    DEVNAME(sc), src->src_meta.scmi.scm_devname,
709		    m->ssdi.ssd_volid, m->ssdi.ssd_chunk_id);
710
711		if (sr_debug & SR_D_META)
712			sr_checksum_print((u_int8_t *)&m->ssd_checksum);
713		DNPRINTF(SR_D_META, "\n");
714		sr_meta_print(m);
715#endif
716
717		/* translate and write to disk */
718		if (s->smd_write(sd, src->src_dev_mm, m, NULL /* XXX */)) {
719			printf("%s: could not write metadata to %s\n",
720			    DEVNAME(sc), src->src_devname);
721			/* restart the meta write */
722			src->src_meta.scm_status = BIOC_SDOFFLINE;
723			/* XXX recalculate volume status */
724			goto restart;
725		}
726	}
727
728	/* not all disciplines have sync */
729	if (sd->sd_scsi_sync) {
730		bzero(&wu, sizeof(wu));
731		wu.swu_flags |= SR_WUF_FAKE;
732		wu.swu_dis = sd;
733		sd->sd_scsi_sync(&wu);
734	}
735	free(m, M_DEVBUF, SR_META_SIZE * DEV_BSIZE);
736	return (0);
737bad:
738	return (1);
739}
740
741int
742sr_meta_read(struct sr_discipline *sd)
743{
744	struct sr_softc		*sc = sd->sd_sc;
745	struct sr_chunk_head	*cl = &sd->sd_vol.sv_chunk_list;
746	struct sr_metadata	*sm;
747	struct sr_chunk		*ch_entry;
748	struct sr_meta_chunk	*cp;
749	struct sr_meta_driver	*s;
750	void			*fm = NULL;
751	int			no_disk = 0, got_meta = 0;
752
753	DNPRINTF(SR_D_META, "%s: sr_meta_read\n", DEVNAME(sc));
754
755	sm = malloc(SR_META_SIZE * DEV_BSIZE, M_DEVBUF, M_WAITOK | M_ZERO);
756	s = &smd[sd->sd_meta_type];
757	if (sd->sd_meta_type != SR_META_F_NATIVE)
758		fm = malloc(s->smd_size, M_DEVBUF, M_WAITOK | M_ZERO);
759
760	cp = (struct sr_meta_chunk *)(sm + 1);
761	SLIST_FOREACH(ch_entry, cl, src_link) {
762		/* skip disks that are offline */
763		if (ch_entry->src_meta.scm_status == BIOC_SDOFFLINE) {
764			DNPRINTF(SR_D_META,
765			    "%s: %s chunk marked offline, spoofing status\n",
766			    DEVNAME(sc), ch_entry->src_devname);
767			cp++; /* adjust chunk pointer to match failure */
768			continue;
769		} else if (s->smd_read(sd, ch_entry->src_dev_mm, sm, fm)) {
770			/* read and translate */
771			/* XXX mark chunk offline, elsewhere!! */
772			ch_entry->src_meta.scm_status = BIOC_SDOFFLINE;
773			cp++; /* adjust chunk pointer to match failure */
774			DNPRINTF(SR_D_META, "%s: sr_meta_read failed\n",
775			    DEVNAME(sc));
776			continue;
777		}
778
779		if (sm->ssdi.ssd_magic != SR_MAGIC) {
780			DNPRINTF(SR_D_META, "%s: sr_meta_read !SR_MAGIC\n",
781			    DEVNAME(sc));
782			continue;
783		}
784
785		/* validate metadata */
786		if (sr_meta_validate(sd, ch_entry->src_dev_mm, sm, fm)) {
787			DNPRINTF(SR_D_META, "%s: invalid metadata\n",
788			    DEVNAME(sc));
789			no_disk = -1;
790			goto done;
791		}
792
793		/* assume first chunk contains metadata */
794		if (got_meta == 0) {
795			sr_meta_opt_load(sc, sm, &sd->sd_meta_opt);
796			memcpy(sd->sd_meta, sm, sizeof(*sd->sd_meta));
797			got_meta = 1;
798		}
799
800		memcpy(&ch_entry->src_meta, cp, sizeof(ch_entry->src_meta));
801
802		no_disk++;
803		cp++;
804	}
805
806	free(sm, M_DEVBUF, SR_META_SIZE * DEV_BSIZE);
807	free(fm, M_DEVBUF, s->smd_size);
808
809done:
810	DNPRINTF(SR_D_META, "%s: sr_meta_read found %d parts\n", DEVNAME(sc),
811	    no_disk);
812	return (no_disk);
813}
814
815void
816sr_meta_opt_load(struct sr_softc *sc, struct sr_metadata *sm,
817    struct sr_meta_opt_head *som)
818{
819	struct sr_meta_opt_hdr	*omh;
820	struct sr_meta_opt_item *omi;
821	u_int8_t		checksum[MD5_DIGEST_LENGTH];
822	int			i;
823
824	/* Process optional metadata. */
825	omh = (struct sr_meta_opt_hdr *)((u_int8_t *)(sm + 1) +
826	    sizeof(struct sr_meta_chunk) * sm->ssdi.ssd_chunk_no);
827	for (i = 0; i < sm->ssdi.ssd_opt_no; i++) {
828
829		omi = malloc(sizeof(struct sr_meta_opt_item), M_DEVBUF,
830		    M_WAITOK | M_ZERO);
831		SLIST_INSERT_HEAD(som, omi, omi_link);
832
833		if (omh->som_length == 0) {
834
835			/* Load old fixed length optional metadata. */
836			DNPRINTF(SR_D_META, "%s: old optional metadata of type "
837			    "%u\n", DEVNAME(sc), omh->som_type);
838
839			/* Validate checksum. */
840			sr_checksum(sc, (void *)omh, &checksum,
841			    SR_OLD_META_OPT_SIZE - MD5_DIGEST_LENGTH);
842			if (bcmp(&checksum, (void *)omh + SR_OLD_META_OPT_MD5,
843			    sizeof(checksum)))
844				panic("%s: invalid optional metadata checksum",
845				    DEVNAME(sc));
846
847			/* Determine correct length. */
848			switch (omh->som_type) {
849			case SR_OPT_CRYPTO:
850				omh->som_length = sizeof(struct sr_meta_crypto);
851				break;
852			case SR_OPT_BOOT:
853				omh->som_length = sizeof(struct sr_meta_boot);
854				break;
855			case SR_OPT_KEYDISK:
856				omh->som_length =
857				    sizeof(struct sr_meta_keydisk);
858				break;
859			default:
860				panic("unknown old optional metadata type %u",
861				    omh->som_type);
862			}
863
864			omi->omi_som = malloc(omh->som_length, M_DEVBUF,
865			    M_WAITOK | M_ZERO);
866			memcpy((u_int8_t *)omi->omi_som + sizeof(*omi->omi_som),
867			    (u_int8_t *)omh + SR_OLD_META_OPT_OFFSET,
868			    omh->som_length - sizeof(*omi->omi_som));
869			omi->omi_som->som_type = omh->som_type;
870			omi->omi_som->som_length = omh->som_length;
871
872			omh = (struct sr_meta_opt_hdr *)((void *)omh +
873			    SR_OLD_META_OPT_SIZE);
874		} else {
875
876			/* Load variable length optional metadata. */
877			DNPRINTF(SR_D_META, "%s: optional metadata of type %u, "
878			    "length %u\n", DEVNAME(sc), omh->som_type,
879			    omh->som_length);
880			omi->omi_som = malloc(omh->som_length, M_DEVBUF,
881			    M_WAITOK | M_ZERO);
882			memcpy(omi->omi_som, omh, omh->som_length);
883
884			/* Validate checksum. */
885			memcpy(&checksum, &omi->omi_som->som_checksum,
886			    MD5_DIGEST_LENGTH);
887			bzero(&omi->omi_som->som_checksum, MD5_DIGEST_LENGTH);
888			sr_checksum(sc, omi->omi_som,
889			    &omi->omi_som->som_checksum, omh->som_length);
890			if (bcmp(&checksum, &omi->omi_som->som_checksum,
891			    sizeof(checksum)))
892				panic("%s: invalid optional metadata checksum",
893				    DEVNAME(sc));
894
895			omh = (struct sr_meta_opt_hdr *)((void *)omh +
896			    omh->som_length);
897		}
898	}
899}
900
901int
902sr_meta_validate(struct sr_discipline *sd, dev_t dev, struct sr_metadata *sm,
903    void *fm)
904{
905	struct sr_softc		*sc = sd->sd_sc;
906	struct sr_meta_driver	*s;
907#ifdef SR_DEBUG
908	struct sr_meta_chunk	*mc;
909#endif
910	u_int8_t		checksum[MD5_DIGEST_LENGTH];
911	char			devname[32];
912	int			rv = 1;
913
914	DNPRINTF(SR_D_META, "%s: sr_meta_validate(%p)\n", DEVNAME(sc), sm);
915
916	sr_meta_getdevname(sc, dev, devname, sizeof(devname));
917
918	s = &smd[sd->sd_meta_type];
919	if (sd->sd_meta_type != SR_META_F_NATIVE)
920		if (s->smd_validate(sd, sm, fm)) {
921			sr_error(sc, "invalid foreign metadata");
922			goto done;
923		}
924
925	/*
926	 * at this point all foreign metadata has been translated to the native
927	 * format and will be treated just like the native format
928	 */
929
930	if (sm->ssdi.ssd_magic != SR_MAGIC) {
931		sr_error(sc, "not valid softraid metadata");
932		goto done;
933	}
934
935	/* Verify metadata checksum. */
936	sr_checksum(sc, sm, &checksum, sizeof(struct sr_meta_invariant));
937	if (bcmp(&checksum, &sm->ssd_checksum, sizeof(checksum))) {
938		sr_error(sc, "invalid metadata checksum");
939		goto done;
940	}
941
942	/* Handle changes between versions. */
943	if (sm->ssdi.ssd_version == 3) {
944
945		/*
946		 * Version 3 - update metadata version and fix up data blkno
947		 * value since this did not exist in version 3.
948		 */
949		if (sm->ssd_data_blkno == 0)
950			sm->ssd_data_blkno = SR_META_V3_DATA_OFFSET;
951		sm->ssdi.ssd_secsize = DEV_BSIZE;
952
953	} else if (sm->ssdi.ssd_version == 4) {
954
955		/*
956		 * Version 4 - original metadata format did not store
957		 * data blkno so fix this up if necessary.
958		 */
959		if (sm->ssd_data_blkno == 0)
960			sm->ssd_data_blkno = SR_DATA_OFFSET;
961		sm->ssdi.ssd_secsize = DEV_BSIZE;
962
963	} else if (sm->ssdi.ssd_version == 5) {
964
965		/*
966		 * Version 5 - variable length optional metadata. Migration
967		 * from earlier fixed length optional metadata is handled
968		 * in sr_meta_read().
969		 */
970		sm->ssdi.ssd_secsize = DEV_BSIZE;
971
972	} else if (sm->ssdi.ssd_version == SR_META_VERSION) {
973
974		/*
975		 * Version 6 - store & report a sector size.
976		 */
977
978	} else {
979
980		sr_error(sc, "cannot read metadata version %u on %s, "
981		    "expected version %u or earlier",
982		    sm->ssdi.ssd_version, devname, SR_META_VERSION);
983		goto done;
984
985	}
986
987	/* Update version number and revision string. */
988	sm->ssdi.ssd_version = SR_META_VERSION;
989	snprintf(sm->ssdi.ssd_revision, sizeof(sm->ssdi.ssd_revision),
990	    "%03d", SR_META_VERSION);
991
992#ifdef SR_DEBUG
993	/* warn if disk changed order */
994	mc = (struct sr_meta_chunk *)(sm + 1);
995	if (strncmp(mc[sm->ssdi.ssd_chunk_id].scmi.scm_devname, devname,
996	    sizeof(mc[sm->ssdi.ssd_chunk_id].scmi.scm_devname)))
997		DNPRINTF(SR_D_META, "%s: roaming device %s -> %s\n",
998		    DEVNAME(sc), mc[sm->ssdi.ssd_chunk_id].scmi.scm_devname,
999		    devname);
1000#endif
1001
1002	/* we have meta data on disk */
1003	DNPRINTF(SR_D_META, "%s: sr_meta_validate valid metadata %s\n",
1004	    DEVNAME(sc), devname);
1005
1006	rv = 0;
1007done:
1008	return (rv);
1009}
1010
1011int
1012sr_meta_native_bootprobe(struct sr_softc *sc, dev_t devno,
1013    struct sr_boot_chunk_head *bch)
1014{
1015	struct vnode		*vn;
1016	struct disklabel	label;
1017	struct sr_metadata	*md = NULL;
1018	struct sr_discipline	*fake_sd = NULL;
1019	struct sr_boot_chunk	*bc;
1020	char			devname[32];
1021	dev_t			chrdev, rawdev;
1022	int			error, i;
1023	int			rv = SR_META_NOTCLAIMED;
1024
1025	DNPRINTF(SR_D_META, "%s: sr_meta_native_bootprobe\n", DEVNAME(sc));
1026
1027	/*
1028	 * Use character raw device to avoid SCSI complaints about missing
1029	 * media on removable media devices.
1030	 */
1031	chrdev = blktochr(devno);
1032	rawdev = MAKEDISKDEV(major(chrdev), DISKUNIT(devno), RAW_PART);
1033	if (cdevvp(rawdev, &vn)) {
1034		sr_error(sc, "sr_meta_native_bootprobe: cannot allocate vnode");
1035		goto done;
1036	}
1037
1038	/* open device */
1039	error = VOP_OPEN(vn, FREAD, NOCRED, curproc);
1040	if (error) {
1041		DNPRINTF(SR_D_META, "%s: sr_meta_native_bootprobe open "
1042		    "failed\n", DEVNAME(sc));
1043		vput(vn);
1044		goto done;
1045	}
1046
1047	/* get disklabel */
1048	error = VOP_IOCTL(vn, DIOCGDINFO, (caddr_t)&label, FREAD, NOCRED,
1049	    curproc);
1050	if (error) {
1051		DNPRINTF(SR_D_META, "%s: sr_meta_native_bootprobe ioctl "
1052		    "failed\n", DEVNAME(sc));
1053		VOP_CLOSE(vn, FREAD, NOCRED, curproc);
1054		vput(vn);
1055		goto done;
1056	}
1057
1058	/* we are done, close device */
1059	error = VOP_CLOSE(vn, FREAD, NOCRED, curproc);
1060	if (error) {
1061		DNPRINTF(SR_D_META, "%s: sr_meta_native_bootprobe close "
1062		    "failed\n", DEVNAME(sc));
1063		vput(vn);
1064		goto done;
1065	}
1066	vput(vn);
1067
1068	md = malloc(SR_META_SIZE * DEV_BSIZE, M_DEVBUF, M_ZERO | M_NOWAIT);
1069	if (md == NULL) {
1070		sr_error(sc, "not enough memory for metadata buffer");
1071		goto done;
1072	}
1073
1074	/* create fake sd to use utility functions */
1075	fake_sd = malloc(sizeof(struct sr_discipline), M_DEVBUF,
1076	    M_ZERO | M_NOWAIT);
1077	if (fake_sd == NULL) {
1078		sr_error(sc, "not enough memory for fake discipline");
1079		goto done;
1080	}
1081	fake_sd->sd_sc = sc;
1082	fake_sd->sd_meta_type = SR_META_F_NATIVE;
1083
1084	for (i = 0; i < MAXPARTITIONS; i++) {
1085		if (label.d_partitions[i].p_fstype != FS_RAID)
1086			continue;
1087
1088		/* open partition */
1089		rawdev = MAKEDISKDEV(major(devno), DISKUNIT(devno), i);
1090		if (bdevvp(rawdev, &vn)) {
1091			sr_error(sc, "sr_meta_native_bootprobe: cannot "
1092			    "allocate vnode for partition");
1093			goto done;
1094		}
1095		error = VOP_OPEN(vn, FREAD, NOCRED, curproc);
1096		if (error) {
1097			DNPRINTF(SR_D_META, "%s: sr_meta_native_bootprobe "
1098			    "open failed, partition %d\n",
1099			    DEVNAME(sc), i);
1100			vput(vn);
1101			continue;
1102		}
1103
1104		if (sr_meta_native_read(fake_sd, rawdev, md, NULL)) {
1105			sr_error(sc, "native bootprobe could not read native "
1106			    "metadata");
1107			VOP_CLOSE(vn, FREAD, NOCRED, curproc);
1108			vput(vn);
1109			continue;
1110		}
1111
1112		/* are we a softraid partition? */
1113		if (md->ssdi.ssd_magic != SR_MAGIC) {
1114			VOP_CLOSE(vn, FREAD, NOCRED, curproc);
1115			vput(vn);
1116			continue;
1117		}
1118
1119		sr_meta_getdevname(sc, rawdev, devname, sizeof(devname));
1120		if (sr_meta_validate(fake_sd, rawdev, md, NULL) == 0) {
1121			/* XXX fix M_WAITOK, this is boot time */
1122			bc = malloc(sizeof(struct sr_boot_chunk),
1123			    M_DEVBUF, M_WAITOK | M_ZERO);
1124			bc->sbc_metadata = malloc(sizeof(struct sr_metadata),
1125			    M_DEVBUF, M_WAITOK | M_ZERO);
1126			memcpy(bc->sbc_metadata, md, sizeof(struct sr_metadata));
1127			bc->sbc_mm = rawdev;
1128			SLIST_INSERT_HEAD(bch, bc, sbc_link);
1129			rv = SR_META_CLAIMED;
1130		}
1131
1132		/* we are done, close partition */
1133		VOP_CLOSE(vn, FREAD, NOCRED, curproc);
1134		vput(vn);
1135	}
1136
1137done:
1138	free(fake_sd, M_DEVBUF, sizeof(struct sr_discipline));
1139	free(md, M_DEVBUF, SR_META_SIZE * DEV_BSIZE);
1140
1141	return (rv);
1142}
1143
1144int
1145sr_boot_assembly(struct sr_softc *sc)
1146{
1147	struct sr_boot_volume_head bvh;
1148	struct sr_boot_chunk_head bch, kdh;
1149	struct sr_boot_volume	*bv, *bv1, *bv2;
1150	struct sr_boot_chunk	*bc, *bcnext, *bc1, *bc2;
1151	struct sr_disk_head	sdklist;
1152	struct sr_disk		*sdk;
1153	struct disk		*dk;
1154	struct bioc_createraid	bcr;
1155	struct sr_meta_chunk	*hm;
1156	struct sr_chunk_head	*cl;
1157	struct sr_chunk		*hotspare, *chunk, *last;
1158	u_int64_t		*ondisk = NULL;
1159	dev_t			*devs = NULL;
1160	void			*data;
1161	char			devname[32];
1162	int			rv = 0, i;
1163
1164	DNPRINTF(SR_D_META, "%s: sr_boot_assembly\n", DEVNAME(sc));
1165
1166	SLIST_INIT(&sdklist);
1167	SLIST_INIT(&bvh);
1168	SLIST_INIT(&bch);
1169	SLIST_INIT(&kdh);
1170
1171	dk = TAILQ_FIRST(&disklist);
1172	while (dk != NULL) {
1173
1174		/* See if this disk has been checked. */
1175		SLIST_FOREACH(sdk, &sdklist, sdk_link)
1176			if (sdk->sdk_devno == dk->dk_devno)
1177				break;
1178
1179		if (sdk != NULL || dk->dk_devno == NODEV) {
1180			dk = TAILQ_NEXT(dk, dk_link);
1181			continue;
1182		}
1183
1184		/* Add this disk to the list that we've checked. */
1185		sdk = malloc(sizeof(struct sr_disk), M_DEVBUF,
1186		    M_NOWAIT | M_ZERO);
1187		if (sdk == NULL)
1188			goto unwind;
1189		sdk->sdk_devno = dk->dk_devno;
1190		SLIST_INSERT_HEAD(&sdklist, sdk, sdk_link);
1191
1192		/* Only check sd(4) and wd(4) devices. */
1193		if (strncmp(dk->dk_name, "sd", 2) &&
1194		    strncmp(dk->dk_name, "wd", 2)) {
1195			dk = TAILQ_NEXT(dk, dk_link);
1196			continue;
1197		}
1198
1199		/* native softraid uses partitions */
1200		rw_enter_write(&sc->sc_lock);
1201		bio_status_init(&sc->sc_status, &sc->sc_dev);
1202		sr_meta_native_bootprobe(sc, dk->dk_devno, &bch);
1203		rw_exit_write(&sc->sc_lock);
1204
1205		/* probe non-native disks if native failed. */
1206
1207		/* Restart scan since we may have slept. */
1208		dk = TAILQ_FIRST(&disklist);
1209	}
1210
1211	/*
1212	 * Create a list of volumes and associate chunks with each volume.
1213	 */
1214	for (bc = SLIST_FIRST(&bch); bc != NULL; bc = bcnext) {
1215
1216		bcnext = SLIST_NEXT(bc, sbc_link);
1217		SLIST_REMOVE(&bch, bc, sr_boot_chunk, sbc_link);
1218		bc->sbc_chunk_id = bc->sbc_metadata->ssdi.ssd_chunk_id;
1219
1220		/* Handle key disks separately. */
1221		if (bc->sbc_metadata->ssdi.ssd_level == SR_KEYDISK_LEVEL) {
1222			SLIST_INSERT_HEAD(&kdh, bc, sbc_link);
1223			continue;
1224		}
1225
1226		SLIST_FOREACH(bv, &bvh, sbv_link) {
1227			if (bcmp(&bc->sbc_metadata->ssdi.ssd_uuid,
1228			    &bv->sbv_uuid,
1229			    sizeof(bc->sbc_metadata->ssdi.ssd_uuid)) == 0)
1230				break;
1231		}
1232
1233		if (bv == NULL) {
1234			bv = malloc(sizeof(struct sr_boot_volume),
1235			    M_DEVBUF, M_NOWAIT | M_ZERO);
1236			if (bv == NULL) {
1237				printf("%s: failed to allocate boot volume\n",
1238				    DEVNAME(sc));
1239				goto unwind;
1240			}
1241
1242			bv->sbv_level = bc->sbc_metadata->ssdi.ssd_level;
1243			bv->sbv_volid = bc->sbc_metadata->ssdi.ssd_volid;
1244			bv->sbv_chunk_no = bc->sbc_metadata->ssdi.ssd_chunk_no;
1245			bv->sbv_flags = bc->sbc_metadata->ssdi.ssd_vol_flags;
1246			memcpy(&bv->sbv_uuid, &bc->sbc_metadata->ssdi.ssd_uuid,
1247			    sizeof(bc->sbc_metadata->ssdi.ssd_uuid));
1248			SLIST_INIT(&bv->sbv_chunks);
1249
1250			/* Maintain volume order. */
1251			bv2 = NULL;
1252			SLIST_FOREACH(bv1, &bvh, sbv_link) {
1253				if (bv1->sbv_volid > bv->sbv_volid)
1254					break;
1255				bv2 = bv1;
1256			}
1257			if (bv2 == NULL) {
1258				DNPRINTF(SR_D_META, "%s: insert volume %u "
1259				    "at head\n", DEVNAME(sc), bv->sbv_volid);
1260				SLIST_INSERT_HEAD(&bvh, bv, sbv_link);
1261			} else {
1262				DNPRINTF(SR_D_META, "%s: insert volume %u "
1263				    "after %u\n", DEVNAME(sc), bv->sbv_volid,
1264				    bv2->sbv_volid);
1265				SLIST_INSERT_AFTER(bv2, bv, sbv_link);
1266			}
1267		}
1268
1269		/* Maintain chunk order. */
1270		bc2 = NULL;
1271		SLIST_FOREACH(bc1, &bv->sbv_chunks, sbc_link) {
1272			if (bc1->sbc_chunk_id > bc->sbc_chunk_id)
1273				break;
1274			bc2 = bc1;
1275		}
1276		if (bc2 == NULL) {
1277			DNPRINTF(SR_D_META, "%s: volume %u insert chunk %u "
1278			    "at head\n", DEVNAME(sc), bv->sbv_volid,
1279			    bc->sbc_chunk_id);
1280			SLIST_INSERT_HEAD(&bv->sbv_chunks, bc, sbc_link);
1281		} else {
1282			DNPRINTF(SR_D_META, "%s: volume %u insert chunk %u "
1283			    "after %u\n", DEVNAME(sc), bv->sbv_volid,
1284			    bc->sbc_chunk_id, bc2->sbc_chunk_id);
1285			SLIST_INSERT_AFTER(bc2, bc, sbc_link);
1286		}
1287
1288		bv->sbv_chunks_found++;
1289	}
1290
1291	/* Allocate memory for device and ondisk version arrays. */
1292	devs = mallocarray(BIOC_CRMAXLEN, sizeof(dev_t), M_DEVBUF,
1293	    M_NOWAIT);
1294	if (devs == NULL) {
1295		printf("%s: failed to allocate device array\n", DEVNAME(sc));
1296		goto unwind;
1297	}
1298	ondisk = mallocarray(BIOC_CRMAXLEN, sizeof(u_int64_t), M_DEVBUF,
1299	    M_NOWAIT);
1300	if (ondisk == NULL) {
1301		printf("%s: failed to allocate ondisk array\n", DEVNAME(sc));
1302		goto unwind;
1303	}
1304
1305	/*
1306	 * Assemble hotspare "volumes".
1307	 */
1308	SLIST_FOREACH(bv, &bvh, sbv_link) {
1309
1310		/* Check if this is a hotspare "volume". */
1311		if (bv->sbv_level != SR_HOTSPARE_LEVEL ||
1312		    bv->sbv_chunk_no != 1)
1313			continue;
1314
1315#ifdef SR_DEBUG
1316		DNPRINTF(SR_D_META, "%s: assembling hotspare volume ",
1317		    DEVNAME(sc));
1318		if (sr_debug & SR_D_META)
1319			sr_uuid_print(&bv->sbv_uuid, 0);
1320		DNPRINTF(SR_D_META, " volid %u with %u chunks\n",
1321		    bv->sbv_volid, bv->sbv_chunk_no);
1322#endif
1323
1324		/* Create hotspare chunk metadata. */
1325		hotspare = malloc(sizeof(struct sr_chunk), M_DEVBUF,
1326		    M_NOWAIT | M_ZERO);
1327		if (hotspare == NULL) {
1328			printf("%s: failed to allocate hotspare\n",
1329			    DEVNAME(sc));
1330			goto unwind;
1331		}
1332
1333		bc = SLIST_FIRST(&bv->sbv_chunks);
1334		sr_meta_getdevname(sc, bc->sbc_mm, devname, sizeof(devname));
1335		hotspare->src_dev_mm = bc->sbc_mm;
1336		strlcpy(hotspare->src_devname, devname,
1337		    sizeof(hotspare->src_devname));
1338		hotspare->src_size = bc->sbc_metadata->ssdi.ssd_size;
1339
1340		hm = &hotspare->src_meta;
1341		hm->scmi.scm_volid = SR_HOTSPARE_VOLID;
1342		hm->scmi.scm_chunk_id = 0;
1343		hm->scmi.scm_size = bc->sbc_metadata->ssdi.ssd_size;
1344		hm->scmi.scm_coerced_size = bc->sbc_metadata->ssdi.ssd_size;
1345		strlcpy(hm->scmi.scm_devname, devname,
1346		    sizeof(hm->scmi.scm_devname));
1347		memcpy(&hm->scmi.scm_uuid, &bc->sbc_metadata->ssdi.ssd_uuid,
1348		    sizeof(struct sr_uuid));
1349
1350		sr_checksum(sc, hm, &hm->scm_checksum,
1351		    sizeof(struct sr_meta_chunk_invariant));
1352
1353		hm->scm_status = BIOC_SDHOTSPARE;
1354
1355		/* Add chunk to hotspare list. */
1356		rw_enter_write(&sc->sc_hs_lock);
1357		cl = &sc->sc_hotspare_list;
1358		if (SLIST_EMPTY(cl))
1359			SLIST_INSERT_HEAD(cl, hotspare, src_link);
1360		else {
1361			SLIST_FOREACH(chunk, cl, src_link)
1362				last = chunk;
1363			SLIST_INSERT_AFTER(last, hotspare, src_link);
1364		}
1365		sc->sc_hotspare_no++;
1366		rw_exit_write(&sc->sc_hs_lock);
1367
1368	}
1369
1370	/*
1371	 * Assemble RAID volumes.
1372	 */
1373	SLIST_FOREACH(bv, &bvh, sbv_link) {
1374
1375		bzero(&bcr, sizeof(bcr));
1376		data = NULL;
1377
1378		/* Check if this is a hotspare "volume". */
1379		if (bv->sbv_level == SR_HOTSPARE_LEVEL &&
1380		    bv->sbv_chunk_no == 1)
1381			continue;
1382
1383		/*
1384		 * Skip volumes that are marked as no auto assemble, unless
1385		 * this was the volume which we actually booted from.
1386		 */
1387		if (bcmp(&sr_bootuuid, &bv->sbv_uuid, sizeof(sr_bootuuid)) != 0)
1388			if (bv->sbv_flags & BIOC_SCNOAUTOASSEMBLE)
1389				continue;
1390
1391#ifdef SR_DEBUG
1392		DNPRINTF(SR_D_META, "%s: assembling volume ", DEVNAME(sc));
1393		if (sr_debug & SR_D_META)
1394			sr_uuid_print(&bv->sbv_uuid, 0);
1395		DNPRINTF(SR_D_META, " volid %u with %u chunks\n",
1396		    bv->sbv_volid, bv->sbv_chunk_no);
1397#endif
1398
1399		/*
1400		 * If this is a crypto volume, try to find a matching
1401		 * key disk...
1402		 */
1403		bcr.bc_key_disk = NODEV;
1404		if (bv->sbv_level == 'C' || bv->sbv_level == 0x1C) {
1405			SLIST_FOREACH(bc, &kdh, sbc_link) {
1406				if (bcmp(&bc->sbc_metadata->ssdi.ssd_uuid,
1407				    &bv->sbv_uuid,
1408				    sizeof(bc->sbc_metadata->ssdi.ssd_uuid))
1409				    == 0)
1410					bcr.bc_key_disk = bc->sbc_mm;
1411			}
1412		}
1413
1414		for (i = 0; i < BIOC_CRMAXLEN; i++) {
1415			devs[i] = NODEV; /* mark device as illegal */
1416			ondisk[i] = 0;
1417		}
1418
1419		SLIST_FOREACH(bc, &bv->sbv_chunks, sbc_link) {
1420			if (devs[bc->sbc_chunk_id] != NODEV) {
1421				bv->sbv_chunks_found--;
1422				sr_meta_getdevname(sc, bc->sbc_mm, devname,
1423				    sizeof(devname));
1424				printf("%s: found duplicate chunk %u for "
1425				    "volume %u on device %s\n", DEVNAME(sc),
1426				    bc->sbc_chunk_id, bv->sbv_volid, devname);
1427			}
1428
1429			if (devs[bc->sbc_chunk_id] == NODEV ||
1430			    bc->sbc_metadata->ssd_ondisk >
1431			    ondisk[bc->sbc_chunk_id]) {
1432				devs[bc->sbc_chunk_id] = bc->sbc_mm;
1433				ondisk[bc->sbc_chunk_id] =
1434				    bc->sbc_metadata->ssd_ondisk;
1435				DNPRINTF(SR_D_META, "%s: using ondisk "
1436				    "metadata version %llu for chunk %u\n",
1437				    DEVNAME(sc), ondisk[bc->sbc_chunk_id],
1438				    bc->sbc_chunk_id);
1439			}
1440		}
1441
1442		if (bv->sbv_chunk_no != bv->sbv_chunks_found) {
1443			printf("%s: not all chunks were provided; "
1444			    "attempting to bring volume %d online\n",
1445			    DEVNAME(sc), bv->sbv_volid);
1446		}
1447
1448		bcr.bc_level = bv->sbv_level;
1449		bcr.bc_dev_list_len = bv->sbv_chunk_no * sizeof(dev_t);
1450		bcr.bc_dev_list = devs;
1451		bcr.bc_flags = BIOC_SCDEVT |
1452		    (bv->sbv_flags & BIOC_SCNOAUTOASSEMBLE);
1453
1454		if ((bv->sbv_level == 'C' || bv->sbv_level == 0x1C) &&
1455		    bcmp(&sr_bootuuid, &bv->sbv_uuid, sizeof(sr_bootuuid)) == 0)
1456			data = sr_bootkey;
1457
1458		rw_enter_write(&sc->sc_lock);
1459		bio_status_init(&sc->sc_status, &sc->sc_dev);
1460		sr_ioctl_createraid(sc, &bcr, 0, data);
1461		rw_exit_write(&sc->sc_lock);
1462
1463		rv++;
1464	}
1465
1466	/* done with metadata */
1467unwind:
1468	/* Free boot volumes and associated chunks. */
1469	for (bv1 = SLIST_FIRST(&bvh); bv1 != NULL; bv1 = bv2) {
1470		bv2 = SLIST_NEXT(bv1, sbv_link);
1471		for (bc1 = SLIST_FIRST(&bv1->sbv_chunks); bc1 != NULL;
1472		    bc1 = bc2) {
1473			bc2 = SLIST_NEXT(bc1, sbc_link);
1474			free(bc1->sbc_metadata, M_DEVBUF,
1475			    sizeof(*bc1->sbc_metadata));
1476			free(bc1, M_DEVBUF, sizeof(*bc1));
1477		}
1478		free(bv1, M_DEVBUF, sizeof(*bv1));
1479	}
1480	/* Free keydisks chunks. */
1481	for (bc1 = SLIST_FIRST(&kdh); bc1 != NULL; bc1 = bc2) {
1482		bc2 = SLIST_NEXT(bc1, sbc_link);
1483		free(bc1->sbc_metadata, M_DEVBUF, sizeof(*bc1->sbc_metadata));
1484		free(bc1, M_DEVBUF, sizeof(*bc1));
1485	}
1486	/* Free unallocated chunks. */
1487	for (bc1 = SLIST_FIRST(&bch); bc1 != NULL; bc1 = bc2) {
1488		bc2 = SLIST_NEXT(bc1, sbc_link);
1489		free(bc1->sbc_metadata, M_DEVBUF, sizeof(*bc1->sbc_metadata));
1490		free(bc1, M_DEVBUF, sizeof(*bc1));
1491	}
1492
1493	while (!SLIST_EMPTY(&sdklist)) {
1494		sdk = SLIST_FIRST(&sdklist);
1495		SLIST_REMOVE_HEAD(&sdklist, sdk_link);
1496		free(sdk, M_DEVBUF, sizeof(*sdk));
1497	}
1498
1499	free(devs, M_DEVBUF, BIOC_CRMAXLEN * sizeof(dev_t));
1500	free(ondisk, M_DEVBUF, BIOC_CRMAXLEN * sizeof(u_int64_t));
1501
1502	return (rv);
1503}
1504
1505void
1506sr_map_root(void)
1507{
1508	struct sr_softc		*sc = softraid0;
1509	struct sr_discipline	*sd;
1510	struct sr_meta_opt_item	*omi;
1511	struct sr_meta_boot	*sbm;
1512	u_char			duid[8];
1513	int			i;
1514
1515	if (sc == NULL)
1516		return;
1517
1518	DNPRINTF(SR_D_MISC, "%s: sr_map_root\n", DEVNAME(sc));
1519
1520	bzero(duid, sizeof(duid));
1521	if (bcmp(rootduid, duid, sizeof(duid)) == 0) {
1522		DNPRINTF(SR_D_MISC, "%s: root duid is zero\n", DEVNAME(sc));
1523		return;
1524	}
1525
1526	TAILQ_FOREACH(sd, &sc->sc_dis_list, sd_link) {
1527		SLIST_FOREACH(omi, &sd->sd_meta_opt, omi_link) {
1528			if (omi->omi_som->som_type != SR_OPT_BOOT)
1529				continue;
1530			sbm = (struct sr_meta_boot *)omi->omi_som;
1531			for (i = 0; i < SR_MAX_BOOT_DISKS; i++) {
1532				if (bcmp(rootduid, sbm->sbm_boot_duid[i],
1533				    sizeof(rootduid)) == 0) {
1534					memcpy(rootduid, sbm->sbm_root_duid,
1535					    sizeof(rootduid));
1536					DNPRINTF(SR_D_MISC, "%s: root duid "
1537					    "mapped to %s\n", DEVNAME(sc),
1538					    duid_format(rootduid));
1539					return;
1540				}
1541			}
1542		}
1543	}
1544}
1545
1546int
1547sr_meta_native_probe(struct sr_softc *sc, struct sr_chunk *ch_entry)
1548{
1549	struct disklabel	label;
1550	char			*devname;
1551	int			error, part;
1552	u_int64_t		size;
1553
1554	DNPRINTF(SR_D_META, "%s: sr_meta_native_probe(%s)\n",
1555	   DEVNAME(sc), ch_entry->src_devname);
1556
1557	devname = ch_entry->src_devname;
1558	part = DISKPART(ch_entry->src_dev_mm);
1559
1560	/* get disklabel */
1561	error = VOP_IOCTL(ch_entry->src_vn, DIOCGDINFO, (caddr_t)&label, FREAD,
1562	    NOCRED, curproc);
1563	if (error) {
1564		DNPRINTF(SR_D_META, "%s: %s can't obtain disklabel\n",
1565		    DEVNAME(sc), devname);
1566		goto unwind;
1567	}
1568	memcpy(ch_entry->src_duid, label.d_uid, sizeof(ch_entry->src_duid));
1569
1570	/* make sure the partition is of the right type */
1571	if (label.d_partitions[part].p_fstype != FS_RAID) {
1572		DNPRINTF(SR_D_META,
1573		    "%s: %s partition not of type RAID (%d)\n", DEVNAME(sc),
1574		    devname,
1575		    label.d_partitions[part].p_fstype);
1576		goto unwind;
1577	}
1578
1579	size = DL_SECTOBLK(&label, DL_GETPSIZE(&label.d_partitions[part]));
1580	if (size <= SR_DATA_OFFSET) {
1581		DNPRINTF(SR_D_META, "%s: %s partition too small\n", DEVNAME(sc),
1582		    devname);
1583		goto unwind;
1584	}
1585	size -= SR_DATA_OFFSET;
1586	if (size > INT64_MAX) {
1587		DNPRINTF(SR_D_META, "%s: %s partition too large\n", DEVNAME(sc),
1588		    devname);
1589		goto unwind;
1590	}
1591	ch_entry->src_size = size;
1592	ch_entry->src_secsize = label.d_secsize;
1593
1594	DNPRINTF(SR_D_META, "%s: probe found %s size %lld\n", DEVNAME(sc),
1595	    devname, (long long)size);
1596
1597	return (SR_META_F_NATIVE);
1598unwind:
1599	DNPRINTF(SR_D_META, "%s: invalid device: %s\n", DEVNAME(sc),
1600	    devname ? devname : "nodev");
1601	return (SR_META_F_INVALID);
1602}
1603
1604int
1605sr_meta_native_attach(struct sr_discipline *sd, int force)
1606{
1607	struct sr_softc		*sc = sd->sd_sc;
1608	struct sr_chunk_head	*cl = &sd->sd_vol.sv_chunk_list;
1609	struct sr_metadata	*md = NULL;
1610	struct sr_chunk		*ch_entry, *ch_next;
1611	struct sr_uuid		uuid;
1612	u_int64_t		version = 0;
1613	int			sr, not_sr, rv = 1, d, expected = -1, old_meta = 0;
1614
1615	DNPRINTF(SR_D_META, "%s: sr_meta_native_attach\n", DEVNAME(sc));
1616
1617	md = malloc(SR_META_SIZE * DEV_BSIZE, M_DEVBUF, M_ZERO | M_NOWAIT);
1618	if (md == NULL) {
1619		sr_error(sc, "not enough memory for metadata buffer");
1620		goto bad;
1621	}
1622
1623	bzero(&uuid, sizeof uuid);
1624
1625	sr = not_sr = d = 0;
1626	SLIST_FOREACH(ch_entry, cl, src_link) {
1627		if (ch_entry->src_dev_mm == NODEV)
1628			continue;
1629
1630		if (sr_meta_native_read(sd, ch_entry->src_dev_mm, md, NULL)) {
1631			sr_error(sc, "could not read native metadata");
1632			goto bad;
1633		}
1634
1635		if (md->ssdi.ssd_magic == SR_MAGIC) {
1636			sr++;
1637			ch_entry->src_meta.scmi.scm_chunk_id =
1638			    md->ssdi.ssd_chunk_id;
1639			if (d == 0) {
1640				memcpy(&uuid, &md->ssdi.ssd_uuid, sizeof uuid);
1641				expected = md->ssdi.ssd_chunk_no;
1642				version = md->ssd_ondisk;
1643				d++;
1644				continue;
1645			} else if (bcmp(&md->ssdi.ssd_uuid, &uuid,
1646			    sizeof uuid)) {
1647				sr_error(sc, "not part of the same volume");
1648				goto bad;
1649			}
1650			if (md->ssd_ondisk != version) {
1651				old_meta++;
1652				version = MAX(md->ssd_ondisk, version);
1653			}
1654		} else
1655			not_sr++;
1656	}
1657
1658	if (sr && not_sr && !force) {
1659		sr_error(sc, "not all chunks are of the native metadata "
1660		    "format");
1661		goto bad;
1662	}
1663
1664	/* mixed metadata versions; mark bad disks offline */
1665	if (old_meta) {
1666		d = 0;
1667		for (ch_entry = SLIST_FIRST(cl); ch_entry != NULL;
1668		    ch_entry = ch_next, d++) {
1669			ch_next = SLIST_NEXT(ch_entry, src_link);
1670
1671			/* XXX do we want to read this again? */
1672			if (ch_entry->src_dev_mm == NODEV)
1673				panic("src_dev_mm == NODEV");
1674			if (sr_meta_native_read(sd, ch_entry->src_dev_mm, md,
1675			    NULL))
1676				sr_warn(sc, "could not read native metadata");
1677			if (md->ssd_ondisk != version)
1678				sd->sd_vol.sv_chunks[d]->src_meta.scm_status =
1679				    BIOC_SDOFFLINE;
1680		}
1681	}
1682
1683	if (expected != sr && !force && expected != -1) {
1684		DNPRINTF(SR_D_META, "%s: not all chunks were provided, trying "
1685		    "anyway\n", DEVNAME(sc));
1686	}
1687
1688	rv = 0;
1689bad:
1690	free(md, M_DEVBUF, SR_META_SIZE * DEV_BSIZE);
1691	return (rv);
1692}
1693
1694int
1695sr_meta_native_read(struct sr_discipline *sd, dev_t dev,
1696    struct sr_metadata *md, void *fm)
1697{
1698#ifdef SR_DEBUG
1699	struct sr_softc		*sc = sd->sd_sc;
1700#endif
1701	DNPRINTF(SR_D_META, "%s: sr_meta_native_read(0x%x, %p)\n",
1702	    DEVNAME(sc), dev, md);
1703
1704	return (sr_meta_rw(sd, dev, md, B_READ));
1705}
1706
1707int
1708sr_meta_native_write(struct sr_discipline *sd, dev_t dev,
1709    struct sr_metadata *md, void *fm)
1710{
1711#ifdef SR_DEBUG
1712	struct sr_softc		*sc = sd->sd_sc;
1713#endif
1714	DNPRINTF(SR_D_META, "%s: sr_meta_native_write(0x%x, %p)\n",
1715	    DEVNAME(sc), dev, md);
1716
1717	return (sr_meta_rw(sd, dev, md, B_WRITE));
1718}
1719
1720void
1721sr_hotplug_register(struct sr_discipline *sd, void *func)
1722{
1723	struct sr_hotplug_list	*mhe;
1724
1725	DNPRINTF(SR_D_MISC, "%s: sr_hotplug_register: %p\n",
1726	    DEVNAME(sd->sd_sc), func);
1727
1728	/* make sure we aren't on the list yet */
1729	SLIST_FOREACH(mhe, &sr_hotplug_callbacks, shl_link)
1730		if (mhe->sh_hotplug == func)
1731			return;
1732
1733	mhe = malloc(sizeof(struct sr_hotplug_list), M_DEVBUF,
1734	    M_WAITOK | M_ZERO);
1735	mhe->sh_hotplug = func;
1736	mhe->sh_sd = sd;
1737	SLIST_INSERT_HEAD(&sr_hotplug_callbacks, mhe, shl_link);
1738}
1739
1740void
1741sr_hotplug_unregister(struct sr_discipline *sd, void *func)
1742{
1743	struct sr_hotplug_list	*mhe;
1744
1745	DNPRINTF(SR_D_MISC, "%s: sr_hotplug_unregister: %s %p\n",
1746	    DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname, func);
1747
1748	/* make sure we are on the list yet */
1749	SLIST_FOREACH(mhe, &sr_hotplug_callbacks, shl_link) {
1750		if (mhe->sh_hotplug == func)
1751			break;
1752	}
1753	if (mhe != NULL) {
1754		SLIST_REMOVE(&sr_hotplug_callbacks, mhe,
1755		    sr_hotplug_list, shl_link);
1756		free(mhe, M_DEVBUF, sizeof(*mhe));
1757	}
1758}
1759
1760void
1761sr_disk_attach(struct disk *diskp, int action)
1762{
1763	struct sr_hotplug_list	*mhe;
1764
1765	SLIST_FOREACH(mhe, &sr_hotplug_callbacks, shl_link)
1766		if (mhe->sh_sd->sd_ready)
1767			mhe->sh_hotplug(mhe->sh_sd, diskp, action);
1768}
1769
1770int
1771sr_match(struct device *parent, void *match, void *aux)
1772{
1773	return (1);
1774}
1775
1776void
1777sr_attach(struct device *parent, struct device *self, void *aux)
1778{
1779	struct sr_softc		*sc = (void *)self;
1780	struct scsibus_attach_args saa;
1781
1782	DNPRINTF(SR_D_MISC, "\n%s: sr_attach", DEVNAME(sc));
1783
1784	if (softraid0 == NULL)
1785		softraid0 = sc;
1786
1787	rw_init(&sc->sc_lock, "sr_lock");
1788	rw_init(&sc->sc_hs_lock, "sr_hs_lock");
1789
1790	SLIST_INIT(&sr_hotplug_callbacks);
1791	TAILQ_INIT(&sc->sc_dis_list);
1792	SLIST_INIT(&sc->sc_hotspare_list);
1793
1794#if NBIO > 0
1795	if (bio_register(&sc->sc_dev, sr_bio_ioctl) != 0)
1796		printf("%s: controller registration failed", DEVNAME(sc));
1797#endif /* NBIO > 0 */
1798
1799#ifndef SMALL_KERNEL
1800	strlcpy(sc->sc_sensordev.xname, DEVNAME(sc),
1801	    sizeof(sc->sc_sensordev.xname));
1802	sensordev_install(&sc->sc_sensordev);
1803#endif /* SMALL_KERNEL */
1804
1805	printf("\n");
1806
1807	saa.saa_adapter_softc = sc;
1808	saa.saa_adapter = &sr_switch;
1809	saa.saa_adapter_target = SDEV_NO_ADAPTER_TARGET;
1810	saa.saa_adapter_buswidth = SR_MAX_LD;
1811	saa.saa_luns = 1;
1812	saa.saa_openings = 0;
1813	saa.saa_pool = NULL;
1814	saa.saa_quirks = saa.saa_flags = 0;
1815	saa.saa_wwpn = saa.saa_wwnn = 0;
1816
1817	sc->sc_scsibus = (struct scsibus_softc *)config_found(&sc->sc_dev, &saa,
1818	    scsiprint);
1819
1820	softraid_disk_attach = sr_disk_attach;
1821
1822	sr_boot_assembly(sc);
1823
1824	explicit_bzero(sr_bootkey, sizeof(sr_bootkey));
1825}
1826
1827int
1828sr_detach(struct device *self, int flags)
1829{
1830	struct sr_softc		*sc = (void *)self;
1831	int			rv;
1832
1833	DNPRINTF(SR_D_MISC, "%s: sr_detach\n", DEVNAME(sc));
1834
1835	softraid_disk_attach = NULL;
1836
1837	sr_shutdown(0);
1838
1839#ifndef SMALL_KERNEL
1840	if (sc->sc_sensor_task != NULL)
1841		sensor_task_unregister(sc->sc_sensor_task);
1842	sensordev_deinstall(&sc->sc_sensordev);
1843#endif /* SMALL_KERNEL */
1844
1845	if (sc->sc_scsibus != NULL) {
1846		rv = config_detach((struct device *)sc->sc_scsibus, flags);
1847		if (rv != 0)
1848			return (rv);
1849		sc->sc_scsibus = NULL;
1850	}
1851
1852	return (0);
1853}
1854
1855void
1856sr_info(struct sr_softc *sc, const char *fmt, ...)
1857{
1858	va_list			ap;
1859
1860	rw_assert_wrlock(&sc->sc_lock);
1861
1862	va_start(ap, fmt);
1863	bio_status(&sc->sc_status, 0, BIO_MSG_INFO, fmt, &ap);
1864	va_end(ap);
1865}
1866
1867void
1868sr_warn(struct sr_softc *sc, const char *fmt, ...)
1869{
1870	va_list			ap;
1871
1872	rw_assert_wrlock(&sc->sc_lock);
1873
1874	va_start(ap, fmt);
1875	bio_status(&sc->sc_status, 1, BIO_MSG_WARN, fmt, &ap);
1876	va_end(ap);
1877}
1878
1879void
1880sr_error(struct sr_softc *sc, const char *fmt, ...)
1881{
1882	va_list			ap;
1883
1884	rw_assert_wrlock(&sc->sc_lock);
1885
1886	va_start(ap, fmt);
1887	bio_status(&sc->sc_status, 1, BIO_MSG_ERROR, fmt, &ap);
1888	va_end(ap);
1889}
1890
1891int
1892sr_ccb_alloc(struct sr_discipline *sd)
1893{
1894	struct sr_ccb		*ccb;
1895	int			i;
1896
1897	if (!sd)
1898		return (1);
1899
1900	DNPRINTF(SR_D_CCB, "%s: sr_ccb_alloc\n", DEVNAME(sd->sd_sc));
1901
1902	if (sd->sd_ccb)
1903		return (1);
1904
1905	sd->sd_ccb = mallocarray(sd->sd_max_wu,
1906	    sd->sd_max_ccb_per_wu * sizeof(struct sr_ccb),
1907	    M_DEVBUF, M_WAITOK | M_ZERO);
1908	TAILQ_INIT(&sd->sd_ccb_freeq);
1909	for (i = 0; i < sd->sd_max_wu * sd->sd_max_ccb_per_wu; i++) {
1910		ccb = &sd->sd_ccb[i];
1911		ccb->ccb_dis = sd;
1912		sr_ccb_put(ccb);
1913	}
1914
1915	DNPRINTF(SR_D_CCB, "%s: sr_ccb_alloc ccb: %d\n",
1916	    DEVNAME(sd->sd_sc), sd->sd_max_wu * sd->sd_max_ccb_per_wu);
1917
1918	return (0);
1919}
1920
1921void
1922sr_ccb_free(struct sr_discipline *sd)
1923{
1924	struct sr_ccb		*ccb;
1925
1926	if (!sd)
1927		return;
1928
1929	DNPRINTF(SR_D_CCB, "%s: sr_ccb_free %p\n", DEVNAME(sd->sd_sc), sd);
1930
1931	while ((ccb = TAILQ_FIRST(&sd->sd_ccb_freeq)) != NULL)
1932		TAILQ_REMOVE(&sd->sd_ccb_freeq, ccb, ccb_link);
1933
1934	free(sd->sd_ccb, M_DEVBUF, sd->sd_max_wu * sd->sd_max_ccb_per_wu *
1935	    sizeof(struct sr_ccb));
1936}
1937
1938struct sr_ccb *
1939sr_ccb_get(struct sr_discipline *sd)
1940{
1941	struct sr_ccb		*ccb;
1942	int			s;
1943
1944	s = splbio();
1945
1946	ccb = TAILQ_FIRST(&sd->sd_ccb_freeq);
1947	if (ccb) {
1948		TAILQ_REMOVE(&sd->sd_ccb_freeq, ccb, ccb_link);
1949		ccb->ccb_state = SR_CCB_INPROGRESS;
1950	}
1951
1952	splx(s);
1953
1954	DNPRINTF(SR_D_CCB, "%s: sr_ccb_get: %p\n", DEVNAME(sd->sd_sc),
1955	    ccb);
1956
1957	return (ccb);
1958}
1959
1960void
1961sr_ccb_put(struct sr_ccb *ccb)
1962{
1963	struct sr_discipline	*sd = ccb->ccb_dis;
1964	int			s;
1965
1966	DNPRINTF(SR_D_CCB, "%s: sr_ccb_put: %p\n", DEVNAME(sd->sd_sc),
1967	    ccb);
1968
1969	s = splbio();
1970
1971	ccb->ccb_wu = NULL;
1972	ccb->ccb_state = SR_CCB_FREE;
1973	ccb->ccb_target = -1;
1974	ccb->ccb_opaque = NULL;
1975
1976	TAILQ_INSERT_TAIL(&sd->sd_ccb_freeq, ccb, ccb_link);
1977
1978	splx(s);
1979}
1980
1981struct sr_ccb *
1982sr_ccb_rw(struct sr_discipline *sd, int chunk, daddr_t blkno,
1983    long len, u_int8_t *data, int xsflags, int ccbflags)
1984{
1985	struct sr_chunk		*sc = sd->sd_vol.sv_chunks[chunk];
1986	struct sr_ccb		*ccb = NULL;
1987	int			s;
1988
1989	ccb = sr_ccb_get(sd);
1990	if (ccb == NULL)
1991		goto out;
1992
1993	ccb->ccb_flags = ccbflags;
1994	ccb->ccb_target = chunk;
1995
1996	ccb->ccb_buf.b_flags = B_PHYS | B_CALL;
1997	if (ISSET(xsflags, SCSI_DATA_IN))
1998		ccb->ccb_buf.b_flags |= B_READ;
1999	else
2000		ccb->ccb_buf.b_flags |= B_WRITE;
2001
2002	ccb->ccb_buf.b_blkno = blkno + sd->sd_meta->ssd_data_blkno;
2003	ccb->ccb_buf.b_bcount = len;
2004	ccb->ccb_buf.b_bufsize = len;
2005	ccb->ccb_buf.b_resid = len;
2006	ccb->ccb_buf.b_data = data;
2007	ccb->ccb_buf.b_error = 0;
2008	ccb->ccb_buf.b_iodone = sd->sd_scsi_intr;
2009	ccb->ccb_buf.b_proc = curproc;
2010	ccb->ccb_buf.b_dev = sc->src_dev_mm;
2011	ccb->ccb_buf.b_vp = sc->src_vn;
2012	ccb->ccb_buf.b_bq = NULL;
2013
2014	if (!ISSET(ccb->ccb_buf.b_flags, B_READ)) {
2015		s = splbio();
2016		ccb->ccb_buf.b_vp->v_numoutput++;
2017		splx(s);
2018	}
2019
2020	DNPRINTF(SR_D_DIS, "%s: %s %s ccb "
2021	    "b_bcount %ld b_blkno %lld b_flags 0x%0lx b_data %p\n",
2022	    DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname, sd->sd_name,
2023	    ccb->ccb_buf.b_bcount, (long long)ccb->ccb_buf.b_blkno,
2024	    ccb->ccb_buf.b_flags, ccb->ccb_buf.b_data);
2025
2026out:
2027	return ccb;
2028}
2029
2030void
2031sr_ccb_done(struct sr_ccb *ccb)
2032{
2033	struct sr_workunit	*wu = ccb->ccb_wu;
2034	struct sr_discipline	*sd = wu->swu_dis;
2035	struct sr_softc		*sc = sd->sd_sc;
2036
2037	DNPRINTF(SR_D_INTR, "%s: %s %s ccb done b_bcount %ld b_resid %zu"
2038	    " b_flags 0x%0lx block %lld target %d\n",
2039	    DEVNAME(sc), sd->sd_meta->ssd_devname, sd->sd_name,
2040	    ccb->ccb_buf.b_bcount, ccb->ccb_buf.b_resid, ccb->ccb_buf.b_flags,
2041	    (long long)ccb->ccb_buf.b_blkno, ccb->ccb_target);
2042
2043	splassert(IPL_BIO);
2044
2045	if (ccb->ccb_target == -1)
2046		panic("%s: invalid target on wu: %p", DEVNAME(sc), wu);
2047
2048	if (ccb->ccb_buf.b_flags & B_ERROR) {
2049		DNPRINTF(SR_D_INTR, "%s: i/o error on block %lld target %d\n",
2050		    DEVNAME(sc), (long long)ccb->ccb_buf.b_blkno,
2051		    ccb->ccb_target);
2052		if (ISSET(sd->sd_capabilities, SR_CAP_REDUNDANT))
2053			sd->sd_set_chunk_state(sd, ccb->ccb_target,
2054			    BIOC_SDOFFLINE);
2055		else
2056			printf("%s: %s: i/o error %d @ %s block %lld\n",
2057			    DEVNAME(sc), sd->sd_meta->ssd_devname,
2058			    ccb->ccb_buf.b_error, sd->sd_name,
2059			    (long long)ccb->ccb_buf.b_blkno);
2060		ccb->ccb_state = SR_CCB_FAILED;
2061		wu->swu_ios_failed++;
2062	} else {
2063		ccb->ccb_state = SR_CCB_OK;
2064		wu->swu_ios_succeeded++;
2065	}
2066
2067	wu->swu_ios_complete++;
2068}
2069
2070int
2071sr_wu_alloc(struct sr_discipline *sd)
2072{
2073	struct sr_workunit	*wu;
2074	int			i, no_wu;
2075
2076	DNPRINTF(SR_D_WU, "%s: sr_wu_alloc %p %d\n", DEVNAME(sd->sd_sc),
2077	    sd, sd->sd_max_wu);
2078
2079	no_wu = sd->sd_max_wu;
2080	sd->sd_wu_pending = no_wu;
2081
2082	mtx_init(&sd->sd_wu_mtx, IPL_BIO);
2083	TAILQ_INIT(&sd->sd_wu);
2084	TAILQ_INIT(&sd->sd_wu_freeq);
2085	TAILQ_INIT(&sd->sd_wu_pendq);
2086	TAILQ_INIT(&sd->sd_wu_defq);
2087
2088	for (i = 0; i < no_wu; i++) {
2089		wu = malloc(sd->sd_wu_size, M_DEVBUF, M_WAITOK | M_ZERO);
2090		TAILQ_INSERT_TAIL(&sd->sd_wu, wu, swu_next);
2091		TAILQ_INIT(&wu->swu_ccb);
2092		wu->swu_dis = sd;
2093		task_set(&wu->swu_task, sr_wu_done_callback, wu);
2094		sr_wu_put(sd, wu);
2095	}
2096
2097	return (0);
2098}
2099
2100void
2101sr_wu_free(struct sr_discipline *sd)
2102{
2103	struct sr_workunit	*wu;
2104
2105	DNPRINTF(SR_D_WU, "%s: sr_wu_free %p\n", DEVNAME(sd->sd_sc), sd);
2106
2107	while ((wu = TAILQ_FIRST(&sd->sd_wu_freeq)) != NULL)
2108		TAILQ_REMOVE(&sd->sd_wu_freeq, wu, swu_link);
2109	while ((wu = TAILQ_FIRST(&sd->sd_wu_pendq)) != NULL)
2110		TAILQ_REMOVE(&sd->sd_wu_pendq, wu, swu_link);
2111	while ((wu = TAILQ_FIRST(&sd->sd_wu_defq)) != NULL)
2112		TAILQ_REMOVE(&sd->sd_wu_defq, wu, swu_link);
2113
2114	while ((wu = TAILQ_FIRST(&sd->sd_wu)) != NULL) {
2115		TAILQ_REMOVE(&sd->sd_wu, wu, swu_next);
2116		free(wu, M_DEVBUF, sd->sd_wu_size);
2117	}
2118}
2119
2120void *
2121sr_wu_get(void *xsd)
2122{
2123	struct sr_discipline	*sd = (struct sr_discipline *)xsd;
2124	struct sr_workunit	*wu;
2125
2126	mtx_enter(&sd->sd_wu_mtx);
2127	wu = TAILQ_FIRST(&sd->sd_wu_freeq);
2128	if (wu) {
2129		TAILQ_REMOVE(&sd->sd_wu_freeq, wu, swu_link);
2130		sd->sd_wu_pending++;
2131	}
2132	mtx_leave(&sd->sd_wu_mtx);
2133
2134	DNPRINTF(SR_D_WU, "%s: sr_wu_get: %p\n", DEVNAME(sd->sd_sc), wu);
2135
2136	return (wu);
2137}
2138
2139void
2140sr_wu_put(void *xsd, void *xwu)
2141{
2142	struct sr_discipline	*sd = (struct sr_discipline *)xsd;
2143	struct sr_workunit	*wu = (struct sr_workunit *)xwu;
2144
2145	DNPRINTF(SR_D_WU, "%s: sr_wu_put: %p\n", DEVNAME(sd->sd_sc), wu);
2146
2147	sr_wu_release_ccbs(wu);
2148	sr_wu_init(sd, wu);
2149
2150	mtx_enter(&sd->sd_wu_mtx);
2151	TAILQ_INSERT_TAIL(&sd->sd_wu_freeq, wu, swu_link);
2152	sd->sd_wu_pending--;
2153	mtx_leave(&sd->sd_wu_mtx);
2154}
2155
2156void
2157sr_wu_init(struct sr_discipline *sd, struct sr_workunit *wu)
2158{
2159	int			s;
2160
2161	s = splbio();
2162	if (wu->swu_cb_active == 1)
2163		panic("%s: sr_wu_init got active wu", DEVNAME(sd->sd_sc));
2164	splx(s);
2165
2166	wu->swu_xs = NULL;
2167	wu->swu_state = SR_WU_FREE;
2168	wu->swu_flags = 0;
2169	wu->swu_blk_start = 0;
2170	wu->swu_blk_end = 0;
2171	wu->swu_collider = NULL;
2172}
2173
2174void
2175sr_wu_enqueue_ccb(struct sr_workunit *wu, struct sr_ccb *ccb)
2176{
2177	struct sr_discipline	*sd = wu->swu_dis;
2178	int			s;
2179
2180	s = splbio();
2181	if (wu->swu_cb_active == 1)
2182		panic("%s: sr_wu_enqueue_ccb got active wu",
2183		    DEVNAME(sd->sd_sc));
2184	ccb->ccb_wu = wu;
2185	wu->swu_io_count++;
2186	TAILQ_INSERT_TAIL(&wu->swu_ccb, ccb, ccb_link);
2187	splx(s);
2188}
2189
2190void
2191sr_wu_release_ccbs(struct sr_workunit *wu)
2192{
2193	struct sr_ccb		*ccb;
2194
2195	/* Return all ccbs that are associated with this workunit. */
2196	while ((ccb = TAILQ_FIRST(&wu->swu_ccb)) != NULL) {
2197		TAILQ_REMOVE(&wu->swu_ccb, ccb, ccb_link);
2198		sr_ccb_put(ccb);
2199	}
2200
2201	wu->swu_io_count = 0;
2202	wu->swu_ios_complete = 0;
2203	wu->swu_ios_failed = 0;
2204	wu->swu_ios_succeeded = 0;
2205}
2206
2207void
2208sr_wu_done(struct sr_workunit *wu)
2209{
2210	struct sr_discipline	*sd = wu->swu_dis;
2211
2212	DNPRINTF(SR_D_INTR, "%s: sr_wu_done count %d completed %d failed %d\n",
2213	    DEVNAME(sd->sd_sc), wu->swu_io_count, wu->swu_ios_complete,
2214	    wu->swu_ios_failed);
2215
2216	if (wu->swu_ios_complete < wu->swu_io_count)
2217		return;
2218
2219	task_add(sd->sd_taskq, &wu->swu_task);
2220}
2221
2222void
2223sr_wu_done_callback(void *xwu)
2224{
2225	struct sr_workunit	*wu = xwu;
2226	struct sr_discipline	*sd = wu->swu_dis;
2227	struct scsi_xfer	*xs = wu->swu_xs;
2228	struct sr_workunit	*wup;
2229	int			s;
2230
2231	/*
2232	 * The SR_WUF_DISCIPLINE or SR_WUF_REBUILD flag must be set if
2233	 * the work unit is not associated with a scsi_xfer.
2234	 */
2235	KASSERT(xs != NULL ||
2236	    (wu->swu_flags & (SR_WUF_DISCIPLINE|SR_WUF_REBUILD)));
2237
2238	s = splbio();
2239
2240	if (xs != NULL) {
2241		if (wu->swu_ios_failed)
2242			xs->error = XS_DRIVER_STUFFUP;
2243		else
2244			xs->error = XS_NOERROR;
2245	}
2246
2247	if (sd->sd_scsi_wu_done) {
2248		if (sd->sd_scsi_wu_done(wu) == SR_WU_RESTART)
2249			goto done;
2250	}
2251
2252	/* Remove work unit from pending queue. */
2253	TAILQ_FOREACH(wup, &sd->sd_wu_pendq, swu_link)
2254		if (wup == wu)
2255			break;
2256	if (wup == NULL)
2257		panic("%s: wu %p not on pending queue",
2258		    DEVNAME(sd->sd_sc), wu);
2259	TAILQ_REMOVE(&sd->sd_wu_pendq, wu, swu_link);
2260
2261	if (wu->swu_collider) {
2262		if (wu->swu_ios_failed)
2263			sr_raid_recreate_wu(wu->swu_collider);
2264
2265		/* XXX Should the collider be failed if this xs failed? */
2266		sr_raid_startwu(wu->swu_collider);
2267	}
2268
2269	/*
2270	 * If a discipline provides its own sd_scsi_done function, then it
2271	 * is responsible for calling sr_scsi_done() once I/O is complete.
2272	 */
2273	if (wu->swu_flags & SR_WUF_REBUILD)
2274		wu->swu_flags |= SR_WUF_REBUILDIOCOMP;
2275	if (wu->swu_flags & SR_WUF_WAKEUP)
2276		wakeup(wu);
2277	if (sd->sd_scsi_done)
2278		sd->sd_scsi_done(wu);
2279	else if (wu->swu_flags & SR_WUF_DISCIPLINE)
2280		sr_scsi_wu_put(sd, wu);
2281	else if (!(wu->swu_flags & SR_WUF_REBUILD))
2282		sr_scsi_done(sd, xs);
2283
2284done:
2285	splx(s);
2286}
2287
2288struct sr_workunit *
2289sr_scsi_wu_get(struct sr_discipline *sd, int flags)
2290{
2291	return scsi_io_get(&sd->sd_iopool, flags);
2292}
2293
2294void
2295sr_scsi_wu_put(struct sr_discipline *sd, struct sr_workunit *wu)
2296{
2297	scsi_io_put(&sd->sd_iopool, wu);
2298
2299	if (sd->sd_sync && sd->sd_wu_pending == 0)
2300		wakeup(sd);
2301}
2302
2303void
2304sr_scsi_done(struct sr_discipline *sd, struct scsi_xfer *xs)
2305{
2306	DNPRINTF(SR_D_DIS, "%s: sr_scsi_done: xs %p\n", DEVNAME(sd->sd_sc), xs);
2307
2308	if (xs->error == XS_NOERROR)
2309		xs->resid = 0;
2310
2311	scsi_done(xs);
2312
2313	if (sd->sd_sync && sd->sd_wu_pending == 0)
2314		wakeup(sd);
2315}
2316
2317void
2318sr_scsi_cmd(struct scsi_xfer *xs)
2319{
2320	struct scsi_link	*link = xs->sc_link;
2321	struct sr_softc		*sc = link->bus->sb_adapter_softc;
2322	struct sr_workunit	*wu = xs->io;
2323	struct sr_discipline	*sd;
2324
2325	DNPRINTF(SR_D_CMD, "%s: sr_scsi_cmd target %d xs %p flags %#x\n",
2326	    DEVNAME(sc), link->target, xs, xs->flags);
2327
2328	sd = sc->sc_targets[link->target];
2329	if (sd == NULL)
2330		panic("%s: sr_scsi_cmd NULL discipline", DEVNAME(sc));
2331
2332	if (sd->sd_deleted) {
2333		printf("%s: %s device is being deleted, failing io\n",
2334		    DEVNAME(sc), sd->sd_meta->ssd_devname);
2335		goto stuffup;
2336	}
2337
2338	/* scsi layer *can* re-send wu without calling sr_wu_put(). */
2339	sr_wu_release_ccbs(wu);
2340	sr_wu_init(sd, wu);
2341	wu->swu_state = SR_WU_INPROGRESS;
2342	wu->swu_xs = xs;
2343
2344	switch (xs->cmd.opcode) {
2345	case READ_COMMAND:
2346	case READ_10:
2347	case READ_16:
2348	case WRITE_COMMAND:
2349	case WRITE_10:
2350	case WRITE_16:
2351		DNPRINTF(SR_D_CMD, "%s: sr_scsi_cmd: READ/WRITE %02x\n",
2352		    DEVNAME(sc), xs->cmd.opcode);
2353		if (sd->sd_scsi_rw(wu))
2354			goto stuffup;
2355		break;
2356
2357	case SYNCHRONIZE_CACHE:
2358		DNPRINTF(SR_D_CMD, "%s: sr_scsi_cmd: SYNCHRONIZE_CACHE\n",
2359		    DEVNAME(sc));
2360		if (sd->sd_scsi_sync(wu))
2361			goto stuffup;
2362		goto complete;
2363
2364	case TEST_UNIT_READY:
2365		DNPRINTF(SR_D_CMD, "%s: sr_scsi_cmd: TEST_UNIT_READY\n",
2366		    DEVNAME(sc));
2367		if (sd->sd_scsi_tur(wu))
2368			goto stuffup;
2369		goto complete;
2370
2371	case START_STOP:
2372		DNPRINTF(SR_D_CMD, "%s: sr_scsi_cmd: START_STOP\n",
2373		    DEVNAME(sc));
2374		if (sd->sd_scsi_start_stop(wu))
2375			goto stuffup;
2376		goto complete;
2377
2378	case INQUIRY:
2379		DNPRINTF(SR_D_CMD, "%s: sr_scsi_cmd: INQUIRY\n",
2380		    DEVNAME(sc));
2381		if (sd->sd_scsi_inquiry(wu))
2382			goto stuffup;
2383		goto complete;
2384
2385	case READ_CAPACITY:
2386	case READ_CAPACITY_16:
2387		DNPRINTF(SR_D_CMD, "%s: sr_scsi_cmd READ CAPACITY 0x%02x\n",
2388		    DEVNAME(sc), xs->cmd.opcode);
2389		if (sd->sd_scsi_read_cap(wu))
2390			goto stuffup;
2391		goto complete;
2392
2393	case REQUEST_SENSE:
2394		DNPRINTF(SR_D_CMD, "%s: sr_scsi_cmd REQUEST SENSE\n",
2395		    DEVNAME(sc));
2396		if (sd->sd_scsi_req_sense(wu))
2397			goto stuffup;
2398		goto complete;
2399
2400	default:
2401		DNPRINTF(SR_D_CMD, "%s: unsupported scsi command %x\n",
2402		    DEVNAME(sc), xs->cmd.opcode);
2403		/* XXX might need to add generic function to handle others */
2404		goto stuffup;
2405	}
2406
2407	return;
2408stuffup:
2409	if (sd->sd_scsi_sense.error_code) {
2410		xs->error = XS_SENSE;
2411		memcpy(&xs->sense, &sd->sd_scsi_sense, sizeof(xs->sense));
2412		bzero(&sd->sd_scsi_sense, sizeof(sd->sd_scsi_sense));
2413	} else {
2414		xs->error = XS_DRIVER_STUFFUP;
2415	}
2416complete:
2417	sr_scsi_done(sd, xs);
2418}
2419
2420int
2421sr_scsi_probe(struct scsi_link *link)
2422{
2423	struct sr_softc		*sc = link->bus->sb_adapter_softc;
2424	struct sr_discipline	*sd;
2425
2426	KASSERT(link->target < SR_MAX_LD && link->lun == 0);
2427
2428	sd = sc->sc_targets[link->target];
2429	if (sd == NULL)
2430		return (ENODEV);
2431
2432	link->pool = &sd->sd_iopool;
2433	if (sd->sd_openings)
2434		link->openings = sd->sd_openings(sd);
2435	else
2436		link->openings = sd->sd_max_wu;
2437
2438	return (0);
2439}
2440
2441int
2442sr_scsi_ioctl(struct scsi_link *link, u_long cmd, caddr_t addr, int flag)
2443{
2444	struct sr_softc		*sc = link->bus->sb_adapter_softc;
2445	struct sr_discipline	*sd;
2446
2447	sd = sc->sc_targets[link->target];
2448	if (sd == NULL)
2449		return (ENODEV);
2450
2451	DNPRINTF(SR_D_IOCTL, "%s: %s sr_scsi_ioctl cmd: %#lx\n",
2452	    DEVNAME(sc), sd->sd_meta->ssd_devname, cmd);
2453
2454	/* Pass bio ioctls through to the bio handler. */
2455	if (IOCGROUP(cmd) == 'B')
2456		return (sr_bio_handler(sc, sd, cmd, (struct bio *)addr));
2457
2458	switch (cmd) {
2459	case DIOCGCACHE:
2460	case DIOCSCACHE:
2461		return (EOPNOTSUPP);
2462	default:
2463		return (ENOTTY);
2464	}
2465}
2466
2467int
2468sr_bio_ioctl(struct device *dev, u_long cmd, caddr_t addr)
2469{
2470	struct sr_softc *sc = (struct sr_softc *) dev;
2471	DNPRINTF(SR_D_IOCTL, "%s: sr_bio_ioctl\n", DEVNAME(sc));
2472
2473	return sr_bio_handler(sc, NULL, cmd, (struct bio *)addr);
2474}
2475
2476int
2477sr_bio_handler(struct sr_softc *sc, struct sr_discipline *sd, u_long cmd,
2478    struct bio *bio)
2479{
2480	int			rv = 0;
2481
2482	DNPRINTF(SR_D_IOCTL, "%s: sr_bio_handler ", DEVNAME(sc));
2483
2484	rw_enter_write(&sc->sc_lock);
2485
2486	bio_status_init(&sc->sc_status, &sc->sc_dev);
2487
2488	switch (cmd) {
2489	case BIOCINQ:
2490		DNPRINTF(SR_D_IOCTL, "inq\n");
2491		rv = sr_ioctl_inq(sc, (struct bioc_inq *)bio);
2492		break;
2493
2494	case BIOCVOL:
2495		DNPRINTF(SR_D_IOCTL, "vol\n");
2496		rv = sr_ioctl_vol(sc, (struct bioc_vol *)bio);
2497		break;
2498
2499	case BIOCDISK:
2500		DNPRINTF(SR_D_IOCTL, "disk\n");
2501		rv = sr_ioctl_disk(sc, (struct bioc_disk *)bio);
2502		break;
2503
2504	case BIOCALARM:
2505		DNPRINTF(SR_D_IOCTL, "alarm\n");
2506		/*rv = sr_ioctl_alarm(sc, (struct bioc_alarm *)bio); */
2507		break;
2508
2509	case BIOCBLINK:
2510		DNPRINTF(SR_D_IOCTL, "blink\n");
2511		/*rv = sr_ioctl_blink(sc, (struct bioc_blink *)bio); */
2512		break;
2513
2514	case BIOCSETSTATE:
2515		DNPRINTF(SR_D_IOCTL, "setstate\n");
2516		rv = sr_ioctl_setstate(sc, (struct bioc_setstate *)bio);
2517		break;
2518
2519	case BIOCCREATERAID:
2520		DNPRINTF(SR_D_IOCTL, "createraid\n");
2521		rv = sr_ioctl_createraid(sc, (struct bioc_createraid *)bio,
2522		    1, NULL);
2523		break;
2524
2525	case BIOCDELETERAID:
2526		DNPRINTF(SR_D_IOCTL, "deleteraid\n");
2527		rv = sr_ioctl_deleteraid(sc, sd, (struct bioc_deleteraid *)bio);
2528		break;
2529
2530	case BIOCDISCIPLINE:
2531		DNPRINTF(SR_D_IOCTL, "discipline\n");
2532		rv = sr_ioctl_discipline(sc, sd, (struct bioc_discipline *)bio);
2533		break;
2534
2535	case BIOCINSTALLBOOT:
2536		DNPRINTF(SR_D_IOCTL, "installboot\n");
2537		rv = sr_ioctl_installboot(sc, sd,
2538		    (struct bioc_installboot *)bio);
2539		break;
2540
2541	default:
2542		DNPRINTF(SR_D_IOCTL, "invalid ioctl\n");
2543		rv = ENOTTY;
2544	}
2545
2546	sc->sc_status.bs_status = (rv ? BIO_STATUS_ERROR : BIO_STATUS_SUCCESS);
2547
2548	if (sc->sc_status.bs_msg_count > 0)
2549		rv = 0;
2550
2551	memcpy(&bio->bio_status, &sc->sc_status, sizeof(struct bio_status));
2552
2553	rw_exit_write(&sc->sc_lock);
2554
2555	return (rv);
2556}
2557
2558int
2559sr_ioctl_inq(struct sr_softc *sc, struct bioc_inq *bi)
2560{
2561	struct sr_discipline	*sd;
2562	int			vol = 0, disk = 0;
2563
2564	TAILQ_FOREACH(sd, &sc->sc_dis_list, sd_link) {
2565		vol++;
2566		disk += sd->sd_meta->ssdi.ssd_chunk_no;
2567	}
2568
2569	strlcpy(bi->bi_dev, sc->sc_dev.dv_xname, sizeof(bi->bi_dev));
2570	bi->bi_novol = vol + sc->sc_hotspare_no;
2571	bi->bi_nodisk = disk + sc->sc_hotspare_no;
2572
2573	return (0);
2574}
2575
2576int
2577sr_ioctl_vol(struct sr_softc *sc, struct bioc_vol *bv)
2578{
2579	int			vol = -1, rv = EINVAL;
2580	struct sr_discipline	*sd;
2581	struct sr_chunk		*hotspare;
2582
2583	TAILQ_FOREACH(sd, &sc->sc_dis_list, sd_link) {
2584		vol++;
2585		if (vol != bv->bv_volid)
2586			continue;
2587
2588		bv->bv_status = sd->sd_vol_status;
2589		bv->bv_size = sd->sd_meta->ssdi.ssd_size << DEV_BSHIFT;
2590		bv->bv_level = sd->sd_meta->ssdi.ssd_level;
2591		bv->bv_nodisk = sd->sd_meta->ssdi.ssd_chunk_no;
2592
2593#ifdef CRYPTO
2594		if (sd->sd_meta->ssdi.ssd_level == 'C' &&
2595		    sd->mds.mdd_crypto.key_disk != NULL)
2596			bv->bv_nodisk++;
2597		else if (sd->sd_meta->ssdi.ssd_level == 0x1C &&
2598		    sd->mds.mdd_raid1c.sr1c_crypto.key_disk != NULL)
2599			bv->bv_nodisk++;
2600#endif
2601		if (bv->bv_status == BIOC_SVREBUILD)
2602			bv->bv_percent = sr_rebuild_percent(sd);
2603
2604		strlcpy(bv->bv_dev, sd->sd_meta->ssd_devname,
2605		    sizeof(bv->bv_dev));
2606		strlcpy(bv->bv_vendor, sd->sd_meta->ssdi.ssd_vendor,
2607		    sizeof(bv->bv_vendor));
2608		rv = 0;
2609		goto done;
2610	}
2611
2612	/* Check hotspares list. */
2613	SLIST_FOREACH(hotspare, &sc->sc_hotspare_list, src_link) {
2614		vol++;
2615		if (vol != bv->bv_volid)
2616			continue;
2617
2618		bv->bv_status = BIOC_SVONLINE;
2619		bv->bv_size = hotspare->src_meta.scmi.scm_size << DEV_BSHIFT;
2620		bv->bv_level = -1;	/* Hotspare. */
2621		bv->bv_nodisk = 1;
2622		strlcpy(bv->bv_dev, hotspare->src_meta.scmi.scm_devname,
2623		    sizeof(bv->bv_dev));
2624		strlcpy(bv->bv_vendor, hotspare->src_meta.scmi.scm_devname,
2625		    sizeof(bv->bv_vendor));
2626		rv = 0;
2627		goto done;
2628	}
2629
2630done:
2631	return (rv);
2632}
2633
2634int
2635sr_ioctl_disk(struct sr_softc *sc, struct bioc_disk *bd)
2636{
2637	struct sr_discipline	*sd;
2638	struct sr_chunk		*src, *hotspare;
2639	int			vol = -1, rv = EINVAL;
2640
2641	if (bd->bd_diskid < 0)
2642		goto done;
2643
2644	TAILQ_FOREACH(sd, &sc->sc_dis_list, sd_link) {
2645		vol++;
2646		if (vol != bd->bd_volid)
2647			continue;
2648
2649		if (bd->bd_diskid < sd->sd_meta->ssdi.ssd_chunk_no)
2650			src = sd->sd_vol.sv_chunks[bd->bd_diskid];
2651#ifdef CRYPTO
2652		else if (bd->bd_diskid == sd->sd_meta->ssdi.ssd_chunk_no &&
2653		    sd->sd_meta->ssdi.ssd_level == 'C' &&
2654		    sd->mds.mdd_crypto.key_disk != NULL)
2655			src = sd->mds.mdd_crypto.key_disk;
2656		else if (bd->bd_diskid == sd->sd_meta->ssdi.ssd_chunk_no &&
2657		    sd->sd_meta->ssdi.ssd_level == 0x1C &&
2658		    sd->mds.mdd_raid1c.sr1c_crypto.key_disk != NULL)
2659			src = sd->mds.mdd_crypto.key_disk;
2660#endif
2661		else
2662			break;
2663
2664		bd->bd_status = src->src_meta.scm_status;
2665		bd->bd_size = src->src_meta.scmi.scm_size << DEV_BSHIFT;
2666		bd->bd_channel = vol;
2667		bd->bd_target = bd->bd_diskid;
2668		strlcpy(bd->bd_vendor, src->src_meta.scmi.scm_devname,
2669		    sizeof(bd->bd_vendor));
2670		rv = 0;
2671		goto done;
2672	}
2673
2674	/* Check hotspares list. */
2675	SLIST_FOREACH(hotspare, &sc->sc_hotspare_list, src_link) {
2676		vol++;
2677		if (vol != bd->bd_volid)
2678			continue;
2679
2680		if (bd->bd_diskid != 0)
2681			break;
2682
2683		bd->bd_status = hotspare->src_meta.scm_status;
2684		bd->bd_size = hotspare->src_meta.scmi.scm_size << DEV_BSHIFT;
2685		bd->bd_channel = vol;
2686		bd->bd_target = bd->bd_diskid;
2687		strlcpy(bd->bd_vendor, hotspare->src_meta.scmi.scm_devname,
2688		    sizeof(bd->bd_vendor));
2689		rv = 0;
2690		goto done;
2691	}
2692
2693done:
2694	return (rv);
2695}
2696
2697int
2698sr_ioctl_setstate(struct sr_softc *sc, struct bioc_setstate *bs)
2699{
2700	int			rv = EINVAL;
2701	int			vol = -1, found, c;
2702	struct sr_discipline	*sd;
2703	struct sr_chunk		*ch_entry;
2704	struct sr_chunk_head	*cl;
2705
2706	if (bs->bs_other_id_type == BIOC_SSOTHER_UNUSED)
2707		goto done;
2708
2709	if (bs->bs_status == BIOC_SSHOTSPARE) {
2710		rv = sr_hotspare(sc, (dev_t)bs->bs_other_id);
2711		goto done;
2712	}
2713
2714	TAILQ_FOREACH(sd, &sc->sc_dis_list, sd_link) {
2715		vol++;
2716		if (vol == bs->bs_volid)
2717			break;
2718	}
2719	if (sd == NULL)
2720		goto done;
2721
2722	switch (bs->bs_status) {
2723	case BIOC_SSOFFLINE:
2724		/* Take chunk offline */
2725		found = c = 0;
2726		cl = &sd->sd_vol.sv_chunk_list;
2727		SLIST_FOREACH(ch_entry, cl, src_link) {
2728			if (ch_entry->src_dev_mm == bs->bs_other_id) {
2729				found = 1;
2730				break;
2731			}
2732			c++;
2733		}
2734		if (found == 0) {
2735			sr_error(sc, "chunk not part of array");
2736			goto done;
2737		}
2738
2739		/* XXX: check current state first */
2740		sd->sd_set_chunk_state(sd, c, BIOC_SDOFFLINE);
2741
2742		if (sr_meta_save(sd, SR_META_DIRTY)) {
2743			sr_error(sc, "could not save metadata for %s",
2744			    sd->sd_meta->ssd_devname);
2745			goto done;
2746		}
2747		rv = 0;
2748		break;
2749
2750	case BIOC_SDSCRUB:
2751		break;
2752
2753	case BIOC_SSREBUILD:
2754		rv = sr_rebuild_init(sd, (dev_t)bs->bs_other_id, 0);
2755		break;
2756
2757	default:
2758		sr_error(sc, "unsupported state request %d", bs->bs_status);
2759	}
2760
2761done:
2762	return (rv);
2763}
2764
2765int
2766sr_chunk_in_use(struct sr_softc *sc, dev_t dev)
2767{
2768	struct sr_discipline	*sd;
2769	struct sr_chunk		*chunk;
2770	int			i;
2771
2772	DNPRINTF(SR_D_MISC, "%s: sr_chunk_in_use(%d)\n", DEVNAME(sc), dev);
2773
2774	if (dev == NODEV)
2775		return BIOC_SDINVALID;
2776
2777	/* See if chunk is already in use. */
2778	TAILQ_FOREACH(sd, &sc->sc_dis_list, sd_link) {
2779		for (i = 0; i < sd->sd_meta->ssdi.ssd_chunk_no; i++) {
2780			chunk = sd->sd_vol.sv_chunks[i];
2781			if (chunk->src_dev_mm == dev)
2782				return chunk->src_meta.scm_status;
2783		}
2784	}
2785
2786	/* Check hotspares list. */
2787	SLIST_FOREACH(chunk, &sc->sc_hotspare_list, src_link)
2788		if (chunk->src_dev_mm == dev)
2789			return chunk->src_meta.scm_status;
2790
2791	return BIOC_SDINVALID;
2792}
2793
2794int
2795sr_hotspare(struct sr_softc *sc, dev_t dev)
2796{
2797	struct sr_discipline	*sd = NULL;
2798	struct sr_metadata	*sm = NULL;
2799	struct sr_meta_chunk    *hm;
2800	struct sr_chunk_head	*cl;
2801	struct sr_chunk		*chunk, *last, *hotspare = NULL;
2802	struct sr_uuid		uuid;
2803	struct disklabel	label;
2804	struct vnode		*vn;
2805	u_int64_t		size;
2806	char			devname[32];
2807	int			rv = EINVAL;
2808	int			c, part, open = 0;
2809
2810	/*
2811	 * Add device to global hotspares list.
2812	 */
2813
2814	sr_meta_getdevname(sc, dev, devname, sizeof(devname));
2815
2816	/* Make sure chunk is not already in use. */
2817	c = sr_chunk_in_use(sc, dev);
2818	if (c != BIOC_SDINVALID && c != BIOC_SDOFFLINE) {
2819		if (c == BIOC_SDHOTSPARE)
2820			sr_error(sc, "%s is already a hotspare", devname);
2821		else
2822			sr_error(sc, "%s is already in use", devname);
2823		goto done;
2824	}
2825
2826	/* XXX - See if there is an existing degraded volume... */
2827
2828	/* Open device. */
2829	if (bdevvp(dev, &vn)) {
2830		sr_error(sc, "sr_hotspare: cannot allocate vnode");
2831		goto done;
2832	}
2833	if (VOP_OPEN(vn, FREAD | FWRITE, NOCRED, curproc)) {
2834		DNPRINTF(SR_D_META,"%s: sr_hotspare cannot open %s\n",
2835		    DEVNAME(sc), devname);
2836		vput(vn);
2837		goto fail;
2838	}
2839	open = 1; /* close dev on error */
2840
2841	/* Get partition details. */
2842	part = DISKPART(dev);
2843	if (VOP_IOCTL(vn, DIOCGDINFO, (caddr_t)&label, FREAD,
2844	    NOCRED, curproc)) {
2845		DNPRINTF(SR_D_META, "%s: sr_hotspare ioctl failed\n",
2846		    DEVNAME(sc));
2847		goto fail;
2848	}
2849	if (label.d_partitions[part].p_fstype != FS_RAID) {
2850		sr_error(sc, "%s partition not of type RAID (%d)",
2851		    devname, label.d_partitions[part].p_fstype);
2852		goto fail;
2853	}
2854
2855	/* Calculate partition size. */
2856	size = DL_SECTOBLK(&label, DL_GETPSIZE(&label.d_partitions[part]));
2857	if (size <= SR_DATA_OFFSET) {
2858		DNPRINTF(SR_D_META, "%s: %s partition too small\n", DEVNAME(sc),
2859		    devname);
2860		goto fail;
2861	}
2862	size -= SR_DATA_OFFSET;
2863	if (size > INT64_MAX) {
2864		DNPRINTF(SR_D_META, "%s: %s partition too large\n", DEVNAME(sc),
2865		    devname);
2866		goto fail;
2867	}
2868
2869	/*
2870	 * Create and populate chunk metadata.
2871	 */
2872
2873	sr_uuid_generate(&uuid);
2874	hotspare = malloc(sizeof(struct sr_chunk), M_DEVBUF, M_WAITOK | M_ZERO);
2875
2876	hotspare->src_dev_mm = dev;
2877	hotspare->src_vn = vn;
2878	strlcpy(hotspare->src_devname, devname, sizeof(hm->scmi.scm_devname));
2879	hotspare->src_size = size;
2880
2881	hm = &hotspare->src_meta;
2882	hm->scmi.scm_volid = SR_HOTSPARE_VOLID;
2883	hm->scmi.scm_chunk_id = 0;
2884	hm->scmi.scm_size = size;
2885	hm->scmi.scm_coerced_size = size;
2886	strlcpy(hm->scmi.scm_devname, devname, sizeof(hm->scmi.scm_devname));
2887	memcpy(&hm->scmi.scm_uuid, &uuid, sizeof(struct sr_uuid));
2888
2889	sr_checksum(sc, hm, &hm->scm_checksum,
2890	    sizeof(struct sr_meta_chunk_invariant));
2891
2892	hm->scm_status = BIOC_SDHOTSPARE;
2893
2894	/*
2895	 * Create and populate our own discipline and metadata.
2896	 */
2897
2898	sm = malloc(sizeof(struct sr_metadata), M_DEVBUF, M_WAITOK | M_ZERO);
2899	sm->ssdi.ssd_magic = SR_MAGIC;
2900	sm->ssdi.ssd_version = SR_META_VERSION;
2901	sm->ssd_ondisk = 0;
2902	sm->ssdi.ssd_vol_flags = 0;
2903	memcpy(&sm->ssdi.ssd_uuid, &uuid, sizeof(struct sr_uuid));
2904	sm->ssdi.ssd_chunk_no = 1;
2905	sm->ssdi.ssd_volid = SR_HOTSPARE_VOLID;
2906	sm->ssdi.ssd_level = SR_HOTSPARE_LEVEL;
2907	sm->ssdi.ssd_size = size;
2908	sm->ssdi.ssd_secsize = label.d_secsize;
2909	strlcpy(sm->ssdi.ssd_vendor, "OPENBSD", sizeof(sm->ssdi.ssd_vendor));
2910	snprintf(sm->ssdi.ssd_product, sizeof(sm->ssdi.ssd_product),
2911	    "SR %s", "HOTSPARE");
2912	snprintf(sm->ssdi.ssd_revision, sizeof(sm->ssdi.ssd_revision),
2913	    "%03d", SR_META_VERSION);
2914
2915	sd = malloc(sizeof(struct sr_discipline), M_DEVBUF, M_WAITOK | M_ZERO);
2916	sd->sd_sc = sc;
2917	sd->sd_meta = sm;
2918	sd->sd_meta_type = SR_META_F_NATIVE;
2919	sd->sd_vol_status = BIOC_SVONLINE;
2920	strlcpy(sd->sd_name, "HOTSPARE", sizeof(sd->sd_name));
2921	SLIST_INIT(&sd->sd_meta_opt);
2922
2923	/* Add chunk to volume. */
2924	sd->sd_vol.sv_chunks = malloc(sizeof(struct sr_chunk *), M_DEVBUF,
2925	    M_WAITOK | M_ZERO);
2926	sd->sd_vol.sv_chunks[0] = hotspare;
2927	SLIST_INIT(&sd->sd_vol.sv_chunk_list);
2928	SLIST_INSERT_HEAD(&sd->sd_vol.sv_chunk_list, hotspare, src_link);
2929
2930	/* Save metadata. */
2931	if (sr_meta_save(sd, SR_META_DIRTY)) {
2932		sr_error(sc, "could not save metadata to %s", devname);
2933		goto fail;
2934	}
2935
2936	/*
2937	 * Add chunk to hotspare list.
2938	 */
2939	rw_enter_write(&sc->sc_hs_lock);
2940	cl = &sc->sc_hotspare_list;
2941	if (SLIST_EMPTY(cl))
2942		SLIST_INSERT_HEAD(cl, hotspare, src_link);
2943	else {
2944		SLIST_FOREACH(chunk, cl, src_link)
2945			last = chunk;
2946		SLIST_INSERT_AFTER(last, hotspare, src_link);
2947	}
2948	sc->sc_hotspare_no++;
2949	rw_exit_write(&sc->sc_hs_lock);
2950
2951	rv = 0;
2952	goto done;
2953
2954fail:
2955	free(hotspare, M_DEVBUF, sizeof(*hotspare));
2956
2957done:
2958	if (sd)
2959		free(sd->sd_vol.sv_chunks, M_DEVBUF,
2960		    sizeof(sd->sd_vol.sv_chunks));
2961	free(sd, M_DEVBUF, sizeof(*sd));
2962	free(sm, M_DEVBUF, sizeof(*sm));
2963	if (open) {
2964		VOP_CLOSE(vn, FREAD | FWRITE, NOCRED, curproc);
2965		vput(vn);
2966	}
2967
2968	return (rv);
2969}
2970
2971void
2972sr_hotspare_rebuild_callback(void *xsd)
2973{
2974	struct sr_discipline *sd = xsd;
2975	sr_hotspare_rebuild(sd);
2976}
2977
2978void
2979sr_hotspare_rebuild(struct sr_discipline *sd)
2980{
2981	struct sr_softc		*sc = sd->sd_sc;
2982	struct sr_chunk_head	*cl;
2983	struct sr_chunk		*hotspare, *chunk = NULL;
2984	struct sr_workunit	*wu;
2985	struct sr_ccb		*ccb;
2986	int			i, s, cid, busy;
2987
2988	/*
2989	 * Attempt to locate a hotspare and initiate rebuild.
2990	 */
2991
2992	/* Find first offline chunk. */
2993	for (cid = 0; cid < sd->sd_meta->ssdi.ssd_chunk_no; cid++) {
2994		if (sd->sd_vol.sv_chunks[cid]->src_meta.scm_status ==
2995		    BIOC_SDOFFLINE) {
2996			chunk = sd->sd_vol.sv_chunks[cid];
2997			break;
2998		}
2999	}
3000	if (chunk == NULL) {
3001		printf("%s: no offline chunk found on %s!\n",
3002		    DEVNAME(sc), sd->sd_meta->ssd_devname);
3003		return;
3004	}
3005
3006	/* See if we have a suitable hotspare... */
3007	rw_enter_write(&sc->sc_hs_lock);
3008	cl = &sc->sc_hotspare_list;
3009	SLIST_FOREACH(hotspare, cl, src_link)
3010		if (hotspare->src_size >= chunk->src_size &&
3011		    hotspare->src_secsize <= sd->sd_meta->ssdi.ssd_secsize)
3012			break;
3013
3014	if (hotspare != NULL) {
3015
3016		printf("%s: %s volume degraded, will attempt to "
3017		    "rebuild on hotspare %s\n", DEVNAME(sc),
3018		    sd->sd_meta->ssd_devname, hotspare->src_devname);
3019
3020		/*
3021		 * Ensure that all pending I/O completes on the failed chunk
3022		 * before trying to initiate a rebuild.
3023		 */
3024		i = 0;
3025		do {
3026			busy = 0;
3027
3028			s = splbio();
3029			TAILQ_FOREACH(wu, &sd->sd_wu_pendq, swu_link) {
3030				TAILQ_FOREACH(ccb, &wu->swu_ccb, ccb_link) {
3031					if (ccb->ccb_target == cid)
3032						busy = 1;
3033				}
3034			}
3035			TAILQ_FOREACH(wu, &sd->sd_wu_defq, swu_link) {
3036				TAILQ_FOREACH(ccb, &wu->swu_ccb, ccb_link) {
3037					if (ccb->ccb_target == cid)
3038						busy = 1;
3039				}
3040			}
3041			splx(s);
3042
3043			if (busy) {
3044				tsleep_nsec(sd, PRIBIO, "sr_hotspare",
3045				    SEC_TO_NSEC(1));
3046				i++;
3047			}
3048
3049		} while (busy && i < 120);
3050
3051		DNPRINTF(SR_D_META, "%s: waited %i seconds for I/O to "
3052		    "complete on failed chunk %s\n", DEVNAME(sc),
3053		    i, chunk->src_devname);
3054
3055		if (busy) {
3056			printf("%s: pending I/O failed to complete on "
3057			    "failed chunk %s, hotspare rebuild aborted...\n",
3058			    DEVNAME(sc), chunk->src_devname);
3059			goto done;
3060		}
3061
3062		s = splbio();
3063		rw_enter_write(&sc->sc_lock);
3064		bio_status_init(&sc->sc_status, &sc->sc_dev);
3065		if (sr_rebuild_init(sd, hotspare->src_dev_mm, 1) == 0) {
3066
3067			/* Remove hotspare from available list. */
3068			sc->sc_hotspare_no--;
3069			SLIST_REMOVE(cl, hotspare, sr_chunk, src_link);
3070			free(hotspare, M_DEVBUF, sizeof(*hotspare));
3071
3072		}
3073		rw_exit_write(&sc->sc_lock);
3074		splx(s);
3075	}
3076done:
3077	rw_exit_write(&sc->sc_hs_lock);
3078}
3079
3080int
3081sr_rebuild_init(struct sr_discipline *sd, dev_t dev, int hotspare)
3082{
3083	struct sr_softc		*sc = sd->sd_sc;
3084	struct sr_chunk		*chunk = NULL;
3085	struct sr_meta_chunk	*meta;
3086	struct disklabel	label;
3087	struct vnode		*vn;
3088	u_int64_t		size;
3089	int64_t			csize;
3090	char			devname[32];
3091	int			rv = EINVAL, open = 0;
3092	int			cid, i, part, status;
3093
3094	/*
3095	 * Attempt to initiate a rebuild onto the specified device.
3096	 */
3097
3098	if (!(sd->sd_capabilities & SR_CAP_REBUILD)) {
3099		sr_error(sc, "discipline does not support rebuild");
3100		goto done;
3101	}
3102
3103	/* make sure volume is in the right state */
3104	if (sd->sd_vol_status == BIOC_SVREBUILD) {
3105		sr_error(sc, "rebuild already in progress");
3106		goto done;
3107	}
3108	if (sd->sd_vol_status != BIOC_SVDEGRADED) {
3109		sr_error(sc, "volume not degraded");
3110		goto done;
3111	}
3112
3113	/* Find first offline chunk. */
3114	for (cid = 0; cid < sd->sd_meta->ssdi.ssd_chunk_no; cid++) {
3115		if (sd->sd_vol.sv_chunks[cid]->src_meta.scm_status ==
3116		    BIOC_SDOFFLINE) {
3117			chunk = sd->sd_vol.sv_chunks[cid];
3118			break;
3119		}
3120	}
3121	if (chunk == NULL) {
3122		sr_error(sc, "no offline chunks available to rebuild");
3123		goto done;
3124	}
3125
3126	/* Get coerced size from another online chunk. */
3127	csize = 0;
3128	for (i = 0; i < sd->sd_meta->ssdi.ssd_chunk_no; i++) {
3129		if (sd->sd_vol.sv_chunks[i]->src_meta.scm_status ==
3130		    BIOC_SDONLINE) {
3131			meta = &sd->sd_vol.sv_chunks[i]->src_meta;
3132			csize = meta->scmi.scm_coerced_size;
3133			break;
3134		}
3135	}
3136	if (csize == 0) {
3137		sr_error(sc, "no online chunks available for rebuild");
3138		goto done;
3139	}
3140
3141	sr_meta_getdevname(sc, dev, devname, sizeof(devname));
3142	if (bdevvp(dev, &vn)) {
3143		printf("%s: sr_rebuild_init: can't allocate vnode\n",
3144		    DEVNAME(sc));
3145		goto done;
3146	}
3147	if (VOP_OPEN(vn, FREAD | FWRITE, NOCRED, curproc)) {
3148		DNPRINTF(SR_D_META,"%s: sr_ioctl_setstate can't "
3149		    "open %s\n", DEVNAME(sc), devname);
3150		vput(vn);
3151		goto done;
3152	}
3153	open = 1; /* close dev on error */
3154
3155	/* Get disklabel and check partition. */
3156	part = DISKPART(dev);
3157	if (VOP_IOCTL(vn, DIOCGDINFO, (caddr_t)&label, FREAD,
3158	    NOCRED, curproc)) {
3159		DNPRINTF(SR_D_META, "%s: sr_ioctl_setstate ioctl failed\n",
3160		    DEVNAME(sc));
3161		goto done;
3162	}
3163	if (label.d_partitions[part].p_fstype != FS_RAID) {
3164		sr_error(sc, "%s partition not of type RAID (%d)",
3165		    devname, label.d_partitions[part].p_fstype);
3166		goto done;
3167	}
3168
3169	/* Is the partition large enough? */
3170	size = DL_SECTOBLK(&label, DL_GETPSIZE(&label.d_partitions[part]));
3171	if (size <= sd->sd_meta->ssd_data_blkno) {
3172		sr_error(sc, "%s: %s partition too small", DEVNAME(sc),
3173		    devname);
3174		goto done;
3175	}
3176	size -= sd->sd_meta->ssd_data_blkno;
3177	if (size > INT64_MAX) {
3178		sr_error(sc, "%s: %s partition too large", DEVNAME(sc),
3179		    devname);
3180		goto done;
3181	}
3182	if (size < csize) {
3183		sr_error(sc, "%s partition too small, at least %lld bytes "
3184		    "required", devname, (long long)(csize << DEV_BSHIFT));
3185		goto done;
3186	} else if (size > csize)
3187		sr_warn(sc, "%s partition too large, wasting %lld bytes",
3188		    devname, (long long)((size - csize) << DEV_BSHIFT));
3189	if (label.d_secsize > sd->sd_meta->ssdi.ssd_secsize) {
3190		sr_error(sc, "%s sector size too large, <= %u bytes "
3191		    "required", devname, sd->sd_meta->ssdi.ssd_secsize);
3192		goto done;
3193	}
3194
3195	/* Ensure that this chunk is not already in use. */
3196	status = sr_chunk_in_use(sc, dev);
3197	if (status != BIOC_SDINVALID && status != BIOC_SDOFFLINE &&
3198	    !(hotspare && status == BIOC_SDHOTSPARE)) {
3199		sr_error(sc, "%s is already in use", devname);
3200		goto done;
3201	}
3202
3203	/* Reset rebuild counter since we rebuilding onto a new chunk. */
3204	sd->sd_meta->ssd_rebuild = 0;
3205
3206	open = 0; /* leave dev open from here on out */
3207
3208	/* Fix up chunk. */
3209	memcpy(chunk->src_duid, label.d_uid, sizeof(chunk->src_duid));
3210	chunk->src_dev_mm = dev;
3211	chunk->src_vn = vn;
3212
3213	/* Reconstruct metadata. */
3214	meta = &chunk->src_meta;
3215	meta->scmi.scm_volid = sd->sd_meta->ssdi.ssd_volid;
3216	meta->scmi.scm_chunk_id = cid;
3217	strlcpy(meta->scmi.scm_devname, devname,
3218	    sizeof(meta->scmi.scm_devname));
3219	meta->scmi.scm_size = size;
3220	meta->scmi.scm_coerced_size = csize;
3221	memcpy(&meta->scmi.scm_uuid, &sd->sd_meta->ssdi.ssd_uuid,
3222	    sizeof(meta->scmi.scm_uuid));
3223	sr_checksum(sc, meta, &meta->scm_checksum,
3224	    sizeof(struct sr_meta_chunk_invariant));
3225
3226	sd->sd_set_chunk_state(sd, cid, BIOC_SDREBUILD);
3227
3228	if (sr_meta_save(sd, SR_META_DIRTY)) {
3229		sr_error(sc, "could not save metadata to %s", devname);
3230		open = 1;
3231		goto done;
3232	}
3233
3234	sr_warn(sc, "rebuild of %s started on %s",
3235	    sd->sd_meta->ssd_devname, devname);
3236
3237	sd->sd_reb_abort = 0;
3238	kthread_create_deferred(sr_rebuild_start, sd);
3239
3240	rv = 0;
3241done:
3242	if (open) {
3243		VOP_CLOSE(vn, FREAD | FWRITE, NOCRED, curproc);
3244		vput(vn);
3245	}
3246
3247	return (rv);
3248}
3249
3250int
3251sr_rebuild_percent(struct sr_discipline *sd)
3252{
3253	daddr_t			rb, sz;
3254
3255	sz = sd->sd_meta->ssdi.ssd_size;
3256	rb = sd->sd_meta->ssd_rebuild;
3257
3258	if (rb > 0)
3259		return (100 - ((sz * 100 - rb * 100) / sz) - 1);
3260
3261	return (0);
3262}
3263
3264void
3265sr_roam_chunks(struct sr_discipline *sd)
3266{
3267	struct sr_softc		*sc = sd->sd_sc;
3268	struct sr_chunk		*chunk;
3269	struct sr_meta_chunk	*meta;
3270	int			roamed = 0;
3271
3272	/* Have any chunks roamed? */
3273	SLIST_FOREACH(chunk, &sd->sd_vol.sv_chunk_list, src_link) {
3274		meta = &chunk->src_meta;
3275		if (strncmp(meta->scmi.scm_devname, chunk->src_devname,
3276		    sizeof(meta->scmi.scm_devname))) {
3277
3278			printf("%s: roaming device %s -> %s\n", DEVNAME(sc),
3279			    meta->scmi.scm_devname, chunk->src_devname);
3280
3281			strlcpy(meta->scmi.scm_devname, chunk->src_devname,
3282			    sizeof(meta->scmi.scm_devname));
3283
3284			roamed++;
3285		}
3286	}
3287
3288	if (roamed)
3289		sr_meta_save(sd, SR_META_DIRTY);
3290}
3291
3292int
3293sr_ioctl_createraid(struct sr_softc *sc, struct bioc_createraid *bc,
3294    int user, void *data)
3295{
3296	struct sr_meta_opt_item *omi;
3297	struct sr_chunk_head	*cl;
3298	struct sr_discipline	*sd = NULL;
3299	struct sr_chunk		*ch_entry;
3300	struct scsi_link	*link;
3301	struct device		*dev;
3302	char			*uuid, devname[32];
3303	dev_t			*dt = NULL;
3304	int			i, no_chunk, rv = EINVAL, target, vol;
3305	int			no_meta;
3306
3307	DNPRINTF(SR_D_IOCTL, "%s: sr_ioctl_createraid(%d)\n",
3308	    DEVNAME(sc), user);
3309
3310	/* user input */
3311	if (bc->bc_dev_list_len > BIOC_CRMAXLEN)
3312		goto unwind;
3313
3314	dt = malloc(bc->bc_dev_list_len, M_DEVBUF, M_WAITOK | M_ZERO);
3315	if (user) {
3316		if (copyin(bc->bc_dev_list, dt, bc->bc_dev_list_len) != 0)
3317			goto unwind;
3318	} else
3319		memcpy(dt, bc->bc_dev_list, bc->bc_dev_list_len);
3320
3321	/* Initialise discipline. */
3322	sd = malloc(sizeof(struct sr_discipline), M_DEVBUF, M_WAITOK | M_ZERO);
3323	sd->sd_sc = sc;
3324	SLIST_INIT(&sd->sd_meta_opt);
3325	sd->sd_taskq = taskq_create("srdis", 1, IPL_BIO, 0);
3326	if (sd->sd_taskq == NULL) {
3327		sr_error(sc, "could not create discipline taskq");
3328		goto unwind;
3329	}
3330	if (sr_discipline_init(sd, bc->bc_level)) {
3331		sr_error(sc, "could not initialize discipline");
3332		goto unwind;
3333	}
3334
3335	no_chunk = bc->bc_dev_list_len / sizeof(dev_t);
3336	cl = &sd->sd_vol.sv_chunk_list;
3337	SLIST_INIT(cl);
3338
3339	/* Ensure that chunks are not already in use. */
3340	for (i = 0; i < no_chunk; i++) {
3341		if (sr_chunk_in_use(sc, dt[i]) != BIOC_SDINVALID) {
3342			sr_meta_getdevname(sc, dt[i], devname, sizeof(devname));
3343			sr_error(sc, "chunk %s already in use", devname);
3344			goto unwind;
3345		}
3346	}
3347
3348	sd->sd_meta_type = sr_meta_probe(sd, dt, no_chunk);
3349	if (sd->sd_meta_type == SR_META_F_INVALID) {
3350		sr_error(sc, "invalid metadata format");
3351		goto unwind;
3352	}
3353
3354	if (sr_meta_attach(sd, no_chunk, bc->bc_flags & BIOC_SCFORCE))
3355		goto unwind;
3356
3357	/* force the raid volume by clearing metadata region */
3358	if (bc->bc_flags & BIOC_SCFORCE) {
3359		/* make sure disk isn't up and running */
3360		if (sr_meta_read(sd))
3361			if (sr_already_assembled(sd)) {
3362				uuid = sr_uuid_format(
3363				    &sd->sd_meta->ssdi.ssd_uuid);
3364				sr_error(sc, "disk %s is currently in use; "
3365				    "cannot force create", uuid);
3366				free(uuid, M_DEVBUF, 37);
3367				goto unwind;
3368			}
3369
3370		if (sr_meta_clear(sd)) {
3371			sr_error(sc, "failed to clear metadata");
3372			goto unwind;
3373		}
3374	}
3375
3376	no_meta = sr_meta_read(sd);
3377	if (no_meta == -1) {
3378
3379		/* Corrupt metadata on one or more chunks. */
3380		sr_error(sc, "one of the chunks has corrupt metadata; "
3381		    "aborting assembly");
3382		goto unwind;
3383
3384	} else if (no_meta == 0) {
3385
3386		/* Initialise volume and chunk metadata. */
3387		sr_meta_init(sd, bc->bc_level, no_chunk);
3388		sd->sd_vol_status = BIOC_SVONLINE;
3389		sd->sd_meta_flags = bc->bc_flags & BIOC_SCNOAUTOASSEMBLE;
3390		if (sd->sd_create) {
3391			if ((i = sd->sd_create(sd, bc, no_chunk,
3392			    sd->sd_vol.sv_chunk_minsz))) {
3393				rv = i;
3394				goto unwind;
3395			}
3396		}
3397		sr_meta_init_complete(sd);
3398
3399		DNPRINTF(SR_D_IOCTL,
3400		    "%s: sr_ioctl_createraid: vol_size: %lld\n",
3401		    DEVNAME(sc), sd->sd_meta->ssdi.ssd_size);
3402
3403		/* Warn if we've wasted chunk space due to coercing. */
3404		if ((sd->sd_capabilities & SR_CAP_NON_COERCED) == 0 &&
3405		    sd->sd_vol.sv_chunk_minsz != sd->sd_vol.sv_chunk_maxsz)
3406			sr_warn(sc, "chunk sizes are not equal; up to %llu "
3407			    "blocks wasted per chunk",
3408			    sd->sd_vol.sv_chunk_maxsz -
3409			    sd->sd_vol.sv_chunk_minsz);
3410
3411	} else {
3412
3413		/* Ensure we are assembling the correct # of chunks. */
3414		if (bc->bc_level == 0x1C &&
3415		    sd->sd_meta->ssdi.ssd_chunk_no > no_chunk) {
3416			sr_warn(sc, "trying to bring up %s degraded",
3417			    sd->sd_meta->ssd_devname);
3418		} else if (sd->sd_meta->ssdi.ssd_chunk_no != no_chunk) {
3419			sr_error(sc, "volume chunk count does not match metadata "
3420			    "chunk count");
3421			goto unwind;
3422		}
3423
3424		/* Ensure metadata level matches requested assembly level. */
3425		if (sd->sd_meta->ssdi.ssd_level != bc->bc_level) {
3426			sr_error(sc, "volume level does not match metadata "
3427			    "level");
3428			goto unwind;
3429		}
3430
3431		if (sr_already_assembled(sd)) {
3432			uuid = sr_uuid_format(&sd->sd_meta->ssdi.ssd_uuid);
3433			sr_error(sc, "disk %s already assembled", uuid);
3434			free(uuid, M_DEVBUF, 37);
3435			goto unwind;
3436		}
3437
3438		if (user == 0 && sd->sd_meta_flags & BIOC_SCNOAUTOASSEMBLE) {
3439			DNPRINTF(SR_D_META, "%s: disk not auto assembled from "
3440			    "metadata\n", DEVNAME(sc));
3441			goto unwind;
3442		}
3443
3444		if (no_meta != no_chunk)
3445			sr_warn(sc, "trying to bring up %s degraded",
3446			    sd->sd_meta->ssd_devname);
3447
3448		if (sd->sd_meta->ssd_meta_flags & SR_META_DIRTY)
3449			sr_warn(sc, "%s was not shutdown properly",
3450			    sd->sd_meta->ssd_devname);
3451
3452		SLIST_FOREACH(omi, &sd->sd_meta_opt, omi_link)
3453			if (sd->sd_meta_opt_handler == NULL ||
3454			    sd->sd_meta_opt_handler(sd, omi->omi_som) != 0)
3455				sr_meta_opt_handler(sd, omi->omi_som);
3456
3457		if (sd->sd_assemble) {
3458			if ((i = sd->sd_assemble(sd, bc, no_chunk, data))) {
3459				rv = i;
3460				goto unwind;
3461			}
3462		}
3463
3464		DNPRINTF(SR_D_META, "%s: disk assembled from metadata\n",
3465		    DEVNAME(sc));
3466
3467	}
3468
3469	/* Metadata MUST be fully populated by this point. */
3470	TAILQ_INSERT_TAIL(&sc->sc_dis_list, sd, sd_link);
3471
3472	/* Allocate all resources. */
3473	if ((rv = sd->sd_alloc_resources(sd)))
3474		goto unwind;
3475
3476	/* Adjust flags if necessary. */
3477	if ((sd->sd_capabilities & SR_CAP_AUTO_ASSEMBLE) &&
3478	    (bc->bc_flags & BIOC_SCNOAUTOASSEMBLE) !=
3479	    (sd->sd_meta->ssdi.ssd_vol_flags & BIOC_SCNOAUTOASSEMBLE)) {
3480		sd->sd_meta->ssdi.ssd_vol_flags &= ~BIOC_SCNOAUTOASSEMBLE;
3481		sd->sd_meta->ssdi.ssd_vol_flags |=
3482		    bc->bc_flags & BIOC_SCNOAUTOASSEMBLE;
3483	}
3484
3485	if (sd->sd_capabilities & SR_CAP_SYSTEM_DISK) {
3486		/* Initialise volume state. */
3487		sd->sd_set_vol_state(sd);
3488		if (sd->sd_vol_status == BIOC_SVOFFLINE) {
3489			sr_error(sc, "%s is offline, will not be brought "
3490			    "online", sd->sd_meta->ssd_devname);
3491			goto unwind;
3492		}
3493
3494		/* Setup SCSI iopool. */
3495		scsi_iopool_init(&sd->sd_iopool, sd, sr_wu_get, sr_wu_put);
3496
3497		/*
3498		 * All checks passed - return ENXIO if volume cannot be created.
3499		 */
3500		rv = ENXIO;
3501
3502		/*
3503		 * Find a free target.
3504		 *
3505		 * XXX: We reserve sd_target == 0 to indicate the
3506		 * discipline is not linked into sc->sc_targets, so begin
3507		 * the search with target = 1.
3508		 */
3509		for (target = 1; target < SR_MAX_LD; target++)
3510			if (sc->sc_targets[target] == NULL)
3511				break;
3512		if (target == SR_MAX_LD) {
3513			sr_error(sc, "no free target for %s",
3514			    sd->sd_meta->ssd_devname);
3515			goto unwind;
3516		}
3517
3518		/* Clear sense data. */
3519		bzero(&sd->sd_scsi_sense, sizeof(sd->sd_scsi_sense));
3520
3521		/* Attach discipline and get midlayer to probe it. */
3522		sd->sd_target = target;
3523		sc->sc_targets[target] = sd;
3524		if (scsi_probe_lun(sc->sc_scsibus, target, 0) != 0) {
3525			sr_error(sc, "scsi_probe_lun failed");
3526			sc->sc_targets[target] = NULL;
3527			sd->sd_target = 0;
3528			goto unwind;
3529		}
3530
3531		link = scsi_get_link(sc->sc_scsibus, target, 0);
3532		if (link == NULL)
3533			goto unwind;
3534
3535		dev = link->device_softc;
3536		DNPRINTF(SR_D_IOCTL, "%s: sr device added: %s at target %d\n",
3537		    DEVNAME(sc), dev->dv_xname, sd->sd_target);
3538
3539		/* XXX - Count volumes, not targets. */
3540		for (i = 0, vol = -1; i <= sd->sd_target; i++)
3541			if (sc->sc_targets[i])
3542				vol++;
3543
3544		rv = 0;
3545
3546		if (sd->sd_meta->ssd_devname[0] != '\0' &&
3547		    strncmp(sd->sd_meta->ssd_devname, dev->dv_xname,
3548		    sizeof(dev->dv_xname)))
3549			sr_warn(sc, "volume %s is roaming, it used to be %s, "
3550			    "updating metadata", dev->dv_xname,
3551			    sd->sd_meta->ssd_devname);
3552
3553		/* Populate remaining volume metadata. */
3554		sd->sd_meta->ssdi.ssd_volid = vol;
3555		strlcpy(sd->sd_meta->ssd_devname, dev->dv_xname,
3556		    sizeof(sd->sd_meta->ssd_devname));
3557
3558		sr_info(sc, "%s volume attached as %s",
3559		    sd->sd_name, sd->sd_meta->ssd_devname);
3560
3561		/* Update device name on any roaming chunks. */
3562		sr_roam_chunks(sd);
3563
3564#ifndef SMALL_KERNEL
3565		if (sr_sensors_create(sd))
3566			sr_warn(sc, "unable to create sensor for %s",
3567			    dev->dv_xname);
3568#endif /* SMALL_KERNEL */
3569	} else {
3570		/* This volume does not attach as a system disk. */
3571		ch_entry = SLIST_FIRST(cl); /* XXX */
3572		strlcpy(sd->sd_meta->ssd_devname, ch_entry->src_devname,
3573		    sizeof(sd->sd_meta->ssd_devname));
3574
3575		if (sd->sd_start_discipline(sd))
3576			goto unwind;
3577	}
3578
3579	/* Save current metadata to disk. */
3580	rv = sr_meta_save(sd, SR_META_DIRTY);
3581
3582	if (sd->sd_vol_status == BIOC_SVREBUILD)
3583		kthread_create_deferred(sr_rebuild_start, sd);
3584
3585	sd->sd_ready = 1;
3586
3587	free(dt, M_DEVBUF, bc->bc_dev_list_len);
3588
3589	return (rv);
3590
3591unwind:
3592	free(dt, M_DEVBUF, bc->bc_dev_list_len);
3593
3594	sr_discipline_shutdown(sd, 0, 0);
3595
3596	if (rv == EAGAIN)
3597		rv = 0;
3598
3599	return (rv);
3600}
3601
3602int
3603sr_ioctl_deleteraid(struct sr_softc *sc, struct sr_discipline *sd,
3604    struct bioc_deleteraid *bd)
3605{
3606	int			rv = 1;
3607
3608	DNPRINTF(SR_D_IOCTL, "%s: sr_ioctl_deleteraid %s\n",
3609	    DEVNAME(sc), bd->bd_dev);
3610
3611	if (sd == NULL && (sd = sr_find_discipline(sc, bd->bd_dev)) == NULL) {
3612		sr_error(sc, "volume %s not found", bd->bd_dev);
3613		goto bad;
3614	}
3615
3616	/*
3617	 * XXX Better check for mounted file systems and refuse to detach any
3618	 * volume that is actively in use.
3619	 */
3620	if (bcmp(&sr_bootuuid, &sd->sd_meta->ssdi.ssd_uuid,
3621	    sizeof(sr_bootuuid)) == 0) {
3622		sr_error(sc, "refusing to delete boot volume");
3623		goto bad;
3624	}
3625
3626	sd->sd_deleted = 1;
3627	sd->sd_meta->ssdi.ssd_vol_flags = BIOC_SCNOAUTOASSEMBLE;
3628	sr_discipline_shutdown(sd, 1, 0);
3629
3630	rv = 0;
3631bad:
3632	return (rv);
3633}
3634
3635int
3636sr_ioctl_discipline(struct sr_softc *sc, struct sr_discipline *sd,
3637    struct bioc_discipline *bd)
3638{
3639	int			rv = 1;
3640
3641	/* Dispatch a discipline specific ioctl. */
3642
3643	DNPRINTF(SR_D_IOCTL, "%s: sr_ioctl_discipline %s\n", DEVNAME(sc),
3644	    bd->bd_dev);
3645
3646	if (sd == NULL && (sd = sr_find_discipline(sc, bd->bd_dev)) == NULL) {
3647		sr_error(sc, "volume %s not found", bd->bd_dev);
3648		goto bad;
3649	}
3650
3651	if (sd->sd_ioctl_handler)
3652		rv = sd->sd_ioctl_handler(sd, bd);
3653
3654bad:
3655	return (rv);
3656}
3657
3658int
3659sr_ioctl_installboot(struct sr_softc *sc, struct sr_discipline *sd,
3660    struct bioc_installboot *bb)
3661{
3662	void			*bootblk = NULL, *bootldr = NULL;
3663	struct sr_chunk		*chunk;
3664	struct sr_meta_opt_item *omi;
3665	struct sr_meta_boot	*sbm;
3666	struct disk		*dk;
3667	u_int32_t		bbs = 0, bls = 0, secsize;
3668	u_char			duid[8];
3669	int			rv = EINVAL;
3670	int			i;
3671
3672	DNPRINTF(SR_D_IOCTL, "%s: sr_ioctl_installboot %s\n", DEVNAME(sc),
3673	    bb->bb_dev);
3674
3675	if (sd == NULL && (sd = sr_find_discipline(sc, bb->bb_dev)) == NULL) {
3676		sr_error(sc, "volume %s not found", bb->bb_dev);
3677		goto done;
3678	}
3679
3680	TAILQ_FOREACH(dk, &disklist,  dk_link)
3681		if (!strncmp(dk->dk_name, bb->bb_dev, sizeof(bb->bb_dev)))
3682			break;
3683	if (dk == NULL || dk->dk_label == NULL ||
3684	    duid_iszero(dk->dk_label->d_uid)) {
3685		sr_error(sc, "failed to get DUID for softraid volume");
3686		goto done;
3687	}
3688	memcpy(duid, dk->dk_label->d_uid, sizeof(duid));
3689
3690	/* Ensure that boot storage area is large enough. */
3691	if (sd->sd_meta->ssd_data_blkno < (SR_BOOT_OFFSET + SR_BOOT_SIZE)) {
3692		sr_error(sc, "insufficient boot storage");
3693		goto done;
3694	}
3695
3696	if (bb->bb_bootblk_size > SR_BOOT_BLOCKS_SIZE * DEV_BSIZE) {
3697		sr_error(sc, "boot block too large (%d > %d)",
3698		    bb->bb_bootblk_size, SR_BOOT_BLOCKS_SIZE * DEV_BSIZE);
3699		goto done;
3700	}
3701
3702	if (bb->bb_bootldr_size > SR_BOOT_LOADER_SIZE * DEV_BSIZE) {
3703		sr_error(sc, "boot loader too large (%d > %d)",
3704		    bb->bb_bootldr_size, SR_BOOT_LOADER_SIZE * DEV_BSIZE);
3705		goto done;
3706	}
3707
3708	secsize = sd->sd_meta->ssdi.ssd_secsize;
3709
3710	/* Copy in boot block. */
3711	bbs = howmany(bb->bb_bootblk_size, secsize) * secsize;
3712	bootblk = malloc(bbs, M_DEVBUF, M_WAITOK | M_ZERO);
3713	if (copyin(bb->bb_bootblk, bootblk, bb->bb_bootblk_size) != 0)
3714		goto done;
3715
3716	/* Copy in boot loader. */
3717	bls = howmany(bb->bb_bootldr_size, secsize) * secsize;
3718	bootldr = malloc(bls, M_DEVBUF, M_WAITOK | M_ZERO);
3719	if (copyin(bb->bb_bootldr, bootldr, bb->bb_bootldr_size) != 0)
3720		goto done;
3721
3722	/* Create or update optional meta for bootable volumes. */
3723	SLIST_FOREACH(omi, &sd->sd_meta_opt, omi_link)
3724		if (omi->omi_som->som_type == SR_OPT_BOOT)
3725			break;
3726	if (omi == NULL) {
3727		omi = malloc(sizeof(struct sr_meta_opt_item), M_DEVBUF,
3728		    M_WAITOK | M_ZERO);
3729		omi->omi_som = malloc(sizeof(struct sr_meta_boot), M_DEVBUF,
3730		    M_WAITOK | M_ZERO);
3731		omi->omi_som->som_type = SR_OPT_BOOT;
3732		omi->omi_som->som_length = sizeof(struct sr_meta_boot);
3733		SLIST_INSERT_HEAD(&sd->sd_meta_opt, omi, omi_link);
3734		sd->sd_meta->ssdi.ssd_opt_no++;
3735	}
3736	sbm = (struct sr_meta_boot *)omi->omi_som;
3737
3738	memcpy(sbm->sbm_root_duid, duid, sizeof(sbm->sbm_root_duid));
3739	bzero(&sbm->sbm_boot_duid, sizeof(sbm->sbm_boot_duid));
3740	sbm->sbm_bootblk_size = bbs;
3741	sbm->sbm_bootldr_size = bls;
3742
3743	DNPRINTF(SR_D_IOCTL, "sr_ioctl_installboot: root duid is %s\n",
3744	    duid_format(sbm->sbm_root_duid));
3745
3746	/* Save boot block and boot loader to each chunk. */
3747	for (i = 0; i < sd->sd_meta->ssdi.ssd_chunk_no; i++) {
3748
3749		chunk = sd->sd_vol.sv_chunks[i];
3750		if (chunk->src_meta.scm_status != BIOC_SDONLINE &&
3751		    chunk->src_meta.scm_status != BIOC_SDREBUILD)
3752			continue;
3753
3754		if (i < SR_MAX_BOOT_DISKS)
3755			memcpy(&sbm->sbm_boot_duid[i], chunk->src_duid,
3756			    sizeof(sbm->sbm_boot_duid[i]));
3757
3758		/* Save boot blocks. */
3759		DNPRINTF(SR_D_IOCTL,
3760		    "sr_ioctl_installboot: saving boot block to %s "
3761		    "(%u bytes)\n", chunk->src_devname, bbs);
3762
3763		if (sr_rw(sc, chunk->src_dev_mm, bootblk, bbs,
3764		    SR_BOOT_BLOCKS_OFFSET, B_WRITE)) {
3765			sr_error(sc, "failed to write boot block");
3766			goto done;
3767		}
3768
3769		/* Save boot loader.*/
3770		DNPRINTF(SR_D_IOCTL,
3771		    "sr_ioctl_installboot: saving boot loader to %s "
3772		    "(%u bytes)\n", chunk->src_devname, bls);
3773
3774		if (sr_rw(sc, chunk->src_dev_mm, bootldr, bls,
3775		    SR_BOOT_LOADER_OFFSET, B_WRITE)) {
3776			sr_error(sc, "failed to write boot loader");
3777			goto done;
3778		}
3779	}
3780
3781	/* XXX - Install boot block on disk - MD code. */
3782
3783	/* Mark volume as bootable and save metadata. */
3784	sd->sd_meta->ssdi.ssd_vol_flags |= BIOC_SCBOOTABLE;
3785	if (sr_meta_save(sd, SR_META_DIRTY)) {
3786		sr_error(sc, "could not save metadata to %s", DEVNAME(sc));
3787		goto done;
3788	}
3789
3790	rv = 0;
3791
3792done:
3793	free(bootblk, M_DEVBUF, bbs);
3794	free(bootldr, M_DEVBUF, bls);
3795
3796	return (rv);
3797}
3798
3799void
3800sr_chunks_unwind(struct sr_softc *sc, struct sr_chunk_head *cl)
3801{
3802	struct sr_chunk		*ch_entry, *ch_next;
3803
3804	DNPRINTF(SR_D_IOCTL, "%s: sr_chunks_unwind\n", DEVNAME(sc));
3805
3806	if (!cl)
3807		return;
3808
3809	for (ch_entry = SLIST_FIRST(cl); ch_entry != NULL; ch_entry = ch_next) {
3810		ch_next = SLIST_NEXT(ch_entry, src_link);
3811
3812		DNPRINTF(SR_D_IOCTL, "%s: sr_chunks_unwind closing: %s\n",
3813		    DEVNAME(sc), ch_entry->src_devname);
3814		if (ch_entry->src_vn) {
3815			/*
3816			 * XXX - explicitly lock the vnode until we can resolve
3817			 * the problem introduced by vnode aliasing... specfs
3818			 * has no locking, whereas ufs/ffs does!
3819			 */
3820			vn_lock(ch_entry->src_vn, LK_EXCLUSIVE | LK_RETRY);
3821			VOP_CLOSE(ch_entry->src_vn, FREAD | FWRITE, NOCRED,
3822			    curproc);
3823			vput(ch_entry->src_vn);
3824		}
3825		free(ch_entry, M_DEVBUF, sizeof(*ch_entry));
3826	}
3827	SLIST_INIT(cl);
3828}
3829
3830void
3831sr_discipline_free(struct sr_discipline *sd)
3832{
3833	struct sr_softc		*sc;
3834	struct sr_discipline	*sdtmp1;
3835	struct sr_meta_opt_head *som;
3836	struct sr_meta_opt_item	*omi, *omi_next;
3837
3838	if (!sd)
3839		return;
3840
3841	sc = sd->sd_sc;
3842
3843	DNPRINTF(SR_D_DIS, "%s: sr_discipline_free %s\n",
3844	    DEVNAME(sc),
3845	    sd->sd_meta ? sd->sd_meta->ssd_devname : "nodev");
3846	if (sd->sd_free_resources)
3847		sd->sd_free_resources(sd);
3848	free(sd->sd_vol.sv_chunks, M_DEVBUF, 0);
3849	free(sd->sd_meta, M_DEVBUF, SR_META_SIZE * DEV_BSIZE);
3850	free(sd->sd_meta_foreign, M_DEVBUF, smd[sd->sd_meta_type].smd_size);
3851
3852	som = &sd->sd_meta_opt;
3853	for (omi = SLIST_FIRST(som); omi != NULL; omi = omi_next) {
3854		omi_next = SLIST_NEXT(omi, omi_link);
3855		free(omi->omi_som, M_DEVBUF, 0);
3856		free(omi, M_DEVBUF, sizeof(*omi));
3857	}
3858
3859	if (sd->sd_target != 0) {
3860		KASSERT(sc->sc_targets[sd->sd_target] == sd);
3861		sc->sc_targets[sd->sd_target] = NULL;
3862	}
3863
3864	TAILQ_FOREACH(sdtmp1, &sc->sc_dis_list, sd_link) {
3865		if (sdtmp1 == sd)
3866			break;
3867	}
3868	if (sdtmp1 != NULL)
3869		TAILQ_REMOVE(&sc->sc_dis_list, sd, sd_link);
3870
3871	explicit_bzero(sd, sizeof *sd);
3872	free(sd, M_DEVBUF, sizeof(*sd));
3873}
3874
3875void
3876sr_discipline_shutdown(struct sr_discipline *sd, int meta_save, int dying)
3877{
3878	struct sr_softc		*sc;
3879	int			ret, s;
3880
3881	if (!sd)
3882		return;
3883	sc = sd->sd_sc;
3884
3885	DNPRINTF(SR_D_DIS, "%s: sr_discipline_shutdown %s\n", DEVNAME(sc),
3886	    sd->sd_meta ? sd->sd_meta->ssd_devname : "nodev");
3887
3888	/* If rebuilding, abort rebuild and drain I/O. */
3889	if (sd->sd_reb_active) {
3890		sd->sd_reb_abort = 1;
3891		while (sd->sd_reb_active)
3892			tsleep_nsec(sd, PWAIT, "sr_shutdown", MSEC_TO_NSEC(1));
3893	}
3894
3895	if (meta_save)
3896		sr_meta_save(sd, 0);
3897
3898	s = splbio();
3899
3900	sd->sd_ready = 0;
3901
3902	/* make sure there isn't a sync pending and yield */
3903	wakeup(sd);
3904	while (sd->sd_sync || sd->sd_must_flush) {
3905		ret = tsleep_nsec(&sd->sd_sync, MAXPRI, "sr_down",
3906		    SEC_TO_NSEC(60));
3907		if (ret == EWOULDBLOCK)
3908			break;
3909	}
3910	if (dying == -1) {
3911		sd->sd_ready = 1;
3912		splx(s);
3913		return;
3914	}
3915
3916#ifndef SMALL_KERNEL
3917	sr_sensors_delete(sd);
3918#endif /* SMALL_KERNEL */
3919
3920	if (sd->sd_target != 0)
3921		scsi_detach_lun(sc->sc_scsibus, sd->sd_target, 0,
3922		    dying ? 0 : DETACH_FORCE);
3923
3924	sr_chunks_unwind(sc, &sd->sd_vol.sv_chunk_list);
3925
3926	if (sd->sd_taskq)
3927		taskq_destroy(sd->sd_taskq);
3928
3929	sr_discipline_free(sd);
3930
3931	splx(s);
3932}
3933
3934int
3935sr_discipline_init(struct sr_discipline *sd, int level)
3936{
3937	int			rv = 1;
3938
3939	/* Initialise discipline function pointers with defaults. */
3940	sd->sd_alloc_resources = sr_alloc_resources;
3941	sd->sd_assemble = NULL;
3942	sd->sd_create = NULL;
3943	sd->sd_free_resources = sr_free_resources;
3944	sd->sd_ioctl_handler = NULL;
3945	sd->sd_openings = NULL;
3946	sd->sd_meta_opt_handler = NULL;
3947	sd->sd_rebuild = sr_rebuild;
3948	sd->sd_scsi_inquiry = sr_raid_inquiry;
3949	sd->sd_scsi_read_cap = sr_raid_read_cap;
3950	sd->sd_scsi_tur = sr_raid_tur;
3951	sd->sd_scsi_req_sense = sr_raid_request_sense;
3952	sd->sd_scsi_start_stop = sr_raid_start_stop;
3953	sd->sd_scsi_sync = sr_raid_sync;
3954	sd->sd_scsi_rw = NULL;
3955	sd->sd_scsi_intr = sr_raid_intr;
3956	sd->sd_scsi_wu_done = NULL;
3957	sd->sd_scsi_done = NULL;
3958	sd->sd_set_chunk_state = sr_set_chunk_state;
3959	sd->sd_set_vol_state = sr_set_vol_state;
3960	sd->sd_start_discipline = NULL;
3961
3962	task_set(&sd->sd_meta_save_task, sr_meta_save_callback, sd);
3963	task_set(&sd->sd_hotspare_rebuild_task, sr_hotspare_rebuild_callback,
3964	    sd);
3965
3966	sd->sd_wu_size = sizeof(struct sr_workunit);
3967	switch (level) {
3968	case 0:
3969		sr_raid0_discipline_init(sd);
3970		break;
3971	case 1:
3972		sr_raid1_discipline_init(sd);
3973		break;
3974	case 5:
3975		sr_raid5_discipline_init(sd);
3976		break;
3977	case 6:
3978		sr_raid6_discipline_init(sd);
3979		break;
3980#ifdef CRYPTO
3981	case 'C':
3982		sr_crypto_discipline_init(sd);
3983		break;
3984	case 0x1C:
3985		sr_raid1c_discipline_init(sd);
3986		break;
3987#endif
3988	case 'c':
3989		sr_concat_discipline_init(sd);
3990		break;
3991	default:
3992		goto bad;
3993	}
3994
3995	rv = 0;
3996bad:
3997	return (rv);
3998}
3999
4000int
4001sr_raid_inquiry(struct sr_workunit *wu)
4002{
4003	struct sr_discipline	*sd = wu->swu_dis;
4004	struct scsi_xfer	*xs = wu->swu_xs;
4005	struct scsi_inquiry	*cdb = (struct scsi_inquiry *)&xs->cmd;
4006	struct scsi_inquiry_data inq;
4007
4008	DNPRINTF(SR_D_DIS, "%s: sr_raid_inquiry\n", DEVNAME(sd->sd_sc));
4009
4010	if (xs->cmdlen != sizeof(*cdb))
4011		return (EINVAL);
4012
4013	if (ISSET(cdb->flags, SI_EVPD))
4014		return (EOPNOTSUPP);
4015
4016	bzero(&inq, sizeof(inq));
4017	inq.device = T_DIRECT;
4018	inq.dev_qual2 = 0;
4019	inq.version = SCSI_REV_2;
4020	inq.response_format = SID_SCSI2_RESPONSE;
4021	inq.additional_length = SID_SCSI2_ALEN;
4022	inq.flags |= SID_CmdQue;
4023	strlcpy(inq.vendor, sd->sd_meta->ssdi.ssd_vendor,
4024	    sizeof(inq.vendor));
4025	strlcpy(inq.product, sd->sd_meta->ssdi.ssd_product,
4026	    sizeof(inq.product));
4027	strlcpy(inq.revision, sd->sd_meta->ssdi.ssd_revision,
4028	    sizeof(inq.revision));
4029	scsi_copy_internal_data(xs, &inq, sizeof(inq));
4030
4031	return (0);
4032}
4033
4034int
4035sr_raid_read_cap(struct sr_workunit *wu)
4036{
4037	struct sr_discipline	*sd = wu->swu_dis;
4038	struct scsi_xfer	*xs = wu->swu_xs;
4039	struct scsi_read_cap_data rcd;
4040	struct scsi_read_cap_data_16 rcd16;
4041	u_int64_t		addr;
4042	int			rv = 1;
4043	u_int32_t		secsize;
4044
4045	DNPRINTF(SR_D_DIS, "%s: sr_raid_read_cap\n", DEVNAME(sd->sd_sc));
4046
4047	secsize = sd->sd_meta->ssdi.ssd_secsize;
4048
4049	addr = ((sd->sd_meta->ssdi.ssd_size * DEV_BSIZE) / secsize) - 1;
4050	if (xs->cmd.opcode == READ_CAPACITY) {
4051		bzero(&rcd, sizeof(rcd));
4052		if (addr > 0xffffffffllu)
4053			_lto4b(0xffffffff, rcd.addr);
4054		else
4055			_lto4b(addr, rcd.addr);
4056		_lto4b(secsize, rcd.length);
4057		scsi_copy_internal_data(xs, &rcd, sizeof(rcd));
4058		rv = 0;
4059	} else if (xs->cmd.opcode == READ_CAPACITY_16) {
4060		bzero(&rcd16, sizeof(rcd16));
4061		_lto8b(addr, rcd16.addr);
4062		_lto4b(secsize, rcd16.length);
4063		scsi_copy_internal_data(xs, &rcd16, sizeof(rcd16));
4064		rv = 0;
4065	}
4066
4067	return (rv);
4068}
4069
4070int
4071sr_raid_tur(struct sr_workunit *wu)
4072{
4073	struct sr_discipline	*sd = wu->swu_dis;
4074
4075	DNPRINTF(SR_D_DIS, "%s: sr_raid_tur\n", DEVNAME(sd->sd_sc));
4076
4077	if (sd->sd_vol_status == BIOC_SVOFFLINE) {
4078		sd->sd_scsi_sense.error_code = SSD_ERRCODE_CURRENT;
4079		sd->sd_scsi_sense.flags = SKEY_NOT_READY;
4080		sd->sd_scsi_sense.add_sense_code = 0x04;
4081		sd->sd_scsi_sense.add_sense_code_qual = 0x11;
4082		sd->sd_scsi_sense.extra_len = 4;
4083		return (1);
4084	} else if (sd->sd_vol_status == BIOC_SVINVALID) {
4085		sd->sd_scsi_sense.error_code = SSD_ERRCODE_CURRENT;
4086		sd->sd_scsi_sense.flags = SKEY_HARDWARE_ERROR;
4087		sd->sd_scsi_sense.add_sense_code = 0x05;
4088		sd->sd_scsi_sense.add_sense_code_qual = 0x00;
4089		sd->sd_scsi_sense.extra_len = 4;
4090		return (1);
4091	}
4092
4093	return (0);
4094}
4095
4096int
4097sr_raid_request_sense(struct sr_workunit *wu)
4098{
4099	struct sr_discipline	*sd = wu->swu_dis;
4100	struct scsi_xfer	*xs = wu->swu_xs;
4101
4102	DNPRINTF(SR_D_DIS, "%s: sr_raid_request_sense\n",
4103	    DEVNAME(sd->sd_sc));
4104
4105	/* use latest sense data */
4106	memcpy(&xs->sense, &sd->sd_scsi_sense, sizeof(xs->sense));
4107
4108	/* clear sense data */
4109	bzero(&sd->sd_scsi_sense, sizeof(sd->sd_scsi_sense));
4110
4111	return (0);
4112}
4113
4114int
4115sr_raid_start_stop(struct sr_workunit *wu)
4116{
4117	struct scsi_xfer	*xs = wu->swu_xs;
4118	struct scsi_start_stop	*ss = (struct scsi_start_stop *)&xs->cmd;
4119
4120	DNPRINTF(SR_D_DIS, "%s: sr_raid_start_stop\n",
4121	    DEVNAME(wu->swu_dis->sd_sc));
4122
4123	if (!ss)
4124		return (1);
4125
4126	/*
4127	 * do nothing!
4128	 * a softraid discipline should always reflect correct status
4129	 */
4130	return (0);
4131}
4132
4133int
4134sr_raid_sync(struct sr_workunit *wu)
4135{
4136	struct sr_discipline	*sd = wu->swu_dis;
4137	int			s, ret, rv = 0, ios;
4138
4139	DNPRINTF(SR_D_DIS, "%s: sr_raid_sync\n", DEVNAME(sd->sd_sc));
4140
4141	/* when doing a fake sync don't count the wu */
4142	ios = (wu->swu_flags & SR_WUF_FAKE) ? 0 : 1;
4143
4144	s = splbio();
4145	sd->sd_sync = 1;
4146	while (sd->sd_wu_pending > ios) {
4147		ret = tsleep_nsec(sd, PRIBIO, "sr_sync", SEC_TO_NSEC(15));
4148		if (ret == EWOULDBLOCK) {
4149			DNPRINTF(SR_D_DIS, "%s: sr_raid_sync timeout\n",
4150			    DEVNAME(sd->sd_sc));
4151			rv = 1;
4152			break;
4153		}
4154	}
4155	sd->sd_sync = 0;
4156	splx(s);
4157
4158	wakeup(&sd->sd_sync);
4159
4160	return (rv);
4161}
4162
4163void
4164sr_raid_intr(struct buf *bp)
4165{
4166	struct sr_ccb		*ccb = (struct sr_ccb *)bp;
4167	struct sr_workunit	*wu = ccb->ccb_wu;
4168#ifdef SR_DEBUG
4169	struct sr_discipline	*sd = wu->swu_dis;
4170	struct scsi_xfer	*xs = wu->swu_xs;
4171#endif
4172	int			s;
4173
4174	DNPRINTF(SR_D_INTR, "%s: %s %s intr bp %p xs %p\n",
4175	    DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname, sd->sd_name, bp, xs);
4176
4177	s = splbio();
4178	sr_ccb_done(ccb);
4179	sr_wu_done(wu);
4180	splx(s);
4181}
4182
4183void
4184sr_schedule_wu(struct sr_workunit *wu)
4185{
4186	struct sr_discipline	*sd = wu->swu_dis;
4187	struct sr_workunit	*wup;
4188	int			s;
4189
4190	DNPRINTF(SR_D_WU, "sr_schedule_wu: schedule wu %p state %i "
4191	    "flags 0x%x\n", wu, wu->swu_state, wu->swu_flags);
4192
4193	KASSERT(wu->swu_io_count > 0);
4194
4195	s = splbio();
4196
4197	/* Construct the work unit, do not schedule it. */
4198	if (wu->swu_state == SR_WU_CONSTRUCT)
4199		goto queued;
4200
4201	/* Deferred work unit being reconstructed, do not start. */
4202	if (wu->swu_state == SR_WU_REQUEUE)
4203		goto queued;
4204
4205	/* Current work unit failed, restart. */
4206	if (wu->swu_state == SR_WU_RESTART)
4207		goto start;
4208
4209	if (wu->swu_state != SR_WU_INPROGRESS)
4210		panic("sr_schedule_wu: work unit not in progress (state %i)",
4211		    wu->swu_state);
4212
4213	/* Walk queue backwards and fill in collider if we have one. */
4214	TAILQ_FOREACH_REVERSE(wup, &sd->sd_wu_pendq, sr_wu_list, swu_link) {
4215		if (wu->swu_blk_end < wup->swu_blk_start ||
4216		    wup->swu_blk_end < wu->swu_blk_start)
4217			continue;
4218
4219		/* Defer work unit due to LBA collision. */
4220		DNPRINTF(SR_D_WU, "sr_schedule_wu: deferring work unit %p\n",
4221		    wu);
4222		wu->swu_state = SR_WU_DEFERRED;
4223		while (wup->swu_collider)
4224			wup = wup->swu_collider;
4225		wup->swu_collider = wu;
4226		TAILQ_INSERT_TAIL(&sd->sd_wu_defq, wu, swu_link);
4227		sd->sd_wu_collisions++;
4228		goto queued;
4229	}
4230
4231start:
4232	sr_raid_startwu(wu);
4233
4234queued:
4235	splx(s);
4236}
4237
4238void
4239sr_raid_startwu(struct sr_workunit *wu)
4240{
4241	struct sr_discipline	*sd = wu->swu_dis;
4242	struct sr_ccb		*ccb;
4243
4244	DNPRINTF(SR_D_WU, "sr_raid_startwu: start wu %p\n", wu);
4245
4246	splassert(IPL_BIO);
4247
4248	if (wu->swu_state == SR_WU_DEFERRED) {
4249		TAILQ_REMOVE(&sd->sd_wu_defq, wu, swu_link);
4250		wu->swu_state = SR_WU_INPROGRESS;
4251	}
4252
4253	if (wu->swu_state != SR_WU_RESTART)
4254		TAILQ_INSERT_TAIL(&sd->sd_wu_pendq, wu, swu_link);
4255
4256	/* Start all of the individual I/Os. */
4257	if (wu->swu_cb_active == 1)
4258		panic("%s: sr_startwu_callback", DEVNAME(sd->sd_sc));
4259	wu->swu_cb_active = 1;
4260
4261	TAILQ_FOREACH(ccb, &wu->swu_ccb, ccb_link)
4262		VOP_STRATEGY(ccb->ccb_buf.b_vp, &ccb->ccb_buf);
4263
4264	wu->swu_cb_active = 0;
4265}
4266
4267void
4268sr_raid_recreate_wu(struct sr_workunit *wu)
4269{
4270	struct sr_discipline	*sd = wu->swu_dis;
4271	struct sr_workunit	*wup = wu;
4272
4273	/*
4274	 * Recreate a work unit by releasing the associated CCBs and reissuing
4275	 * the SCSI I/O request. This process is then repeated for all of the
4276	 * colliding work units.
4277	 */
4278	do {
4279		sr_wu_release_ccbs(wup);
4280
4281		wup->swu_state = SR_WU_REQUEUE;
4282		if (sd->sd_scsi_rw(wup))
4283			panic("could not requeue I/O");
4284
4285		wup = wup->swu_collider;
4286	} while (wup);
4287}
4288
4289int
4290sr_alloc_resources(struct sr_discipline *sd)
4291{
4292	if (sr_wu_alloc(sd)) {
4293		sr_error(sd->sd_sc, "unable to allocate work units");
4294		return (ENOMEM);
4295	}
4296	if (sr_ccb_alloc(sd)) {
4297		sr_error(sd->sd_sc, "unable to allocate ccbs");
4298		return (ENOMEM);
4299	}
4300
4301	return (0);
4302}
4303
4304void
4305sr_free_resources(struct sr_discipline *sd)
4306{
4307	sr_wu_free(sd);
4308	sr_ccb_free(sd);
4309}
4310
4311void
4312sr_set_chunk_state(struct sr_discipline *sd, int c, int new_state)
4313{
4314	int			old_state, s;
4315
4316	DNPRINTF(SR_D_STATE, "%s: %s: %s: sr_set_chunk_state %d -> %d\n",
4317	    DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname,
4318	    sd->sd_vol.sv_chunks[c]->src_meta.scmi.scm_devname, c, new_state);
4319
4320	/* ok to go to splbio since this only happens in error path */
4321	s = splbio();
4322	old_state = sd->sd_vol.sv_chunks[c]->src_meta.scm_status;
4323
4324	/* multiple IOs to the same chunk that fail will come through here */
4325	if (old_state == new_state)
4326		goto done;
4327
4328	switch (old_state) {
4329	case BIOC_SDONLINE:
4330		if (new_state == BIOC_SDOFFLINE)
4331			break;
4332		else
4333			goto die;
4334		break;
4335
4336	case BIOC_SDOFFLINE:
4337		goto die;
4338
4339	default:
4340die:
4341		splx(s); /* XXX */
4342		panic("%s: %s: %s: invalid chunk state transition %d -> %d",
4343		    DEVNAME(sd->sd_sc),
4344		    sd->sd_meta->ssd_devname,
4345		    sd->sd_vol.sv_chunks[c]->src_meta.scmi.scm_devname,
4346		    old_state, new_state);
4347		/* NOTREACHED */
4348	}
4349
4350	sd->sd_vol.sv_chunks[c]->src_meta.scm_status = new_state;
4351	sd->sd_set_vol_state(sd);
4352
4353	sd->sd_must_flush = 1;
4354	task_add(systq, &sd->sd_meta_save_task);
4355done:
4356	splx(s);
4357}
4358
4359void
4360sr_set_vol_state(struct sr_discipline *sd)
4361{
4362	int			states[SR_MAX_STATES];
4363	int			new_state, i, nd;
4364	int			old_state = sd->sd_vol_status;
4365	u_int32_t		s;
4366
4367	DNPRINTF(SR_D_STATE, "%s: %s: sr_set_vol_state\n",
4368	    DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname);
4369
4370	nd = sd->sd_meta->ssdi.ssd_chunk_no;
4371
4372	for (i = 0; i < SR_MAX_STATES; i++)
4373		states[i] = 0;
4374
4375	for (i = 0; i < nd; i++) {
4376		s = sd->sd_vol.sv_chunks[i]->src_meta.scm_status;
4377		if (s >= SR_MAX_STATES)
4378			panic("%s: %s: %s: invalid chunk state",
4379			    DEVNAME(sd->sd_sc),
4380			    sd->sd_meta->ssd_devname,
4381			    sd->sd_vol.sv_chunks[i]->src_meta.scmi.scm_devname);
4382		states[s]++;
4383	}
4384
4385	if (states[BIOC_SDONLINE] == nd)
4386		new_state = BIOC_SVONLINE;
4387	else
4388		new_state = BIOC_SVOFFLINE;
4389
4390	DNPRINTF(SR_D_STATE, "%s: %s: sr_set_vol_state %d -> %d\n",
4391	    DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname,
4392	    old_state, new_state);
4393
4394	switch (old_state) {
4395	case BIOC_SVONLINE:
4396		if (new_state == BIOC_SVOFFLINE || new_state == BIOC_SVONLINE)
4397			break;
4398		else
4399			goto die;
4400		break;
4401
4402	case BIOC_SVOFFLINE:
4403		/* XXX this might be a little too much */
4404		goto die;
4405
4406	default:
4407die:
4408		panic("%s: %s: invalid volume state transition %d -> %d",
4409		    DEVNAME(sd->sd_sc),
4410		    sd->sd_meta->ssd_devname,
4411		    old_state, new_state);
4412		/* NOTREACHED */
4413	}
4414
4415	sd->sd_vol_status = new_state;
4416}
4417
4418void *
4419sr_block_get(struct sr_discipline *sd, long length)
4420{
4421	return dma_alloc(length, PR_NOWAIT | PR_ZERO);
4422}
4423
4424void
4425sr_block_put(struct sr_discipline *sd, void *ptr, int length)
4426{
4427	dma_free(ptr, length);
4428}
4429
4430void
4431sr_checksum_print(u_int8_t *md5)
4432{
4433	int			i;
4434
4435	for (i = 0; i < MD5_DIGEST_LENGTH; i++)
4436		printf("%02x", md5[i]);
4437}
4438
4439void
4440sr_checksum(struct sr_softc *sc, void *src, void *md5, u_int32_t len)
4441{
4442	MD5_CTX			ctx;
4443
4444	DNPRINTF(SR_D_MISC, "%s: sr_checksum(%p %p %d)\n", DEVNAME(sc), src,
4445	    md5, len);
4446
4447	MD5Init(&ctx);
4448	MD5Update(&ctx, src, len);
4449	MD5Final(md5, &ctx);
4450}
4451
4452void
4453sr_uuid_generate(struct sr_uuid *uuid)
4454{
4455	arc4random_buf(uuid->sui_id, sizeof(uuid->sui_id));
4456	/* UUID version 4: random */
4457	uuid->sui_id[6] &= 0x0f;
4458	uuid->sui_id[6] |= 0x40;
4459	/* RFC4122 variant */
4460	uuid->sui_id[8] &= 0x3f;
4461	uuid->sui_id[8] |= 0x80;
4462}
4463
4464char *
4465sr_uuid_format(struct sr_uuid *uuid)
4466{
4467	char *uuidstr;
4468
4469	uuidstr = malloc(37, M_DEVBUF, M_WAITOK);
4470
4471	snprintf(uuidstr, 37,
4472	    "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-"
4473	    "%02x%02x%02x%02x%02x%02x",
4474	    uuid->sui_id[0], uuid->sui_id[1],
4475	    uuid->sui_id[2], uuid->sui_id[3],
4476	    uuid->sui_id[4], uuid->sui_id[5],
4477	    uuid->sui_id[6], uuid->sui_id[7],
4478	    uuid->sui_id[8], uuid->sui_id[9],
4479	    uuid->sui_id[10], uuid->sui_id[11],
4480	    uuid->sui_id[12], uuid->sui_id[13],
4481	    uuid->sui_id[14], uuid->sui_id[15]);
4482
4483	return uuidstr;
4484}
4485
4486void
4487sr_uuid_print(struct sr_uuid *uuid, int cr)
4488{
4489	char *uuidstr;
4490
4491	uuidstr = sr_uuid_format(uuid);
4492	printf("%s%s", uuidstr, (cr ? "\n" : ""));
4493	free(uuidstr, M_DEVBUF, 37);
4494}
4495
4496int
4497sr_already_assembled(struct sr_discipline *sd)
4498{
4499	struct sr_softc		*sc = sd->sd_sc;
4500	struct sr_discipline	*sdtmp;
4501
4502	TAILQ_FOREACH(sdtmp, &sc->sc_dis_list, sd_link) {
4503		if (!bcmp(&sd->sd_meta->ssdi.ssd_uuid,
4504		    &sdtmp->sd_meta->ssdi.ssd_uuid,
4505		    sizeof(sd->sd_meta->ssdi.ssd_uuid)))
4506			return (1);
4507	}
4508
4509	return (0);
4510}
4511
4512int32_t
4513sr_validate_stripsize(u_int32_t b)
4514{
4515	int			s = 0;
4516
4517	if (b % DEV_BSIZE)
4518		return (-1);
4519
4520	while ((b & 1) == 0) {
4521		b >>= 1;
4522		s++;
4523	}
4524
4525	/* only multiple of twos */
4526	b >>= 1;
4527	if (b)
4528		return(-1);
4529
4530	return (s);
4531}
4532
4533void
4534sr_quiesce(void)
4535{
4536	struct sr_softc		*sc = softraid0;
4537	struct sr_discipline	*sd, *nsd;
4538
4539	if (sc == NULL)
4540		return;
4541
4542	/* Shutdown disciplines in reverse attach order. */
4543	TAILQ_FOREACH_REVERSE_SAFE(sd, &sc->sc_dis_list,
4544	    sr_discipline_list, sd_link, nsd)
4545		sr_discipline_shutdown(sd, 1, -1);
4546}
4547
4548void
4549sr_shutdown(int dying)
4550{
4551	struct sr_softc		*sc = softraid0;
4552	struct sr_discipline	*sd;
4553
4554	if (sc == NULL)
4555		return;
4556
4557	DNPRINTF(SR_D_MISC, "%s: sr_shutdown\n", DEVNAME(sc));
4558
4559	/*
4560	 * Since softraid is not under mainbus, we have to explicitly
4561	 * notify its children that the power is going down, so they
4562	 * can execute their shutdown hooks.
4563	 */
4564	config_suspend((struct device *)sc, DVACT_POWERDOWN);
4565
4566	/* Shutdown disciplines in reverse attach order. */
4567	while ((sd = TAILQ_LAST(&sc->sc_dis_list, sr_discipline_list)) != NULL)
4568		sr_discipline_shutdown(sd, 1, dying);
4569}
4570
4571int
4572sr_validate_io(struct sr_workunit *wu, daddr_t *blkno, char *func)
4573{
4574	struct sr_discipline	*sd = wu->swu_dis;
4575	struct scsi_xfer	*xs = wu->swu_xs;
4576	int			rv = 1;
4577
4578	DNPRINTF(SR_D_DIS, "%s: %s 0x%02x\n", DEVNAME(sd->sd_sc), func,
4579	    xs->cmd.opcode);
4580
4581	if (sd->sd_meta->ssd_data_blkno == 0)
4582		panic("invalid data blkno");
4583
4584	if (sd->sd_vol_status == BIOC_SVOFFLINE) {
4585		DNPRINTF(SR_D_DIS, "%s: %s device offline\n",
4586		    DEVNAME(sd->sd_sc), func);
4587		goto bad;
4588	}
4589
4590	if (xs->datalen == 0) {
4591		printf("%s: %s: illegal block count for %s\n",
4592		    DEVNAME(sd->sd_sc), func, sd->sd_meta->ssd_devname);
4593		goto bad;
4594	}
4595
4596	if (xs->cmdlen == 10)
4597		*blkno = _4btol(((struct scsi_rw_10 *)&xs->cmd)->addr);
4598	else if (xs->cmdlen == 16)
4599		*blkno = _8btol(((struct scsi_rw_16 *)&xs->cmd)->addr);
4600	else if (xs->cmdlen == 6)
4601		*blkno = _3btol(((struct scsi_rw *)&xs->cmd)->addr);
4602	else {
4603		printf("%s: %s: illegal cmdlen for %s\n",
4604		    DEVNAME(sd->sd_sc), func, sd->sd_meta->ssd_devname);
4605		goto bad;
4606	}
4607
4608	*blkno *= (sd->sd_meta->ssdi.ssd_secsize / DEV_BSIZE);
4609
4610	wu->swu_blk_start = *blkno;
4611	wu->swu_blk_end = *blkno + (xs->datalen >> DEV_BSHIFT) - 1;
4612
4613	if (wu->swu_blk_end > sd->sd_meta->ssdi.ssd_size) {
4614		DNPRINTF(SR_D_DIS, "%s: %s out of bounds start: %lld "
4615		    "end: %lld length: %d\n",
4616		    DEVNAME(sd->sd_sc), func, (long long)wu->swu_blk_start,
4617		    (long long)wu->swu_blk_end, xs->datalen);
4618
4619		sd->sd_scsi_sense.error_code = SSD_ERRCODE_CURRENT |
4620		    SSD_ERRCODE_VALID;
4621		sd->sd_scsi_sense.flags = SKEY_ILLEGAL_REQUEST;
4622		sd->sd_scsi_sense.add_sense_code = 0x21;
4623		sd->sd_scsi_sense.add_sense_code_qual = 0x00;
4624		sd->sd_scsi_sense.extra_len = 4;
4625		goto bad;
4626	}
4627
4628	rv = 0;
4629bad:
4630	return (rv);
4631}
4632
4633void
4634sr_rebuild_start(void *arg)
4635{
4636	struct sr_discipline	*sd = arg;
4637	struct sr_softc		*sc = sd->sd_sc;
4638
4639	DNPRINTF(SR_D_REBUILD, "%s: %s starting rebuild thread\n",
4640	    DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname);
4641
4642	if (kthread_create(sr_rebuild_thread, sd, &sd->sd_background_proc,
4643	    DEVNAME(sc)) != 0)
4644		printf("%s: unable to start background operation\n",
4645		    DEVNAME(sc));
4646}
4647
4648void
4649sr_rebuild_thread(void *arg)
4650{
4651	struct sr_discipline	*sd = arg;
4652
4653	DNPRINTF(SR_D_REBUILD, "%s: %s rebuild thread started\n",
4654	    DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname);
4655
4656	sd->sd_reb_active = 1;
4657	sd->sd_rebuild(sd);
4658	sd->sd_reb_active = 0;
4659
4660	kthread_exit(0);
4661}
4662
4663void
4664sr_rebuild(struct sr_discipline *sd)
4665{
4666	struct sr_softc		*sc = sd->sd_sc;
4667	u_int64_t		sz, whole_blk, partial_blk, blk, restart;
4668	daddr_t			lba;
4669	struct sr_workunit	*wu_r, *wu_w;
4670	struct scsi_xfer	xs_r, xs_w;
4671	struct scsi_rw_16	*cr, *cw;
4672	int			c, s, slept, percent = 0, old_percent = -1;
4673	u_int8_t		*buf;
4674
4675	whole_blk = sd->sd_meta->ssdi.ssd_size / SR_REBUILD_IO_SIZE;
4676	partial_blk = sd->sd_meta->ssdi.ssd_size % SR_REBUILD_IO_SIZE;
4677
4678	restart = sd->sd_meta->ssd_rebuild / SR_REBUILD_IO_SIZE;
4679	if (restart > whole_blk) {
4680		printf("%s: bogus rebuild restart offset, starting from 0\n",
4681		    DEVNAME(sc));
4682		restart = 0;
4683	}
4684	if (restart) {
4685		/*
4686		 * XXX there is a hole here; there is a possibility that we
4687		 * had a restart however the chunk that was supposed to
4688		 * be rebuilt is no longer valid; we can reach this situation
4689		 * when a rebuild is in progress and the box crashes and
4690		 * on reboot the rebuild chunk is different (like zero'd or
4691		 * replaced).  We need to check the uuid of the chunk that is
4692		 * being rebuilt to assert this.
4693		 */
4694		percent = sr_rebuild_percent(sd);
4695		printf("%s: resuming rebuild on %s at %d%%\n",
4696		    DEVNAME(sc), sd->sd_meta->ssd_devname, percent);
4697	}
4698
4699	/* currently this is 64k therefore we can use dma_alloc */
4700	buf = dma_alloc(SR_REBUILD_IO_SIZE << DEV_BSHIFT, PR_WAITOK);
4701	for (blk = restart; blk <= whole_blk; blk++) {
4702		lba = blk * SR_REBUILD_IO_SIZE;
4703		sz = SR_REBUILD_IO_SIZE;
4704		if (blk == whole_blk) {
4705			if (partial_blk == 0)
4706				break;
4707			sz = partial_blk;
4708		}
4709
4710		/* get some wu */
4711		wu_r = sr_scsi_wu_get(sd, 0);
4712		wu_w = sr_scsi_wu_get(sd, 0);
4713
4714		DNPRINTF(SR_D_REBUILD, "%s: %s rebuild wu_r %p, wu_w %p\n",
4715		    DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname, wu_r, wu_w);
4716
4717		/* setup read io */
4718		bzero(&xs_r, sizeof xs_r);
4719		xs_r.error = XS_NOERROR;
4720		xs_r.flags = SCSI_DATA_IN;
4721		xs_r.datalen = sz << DEV_BSHIFT;
4722		xs_r.data = buf;
4723		xs_r.cmdlen = sizeof(*cr);
4724		cr = (struct scsi_rw_16 *)&xs_r.cmd;
4725		cr->opcode = READ_16;
4726		_lto4b(sz, cr->length);
4727		_lto8b(lba, cr->addr);
4728		wu_r->swu_state = SR_WU_CONSTRUCT;
4729		wu_r->swu_flags |= SR_WUF_REBUILD;
4730		wu_r->swu_xs = &xs_r;
4731		if (sd->sd_scsi_rw(wu_r)) {
4732			printf("%s: could not create read io\n",
4733			    DEVNAME(sc));
4734			goto fail;
4735		}
4736
4737		/* setup write io */
4738		bzero(&xs_w, sizeof xs_w);
4739		xs_w.error = XS_NOERROR;
4740		xs_w.flags = SCSI_DATA_OUT;
4741		xs_w.datalen = sz << DEV_BSHIFT;
4742		xs_w.data = buf;
4743		xs_w.cmdlen = sizeof(*cw);
4744		cw = (struct scsi_rw_16 *)&xs_w.cmd;
4745		cw->opcode = WRITE_16;
4746		_lto4b(sz, cw->length);
4747		_lto8b(lba, cw->addr);
4748		wu_w->swu_state = SR_WU_CONSTRUCT;
4749		wu_w->swu_flags |= SR_WUF_REBUILD | SR_WUF_WAKEUP;
4750		wu_w->swu_xs = &xs_w;
4751		if (sd->sd_scsi_rw(wu_w)) {
4752			printf("%s: could not create write io\n",
4753			    DEVNAME(sc));
4754			goto fail;
4755		}
4756
4757		/*
4758		 * collide with the read io so that we get automatically
4759		 * started when the read is done
4760		 */
4761		wu_w->swu_state = SR_WU_DEFERRED;
4762		wu_r->swu_collider = wu_w;
4763		s = splbio();
4764		TAILQ_INSERT_TAIL(&sd->sd_wu_defq, wu_w, swu_link);
4765		splx(s);
4766
4767		DNPRINTF(SR_D_REBUILD, "%s: %s rebuild scheduling wu_r %p\n",
4768		    DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname, wu_r);
4769
4770		wu_r->swu_state = SR_WU_INPROGRESS;
4771		sr_schedule_wu(wu_r);
4772
4773		/* wait for write completion */
4774		slept = 0;
4775		while ((wu_w->swu_flags & SR_WUF_REBUILDIOCOMP) == 0) {
4776			tsleep_nsec(wu_w, PRIBIO, "sr_rebuild", INFSLP);
4777			slept = 1;
4778		}
4779		/* yield if we didn't sleep */
4780		if (slept == 0)
4781			tsleep_nsec(sc, PWAIT, "sr_yield", MSEC_TO_NSEC(1));
4782
4783		sr_scsi_wu_put(sd, wu_r);
4784		sr_scsi_wu_put(sd, wu_w);
4785
4786		sd->sd_meta->ssd_rebuild = lba;
4787
4788		/* XXX - this should be based on size, not percentage. */
4789		/* save metadata every percent */
4790		percent = sr_rebuild_percent(sd);
4791		if (percent != old_percent && blk != whole_blk) {
4792			if (sr_meta_save(sd, SR_META_DIRTY))
4793				printf("%s: could not save metadata to %s\n",
4794				    DEVNAME(sc), sd->sd_meta->ssd_devname);
4795			old_percent = percent;
4796		}
4797
4798		if (sd->sd_reb_abort)
4799			goto abort;
4800	}
4801
4802	/* all done */
4803	sd->sd_meta->ssd_rebuild = 0;
4804	for (c = 0; c < sd->sd_meta->ssdi.ssd_chunk_no; c++) {
4805		if (sd->sd_vol.sv_chunks[c]->src_meta.scm_status ==
4806		    BIOC_SDREBUILD) {
4807			sd->sd_set_chunk_state(sd, c, BIOC_SDONLINE);
4808			break;
4809		}
4810	}
4811
4812abort:
4813	if (sr_meta_save(sd, SR_META_DIRTY))
4814		printf("%s: could not save metadata to %s\n",
4815		    DEVNAME(sc), sd->sd_meta->ssd_devname);
4816fail:
4817	dma_free(buf, SR_REBUILD_IO_SIZE << DEV_BSHIFT);
4818}
4819
4820struct sr_discipline *
4821sr_find_discipline(struct sr_softc *sc, const char *devname)
4822{
4823	struct sr_discipline	*sd;
4824
4825	TAILQ_FOREACH(sd, &sc->sc_dis_list, sd_link)
4826		if (!strncmp(sd->sd_meta->ssd_devname, devname,
4827		    sizeof(sd->sd_meta->ssd_devname)))
4828			break;
4829	return sd;
4830}
4831
4832#ifndef SMALL_KERNEL
4833int
4834sr_sensors_create(struct sr_discipline *sd)
4835{
4836	struct sr_softc		*sc = sd->sd_sc;
4837	int			rv = 1;
4838
4839	DNPRINTF(SR_D_STATE, "%s: %s: sr_sensors_create\n",
4840	    DEVNAME(sc), sd->sd_meta->ssd_devname);
4841
4842	sd->sd_vol.sv_sensor.type = SENSOR_DRIVE;
4843	sd->sd_vol.sv_sensor.status = SENSOR_S_UNKNOWN;
4844	strlcpy(sd->sd_vol.sv_sensor.desc, sd->sd_meta->ssd_devname,
4845	    sizeof(sd->sd_vol.sv_sensor.desc));
4846
4847	sensor_attach(&sc->sc_sensordev, &sd->sd_vol.sv_sensor);
4848	sd->sd_vol.sv_sensor_attached = 1;
4849
4850	if (sc->sc_sensor_task == NULL) {
4851		sc->sc_sensor_task = sensor_task_register(sc,
4852		    sr_sensors_refresh, 10);
4853		if (sc->sc_sensor_task == NULL)
4854			goto bad;
4855	}
4856
4857	rv = 0;
4858bad:
4859	return (rv);
4860}
4861
4862void
4863sr_sensors_delete(struct sr_discipline *sd)
4864{
4865	DNPRINTF(SR_D_STATE, "%s: sr_sensors_delete\n", DEVNAME(sd->sd_sc));
4866
4867	if (sd->sd_vol.sv_sensor_attached)
4868		sensor_detach(&sd->sd_sc->sc_sensordev, &sd->sd_vol.sv_sensor);
4869}
4870
4871void
4872sr_sensors_refresh(void *arg)
4873{
4874	struct sr_softc		*sc = arg;
4875	struct sr_volume	*sv;
4876	struct sr_discipline	*sd;
4877
4878	DNPRINTF(SR_D_STATE, "%s: sr_sensors_refresh\n", DEVNAME(sc));
4879
4880	TAILQ_FOREACH(sd, &sc->sc_dis_list, sd_link) {
4881		sv = &sd->sd_vol;
4882
4883		switch(sd->sd_vol_status) {
4884		case BIOC_SVOFFLINE:
4885			sv->sv_sensor.value = SENSOR_DRIVE_FAIL;
4886			sv->sv_sensor.status = SENSOR_S_CRIT;
4887			break;
4888
4889		case BIOC_SVDEGRADED:
4890			sv->sv_sensor.value = SENSOR_DRIVE_PFAIL;
4891			sv->sv_sensor.status = SENSOR_S_WARN;
4892			break;
4893
4894		case BIOC_SVREBUILD:
4895			sv->sv_sensor.value = SENSOR_DRIVE_REBUILD;
4896			sv->sv_sensor.status = SENSOR_S_WARN;
4897			break;
4898
4899		case BIOC_SVSCRUB:
4900		case BIOC_SVONLINE:
4901			sv->sv_sensor.value = SENSOR_DRIVE_ONLINE;
4902			sv->sv_sensor.status = SENSOR_S_OK;
4903			break;
4904
4905		default:
4906			sv->sv_sensor.value = 0; /* unknown */
4907			sv->sv_sensor.status = SENSOR_S_UNKNOWN;
4908		}
4909	}
4910}
4911#endif /* SMALL_KERNEL */
4912
4913#ifdef SR_FANCY_STATS
4914void				sr_print_stats(void);
4915
4916void
4917sr_print_stats(void)
4918{
4919	struct sr_softc		*sc = softraid0;
4920	struct sr_discipline	*sd;
4921
4922	if (sc == NULL) {
4923		printf("no softraid softc found\n");
4924		return;
4925	}
4926
4927	TAILQ_FOREACH(sd, &sc->sc_dis_list, sd_link) {
4928		printf("%s: ios pending %d, collisions %llu\n",
4929		    sd->sd_meta->ssd_devname,
4930		    sd->sd_wu_pending,
4931		    sd->sd_wu_collisions);
4932	}
4933}
4934#endif /* SR_FANCY_STATS */
4935
4936#ifdef SR_DEBUG
4937void
4938sr_meta_print(struct sr_metadata *m)
4939{
4940	int			i;
4941	struct sr_meta_chunk	*mc;
4942	struct sr_meta_opt_hdr	*omh;
4943
4944	if (!(sr_debug & SR_D_META))
4945		return;
4946
4947	printf("\tssd_magic 0x%llx\n", m->ssdi.ssd_magic);
4948	printf("\tssd_version %d\n", m->ssdi.ssd_version);
4949	printf("\tssd_vol_flags 0x%x\n", m->ssdi.ssd_vol_flags);
4950	printf("\tssd_uuid ");
4951	sr_uuid_print(&m->ssdi.ssd_uuid, 1);
4952	printf("\tssd_chunk_no %d\n", m->ssdi.ssd_chunk_no);
4953	printf("\tssd_chunk_id %d\n", m->ssdi.ssd_chunk_id);
4954	printf("\tssd_opt_no %d\n", m->ssdi.ssd_opt_no);
4955	printf("\tssd_volid %d\n", m->ssdi.ssd_volid);
4956	printf("\tssd_level %d\n", m->ssdi.ssd_level);
4957	printf("\tssd_size %lld\n", m->ssdi.ssd_size);
4958	printf("\tssd_devname %s\n", m->ssd_devname);
4959	printf("\tssd_vendor %s\n", m->ssdi.ssd_vendor);
4960	printf("\tssd_product %s\n", m->ssdi.ssd_product);
4961	printf("\tssd_revision %s\n", m->ssdi.ssd_revision);
4962	printf("\tssd_strip_size %d\n", m->ssdi.ssd_strip_size);
4963	printf("\tssd_checksum ");
4964	sr_checksum_print(m->ssd_checksum);
4965	printf("\n");
4966	printf("\tssd_meta_flags 0x%x\n", m->ssd_meta_flags);
4967	printf("\tssd_ondisk %llu\n", m->ssd_ondisk);
4968
4969	mc = (struct sr_meta_chunk *)(m + 1);
4970	for (i = 0; i < m->ssdi.ssd_chunk_no; i++, mc++) {
4971		printf("\t\tscm_volid %d\n", mc->scmi.scm_volid);
4972		printf("\t\tscm_chunk_id %d\n", mc->scmi.scm_chunk_id);
4973		printf("\t\tscm_devname %s\n", mc->scmi.scm_devname);
4974		printf("\t\tscm_size %lld\n", mc->scmi.scm_size);
4975		printf("\t\tscm_coerced_size %lld\n",mc->scmi.scm_coerced_size);
4976		printf("\t\tscm_uuid ");
4977		sr_uuid_print(&mc->scmi.scm_uuid, 1);
4978		printf("\t\tscm_checksum ");
4979		sr_checksum_print(mc->scm_checksum);
4980		printf("\n");
4981		printf("\t\tscm_status %d\n", mc->scm_status);
4982	}
4983
4984	omh = (struct sr_meta_opt_hdr *)((u_int8_t *)(m + 1) +
4985	    sizeof(struct sr_meta_chunk) * m->ssdi.ssd_chunk_no);
4986	for (i = 0; i < m->ssdi.ssd_opt_no; i++) {
4987		printf("\t\t\tsom_type %d\n", omh->som_type);
4988		printf("\t\t\tsom_checksum ");
4989		sr_checksum_print(omh->som_checksum);
4990		printf("\n");
4991		omh = (struct sr_meta_opt_hdr *)((void *)omh +
4992		    omh->som_length);
4993	}
4994}
4995
4996void
4997sr_dump_block(void *blk, int len)
4998{
4999	uint8_t			*b = blk;
5000	int			i, j, c;
5001
5002	for (i = 0; i < len; i += 16) {
5003		for (j = 0; j < 16; j++)
5004			printf("%.2x ", b[i + j]);
5005		printf("  ");
5006		for (j = 0; j < 16; j++) {
5007			c = b[i + j];
5008			if (c < ' ' || c > 'z' || i + j > len)
5009				c = '.';
5010			printf("%c", c);
5011		}
5012		printf("\n");
5013	}
5014}
5015
5016void
5017sr_dump_mem(u_int8_t *p, int len)
5018{
5019	int			i;
5020
5021	for (i = 0; i < len; i++)
5022		printf("%02x ", *p++);
5023	printf("\n");
5024}
5025
5026#endif /* SR_DEBUG */
5027
5028#ifdef HIBERNATE
5029/*
5030 * Side-effect free (no malloc, printf, pool, splx) softraid crypto writer.
5031 *
5032 * This function must perform the following:
5033 * 1. Determine the underlying device's own side-effect free I/O function
5034 *    (eg, ahci_hibernate_io, wd_hibernate_io, etc).
5035 * 2. Store enough information in the provided page argument for subsequent
5036 *    I/O calls (such as the crypto discipline structure for the keys, the
5037 *    offset of the softraid partition on the underlying disk, as well as
5038 *    the offset of the swap partition within the crypto volume.
5039 * 3. Encrypt the incoming data using the sr_discipline keys, then pass
5040 *    the request to the underlying device's own I/O function.
5041 */
5042int
5043sr_hibernate_io(dev_t dev, daddr_t blkno, vaddr_t addr, size_t size, int op, void *page)
5044{
5045	/* Struct for stashing data obtained on HIB_INIT.
5046	 * XXX
5047	 * We share the page with the underlying device's own
5048	 * side-effect free I/O function, so we pad our data to
5049	 * the end of the page. Presently this does not overlap
5050	 * with either of the two other side-effect free i/o
5051	 * functions (ahci/wd).
5052	 */
5053	struct {
5054		char pad[3072];
5055		struct sr_discipline *srd;
5056		hibio_fn subfn;		/* underlying device i/o fn */
5057		dev_t subdev;		/* underlying device dev_t */
5058		daddr_t sr_swapoff;	/* ofs of swap part in sr volume */
5059		char buf[DEV_BSIZE];	/* encryption performed into this buf */
5060	} *my = page;
5061	extern struct cfdriver sd_cd;
5062	char errstr[128], *dl_ret;
5063	struct sr_chunk *schunk;
5064	struct sd_softc *sd;
5065	struct aes_xts_ctx ctx;
5066	struct sr_softc *sc;
5067	struct device *dv;
5068	daddr_t key_blkno;
5069	uint32_t sub_raidoff;  /* ofs of sr part in underlying dev */
5070	struct disklabel dl;
5071	struct partition *pp;
5072	size_t i, j;
5073	u_char iv[8];
5074
5075	/*
5076	 * In HIB_INIT, we are passed the swap partition size and offset
5077	 * in 'size' and 'blkno' respectively. These are relative to the
5078	 * start of the softraid partition, and we need to save these
5079	 * for later translation to the underlying device's layout.
5080	 */
5081	if (op == HIB_INIT) {
5082		dv = disk_lookup(&sd_cd, DISKUNIT(dev));
5083		sd = (struct sd_softc *)dv;
5084		sc = (struct sr_softc *)dv->dv_parent->dv_parent;
5085
5086		/*
5087		 * Look up the sr discipline. This is used to determine
5088		 * if we are SR crypto and what the underlying device is.
5089		 */
5090		my->srd = sc->sc_targets[sd->sc_link->target];
5091		DNPRINTF(SR_D_MISC, "sr_hibernate_io: discipline is %s\n",
5092			my->srd->sd_name);
5093		if (strncmp(my->srd->sd_name, "CRYPTO",
5094		    sizeof(my->srd->sd_name)))
5095			return (ENOTSUP);
5096
5097		/* Find the underlying device */
5098		schunk = my->srd->sd_vol.sv_chunks[0];
5099		my->subdev = schunk->src_dev_mm;
5100
5101		/*
5102		 * Find the appropriate underlying device side effect free
5103		 * I/O function, based on the type of device it is.
5104		 */
5105		my->subfn = get_hibernate_io_function(my->subdev);
5106		if (!my->subfn)
5107			return (ENODEV);
5108
5109		/*
5110		 * Find blkno where this raid partition starts on
5111		 * the underlying disk.
5112		 */
5113		dl_ret = disk_readlabel(&dl, my->subdev, errstr,
5114		    sizeof(errstr));
5115		if (dl_ret) {
5116			printf("Hibernate error reading disklabel: %s\n", dl_ret);
5117			return (ENOTSUP);
5118		}
5119
5120		pp = &dl.d_partitions[DISKPART(my->subdev)];
5121		if (pp->p_fstype != FS_RAID || DL_GETPSIZE(pp) == 0)
5122			return (ENOTSUP);
5123
5124		/* Find the blkno of the SR part in the underlying device */
5125		sub_raidoff = my->srd->sd_meta->ssd_data_blkno +
5126		    DL_SECTOBLK(&dl, DL_GETPOFFSET(pp));
5127		DNPRINTF(SR_D_MISC,"sr_hibernate_io: blk trans ofs: %d blks\n",
5128		    sub_raidoff);
5129
5130		/* Save the blkno of the swap partition in the SR disk */
5131		my->sr_swapoff = blkno;
5132
5133		/* Initialize the sub-device */
5134		return my->subfn(my->subdev, sub_raidoff + blkno,
5135		    addr, size, op, page);
5136	}
5137
5138	/* Hibernate only uses (and we only support) writes */
5139	if (op != HIB_W)
5140		return (ENOTSUP);
5141
5142	/*
5143	 * Blocks act as the IV for the encryption. These block numbers
5144	 * are relative to the start of the sr partition, but the 'blkno'
5145	 * passed above is relative to the start of the swap partition
5146	 * inside the sr partition, so bias appropriately.
5147	 */
5148	key_blkno = my->sr_swapoff + blkno;
5149
5150	/* Process each disk block one at a time. */
5151	for (i = 0; i < size; i += DEV_BSIZE) {
5152		int res;
5153
5154		bzero(&ctx, sizeof(ctx));
5155
5156		/*
5157		 * Set encryption key (from the sr discipline stashed
5158		 * during HIB_INIT. This code is based on the softraid
5159		 * bootblock code.
5160		 */
5161		aes_xts_setkey(&ctx, my->srd->mds.mdd_crypto.scr_key[0], 64);
5162		/* We encrypt DEV_BSIZE bytes at a time in my->buf */
5163		memcpy(my->buf, ((char *)addr) + i, DEV_BSIZE);
5164
5165		/* Block number is the IV */
5166		memcpy(&iv, &key_blkno, sizeof(key_blkno));
5167		aes_xts_reinit(&ctx, iv);
5168
5169		/* Encrypt DEV_BSIZE bytes, AES_XTS_BLOCKSIZE bytes at a time */
5170		for (j = 0; j < DEV_BSIZE; j += AES_XTS_BLOCKSIZE)
5171			aes_xts_encrypt(&ctx, my->buf + j);
5172
5173		/*
5174		 * Write one block out from my->buf to the underlying device
5175		 * using its own side-effect free I/O function.
5176		 */
5177		res = my->subfn(my->subdev, blkno + (i / DEV_BSIZE),
5178		    (vaddr_t)(my->buf), DEV_BSIZE, op, page);
5179		if (res != 0)
5180			return (res);
5181		key_blkno++;
5182	}
5183	return (0);
5184}
5185#endif /* HIBERNATE */
5186