geom_ccd.c revision 71773
1/* $FreeBSD: head/sys/geom/geom_ccd.c 71773 2001-01-29 06:18:14Z phk $ */
2
3/*	$NetBSD: ccd.c,v 1.22 1995/12/08 19:13:26 thorpej Exp $	*/
4
5/*
6 * Copyright (c) 1995 Jason R. Thorpe.
7 * All rights reserved.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 *    notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 *    notice, this list of conditions and the following disclaimer in the
16 *    documentation and/or other materials provided with the distribution.
17 * 3. All advertising materials mentioning features or use of this software
18 *    must display the following acknowledgement:
19 *	This product includes software developed for the NetBSD Project
20 *	by Jason R. Thorpe.
21 * 4. The name of the author may not be used to endorse or promote products
22 *    derived from this software without specific prior written permission.
23 *
24 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
25 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
26 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
27 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
28 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
29 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
30 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
31 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
32 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * SUCH DAMAGE.
35 */
36
37/*
38 * Copyright (c) 1988 University of Utah.
39 * Copyright (c) 1990, 1993
40 *	The Regents of the University of California.  All rights reserved.
41 *
42 * This code is derived from software contributed to Berkeley by
43 * the Systems Programming Group of the University of Utah Computer
44 * Science Department.
45 *
46 * Redistribution and use in source and binary forms, with or without
47 * modification, are permitted provided that the following conditions
48 * are met:
49 * 1. Redistributions of source code must retain the above copyright
50 *    notice, this list of conditions and the following disclaimer.
51 * 2. Redistributions in binary form must reproduce the above copyright
52 *    notice, this list of conditions and the following disclaimer in the
53 *    documentation and/or other materials provided with the distribution.
54 * 3. All advertising materials mentioning features or use of this software
55 *    must display the following acknowledgement:
56 *	This product includes software developed by the University of
57 *	California, Berkeley and its contributors.
58 * 4. Neither the name of the University nor the names of its contributors
59 *    may be used to endorse or promote products derived from this software
60 *    without specific prior written permission.
61 *
62 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
63 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
64 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
65 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
66 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
67 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
68 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
69 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
70 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
71 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
72 * SUCH DAMAGE.
73 *
74 * from: Utah $Hdr: cd.c 1.6 90/11/28$
75 *
76 *	@(#)cd.c	8.2 (Berkeley) 11/16/93
77 */
78
79/*
80 * "Concatenated" disk driver.
81 *
82 * Dynamic configuration and disklabel support by:
83 *	Jason R. Thorpe <thorpej@nas.nasa.gov>
84 *	Numerical Aerodynamic Simulation Facility
85 *	Mail Stop 258-6
86 *	NASA Ames Research Center
87 *	Moffett Field, CA 94035
88 */
89
90#include "ccd.h"
91
92#include <sys/param.h>
93#include <sys/systm.h>
94#include <sys/kernel.h>
95#include <sys/module.h>
96#include <sys/proc.h>
97#include <sys/bio.h>
98#include <sys/malloc.h>
99#include <sys/namei.h>
100#include <sys/conf.h>
101#include <sys/stat.h>
102#include <sys/sysctl.h>
103#include <sys/disklabel.h>
104#include <ufs/ffs/fs.h>
105#include <sys/devicestat.h>
106#include <sys/fcntl.h>
107#include <sys/vnode.h>
108
109#include <sys/ccdvar.h>
110
111#if defined(CCDDEBUG) && !defined(DEBUG)
112#define DEBUG
113#endif
114
115#ifdef DEBUG
116#define CCDB_FOLLOW	0x01
117#define CCDB_INIT	0x02
118#define CCDB_IO		0x04
119#define CCDB_LABEL	0x08
120#define CCDB_VNODE	0x10
121static int ccddebug = CCDB_FOLLOW | CCDB_INIT | CCDB_IO | CCDB_LABEL |
122    CCDB_VNODE;
123SYSCTL_INT(_debug, OID_AUTO, ccddebug, CTLFLAG_RW, &ccddebug, 0, "");
124#undef DEBUG
125#endif
126
127#define	ccdunit(x)	dkunit(x)
128#define ccdpart(x)	dkpart(x)
129
130/*
131   This is how mirroring works (only writes are special):
132
133   When initiating a write, ccdbuffer() returns two "struct ccdbuf *"s
134   linked together by the cb_mirror field.  "cb_pflags &
135   CCDPF_MIRROR_DONE" is set to 0 on both of them.
136
137   When a component returns to ccdiodone(), it checks if "cb_pflags &
138   CCDPF_MIRROR_DONE" is set or not.  If not, it sets the partner's
139   flag and returns.  If it is, it means its partner has already
140   returned, so it will go to the regular cleanup.
141
142 */
143
144struct ccdbuf {
145	struct bio	cb_buf;		/* new I/O buf */
146	struct bio	*cb_obp;	/* ptr. to original I/O buf */
147	struct ccdbuf	*cb_freenext;	/* free list link */
148	int		cb_unit;	/* target unit */
149	int		cb_comp;	/* target component */
150	int		cb_pflags;	/* mirror/parity status flag */
151	struct ccdbuf	*cb_mirror;	/* mirror counterpart */
152};
153
154/* bits in cb_pflags */
155#define CCDPF_MIRROR_DONE 1	/* if set, mirror counterpart is done */
156
157#define CCDLABELDEV(dev)	\
158	(makedev(major((dev)), dkmakeminor(ccdunit((dev)), 0, RAW_PART)))
159
160static d_open_t ccdopen;
161static d_close_t ccdclose;
162static d_strategy_t ccdstrategy;
163static d_ioctl_t ccdioctl;
164static d_dump_t ccddump;
165static d_psize_t ccdsize;
166
167#define NCCDFREEHIWAT	16
168
169#define CDEV_MAJOR 74
170#define BDEV_MAJOR 21
171
172static struct cdevsw ccd_cdevsw = {
173	/* open */	ccdopen,
174	/* close */	ccdclose,
175	/* read */	physread,
176	/* write */	physwrite,
177	/* ioctl */	ccdioctl,
178	/* poll */	nopoll,
179	/* mmap */	nommap,
180	/* strategy */	ccdstrategy,
181	/* name */	"ccd",
182	/* maj */	CDEV_MAJOR,
183	/* dump */	ccddump,
184	/* psize */	ccdsize,
185	/* flags */	D_DISK,
186	/* bmaj */	BDEV_MAJOR
187};
188
189/* called during module initialization */
190static	void ccdattach __P((void));
191static	int ccd_modevent __P((module_t, int, void *));
192
193/* called by biodone() at interrupt time */
194static	void ccdiodone __P((struct bio *bp));
195
196static	void ccdstart __P((struct ccd_softc *, struct bio *));
197static	void ccdinterleave __P((struct ccd_softc *, int));
198static	void ccdintr __P((struct ccd_softc *, struct bio *));
199static	int ccdinit __P((struct ccddevice *, char **, struct proc *));
200static	int ccdlookup __P((char *, struct proc *p, struct vnode **));
201static	void ccdbuffer __P((struct ccdbuf **ret, struct ccd_softc *,
202		struct bio *, daddr_t, caddr_t, long));
203static	void ccdgetdisklabel __P((dev_t));
204static	void ccdmakedisklabel __P((struct ccd_softc *));
205static	int ccdlock __P((struct ccd_softc *));
206static	void ccdunlock __P((struct ccd_softc *));
207
208#ifdef DEBUG
209static	void printiinfo __P((struct ccdiinfo *));
210#endif
211
212/* Non-private for the benefit of libkvm. */
213struct	ccd_softc *ccd_softc;
214struct	ccddevice *ccddevs;
215struct	ccdbuf *ccdfreebufs;
216static	int numccdfreebufs;
217static	int numccd = 0;
218
219/*
220 * getccdbuf() -	Allocate and zero a ccd buffer.
221 *
222 *	This routine is called at splbio().
223 */
224
225static __inline
226struct ccdbuf *
227getccdbuf(struct ccdbuf *cpy)
228{
229	struct ccdbuf *cbp;
230
231	/*
232	 * Allocate from freelist or malloc as necessary
233	 */
234	if ((cbp = ccdfreebufs) != NULL) {
235		ccdfreebufs = cbp->cb_freenext;
236		--numccdfreebufs;
237	} else {
238		cbp = malloc(sizeof(struct ccdbuf), M_DEVBUF, M_WAITOK);
239	}
240
241	/*
242	 * Used by mirroring code
243	 */
244	if (cpy)
245		bcopy(cpy, cbp, sizeof(struct ccdbuf));
246	else
247		bzero(cbp, sizeof(struct ccdbuf));
248
249	/*
250	 * independant struct bio initialization
251	 */
252
253	return(cbp);
254}
255
256/*
257 * putccdbuf() -	Free a ccd buffer.
258 *
259 *	This routine is called at splbio().
260 */
261
262static __inline
263void
264putccdbuf(struct ccdbuf *cbp)
265{
266
267	if (numccdfreebufs < NCCDFREEHIWAT) {
268		cbp->cb_freenext = ccdfreebufs;
269		ccdfreebufs = cbp;
270		++numccdfreebufs;
271	} else {
272		free((caddr_t)cbp, M_DEVBUF);
273	}
274}
275
276
277/*
278 * Number of blocks to untouched in front of a component partition.
279 * This is to avoid violating its disklabel area when it starts at the
280 * beginning of the slice.
281 */
282#if !defined(CCD_OFFSET)
283#define CCD_OFFSET 16
284#endif
285
286static void
287ccd_clone(void *arg, char *name, int namelen, dev_t *dev)
288{
289	int i, u;
290	char *s;
291
292	if (*dev != NODEV)
293		return;
294	i = dev_stdclone(name, &s, "ccd", &u);
295	if (i != 2)
296		return;
297	if (u >= numccd)
298		return;
299	if (*s < 'a' || *s > 'h')
300		return;
301	if (s[1] != '\0')
302		return;
303	*dev = make_dev(&ccd_cdevsw, u * 8 + *s - 'a',
304		UID_ROOT, GID_OPERATOR, 0640, name);
305}
306
307/*
308 * Called by main() during pseudo-device attachment.  All we need
309 * to do is allocate enough space for devices to be configured later, and
310 * add devsw entries.
311 */
312static void
313ccdattach()
314{
315	int i;
316	int num = NCCD;
317
318	if (num > 1)
319		printf("ccd0-%d: Concatenated disk drivers\n", num-1);
320	else
321		printf("ccd0: Concatenated disk driver\n");
322
323	ccd_softc = (struct ccd_softc *)malloc(num * sizeof(struct ccd_softc),
324	    M_DEVBUF, M_NOWAIT);
325	ccddevs = (struct ccddevice *)malloc(num * sizeof(struct ccddevice),
326	    M_DEVBUF, M_NOWAIT);
327	if ((ccd_softc == NULL) || (ccddevs == NULL)) {
328		printf("WARNING: no memory for concatenated disks\n");
329		if (ccd_softc != NULL)
330			free(ccd_softc, M_DEVBUF);
331		if (ccddevs != NULL)
332			free(ccddevs, M_DEVBUF);
333		return;
334	}
335	numccd = num;
336	bzero(ccd_softc, num * sizeof(struct ccd_softc));
337	bzero(ccddevs, num * sizeof(struct ccddevice));
338
339	cdevsw_add(&ccd_cdevsw);
340	/* XXX: is this necessary? */
341	for (i = 0; i < numccd; ++i)
342		ccddevs[i].ccd_dk = -1;
343	EVENTHANDLER_REGISTER(dev_clone, ccd_clone, 0, 1000);
344}
345
346static int
347ccd_modevent(mod, type, data)
348	module_t mod;
349	int type;
350	void *data;
351{
352	int error = 0;
353
354	switch (type) {
355	case MOD_LOAD:
356		ccdattach();
357		break;
358
359	case MOD_UNLOAD:
360		printf("ccd0: Unload not supported!\n");
361		error = EOPNOTSUPP;
362		break;
363
364	default:	/* MOD_SHUTDOWN etc */
365		break;
366	}
367	return (error);
368}
369
370DEV_MODULE(ccd, ccd_modevent, NULL);
371
372static int
373ccdinit(ccd, cpaths, p)
374	struct ccddevice *ccd;
375	char **cpaths;
376	struct proc *p;
377{
378	struct ccd_softc *cs = &ccd_softc[ccd->ccd_unit];
379	struct ccdcinfo *ci = NULL;	/* XXX */
380	size_t size;
381	int ix;
382	struct vnode *vp;
383	size_t minsize;
384	int maxsecsize;
385	struct partinfo dpart;
386	struct ccdgeom *ccg = &cs->sc_geom;
387	char tmppath[MAXPATHLEN];
388	int error = 0;
389
390#ifdef DEBUG
391	if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
392		printf("ccdinit: unit %d\n", ccd->ccd_unit);
393#endif
394
395	cs->sc_size = 0;
396	cs->sc_ileave = ccd->ccd_interleave;
397	cs->sc_nccdisks = ccd->ccd_ndev;
398
399	/* Allocate space for the component info. */
400	cs->sc_cinfo = malloc(cs->sc_nccdisks * sizeof(struct ccdcinfo),
401	    M_DEVBUF, M_WAITOK);
402
403	/*
404	 * Verify that each component piece exists and record
405	 * relevant information about it.
406	 */
407	maxsecsize = 0;
408	minsize = 0;
409	for (ix = 0; ix < cs->sc_nccdisks; ix++) {
410		vp = ccd->ccd_vpp[ix];
411		ci = &cs->sc_cinfo[ix];
412		ci->ci_vp = vp;
413
414		/*
415		 * Copy in the pathname of the component.
416		 */
417		bzero(tmppath, sizeof(tmppath));	/* sanity */
418		if ((error = copyinstr(cpaths[ix], tmppath,
419		    MAXPATHLEN, &ci->ci_pathlen)) != 0) {
420#ifdef DEBUG
421			if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
422				printf("ccd%d: can't copy path, error = %d\n",
423				    ccd->ccd_unit, error);
424#endif
425			goto fail;
426		}
427		ci->ci_path = malloc(ci->ci_pathlen, M_DEVBUF, M_WAITOK);
428		bcopy(tmppath, ci->ci_path, ci->ci_pathlen);
429
430		ci->ci_dev = vn_todev(vp);
431
432		/*
433		 * Get partition information for the component.
434		 */
435		if ((error = VOP_IOCTL(vp, DIOCGPART, (caddr_t)&dpart,
436		    FREAD, p->p_ucred, p)) != 0) {
437#ifdef DEBUG
438			if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
439				 printf("ccd%d: %s: ioctl failed, error = %d\n",
440				     ccd->ccd_unit, ci->ci_path, error);
441#endif
442			goto fail;
443		}
444		if (dpart.part->p_fstype == FS_BSDFFS) {
445			maxsecsize =
446			    ((dpart.disklab->d_secsize > maxsecsize) ?
447			    dpart.disklab->d_secsize : maxsecsize);
448			size = dpart.part->p_size - CCD_OFFSET;
449		} else {
450#ifdef DEBUG
451			if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
452				printf("ccd%d: %s: incorrect partition type\n",
453				    ccd->ccd_unit, ci->ci_path);
454#endif
455			error = EFTYPE;
456			goto fail;
457		}
458
459		/*
460		 * Calculate the size, truncating to an interleave
461		 * boundary if necessary.
462		 */
463
464		if (cs->sc_ileave > 1)
465			size -= size % cs->sc_ileave;
466
467		if (size == 0) {
468#ifdef DEBUG
469			if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
470				printf("ccd%d: %s: size == 0\n",
471				    ccd->ccd_unit, ci->ci_path);
472#endif
473			error = ENODEV;
474			goto fail;
475		}
476
477		if (minsize == 0 || size < minsize)
478			minsize = size;
479		ci->ci_size = size;
480		cs->sc_size += size;
481	}
482
483	/*
484	 * Don't allow the interleave to be smaller than
485	 * the biggest component sector.
486	 */
487	if ((cs->sc_ileave > 0) &&
488	    (cs->sc_ileave < (maxsecsize / DEV_BSIZE))) {
489#ifdef DEBUG
490		if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
491			printf("ccd%d: interleave must be at least %d\n",
492			    ccd->ccd_unit, (maxsecsize / DEV_BSIZE));
493#endif
494		error = EINVAL;
495		goto fail;
496	}
497
498	/*
499	 * If uniform interleave is desired set all sizes to that of
500	 * the smallest component.  This will guarentee that a single
501	 * interleave table is generated.
502	 *
503	 * Lost space must be taken into account when calculating the
504	 * overall size.  Half the space is lost when CCDF_MIRROR is
505	 * specified.  One disk is lost when CCDF_PARITY is specified.
506	 */
507	if (ccd->ccd_flags & CCDF_UNIFORM) {
508		for (ci = cs->sc_cinfo;
509		     ci < &cs->sc_cinfo[cs->sc_nccdisks]; ci++) {
510			ci->ci_size = minsize;
511		}
512		if (ccd->ccd_flags & CCDF_MIRROR) {
513			/*
514			 * Check to see if an even number of components
515			 * have been specified.  The interleave must also
516			 * be non-zero in order for us to be able to
517			 * guarentee the topology.
518			 */
519			if (cs->sc_nccdisks % 2) {
520				printf("ccd%d: mirroring requires an even number of disks\n", ccd->ccd_unit );
521				error = EINVAL;
522				goto fail;
523			}
524			if (cs->sc_ileave == 0) {
525				printf("ccd%d: an interleave must be specified when mirroring\n", ccd->ccd_unit);
526				error = EINVAL;
527				goto fail;
528			}
529			cs->sc_size = (cs->sc_nccdisks/2) * minsize;
530		} else if (ccd->ccd_flags & CCDF_PARITY) {
531			cs->sc_size = (cs->sc_nccdisks-1) * minsize;
532		} else {
533			if (cs->sc_ileave == 0) {
534				printf("ccd%d: an interleave must be specified when using parity\n", ccd->ccd_unit);
535				error = EINVAL;
536				goto fail;
537			}
538			cs->sc_size = cs->sc_nccdisks * minsize;
539		}
540	}
541
542	/*
543	 * Construct the interleave table.
544	 */
545	ccdinterleave(cs, ccd->ccd_unit);
546
547	/*
548	 * Create pseudo-geometry based on 1MB cylinders.  It's
549	 * pretty close.
550	 */
551	ccg->ccg_secsize = maxsecsize;
552	ccg->ccg_ntracks = 1;
553	ccg->ccg_nsectors = 1024 * 1024 / ccg->ccg_secsize;
554	ccg->ccg_ncylinders = cs->sc_size / ccg->ccg_nsectors;
555
556	/*
557	 * Add an devstat entry for this device.
558	 */
559	devstat_add_entry(&cs->device_stats, "ccd", ccd->ccd_unit,
560			  ccg->ccg_secsize, DEVSTAT_ALL_SUPPORTED,
561			  DEVSTAT_TYPE_STORARRAY |DEVSTAT_TYPE_IF_OTHER,
562			  DEVSTAT_PRIORITY_ARRAY);
563
564	cs->sc_flags |= CCDF_INITED;
565	cs->sc_cflags = ccd->ccd_flags;	/* So we can find out later... */
566	cs->sc_unit = ccd->ccd_unit;
567	return (0);
568fail:
569	while (ci > cs->sc_cinfo) {
570		ci--;
571		free(ci->ci_path, M_DEVBUF);
572	}
573	free(cs->sc_cinfo, M_DEVBUF);
574	return (error);
575}
576
577static void
578ccdinterleave(cs, unit)
579	struct ccd_softc *cs;
580	int unit;
581{
582	struct ccdcinfo *ci, *smallci;
583	struct ccdiinfo *ii;
584	daddr_t bn, lbn;
585	int ix;
586	u_long size;
587
588#ifdef DEBUG
589	if (ccddebug & CCDB_INIT)
590		printf("ccdinterleave(%x): ileave %d\n", cs, cs->sc_ileave);
591#endif
592
593	/*
594	 * Allocate an interleave table.  The worst case occurs when each
595	 * of N disks is of a different size, resulting in N interleave
596	 * tables.
597	 *
598	 * Chances are this is too big, but we don't care.
599	 */
600	size = (cs->sc_nccdisks + 1) * sizeof(struct ccdiinfo);
601	cs->sc_itable = (struct ccdiinfo *)malloc(size, M_DEVBUF,
602	    M_WAITOK | M_ZERO);
603
604	/*
605	 * Trivial case: no interleave (actually interleave of disk size).
606	 * Each table entry represents a single component in its entirety.
607	 *
608	 * An interleave of 0 may not be used with a mirror or parity setup.
609	 */
610	if (cs->sc_ileave == 0) {
611		bn = 0;
612		ii = cs->sc_itable;
613
614		for (ix = 0; ix < cs->sc_nccdisks; ix++) {
615			/* Allocate space for ii_index. */
616			ii->ii_index = malloc(sizeof(int), M_DEVBUF, M_WAITOK);
617			ii->ii_ndisk = 1;
618			ii->ii_startblk = bn;
619			ii->ii_startoff = 0;
620			ii->ii_index[0] = ix;
621			bn += cs->sc_cinfo[ix].ci_size;
622			ii++;
623		}
624		ii->ii_ndisk = 0;
625#ifdef DEBUG
626		if (ccddebug & CCDB_INIT)
627			printiinfo(cs->sc_itable);
628#endif
629		return;
630	}
631
632	/*
633	 * The following isn't fast or pretty; it doesn't have to be.
634	 */
635	size = 0;
636	bn = lbn = 0;
637	for (ii = cs->sc_itable; ; ii++) {
638		/*
639		 * Allocate space for ii_index.  We might allocate more then
640		 * we use.
641		 */
642		ii->ii_index = malloc((sizeof(int) * cs->sc_nccdisks),
643		    M_DEVBUF, M_WAITOK);
644
645		/*
646		 * Locate the smallest of the remaining components
647		 */
648		smallci = NULL;
649		for (ci = cs->sc_cinfo; ci < &cs->sc_cinfo[cs->sc_nccdisks];
650		    ci++) {
651			if (ci->ci_size > size &&
652			    (smallci == NULL ||
653			     ci->ci_size < smallci->ci_size)) {
654				smallci = ci;
655			}
656		}
657
658		/*
659		 * Nobody left, all done
660		 */
661		if (smallci == NULL) {
662			ii->ii_ndisk = 0;
663			break;
664		}
665
666		/*
667		 * Record starting logical block using an sc_ileave blocksize.
668		 */
669		ii->ii_startblk = bn / cs->sc_ileave;
670
671		/*
672		 * Record starting comopnent block using an sc_ileave
673		 * blocksize.  This value is relative to the beginning of
674		 * a component disk.
675		 */
676		ii->ii_startoff = lbn;
677
678		/*
679		 * Determine how many disks take part in this interleave
680		 * and record their indices.
681		 */
682		ix = 0;
683		for (ci = cs->sc_cinfo;
684		    ci < &cs->sc_cinfo[cs->sc_nccdisks]; ci++) {
685			if (ci->ci_size >= smallci->ci_size) {
686				ii->ii_index[ix++] = ci - cs->sc_cinfo;
687			}
688		}
689		ii->ii_ndisk = ix;
690		bn += ix * (smallci->ci_size - size);
691		lbn = smallci->ci_size / cs->sc_ileave;
692		size = smallci->ci_size;
693	}
694#ifdef DEBUG
695	if (ccddebug & CCDB_INIT)
696		printiinfo(cs->sc_itable);
697#endif
698}
699
700/* ARGSUSED */
701static int
702ccdopen(dev, flags, fmt, p)
703	dev_t dev;
704	int flags, fmt;
705	struct proc *p;
706{
707	int unit = ccdunit(dev);
708	struct ccd_softc *cs;
709	struct disklabel *lp;
710	int error = 0, part, pmask;
711
712#ifdef DEBUG
713	if (ccddebug & CCDB_FOLLOW)
714		printf("ccdopen(%x, %x)\n", dev, flags);
715#endif
716	if (unit >= numccd)
717		return (ENXIO);
718	cs = &ccd_softc[unit];
719
720	if ((error = ccdlock(cs)) != 0)
721		return (error);
722
723	lp = &cs->sc_label;
724
725	part = ccdpart(dev);
726	pmask = (1 << part);
727
728	/*
729	 * If we're initialized, check to see if there are any other
730	 * open partitions.  If not, then it's safe to update
731	 * the in-core disklabel.
732	 */
733	if ((cs->sc_flags & CCDF_INITED) && (cs->sc_openmask == 0))
734		ccdgetdisklabel(dev);
735
736	/* Check that the partition exists. */
737	if (part != RAW_PART && ((part >= lp->d_npartitions) ||
738	    (lp->d_partitions[part].p_fstype == FS_UNUSED))) {
739		error = ENXIO;
740		goto done;
741	}
742
743	cs->sc_openmask |= pmask;
744 done:
745	ccdunlock(cs);
746	return (0);
747}
748
749/* ARGSUSED */
750static int
751ccdclose(dev, flags, fmt, p)
752	dev_t dev;
753	int flags, fmt;
754	struct proc *p;
755{
756	int unit = ccdunit(dev);
757	struct ccd_softc *cs;
758	int error = 0, part;
759
760#ifdef DEBUG
761	if (ccddebug & CCDB_FOLLOW)
762		printf("ccdclose(%x, %x)\n", dev, flags);
763#endif
764
765	if (unit >= numccd)
766		return (ENXIO);
767	cs = &ccd_softc[unit];
768
769	if ((error = ccdlock(cs)) != 0)
770		return (error);
771
772	part = ccdpart(dev);
773
774	/* ...that much closer to allowing unconfiguration... */
775	cs->sc_openmask &= ~(1 << part);
776	ccdunlock(cs);
777	return (0);
778}
779
780static void
781ccdstrategy(bp)
782	struct bio *bp;
783{
784	int unit = ccdunit(bp->bio_dev);
785	struct ccd_softc *cs = &ccd_softc[unit];
786	int s;
787	int wlabel;
788	struct disklabel *lp;
789
790#ifdef DEBUG
791	if (ccddebug & CCDB_FOLLOW)
792		printf("ccdstrategy(%x): unit %d\n", bp, unit);
793#endif
794	if ((cs->sc_flags & CCDF_INITED) == 0) {
795		bp->bio_error = ENXIO;
796		bp->bio_flags |= BIO_ERROR;
797		goto done;
798	}
799
800	/* If it's a nil transfer, wake up the top half now. */
801	if (bp->bio_bcount == 0)
802		goto done;
803
804	lp = &cs->sc_label;
805
806	/*
807	 * Do bounds checking and adjust transfer.  If there's an
808	 * error, the bounds check will flag that for us.
809	 */
810	wlabel = cs->sc_flags & (CCDF_WLABEL|CCDF_LABELLING);
811	if (ccdpart(bp->bio_dev) != RAW_PART) {
812		if (bounds_check_with_label(bp, lp, wlabel) <= 0)
813			goto done;
814	} else {
815		int pbn;        /* in sc_secsize chunks */
816		long sz;        /* in sc_secsize chunks */
817
818		pbn = bp->bio_blkno / (cs->sc_geom.ccg_secsize / DEV_BSIZE);
819		sz = howmany(bp->bio_bcount, cs->sc_geom.ccg_secsize);
820
821		/*
822		 * If out of bounds return an error. If at the EOF point,
823		 * simply read or write less.
824		 */
825
826		if (pbn < 0 || pbn >= cs->sc_size) {
827			bp->bio_resid = bp->bio_bcount;
828			if (pbn != cs->sc_size) {
829				bp->bio_error = EINVAL;
830				bp->bio_flags |= BIO_ERROR;
831			}
832			goto done;
833		}
834
835		/*
836		 * If the request crosses EOF, truncate the request.
837		 */
838		if (pbn + sz > cs->sc_size) {
839			bp->bio_bcount = (cs->sc_size - pbn) *
840			    cs->sc_geom.ccg_secsize;
841		}
842	}
843
844	bp->bio_resid = bp->bio_bcount;
845
846	/*
847	 * "Start" the unit.
848	 */
849	s = splbio();
850	ccdstart(cs, bp);
851	splx(s);
852	return;
853done:
854	biodone(bp);
855}
856
857static void
858ccdstart(cs, bp)
859	struct ccd_softc *cs;
860	struct bio *bp;
861{
862	long bcount, rcount;
863	struct ccdbuf *cbp[4];
864	/* XXX! : 2 reads and 2 writes for RAID 4/5 */
865	caddr_t addr;
866	daddr_t bn;
867	struct partition *pp;
868
869#ifdef DEBUG
870	if (ccddebug & CCDB_FOLLOW)
871		printf("ccdstart(%x, %x)\n", cs, bp);
872#endif
873
874	/* Record the transaction start  */
875	devstat_start_transaction(&cs->device_stats);
876
877	/*
878	 * Translate the partition-relative block number to an absolute.
879	 */
880	bn = bp->bio_blkno;
881	if (ccdpart(bp->bio_dev) != RAW_PART) {
882		pp = &cs->sc_label.d_partitions[ccdpart(bp->bio_dev)];
883		bn += pp->p_offset;
884	}
885
886	/*
887	 * Allocate component buffers and fire off the requests
888	 */
889	addr = bp->bio_data;
890	for (bcount = bp->bio_bcount; bcount > 0; bcount -= rcount) {
891		ccdbuffer(cbp, cs, bp, bn, addr, bcount);
892		rcount = cbp[0]->cb_buf.bio_bcount;
893
894		if (cs->sc_cflags & CCDF_MIRROR) {
895			/*
896			 * Mirroring.  Writes go to both disks, reads are
897			 * taken from whichever disk seems most appropriate.
898			 *
899			 * We attempt to localize reads to the disk whos arm
900			 * is nearest the read request.  We ignore seeks due
901			 * to writes when making this determination and we
902			 * also try to avoid hogging.
903			 */
904			if (cbp[0]->cb_buf.bio_cmd == BIO_WRITE) {
905				BIO_STRATEGY(&cbp[0]->cb_buf, 0);
906				BIO_STRATEGY(&cbp[1]->cb_buf, 0);
907			} else {
908				int pick = cs->sc_pick;
909				daddr_t range = cs->sc_size / 16;
910
911				if (bn < cs->sc_blk[pick] - range ||
912				    bn > cs->sc_blk[pick] + range
913				) {
914					cs->sc_pick = pick = 1 - pick;
915				}
916				cs->sc_blk[pick] = bn + btodb(rcount);
917				BIO_STRATEGY(&cbp[pick]->cb_buf, 0);
918			}
919		} else {
920			/*
921			 * Not mirroring
922			 */
923			BIO_STRATEGY(&cbp[0]->cb_buf, 0);
924		}
925		bn += btodb(rcount);
926		addr += rcount;
927	}
928}
929
930/*
931 * Build a component buffer header.
932 */
933static void
934ccdbuffer(cb, cs, bp, bn, addr, bcount)
935	struct ccdbuf **cb;
936	struct ccd_softc *cs;
937	struct bio *bp;
938	daddr_t bn;
939	caddr_t addr;
940	long bcount;
941{
942	struct ccdcinfo *ci, *ci2 = NULL;	/* XXX */
943	struct ccdbuf *cbp;
944	daddr_t cbn, cboff;
945	off_t cbc;
946
947#ifdef DEBUG
948	if (ccddebug & CCDB_IO)
949		printf("ccdbuffer(%x, %x, %d, %x, %d)\n",
950		       cs, bp, bn, addr, bcount);
951#endif
952	/*
953	 * Determine which component bn falls in.
954	 */
955	cbn = bn;
956	cboff = 0;
957
958	if (cs->sc_ileave == 0) {
959		/*
960		 * Serially concatenated and neither a mirror nor a parity
961		 * config.  This is a special case.
962		 */
963		daddr_t sblk;
964
965		sblk = 0;
966		for (ci = cs->sc_cinfo; cbn >= sblk + ci->ci_size; ci++)
967			sblk += ci->ci_size;
968		cbn -= sblk;
969	} else {
970		struct ccdiinfo *ii;
971		int ccdisk, off;
972
973		/*
974		 * Calculate cbn, the logical superblock (sc_ileave chunks),
975		 * and cboff, a normal block offset (DEV_BSIZE chunks) relative
976		 * to cbn.
977		 */
978		cboff = cbn % cs->sc_ileave;	/* DEV_BSIZE gran */
979		cbn = cbn / cs->sc_ileave;	/* DEV_BSIZE * ileave gran */
980
981		/*
982		 * Figure out which interleave table to use.
983		 */
984		for (ii = cs->sc_itable; ii->ii_ndisk; ii++) {
985			if (ii->ii_startblk > cbn)
986				break;
987		}
988		ii--;
989
990		/*
991		 * off is the logical superblock relative to the beginning
992		 * of this interleave block.
993		 */
994		off = cbn - ii->ii_startblk;
995
996		/*
997		 * We must calculate which disk component to use (ccdisk),
998		 * and recalculate cbn to be the superblock relative to
999		 * the beginning of the component.  This is typically done by
1000		 * adding 'off' and ii->ii_startoff together.  However, 'off'
1001		 * must typically be divided by the number of components in
1002		 * this interleave array to be properly convert it from a
1003		 * CCD-relative logical superblock number to a
1004		 * component-relative superblock number.
1005		 */
1006		if (ii->ii_ndisk == 1) {
1007			/*
1008			 * When we have just one disk, it can't be a mirror
1009			 * or a parity config.
1010			 */
1011			ccdisk = ii->ii_index[0];
1012			cbn = ii->ii_startoff + off;
1013		} else {
1014			if (cs->sc_cflags & CCDF_MIRROR) {
1015				/*
1016				 * We have forced a uniform mapping, resulting
1017				 * in a single interleave array.  We double
1018				 * up on the first half of the available
1019				 * components and our mirror is in the second
1020				 * half.  This only works with a single
1021				 * interleave array because doubling up
1022				 * doubles the number of sectors, so there
1023				 * cannot be another interleave array because
1024				 * the next interleave array's calculations
1025				 * would be off.
1026				 */
1027				int ndisk2 = ii->ii_ndisk / 2;
1028				ccdisk = ii->ii_index[off % ndisk2];
1029				cbn = ii->ii_startoff + off / ndisk2;
1030				ci2 = &cs->sc_cinfo[ccdisk + ndisk2];
1031			} else if (cs->sc_cflags & CCDF_PARITY) {
1032				/*
1033				 * XXX not implemented yet
1034				 */
1035				int ndisk2 = ii->ii_ndisk - 1;
1036				ccdisk = ii->ii_index[off % ndisk2];
1037				cbn = ii->ii_startoff + off / ndisk2;
1038				if (cbn % ii->ii_ndisk <= ccdisk)
1039					ccdisk++;
1040			} else {
1041				ccdisk = ii->ii_index[off % ii->ii_ndisk];
1042				cbn = ii->ii_startoff + off / ii->ii_ndisk;
1043			}
1044		}
1045
1046		ci = &cs->sc_cinfo[ccdisk];
1047
1048		/*
1049		 * Convert cbn from a superblock to a normal block so it
1050		 * can be used to calculate (along with cboff) the normal
1051		 * block index into this particular disk.
1052		 */
1053		cbn *= cs->sc_ileave;
1054	}
1055
1056	/*
1057	 * Fill in the component buf structure.
1058	 */
1059	cbp = getccdbuf(NULL);
1060	cbp->cb_buf.bio_cmd = bp->bio_cmd;
1061	cbp->cb_buf.bio_done = ccdiodone;
1062	cbp->cb_buf.bio_dev = ci->ci_dev;		/* XXX */
1063	cbp->cb_buf.bio_blkno = cbn + cboff + CCD_OFFSET;
1064	cbp->cb_buf.bio_offset = dbtob(cbn + cboff + CCD_OFFSET);
1065	cbp->cb_buf.bio_data = addr;
1066	if (cs->sc_ileave == 0)
1067              cbc = dbtob((off_t)(ci->ci_size - cbn));
1068	else
1069              cbc = dbtob((off_t)(cs->sc_ileave - cboff));
1070	cbp->cb_buf.bio_bcount = (cbc < bcount) ? cbc : bcount;
1071 	cbp->cb_buf.bio_caller1 = (void*)cbp->cb_buf.bio_bcount;
1072
1073	/*
1074	 * context for ccdiodone
1075	 */
1076	cbp->cb_obp = bp;
1077	cbp->cb_unit = cs - ccd_softc;
1078	cbp->cb_comp = ci - cs->sc_cinfo;
1079
1080#ifdef DEBUG
1081	if (ccddebug & CCDB_IO)
1082		printf(" dev %x(u%d): cbp %x bn %d addr %x bcnt %d\n",
1083		       ci->ci_dev, ci-cs->sc_cinfo, cbp, cbp->cb_buf.bio_blkno,
1084		       cbp->cb_buf.bio_data, cbp->cb_buf.bio_bcount);
1085#endif
1086	cb[0] = cbp;
1087
1088	/*
1089	 * Note: both I/O's setup when reading from mirror, but only one
1090	 * will be executed.
1091	 */
1092	if (cs->sc_cflags & CCDF_MIRROR) {
1093		/* mirror, setup second I/O */
1094		cbp = getccdbuf(cb[0]);
1095		cbp->cb_buf.bio_dev = ci2->ci_dev;
1096		cbp->cb_comp = ci2 - cs->sc_cinfo;
1097		cb[1] = cbp;
1098		/* link together the ccdbuf's and clear "mirror done" flag */
1099		cb[0]->cb_mirror = cb[1];
1100		cb[1]->cb_mirror = cb[0];
1101		cb[0]->cb_pflags &= ~CCDPF_MIRROR_DONE;
1102		cb[1]->cb_pflags &= ~CCDPF_MIRROR_DONE;
1103	}
1104}
1105
1106static void
1107ccdintr(cs, bp)
1108	struct ccd_softc *cs;
1109	struct bio *bp;
1110{
1111#ifdef DEBUG
1112	if (ccddebug & CCDB_FOLLOW)
1113		printf("ccdintr(%x, %x)\n", cs, bp);
1114#endif
1115	/*
1116	 * Request is done for better or worse, wakeup the top half.
1117	 */
1118	if (bp->bio_flags & BIO_ERROR)
1119		bp->bio_resid = bp->bio_bcount;
1120	devstat_end_transaction_bio(&cs->device_stats, bp);
1121	biodone(bp);
1122}
1123
1124/*
1125 * Called at interrupt time.
1126 * Mark the component as done and if all components are done,
1127 * take a ccd interrupt.
1128 */
1129static void
1130ccdiodone(ibp)
1131	struct bio *ibp;
1132{
1133	struct ccdbuf *cbp = (struct ccdbuf *)ibp;
1134	struct bio *bp = cbp->cb_obp;
1135	int unit = cbp->cb_unit;
1136	int count, s;
1137
1138	s = splbio();
1139#ifdef DEBUG
1140	if (ccddebug & CCDB_FOLLOW)
1141		printf("ccdiodone(%x)\n", cbp);
1142	if (ccddebug & CCDB_IO) {
1143		printf("ccdiodone: bp %x bcount %d resid %d\n",
1144		       bp, bp->bio_bcount, bp->bio_resid);
1145		printf(" dev %x(u%d), cbp %x bn %d addr %x bcnt %d\n",
1146		       cbp->cb_buf.bio_dev, cbp->cb_comp, cbp,
1147		       cbp->cb_buf.bio_blkno, cbp->cb_buf.bio_data,
1148		       cbp->cb_buf.bio_bcount);
1149	}
1150#endif
1151	/*
1152	 * If an error occured, report it.  If this is a mirrored
1153	 * configuration and the first of two possible reads, do not
1154	 * set the error in the bp yet because the second read may
1155	 * succeed.
1156	 */
1157
1158	if (cbp->cb_buf.bio_flags & BIO_ERROR) {
1159		const char *msg = "";
1160
1161		if ((ccd_softc[unit].sc_cflags & CCDF_MIRROR) &&
1162		    (cbp->cb_buf.bio_cmd == BIO_READ) &&
1163		    (cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) {
1164			/*
1165			 * We will try our read on the other disk down
1166			 * below, also reverse the default pick so if we
1167			 * are doing a scan we do not keep hitting the
1168			 * bad disk first.
1169			 */
1170			struct ccd_softc *cs = &ccd_softc[unit];
1171
1172			msg = ", trying other disk";
1173			cs->sc_pick = 1 - cs->sc_pick;
1174			cs->sc_blk[cs->sc_pick] = bp->bio_blkno;
1175		} else {
1176			bp->bio_flags |= BIO_ERROR;
1177			bp->bio_error = cbp->cb_buf.bio_error ?
1178			    cbp->cb_buf.bio_error : EIO;
1179		}
1180		printf("ccd%d: error %d on component %d block %d (ccd block %d)%s\n",
1181		       unit, bp->bio_error, cbp->cb_comp,
1182		       (int)cbp->cb_buf.bio_blkno, bp->bio_blkno, msg);
1183	}
1184
1185	/*
1186	 * Process mirror.  If we are writing, I/O has been initiated on both
1187	 * buffers and we fall through only after both are finished.
1188	 *
1189	 * If we are reading only one I/O is initiated at a time.  If an
1190	 * error occurs we initiate the second I/O and return, otherwise
1191	 * we free the second I/O without initiating it.
1192	 */
1193
1194	if (ccd_softc[unit].sc_cflags & CCDF_MIRROR) {
1195		if (cbp->cb_buf.bio_cmd == BIO_WRITE) {
1196			/*
1197			 * When writing, handshake with the second buffer
1198			 * to determine when both are done.  If both are not
1199			 * done, return here.
1200			 */
1201			if ((cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) {
1202				cbp->cb_mirror->cb_pflags |= CCDPF_MIRROR_DONE;
1203				putccdbuf(cbp);
1204				splx(s);
1205				return;
1206			}
1207		} else {
1208			/*
1209			 * When reading, either dispose of the second buffer
1210			 * or initiate I/O on the second buffer if an error
1211			 * occured with this one.
1212			 */
1213			if ((cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) {
1214				if (cbp->cb_buf.bio_flags & BIO_ERROR) {
1215					cbp->cb_mirror->cb_pflags |=
1216					    CCDPF_MIRROR_DONE;
1217					BIO_STRATEGY(&cbp->cb_mirror->cb_buf, 0);
1218					putccdbuf(cbp);
1219					splx(s);
1220					return;
1221				} else {
1222					putccdbuf(cbp->cb_mirror);
1223					/* fall through */
1224				}
1225			}
1226		}
1227	}
1228
1229	/*
1230	 * use bio_caller1 to determine how big the original request was rather
1231	 * then bio_bcount, because bio_bcount may have been truncated for EOF.
1232	 *
1233	 * XXX We check for an error, but we do not test the resid for an
1234	 * aligned EOF condition.  This may result in character & block
1235	 * device access not recognizing EOF properly when read or written
1236	 * sequentially, but will not effect filesystems.
1237	 */
1238	count = (long)cbp->cb_buf.bio_caller1;
1239	putccdbuf(cbp);
1240
1241	/*
1242	 * If all done, "interrupt".
1243	 */
1244	bp->bio_resid -= count;
1245	if (bp->bio_resid < 0)
1246		panic("ccdiodone: count");
1247	if (bp->bio_resid == 0)
1248		ccdintr(&ccd_softc[unit], bp);
1249	splx(s);
1250}
1251
1252static int
1253ccdioctl(dev, cmd, data, flag, p)
1254	dev_t dev;
1255	u_long cmd;
1256	caddr_t data;
1257	int flag;
1258	struct proc *p;
1259{
1260	int unit = ccdunit(dev);
1261	int i, j, lookedup = 0, error = 0;
1262	int part, pmask, s;
1263	struct ccd_softc *cs;
1264	struct ccd_ioctl *ccio = (struct ccd_ioctl *)data;
1265	struct ccddevice ccd;
1266	char **cpp;
1267	struct vnode **vpp;
1268
1269	if (unit >= numccd)
1270		return (ENXIO);
1271	cs = &ccd_softc[unit];
1272
1273	bzero(&ccd, sizeof(ccd));
1274
1275	switch (cmd) {
1276	case CCDIOCSET:
1277		if (cs->sc_flags & CCDF_INITED)
1278			return (EBUSY);
1279
1280		if ((flag & FWRITE) == 0)
1281			return (EBADF);
1282
1283		if ((error = ccdlock(cs)) != 0)
1284			return (error);
1285
1286		/* Fill in some important bits. */
1287		ccd.ccd_unit = unit;
1288		ccd.ccd_interleave = ccio->ccio_ileave;
1289		if (ccd.ccd_interleave == 0 &&
1290		    ((ccio->ccio_flags & CCDF_MIRROR) ||
1291		     (ccio->ccio_flags & CCDF_PARITY))) {
1292			printf("ccd%d: disabling mirror/parity, interleave is 0\n", unit);
1293			ccio->ccio_flags &= ~(CCDF_MIRROR | CCDF_PARITY);
1294		}
1295		if ((ccio->ccio_flags & CCDF_MIRROR) &&
1296		    (ccio->ccio_flags & CCDF_PARITY)) {
1297			printf("ccd%d: can't specify both mirror and parity, using mirror\n", unit);
1298			ccio->ccio_flags &= ~CCDF_PARITY;
1299		}
1300		if ((ccio->ccio_flags & (CCDF_MIRROR | CCDF_PARITY)) &&
1301		    !(ccio->ccio_flags & CCDF_UNIFORM)) {
1302			printf("ccd%d: mirror/parity forces uniform flag\n",
1303			       unit);
1304			ccio->ccio_flags |= CCDF_UNIFORM;
1305		}
1306		ccd.ccd_flags = ccio->ccio_flags & CCDF_USERMASK;
1307
1308		/*
1309		 * Allocate space for and copy in the array of
1310		 * componet pathnames and device numbers.
1311		 */
1312		cpp = malloc(ccio->ccio_ndisks * sizeof(char *),
1313		    M_DEVBUF, M_WAITOK);
1314		vpp = malloc(ccio->ccio_ndisks * sizeof(struct vnode *),
1315		    M_DEVBUF, M_WAITOK);
1316
1317		error = copyin((caddr_t)ccio->ccio_disks, (caddr_t)cpp,
1318		    ccio->ccio_ndisks * sizeof(char **));
1319		if (error) {
1320			free(vpp, M_DEVBUF);
1321			free(cpp, M_DEVBUF);
1322			ccdunlock(cs);
1323			return (error);
1324		}
1325
1326#ifdef DEBUG
1327		if (ccddebug & CCDB_INIT)
1328			for (i = 0; i < ccio->ccio_ndisks; ++i)
1329				printf("ccdioctl: component %d: 0x%x\n",
1330				    i, cpp[i]);
1331#endif
1332
1333		for (i = 0; i < ccio->ccio_ndisks; ++i) {
1334#ifdef DEBUG
1335			if (ccddebug & CCDB_INIT)
1336				printf("ccdioctl: lookedup = %d\n", lookedup);
1337#endif
1338			if ((error = ccdlookup(cpp[i], p, &vpp[i])) != 0) {
1339				for (j = 0; j < lookedup; ++j)
1340					(void)vn_close(vpp[j], FREAD|FWRITE,
1341					    p->p_ucred, p);
1342				free(vpp, M_DEVBUF);
1343				free(cpp, M_DEVBUF);
1344				ccdunlock(cs);
1345				return (error);
1346			}
1347			++lookedup;
1348		}
1349		ccd.ccd_cpp = cpp;
1350		ccd.ccd_vpp = vpp;
1351		ccd.ccd_ndev = ccio->ccio_ndisks;
1352
1353		/*
1354		 * Initialize the ccd.  Fills in the softc for us.
1355		 */
1356		if ((error = ccdinit(&ccd, cpp, p)) != 0) {
1357			for (j = 0; j < lookedup; ++j)
1358				(void)vn_close(vpp[j], FREAD|FWRITE,
1359				    p->p_ucred, p);
1360			bzero(&ccd_softc[unit], sizeof(struct ccd_softc));
1361			free(vpp, M_DEVBUF);
1362			free(cpp, M_DEVBUF);
1363			ccdunlock(cs);
1364			return (error);
1365		}
1366
1367		/*
1368		 * The ccd has been successfully initialized, so
1369		 * we can place it into the array and read the disklabel.
1370		 */
1371		bcopy(&ccd, &ccddevs[unit], sizeof(ccd));
1372		ccio->ccio_unit = unit;
1373		ccio->ccio_size = cs->sc_size;
1374		ccdgetdisklabel(dev);
1375
1376		ccdunlock(cs);
1377
1378		break;
1379
1380	case CCDIOCCLR:
1381		if ((cs->sc_flags & CCDF_INITED) == 0)
1382			return (ENXIO);
1383
1384		if ((flag & FWRITE) == 0)
1385			return (EBADF);
1386
1387		if ((error = ccdlock(cs)) != 0)
1388			return (error);
1389
1390		/* Don't unconfigure if any other partitions are open */
1391		part = ccdpart(dev);
1392		pmask = (1 << part);
1393		if ((cs->sc_openmask & ~pmask)) {
1394			ccdunlock(cs);
1395			return (EBUSY);
1396		}
1397
1398		/*
1399		 * Free ccd_softc information and clear entry.
1400		 */
1401
1402		/* Close the components and free their pathnames. */
1403		for (i = 0; i < cs->sc_nccdisks; ++i) {
1404			/*
1405			 * XXX: this close could potentially fail and
1406			 * cause Bad Things.  Maybe we need to force
1407			 * the close to happen?
1408			 */
1409#ifdef DEBUG
1410			if (ccddebug & CCDB_VNODE)
1411				vprint("CCDIOCCLR: vnode info",
1412				    cs->sc_cinfo[i].ci_vp);
1413#endif
1414			(void)vn_close(cs->sc_cinfo[i].ci_vp, FREAD|FWRITE,
1415			    p->p_ucred, p);
1416			free(cs->sc_cinfo[i].ci_path, M_DEVBUF);
1417		}
1418
1419		/* Free interleave index. */
1420		for (i = 0; cs->sc_itable[i].ii_ndisk; ++i)
1421			free(cs->sc_itable[i].ii_index, M_DEVBUF);
1422
1423		/* Free component info and interleave table. */
1424		free(cs->sc_cinfo, M_DEVBUF);
1425		free(cs->sc_itable, M_DEVBUF);
1426		cs->sc_flags &= ~CCDF_INITED;
1427
1428		/*
1429		 * Free ccddevice information and clear entry.
1430		 */
1431		free(ccddevs[unit].ccd_cpp, M_DEVBUF);
1432		free(ccddevs[unit].ccd_vpp, M_DEVBUF);
1433		ccd.ccd_dk = -1;
1434		bcopy(&ccd, &ccddevs[unit], sizeof(ccd));
1435
1436		/*
1437		 * And remove the devstat entry.
1438		 */
1439		devstat_remove_entry(&cs->device_stats);
1440
1441		/* This must be atomic. */
1442		s = splhigh();
1443		ccdunlock(cs);
1444		bzero(cs, sizeof(struct ccd_softc));
1445		splx(s);
1446
1447		break;
1448
1449	case DIOCGDINFO:
1450		if ((cs->sc_flags & CCDF_INITED) == 0)
1451			return (ENXIO);
1452
1453		*(struct disklabel *)data = cs->sc_label;
1454		break;
1455
1456	case DIOCGPART:
1457		if ((cs->sc_flags & CCDF_INITED) == 0)
1458			return (ENXIO);
1459
1460		((struct partinfo *)data)->disklab = &cs->sc_label;
1461		((struct partinfo *)data)->part =
1462		    &cs->sc_label.d_partitions[ccdpart(dev)];
1463		break;
1464
1465	case DIOCWDINFO:
1466	case DIOCSDINFO:
1467		if ((cs->sc_flags & CCDF_INITED) == 0)
1468			return (ENXIO);
1469
1470		if ((flag & FWRITE) == 0)
1471			return (EBADF);
1472
1473		if ((error = ccdlock(cs)) != 0)
1474			return (error);
1475
1476		cs->sc_flags |= CCDF_LABELLING;
1477
1478		error = setdisklabel(&cs->sc_label,
1479		    (struct disklabel *)data, 0);
1480		if (error == 0) {
1481			if (cmd == DIOCWDINFO)
1482				error = writedisklabel(CCDLABELDEV(dev),
1483				    &cs->sc_label);
1484		}
1485
1486		cs->sc_flags &= ~CCDF_LABELLING;
1487
1488		ccdunlock(cs);
1489
1490		if (error)
1491			return (error);
1492		break;
1493
1494	case DIOCWLABEL:
1495		if ((cs->sc_flags & CCDF_INITED) == 0)
1496			return (ENXIO);
1497
1498		if ((flag & FWRITE) == 0)
1499			return (EBADF);
1500		if (*(int *)data != 0)
1501			cs->sc_flags |= CCDF_WLABEL;
1502		else
1503			cs->sc_flags &= ~CCDF_WLABEL;
1504		break;
1505
1506	default:
1507		return (ENOTTY);
1508	}
1509
1510	return (0);
1511}
1512
1513static int
1514ccdsize(dev)
1515	dev_t dev;
1516{
1517	struct ccd_softc *cs;
1518	int part, size;
1519
1520	if (ccdopen(dev, 0, S_IFCHR, curproc))
1521		return (-1);
1522
1523	cs = &ccd_softc[ccdunit(dev)];
1524	part = ccdpart(dev);
1525
1526	if ((cs->sc_flags & CCDF_INITED) == 0)
1527		return (-1);
1528
1529	if (cs->sc_label.d_partitions[part].p_fstype != FS_SWAP)
1530		size = -1;
1531	else
1532		size = cs->sc_label.d_partitions[part].p_size;
1533
1534	if (ccdclose(dev, 0, S_IFCHR, curproc))
1535		return (-1);
1536
1537	return (size);
1538}
1539
1540static int
1541ccddump(dev)
1542	dev_t dev;
1543{
1544
1545	/* Not implemented. */
1546	return ENXIO;
1547}
1548
1549/*
1550 * Lookup the provided name in the filesystem.  If the file exists,
1551 * is a valid block device, and isn't being used by anyone else,
1552 * set *vpp to the file's vnode.
1553 */
1554static int
1555ccdlookup(path, p, vpp)
1556	char *path;
1557	struct proc *p;
1558	struct vnode **vpp;	/* result */
1559{
1560	struct nameidata nd;
1561	struct vnode *vp;
1562	int error, flags;
1563
1564	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, path, p);
1565	flags = FREAD | FWRITE;
1566	if ((error = vn_open(&nd, &flags, 0)) != 0) {
1567#ifdef DEBUG
1568		if (ccddebug & CCDB_FOLLOW|CCDB_INIT)
1569			printf("ccdlookup: vn_open error = %d\n", error);
1570#endif
1571		return (error);
1572	}
1573	vp = nd.ni_vp;
1574
1575	if (vp->v_usecount > 1) {
1576		error = EBUSY;
1577		goto bad;
1578	}
1579
1580	if (!vn_isdisk(vp, &error))
1581		goto bad;
1582
1583#ifdef DEBUG
1584	if (ccddebug & CCDB_VNODE)
1585		vprint("ccdlookup: vnode info", vp);
1586#endif
1587
1588	VOP_UNLOCK(vp, 0, p);
1589	NDFREE(&nd, NDF_ONLY_PNBUF);
1590	*vpp = vp;
1591	return (0);
1592bad:
1593	VOP_UNLOCK(vp, 0, p);
1594	NDFREE(&nd, NDF_ONLY_PNBUF);
1595	/* vn_close does vrele() for vp */
1596	(void)vn_close(vp, FREAD|FWRITE, p->p_ucred, p);
1597	return (error);
1598}
1599
1600/*
1601 * Read the disklabel from the ccd.  If one is not present, fake one
1602 * up.
1603 */
1604static void
1605ccdgetdisklabel(dev)
1606	dev_t dev;
1607{
1608	int unit = ccdunit(dev);
1609	struct ccd_softc *cs = &ccd_softc[unit];
1610	char *errstring;
1611	struct disklabel *lp = &cs->sc_label;
1612	struct ccdgeom *ccg = &cs->sc_geom;
1613
1614	bzero(lp, sizeof(*lp));
1615
1616	lp->d_secperunit = cs->sc_size;
1617	lp->d_secsize = ccg->ccg_secsize;
1618	lp->d_nsectors = ccg->ccg_nsectors;
1619	lp->d_ntracks = ccg->ccg_ntracks;
1620	lp->d_ncylinders = ccg->ccg_ncylinders;
1621	lp->d_secpercyl = lp->d_ntracks * lp->d_nsectors;
1622
1623	strncpy(lp->d_typename, "ccd", sizeof(lp->d_typename));
1624	lp->d_type = DTYPE_CCD;
1625	strncpy(lp->d_packname, "fictitious", sizeof(lp->d_packname));
1626	lp->d_rpm = 3600;
1627	lp->d_interleave = 1;
1628	lp->d_flags = 0;
1629
1630	lp->d_partitions[RAW_PART].p_offset = 0;
1631	lp->d_partitions[RAW_PART].p_size = cs->sc_size;
1632	lp->d_partitions[RAW_PART].p_fstype = FS_UNUSED;
1633	lp->d_npartitions = RAW_PART + 1;
1634
1635	lp->d_bbsize = BBSIZE;				/* XXX */
1636	lp->d_sbsize = SBSIZE;				/* XXX */
1637
1638	lp->d_magic = DISKMAGIC;
1639	lp->d_magic2 = DISKMAGIC;
1640	lp->d_checksum = dkcksum(&cs->sc_label);
1641
1642	/*
1643	 * Call the generic disklabel extraction routine.
1644	 */
1645	errstring = readdisklabel(CCDLABELDEV(dev), &cs->sc_label);
1646	if (errstring != NULL)
1647		ccdmakedisklabel(cs);
1648
1649#ifdef DEBUG
1650	/* It's actually extremely common to have unlabeled ccds. */
1651	if (ccddebug & CCDB_LABEL)
1652		if (errstring != NULL)
1653			printf("ccd%d: %s\n", unit, errstring);
1654#endif
1655}
1656
1657/*
1658 * Take care of things one might want to take care of in the event
1659 * that a disklabel isn't present.
1660 */
1661static void
1662ccdmakedisklabel(cs)
1663	struct ccd_softc *cs;
1664{
1665	struct disklabel *lp = &cs->sc_label;
1666
1667	/*
1668	 * For historical reasons, if there's no disklabel present
1669	 * the raw partition must be marked FS_BSDFFS.
1670	 */
1671	lp->d_partitions[RAW_PART].p_fstype = FS_BSDFFS;
1672
1673	strncpy(lp->d_packname, "default label", sizeof(lp->d_packname));
1674}
1675
1676/*
1677 * Wait interruptibly for an exclusive lock.
1678 *
1679 * XXX
1680 * Several drivers do this; it should be abstracted and made MP-safe.
1681 */
1682static int
1683ccdlock(cs)
1684	struct ccd_softc *cs;
1685{
1686	int error;
1687
1688	while ((cs->sc_flags & CCDF_LOCKED) != 0) {
1689		cs->sc_flags |= CCDF_WANTED;
1690		if ((error = tsleep(cs, PRIBIO | PCATCH, "ccdlck", 0)) != 0)
1691			return (error);
1692	}
1693	cs->sc_flags |= CCDF_LOCKED;
1694	return (0);
1695}
1696
1697/*
1698 * Unlock and wake up any waiters.
1699 */
1700static void
1701ccdunlock(cs)
1702	struct ccd_softc *cs;
1703{
1704
1705	cs->sc_flags &= ~CCDF_LOCKED;
1706	if ((cs->sc_flags & CCDF_WANTED) != 0) {
1707		cs->sc_flags &= ~CCDF_WANTED;
1708		wakeup(cs);
1709	}
1710}
1711
1712#ifdef DEBUG
1713static void
1714printiinfo(ii)
1715	struct ccdiinfo *ii;
1716{
1717	int ix, i;
1718
1719	for (ix = 0; ii->ii_ndisk; ix++, ii++) {
1720		printf(" itab[%d]: #dk %d sblk %d soff %d",
1721		       ix, ii->ii_ndisk, ii->ii_startblk, ii->ii_startoff);
1722		for (i = 0; i < ii->ii_ndisk; i++)
1723			printf(" %d", ii->ii_index[i]);
1724		printf("\n");
1725	}
1726}
1727#endif
1728