geom_ccd.c revision 58349
1132718Skan/* $FreeBSD: head/sys/geom/geom_ccd.c 58349 2000-03-20 11:29:10Z phk $ */
2169689Skan
3169689Skan/*	$NetBSD: ccd.c,v 1.22 1995/12/08 19:13:26 thorpej Exp $	*/
4132718Skan
5132718Skan/*
6132718Skan * Copyright (c) 1995 Jason R. Thorpe.
7132718Skan * All rights reserved.
8132718Skan *
9132718Skan * Redistribution and use in source and binary forms, with or without
10132718Skan * modification, are permitted provided that the following conditions
11132718Skan * are met:
12132718Skan * 1. Redistributions of source code must retain the above copyright
13132718Skan *    notice, this list of conditions and the following disclaimer.
14132718Skan * 2. Redistributions in binary form must reproduce the above copyright
15132718Skan *    notice, this list of conditions and the following disclaimer in the
16132718Skan *    documentation and/or other materials provided with the distribution.
17132718Skan * 3. All advertising materials mentioning features or use of this software
18132718Skan *    must display the following acknowledgement:
19132718Skan *	This product includes software developed for the NetBSD Project
20132718Skan *	by Jason R. Thorpe.
21169689Skan * 4. The name of the author may not be used to endorse or promote products
22169689Skan *    derived from this software without specific prior written permission.
23132718Skan *
24132718Skan * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
25132718Skan * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
26132718Skan * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
27132718Skan * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
28132718Skan * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
29132718Skan * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
30132718Skan * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
31132718Skan * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
32132718Skan * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33132718Skan * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34132718Skan * SUCH DAMAGE.
35132718Skan */
36132718Skan
37132718Skan/*
38132718Skan * Copyright (c) 1988 University of Utah.
39132718Skan * Copyright (c) 1990, 1993
40132718Skan *	The Regents of the University of California.  All rights reserved.
41132718Skan *
42132718Skan * This code is derived from software contributed to Berkeley by
43132718Skan * the Systems Programming Group of the University of Utah Computer
44132718Skan * Science Department.
45132718Skan *
46132718Skan * Redistribution and use in source and binary forms, with or without
47132718Skan * modification, are permitted provided that the following conditions
48132718Skan * are met:
49132718Skan * 1. Redistributions of source code must retain the above copyright
50132718Skan *    notice, this list of conditions and the following disclaimer.
51132718Skan * 2. Redistributions in binary form must reproduce the above copyright
52132718Skan *    notice, this list of conditions and the following disclaimer in the
53132718Skan *    documentation and/or other materials provided with the distribution.
54132718Skan * 3. All advertising materials mentioning features or use of this software
55132718Skan *    must display the following acknowledgement:
56132718Skan *	This product includes software developed by the University of
57132718Skan *	California, Berkeley and its contributors.
58132718Skan * 4. Neither the name of the University nor the names of its contributors
59132718Skan *    may be used to endorse or promote products derived from this software
60132718Skan *    without specific prior written permission.
61132718Skan *
62132718Skan * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
63132718Skan * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
64132718Skan * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
65132718Skan * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
66132718Skan * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
67132718Skan * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
68132718Skan * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
69132718Skan * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
70132718Skan * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
71132718Skan * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
72132718Skan * SUCH DAMAGE.
73132718Skan *
74132718Skan * from: Utah $Hdr: cd.c 1.6 90/11/28$
75132718Skan *
76132718Skan *	@(#)cd.c	8.2 (Berkeley) 11/16/93
77169689Skan */
78132718Skan
79132718Skan/*
80132718Skan * "Concatenated" disk driver.
81132718Skan *
82132718Skan * Dynamic configuration and disklabel support by:
83132718Skan *	Jason R. Thorpe <thorpej@nas.nasa.gov>
84132718Skan *	Numerical Aerodynamic Simulation Facility
85132718Skan *	Mail Stop 258-6
86132718Skan *	NASA Ames Research Center
87132718Skan *	Moffett Field, CA 94035
88132718Skan */
89132718Skan
90132718Skan#include "ccd.h"
91132718Skan
92132718Skan#include <sys/param.h>
93132718Skan#include <sys/systm.h>
94132718Skan#include <sys/kernel.h>
95132718Skan#include <sys/module.h>
96132718Skan#include <sys/proc.h>
97132718Skan#include <sys/buf.h>
98132718Skan#include <sys/malloc.h>
99132718Skan#include <sys/namei.h>
100132718Skan#include <sys/conf.h>
101132718Skan#include <sys/stat.h>
102132718Skan#include <sys/sysctl.h>
103132718Skan#include <sys/disklabel.h>
104132718Skan#include <ufs/ffs/fs.h>
105132718Skan#include <sys/devicestat.h>
106132718Skan#include <sys/fcntl.h>
107132718Skan#include <sys/vnode.h>
108132718Skan
109132718Skan#include <sys/ccdvar.h>
110132718Skan
111132718Skan#include <vm/vm_zone.h>
112132718Skan
113132718Skan#if defined(CCDDEBUG) && !defined(DEBUG)
114132718Skan#define DEBUG
115132718Skan#endif
116132718Skan
117132718Skan#ifdef DEBUG
118132718Skan#define CCDB_FOLLOW	0x01
119132718Skan#define CCDB_INIT	0x02
120132718Skan#define CCDB_IO		0x04
121132718Skan#define CCDB_LABEL	0x08
122132718Skan#define CCDB_VNODE	0x10
123132718Skanstatic int ccddebug = CCDB_FOLLOW | CCDB_INIT | CCDB_IO | CCDB_LABEL |
124132718Skan    CCDB_VNODE;
125132718SkanSYSCTL_INT(_debug, OID_AUTO, ccddebug, CTLFLAG_RW, &ccddebug, 0, "");
126132718Skan#undef DEBUG
127132718Skan#endif
128132718Skan
129132718Skan#define	ccdunit(x)	dkunit(x)
130132718Skan#define ccdpart(x)	dkpart(x)
131132718Skan
132132718Skan/*
133132718Skan   This is how mirroring works (only writes are special):
134132718Skan
135132718Skan   When initiating a write, ccdbuffer() returns two "struct ccdbuf *"s
136132718Skan   linked together by the cb_mirror field.  "cb_pflags &
137132718Skan   CCDPF_MIRROR_DONE" is set to 0 on both of them.
138132718Skan
139132718Skan   When a component returns to ccdiodone(), it checks if "cb_pflags &
140132718Skan   CCDPF_MIRROR_DONE" is set or not.  If not, it sets the partner's
141132718Skan   flag and returns.  If it is, it means its partner has already
142132718Skan   returned, so it will go to the regular cleanup.
143132718Skan
144132718Skan */
145132718Skan
146132718Skanstruct ccdbuf {
147132718Skan	struct buf	cb_buf;		/* new I/O buf */
148132718Skan	struct buf	*cb_obp;	/* ptr. to original I/O buf */
149132718Skan	struct ccdbuf	*cb_freenext;	/* free list link */
150132718Skan	int		cb_unit;	/* target unit */
151132718Skan	int		cb_comp;	/* target component */
152132718Skan	int		cb_pflags;	/* mirror/parity status flag */
153132718Skan	struct ccdbuf	*cb_mirror;	/* mirror counterpart */
154132718Skan};
155132718Skan
156132718Skan/* bits in cb_pflags */
157132718Skan#define CCDPF_MIRROR_DONE 1	/* if set, mirror counterpart is done */
158132718Skan
159132718Skan#define CCDLABELDEV(dev)	\
160132718Skan	(makedev(major((dev)), dkmakeminor(ccdunit((dev)), 0, RAW_PART)))
161132718Skan
162132718Skanstatic d_open_t ccdopen;
163132718Skanstatic d_close_t ccdclose;
164132718Skanstatic d_strategy_t ccdstrategy;
165132718Skanstatic d_ioctl_t ccdioctl;
166132718Skanstatic d_dump_t ccddump;
167132718Skanstatic d_psize_t ccdsize;
168132718Skan
169132718Skan#define NCCDFREEHIWAT	16
170132718Skan
171132718Skan#define CDEV_MAJOR 74
172132718Skan#define BDEV_MAJOR 21
173132718Skan
174132718Skanstatic struct cdevsw ccd_cdevsw = {
175132718Skan	/* open */	ccdopen,
176132718Skan	/* close */	ccdclose,
177132718Skan	/* read */	physread,
178132718Skan	/* write */	physwrite,
179132718Skan	/* ioctl */	ccdioctl,
180132718Skan	/* poll */	nopoll,
181132718Skan	/* mmap */	nommap,
182132718Skan	/* strategy */	ccdstrategy,
183132718Skan	/* name */	"ccd",
184132718Skan	/* maj */	CDEV_MAJOR,
185132718Skan	/* dump */	ccddump,
186132718Skan	/* psize */	ccdsize,
187132718Skan	/* flags */	D_DISK,
188132718Skan	/* bmaj */	BDEV_MAJOR
189132718Skan};
190132718Skan
191132718Skan/* called during module initialization */
192132718Skanstatic	void ccdattach __P((void));
193132718Skanstatic	int ccd_modevent __P((module_t, int, void *));
194132718Skan
195132718Skan/* called by biodone() at interrupt time */
196132718Skanstatic	void ccdiodone __P((struct buf *bp));
197132718Skan
198132718Skanstatic	void ccdstart __P((struct ccd_softc *, struct buf *));
199132718Skanstatic	void ccdinterleave __P((struct ccd_softc *, int));
200132718Skanstatic	void ccdintr __P((struct ccd_softc *, struct buf *));
201132718Skanstatic	int ccdinit __P((struct ccddevice *, char **, struct proc *));
202132718Skanstatic	int ccdlookup __P((char *, struct proc *p, struct vnode **));
203132718Skanstatic	void ccdbuffer __P((struct ccdbuf **ret, struct ccd_softc *,
204132718Skan		struct buf *, daddr_t, caddr_t, long));
205132718Skanstatic	void ccdgetdisklabel __P((dev_t));
206132718Skanstatic	void ccdmakedisklabel __P((struct ccd_softc *));
207132718Skanstatic	int ccdlock __P((struct ccd_softc *));
208132718Skanstatic	void ccdunlock __P((struct ccd_softc *));
209132718Skan
210132718Skan#ifdef DEBUG
211132718Skanstatic	void printiinfo __P((struct ccdiinfo *));
212132718Skan#endif
213132718Skan
214132718Skan/* Non-private for the benefit of libkvm. */
215132718Skanstruct	ccd_softc *ccd_softc;
216132718Skanstruct	ccddevice *ccddevs;
217132718Skanstruct	ccdbuf *ccdfreebufs;
218132718Skanstatic	int numccdfreebufs;
219132718Skanstatic	int numccd = 0;
220132718Skan
221132718Skan/*
222132718Skan * getccdbuf() -	Allocate and zero a ccd buffer.
223132718Skan *
224169689Skan *	This routine is called at splbio().
225132718Skan */
226132718Skan
227132718Skanstatic __inline
228132718Skanstruct ccdbuf *
229132718Skangetccdbuf(struct ccdbuf *cpy)
230132718Skan{
231169689Skan	struct ccdbuf *cbp;
232132718Skan
233132718Skan	/*
234132718Skan	 * Allocate from freelist or malloc as necessary
235132718Skan	 */
236132718Skan	if ((cbp = ccdfreebufs) != NULL) {
237132718Skan		ccdfreebufs = cbp->cb_freenext;
238132718Skan		--numccdfreebufs;
239132718Skan	} else {
240132718Skan		cbp = malloc(sizeof(struct ccdbuf), M_DEVBUF, M_WAITOK);
241132718Skan	}
242132718Skan
243132718Skan	/*
244132718Skan	 * Used by mirroring code
245132718Skan	 */
246132718Skan	if (cpy)
247132718Skan		bcopy(cpy, cbp, sizeof(struct ccdbuf));
248132718Skan	else
249132718Skan		bzero(cbp, sizeof(struct ccdbuf));
250132718Skan
251132718Skan	/*
252132718Skan	 * independant struct buf initialization
253132718Skan	 */
254132718Skan	LIST_INIT(&cbp->cb_buf.b_dep);
255132718Skan	BUF_LOCKINIT(&cbp->cb_buf);
256132718Skan	BUF_LOCK(&cbp->cb_buf, LK_EXCLUSIVE);
257132718Skan	BUF_KERNPROC(&cbp->cb_buf);
258132718Skan
259132718Skan	return(cbp);
260132718Skan}
261132718Skan
262132718Skan/*
263132718Skan * putccdbuf() -	Free a ccd buffer.
264132718Skan *
265132718Skan *	This routine is called at splbio().
266132718Skan */
267132718Skan
268132718Skanstatic __inline
269132718Skanvoid
270132718Skanputccdbuf(struct ccdbuf *cbp)
271132718Skan{
272132718Skan	BUF_UNLOCK(&cbp->cb_buf);
273132718Skan	BUF_LOCKFREE(&cbp->cb_buf);
274132718Skan
275132718Skan	if (numccdfreebufs < NCCDFREEHIWAT) {
276132718Skan		cbp->cb_freenext = ccdfreebufs;
277132718Skan		ccdfreebufs = cbp;
278132718Skan		++numccdfreebufs;
279132718Skan	} else {
280132718Skan		free((caddr_t)cbp, M_DEVBUF);
281132718Skan	}
282132718Skan}
283132718Skan
284132718Skan
285132718Skan/*
286132718Skan * Number of blocks to untouched in front of a component partition.
287132718Skan * This is to avoid violating its disklabel area when it starts at the
288132718Skan * beginning of the slice.
289132718Skan */
290132718Skan#if !defined(CCD_OFFSET)
291132718Skan#define CCD_OFFSET 16
292132718Skan#endif
293132718Skan
294132718Skan/*
295132718Skan * Called by main() during pseudo-device attachment.  All we need
296132718Skan * to do is allocate enough space for devices to be configured later, and
297132718Skan * add devsw entries.
298132718Skan */
299132718Skanstatic void
300132718Skanccdattach()
301132718Skan{
302132718Skan	int i;
303132718Skan	int num = NCCD;
304132718Skan
305132718Skan	if (num > 1)
306132718Skan		printf("ccd0-%d: Concatenated disk drivers\n", num-1);
307132718Skan	else
308132718Skan		printf("ccd0: Concatenated disk driver\n");
309132718Skan
310132718Skan	ccd_softc = (struct ccd_softc *)malloc(num * sizeof(struct ccd_softc),
311132718Skan	    M_DEVBUF, M_NOWAIT);
312132718Skan	ccddevs = (struct ccddevice *)malloc(num * sizeof(struct ccddevice),
313132718Skan	    M_DEVBUF, M_NOWAIT);
314132718Skan	if ((ccd_softc == NULL) || (ccddevs == NULL)) {
315132718Skan		printf("WARNING: no memory for concatenated disks\n");
316132718Skan		if (ccd_softc != NULL)
317132718Skan			free(ccd_softc, M_DEVBUF);
318132718Skan		if (ccddevs != NULL)
319132718Skan			free(ccddevs, M_DEVBUF);
320132718Skan		return;
321132718Skan	}
322132718Skan	numccd = num;
323132718Skan	bzero(ccd_softc, num * sizeof(struct ccd_softc));
324132718Skan	bzero(ccddevs, num * sizeof(struct ccddevice));
325132718Skan
326169689Skan	cdevsw_add(&ccd_cdevsw);
327169689Skan	/* XXX: is this necessary? */
328169689Skan	for (i = 0; i < numccd; ++i)
329132718Skan		ccddevs[i].ccd_dk = -1;
330132718Skan}
331132718Skan
332132718Skanstatic int
333132718Skanccd_modevent(mod, type, data)
334132718Skan	module_t mod;
335132718Skan	int type;
336132718Skan	void *data;
337132718Skan{
338132718Skan	int error = 0;
339132718Skan
340132718Skan	switch (type) {
341132718Skan	case MOD_LOAD:
342132718Skan		ccdattach();
343132718Skan		break;
344132718Skan
345132718Skan	case MOD_UNLOAD:
346132718Skan		printf("ccd0: Unload not supported!\n");
347132718Skan		error = EOPNOTSUPP;
348132718Skan		break;
349132718Skan
350132718Skan	default:	/* MOD_SHUTDOWN etc */
351132718Skan		break;
352132718Skan	}
353132718Skan	return (error);
354132718Skan}
355132718Skan
356132718SkanDEV_MODULE(ccd, ccd_modevent, NULL);
357132718Skan
358132718Skanstatic int
359132718Skanccdinit(ccd, cpaths, p)
360132718Skan	struct ccddevice *ccd;
361132718Skan	char **cpaths;
362132718Skan	struct proc *p;
363132718Skan{
364132718Skan	struct ccd_softc *cs = &ccd_softc[ccd->ccd_unit];
365132718Skan	struct ccdcinfo *ci = NULL;	/* XXX */
366132718Skan	size_t size;
367132718Skan	int ix;
368132718Skan	struct vnode *vp;
369132718Skan	size_t minsize;
370132718Skan	int maxsecsize;
371132718Skan	struct partinfo dpart;
372132718Skan	struct ccdgeom *ccg = &cs->sc_geom;
373132718Skan	char tmppath[MAXPATHLEN];
374132718Skan	int error = 0;
375132718Skan
376132718Skan#ifdef DEBUG
377132718Skan	if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
378132718Skan		printf("ccdinit: unit %d\n", ccd->ccd_unit);
379132718Skan#endif
380132718Skan
381132718Skan	cs->sc_size = 0;
382132718Skan	cs->sc_ileave = ccd->ccd_interleave;
383169689Skan	cs->sc_nccdisks = ccd->ccd_ndev;
384132718Skan
385132718Skan	/* Allocate space for the component info. */
386132718Skan	cs->sc_cinfo = malloc(cs->sc_nccdisks * sizeof(struct ccdcinfo),
387132718Skan	    M_DEVBUF, M_WAITOK);
388132718Skan
389132718Skan	/*
390169689Skan	 * Verify that each component piece exists and record
391132718Skan	 * relevant information about it.
392132718Skan	 */
393132718Skan	maxsecsize = 0;
394132718Skan	minsize = 0;
395132718Skan	for (ix = 0; ix < cs->sc_nccdisks; ix++) {
396132718Skan		vp = ccd->ccd_vpp[ix];
397132718Skan		ci = &cs->sc_cinfo[ix];
398132718Skan		ci->ci_vp = vp;
399169689Skan
400132718Skan		/*
401132718Skan		 * Copy in the pathname of the component.
402132718Skan		 */
403132718Skan		bzero(tmppath, sizeof(tmppath));	/* sanity */
404132718Skan		if ((error = copyinstr(cpaths[ix], tmppath,
405132718Skan		    MAXPATHLEN, &ci->ci_pathlen)) != 0) {
406132718Skan#ifdef DEBUG
407132718Skan			if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
408132718Skan				printf("ccd%d: can't copy path, error = %d\n",
409132718Skan				    ccd->ccd_unit, error);
410132718Skan#endif
411132718Skan			goto fail;
412132718Skan		}
413132718Skan		ci->ci_path = malloc(ci->ci_pathlen, M_DEVBUF, M_WAITOK);
414132718Skan		bcopy(tmppath, ci->ci_path, ci->ci_pathlen);
415132718Skan
416132718Skan		ci->ci_dev = vn_todev(vp);
417132718Skan
418132718Skan		/*
419132718Skan		 * Get partition information for the component.
420132718Skan		 */
421132718Skan		if ((error = VOP_IOCTL(vp, DIOCGPART, (caddr_t)&dpart,
422132718Skan		    FREAD, p->p_ucred, p)) != 0) {
423132718Skan#ifdef DEBUG
424132718Skan			if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
425132718Skan				 printf("ccd%d: %s: ioctl failed, error = %d\n",
426132718Skan				     ccd->ccd_unit, ci->ci_path, error);
427132718Skan#endif
428132718Skan			goto fail;
429132718Skan		}
430132718Skan		if (dpart.part->p_fstype == FS_BSDFFS) {
431132718Skan			maxsecsize =
432132718Skan			    ((dpart.disklab->d_secsize > maxsecsize) ?
433132718Skan			    dpart.disklab->d_secsize : maxsecsize);
434132718Skan			size = dpart.part->p_size - CCD_OFFSET;
435132718Skan		} else {
436132718Skan#ifdef DEBUG
437132718Skan			if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
438132718Skan				printf("ccd%d: %s: incorrect partition type\n",
439132718Skan				    ccd->ccd_unit, ci->ci_path);
440132718Skan#endif
441132718Skan			error = EFTYPE;
442132718Skan			goto fail;
443132718Skan		}
444132718Skan
445132718Skan		/*
446132718Skan		 * Calculate the size, truncating to an interleave
447132718Skan		 * boundary if necessary.
448132718Skan		 */
449132718Skan
450132718Skan		if (cs->sc_ileave > 1)
451132718Skan			size -= size % cs->sc_ileave;
452169689Skan
453132718Skan		if (size == 0) {
454132718Skan#ifdef DEBUG
455132718Skan			if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
456132718Skan				printf("ccd%d: %s: size == 0\n",
457132718Skan				    ccd->ccd_unit, ci->ci_path);
458132718Skan#endif
459132718Skan			error = ENODEV;
460132718Skan			goto fail;
461132718Skan		}
462132718Skan
463132718Skan		if (minsize == 0 || size < minsize)
464132718Skan			minsize = size;
465132718Skan		ci->ci_size = size;
466132718Skan		cs->sc_size += size;
467132718Skan	}
468132718Skan
469132718Skan	/*
470132718Skan	 * Don't allow the interleave to be smaller than
471132718Skan	 * the biggest component sector.
472132718Skan	 */
473132718Skan	if ((cs->sc_ileave > 0) &&
474132718Skan	    (cs->sc_ileave < (maxsecsize / DEV_BSIZE))) {
475132718Skan#ifdef DEBUG
476132718Skan		if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
477132718Skan			printf("ccd%d: interleave must be at least %d\n",
478132718Skan			    ccd->ccd_unit, (maxsecsize / DEV_BSIZE));
479132718Skan#endif
480132718Skan		error = EINVAL;
481132718Skan		goto fail;
482132718Skan	}
483132718Skan
484132718Skan	/*
485132718Skan	 * If uniform interleave is desired set all sizes to that of
486132718Skan	 * the smallest component.  This will guarentee that a single
487132718Skan	 * interleave table is generated.
488132718Skan	 *
489132718Skan	 * Lost space must be taken into account when calculating the
490132718Skan	 * overall size.  Half the space is lost when CCDF_MIRROR is
491132718Skan	 * specified.  One disk is lost when CCDF_PARITY is specified.
492132718Skan	 */
493132718Skan	if (ccd->ccd_flags & CCDF_UNIFORM) {
494132718Skan		for (ci = cs->sc_cinfo;
495132718Skan		     ci < &cs->sc_cinfo[cs->sc_nccdisks]; ci++) {
496132718Skan			ci->ci_size = minsize;
497169689Skan		}
498132718Skan		if (ccd->ccd_flags & CCDF_MIRROR) {
499132718Skan			/*
500132718Skan			 * Check to see if an even number of components
501132718Skan			 * have been specified.  The interleave must also
502132718Skan			 * be non-zero in order for us to be able to
503132718Skan			 * guarentee the topology.
504132718Skan			 */
505132718Skan			if (cs->sc_nccdisks % 2) {
506132718Skan				printf("ccd%d: mirroring requires an even number of disks\n", ccd->ccd_unit );
507132718Skan				error = EINVAL;
508132718Skan				goto fail;
509132718Skan			}
510132718Skan			if (cs->sc_ileave == 0) {
511169689Skan				printf("ccd%d: an interleave must be specified when mirroring\n", ccd->ccd_unit);
512132718Skan				error = EINVAL;
513132718Skan				goto fail;
514132718Skan			}
515132718Skan			cs->sc_size = (cs->sc_nccdisks/2) * minsize;
516169689Skan		} else if (ccd->ccd_flags & CCDF_PARITY) {
517132718Skan			cs->sc_size = (cs->sc_nccdisks-1) * minsize;
518132718Skan		} else {
519169689Skan			if (cs->sc_ileave == 0) {
520132718Skan				printf("ccd%d: an interleave must be specified when using parity\n", ccd->ccd_unit);
521132718Skan				error = EINVAL;
522132718Skan				goto fail;
523132718Skan			}
524132718Skan			cs->sc_size = cs->sc_nccdisks * minsize;
525132718Skan		}
526132718Skan	}
527132718Skan
528132718Skan	/*
529132718Skan	 * Construct the interleave table.
530132718Skan	 */
531132718Skan	ccdinterleave(cs, ccd->ccd_unit);
532132718Skan
533132718Skan	/*
534132718Skan	 * Create pseudo-geometry based on 1MB cylinders.  It's
535132718Skan	 * pretty close.
536132718Skan	 */
537132718Skan	ccg->ccg_secsize = maxsecsize;
538	ccg->ccg_ntracks = 1;
539	ccg->ccg_nsectors = 1024 * 1024 / ccg->ccg_secsize;
540	ccg->ccg_ncylinders = cs->sc_size / ccg->ccg_nsectors;
541
542	/*
543	 * Add an devstat entry for this device.
544	 */
545	devstat_add_entry(&cs->device_stats, "ccd", ccd->ccd_unit,
546			  ccg->ccg_secsize, DEVSTAT_ALL_SUPPORTED,
547			  DEVSTAT_TYPE_STORARRAY |DEVSTAT_TYPE_IF_OTHER,
548			  DEVSTAT_PRIORITY_ARRAY);
549
550	cs->sc_flags |= CCDF_INITED;
551	cs->sc_cflags = ccd->ccd_flags;	/* So we can find out later... */
552	cs->sc_unit = ccd->ccd_unit;
553	return (0);
554fail:
555	while (ci > cs->sc_cinfo) {
556		ci--;
557		free(ci->ci_path, M_DEVBUF);
558	}
559	free(cs->sc_cinfo, M_DEVBUF);
560	return (error);
561}
562
563static void
564ccdinterleave(cs, unit)
565	struct ccd_softc *cs;
566	int unit;
567{
568	struct ccdcinfo *ci, *smallci;
569	struct ccdiinfo *ii;
570	daddr_t bn, lbn;
571	int ix;
572	u_long size;
573
574#ifdef DEBUG
575	if (ccddebug & CCDB_INIT)
576		printf("ccdinterleave(%x): ileave %d\n", cs, cs->sc_ileave);
577#endif
578
579	/*
580	 * Allocate an interleave table.  The worst case occurs when each
581	 * of N disks is of a different size, resulting in N interleave
582	 * tables.
583	 *
584	 * Chances are this is too big, but we don't care.
585	 */
586	size = (cs->sc_nccdisks + 1) * sizeof(struct ccdiinfo);
587	cs->sc_itable = (struct ccdiinfo *)malloc(size, M_DEVBUF, M_WAITOK);
588	bzero((caddr_t)cs->sc_itable, size);
589
590	/*
591	 * Trivial case: no interleave (actually interleave of disk size).
592	 * Each table entry represents a single component in its entirety.
593	 *
594	 * An interleave of 0 may not be used with a mirror or parity setup.
595	 */
596	if (cs->sc_ileave == 0) {
597		bn = 0;
598		ii = cs->sc_itable;
599
600		for (ix = 0; ix < cs->sc_nccdisks; ix++) {
601			/* Allocate space for ii_index. */
602			ii->ii_index = malloc(sizeof(int), M_DEVBUF, M_WAITOK);
603			ii->ii_ndisk = 1;
604			ii->ii_startblk = bn;
605			ii->ii_startoff = 0;
606			ii->ii_index[0] = ix;
607			bn += cs->sc_cinfo[ix].ci_size;
608			ii++;
609		}
610		ii->ii_ndisk = 0;
611#ifdef DEBUG
612		if (ccddebug & CCDB_INIT)
613			printiinfo(cs->sc_itable);
614#endif
615		return;
616	}
617
618	/*
619	 * The following isn't fast or pretty; it doesn't have to be.
620	 */
621	size = 0;
622	bn = lbn = 0;
623	for (ii = cs->sc_itable; ; ii++) {
624		/*
625		 * Allocate space for ii_index.  We might allocate more then
626		 * we use.
627		 */
628		ii->ii_index = malloc((sizeof(int) * cs->sc_nccdisks),
629		    M_DEVBUF, M_WAITOK);
630
631		/*
632		 * Locate the smallest of the remaining components
633		 */
634		smallci = NULL;
635		for (ci = cs->sc_cinfo; ci < &cs->sc_cinfo[cs->sc_nccdisks];
636		    ci++) {
637			if (ci->ci_size > size &&
638			    (smallci == NULL ||
639			     ci->ci_size < smallci->ci_size)) {
640				smallci = ci;
641			}
642		}
643
644		/*
645		 * Nobody left, all done
646		 */
647		if (smallci == NULL) {
648			ii->ii_ndisk = 0;
649			break;
650		}
651
652		/*
653		 * Record starting logical block using an sc_ileave blocksize.
654		 */
655		ii->ii_startblk = bn / cs->sc_ileave;
656
657		/*
658		 * Record starting comopnent block using an sc_ileave
659		 * blocksize.  This value is relative to the beginning of
660		 * a component disk.
661		 */
662		ii->ii_startoff = lbn;
663
664		/*
665		 * Determine how many disks take part in this interleave
666		 * and record their indices.
667		 */
668		ix = 0;
669		for (ci = cs->sc_cinfo;
670		    ci < &cs->sc_cinfo[cs->sc_nccdisks]; ci++) {
671			if (ci->ci_size >= smallci->ci_size) {
672				ii->ii_index[ix++] = ci - cs->sc_cinfo;
673			}
674		}
675		ii->ii_ndisk = ix;
676		bn += ix * (smallci->ci_size - size);
677		lbn = smallci->ci_size / cs->sc_ileave;
678		size = smallci->ci_size;
679	}
680#ifdef DEBUG
681	if (ccddebug & CCDB_INIT)
682		printiinfo(cs->sc_itable);
683#endif
684}
685
686/* ARGSUSED */
687static int
688ccdopen(dev, flags, fmt, p)
689	dev_t dev;
690	int flags, fmt;
691	struct proc *p;
692{
693	int unit = ccdunit(dev);
694	struct ccd_softc *cs;
695	struct disklabel *lp;
696	int error = 0, part, pmask;
697
698#ifdef DEBUG
699	if (ccddebug & CCDB_FOLLOW)
700		printf("ccdopen(%x, %x)\n", dev, flags);
701#endif
702	if (unit >= numccd)
703		return (ENXIO);
704	cs = &ccd_softc[unit];
705
706	if ((error = ccdlock(cs)) != 0)
707		return (error);
708
709	lp = &cs->sc_label;
710
711	part = ccdpart(dev);
712	pmask = (1 << part);
713
714	/*
715	 * If we're initialized, check to see if there are any other
716	 * open partitions.  If not, then it's safe to update
717	 * the in-core disklabel.
718	 */
719	if ((cs->sc_flags & CCDF_INITED) && (cs->sc_openmask == 0))
720		ccdgetdisklabel(dev);
721
722	/* Check that the partition exists. */
723	if (part != RAW_PART && ((part >= lp->d_npartitions) ||
724	    (lp->d_partitions[part].p_fstype == FS_UNUSED))) {
725		error = ENXIO;
726		goto done;
727	}
728
729	cs->sc_openmask |= pmask;
730 done:
731	ccdunlock(cs);
732	return (0);
733}
734
735/* ARGSUSED */
736static int
737ccdclose(dev, flags, fmt, p)
738	dev_t dev;
739	int flags, fmt;
740	struct proc *p;
741{
742	int unit = ccdunit(dev);
743	struct ccd_softc *cs;
744	int error = 0, part;
745
746#ifdef DEBUG
747	if (ccddebug & CCDB_FOLLOW)
748		printf("ccdclose(%x, %x)\n", dev, flags);
749#endif
750
751	if (unit >= numccd)
752		return (ENXIO);
753	cs = &ccd_softc[unit];
754
755	if ((error = ccdlock(cs)) != 0)
756		return (error);
757
758	part = ccdpart(dev);
759
760	/* ...that much closer to allowing unconfiguration... */
761	cs->sc_openmask &= ~(1 << part);
762	ccdunlock(cs);
763	return (0);
764}
765
766static void
767ccdstrategy(bp)
768	struct buf *bp;
769{
770	int unit = ccdunit(bp->b_dev);
771	struct ccd_softc *cs = &ccd_softc[unit];
772	int s;
773	int wlabel;
774	struct disklabel *lp;
775
776#ifdef DEBUG
777	if (ccddebug & CCDB_FOLLOW)
778		printf("ccdstrategy(%x): unit %d\n", bp, unit);
779#endif
780	if ((cs->sc_flags & CCDF_INITED) == 0) {
781		bp->b_error = ENXIO;
782		bp->b_flags |= B_ERROR;
783		goto done;
784	}
785
786	/* If it's a nil transfer, wake up the top half now. */
787	if (bp->b_bcount == 0)
788		goto done;
789
790	lp = &cs->sc_label;
791
792	/*
793	 * Do bounds checking and adjust transfer.  If there's an
794	 * error, the bounds check will flag that for us.
795	 */
796	wlabel = cs->sc_flags & (CCDF_WLABEL|CCDF_LABELLING);
797	if (ccdpart(bp->b_dev) != RAW_PART) {
798		if (bounds_check_with_label(bp, lp, wlabel) <= 0)
799			goto done;
800	} else {
801		int pbn;        /* in sc_secsize chunks */
802		long sz;        /* in sc_secsize chunks */
803
804		pbn = bp->b_blkno / (cs->sc_geom.ccg_secsize / DEV_BSIZE);
805		sz = howmany(bp->b_bcount, cs->sc_geom.ccg_secsize);
806
807		/*
808		 * If out of bounds return an error. If at the EOF point,
809		 * simply read or write less.
810		 */
811
812		if (pbn < 0 || pbn >= cs->sc_size) {
813			bp->b_resid = bp->b_bcount;
814			if (pbn != cs->sc_size) {
815				bp->b_error = EINVAL;
816				bp->b_flags |= B_ERROR | B_INVAL;
817			}
818			goto done;
819		}
820
821		/*
822		 * If the request crosses EOF, truncate the request.
823		 */
824		if (pbn + sz > cs->sc_size) {
825			bp->b_bcount = (cs->sc_size - pbn) *
826			    cs->sc_geom.ccg_secsize;
827		}
828	}
829
830	bp->b_resid = bp->b_bcount;
831
832	/*
833	 * "Start" the unit.
834	 */
835	s = splbio();
836	ccdstart(cs, bp);
837	splx(s);
838	return;
839done:
840	biodone(bp);
841}
842
843static void
844ccdstart(cs, bp)
845	struct ccd_softc *cs;
846	struct buf *bp;
847{
848	long bcount, rcount;
849	struct ccdbuf *cbp[4];
850	/* XXX! : 2 reads and 2 writes for RAID 4/5 */
851	caddr_t addr;
852	daddr_t bn;
853	struct partition *pp;
854
855#ifdef DEBUG
856	if (ccddebug & CCDB_FOLLOW)
857		printf("ccdstart(%x, %x)\n", cs, bp);
858#endif
859
860	/* Record the transaction start  */
861	devstat_start_transaction(&cs->device_stats);
862
863	/*
864	 * Translate the partition-relative block number to an absolute.
865	 */
866	bn = bp->b_blkno;
867	if (ccdpart(bp->b_dev) != RAW_PART) {
868		pp = &cs->sc_label.d_partitions[ccdpart(bp->b_dev)];
869		bn += pp->p_offset;
870	}
871
872	/*
873	 * Allocate component buffers and fire off the requests
874	 */
875	addr = bp->b_data;
876	for (bcount = bp->b_bcount; bcount > 0; bcount -= rcount) {
877		ccdbuffer(cbp, cs, bp, bn, addr, bcount);
878		rcount = cbp[0]->cb_buf.b_bcount;
879
880		if (cs->sc_cflags & CCDF_MIRROR) {
881			/*
882			 * Mirroring.  Writes go to both disks, reads are
883			 * taken from whichever disk seems most appropriate.
884			 *
885			 * We attempt to localize reads to the disk whos arm
886			 * is nearest the read request.  We ignore seeks due
887			 * to writes when making this determination and we
888			 * also try to avoid hogging.
889			 */
890			if (cbp[0]->cb_buf.b_iocmd == BIO_WRITE) {
891				cbp[0]->cb_buf.b_vp->v_numoutput++;
892				cbp[1]->cb_buf.b_vp->v_numoutput++;
893				BUF_STRATEGY(&cbp[0]->cb_buf);
894				BUF_STRATEGY(&cbp[1]->cb_buf);
895			} else {
896				int pick = cs->sc_pick;
897				daddr_t range = cs->sc_size / 16;
898
899				if (bn < cs->sc_blk[pick] - range ||
900				    bn > cs->sc_blk[pick] + range
901				) {
902					cs->sc_pick = pick = 1 - pick;
903				}
904				cs->sc_blk[pick] = bn + btodb(rcount);
905				BUF_STRATEGY(&cbp[pick]->cb_buf);
906			}
907		} else {
908			/*
909			 * Not mirroring
910			 */
911			if (cbp[0]->cb_buf.b_iocmd == BIO_WRITE)
912				cbp[0]->cb_buf.b_vp->v_numoutput++;
913			BUF_STRATEGY(&cbp[0]->cb_buf);
914		}
915		bn += btodb(rcount);
916		addr += rcount;
917	}
918}
919
920/*
921 * Build a component buffer header.
922 */
923static void
924ccdbuffer(cb, cs, bp, bn, addr, bcount)
925	struct ccdbuf **cb;
926	struct ccd_softc *cs;
927	struct buf *bp;
928	daddr_t bn;
929	caddr_t addr;
930	long bcount;
931{
932	struct ccdcinfo *ci, *ci2 = NULL;	/* XXX */
933	struct ccdbuf *cbp;
934	daddr_t cbn, cboff;
935	off_t cbc;
936
937#ifdef DEBUG
938	if (ccddebug & CCDB_IO)
939		printf("ccdbuffer(%x, %x, %d, %x, %d)\n",
940		       cs, bp, bn, addr, bcount);
941#endif
942	/*
943	 * Determine which component bn falls in.
944	 */
945	cbn = bn;
946	cboff = 0;
947
948	if (cs->sc_ileave == 0) {
949		/*
950		 * Serially concatenated and neither a mirror nor a parity
951		 * config.  This is a special case.
952		 */
953		daddr_t sblk;
954
955		sblk = 0;
956		for (ci = cs->sc_cinfo; cbn >= sblk + ci->ci_size; ci++)
957			sblk += ci->ci_size;
958		cbn -= sblk;
959	} else {
960		struct ccdiinfo *ii;
961		int ccdisk, off;
962
963		/*
964		 * Calculate cbn, the logical superblock (sc_ileave chunks),
965		 * and cboff, a normal block offset (DEV_BSIZE chunks) relative
966		 * to cbn.
967		 */
968		cboff = cbn % cs->sc_ileave;	/* DEV_BSIZE gran */
969		cbn = cbn / cs->sc_ileave;	/* DEV_BSIZE * ileave gran */
970
971		/*
972		 * Figure out which interleave table to use.
973		 */
974		for (ii = cs->sc_itable; ii->ii_ndisk; ii++) {
975			if (ii->ii_startblk > cbn)
976				break;
977		}
978		ii--;
979
980		/*
981		 * off is the logical superblock relative to the beginning
982		 * of this interleave block.
983		 */
984		off = cbn - ii->ii_startblk;
985
986		/*
987		 * We must calculate which disk component to use (ccdisk),
988		 * and recalculate cbn to be the superblock relative to
989		 * the beginning of the component.  This is typically done by
990		 * adding 'off' and ii->ii_startoff together.  However, 'off'
991		 * must typically be divided by the number of components in
992		 * this interleave array to be properly convert it from a
993		 * CCD-relative logical superblock number to a
994		 * component-relative superblock number.
995		 */
996		if (ii->ii_ndisk == 1) {
997			/*
998			 * When we have just one disk, it can't be a mirror
999			 * or a parity config.
1000			 */
1001			ccdisk = ii->ii_index[0];
1002			cbn = ii->ii_startoff + off;
1003		} else {
1004			if (cs->sc_cflags & CCDF_MIRROR) {
1005				/*
1006				 * We have forced a uniform mapping, resulting
1007				 * in a single interleave array.  We double
1008				 * up on the first half of the available
1009				 * components and our mirror is in the second
1010				 * half.  This only works with a single
1011				 * interleave array because doubling up
1012				 * doubles the number of sectors, so there
1013				 * cannot be another interleave array because
1014				 * the next interleave array's calculations
1015				 * would be off.
1016				 */
1017				int ndisk2 = ii->ii_ndisk / 2;
1018				ccdisk = ii->ii_index[off % ndisk2];
1019				cbn = ii->ii_startoff + off / ndisk2;
1020				ci2 = &cs->sc_cinfo[ccdisk + ndisk2];
1021			} else if (cs->sc_cflags & CCDF_PARITY) {
1022				/*
1023				 * XXX not implemented yet
1024				 */
1025				int ndisk2 = ii->ii_ndisk - 1;
1026				ccdisk = ii->ii_index[off % ndisk2];
1027				cbn = ii->ii_startoff + off / ndisk2;
1028				if (cbn % ii->ii_ndisk <= ccdisk)
1029					ccdisk++;
1030			} else {
1031				ccdisk = ii->ii_index[off % ii->ii_ndisk];
1032				cbn = ii->ii_startoff + off / ii->ii_ndisk;
1033			}
1034		}
1035
1036		ci = &cs->sc_cinfo[ccdisk];
1037
1038		/*
1039		 * Convert cbn from a superblock to a normal block so it
1040		 * can be used to calculate (along with cboff) the normal
1041		 * block index into this particular disk.
1042		 */
1043		cbn *= cs->sc_ileave;
1044	}
1045
1046	/*
1047	 * Fill in the component buf structure.
1048	 */
1049	cbp = getccdbuf(NULL);
1050	cbp->cb_buf.b_flags = bp->b_flags;
1051	cbp->cb_buf.b_iocmd = bp->b_iocmd;
1052	cbp->cb_buf.b_iodone = ccdiodone;
1053	cbp->cb_buf.b_dev = ci->ci_dev;		/* XXX */
1054	cbp->cb_buf.b_blkno = cbn + cboff + CCD_OFFSET;
1055	cbp->cb_buf.b_offset = dbtob(cbn + cboff + CCD_OFFSET);
1056	cbp->cb_buf.b_data = addr;
1057	cbp->cb_buf.b_vp = ci->ci_vp;
1058	if (cs->sc_ileave == 0)
1059              cbc = dbtob((off_t)(ci->ci_size - cbn));
1060	else
1061              cbc = dbtob((off_t)(cs->sc_ileave - cboff));
1062	cbp->cb_buf.b_bcount = (cbc < bcount) ? cbc : bcount;
1063 	cbp->cb_buf.b_bufsize = cbp->cb_buf.b_bcount;
1064
1065	/*
1066	 * context for ccdiodone
1067	 */
1068	cbp->cb_obp = bp;
1069	cbp->cb_unit = cs - ccd_softc;
1070	cbp->cb_comp = ci - cs->sc_cinfo;
1071
1072#ifdef DEBUG
1073	if (ccddebug & CCDB_IO)
1074		printf(" dev %x(u%d): cbp %x bn %d addr %x bcnt %d\n",
1075		       ci->ci_dev, ci-cs->sc_cinfo, cbp, cbp->cb_buf.b_blkno,
1076		       cbp->cb_buf.b_data, cbp->cb_buf.b_bcount);
1077#endif
1078	cb[0] = cbp;
1079
1080	/*
1081	 * Note: both I/O's setup when reading from mirror, but only one
1082	 * will be executed.
1083	 */
1084	if (cs->sc_cflags & CCDF_MIRROR) {
1085		/* mirror, setup second I/O */
1086		cbp = getccdbuf(cb[0]);
1087		cbp->cb_buf.b_dev = ci2->ci_dev;
1088		cbp->cb_buf.b_vp = ci2->ci_vp;
1089		cbp->cb_comp = ci2 - cs->sc_cinfo;
1090		cb[1] = cbp;
1091		/* link together the ccdbuf's and clear "mirror done" flag */
1092		cb[0]->cb_mirror = cb[1];
1093		cb[1]->cb_mirror = cb[0];
1094		cb[0]->cb_pflags &= ~CCDPF_MIRROR_DONE;
1095		cb[1]->cb_pflags &= ~CCDPF_MIRROR_DONE;
1096	}
1097}
1098
1099static void
1100ccdintr(cs, bp)
1101	struct ccd_softc *cs;
1102	struct buf *bp;
1103{
1104#ifdef DEBUG
1105	if (ccddebug & CCDB_FOLLOW)
1106		printf("ccdintr(%x, %x)\n", cs, bp);
1107#endif
1108	/*
1109	 * Request is done for better or worse, wakeup the top half.
1110	 */
1111	if (bp->b_flags & B_ERROR)
1112		bp->b_resid = bp->b_bcount;
1113	devstat_end_transaction_buf(&cs->device_stats, bp);
1114	biodone(bp);
1115}
1116
1117/*
1118 * Called at interrupt time.
1119 * Mark the component as done and if all components are done,
1120 * take a ccd interrupt.
1121 */
1122static void
1123ccdiodone(ibp)
1124	struct buf *ibp;
1125{
1126	struct ccdbuf *cbp = (struct ccdbuf *)ibp;
1127	struct buf *bp = cbp->cb_obp;
1128	int unit = cbp->cb_unit;
1129	int count, s;
1130
1131	s = splbio();
1132#ifdef DEBUG
1133	if (ccddebug & CCDB_FOLLOW)
1134		printf("ccdiodone(%x)\n", cbp);
1135	if (ccddebug & CCDB_IO) {
1136		printf("ccdiodone: bp %x bcount %d resid %d\n",
1137		       bp, bp->b_bcount, bp->b_resid);
1138		printf(" dev %x(u%d), cbp %x bn %d addr %x bcnt %d\n",
1139		       cbp->cb_buf.b_dev, cbp->cb_comp, cbp,
1140		       cbp->cb_buf.b_blkno, cbp->cb_buf.b_data,
1141		       cbp->cb_buf.b_bcount);
1142	}
1143#endif
1144	/*
1145	 * If an error occured, report it.  If this is a mirrored
1146	 * configuration and the first of two possible reads, do not
1147	 * set the error in the bp yet because the second read may
1148	 * succeed.
1149	 */
1150
1151	if (cbp->cb_buf.b_flags & B_ERROR) {
1152		const char *msg = "";
1153
1154		if ((ccd_softc[unit].sc_cflags & CCDF_MIRROR) &&
1155		    (cbp->cb_buf.b_iocmd == BIO_READ) &&
1156		    (cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) {
1157			/*
1158			 * We will try our read on the other disk down
1159			 * below, also reverse the default pick so if we
1160			 * are doing a scan we do not keep hitting the
1161			 * bad disk first.
1162			 */
1163			struct ccd_softc *cs = &ccd_softc[unit];
1164
1165			msg = ", trying other disk";
1166			cs->sc_pick = 1 - cs->sc_pick;
1167			cs->sc_blk[cs->sc_pick] = bp->b_blkno;
1168		} else {
1169			bp->b_flags |= B_ERROR;
1170			bp->b_error = cbp->cb_buf.b_error ?
1171			    cbp->cb_buf.b_error : EIO;
1172		}
1173		printf("ccd%d: error %d on component %d block %d (ccd block %d)%s\n",
1174		       unit, bp->b_error, cbp->cb_comp,
1175		       (int)cbp->cb_buf.b_blkno, bp->b_blkno, msg);
1176	}
1177
1178	/*
1179	 * Process mirror.  If we are writing, I/O has been initiated on both
1180	 * buffers and we fall through only after both are finished.
1181	 *
1182	 * If we are reading only one I/O is initiated at a time.  If an
1183	 * error occurs we initiate the second I/O and return, otherwise
1184	 * we free the second I/O without initiating it.
1185	 */
1186
1187	if (ccd_softc[unit].sc_cflags & CCDF_MIRROR) {
1188		if (cbp->cb_buf.b_iocmd == BIO_WRITE) {
1189			/*
1190			 * When writing, handshake with the second buffer
1191			 * to determine when both are done.  If both are not
1192			 * done, return here.
1193			 */
1194			if ((cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) {
1195				cbp->cb_mirror->cb_pflags |= CCDPF_MIRROR_DONE;
1196				putccdbuf(cbp);
1197				splx(s);
1198				return;
1199			}
1200		} else {
1201			/*
1202			 * When reading, either dispose of the second buffer
1203			 * or initiate I/O on the second buffer if an error
1204			 * occured with this one.
1205			 */
1206			if ((cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) {
1207				if (cbp->cb_buf.b_flags & B_ERROR) {
1208					cbp->cb_mirror->cb_pflags |=
1209					    CCDPF_MIRROR_DONE;
1210					BUF_STRATEGY(&cbp->cb_mirror->cb_buf);
1211					putccdbuf(cbp);
1212					splx(s);
1213					return;
1214				} else {
1215					putccdbuf(cbp->cb_mirror);
1216					/* fall through */
1217				}
1218			}
1219		}
1220	}
1221
1222	/*
1223	 * use b_bufsize to determine how big the original request was rather
1224	 * then b_bcount, because b_bcount may have been truncated for EOF.
1225	 *
1226	 * XXX We check for an error, but we do not test the resid for an
1227	 * aligned EOF condition.  This may result in character & block
1228	 * device access not recognizing EOF properly when read or written
1229	 * sequentially, but will not effect filesystems.
1230	 */
1231	count = cbp->cb_buf.b_bufsize;
1232	putccdbuf(cbp);
1233
1234	/*
1235	 * If all done, "interrupt".
1236	 */
1237	bp->b_resid -= count;
1238	if (bp->b_resid < 0)
1239		panic("ccdiodone: count");
1240	if (bp->b_resid == 0)
1241		ccdintr(&ccd_softc[unit], bp);
1242	splx(s);
1243}
1244
1245static int
1246ccdioctl(dev, cmd, data, flag, p)
1247	dev_t dev;
1248	u_long cmd;
1249	caddr_t data;
1250	int flag;
1251	struct proc *p;
1252{
1253	int unit = ccdunit(dev);
1254	int i, j, lookedup = 0, error = 0;
1255	int part, pmask, s;
1256	struct ccd_softc *cs;
1257	struct ccd_ioctl *ccio = (struct ccd_ioctl *)data;
1258	struct ccddevice ccd;
1259	char **cpp;
1260	struct vnode **vpp;
1261
1262	if (unit >= numccd)
1263		return (ENXIO);
1264	cs = &ccd_softc[unit];
1265
1266	bzero(&ccd, sizeof(ccd));
1267
1268	switch (cmd) {
1269	case CCDIOCSET:
1270		if (cs->sc_flags & CCDF_INITED)
1271			return (EBUSY);
1272
1273		if ((flag & FWRITE) == 0)
1274			return (EBADF);
1275
1276		if ((error = ccdlock(cs)) != 0)
1277			return (error);
1278
1279		/* Fill in some important bits. */
1280		ccd.ccd_unit = unit;
1281		ccd.ccd_interleave = ccio->ccio_ileave;
1282		if (ccd.ccd_interleave == 0 &&
1283		    ((ccio->ccio_flags & CCDF_MIRROR) ||
1284		     (ccio->ccio_flags & CCDF_PARITY))) {
1285			printf("ccd%d: disabling mirror/parity, interleave is 0\n", unit);
1286			ccio->ccio_flags &= ~(CCDF_MIRROR | CCDF_PARITY);
1287		}
1288		if ((ccio->ccio_flags & CCDF_MIRROR) &&
1289		    (ccio->ccio_flags & CCDF_PARITY)) {
1290			printf("ccd%d: can't specify both mirror and parity, using mirror\n", unit);
1291			ccio->ccio_flags &= ~CCDF_PARITY;
1292		}
1293		if ((ccio->ccio_flags & (CCDF_MIRROR | CCDF_PARITY)) &&
1294		    !(ccio->ccio_flags & CCDF_UNIFORM)) {
1295			printf("ccd%d: mirror/parity forces uniform flag\n",
1296			       unit);
1297			ccio->ccio_flags |= CCDF_UNIFORM;
1298		}
1299		ccd.ccd_flags = ccio->ccio_flags & CCDF_USERMASK;
1300
1301		/*
1302		 * Allocate space for and copy in the array of
1303		 * componet pathnames and device numbers.
1304		 */
1305		cpp = malloc(ccio->ccio_ndisks * sizeof(char *),
1306		    M_DEVBUF, M_WAITOK);
1307		vpp = malloc(ccio->ccio_ndisks * sizeof(struct vnode *),
1308		    M_DEVBUF, M_WAITOK);
1309
1310		error = copyin((caddr_t)ccio->ccio_disks, (caddr_t)cpp,
1311		    ccio->ccio_ndisks * sizeof(char **));
1312		if (error) {
1313			free(vpp, M_DEVBUF);
1314			free(cpp, M_DEVBUF);
1315			ccdunlock(cs);
1316			return (error);
1317		}
1318
1319#ifdef DEBUG
1320		if (ccddebug & CCDB_INIT)
1321			for (i = 0; i < ccio->ccio_ndisks; ++i)
1322				printf("ccdioctl: component %d: 0x%x\n",
1323				    i, cpp[i]);
1324#endif
1325
1326		for (i = 0; i < ccio->ccio_ndisks; ++i) {
1327#ifdef DEBUG
1328			if (ccddebug & CCDB_INIT)
1329				printf("ccdioctl: lookedup = %d\n", lookedup);
1330#endif
1331			if ((error = ccdlookup(cpp[i], p, &vpp[i])) != 0) {
1332				for (j = 0; j < lookedup; ++j)
1333					(void)vn_close(vpp[j], FREAD|FWRITE,
1334					    p->p_ucred, p);
1335				free(vpp, M_DEVBUF);
1336				free(cpp, M_DEVBUF);
1337				ccdunlock(cs);
1338				return (error);
1339			}
1340			++lookedup;
1341		}
1342		ccd.ccd_cpp = cpp;
1343		ccd.ccd_vpp = vpp;
1344		ccd.ccd_ndev = ccio->ccio_ndisks;
1345
1346		/*
1347		 * Initialize the ccd.  Fills in the softc for us.
1348		 */
1349		if ((error = ccdinit(&ccd, cpp, p)) != 0) {
1350			for (j = 0; j < lookedup; ++j)
1351				(void)vn_close(vpp[j], FREAD|FWRITE,
1352				    p->p_ucred, p);
1353			bzero(&ccd_softc[unit], sizeof(struct ccd_softc));
1354			free(vpp, M_DEVBUF);
1355			free(cpp, M_DEVBUF);
1356			ccdunlock(cs);
1357			return (error);
1358		}
1359
1360		/*
1361		 * The ccd has been successfully initialized, so
1362		 * we can place it into the array and read the disklabel.
1363		 */
1364		bcopy(&ccd, &ccddevs[unit], sizeof(ccd));
1365		ccio->ccio_unit = unit;
1366		ccio->ccio_size = cs->sc_size;
1367		ccdgetdisklabel(dev);
1368
1369		ccdunlock(cs);
1370
1371		break;
1372
1373	case CCDIOCCLR:
1374		if ((cs->sc_flags & CCDF_INITED) == 0)
1375			return (ENXIO);
1376
1377		if ((flag & FWRITE) == 0)
1378			return (EBADF);
1379
1380		if ((error = ccdlock(cs)) != 0)
1381			return (error);
1382
1383		/* Don't unconfigure if any other partitions are open */
1384		part = ccdpart(dev);
1385		pmask = (1 << part);
1386		if ((cs->sc_openmask & ~pmask)) {
1387			ccdunlock(cs);
1388			return (EBUSY);
1389		}
1390
1391		/*
1392		 * Free ccd_softc information and clear entry.
1393		 */
1394
1395		/* Close the components and free their pathnames. */
1396		for (i = 0; i < cs->sc_nccdisks; ++i) {
1397			/*
1398			 * XXX: this close could potentially fail and
1399			 * cause Bad Things.  Maybe we need to force
1400			 * the close to happen?
1401			 */
1402#ifdef DEBUG
1403			if (ccddebug & CCDB_VNODE)
1404				vprint("CCDIOCCLR: vnode info",
1405				    cs->sc_cinfo[i].ci_vp);
1406#endif
1407			(void)vn_close(cs->sc_cinfo[i].ci_vp, FREAD|FWRITE,
1408			    p->p_ucred, p);
1409			free(cs->sc_cinfo[i].ci_path, M_DEVBUF);
1410		}
1411
1412		/* Free interleave index. */
1413		for (i = 0; cs->sc_itable[i].ii_ndisk; ++i)
1414			free(cs->sc_itable[i].ii_index, M_DEVBUF);
1415
1416		/* Free component info and interleave table. */
1417		free(cs->sc_cinfo, M_DEVBUF);
1418		free(cs->sc_itable, M_DEVBUF);
1419		cs->sc_flags &= ~CCDF_INITED;
1420
1421		/*
1422		 * Free ccddevice information and clear entry.
1423		 */
1424		free(ccddevs[unit].ccd_cpp, M_DEVBUF);
1425		free(ccddevs[unit].ccd_vpp, M_DEVBUF);
1426		ccd.ccd_dk = -1;
1427		bcopy(&ccd, &ccddevs[unit], sizeof(ccd));
1428
1429		/*
1430		 * And remove the devstat entry.
1431		 */
1432		devstat_remove_entry(&cs->device_stats);
1433
1434		/* This must be atomic. */
1435		s = splhigh();
1436		ccdunlock(cs);
1437		bzero(cs, sizeof(struct ccd_softc));
1438		splx(s);
1439
1440		break;
1441
1442	case DIOCGDINFO:
1443		if ((cs->sc_flags & CCDF_INITED) == 0)
1444			return (ENXIO);
1445
1446		*(struct disklabel *)data = cs->sc_label;
1447		break;
1448
1449	case DIOCGPART:
1450		if ((cs->sc_flags & CCDF_INITED) == 0)
1451			return (ENXIO);
1452
1453		((struct partinfo *)data)->disklab = &cs->sc_label;
1454		((struct partinfo *)data)->part =
1455		    &cs->sc_label.d_partitions[ccdpart(dev)];
1456		break;
1457
1458	case DIOCWDINFO:
1459	case DIOCSDINFO:
1460		if ((cs->sc_flags & CCDF_INITED) == 0)
1461			return (ENXIO);
1462
1463		if ((flag & FWRITE) == 0)
1464			return (EBADF);
1465
1466		if ((error = ccdlock(cs)) != 0)
1467			return (error);
1468
1469		cs->sc_flags |= CCDF_LABELLING;
1470
1471		error = setdisklabel(&cs->sc_label,
1472		    (struct disklabel *)data, 0);
1473		if (error == 0) {
1474			if (cmd == DIOCWDINFO)
1475				error = writedisklabel(CCDLABELDEV(dev),
1476				    &cs->sc_label);
1477		}
1478
1479		cs->sc_flags &= ~CCDF_LABELLING;
1480
1481		ccdunlock(cs);
1482
1483		if (error)
1484			return (error);
1485		break;
1486
1487	case DIOCWLABEL:
1488		if ((cs->sc_flags & CCDF_INITED) == 0)
1489			return (ENXIO);
1490
1491		if ((flag & FWRITE) == 0)
1492			return (EBADF);
1493		if (*(int *)data != 0)
1494			cs->sc_flags |= CCDF_WLABEL;
1495		else
1496			cs->sc_flags &= ~CCDF_WLABEL;
1497		break;
1498
1499	default:
1500		return (ENOTTY);
1501	}
1502
1503	return (0);
1504}
1505
1506static int
1507ccdsize(dev)
1508	dev_t dev;
1509{
1510	struct ccd_softc *cs;
1511	int part, size;
1512
1513	if (ccdopen(dev, 0, S_IFCHR, curproc))
1514		return (-1);
1515
1516	cs = &ccd_softc[ccdunit(dev)];
1517	part = ccdpart(dev);
1518
1519	if ((cs->sc_flags & CCDF_INITED) == 0)
1520		return (-1);
1521
1522	if (cs->sc_label.d_partitions[part].p_fstype != FS_SWAP)
1523		size = -1;
1524	else
1525		size = cs->sc_label.d_partitions[part].p_size;
1526
1527	if (ccdclose(dev, 0, S_IFCHR, curproc))
1528		return (-1);
1529
1530	return (size);
1531}
1532
1533static int
1534ccddump(dev)
1535	dev_t dev;
1536{
1537
1538	/* Not implemented. */
1539	return ENXIO;
1540}
1541
1542/*
1543 * Lookup the provided name in the filesystem.  If the file exists,
1544 * is a valid block device, and isn't being used by anyone else,
1545 * set *vpp to the file's vnode.
1546 */
1547static int
1548ccdlookup(path, p, vpp)
1549	char *path;
1550	struct proc *p;
1551	struct vnode **vpp;	/* result */
1552{
1553	struct nameidata nd;
1554	struct vnode *vp;
1555	int error;
1556
1557	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, path, p);
1558	if ((error = vn_open(&nd, FREAD|FWRITE, 0)) != 0) {
1559#ifdef DEBUG
1560		if (ccddebug & CCDB_FOLLOW|CCDB_INIT)
1561			printf("ccdlookup: vn_open error = %d\n", error);
1562#endif
1563		return (error);
1564	}
1565	vp = nd.ni_vp;
1566
1567	if (vp->v_usecount > 1) {
1568		error = EBUSY;
1569		goto bad;
1570	}
1571
1572	if (!vn_isdisk(vp, &error))
1573		goto bad;
1574
1575#ifdef DEBUG
1576	if (ccddebug & CCDB_VNODE)
1577		vprint("ccdlookup: vnode info", vp);
1578#endif
1579
1580	VOP_UNLOCK(vp, 0, p);
1581	NDFREE(&nd, NDF_ONLY_PNBUF);
1582	*vpp = vp;
1583	return (0);
1584bad:
1585	VOP_UNLOCK(vp, 0, p);
1586	NDFREE(&nd, NDF_ONLY_PNBUF);
1587	/* vn_close does vrele() for vp */
1588	(void)vn_close(vp, FREAD|FWRITE, p->p_ucred, p);
1589	return (error);
1590}
1591
1592/*
1593 * Read the disklabel from the ccd.  If one is not present, fake one
1594 * up.
1595 */
1596static void
1597ccdgetdisklabel(dev)
1598	dev_t dev;
1599{
1600	int unit = ccdunit(dev);
1601	struct ccd_softc *cs = &ccd_softc[unit];
1602	char *errstring;
1603	struct disklabel *lp = &cs->sc_label;
1604	struct ccdgeom *ccg = &cs->sc_geom;
1605
1606	bzero(lp, sizeof(*lp));
1607
1608	lp->d_secperunit = cs->sc_size;
1609	lp->d_secsize = ccg->ccg_secsize;
1610	lp->d_nsectors = ccg->ccg_nsectors;
1611	lp->d_ntracks = ccg->ccg_ntracks;
1612	lp->d_ncylinders = ccg->ccg_ncylinders;
1613	lp->d_secpercyl = lp->d_ntracks * lp->d_nsectors;
1614
1615	strncpy(lp->d_typename, "ccd", sizeof(lp->d_typename));
1616	lp->d_type = DTYPE_CCD;
1617	strncpy(lp->d_packname, "fictitious", sizeof(lp->d_packname));
1618	lp->d_rpm = 3600;
1619	lp->d_interleave = 1;
1620	lp->d_flags = 0;
1621
1622	lp->d_partitions[RAW_PART].p_offset = 0;
1623	lp->d_partitions[RAW_PART].p_size = cs->sc_size;
1624	lp->d_partitions[RAW_PART].p_fstype = FS_UNUSED;
1625	lp->d_npartitions = RAW_PART + 1;
1626
1627	lp->d_bbsize = BBSIZE;				/* XXX */
1628	lp->d_sbsize = SBSIZE;				/* XXX */
1629
1630	lp->d_magic = DISKMAGIC;
1631	lp->d_magic2 = DISKMAGIC;
1632	lp->d_checksum = dkcksum(&cs->sc_label);
1633
1634	/*
1635	 * Call the generic disklabel extraction routine.
1636	 */
1637	errstring = readdisklabel(CCDLABELDEV(dev), &cs->sc_label);
1638	if (errstring != NULL)
1639		ccdmakedisklabel(cs);
1640
1641#ifdef DEBUG
1642	/* It's actually extremely common to have unlabeled ccds. */
1643	if (ccddebug & CCDB_LABEL)
1644		if (errstring != NULL)
1645			printf("ccd%d: %s\n", unit, errstring);
1646#endif
1647}
1648
1649/*
1650 * Take care of things one might want to take care of in the event
1651 * that a disklabel isn't present.
1652 */
1653static void
1654ccdmakedisklabel(cs)
1655	struct ccd_softc *cs;
1656{
1657	struct disklabel *lp = &cs->sc_label;
1658
1659	/*
1660	 * For historical reasons, if there's no disklabel present
1661	 * the raw partition must be marked FS_BSDFFS.
1662	 */
1663	lp->d_partitions[RAW_PART].p_fstype = FS_BSDFFS;
1664
1665	strncpy(lp->d_packname, "default label", sizeof(lp->d_packname));
1666}
1667
1668/*
1669 * Wait interruptibly for an exclusive lock.
1670 *
1671 * XXX
1672 * Several drivers do this; it should be abstracted and made MP-safe.
1673 */
1674static int
1675ccdlock(cs)
1676	struct ccd_softc *cs;
1677{
1678	int error;
1679
1680	while ((cs->sc_flags & CCDF_LOCKED) != 0) {
1681		cs->sc_flags |= CCDF_WANTED;
1682		if ((error = tsleep(cs, PRIBIO | PCATCH, "ccdlck", 0)) != 0)
1683			return (error);
1684	}
1685	cs->sc_flags |= CCDF_LOCKED;
1686	return (0);
1687}
1688
1689/*
1690 * Unlock and wake up any waiters.
1691 */
1692static void
1693ccdunlock(cs)
1694	struct ccd_softc *cs;
1695{
1696
1697	cs->sc_flags &= ~CCDF_LOCKED;
1698	if ((cs->sc_flags & CCDF_WANTED) != 0) {
1699		cs->sc_flags &= ~CCDF_WANTED;
1700		wakeup(cs);
1701	}
1702}
1703
1704#ifdef DEBUG
1705static void
1706printiinfo(ii)
1707	struct ccdiinfo *ii;
1708{
1709	int ix, i;
1710
1711	for (ix = 0; ii->ii_ndisk; ix++, ii++) {
1712		printf(" itab[%d]: #dk %d sblk %d soff %d",
1713		       ix, ii->ii_ndisk, ii->ii_startblk, ii->ii_startoff);
1714		for (i = 0; i < ii->ii_ndisk; i++)
1715			printf(" %d", ii->ii_index[i]);
1716		printf("\n");
1717	}
1718}
1719#endif
1720
1721
1722/* Local Variables: */
1723/* c-argdecl-indent: 8 */
1724/* c-continued-statement-offset: 8 */
1725/* c-indent-level: 8 */
1726/* End: */
1727