geom_ccd.c revision 112946
1/*
2 * Copyright (c) 2003 Poul-Henning Kamp.
3 * Copyright (c) 1995 Jason R. Thorpe.
4 * Copyright (c) 1990, 1993
5 *	The Regents of the University of California.  All rights reserved.
6 * All rights reserved.
7 * Copyright (c) 1988 University of Utah.
8 *
9 * This code is derived from software contributed to Berkeley by
10 * the Systems Programming Group of the University of Utah Computer
11 * Science Department.
12 *
13 * Redistribution and use in source and binary forms, with or without
14 * modification, are permitted provided that the following conditions
15 * are met:
16 * 1. Redistributions of source code must retain the above copyright
17 *    notice, this list of conditions and the following disclaimer.
18 * 2. Redistributions in binary form must reproduce the above copyright
19 *    notice, this list of conditions and the following disclaimer in the
20 *    documentation and/or other materials provided with the distribution.
21 * 3. All advertising materials mentioning features or use of this software
22 *    must display the following acknowledgement:
23 *	This product includes software developed for the NetBSD Project
24 *	by Jason R. Thorpe.
25 * 4. The names of the authors may not be used to endorse or promote products
26 *    derived from this software without specific prior written permission.
27 *
28 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
29 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
30 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
31 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
32 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
33 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
34 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
35 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
36 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
37 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
38 * SUCH DAMAGE.
39 *
40 * Dynamic configuration and disklabel support by:
41 *	Jason R. Thorpe <thorpej@nas.nasa.gov>
42 *	Numerical Aerodynamic Simulation Facility
43 *	Mail Stop 258-6
44 *	NASA Ames Research Center
45 *	Moffett Field, CA 94035
46 *
47 * from: Utah $Hdr: cd.c 1.6 90/11/28$
48 *
49 *	@(#)cd.c	8.2 (Berkeley) 11/16/93
50 *
51 *	$NetBSD: ccd.c,v 1.22 1995/12/08 19:13:26 thorpej Exp $
52 *
53 * $FreeBSD: head/sys/geom/geom_ccd.c 112946 2003-04-01 15:06:26Z phk $
54 */
55
56#include <sys/param.h>
57#include <sys/systm.h>
58#include <sys/kernel.h>
59#include <sys/module.h>
60#include <sys/proc.h>
61#include <sys/bio.h>
62#include <sys/malloc.h>
63#include <sys/namei.h>
64#include <sys/conf.h>
65#include <sys/stat.h>
66#include <sys/sysctl.h>
67#include <sys/disk.h>
68#include <sys/fcntl.h>
69#include <sys/vnode.h>
70#include <geom/geom_disk.h>
71
72#include <sys/ccdvar.h>
73
74MALLOC_DEFINE(M_CCD, "CCD driver", "Concatenated Disk driver");
75
76/*
77   This is how mirroring works (only writes are special):
78
79   When initiating a write, ccdbuffer() returns two "struct ccdbuf *"s
80   linked together by the cb_mirror field.  "cb_pflags &
81   CCDPF_MIRROR_DONE" is set to 0 on both of them.
82
83   When a component returns to ccdiodone(), it checks if "cb_pflags &
84   CCDPF_MIRROR_DONE" is set or not.  If not, it sets the partner's
85   flag and returns.  If it is, it means its partner has already
86   returned, so it will go to the regular cleanup.
87
88 */
89
90struct ccdbuf {
91	struct bio	cb_buf;		/* new I/O buf */
92	struct bio	*cb_obp;	/* ptr. to original I/O buf */
93	struct ccdbuf	*cb_freenext;	/* free list link */
94	struct ccd_s	*cb_softc;
95	int		cb_comp;	/* target component */
96	int		cb_pflags;	/* mirror/parity status flag */
97	struct ccdbuf	*cb_mirror;	/* mirror counterpart */
98};
99
100/* bits in cb_pflags */
101#define CCDPF_MIRROR_DONE 1	/* if set, mirror counterpart is done */
102
103/* convinient macros for often-used statements */
104#define IS_ALLOCATED(unit)	(ccdfind(unit) != NULL)
105#define IS_INITED(cs)		(((cs)->sc_flags & CCDF_INITED) != 0)
106
107static dev_t	ccdctldev;
108
109static disk_strategy_t ccdstrategy;
110static d_ioctl_t ccdctlioctl;
111
112#define NCCDFREEHIWAT	16
113
114#define CDEV_MAJOR 74
115
116static struct cdevsw ccdctl_cdevsw = {
117	.d_open =	nullopen,
118	.d_close =	nullclose,
119	.d_ioctl =	ccdctlioctl,
120	.d_name =	"ccdctl",
121	.d_maj =	CDEV_MAJOR,
122};
123
124static LIST_HEAD(, ccd_s) ccd_softc_list =
125	LIST_HEAD_INITIALIZER(&ccd_softc_list);
126
127static struct ccd_s *ccdfind(int);
128static struct ccd_s *ccdnew(int);
129static int ccddestroy(struct ccd_s *);
130
131/* called during module initialization */
132static void ccdattach(void);
133static int ccd_modevent(module_t, int, void *);
134
135/* called by biodone() at interrupt time */
136static void ccdiodone(struct bio *bp);
137
138static void ccdstart(struct ccd_s *, struct bio *);
139static void ccdinterleave(struct ccd_s *, int);
140static int ccdinit(struct ccd_s *, char **, struct thread *);
141static int ccdlookup(char *, struct thread *p, struct vnode **);
142static int ccdbuffer(struct ccdbuf **ret, struct ccd_s *,
143		      struct bio *, daddr_t, caddr_t, long);
144static int ccdlock(struct ccd_s *);
145static void ccdunlock(struct ccd_s *);
146
147
148/*
149 * Number of blocks to untouched in front of a component partition.
150 * This is to avoid violating its disklabel area when it starts at the
151 * beginning of the slice.
152 */
153#if !defined(CCD_OFFSET)
154#define CCD_OFFSET 16
155#endif
156
157static struct ccd_s *
158ccdfind(int unit)
159{
160	struct ccd_s *sc = NULL;
161
162	/* XXX: LOCK(unique unit numbers) */
163	LIST_FOREACH(sc, &ccd_softc_list, list) {
164		if (sc->sc_unit == unit)
165			break;
166	}
167	/* XXX: UNLOCK(unique unit numbers) */
168	return ((sc == NULL) || (sc->sc_unit != unit) ? NULL : sc);
169}
170
171static struct ccd_s *
172ccdnew(int unit)
173{
174	struct ccd_s *sc;
175
176	/* XXX: LOCK(unique unit numbers) */
177	if (IS_ALLOCATED(unit) || unit > 32)
178		return (NULL);
179
180	MALLOC(sc, struct ccd_s *, sizeof(*sc), M_CCD, M_WAITOK | M_ZERO);
181	sc->sc_unit = unit;
182	LIST_INSERT_HEAD(&ccd_softc_list, sc, list);
183	/* XXX: UNLOCK(unique unit numbers) */
184	return (sc);
185}
186
187static int
188ccddestroy(struct ccd_s *sc)
189{
190
191	/* XXX: LOCK(unique unit numbers) */
192	LIST_REMOVE(sc, list);
193	/* XXX: UNLOCK(unique unit numbers) */
194	FREE(sc, M_CCD);
195	return (0);
196}
197
198/*
199 * Called by main() during pseudo-device attachment.  All we need
200 * to do is to add devsw entries.
201 */
202static void
203ccdattach()
204{
205
206	ccdctldev = make_dev(&ccdctl_cdevsw, 0xffff00ff,
207		UID_ROOT, GID_OPERATOR, 0640, "ccd.ctl");
208	ccdctldev->si_drv1 = ccdctldev;
209}
210
211static int
212ccd_modevent(module_t mod, int type, void *data)
213{
214	int error = 0;
215
216	switch (type) {
217	case MOD_LOAD:
218		ccdattach();
219		break;
220
221	case MOD_UNLOAD:
222		printf("ccd0: Unload not supported!\n");
223		error = EOPNOTSUPP;
224		break;
225
226	case MOD_SHUTDOWN:
227		break;
228
229	default:
230		error = EOPNOTSUPP;
231	}
232	return (error);
233}
234
235DEV_MODULE(ccd, ccd_modevent, NULL);
236
237static int
238ccdinit(struct ccd_s *cs, char **cpaths, struct thread *td)
239{
240	struct ccdcinfo *ci = NULL;	/* XXX */
241	size_t size;
242	int ix;
243	struct vnode *vp;
244	size_t minsize;
245	int maxsecsize;
246	struct ccdgeom *ccg = &cs->sc_geom;
247	char *tmppath = NULL;
248	int error = 0;
249	off_t mediasize;
250	u_int sectorsize;
251
252
253	cs->sc_size = 0;
254
255	/* Allocate space for the component info. */
256	cs->sc_cinfo = malloc(cs->sc_nccdisks * sizeof(struct ccdcinfo),
257	    M_CCD, M_WAITOK);
258
259	/*
260	 * Verify that each component piece exists and record
261	 * relevant information about it.
262	 */
263	maxsecsize = 0;
264	minsize = 0;
265	tmppath = malloc(MAXPATHLEN, M_CCD, M_WAITOK);
266	for (ix = 0; ix < cs->sc_nccdisks; ix++) {
267		vp = cs->sc_vpp[ix];
268		ci = &cs->sc_cinfo[ix];
269		ci->ci_vp = vp;
270
271		/*
272		 * Copy in the pathname of the component.
273		 */
274		if ((error = copyinstr(cpaths[ix], tmppath,
275		    MAXPATHLEN, &ci->ci_pathlen)) != 0) {
276			goto fail;
277		}
278		ci->ci_path = malloc(ci->ci_pathlen, M_CCD, M_WAITOK);
279		bcopy(tmppath, ci->ci_path, ci->ci_pathlen);
280
281		ci->ci_dev = vn_todev(vp);
282
283		/*
284		 * Get partition information for the component.
285		 */
286		error = VOP_IOCTL(vp, DIOCGMEDIASIZE, (caddr_t)&mediasize,
287		    FREAD, td->td_ucred, td);
288		if (error != 0) {
289			goto fail;
290		}
291		/*
292		 * Get partition information for the component.
293		 */
294		error = VOP_IOCTL(vp, DIOCGSECTORSIZE, (caddr_t)&sectorsize,
295		    FREAD, td->td_ucred, td);
296		if (error != 0) {
297			goto fail;
298		}
299		if (sectorsize > maxsecsize)
300			maxsecsize = sectorsize;
301		size = mediasize / DEV_BSIZE - CCD_OFFSET;
302
303		/*
304		 * Calculate the size, truncating to an interleave
305		 * boundary if necessary.
306		 */
307
308		if (cs->sc_ileave > 1)
309			size -= size % cs->sc_ileave;
310
311		if (size == 0) {
312			error = ENODEV;
313			goto fail;
314		}
315
316		if (minsize == 0 || size < minsize)
317			minsize = size;
318		ci->ci_size = size;
319		cs->sc_size += size;
320	}
321
322	free(tmppath, M_CCD);
323	tmppath = NULL;
324
325	/*
326	 * Don't allow the interleave to be smaller than
327	 * the biggest component sector.
328	 */
329	if ((cs->sc_ileave > 0) &&
330	    (cs->sc_ileave < (maxsecsize / DEV_BSIZE))) {
331		error = EINVAL;
332		goto fail;
333	}
334
335	/*
336	 * If uniform interleave is desired set all sizes to that of
337	 * the smallest component.  This will guarentee that a single
338	 * interleave table is generated.
339	 *
340	 * Lost space must be taken into account when calculating the
341	 * overall size.  Half the space is lost when CCDF_MIRROR is
342	 * specified.
343	 */
344	if (cs->sc_flags & CCDF_UNIFORM) {
345		for (ci = cs->sc_cinfo;
346		     ci < &cs->sc_cinfo[cs->sc_nccdisks]; ci++) {
347			ci->ci_size = minsize;
348		}
349		if (cs->sc_flags & CCDF_MIRROR) {
350			/*
351			 * Check to see if an even number of components
352			 * have been specified.  The interleave must also
353			 * be non-zero in order for us to be able to
354			 * guarentee the topology.
355			 */
356			if (cs->sc_nccdisks % 2) {
357				printf("ccd%d: mirroring requires an even number of disks\n", cs->sc_unit );
358				error = EINVAL;
359				goto fail;
360			}
361			if (cs->sc_ileave == 0) {
362				printf("ccd%d: an interleave must be specified when mirroring\n", cs->sc_unit);
363				error = EINVAL;
364				goto fail;
365			}
366			cs->sc_size = (cs->sc_nccdisks/2) * minsize;
367		} else {
368			if (cs->sc_ileave == 0) {
369				printf("ccd%d: an interleave must be specified when using parity\n", cs->sc_unit);
370				error = EINVAL;
371				goto fail;
372			}
373			cs->sc_size = cs->sc_nccdisks * minsize;
374		}
375	}
376
377	/*
378	 * Construct the interleave table.
379	 */
380	ccdinterleave(cs, cs->sc_unit);
381
382	/*
383	 * Create pseudo-geometry based on 1MB cylinders.  It's
384	 * pretty close.
385	 */
386	ccg->ccg_secsize = maxsecsize;
387	ccg->ccg_ntracks = 1;
388	ccg->ccg_nsectors = 1024 * 1024 / ccg->ccg_secsize;
389	ccg->ccg_ncylinders = cs->sc_size / ccg->ccg_nsectors;
390
391	cs->sc_flags |= CCDF_INITED;
392	cs->sc_cflags = cs->sc_flags;	/* So we can find out later... */
393	return (0);
394fail:
395	while (ci > cs->sc_cinfo) {
396		ci--;
397		free(ci->ci_path, M_CCD);
398	}
399	if (tmppath != NULL)
400		free(tmppath, M_CCD);
401	free(cs->sc_cinfo, M_CCD);
402	ccddestroy(cs);
403	return (error);
404}
405
406static void
407ccdinterleave(struct ccd_s *cs, int unit)
408{
409	struct ccdcinfo *ci, *smallci;
410	struct ccdiinfo *ii;
411	daddr_t bn, lbn;
412	int ix;
413	u_long size;
414
415
416	/*
417	 * Allocate an interleave table.  The worst case occurs when each
418	 * of N disks is of a different size, resulting in N interleave
419	 * tables.
420	 *
421	 * Chances are this is too big, but we don't care.
422	 */
423	size = (cs->sc_nccdisks + 1) * sizeof(struct ccdiinfo);
424	cs->sc_itable = (struct ccdiinfo *)malloc(size, M_CCD,
425	    M_WAITOK | M_ZERO);
426
427	/*
428	 * Trivial case: no interleave (actually interleave of disk size).
429	 * Each table entry represents a single component in its entirety.
430	 *
431	 * An interleave of 0 may not be used with a mirror setup.
432	 */
433	if (cs->sc_ileave == 0) {
434		bn = 0;
435		ii = cs->sc_itable;
436
437		for (ix = 0; ix < cs->sc_nccdisks; ix++) {
438			/* Allocate space for ii_index. */
439			ii->ii_index = malloc(sizeof(int), M_CCD, M_WAITOK);
440			ii->ii_ndisk = 1;
441			ii->ii_startblk = bn;
442			ii->ii_startoff = 0;
443			ii->ii_index[0] = ix;
444			bn += cs->sc_cinfo[ix].ci_size;
445			ii++;
446		}
447		ii->ii_ndisk = 0;
448		return;
449	}
450
451	/*
452	 * The following isn't fast or pretty; it doesn't have to be.
453	 */
454	size = 0;
455	bn = lbn = 0;
456	for (ii = cs->sc_itable; ; ii++) {
457		/*
458		 * Allocate space for ii_index.  We might allocate more then
459		 * we use.
460		 */
461		ii->ii_index = malloc((sizeof(int) * cs->sc_nccdisks),
462		    M_CCD, M_WAITOK);
463
464		/*
465		 * Locate the smallest of the remaining components
466		 */
467		smallci = NULL;
468		for (ci = cs->sc_cinfo; ci < &cs->sc_cinfo[cs->sc_nccdisks];
469		    ci++) {
470			if (ci->ci_size > size &&
471			    (smallci == NULL ||
472			     ci->ci_size < smallci->ci_size)) {
473				smallci = ci;
474			}
475		}
476
477		/*
478		 * Nobody left, all done
479		 */
480		if (smallci == NULL) {
481			ii->ii_ndisk = 0;
482			free(ii->ii_index, M_CCD);
483			break;
484		}
485
486		/*
487		 * Record starting logical block using an sc_ileave blocksize.
488		 */
489		ii->ii_startblk = bn / cs->sc_ileave;
490
491		/*
492		 * Record starting comopnent block using an sc_ileave
493		 * blocksize.  This value is relative to the beginning of
494		 * a component disk.
495		 */
496		ii->ii_startoff = lbn;
497
498		/*
499		 * Determine how many disks take part in this interleave
500		 * and record their indices.
501		 */
502		ix = 0;
503		for (ci = cs->sc_cinfo;
504		    ci < &cs->sc_cinfo[cs->sc_nccdisks]; ci++) {
505			if (ci->ci_size >= smallci->ci_size) {
506				ii->ii_index[ix++] = ci - cs->sc_cinfo;
507			}
508		}
509		ii->ii_ndisk = ix;
510		bn += ix * (smallci->ci_size - size);
511		lbn = smallci->ci_size / cs->sc_ileave;
512		size = smallci->ci_size;
513	}
514}
515
516static void
517ccdstrategy(struct bio *bp)
518{
519	struct ccd_s *cs;
520	int pbn;        /* in sc_secsize chunks */
521	long sz;        /* in sc_secsize chunks */
522
523	cs = bp->bio_disk->d_drv1;
524
525	pbn = bp->bio_blkno / (cs->sc_geom.ccg_secsize / DEV_BSIZE);
526	sz = howmany(bp->bio_bcount, cs->sc_geom.ccg_secsize);
527
528	/*
529	 * If out of bounds return an error. If at the EOF point,
530	 * simply read or write less.
531	 */
532
533	if (pbn < 0 || pbn >= cs->sc_size) {
534		bp->bio_resid = bp->bio_bcount;
535		if (pbn != cs->sc_size)
536			biofinish(bp, NULL, EINVAL);
537		else
538			biodone(bp);
539		return;
540	}
541
542	/*
543	 * If the request crosses EOF, truncate the request.
544	 */
545	if (pbn + sz > cs->sc_size) {
546		bp->bio_bcount = (cs->sc_size - pbn) *
547		    cs->sc_geom.ccg_secsize;
548	}
549
550	bp->bio_resid = bp->bio_bcount;
551
552	/*
553	 * "Start" the unit.
554	 */
555	ccdstart(cs, bp);
556	return;
557}
558
559static void
560ccdstart(struct ccd_s *cs, struct bio *bp)
561{
562	long bcount, rcount;
563	struct ccdbuf *cbp[2];
564	caddr_t addr;
565	daddr_t bn;
566	int err;
567
568	/*
569	 * Translate the partition-relative block number to an absolute.
570	 */
571	bn = bp->bio_blkno;
572
573	/*
574	 * Allocate component buffers and fire off the requests
575	 */
576	addr = bp->bio_data;
577	for (bcount = bp->bio_bcount; bcount > 0; bcount -= rcount) {
578		err = ccdbuffer(cbp, cs, bp, bn, addr, bcount);
579		if (err) {
580			printf("ccdbuffer error %d\n", err);
581			/* We're screwed */
582			bp->bio_resid -= bcount;
583			bp->bio_error = ENOMEM;
584			bp->bio_flags |= BIO_ERROR;
585			return;
586		}
587		rcount = cbp[0]->cb_buf.bio_bcount;
588
589		if (cs->sc_cflags & CCDF_MIRROR) {
590			/*
591			 * Mirroring.  Writes go to both disks, reads are
592			 * taken from whichever disk seems most appropriate.
593			 *
594			 * We attempt to localize reads to the disk whos arm
595			 * is nearest the read request.  We ignore seeks due
596			 * to writes when making this determination and we
597			 * also try to avoid hogging.
598			 */
599			if (cbp[0]->cb_buf.bio_cmd == BIO_WRITE) {
600				BIO_STRATEGY(&cbp[0]->cb_buf);
601				BIO_STRATEGY(&cbp[1]->cb_buf);
602			} else {
603				int pick = cs->sc_pick;
604				daddr_t range = cs->sc_size / 16;
605
606				if (bn < cs->sc_blk[pick] - range ||
607				    bn > cs->sc_blk[pick] + range
608				) {
609					cs->sc_pick = pick = 1 - pick;
610				}
611				cs->sc_blk[pick] = bn + btodb(rcount);
612				BIO_STRATEGY(&cbp[pick]->cb_buf);
613			}
614		} else {
615			/*
616			 * Not mirroring
617			 */
618			BIO_STRATEGY(&cbp[0]->cb_buf);
619		}
620		bn += btodb(rcount);
621		addr += rcount;
622	}
623}
624
625/*
626 * Build a component buffer header.
627 */
628static int
629ccdbuffer(struct ccdbuf **cb, struct ccd_s *cs, struct bio *bp, daddr_t bn, caddr_t addr, long bcount)
630{
631	struct ccdcinfo *ci, *ci2 = NULL;	/* XXX */
632	struct ccdbuf *cbp;
633	daddr_t cbn, cboff;
634	off_t cbc;
635
636	/*
637	 * Determine which component bn falls in.
638	 */
639	cbn = bn;
640	cboff = 0;
641
642	if (cs->sc_ileave == 0) {
643		/*
644		 * Serially concatenated and neither a mirror nor a parity
645		 * config.  This is a special case.
646		 */
647		daddr_t sblk;
648
649		sblk = 0;
650		for (ci = cs->sc_cinfo; cbn >= sblk + ci->ci_size; ci++)
651			sblk += ci->ci_size;
652		cbn -= sblk;
653	} else {
654		struct ccdiinfo *ii;
655		int ccdisk, off;
656
657		/*
658		 * Calculate cbn, the logical superblock (sc_ileave chunks),
659		 * and cboff, a normal block offset (DEV_BSIZE chunks) relative
660		 * to cbn.
661		 */
662		cboff = cbn % cs->sc_ileave;	/* DEV_BSIZE gran */
663		cbn = cbn / cs->sc_ileave;	/* DEV_BSIZE * ileave gran */
664
665		/*
666		 * Figure out which interleave table to use.
667		 */
668		for (ii = cs->sc_itable; ii->ii_ndisk; ii++) {
669			if (ii->ii_startblk > cbn)
670				break;
671		}
672		ii--;
673
674		/*
675		 * off is the logical superblock relative to the beginning
676		 * of this interleave block.
677		 */
678		off = cbn - ii->ii_startblk;
679
680		/*
681		 * We must calculate which disk component to use (ccdisk),
682		 * and recalculate cbn to be the superblock relative to
683		 * the beginning of the component.  This is typically done by
684		 * adding 'off' and ii->ii_startoff together.  However, 'off'
685		 * must typically be divided by the number of components in
686		 * this interleave array to be properly convert it from a
687		 * CCD-relative logical superblock number to a
688		 * component-relative superblock number.
689		 */
690		if (ii->ii_ndisk == 1) {
691			/*
692			 * When we have just one disk, it can't be a mirror
693			 * or a parity config.
694			 */
695			ccdisk = ii->ii_index[0];
696			cbn = ii->ii_startoff + off;
697		} else {
698			if (cs->sc_cflags & CCDF_MIRROR) {
699				/*
700				 * We have forced a uniform mapping, resulting
701				 * in a single interleave array.  We double
702				 * up on the first half of the available
703				 * components and our mirror is in the second
704				 * half.  This only works with a single
705				 * interleave array because doubling up
706				 * doubles the number of sectors, so there
707				 * cannot be another interleave array because
708				 * the next interleave array's calculations
709				 * would be off.
710				 */
711				int ndisk2 = ii->ii_ndisk / 2;
712				ccdisk = ii->ii_index[off % ndisk2];
713				cbn = ii->ii_startoff + off / ndisk2;
714				ci2 = &cs->sc_cinfo[ccdisk + ndisk2];
715			} else {
716				ccdisk = ii->ii_index[off % ii->ii_ndisk];
717				cbn = ii->ii_startoff + off / ii->ii_ndisk;
718			}
719		}
720
721		ci = &cs->sc_cinfo[ccdisk];
722
723		/*
724		 * Convert cbn from a superblock to a normal block so it
725		 * can be used to calculate (along with cboff) the normal
726		 * block index into this particular disk.
727		 */
728		cbn *= cs->sc_ileave;
729	}
730
731	/*
732	 * Fill in the component buf structure.
733	 */
734	cbp = malloc(sizeof(struct ccdbuf), M_CCD, M_NOWAIT | M_ZERO);
735	if (cbp == NULL)
736		return (ENOMEM);
737	cbp->cb_buf.bio_cmd = bp->bio_cmd;
738	cbp->cb_buf.bio_done = ccdiodone;
739	cbp->cb_buf.bio_dev = ci->ci_dev;		/* XXX */
740	cbp->cb_buf.bio_blkno = cbn + cboff + CCD_OFFSET;
741	cbp->cb_buf.bio_offset = dbtob(cbn + cboff + CCD_OFFSET);
742	cbp->cb_buf.bio_data = addr;
743	cbp->cb_buf.bio_caller2 = cbp;
744	if (cs->sc_ileave == 0)
745              cbc = dbtob((off_t)(ci->ci_size - cbn));
746	else
747              cbc = dbtob((off_t)(cs->sc_ileave - cboff));
748	cbp->cb_buf.bio_bcount = (cbc < bcount) ? cbc : bcount;
749 	cbp->cb_buf.bio_caller1 = (void*)cbp->cb_buf.bio_bcount;
750
751	/*
752	 * context for ccdiodone
753	 */
754	cbp->cb_obp = bp;
755	cbp->cb_softc = cs;
756	cbp->cb_comp = ci - cs->sc_cinfo;
757
758	cb[0] = cbp;
759
760	/*
761	 * Note: both I/O's setup when reading from mirror, but only one
762	 * will be executed.
763	 */
764	if (cs->sc_cflags & CCDF_MIRROR) {
765		/* mirror, setup second I/O */
766		cbp = malloc(sizeof(struct ccdbuf), M_CCD, M_NOWAIT);
767		if (cbp == NULL) {
768			free(cb[0], M_CCD);
769			cb[0] = NULL;
770			return (ENOMEM);
771		}
772		bcopy(cb[0], cbp, sizeof(struct ccdbuf));
773		cbp->cb_buf.bio_caller2 = cbp;
774		cbp->cb_buf.bio_dev = ci2->ci_dev;
775		cbp->cb_comp = ci2 - cs->sc_cinfo;
776		cb[1] = cbp;
777		/* link together the ccdbuf's and clear "mirror done" flag */
778		cb[0]->cb_mirror = cb[1];
779		cb[1]->cb_mirror = cb[0];
780		cb[0]->cb_pflags &= ~CCDPF_MIRROR_DONE;
781		cb[1]->cb_pflags &= ~CCDPF_MIRROR_DONE;
782	}
783	return (0);
784}
785
786/*
787 * Called at interrupt time.
788 * Mark the component as done and if all components are done,
789 * take a ccd interrupt.
790 */
791static void
792ccdiodone(struct bio *ibp)
793{
794	struct ccdbuf *cbp;
795	struct bio *bp;
796	struct ccd_s *cs;
797	int count;
798
799	cbp = ibp->bio_caller2;
800	cs = cbp->cb_softc;
801	bp = cbp->cb_obp;
802	/*
803	 * If an error occured, report it.  If this is a mirrored
804	 * configuration and the first of two possible reads, do not
805	 * set the error in the bp yet because the second read may
806	 * succeed.
807	 */
808
809	if (cbp->cb_buf.bio_flags & BIO_ERROR) {
810		const char *msg = "";
811
812		if ((cs->sc_cflags & CCDF_MIRROR) &&
813		    (cbp->cb_buf.bio_cmd == BIO_READ) &&
814		    (cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) {
815			/*
816			 * We will try our read on the other disk down
817			 * below, also reverse the default pick so if we
818			 * are doing a scan we do not keep hitting the
819			 * bad disk first.
820			 */
821
822			msg = ", trying other disk";
823			cs->sc_pick = 1 - cs->sc_pick;
824			cs->sc_blk[cs->sc_pick] = bp->bio_blkno;
825		} else {
826			bp->bio_flags |= BIO_ERROR;
827			bp->bio_error = cbp->cb_buf.bio_error ?
828			    cbp->cb_buf.bio_error : EIO;
829		}
830		printf("ccd%d: error %d on component %d block %jd "
831		    "(ccd block %jd)%s\n", cs->sc_unit, bp->bio_error,
832		    cbp->cb_comp,
833		    (intmax_t)cbp->cb_buf.bio_blkno, (intmax_t)bp->bio_blkno,
834		    msg);
835	}
836
837	/*
838	 * Process mirror.  If we are writing, I/O has been initiated on both
839	 * buffers and we fall through only after both are finished.
840	 *
841	 * If we are reading only one I/O is initiated at a time.  If an
842	 * error occurs we initiate the second I/O and return, otherwise
843	 * we free the second I/O without initiating it.
844	 */
845
846	if (cs->sc_cflags & CCDF_MIRROR) {
847		if (cbp->cb_buf.bio_cmd == BIO_WRITE) {
848			/*
849			 * When writing, handshake with the second buffer
850			 * to determine when both are done.  If both are not
851			 * done, return here.
852			 */
853			if ((cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) {
854				cbp->cb_mirror->cb_pflags |= CCDPF_MIRROR_DONE;
855				free(cbp, M_CCD);
856				return;
857			}
858		} else {
859			/*
860			 * When reading, either dispose of the second buffer
861			 * or initiate I/O on the second buffer if an error
862			 * occured with this one.
863			 */
864			if ((cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) {
865				if (cbp->cb_buf.bio_flags & BIO_ERROR) {
866					cbp->cb_mirror->cb_pflags |=
867					    CCDPF_MIRROR_DONE;
868					BIO_STRATEGY(&cbp->cb_mirror->cb_buf);
869					free(cbp, M_CCD);
870					return;
871				} else {
872					free(cbp->cb_mirror, M_CCD);
873				}
874			}
875		}
876	}
877
878	/*
879	 * use bio_caller1 to determine how big the original request was rather
880	 * then bio_bcount, because bio_bcount may have been truncated for EOF.
881	 *
882	 * XXX We check for an error, but we do not test the resid for an
883	 * aligned EOF condition.  This may result in character & block
884	 * device access not recognizing EOF properly when read or written
885	 * sequentially, but will not effect filesystems.
886	 */
887	count = (long)cbp->cb_buf.bio_caller1;
888	free(cbp, M_CCD);
889
890	/*
891	 * If all done, "interrupt".
892	 */
893	bp->bio_resid -= count;
894	if (bp->bio_resid < 0)
895		panic("ccdiodone: count");
896	if (bp->bio_resid == 0) {
897		if (bp->bio_flags & BIO_ERROR)
898			bp->bio_resid = bp->bio_bcount;
899		biodone(bp);
900	}
901}
902
903static int ccdioctltoo(int unit, u_long cmd, caddr_t data, int flag, struct thread *td);
904
905static int
906ccdctlioctl(dev_t dev, u_long cmd, caddr_t data, int flag, struct thread *td)
907{
908	struct ccd_ioctl *ccio;
909	u_int unit;
910	dev_t dev2;
911	int error;
912
913	switch (cmd) {
914	case CCDIOCSET:
915	case CCDIOCCLR:
916		ccio = (struct ccd_ioctl *)data;
917		unit = ccio->ccio_size;
918		return (ccdioctltoo(unit, cmd, data, flag, td));
919	case CCDCONFINFO:
920		{
921		int ninit = 0;
922		struct ccdconf *conf = (struct ccdconf *)data;
923		struct ccd_s *tmpcs;
924		struct ccd_s *ubuf = conf->buffer;
925
926		/* XXX: LOCK(unique unit numbers) */
927		LIST_FOREACH(tmpcs, &ccd_softc_list, list)
928			if (IS_INITED(tmpcs))
929				ninit++;
930
931		if (conf->size == 0) {
932			conf->size = sizeof(struct ccd_s) * ninit;
933			return (0);
934		} else if ((conf->size / sizeof(struct ccd_s) != ninit) ||
935		    (conf->size % sizeof(struct ccd_s) != 0)) {
936			/* XXX: UNLOCK(unique unit numbers) */
937			return (EINVAL);
938		}
939
940		ubuf += ninit;
941		LIST_FOREACH(tmpcs, &ccd_softc_list, list) {
942			if (!IS_INITED(tmpcs))
943				continue;
944			error = copyout(tmpcs, --ubuf,
945			    sizeof(struct ccd_s));
946			if (error != 0)
947				/* XXX: UNLOCK(unique unit numbers) */
948				return (error);
949		}
950		/* XXX: UNLOCK(unique unit numbers) */
951		return (0);
952		}
953
954	case CCDCPPINFO:
955		{
956		struct ccdcpps *cpps = (struct ccdcpps *)data;
957		char *ubuf = cpps->buffer;
958		struct ccd_s *cs;
959
960
961		error = copyin(ubuf, &unit, sizeof (unit));
962		if (error)
963			return (error);
964
965		if (!IS_ALLOCATED(unit))
966			return (ENXIO);
967		dev2 = makedev(CDEV_MAJOR, unit * 8 + 2);
968		cs = ccdfind(unit);
969		if (!IS_INITED(cs))
970			return (ENXIO);
971
972		{
973			int len = 0, i;
974			struct ccdcpps *cpps = (struct ccdcpps *)data;
975			char *ubuf = cpps->buffer;
976
977
978			for (i = 0; i < cs->sc_nccdisks; ++i)
979				len += cs->sc_cinfo[i].ci_pathlen;
980
981			if (cpps->size < len)
982				return (ENOMEM);
983
984			for (i = 0; i < cs->sc_nccdisks; ++i) {
985				len = cs->sc_cinfo[i].ci_pathlen;
986				error = copyout(cs->sc_cinfo[i].ci_path, ubuf,
987				    len);
988				if (error != 0)
989					return (error);
990				ubuf += len;
991			}
992			return(copyout("", ubuf, 1));
993		}
994		break;
995		}
996
997	default:
998		return (ENXIO);
999	}
1000}
1001
1002static int
1003ccdioctltoo(int unit, u_long cmd, caddr_t data, int flag, struct thread *td)
1004{
1005	int i, j, lookedup = 0, error = 0;
1006	struct ccd_s *cs;
1007	struct ccd_ioctl *ccio = (struct ccd_ioctl *)data;
1008	struct ccdgeom *ccg;
1009	char **cpp;
1010	struct vnode **vpp;
1011
1012	cs = ccdfind(unit);
1013	switch (cmd) {
1014	case CCDIOCSET:
1015		if (cs == NULL)
1016			cs = ccdnew(unit);
1017		if (IS_INITED(cs))
1018			return (EBUSY);
1019
1020		if ((flag & FWRITE) == 0)
1021			return (EBADF);
1022
1023		if ((error = ccdlock(cs)) != 0)
1024			return (error);
1025
1026		if (ccio->ccio_ndisks > CCD_MAXNDISKS)
1027			return (EINVAL);
1028
1029		/* Fill in some important bits. */
1030		cs->sc_ileave = ccio->ccio_ileave;
1031		if (cs->sc_ileave == 0 && (ccio->ccio_flags & CCDF_MIRROR)) {
1032			printf("ccd%d: disabling mirror, interleave is 0\n",
1033			    unit);
1034			ccio->ccio_flags &= ~(CCDF_MIRROR);
1035		}
1036		if ((ccio->ccio_flags & CCDF_MIRROR) &&
1037		    !(ccio->ccio_flags & CCDF_UNIFORM)) {
1038			printf("ccd%d: mirror/parity forces uniform flag\n",
1039			       unit);
1040			ccio->ccio_flags |= CCDF_UNIFORM;
1041		}
1042		cs->sc_flags = ccio->ccio_flags & CCDF_USERMASK;
1043
1044		/*
1045		 * Allocate space for and copy in the array of
1046		 * componet pathnames and device numbers.
1047		 */
1048		cpp = malloc(ccio->ccio_ndisks * sizeof(char *),
1049		    M_CCD, M_WAITOK);
1050		vpp = malloc(ccio->ccio_ndisks * sizeof(struct vnode *),
1051		    M_CCD, M_WAITOK);
1052
1053		error = copyin((caddr_t)ccio->ccio_disks, (caddr_t)cpp,
1054		    ccio->ccio_ndisks * sizeof(char **));
1055		if (error) {
1056			free(vpp, M_CCD);
1057			free(cpp, M_CCD);
1058			ccdunlock(cs);
1059			return (error);
1060		}
1061
1062
1063		for (i = 0; i < ccio->ccio_ndisks; ++i) {
1064			if ((error = ccdlookup(cpp[i], td, &vpp[i])) != 0) {
1065				for (j = 0; j < lookedup; ++j)
1066					(void)vn_close(vpp[j], FREAD|FWRITE,
1067					    td->td_ucred, td);
1068				free(vpp, M_CCD);
1069				free(cpp, M_CCD);
1070				ccdunlock(cs);
1071				return (error);
1072			}
1073			++lookedup;
1074		}
1075		cs->sc_vpp = vpp;
1076		cs->sc_nccdisks = ccio->ccio_ndisks;
1077
1078		/*
1079		 * Initialize the ccd.  Fills in the softc for us.
1080		 */
1081		if ((error = ccdinit(cs, cpp, td)) != 0) {
1082			for (j = 0; j < lookedup; ++j)
1083				(void)vn_close(vpp[j], FREAD|FWRITE,
1084				    td->td_ucred, td);
1085			/*
1086			 * We can't ccddestroy() cs just yet, because nothing
1087			 * prevents user-level app to do another ioctl()
1088			 * without closing the device first, therefore
1089			 * declare unit null and void and let ccdclose()
1090			 * destroy it when it is safe to do so.
1091			 */
1092			cs->sc_flags &= (CCDF_WANTED | CCDF_LOCKED);
1093			free(vpp, M_CCD);
1094			free(cpp, M_CCD);
1095			ccdunlock(cs);
1096			return (error);
1097		}
1098		free(cpp, M_CCD);
1099
1100		/*
1101		 * The ccd has been successfully initialized, so
1102		 * we can place it into the array and read the disklabel.
1103		 */
1104		ccio->ccio_unit = unit;
1105		ccio->ccio_size = cs->sc_size;
1106		ccg = &cs->sc_geom;
1107		cs->sc_disk = malloc(sizeof(struct disk), M_CCD,
1108		    M_ZERO | M_WAITOK);
1109		cs->sc_disk->d_strategy = ccdstrategy;
1110		cs->sc_disk->d_name = "ccd";
1111		cs->sc_disk->d_sectorsize = ccg->ccg_secsize;
1112		cs->sc_disk->d_mediasize =
1113		    cs->sc_size * (off_t)ccg->ccg_secsize;
1114		cs->sc_disk->d_fwsectors = ccg->ccg_nsectors;
1115		cs->sc_disk->d_fwheads = ccg->ccg_ntracks;
1116		cs->sc_disk->d_drv1 = cs;
1117		cs->sc_disk->d_maxsize = MAXPHYS;
1118		disk_create(unit, cs->sc_disk, 0, NULL, NULL);
1119
1120		ccdunlock(cs);
1121
1122		break;
1123
1124	case CCDIOCCLR:
1125		if (cs == NULL)
1126			return (ENXIO);
1127
1128		if (!IS_INITED(cs))
1129			return (ENXIO);
1130
1131		if ((flag & FWRITE) == 0)
1132			return (EBADF);
1133
1134		if ((error = ccdlock(cs)) != 0)
1135			return (error);
1136
1137		/* Don't unconfigure if any other partitions are open */
1138		if (cs->sc_disk->d_flags & DISKFLAG_OPEN) {
1139			ccdunlock(cs);
1140			return (EBUSY);
1141		}
1142
1143		disk_destroy(cs->sc_disk);
1144		free(cs->sc_disk, M_CCD);
1145		cs->sc_disk = NULL;
1146		/* Declare unit null and void (reset all flags) */
1147		cs->sc_flags &= (CCDF_WANTED | CCDF_LOCKED);
1148
1149		/* Close the components and free their pathnames. */
1150		for (i = 0; i < cs->sc_nccdisks; ++i) {
1151			/*
1152			 * XXX: this close could potentially fail and
1153			 * cause Bad Things.  Maybe we need to force
1154			 * the close to happen?
1155			 */
1156			(void)vn_close(cs->sc_cinfo[i].ci_vp, FREAD|FWRITE,
1157			    td->td_ucred, td);
1158			free(cs->sc_cinfo[i].ci_path, M_CCD);
1159		}
1160
1161		/* Free interleave index. */
1162		for (i = 0; cs->sc_itable[i].ii_ndisk; ++i)
1163			free(cs->sc_itable[i].ii_index, M_CCD);
1164
1165		/* Free component info and interleave table. */
1166		free(cs->sc_cinfo, M_CCD);
1167		free(cs->sc_itable, M_CCD);
1168		free(cs->sc_vpp, M_CCD);
1169
1170		/* This must be atomic. */
1171		ccdunlock(cs);
1172		ccddestroy(cs);
1173
1174		break;
1175	}
1176
1177	return (0);
1178}
1179
1180
1181/*
1182 * Lookup the provided name in the filesystem.  If the file exists,
1183 * is a valid block device, and isn't being used by anyone else,
1184 * set *vpp to the file's vnode.
1185 */
1186static int
1187ccdlookup(char *path, struct thread *td, struct vnode **vpp)
1188{
1189	struct nameidata nd;
1190	struct vnode *vp;
1191	int error, flags;
1192
1193	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, path, td);
1194	flags = FREAD | FWRITE;
1195	if ((error = vn_open(&nd, &flags, 0)) != 0) {
1196		return (error);
1197	}
1198	vp = nd.ni_vp;
1199
1200	if (vrefcnt(vp) > 1) {
1201		error = EBUSY;
1202		goto bad;
1203	}
1204
1205	if (!vn_isdisk(vp, &error))
1206		goto bad;
1207
1208
1209	VOP_UNLOCK(vp, 0, td);
1210	NDFREE(&nd, NDF_ONLY_PNBUF);
1211	*vpp = vp;
1212	return (0);
1213bad:
1214	VOP_UNLOCK(vp, 0, td);
1215	NDFREE(&nd, NDF_ONLY_PNBUF);
1216	/* vn_close does vrele() for vp */
1217	(void)vn_close(vp, FREAD|FWRITE, td->td_ucred, td);
1218	return (error);
1219}
1220
1221/*
1222
1223 * Wait interruptibly for an exclusive lock.
1224 *
1225 * XXX
1226 * Several drivers do this; it should be abstracted and made MP-safe.
1227 */
1228static int
1229ccdlock(struct ccd_s *cs)
1230{
1231	int error;
1232
1233	while ((cs->sc_flags & CCDF_LOCKED) != 0) {
1234		cs->sc_flags |= CCDF_WANTED;
1235		if ((error = tsleep(cs, PRIBIO | PCATCH, "ccdlck", 0)) != 0)
1236			return (error);
1237	}
1238	cs->sc_flags |= CCDF_LOCKED;
1239	return (0);
1240}
1241
1242/*
1243 * Unlock and wake up any waiters.
1244 */
1245static void
1246ccdunlock(struct ccd_s *cs)
1247{
1248
1249	cs->sc_flags &= ~CCDF_LOCKED;
1250	if ((cs->sc_flags & CCDF_WANTED) != 0) {
1251		cs->sc_flags &= ~CCDF_WANTED;
1252		wakeup(cs);
1253	}
1254}
1255