geom_ccd.c revision 109534
1/*
2 * Copyright (c) 2003 Poul-Henning Kamp.
3 * Copyright (c) 1995 Jason R. Thorpe.
4 * Copyright (c) 1990, 1993
5 *	The Regents of the University of California.  All rights reserved.
6 * All rights reserved.
7 * Copyright (c) 1988 University of Utah.
8 *
9 * This code is derived from software contributed to Berkeley by
10 * the Systems Programming Group of the University of Utah Computer
11 * Science Department.
12 *
13 * Redistribution and use in source and binary forms, with or without
14 * modification, are permitted provided that the following conditions
15 * are met:
16 * 1. Redistributions of source code must retain the above copyright
17 *    notice, this list of conditions and the following disclaimer.
18 * 2. Redistributions in binary form must reproduce the above copyright
19 *    notice, this list of conditions and the following disclaimer in the
20 *    documentation and/or other materials provided with the distribution.
21 * 3. All advertising materials mentioning features or use of this software
22 *    must display the following acknowledgement:
23 *	This product includes software developed for the NetBSD Project
24 *	by Jason R. Thorpe.
25 * 4. The names of the authors may not be used to endorse or promote products
26 *    derived from this software without specific prior written permission.
27 *
28 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
29 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
30 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
31 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
32 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
33 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
34 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
35 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
36 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
37 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
38 * SUCH DAMAGE.
39 *
40 * Dynamic configuration and disklabel support by:
41 *	Jason R. Thorpe <thorpej@nas.nasa.gov>
42 *	Numerical Aerodynamic Simulation Facility
43 *	Mail Stop 258-6
44 *	NASA Ames Research Center
45 *	Moffett Field, CA 94035
46 *
47 * from: Utah $Hdr: cd.c 1.6 90/11/28$
48 *
49 *	@(#)cd.c	8.2 (Berkeley) 11/16/93
50 *
51 *	$NetBSD: ccd.c,v 1.22 1995/12/08 19:13:26 thorpej Exp $
52 *
53 * $FreeBSD: head/sys/geom/geom_ccd.c 109534 2003-01-19 14:35:38Z phk $
54 */
55
56#include <sys/param.h>
57#include <sys/systm.h>
58#include <sys/kernel.h>
59#include <sys/module.h>
60#include <sys/proc.h>
61#include <sys/bio.h>
62#include <sys/malloc.h>
63#include <sys/namei.h>
64#include <sys/conf.h>
65#include <sys/stat.h>
66#include <sys/stdint.h>
67#include <sys/sysctl.h>
68#include <sys/disk.h>
69#include <sys/disklabel.h>
70#include <sys/devicestat.h>
71#include <sys/fcntl.h>
72#include <sys/vnode.h>
73
74#include <sys/ccdvar.h>
75
76MALLOC_DEFINE(M_CCD, "CCD driver", "Concatenated Disk driver");
77
78static u_int
79ccdunit(dev_t dev)
80{
81        return (((minor(dev) >> 16) & 0x1e0) | ((minor(dev) >> 3) & 0x1f));
82}
83
84#define ccdpart(x)	(minor(x) & 7)
85
86/*
87   This is how mirroring works (only writes are special):
88
89   When initiating a write, ccdbuffer() returns two "struct ccdbuf *"s
90   linked together by the cb_mirror field.  "cb_pflags &
91   CCDPF_MIRROR_DONE" is set to 0 on both of them.
92
93   When a component returns to ccdiodone(), it checks if "cb_pflags &
94   CCDPF_MIRROR_DONE" is set or not.  If not, it sets the partner's
95   flag and returns.  If it is, it means its partner has already
96   returned, so it will go to the regular cleanup.
97
98 */
99
100struct ccdbuf {
101	struct bio	cb_buf;		/* new I/O buf */
102	struct bio	*cb_obp;	/* ptr. to original I/O buf */
103	struct ccdbuf	*cb_freenext;	/* free list link */
104	int		cb_unit;	/* target unit */
105	int		cb_comp;	/* target component */
106	int		cb_pflags;	/* mirror/parity status flag */
107	struct ccdbuf	*cb_mirror;	/* mirror counterpart */
108};
109
110/* bits in cb_pflags */
111#define CCDPF_MIRROR_DONE 1	/* if set, mirror counterpart is done */
112
113#define CCDLABELDEV(dev)	\
114	(makedev(major((dev)), dkmakeminor(ccdunit((dev)), 0, RAW_PART)))
115
116/* convinient macros for often-used statements */
117#define IS_ALLOCATED(unit)	(ccdfind(unit) != NULL)
118#define IS_INITED(cs)		(((cs)->sc_flags & CCDF_INITED) != 0)
119
120
121static dev_t	ccdctldev;
122
123
124static d_open_t ccdopen;
125static d_close_t ccdclose;
126static d_strategy_t ccdstrategy;
127static d_ioctl_t ccdioctl;
128static d_ioctl_t ccdioctltoo;
129static d_psize_t ccdsize;
130
131#define NCCDFREEHIWAT	16
132
133#define CDEV_MAJOR 74
134
135static struct cdevsw ccd_cdevsw = {
136	/* open */	ccdopen,
137	/* close */	ccdclose,
138	/* read */	physread,
139	/* write */	physwrite,
140	/* ioctl */	ccdioctl,
141	/* poll */	nopoll,
142	/* mmap */	nommap,
143	/* strategy */	ccdstrategy,
144	/* name */	"ccd",
145	/* maj */	CDEV_MAJOR,
146	/* dump */	nodump,
147	/* psize */	ccdsize,
148	/* flags */	D_DISK,
149};
150static LIST_HEAD(, ccd_s) ccd_softc_list = LIST_HEAD_INITIALIZER(&ccd_softc_list);
151
152static struct ccd_s *ccdfind(int);
153static struct ccd_s *ccdnew(int);
154static int ccddestroy(struct ccd_s *, struct proc *);
155
156/* called during module initialization */
157static void ccdattach(void);
158static int ccd_modevent(module_t, int, void *);
159
160/* called by biodone() at interrupt time */
161static void ccdiodone(struct bio *bp);
162
163static void ccdstart(struct ccd_s *, struct bio *);
164static void ccdinterleave(struct ccd_s *, int);
165static int ccdinit(struct ccd_s *, char **, struct thread *);
166static int ccdlookup(char *, struct thread *p, struct vnode **);
167static void ccdbuffer(struct ccdbuf **ret, struct ccd_s *,
168		      struct bio *, daddr_t, caddr_t, long);
169static void ccdgetdisklabel(dev_t);
170static void ccdmakedisklabel(struct ccd_s *);
171static int ccdlock(struct ccd_s *);
172static void ccdunlock(struct ccd_s *);
173
174
175/*
176 * Number of blocks to untouched in front of a component partition.
177 * This is to avoid violating its disklabel area when it starts at the
178 * beginning of the slice.
179 */
180#if !defined(CCD_OFFSET)
181#define CCD_OFFSET 16
182#endif
183
184static struct ccd_s *
185ccdfind(int unit)
186{
187	struct ccd_s *sc = NULL;
188
189	/* XXX: LOCK(unique unit numbers) */
190	LIST_FOREACH(sc, &ccd_softc_list, list) {
191		if (sc->sc_unit == unit)
192			break;
193	}
194	/* XXX: UNLOCK(unique unit numbers) */
195	return ((sc == NULL) || (sc->sc_unit != unit) ? NULL : sc);
196}
197
198static struct ccd_s *
199ccdnew(int unit)
200{
201	struct ccd_s *sc;
202
203	/* XXX: LOCK(unique unit numbers) */
204	if (IS_ALLOCATED(unit) || unit > DKMAXUNIT)
205		return (NULL);
206
207	MALLOC(sc, struct ccd_s *, sizeof(*sc), M_CCD, M_WAITOK | M_ZERO);
208	sc->sc_unit = unit;
209	LIST_INSERT_HEAD(&ccd_softc_list, sc, list);
210	/* XXX: UNLOCK(unique unit numbers) */
211	return (sc);
212}
213
214static int
215ccddestroy(struct ccd_s *sc, struct proc *p)
216{
217
218	/* XXX: LOCK(unique unit numbers) */
219	LIST_REMOVE(sc, list);
220	/* XXX: UNLOCK(unique unit numbers) */
221	FREE(sc, M_CCD);
222	return (0);
223}
224
225static void
226ccd_clone(void *arg, char *name, int namelen, dev_t *dev)
227{
228	int i, u;
229	char *s;
230
231	if (*dev != NODEV)
232		return;
233	i = dev_stdclone(name, &s, "ccd", &u);
234	if (i != 2)
235		return;
236	if (*s < 'a' || *s > 'h')
237		return;
238	if (s[1] != '\0')
239		return;
240	*dev = make_dev(&ccd_cdevsw, u * 8 + *s - 'a',
241		UID_ROOT, GID_OPERATOR, 0640, name);
242}
243
244/*
245 * Called by main() during pseudo-device attachment.  All we need
246 * to do is to add devsw entries.
247 */
248static void
249ccdattach()
250{
251
252	ccdctldev = make_dev(&ccd_cdevsw, 0xffff00ff,
253		UID_ROOT, GID_OPERATOR, 0640, "ccd.ctl");
254	ccdctldev->si_drv1 = ccdctldev;
255	EVENTHANDLER_REGISTER(dev_clone, ccd_clone, 0, 1000);
256}
257
258static int
259ccd_modevent(module_t mod, int type, void *data)
260{
261	int error = 0;
262
263	switch (type) {
264	case MOD_LOAD:
265		ccdattach();
266		break;
267
268	case MOD_UNLOAD:
269		printf("ccd0: Unload not supported!\n");
270		error = EOPNOTSUPP;
271		break;
272
273	case MOD_SHUTDOWN:
274		break;
275
276	default:
277		error = EOPNOTSUPP;
278	}
279	return (error);
280}
281
282DEV_MODULE(ccd, ccd_modevent, NULL);
283
284static int
285ccdinit(struct ccd_s *cs, char **cpaths, struct thread *td)
286{
287	struct ccdcinfo *ci = NULL;	/* XXX */
288	size_t size;
289	int ix;
290	struct vnode *vp;
291	size_t minsize;
292	int maxsecsize;
293	struct ccdgeom *ccg = &cs->sc_geom;
294	char *tmppath = NULL;
295	int error = 0;
296	off_t mediasize;
297	u_int sectorsize;
298
299
300	cs->sc_size = 0;
301
302	/* Allocate space for the component info. */
303	cs->sc_cinfo = malloc(cs->sc_nccdisks * sizeof(struct ccdcinfo),
304	    M_CCD, M_WAITOK);
305
306	/*
307	 * Verify that each component piece exists and record
308	 * relevant information about it.
309	 */
310	maxsecsize = 0;
311	minsize = 0;
312	tmppath = malloc(MAXPATHLEN, M_CCD, M_WAITOK);
313	for (ix = 0; ix < cs->sc_nccdisks; ix++) {
314		vp = cs->sc_vpp[ix];
315		ci = &cs->sc_cinfo[ix];
316		ci->ci_vp = vp;
317
318		/*
319		 * Copy in the pathname of the component.
320		 */
321		if ((error = copyinstr(cpaths[ix], tmppath,
322		    MAXPATHLEN, &ci->ci_pathlen)) != 0) {
323			goto fail;
324		}
325		ci->ci_path = malloc(ci->ci_pathlen, M_CCD, M_WAITOK);
326		bcopy(tmppath, ci->ci_path, ci->ci_pathlen);
327
328		ci->ci_dev = vn_todev(vp);
329
330		/*
331		 * Get partition information for the component.
332		 */
333		error = VOP_IOCTL(vp, DIOCGMEDIASIZE, (caddr_t)&mediasize,
334		    FREAD, td->td_ucred, td);
335		if (error != 0) {
336			goto fail;
337		}
338		/*
339		 * Get partition information for the component.
340		 */
341		error = VOP_IOCTL(vp, DIOCGSECTORSIZE, (caddr_t)&sectorsize,
342		    FREAD, td->td_ucred, td);
343		if (error != 0) {
344			goto fail;
345		}
346		if (sectorsize > maxsecsize)
347			maxsecsize = sectorsize;
348		size = mediasize / DEV_BSIZE - CCD_OFFSET;
349
350		/*
351		 * Calculate the size, truncating to an interleave
352		 * boundary if necessary.
353		 */
354
355		if (cs->sc_ileave > 1)
356			size -= size % cs->sc_ileave;
357
358		if (size == 0) {
359			error = ENODEV;
360			goto fail;
361		}
362
363		if (minsize == 0 || size < minsize)
364			minsize = size;
365		ci->ci_size = size;
366		cs->sc_size += size;
367	}
368
369	free(tmppath, M_CCD);
370	tmppath = NULL;
371
372	/*
373	 * Don't allow the interleave to be smaller than
374	 * the biggest component sector.
375	 */
376	if ((cs->sc_ileave > 0) &&
377	    (cs->sc_ileave < (maxsecsize / DEV_BSIZE))) {
378		error = EINVAL;
379		goto fail;
380	}
381
382	/*
383	 * If uniform interleave is desired set all sizes to that of
384	 * the smallest component.  This will guarentee that a single
385	 * interleave table is generated.
386	 *
387	 * Lost space must be taken into account when calculating the
388	 * overall size.  Half the space is lost when CCDF_MIRROR is
389	 * specified.
390	 */
391	if (cs->sc_flags & CCDF_UNIFORM) {
392		for (ci = cs->sc_cinfo;
393		     ci < &cs->sc_cinfo[cs->sc_nccdisks]; ci++) {
394			ci->ci_size = minsize;
395		}
396		if (cs->sc_flags & CCDF_MIRROR) {
397			/*
398			 * Check to see if an even number of components
399			 * have been specified.  The interleave must also
400			 * be non-zero in order for us to be able to
401			 * guarentee the topology.
402			 */
403			if (cs->sc_nccdisks % 2) {
404				printf("ccd%d: mirroring requires an even number of disks\n", cs->sc_unit );
405				error = EINVAL;
406				goto fail;
407			}
408			if (cs->sc_ileave == 0) {
409				printf("ccd%d: an interleave must be specified when mirroring\n", cs->sc_unit);
410				error = EINVAL;
411				goto fail;
412			}
413			cs->sc_size = (cs->sc_nccdisks/2) * minsize;
414		} else {
415			if (cs->sc_ileave == 0) {
416				printf("ccd%d: an interleave must be specified when using parity\n", cs->sc_unit);
417				error = EINVAL;
418				goto fail;
419			}
420			cs->sc_size = cs->sc_nccdisks * minsize;
421		}
422	}
423
424	/*
425	 * Construct the interleave table.
426	 */
427	ccdinterleave(cs, cs->sc_unit);
428
429	/*
430	 * Create pseudo-geometry based on 1MB cylinders.  It's
431	 * pretty close.
432	 */
433	ccg->ccg_secsize = maxsecsize;
434	ccg->ccg_ntracks = 1;
435	ccg->ccg_nsectors = 1024 * 1024 / ccg->ccg_secsize;
436	ccg->ccg_ncylinders = cs->sc_size / ccg->ccg_nsectors;
437
438	/*
439	 * Add a devstat entry for this device.
440	 */
441	devstat_add_entry(&cs->device_stats, "ccd", cs->sc_unit,
442			  ccg->ccg_secsize, DEVSTAT_ALL_SUPPORTED,
443			  DEVSTAT_TYPE_STORARRAY |DEVSTAT_TYPE_IF_OTHER,
444			  DEVSTAT_PRIORITY_ARRAY);
445
446	cs->sc_flags |= CCDF_INITED;
447	cs->sc_cflags = cs->sc_flags;	/* So we can find out later... */
448	return (0);
449fail:
450	while (ci > cs->sc_cinfo) {
451		ci--;
452		free(ci->ci_path, M_CCD);
453	}
454	if (tmppath != NULL)
455		free(tmppath, M_CCD);
456	free(cs->sc_cinfo, M_CCD);
457	return (error);
458}
459
460static void
461ccdinterleave(struct ccd_s *cs, int unit)
462{
463	struct ccdcinfo *ci, *smallci;
464	struct ccdiinfo *ii;
465	daddr_t bn, lbn;
466	int ix;
467	u_long size;
468
469
470	/*
471	 * Allocate an interleave table.  The worst case occurs when each
472	 * of N disks is of a different size, resulting in N interleave
473	 * tables.
474	 *
475	 * Chances are this is too big, but we don't care.
476	 */
477	size = (cs->sc_nccdisks + 1) * sizeof(struct ccdiinfo);
478	cs->sc_itable = (struct ccdiinfo *)malloc(size, M_CCD,
479	    M_WAITOK | M_ZERO);
480
481	/*
482	 * Trivial case: no interleave (actually interleave of disk size).
483	 * Each table entry represents a single component in its entirety.
484	 *
485	 * An interleave of 0 may not be used with a mirror setup.
486	 */
487	if (cs->sc_ileave == 0) {
488		bn = 0;
489		ii = cs->sc_itable;
490
491		for (ix = 0; ix < cs->sc_nccdisks; ix++) {
492			/* Allocate space for ii_index. */
493			ii->ii_index = malloc(sizeof(int), M_CCD, M_WAITOK);
494			ii->ii_ndisk = 1;
495			ii->ii_startblk = bn;
496			ii->ii_startoff = 0;
497			ii->ii_index[0] = ix;
498			bn += cs->sc_cinfo[ix].ci_size;
499			ii++;
500		}
501		ii->ii_ndisk = 0;
502		return;
503	}
504
505	/*
506	 * The following isn't fast or pretty; it doesn't have to be.
507	 */
508	size = 0;
509	bn = lbn = 0;
510	for (ii = cs->sc_itable; ; ii++) {
511		/*
512		 * Allocate space for ii_index.  We might allocate more then
513		 * we use.
514		 */
515		ii->ii_index = malloc((sizeof(int) * cs->sc_nccdisks),
516		    M_CCD, M_WAITOK);
517
518		/*
519		 * Locate the smallest of the remaining components
520		 */
521		smallci = NULL;
522		for (ci = cs->sc_cinfo; ci < &cs->sc_cinfo[cs->sc_nccdisks];
523		    ci++) {
524			if (ci->ci_size > size &&
525			    (smallci == NULL ||
526			     ci->ci_size < smallci->ci_size)) {
527				smallci = ci;
528			}
529		}
530
531		/*
532		 * Nobody left, all done
533		 */
534		if (smallci == NULL) {
535			ii->ii_ndisk = 0;
536			free(ii->ii_index, M_CCD);
537			break;
538		}
539
540		/*
541		 * Record starting logical block using an sc_ileave blocksize.
542		 */
543		ii->ii_startblk = bn / cs->sc_ileave;
544
545		/*
546		 * Record starting comopnent block using an sc_ileave
547		 * blocksize.  This value is relative to the beginning of
548		 * a component disk.
549		 */
550		ii->ii_startoff = lbn;
551
552		/*
553		 * Determine how many disks take part in this interleave
554		 * and record their indices.
555		 */
556		ix = 0;
557		for (ci = cs->sc_cinfo;
558		    ci < &cs->sc_cinfo[cs->sc_nccdisks]; ci++) {
559			if (ci->ci_size >= smallci->ci_size) {
560				ii->ii_index[ix++] = ci - cs->sc_cinfo;
561			}
562		}
563		ii->ii_ndisk = ix;
564		bn += ix * (smallci->ci_size - size);
565		lbn = smallci->ci_size / cs->sc_ileave;
566		size = smallci->ci_size;
567	}
568}
569
570/* ARGSUSED */
571static int
572ccdopen(dev_t dev, int flags, int fmt, struct thread *td)
573{
574	int unit = ccdunit(dev);
575	struct ccd_s *cs;
576	struct disklabel *lp;
577	int error = 0, part, pmask;
578
579	if (dev->si_drv1 == dev)
580		return (0);
581
582	cs = IS_ALLOCATED(unit) ? ccdfind(unit) : ccdnew(unit);
583
584	if ((error = ccdlock(cs)) != 0)
585		return (error);
586
587	lp = &cs->sc_label;
588
589	part = ccdpart(dev);
590	pmask = (1 << part);
591
592	/*
593	 * If we're initialized, check to see if there are any other
594	 * open partitions.  If not, then it's safe to update
595	 * the in-core disklabel.
596	 */
597	if (IS_INITED(cs) && (cs->sc_openmask == 0))
598		ccdgetdisklabel(dev);
599
600	/* Check that the partition exists. */
601	if (part != RAW_PART && ((part >= lp->d_npartitions) ||
602	    (lp->d_partitions[part].p_fstype == FS_UNUSED))) {
603		error = ENXIO;
604		goto done;
605	}
606
607	cs->sc_openmask |= pmask;
608 done:
609	ccdunlock(cs);
610	return (0);
611}
612
613/* ARGSUSED */
614static int
615ccdclose(dev_t dev, int flags, int fmt, struct thread *td)
616{
617	int unit = ccdunit(dev);
618	struct ccd_s *cs;
619	int error = 0, part;
620
621	if (dev->si_drv1 == dev)
622		return (0);
623
624	if (!IS_ALLOCATED(unit))
625		return (ENXIO);
626	cs = ccdfind(unit);
627
628	if ((error = ccdlock(cs)) != 0)
629		return (error);
630
631	part = ccdpart(dev);
632
633	/* ...that much closer to allowing unconfiguration... */
634	cs->sc_openmask &= ~(1 << part);
635	/* collect "garbage" if possible */
636	if (!IS_INITED(cs) && (cs->sc_flags & CCDF_WANTED) == 0)
637		ccddestroy(cs, td->td_proc);
638	else
639		ccdunlock(cs);
640	return (0);
641}
642
643static void
644ccdstrategy(struct bio *bp)
645{
646	int unit = ccdunit(bp->bio_dev);
647	struct ccd_s *cs = ccdfind(unit);
648	int s;
649	int wlabel;
650	struct disklabel *lp;
651
652	if (bp->bio_dev->si_drv1 == bp->bio_dev) {
653		biofinish(bp, NULL, ENXIO);
654		return;
655	}
656	if (!IS_INITED(cs)) {
657		biofinish(bp, NULL, ENXIO);
658		return;
659	}
660
661	/* If it's a nil transfer, wake up the top half now. */
662	if (bp->bio_bcount == 0) {
663		biodone(bp);
664		return;
665	}
666
667	lp = &cs->sc_label;
668
669	/*
670	 * Do bounds checking and adjust transfer.  If there's an
671	 * error, the bounds check will flag that for us.
672	 */
673	wlabel = cs->sc_flags & (CCDF_WLABEL|CCDF_LABELLING);
674	if (ccdpart(bp->bio_dev) != RAW_PART) {
675		if (bounds_check_with_label(bp, lp, wlabel) <= 0) {
676			biodone(bp);
677			return;
678		}
679	} else {
680		int pbn;        /* in sc_secsize chunks */
681		long sz;        /* in sc_secsize chunks */
682
683		pbn = bp->bio_blkno / (cs->sc_geom.ccg_secsize / DEV_BSIZE);
684		sz = howmany(bp->bio_bcount, cs->sc_geom.ccg_secsize);
685
686		/*
687		 * If out of bounds return an error. If at the EOF point,
688		 * simply read or write less.
689		 */
690
691		if (pbn < 0 || pbn >= cs->sc_size) {
692			bp->bio_resid = bp->bio_bcount;
693			if (pbn != cs->sc_size)
694				biofinish(bp, NULL, EINVAL);
695			else
696				biodone(bp);
697			return;
698		}
699
700		/*
701		 * If the request crosses EOF, truncate the request.
702		 */
703		if (pbn + sz > cs->sc_size) {
704			bp->bio_bcount = (cs->sc_size - pbn) *
705			    cs->sc_geom.ccg_secsize;
706		}
707	}
708
709	bp->bio_resid = bp->bio_bcount;
710
711	/*
712	 * "Start" the unit.
713	 */
714	s = splbio();
715	ccdstart(cs, bp);
716	splx(s);
717	return;
718}
719
720static void
721ccdstart(struct ccd_s *cs, struct bio *bp)
722{
723	long bcount, rcount;
724	struct ccdbuf *cbp[4];
725	/* XXX! : 2 reads and 2 writes for RAID 4/5 */
726	caddr_t addr;
727	daddr_t bn;
728	struct partition *pp;
729
730
731	/* Record the transaction start  */
732	devstat_start_transaction(&cs->device_stats);
733
734	/*
735	 * Translate the partition-relative block number to an absolute.
736	 */
737	bn = bp->bio_blkno;
738	if (ccdpart(bp->bio_dev) != RAW_PART) {
739		pp = &cs->sc_label.d_partitions[ccdpart(bp->bio_dev)];
740		bn += pp->p_offset;
741	}
742
743	/*
744	 * Allocate component buffers and fire off the requests
745	 */
746	addr = bp->bio_data;
747	for (bcount = bp->bio_bcount; bcount > 0; bcount -= rcount) {
748		ccdbuffer(cbp, cs, bp, bn, addr, bcount);
749		rcount = cbp[0]->cb_buf.bio_bcount;
750
751		if (cs->sc_cflags & CCDF_MIRROR) {
752			/*
753			 * Mirroring.  Writes go to both disks, reads are
754			 * taken from whichever disk seems most appropriate.
755			 *
756			 * We attempt to localize reads to the disk whos arm
757			 * is nearest the read request.  We ignore seeks due
758			 * to writes when making this determination and we
759			 * also try to avoid hogging.
760			 */
761			if (cbp[0]->cb_buf.bio_cmd == BIO_WRITE) {
762				BIO_STRATEGY(&cbp[0]->cb_buf);
763				BIO_STRATEGY(&cbp[1]->cb_buf);
764			} else {
765				int pick = cs->sc_pick;
766				daddr_t range = cs->sc_size / 16;
767
768				if (bn < cs->sc_blk[pick] - range ||
769				    bn > cs->sc_blk[pick] + range
770				) {
771					cs->sc_pick = pick = 1 - pick;
772				}
773				cs->sc_blk[pick] = bn + btodb(rcount);
774				BIO_STRATEGY(&cbp[pick]->cb_buf);
775			}
776		} else {
777			/*
778			 * Not mirroring
779			 */
780			BIO_STRATEGY(&cbp[0]->cb_buf);
781		}
782		bn += btodb(rcount);
783		addr += rcount;
784	}
785}
786
787/*
788 * Build a component buffer header.
789 */
790static void
791ccdbuffer(struct ccdbuf **cb, struct ccd_s *cs, struct bio *bp, daddr_t bn, caddr_t addr, long bcount)
792{
793	struct ccdcinfo *ci, *ci2 = NULL;	/* XXX */
794	struct ccdbuf *cbp;
795	daddr_t cbn, cboff;
796	off_t cbc;
797
798	/*
799	 * Determine which component bn falls in.
800	 */
801	cbn = bn;
802	cboff = 0;
803
804	if (cs->sc_ileave == 0) {
805		/*
806		 * Serially concatenated and neither a mirror nor a parity
807		 * config.  This is a special case.
808		 */
809		daddr_t sblk;
810
811		sblk = 0;
812		for (ci = cs->sc_cinfo; cbn >= sblk + ci->ci_size; ci++)
813			sblk += ci->ci_size;
814		cbn -= sblk;
815	} else {
816		struct ccdiinfo *ii;
817		int ccdisk, off;
818
819		/*
820		 * Calculate cbn, the logical superblock (sc_ileave chunks),
821		 * and cboff, a normal block offset (DEV_BSIZE chunks) relative
822		 * to cbn.
823		 */
824		cboff = cbn % cs->sc_ileave;	/* DEV_BSIZE gran */
825		cbn = cbn / cs->sc_ileave;	/* DEV_BSIZE * ileave gran */
826
827		/*
828		 * Figure out which interleave table to use.
829		 */
830		for (ii = cs->sc_itable; ii->ii_ndisk; ii++) {
831			if (ii->ii_startblk > cbn)
832				break;
833		}
834		ii--;
835
836		/*
837		 * off is the logical superblock relative to the beginning
838		 * of this interleave block.
839		 */
840		off = cbn - ii->ii_startblk;
841
842		/*
843		 * We must calculate which disk component to use (ccdisk),
844		 * and recalculate cbn to be the superblock relative to
845		 * the beginning of the component.  This is typically done by
846		 * adding 'off' and ii->ii_startoff together.  However, 'off'
847		 * must typically be divided by the number of components in
848		 * this interleave array to be properly convert it from a
849		 * CCD-relative logical superblock number to a
850		 * component-relative superblock number.
851		 */
852		if (ii->ii_ndisk == 1) {
853			/*
854			 * When we have just one disk, it can't be a mirror
855			 * or a parity config.
856			 */
857			ccdisk = ii->ii_index[0];
858			cbn = ii->ii_startoff + off;
859		} else {
860			if (cs->sc_cflags & CCDF_MIRROR) {
861				/*
862				 * We have forced a uniform mapping, resulting
863				 * in a single interleave array.  We double
864				 * up on the first half of the available
865				 * components and our mirror is in the second
866				 * half.  This only works with a single
867				 * interleave array because doubling up
868				 * doubles the number of sectors, so there
869				 * cannot be another interleave array because
870				 * the next interleave array's calculations
871				 * would be off.
872				 */
873				int ndisk2 = ii->ii_ndisk / 2;
874				ccdisk = ii->ii_index[off % ndisk2];
875				cbn = ii->ii_startoff + off / ndisk2;
876				ci2 = &cs->sc_cinfo[ccdisk + ndisk2];
877			} else {
878				ccdisk = ii->ii_index[off % ii->ii_ndisk];
879				cbn = ii->ii_startoff + off / ii->ii_ndisk;
880			}
881		}
882
883		ci = &cs->sc_cinfo[ccdisk];
884
885		/*
886		 * Convert cbn from a superblock to a normal block so it
887		 * can be used to calculate (along with cboff) the normal
888		 * block index into this particular disk.
889		 */
890		cbn *= cs->sc_ileave;
891	}
892
893	/*
894	 * Fill in the component buf structure.
895	 */
896	cbp = malloc(sizeof(struct ccdbuf), M_CCD, M_WAITOK | M_ZERO);
897	cbp->cb_buf.bio_cmd = bp->bio_cmd;
898	cbp->cb_buf.bio_done = ccdiodone;
899	cbp->cb_buf.bio_dev = ci->ci_dev;		/* XXX */
900	cbp->cb_buf.bio_blkno = cbn + cboff + CCD_OFFSET;
901	cbp->cb_buf.bio_offset = dbtob(cbn + cboff + CCD_OFFSET);
902	cbp->cb_buf.bio_data = addr;
903	if (cs->sc_ileave == 0)
904              cbc = dbtob((off_t)(ci->ci_size - cbn));
905	else
906              cbc = dbtob((off_t)(cs->sc_ileave - cboff));
907	cbp->cb_buf.bio_bcount = (cbc < bcount) ? cbc : bcount;
908 	cbp->cb_buf.bio_caller1 = (void*)cbp->cb_buf.bio_bcount;
909
910	/*
911	 * context for ccdiodone
912	 */
913	cbp->cb_obp = bp;
914	cbp->cb_unit = cs->sc_unit;
915	cbp->cb_comp = ci - cs->sc_cinfo;
916
917	cb[0] = cbp;
918
919	/*
920	 * Note: both I/O's setup when reading from mirror, but only one
921	 * will be executed.
922	 */
923	if (cs->sc_cflags & CCDF_MIRROR) {
924		/* mirror, setup second I/O */
925		cbp = malloc(sizeof(struct ccdbuf), M_CCD, M_WAITOK);
926		bcopy(cb[0], cbp, sizeof(struct ccdbuf));
927		cbp->cb_buf.bio_dev = ci2->ci_dev;
928		cbp->cb_comp = ci2 - cs->sc_cinfo;
929		cb[1] = cbp;
930		/* link together the ccdbuf's and clear "mirror done" flag */
931		cb[0]->cb_mirror = cb[1];
932		cb[1]->cb_mirror = cb[0];
933		cb[0]->cb_pflags &= ~CCDPF_MIRROR_DONE;
934		cb[1]->cb_pflags &= ~CCDPF_MIRROR_DONE;
935	}
936}
937
938/*
939 * Called at interrupt time.
940 * Mark the component as done and if all components are done,
941 * take a ccd interrupt.
942 */
943static void
944ccdiodone(struct bio *ibp)
945{
946	struct ccdbuf *cbp = (struct ccdbuf *)ibp;
947	struct bio *bp = cbp->cb_obp;
948	int unit = cbp->cb_unit;
949	struct ccd_s *cs;
950	int count, s;
951
952	cs = ccdfind(unit);
953	s = splbio();
954	/*
955	 * If an error occured, report it.  If this is a mirrored
956	 * configuration and the first of two possible reads, do not
957	 * set the error in the bp yet because the second read may
958	 * succeed.
959	 */
960
961	if (cbp->cb_buf.bio_flags & BIO_ERROR) {
962		const char *msg = "";
963
964		if ((cs->sc_cflags & CCDF_MIRROR) &&
965		    (cbp->cb_buf.bio_cmd == BIO_READ) &&
966		    (cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) {
967			/*
968			 * We will try our read on the other disk down
969			 * below, also reverse the default pick so if we
970			 * are doing a scan we do not keep hitting the
971			 * bad disk first.
972			 */
973
974			msg = ", trying other disk";
975			cs->sc_pick = 1 - cs->sc_pick;
976			cs->sc_blk[cs->sc_pick] = bp->bio_blkno;
977		} else {
978			bp->bio_flags |= BIO_ERROR;
979			bp->bio_error = cbp->cb_buf.bio_error ?
980			    cbp->cb_buf.bio_error : EIO;
981		}
982		printf("ccd%d: error %d on component %d block %jd "
983		    "(ccd block %jd)%s\n", unit, bp->bio_error, cbp->cb_comp,
984		    (intmax_t)cbp->cb_buf.bio_blkno, (intmax_t)bp->bio_blkno,
985		    msg);
986	}
987
988	/*
989	 * Process mirror.  If we are writing, I/O has been initiated on both
990	 * buffers and we fall through only after both are finished.
991	 *
992	 * If we are reading only one I/O is initiated at a time.  If an
993	 * error occurs we initiate the second I/O and return, otherwise
994	 * we free the second I/O without initiating it.
995	 */
996
997	if (cs->sc_cflags & CCDF_MIRROR) {
998		if (cbp->cb_buf.bio_cmd == BIO_WRITE) {
999			/*
1000			 * When writing, handshake with the second buffer
1001			 * to determine when both are done.  If both are not
1002			 * done, return here.
1003			 */
1004			if ((cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) {
1005				cbp->cb_mirror->cb_pflags |= CCDPF_MIRROR_DONE;
1006				free(cbp, M_CCD);
1007				splx(s);
1008				return;
1009			}
1010		} else {
1011			/*
1012			 * When reading, either dispose of the second buffer
1013			 * or initiate I/O on the second buffer if an error
1014			 * occured with this one.
1015			 */
1016			if ((cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) {
1017				if (cbp->cb_buf.bio_flags & BIO_ERROR) {
1018					cbp->cb_mirror->cb_pflags |=
1019					    CCDPF_MIRROR_DONE;
1020					BIO_STRATEGY(&cbp->cb_mirror->cb_buf);
1021					free(cbp, M_CCD);
1022					splx(s);
1023					return;
1024				} else {
1025					free(cbp->cb_mirror, M_CCD);
1026				}
1027			}
1028		}
1029	}
1030
1031	/*
1032	 * use bio_caller1 to determine how big the original request was rather
1033	 * then bio_bcount, because bio_bcount may have been truncated for EOF.
1034	 *
1035	 * XXX We check for an error, but we do not test the resid for an
1036	 * aligned EOF condition.  This may result in character & block
1037	 * device access not recognizing EOF properly when read or written
1038	 * sequentially, but will not effect filesystems.
1039	 */
1040	count = (long)cbp->cb_buf.bio_caller1;
1041	free(cbp, M_CCD);
1042
1043	/*
1044	 * If all done, "interrupt".
1045	 */
1046	bp->bio_resid -= count;
1047	if (bp->bio_resid < 0)
1048		panic("ccdiodone: count");
1049	if (bp->bio_resid == 0) {
1050		if (bp->bio_flags & BIO_ERROR)
1051			bp->bio_resid = bp->bio_bcount;
1052		biofinish(bp, &cs->device_stats, 0);
1053	}
1054	splx(s);
1055}
1056
1057static int
1058ccdioctl(dev_t dev, u_long cmd, caddr_t data, int flag, struct thread *td)
1059{
1060	struct ccd_ioctl *ccio;
1061	u_int unit;
1062	dev_t dev2;
1063	int error;
1064
1065	if (dev->si_drv1 != dev) {
1066		switch (cmd) {
1067		case CCDIOCSET:
1068		case CCDIOCCLR:
1069		case CCDCONFINFO:
1070		case CCDCPPINFO:
1071			printf("*** WARNING: upgrade your ccdconfig(8) binary\n");
1072			printf("*** WARNING: continuing in 30 seconds\n");
1073			tsleep(dev, PRIBIO, "ccdbug", hz * 30);
1074			break;
1075		}
1076		return ccdioctltoo(dev, cmd, data, flag, td);
1077	}
1078	switch (cmd) {
1079	case CCDIOCSET:
1080	case CCDIOCCLR:
1081		ccio = (struct ccd_ioctl *)data;
1082		unit = ccio->ccio_size;
1083		dev2 = makedev(CDEV_MAJOR, unit * 8 + 2);
1084		if (!(dev2->si_flags & SI_NAMED)) {
1085			dev2 = make_dev(&ccd_cdevsw, unit * 8 + 2,
1086				UID_ROOT, GID_OPERATOR, 0640, "ccd%dc", unit);
1087			ccdnew(unit);
1088		}
1089		return (ccdioctltoo(dev2, cmd, data, flag, td));
1090	case CCDCONFINFO:
1091		{
1092		int ninit = 0;
1093		struct ccdconf *conf = (struct ccdconf *)data;
1094		struct ccd_s *tmpcs;
1095		struct ccd_s *ubuf = conf->buffer;
1096
1097		/* XXX: LOCK(unique unit numbers) */
1098		LIST_FOREACH(tmpcs, &ccd_softc_list, list)
1099			if (IS_INITED(tmpcs))
1100				ninit++;
1101
1102		if (conf->size == 0) {
1103			conf->size = sizeof(struct ccd_s) * ninit;
1104			return (0);
1105		} else if ((conf->size / sizeof(struct ccd_s) != ninit) ||
1106		    (conf->size % sizeof(struct ccd_s) != 0)) {
1107			/* XXX: UNLOCK(unique unit numbers) */
1108			return (EINVAL);
1109		}
1110
1111		ubuf += ninit;
1112		LIST_FOREACH(tmpcs, &ccd_softc_list, list) {
1113			if (!IS_INITED(tmpcs))
1114				continue;
1115			error = copyout(tmpcs, --ubuf,
1116			    sizeof(struct ccd_s));
1117			if (error != 0)
1118				/* XXX: UNLOCK(unique unit numbers) */
1119				return (error);
1120		}
1121		/* XXX: UNLOCK(unique unit numbers) */
1122		return (0);
1123		}
1124
1125	case CCDCPPINFO:
1126		{
1127		struct ccdcpps *cpps = (struct ccdcpps *)data;
1128		char *ubuf = cpps->buffer;
1129
1130
1131		error = copyin(ubuf, &unit, sizeof (unit));
1132		if (error)
1133			return (error);
1134
1135		if (!IS_ALLOCATED(unit))
1136			return (ENXIO);
1137		dev2 = makedev(CDEV_MAJOR, unit * 8 + 2);
1138		return (ccdioctltoo(dev2, cmd, data, flag, td));
1139		}
1140
1141	default:
1142		return (ENXIO);
1143	}
1144}
1145
1146static int
1147ccdioctltoo(dev_t dev, u_long cmd, caddr_t data, int flag, struct thread *td)
1148{
1149	int unit;
1150	int i, j, lookedup = 0, error = 0;
1151	int part, pmask, s;
1152	struct ccd_s *cs;
1153	struct ccd_ioctl *ccio = (struct ccd_ioctl *)data;
1154	char **cpp;
1155	struct vnode **vpp;
1156
1157	unit = ccdunit(dev);
1158	if (!IS_ALLOCATED(unit))
1159		return (ENXIO);
1160	cs = ccdfind(unit);
1161
1162	switch (cmd) {
1163	case CCDIOCSET:
1164		if (IS_INITED(cs))
1165			return (EBUSY);
1166
1167		if ((flag & FWRITE) == 0)
1168			return (EBADF);
1169
1170		if ((error = ccdlock(cs)) != 0)
1171			return (error);
1172
1173		if (ccio->ccio_ndisks > CCD_MAXNDISKS)
1174			return (EINVAL);
1175
1176		/* Fill in some important bits. */
1177		cs->sc_ileave = ccio->ccio_ileave;
1178		if (cs->sc_ileave == 0 && (ccio->ccio_flags & CCDF_MIRROR)) {
1179			printf("ccd%d: disabling mirror, interleave is 0\n",
1180			    unit);
1181			ccio->ccio_flags &= ~(CCDF_MIRROR);
1182		}
1183		if ((ccio->ccio_flags & CCDF_MIRROR) &&
1184		    !(ccio->ccio_flags & CCDF_UNIFORM)) {
1185			printf("ccd%d: mirror/parity forces uniform flag\n",
1186			       unit);
1187			ccio->ccio_flags |= CCDF_UNIFORM;
1188		}
1189		cs->sc_flags = ccio->ccio_flags & CCDF_USERMASK;
1190
1191		/*
1192		 * Allocate space for and copy in the array of
1193		 * componet pathnames and device numbers.
1194		 */
1195		cpp = malloc(ccio->ccio_ndisks * sizeof(char *),
1196		    M_CCD, M_WAITOK);
1197		vpp = malloc(ccio->ccio_ndisks * sizeof(struct vnode *),
1198		    M_CCD, M_WAITOK);
1199
1200		error = copyin((caddr_t)ccio->ccio_disks, (caddr_t)cpp,
1201		    ccio->ccio_ndisks * sizeof(char **));
1202		if (error) {
1203			free(vpp, M_CCD);
1204			free(cpp, M_CCD);
1205			ccdunlock(cs);
1206			return (error);
1207		}
1208
1209
1210		for (i = 0; i < ccio->ccio_ndisks; ++i) {
1211			if ((error = ccdlookup(cpp[i], td, &vpp[i])) != 0) {
1212				for (j = 0; j < lookedup; ++j)
1213					(void)vn_close(vpp[j], FREAD|FWRITE,
1214					    td->td_ucred, td);
1215				free(vpp, M_CCD);
1216				free(cpp, M_CCD);
1217				ccdunlock(cs);
1218				return (error);
1219			}
1220			++lookedup;
1221		}
1222		cs->sc_vpp = vpp;
1223		cs->sc_nccdisks = ccio->ccio_ndisks;
1224
1225		/*
1226		 * Initialize the ccd.  Fills in the softc for us.
1227		 */
1228		if ((error = ccdinit(cs, cpp, td)) != 0) {
1229			for (j = 0; j < lookedup; ++j)
1230				(void)vn_close(vpp[j], FREAD|FWRITE,
1231				    td->td_ucred, td);
1232			/*
1233			 * We can't ccddestroy() cs just yet, because nothing
1234			 * prevents user-level app to do another ioctl()
1235			 * without closing the device first, therefore
1236			 * declare unit null and void and let ccdclose()
1237			 * destroy it when it is safe to do so.
1238			 */
1239			cs->sc_flags &= (CCDF_WANTED | CCDF_LOCKED);
1240			free(vpp, M_CCD);
1241			free(cpp, M_CCD);
1242			ccdunlock(cs);
1243			return (error);
1244		}
1245		free(cpp, M_CCD);
1246
1247		/*
1248		 * The ccd has been successfully initialized, so
1249		 * we can place it into the array and read the disklabel.
1250		 */
1251		ccio->ccio_unit = unit;
1252		ccio->ccio_size = cs->sc_size;
1253		ccdgetdisklabel(dev);
1254
1255		ccdunlock(cs);
1256
1257		break;
1258
1259	case CCDIOCCLR:
1260		if (!IS_INITED(cs))
1261			return (ENXIO);
1262
1263		if ((flag & FWRITE) == 0)
1264			return (EBADF);
1265
1266		if ((error = ccdlock(cs)) != 0)
1267			return (error);
1268
1269		/* Don't unconfigure if any other partitions are open */
1270		part = ccdpart(dev);
1271		pmask = (1 << part);
1272		if ((cs->sc_openmask & ~pmask)) {
1273			ccdunlock(cs);
1274			return (EBUSY);
1275		}
1276
1277		/* Declare unit null and void (reset all flags) */
1278		cs->sc_flags &= (CCDF_WANTED | CCDF_LOCKED);
1279
1280		/* Close the components and free their pathnames. */
1281		for (i = 0; i < cs->sc_nccdisks; ++i) {
1282			/*
1283			 * XXX: this close could potentially fail and
1284			 * cause Bad Things.  Maybe we need to force
1285			 * the close to happen?
1286			 */
1287			(void)vn_close(cs->sc_cinfo[i].ci_vp, FREAD|FWRITE,
1288			    td->td_ucred, td);
1289			free(cs->sc_cinfo[i].ci_path, M_CCD);
1290		}
1291
1292		/* Free interleave index. */
1293		for (i = 0; cs->sc_itable[i].ii_ndisk; ++i)
1294			free(cs->sc_itable[i].ii_index, M_CCD);
1295
1296		/* Free component info and interleave table. */
1297		free(cs->sc_cinfo, M_CCD);
1298		free(cs->sc_itable, M_CCD);
1299		free(cs->sc_vpp, M_CCD);
1300
1301		/* And remove the devstat entry. */
1302		devstat_remove_entry(&cs->device_stats);
1303
1304		/* This must be atomic. */
1305		s = splhigh();
1306		ccdunlock(cs);
1307		splx(s);
1308
1309		break;
1310
1311	case CCDCONFINFO:
1312		{
1313			int ninit = 0;
1314			struct ccdconf *conf = (struct ccdconf *)data;
1315			struct ccd_s *tmpcs;
1316			struct ccd_s *ubuf = conf->buffer;
1317
1318			/* XXX: LOCK(unique unit numbers) */
1319			LIST_FOREACH(tmpcs, &ccd_softc_list, list)
1320				if (IS_INITED(tmpcs))
1321					ninit++;
1322
1323			if (conf->size == 0) {
1324				conf->size = sizeof(struct ccd_s) * ninit;
1325				break;
1326			} else if ((conf->size / sizeof(struct ccd_s) != ninit) ||
1327			    (conf->size % sizeof(struct ccd_s) != 0)) {
1328				/* XXX: UNLOCK(unique unit numbers) */
1329				return (EINVAL);
1330			}
1331
1332			ubuf += ninit;
1333			LIST_FOREACH(tmpcs, &ccd_softc_list, list) {
1334				if (!IS_INITED(tmpcs))
1335					continue;
1336				error = copyout(tmpcs, --ubuf,
1337				    sizeof(struct ccd_s));
1338				if (error != 0)
1339					/* XXX: UNLOCK(unique unit numbers) */
1340					return (error);
1341			}
1342			/* XXX: UNLOCK(unique unit numbers) */
1343		}
1344		break;
1345
1346	case CCDCPPINFO:
1347		if (!IS_INITED(cs))
1348			return (ENXIO);
1349
1350		{
1351			int len = 0;
1352			struct ccdcpps *cpps = (struct ccdcpps *)data;
1353			char *ubuf = cpps->buffer;
1354
1355
1356			for (i = 0; i < cs->sc_nccdisks; ++i)
1357				len += cs->sc_cinfo[i].ci_pathlen;
1358
1359			if (cpps->size == 0) {
1360				cpps->size = len;
1361				break;
1362			} else if (cpps->size < len) {
1363				return (ENOMEM);
1364			}
1365
1366			for (i = 0; i < cs->sc_nccdisks; ++i) {
1367				len = cs->sc_cinfo[i].ci_pathlen;
1368				error = copyout(cs->sc_cinfo[i].ci_path, ubuf,
1369				    len);
1370				if (error != 0)
1371					return (error);
1372				ubuf += len;
1373			}
1374			return(copyout("", ubuf, 1));
1375		}
1376		break;
1377
1378	case DIOCGDINFO:
1379		if (!IS_INITED(cs))
1380			return (ENXIO);
1381
1382		*(struct disklabel *)data = cs->sc_label;
1383		break;
1384
1385	case DIOCWDINFO:
1386	case DIOCSDINFO:
1387		if (!IS_INITED(cs))
1388			return (ENXIO);
1389
1390		if ((flag & FWRITE) == 0)
1391			return (EBADF);
1392
1393		if ((error = ccdlock(cs)) != 0)
1394			return (error);
1395
1396		cs->sc_flags |= CCDF_LABELLING;
1397
1398		error = setdisklabel(&cs->sc_label,
1399		    (struct disklabel *)data, 0);
1400		if (error == 0) {
1401			if (cmd == DIOCWDINFO)
1402				error = writedisklabel(CCDLABELDEV(dev),
1403				    &cs->sc_label);
1404		}
1405
1406		cs->sc_flags &= ~CCDF_LABELLING;
1407
1408		ccdunlock(cs);
1409
1410		if (error)
1411			return (error);
1412		break;
1413
1414	case DIOCWLABEL:
1415		if (!IS_INITED(cs))
1416			return (ENXIO);
1417
1418		if ((flag & FWRITE) == 0)
1419			return (EBADF);
1420		if (*(int *)data != 0)
1421			cs->sc_flags |= CCDF_WLABEL;
1422		else
1423			cs->sc_flags &= ~CCDF_WLABEL;
1424		break;
1425
1426	default:
1427		return (ENOTTY);
1428	}
1429
1430	return (0);
1431}
1432
1433static int
1434ccdsize(dev_t dev)
1435{
1436	struct ccd_s *cs;
1437	int part, size;
1438
1439	if (dev->si_drv1 == dev)
1440		return (-1);
1441
1442	if (ccdopen(dev, 0, S_IFCHR, curthread))
1443		return (-1);
1444
1445	cs = ccdfind(ccdunit(dev));
1446	part = ccdpart(dev);
1447
1448	if (!IS_INITED(cs))
1449		return (-1);
1450
1451	if (cs->sc_label.d_partitions[part].p_fstype != FS_SWAP)
1452		size = -1;
1453	else
1454		size = cs->sc_label.d_partitions[part].p_size;
1455
1456	if (ccdclose(dev, 0, S_IFCHR, curthread))
1457		return (-1);
1458
1459	return (size);
1460}
1461
1462/*
1463 * Lookup the provided name in the filesystem.  If the file exists,
1464 * is a valid block device, and isn't being used by anyone else,
1465 * set *vpp to the file's vnode.
1466 */
1467static int
1468ccdlookup(char *path, struct thread *td, struct vnode **vpp)
1469{
1470	struct nameidata nd;
1471	struct vnode *vp;
1472	int error, flags;
1473
1474	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, path, td);
1475	flags = FREAD | FWRITE;
1476	if ((error = vn_open(&nd, &flags, 0)) != 0) {
1477		return (error);
1478	}
1479	vp = nd.ni_vp;
1480
1481	if (vrefcnt(vp) > 1) {
1482		error = EBUSY;
1483		goto bad;
1484	}
1485
1486	if (!vn_isdisk(vp, &error))
1487		goto bad;
1488
1489
1490	VOP_UNLOCK(vp, 0, td);
1491	NDFREE(&nd, NDF_ONLY_PNBUF);
1492	*vpp = vp;
1493	return (0);
1494bad:
1495	VOP_UNLOCK(vp, 0, td);
1496	NDFREE(&nd, NDF_ONLY_PNBUF);
1497	/* vn_close does vrele() for vp */
1498	(void)vn_close(vp, FREAD|FWRITE, td->td_ucred, td);
1499	return (error);
1500}
1501
1502/*
1503 * Read the disklabel from the ccd.  If one is not present, fake one
1504 * up.
1505 */
1506static void
1507ccdgetdisklabel(dev_t dev)
1508{
1509	int unit = ccdunit(dev);
1510	struct ccd_s *cs = ccdfind(unit);
1511	char *errstring;
1512	struct disklabel *lp = &cs->sc_label;
1513	struct ccdgeom *ccg = &cs->sc_geom;
1514
1515	bzero(lp, sizeof(*lp));
1516
1517	lp->d_secperunit = cs->sc_size;
1518	lp->d_secsize = ccg->ccg_secsize;
1519	lp->d_nsectors = ccg->ccg_nsectors;
1520	lp->d_ntracks = ccg->ccg_ntracks;
1521	lp->d_ncylinders = ccg->ccg_ncylinders;
1522	lp->d_secpercyl = lp->d_ntracks * lp->d_nsectors;
1523
1524	strncpy(lp->d_typename, "ccd", sizeof(lp->d_typename));
1525	lp->d_type = DTYPE_CCD;
1526	strncpy(lp->d_packname, "fictitious", sizeof(lp->d_packname));
1527	lp->d_rpm = 3600;
1528	lp->d_interleave = 1;
1529	lp->d_flags = 0;
1530
1531	lp->d_partitions[RAW_PART].p_offset = 0;
1532	lp->d_partitions[RAW_PART].p_size = cs->sc_size;
1533	lp->d_partitions[RAW_PART].p_fstype = FS_UNUSED;
1534	lp->d_npartitions = RAW_PART + 1;
1535
1536	lp->d_bbsize = BBSIZE;				/* XXX */
1537	lp->d_sbsize = 0;
1538
1539	lp->d_magic = DISKMAGIC;
1540	lp->d_magic2 = DISKMAGIC;
1541	lp->d_checksum = dkcksum(&cs->sc_label);
1542
1543	/*
1544	 * Call the generic disklabel extraction routine.
1545	 */
1546	errstring = readdisklabel(CCDLABELDEV(dev), &cs->sc_label);
1547	if (errstring != NULL)
1548		ccdmakedisklabel(cs);
1549
1550}
1551
1552/*
1553 * Take care of things one might want to take care of in the event
1554 * that a disklabel isn't present.
1555 */
1556static void
1557ccdmakedisklabel(struct ccd_s *cs)
1558{
1559	struct disklabel *lp = &cs->sc_label;
1560
1561	/*
1562	 * For historical reasons, if there's no disklabel present
1563	 * the raw partition must be marked FS_BSDFFS.
1564	 */
1565	lp->d_partitions[RAW_PART].p_fstype = FS_BSDFFS;
1566
1567	strncpy(lp->d_packname, "default label", sizeof(lp->d_packname));
1568}
1569
1570/*
1571 * Wait interruptibly for an exclusive lock.
1572 *
1573 * XXX
1574 * Several drivers do this; it should be abstracted and made MP-safe.
1575 */
1576static int
1577ccdlock(struct ccd_s *cs)
1578{
1579	int error;
1580
1581	while ((cs->sc_flags & CCDF_LOCKED) != 0) {
1582		cs->sc_flags |= CCDF_WANTED;
1583		if ((error = tsleep(cs, PRIBIO | PCATCH, "ccdlck", 0)) != 0)
1584			return (error);
1585	}
1586	cs->sc_flags |= CCDF_LOCKED;
1587	return (0);
1588}
1589
1590/*
1591 * Unlock and wake up any waiters.
1592 */
1593static void
1594ccdunlock(struct ccd_s *cs)
1595{
1596
1597	cs->sc_flags &= ~CCDF_LOCKED;
1598	if ((cs->sc_flags & CCDF_WANTED) != 0) {
1599		cs->sc_flags &= ~CCDF_WANTED;
1600		wakeup(cs);
1601	}
1602}
1603
1604