Deleted Added
full compact
geom_ccd.c (111232) geom_ccd.c (111815)
1/*
2 * Copyright (c) 2003 Poul-Henning Kamp.
3 * Copyright (c) 1995 Jason R. Thorpe.
4 * Copyright (c) 1990, 1993
5 * The Regents of the University of California. All rights reserved.
6 * All rights reserved.
7 * Copyright (c) 1988 University of Utah.
8 *
9 * This code is derived from software contributed to Berkeley by
10 * the Systems Programming Group of the University of Utah Computer
11 * Science Department.
12 *
13 * Redistribution and use in source and binary forms, with or without
14 * modification, are permitted provided that the following conditions
15 * are met:
16 * 1. Redistributions of source code must retain the above copyright
17 * notice, this list of conditions and the following disclaimer.
18 * 2. Redistributions in binary form must reproduce the above copyright
19 * notice, this list of conditions and the following disclaimer in the
20 * documentation and/or other materials provided with the distribution.
21 * 3. All advertising materials mentioning features or use of this software
22 * must display the following acknowledgement:
23 * This product includes software developed for the NetBSD Project
24 * by Jason R. Thorpe.
25 * 4. The names of the authors may not be used to endorse or promote products
26 * derived from this software without specific prior written permission.
27 *
28 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
29 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
30 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
31 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
32 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
33 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
34 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
35 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
36 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
37 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
38 * SUCH DAMAGE.
39 *
40 * Dynamic configuration and disklabel support by:
41 * Jason R. Thorpe <thorpej@nas.nasa.gov>
42 * Numerical Aerodynamic Simulation Facility
43 * Mail Stop 258-6
44 * NASA Ames Research Center
45 * Moffett Field, CA 94035
46 *
47 * from: Utah $Hdr: cd.c 1.6 90/11/28$
48 *
49 * @(#)cd.c 8.2 (Berkeley) 11/16/93
50 *
51 * $NetBSD: ccd.c,v 1.22 1995/12/08 19:13:26 thorpej Exp $
52 *
1/*
2 * Copyright (c) 2003 Poul-Henning Kamp.
3 * Copyright (c) 1995 Jason R. Thorpe.
4 * Copyright (c) 1990, 1993
5 * The Regents of the University of California. All rights reserved.
6 * All rights reserved.
7 * Copyright (c) 1988 University of Utah.
8 *
9 * This code is derived from software contributed to Berkeley by
10 * the Systems Programming Group of the University of Utah Computer
11 * Science Department.
12 *
13 * Redistribution and use in source and binary forms, with or without
14 * modification, are permitted provided that the following conditions
15 * are met:
16 * 1. Redistributions of source code must retain the above copyright
17 * notice, this list of conditions and the following disclaimer.
18 * 2. Redistributions in binary form must reproduce the above copyright
19 * notice, this list of conditions and the following disclaimer in the
20 * documentation and/or other materials provided with the distribution.
21 * 3. All advertising materials mentioning features or use of this software
22 * must display the following acknowledgement:
23 * This product includes software developed for the NetBSD Project
24 * by Jason R. Thorpe.
25 * 4. The names of the authors may not be used to endorse or promote products
26 * derived from this software without specific prior written permission.
27 *
28 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
29 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
30 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
31 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
32 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
33 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
34 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
35 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
36 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
37 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
38 * SUCH DAMAGE.
39 *
40 * Dynamic configuration and disklabel support by:
41 * Jason R. Thorpe <thorpej@nas.nasa.gov>
42 * Numerical Aerodynamic Simulation Facility
43 * Mail Stop 258-6
44 * NASA Ames Research Center
45 * Moffett Field, CA 94035
46 *
47 * from: Utah $Hdr: cd.c 1.6 90/11/28$
48 *
49 * @(#)cd.c 8.2 (Berkeley) 11/16/93
50 *
51 * $NetBSD: ccd.c,v 1.22 1995/12/08 19:13:26 thorpej Exp $
52 *
53 * $FreeBSD: head/sys/geom/geom_ccd.c 111232 2003-02-21 23:25:43Z phk $
53 * $FreeBSD: head/sys/geom/geom_ccd.c 111815 2003-03-03 12:15:54Z phk $
54 */
55
56#include <sys/param.h>
57#include <sys/systm.h>
58#include <sys/kernel.h>
59#include <sys/module.h>
60#include <sys/proc.h>
61#include <sys/bio.h>
62#include <sys/malloc.h>
63#include <sys/namei.h>
64#include <sys/conf.h>
65#include <sys/stat.h>
66#include <sys/stdint.h>
67#include <sys/sysctl.h>
68#include <sys/disk.h>
69#include <sys/devicestat.h>
70#include <sys/fcntl.h>
71#include <sys/vnode.h>
72
73#include <sys/ccdvar.h>
74
75MALLOC_DEFINE(M_CCD, "CCD driver", "Concatenated Disk driver");
76
77/*
78 This is how mirroring works (only writes are special):
79
80 When initiating a write, ccdbuffer() returns two "struct ccdbuf *"s
81 linked together by the cb_mirror field. "cb_pflags &
82 CCDPF_MIRROR_DONE" is set to 0 on both of them.
83
84 When a component returns to ccdiodone(), it checks if "cb_pflags &
85 CCDPF_MIRROR_DONE" is set or not. If not, it sets the partner's
86 flag and returns. If it is, it means its partner has already
87 returned, so it will go to the regular cleanup.
88
89 */
90
91struct ccdbuf {
92 struct bio cb_buf; /* new I/O buf */
93 struct bio *cb_obp; /* ptr. to original I/O buf */
94 struct ccdbuf *cb_freenext; /* free list link */
95 struct ccd_s *cb_softc;
96 int cb_comp; /* target component */
97 int cb_pflags; /* mirror/parity status flag */
98 struct ccdbuf *cb_mirror; /* mirror counterpart */
99};
100
101/* bits in cb_pflags */
102#define CCDPF_MIRROR_DONE 1 /* if set, mirror counterpart is done */
103
104/* convinient macros for often-used statements */
105#define IS_ALLOCATED(unit) (ccdfind(unit) != NULL)
106#define IS_INITED(cs) (((cs)->sc_flags & CCDF_INITED) != 0)
107
108static dev_t ccdctldev;
109
110static disk_strategy_t ccdstrategy;
111static d_ioctl_t ccdctlioctl;
112
113#define NCCDFREEHIWAT 16
114
115#define CDEV_MAJOR 74
116
117static struct cdevsw ccdctl_cdevsw = {
54 */
55
56#include <sys/param.h>
57#include <sys/systm.h>
58#include <sys/kernel.h>
59#include <sys/module.h>
60#include <sys/proc.h>
61#include <sys/bio.h>
62#include <sys/malloc.h>
63#include <sys/namei.h>
64#include <sys/conf.h>
65#include <sys/stat.h>
66#include <sys/stdint.h>
67#include <sys/sysctl.h>
68#include <sys/disk.h>
69#include <sys/devicestat.h>
70#include <sys/fcntl.h>
71#include <sys/vnode.h>
72
73#include <sys/ccdvar.h>
74
75MALLOC_DEFINE(M_CCD, "CCD driver", "Concatenated Disk driver");
76
77/*
78 This is how mirroring works (only writes are special):
79
80 When initiating a write, ccdbuffer() returns two "struct ccdbuf *"s
81 linked together by the cb_mirror field. "cb_pflags &
82 CCDPF_MIRROR_DONE" is set to 0 on both of them.
83
84 When a component returns to ccdiodone(), it checks if "cb_pflags &
85 CCDPF_MIRROR_DONE" is set or not. If not, it sets the partner's
86 flag and returns. If it is, it means its partner has already
87 returned, so it will go to the regular cleanup.
88
89 */
90
91struct ccdbuf {
92 struct bio cb_buf; /* new I/O buf */
93 struct bio *cb_obp; /* ptr. to original I/O buf */
94 struct ccdbuf *cb_freenext; /* free list link */
95 struct ccd_s *cb_softc;
96 int cb_comp; /* target component */
97 int cb_pflags; /* mirror/parity status flag */
98 struct ccdbuf *cb_mirror; /* mirror counterpart */
99};
100
101/* bits in cb_pflags */
102#define CCDPF_MIRROR_DONE 1 /* if set, mirror counterpart is done */
103
104/* convinient macros for often-used statements */
105#define IS_ALLOCATED(unit) (ccdfind(unit) != NULL)
106#define IS_INITED(cs) (((cs)->sc_flags & CCDF_INITED) != 0)
107
108static dev_t ccdctldev;
109
110static disk_strategy_t ccdstrategy;
111static d_ioctl_t ccdctlioctl;
112
113#define NCCDFREEHIWAT 16
114
115#define CDEV_MAJOR 74
116
117static struct cdevsw ccdctl_cdevsw = {
118 /* open */ nullopen,
119 /* close */ nullclose,
120 /* read */ noread,
121 /* write */ nowrite,
122 /* ioctl */ ccdctlioctl,
123 /* poll */ nopoll,
124 /* mmap */ nommap,
125 /* strategy */ nostrategy,
126 /* name */ "ccdctl",
127 /* maj */ CDEV_MAJOR,
128 /* dump */ nodump,
129 /* psize */ nopsize,
130 /* flags */ 0
118 .d_open = nullopen,
119 .d_close = nullclose,
120 .d_ioctl = ccdctlioctl,
121 .d_name = "ccdctl",
122 .d_maj = CDEV_MAJOR,
131};
132
133static LIST_HEAD(, ccd_s) ccd_softc_list =
134 LIST_HEAD_INITIALIZER(&ccd_softc_list);
135
136static struct ccd_s *ccdfind(int);
137static struct ccd_s *ccdnew(int);
138static int ccddestroy(struct ccd_s *);
139
140/* called during module initialization */
141static void ccdattach(void);
142static int ccd_modevent(module_t, int, void *);
143
144/* called by biodone() at interrupt time */
145static void ccdiodone(struct bio *bp);
146
147static void ccdstart(struct ccd_s *, struct bio *);
148static void ccdinterleave(struct ccd_s *, int);
149static int ccdinit(struct ccd_s *, char **, struct thread *);
150static int ccdlookup(char *, struct thread *p, struct vnode **);
151static int ccdbuffer(struct ccdbuf **ret, struct ccd_s *,
152 struct bio *, daddr_t, caddr_t, long);
153static int ccdlock(struct ccd_s *);
154static void ccdunlock(struct ccd_s *);
155
156
157/*
158 * Number of blocks to untouched in front of a component partition.
159 * This is to avoid violating its disklabel area when it starts at the
160 * beginning of the slice.
161 */
162#if !defined(CCD_OFFSET)
163#define CCD_OFFSET 16
164#endif
165
166static struct ccd_s *
167ccdfind(int unit)
168{
169 struct ccd_s *sc = NULL;
170
171 /* XXX: LOCK(unique unit numbers) */
172 LIST_FOREACH(sc, &ccd_softc_list, list) {
173 if (sc->sc_unit == unit)
174 break;
175 }
176 /* XXX: UNLOCK(unique unit numbers) */
177 return ((sc == NULL) || (sc->sc_unit != unit) ? NULL : sc);
178}
179
180static struct ccd_s *
181ccdnew(int unit)
182{
183 struct ccd_s *sc;
184
185 /* XXX: LOCK(unique unit numbers) */
186 if (IS_ALLOCATED(unit) || unit > 32)
187 return (NULL);
188
189 MALLOC(sc, struct ccd_s *, sizeof(*sc), M_CCD, M_WAITOK | M_ZERO);
190 sc->sc_unit = unit;
191 LIST_INSERT_HEAD(&ccd_softc_list, sc, list);
192 /* XXX: UNLOCK(unique unit numbers) */
193 return (sc);
194}
195
196static int
197ccddestroy(struct ccd_s *sc)
198{
199
200 /* XXX: LOCK(unique unit numbers) */
201 LIST_REMOVE(sc, list);
202 /* XXX: UNLOCK(unique unit numbers) */
203 FREE(sc, M_CCD);
204 return (0);
205}
206
207/*
208 * Called by main() during pseudo-device attachment. All we need
209 * to do is to add devsw entries.
210 */
211static void
212ccdattach()
213{
214
215 ccdctldev = make_dev(&ccdctl_cdevsw, 0xffff00ff,
216 UID_ROOT, GID_OPERATOR, 0640, "ccd.ctl");
217 ccdctldev->si_drv1 = ccdctldev;
218}
219
220static int
221ccd_modevent(module_t mod, int type, void *data)
222{
223 int error = 0;
224
225 switch (type) {
226 case MOD_LOAD:
227 ccdattach();
228 break;
229
230 case MOD_UNLOAD:
231 printf("ccd0: Unload not supported!\n");
232 error = EOPNOTSUPP;
233 break;
234
235 case MOD_SHUTDOWN:
236 break;
237
238 default:
239 error = EOPNOTSUPP;
240 }
241 return (error);
242}
243
244DEV_MODULE(ccd, ccd_modevent, NULL);
245
246static int
247ccdinit(struct ccd_s *cs, char **cpaths, struct thread *td)
248{
249 struct ccdcinfo *ci = NULL; /* XXX */
250 size_t size;
251 int ix;
252 struct vnode *vp;
253 size_t minsize;
254 int maxsecsize;
255 struct ccdgeom *ccg = &cs->sc_geom;
256 char *tmppath = NULL;
257 int error = 0;
258 off_t mediasize;
259 u_int sectorsize;
260
261
262 cs->sc_size = 0;
263
264 /* Allocate space for the component info. */
265 cs->sc_cinfo = malloc(cs->sc_nccdisks * sizeof(struct ccdcinfo),
266 M_CCD, M_WAITOK);
267
268 /*
269 * Verify that each component piece exists and record
270 * relevant information about it.
271 */
272 maxsecsize = 0;
273 minsize = 0;
274 tmppath = malloc(MAXPATHLEN, M_CCD, M_WAITOK);
275 for (ix = 0; ix < cs->sc_nccdisks; ix++) {
276 vp = cs->sc_vpp[ix];
277 ci = &cs->sc_cinfo[ix];
278 ci->ci_vp = vp;
279
280 /*
281 * Copy in the pathname of the component.
282 */
283 if ((error = copyinstr(cpaths[ix], tmppath,
284 MAXPATHLEN, &ci->ci_pathlen)) != 0) {
285 goto fail;
286 }
287 ci->ci_path = malloc(ci->ci_pathlen, M_CCD, M_WAITOK);
288 bcopy(tmppath, ci->ci_path, ci->ci_pathlen);
289
290 ci->ci_dev = vn_todev(vp);
291
292 /*
293 * Get partition information for the component.
294 */
295 error = VOP_IOCTL(vp, DIOCGMEDIASIZE, (caddr_t)&mediasize,
296 FREAD, td->td_ucred, td);
297 if (error != 0) {
298 goto fail;
299 }
300 /*
301 * Get partition information for the component.
302 */
303 error = VOP_IOCTL(vp, DIOCGSECTORSIZE, (caddr_t)&sectorsize,
304 FREAD, td->td_ucred, td);
305 if (error != 0) {
306 goto fail;
307 }
308 if (sectorsize > maxsecsize)
309 maxsecsize = sectorsize;
310 size = mediasize / DEV_BSIZE - CCD_OFFSET;
311
312 /*
313 * Calculate the size, truncating to an interleave
314 * boundary if necessary.
315 */
316
317 if (cs->sc_ileave > 1)
318 size -= size % cs->sc_ileave;
319
320 if (size == 0) {
321 error = ENODEV;
322 goto fail;
323 }
324
325 if (minsize == 0 || size < minsize)
326 minsize = size;
327 ci->ci_size = size;
328 cs->sc_size += size;
329 }
330
331 free(tmppath, M_CCD);
332 tmppath = NULL;
333
334 /*
335 * Don't allow the interleave to be smaller than
336 * the biggest component sector.
337 */
338 if ((cs->sc_ileave > 0) &&
339 (cs->sc_ileave < (maxsecsize / DEV_BSIZE))) {
340 error = EINVAL;
341 goto fail;
342 }
343
344 /*
345 * If uniform interleave is desired set all sizes to that of
346 * the smallest component. This will guarentee that a single
347 * interleave table is generated.
348 *
349 * Lost space must be taken into account when calculating the
350 * overall size. Half the space is lost when CCDF_MIRROR is
351 * specified.
352 */
353 if (cs->sc_flags & CCDF_UNIFORM) {
354 for (ci = cs->sc_cinfo;
355 ci < &cs->sc_cinfo[cs->sc_nccdisks]; ci++) {
356 ci->ci_size = minsize;
357 }
358 if (cs->sc_flags & CCDF_MIRROR) {
359 /*
360 * Check to see if an even number of components
361 * have been specified. The interleave must also
362 * be non-zero in order for us to be able to
363 * guarentee the topology.
364 */
365 if (cs->sc_nccdisks % 2) {
366 printf("ccd%d: mirroring requires an even number of disks\n", cs->sc_unit );
367 error = EINVAL;
368 goto fail;
369 }
370 if (cs->sc_ileave == 0) {
371 printf("ccd%d: an interleave must be specified when mirroring\n", cs->sc_unit);
372 error = EINVAL;
373 goto fail;
374 }
375 cs->sc_size = (cs->sc_nccdisks/2) * minsize;
376 } else {
377 if (cs->sc_ileave == 0) {
378 printf("ccd%d: an interleave must be specified when using parity\n", cs->sc_unit);
379 error = EINVAL;
380 goto fail;
381 }
382 cs->sc_size = cs->sc_nccdisks * minsize;
383 }
384 }
385
386 /*
387 * Construct the interleave table.
388 */
389 ccdinterleave(cs, cs->sc_unit);
390
391 /*
392 * Create pseudo-geometry based on 1MB cylinders. It's
393 * pretty close.
394 */
395 ccg->ccg_secsize = maxsecsize;
396 ccg->ccg_ntracks = 1;
397 ccg->ccg_nsectors = 1024 * 1024 / ccg->ccg_secsize;
398 ccg->ccg_ncylinders = cs->sc_size / ccg->ccg_nsectors;
399
400 /*
401 * Add a devstat entry for this device.
402 */
403 devstat_add_entry(&cs->device_stats, "ccd", cs->sc_unit,
404 ccg->ccg_secsize, DEVSTAT_ALL_SUPPORTED,
405 DEVSTAT_TYPE_STORARRAY |DEVSTAT_TYPE_IF_OTHER,
406 DEVSTAT_PRIORITY_ARRAY);
407
408 cs->sc_flags |= CCDF_INITED;
409 cs->sc_cflags = cs->sc_flags; /* So we can find out later... */
410 return (0);
411fail:
412 while (ci > cs->sc_cinfo) {
413 ci--;
414 free(ci->ci_path, M_CCD);
415 }
416 if (tmppath != NULL)
417 free(tmppath, M_CCD);
418 free(cs->sc_cinfo, M_CCD);
419 ccddestroy(cs);
420 return (error);
421}
422
423static void
424ccdinterleave(struct ccd_s *cs, int unit)
425{
426 struct ccdcinfo *ci, *smallci;
427 struct ccdiinfo *ii;
428 daddr_t bn, lbn;
429 int ix;
430 u_long size;
431
432
433 /*
434 * Allocate an interleave table. The worst case occurs when each
435 * of N disks is of a different size, resulting in N interleave
436 * tables.
437 *
438 * Chances are this is too big, but we don't care.
439 */
440 size = (cs->sc_nccdisks + 1) * sizeof(struct ccdiinfo);
441 cs->sc_itable = (struct ccdiinfo *)malloc(size, M_CCD,
442 M_WAITOK | M_ZERO);
443
444 /*
445 * Trivial case: no interleave (actually interleave of disk size).
446 * Each table entry represents a single component in its entirety.
447 *
448 * An interleave of 0 may not be used with a mirror setup.
449 */
450 if (cs->sc_ileave == 0) {
451 bn = 0;
452 ii = cs->sc_itable;
453
454 for (ix = 0; ix < cs->sc_nccdisks; ix++) {
455 /* Allocate space for ii_index. */
456 ii->ii_index = malloc(sizeof(int), M_CCD, M_WAITOK);
457 ii->ii_ndisk = 1;
458 ii->ii_startblk = bn;
459 ii->ii_startoff = 0;
460 ii->ii_index[0] = ix;
461 bn += cs->sc_cinfo[ix].ci_size;
462 ii++;
463 }
464 ii->ii_ndisk = 0;
465 return;
466 }
467
468 /*
469 * The following isn't fast or pretty; it doesn't have to be.
470 */
471 size = 0;
472 bn = lbn = 0;
473 for (ii = cs->sc_itable; ; ii++) {
474 /*
475 * Allocate space for ii_index. We might allocate more then
476 * we use.
477 */
478 ii->ii_index = malloc((sizeof(int) * cs->sc_nccdisks),
479 M_CCD, M_WAITOK);
480
481 /*
482 * Locate the smallest of the remaining components
483 */
484 smallci = NULL;
485 for (ci = cs->sc_cinfo; ci < &cs->sc_cinfo[cs->sc_nccdisks];
486 ci++) {
487 if (ci->ci_size > size &&
488 (smallci == NULL ||
489 ci->ci_size < smallci->ci_size)) {
490 smallci = ci;
491 }
492 }
493
494 /*
495 * Nobody left, all done
496 */
497 if (smallci == NULL) {
498 ii->ii_ndisk = 0;
499 free(ii->ii_index, M_CCD);
500 break;
501 }
502
503 /*
504 * Record starting logical block using an sc_ileave blocksize.
505 */
506 ii->ii_startblk = bn / cs->sc_ileave;
507
508 /*
509 * Record starting comopnent block using an sc_ileave
510 * blocksize. This value is relative to the beginning of
511 * a component disk.
512 */
513 ii->ii_startoff = lbn;
514
515 /*
516 * Determine how many disks take part in this interleave
517 * and record their indices.
518 */
519 ix = 0;
520 for (ci = cs->sc_cinfo;
521 ci < &cs->sc_cinfo[cs->sc_nccdisks]; ci++) {
522 if (ci->ci_size >= smallci->ci_size) {
523 ii->ii_index[ix++] = ci - cs->sc_cinfo;
524 }
525 }
526 ii->ii_ndisk = ix;
527 bn += ix * (smallci->ci_size - size);
528 lbn = smallci->ci_size / cs->sc_ileave;
529 size = smallci->ci_size;
530 }
531}
532
533static void
534ccdstrategy(struct bio *bp)
535{
536 struct ccd_s *cs;
537 int pbn; /* in sc_secsize chunks */
538 long sz; /* in sc_secsize chunks */
539
540 cs = bp->bio_disk->d_drv1;
541
542 pbn = bp->bio_blkno / (cs->sc_geom.ccg_secsize / DEV_BSIZE);
543 sz = howmany(bp->bio_bcount, cs->sc_geom.ccg_secsize);
544
545 /*
546 * If out of bounds return an error. If at the EOF point,
547 * simply read or write less.
548 */
549
550 if (pbn < 0 || pbn >= cs->sc_size) {
551 bp->bio_resid = bp->bio_bcount;
552 if (pbn != cs->sc_size)
553 biofinish(bp, NULL, EINVAL);
554 else
555 biodone(bp);
556 return;
557 }
558
559 /*
560 * If the request crosses EOF, truncate the request.
561 */
562 if (pbn + sz > cs->sc_size) {
563 bp->bio_bcount = (cs->sc_size - pbn) *
564 cs->sc_geom.ccg_secsize;
565 }
566
567 bp->bio_resid = bp->bio_bcount;
568
569 /*
570 * "Start" the unit.
571 */
572 ccdstart(cs, bp);
573 return;
574}
575
576static void
577ccdstart(struct ccd_s *cs, struct bio *bp)
578{
579 long bcount, rcount;
580 struct ccdbuf *cbp[2];
581 caddr_t addr;
582 daddr_t bn;
583 int err;
584
585
586 /* Record the transaction start */
587 devstat_start_transaction(&cs->device_stats);
588
589 /*
590 * Translate the partition-relative block number to an absolute.
591 */
592 bn = bp->bio_blkno;
593
594 /*
595 * Allocate component buffers and fire off the requests
596 */
597 addr = bp->bio_data;
598 for (bcount = bp->bio_bcount; bcount > 0; bcount -= rcount) {
599 err = ccdbuffer(cbp, cs, bp, bn, addr, bcount);
600 if (err) {
601 printf("ccdbuffer error %d\n", err);
602 /* We're screwed */
603 bp->bio_resid -= bcount;
604 bp->bio_error = ENOMEM;
605 bp->bio_flags |= BIO_ERROR;
606 return;
607 }
608 rcount = cbp[0]->cb_buf.bio_bcount;
609
610 if (cs->sc_cflags & CCDF_MIRROR) {
611 /*
612 * Mirroring. Writes go to both disks, reads are
613 * taken from whichever disk seems most appropriate.
614 *
615 * We attempt to localize reads to the disk whos arm
616 * is nearest the read request. We ignore seeks due
617 * to writes when making this determination and we
618 * also try to avoid hogging.
619 */
620 if (cbp[0]->cb_buf.bio_cmd == BIO_WRITE) {
621 BIO_STRATEGY(&cbp[0]->cb_buf);
622 BIO_STRATEGY(&cbp[1]->cb_buf);
623 } else {
624 int pick = cs->sc_pick;
625 daddr_t range = cs->sc_size / 16;
626
627 if (bn < cs->sc_blk[pick] - range ||
628 bn > cs->sc_blk[pick] + range
629 ) {
630 cs->sc_pick = pick = 1 - pick;
631 }
632 cs->sc_blk[pick] = bn + btodb(rcount);
633 BIO_STRATEGY(&cbp[pick]->cb_buf);
634 }
635 } else {
636 /*
637 * Not mirroring
638 */
639 BIO_STRATEGY(&cbp[0]->cb_buf);
640 }
641 bn += btodb(rcount);
642 addr += rcount;
643 }
644}
645
646/*
647 * Build a component buffer header.
648 */
649static int
650ccdbuffer(struct ccdbuf **cb, struct ccd_s *cs, struct bio *bp, daddr_t bn, caddr_t addr, long bcount)
651{
652 struct ccdcinfo *ci, *ci2 = NULL; /* XXX */
653 struct ccdbuf *cbp;
654 daddr_t cbn, cboff;
655 off_t cbc;
656
657 /*
658 * Determine which component bn falls in.
659 */
660 cbn = bn;
661 cboff = 0;
662
663 if (cs->sc_ileave == 0) {
664 /*
665 * Serially concatenated and neither a mirror nor a parity
666 * config. This is a special case.
667 */
668 daddr_t sblk;
669
670 sblk = 0;
671 for (ci = cs->sc_cinfo; cbn >= sblk + ci->ci_size; ci++)
672 sblk += ci->ci_size;
673 cbn -= sblk;
674 } else {
675 struct ccdiinfo *ii;
676 int ccdisk, off;
677
678 /*
679 * Calculate cbn, the logical superblock (sc_ileave chunks),
680 * and cboff, a normal block offset (DEV_BSIZE chunks) relative
681 * to cbn.
682 */
683 cboff = cbn % cs->sc_ileave; /* DEV_BSIZE gran */
684 cbn = cbn / cs->sc_ileave; /* DEV_BSIZE * ileave gran */
685
686 /*
687 * Figure out which interleave table to use.
688 */
689 for (ii = cs->sc_itable; ii->ii_ndisk; ii++) {
690 if (ii->ii_startblk > cbn)
691 break;
692 }
693 ii--;
694
695 /*
696 * off is the logical superblock relative to the beginning
697 * of this interleave block.
698 */
699 off = cbn - ii->ii_startblk;
700
701 /*
702 * We must calculate which disk component to use (ccdisk),
703 * and recalculate cbn to be the superblock relative to
704 * the beginning of the component. This is typically done by
705 * adding 'off' and ii->ii_startoff together. However, 'off'
706 * must typically be divided by the number of components in
707 * this interleave array to be properly convert it from a
708 * CCD-relative logical superblock number to a
709 * component-relative superblock number.
710 */
711 if (ii->ii_ndisk == 1) {
712 /*
713 * When we have just one disk, it can't be a mirror
714 * or a parity config.
715 */
716 ccdisk = ii->ii_index[0];
717 cbn = ii->ii_startoff + off;
718 } else {
719 if (cs->sc_cflags & CCDF_MIRROR) {
720 /*
721 * We have forced a uniform mapping, resulting
722 * in a single interleave array. We double
723 * up on the first half of the available
724 * components and our mirror is in the second
725 * half. This only works with a single
726 * interleave array because doubling up
727 * doubles the number of sectors, so there
728 * cannot be another interleave array because
729 * the next interleave array's calculations
730 * would be off.
731 */
732 int ndisk2 = ii->ii_ndisk / 2;
733 ccdisk = ii->ii_index[off % ndisk2];
734 cbn = ii->ii_startoff + off / ndisk2;
735 ci2 = &cs->sc_cinfo[ccdisk + ndisk2];
736 } else {
737 ccdisk = ii->ii_index[off % ii->ii_ndisk];
738 cbn = ii->ii_startoff + off / ii->ii_ndisk;
739 }
740 }
741
742 ci = &cs->sc_cinfo[ccdisk];
743
744 /*
745 * Convert cbn from a superblock to a normal block so it
746 * can be used to calculate (along with cboff) the normal
747 * block index into this particular disk.
748 */
749 cbn *= cs->sc_ileave;
750 }
751
752 /*
753 * Fill in the component buf structure.
754 */
755 cbp = malloc(sizeof(struct ccdbuf), M_CCD, M_NOWAIT | M_ZERO);
756 if (cbp == NULL)
757 return (ENOMEM);
758 cbp->cb_buf.bio_cmd = bp->bio_cmd;
759 cbp->cb_buf.bio_done = ccdiodone;
760 cbp->cb_buf.bio_dev = ci->ci_dev; /* XXX */
761 cbp->cb_buf.bio_blkno = cbn + cboff + CCD_OFFSET;
762 cbp->cb_buf.bio_offset = dbtob(cbn + cboff + CCD_OFFSET);
763 cbp->cb_buf.bio_data = addr;
764 cbp->cb_buf.bio_caller2 = cbp;
765 if (cs->sc_ileave == 0)
766 cbc = dbtob((off_t)(ci->ci_size - cbn));
767 else
768 cbc = dbtob((off_t)(cs->sc_ileave - cboff));
769 cbp->cb_buf.bio_bcount = (cbc < bcount) ? cbc : bcount;
770 cbp->cb_buf.bio_caller1 = (void*)cbp->cb_buf.bio_bcount;
771
772 /*
773 * context for ccdiodone
774 */
775 cbp->cb_obp = bp;
776 cbp->cb_softc = cs;
777 cbp->cb_comp = ci - cs->sc_cinfo;
778
779 cb[0] = cbp;
780
781 /*
782 * Note: both I/O's setup when reading from mirror, but only one
783 * will be executed.
784 */
785 if (cs->sc_cflags & CCDF_MIRROR) {
786 /* mirror, setup second I/O */
787 cbp = malloc(sizeof(struct ccdbuf), M_CCD, M_NOWAIT);
788 if (cbp == NULL) {
789 free(cb[0], M_CCD);
790 cb[0] = NULL;
791 return (ENOMEM);
792 }
793 bcopy(cb[0], cbp, sizeof(struct ccdbuf));
794 cbp->cb_buf.bio_dev = ci2->ci_dev;
795 cbp->cb_comp = ci2 - cs->sc_cinfo;
796 cb[1] = cbp;
797 /* link together the ccdbuf's and clear "mirror done" flag */
798 cb[0]->cb_mirror = cb[1];
799 cb[1]->cb_mirror = cb[0];
800 cb[0]->cb_pflags &= ~CCDPF_MIRROR_DONE;
801 cb[1]->cb_pflags &= ~CCDPF_MIRROR_DONE;
802 }
803 return (0);
804}
805
806/*
807 * Called at interrupt time.
808 * Mark the component as done and if all components are done,
809 * take a ccd interrupt.
810 */
811static void
812ccdiodone(struct bio *ibp)
813{
814 struct ccdbuf *cbp;
815 struct bio *bp;
816 struct ccd_s *cs;
817 int count;
818
819 cbp = ibp->bio_caller2;
820 cs = cbp->cb_softc;
821 bp = cbp->cb_obp;
822 /*
823 * If an error occured, report it. If this is a mirrored
824 * configuration and the first of two possible reads, do not
825 * set the error in the bp yet because the second read may
826 * succeed.
827 */
828
829 if (cbp->cb_buf.bio_flags & BIO_ERROR) {
830 const char *msg = "";
831
832 if ((cs->sc_cflags & CCDF_MIRROR) &&
833 (cbp->cb_buf.bio_cmd == BIO_READ) &&
834 (cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) {
835 /*
836 * We will try our read on the other disk down
837 * below, also reverse the default pick so if we
838 * are doing a scan we do not keep hitting the
839 * bad disk first.
840 */
841
842 msg = ", trying other disk";
843 cs->sc_pick = 1 - cs->sc_pick;
844 cs->sc_blk[cs->sc_pick] = bp->bio_blkno;
845 } else {
846 bp->bio_flags |= BIO_ERROR;
847 bp->bio_error = cbp->cb_buf.bio_error ?
848 cbp->cb_buf.bio_error : EIO;
849 }
850 printf("ccd%d: error %d on component %d block %jd "
851 "(ccd block %jd)%s\n", cs->sc_unit, bp->bio_error,
852 cbp->cb_comp,
853 (intmax_t)cbp->cb_buf.bio_blkno, (intmax_t)bp->bio_blkno,
854 msg);
855 }
856
857 /*
858 * Process mirror. If we are writing, I/O has been initiated on both
859 * buffers and we fall through only after both are finished.
860 *
861 * If we are reading only one I/O is initiated at a time. If an
862 * error occurs we initiate the second I/O and return, otherwise
863 * we free the second I/O without initiating it.
864 */
865
866 if (cs->sc_cflags & CCDF_MIRROR) {
867 if (cbp->cb_buf.bio_cmd == BIO_WRITE) {
868 /*
869 * When writing, handshake with the second buffer
870 * to determine when both are done. If both are not
871 * done, return here.
872 */
873 if ((cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) {
874 cbp->cb_mirror->cb_pflags |= CCDPF_MIRROR_DONE;
875 free(cbp, M_CCD);
876 return;
877 }
878 } else {
879 /*
880 * When reading, either dispose of the second buffer
881 * or initiate I/O on the second buffer if an error
882 * occured with this one.
883 */
884 if ((cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) {
885 if (cbp->cb_buf.bio_flags & BIO_ERROR) {
886 cbp->cb_mirror->cb_pflags |=
887 CCDPF_MIRROR_DONE;
888 BIO_STRATEGY(&cbp->cb_mirror->cb_buf);
889 free(cbp, M_CCD);
890 return;
891 } else {
892 free(cbp->cb_mirror, M_CCD);
893 }
894 }
895 }
896 }
897
898 /*
899 * use bio_caller1 to determine how big the original request was rather
900 * then bio_bcount, because bio_bcount may have been truncated for EOF.
901 *
902 * XXX We check for an error, but we do not test the resid for an
903 * aligned EOF condition. This may result in character & block
904 * device access not recognizing EOF properly when read or written
905 * sequentially, but will not effect filesystems.
906 */
907 count = (long)cbp->cb_buf.bio_caller1;
908 free(cbp, M_CCD);
909
910 /*
911 * If all done, "interrupt".
912 */
913 bp->bio_resid -= count;
914 if (bp->bio_resid < 0)
915 panic("ccdiodone: count");
916 if (bp->bio_resid == 0) {
917 if (bp->bio_flags & BIO_ERROR)
918 bp->bio_resid = bp->bio_bcount;
919 biofinish(bp, &cs->device_stats, 0);
920 }
921}
922
923static int ccdioctltoo(int unit, u_long cmd, caddr_t data, int flag, struct thread *td);
924
925static int
926ccdctlioctl(dev_t dev, u_long cmd, caddr_t data, int flag, struct thread *td)
927{
928 struct ccd_ioctl *ccio;
929 u_int unit;
930 dev_t dev2;
931 int error;
932
933 switch (cmd) {
934 case CCDIOCSET:
935 case CCDIOCCLR:
936 ccio = (struct ccd_ioctl *)data;
937 unit = ccio->ccio_size;
938 return (ccdioctltoo(unit, cmd, data, flag, td));
939 case CCDCONFINFO:
940 {
941 int ninit = 0;
942 struct ccdconf *conf = (struct ccdconf *)data;
943 struct ccd_s *tmpcs;
944 struct ccd_s *ubuf = conf->buffer;
945
946 /* XXX: LOCK(unique unit numbers) */
947 LIST_FOREACH(tmpcs, &ccd_softc_list, list)
948 if (IS_INITED(tmpcs))
949 ninit++;
950
951 if (conf->size == 0) {
952 conf->size = sizeof(struct ccd_s) * ninit;
953 return (0);
954 } else if ((conf->size / sizeof(struct ccd_s) != ninit) ||
955 (conf->size % sizeof(struct ccd_s) != 0)) {
956 /* XXX: UNLOCK(unique unit numbers) */
957 return (EINVAL);
958 }
959
960 ubuf += ninit;
961 LIST_FOREACH(tmpcs, &ccd_softc_list, list) {
962 if (!IS_INITED(tmpcs))
963 continue;
964 error = copyout(tmpcs, --ubuf,
965 sizeof(struct ccd_s));
966 if (error != 0)
967 /* XXX: UNLOCK(unique unit numbers) */
968 return (error);
969 }
970 /* XXX: UNLOCK(unique unit numbers) */
971 return (0);
972 }
973
974 case CCDCPPINFO:
975 {
976 struct ccdcpps *cpps = (struct ccdcpps *)data;
977 char *ubuf = cpps->buffer;
978 struct ccd_s *cs;
979
980
981 error = copyin(ubuf, &unit, sizeof (unit));
982 if (error)
983 return (error);
984
985 if (!IS_ALLOCATED(unit))
986 return (ENXIO);
987 dev2 = makedev(CDEV_MAJOR, unit * 8 + 2);
988 cs = ccdfind(unit);
989 if (!IS_INITED(cs))
990 return (ENXIO);
991
992 {
993 int len = 0, i;
994 struct ccdcpps *cpps = (struct ccdcpps *)data;
995 char *ubuf = cpps->buffer;
996
997
998 for (i = 0; i < cs->sc_nccdisks; ++i)
999 len += cs->sc_cinfo[i].ci_pathlen;
1000
1001 if (cpps->size < len)
1002 return (ENOMEM);
1003
1004 for (i = 0; i < cs->sc_nccdisks; ++i) {
1005 len = cs->sc_cinfo[i].ci_pathlen;
1006 error = copyout(cs->sc_cinfo[i].ci_path, ubuf,
1007 len);
1008 if (error != 0)
1009 return (error);
1010 ubuf += len;
1011 }
1012 return(copyout("", ubuf, 1));
1013 }
1014 break;
1015 }
1016
1017 default:
1018 return (ENXIO);
1019 }
1020}
1021
1022static int
1023ccdioctltoo(int unit, u_long cmd, caddr_t data, int flag, struct thread *td)
1024{
1025 int i, j, lookedup = 0, error = 0;
1026 struct ccd_s *cs;
1027 struct ccd_ioctl *ccio = (struct ccd_ioctl *)data;
1028 struct ccdgeom *ccg;
1029 char **cpp;
1030 struct vnode **vpp;
1031
1032 cs = ccdfind(unit);
1033 switch (cmd) {
1034 case CCDIOCSET:
1035 if (cs == NULL)
1036 cs = ccdnew(unit);
1037 if (IS_INITED(cs))
1038 return (EBUSY);
1039
1040 if ((flag & FWRITE) == 0)
1041 return (EBADF);
1042
1043 if ((error = ccdlock(cs)) != 0)
1044 return (error);
1045
1046 if (ccio->ccio_ndisks > CCD_MAXNDISKS)
1047 return (EINVAL);
1048
1049 /* Fill in some important bits. */
1050 cs->sc_ileave = ccio->ccio_ileave;
1051 if (cs->sc_ileave == 0 && (ccio->ccio_flags & CCDF_MIRROR)) {
1052 printf("ccd%d: disabling mirror, interleave is 0\n",
1053 unit);
1054 ccio->ccio_flags &= ~(CCDF_MIRROR);
1055 }
1056 if ((ccio->ccio_flags & CCDF_MIRROR) &&
1057 !(ccio->ccio_flags & CCDF_UNIFORM)) {
1058 printf("ccd%d: mirror/parity forces uniform flag\n",
1059 unit);
1060 ccio->ccio_flags |= CCDF_UNIFORM;
1061 }
1062 cs->sc_flags = ccio->ccio_flags & CCDF_USERMASK;
1063
1064 /*
1065 * Allocate space for and copy in the array of
1066 * componet pathnames and device numbers.
1067 */
1068 cpp = malloc(ccio->ccio_ndisks * sizeof(char *),
1069 M_CCD, M_WAITOK);
1070 vpp = malloc(ccio->ccio_ndisks * sizeof(struct vnode *),
1071 M_CCD, M_WAITOK);
1072
1073 error = copyin((caddr_t)ccio->ccio_disks, (caddr_t)cpp,
1074 ccio->ccio_ndisks * sizeof(char **));
1075 if (error) {
1076 free(vpp, M_CCD);
1077 free(cpp, M_CCD);
1078 ccdunlock(cs);
1079 return (error);
1080 }
1081
1082
1083 for (i = 0; i < ccio->ccio_ndisks; ++i) {
1084 if ((error = ccdlookup(cpp[i], td, &vpp[i])) != 0) {
1085 for (j = 0; j < lookedup; ++j)
1086 (void)vn_close(vpp[j], FREAD|FWRITE,
1087 td->td_ucred, td);
1088 free(vpp, M_CCD);
1089 free(cpp, M_CCD);
1090 ccdunlock(cs);
1091 return (error);
1092 }
1093 ++lookedup;
1094 }
1095 cs->sc_vpp = vpp;
1096 cs->sc_nccdisks = ccio->ccio_ndisks;
1097
1098 /*
1099 * Initialize the ccd. Fills in the softc for us.
1100 */
1101 if ((error = ccdinit(cs, cpp, td)) != 0) {
1102 for (j = 0; j < lookedup; ++j)
1103 (void)vn_close(vpp[j], FREAD|FWRITE,
1104 td->td_ucred, td);
1105 /*
1106 * We can't ccddestroy() cs just yet, because nothing
1107 * prevents user-level app to do another ioctl()
1108 * without closing the device first, therefore
1109 * declare unit null and void and let ccdclose()
1110 * destroy it when it is safe to do so.
1111 */
1112 cs->sc_flags &= (CCDF_WANTED | CCDF_LOCKED);
1113 free(vpp, M_CCD);
1114 free(cpp, M_CCD);
1115 ccdunlock(cs);
1116 return (error);
1117 }
1118 free(cpp, M_CCD);
1119
1120 /*
1121 * The ccd has been successfully initialized, so
1122 * we can place it into the array and read the disklabel.
1123 */
1124 ccio->ccio_unit = unit;
1125 ccio->ccio_size = cs->sc_size;
1126 ccg = &cs->sc_geom;
1127 cs->sc_disk = malloc(sizeof(struct disk), M_CCD,
1128 M_ZERO | M_WAITOK);
1129 cs->sc_disk->d_strategy = ccdstrategy;
1130 cs->sc_disk->d_name = "ccd";
1131 cs->sc_disk->d_sectorsize = ccg->ccg_secsize;
1132 cs->sc_disk->d_mediasize =
1133 cs->sc_size * (off_t)ccg->ccg_secsize;
1134 cs->sc_disk->d_fwsectors = ccg->ccg_nsectors;
1135 cs->sc_disk->d_fwheads = ccg->ccg_ntracks;
1136 cs->sc_disk->d_drv1 = cs;
1137 cs->sc_disk->d_maxsize = MAXPHYS;
1138 disk_create(unit, cs->sc_disk, 0, NULL, NULL);
1139
1140 ccdunlock(cs);
1141
1142 break;
1143
1144 case CCDIOCCLR:
1145 if (cs == NULL)
1146 return (ENXIO);
1147
1148 if (!IS_INITED(cs))
1149 return (ENXIO);
1150
1151 if ((flag & FWRITE) == 0)
1152 return (EBADF);
1153
1154 if ((error = ccdlock(cs)) != 0)
1155 return (error);
1156
1157 /* Don't unconfigure if any other partitions are open */
1158 if (cs->sc_disk->d_flags & DISKFLAG_OPEN) {
1159 ccdunlock(cs);
1160 return (EBUSY);
1161 }
1162
1163 disk_destroy(cs->sc_disk);
1164 free(cs->sc_disk, M_CCD);
1165 cs->sc_disk = NULL;
1166 /* Declare unit null and void (reset all flags) */
1167 cs->sc_flags &= (CCDF_WANTED | CCDF_LOCKED);
1168
1169 /* Close the components and free their pathnames. */
1170 for (i = 0; i < cs->sc_nccdisks; ++i) {
1171 /*
1172 * XXX: this close could potentially fail and
1173 * cause Bad Things. Maybe we need to force
1174 * the close to happen?
1175 */
1176 (void)vn_close(cs->sc_cinfo[i].ci_vp, FREAD|FWRITE,
1177 td->td_ucred, td);
1178 free(cs->sc_cinfo[i].ci_path, M_CCD);
1179 }
1180
1181 /* Free interleave index. */
1182 for (i = 0; cs->sc_itable[i].ii_ndisk; ++i)
1183 free(cs->sc_itable[i].ii_index, M_CCD);
1184
1185 /* Free component info and interleave table. */
1186 free(cs->sc_cinfo, M_CCD);
1187 free(cs->sc_itable, M_CCD);
1188 free(cs->sc_vpp, M_CCD);
1189
1190 /* And remove the devstat entry. */
1191 devstat_remove_entry(&cs->device_stats);
1192
1193 /* This must be atomic. */
1194 ccdunlock(cs);
1195 ccddestroy(cs);
1196
1197 break;
1198 }
1199
1200 return (0);
1201}
1202
1203
1204/*
1205 * Lookup the provided name in the filesystem. If the file exists,
1206 * is a valid block device, and isn't being used by anyone else,
1207 * set *vpp to the file's vnode.
1208 */
1209static int
1210ccdlookup(char *path, struct thread *td, struct vnode **vpp)
1211{
1212 struct nameidata nd;
1213 struct vnode *vp;
1214 int error, flags;
1215
1216 NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, path, td);
1217 flags = FREAD | FWRITE;
1218 if ((error = vn_open(&nd, &flags, 0)) != 0) {
1219 return (error);
1220 }
1221 vp = nd.ni_vp;
1222
1223 if (vrefcnt(vp) > 1) {
1224 error = EBUSY;
1225 goto bad;
1226 }
1227
1228 if (!vn_isdisk(vp, &error))
1229 goto bad;
1230
1231
1232 VOP_UNLOCK(vp, 0, td);
1233 NDFREE(&nd, NDF_ONLY_PNBUF);
1234 *vpp = vp;
1235 return (0);
1236bad:
1237 VOP_UNLOCK(vp, 0, td);
1238 NDFREE(&nd, NDF_ONLY_PNBUF);
1239 /* vn_close does vrele() for vp */
1240 (void)vn_close(vp, FREAD|FWRITE, td->td_ucred, td);
1241 return (error);
1242}
1243
1244/*
1245
1246 * Wait interruptibly for an exclusive lock.
1247 *
1248 * XXX
1249 * Several drivers do this; it should be abstracted and made MP-safe.
1250 */
1251static int
1252ccdlock(struct ccd_s *cs)
1253{
1254 int error;
1255
1256 while ((cs->sc_flags & CCDF_LOCKED) != 0) {
1257 cs->sc_flags |= CCDF_WANTED;
1258 if ((error = tsleep(cs, PRIBIO | PCATCH, "ccdlck", 0)) != 0)
1259 return (error);
1260 }
1261 cs->sc_flags |= CCDF_LOCKED;
1262 return (0);
1263}
1264
1265/*
1266 * Unlock and wake up any waiters.
1267 */
1268static void
1269ccdunlock(struct ccd_s *cs)
1270{
1271
1272 cs->sc_flags &= ~CCDF_LOCKED;
1273 if ((cs->sc_flags & CCDF_WANTED) != 0) {
1274 cs->sc_flags &= ~CCDF_WANTED;
1275 wakeup(cs);
1276 }
1277}
123};
124
125static LIST_HEAD(, ccd_s) ccd_softc_list =
126 LIST_HEAD_INITIALIZER(&ccd_softc_list);
127
128static struct ccd_s *ccdfind(int);
129static struct ccd_s *ccdnew(int);
130static int ccddestroy(struct ccd_s *);
131
132/* called during module initialization */
133static void ccdattach(void);
134static int ccd_modevent(module_t, int, void *);
135
136/* called by biodone() at interrupt time */
137static void ccdiodone(struct bio *bp);
138
139static void ccdstart(struct ccd_s *, struct bio *);
140static void ccdinterleave(struct ccd_s *, int);
141static int ccdinit(struct ccd_s *, char **, struct thread *);
142static int ccdlookup(char *, struct thread *p, struct vnode **);
143static int ccdbuffer(struct ccdbuf **ret, struct ccd_s *,
144 struct bio *, daddr_t, caddr_t, long);
145static int ccdlock(struct ccd_s *);
146static void ccdunlock(struct ccd_s *);
147
148
149/*
150 * Number of blocks to untouched in front of a component partition.
151 * This is to avoid violating its disklabel area when it starts at the
152 * beginning of the slice.
153 */
154#if !defined(CCD_OFFSET)
155#define CCD_OFFSET 16
156#endif
157
158static struct ccd_s *
159ccdfind(int unit)
160{
161 struct ccd_s *sc = NULL;
162
163 /* XXX: LOCK(unique unit numbers) */
164 LIST_FOREACH(sc, &ccd_softc_list, list) {
165 if (sc->sc_unit == unit)
166 break;
167 }
168 /* XXX: UNLOCK(unique unit numbers) */
169 return ((sc == NULL) || (sc->sc_unit != unit) ? NULL : sc);
170}
171
172static struct ccd_s *
173ccdnew(int unit)
174{
175 struct ccd_s *sc;
176
177 /* XXX: LOCK(unique unit numbers) */
178 if (IS_ALLOCATED(unit) || unit > 32)
179 return (NULL);
180
181 MALLOC(sc, struct ccd_s *, sizeof(*sc), M_CCD, M_WAITOK | M_ZERO);
182 sc->sc_unit = unit;
183 LIST_INSERT_HEAD(&ccd_softc_list, sc, list);
184 /* XXX: UNLOCK(unique unit numbers) */
185 return (sc);
186}
187
188static int
189ccddestroy(struct ccd_s *sc)
190{
191
192 /* XXX: LOCK(unique unit numbers) */
193 LIST_REMOVE(sc, list);
194 /* XXX: UNLOCK(unique unit numbers) */
195 FREE(sc, M_CCD);
196 return (0);
197}
198
199/*
200 * Called by main() during pseudo-device attachment. All we need
201 * to do is to add devsw entries.
202 */
203static void
204ccdattach()
205{
206
207 ccdctldev = make_dev(&ccdctl_cdevsw, 0xffff00ff,
208 UID_ROOT, GID_OPERATOR, 0640, "ccd.ctl");
209 ccdctldev->si_drv1 = ccdctldev;
210}
211
212static int
213ccd_modevent(module_t mod, int type, void *data)
214{
215 int error = 0;
216
217 switch (type) {
218 case MOD_LOAD:
219 ccdattach();
220 break;
221
222 case MOD_UNLOAD:
223 printf("ccd0: Unload not supported!\n");
224 error = EOPNOTSUPP;
225 break;
226
227 case MOD_SHUTDOWN:
228 break;
229
230 default:
231 error = EOPNOTSUPP;
232 }
233 return (error);
234}
235
236DEV_MODULE(ccd, ccd_modevent, NULL);
237
238static int
239ccdinit(struct ccd_s *cs, char **cpaths, struct thread *td)
240{
241 struct ccdcinfo *ci = NULL; /* XXX */
242 size_t size;
243 int ix;
244 struct vnode *vp;
245 size_t minsize;
246 int maxsecsize;
247 struct ccdgeom *ccg = &cs->sc_geom;
248 char *tmppath = NULL;
249 int error = 0;
250 off_t mediasize;
251 u_int sectorsize;
252
253
254 cs->sc_size = 0;
255
256 /* Allocate space for the component info. */
257 cs->sc_cinfo = malloc(cs->sc_nccdisks * sizeof(struct ccdcinfo),
258 M_CCD, M_WAITOK);
259
260 /*
261 * Verify that each component piece exists and record
262 * relevant information about it.
263 */
264 maxsecsize = 0;
265 minsize = 0;
266 tmppath = malloc(MAXPATHLEN, M_CCD, M_WAITOK);
267 for (ix = 0; ix < cs->sc_nccdisks; ix++) {
268 vp = cs->sc_vpp[ix];
269 ci = &cs->sc_cinfo[ix];
270 ci->ci_vp = vp;
271
272 /*
273 * Copy in the pathname of the component.
274 */
275 if ((error = copyinstr(cpaths[ix], tmppath,
276 MAXPATHLEN, &ci->ci_pathlen)) != 0) {
277 goto fail;
278 }
279 ci->ci_path = malloc(ci->ci_pathlen, M_CCD, M_WAITOK);
280 bcopy(tmppath, ci->ci_path, ci->ci_pathlen);
281
282 ci->ci_dev = vn_todev(vp);
283
284 /*
285 * Get partition information for the component.
286 */
287 error = VOP_IOCTL(vp, DIOCGMEDIASIZE, (caddr_t)&mediasize,
288 FREAD, td->td_ucred, td);
289 if (error != 0) {
290 goto fail;
291 }
292 /*
293 * Get partition information for the component.
294 */
295 error = VOP_IOCTL(vp, DIOCGSECTORSIZE, (caddr_t)&sectorsize,
296 FREAD, td->td_ucred, td);
297 if (error != 0) {
298 goto fail;
299 }
300 if (sectorsize > maxsecsize)
301 maxsecsize = sectorsize;
302 size = mediasize / DEV_BSIZE - CCD_OFFSET;
303
304 /*
305 * Calculate the size, truncating to an interleave
306 * boundary if necessary.
307 */
308
309 if (cs->sc_ileave > 1)
310 size -= size % cs->sc_ileave;
311
312 if (size == 0) {
313 error = ENODEV;
314 goto fail;
315 }
316
317 if (minsize == 0 || size < minsize)
318 minsize = size;
319 ci->ci_size = size;
320 cs->sc_size += size;
321 }
322
323 free(tmppath, M_CCD);
324 tmppath = NULL;
325
326 /*
327 * Don't allow the interleave to be smaller than
328 * the biggest component sector.
329 */
330 if ((cs->sc_ileave > 0) &&
331 (cs->sc_ileave < (maxsecsize / DEV_BSIZE))) {
332 error = EINVAL;
333 goto fail;
334 }
335
336 /*
337 * If uniform interleave is desired set all sizes to that of
338 * the smallest component. This will guarentee that a single
339 * interleave table is generated.
340 *
341 * Lost space must be taken into account when calculating the
342 * overall size. Half the space is lost when CCDF_MIRROR is
343 * specified.
344 */
345 if (cs->sc_flags & CCDF_UNIFORM) {
346 for (ci = cs->sc_cinfo;
347 ci < &cs->sc_cinfo[cs->sc_nccdisks]; ci++) {
348 ci->ci_size = minsize;
349 }
350 if (cs->sc_flags & CCDF_MIRROR) {
351 /*
352 * Check to see if an even number of components
353 * have been specified. The interleave must also
354 * be non-zero in order for us to be able to
355 * guarentee the topology.
356 */
357 if (cs->sc_nccdisks % 2) {
358 printf("ccd%d: mirroring requires an even number of disks\n", cs->sc_unit );
359 error = EINVAL;
360 goto fail;
361 }
362 if (cs->sc_ileave == 0) {
363 printf("ccd%d: an interleave must be specified when mirroring\n", cs->sc_unit);
364 error = EINVAL;
365 goto fail;
366 }
367 cs->sc_size = (cs->sc_nccdisks/2) * minsize;
368 } else {
369 if (cs->sc_ileave == 0) {
370 printf("ccd%d: an interleave must be specified when using parity\n", cs->sc_unit);
371 error = EINVAL;
372 goto fail;
373 }
374 cs->sc_size = cs->sc_nccdisks * minsize;
375 }
376 }
377
378 /*
379 * Construct the interleave table.
380 */
381 ccdinterleave(cs, cs->sc_unit);
382
383 /*
384 * Create pseudo-geometry based on 1MB cylinders. It's
385 * pretty close.
386 */
387 ccg->ccg_secsize = maxsecsize;
388 ccg->ccg_ntracks = 1;
389 ccg->ccg_nsectors = 1024 * 1024 / ccg->ccg_secsize;
390 ccg->ccg_ncylinders = cs->sc_size / ccg->ccg_nsectors;
391
392 /*
393 * Add a devstat entry for this device.
394 */
395 devstat_add_entry(&cs->device_stats, "ccd", cs->sc_unit,
396 ccg->ccg_secsize, DEVSTAT_ALL_SUPPORTED,
397 DEVSTAT_TYPE_STORARRAY |DEVSTAT_TYPE_IF_OTHER,
398 DEVSTAT_PRIORITY_ARRAY);
399
400 cs->sc_flags |= CCDF_INITED;
401 cs->sc_cflags = cs->sc_flags; /* So we can find out later... */
402 return (0);
403fail:
404 while (ci > cs->sc_cinfo) {
405 ci--;
406 free(ci->ci_path, M_CCD);
407 }
408 if (tmppath != NULL)
409 free(tmppath, M_CCD);
410 free(cs->sc_cinfo, M_CCD);
411 ccddestroy(cs);
412 return (error);
413}
414
415static void
416ccdinterleave(struct ccd_s *cs, int unit)
417{
418 struct ccdcinfo *ci, *smallci;
419 struct ccdiinfo *ii;
420 daddr_t bn, lbn;
421 int ix;
422 u_long size;
423
424
425 /*
426 * Allocate an interleave table. The worst case occurs when each
427 * of N disks is of a different size, resulting in N interleave
428 * tables.
429 *
430 * Chances are this is too big, but we don't care.
431 */
432 size = (cs->sc_nccdisks + 1) * sizeof(struct ccdiinfo);
433 cs->sc_itable = (struct ccdiinfo *)malloc(size, M_CCD,
434 M_WAITOK | M_ZERO);
435
436 /*
437 * Trivial case: no interleave (actually interleave of disk size).
438 * Each table entry represents a single component in its entirety.
439 *
440 * An interleave of 0 may not be used with a mirror setup.
441 */
442 if (cs->sc_ileave == 0) {
443 bn = 0;
444 ii = cs->sc_itable;
445
446 for (ix = 0; ix < cs->sc_nccdisks; ix++) {
447 /* Allocate space for ii_index. */
448 ii->ii_index = malloc(sizeof(int), M_CCD, M_WAITOK);
449 ii->ii_ndisk = 1;
450 ii->ii_startblk = bn;
451 ii->ii_startoff = 0;
452 ii->ii_index[0] = ix;
453 bn += cs->sc_cinfo[ix].ci_size;
454 ii++;
455 }
456 ii->ii_ndisk = 0;
457 return;
458 }
459
460 /*
461 * The following isn't fast or pretty; it doesn't have to be.
462 */
463 size = 0;
464 bn = lbn = 0;
465 for (ii = cs->sc_itable; ; ii++) {
466 /*
467 * Allocate space for ii_index. We might allocate more then
468 * we use.
469 */
470 ii->ii_index = malloc((sizeof(int) * cs->sc_nccdisks),
471 M_CCD, M_WAITOK);
472
473 /*
474 * Locate the smallest of the remaining components
475 */
476 smallci = NULL;
477 for (ci = cs->sc_cinfo; ci < &cs->sc_cinfo[cs->sc_nccdisks];
478 ci++) {
479 if (ci->ci_size > size &&
480 (smallci == NULL ||
481 ci->ci_size < smallci->ci_size)) {
482 smallci = ci;
483 }
484 }
485
486 /*
487 * Nobody left, all done
488 */
489 if (smallci == NULL) {
490 ii->ii_ndisk = 0;
491 free(ii->ii_index, M_CCD);
492 break;
493 }
494
495 /*
496 * Record starting logical block using an sc_ileave blocksize.
497 */
498 ii->ii_startblk = bn / cs->sc_ileave;
499
500 /*
501 * Record starting comopnent block using an sc_ileave
502 * blocksize. This value is relative to the beginning of
503 * a component disk.
504 */
505 ii->ii_startoff = lbn;
506
507 /*
508 * Determine how many disks take part in this interleave
509 * and record their indices.
510 */
511 ix = 0;
512 for (ci = cs->sc_cinfo;
513 ci < &cs->sc_cinfo[cs->sc_nccdisks]; ci++) {
514 if (ci->ci_size >= smallci->ci_size) {
515 ii->ii_index[ix++] = ci - cs->sc_cinfo;
516 }
517 }
518 ii->ii_ndisk = ix;
519 bn += ix * (smallci->ci_size - size);
520 lbn = smallci->ci_size / cs->sc_ileave;
521 size = smallci->ci_size;
522 }
523}
524
525static void
526ccdstrategy(struct bio *bp)
527{
528 struct ccd_s *cs;
529 int pbn; /* in sc_secsize chunks */
530 long sz; /* in sc_secsize chunks */
531
532 cs = bp->bio_disk->d_drv1;
533
534 pbn = bp->bio_blkno / (cs->sc_geom.ccg_secsize / DEV_BSIZE);
535 sz = howmany(bp->bio_bcount, cs->sc_geom.ccg_secsize);
536
537 /*
538 * If out of bounds return an error. If at the EOF point,
539 * simply read or write less.
540 */
541
542 if (pbn < 0 || pbn >= cs->sc_size) {
543 bp->bio_resid = bp->bio_bcount;
544 if (pbn != cs->sc_size)
545 biofinish(bp, NULL, EINVAL);
546 else
547 biodone(bp);
548 return;
549 }
550
551 /*
552 * If the request crosses EOF, truncate the request.
553 */
554 if (pbn + sz > cs->sc_size) {
555 bp->bio_bcount = (cs->sc_size - pbn) *
556 cs->sc_geom.ccg_secsize;
557 }
558
559 bp->bio_resid = bp->bio_bcount;
560
561 /*
562 * "Start" the unit.
563 */
564 ccdstart(cs, bp);
565 return;
566}
567
568static void
569ccdstart(struct ccd_s *cs, struct bio *bp)
570{
571 long bcount, rcount;
572 struct ccdbuf *cbp[2];
573 caddr_t addr;
574 daddr_t bn;
575 int err;
576
577
578 /* Record the transaction start */
579 devstat_start_transaction(&cs->device_stats);
580
581 /*
582 * Translate the partition-relative block number to an absolute.
583 */
584 bn = bp->bio_blkno;
585
586 /*
587 * Allocate component buffers and fire off the requests
588 */
589 addr = bp->bio_data;
590 for (bcount = bp->bio_bcount; bcount > 0; bcount -= rcount) {
591 err = ccdbuffer(cbp, cs, bp, bn, addr, bcount);
592 if (err) {
593 printf("ccdbuffer error %d\n", err);
594 /* We're screwed */
595 bp->bio_resid -= bcount;
596 bp->bio_error = ENOMEM;
597 bp->bio_flags |= BIO_ERROR;
598 return;
599 }
600 rcount = cbp[0]->cb_buf.bio_bcount;
601
602 if (cs->sc_cflags & CCDF_MIRROR) {
603 /*
604 * Mirroring. Writes go to both disks, reads are
605 * taken from whichever disk seems most appropriate.
606 *
607 * We attempt to localize reads to the disk whos arm
608 * is nearest the read request. We ignore seeks due
609 * to writes when making this determination and we
610 * also try to avoid hogging.
611 */
612 if (cbp[0]->cb_buf.bio_cmd == BIO_WRITE) {
613 BIO_STRATEGY(&cbp[0]->cb_buf);
614 BIO_STRATEGY(&cbp[1]->cb_buf);
615 } else {
616 int pick = cs->sc_pick;
617 daddr_t range = cs->sc_size / 16;
618
619 if (bn < cs->sc_blk[pick] - range ||
620 bn > cs->sc_blk[pick] + range
621 ) {
622 cs->sc_pick = pick = 1 - pick;
623 }
624 cs->sc_blk[pick] = bn + btodb(rcount);
625 BIO_STRATEGY(&cbp[pick]->cb_buf);
626 }
627 } else {
628 /*
629 * Not mirroring
630 */
631 BIO_STRATEGY(&cbp[0]->cb_buf);
632 }
633 bn += btodb(rcount);
634 addr += rcount;
635 }
636}
637
638/*
639 * Build a component buffer header.
640 */
641static int
642ccdbuffer(struct ccdbuf **cb, struct ccd_s *cs, struct bio *bp, daddr_t bn, caddr_t addr, long bcount)
643{
644 struct ccdcinfo *ci, *ci2 = NULL; /* XXX */
645 struct ccdbuf *cbp;
646 daddr_t cbn, cboff;
647 off_t cbc;
648
649 /*
650 * Determine which component bn falls in.
651 */
652 cbn = bn;
653 cboff = 0;
654
655 if (cs->sc_ileave == 0) {
656 /*
657 * Serially concatenated and neither a mirror nor a parity
658 * config. This is a special case.
659 */
660 daddr_t sblk;
661
662 sblk = 0;
663 for (ci = cs->sc_cinfo; cbn >= sblk + ci->ci_size; ci++)
664 sblk += ci->ci_size;
665 cbn -= sblk;
666 } else {
667 struct ccdiinfo *ii;
668 int ccdisk, off;
669
670 /*
671 * Calculate cbn, the logical superblock (sc_ileave chunks),
672 * and cboff, a normal block offset (DEV_BSIZE chunks) relative
673 * to cbn.
674 */
675 cboff = cbn % cs->sc_ileave; /* DEV_BSIZE gran */
676 cbn = cbn / cs->sc_ileave; /* DEV_BSIZE * ileave gran */
677
678 /*
679 * Figure out which interleave table to use.
680 */
681 for (ii = cs->sc_itable; ii->ii_ndisk; ii++) {
682 if (ii->ii_startblk > cbn)
683 break;
684 }
685 ii--;
686
687 /*
688 * off is the logical superblock relative to the beginning
689 * of this interleave block.
690 */
691 off = cbn - ii->ii_startblk;
692
693 /*
694 * We must calculate which disk component to use (ccdisk),
695 * and recalculate cbn to be the superblock relative to
696 * the beginning of the component. This is typically done by
697 * adding 'off' and ii->ii_startoff together. However, 'off'
698 * must typically be divided by the number of components in
699 * this interleave array to be properly convert it from a
700 * CCD-relative logical superblock number to a
701 * component-relative superblock number.
702 */
703 if (ii->ii_ndisk == 1) {
704 /*
705 * When we have just one disk, it can't be a mirror
706 * or a parity config.
707 */
708 ccdisk = ii->ii_index[0];
709 cbn = ii->ii_startoff + off;
710 } else {
711 if (cs->sc_cflags & CCDF_MIRROR) {
712 /*
713 * We have forced a uniform mapping, resulting
714 * in a single interleave array. We double
715 * up on the first half of the available
716 * components and our mirror is in the second
717 * half. This only works with a single
718 * interleave array because doubling up
719 * doubles the number of sectors, so there
720 * cannot be another interleave array because
721 * the next interleave array's calculations
722 * would be off.
723 */
724 int ndisk2 = ii->ii_ndisk / 2;
725 ccdisk = ii->ii_index[off % ndisk2];
726 cbn = ii->ii_startoff + off / ndisk2;
727 ci2 = &cs->sc_cinfo[ccdisk + ndisk2];
728 } else {
729 ccdisk = ii->ii_index[off % ii->ii_ndisk];
730 cbn = ii->ii_startoff + off / ii->ii_ndisk;
731 }
732 }
733
734 ci = &cs->sc_cinfo[ccdisk];
735
736 /*
737 * Convert cbn from a superblock to a normal block so it
738 * can be used to calculate (along with cboff) the normal
739 * block index into this particular disk.
740 */
741 cbn *= cs->sc_ileave;
742 }
743
744 /*
745 * Fill in the component buf structure.
746 */
747 cbp = malloc(sizeof(struct ccdbuf), M_CCD, M_NOWAIT | M_ZERO);
748 if (cbp == NULL)
749 return (ENOMEM);
750 cbp->cb_buf.bio_cmd = bp->bio_cmd;
751 cbp->cb_buf.bio_done = ccdiodone;
752 cbp->cb_buf.bio_dev = ci->ci_dev; /* XXX */
753 cbp->cb_buf.bio_blkno = cbn + cboff + CCD_OFFSET;
754 cbp->cb_buf.bio_offset = dbtob(cbn + cboff + CCD_OFFSET);
755 cbp->cb_buf.bio_data = addr;
756 cbp->cb_buf.bio_caller2 = cbp;
757 if (cs->sc_ileave == 0)
758 cbc = dbtob((off_t)(ci->ci_size - cbn));
759 else
760 cbc = dbtob((off_t)(cs->sc_ileave - cboff));
761 cbp->cb_buf.bio_bcount = (cbc < bcount) ? cbc : bcount;
762 cbp->cb_buf.bio_caller1 = (void*)cbp->cb_buf.bio_bcount;
763
764 /*
765 * context for ccdiodone
766 */
767 cbp->cb_obp = bp;
768 cbp->cb_softc = cs;
769 cbp->cb_comp = ci - cs->sc_cinfo;
770
771 cb[0] = cbp;
772
773 /*
774 * Note: both I/O's setup when reading from mirror, but only one
775 * will be executed.
776 */
777 if (cs->sc_cflags & CCDF_MIRROR) {
778 /* mirror, setup second I/O */
779 cbp = malloc(sizeof(struct ccdbuf), M_CCD, M_NOWAIT);
780 if (cbp == NULL) {
781 free(cb[0], M_CCD);
782 cb[0] = NULL;
783 return (ENOMEM);
784 }
785 bcopy(cb[0], cbp, sizeof(struct ccdbuf));
786 cbp->cb_buf.bio_dev = ci2->ci_dev;
787 cbp->cb_comp = ci2 - cs->sc_cinfo;
788 cb[1] = cbp;
789 /* link together the ccdbuf's and clear "mirror done" flag */
790 cb[0]->cb_mirror = cb[1];
791 cb[1]->cb_mirror = cb[0];
792 cb[0]->cb_pflags &= ~CCDPF_MIRROR_DONE;
793 cb[1]->cb_pflags &= ~CCDPF_MIRROR_DONE;
794 }
795 return (0);
796}
797
798/*
799 * Called at interrupt time.
800 * Mark the component as done and if all components are done,
801 * take a ccd interrupt.
802 */
803static void
804ccdiodone(struct bio *ibp)
805{
806 struct ccdbuf *cbp;
807 struct bio *bp;
808 struct ccd_s *cs;
809 int count;
810
811 cbp = ibp->bio_caller2;
812 cs = cbp->cb_softc;
813 bp = cbp->cb_obp;
814 /*
815 * If an error occured, report it. If this is a mirrored
816 * configuration and the first of two possible reads, do not
817 * set the error in the bp yet because the second read may
818 * succeed.
819 */
820
821 if (cbp->cb_buf.bio_flags & BIO_ERROR) {
822 const char *msg = "";
823
824 if ((cs->sc_cflags & CCDF_MIRROR) &&
825 (cbp->cb_buf.bio_cmd == BIO_READ) &&
826 (cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) {
827 /*
828 * We will try our read on the other disk down
829 * below, also reverse the default pick so if we
830 * are doing a scan we do not keep hitting the
831 * bad disk first.
832 */
833
834 msg = ", trying other disk";
835 cs->sc_pick = 1 - cs->sc_pick;
836 cs->sc_blk[cs->sc_pick] = bp->bio_blkno;
837 } else {
838 bp->bio_flags |= BIO_ERROR;
839 bp->bio_error = cbp->cb_buf.bio_error ?
840 cbp->cb_buf.bio_error : EIO;
841 }
842 printf("ccd%d: error %d on component %d block %jd "
843 "(ccd block %jd)%s\n", cs->sc_unit, bp->bio_error,
844 cbp->cb_comp,
845 (intmax_t)cbp->cb_buf.bio_blkno, (intmax_t)bp->bio_blkno,
846 msg);
847 }
848
849 /*
850 * Process mirror. If we are writing, I/O has been initiated on both
851 * buffers and we fall through only after both are finished.
852 *
853 * If we are reading only one I/O is initiated at a time. If an
854 * error occurs we initiate the second I/O and return, otherwise
855 * we free the second I/O without initiating it.
856 */
857
858 if (cs->sc_cflags & CCDF_MIRROR) {
859 if (cbp->cb_buf.bio_cmd == BIO_WRITE) {
860 /*
861 * When writing, handshake with the second buffer
862 * to determine when both are done. If both are not
863 * done, return here.
864 */
865 if ((cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) {
866 cbp->cb_mirror->cb_pflags |= CCDPF_MIRROR_DONE;
867 free(cbp, M_CCD);
868 return;
869 }
870 } else {
871 /*
872 * When reading, either dispose of the second buffer
873 * or initiate I/O on the second buffer if an error
874 * occured with this one.
875 */
876 if ((cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) {
877 if (cbp->cb_buf.bio_flags & BIO_ERROR) {
878 cbp->cb_mirror->cb_pflags |=
879 CCDPF_MIRROR_DONE;
880 BIO_STRATEGY(&cbp->cb_mirror->cb_buf);
881 free(cbp, M_CCD);
882 return;
883 } else {
884 free(cbp->cb_mirror, M_CCD);
885 }
886 }
887 }
888 }
889
890 /*
891 * use bio_caller1 to determine how big the original request was rather
892 * then bio_bcount, because bio_bcount may have been truncated for EOF.
893 *
894 * XXX We check for an error, but we do not test the resid for an
895 * aligned EOF condition. This may result in character & block
896 * device access not recognizing EOF properly when read or written
897 * sequentially, but will not effect filesystems.
898 */
899 count = (long)cbp->cb_buf.bio_caller1;
900 free(cbp, M_CCD);
901
902 /*
903 * If all done, "interrupt".
904 */
905 bp->bio_resid -= count;
906 if (bp->bio_resid < 0)
907 panic("ccdiodone: count");
908 if (bp->bio_resid == 0) {
909 if (bp->bio_flags & BIO_ERROR)
910 bp->bio_resid = bp->bio_bcount;
911 biofinish(bp, &cs->device_stats, 0);
912 }
913}
914
915static int ccdioctltoo(int unit, u_long cmd, caddr_t data, int flag, struct thread *td);
916
917static int
918ccdctlioctl(dev_t dev, u_long cmd, caddr_t data, int flag, struct thread *td)
919{
920 struct ccd_ioctl *ccio;
921 u_int unit;
922 dev_t dev2;
923 int error;
924
925 switch (cmd) {
926 case CCDIOCSET:
927 case CCDIOCCLR:
928 ccio = (struct ccd_ioctl *)data;
929 unit = ccio->ccio_size;
930 return (ccdioctltoo(unit, cmd, data, flag, td));
931 case CCDCONFINFO:
932 {
933 int ninit = 0;
934 struct ccdconf *conf = (struct ccdconf *)data;
935 struct ccd_s *tmpcs;
936 struct ccd_s *ubuf = conf->buffer;
937
938 /* XXX: LOCK(unique unit numbers) */
939 LIST_FOREACH(tmpcs, &ccd_softc_list, list)
940 if (IS_INITED(tmpcs))
941 ninit++;
942
943 if (conf->size == 0) {
944 conf->size = sizeof(struct ccd_s) * ninit;
945 return (0);
946 } else if ((conf->size / sizeof(struct ccd_s) != ninit) ||
947 (conf->size % sizeof(struct ccd_s) != 0)) {
948 /* XXX: UNLOCK(unique unit numbers) */
949 return (EINVAL);
950 }
951
952 ubuf += ninit;
953 LIST_FOREACH(tmpcs, &ccd_softc_list, list) {
954 if (!IS_INITED(tmpcs))
955 continue;
956 error = copyout(tmpcs, --ubuf,
957 sizeof(struct ccd_s));
958 if (error != 0)
959 /* XXX: UNLOCK(unique unit numbers) */
960 return (error);
961 }
962 /* XXX: UNLOCK(unique unit numbers) */
963 return (0);
964 }
965
966 case CCDCPPINFO:
967 {
968 struct ccdcpps *cpps = (struct ccdcpps *)data;
969 char *ubuf = cpps->buffer;
970 struct ccd_s *cs;
971
972
973 error = copyin(ubuf, &unit, sizeof (unit));
974 if (error)
975 return (error);
976
977 if (!IS_ALLOCATED(unit))
978 return (ENXIO);
979 dev2 = makedev(CDEV_MAJOR, unit * 8 + 2);
980 cs = ccdfind(unit);
981 if (!IS_INITED(cs))
982 return (ENXIO);
983
984 {
985 int len = 0, i;
986 struct ccdcpps *cpps = (struct ccdcpps *)data;
987 char *ubuf = cpps->buffer;
988
989
990 for (i = 0; i < cs->sc_nccdisks; ++i)
991 len += cs->sc_cinfo[i].ci_pathlen;
992
993 if (cpps->size < len)
994 return (ENOMEM);
995
996 for (i = 0; i < cs->sc_nccdisks; ++i) {
997 len = cs->sc_cinfo[i].ci_pathlen;
998 error = copyout(cs->sc_cinfo[i].ci_path, ubuf,
999 len);
1000 if (error != 0)
1001 return (error);
1002 ubuf += len;
1003 }
1004 return(copyout("", ubuf, 1));
1005 }
1006 break;
1007 }
1008
1009 default:
1010 return (ENXIO);
1011 }
1012}
1013
1014static int
1015ccdioctltoo(int unit, u_long cmd, caddr_t data, int flag, struct thread *td)
1016{
1017 int i, j, lookedup = 0, error = 0;
1018 struct ccd_s *cs;
1019 struct ccd_ioctl *ccio = (struct ccd_ioctl *)data;
1020 struct ccdgeom *ccg;
1021 char **cpp;
1022 struct vnode **vpp;
1023
1024 cs = ccdfind(unit);
1025 switch (cmd) {
1026 case CCDIOCSET:
1027 if (cs == NULL)
1028 cs = ccdnew(unit);
1029 if (IS_INITED(cs))
1030 return (EBUSY);
1031
1032 if ((flag & FWRITE) == 0)
1033 return (EBADF);
1034
1035 if ((error = ccdlock(cs)) != 0)
1036 return (error);
1037
1038 if (ccio->ccio_ndisks > CCD_MAXNDISKS)
1039 return (EINVAL);
1040
1041 /* Fill in some important bits. */
1042 cs->sc_ileave = ccio->ccio_ileave;
1043 if (cs->sc_ileave == 0 && (ccio->ccio_flags & CCDF_MIRROR)) {
1044 printf("ccd%d: disabling mirror, interleave is 0\n",
1045 unit);
1046 ccio->ccio_flags &= ~(CCDF_MIRROR);
1047 }
1048 if ((ccio->ccio_flags & CCDF_MIRROR) &&
1049 !(ccio->ccio_flags & CCDF_UNIFORM)) {
1050 printf("ccd%d: mirror/parity forces uniform flag\n",
1051 unit);
1052 ccio->ccio_flags |= CCDF_UNIFORM;
1053 }
1054 cs->sc_flags = ccio->ccio_flags & CCDF_USERMASK;
1055
1056 /*
1057 * Allocate space for and copy in the array of
1058 * componet pathnames and device numbers.
1059 */
1060 cpp = malloc(ccio->ccio_ndisks * sizeof(char *),
1061 M_CCD, M_WAITOK);
1062 vpp = malloc(ccio->ccio_ndisks * sizeof(struct vnode *),
1063 M_CCD, M_WAITOK);
1064
1065 error = copyin((caddr_t)ccio->ccio_disks, (caddr_t)cpp,
1066 ccio->ccio_ndisks * sizeof(char **));
1067 if (error) {
1068 free(vpp, M_CCD);
1069 free(cpp, M_CCD);
1070 ccdunlock(cs);
1071 return (error);
1072 }
1073
1074
1075 for (i = 0; i < ccio->ccio_ndisks; ++i) {
1076 if ((error = ccdlookup(cpp[i], td, &vpp[i])) != 0) {
1077 for (j = 0; j < lookedup; ++j)
1078 (void)vn_close(vpp[j], FREAD|FWRITE,
1079 td->td_ucred, td);
1080 free(vpp, M_CCD);
1081 free(cpp, M_CCD);
1082 ccdunlock(cs);
1083 return (error);
1084 }
1085 ++lookedup;
1086 }
1087 cs->sc_vpp = vpp;
1088 cs->sc_nccdisks = ccio->ccio_ndisks;
1089
1090 /*
1091 * Initialize the ccd. Fills in the softc for us.
1092 */
1093 if ((error = ccdinit(cs, cpp, td)) != 0) {
1094 for (j = 0; j < lookedup; ++j)
1095 (void)vn_close(vpp[j], FREAD|FWRITE,
1096 td->td_ucred, td);
1097 /*
1098 * We can't ccddestroy() cs just yet, because nothing
1099 * prevents user-level app to do another ioctl()
1100 * without closing the device first, therefore
1101 * declare unit null and void and let ccdclose()
1102 * destroy it when it is safe to do so.
1103 */
1104 cs->sc_flags &= (CCDF_WANTED | CCDF_LOCKED);
1105 free(vpp, M_CCD);
1106 free(cpp, M_CCD);
1107 ccdunlock(cs);
1108 return (error);
1109 }
1110 free(cpp, M_CCD);
1111
1112 /*
1113 * The ccd has been successfully initialized, so
1114 * we can place it into the array and read the disklabel.
1115 */
1116 ccio->ccio_unit = unit;
1117 ccio->ccio_size = cs->sc_size;
1118 ccg = &cs->sc_geom;
1119 cs->sc_disk = malloc(sizeof(struct disk), M_CCD,
1120 M_ZERO | M_WAITOK);
1121 cs->sc_disk->d_strategy = ccdstrategy;
1122 cs->sc_disk->d_name = "ccd";
1123 cs->sc_disk->d_sectorsize = ccg->ccg_secsize;
1124 cs->sc_disk->d_mediasize =
1125 cs->sc_size * (off_t)ccg->ccg_secsize;
1126 cs->sc_disk->d_fwsectors = ccg->ccg_nsectors;
1127 cs->sc_disk->d_fwheads = ccg->ccg_ntracks;
1128 cs->sc_disk->d_drv1 = cs;
1129 cs->sc_disk->d_maxsize = MAXPHYS;
1130 disk_create(unit, cs->sc_disk, 0, NULL, NULL);
1131
1132 ccdunlock(cs);
1133
1134 break;
1135
1136 case CCDIOCCLR:
1137 if (cs == NULL)
1138 return (ENXIO);
1139
1140 if (!IS_INITED(cs))
1141 return (ENXIO);
1142
1143 if ((flag & FWRITE) == 0)
1144 return (EBADF);
1145
1146 if ((error = ccdlock(cs)) != 0)
1147 return (error);
1148
1149 /* Don't unconfigure if any other partitions are open */
1150 if (cs->sc_disk->d_flags & DISKFLAG_OPEN) {
1151 ccdunlock(cs);
1152 return (EBUSY);
1153 }
1154
1155 disk_destroy(cs->sc_disk);
1156 free(cs->sc_disk, M_CCD);
1157 cs->sc_disk = NULL;
1158 /* Declare unit null and void (reset all flags) */
1159 cs->sc_flags &= (CCDF_WANTED | CCDF_LOCKED);
1160
1161 /* Close the components and free their pathnames. */
1162 for (i = 0; i < cs->sc_nccdisks; ++i) {
1163 /*
1164 * XXX: this close could potentially fail and
1165 * cause Bad Things. Maybe we need to force
1166 * the close to happen?
1167 */
1168 (void)vn_close(cs->sc_cinfo[i].ci_vp, FREAD|FWRITE,
1169 td->td_ucred, td);
1170 free(cs->sc_cinfo[i].ci_path, M_CCD);
1171 }
1172
1173 /* Free interleave index. */
1174 for (i = 0; cs->sc_itable[i].ii_ndisk; ++i)
1175 free(cs->sc_itable[i].ii_index, M_CCD);
1176
1177 /* Free component info and interleave table. */
1178 free(cs->sc_cinfo, M_CCD);
1179 free(cs->sc_itable, M_CCD);
1180 free(cs->sc_vpp, M_CCD);
1181
1182 /* And remove the devstat entry. */
1183 devstat_remove_entry(&cs->device_stats);
1184
1185 /* This must be atomic. */
1186 ccdunlock(cs);
1187 ccddestroy(cs);
1188
1189 break;
1190 }
1191
1192 return (0);
1193}
1194
1195
1196/*
1197 * Lookup the provided name in the filesystem. If the file exists,
1198 * is a valid block device, and isn't being used by anyone else,
1199 * set *vpp to the file's vnode.
1200 */
1201static int
1202ccdlookup(char *path, struct thread *td, struct vnode **vpp)
1203{
1204 struct nameidata nd;
1205 struct vnode *vp;
1206 int error, flags;
1207
1208 NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, path, td);
1209 flags = FREAD | FWRITE;
1210 if ((error = vn_open(&nd, &flags, 0)) != 0) {
1211 return (error);
1212 }
1213 vp = nd.ni_vp;
1214
1215 if (vrefcnt(vp) > 1) {
1216 error = EBUSY;
1217 goto bad;
1218 }
1219
1220 if (!vn_isdisk(vp, &error))
1221 goto bad;
1222
1223
1224 VOP_UNLOCK(vp, 0, td);
1225 NDFREE(&nd, NDF_ONLY_PNBUF);
1226 *vpp = vp;
1227 return (0);
1228bad:
1229 VOP_UNLOCK(vp, 0, td);
1230 NDFREE(&nd, NDF_ONLY_PNBUF);
1231 /* vn_close does vrele() for vp */
1232 (void)vn_close(vp, FREAD|FWRITE, td->td_ucred, td);
1233 return (error);
1234}
1235
1236/*
1237
1238 * Wait interruptibly for an exclusive lock.
1239 *
1240 * XXX
1241 * Several drivers do this; it should be abstracted and made MP-safe.
1242 */
1243static int
1244ccdlock(struct ccd_s *cs)
1245{
1246 int error;
1247
1248 while ((cs->sc_flags & CCDF_LOCKED) != 0) {
1249 cs->sc_flags |= CCDF_WANTED;
1250 if ((error = tsleep(cs, PRIBIO | PCATCH, "ccdlck", 0)) != 0)
1251 return (error);
1252 }
1253 cs->sc_flags |= CCDF_LOCKED;
1254 return (0);
1255}
1256
1257/*
1258 * Unlock and wake up any waiters.
1259 */
1260static void
1261ccdunlock(struct ccd_s *cs)
1262{
1263
1264 cs->sc_flags &= ~CCDF_LOCKED;
1265 if ((cs->sc_flags & CCDF_WANTED) != 0) {
1266 cs->sc_flags &= ~CCDF_WANTED;
1267 wakeup(cs);
1268 }
1269}