geom_ccd.c revision 112946
1/* 2 * Copyright (c) 2003 Poul-Henning Kamp. 3 * Copyright (c) 1995 Jason R. Thorpe. 4 * Copyright (c) 1990, 1993 5 * The Regents of the University of California. All rights reserved. 6 * All rights reserved. 7 * Copyright (c) 1988 University of Utah. 8 * 9 * This code is derived from software contributed to Berkeley by 10 * the Systems Programming Group of the University of Utah Computer 11 * Science Department. 12 * 13 * Redistribution and use in source and binary forms, with or without 14 * modification, are permitted provided that the following conditions 15 * are met: 16 * 1. Redistributions of source code must retain the above copyright 17 * notice, this list of conditions and the following disclaimer. 18 * 2. Redistributions in binary form must reproduce the above copyright 19 * notice, this list of conditions and the following disclaimer in the 20 * documentation and/or other materials provided with the distribution. 21 * 3. All advertising materials mentioning features or use of this software 22 * must display the following acknowledgement: 23 * This product includes software developed for the NetBSD Project 24 * by Jason R. Thorpe. 25 * 4. The names of the authors may not be used to endorse or promote products 26 * derived from this software without specific prior written permission. 27 * 28 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 29 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 30 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 31 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 32 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 33 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 34 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 35 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 36 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 37 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 38 * SUCH DAMAGE. 39 * 40 * Dynamic configuration and disklabel support by: 41 * Jason R. Thorpe <thorpej@nas.nasa.gov> 42 * Numerical Aerodynamic Simulation Facility 43 * Mail Stop 258-6 44 * NASA Ames Research Center 45 * Moffett Field, CA 94035 46 * 47 * from: Utah $Hdr: cd.c 1.6 90/11/28$ 48 * 49 * @(#)cd.c 8.2 (Berkeley) 11/16/93 50 * 51 * $NetBSD: ccd.c,v 1.22 1995/12/08 19:13:26 thorpej Exp $ 52 * 53 * $FreeBSD: head/sys/geom/geom_ccd.c 112946 2003-04-01 15:06:26Z phk $ 54 */ 55 56#include <sys/param.h> 57#include <sys/systm.h> 58#include <sys/kernel.h> 59#include <sys/module.h> 60#include <sys/proc.h> 61#include <sys/bio.h> 62#include <sys/malloc.h> 63#include <sys/namei.h> 64#include <sys/conf.h> 65#include <sys/stat.h> 66#include <sys/sysctl.h> 67#include <sys/disk.h> 68#include <sys/fcntl.h> 69#include <sys/vnode.h> 70#include <geom/geom_disk.h> 71 72#include <sys/ccdvar.h> 73 74MALLOC_DEFINE(M_CCD, "CCD driver", "Concatenated Disk driver"); 75 76/* 77 This is how mirroring works (only writes are special): 78 79 When initiating a write, ccdbuffer() returns two "struct ccdbuf *"s 80 linked together by the cb_mirror field. "cb_pflags & 81 CCDPF_MIRROR_DONE" is set to 0 on both of them. 82 83 When a component returns to ccdiodone(), it checks if "cb_pflags & 84 CCDPF_MIRROR_DONE" is set or not. If not, it sets the partner's 85 flag and returns. If it is, it means its partner has already 86 returned, so it will go to the regular cleanup. 87 88 */ 89 90struct ccdbuf { 91 struct bio cb_buf; /* new I/O buf */ 92 struct bio *cb_obp; /* ptr. to original I/O buf */ 93 struct ccdbuf *cb_freenext; /* free list link */ 94 struct ccd_s *cb_softc; 95 int cb_comp; /* target component */ 96 int cb_pflags; /* mirror/parity status flag */ 97 struct ccdbuf *cb_mirror; /* mirror counterpart */ 98}; 99 100/* bits in cb_pflags */ 101#define CCDPF_MIRROR_DONE 1 /* if set, mirror counterpart is done */ 102 103/* convinient macros for often-used statements */ 104#define IS_ALLOCATED(unit) (ccdfind(unit) != NULL) 105#define IS_INITED(cs) (((cs)->sc_flags & CCDF_INITED) != 0) 106 107static dev_t ccdctldev; 108 109static disk_strategy_t ccdstrategy; 110static d_ioctl_t ccdctlioctl; 111 112#define NCCDFREEHIWAT 16 113 114#define CDEV_MAJOR 74 115 116static struct cdevsw ccdctl_cdevsw = { 117 .d_open = nullopen, 118 .d_close = nullclose, 119 .d_ioctl = ccdctlioctl, 120 .d_name = "ccdctl", 121 .d_maj = CDEV_MAJOR, 122}; 123 124static LIST_HEAD(, ccd_s) ccd_softc_list = 125 LIST_HEAD_INITIALIZER(&ccd_softc_list); 126 127static struct ccd_s *ccdfind(int); 128static struct ccd_s *ccdnew(int); 129static int ccddestroy(struct ccd_s *); 130 131/* called during module initialization */ 132static void ccdattach(void); 133static int ccd_modevent(module_t, int, void *); 134 135/* called by biodone() at interrupt time */ 136static void ccdiodone(struct bio *bp); 137 138static void ccdstart(struct ccd_s *, struct bio *); 139static void ccdinterleave(struct ccd_s *, int); 140static int ccdinit(struct ccd_s *, char **, struct thread *); 141static int ccdlookup(char *, struct thread *p, struct vnode **); 142static int ccdbuffer(struct ccdbuf **ret, struct ccd_s *, 143 struct bio *, daddr_t, caddr_t, long); 144static int ccdlock(struct ccd_s *); 145static void ccdunlock(struct ccd_s *); 146 147 148/* 149 * Number of blocks to untouched in front of a component partition. 150 * This is to avoid violating its disklabel area when it starts at the 151 * beginning of the slice. 152 */ 153#if !defined(CCD_OFFSET) 154#define CCD_OFFSET 16 155#endif 156 157static struct ccd_s * 158ccdfind(int unit) 159{ 160 struct ccd_s *sc = NULL; 161 162 /* XXX: LOCK(unique unit numbers) */ 163 LIST_FOREACH(sc, &ccd_softc_list, list) { 164 if (sc->sc_unit == unit) 165 break; 166 } 167 /* XXX: UNLOCK(unique unit numbers) */ 168 return ((sc == NULL) || (sc->sc_unit != unit) ? NULL : sc); 169} 170 171static struct ccd_s * 172ccdnew(int unit) 173{ 174 struct ccd_s *sc; 175 176 /* XXX: LOCK(unique unit numbers) */ 177 if (IS_ALLOCATED(unit) || unit > 32) 178 return (NULL); 179 180 MALLOC(sc, struct ccd_s *, sizeof(*sc), M_CCD, M_WAITOK | M_ZERO); 181 sc->sc_unit = unit; 182 LIST_INSERT_HEAD(&ccd_softc_list, sc, list); 183 /* XXX: UNLOCK(unique unit numbers) */ 184 return (sc); 185} 186 187static int 188ccddestroy(struct ccd_s *sc) 189{ 190 191 /* XXX: LOCK(unique unit numbers) */ 192 LIST_REMOVE(sc, list); 193 /* XXX: UNLOCK(unique unit numbers) */ 194 FREE(sc, M_CCD); 195 return (0); 196} 197 198/* 199 * Called by main() during pseudo-device attachment. All we need 200 * to do is to add devsw entries. 201 */ 202static void 203ccdattach() 204{ 205 206 ccdctldev = make_dev(&ccdctl_cdevsw, 0xffff00ff, 207 UID_ROOT, GID_OPERATOR, 0640, "ccd.ctl"); 208 ccdctldev->si_drv1 = ccdctldev; 209} 210 211static int 212ccd_modevent(module_t mod, int type, void *data) 213{ 214 int error = 0; 215 216 switch (type) { 217 case MOD_LOAD: 218 ccdattach(); 219 break; 220 221 case MOD_UNLOAD: 222 printf("ccd0: Unload not supported!\n"); 223 error = EOPNOTSUPP; 224 break; 225 226 case MOD_SHUTDOWN: 227 break; 228 229 default: 230 error = EOPNOTSUPP; 231 } 232 return (error); 233} 234 235DEV_MODULE(ccd, ccd_modevent, NULL); 236 237static int 238ccdinit(struct ccd_s *cs, char **cpaths, struct thread *td) 239{ 240 struct ccdcinfo *ci = NULL; /* XXX */ 241 size_t size; 242 int ix; 243 struct vnode *vp; 244 size_t minsize; 245 int maxsecsize; 246 struct ccdgeom *ccg = &cs->sc_geom; 247 char *tmppath = NULL; 248 int error = 0; 249 off_t mediasize; 250 u_int sectorsize; 251 252 253 cs->sc_size = 0; 254 255 /* Allocate space for the component info. */ 256 cs->sc_cinfo = malloc(cs->sc_nccdisks * sizeof(struct ccdcinfo), 257 M_CCD, M_WAITOK); 258 259 /* 260 * Verify that each component piece exists and record 261 * relevant information about it. 262 */ 263 maxsecsize = 0; 264 minsize = 0; 265 tmppath = malloc(MAXPATHLEN, M_CCD, M_WAITOK); 266 for (ix = 0; ix < cs->sc_nccdisks; ix++) { 267 vp = cs->sc_vpp[ix]; 268 ci = &cs->sc_cinfo[ix]; 269 ci->ci_vp = vp; 270 271 /* 272 * Copy in the pathname of the component. 273 */ 274 if ((error = copyinstr(cpaths[ix], tmppath, 275 MAXPATHLEN, &ci->ci_pathlen)) != 0) { 276 goto fail; 277 } 278 ci->ci_path = malloc(ci->ci_pathlen, M_CCD, M_WAITOK); 279 bcopy(tmppath, ci->ci_path, ci->ci_pathlen); 280 281 ci->ci_dev = vn_todev(vp); 282 283 /* 284 * Get partition information for the component. 285 */ 286 error = VOP_IOCTL(vp, DIOCGMEDIASIZE, (caddr_t)&mediasize, 287 FREAD, td->td_ucred, td); 288 if (error != 0) { 289 goto fail; 290 } 291 /* 292 * Get partition information for the component. 293 */ 294 error = VOP_IOCTL(vp, DIOCGSECTORSIZE, (caddr_t)§orsize, 295 FREAD, td->td_ucred, td); 296 if (error != 0) { 297 goto fail; 298 } 299 if (sectorsize > maxsecsize) 300 maxsecsize = sectorsize; 301 size = mediasize / DEV_BSIZE - CCD_OFFSET; 302 303 /* 304 * Calculate the size, truncating to an interleave 305 * boundary if necessary. 306 */ 307 308 if (cs->sc_ileave > 1) 309 size -= size % cs->sc_ileave; 310 311 if (size == 0) { 312 error = ENODEV; 313 goto fail; 314 } 315 316 if (minsize == 0 || size < minsize) 317 minsize = size; 318 ci->ci_size = size; 319 cs->sc_size += size; 320 } 321 322 free(tmppath, M_CCD); 323 tmppath = NULL; 324 325 /* 326 * Don't allow the interleave to be smaller than 327 * the biggest component sector. 328 */ 329 if ((cs->sc_ileave > 0) && 330 (cs->sc_ileave < (maxsecsize / DEV_BSIZE))) { 331 error = EINVAL; 332 goto fail; 333 } 334 335 /* 336 * If uniform interleave is desired set all sizes to that of 337 * the smallest component. This will guarentee that a single 338 * interleave table is generated. 339 * 340 * Lost space must be taken into account when calculating the 341 * overall size. Half the space is lost when CCDF_MIRROR is 342 * specified. 343 */ 344 if (cs->sc_flags & CCDF_UNIFORM) { 345 for (ci = cs->sc_cinfo; 346 ci < &cs->sc_cinfo[cs->sc_nccdisks]; ci++) { 347 ci->ci_size = minsize; 348 } 349 if (cs->sc_flags & CCDF_MIRROR) { 350 /* 351 * Check to see if an even number of components 352 * have been specified. The interleave must also 353 * be non-zero in order for us to be able to 354 * guarentee the topology. 355 */ 356 if (cs->sc_nccdisks % 2) { 357 printf("ccd%d: mirroring requires an even number of disks\n", cs->sc_unit ); 358 error = EINVAL; 359 goto fail; 360 } 361 if (cs->sc_ileave == 0) { 362 printf("ccd%d: an interleave must be specified when mirroring\n", cs->sc_unit); 363 error = EINVAL; 364 goto fail; 365 } 366 cs->sc_size = (cs->sc_nccdisks/2) * minsize; 367 } else { 368 if (cs->sc_ileave == 0) { 369 printf("ccd%d: an interleave must be specified when using parity\n", cs->sc_unit); 370 error = EINVAL; 371 goto fail; 372 } 373 cs->sc_size = cs->sc_nccdisks * minsize; 374 } 375 } 376 377 /* 378 * Construct the interleave table. 379 */ 380 ccdinterleave(cs, cs->sc_unit); 381 382 /* 383 * Create pseudo-geometry based on 1MB cylinders. It's 384 * pretty close. 385 */ 386 ccg->ccg_secsize = maxsecsize; 387 ccg->ccg_ntracks = 1; 388 ccg->ccg_nsectors = 1024 * 1024 / ccg->ccg_secsize; 389 ccg->ccg_ncylinders = cs->sc_size / ccg->ccg_nsectors; 390 391 cs->sc_flags |= CCDF_INITED; 392 cs->sc_cflags = cs->sc_flags; /* So we can find out later... */ 393 return (0); 394fail: 395 while (ci > cs->sc_cinfo) { 396 ci--; 397 free(ci->ci_path, M_CCD); 398 } 399 if (tmppath != NULL) 400 free(tmppath, M_CCD); 401 free(cs->sc_cinfo, M_CCD); 402 ccddestroy(cs); 403 return (error); 404} 405 406static void 407ccdinterleave(struct ccd_s *cs, int unit) 408{ 409 struct ccdcinfo *ci, *smallci; 410 struct ccdiinfo *ii; 411 daddr_t bn, lbn; 412 int ix; 413 u_long size; 414 415 416 /* 417 * Allocate an interleave table. The worst case occurs when each 418 * of N disks is of a different size, resulting in N interleave 419 * tables. 420 * 421 * Chances are this is too big, but we don't care. 422 */ 423 size = (cs->sc_nccdisks + 1) * sizeof(struct ccdiinfo); 424 cs->sc_itable = (struct ccdiinfo *)malloc(size, M_CCD, 425 M_WAITOK | M_ZERO); 426 427 /* 428 * Trivial case: no interleave (actually interleave of disk size). 429 * Each table entry represents a single component in its entirety. 430 * 431 * An interleave of 0 may not be used with a mirror setup. 432 */ 433 if (cs->sc_ileave == 0) { 434 bn = 0; 435 ii = cs->sc_itable; 436 437 for (ix = 0; ix < cs->sc_nccdisks; ix++) { 438 /* Allocate space for ii_index. */ 439 ii->ii_index = malloc(sizeof(int), M_CCD, M_WAITOK); 440 ii->ii_ndisk = 1; 441 ii->ii_startblk = bn; 442 ii->ii_startoff = 0; 443 ii->ii_index[0] = ix; 444 bn += cs->sc_cinfo[ix].ci_size; 445 ii++; 446 } 447 ii->ii_ndisk = 0; 448 return; 449 } 450 451 /* 452 * The following isn't fast or pretty; it doesn't have to be. 453 */ 454 size = 0; 455 bn = lbn = 0; 456 for (ii = cs->sc_itable; ; ii++) { 457 /* 458 * Allocate space for ii_index. We might allocate more then 459 * we use. 460 */ 461 ii->ii_index = malloc((sizeof(int) * cs->sc_nccdisks), 462 M_CCD, M_WAITOK); 463 464 /* 465 * Locate the smallest of the remaining components 466 */ 467 smallci = NULL; 468 for (ci = cs->sc_cinfo; ci < &cs->sc_cinfo[cs->sc_nccdisks]; 469 ci++) { 470 if (ci->ci_size > size && 471 (smallci == NULL || 472 ci->ci_size < smallci->ci_size)) { 473 smallci = ci; 474 } 475 } 476 477 /* 478 * Nobody left, all done 479 */ 480 if (smallci == NULL) { 481 ii->ii_ndisk = 0; 482 free(ii->ii_index, M_CCD); 483 break; 484 } 485 486 /* 487 * Record starting logical block using an sc_ileave blocksize. 488 */ 489 ii->ii_startblk = bn / cs->sc_ileave; 490 491 /* 492 * Record starting comopnent block using an sc_ileave 493 * blocksize. This value is relative to the beginning of 494 * a component disk. 495 */ 496 ii->ii_startoff = lbn; 497 498 /* 499 * Determine how many disks take part in this interleave 500 * and record their indices. 501 */ 502 ix = 0; 503 for (ci = cs->sc_cinfo; 504 ci < &cs->sc_cinfo[cs->sc_nccdisks]; ci++) { 505 if (ci->ci_size >= smallci->ci_size) { 506 ii->ii_index[ix++] = ci - cs->sc_cinfo; 507 } 508 } 509 ii->ii_ndisk = ix; 510 bn += ix * (smallci->ci_size - size); 511 lbn = smallci->ci_size / cs->sc_ileave; 512 size = smallci->ci_size; 513 } 514} 515 516static void 517ccdstrategy(struct bio *bp) 518{ 519 struct ccd_s *cs; 520 int pbn; /* in sc_secsize chunks */ 521 long sz; /* in sc_secsize chunks */ 522 523 cs = bp->bio_disk->d_drv1; 524 525 pbn = bp->bio_blkno / (cs->sc_geom.ccg_secsize / DEV_BSIZE); 526 sz = howmany(bp->bio_bcount, cs->sc_geom.ccg_secsize); 527 528 /* 529 * If out of bounds return an error. If at the EOF point, 530 * simply read or write less. 531 */ 532 533 if (pbn < 0 || pbn >= cs->sc_size) { 534 bp->bio_resid = bp->bio_bcount; 535 if (pbn != cs->sc_size) 536 biofinish(bp, NULL, EINVAL); 537 else 538 biodone(bp); 539 return; 540 } 541 542 /* 543 * If the request crosses EOF, truncate the request. 544 */ 545 if (pbn + sz > cs->sc_size) { 546 bp->bio_bcount = (cs->sc_size - pbn) * 547 cs->sc_geom.ccg_secsize; 548 } 549 550 bp->bio_resid = bp->bio_bcount; 551 552 /* 553 * "Start" the unit. 554 */ 555 ccdstart(cs, bp); 556 return; 557} 558 559static void 560ccdstart(struct ccd_s *cs, struct bio *bp) 561{ 562 long bcount, rcount; 563 struct ccdbuf *cbp[2]; 564 caddr_t addr; 565 daddr_t bn; 566 int err; 567 568 /* 569 * Translate the partition-relative block number to an absolute. 570 */ 571 bn = bp->bio_blkno; 572 573 /* 574 * Allocate component buffers and fire off the requests 575 */ 576 addr = bp->bio_data; 577 for (bcount = bp->bio_bcount; bcount > 0; bcount -= rcount) { 578 err = ccdbuffer(cbp, cs, bp, bn, addr, bcount); 579 if (err) { 580 printf("ccdbuffer error %d\n", err); 581 /* We're screwed */ 582 bp->bio_resid -= bcount; 583 bp->bio_error = ENOMEM; 584 bp->bio_flags |= BIO_ERROR; 585 return; 586 } 587 rcount = cbp[0]->cb_buf.bio_bcount; 588 589 if (cs->sc_cflags & CCDF_MIRROR) { 590 /* 591 * Mirroring. Writes go to both disks, reads are 592 * taken from whichever disk seems most appropriate. 593 * 594 * We attempt to localize reads to the disk whos arm 595 * is nearest the read request. We ignore seeks due 596 * to writes when making this determination and we 597 * also try to avoid hogging. 598 */ 599 if (cbp[0]->cb_buf.bio_cmd == BIO_WRITE) { 600 BIO_STRATEGY(&cbp[0]->cb_buf); 601 BIO_STRATEGY(&cbp[1]->cb_buf); 602 } else { 603 int pick = cs->sc_pick; 604 daddr_t range = cs->sc_size / 16; 605 606 if (bn < cs->sc_blk[pick] - range || 607 bn > cs->sc_blk[pick] + range 608 ) { 609 cs->sc_pick = pick = 1 - pick; 610 } 611 cs->sc_blk[pick] = bn + btodb(rcount); 612 BIO_STRATEGY(&cbp[pick]->cb_buf); 613 } 614 } else { 615 /* 616 * Not mirroring 617 */ 618 BIO_STRATEGY(&cbp[0]->cb_buf); 619 } 620 bn += btodb(rcount); 621 addr += rcount; 622 } 623} 624 625/* 626 * Build a component buffer header. 627 */ 628static int 629ccdbuffer(struct ccdbuf **cb, struct ccd_s *cs, struct bio *bp, daddr_t bn, caddr_t addr, long bcount) 630{ 631 struct ccdcinfo *ci, *ci2 = NULL; /* XXX */ 632 struct ccdbuf *cbp; 633 daddr_t cbn, cboff; 634 off_t cbc; 635 636 /* 637 * Determine which component bn falls in. 638 */ 639 cbn = bn; 640 cboff = 0; 641 642 if (cs->sc_ileave == 0) { 643 /* 644 * Serially concatenated and neither a mirror nor a parity 645 * config. This is a special case. 646 */ 647 daddr_t sblk; 648 649 sblk = 0; 650 for (ci = cs->sc_cinfo; cbn >= sblk + ci->ci_size; ci++) 651 sblk += ci->ci_size; 652 cbn -= sblk; 653 } else { 654 struct ccdiinfo *ii; 655 int ccdisk, off; 656 657 /* 658 * Calculate cbn, the logical superblock (sc_ileave chunks), 659 * and cboff, a normal block offset (DEV_BSIZE chunks) relative 660 * to cbn. 661 */ 662 cboff = cbn % cs->sc_ileave; /* DEV_BSIZE gran */ 663 cbn = cbn / cs->sc_ileave; /* DEV_BSIZE * ileave gran */ 664 665 /* 666 * Figure out which interleave table to use. 667 */ 668 for (ii = cs->sc_itable; ii->ii_ndisk; ii++) { 669 if (ii->ii_startblk > cbn) 670 break; 671 } 672 ii--; 673 674 /* 675 * off is the logical superblock relative to the beginning 676 * of this interleave block. 677 */ 678 off = cbn - ii->ii_startblk; 679 680 /* 681 * We must calculate which disk component to use (ccdisk), 682 * and recalculate cbn to be the superblock relative to 683 * the beginning of the component. This is typically done by 684 * adding 'off' and ii->ii_startoff together. However, 'off' 685 * must typically be divided by the number of components in 686 * this interleave array to be properly convert it from a 687 * CCD-relative logical superblock number to a 688 * component-relative superblock number. 689 */ 690 if (ii->ii_ndisk == 1) { 691 /* 692 * When we have just one disk, it can't be a mirror 693 * or a parity config. 694 */ 695 ccdisk = ii->ii_index[0]; 696 cbn = ii->ii_startoff + off; 697 } else { 698 if (cs->sc_cflags & CCDF_MIRROR) { 699 /* 700 * We have forced a uniform mapping, resulting 701 * in a single interleave array. We double 702 * up on the first half of the available 703 * components and our mirror is in the second 704 * half. This only works with a single 705 * interleave array because doubling up 706 * doubles the number of sectors, so there 707 * cannot be another interleave array because 708 * the next interleave array's calculations 709 * would be off. 710 */ 711 int ndisk2 = ii->ii_ndisk / 2; 712 ccdisk = ii->ii_index[off % ndisk2]; 713 cbn = ii->ii_startoff + off / ndisk2; 714 ci2 = &cs->sc_cinfo[ccdisk + ndisk2]; 715 } else { 716 ccdisk = ii->ii_index[off % ii->ii_ndisk]; 717 cbn = ii->ii_startoff + off / ii->ii_ndisk; 718 } 719 } 720 721 ci = &cs->sc_cinfo[ccdisk]; 722 723 /* 724 * Convert cbn from a superblock to a normal block so it 725 * can be used to calculate (along with cboff) the normal 726 * block index into this particular disk. 727 */ 728 cbn *= cs->sc_ileave; 729 } 730 731 /* 732 * Fill in the component buf structure. 733 */ 734 cbp = malloc(sizeof(struct ccdbuf), M_CCD, M_NOWAIT | M_ZERO); 735 if (cbp == NULL) 736 return (ENOMEM); 737 cbp->cb_buf.bio_cmd = bp->bio_cmd; 738 cbp->cb_buf.bio_done = ccdiodone; 739 cbp->cb_buf.bio_dev = ci->ci_dev; /* XXX */ 740 cbp->cb_buf.bio_blkno = cbn + cboff + CCD_OFFSET; 741 cbp->cb_buf.bio_offset = dbtob(cbn + cboff + CCD_OFFSET); 742 cbp->cb_buf.bio_data = addr; 743 cbp->cb_buf.bio_caller2 = cbp; 744 if (cs->sc_ileave == 0) 745 cbc = dbtob((off_t)(ci->ci_size - cbn)); 746 else 747 cbc = dbtob((off_t)(cs->sc_ileave - cboff)); 748 cbp->cb_buf.bio_bcount = (cbc < bcount) ? cbc : bcount; 749 cbp->cb_buf.bio_caller1 = (void*)cbp->cb_buf.bio_bcount; 750 751 /* 752 * context for ccdiodone 753 */ 754 cbp->cb_obp = bp; 755 cbp->cb_softc = cs; 756 cbp->cb_comp = ci - cs->sc_cinfo; 757 758 cb[0] = cbp; 759 760 /* 761 * Note: both I/O's setup when reading from mirror, but only one 762 * will be executed. 763 */ 764 if (cs->sc_cflags & CCDF_MIRROR) { 765 /* mirror, setup second I/O */ 766 cbp = malloc(sizeof(struct ccdbuf), M_CCD, M_NOWAIT); 767 if (cbp == NULL) { 768 free(cb[0], M_CCD); 769 cb[0] = NULL; 770 return (ENOMEM); 771 } 772 bcopy(cb[0], cbp, sizeof(struct ccdbuf)); 773 cbp->cb_buf.bio_caller2 = cbp; 774 cbp->cb_buf.bio_dev = ci2->ci_dev; 775 cbp->cb_comp = ci2 - cs->sc_cinfo; 776 cb[1] = cbp; 777 /* link together the ccdbuf's and clear "mirror done" flag */ 778 cb[0]->cb_mirror = cb[1]; 779 cb[1]->cb_mirror = cb[0]; 780 cb[0]->cb_pflags &= ~CCDPF_MIRROR_DONE; 781 cb[1]->cb_pflags &= ~CCDPF_MIRROR_DONE; 782 } 783 return (0); 784} 785 786/* 787 * Called at interrupt time. 788 * Mark the component as done and if all components are done, 789 * take a ccd interrupt. 790 */ 791static void 792ccdiodone(struct bio *ibp) 793{ 794 struct ccdbuf *cbp; 795 struct bio *bp; 796 struct ccd_s *cs; 797 int count; 798 799 cbp = ibp->bio_caller2; 800 cs = cbp->cb_softc; 801 bp = cbp->cb_obp; 802 /* 803 * If an error occured, report it. If this is a mirrored 804 * configuration and the first of two possible reads, do not 805 * set the error in the bp yet because the second read may 806 * succeed. 807 */ 808 809 if (cbp->cb_buf.bio_flags & BIO_ERROR) { 810 const char *msg = ""; 811 812 if ((cs->sc_cflags & CCDF_MIRROR) && 813 (cbp->cb_buf.bio_cmd == BIO_READ) && 814 (cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) { 815 /* 816 * We will try our read on the other disk down 817 * below, also reverse the default pick so if we 818 * are doing a scan we do not keep hitting the 819 * bad disk first. 820 */ 821 822 msg = ", trying other disk"; 823 cs->sc_pick = 1 - cs->sc_pick; 824 cs->sc_blk[cs->sc_pick] = bp->bio_blkno; 825 } else { 826 bp->bio_flags |= BIO_ERROR; 827 bp->bio_error = cbp->cb_buf.bio_error ? 828 cbp->cb_buf.bio_error : EIO; 829 } 830 printf("ccd%d: error %d on component %d block %jd " 831 "(ccd block %jd)%s\n", cs->sc_unit, bp->bio_error, 832 cbp->cb_comp, 833 (intmax_t)cbp->cb_buf.bio_blkno, (intmax_t)bp->bio_blkno, 834 msg); 835 } 836 837 /* 838 * Process mirror. If we are writing, I/O has been initiated on both 839 * buffers and we fall through only after both are finished. 840 * 841 * If we are reading only one I/O is initiated at a time. If an 842 * error occurs we initiate the second I/O and return, otherwise 843 * we free the second I/O without initiating it. 844 */ 845 846 if (cs->sc_cflags & CCDF_MIRROR) { 847 if (cbp->cb_buf.bio_cmd == BIO_WRITE) { 848 /* 849 * When writing, handshake with the second buffer 850 * to determine when both are done. If both are not 851 * done, return here. 852 */ 853 if ((cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) { 854 cbp->cb_mirror->cb_pflags |= CCDPF_MIRROR_DONE; 855 free(cbp, M_CCD); 856 return; 857 } 858 } else { 859 /* 860 * When reading, either dispose of the second buffer 861 * or initiate I/O on the second buffer if an error 862 * occured with this one. 863 */ 864 if ((cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) { 865 if (cbp->cb_buf.bio_flags & BIO_ERROR) { 866 cbp->cb_mirror->cb_pflags |= 867 CCDPF_MIRROR_DONE; 868 BIO_STRATEGY(&cbp->cb_mirror->cb_buf); 869 free(cbp, M_CCD); 870 return; 871 } else { 872 free(cbp->cb_mirror, M_CCD); 873 } 874 } 875 } 876 } 877 878 /* 879 * use bio_caller1 to determine how big the original request was rather 880 * then bio_bcount, because bio_bcount may have been truncated for EOF. 881 * 882 * XXX We check for an error, but we do not test the resid for an 883 * aligned EOF condition. This may result in character & block 884 * device access not recognizing EOF properly when read or written 885 * sequentially, but will not effect filesystems. 886 */ 887 count = (long)cbp->cb_buf.bio_caller1; 888 free(cbp, M_CCD); 889 890 /* 891 * If all done, "interrupt". 892 */ 893 bp->bio_resid -= count; 894 if (bp->bio_resid < 0) 895 panic("ccdiodone: count"); 896 if (bp->bio_resid == 0) { 897 if (bp->bio_flags & BIO_ERROR) 898 bp->bio_resid = bp->bio_bcount; 899 biodone(bp); 900 } 901} 902 903static int ccdioctltoo(int unit, u_long cmd, caddr_t data, int flag, struct thread *td); 904 905static int 906ccdctlioctl(dev_t dev, u_long cmd, caddr_t data, int flag, struct thread *td) 907{ 908 struct ccd_ioctl *ccio; 909 u_int unit; 910 dev_t dev2; 911 int error; 912 913 switch (cmd) { 914 case CCDIOCSET: 915 case CCDIOCCLR: 916 ccio = (struct ccd_ioctl *)data; 917 unit = ccio->ccio_size; 918 return (ccdioctltoo(unit, cmd, data, flag, td)); 919 case CCDCONFINFO: 920 { 921 int ninit = 0; 922 struct ccdconf *conf = (struct ccdconf *)data; 923 struct ccd_s *tmpcs; 924 struct ccd_s *ubuf = conf->buffer; 925 926 /* XXX: LOCK(unique unit numbers) */ 927 LIST_FOREACH(tmpcs, &ccd_softc_list, list) 928 if (IS_INITED(tmpcs)) 929 ninit++; 930 931 if (conf->size == 0) { 932 conf->size = sizeof(struct ccd_s) * ninit; 933 return (0); 934 } else if ((conf->size / sizeof(struct ccd_s) != ninit) || 935 (conf->size % sizeof(struct ccd_s) != 0)) { 936 /* XXX: UNLOCK(unique unit numbers) */ 937 return (EINVAL); 938 } 939 940 ubuf += ninit; 941 LIST_FOREACH(tmpcs, &ccd_softc_list, list) { 942 if (!IS_INITED(tmpcs)) 943 continue; 944 error = copyout(tmpcs, --ubuf, 945 sizeof(struct ccd_s)); 946 if (error != 0) 947 /* XXX: UNLOCK(unique unit numbers) */ 948 return (error); 949 } 950 /* XXX: UNLOCK(unique unit numbers) */ 951 return (0); 952 } 953 954 case CCDCPPINFO: 955 { 956 struct ccdcpps *cpps = (struct ccdcpps *)data; 957 char *ubuf = cpps->buffer; 958 struct ccd_s *cs; 959 960 961 error = copyin(ubuf, &unit, sizeof (unit)); 962 if (error) 963 return (error); 964 965 if (!IS_ALLOCATED(unit)) 966 return (ENXIO); 967 dev2 = makedev(CDEV_MAJOR, unit * 8 + 2); 968 cs = ccdfind(unit); 969 if (!IS_INITED(cs)) 970 return (ENXIO); 971 972 { 973 int len = 0, i; 974 struct ccdcpps *cpps = (struct ccdcpps *)data; 975 char *ubuf = cpps->buffer; 976 977 978 for (i = 0; i < cs->sc_nccdisks; ++i) 979 len += cs->sc_cinfo[i].ci_pathlen; 980 981 if (cpps->size < len) 982 return (ENOMEM); 983 984 for (i = 0; i < cs->sc_nccdisks; ++i) { 985 len = cs->sc_cinfo[i].ci_pathlen; 986 error = copyout(cs->sc_cinfo[i].ci_path, ubuf, 987 len); 988 if (error != 0) 989 return (error); 990 ubuf += len; 991 } 992 return(copyout("", ubuf, 1)); 993 } 994 break; 995 } 996 997 default: 998 return (ENXIO); 999 } 1000} 1001 1002static int 1003ccdioctltoo(int unit, u_long cmd, caddr_t data, int flag, struct thread *td) 1004{ 1005 int i, j, lookedup = 0, error = 0; 1006 struct ccd_s *cs; 1007 struct ccd_ioctl *ccio = (struct ccd_ioctl *)data; 1008 struct ccdgeom *ccg; 1009 char **cpp; 1010 struct vnode **vpp; 1011 1012 cs = ccdfind(unit); 1013 switch (cmd) { 1014 case CCDIOCSET: 1015 if (cs == NULL) 1016 cs = ccdnew(unit); 1017 if (IS_INITED(cs)) 1018 return (EBUSY); 1019 1020 if ((flag & FWRITE) == 0) 1021 return (EBADF); 1022 1023 if ((error = ccdlock(cs)) != 0) 1024 return (error); 1025 1026 if (ccio->ccio_ndisks > CCD_MAXNDISKS) 1027 return (EINVAL); 1028 1029 /* Fill in some important bits. */ 1030 cs->sc_ileave = ccio->ccio_ileave; 1031 if (cs->sc_ileave == 0 && (ccio->ccio_flags & CCDF_MIRROR)) { 1032 printf("ccd%d: disabling mirror, interleave is 0\n", 1033 unit); 1034 ccio->ccio_flags &= ~(CCDF_MIRROR); 1035 } 1036 if ((ccio->ccio_flags & CCDF_MIRROR) && 1037 !(ccio->ccio_flags & CCDF_UNIFORM)) { 1038 printf("ccd%d: mirror/parity forces uniform flag\n", 1039 unit); 1040 ccio->ccio_flags |= CCDF_UNIFORM; 1041 } 1042 cs->sc_flags = ccio->ccio_flags & CCDF_USERMASK; 1043 1044 /* 1045 * Allocate space for and copy in the array of 1046 * componet pathnames and device numbers. 1047 */ 1048 cpp = malloc(ccio->ccio_ndisks * sizeof(char *), 1049 M_CCD, M_WAITOK); 1050 vpp = malloc(ccio->ccio_ndisks * sizeof(struct vnode *), 1051 M_CCD, M_WAITOK); 1052 1053 error = copyin((caddr_t)ccio->ccio_disks, (caddr_t)cpp, 1054 ccio->ccio_ndisks * sizeof(char **)); 1055 if (error) { 1056 free(vpp, M_CCD); 1057 free(cpp, M_CCD); 1058 ccdunlock(cs); 1059 return (error); 1060 } 1061 1062 1063 for (i = 0; i < ccio->ccio_ndisks; ++i) { 1064 if ((error = ccdlookup(cpp[i], td, &vpp[i])) != 0) { 1065 for (j = 0; j < lookedup; ++j) 1066 (void)vn_close(vpp[j], FREAD|FWRITE, 1067 td->td_ucred, td); 1068 free(vpp, M_CCD); 1069 free(cpp, M_CCD); 1070 ccdunlock(cs); 1071 return (error); 1072 } 1073 ++lookedup; 1074 } 1075 cs->sc_vpp = vpp; 1076 cs->sc_nccdisks = ccio->ccio_ndisks; 1077 1078 /* 1079 * Initialize the ccd. Fills in the softc for us. 1080 */ 1081 if ((error = ccdinit(cs, cpp, td)) != 0) { 1082 for (j = 0; j < lookedup; ++j) 1083 (void)vn_close(vpp[j], FREAD|FWRITE, 1084 td->td_ucred, td); 1085 /* 1086 * We can't ccddestroy() cs just yet, because nothing 1087 * prevents user-level app to do another ioctl() 1088 * without closing the device first, therefore 1089 * declare unit null and void and let ccdclose() 1090 * destroy it when it is safe to do so. 1091 */ 1092 cs->sc_flags &= (CCDF_WANTED | CCDF_LOCKED); 1093 free(vpp, M_CCD); 1094 free(cpp, M_CCD); 1095 ccdunlock(cs); 1096 return (error); 1097 } 1098 free(cpp, M_CCD); 1099 1100 /* 1101 * The ccd has been successfully initialized, so 1102 * we can place it into the array and read the disklabel. 1103 */ 1104 ccio->ccio_unit = unit; 1105 ccio->ccio_size = cs->sc_size; 1106 ccg = &cs->sc_geom; 1107 cs->sc_disk = malloc(sizeof(struct disk), M_CCD, 1108 M_ZERO | M_WAITOK); 1109 cs->sc_disk->d_strategy = ccdstrategy; 1110 cs->sc_disk->d_name = "ccd"; 1111 cs->sc_disk->d_sectorsize = ccg->ccg_secsize; 1112 cs->sc_disk->d_mediasize = 1113 cs->sc_size * (off_t)ccg->ccg_secsize; 1114 cs->sc_disk->d_fwsectors = ccg->ccg_nsectors; 1115 cs->sc_disk->d_fwheads = ccg->ccg_ntracks; 1116 cs->sc_disk->d_drv1 = cs; 1117 cs->sc_disk->d_maxsize = MAXPHYS; 1118 disk_create(unit, cs->sc_disk, 0, NULL, NULL); 1119 1120 ccdunlock(cs); 1121 1122 break; 1123 1124 case CCDIOCCLR: 1125 if (cs == NULL) 1126 return (ENXIO); 1127 1128 if (!IS_INITED(cs)) 1129 return (ENXIO); 1130 1131 if ((flag & FWRITE) == 0) 1132 return (EBADF); 1133 1134 if ((error = ccdlock(cs)) != 0) 1135 return (error); 1136 1137 /* Don't unconfigure if any other partitions are open */ 1138 if (cs->sc_disk->d_flags & DISKFLAG_OPEN) { 1139 ccdunlock(cs); 1140 return (EBUSY); 1141 } 1142 1143 disk_destroy(cs->sc_disk); 1144 free(cs->sc_disk, M_CCD); 1145 cs->sc_disk = NULL; 1146 /* Declare unit null and void (reset all flags) */ 1147 cs->sc_flags &= (CCDF_WANTED | CCDF_LOCKED); 1148 1149 /* Close the components and free their pathnames. */ 1150 for (i = 0; i < cs->sc_nccdisks; ++i) { 1151 /* 1152 * XXX: this close could potentially fail and 1153 * cause Bad Things. Maybe we need to force 1154 * the close to happen? 1155 */ 1156 (void)vn_close(cs->sc_cinfo[i].ci_vp, FREAD|FWRITE, 1157 td->td_ucred, td); 1158 free(cs->sc_cinfo[i].ci_path, M_CCD); 1159 } 1160 1161 /* Free interleave index. */ 1162 for (i = 0; cs->sc_itable[i].ii_ndisk; ++i) 1163 free(cs->sc_itable[i].ii_index, M_CCD); 1164 1165 /* Free component info and interleave table. */ 1166 free(cs->sc_cinfo, M_CCD); 1167 free(cs->sc_itable, M_CCD); 1168 free(cs->sc_vpp, M_CCD); 1169 1170 /* This must be atomic. */ 1171 ccdunlock(cs); 1172 ccddestroy(cs); 1173 1174 break; 1175 } 1176 1177 return (0); 1178} 1179 1180 1181/* 1182 * Lookup the provided name in the filesystem. If the file exists, 1183 * is a valid block device, and isn't being used by anyone else, 1184 * set *vpp to the file's vnode. 1185 */ 1186static int 1187ccdlookup(char *path, struct thread *td, struct vnode **vpp) 1188{ 1189 struct nameidata nd; 1190 struct vnode *vp; 1191 int error, flags; 1192 1193 NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, path, td); 1194 flags = FREAD | FWRITE; 1195 if ((error = vn_open(&nd, &flags, 0)) != 0) { 1196 return (error); 1197 } 1198 vp = nd.ni_vp; 1199 1200 if (vrefcnt(vp) > 1) { 1201 error = EBUSY; 1202 goto bad; 1203 } 1204 1205 if (!vn_isdisk(vp, &error)) 1206 goto bad; 1207 1208 1209 VOP_UNLOCK(vp, 0, td); 1210 NDFREE(&nd, NDF_ONLY_PNBUF); 1211 *vpp = vp; 1212 return (0); 1213bad: 1214 VOP_UNLOCK(vp, 0, td); 1215 NDFREE(&nd, NDF_ONLY_PNBUF); 1216 /* vn_close does vrele() for vp */ 1217 (void)vn_close(vp, FREAD|FWRITE, td->td_ucred, td); 1218 return (error); 1219} 1220 1221/* 1222 1223 * Wait interruptibly for an exclusive lock. 1224 * 1225 * XXX 1226 * Several drivers do this; it should be abstracted and made MP-safe. 1227 */ 1228static int 1229ccdlock(struct ccd_s *cs) 1230{ 1231 int error; 1232 1233 while ((cs->sc_flags & CCDF_LOCKED) != 0) { 1234 cs->sc_flags |= CCDF_WANTED; 1235 if ((error = tsleep(cs, PRIBIO | PCATCH, "ccdlck", 0)) != 0) 1236 return (error); 1237 } 1238 cs->sc_flags |= CCDF_LOCKED; 1239 return (0); 1240} 1241 1242/* 1243 * Unlock and wake up any waiters. 1244 */ 1245static void 1246ccdunlock(struct ccd_s *cs) 1247{ 1248 1249 cs->sc_flags &= ~CCDF_LOCKED; 1250 if ((cs->sc_flags & CCDF_WANTED) != 0) { 1251 cs->sc_flags &= ~CCDF_WANTED; 1252 wakeup(cs); 1253 } 1254} 1255