subr_disk.c revision 105365
1/* 2 * ---------------------------------------------------------------------------- 3 * "THE BEER-WARE LICENSE" (Revision 42): 4 * <phk@FreeBSD.ORG> wrote this file. As long as you retain this notice you 5 * can do whatever you want with this stuff. If we meet some day, and you think 6 * this stuff is worth it, you can buy me a beer in return. Poul-Henning Kamp 7 * ---------------------------------------------------------------------------- 8 * 9 * $FreeBSD: head/sys/kern/subr_disk.c 105365 2002-10-17 23:48:29Z sobomax $ 10 * 11 */ 12 13#include "opt_geom.h" 14 15#include <sys/param.h> 16#include <sys/systm.h> 17#include <sys/stdint.h> 18#include <sys/bio.h> 19#include <sys/conf.h> 20#include <sys/disk.h> 21#include <sys/diskslice.h> 22#include <sys/disklabel.h> 23#ifdef NO_GEOM 24#include <sys/kernel.h> 25#include <sys/sysctl.h> 26#include <sys/malloc.h> 27#include <sys/sysctl.h> 28#include <machine/md_var.h> 29#include <sys/ctype.h> 30 31static MALLOC_DEFINE(M_DISK, "disk", "disk data"); 32 33static d_strategy_t diskstrategy; 34static d_open_t diskopen; 35static d_close_t diskclose; 36static d_ioctl_t diskioctl; 37static d_psize_t diskpsize; 38 39static LIST_HEAD(, disk) disklist = LIST_HEAD_INITIALIZER(&disklist); 40 41void disk_dev_synth(dev_t dev); 42 43void 44disk_dev_synth(dev_t dev) 45{ 46 struct disk *dp; 47 int u, s, p; 48 dev_t pdev; 49 50 if (dksparebits(dev)) 51 return; 52 LIST_FOREACH(dp, &disklist, d_list) { 53 if (major(dev) != dp->d_devsw->d_maj) 54 continue; 55 u = dkunit(dev); 56 p = RAW_PART; 57 s = WHOLE_DISK_SLICE; 58 pdev = makedev(dp->d_devsw->d_maj, dkmakeminor(u, s, p)); 59 if (pdev->si_devsw == NULL) 60 return; /* Probably a unit we don't have */ 61 s = dkslice(dev); 62 p = dkpart(dev); 63 if (s == WHOLE_DISK_SLICE && p == RAW_PART) { 64 /* XXX: actually should not happen */ 65 dev = make_dev(pdev->si_devsw, dkmakeminor(u, s, p), 66 UID_ROOT, GID_OPERATOR, 0640, "%s%d", 67 dp->d_devsw->d_name, u); 68 dev_depends(pdev, dev); 69 return; 70 } 71 if (s == COMPATIBILITY_SLICE) { 72 dev = make_dev(pdev->si_devsw, dkmakeminor(u, s, p), 73 UID_ROOT, GID_OPERATOR, 0640, "%s%d%c", 74 dp->d_devsw->d_name, u, 'a' + p); 75 dev_depends(pdev, dev); 76 return; 77 } 78 if (p != RAW_PART) { 79 dev = make_dev(pdev->si_devsw, dkmakeminor(u, s, p), 80 UID_ROOT, GID_OPERATOR, 0640, "%s%ds%d%c", 81 dp->d_devsw->d_name, u, s - BASE_SLICE + 1, 82 'a' + p); 83 } else { 84 dev = make_dev(pdev->si_devsw, dkmakeminor(u, s, p), 85 UID_ROOT, GID_OPERATOR, 0640, "%s%ds%d", 86 dp->d_devsw->d_name, u, s - BASE_SLICE + 1); 87 make_dev_alias(dev, "%s%ds%dc", 88 dp->d_devsw->d_name, u, s - BASE_SLICE + 1); 89 } 90 dev_depends(pdev, dev); 91 return; 92 } 93} 94 95static void 96disk_clone(void *arg, char *name, int namelen, dev_t *dev) 97{ 98 struct disk *dp; 99 char const *d; 100 char *e; 101 int j, u, s, p; 102 dev_t pdev; 103 104 if (*dev != NODEV) 105 return; 106 107 LIST_FOREACH(dp, &disklist, d_list) { 108 d = dp->d_devsw->d_name; 109 j = dev_stdclone(name, &e, d, &u); 110 if (j == 0) 111 continue; 112 if (u > DKMAXUNIT) 113 continue; 114 p = RAW_PART; 115 s = WHOLE_DISK_SLICE; 116 pdev = makedev(dp->d_devsw->d_maj, dkmakeminor(u, s, p)); 117 if (pdev->si_disk == NULL) 118 continue; 119 if (*e != '\0') { 120 j = dev_stdclone(e, &e, "s", &s); 121 if (j == 0) 122 s = COMPATIBILITY_SLICE; 123 else if (j == 1 || j == 2) 124 s += BASE_SLICE - 1; 125 if (!*e) 126 ; /* ad0s1 case */ 127 else if (e[1] != '\0') 128 return; /* can never be a disk name */ 129 else if (*e < 'a' || *e > 'h') 130 return; /* can never be a disk name */ 131 else 132 p = *e - 'a'; 133 } 134 if (s == WHOLE_DISK_SLICE && p == RAW_PART) { 135 return; 136 } else if (s >= BASE_SLICE && p != RAW_PART) { 137 *dev = make_dev(pdev->si_devsw, dkmakeminor(u, s, p), 138 UID_ROOT, GID_OPERATOR, 0640, "%s%ds%d%c", 139 pdev->si_devsw->d_name, u, s - BASE_SLICE + 1, 140 p + 'a'); 141 } else if (s >= BASE_SLICE) { 142 *dev = make_dev(pdev->si_devsw, dkmakeminor(u, s, p), 143 UID_ROOT, GID_OPERATOR, 0640, "%s%ds%d", 144 pdev->si_devsw->d_name, u, s - BASE_SLICE + 1); 145 make_dev_alias(*dev, "%s%ds%dc", 146 pdev->si_devsw->d_name, u, s - BASE_SLICE + 1); 147 } else { 148 *dev = make_dev(pdev->si_devsw, dkmakeminor(u, s, p), 149 UID_ROOT, GID_OPERATOR, 0640, "%s%d%c", 150 pdev->si_devsw->d_name, u, p + 'a'); 151 } 152 dev_depends(pdev, *dev); 153 return; 154 } 155} 156 157static void 158inherit_raw(dev_t pdev, dev_t dev) 159{ 160 dev->si_disk = pdev->si_disk; 161 dev->si_drv1 = pdev->si_drv1; 162 dev->si_drv2 = pdev->si_drv2; 163 dev->si_iosize_max = pdev->si_iosize_max; 164 dev->si_bsize_phys = pdev->si_bsize_phys; 165 dev->si_bsize_best = pdev->si_bsize_best; 166} 167 168dev_t 169disk_create(int unit, struct disk *dp, int flags, struct cdevsw *cdevsw, struct cdevsw *proto) 170{ 171 static int once; 172 dev_t dev; 173 174 if (!once) { 175 EVENTHANDLER_REGISTER(dev_clone, disk_clone, 0, 1000); 176 once++; 177 } 178 179 bzero(dp, sizeof(*dp)); 180 dp->d_label = malloc(sizeof *dp->d_label, M_DEVBUF, M_WAITOK|M_ZERO); 181 182 if (proto->d_open != diskopen) { 183 *proto = *cdevsw; 184 proto->d_open = diskopen; 185 proto->d_close = diskclose; 186 proto->d_ioctl = diskioctl; 187 proto->d_strategy = diskstrategy; 188 proto->d_psize = diskpsize; 189 } 190 191 if (bootverbose) 192 printf("Creating DISK %s%d\n", cdevsw->d_name, unit); 193 dev = make_dev(proto, dkmakeminor(unit, WHOLE_DISK_SLICE, RAW_PART), 194 UID_ROOT, GID_OPERATOR, 0640, "%s%d", cdevsw->d_name, unit); 195 196 dev->si_disk = dp; 197 dp->d_dev = dev; 198 dp->d_dsflags = flags; 199 dp->d_devsw = cdevsw; 200 LIST_INSERT_HEAD(&disklist, dp, d_list); 201 202 return (dev); 203} 204 205static int 206diskdumpconf(u_int onoff, dev_t dev, struct disk *dp) 207{ 208 struct dumperinfo di; 209 struct disklabel *dl; 210 211 if (!onoff) 212 return(set_dumper(NULL)); 213 dl = dsgetlabel(dev, dp->d_slice); 214 if (!dl) 215 return (ENXIO); 216 bzero(&di, sizeof di); 217 di.dumper = (dumper_t *)dp->d_devsw->d_dump; 218 di.priv = dp->d_dev; 219 di.blocksize = dl->d_secsize; 220 di.mediaoffset = (off_t)(dl->d_partitions[dkpart(dev)].p_offset + 221 dp->d_slice->dss_slices[dkslice(dev)].ds_offset) * DEV_BSIZE; 222 di.mediasize = 223 (off_t)(dl->d_partitions[dkpart(dev)].p_size) * DEV_BSIZE; 224 if (di.mediasize == 0) 225 return (EINVAL); 226 return(set_dumper(&di)); 227} 228 229void 230disk_invalidate (struct disk *disk) 231{ 232 if (disk->d_slice) 233 dsgone(&disk->d_slice); 234} 235 236void 237disk_destroy(dev_t dev) 238{ 239 LIST_REMOVE(dev->si_disk, d_list); 240 free(dev->si_disk->d_label, M_DEVBUF); 241 bzero(dev->si_disk, sizeof(*dev->si_disk)); 242 dev->si_disk = NULL; 243 destroy_dev(dev); 244 return; 245} 246 247struct disk * 248disk_enumerate(struct disk *disk) 249{ 250 if (!disk) 251 return (LIST_FIRST(&disklist)); 252 else 253 return (LIST_NEXT(disk, d_list)); 254} 255 256static int 257sysctl_disks(SYSCTL_HANDLER_ARGS) 258{ 259 struct disk *disk; 260 int error, first; 261 262 disk = NULL; 263 first = 1; 264 265 while ((disk = disk_enumerate(disk))) { 266 if (!first) { 267 error = SYSCTL_OUT(req, " ", 1); 268 if (error) 269 return error; 270 } else { 271 first = 0; 272 } 273 error = SYSCTL_OUT(req, disk->d_dev->si_name, strlen(disk->d_dev->si_name)); 274 if (error) 275 return error; 276 } 277 error = SYSCTL_OUT(req, "", 1); 278 return error; 279} 280 281SYSCTL_PROC(_kern, OID_AUTO, disks, CTLTYPE_STRING | CTLFLAG_RD, 0, 0, 282 sysctl_disks, "A", "names of available disks"); 283 284/* 285 * The cdevsw functions 286 */ 287 288static int 289diskopen(dev_t dev, int oflags, int devtype, struct thread *td) 290{ 291 dev_t pdev; 292 struct disk *dp; 293 int error; 294 295 error = 0; 296 pdev = dkmodpart(dkmodslice(dev, WHOLE_DISK_SLICE), RAW_PART); 297 298 dp = pdev->si_disk; 299 if (!dp) 300 return (ENXIO); 301 302 while (dp->d_flags & DISKFLAG_LOCK) { 303 dp->d_flags |= DISKFLAG_WANTED; 304 error = tsleep(dp, PRIBIO | PCATCH, "diskopen", hz); 305 if (error) 306 return (error); 307 } 308 dp->d_flags |= DISKFLAG_LOCK; 309 310 if (!dsisopen(dp->d_slice)) { 311 if (!pdev->si_iosize_max) 312 pdev->si_iosize_max = dev->si_iosize_max; 313 error = dp->d_devsw->d_open(pdev, oflags, devtype, td); 314 dp->d_label->d_secsize = dp->d_sectorsize; 315 dp->d_label->d_secperunit = dp->d_mediasize / dp->d_sectorsize; 316 dp->d_label->d_nsectors = dp->d_fwsectors; 317 dp->d_label->d_ntracks = dp->d_fwheads; 318 } 319 320 /* Inherit properties from the whole/raw dev_t */ 321 inherit_raw(pdev, dev); 322 323 if (error) 324 goto out; 325 326 error = dsopen(dev, devtype, dp->d_dsflags, &dp->d_slice, dp->d_label); 327 328 if (!dsisopen(dp->d_slice)) 329 dp->d_devsw->d_close(pdev, oflags, devtype, td); 330out: 331 dp->d_flags &= ~DISKFLAG_LOCK; 332 if (dp->d_flags & DISKFLAG_WANTED) { 333 dp->d_flags &= ~DISKFLAG_WANTED; 334 wakeup(dp); 335 } 336 337 return(error); 338} 339 340static int 341diskclose(dev_t dev, int fflag, int devtype, struct thread *td) 342{ 343 struct disk *dp; 344 int error; 345 dev_t pdev; 346 347 error = 0; 348 pdev = dkmodpart(dkmodslice(dev, WHOLE_DISK_SLICE), RAW_PART); 349 dp = pdev->si_disk; 350 if (!dp) 351 return (ENXIO); 352 dsclose(dev, devtype, dp->d_slice); 353 if (!dsisopen(dp->d_slice)) 354 error = dp->d_devsw->d_close(dp->d_dev, fflag, devtype, td); 355 return (error); 356} 357 358static void 359diskstrategy(struct bio *bp) 360{ 361 dev_t pdev; 362 struct disk *dp; 363 364 pdev = dkmodpart(dkmodslice(bp->bio_dev, WHOLE_DISK_SLICE), RAW_PART); 365 dp = pdev->si_disk; 366 bp->bio_resid = bp->bio_bcount; 367 if (dp != bp->bio_dev->si_disk) 368 inherit_raw(pdev, bp->bio_dev); 369 370 if (!dp) { 371 biofinish(bp, NULL, ENXIO); 372 return; 373 } 374 375 if (dscheck(bp, dp->d_slice) <= 0) { 376 biodone(bp); 377 return; 378 } 379 380 if (bp->bio_bcount == 0) { 381 biodone(bp); 382 return; 383 } 384 385 KASSERT(dp->d_devsw != NULL, ("NULL devsw")); 386 KASSERT(dp->d_devsw->d_strategy != NULL, ("NULL d_strategy")); 387 dp->d_devsw->d_strategy(bp); 388 return; 389 390} 391 392static int 393diskioctl(dev_t dev, u_long cmd, caddr_t data, int fflag, struct thread *td) 394{ 395 struct disk *dp; 396 int error; 397 u_int u; 398 dev_t pdev; 399 400 pdev = dkmodpart(dkmodslice(dev, WHOLE_DISK_SLICE), RAW_PART); 401 dp = pdev->si_disk; 402 if (!dp) 403 return (ENXIO); 404 if (cmd == DIOCSKERNELDUMP) { 405 u = *(u_int *)data; 406 return (diskdumpconf(u, dev, dp)); 407 } 408 if (cmd == DIOCGFRONTSTUFF) { 409 *(off_t *)data = 8192; /* XXX: crude but enough) */ 410 return (0); 411 } 412 error = dsioctl(dev, cmd, data, fflag, &dp->d_slice); 413 if (error == ENOIOCTL) 414 error = dp->d_devsw->d_ioctl(dev, cmd, data, fflag, td); 415 return (error); 416} 417 418static int 419diskpsize(dev_t dev) 420{ 421 struct disk *dp; 422 dev_t pdev; 423 424 pdev = dkmodpart(dkmodslice(dev, WHOLE_DISK_SLICE), RAW_PART); 425 dp = pdev->si_disk; 426 if (!dp) 427 return (-1); 428 if (dp != dev->si_disk) { 429 dev->si_drv1 = pdev->si_drv1; 430 dev->si_drv2 = pdev->si_drv2; 431 /* XXX: don't set bp->b_dev->si_disk (?) */ 432 } 433 return (dssize(dev, &dp->d_slice)); 434} 435 436SYSCTL_INT(_debug_sizeof, OID_AUTO, disklabel, CTLFLAG_RD, 437 0, sizeof(struct disklabel), "sizeof(struct disklabel)"); 438 439SYSCTL_INT(_debug_sizeof, OID_AUTO, diskslices, CTLFLAG_RD, 440 0, sizeof(struct diskslices), "sizeof(struct diskslices)"); 441 442SYSCTL_INT(_debug_sizeof, OID_AUTO, disk, CTLFLAG_RD, 443 0, sizeof(struct disk), "sizeof(struct disk)"); 444 445#endif /* NO_GEOM */ 446 447/*- 448 * Disk error is the preface to plaintive error messages 449 * about failing disk transfers. It prints messages of the form 450 * "hp0g: BLABLABLA cmd=read fsbn 12345 of 12344-12347" 451 * blkdone should be -1 if the position of the error is unknown. 452 * The message is printed with printf. 453 */ 454void 455disk_err(struct bio *bp, const char *what, int blkdone, int nl) 456{ 457 daddr_t sn; 458 459 printf("%s: %s ", devtoname(bp->bio_dev), what); 460 switch(bp->bio_cmd) { 461 case BIO_READ: printf("cmd=read "); break; 462 case BIO_WRITE: printf("cmd=write "); break; 463 case BIO_DELETE: printf("cmd=delete "); break; 464 case BIO_GETATTR: printf("cmd=getattr "); break; 465 case BIO_SETATTR: printf("cmd=setattr "); break; 466 default: printf("cmd=%x ", bp->bio_cmd); break; 467 } 468 sn = bp->bio_blkno; 469 if (bp->bio_bcount <= DEV_BSIZE) { 470 printf("fsbn %jd%s", (intmax_t)sn, nl ? "\n" : ""); 471 return; 472 } 473 if (blkdone >= 0) { 474 sn += blkdone; 475 printf("fsbn %jd of ", (intmax_t)sn); 476 } 477 printf("%jd-%jd", (intmax_t)bp->bio_blkno, 478 (intmax_t)(bp->bio_blkno + (bp->bio_bcount - 1) / DEV_BSIZE)); 479 if (nl) 480 printf("\n"); 481} 482 483#ifdef notquite 484/* 485 * Mutex to use when delaying niced I/O bound processes in bioq_disksort(). 486 */ 487static struct mtx dksort_mtx; 488static void 489dksort_init(void) 490{ 491 492 mtx_init(&dksort_mtx, "dksort", NULL, MTX_DEF); 493} 494SYSINIT(dksort, SI_SUB_DRIVERS, SI_ORDER_MIDDLE, dksort_init, NULL) 495#endif 496 497/* 498 * Seek sort for disks. 499 * 500 * The buf_queue keep two queues, sorted in ascending block order. The first 501 * queue holds those requests which are positioned after the current block 502 * (in the first request); the second, which starts at queue->switch_point, 503 * holds requests which came in after their block number was passed. Thus 504 * we implement a one way scan, retracting after reaching the end of the drive 505 * to the first request on the second queue, at which time it becomes the 506 * first queue. 507 * 508 * A one-way scan is natural because of the way UNIX read-ahead blocks are 509 * allocated. 510 */ 511 512void 513bioq_disksort(bioq, bp) 514 struct bio_queue_head *bioq; 515 struct bio *bp; 516{ 517 struct bio *bq; 518 struct bio *bn; 519 struct bio *be; 520 521#ifdef notquite 522 struct thread *td = curthread; 523 524 if (td && td->td_ksegrp->kg_nice > 0) { 525 TAILQ_FOREACH(bn, &bioq->queue, bio_queue) 526 if (BIOTOBUF(bp)->b_vp != BIOTOBUF(bn)->b_vp) 527 break; 528 if (bn != NULL) { 529 mtx_lock(&dksort_mtx); 530 msleep(&dksort_mtx, &dksort_mtx, 531 PPAUSE | PCATCH | PDROP, "ioslow", 532 td->td_ksegrp->kg_nice); 533 } 534 } 535#endif 536 if (!atomic_cmpset_int(&bioq->busy, 0, 1)) 537 panic("Recursing in bioq_disksort()"); 538 be = TAILQ_LAST(&bioq->queue, bio_queue); 539 /* 540 * If the queue is empty or we are an 541 * ordered transaction, then it's easy. 542 */ 543 if ((bq = bioq_first(bioq)) == NULL) { 544 bioq_insert_tail(bioq, bp); 545 bioq->busy = 0; 546 return; 547 } else if (bioq->insert_point != NULL) { 548 549 /* 550 * A certain portion of the list is 551 * "locked" to preserve ordering, so 552 * we can only insert after the insert 553 * point. 554 */ 555 bq = bioq->insert_point; 556 } else { 557 558 /* 559 * If we lie before the last removed (currently active) 560 * request, and are not inserting ourselves into the 561 * "locked" portion of the list, then we must add ourselves 562 * to the second request list. 563 */ 564 if (bp->bio_pblkno < bioq->last_pblkno) { 565 566 bq = bioq->switch_point; 567 /* 568 * If we are starting a new secondary list, 569 * then it's easy. 570 */ 571 if (bq == NULL) { 572 bioq->switch_point = bp; 573 bioq_insert_tail(bioq, bp); 574 bioq->busy = 0; 575 return; 576 } 577 /* 578 * If we lie ahead of the current switch point, 579 * insert us before the switch point and move 580 * the switch point. 581 */ 582 if (bp->bio_pblkno < bq->bio_pblkno) { 583 bioq->switch_point = bp; 584 TAILQ_INSERT_BEFORE(bq, bp, bio_queue); 585 bioq->busy = 0; 586 return; 587 } 588 } else { 589 if (bioq->switch_point != NULL) 590 be = TAILQ_PREV(bioq->switch_point, 591 bio_queue, bio_queue); 592 /* 593 * If we lie between last_pblkno and bq, 594 * insert before bq. 595 */ 596 if (bp->bio_pblkno < bq->bio_pblkno) { 597 TAILQ_INSERT_BEFORE(bq, bp, bio_queue); 598 bioq->busy = 0; 599 return; 600 } 601 } 602 } 603 604 /* 605 * Request is at/after our current position in the list. 606 * Optimize for sequential I/O by seeing if we go at the tail. 607 */ 608 if (bp->bio_pblkno > be->bio_pblkno) { 609 TAILQ_INSERT_AFTER(&bioq->queue, be, bp, bio_queue); 610 bioq->busy = 0; 611 return; 612 } 613 614 /* Otherwise, insertion sort */ 615 while ((bn = TAILQ_NEXT(bq, bio_queue)) != NULL) { 616 617 /* 618 * We want to go after the current request if it is the end 619 * of the first request list, or if the next request is a 620 * larger cylinder than our request. 621 */ 622 if (bn == bioq->switch_point 623 || bp->bio_pblkno < bn->bio_pblkno) 624 break; 625 bq = bn; 626 } 627 TAILQ_INSERT_AFTER(&bioq->queue, bq, bp, bio_queue); 628 bioq->busy = 0; 629} 630 631 632