md.c revision 97291
1/* 2 * ---------------------------------------------------------------------------- 3 * "THE BEER-WARE LICENSE" (Revision 42): 4 * <phk@FreeBSD.ORG> wrote this file. As long as you retain this notice you 5 * can do whatever you want with this stuff. If we meet some day, and you think 6 * this stuff is worth it, you can buy me a beer in return. Poul-Henning Kamp 7 * ---------------------------------------------------------------------------- 8 * 9 * $FreeBSD: head/sys/dev/md/md.c 97291 2002-05-25 20:44:20Z phk $ 10 * 11 */ 12 13/* 14 * The following functions are based in the vn(4) driver: mdstart_swap(), 15 * mdstart_vnode(), mdcreate_swap(), mdcreate_vnode() and mddestroy(), 16 * and as such under the following copyright: 17 * 18 * Copyright (c) 1988 University of Utah. 19 * Copyright (c) 1990, 1993 20 * The Regents of the University of California. All rights reserved. 21 * 22 * This code is derived from software contributed to Berkeley by 23 * the Systems Programming Group of the University of Utah Computer 24 * Science Department. 25 * 26 * Redistribution and use in source and binary forms, with or without 27 * modification, are permitted provided that the following conditions 28 * are met: 29 * 1. Redistributions of source code must retain the above copyright 30 * notice, this list of conditions and the following disclaimer. 31 * 2. Redistributions in binary form must reproduce the above copyright 32 * notice, this list of conditions and the following disclaimer in the 33 * documentation and/or other materials provided with the distribution. 34 * 3. All advertising materials mentioning features or use of this software 35 * must display the following acknowledgement: 36 * This product includes software developed by the University of 37 * California, Berkeley and its contributors. 38 * 4. Neither the name of the University nor the names of its contributors 39 * may be used to endorse or promote products derived from this software 40 * without specific prior written permission. 41 * 42 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 43 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 44 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 45 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 46 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 47 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 48 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 49 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 50 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 51 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 52 * SUCH DAMAGE. 53 * 54 * from: Utah Hdr: vn.c 1.13 94/04/02 55 * 56 * from: @(#)vn.c 8.6 (Berkeley) 4/1/94 57 * From: src/sys/dev/vn/vn.c,v 1.122 2000/12/16 16:06:03 58 */ 59 60#include "opt_md.h" 61 62#include <sys/param.h> 63#include <sys/systm.h> 64#include <sys/bio.h> 65#include <sys/conf.h> 66#include <sys/devicestat.h> 67#include <sys/disk.h> 68#include <sys/fcntl.h> 69#include <sys/kernel.h> 70#include <sys/linker.h> 71#include <sys/lock.h> 72#include <sys/malloc.h> 73#include <sys/mdioctl.h> 74#include <sys/mutex.h> 75#include <sys/namei.h> 76#include <sys/proc.h> 77#include <sys/queue.h> 78#include <sys/sysctl.h> 79#include <sys/vnode.h> 80 81#include <machine/atomic.h> 82 83#include <vm/vm.h> 84#include <vm/vm_object.h> 85#include <vm/vm_page.h> 86#include <vm/vm_pager.h> 87#include <vm/swap_pager.h> 88 89#define MD_MODVER 1 90 91#ifndef MD_NSECT 92#define MD_NSECT (10000 * 2) 93#endif 94 95static MALLOC_DEFINE(M_MD, "MD disk", "Memory Disk"); 96static MALLOC_DEFINE(M_MDSECT, "MD sectors", "Memory Disk Sectors"); 97 98static int md_debug; 99SYSCTL_INT(_debug, OID_AUTO, mddebug, CTLFLAG_RW, &md_debug, 0, ""); 100 101#if defined(MD_ROOT) && defined(MD_ROOT_SIZE) 102/* Image gets put here: */ 103static u_char mfs_root[MD_ROOT_SIZE*1024] = "MFS Filesystem goes here"; 104static u_char end_mfs_root[] __unused = "MFS Filesystem had better STOP here"; 105#endif 106 107static int mdrootready; 108static int mdunits; 109static dev_t status_dev = 0; 110 111#define CDEV_MAJOR 95 112 113static d_strategy_t mdstrategy; 114static d_open_t mdopen; 115static d_close_t mdclose; 116static d_ioctl_t mdioctl, mdctlioctl; 117 118static struct cdevsw md_cdevsw = { 119 /* open */ mdopen, 120 /* close */ mdclose, 121 /* read */ physread, 122 /* write */ physwrite, 123 /* ioctl */ mdioctl, 124 /* poll */ nopoll, 125 /* mmap */ nommap, 126 /* strategy */ mdstrategy, 127 /* name */ MD_NAME, 128 /* maj */ CDEV_MAJOR, 129 /* dump */ nodump, 130 /* psize */ nopsize, 131 /* flags */ D_DISK | D_CANFREE | D_MEMDISK, 132}; 133 134static struct cdevsw mdctl_cdevsw = { 135 /* open */ nullopen, 136 /* close */ nullclose, 137 /* read */ noread, 138 /* write */ nowrite, 139 /* ioctl */ mdctlioctl, 140 /* poll */ nopoll, 141 /* mmap */ nommap, 142 /* strategy */ nostrategy, 143 /* name */ MD_NAME, 144 /* maj */ CDEV_MAJOR 145}; 146 147static struct cdevsw mddisk_cdevsw; 148 149static LIST_HEAD(, md_s) md_softc_list = LIST_HEAD_INITIALIZER(&md_softc_list); 150 151#define NINDIR (PAGE_SIZE / sizeof(uintptr_t)) 152#define NMASK (NINDIR-1) 153static int nshift; 154 155struct indir { 156 uintptr_t *array; 157 uint total; 158 uint used; 159 uint shift; 160}; 161 162struct md_s { 163 int unit; 164 LIST_ENTRY(md_s) list; 165 struct devstat stats; 166 struct bio_queue_head bio_queue; 167 struct disk disk; 168 dev_t dev; 169 int busy; 170 enum md_types type; 171 unsigned nsect; 172 unsigned opencount; 173 unsigned secsize; 174 unsigned flags; 175 176 /* MD_MALLOC related fields */ 177 struct indir *indir; 178 179 /* MD_PRELOAD related fields */ 180 u_char *pl_ptr; 181 unsigned pl_len; 182 183 /* MD_VNODE related fields */ 184 struct vnode *vnode; 185 struct ucred *cred; 186 187 /* MD_SWAP related fields */ 188 vm_object_t object; 189}; 190 191static int mddestroy(struct md_s *sc, struct thread *td); 192 193static struct indir * 194new_indir(uint shift) 195{ 196 struct indir *ip; 197 198 ip = malloc(sizeof *ip, M_MD, M_NOWAIT | M_ZERO); 199 if (ip == NULL) 200 return(NULL); 201 ip->array = malloc(sizeof(uintptr_t) * NINDIR, 202 M_MDSECT, M_NOWAIT | M_ZERO); 203 if (ip->array == NULL) { 204 free(ip, M_MD); 205 return(NULL); 206 } 207 ip->total = NINDIR; 208 ip->shift = shift; 209 return(ip); 210} 211 212static void 213del_indir(struct indir *ip) 214{ 215 216 free(ip->array, M_MD); 217 free(ip, M_MD); 218} 219 220/* 221 * This function does the math and alloctes the top level "indir" structure 222 * for a device of "size" sectors. 223 */ 224 225static struct indir * 226dimension(off_t size) 227{ 228 off_t rcnt; 229 struct indir *ip; 230 int i, layer; 231 232 rcnt = size; 233 layer = 0; 234 while (rcnt > NINDIR) { 235 rcnt /= NINDIR; 236 layer++; 237 } 238 /* figure out log2(NINDIR) */ 239 for (i = NINDIR, nshift = -1; i; nshift++) 240 i >>= 1; 241 242 /* 243 * XXX: the top layer is probably not fully populated, so we allocate 244 * too much space for ip->array in new_indir() here. 245 */ 246 ip = new_indir(layer * nshift); 247 return (ip); 248} 249 250/* 251 * Read a given sector 252 */ 253 254static uintptr_t 255s_read(struct indir *ip, off_t offset) 256{ 257 struct indir *cip; 258 int idx; 259 uintptr_t up; 260 261 if (md_debug > 1) 262 printf("s_read(%lld)\n", offset); 263 up = 0; 264 for (cip = ip; cip != NULL;) { 265 if (cip->shift) { 266 idx = (offset >> cip->shift) & NMASK; 267 up = cip->array[idx]; 268 cip = (struct indir *)up; 269 continue; 270 } 271 idx = offset & NMASK; 272 return(cip->array[idx]); 273 } 274 return (0); 275} 276 277/* 278 * Write a given sector, prune the tree if the value is 0 279 * If the new value is different from the old, return the old value. 280 */ 281 282static int 283s_write(struct indir *ip, off_t offset, uintptr_t ptr, uintptr_t *old) 284{ 285 struct indir *cip, *lip[10]; 286 int idx, li; 287 uintptr_t up; 288 289 if (md_debug > 1) 290 printf("s_write(%lld, %p, %p)\n", offset, (void *)ptr, old); 291 up = 0; 292 li = 0; 293 cip = ip; 294 for (;;) { 295 lip[li++] = cip; 296 if (cip->shift) { 297 idx = (offset >> cip->shift) & NMASK; 298 up = cip->array[idx]; 299 if (up != 0) { 300 cip = (struct indir *)up; 301 continue; 302 } 303 /* Allocate branch */ 304 cip->array[idx] = 305 (uintptr_t)new_indir(cip->shift - nshift); 306 if (cip->array[idx] == 0) 307 return(ENOMEM); 308 cip->used++; 309 up = cip->array[idx]; 310 cip = (struct indir *)up; 311 continue; 312 } 313 /* leafnode */ 314 idx = offset & NMASK; 315 up = cip->array[idx]; 316 if (old != NULL && up != ptr) 317 *old = up; 318 if (up != 0) 319 cip->used--; 320 cip->array[idx] = ptr; 321 if (ptr != 0) 322 cip->used++; 323 break; 324 } 325 if (cip->used != 0 || li == 1) 326 return (0); 327 li--; 328 while (cip->used == 0 && cip != ip) { 329 li--; 330 idx = (offset >> lip[li]->shift) & NMASK; 331 up = lip[li]->array[idx]; 332 KASSERT(up == (uintptr_t)cip, ("md screwed up")); 333 del_indir(cip); 334 lip[li]->array[idx] = NULL; 335 lip[li]->used--; 336 cip = lip[li]; 337 } 338 return (0); 339} 340 341static int 342mdopen(dev_t dev, int flag, int fmt, struct thread *td) 343{ 344 struct md_s *sc; 345 struct disklabel *dl; 346 347 if (md_debug) 348 printf("mdopen(%p %x %x %p)\n", 349 devtoname(dev), flag, fmt, td); 350 351 sc = dev->si_drv1; 352 353 dl = &sc->disk.d_label; 354 bzero(dl, sizeof(*dl)); 355 dl->d_secsize = sc->secsize; 356 dl->d_nsectors = sc->nsect > 63 ? 63 : sc->nsect; 357 dl->d_ntracks = 1; 358 dl->d_secpercyl = dl->d_nsectors * dl->d_ntracks; 359 dl->d_secperunit = sc->nsect; 360 dl->d_ncylinders = dl->d_secperunit / dl->d_secpercyl; 361 sc->opencount++; 362 return (0); 363} 364 365static int 366mdclose(dev_t dev, int flags, int fmt, struct thread *td) 367{ 368 struct md_s *sc = dev->si_drv1; 369 370 sc->opencount--; 371 return (0); 372} 373 374static int 375mdioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct thread *td) 376{ 377 378 if (md_debug) 379 printf("mdioctl(%s %lx %p %x %p)\n", 380 devtoname(dev), cmd, addr, flags, td); 381 382 return (ENOIOCTL); 383} 384 385static int 386mdstart_malloc(struct md_s *sc, struct bio *bp) 387{ 388 int i, error; 389 u_char *dst; 390 unsigned secno, nsec, uc; 391 uintptr_t sp, osp; 392 393 nsec = bp->bio_bcount / sc->secsize; 394 secno = bp->bio_pblkno; 395 dst = bp->bio_data; 396 error = 0; 397 while (nsec--) { 398 osp = 0; 399 if (bp->bio_cmd == BIO_DELETE) { 400 error = s_write(sc->indir, secno, 0, &osp); 401 } else if (bp->bio_cmd == BIO_READ) { 402 sp = s_read(sc->indir, secno); 403 if (sp == 0) 404 bzero(dst, sc->secsize); 405 else if (sp <= 255) 406 for (i = 0; i < sc->secsize; i++) 407 dst[i] = sp; 408 else 409 bcopy((void *)sp, dst, sc->secsize); 410 } else if (bp->bio_cmd == BIO_WRITE) { 411 if (sc->flags & MD_COMPRESS) { 412 uc = dst[0]; 413 for (i = 1; i < sc->secsize; i++) 414 if (dst[i] != uc) 415 break; 416 } else { 417 i = 0; 418 uc = 0; 419 } 420 if (i == sc->secsize) { 421 error = s_write(sc->indir, secno, uc, &osp); 422 } else { 423 sp = s_read(sc->indir, secno); 424 if (sp <= 255) 425 sp = (uintptr_t) malloc( 426 sc->secsize, M_MDSECT, M_NOWAIT); 427 if (sp == 0) { 428 error = ENOMEM; 429 } else { 430 bcopy(dst, (void *)sp, sc->secsize); 431 error = s_write(sc->indir, secno, 432 sp, &osp); 433 } 434 } 435 } else { 436 error = EOPNOTSUPP; 437 } 438 if (osp > 255) 439 free((void*)osp, M_MDSECT); 440 if (error) 441 break; 442 secno++; 443 dst += sc->secsize; 444 } 445 bp->bio_resid = 0; 446 return (error); 447} 448 449 450static int 451mdstart_preload(struct md_s *sc, struct bio *bp) 452{ 453 454 if (bp->bio_cmd == BIO_DELETE) { 455 } else if (bp->bio_cmd == BIO_READ) { 456 bcopy(sc->pl_ptr + (bp->bio_pblkno << DEV_BSHIFT), bp->bio_data, bp->bio_bcount); 457 } else { 458 bcopy(bp->bio_data, sc->pl_ptr + (bp->bio_pblkno << DEV_BSHIFT), bp->bio_bcount); 459 } 460 bp->bio_resid = 0; 461 return (0); 462} 463 464static int 465mdstart_vnode(struct md_s *sc, struct bio *bp) 466{ 467 int error; 468 struct uio auio; 469 struct iovec aiov; 470 struct mount *mp; 471 472 /* 473 * VNODE I/O 474 * 475 * If an error occurs, we set BIO_ERROR but we do not set 476 * B_INVAL because (for a write anyway), the buffer is 477 * still valid. 478 */ 479 480 bzero(&auio, sizeof(auio)); 481 482 aiov.iov_base = bp->bio_data; 483 aiov.iov_len = bp->bio_bcount; 484 auio.uio_iov = &aiov; 485 auio.uio_iovcnt = 1; 486 auio.uio_offset = (vm_ooffset_t)bp->bio_pblkno * sc->secsize; 487 auio.uio_segflg = UIO_SYSSPACE; 488 if(bp->bio_cmd == BIO_READ) 489 auio.uio_rw = UIO_READ; 490 else 491 auio.uio_rw = UIO_WRITE; 492 auio.uio_resid = bp->bio_bcount; 493 auio.uio_td = curthread; 494 /* 495 * When reading set IO_DIRECT to try to avoid double-caching 496 * the data. When writing IO_DIRECT is not optimal, but we 497 * must set IO_NOWDRAIN to avoid a wdrain deadlock. 498 */ 499 if (bp->bio_cmd == BIO_READ) { 500 vn_lock(sc->vnode, LK_EXCLUSIVE | LK_RETRY, curthread); 501 error = VOP_READ(sc->vnode, &auio, IO_DIRECT, sc->cred); 502 } else { 503 (void) vn_start_write(sc->vnode, &mp, V_WAIT); 504 vn_lock(sc->vnode, LK_EXCLUSIVE | LK_RETRY, curthread); 505 error = VOP_WRITE(sc->vnode, &auio, IO_NOWDRAIN, sc->cred); 506 vn_finished_write(mp); 507 } 508 VOP_UNLOCK(sc->vnode, 0, curthread); 509 bp->bio_resid = auio.uio_resid; 510 return (error); 511} 512 513static int 514mdstart_swap(struct md_s *sc, struct bio *bp) 515{ 516 517 if ((bp->bio_cmd == BIO_DELETE) && (sc->flags & MD_RESERVE)) 518 biodone(bp); 519 else 520 vm_pager_strategy(sc->object, bp); 521 return (-1); 522} 523 524static void 525mdstrategy(struct bio *bp) 526{ 527 struct md_s *sc; 528 int error; 529 530 if (md_debug > 1) 531 printf("mdstrategy(%p) %s %x, %lld, %ld, %p)\n", 532 (void *)bp, devtoname(bp->bio_dev), bp->bio_flags, 533 (long long)bp->bio_blkno, bp->bio_bcount / DEV_BSIZE, 534 (void *)bp->bio_data); 535 536 sc = bp->bio_dev->si_drv1; 537 538 /* XXX: LOCK(sc->lock) */ 539 bioqdisksort(&sc->bio_queue, bp); 540 /* XXX: UNLOCK(sc->lock) */ 541 542 if (atomic_cmpset_int(&sc->busy, 0, 1) == 0) 543 return; 544 545 for (;;) { 546 /* XXX: LOCK(unique unit numbers) */ 547 bp = bioq_first(&sc->bio_queue); 548 if (bp) 549 bioq_remove(&sc->bio_queue, bp); 550 /* XXX: UNLOCK(unique unit numbers) */ 551 if (!bp) 552 break; 553 554 555 switch (sc->type) { 556 case MD_MALLOC: 557 devstat_start_transaction(&sc->stats); 558 error = mdstart_malloc(sc, bp); 559 break; 560 case MD_PRELOAD: 561 devstat_start_transaction(&sc->stats); 562 error = mdstart_preload(sc, bp); 563 break; 564 case MD_VNODE: 565 devstat_start_transaction(&sc->stats); 566 error = mdstart_vnode(sc, bp); 567 break; 568 case MD_SWAP: 569 error = mdstart_swap(sc, bp); 570 break; 571 default: 572 panic("Impossible md(type)"); 573 break; 574 } 575 576 if (error != -1) 577 biofinish(bp, &sc->stats, error); 578 } 579 sc->busy = 0; 580} 581 582static struct md_s * 583mdfind(int unit) 584{ 585 struct md_s *sc; 586 587 /* XXX: LOCK(unique unit numbers) */ 588 LIST_FOREACH(sc, &md_softc_list, list) { 589 if (sc->unit == unit) 590 break; 591 } 592 /* XXX: UNLOCK(unique unit numbers) */ 593 return (sc); 594} 595 596static struct md_s * 597mdnew(int unit) 598{ 599 struct md_s *sc; 600 int max = -1; 601 602 /* XXX: LOCK(unique unit numbers) */ 603 LIST_FOREACH(sc, &md_softc_list, list) { 604 if (sc->unit == unit) { 605 /* XXX: UNLOCK(unique unit numbers) */ 606 return (NULL); 607 } 608 if (sc->unit > max) 609 max = sc->unit; 610 } 611 if (unit == -1) 612 unit = max + 1; 613 if (unit > DKMAXUNIT) 614 return (NULL); 615 sc = (struct md_s *)malloc(sizeof *sc, M_MD, M_WAITOK | M_ZERO); 616 sc->unit = unit; 617 LIST_INSERT_HEAD(&md_softc_list, sc, list); 618 /* XXX: UNLOCK(unique unit numbers) */ 619 return (sc); 620} 621 622static void 623mdinit(struct md_s *sc) 624{ 625 626 bioq_init(&sc->bio_queue); 627 devstat_add_entry(&sc->stats, MD_NAME, sc->unit, sc->secsize, 628 DEVSTAT_NO_ORDERED_TAGS, 629 DEVSTAT_TYPE_DIRECT | DEVSTAT_TYPE_IF_OTHER, 630 DEVSTAT_PRIORITY_OTHER); 631 sc->dev = disk_create(sc->unit, &sc->disk, 0, &md_cdevsw, &mddisk_cdevsw); 632 sc->dev->si_drv1 = sc; 633} 634 635/* 636 * XXX: we should check that the range they feed us is mapped. 637 * XXX: we should implement read-only. 638 */ 639 640static int 641mdcreate_preload(struct md_ioctl *mdio) 642{ 643 struct md_s *sc; 644 645 if (mdio->md_size == 0) 646 return (EINVAL); 647 if (mdio->md_options & ~(MD_AUTOUNIT)) 648 return (EINVAL); 649 if (mdio->md_options & MD_AUTOUNIT) { 650 sc = mdnew(-1); 651 if (sc == NULL) 652 return (ENOMEM); 653 mdio->md_unit = sc->unit; 654 } else { 655 sc = mdnew(mdio->md_unit); 656 if (sc == NULL) 657 return (EBUSY); 658 } 659 sc->type = MD_PRELOAD; 660 sc->secsize = DEV_BSIZE; 661 sc->nsect = mdio->md_size; 662 sc->flags = mdio->md_options & MD_FORCE; 663 /* Cast to pointer size, then to pointer to avoid warning */ 664 sc->pl_ptr = (u_char *)(uintptr_t)mdio->md_base; 665 sc->pl_len = (mdio->md_size << DEV_BSHIFT); 666 mdinit(sc); 667 return (0); 668} 669 670 671static int 672mdcreate_malloc(struct md_ioctl *mdio) 673{ 674 struct md_s *sc; 675 off_t u; 676 uintptr_t sp; 677 int error; 678 679 error = 0; 680 if (mdio->md_size == 0) 681 return (EINVAL); 682 if (mdio->md_options & ~(MD_AUTOUNIT | MD_COMPRESS | MD_RESERVE)) 683 return (EINVAL); 684 /* Compression doesn't make sense if we have reserved space */ 685 if (mdio->md_options & MD_RESERVE) 686 mdio->md_options &= ~MD_COMPRESS; 687 if (mdio->md_options & MD_AUTOUNIT) { 688 sc = mdnew(-1); 689 if (sc == NULL) 690 return (ENOMEM); 691 mdio->md_unit = sc->unit; 692 } else { 693 sc = mdnew(mdio->md_unit); 694 if (sc == NULL) 695 return (EBUSY); 696 } 697 sc->type = MD_MALLOC; 698 sc->secsize = DEV_BSIZE; 699 sc->nsect = mdio->md_size; 700 sc->flags = mdio->md_options & (MD_COMPRESS | MD_FORCE); 701 sc->indir = dimension(sc->nsect); 702 if (mdio->md_options & MD_RESERVE) { 703 for (u = 0; u < sc->nsect; u++) { 704 sp = (uintptr_t) malloc( sc->secsize, 705 M_MDSECT, M_NOWAIT | M_ZERO); 706 if (sp != 0) 707 error = s_write(sc->indir, u, sp, NULL); 708 else 709 error = ENOMEM; 710 if (error) 711 break; 712 } 713 } 714 if (!error) { 715 printf("%s%d: Malloc disk\n", MD_NAME, sc->unit); 716 mdinit(sc); 717 } else 718 mddestroy(sc, NULL); 719 return (error); 720} 721 722 723static int 724mdsetcred(struct md_s *sc, struct ucred *cred) 725{ 726 char *tmpbuf; 727 int error = 0; 728 729 /* 730 * Set credits in our softc 731 */ 732 733 if (sc->cred) 734 crfree(sc->cred); 735 sc->cred = crhold(cred); 736 737 /* 738 * Horrible kludge to establish credentials for NFS XXX. 739 */ 740 741 if (sc->vnode) { 742 struct uio auio; 743 struct iovec aiov; 744 745 tmpbuf = malloc(sc->secsize, M_TEMP, M_WAITOK); 746 bzero(&auio, sizeof(auio)); 747 748 aiov.iov_base = tmpbuf; 749 aiov.iov_len = sc->secsize; 750 auio.uio_iov = &aiov; 751 auio.uio_iovcnt = 1; 752 auio.uio_offset = 0; 753 auio.uio_rw = UIO_READ; 754 auio.uio_segflg = UIO_SYSSPACE; 755 auio.uio_resid = aiov.iov_len; 756 vn_lock(sc->vnode, LK_EXCLUSIVE | LK_RETRY, curthread); 757 error = VOP_READ(sc->vnode, &auio, 0, sc->cred); 758 VOP_UNLOCK(sc->vnode, 0, curthread); 759 free(tmpbuf, M_TEMP); 760 } 761 return (error); 762} 763 764static int 765mdcreate_vnode(struct md_ioctl *mdio, struct thread *td) 766{ 767 struct md_s *sc; 768 struct vattr vattr; 769 struct nameidata nd; 770 int error, flags; 771 772 flags = FREAD|FWRITE; 773 NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, mdio->md_file, td); 774 error = vn_open(&nd, &flags, 0); 775 if (error) { 776 if (error != EACCES && error != EPERM && error != EROFS) 777 return (error); 778 flags &= ~FWRITE; 779 NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, mdio->md_file, td); 780 error = vn_open(&nd, &flags, 0); 781 if (error) 782 return (error); 783 } 784 NDFREE(&nd, NDF_ONLY_PNBUF); 785 if (nd.ni_vp->v_type != VREG || 786 (error = VOP_GETATTR(nd.ni_vp, &vattr, td->td_ucred, td))) { 787 VOP_UNLOCK(nd.ni_vp, 0, td); 788 (void) vn_close(nd.ni_vp, flags, td->td_ucred, td); 789 return (error ? error : EINVAL); 790 } 791 VOP_UNLOCK(nd.ni_vp, 0, td); 792 793 if (mdio->md_options & MD_AUTOUNIT) { 794 sc = mdnew(-1); 795 mdio->md_unit = sc->unit; 796 } else { 797 sc = mdnew(mdio->md_unit); 798 } 799 if (sc == NULL) { 800 (void) vn_close(nd.ni_vp, flags, td->td_ucred, td); 801 return (EBUSY); 802 } 803 804 sc->type = MD_VNODE; 805 sc->flags = mdio->md_options & MD_FORCE; 806 if (!(flags & FWRITE)) 807 sc->flags |= MD_READONLY; 808 sc->secsize = DEV_BSIZE; 809 sc->vnode = nd.ni_vp; 810 811 /* 812 * If the size is specified, override the file attributes. 813 */ 814 if (mdio->md_size) 815 sc->nsect = mdio->md_size; 816 else 817 sc->nsect = vattr.va_size / sc->secsize; /* XXX: round up ? */ 818 if (sc->nsect == 0) { 819 (void) vn_close(nd.ni_vp, flags, td->td_ucred, td); 820 return (EINVAL); 821 } 822 error = mdsetcred(sc, td->td_ucred); 823 if (error) { 824 (void) vn_close(nd.ni_vp, flags, td->td_ucred, td); 825 return (error); 826 } 827 mdinit(sc); 828 return (0); 829} 830 831static int 832mddestroy(struct md_s *sc, struct thread *td) 833{ 834 835 GIANT_REQUIRED; 836 837 if (sc->dev != NULL) { 838 devstat_remove_entry(&sc->stats); 839 disk_destroy(sc->dev); 840 } 841 if (sc->vnode != NULL) 842 (void)vn_close(sc->vnode, sc->flags & MD_READONLY ? 843 FREAD : (FREAD|FWRITE), sc->cred, td); 844 if (sc->cred != NULL) 845 crfree(sc->cred); 846 if (sc->object != NULL) { 847 vm_pager_deallocate(sc->object); 848 } 849#if 0 850 if (sc->secp != NULL) { 851 for (u = 0; u < sc->nsect; u++) 852 if ((uintptr_t)sc->secp[u] > 255) 853 free(sc->secp[u], M_MDSECT); 854 free(sc->secp, M_MD); 855 } 856#endif 857 858 /* XXX: LOCK(unique unit numbers) */ 859 LIST_REMOVE(sc, list); 860 /* XXX: UNLOCK(unique unit numbers) */ 861 free(sc, M_MD); 862 return (0); 863} 864 865static int 866mdcreate_swap(struct md_ioctl *mdio, struct thread *td) 867{ 868 int error; 869 struct md_s *sc; 870 871 GIANT_REQUIRED; 872 873 if (mdio->md_options & MD_AUTOUNIT) { 874 sc = mdnew(-1); 875 mdio->md_unit = sc->unit; 876 } else { 877 sc = mdnew(mdio->md_unit); 878 } 879 if (sc == NULL) 880 return (EBUSY); 881 882 sc->type = MD_SWAP; 883 884 /* 885 * Range check. Disallow negative sizes or any size less then the 886 * size of a page. Then round to a page. 887 */ 888 889 if (mdio->md_size == 0) { 890 mddestroy(sc, td); 891 return (EDOM); 892 } 893 894 /* 895 * Allocate an OBJT_SWAP object. 896 * 897 * sc_secsize is PAGE_SIZE'd 898 * 899 * mdio->size is in DEV_BSIZE'd chunks. 900 * Note the truncation. 901 */ 902 903 sc->secsize = PAGE_SIZE; 904 sc->nsect = mdio->md_size / (PAGE_SIZE / DEV_BSIZE); 905 sc->object = vm_pager_allocate(OBJT_SWAP, NULL, sc->secsize * (vm_offset_t)sc->nsect, VM_PROT_DEFAULT, 0); 906 sc->flags = mdio->md_options & MD_FORCE; 907 if (mdio->md_options & MD_RESERVE) { 908 if (swap_pager_reserve(sc->object, 0, sc->nsect) < 0) { 909 vm_pager_deallocate(sc->object); 910 sc->object = NULL; 911 mddestroy(sc, td); 912 return (EDOM); 913 } 914 } 915 error = mdsetcred(sc, td->td_ucred); 916 if (error) 917 mddestroy(sc, td); 918 else 919 mdinit(sc); 920 return (error); 921} 922 923static int 924mddetach(int unit, struct thread *td) 925{ 926 struct md_s *sc; 927 928 sc = mdfind(unit); 929 if (sc == NULL) 930 return (ENOENT); 931 if (sc->opencount != 0 && !(sc->flags & MD_FORCE)) 932 return (EBUSY); 933 switch(sc->type) { 934 case MD_VNODE: 935 case MD_SWAP: 936 case MD_MALLOC: 937 case MD_PRELOAD: 938 return (mddestroy(sc, td)); 939 default: 940 return (EOPNOTSUPP); 941 } 942} 943 944static int 945mdctlioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct thread *td) 946{ 947 struct md_ioctl *mdio; 948 struct md_s *sc; 949 950 if (md_debug) 951 printf("mdctlioctl(%s %lx %p %x %p)\n", 952 devtoname(dev), cmd, addr, flags, td); 953 954 /* 955 * We assert the version number in the individual ioctl 956 * handlers instead of out here because (a) it is possible we 957 * may add another ioctl in the future which doesn't read an 958 * mdio, and (b) the correct return value for an unknown ioctl 959 * is ENOIOCTL, not EINVAL. 960 */ 961 mdio = (struct md_ioctl *)addr; 962 switch (cmd) { 963 case MDIOCATTACH: 964 if (mdio->md_version != MDIOVERSION) 965 return (EINVAL); 966 switch (mdio->md_type) { 967 case MD_MALLOC: 968 return (mdcreate_malloc(mdio)); 969 case MD_PRELOAD: 970 return (mdcreate_preload(mdio)); 971 case MD_VNODE: 972 return (mdcreate_vnode(mdio, td)); 973 case MD_SWAP: 974 return (mdcreate_swap(mdio, td)); 975 default: 976 return (EINVAL); 977 } 978 case MDIOCDETACH: 979 if (mdio->md_version != MDIOVERSION) 980 return (EINVAL); 981 if (mdio->md_file != NULL || mdio->md_size != 0 || 982 mdio->md_options != 0) 983 return (EINVAL); 984 return (mddetach(mdio->md_unit, td)); 985 case MDIOCQUERY: 986 if (mdio->md_version != MDIOVERSION) 987 return (EINVAL); 988 sc = mdfind(mdio->md_unit); 989 if (sc == NULL) 990 return (ENOENT); 991 mdio->md_type = sc->type; 992 mdio->md_options = sc->flags; 993 switch (sc->type) { 994 case MD_MALLOC: 995 mdio->md_size = sc->nsect; 996 break; 997 case MD_PRELOAD: 998 mdio->md_size = sc->nsect; 999 (u_char *)(uintptr_t)mdio->md_base = sc->pl_ptr; 1000 break; 1001 case MD_SWAP: 1002 mdio->md_size = sc->nsect * (PAGE_SIZE / DEV_BSIZE); 1003 break; 1004 case MD_VNODE: 1005 mdio->md_size = sc->nsect; 1006 /* XXX fill this in */ 1007 mdio->md_file = NULL; 1008 break; 1009 } 1010 return (0); 1011 default: 1012 return (ENOIOCTL); 1013 }; 1014 return (ENOIOCTL); 1015} 1016 1017static void 1018md_preloaded(u_char *image, unsigned length) 1019{ 1020 struct md_s *sc; 1021 1022 sc = mdnew(-1); 1023 if (sc == NULL) 1024 return; 1025 sc->type = MD_PRELOAD; 1026 sc->secsize = DEV_BSIZE; 1027 sc->nsect = length / DEV_BSIZE; 1028 sc->pl_ptr = image; 1029 sc->pl_len = length; 1030 if (sc->unit == 0) 1031 mdrootready = 1; 1032 mdinit(sc); 1033} 1034 1035static void 1036md_drvinit(void *unused) 1037{ 1038 1039 caddr_t mod; 1040 caddr_t c; 1041 u_char *ptr, *name, *type; 1042 unsigned len; 1043 1044#ifdef MD_ROOT_SIZE 1045 md_preloaded(mfs_root, MD_ROOT_SIZE*1024); 1046#endif 1047 mod = NULL; 1048 while ((mod = preload_search_next_name(mod)) != NULL) { 1049 name = (char *)preload_search_info(mod, MODINFO_NAME); 1050 type = (char *)preload_search_info(mod, MODINFO_TYPE); 1051 if (name == NULL) 1052 continue; 1053 if (type == NULL) 1054 continue; 1055 if (strcmp(type, "md_image") && strcmp(type, "mfs_root")) 1056 continue; 1057 c = preload_search_info(mod, MODINFO_ADDR); 1058 ptr = *(u_char **)c; 1059 c = preload_search_info(mod, MODINFO_SIZE); 1060 len = *(unsigned *)c; 1061 printf("%s%d: Preloaded image <%s> %d bytes at %p\n", 1062 MD_NAME, mdunits, name, len, ptr); 1063 md_preloaded(ptr, len); 1064 } 1065 status_dev = make_dev(&mdctl_cdevsw, 0xffff00ff, UID_ROOT, GID_WHEEL, 1066 0600, MDCTL_NAME); 1067} 1068 1069static int 1070md_modevent(module_t mod, int type, void *data) 1071{ 1072 int error; 1073 struct md_s *sc; 1074 1075 switch (type) { 1076 case MOD_LOAD: 1077 md_drvinit(NULL); 1078 break; 1079 case MOD_UNLOAD: 1080 LIST_FOREACH(sc, &md_softc_list, list) { 1081 error = mddetach(sc->unit, curthread); 1082 if (error != 0) 1083 return (error); 1084 } 1085 if (status_dev) 1086 destroy_dev(status_dev); 1087 status_dev = 0; 1088 break; 1089 default: 1090 break; 1091 } 1092 return (0); 1093} 1094 1095static moduledata_t md_mod = { 1096 MD_NAME, 1097 md_modevent, 1098 NULL 1099}; 1100DECLARE_MODULE(md, md_mod, SI_SUB_DRIVERS, SI_ORDER_MIDDLE+CDEV_MAJOR); 1101MODULE_VERSION(md, MD_MODVER); 1102 1103 1104#ifdef MD_ROOT 1105static void 1106md_takeroot(void *junk) 1107{ 1108 if (mdrootready) 1109 rootdevnames[0] = "ufs:/dev/md0c"; 1110} 1111 1112SYSINIT(md_root, SI_SUB_MOUNT_ROOT, SI_ORDER_FIRST, md_takeroot, NULL); 1113#endif 1114