md.c revision 104091
1322249Sbapt/* 2241675Suqs * ---------------------------------------------------------------------------- 3241675Suqs * "THE BEER-WARE LICENSE" (Revision 42): 4241675Suqs * <phk@FreeBSD.ORG> wrote this file. As long as you retain this notice you 5241675Suqs * can do whatever you want with this stuff. If we meet some day, and you think 6241675Suqs * this stuff is worth it, you can buy me a beer in return. Poul-Henning Kamp 7241675Suqs * ---------------------------------------------------------------------------- 8241675Suqs * 9241675Suqs * $FreeBSD: head/sys/dev/md/md.c 104091 2002-09-28 14:17:24Z phk $ 10241675Suqs * 11241675Suqs */ 12241675Suqs 13241675Suqs/* 14241675Suqs * The following functions are based in the vn(4) driver: mdstart_swap(), 15241675Suqs * mdstart_vnode(), mdcreate_swap(), mdcreate_vnode() and mddestroy(), 16241675Suqs * and as such under the following copyright: 17241675Suqs * 18241675Suqs * Copyright (c) 1988 University of Utah. 19275432Sbapt * Copyright (c) 1990, 1993 20275432Sbapt * The Regents of the University of California. All rights reserved. 21241675Suqs * 22241675Suqs * This code is derived from software contributed to Berkeley by 23322249Sbapt * the Systems Programming Group of the University of Utah Computer 24294113Sbapt * Science Department. 25241675Suqs * 26241675Suqs * Redistribution and use in source and binary forms, with or without 27241675Suqs * modification, are permitted provided that the following conditions 28241675Suqs * are met: 29241675Suqs * 1. Redistributions of source code must retain the above copyright 30241675Suqs * notice, this list of conditions and the following disclaimer. 31241675Suqs * 2. Redistributions in binary form must reproduce the above copyright 32241675Suqs * notice, this list of conditions and the following disclaimer in the 33241675Suqs * documentation and/or other materials provided with the distribution. 34241675Suqs * 3. All advertising materials mentioning features or use of this software 35241675Suqs * must display the following acknowledgement: 36241675Suqs * This product includes software developed by the University of 37294113Sbapt * California, Berkeley and its contributors. 38241675Suqs * 4. Neither the name of the University nor the names of its contributors 39 * may be used to endorse or promote products derived from this software 40 * without specific prior written permission. 41 * 42 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 43 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 44 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 45 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 46 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 47 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 48 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 49 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 50 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 51 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 52 * SUCH DAMAGE. 53 * 54 * from: Utah Hdr: vn.c 1.13 94/04/02 55 * 56 * from: @(#)vn.c 8.6 (Berkeley) 4/1/94 57 * From: src/sys/dev/vn/vn.c,v 1.122 2000/12/16 16:06:03 58 */ 59 60#include "opt_md.h" 61 62#include <sys/param.h> 63#include <sys/systm.h> 64#include <sys/bio.h> 65#include <sys/conf.h> 66#include <sys/devicestat.h> 67#include <sys/disk.h> 68#include <sys/fcntl.h> 69#include <sys/kernel.h> 70#include <sys/kthread.h> 71#include <sys/linker.h> 72#include <sys/lock.h> 73#include <sys/malloc.h> 74#include <sys/mdioctl.h> 75#include <sys/mutex.h> 76#include <sys/namei.h> 77#include <sys/proc.h> 78#include <sys/queue.h> 79#include <sys/stdint.h> 80#include <sys/sysctl.h> 81#include <sys/vnode.h> 82 83#include <vm/vm.h> 84#include <vm/vm_object.h> 85#include <vm/vm_page.h> 86#include <vm/vm_pager.h> 87#include <vm/swap_pager.h> 88#include <vm/uma.h> 89 90#define MD_MODVER 1 91 92#define MD_SHUTDOWN 0x10000 /* Tell worker thread to terminate. */ 93 94#ifndef MD_NSECT 95#define MD_NSECT (10000 * 2) 96#endif 97 98static MALLOC_DEFINE(M_MD, "MD disk", "Memory Disk"); 99static MALLOC_DEFINE(M_MDSECT, "MD sectors", "Memory Disk Sectors"); 100 101static int md_debug; 102SYSCTL_INT(_debug, OID_AUTO, mddebug, CTLFLAG_RW, &md_debug, 0, ""); 103 104#if defined(MD_ROOT) && defined(MD_ROOT_SIZE) 105/* Image gets put here: */ 106static u_char mfs_root[MD_ROOT_SIZE*1024] = "MFS Filesystem goes here"; 107static u_char end_mfs_root[] __unused = "MFS Filesystem had better STOP here"; 108#endif 109 110static int mdrootready; 111static int mdunits; 112static dev_t status_dev = 0; 113 114#define CDEV_MAJOR 95 115 116static d_strategy_t mdstrategy; 117static d_open_t mdopen; 118static d_close_t mdclose; 119static d_ioctl_t mdioctl, mdctlioctl; 120 121static struct cdevsw md_cdevsw = { 122 /* open */ mdopen, 123 /* close */ mdclose, 124 /* read */ physread, 125 /* write */ physwrite, 126 /* ioctl */ mdioctl, 127 /* poll */ nopoll, 128 /* mmap */ nommap, 129 /* strategy */ mdstrategy, 130 /* name */ MD_NAME, 131 /* maj */ CDEV_MAJOR, 132 /* dump */ nodump, 133 /* psize */ nopsize, 134 /* flags */ D_DISK | D_CANFREE | D_MEMDISK, 135}; 136 137static struct cdevsw mdctl_cdevsw = { 138 /* open */ nullopen, 139 /* close */ nullclose, 140 /* read */ noread, 141 /* write */ nowrite, 142 /* ioctl */ mdctlioctl, 143 /* poll */ nopoll, 144 /* mmap */ nommap, 145 /* strategy */ nostrategy, 146 /* name */ MD_NAME, 147 /* maj */ CDEV_MAJOR 148}; 149 150static struct cdevsw mddisk_cdevsw; 151 152static LIST_HEAD(, md_s) md_softc_list = LIST_HEAD_INITIALIZER(&md_softc_list); 153 154#define NINDIR (PAGE_SIZE / sizeof(uintptr_t)) 155#define NMASK (NINDIR-1) 156static int nshift; 157 158struct indir { 159 uintptr_t *array; 160 uint total; 161 uint used; 162 uint shift; 163}; 164 165struct md_s { 166 int unit; 167 LIST_ENTRY(md_s) list; 168 struct devstat stats; 169 struct bio_queue_head bio_queue; 170 struct disk disk; 171 dev_t dev; 172 enum md_types type; 173 unsigned nsect; 174 unsigned opencount; 175 unsigned secsize; 176 unsigned flags; 177 char name[20]; 178 struct proc *procp; 179 180 /* MD_MALLOC related fields */ 181 struct indir *indir; 182 uma_zone_t uma; 183 184 /* MD_PRELOAD related fields */ 185 u_char *pl_ptr; 186 unsigned pl_len; 187 188 /* MD_VNODE related fields */ 189 struct vnode *vnode; 190 struct ucred *cred; 191 192 /* MD_SWAP related fields */ 193 vm_object_t object; 194}; 195 196static int mddestroy(struct md_s *sc, struct thread *td); 197 198static struct indir * 199new_indir(uint shift) 200{ 201 struct indir *ip; 202 203 ip = malloc(sizeof *ip, M_MD, M_NOWAIT | M_ZERO); 204 if (ip == NULL) 205 return (NULL); 206 ip->array = malloc(sizeof(uintptr_t) * NINDIR, 207 M_MDSECT, M_NOWAIT | M_ZERO); 208 if (ip->array == NULL) { 209 free(ip, M_MD); 210 return (NULL); 211 } 212 ip->total = NINDIR; 213 ip->shift = shift; 214 return (ip); 215} 216 217static void 218del_indir(struct indir *ip) 219{ 220 221 free(ip->array, M_MDSECT); 222 free(ip, M_MD); 223} 224 225static void 226destroy_indir(struct md_s *sc, struct indir *ip) 227{ 228 int i; 229 230 for (i = 0; i < NINDIR; i++) { 231 if (!ip->array[i]) 232 continue; 233 if (ip->shift) 234 destroy_indir(sc, (struct indir*)(ip->array[i])); 235 else if (ip->array[i] > 255) 236 uma_zfree(sc->uma, (void *)(ip->array[i])); 237 } 238 del_indir(ip); 239} 240 241/* 242 * This function does the math and alloctes the top level "indir" structure 243 * for a device of "size" sectors. 244 */ 245 246static struct indir * 247dimension(off_t size) 248{ 249 off_t rcnt; 250 struct indir *ip; 251 int i, layer; 252 253 rcnt = size; 254 layer = 0; 255 while (rcnt > NINDIR) { 256 rcnt /= NINDIR; 257 layer++; 258 } 259 /* figure out log2(NINDIR) */ 260 for (i = NINDIR, nshift = -1; i; nshift++) 261 i >>= 1; 262 263 /* 264 * XXX: the top layer is probably not fully populated, so we allocate 265 * too much space for ip->array in new_indir() here. 266 */ 267 ip = new_indir(layer * nshift); 268 return (ip); 269} 270 271/* 272 * Read a given sector 273 */ 274 275static uintptr_t 276s_read(struct indir *ip, off_t offset) 277{ 278 struct indir *cip; 279 int idx; 280 uintptr_t up; 281 282 if (md_debug > 1) 283 printf("s_read(%jd)\n", (intmax_t)offset); 284 up = 0; 285 for (cip = ip; cip != NULL;) { 286 if (cip->shift) { 287 idx = (offset >> cip->shift) & NMASK; 288 up = cip->array[idx]; 289 cip = (struct indir *)up; 290 continue; 291 } 292 idx = offset & NMASK; 293 return (cip->array[idx]); 294 } 295 return (0); 296} 297 298/* 299 * Write a given sector, prune the tree if the value is 0 300 */ 301 302static int 303s_write(struct indir *ip, off_t offset, uintptr_t ptr) 304{ 305 struct indir *cip, *lip[10]; 306 int idx, li; 307 uintptr_t up; 308 309 if (md_debug > 1) 310 printf("s_write(%jd, %p)\n", (intmax_t)offset, (void *)ptr); 311 up = 0; 312 li = 0; 313 cip = ip; 314 for (;;) { 315 lip[li++] = cip; 316 if (cip->shift) { 317 idx = (offset >> cip->shift) & NMASK; 318 up = cip->array[idx]; 319 if (up != 0) { 320 cip = (struct indir *)up; 321 continue; 322 } 323 /* Allocate branch */ 324 cip->array[idx] = 325 (uintptr_t)new_indir(cip->shift - nshift); 326 if (cip->array[idx] == 0) 327 return (ENOMEM); 328 cip->used++; 329 up = cip->array[idx]; 330 cip = (struct indir *)up; 331 continue; 332 } 333 /* leafnode */ 334 idx = offset & NMASK; 335 up = cip->array[idx]; 336 if (up != 0) 337 cip->used--; 338 cip->array[idx] = ptr; 339 if (ptr != 0) 340 cip->used++; 341 break; 342 } 343 if (cip->used != 0 || li == 1) 344 return (0); 345 li--; 346 while (cip->used == 0 && cip != ip) { 347 li--; 348 idx = (offset >> lip[li]->shift) & NMASK; 349 up = lip[li]->array[idx]; 350 KASSERT(up == (uintptr_t)cip, ("md screwed up")); 351 del_indir(cip); 352 lip[li]->array[idx] = 0; 353 lip[li]->used--; 354 cip = lip[li]; 355 } 356 return (0); 357} 358 359static int 360mdopen(dev_t dev, int flag, int fmt, struct thread *td) 361{ 362 struct md_s *sc; 363 364 if (md_debug) 365 printf("mdopen(%s %x %x %p)\n", 366 devtoname(dev), flag, fmt, td); 367 368 sc = dev->si_drv1; 369 370 sc->disk.d_sectorsize = sc->secsize; 371 sc->disk.d_mediasize = (off_t)sc->nsect * sc->secsize; 372 sc->disk.d_fwsectors = sc->nsect > 63 ? 63 : sc->nsect; 373 sc->disk.d_fwheads = 1; 374 sc->opencount++; 375 return (0); 376} 377 378static int 379mdclose(dev_t dev, int flags, int fmt, struct thread *td) 380{ 381 struct md_s *sc = dev->si_drv1; 382 383 sc->opencount--; 384 return (0); 385} 386 387static int 388mdioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct thread *td) 389{ 390 391 if (md_debug) 392 printf("mdioctl(%s %lx %p %x %p)\n", 393 devtoname(dev), cmd, addr, flags, td); 394 395 return (ENOIOCTL); 396} 397 398static int 399mdstart_malloc(struct md_s *sc, struct bio *bp) 400{ 401 int i, error; 402 u_char *dst; 403 unsigned secno, nsec, uc; 404 uintptr_t sp, osp; 405 406 nsec = bp->bio_bcount / sc->secsize; 407 secno = bp->bio_pblkno; 408 dst = bp->bio_data; 409 error = 0; 410 while (nsec--) { 411 osp = s_read(sc->indir, secno); 412 if (bp->bio_cmd == BIO_DELETE) { 413 if (osp != 0) 414 error = s_write(sc->indir, secno, 0); 415 } else if (bp->bio_cmd == BIO_READ) { 416 if (osp == 0) 417 bzero(dst, sc->secsize); 418 else if (osp <= 255) 419 for (i = 0; i < sc->secsize; i++) 420 dst[i] = osp; 421 else 422 bcopy((void *)osp, dst, sc->secsize); 423 osp = 0; 424 } else if (bp->bio_cmd == BIO_WRITE) { 425 if (sc->flags & MD_COMPRESS) { 426 uc = dst[0]; 427 for (i = 1; i < sc->secsize; i++) 428 if (dst[i] != uc) 429 break; 430 } else { 431 i = 0; 432 uc = 0; 433 } 434 if (i == sc->secsize) { 435 if (osp != uc) 436 error = s_write(sc->indir, secno, uc); 437 } else { 438 if (osp <= 255) { 439 sp = (uintptr_t) uma_zalloc( 440 sc->uma, M_NOWAIT); 441 if (sp == 0) { 442 error = ENOSPC; 443 break; 444 } 445 bcopy(dst, (void *)sp, sc->secsize); 446 error = s_write(sc->indir, secno, sp); 447 } else { 448 bcopy(dst, (void *)osp, sc->secsize); 449 osp = 0; 450 } 451 } 452 } else { 453 error = EOPNOTSUPP; 454 } 455 if (osp > 255) 456 uma_zfree(sc->uma, (void*)osp); 457 if (error) 458 break; 459 secno++; 460 dst += sc->secsize; 461 } 462 bp->bio_resid = 0; 463 return (error); 464} 465 466static int 467mdstart_preload(struct md_s *sc, struct bio *bp) 468{ 469 470 if (bp->bio_cmd == BIO_DELETE) { 471 } else if (bp->bio_cmd == BIO_READ) { 472 bcopy(sc->pl_ptr + (bp->bio_pblkno << DEV_BSHIFT), bp->bio_data, bp->bio_bcount); 473 } else { 474 bcopy(bp->bio_data, sc->pl_ptr + (bp->bio_pblkno << DEV_BSHIFT), bp->bio_bcount); 475 } 476 bp->bio_resid = 0; 477 return (0); 478} 479 480static int 481mdstart_vnode(struct md_s *sc, struct bio *bp) 482{ 483 int error; 484 struct uio auio; 485 struct iovec aiov; 486 struct mount *mp; 487 488 /* 489 * VNODE I/O 490 * 491 * If an error occurs, we set BIO_ERROR but we do not set 492 * B_INVAL because (for a write anyway), the buffer is 493 * still valid. 494 */ 495 496 bzero(&auio, sizeof(auio)); 497 498 aiov.iov_base = bp->bio_data; 499 aiov.iov_len = bp->bio_bcount; 500 auio.uio_iov = &aiov; 501 auio.uio_iovcnt = 1; 502 auio.uio_offset = (vm_ooffset_t)bp->bio_pblkno * sc->secsize; 503 auio.uio_segflg = UIO_SYSSPACE; 504 if(bp->bio_cmd == BIO_READ) 505 auio.uio_rw = UIO_READ; 506 else 507 auio.uio_rw = UIO_WRITE; 508 auio.uio_resid = bp->bio_bcount; 509 auio.uio_td = curthread; 510 /* 511 * When reading set IO_DIRECT to try to avoid double-caching 512 * the data. When writing IO_DIRECT is not optimal, but we 513 * must set IO_NOWDRAIN to avoid a wdrain deadlock. 514 */ 515 if (bp->bio_cmd == BIO_READ) { 516 vn_lock(sc->vnode, LK_EXCLUSIVE | LK_RETRY, curthread); 517 error = VOP_READ(sc->vnode, &auio, IO_DIRECT, sc->cred); 518 } else { 519 (void) vn_start_write(sc->vnode, &mp, V_WAIT); 520 vn_lock(sc->vnode, LK_EXCLUSIVE | LK_RETRY, curthread); 521 error = VOP_WRITE(sc->vnode, &auio, IO_NOWDRAIN, sc->cred); 522 vn_finished_write(mp); 523 } 524 VOP_UNLOCK(sc->vnode, 0, curthread); 525 bp->bio_resid = auio.uio_resid; 526 return (error); 527} 528 529static int 530mdstart_swap(struct md_s *sc, struct bio *bp) 531{ 532 533 if ((bp->bio_cmd == BIO_DELETE) && (sc->flags & MD_RESERVE)) 534 biodone(bp); 535 else 536 vm_pager_strategy(sc->object, bp); 537 return (-1); 538} 539 540static void 541mdstrategy(struct bio *bp) 542{ 543 struct md_s *sc; 544 545 if (md_debug > 1) 546 printf("mdstrategy(%p) %s %x, %lld, %ld, %p)\n", 547 (void *)bp, devtoname(bp->bio_dev), bp->bio_flags, 548 (long long)bp->bio_blkno, bp->bio_bcount / DEV_BSIZE, 549 (void *)bp->bio_data); 550 551 sc = bp->bio_dev->si_drv1; 552 553 /* XXX: LOCK(sc->lock) */ 554 bioqdisksort(&sc->bio_queue, bp); 555 /* XXX: UNLOCK(sc->lock) */ 556 557 wakeup(sc); 558} 559 560static void 561md_kthread(void *arg) 562{ 563 struct md_s *sc; 564 struct bio *bp; 565 int error; 566 567 sc = arg; 568 curthread->td_base_pri = PRIBIO; 569 570 mtx_lock(&Giant); 571 for (;;) { 572 /* XXX: LOCK(unique unit numbers) */ 573 bp = bioq_first(&sc->bio_queue); 574 if (bp) 575 bioq_remove(&sc->bio_queue, bp); 576 /* XXX: UNLOCK(unique unit numbers) */ 577 if (!bp) { 578 tsleep(sc, PRIBIO, "mdwait", 0); 579 if (sc->flags & MD_SHUTDOWN) { 580 sc->procp = NULL; 581 wakeup(&sc->procp); 582 kthread_exit(0); 583 } 584 continue; 585 } 586 587 switch (sc->type) { 588 case MD_MALLOC: 589 devstat_start_transaction(&sc->stats); 590 error = mdstart_malloc(sc, bp); 591 break; 592 case MD_PRELOAD: 593 devstat_start_transaction(&sc->stats); 594 error = mdstart_preload(sc, bp); 595 break; 596 case MD_VNODE: 597 devstat_start_transaction(&sc->stats); 598 error = mdstart_vnode(sc, bp); 599 break; 600 case MD_SWAP: 601 error = mdstart_swap(sc, bp); 602 break; 603 default: 604 panic("Impossible md(type)"); 605 break; 606 } 607 608 if (error != -1) 609 biofinish(bp, &sc->stats, error); 610 } 611} 612 613static struct md_s * 614mdfind(int unit) 615{ 616 struct md_s *sc; 617 618 /* XXX: LOCK(unique unit numbers) */ 619 LIST_FOREACH(sc, &md_softc_list, list) { 620 if (sc->unit == unit) 621 break; 622 } 623 /* XXX: UNLOCK(unique unit numbers) */ 624 return (sc); 625} 626 627static struct md_s * 628mdnew(int unit) 629{ 630 struct md_s *sc; 631 int error, max = -1; 632 633 /* XXX: LOCK(unique unit numbers) */ 634 LIST_FOREACH(sc, &md_softc_list, list) { 635 if (sc->unit == unit) { 636 /* XXX: UNLOCK(unique unit numbers) */ 637 return (NULL); 638 } 639 if (sc->unit > max) 640 max = sc->unit; 641 } 642 if (unit == -1) 643 unit = max + 1; 644 if (unit > 255) 645 return (NULL); 646 sc = (struct md_s *)malloc(sizeof *sc, M_MD, M_WAITOK | M_ZERO); 647 sc->unit = unit; 648 sprintf(sc->name, "md%d", unit); 649 error = kthread_create(md_kthread, sc, &sc->procp, 0, "%s", sc->name); 650 if (error) { 651 free(sc, M_MD); 652 return (NULL); 653 } 654 LIST_INSERT_HEAD(&md_softc_list, sc, list); 655 /* XXX: UNLOCK(unique unit numbers) */ 656 return (sc); 657} 658 659static void 660mdinit(struct md_s *sc) 661{ 662 663 bioq_init(&sc->bio_queue); 664 devstat_add_entry(&sc->stats, MD_NAME, sc->unit, sc->secsize, 665 DEVSTAT_NO_ORDERED_TAGS, 666 DEVSTAT_TYPE_DIRECT | DEVSTAT_TYPE_IF_OTHER, 667 DEVSTAT_PRIORITY_OTHER); 668 sc->dev = disk_create(sc->unit, &sc->disk, 0, &md_cdevsw, &mddisk_cdevsw); 669 sc->dev->si_drv1 = sc; 670} 671 672/* 673 * XXX: we should check that the range they feed us is mapped. 674 * XXX: we should implement read-only. 675 */ 676 677static int 678mdcreate_preload(struct md_ioctl *mdio) 679{ 680 struct md_s *sc; 681 682 if (mdio->md_size == 0) 683 return (EINVAL); 684 if (mdio->md_options & ~(MD_AUTOUNIT)) 685 return (EINVAL); 686 if (mdio->md_options & MD_AUTOUNIT) { 687 sc = mdnew(-1); 688 if (sc == NULL) 689 return (ENOMEM); 690 mdio->md_unit = sc->unit; 691 } else { 692 sc = mdnew(mdio->md_unit); 693 if (sc == NULL) 694 return (EBUSY); 695 } 696 sc->type = MD_PRELOAD; 697 sc->secsize = DEV_BSIZE; 698 sc->nsect = mdio->md_size; 699 sc->flags = mdio->md_options & MD_FORCE; 700 /* Cast to pointer size, then to pointer to avoid warning */ 701 sc->pl_ptr = (u_char *)(uintptr_t)mdio->md_base; 702 sc->pl_len = (mdio->md_size << DEV_BSHIFT); 703 mdinit(sc); 704 return (0); 705} 706 707 708static int 709mdcreate_malloc(struct md_ioctl *mdio) 710{ 711 struct md_s *sc; 712 off_t u; 713 uintptr_t sp; 714 int error; 715 716 error = 0; 717 if (mdio->md_size == 0) 718 return (EINVAL); 719 if (mdio->md_options & ~(MD_AUTOUNIT | MD_COMPRESS | MD_RESERVE)) 720 return (EINVAL); 721 /* Compression doesn't make sense if we have reserved space */ 722 if (mdio->md_options & MD_RESERVE) 723 mdio->md_options &= ~MD_COMPRESS; 724 if (mdio->md_options & MD_AUTOUNIT) { 725 sc = mdnew(-1); 726 if (sc == NULL) 727 return (ENOMEM); 728 mdio->md_unit = sc->unit; 729 } else { 730 sc = mdnew(mdio->md_unit); 731 if (sc == NULL) 732 return (EBUSY); 733 } 734 sc->type = MD_MALLOC; 735 sc->secsize = DEV_BSIZE; 736 sc->nsect = mdio->md_size; 737 sc->flags = mdio->md_options & (MD_COMPRESS | MD_FORCE); 738 sc->indir = dimension(sc->nsect); 739 sc->uma = uma_zcreate(sc->name, sc->secsize, 740 NULL, NULL, NULL, NULL, 0x1ff, 0); 741 if (mdio->md_options & MD_RESERVE) { 742 for (u = 0; u < sc->nsect; u++) { 743 sp = (uintptr_t) uma_zalloc(sc->uma, M_NOWAIT | M_ZERO); 744 if (sp != 0) 745 error = s_write(sc->indir, u, sp); 746 else 747 error = ENOMEM; 748 if (error) 749 break; 750 } 751 } 752 if (!error) { 753 printf("%s%d: Malloc disk\n", MD_NAME, sc->unit); 754 mdinit(sc); 755 } else 756 mddestroy(sc, NULL); 757 return (error); 758} 759 760 761static int 762mdsetcred(struct md_s *sc, struct ucred *cred) 763{ 764 char *tmpbuf; 765 int error = 0; 766 767 /* 768 * Set credits in our softc 769 */ 770 771 if (sc->cred) 772 crfree(sc->cred); 773 sc->cred = crhold(cred); 774 775 /* 776 * Horrible kludge to establish credentials for NFS XXX. 777 */ 778 779 if (sc->vnode) { 780 struct uio auio; 781 struct iovec aiov; 782 783 tmpbuf = malloc(sc->secsize, M_TEMP, M_WAITOK); 784 bzero(&auio, sizeof(auio)); 785 786 aiov.iov_base = tmpbuf; 787 aiov.iov_len = sc->secsize; 788 auio.uio_iov = &aiov; 789 auio.uio_iovcnt = 1; 790 auio.uio_offset = 0; 791 auio.uio_rw = UIO_READ; 792 auio.uio_segflg = UIO_SYSSPACE; 793 auio.uio_resid = aiov.iov_len; 794 vn_lock(sc->vnode, LK_EXCLUSIVE | LK_RETRY, curthread); 795 error = VOP_READ(sc->vnode, &auio, 0, sc->cred); 796 VOP_UNLOCK(sc->vnode, 0, curthread); 797 free(tmpbuf, M_TEMP); 798 } 799 return (error); 800} 801 802static int 803mdcreate_vnode(struct md_ioctl *mdio, struct thread *td) 804{ 805 struct md_s *sc; 806 struct vattr vattr; 807 struct nameidata nd; 808 int error, flags; 809 810 flags = FREAD|FWRITE; 811 NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, mdio->md_file, td); 812 error = vn_open(&nd, &flags, 0); 813 if (error) { 814 if (error != EACCES && error != EPERM && error != EROFS) 815 return (error); 816 flags &= ~FWRITE; 817 NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, mdio->md_file, td); 818 error = vn_open(&nd, &flags, 0); 819 if (error) 820 return (error); 821 } 822 NDFREE(&nd, NDF_ONLY_PNBUF); 823 if (nd.ni_vp->v_type != VREG || 824 (error = VOP_GETATTR(nd.ni_vp, &vattr, td->td_ucred, td))) { 825 VOP_UNLOCK(nd.ni_vp, 0, td); 826 (void) vn_close(nd.ni_vp, flags, td->td_ucred, td); 827 return (error ? error : EINVAL); 828 } 829 VOP_UNLOCK(nd.ni_vp, 0, td); 830 831 if (mdio->md_options & MD_AUTOUNIT) { 832 sc = mdnew(-1); 833 mdio->md_unit = sc->unit; 834 } else { 835 sc = mdnew(mdio->md_unit); 836 } 837 if (sc == NULL) { 838 (void) vn_close(nd.ni_vp, flags, td->td_ucred, td); 839 return (EBUSY); 840 } 841 842 sc->type = MD_VNODE; 843 sc->flags = mdio->md_options & MD_FORCE; 844 if (!(flags & FWRITE)) 845 sc->flags |= MD_READONLY; 846 sc->secsize = DEV_BSIZE; 847 sc->vnode = nd.ni_vp; 848 849 /* 850 * If the size is specified, override the file attributes. 851 */ 852 if (mdio->md_size) 853 sc->nsect = mdio->md_size; 854 else 855 sc->nsect = vattr.va_size / sc->secsize; /* XXX: round up ? */ 856 if (sc->nsect == 0) { 857 mddestroy(sc, td); 858 return (EINVAL); 859 } 860 error = mdsetcred(sc, td->td_ucred); 861 if (error) { 862 mddestroy(sc, td); 863 return (error); 864 } 865 mdinit(sc); 866 return (0); 867} 868 869static int 870mddestroy(struct md_s *sc, struct thread *td) 871{ 872 873 GIANT_REQUIRED; 874 875 if (sc->dev != NULL) { 876 devstat_remove_entry(&sc->stats); 877 disk_destroy(sc->dev); 878 } 879 sc->flags |= MD_SHUTDOWN; 880 wakeup(sc); 881 while (sc->procp != NULL) 882 tsleep(&sc->procp, PRIBIO, "mddestroy", hz / 10); 883 if (sc->vnode != NULL) 884 (void)vn_close(sc->vnode, sc->flags & MD_READONLY ? 885 FREAD : (FREAD|FWRITE), sc->cred, td); 886 if (sc->cred != NULL) 887 crfree(sc->cred); 888 if (sc->object != NULL) { 889 vm_pager_deallocate(sc->object); 890 } 891 if (sc->indir) 892 destroy_indir(sc, sc->indir); 893 if (sc->uma) 894 uma_zdestroy(sc->uma); 895 896 /* XXX: LOCK(unique unit numbers) */ 897 LIST_REMOVE(sc, list); 898 /* XXX: UNLOCK(unique unit numbers) */ 899 free(sc, M_MD); 900 return (0); 901} 902 903static int 904mdcreate_swap(struct md_ioctl *mdio, struct thread *td) 905{ 906 int error; 907 struct md_s *sc; 908 909 GIANT_REQUIRED; 910 911 if (mdio->md_options & MD_AUTOUNIT) { 912 sc = mdnew(-1); 913 mdio->md_unit = sc->unit; 914 } else { 915 sc = mdnew(mdio->md_unit); 916 } 917 if (sc == NULL) 918 return (EBUSY); 919 920 sc->type = MD_SWAP; 921 922 /* 923 * Range check. Disallow negative sizes or any size less then the 924 * size of a page. Then round to a page. 925 */ 926 927 if (mdio->md_size == 0) { 928 mddestroy(sc, td); 929 return (EDOM); 930 } 931 932 /* 933 * Allocate an OBJT_SWAP object. 934 * 935 * sc_secsize is PAGE_SIZE'd 936 * 937 * mdio->size is in DEV_BSIZE'd chunks. 938 * Note the truncation. 939 */ 940 941 sc->secsize = PAGE_SIZE; 942 sc->nsect = mdio->md_size / (PAGE_SIZE / DEV_BSIZE); 943 sc->object = vm_pager_allocate(OBJT_SWAP, NULL, sc->secsize * (vm_offset_t)sc->nsect, VM_PROT_DEFAULT, 0); 944 sc->flags = mdio->md_options & MD_FORCE; 945 if (mdio->md_options & MD_RESERVE) { 946 if (swap_pager_reserve(sc->object, 0, sc->nsect) < 0) { 947 vm_pager_deallocate(sc->object); 948 sc->object = NULL; 949 mddestroy(sc, td); 950 return (EDOM); 951 } 952 } 953 error = mdsetcred(sc, td->td_ucred); 954 if (error) 955 mddestroy(sc, td); 956 else 957 mdinit(sc); 958 return (error); 959} 960 961static int 962mddetach(int unit, struct thread *td) 963{ 964 struct md_s *sc; 965 966 sc = mdfind(unit); 967 if (sc == NULL) 968 return (ENOENT); 969 if (sc->opencount != 0 && !(sc->flags & MD_FORCE)) 970 return (EBUSY); 971 switch(sc->type) { 972 case MD_VNODE: 973 case MD_SWAP: 974 case MD_MALLOC: 975 case MD_PRELOAD: 976 return (mddestroy(sc, td)); 977 default: 978 return (EOPNOTSUPP); 979 } 980} 981 982static int 983mdctlioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct thread *td) 984{ 985 struct md_ioctl *mdio; 986 struct md_s *sc; 987 988 if (md_debug) 989 printf("mdctlioctl(%s %lx %p %x %p)\n", 990 devtoname(dev), cmd, addr, flags, td); 991 992 /* 993 * We assert the version number in the individual ioctl 994 * handlers instead of out here because (a) it is possible we 995 * may add another ioctl in the future which doesn't read an 996 * mdio, and (b) the correct return value for an unknown ioctl 997 * is ENOIOCTL, not EINVAL. 998 */ 999 mdio = (struct md_ioctl *)addr; 1000 switch (cmd) { 1001 case MDIOCATTACH: 1002 if (mdio->md_version != MDIOVERSION) 1003 return (EINVAL); 1004 switch (mdio->md_type) { 1005 case MD_MALLOC: 1006 return (mdcreate_malloc(mdio)); 1007 case MD_PRELOAD: 1008 return (mdcreate_preload(mdio)); 1009 case MD_VNODE: 1010 return (mdcreate_vnode(mdio, td)); 1011 case MD_SWAP: 1012 return (mdcreate_swap(mdio, td)); 1013 default: 1014 return (EINVAL); 1015 } 1016 case MDIOCDETACH: 1017 if (mdio->md_version != MDIOVERSION) 1018 return (EINVAL); 1019 if (mdio->md_file != NULL || mdio->md_size != 0 || 1020 mdio->md_options != 0) 1021 return (EINVAL); 1022 return (mddetach(mdio->md_unit, td)); 1023 case MDIOCQUERY: 1024 if (mdio->md_version != MDIOVERSION) 1025 return (EINVAL); 1026 sc = mdfind(mdio->md_unit); 1027 if (sc == NULL) 1028 return (ENOENT); 1029 mdio->md_type = sc->type; 1030 mdio->md_options = sc->flags; 1031 switch (sc->type) { 1032 case MD_MALLOC: 1033 mdio->md_size = sc->nsect; 1034 break; 1035 case MD_PRELOAD: 1036 mdio->md_size = sc->nsect; 1037 mdio->md_base = (uint64_t)(intptr_t)sc->pl_ptr; 1038 break; 1039 case MD_SWAP: 1040 mdio->md_size = sc->nsect * (PAGE_SIZE / DEV_BSIZE); 1041 break; 1042 case MD_VNODE: 1043 mdio->md_size = sc->nsect; 1044 /* XXX fill this in */ 1045 mdio->md_file = NULL; 1046 break; 1047 } 1048 return (0); 1049 default: 1050 return (ENOIOCTL); 1051 }; 1052 return (ENOIOCTL); 1053} 1054 1055static void 1056md_preloaded(u_char *image, unsigned length) 1057{ 1058 struct md_s *sc; 1059 1060 sc = mdnew(-1); 1061 if (sc == NULL) 1062 return; 1063 sc->type = MD_PRELOAD; 1064 sc->secsize = DEV_BSIZE; 1065 sc->nsect = length / DEV_BSIZE; 1066 sc->pl_ptr = image; 1067 sc->pl_len = length; 1068 if (sc->unit == 0) 1069 mdrootready = 1; 1070 mdinit(sc); 1071} 1072 1073static void 1074md_drvinit(void *unused) 1075{ 1076 1077 caddr_t mod; 1078 caddr_t c; 1079 u_char *ptr, *name, *type; 1080 unsigned len; 1081 1082#ifdef MD_ROOT_SIZE 1083 md_preloaded(mfs_root, MD_ROOT_SIZE*1024); 1084#endif 1085 mod = NULL; 1086 while ((mod = preload_search_next_name(mod)) != NULL) { 1087 name = (char *)preload_search_info(mod, MODINFO_NAME); 1088 type = (char *)preload_search_info(mod, MODINFO_TYPE); 1089 if (name == NULL) 1090 continue; 1091 if (type == NULL) 1092 continue; 1093 if (strcmp(type, "md_image") && strcmp(type, "mfs_root")) 1094 continue; 1095 c = preload_search_info(mod, MODINFO_ADDR); 1096 ptr = *(u_char **)c; 1097 c = preload_search_info(mod, MODINFO_SIZE); 1098 len = *(unsigned *)c; 1099 printf("%s%d: Preloaded image <%s> %d bytes at %p\n", 1100 MD_NAME, mdunits, name, len, ptr); 1101 md_preloaded(ptr, len); 1102 } 1103 status_dev = make_dev(&mdctl_cdevsw, 0xffff00ff, UID_ROOT, GID_WHEEL, 1104 0600, MDCTL_NAME); 1105} 1106 1107static int 1108md_modevent(module_t mod, int type, void *data) 1109{ 1110 int error; 1111 struct md_s *sc; 1112 1113 switch (type) { 1114 case MOD_LOAD: 1115 md_drvinit(NULL); 1116 break; 1117 case MOD_UNLOAD: 1118 LIST_FOREACH(sc, &md_softc_list, list) { 1119 error = mddetach(sc->unit, curthread); 1120 if (error != 0) 1121 return (error); 1122 } 1123 if (status_dev) 1124 destroy_dev(status_dev); 1125 status_dev = 0; 1126 break; 1127 default: 1128 break; 1129 } 1130 return (0); 1131} 1132 1133static moduledata_t md_mod = { 1134 MD_NAME, 1135 md_modevent, 1136 NULL 1137}; 1138DECLARE_MODULE(md, md_mod, SI_SUB_DRIVERS, SI_ORDER_MIDDLE+CDEV_MAJOR); 1139MODULE_VERSION(md, MD_MODVER); 1140 1141 1142#ifdef MD_ROOT 1143static void 1144md_takeroot(void *junk) 1145{ 1146 if (mdrootready) 1147 rootdevnames[0] = "ufs:/dev/md0c"; 1148} 1149 1150SYSINIT(md_root, SI_SUB_MOUNT_ROOT, SI_ORDER_FIRST, md_takeroot, NULL); 1151#endif 1152