md.c revision 286720
1/*- 2 * ---------------------------------------------------------------------------- 3 * "THE BEER-WARE LICENSE" (Revision 42): 4 * <phk@FreeBSD.ORG> wrote this file. As long as you retain this notice you 5 * can do whatever you want with this stuff. If we meet some day, and you think 6 * this stuff is worth it, you can buy me a beer in return. Poul-Henning Kamp 7 * ---------------------------------------------------------------------------- 8 * 9 * $FreeBSD: head/sys/dev/md/md.c 286720 2015-08-13 13:20:29Z ae $ 10 * 11 */ 12 13/*- 14 * The following functions are based in the vn(4) driver: mdstart_swap(), 15 * mdstart_vnode(), mdcreate_swap(), mdcreate_vnode() and mddestroy(), 16 * and as such under the following copyright: 17 * 18 * Copyright (c) 1988 University of Utah. 19 * Copyright (c) 1990, 1993 20 * The Regents of the University of California. All rights reserved. 21 * Copyright (c) 2013 The FreeBSD Foundation 22 * All rights reserved. 23 * 24 * This code is derived from software contributed to Berkeley by 25 * the Systems Programming Group of the University of Utah Computer 26 * Science Department. 27 * 28 * Portions of this software were developed by Konstantin Belousov 29 * under sponsorship from the FreeBSD Foundation. 30 * 31 * Redistribution and use in source and binary forms, with or without 32 * modification, are permitted provided that the following conditions 33 * are met: 34 * 1. Redistributions of source code must retain the above copyright 35 * notice, this list of conditions and the following disclaimer. 36 * 2. Redistributions in binary form must reproduce the above copyright 37 * notice, this list of conditions and the following disclaimer in the 38 * documentation and/or other materials provided with the distribution. 39 * 4. Neither the name of the University nor the names of its contributors 40 * may be used to endorse or promote products derived from this software 41 * without specific prior written permission. 42 * 43 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 44 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 45 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 46 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 47 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 48 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 49 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 50 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 51 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 52 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 53 * SUCH DAMAGE. 54 * 55 * from: Utah Hdr: vn.c 1.13 94/04/02 56 * 57 * from: @(#)vn.c 8.6 (Berkeley) 4/1/94 58 * From: src/sys/dev/vn/vn.c,v 1.122 2000/12/16 16:06:03 59 */ 60 61#include "opt_geom.h" 62#include "opt_md.h" 63 64#include <sys/param.h> 65#include <sys/systm.h> 66#include <sys/bio.h> 67#include <sys/buf.h> 68#include <sys/conf.h> 69#include <sys/devicestat.h> 70#include <sys/fcntl.h> 71#include <sys/kernel.h> 72#include <sys/kthread.h> 73#include <sys/limits.h> 74#include <sys/linker.h> 75#include <sys/lock.h> 76#include <sys/malloc.h> 77#include <sys/mdioctl.h> 78#include <sys/mount.h> 79#include <sys/mutex.h> 80#include <sys/sx.h> 81#include <sys/namei.h> 82#include <sys/proc.h> 83#include <sys/queue.h> 84#include <sys/rwlock.h> 85#include <sys/sbuf.h> 86#include <sys/sched.h> 87#include <sys/sf_buf.h> 88#include <sys/sysctl.h> 89#include <sys/vnode.h> 90 91#include <geom/geom.h> 92#include <geom/geom_int.h> 93 94#include <vm/vm.h> 95#include <vm/vm_param.h> 96#include <vm/vm_object.h> 97#include <vm/vm_page.h> 98#include <vm/vm_pager.h> 99#include <vm/swap_pager.h> 100#include <vm/uma.h> 101 102#define MD_MODVER 1 103 104#define MD_SHUTDOWN 0x10000 /* Tell worker thread to terminate. */ 105#define MD_EXITING 0x20000 /* Worker thread is exiting. */ 106 107#ifndef MD_NSECT 108#define MD_NSECT (10000 * 2) 109#endif 110 111static MALLOC_DEFINE(M_MD, "md_disk", "Memory Disk"); 112static MALLOC_DEFINE(M_MDSECT, "md_sectors", "Memory Disk Sectors"); 113 114static int md_debug; 115SYSCTL_INT(_debug, OID_AUTO, mddebug, CTLFLAG_RW, &md_debug, 0, 116 "Enable md(4) debug messages"); 117static int md_malloc_wait; 118SYSCTL_INT(_vm, OID_AUTO, md_malloc_wait, CTLFLAG_RW, &md_malloc_wait, 0, 119 "Allow malloc to wait for memory allocations"); 120 121#if defined(MD_ROOT) && !defined(MD_ROOT_FSTYPE) 122#define MD_ROOT_FSTYPE "ufs" 123#endif 124 125#if defined(MD_ROOT) && defined(MD_ROOT_SIZE) 126/* 127 * Preloaded image gets put here. 128 * Applications that patch the object with the image can determine 129 * the size looking at the start and end markers (strings), 130 * so we want them contiguous. 131 */ 132static struct { 133 u_char start[MD_ROOT_SIZE*1024]; 134 u_char end[128]; 135} mfs_root = { 136 .start = "MFS Filesystem goes here", 137 .end = "MFS Filesystem had better STOP here", 138}; 139#endif 140 141static g_init_t g_md_init; 142static g_fini_t g_md_fini; 143static g_start_t g_md_start; 144static g_access_t g_md_access; 145static void g_md_dumpconf(struct sbuf *sb, const char *indent, 146 struct g_geom *gp, struct g_consumer *cp __unused, struct g_provider *pp); 147 148static struct cdev *status_dev = 0; 149static struct sx md_sx; 150static struct unrhdr *md_uh; 151 152static d_ioctl_t mdctlioctl; 153 154static struct cdevsw mdctl_cdevsw = { 155 .d_version = D_VERSION, 156 .d_ioctl = mdctlioctl, 157 .d_name = MD_NAME, 158}; 159 160struct g_class g_md_class = { 161 .name = "MD", 162 .version = G_VERSION, 163 .init = g_md_init, 164 .fini = g_md_fini, 165 .start = g_md_start, 166 .access = g_md_access, 167 .dumpconf = g_md_dumpconf, 168}; 169 170DECLARE_GEOM_CLASS(g_md_class, g_md); 171 172 173static LIST_HEAD(, md_s) md_softc_list = LIST_HEAD_INITIALIZER(md_softc_list); 174 175#define NINDIR (PAGE_SIZE / sizeof(uintptr_t)) 176#define NMASK (NINDIR-1) 177static int nshift; 178 179static int md_vnode_pbuf_freecnt; 180 181struct indir { 182 uintptr_t *array; 183 u_int total; 184 u_int used; 185 u_int shift; 186}; 187 188struct md_s { 189 int unit; 190 LIST_ENTRY(md_s) list; 191 struct bio_queue_head bio_queue; 192 struct mtx queue_mtx; 193 struct mtx stat_mtx; 194 struct cdev *dev; 195 enum md_types type; 196 off_t mediasize; 197 unsigned sectorsize; 198 unsigned opencount; 199 unsigned fwheads; 200 unsigned fwsectors; 201 unsigned flags; 202 char name[20]; 203 struct proc *procp; 204 struct g_geom *gp; 205 struct g_provider *pp; 206 int (*start)(struct md_s *sc, struct bio *bp); 207 struct devstat *devstat; 208 209 /* MD_MALLOC related fields */ 210 struct indir *indir; 211 uma_zone_t uma; 212 213 /* MD_PRELOAD related fields */ 214 u_char *pl_ptr; 215 size_t pl_len; 216 217 /* MD_VNODE related fields */ 218 struct vnode *vnode; 219 char file[PATH_MAX]; 220 struct ucred *cred; 221 222 /* MD_SWAP related fields */ 223 vm_object_t object; 224}; 225 226static struct indir * 227new_indir(u_int shift) 228{ 229 struct indir *ip; 230 231 ip = malloc(sizeof *ip, M_MD, (md_malloc_wait ? M_WAITOK : M_NOWAIT) 232 | M_ZERO); 233 if (ip == NULL) 234 return (NULL); 235 ip->array = malloc(sizeof(uintptr_t) * NINDIR, 236 M_MDSECT, (md_malloc_wait ? M_WAITOK : M_NOWAIT) | M_ZERO); 237 if (ip->array == NULL) { 238 free(ip, M_MD); 239 return (NULL); 240 } 241 ip->total = NINDIR; 242 ip->shift = shift; 243 return (ip); 244} 245 246static void 247del_indir(struct indir *ip) 248{ 249 250 free(ip->array, M_MDSECT); 251 free(ip, M_MD); 252} 253 254static void 255destroy_indir(struct md_s *sc, struct indir *ip) 256{ 257 int i; 258 259 for (i = 0; i < NINDIR; i++) { 260 if (!ip->array[i]) 261 continue; 262 if (ip->shift) 263 destroy_indir(sc, (struct indir*)(ip->array[i])); 264 else if (ip->array[i] > 255) 265 uma_zfree(sc->uma, (void *)(ip->array[i])); 266 } 267 del_indir(ip); 268} 269 270/* 271 * This function does the math and allocates the top level "indir" structure 272 * for a device of "size" sectors. 273 */ 274 275static struct indir * 276dimension(off_t size) 277{ 278 off_t rcnt; 279 struct indir *ip; 280 int layer; 281 282 rcnt = size; 283 layer = 0; 284 while (rcnt > NINDIR) { 285 rcnt /= NINDIR; 286 layer++; 287 } 288 289 /* 290 * XXX: the top layer is probably not fully populated, so we allocate 291 * too much space for ip->array in here. 292 */ 293 ip = malloc(sizeof *ip, M_MD, M_WAITOK | M_ZERO); 294 ip->array = malloc(sizeof(uintptr_t) * NINDIR, 295 M_MDSECT, M_WAITOK | M_ZERO); 296 ip->total = NINDIR; 297 ip->shift = layer * nshift; 298 return (ip); 299} 300 301/* 302 * Read a given sector 303 */ 304 305static uintptr_t 306s_read(struct indir *ip, off_t offset) 307{ 308 struct indir *cip; 309 int idx; 310 uintptr_t up; 311 312 if (md_debug > 1) 313 printf("s_read(%jd)\n", (intmax_t)offset); 314 up = 0; 315 for (cip = ip; cip != NULL;) { 316 if (cip->shift) { 317 idx = (offset >> cip->shift) & NMASK; 318 up = cip->array[idx]; 319 cip = (struct indir *)up; 320 continue; 321 } 322 idx = offset & NMASK; 323 return (cip->array[idx]); 324 } 325 return (0); 326} 327 328/* 329 * Write a given sector, prune the tree if the value is 0 330 */ 331 332static int 333s_write(struct indir *ip, off_t offset, uintptr_t ptr) 334{ 335 struct indir *cip, *lip[10]; 336 int idx, li; 337 uintptr_t up; 338 339 if (md_debug > 1) 340 printf("s_write(%jd, %p)\n", (intmax_t)offset, (void *)ptr); 341 up = 0; 342 li = 0; 343 cip = ip; 344 for (;;) { 345 lip[li++] = cip; 346 if (cip->shift) { 347 idx = (offset >> cip->shift) & NMASK; 348 up = cip->array[idx]; 349 if (up != 0) { 350 cip = (struct indir *)up; 351 continue; 352 } 353 /* Allocate branch */ 354 cip->array[idx] = 355 (uintptr_t)new_indir(cip->shift - nshift); 356 if (cip->array[idx] == 0) 357 return (ENOSPC); 358 cip->used++; 359 up = cip->array[idx]; 360 cip = (struct indir *)up; 361 continue; 362 } 363 /* leafnode */ 364 idx = offset & NMASK; 365 up = cip->array[idx]; 366 if (up != 0) 367 cip->used--; 368 cip->array[idx] = ptr; 369 if (ptr != 0) 370 cip->used++; 371 break; 372 } 373 if (cip->used != 0 || li == 1) 374 return (0); 375 li--; 376 while (cip->used == 0 && cip != ip) { 377 li--; 378 idx = (offset >> lip[li]->shift) & NMASK; 379 up = lip[li]->array[idx]; 380 KASSERT(up == (uintptr_t)cip, ("md screwed up")); 381 del_indir(cip); 382 lip[li]->array[idx] = 0; 383 lip[li]->used--; 384 cip = lip[li]; 385 } 386 return (0); 387} 388 389 390static int 391g_md_access(struct g_provider *pp, int r, int w, int e) 392{ 393 struct md_s *sc; 394 395 sc = pp->geom->softc; 396 if (sc == NULL) { 397 if (r <= 0 && w <= 0 && e <= 0) 398 return (0); 399 return (ENXIO); 400 } 401 r += pp->acr; 402 w += pp->acw; 403 e += pp->ace; 404 if ((sc->flags & MD_READONLY) != 0 && w > 0) 405 return (EROFS); 406 if ((pp->acr + pp->acw + pp->ace) == 0 && (r + w + e) > 0) { 407 sc->opencount = 1; 408 } else if ((pp->acr + pp->acw + pp->ace) > 0 && (r + w + e) == 0) { 409 sc->opencount = 0; 410 } 411 return (0); 412} 413 414static void 415g_md_start(struct bio *bp) 416{ 417 struct md_s *sc; 418 419 sc = bp->bio_to->geom->softc; 420 if ((bp->bio_cmd == BIO_READ) || (bp->bio_cmd == BIO_WRITE)) { 421 mtx_lock(&sc->stat_mtx); 422 devstat_start_transaction_bio(sc->devstat, bp); 423 mtx_unlock(&sc->stat_mtx); 424 } 425 mtx_lock(&sc->queue_mtx); 426 bioq_disksort(&sc->bio_queue, bp); 427 mtx_unlock(&sc->queue_mtx); 428 wakeup(sc); 429} 430 431#define MD_MALLOC_MOVE_ZERO 1 432#define MD_MALLOC_MOVE_FILL 2 433#define MD_MALLOC_MOVE_READ 3 434#define MD_MALLOC_MOVE_WRITE 4 435#define MD_MALLOC_MOVE_CMP 5 436 437static int 438md_malloc_move(vm_page_t **mp, int *ma_offs, unsigned sectorsize, 439 void *ptr, u_char fill, int op) 440{ 441 struct sf_buf *sf; 442 vm_page_t m, *mp1; 443 char *p, first; 444 off_t *uc; 445 unsigned n; 446 int error, i, ma_offs1, sz, first_read; 447 448 m = NULL; 449 error = 0; 450 sf = NULL; 451 /* if (op == MD_MALLOC_MOVE_CMP) { gcc */ 452 first = 0; 453 first_read = 0; 454 uc = ptr; 455 mp1 = *mp; 456 ma_offs1 = *ma_offs; 457 /* } */ 458 sched_pin(); 459 for (n = sectorsize; n != 0; n -= sz) { 460 sz = imin(PAGE_SIZE - *ma_offs, n); 461 if (m != **mp) { 462 if (sf != NULL) 463 sf_buf_free(sf); 464 m = **mp; 465 sf = sf_buf_alloc(m, SFB_CPUPRIVATE | 466 (md_malloc_wait ? 0 : SFB_NOWAIT)); 467 if (sf == NULL) { 468 error = ENOMEM; 469 break; 470 } 471 } 472 p = (char *)sf_buf_kva(sf) + *ma_offs; 473 switch (op) { 474 case MD_MALLOC_MOVE_ZERO: 475 bzero(p, sz); 476 break; 477 case MD_MALLOC_MOVE_FILL: 478 memset(p, fill, sz); 479 break; 480 case MD_MALLOC_MOVE_READ: 481 bcopy(ptr, p, sz); 482 cpu_flush_dcache(p, sz); 483 break; 484 case MD_MALLOC_MOVE_WRITE: 485 bcopy(p, ptr, sz); 486 break; 487 case MD_MALLOC_MOVE_CMP: 488 for (i = 0; i < sz; i++, p++) { 489 if (!first_read) { 490 *uc = (u_char)*p; 491 first = *p; 492 first_read = 1; 493 } else if (*p != first) { 494 error = EDOOFUS; 495 break; 496 } 497 } 498 break; 499 default: 500 KASSERT(0, ("md_malloc_move unknown op %d\n", op)); 501 break; 502 } 503 if (error != 0) 504 break; 505 *ma_offs += sz; 506 *ma_offs %= PAGE_SIZE; 507 if (*ma_offs == 0) 508 (*mp)++; 509 ptr = (char *)ptr + sz; 510 } 511 512 if (sf != NULL) 513 sf_buf_free(sf); 514 sched_unpin(); 515 if (op == MD_MALLOC_MOVE_CMP && error != 0) { 516 *mp = mp1; 517 *ma_offs = ma_offs1; 518 } 519 return (error); 520} 521 522static int 523mdstart_malloc(struct md_s *sc, struct bio *bp) 524{ 525 u_char *dst; 526 vm_page_t *m; 527 int i, error, error1, ma_offs, notmapped; 528 off_t secno, nsec, uc; 529 uintptr_t sp, osp; 530 531 switch (bp->bio_cmd) { 532 case BIO_READ: 533 case BIO_WRITE: 534 case BIO_DELETE: 535 break; 536 default: 537 return (EOPNOTSUPP); 538 } 539 540 notmapped = (bp->bio_flags & BIO_UNMAPPED) != 0; 541 if (notmapped) { 542 m = bp->bio_ma; 543 ma_offs = bp->bio_ma_offset; 544 dst = NULL; 545 } else { 546 dst = bp->bio_data; 547 } 548 549 nsec = bp->bio_length / sc->sectorsize; 550 secno = bp->bio_offset / sc->sectorsize; 551 error = 0; 552 while (nsec--) { 553 osp = s_read(sc->indir, secno); 554 if (bp->bio_cmd == BIO_DELETE) { 555 if (osp != 0) 556 error = s_write(sc->indir, secno, 0); 557 } else if (bp->bio_cmd == BIO_READ) { 558 if (osp == 0) { 559 if (notmapped) { 560 error = md_malloc_move(&m, &ma_offs, 561 sc->sectorsize, NULL, 0, 562 MD_MALLOC_MOVE_ZERO); 563 } else 564 bzero(dst, sc->sectorsize); 565 } else if (osp <= 255) { 566 if (notmapped) { 567 error = md_malloc_move(&m, &ma_offs, 568 sc->sectorsize, NULL, osp, 569 MD_MALLOC_MOVE_FILL); 570 } else 571 memset(dst, osp, sc->sectorsize); 572 } else { 573 if (notmapped) { 574 error = md_malloc_move(&m, &ma_offs, 575 sc->sectorsize, (void *)osp, 0, 576 MD_MALLOC_MOVE_READ); 577 } else { 578 bcopy((void *)osp, dst, sc->sectorsize); 579 cpu_flush_dcache(dst, sc->sectorsize); 580 } 581 } 582 osp = 0; 583 } else if (bp->bio_cmd == BIO_WRITE) { 584 if (sc->flags & MD_COMPRESS) { 585 if (notmapped) { 586 error1 = md_malloc_move(&m, &ma_offs, 587 sc->sectorsize, &uc, 0, 588 MD_MALLOC_MOVE_CMP); 589 i = error1 == 0 ? sc->sectorsize : 0; 590 } else { 591 uc = dst[0]; 592 for (i = 1; i < sc->sectorsize; i++) { 593 if (dst[i] != uc) 594 break; 595 } 596 } 597 } else { 598 i = 0; 599 uc = 0; 600 } 601 if (i == sc->sectorsize) { 602 if (osp != uc) 603 error = s_write(sc->indir, secno, uc); 604 } else { 605 if (osp <= 255) { 606 sp = (uintptr_t)uma_zalloc(sc->uma, 607 md_malloc_wait ? M_WAITOK : 608 M_NOWAIT); 609 if (sp == 0) { 610 error = ENOSPC; 611 break; 612 } 613 if (notmapped) { 614 error = md_malloc_move(&m, 615 &ma_offs, sc->sectorsize, 616 (void *)sp, 0, 617 MD_MALLOC_MOVE_WRITE); 618 } else { 619 bcopy(dst, (void *)sp, 620 sc->sectorsize); 621 } 622 error = s_write(sc->indir, secno, sp); 623 } else { 624 if (notmapped) { 625 error = md_malloc_move(&m, 626 &ma_offs, sc->sectorsize, 627 (void *)osp, 0, 628 MD_MALLOC_MOVE_WRITE); 629 } else { 630 bcopy(dst, (void *)osp, 631 sc->sectorsize); 632 } 633 osp = 0; 634 } 635 } 636 } else { 637 error = EOPNOTSUPP; 638 } 639 if (osp > 255) 640 uma_zfree(sc->uma, (void*)osp); 641 if (error != 0) 642 break; 643 secno++; 644 if (!notmapped) 645 dst += sc->sectorsize; 646 } 647 bp->bio_resid = 0; 648 return (error); 649} 650 651static int 652mdstart_preload(struct md_s *sc, struct bio *bp) 653{ 654 655 switch (bp->bio_cmd) { 656 case BIO_READ: 657 bcopy(sc->pl_ptr + bp->bio_offset, bp->bio_data, 658 bp->bio_length); 659 cpu_flush_dcache(bp->bio_data, bp->bio_length); 660 break; 661 case BIO_WRITE: 662 bcopy(bp->bio_data, sc->pl_ptr + bp->bio_offset, 663 bp->bio_length); 664 break; 665 } 666 bp->bio_resid = 0; 667 return (0); 668} 669 670static int 671mdstart_vnode(struct md_s *sc, struct bio *bp) 672{ 673 int error; 674 struct uio auio; 675 struct iovec aiov; 676 struct mount *mp; 677 struct vnode *vp; 678 struct buf *pb; 679 struct thread *td; 680 off_t end, zerosize; 681 682 switch (bp->bio_cmd) { 683 case BIO_READ: 684 case BIO_WRITE: 685 case BIO_DELETE: 686 case BIO_FLUSH: 687 break; 688 default: 689 return (EOPNOTSUPP); 690 } 691 692 td = curthread; 693 vp = sc->vnode; 694 695 /* 696 * VNODE I/O 697 * 698 * If an error occurs, we set BIO_ERROR but we do not set 699 * B_INVAL because (for a write anyway), the buffer is 700 * still valid. 701 */ 702 703 if (bp->bio_cmd == BIO_FLUSH) { 704 (void) vn_start_write(vp, &mp, V_WAIT); 705 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 706 error = VOP_FSYNC(vp, MNT_WAIT, td); 707 VOP_UNLOCK(vp, 0); 708 vn_finished_write(mp); 709 return (error); 710 } 711 712 bzero(&auio, sizeof(auio)); 713 714 /* 715 * Special case for BIO_DELETE. On the surface, this is very 716 * similar to BIO_WRITE, except that we write from our own 717 * fixed-length buffer, so we have to loop. The net result is 718 * that the two cases end up having very little in common. 719 */ 720 if (bp->bio_cmd == BIO_DELETE) { 721 zerosize = ZERO_REGION_SIZE - 722 (ZERO_REGION_SIZE % sc->sectorsize); 723 auio.uio_iov = &aiov; 724 auio.uio_iovcnt = 1; 725 auio.uio_offset = (vm_ooffset_t)bp->bio_offset; 726 auio.uio_segflg = UIO_SYSSPACE; 727 auio.uio_rw = UIO_WRITE; 728 auio.uio_td = td; 729 end = bp->bio_offset + bp->bio_length; 730 (void) vn_start_write(vp, &mp, V_WAIT); 731 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 732 error = 0; 733 while (auio.uio_offset < end) { 734 aiov.iov_base = __DECONST(void *, zero_region); 735 aiov.iov_len = end - auio.uio_offset; 736 if (aiov.iov_len > zerosize) 737 aiov.iov_len = zerosize; 738 auio.uio_resid = aiov.iov_len; 739 error = VOP_WRITE(vp, &auio, 740 sc->flags & MD_ASYNC ? 0 : IO_SYNC, sc->cred); 741 if (error != 0) 742 break; 743 } 744 VOP_UNLOCK(vp, 0); 745 vn_finished_write(mp); 746 bp->bio_resid = end - auio.uio_offset; 747 return (error); 748 } 749 750 if ((bp->bio_flags & BIO_UNMAPPED) == 0) { 751 pb = NULL; 752 aiov.iov_base = bp->bio_data; 753 } else { 754 KASSERT(bp->bio_length <= MAXPHYS, ("bio_length %jd", 755 (uintmax_t)bp->bio_length)); 756 pb = getpbuf(&md_vnode_pbuf_freecnt); 757 pmap_qenter((vm_offset_t)pb->b_data, bp->bio_ma, bp->bio_ma_n); 758 aiov.iov_base = (void *)((vm_offset_t)pb->b_data + 759 bp->bio_ma_offset); 760 } 761 aiov.iov_len = bp->bio_length; 762 auio.uio_iov = &aiov; 763 auio.uio_iovcnt = 1; 764 auio.uio_offset = (vm_ooffset_t)bp->bio_offset; 765 auio.uio_segflg = UIO_SYSSPACE; 766 if (bp->bio_cmd == BIO_READ) 767 auio.uio_rw = UIO_READ; 768 else if (bp->bio_cmd == BIO_WRITE) 769 auio.uio_rw = UIO_WRITE; 770 else 771 panic("wrong BIO_OP in mdstart_vnode"); 772 auio.uio_resid = bp->bio_length; 773 auio.uio_td = td; 774 /* 775 * When reading set IO_DIRECT to try to avoid double-caching 776 * the data. When writing IO_DIRECT is not optimal. 777 */ 778 if (bp->bio_cmd == BIO_READ) { 779 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 780 error = VOP_READ(vp, &auio, IO_DIRECT, sc->cred); 781 VOP_UNLOCK(vp, 0); 782 } else { 783 (void) vn_start_write(vp, &mp, V_WAIT); 784 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 785 error = VOP_WRITE(vp, &auio, sc->flags & MD_ASYNC ? 0 : IO_SYNC, 786 sc->cred); 787 VOP_UNLOCK(vp, 0); 788 vn_finished_write(mp); 789 } 790 if ((bp->bio_flags & BIO_UNMAPPED) != 0) { 791 pmap_qremove((vm_offset_t)pb->b_data, bp->bio_ma_n); 792 relpbuf(pb, &md_vnode_pbuf_freecnt); 793 } 794 bp->bio_resid = auio.uio_resid; 795 return (error); 796} 797 798static int 799mdstart_swap(struct md_s *sc, struct bio *bp) 800{ 801 vm_page_t m; 802 u_char *p; 803 vm_pindex_t i, lastp; 804 int rv, ma_offs, offs, len, lastend; 805 806 switch (bp->bio_cmd) { 807 case BIO_READ: 808 case BIO_WRITE: 809 case BIO_DELETE: 810 break; 811 default: 812 return (EOPNOTSUPP); 813 } 814 815 p = bp->bio_data; 816 ma_offs = (bp->bio_flags & BIO_UNMAPPED) == 0 ? 0 : bp->bio_ma_offset; 817 818 /* 819 * offs is the offset at which to start operating on the 820 * next (ie, first) page. lastp is the last page on 821 * which we're going to operate. lastend is the ending 822 * position within that last page (ie, PAGE_SIZE if 823 * we're operating on complete aligned pages). 824 */ 825 offs = bp->bio_offset % PAGE_SIZE; 826 lastp = (bp->bio_offset + bp->bio_length - 1) / PAGE_SIZE; 827 lastend = (bp->bio_offset + bp->bio_length - 1) % PAGE_SIZE + 1; 828 829 rv = VM_PAGER_OK; 830 VM_OBJECT_WLOCK(sc->object); 831 vm_object_pip_add(sc->object, 1); 832 for (i = bp->bio_offset / PAGE_SIZE; i <= lastp; i++) { 833 len = ((i == lastp) ? lastend : PAGE_SIZE) - offs; 834 m = vm_page_grab(sc->object, i, VM_ALLOC_SYSTEM); 835 if (bp->bio_cmd == BIO_READ) { 836 if (m->valid == VM_PAGE_BITS_ALL) 837 rv = VM_PAGER_OK; 838 else 839 rv = vm_pager_get_pages(sc->object, &m, 1, 0); 840 if (rv == VM_PAGER_ERROR) { 841 vm_page_xunbusy(m); 842 break; 843 } else if (rv == VM_PAGER_FAIL) { 844 /* 845 * Pager does not have the page. Zero 846 * the allocated page, and mark it as 847 * valid. Do not set dirty, the page 848 * can be recreated if thrown out. 849 */ 850 pmap_zero_page(m); 851 m->valid = VM_PAGE_BITS_ALL; 852 } 853 if ((bp->bio_flags & BIO_UNMAPPED) != 0) { 854 pmap_copy_pages(&m, offs, bp->bio_ma, 855 ma_offs, len); 856 } else { 857 physcopyout(VM_PAGE_TO_PHYS(m) + offs, p, len); 858 cpu_flush_dcache(p, len); 859 } 860 } else if (bp->bio_cmd == BIO_WRITE) { 861 if (len != PAGE_SIZE && m->valid != VM_PAGE_BITS_ALL) 862 rv = vm_pager_get_pages(sc->object, &m, 1, 0); 863 else 864 rv = VM_PAGER_OK; 865 if (rv == VM_PAGER_ERROR) { 866 vm_page_xunbusy(m); 867 break; 868 } 869 if ((bp->bio_flags & BIO_UNMAPPED) != 0) { 870 pmap_copy_pages(bp->bio_ma, ma_offs, &m, 871 offs, len); 872 } else { 873 physcopyin(p, VM_PAGE_TO_PHYS(m) + offs, len); 874 } 875 m->valid = VM_PAGE_BITS_ALL; 876 } else if (bp->bio_cmd == BIO_DELETE) { 877 if (len != PAGE_SIZE && m->valid != VM_PAGE_BITS_ALL) 878 rv = vm_pager_get_pages(sc->object, &m, 1, 0); 879 else 880 rv = VM_PAGER_OK; 881 if (rv == VM_PAGER_ERROR) { 882 vm_page_xunbusy(m); 883 break; 884 } 885 if (len != PAGE_SIZE) { 886 pmap_zero_page_area(m, offs, len); 887 vm_page_clear_dirty(m, offs, len); 888 m->valid = VM_PAGE_BITS_ALL; 889 } else 890 vm_pager_page_unswapped(m); 891 } 892 vm_page_xunbusy(m); 893 vm_page_lock(m); 894 if (bp->bio_cmd == BIO_DELETE && len == PAGE_SIZE) 895 vm_page_free(m); 896 else 897 vm_page_activate(m); 898 vm_page_unlock(m); 899 if (bp->bio_cmd == BIO_WRITE) { 900 vm_page_dirty(m); 901 vm_pager_page_unswapped(m); 902 } 903 904 /* Actions on further pages start at offset 0 */ 905 p += PAGE_SIZE - offs; 906 offs = 0; 907 ma_offs += len; 908 } 909 vm_object_pip_wakeup(sc->object); 910 VM_OBJECT_WUNLOCK(sc->object); 911 return (rv != VM_PAGER_ERROR ? 0 : ENOSPC); 912} 913 914static int 915mdstart_null(struct md_s *sc, struct bio *bp) 916{ 917 918 switch (bp->bio_cmd) { 919 case BIO_READ: 920 bzero(bp->bio_data, bp->bio_length); 921 cpu_flush_dcache(bp->bio_data, bp->bio_length); 922 break; 923 case BIO_WRITE: 924 break; 925 } 926 bp->bio_resid = 0; 927 return (0); 928} 929 930static void 931md_kthread(void *arg) 932{ 933 struct md_s *sc; 934 struct bio *bp; 935 int error; 936 937 sc = arg; 938 thread_lock(curthread); 939 sched_prio(curthread, PRIBIO); 940 thread_unlock(curthread); 941 if (sc->type == MD_VNODE) 942 curthread->td_pflags |= TDP_NORUNNINGBUF; 943 944 for (;;) { 945 mtx_lock(&sc->queue_mtx); 946 if (sc->flags & MD_SHUTDOWN) { 947 sc->flags |= MD_EXITING; 948 mtx_unlock(&sc->queue_mtx); 949 kproc_exit(0); 950 } 951 bp = bioq_takefirst(&sc->bio_queue); 952 if (!bp) { 953 msleep(sc, &sc->queue_mtx, PRIBIO | PDROP, "mdwait", 0); 954 continue; 955 } 956 mtx_unlock(&sc->queue_mtx); 957 if (bp->bio_cmd == BIO_GETATTR) { 958 if ((sc->fwsectors && sc->fwheads && 959 (g_handleattr_int(bp, "GEOM::fwsectors", 960 sc->fwsectors) || 961 g_handleattr_int(bp, "GEOM::fwheads", 962 sc->fwheads))) || 963 g_handleattr_int(bp, "GEOM::candelete", 1)) 964 error = -1; 965 else 966 error = EOPNOTSUPP; 967 } else { 968 error = sc->start(sc, bp); 969 } 970 971 if (error != -1) { 972 bp->bio_completed = bp->bio_length; 973 if ((bp->bio_cmd == BIO_READ) || (bp->bio_cmd == BIO_WRITE)) 974 devstat_end_transaction_bio(sc->devstat, bp); 975 g_io_deliver(bp, error); 976 } 977 } 978} 979 980static struct md_s * 981mdfind(int unit) 982{ 983 struct md_s *sc; 984 985 LIST_FOREACH(sc, &md_softc_list, list) { 986 if (sc->unit == unit) 987 break; 988 } 989 return (sc); 990} 991 992static struct md_s * 993mdnew(int unit, int *errp, enum md_types type) 994{ 995 struct md_s *sc; 996 int error; 997 998 *errp = 0; 999 if (unit == -1) 1000 unit = alloc_unr(md_uh); 1001 else 1002 unit = alloc_unr_specific(md_uh, unit); 1003 1004 if (unit == -1) { 1005 *errp = EBUSY; 1006 return (NULL); 1007 } 1008 1009 sc = (struct md_s *)malloc(sizeof *sc, M_MD, M_WAITOK | M_ZERO); 1010 sc->type = type; 1011 bioq_init(&sc->bio_queue); 1012 mtx_init(&sc->queue_mtx, "md bio queue", NULL, MTX_DEF); 1013 mtx_init(&sc->stat_mtx, "md stat", NULL, MTX_DEF); 1014 sc->unit = unit; 1015 sprintf(sc->name, "md%d", unit); 1016 LIST_INSERT_HEAD(&md_softc_list, sc, list); 1017 error = kproc_create(md_kthread, sc, &sc->procp, 0, 0,"%s", sc->name); 1018 if (error == 0) 1019 return (sc); 1020 LIST_REMOVE(sc, list); 1021 mtx_destroy(&sc->stat_mtx); 1022 mtx_destroy(&sc->queue_mtx); 1023 free_unr(md_uh, sc->unit); 1024 free(sc, M_MD); 1025 *errp = error; 1026 return (NULL); 1027} 1028 1029static void 1030mdinit(struct md_s *sc) 1031{ 1032 struct g_geom *gp; 1033 struct g_provider *pp; 1034 1035 g_topology_lock(); 1036 gp = g_new_geomf(&g_md_class, "md%d", sc->unit); 1037 gp->softc = sc; 1038 pp = g_new_providerf(gp, "md%d", sc->unit); 1039 pp->flags |= G_PF_DIRECT_SEND | G_PF_DIRECT_RECEIVE; 1040 pp->mediasize = sc->mediasize; 1041 pp->sectorsize = sc->sectorsize; 1042 switch (sc->type) { 1043 case MD_MALLOC: 1044 case MD_VNODE: 1045 case MD_SWAP: 1046 pp->flags |= G_PF_ACCEPT_UNMAPPED; 1047 break; 1048 case MD_PRELOAD: 1049 case MD_NULL: 1050 break; 1051 } 1052 sc->gp = gp; 1053 sc->pp = pp; 1054 g_error_provider(pp, 0); 1055 g_topology_unlock(); 1056 sc->devstat = devstat_new_entry("md", sc->unit, sc->sectorsize, 1057 DEVSTAT_ALL_SUPPORTED, DEVSTAT_TYPE_DIRECT, DEVSTAT_PRIORITY_MAX); 1058} 1059 1060static int 1061mdcreate_malloc(struct md_s *sc, struct md_ioctl *mdio) 1062{ 1063 uintptr_t sp; 1064 int error; 1065 off_t u; 1066 1067 error = 0; 1068 if (mdio->md_options & ~(MD_AUTOUNIT | MD_COMPRESS | MD_RESERVE)) 1069 return (EINVAL); 1070 if (mdio->md_sectorsize != 0 && !powerof2(mdio->md_sectorsize)) 1071 return (EINVAL); 1072 /* Compression doesn't make sense if we have reserved space */ 1073 if (mdio->md_options & MD_RESERVE) 1074 mdio->md_options &= ~MD_COMPRESS; 1075 if (mdio->md_fwsectors != 0) 1076 sc->fwsectors = mdio->md_fwsectors; 1077 if (mdio->md_fwheads != 0) 1078 sc->fwheads = mdio->md_fwheads; 1079 sc->flags = mdio->md_options & (MD_COMPRESS | MD_FORCE); 1080 sc->indir = dimension(sc->mediasize / sc->sectorsize); 1081 sc->uma = uma_zcreate(sc->name, sc->sectorsize, NULL, NULL, NULL, NULL, 1082 0x1ff, 0); 1083 if (mdio->md_options & MD_RESERVE) { 1084 off_t nsectors; 1085 1086 nsectors = sc->mediasize / sc->sectorsize; 1087 for (u = 0; u < nsectors; u++) { 1088 sp = (uintptr_t)uma_zalloc(sc->uma, (md_malloc_wait ? 1089 M_WAITOK : M_NOWAIT) | M_ZERO); 1090 if (sp != 0) 1091 error = s_write(sc->indir, u, sp); 1092 else 1093 error = ENOMEM; 1094 if (error != 0) 1095 break; 1096 } 1097 } 1098 return (error); 1099} 1100 1101 1102static int 1103mdsetcred(struct md_s *sc, struct ucred *cred) 1104{ 1105 char *tmpbuf; 1106 int error = 0; 1107 1108 /* 1109 * Set credits in our softc 1110 */ 1111 1112 if (sc->cred) 1113 crfree(sc->cred); 1114 sc->cred = crhold(cred); 1115 1116 /* 1117 * Horrible kludge to establish credentials for NFS XXX. 1118 */ 1119 1120 if (sc->vnode) { 1121 struct uio auio; 1122 struct iovec aiov; 1123 1124 tmpbuf = malloc(sc->sectorsize, M_TEMP, M_WAITOK); 1125 bzero(&auio, sizeof(auio)); 1126 1127 aiov.iov_base = tmpbuf; 1128 aiov.iov_len = sc->sectorsize; 1129 auio.uio_iov = &aiov; 1130 auio.uio_iovcnt = 1; 1131 auio.uio_offset = 0; 1132 auio.uio_rw = UIO_READ; 1133 auio.uio_segflg = UIO_SYSSPACE; 1134 auio.uio_resid = aiov.iov_len; 1135 vn_lock(sc->vnode, LK_EXCLUSIVE | LK_RETRY); 1136 error = VOP_READ(sc->vnode, &auio, 0, sc->cred); 1137 VOP_UNLOCK(sc->vnode, 0); 1138 free(tmpbuf, M_TEMP); 1139 } 1140 return (error); 1141} 1142 1143static int 1144mdcreate_vnode(struct md_s *sc, struct md_ioctl *mdio, struct thread *td) 1145{ 1146 struct vattr vattr; 1147 struct nameidata nd; 1148 char *fname; 1149 int error, flags; 1150 1151 /* 1152 * Kernel-originated requests must have the filename appended 1153 * to the mdio structure to protect against malicious software. 1154 */ 1155 fname = mdio->md_file; 1156 if ((void *)fname != (void *)(mdio + 1)) { 1157 error = copyinstr(fname, sc->file, sizeof(sc->file), NULL); 1158 if (error != 0) 1159 return (error); 1160 } else 1161 strlcpy(sc->file, fname, sizeof(sc->file)); 1162 1163 /* 1164 * If the user specified that this is a read only device, don't 1165 * set the FWRITE mask before trying to open the backing store. 1166 */ 1167 flags = FREAD | ((mdio->md_options & MD_READONLY) ? 0 : FWRITE); 1168 NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, sc->file, td); 1169 error = vn_open(&nd, &flags, 0, NULL); 1170 if (error != 0) 1171 return (error); 1172 NDFREE(&nd, NDF_ONLY_PNBUF); 1173 if (nd.ni_vp->v_type != VREG) { 1174 error = EINVAL; 1175 goto bad; 1176 } 1177 error = VOP_GETATTR(nd.ni_vp, &vattr, td->td_ucred); 1178 if (error != 0) 1179 goto bad; 1180 if (VOP_ISLOCKED(nd.ni_vp) != LK_EXCLUSIVE) { 1181 vn_lock(nd.ni_vp, LK_UPGRADE | LK_RETRY); 1182 if (nd.ni_vp->v_iflag & VI_DOOMED) { 1183 /* Forced unmount. */ 1184 error = EBADF; 1185 goto bad; 1186 } 1187 } 1188 nd.ni_vp->v_vflag |= VV_MD; 1189 VOP_UNLOCK(nd.ni_vp, 0); 1190 1191 if (mdio->md_fwsectors != 0) 1192 sc->fwsectors = mdio->md_fwsectors; 1193 if (mdio->md_fwheads != 0) 1194 sc->fwheads = mdio->md_fwheads; 1195 sc->flags = mdio->md_options & (MD_FORCE | MD_ASYNC); 1196 if (!(flags & FWRITE)) 1197 sc->flags |= MD_READONLY; 1198 sc->vnode = nd.ni_vp; 1199 1200 error = mdsetcred(sc, td->td_ucred); 1201 if (error != 0) { 1202 sc->vnode = NULL; 1203 vn_lock(nd.ni_vp, LK_EXCLUSIVE | LK_RETRY); 1204 nd.ni_vp->v_vflag &= ~VV_MD; 1205 goto bad; 1206 } 1207 return (0); 1208bad: 1209 VOP_UNLOCK(nd.ni_vp, 0); 1210 (void)vn_close(nd.ni_vp, flags, td->td_ucred, td); 1211 return (error); 1212} 1213 1214static int 1215mddestroy(struct md_s *sc, struct thread *td) 1216{ 1217 1218 if (sc->gp) { 1219 sc->gp->softc = NULL; 1220 g_topology_lock(); 1221 g_wither_geom(sc->gp, ENXIO); 1222 g_topology_unlock(); 1223 sc->gp = NULL; 1224 sc->pp = NULL; 1225 } 1226 if (sc->devstat) { 1227 devstat_remove_entry(sc->devstat); 1228 sc->devstat = NULL; 1229 } 1230 mtx_lock(&sc->queue_mtx); 1231 sc->flags |= MD_SHUTDOWN; 1232 wakeup(sc); 1233 while (!(sc->flags & MD_EXITING)) 1234 msleep(sc->procp, &sc->queue_mtx, PRIBIO, "mddestroy", hz / 10); 1235 mtx_unlock(&sc->queue_mtx); 1236 mtx_destroy(&sc->stat_mtx); 1237 mtx_destroy(&sc->queue_mtx); 1238 if (sc->vnode != NULL) { 1239 vn_lock(sc->vnode, LK_EXCLUSIVE | LK_RETRY); 1240 sc->vnode->v_vflag &= ~VV_MD; 1241 VOP_UNLOCK(sc->vnode, 0); 1242 (void)vn_close(sc->vnode, sc->flags & MD_READONLY ? 1243 FREAD : (FREAD|FWRITE), sc->cred, td); 1244 } 1245 if (sc->cred != NULL) 1246 crfree(sc->cred); 1247 if (sc->object != NULL) 1248 vm_object_deallocate(sc->object); 1249 if (sc->indir) 1250 destroy_indir(sc, sc->indir); 1251 if (sc->uma) 1252 uma_zdestroy(sc->uma); 1253 1254 LIST_REMOVE(sc, list); 1255 free_unr(md_uh, sc->unit); 1256 free(sc, M_MD); 1257 return (0); 1258} 1259 1260static int 1261mdresize(struct md_s *sc, struct md_ioctl *mdio) 1262{ 1263 int error, res; 1264 vm_pindex_t oldpages, newpages; 1265 1266 switch (sc->type) { 1267 case MD_VNODE: 1268 case MD_NULL: 1269 break; 1270 case MD_SWAP: 1271 if (mdio->md_mediasize <= 0 || 1272 (mdio->md_mediasize % PAGE_SIZE) != 0) 1273 return (EDOM); 1274 oldpages = OFF_TO_IDX(round_page(sc->mediasize)); 1275 newpages = OFF_TO_IDX(round_page(mdio->md_mediasize)); 1276 if (newpages < oldpages) { 1277 VM_OBJECT_WLOCK(sc->object); 1278 vm_object_page_remove(sc->object, newpages, 0, 0); 1279 swap_pager_freespace(sc->object, newpages, 1280 oldpages - newpages); 1281 swap_release_by_cred(IDX_TO_OFF(oldpages - 1282 newpages), sc->cred); 1283 sc->object->charge = IDX_TO_OFF(newpages); 1284 sc->object->size = newpages; 1285 VM_OBJECT_WUNLOCK(sc->object); 1286 } else if (newpages > oldpages) { 1287 res = swap_reserve_by_cred(IDX_TO_OFF(newpages - 1288 oldpages), sc->cred); 1289 if (!res) 1290 return (ENOMEM); 1291 if ((mdio->md_options & MD_RESERVE) || 1292 (sc->flags & MD_RESERVE)) { 1293 error = swap_pager_reserve(sc->object, 1294 oldpages, newpages - oldpages); 1295 if (error < 0) { 1296 swap_release_by_cred( 1297 IDX_TO_OFF(newpages - oldpages), 1298 sc->cred); 1299 return (EDOM); 1300 } 1301 } 1302 VM_OBJECT_WLOCK(sc->object); 1303 sc->object->charge = IDX_TO_OFF(newpages); 1304 sc->object->size = newpages; 1305 VM_OBJECT_WUNLOCK(sc->object); 1306 } 1307 break; 1308 default: 1309 return (EOPNOTSUPP); 1310 } 1311 1312 sc->mediasize = mdio->md_mediasize; 1313 g_topology_lock(); 1314 g_resize_provider(sc->pp, sc->mediasize); 1315 g_topology_unlock(); 1316 return (0); 1317} 1318 1319static int 1320mdcreate_swap(struct md_s *sc, struct md_ioctl *mdio, struct thread *td) 1321{ 1322 vm_ooffset_t npage; 1323 int error; 1324 1325 /* 1326 * Range check. Disallow negative sizes and sizes not being 1327 * multiple of page size. 1328 */ 1329 if (sc->mediasize <= 0 || (sc->mediasize % PAGE_SIZE) != 0) 1330 return (EDOM); 1331 1332 /* 1333 * Allocate an OBJT_SWAP object. 1334 * 1335 * Note the truncation. 1336 */ 1337 1338 npage = mdio->md_mediasize / PAGE_SIZE; 1339 if (mdio->md_fwsectors != 0) 1340 sc->fwsectors = mdio->md_fwsectors; 1341 if (mdio->md_fwheads != 0) 1342 sc->fwheads = mdio->md_fwheads; 1343 sc->object = vm_pager_allocate(OBJT_SWAP, NULL, PAGE_SIZE * npage, 1344 VM_PROT_DEFAULT, 0, td->td_ucred); 1345 if (sc->object == NULL) 1346 return (ENOMEM); 1347 sc->flags = mdio->md_options & (MD_FORCE | MD_RESERVE); 1348 if (mdio->md_options & MD_RESERVE) { 1349 if (swap_pager_reserve(sc->object, 0, npage) < 0) { 1350 error = EDOM; 1351 goto finish; 1352 } 1353 } 1354 error = mdsetcred(sc, td->td_ucred); 1355 finish: 1356 if (error != 0) { 1357 vm_object_deallocate(sc->object); 1358 sc->object = NULL; 1359 } 1360 return (error); 1361} 1362 1363static int 1364mdcreate_null(struct md_s *sc, struct md_ioctl *mdio, struct thread *td) 1365{ 1366 1367 /* 1368 * Range check. Disallow negative sizes and sizes not being 1369 * multiple of page size. 1370 */ 1371 if (sc->mediasize <= 0 || (sc->mediasize % PAGE_SIZE) != 0) 1372 return (EDOM); 1373 1374 return (0); 1375} 1376 1377static int 1378xmdctlioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flags, struct thread *td) 1379{ 1380 struct md_ioctl *mdio; 1381 struct md_s *sc; 1382 int error, i; 1383 unsigned sectsize; 1384 1385 if (md_debug) 1386 printf("mdctlioctl(%s %lx %p %x %p)\n", 1387 devtoname(dev), cmd, addr, flags, td); 1388 1389 mdio = (struct md_ioctl *)addr; 1390 if (mdio->md_version != MDIOVERSION) 1391 return (EINVAL); 1392 1393 /* 1394 * We assert the version number in the individual ioctl 1395 * handlers instead of out here because (a) it is possible we 1396 * may add another ioctl in the future which doesn't read an 1397 * mdio, and (b) the correct return value for an unknown ioctl 1398 * is ENOIOCTL, not EINVAL. 1399 */ 1400 error = 0; 1401 switch (cmd) { 1402 case MDIOCATTACH: 1403 switch (mdio->md_type) { 1404 case MD_MALLOC: 1405 case MD_PRELOAD: 1406 case MD_VNODE: 1407 case MD_SWAP: 1408 case MD_NULL: 1409 break; 1410 default: 1411 return (EINVAL); 1412 } 1413 if (mdio->md_sectorsize == 0) 1414 sectsize = DEV_BSIZE; 1415 else 1416 sectsize = mdio->md_sectorsize; 1417 if (sectsize > MAXPHYS || mdio->md_mediasize < sectsize) 1418 return (EINVAL); 1419 if (mdio->md_options & MD_AUTOUNIT) 1420 sc = mdnew(-1, &error, mdio->md_type); 1421 else { 1422 if (mdio->md_unit > INT_MAX) 1423 return (EINVAL); 1424 sc = mdnew(mdio->md_unit, &error, mdio->md_type); 1425 } 1426 if (sc == NULL) 1427 return (error); 1428 if (mdio->md_options & MD_AUTOUNIT) 1429 mdio->md_unit = sc->unit; 1430 sc->mediasize = mdio->md_mediasize; 1431 sc->sectorsize = sectsize; 1432 error = EDOOFUS; 1433 switch (sc->type) { 1434 case MD_MALLOC: 1435 sc->start = mdstart_malloc; 1436 error = mdcreate_malloc(sc, mdio); 1437 break; 1438 case MD_PRELOAD: 1439 /* 1440 * We disallow attaching preloaded memory disks via 1441 * ioctl. Preloaded memory disks are automatically 1442 * attached in g_md_init(). 1443 */ 1444 error = EOPNOTSUPP; 1445 break; 1446 case MD_VNODE: 1447 sc->start = mdstart_vnode; 1448 error = mdcreate_vnode(sc, mdio, td); 1449 break; 1450 case MD_SWAP: 1451 sc->start = mdstart_swap; 1452 error = mdcreate_swap(sc, mdio, td); 1453 break; 1454 case MD_NULL: 1455 sc->start = mdstart_null; 1456 error = mdcreate_null(sc, mdio, td); 1457 break; 1458 } 1459 if (error != 0) { 1460 mddestroy(sc, td); 1461 return (error); 1462 } 1463 1464 /* Prune off any residual fractional sector */ 1465 i = sc->mediasize % sc->sectorsize; 1466 sc->mediasize -= i; 1467 1468 mdinit(sc); 1469 return (0); 1470 case MDIOCDETACH: 1471 if (mdio->md_mediasize != 0 || 1472 (mdio->md_options & ~MD_FORCE) != 0) 1473 return (EINVAL); 1474 1475 sc = mdfind(mdio->md_unit); 1476 if (sc == NULL) 1477 return (ENOENT); 1478 if (sc->opencount != 0 && !(sc->flags & MD_FORCE) && 1479 !(mdio->md_options & MD_FORCE)) 1480 return (EBUSY); 1481 return (mddestroy(sc, td)); 1482 case MDIOCRESIZE: 1483 if ((mdio->md_options & ~(MD_FORCE | MD_RESERVE)) != 0) 1484 return (EINVAL); 1485 1486 sc = mdfind(mdio->md_unit); 1487 if (sc == NULL) 1488 return (ENOENT); 1489 if (mdio->md_mediasize < sc->sectorsize) 1490 return (EINVAL); 1491 if (mdio->md_mediasize < sc->mediasize && 1492 !(sc->flags & MD_FORCE) && 1493 !(mdio->md_options & MD_FORCE)) 1494 return (EBUSY); 1495 return (mdresize(sc, mdio)); 1496 case MDIOCQUERY: 1497 sc = mdfind(mdio->md_unit); 1498 if (sc == NULL) 1499 return (ENOENT); 1500 mdio->md_type = sc->type; 1501 mdio->md_options = sc->flags; 1502 mdio->md_mediasize = sc->mediasize; 1503 mdio->md_sectorsize = sc->sectorsize; 1504 if (sc->type == MD_VNODE) 1505 error = copyout(sc->file, mdio->md_file, 1506 strlen(sc->file) + 1); 1507 return (error); 1508 case MDIOCLIST: 1509 i = 1; 1510 LIST_FOREACH(sc, &md_softc_list, list) { 1511 if (i == MDNPAD - 1) 1512 mdio->md_pad[i] = -1; 1513 else 1514 mdio->md_pad[i++] = sc->unit; 1515 } 1516 mdio->md_pad[0] = i - 1; 1517 return (0); 1518 default: 1519 return (ENOIOCTL); 1520 }; 1521} 1522 1523static int 1524mdctlioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flags, struct thread *td) 1525{ 1526 int error; 1527 1528 sx_xlock(&md_sx); 1529 error = xmdctlioctl(dev, cmd, addr, flags, td); 1530 sx_xunlock(&md_sx); 1531 return (error); 1532} 1533 1534static void 1535md_preloaded(u_char *image, size_t length, const char *name) 1536{ 1537 struct md_s *sc; 1538 int error; 1539 1540 sc = mdnew(-1, &error, MD_PRELOAD); 1541 if (sc == NULL) 1542 return; 1543 sc->mediasize = length; 1544 sc->sectorsize = DEV_BSIZE; 1545 sc->pl_ptr = image; 1546 sc->pl_len = length; 1547 sc->start = mdstart_preload; 1548#ifdef MD_ROOT 1549 if (sc->unit == 0) 1550 rootdevnames[0] = MD_ROOT_FSTYPE ":/dev/md0"; 1551#endif 1552 mdinit(sc); 1553 if (name != NULL) { 1554 printf("%s%d: Preloaded image <%s> %zd bytes at %p\n", 1555 MD_NAME, sc->unit, name, length, image); 1556 } 1557} 1558 1559static void 1560g_md_init(struct g_class *mp __unused) 1561{ 1562 caddr_t mod; 1563 u_char *ptr, *name, *type; 1564 unsigned len; 1565 int i; 1566 1567 /* figure out log2(NINDIR) */ 1568 for (i = NINDIR, nshift = -1; i; nshift++) 1569 i >>= 1; 1570 1571 mod = NULL; 1572 sx_init(&md_sx, "MD config lock"); 1573 g_topology_unlock(); 1574 md_uh = new_unrhdr(0, INT_MAX, NULL); 1575#ifdef MD_ROOT_SIZE 1576 sx_xlock(&md_sx); 1577 md_preloaded(mfs_root.start, sizeof(mfs_root.start), NULL); 1578 sx_xunlock(&md_sx); 1579#endif 1580 /* XXX: are preload_* static or do they need Giant ? */ 1581 while ((mod = preload_search_next_name(mod)) != NULL) { 1582 name = (char *)preload_search_info(mod, MODINFO_NAME); 1583 if (name == NULL) 1584 continue; 1585 type = (char *)preload_search_info(mod, MODINFO_TYPE); 1586 if (type == NULL) 1587 continue; 1588 if (strcmp(type, "md_image") && strcmp(type, "mfs_root")) 1589 continue; 1590 ptr = preload_fetch_addr(mod); 1591 len = preload_fetch_size(mod); 1592 if (ptr != NULL && len != 0) { 1593 sx_xlock(&md_sx); 1594 md_preloaded(ptr, len, name); 1595 sx_xunlock(&md_sx); 1596 } 1597 } 1598 md_vnode_pbuf_freecnt = nswbuf / 10; 1599 status_dev = make_dev(&mdctl_cdevsw, INT_MAX, UID_ROOT, GID_WHEEL, 1600 0600, MDCTL_NAME); 1601 g_topology_lock(); 1602} 1603 1604static void 1605g_md_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, 1606 struct g_consumer *cp __unused, struct g_provider *pp) 1607{ 1608 struct md_s *mp; 1609 char *type; 1610 1611 mp = gp->softc; 1612 if (mp == NULL) 1613 return; 1614 1615 switch (mp->type) { 1616 case MD_MALLOC: 1617 type = "malloc"; 1618 break; 1619 case MD_PRELOAD: 1620 type = "preload"; 1621 break; 1622 case MD_VNODE: 1623 type = "vnode"; 1624 break; 1625 case MD_SWAP: 1626 type = "swap"; 1627 break; 1628 case MD_NULL: 1629 type = "null"; 1630 break; 1631 default: 1632 type = "unknown"; 1633 break; 1634 } 1635 1636 if (pp != NULL) { 1637 if (indent == NULL) { 1638 sbuf_printf(sb, " u %d", mp->unit); 1639 sbuf_printf(sb, " s %ju", (uintmax_t) mp->sectorsize); 1640 sbuf_printf(sb, " f %ju", (uintmax_t) mp->fwheads); 1641 sbuf_printf(sb, " fs %ju", (uintmax_t) mp->fwsectors); 1642 sbuf_printf(sb, " l %ju", (uintmax_t) mp->mediasize); 1643 sbuf_printf(sb, " t %s", type); 1644 if (mp->type == MD_VNODE && mp->vnode != NULL) 1645 sbuf_printf(sb, " file %s", mp->file); 1646 } else { 1647 sbuf_printf(sb, "%s<unit>%d</unit>\n", indent, 1648 mp->unit); 1649 sbuf_printf(sb, "%s<sectorsize>%ju</sectorsize>\n", 1650 indent, (uintmax_t) mp->sectorsize); 1651 sbuf_printf(sb, "%s<fwheads>%ju</fwheads>\n", 1652 indent, (uintmax_t) mp->fwheads); 1653 sbuf_printf(sb, "%s<fwsectors>%ju</fwsectors>\n", 1654 indent, (uintmax_t) mp->fwsectors); 1655 sbuf_printf(sb, "%s<length>%ju</length>\n", 1656 indent, (uintmax_t) mp->mediasize); 1657 sbuf_printf(sb, "%s<compression>%s</compression>\n", indent, 1658 (mp->flags & MD_COMPRESS) == 0 ? "off": "on"); 1659 sbuf_printf(sb, "%s<access>%s</access>\n", indent, 1660 (mp->flags & MD_READONLY) == 0 ? "read-write": 1661 "read-only"); 1662 sbuf_printf(sb, "%s<type>%s</type>\n", indent, 1663 type); 1664 if (mp->type == MD_VNODE && mp->vnode != NULL) { 1665 sbuf_printf(sb, "%s<file>", indent); 1666 g_conf_printf_escaped(sb, "%s", mp->file); 1667 sbuf_printf(sb, "</file>\n"); 1668 } 1669 } 1670 } 1671} 1672 1673static void 1674g_md_fini(struct g_class *mp __unused) 1675{ 1676 1677 sx_destroy(&md_sx); 1678 if (status_dev != NULL) 1679 destroy_dev(status_dev); 1680 delete_unrhdr(md_uh); 1681} 1682