1/*- 2 * ---------------------------------------------------------------------------- 3 * "THE BEER-WARE LICENSE" (Revision 42): 4 * <phk@FreeBSD.ORG> wrote this file. As long as you retain this notice you 5 * can do whatever you want with this stuff. If we meet some day, and you think 6 * this stuff is worth it, you can buy me a beer in return. Poul-Henning Kamp 7 * ---------------------------------------------------------------------------- 8 * 9 * $FreeBSD: stable/10/sys/dev/md/md.c 320154 2017-06-20 17:03:06Z markj $ 10 * 11 */ 12 13/*- 14 * The following functions are based in the vn(4) driver: mdstart_swap(), 15 * mdstart_vnode(), mdcreate_swap(), mdcreate_vnode() and mddestroy(), 16 * and as such under the following copyright: 17 * 18 * Copyright (c) 1988 University of Utah. 19 * Copyright (c) 1990, 1993 20 * The Regents of the University of California. All rights reserved. 21 * Copyright (c) 2013 The FreeBSD Foundation 22 * All rights reserved. 23 * 24 * This code is derived from software contributed to Berkeley by 25 * the Systems Programming Group of the University of Utah Computer 26 * Science Department. 27 * 28 * Portions of this software were developed by Konstantin Belousov 29 * under sponsorship from the FreeBSD Foundation. 30 * 31 * Redistribution and use in source and binary forms, with or without 32 * modification, are permitted provided that the following conditions 33 * are met: 34 * 1. Redistributions of source code must retain the above copyright 35 * notice, this list of conditions and the following disclaimer. 36 * 2. Redistributions in binary form must reproduce the above copyright 37 * notice, this list of conditions and the following disclaimer in the 38 * documentation and/or other materials provided with the distribution. 39 * 4. Neither the name of the University nor the names of its contributors 40 * may be used to endorse or promote products derived from this software 41 * without specific prior written permission. 42 * 43 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 44 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 45 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 46 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 47 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 48 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 49 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 50 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 51 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 52 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 53 * SUCH DAMAGE. 54 * 55 * from: Utah Hdr: vn.c 1.13 94/04/02 56 * 57 * from: @(#)vn.c 8.6 (Berkeley) 4/1/94 58 * From: src/sys/dev/vn/vn.c,v 1.122 2000/12/16 16:06:03 59 */ 60 61#include "opt_geom.h" 62#include "opt_md.h" 63 64#include <sys/param.h> 65#include <sys/systm.h> 66#include <sys/bio.h> 67#include <sys/buf.h> 68#include <sys/conf.h> 69#include <sys/devicestat.h> 70#include <sys/fcntl.h> 71#include <sys/kernel.h> 72#include <sys/kthread.h> 73#include <sys/limits.h> 74#include <sys/linker.h> 75#include <sys/lock.h> 76#include <sys/malloc.h> 77#include <sys/mdioctl.h> 78#include <sys/mount.h> 79#include <sys/mutex.h> 80#include <sys/sx.h> 81#include <sys/namei.h> 82#include <sys/proc.h> 83#include <sys/queue.h> 84#include <sys/rwlock.h> 85#include <sys/sbuf.h> 86#include <sys/sched.h> 87#include <sys/sf_buf.h> 88#include <sys/sysctl.h> 89#include <sys/vnode.h> 90 91#include <geom/geom.h> 92#include <geom/geom_int.h> 93 94#include <vm/vm.h> 95#include <vm/vm_param.h> 96#include <vm/vm_object.h> 97#include <vm/vm_page.h> 98#include <vm/vm_pager.h> 99#include <vm/swap_pager.h> 100#include <vm/uma.h> 101 102#include <machine/bus.h> 103 104#define MD_MODVER 1 105 106#define MD_SHUTDOWN 0x10000 /* Tell worker thread to terminate. */ 107#define MD_EXITING 0x20000 /* Worker thread is exiting. */ 108 109#ifndef MD_NSECT 110#define MD_NSECT (10000 * 2) 111#endif 112 113static MALLOC_DEFINE(M_MD, "md_disk", "Memory Disk"); 114static MALLOC_DEFINE(M_MDSECT, "md_sectors", "Memory Disk Sectors"); 115 116static int md_debug; 117SYSCTL_INT(_debug, OID_AUTO, mddebug, CTLFLAG_RW, &md_debug, 0, 118 "Enable md(4) debug messages"); 119static int md_malloc_wait; 120SYSCTL_INT(_vm, OID_AUTO, md_malloc_wait, CTLFLAG_RW, &md_malloc_wait, 0, 121 "Allow malloc to wait for memory allocations"); 122 123#if defined(MD_ROOT) && !defined(MD_ROOT_FSTYPE) 124#define MD_ROOT_FSTYPE "ufs" 125#endif 126 127#if defined(MD_ROOT) && defined(MD_ROOT_SIZE) 128/* 129 * Preloaded image gets put here. 130 * Applications that patch the object with the image can determine 131 * the size looking at the start and end markers (strings), 132 * so we want them contiguous. 133 */ 134static struct { 135 u_char start[MD_ROOT_SIZE*1024]; 136 u_char end[128]; 137} mfs_root = { 138 .start = "MFS Filesystem goes here", 139 .end = "MFS Filesystem had better STOP here", 140}; 141#endif 142 143static g_init_t g_md_init; 144static g_fini_t g_md_fini; 145static g_start_t g_md_start; 146static g_access_t g_md_access; 147static void g_md_dumpconf(struct sbuf *sb, const char *indent, 148 struct g_geom *gp, struct g_consumer *cp __unused, struct g_provider *pp); 149 150static struct cdev *status_dev = 0; 151static struct sx md_sx; 152static struct unrhdr *md_uh; 153 154static d_ioctl_t mdctlioctl; 155 156static struct cdevsw mdctl_cdevsw = { 157 .d_version = D_VERSION, 158 .d_ioctl = mdctlioctl, 159 .d_name = MD_NAME, 160}; 161 162struct g_class g_md_class = { 163 .name = "MD", 164 .version = G_VERSION, 165 .init = g_md_init, 166 .fini = g_md_fini, 167 .start = g_md_start, 168 .access = g_md_access, 169 .dumpconf = g_md_dumpconf, 170}; 171 172DECLARE_GEOM_CLASS(g_md_class, g_md); 173 174 175static LIST_HEAD(, md_s) md_softc_list = LIST_HEAD_INITIALIZER(md_softc_list); 176 177#define NINDIR (PAGE_SIZE / sizeof(uintptr_t)) 178#define NMASK (NINDIR-1) 179static int nshift; 180 181static int md_vnode_pbuf_freecnt; 182 183struct indir { 184 uintptr_t *array; 185 u_int total; 186 u_int used; 187 u_int shift; 188}; 189 190struct md_s { 191 int unit; 192 LIST_ENTRY(md_s) list; 193 struct bio_queue_head bio_queue; 194 struct mtx queue_mtx; 195 struct mtx stat_mtx; 196 struct cdev *dev; 197 enum md_types type; 198 off_t mediasize; 199 unsigned sectorsize; 200 unsigned opencount; 201 unsigned fwheads; 202 unsigned fwsectors; 203 unsigned flags; 204 char name[20]; 205 struct proc *procp; 206 struct g_geom *gp; 207 struct g_provider *pp; 208 int (*start)(struct md_s *sc, struct bio *bp); 209 struct devstat *devstat; 210 211 /* MD_MALLOC related fields */ 212 struct indir *indir; 213 uma_zone_t uma; 214 215 /* MD_PRELOAD related fields */ 216 u_char *pl_ptr; 217 size_t pl_len; 218 219 /* MD_VNODE related fields */ 220 struct vnode *vnode; 221 char file[PATH_MAX]; 222 struct ucred *cred; 223 224 /* MD_SWAP related fields */ 225 vm_object_t object; 226}; 227 228static struct indir * 229new_indir(u_int shift) 230{ 231 struct indir *ip; 232 233 ip = malloc(sizeof *ip, M_MD, (md_malloc_wait ? M_WAITOK : M_NOWAIT) 234 | M_ZERO); 235 if (ip == NULL) 236 return (NULL); 237 ip->array = malloc(sizeof(uintptr_t) * NINDIR, 238 M_MDSECT, (md_malloc_wait ? M_WAITOK : M_NOWAIT) | M_ZERO); 239 if (ip->array == NULL) { 240 free(ip, M_MD); 241 return (NULL); 242 } 243 ip->total = NINDIR; 244 ip->shift = shift; 245 return (ip); 246} 247 248static void 249del_indir(struct indir *ip) 250{ 251 252 free(ip->array, M_MDSECT); 253 free(ip, M_MD); 254} 255 256static void 257destroy_indir(struct md_s *sc, struct indir *ip) 258{ 259 int i; 260 261 for (i = 0; i < NINDIR; i++) { 262 if (!ip->array[i]) 263 continue; 264 if (ip->shift) 265 destroy_indir(sc, (struct indir*)(ip->array[i])); 266 else if (ip->array[i] > 255) 267 uma_zfree(sc->uma, (void *)(ip->array[i])); 268 } 269 del_indir(ip); 270} 271 272/* 273 * This function does the math and allocates the top level "indir" structure 274 * for a device of "size" sectors. 275 */ 276 277static struct indir * 278dimension(off_t size) 279{ 280 off_t rcnt; 281 struct indir *ip; 282 int layer; 283 284 rcnt = size; 285 layer = 0; 286 while (rcnt > NINDIR) { 287 rcnt /= NINDIR; 288 layer++; 289 } 290 291 /* 292 * XXX: the top layer is probably not fully populated, so we allocate 293 * too much space for ip->array in here. 294 */ 295 ip = malloc(sizeof *ip, M_MD, M_WAITOK | M_ZERO); 296 ip->array = malloc(sizeof(uintptr_t) * NINDIR, 297 M_MDSECT, M_WAITOK | M_ZERO); 298 ip->total = NINDIR; 299 ip->shift = layer * nshift; 300 return (ip); 301} 302 303/* 304 * Read a given sector 305 */ 306 307static uintptr_t 308s_read(struct indir *ip, off_t offset) 309{ 310 struct indir *cip; 311 int idx; 312 uintptr_t up; 313 314 if (md_debug > 1) 315 printf("s_read(%jd)\n", (intmax_t)offset); 316 up = 0; 317 for (cip = ip; cip != NULL;) { 318 if (cip->shift) { 319 idx = (offset >> cip->shift) & NMASK; 320 up = cip->array[idx]; 321 cip = (struct indir *)up; 322 continue; 323 } 324 idx = offset & NMASK; 325 return (cip->array[idx]); 326 } 327 return (0); 328} 329 330/* 331 * Write a given sector, prune the tree if the value is 0 332 */ 333 334static int 335s_write(struct indir *ip, off_t offset, uintptr_t ptr) 336{ 337 struct indir *cip, *lip[10]; 338 int idx, li; 339 uintptr_t up; 340 341 if (md_debug > 1) 342 printf("s_write(%jd, %p)\n", (intmax_t)offset, (void *)ptr); 343 up = 0; 344 li = 0; 345 cip = ip; 346 for (;;) { 347 lip[li++] = cip; 348 if (cip->shift) { 349 idx = (offset >> cip->shift) & NMASK; 350 up = cip->array[idx]; 351 if (up != 0) { 352 cip = (struct indir *)up; 353 continue; 354 } 355 /* Allocate branch */ 356 cip->array[idx] = 357 (uintptr_t)new_indir(cip->shift - nshift); 358 if (cip->array[idx] == 0) 359 return (ENOSPC); 360 cip->used++; 361 up = cip->array[idx]; 362 cip = (struct indir *)up; 363 continue; 364 } 365 /* leafnode */ 366 idx = offset & NMASK; 367 up = cip->array[idx]; 368 if (up != 0) 369 cip->used--; 370 cip->array[idx] = ptr; 371 if (ptr != 0) 372 cip->used++; 373 break; 374 } 375 if (cip->used != 0 || li == 1) 376 return (0); 377 li--; 378 while (cip->used == 0 && cip != ip) { 379 li--; 380 idx = (offset >> lip[li]->shift) & NMASK; 381 up = lip[li]->array[idx]; 382 KASSERT(up == (uintptr_t)cip, ("md screwed up")); 383 del_indir(cip); 384 lip[li]->array[idx] = 0; 385 lip[li]->used--; 386 cip = lip[li]; 387 } 388 return (0); 389} 390 391 392static int 393g_md_access(struct g_provider *pp, int r, int w, int e) 394{ 395 struct md_s *sc; 396 397 sc = pp->geom->softc; 398 if (sc == NULL) { 399 if (r <= 0 && w <= 0 && e <= 0) 400 return (0); 401 return (ENXIO); 402 } 403 r += pp->acr; 404 w += pp->acw; 405 e += pp->ace; 406 if ((sc->flags & MD_READONLY) != 0 && w > 0) 407 return (EROFS); 408 if ((pp->acr + pp->acw + pp->ace) == 0 && (r + w + e) > 0) { 409 sc->opencount = 1; 410 } else if ((pp->acr + pp->acw + pp->ace) > 0 && (r + w + e) == 0) { 411 sc->opencount = 0; 412 } 413 return (0); 414} 415 416static void 417g_md_start(struct bio *bp) 418{ 419 struct md_s *sc; 420 421 sc = bp->bio_to->geom->softc; 422 if ((bp->bio_cmd == BIO_READ) || (bp->bio_cmd == BIO_WRITE)) { 423 mtx_lock(&sc->stat_mtx); 424 devstat_start_transaction_bio(sc->devstat, bp); 425 mtx_unlock(&sc->stat_mtx); 426 } 427 mtx_lock(&sc->queue_mtx); 428 bioq_disksort(&sc->bio_queue, bp); 429 mtx_unlock(&sc->queue_mtx); 430 wakeup(sc); 431} 432 433#define MD_MALLOC_MOVE_ZERO 1 434#define MD_MALLOC_MOVE_FILL 2 435#define MD_MALLOC_MOVE_READ 3 436#define MD_MALLOC_MOVE_WRITE 4 437#define MD_MALLOC_MOVE_CMP 5 438 439static int 440md_malloc_move_ma(vm_page_t **mp, int *ma_offs, unsigned sectorsize, 441 void *ptr, u_char fill, int op) 442{ 443 struct sf_buf *sf; 444 vm_page_t m, *mp1; 445 char *p, first; 446 off_t *uc; 447 unsigned n; 448 int error, i, ma_offs1, sz, first_read; 449 450 m = NULL; 451 error = 0; 452 sf = NULL; 453 /* if (op == MD_MALLOC_MOVE_CMP) { gcc */ 454 first = 0; 455 first_read = 0; 456 uc = ptr; 457 mp1 = *mp; 458 ma_offs1 = *ma_offs; 459 /* } */ 460 sched_pin(); 461 for (n = sectorsize; n != 0; n -= sz) { 462 sz = imin(PAGE_SIZE - *ma_offs, n); 463 if (m != **mp) { 464 if (sf != NULL) 465 sf_buf_free(sf); 466 m = **mp; 467 sf = sf_buf_alloc(m, SFB_CPUPRIVATE | 468 (md_malloc_wait ? 0 : SFB_NOWAIT)); 469 if (sf == NULL) { 470 error = ENOMEM; 471 break; 472 } 473 } 474 p = (char *)sf_buf_kva(sf) + *ma_offs; 475 switch (op) { 476 case MD_MALLOC_MOVE_ZERO: 477 bzero(p, sz); 478 break; 479 case MD_MALLOC_MOVE_FILL: 480 memset(p, fill, sz); 481 break; 482 case MD_MALLOC_MOVE_READ: 483 bcopy(ptr, p, sz); 484 cpu_flush_dcache(p, sz); 485 break; 486 case MD_MALLOC_MOVE_WRITE: 487 bcopy(p, ptr, sz); 488 break; 489 case MD_MALLOC_MOVE_CMP: 490 for (i = 0; i < sz; i++, p++) { 491 if (!first_read) { 492 *uc = (u_char)*p; 493 first = *p; 494 first_read = 1; 495 } else if (*p != first) { 496 error = EDOOFUS; 497 break; 498 } 499 } 500 break; 501 default: 502 KASSERT(0, ("md_malloc_move_ma unknown op %d\n", op)); 503 break; 504 } 505 if (error != 0) 506 break; 507 *ma_offs += sz; 508 *ma_offs %= PAGE_SIZE; 509 if (*ma_offs == 0) 510 (*mp)++; 511 ptr = (char *)ptr + sz; 512 } 513 514 if (sf != NULL) 515 sf_buf_free(sf); 516 sched_unpin(); 517 if (op == MD_MALLOC_MOVE_CMP && error != 0) { 518 *mp = mp1; 519 *ma_offs = ma_offs1; 520 } 521 return (error); 522} 523 524static int 525md_malloc_move_vlist(bus_dma_segment_t **pvlist, int *pma_offs, 526 unsigned len, void *ptr, u_char fill, int op) 527{ 528 bus_dma_segment_t *vlist; 529 uint8_t *p, *end, first; 530 off_t *uc; 531 int ma_offs, seg_len; 532 533 vlist = *pvlist; 534 ma_offs = *pma_offs; 535 uc = ptr; 536 537 for (; len != 0; len -= seg_len) { 538 seg_len = imin(vlist->ds_len - ma_offs, len); 539 p = (uint8_t *)(uintptr_t)vlist->ds_addr + ma_offs; 540 switch (op) { 541 case MD_MALLOC_MOVE_ZERO: 542 bzero(p, seg_len); 543 break; 544 case MD_MALLOC_MOVE_FILL: 545 memset(p, fill, seg_len); 546 break; 547 case MD_MALLOC_MOVE_READ: 548 bcopy(ptr, p, seg_len); 549 cpu_flush_dcache(p, seg_len); 550 break; 551 case MD_MALLOC_MOVE_WRITE: 552 bcopy(p, ptr, seg_len); 553 break; 554 case MD_MALLOC_MOVE_CMP: 555 end = p + seg_len; 556 first = *uc = *p; 557 /* Confirm all following bytes match the first */ 558 while (++p < end) { 559 if (*p != first) 560 return (EDOOFUS); 561 } 562 break; 563 default: 564 KASSERT(0, ("md_malloc_move_vlist unknown op %d\n", op)); 565 break; 566 } 567 568 ma_offs += seg_len; 569 if (ma_offs == vlist->ds_len) { 570 ma_offs = 0; 571 vlist++; 572 } 573 ptr = (uint8_t *)ptr + seg_len; 574 } 575 *pvlist = vlist; 576 *pma_offs = ma_offs; 577 578 return (0); 579} 580 581static int 582mdstart_malloc(struct md_s *sc, struct bio *bp) 583{ 584 u_char *dst; 585 vm_page_t *m; 586 bus_dma_segment_t *vlist; 587 int i, error, error1, ma_offs, notmapped; 588 off_t secno, nsec, uc; 589 uintptr_t sp, osp; 590 591 switch (bp->bio_cmd) { 592 case BIO_READ: 593 case BIO_WRITE: 594 case BIO_DELETE: 595 break; 596 default: 597 return (EOPNOTSUPP); 598 } 599 600 notmapped = (bp->bio_flags & BIO_UNMAPPED) != 0; 601 vlist = (bp->bio_flags & BIO_VLIST) != 0 ? 602 (bus_dma_segment_t *)bp->bio_data : NULL; 603 if (notmapped) { 604 m = bp->bio_ma; 605 ma_offs = bp->bio_ma_offset; 606 dst = NULL; 607 KASSERT(vlist == NULL, ("vlists cannot be unmapped")); 608 } else if (vlist != NULL) { 609 ma_offs = bp->bio_ma_offset; 610 dst = NULL; 611 } else { 612 dst = bp->bio_data; 613 } 614 615 nsec = bp->bio_length / sc->sectorsize; 616 secno = bp->bio_offset / sc->sectorsize; 617 error = 0; 618 while (nsec--) { 619 osp = s_read(sc->indir, secno); 620 if (bp->bio_cmd == BIO_DELETE) { 621 if (osp != 0) 622 error = s_write(sc->indir, secno, 0); 623 } else if (bp->bio_cmd == BIO_READ) { 624 if (osp == 0) { 625 if (notmapped) { 626 error = md_malloc_move_ma(&m, &ma_offs, 627 sc->sectorsize, NULL, 0, 628 MD_MALLOC_MOVE_ZERO); 629 } else if (vlist != NULL) { 630 error = md_malloc_move_vlist(&vlist, 631 &ma_offs, sc->sectorsize, NULL, 0, 632 MD_MALLOC_MOVE_ZERO); 633 } else 634 bzero(dst, sc->sectorsize); 635 } else if (osp <= 255) { 636 if (notmapped) { 637 error = md_malloc_move_ma(&m, &ma_offs, 638 sc->sectorsize, NULL, osp, 639 MD_MALLOC_MOVE_FILL); 640 } else if (vlist != NULL) { 641 error = md_malloc_move_vlist(&vlist, 642 &ma_offs, sc->sectorsize, NULL, osp, 643 MD_MALLOC_MOVE_FILL); 644 } else 645 memset(dst, osp, sc->sectorsize); 646 } else { 647 if (notmapped) { 648 error = md_malloc_move_ma(&m, &ma_offs, 649 sc->sectorsize, (void *)osp, 0, 650 MD_MALLOC_MOVE_READ); 651 } else if (vlist != NULL) { 652 error = md_malloc_move_vlist(&vlist, 653 &ma_offs, sc->sectorsize, 654 (void *)osp, 0, 655 MD_MALLOC_MOVE_READ); 656 } else { 657 bcopy((void *)osp, dst, sc->sectorsize); 658 cpu_flush_dcache(dst, sc->sectorsize); 659 } 660 } 661 osp = 0; 662 } else if (bp->bio_cmd == BIO_WRITE) { 663 if (sc->flags & MD_COMPRESS) { 664 if (notmapped) { 665 error1 = md_malloc_move_ma(&m, &ma_offs, 666 sc->sectorsize, &uc, 0, 667 MD_MALLOC_MOVE_CMP); 668 i = error1 == 0 ? sc->sectorsize : 0; 669 } else if (vlist != NULL) { 670 error1 = md_malloc_move_vlist(&vlist, 671 &ma_offs, sc->sectorsize, &uc, 0, 672 MD_MALLOC_MOVE_CMP); 673 i = error1 == 0 ? sc->sectorsize : 0; 674 } else { 675 uc = dst[0]; 676 for (i = 1; i < sc->sectorsize; i++) { 677 if (dst[i] != uc) 678 break; 679 } 680 } 681 } else { 682 i = 0; 683 uc = 0; 684 } 685 if (i == sc->sectorsize) { 686 if (osp != uc) 687 error = s_write(sc->indir, secno, uc); 688 } else { 689 if (osp <= 255) { 690 sp = (uintptr_t)uma_zalloc(sc->uma, 691 md_malloc_wait ? M_WAITOK : 692 M_NOWAIT); 693 if (sp == 0) { 694 error = ENOSPC; 695 break; 696 } 697 if (notmapped) { 698 error = md_malloc_move_ma(&m, 699 &ma_offs, sc->sectorsize, 700 (void *)sp, 0, 701 MD_MALLOC_MOVE_WRITE); 702 } else if (vlist != NULL) { 703 error = md_malloc_move_vlist( 704 &vlist, &ma_offs, 705 sc->sectorsize, (void *)sp, 706 0, MD_MALLOC_MOVE_WRITE); 707 } else { 708 bcopy(dst, (void *)sp, 709 sc->sectorsize); 710 } 711 error = s_write(sc->indir, secno, sp); 712 } else { 713 if (notmapped) { 714 error = md_malloc_move_ma(&m, 715 &ma_offs, sc->sectorsize, 716 (void *)osp, 0, 717 MD_MALLOC_MOVE_WRITE); 718 } else if (vlist != NULL) { 719 error = md_malloc_move_vlist( 720 &vlist, &ma_offs, 721 sc->sectorsize, (void *)osp, 722 0, MD_MALLOC_MOVE_WRITE); 723 } else { 724 bcopy(dst, (void *)osp, 725 sc->sectorsize); 726 } 727 osp = 0; 728 } 729 } 730 } else { 731 error = EOPNOTSUPP; 732 } 733 if (osp > 255) 734 uma_zfree(sc->uma, (void*)osp); 735 if (error != 0) 736 break; 737 secno++; 738 if (!notmapped && vlist == NULL) 739 dst += sc->sectorsize; 740 } 741 bp->bio_resid = 0; 742 return (error); 743} 744 745static void 746mdcopyto_vlist(void *src, bus_dma_segment_t *vlist, off_t offset, off_t len) 747{ 748 off_t seg_len; 749 750 while (offset >= vlist->ds_len) { 751 offset -= vlist->ds_len; 752 vlist++; 753 } 754 755 while (len != 0) { 756 seg_len = omin(len, vlist->ds_len - offset); 757 bcopy(src, (void *)(uintptr_t)(vlist->ds_addr + offset), 758 seg_len); 759 offset = 0; 760 src = (uint8_t *)src + seg_len; 761 len -= seg_len; 762 vlist++; 763 } 764} 765 766static void 767mdcopyfrom_vlist(bus_dma_segment_t *vlist, off_t offset, void *dst, off_t len) 768{ 769 off_t seg_len; 770 771 while (offset >= vlist->ds_len) { 772 offset -= vlist->ds_len; 773 vlist++; 774 } 775 776 while (len != 0) { 777 seg_len = omin(len, vlist->ds_len - offset); 778 bcopy((void *)(uintptr_t)(vlist->ds_addr + offset), dst, 779 seg_len); 780 offset = 0; 781 dst = (uint8_t *)dst + seg_len; 782 len -= seg_len; 783 vlist++; 784 } 785} 786 787static int 788mdstart_preload(struct md_s *sc, struct bio *bp) 789{ 790 uint8_t *p; 791 792 p = sc->pl_ptr + bp->bio_offset; 793 switch (bp->bio_cmd) { 794 case BIO_READ: 795 if ((bp->bio_flags & BIO_VLIST) != 0) { 796 mdcopyto_vlist(p, (bus_dma_segment_t *)bp->bio_data, 797 bp->bio_ma_offset, bp->bio_length); 798 } else { 799 bcopy(p, bp->bio_data, bp->bio_length); 800 } 801 cpu_flush_dcache(bp->bio_data, bp->bio_length); 802 break; 803 case BIO_WRITE: 804 if ((bp->bio_flags & BIO_VLIST) != 0) { 805 mdcopyfrom_vlist((bus_dma_segment_t *)bp->bio_data, 806 bp->bio_ma_offset, p, bp->bio_length); 807 } else { 808 bcopy(bp->bio_data, p, bp->bio_length); 809 } 810 break; 811 } 812 bp->bio_resid = 0; 813 return (0); 814} 815 816static int 817mdstart_vnode(struct md_s *sc, struct bio *bp) 818{ 819 int error; 820 struct uio auio; 821 struct iovec aiov; 822 struct iovec *piov; 823 struct mount *mp; 824 struct vnode *vp; 825 struct buf *pb; 826 bus_dma_segment_t *vlist; 827 struct thread *td; 828 off_t iolen, len, zerosize; 829 int ma_offs, npages; 830 831 switch (bp->bio_cmd) { 832 case BIO_READ: 833 auio.uio_rw = UIO_READ; 834 break; 835 case BIO_WRITE: 836 case BIO_DELETE: 837 auio.uio_rw = UIO_WRITE; 838 break; 839 case BIO_FLUSH: 840 break; 841 default: 842 return (EOPNOTSUPP); 843 } 844 845 td = curthread; 846 vp = sc->vnode; 847 pb = NULL; 848 piov = NULL; 849 ma_offs = bp->bio_ma_offset; 850 len = bp->bio_length; 851 852 /* 853 * VNODE I/O 854 * 855 * If an error occurs, we set BIO_ERROR but we do not set 856 * B_INVAL because (for a write anyway), the buffer is 857 * still valid. 858 */ 859 860 if (bp->bio_cmd == BIO_FLUSH) { 861 (void) vn_start_write(vp, &mp, V_WAIT); 862 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 863 error = VOP_FSYNC(vp, MNT_WAIT, td); 864 VOP_UNLOCK(vp, 0); 865 vn_finished_write(mp); 866 return (error); 867 } 868 869 auio.uio_offset = (vm_ooffset_t)bp->bio_offset; 870 auio.uio_resid = bp->bio_length; 871 auio.uio_segflg = UIO_SYSSPACE; 872 auio.uio_td = td; 873 874 if (bp->bio_cmd == BIO_DELETE) { 875 /* 876 * Emulate BIO_DELETE by writing zeros. 877 */ 878 zerosize = ZERO_REGION_SIZE - 879 (ZERO_REGION_SIZE % sc->sectorsize); 880 auio.uio_iovcnt = howmany(bp->bio_length, zerosize); 881 piov = malloc(sizeof(*piov) * auio.uio_iovcnt, M_MD, M_WAITOK); 882 auio.uio_iov = piov; 883 while (len > 0) { 884 piov->iov_base = __DECONST(void *, zero_region); 885 piov->iov_len = len; 886 if (len > zerosize) 887 piov->iov_len = zerosize; 888 len -= piov->iov_len; 889 piov++; 890 } 891 piov = auio.uio_iov; 892 } else if ((bp->bio_flags & BIO_VLIST) != 0) { 893 piov = malloc(sizeof(*piov) * bp->bio_ma_n, M_MD, M_WAITOK); 894 auio.uio_iov = piov; 895 vlist = (bus_dma_segment_t *)bp->bio_data; 896 while (len > 0) { 897 piov->iov_base = (void *)(uintptr_t)(vlist->ds_addr + 898 ma_offs); 899 piov->iov_len = vlist->ds_len - ma_offs; 900 if (piov->iov_len > len) 901 piov->iov_len = len; 902 len -= piov->iov_len; 903 ma_offs = 0; 904 vlist++; 905 piov++; 906 } 907 auio.uio_iovcnt = piov - auio.uio_iov; 908 piov = auio.uio_iov; 909 } else if ((bp->bio_flags & BIO_UNMAPPED) != 0) { 910 pb = getpbuf(&md_vnode_pbuf_freecnt); 911 bp->bio_resid = len; 912unmapped_step: 913 npages = atop(min(MAXPHYS, round_page(len + (ma_offs & 914 PAGE_MASK)))); 915 iolen = min(ptoa(npages) - (ma_offs & PAGE_MASK), len); 916 KASSERT(iolen > 0, ("zero iolen")); 917 pmap_qenter((vm_offset_t)pb->b_data, 918 &bp->bio_ma[atop(ma_offs)], npages); 919 aiov.iov_base = (void *)((vm_offset_t)pb->b_data + 920 (ma_offs & PAGE_MASK)); 921 aiov.iov_len = iolen; 922 auio.uio_iov = &aiov; 923 auio.uio_iovcnt = 1; 924 auio.uio_resid = iolen; 925 } else { 926 aiov.iov_base = bp->bio_data; 927 aiov.iov_len = bp->bio_length; 928 auio.uio_iov = &aiov; 929 auio.uio_iovcnt = 1; 930 } 931 /* 932 * When reading set IO_DIRECT to try to avoid double-caching 933 * the data. When writing IO_DIRECT is not optimal. 934 */ 935 if (auio.uio_rw == UIO_READ) { 936 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 937 error = VOP_READ(vp, &auio, IO_DIRECT, sc->cred); 938 VOP_UNLOCK(vp, 0); 939 } else { 940 (void) vn_start_write(vp, &mp, V_WAIT); 941 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 942 error = VOP_WRITE(vp, &auio, sc->flags & MD_ASYNC ? 0 : IO_SYNC, 943 sc->cred); 944 VOP_UNLOCK(vp, 0); 945 vn_finished_write(mp); 946 } 947 948 if (pb != NULL) { 949 pmap_qremove((vm_offset_t)pb->b_data, npages); 950 if (error == 0) { 951 len -= iolen; 952 bp->bio_resid -= iolen; 953 ma_offs += iolen; 954 if (len > 0) 955 goto unmapped_step; 956 } 957 relpbuf(pb, &md_vnode_pbuf_freecnt); 958 } 959 960 free(piov, M_MD); 961 if (pb == NULL) 962 bp->bio_resid = auio.uio_resid; 963 return (error); 964} 965 966static void 967md_swap_page_free(vm_page_t m) 968{ 969 970 vm_page_xunbusy(m); 971 vm_page_lock(m); 972 vm_page_free(m); 973 vm_page_unlock(m); 974} 975 976static int 977mdstart_swap(struct md_s *sc, struct bio *bp) 978{ 979 vm_page_t m; 980 u_char *p; 981 vm_pindex_t i, lastp; 982 bus_dma_segment_t *vlist; 983 int rv, ma_offs, offs, len, lastend; 984 985 switch (bp->bio_cmd) { 986 case BIO_READ: 987 case BIO_WRITE: 988 case BIO_DELETE: 989 break; 990 default: 991 return (EOPNOTSUPP); 992 } 993 994 p = bp->bio_data; 995 ma_offs = (bp->bio_flags & (BIO_UNMAPPED|BIO_VLIST)) != 0 ? 996 bp->bio_ma_offset : 0; 997 vlist = (bp->bio_flags & BIO_VLIST) != 0 ? 998 (bus_dma_segment_t *)bp->bio_data : NULL; 999 1000 /* 1001 * offs is the offset at which to start operating on the 1002 * next (ie, first) page. lastp is the last page on 1003 * which we're going to operate. lastend is the ending 1004 * position within that last page (ie, PAGE_SIZE if 1005 * we're operating on complete aligned pages). 1006 */ 1007 offs = bp->bio_offset % PAGE_SIZE; 1008 lastp = (bp->bio_offset + bp->bio_length - 1) / PAGE_SIZE; 1009 lastend = (bp->bio_offset + bp->bio_length - 1) % PAGE_SIZE + 1; 1010 1011 rv = VM_PAGER_OK; 1012 VM_OBJECT_WLOCK(sc->object); 1013 vm_object_pip_add(sc->object, 1); 1014 for (i = bp->bio_offset / PAGE_SIZE; i <= lastp; i++) { 1015 len = ((i == lastp) ? lastend : PAGE_SIZE) - offs; 1016 m = vm_page_grab(sc->object, i, VM_ALLOC_SYSTEM); 1017 if (bp->bio_cmd == BIO_READ) { 1018 if (m->valid == VM_PAGE_BITS_ALL) 1019 rv = VM_PAGER_OK; 1020 else 1021 rv = vm_pager_get_pages(sc->object, &m, 1, 0); 1022 if (rv == VM_PAGER_ERROR) { 1023 vm_page_xunbusy(m); 1024 break; 1025 } else if (rv == VM_PAGER_FAIL) { 1026 /* 1027 * Pager does not have the page. Zero 1028 * the allocated page, and mark it as 1029 * valid. Do not set dirty, the page 1030 * can be recreated if thrown out. 1031 */ 1032 pmap_zero_page(m); 1033 m->valid = VM_PAGE_BITS_ALL; 1034 } 1035 if ((bp->bio_flags & BIO_UNMAPPED) != 0) { 1036 pmap_copy_pages(&m, offs, bp->bio_ma, 1037 ma_offs, len); 1038 } else if ((bp->bio_flags & BIO_VLIST) != 0) { 1039 physcopyout_vlist(VM_PAGE_TO_PHYS(m) + offs, 1040 vlist, ma_offs, len); 1041 cpu_flush_dcache(p, len); 1042 } else { 1043 physcopyout(VM_PAGE_TO_PHYS(m) + offs, p, len); 1044 cpu_flush_dcache(p, len); 1045 } 1046 } else if (bp->bio_cmd == BIO_WRITE) { 1047 if (len == PAGE_SIZE || m->valid == VM_PAGE_BITS_ALL) 1048 rv = VM_PAGER_OK; 1049 else 1050 rv = vm_pager_get_pages(sc->object, &m, 1, 0); 1051 if (rv == VM_PAGER_ERROR) { 1052 vm_page_xunbusy(m); 1053 break; 1054 } else if (rv == VM_PAGER_FAIL) 1055 pmap_zero_page(m); 1056 1057 if ((bp->bio_flags & BIO_UNMAPPED) != 0) { 1058 pmap_copy_pages(bp->bio_ma, ma_offs, &m, 1059 offs, len); 1060 } else if ((bp->bio_flags & BIO_VLIST) != 0) { 1061 physcopyin_vlist(vlist, ma_offs, 1062 VM_PAGE_TO_PHYS(m) + offs, len); 1063 } else { 1064 physcopyin(p, VM_PAGE_TO_PHYS(m) + offs, len); 1065 } 1066 1067 m->valid = VM_PAGE_BITS_ALL; 1068 vm_page_dirty(m); 1069 vm_pager_page_unswapped(m); 1070 } else if (bp->bio_cmd == BIO_DELETE) { 1071 if (len == PAGE_SIZE || m->valid == VM_PAGE_BITS_ALL) 1072 rv = VM_PAGER_OK; 1073 else 1074 rv = vm_pager_get_pages(sc->object, &m, 1, 0); 1075 if (rv == VM_PAGER_ERROR) { 1076 vm_page_xunbusy(m); 1077 break; 1078 } else if (rv == VM_PAGER_FAIL) { 1079 md_swap_page_free(m); 1080 m = NULL; 1081 } else { 1082 /* Page is valid. */ 1083 if (len != PAGE_SIZE) { 1084 pmap_zero_page_area(m, offs, len); 1085 vm_page_dirty(m); 1086 } 1087 vm_pager_page_unswapped(m); 1088 if (len == PAGE_SIZE) { 1089 md_swap_page_free(m); 1090 m = NULL; 1091 } 1092 } 1093 } 1094 if (m != NULL) { 1095 vm_page_xunbusy(m); 1096 vm_page_lock(m); 1097 vm_page_activate(m); 1098 vm_page_unlock(m); 1099 } 1100 1101 /* Actions on further pages start at offset 0 */ 1102 p += PAGE_SIZE - offs; 1103 offs = 0; 1104 ma_offs += len; 1105 } 1106 vm_object_pip_subtract(sc->object, 1); 1107 VM_OBJECT_WUNLOCK(sc->object); 1108 return (rv != VM_PAGER_ERROR ? 0 : ENOSPC); 1109} 1110 1111static int 1112mdstart_null(struct md_s *sc, struct bio *bp) 1113{ 1114 1115 switch (bp->bio_cmd) { 1116 case BIO_READ: 1117 bzero(bp->bio_data, bp->bio_length); 1118 cpu_flush_dcache(bp->bio_data, bp->bio_length); 1119 break; 1120 case BIO_WRITE: 1121 break; 1122 } 1123 bp->bio_resid = 0; 1124 return (0); 1125} 1126 1127static void 1128md_kthread(void *arg) 1129{ 1130 struct md_s *sc; 1131 struct bio *bp; 1132 int error; 1133 1134 sc = arg; 1135 thread_lock(curthread); 1136 sched_prio(curthread, PRIBIO); 1137 thread_unlock(curthread); 1138 if (sc->type == MD_VNODE) 1139 curthread->td_pflags |= TDP_NORUNNINGBUF; 1140 1141 for (;;) { 1142 mtx_lock(&sc->queue_mtx); 1143 if (sc->flags & MD_SHUTDOWN) { 1144 sc->flags |= MD_EXITING; 1145 mtx_unlock(&sc->queue_mtx); 1146 kproc_exit(0); 1147 } 1148 bp = bioq_takefirst(&sc->bio_queue); 1149 if (!bp) { 1150 msleep(sc, &sc->queue_mtx, PRIBIO | PDROP, "mdwait", 0); 1151 continue; 1152 } 1153 mtx_unlock(&sc->queue_mtx); 1154 if (bp->bio_cmd == BIO_GETATTR) { 1155 if ((sc->fwsectors && sc->fwheads && 1156 (g_handleattr_int(bp, "GEOM::fwsectors", 1157 sc->fwsectors) || 1158 g_handleattr_int(bp, "GEOM::fwheads", 1159 sc->fwheads))) || 1160 g_handleattr_int(bp, "GEOM::candelete", 1)) 1161 error = -1; 1162 else 1163 error = EOPNOTSUPP; 1164 } else { 1165 error = sc->start(sc, bp); 1166 } 1167 1168 if (error != -1) { 1169 bp->bio_completed = bp->bio_length; 1170 if ((bp->bio_cmd == BIO_READ) || (bp->bio_cmd == BIO_WRITE)) 1171 devstat_end_transaction_bio(sc->devstat, bp); 1172 g_io_deliver(bp, error); 1173 } 1174 } 1175} 1176 1177static struct md_s * 1178mdfind(int unit) 1179{ 1180 struct md_s *sc; 1181 1182 LIST_FOREACH(sc, &md_softc_list, list) { 1183 if (sc->unit == unit) 1184 break; 1185 } 1186 return (sc); 1187} 1188 1189static struct md_s * 1190mdnew(int unit, int *errp, enum md_types type) 1191{ 1192 struct md_s *sc; 1193 int error; 1194 1195 *errp = 0; 1196 if (unit == -1) 1197 unit = alloc_unr(md_uh); 1198 else 1199 unit = alloc_unr_specific(md_uh, unit); 1200 1201 if (unit == -1) { 1202 *errp = EBUSY; 1203 return (NULL); 1204 } 1205 1206 sc = (struct md_s *)malloc(sizeof *sc, M_MD, M_WAITOK | M_ZERO); 1207 sc->type = type; 1208 bioq_init(&sc->bio_queue); 1209 mtx_init(&sc->queue_mtx, "md bio queue", NULL, MTX_DEF); 1210 mtx_init(&sc->stat_mtx, "md stat", NULL, MTX_DEF); 1211 sc->unit = unit; 1212 sprintf(sc->name, "md%d", unit); 1213 LIST_INSERT_HEAD(&md_softc_list, sc, list); 1214 error = kproc_create(md_kthread, sc, &sc->procp, 0, 0,"%s", sc->name); 1215 if (error == 0) 1216 return (sc); 1217 LIST_REMOVE(sc, list); 1218 mtx_destroy(&sc->stat_mtx); 1219 mtx_destroy(&sc->queue_mtx); 1220 free_unr(md_uh, sc->unit); 1221 free(sc, M_MD); 1222 *errp = error; 1223 return (NULL); 1224} 1225 1226static void 1227mdinit(struct md_s *sc) 1228{ 1229 struct g_geom *gp; 1230 struct g_provider *pp; 1231 1232 g_topology_lock(); 1233 gp = g_new_geomf(&g_md_class, "md%d", sc->unit); 1234 gp->softc = sc; 1235 pp = g_new_providerf(gp, "md%d", sc->unit); 1236 pp->flags |= G_PF_DIRECT_SEND | G_PF_DIRECT_RECEIVE; 1237 pp->mediasize = sc->mediasize; 1238 pp->sectorsize = sc->sectorsize; 1239 switch (sc->type) { 1240 case MD_MALLOC: 1241 case MD_VNODE: 1242 case MD_SWAP: 1243 pp->flags |= G_PF_ACCEPT_UNMAPPED; 1244 break; 1245 case MD_PRELOAD: 1246 case MD_NULL: 1247 break; 1248 } 1249 sc->gp = gp; 1250 sc->pp = pp; 1251 g_error_provider(pp, 0); 1252 g_topology_unlock(); 1253 sc->devstat = devstat_new_entry("md", sc->unit, sc->sectorsize, 1254 DEVSTAT_ALL_SUPPORTED, DEVSTAT_TYPE_DIRECT, DEVSTAT_PRIORITY_MAX); 1255} 1256 1257static int 1258mdcreate_malloc(struct md_s *sc, struct md_ioctl *mdio) 1259{ 1260 uintptr_t sp; 1261 int error; 1262 off_t u; 1263 1264 error = 0; 1265 if (mdio->md_options & ~(MD_AUTOUNIT | MD_COMPRESS | MD_RESERVE)) 1266 return (EINVAL); 1267 if (mdio->md_sectorsize != 0 && !powerof2(mdio->md_sectorsize)) 1268 return (EINVAL); 1269 /* Compression doesn't make sense if we have reserved space */ 1270 if (mdio->md_options & MD_RESERVE) 1271 mdio->md_options &= ~MD_COMPRESS; 1272 if (mdio->md_fwsectors != 0) 1273 sc->fwsectors = mdio->md_fwsectors; 1274 if (mdio->md_fwheads != 0) 1275 sc->fwheads = mdio->md_fwheads; 1276 sc->flags = mdio->md_options & (MD_COMPRESS | MD_FORCE); 1277 sc->indir = dimension(sc->mediasize / sc->sectorsize); 1278 sc->uma = uma_zcreate(sc->name, sc->sectorsize, NULL, NULL, NULL, NULL, 1279 0x1ff, 0); 1280 if (mdio->md_options & MD_RESERVE) { 1281 off_t nsectors; 1282 1283 nsectors = sc->mediasize / sc->sectorsize; 1284 for (u = 0; u < nsectors; u++) { 1285 sp = (uintptr_t)uma_zalloc(sc->uma, (md_malloc_wait ? 1286 M_WAITOK : M_NOWAIT) | M_ZERO); 1287 if (sp != 0) 1288 error = s_write(sc->indir, u, sp); 1289 else 1290 error = ENOMEM; 1291 if (error != 0) 1292 break; 1293 } 1294 } 1295 return (error); 1296} 1297 1298 1299static int 1300mdsetcred(struct md_s *sc, struct ucred *cred) 1301{ 1302 char *tmpbuf; 1303 int error = 0; 1304 1305 /* 1306 * Set credits in our softc 1307 */ 1308 1309 if (sc->cred) 1310 crfree(sc->cred); 1311 sc->cred = crhold(cred); 1312 1313 /* 1314 * Horrible kludge to establish credentials for NFS XXX. 1315 */ 1316 1317 if (sc->vnode) { 1318 struct uio auio; 1319 struct iovec aiov; 1320 1321 tmpbuf = malloc(sc->sectorsize, M_TEMP, M_WAITOK); 1322 bzero(&auio, sizeof(auio)); 1323 1324 aiov.iov_base = tmpbuf; 1325 aiov.iov_len = sc->sectorsize; 1326 auio.uio_iov = &aiov; 1327 auio.uio_iovcnt = 1; 1328 auio.uio_offset = 0; 1329 auio.uio_rw = UIO_READ; 1330 auio.uio_segflg = UIO_SYSSPACE; 1331 auio.uio_resid = aiov.iov_len; 1332 vn_lock(sc->vnode, LK_EXCLUSIVE | LK_RETRY); 1333 error = VOP_READ(sc->vnode, &auio, 0, sc->cred); 1334 VOP_UNLOCK(sc->vnode, 0); 1335 free(tmpbuf, M_TEMP); 1336 } 1337 return (error); 1338} 1339 1340static int 1341mdcreate_vnode(struct md_s *sc, struct md_ioctl *mdio, struct thread *td) 1342{ 1343 struct vattr vattr; 1344 struct nameidata nd; 1345 char *fname; 1346 int error, flags; 1347 1348 /* 1349 * Kernel-originated requests must have the filename appended 1350 * to the mdio structure to protect against malicious software. 1351 */ 1352 fname = mdio->md_file; 1353 if ((void *)fname != (void *)(mdio + 1)) { 1354 error = copyinstr(fname, sc->file, sizeof(sc->file), NULL); 1355 if (error != 0) 1356 return (error); 1357 } else 1358 strlcpy(sc->file, fname, sizeof(sc->file)); 1359 1360 /* 1361 * If the user specified that this is a read only device, don't 1362 * set the FWRITE mask before trying to open the backing store. 1363 */ 1364 flags = FREAD | ((mdio->md_options & MD_READONLY) ? 0 : FWRITE); 1365 NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, sc->file, td); 1366 error = vn_open(&nd, &flags, 0, NULL); 1367 if (error != 0) 1368 return (error); 1369 NDFREE(&nd, NDF_ONLY_PNBUF); 1370 if (nd.ni_vp->v_type != VREG) { 1371 error = EINVAL; 1372 goto bad; 1373 } 1374 error = VOP_GETATTR(nd.ni_vp, &vattr, td->td_ucred); 1375 if (error != 0) 1376 goto bad; 1377 if (VOP_ISLOCKED(nd.ni_vp) != LK_EXCLUSIVE) { 1378 vn_lock(nd.ni_vp, LK_UPGRADE | LK_RETRY); 1379 if (nd.ni_vp->v_iflag & VI_DOOMED) { 1380 /* Forced unmount. */ 1381 error = EBADF; 1382 goto bad; 1383 } 1384 } 1385 nd.ni_vp->v_vflag |= VV_MD; 1386 VOP_UNLOCK(nd.ni_vp, 0); 1387 1388 if (mdio->md_fwsectors != 0) 1389 sc->fwsectors = mdio->md_fwsectors; 1390 if (mdio->md_fwheads != 0) 1391 sc->fwheads = mdio->md_fwheads; 1392 sc->flags = mdio->md_options & (MD_FORCE | MD_ASYNC); 1393 if (!(flags & FWRITE)) 1394 sc->flags |= MD_READONLY; 1395 sc->vnode = nd.ni_vp; 1396 1397 error = mdsetcred(sc, td->td_ucred); 1398 if (error != 0) { 1399 sc->vnode = NULL; 1400 vn_lock(nd.ni_vp, LK_EXCLUSIVE | LK_RETRY); 1401 nd.ni_vp->v_vflag &= ~VV_MD; 1402 goto bad; 1403 } 1404 return (0); 1405bad: 1406 VOP_UNLOCK(nd.ni_vp, 0); 1407 (void)vn_close(nd.ni_vp, flags, td->td_ucred, td); 1408 return (error); 1409} 1410 1411static int 1412mddestroy(struct md_s *sc, struct thread *td) 1413{ 1414 1415 if (sc->gp) { 1416 sc->gp->softc = NULL; 1417 g_topology_lock(); 1418 g_wither_geom(sc->gp, ENXIO); 1419 g_topology_unlock(); 1420 sc->gp = NULL; 1421 sc->pp = NULL; 1422 } 1423 if (sc->devstat) { 1424 devstat_remove_entry(sc->devstat); 1425 sc->devstat = NULL; 1426 } 1427 mtx_lock(&sc->queue_mtx); 1428 sc->flags |= MD_SHUTDOWN; 1429 wakeup(sc); 1430 while (!(sc->flags & MD_EXITING)) 1431 msleep(sc->procp, &sc->queue_mtx, PRIBIO, "mddestroy", hz / 10); 1432 mtx_unlock(&sc->queue_mtx); 1433 mtx_destroy(&sc->stat_mtx); 1434 mtx_destroy(&sc->queue_mtx); 1435 if (sc->vnode != NULL) { 1436 vn_lock(sc->vnode, LK_EXCLUSIVE | LK_RETRY); 1437 sc->vnode->v_vflag &= ~VV_MD; 1438 VOP_UNLOCK(sc->vnode, 0); 1439 (void)vn_close(sc->vnode, sc->flags & MD_READONLY ? 1440 FREAD : (FREAD|FWRITE), sc->cred, td); 1441 } 1442 if (sc->cred != NULL) 1443 crfree(sc->cred); 1444 if (sc->object != NULL) 1445 vm_object_deallocate(sc->object); 1446 if (sc->indir) 1447 destroy_indir(sc, sc->indir); 1448 if (sc->uma) 1449 uma_zdestroy(sc->uma); 1450 1451 LIST_REMOVE(sc, list); 1452 free_unr(md_uh, sc->unit); 1453 free(sc, M_MD); 1454 return (0); 1455} 1456 1457static int 1458mdresize(struct md_s *sc, struct md_ioctl *mdio) 1459{ 1460 int error, res; 1461 vm_pindex_t oldpages, newpages; 1462 1463 switch (sc->type) { 1464 case MD_VNODE: 1465 case MD_NULL: 1466 break; 1467 case MD_SWAP: 1468 if (mdio->md_mediasize <= 0 || 1469 (mdio->md_mediasize % PAGE_SIZE) != 0) 1470 return (EDOM); 1471 oldpages = OFF_TO_IDX(round_page(sc->mediasize)); 1472 newpages = OFF_TO_IDX(round_page(mdio->md_mediasize)); 1473 if (newpages < oldpages) { 1474 VM_OBJECT_WLOCK(sc->object); 1475 vm_object_page_remove(sc->object, newpages, 0, 0); 1476 swap_pager_freespace(sc->object, newpages, 1477 oldpages - newpages); 1478 swap_release_by_cred(IDX_TO_OFF(oldpages - 1479 newpages), sc->cred); 1480 sc->object->charge = IDX_TO_OFF(newpages); 1481 sc->object->size = newpages; 1482 VM_OBJECT_WUNLOCK(sc->object); 1483 } else if (newpages > oldpages) { 1484 res = swap_reserve_by_cred(IDX_TO_OFF(newpages - 1485 oldpages), sc->cred); 1486 if (!res) 1487 return (ENOMEM); 1488 if ((mdio->md_options & MD_RESERVE) || 1489 (sc->flags & MD_RESERVE)) { 1490 error = swap_pager_reserve(sc->object, 1491 oldpages, newpages - oldpages); 1492 if (error < 0) { 1493 swap_release_by_cred( 1494 IDX_TO_OFF(newpages - oldpages), 1495 sc->cred); 1496 return (EDOM); 1497 } 1498 } 1499 VM_OBJECT_WLOCK(sc->object); 1500 sc->object->charge = IDX_TO_OFF(newpages); 1501 sc->object->size = newpages; 1502 VM_OBJECT_WUNLOCK(sc->object); 1503 } 1504 break; 1505 default: 1506 return (EOPNOTSUPP); 1507 } 1508 1509 sc->mediasize = mdio->md_mediasize; 1510 g_topology_lock(); 1511 g_resize_provider(sc->pp, sc->mediasize); 1512 g_topology_unlock(); 1513 return (0); 1514} 1515 1516static int 1517mdcreate_swap(struct md_s *sc, struct md_ioctl *mdio, struct thread *td) 1518{ 1519 vm_ooffset_t npage; 1520 int error; 1521 1522 /* 1523 * Range check. Disallow negative sizes or any size less then the 1524 * size of a page. Then round to a page. 1525 */ 1526 if (sc->mediasize <= 0 || (sc->mediasize % PAGE_SIZE) != 0) 1527 return (EDOM); 1528 1529 /* 1530 * Allocate an OBJT_SWAP object. 1531 * 1532 * Note the truncation. 1533 */ 1534 1535 npage = mdio->md_mediasize / PAGE_SIZE; 1536 if (mdio->md_fwsectors != 0) 1537 sc->fwsectors = mdio->md_fwsectors; 1538 if (mdio->md_fwheads != 0) 1539 sc->fwheads = mdio->md_fwheads; 1540 sc->object = vm_pager_allocate(OBJT_SWAP, NULL, PAGE_SIZE * npage, 1541 VM_PROT_DEFAULT, 0, td->td_ucred); 1542 if (sc->object == NULL) 1543 return (ENOMEM); 1544 sc->flags = mdio->md_options & (MD_FORCE | MD_RESERVE); 1545 if (mdio->md_options & MD_RESERVE) { 1546 if (swap_pager_reserve(sc->object, 0, npage) < 0) { 1547 error = EDOM; 1548 goto finish; 1549 } 1550 } 1551 error = mdsetcred(sc, td->td_ucred); 1552 finish: 1553 if (error != 0) { 1554 vm_object_deallocate(sc->object); 1555 sc->object = NULL; 1556 } 1557 return (error); 1558} 1559 1560static int 1561mdcreate_null(struct md_s *sc, struct md_ioctl *mdio, struct thread *td) 1562{ 1563 1564 /* 1565 * Range check. Disallow negative sizes or any size less then the 1566 * size of a page. Then round to a page. 1567 */ 1568 if (sc->mediasize <= 0 || (sc->mediasize % PAGE_SIZE) != 0) 1569 return (EDOM); 1570 1571 return (0); 1572} 1573 1574static int 1575xmdctlioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flags, struct thread *td) 1576{ 1577 struct md_ioctl *mdio; 1578 struct md_s *sc; 1579 int error, i; 1580 unsigned sectsize; 1581 1582 if (md_debug) 1583 printf("mdctlioctl(%s %lx %p %x %p)\n", 1584 devtoname(dev), cmd, addr, flags, td); 1585 1586 mdio = (struct md_ioctl *)addr; 1587 if (mdio->md_version != MDIOVERSION) 1588 return (EINVAL); 1589 1590 /* 1591 * We assert the version number in the individual ioctl 1592 * handlers instead of out here because (a) it is possible we 1593 * may add another ioctl in the future which doesn't read an 1594 * mdio, and (b) the correct return value for an unknown ioctl 1595 * is ENOIOCTL, not EINVAL. 1596 */ 1597 error = 0; 1598 switch (cmd) { 1599 case MDIOCATTACH: 1600 switch (mdio->md_type) { 1601 case MD_MALLOC: 1602 case MD_PRELOAD: 1603 case MD_VNODE: 1604 case MD_SWAP: 1605 case MD_NULL: 1606 break; 1607 default: 1608 return (EINVAL); 1609 } 1610 if (mdio->md_sectorsize == 0) 1611 sectsize = DEV_BSIZE; 1612 else 1613 sectsize = mdio->md_sectorsize; 1614 if (sectsize > MAXPHYS || mdio->md_mediasize < sectsize) 1615 return (EINVAL); 1616 if (mdio->md_options & MD_AUTOUNIT) 1617 sc = mdnew(-1, &error, mdio->md_type); 1618 else { 1619 if (mdio->md_unit > INT_MAX) 1620 return (EINVAL); 1621 sc = mdnew(mdio->md_unit, &error, mdio->md_type); 1622 } 1623 if (sc == NULL) 1624 return (error); 1625 if (mdio->md_options & MD_AUTOUNIT) 1626 mdio->md_unit = sc->unit; 1627 sc->mediasize = mdio->md_mediasize; 1628 sc->sectorsize = sectsize; 1629 error = EDOOFUS; 1630 switch (sc->type) { 1631 case MD_MALLOC: 1632 sc->start = mdstart_malloc; 1633 error = mdcreate_malloc(sc, mdio); 1634 break; 1635 case MD_PRELOAD: 1636 /* 1637 * We disallow attaching preloaded memory disks via 1638 * ioctl. Preloaded memory disks are automatically 1639 * attached in g_md_init(). 1640 */ 1641 error = EOPNOTSUPP; 1642 break; 1643 case MD_VNODE: 1644 sc->start = mdstart_vnode; 1645 error = mdcreate_vnode(sc, mdio, td); 1646 break; 1647 case MD_SWAP: 1648 sc->start = mdstart_swap; 1649 error = mdcreate_swap(sc, mdio, td); 1650 break; 1651 case MD_NULL: 1652 sc->start = mdstart_null; 1653 error = mdcreate_null(sc, mdio, td); 1654 break; 1655 } 1656 if (error != 0) { 1657 mddestroy(sc, td); 1658 return (error); 1659 } 1660 1661 /* Prune off any residual fractional sector */ 1662 i = sc->mediasize % sc->sectorsize; 1663 sc->mediasize -= i; 1664 1665 mdinit(sc); 1666 return (0); 1667 case MDIOCDETACH: 1668 if (mdio->md_mediasize != 0 || 1669 (mdio->md_options & ~MD_FORCE) != 0) 1670 return (EINVAL); 1671 1672 sc = mdfind(mdio->md_unit); 1673 if (sc == NULL) 1674 return (ENOENT); 1675 if (sc->opencount != 0 && !(sc->flags & MD_FORCE) && 1676 !(mdio->md_options & MD_FORCE)) 1677 return (EBUSY); 1678 return (mddestroy(sc, td)); 1679 case MDIOCRESIZE: 1680 if ((mdio->md_options & ~(MD_FORCE | MD_RESERVE)) != 0) 1681 return (EINVAL); 1682 1683 sc = mdfind(mdio->md_unit); 1684 if (sc == NULL) 1685 return (ENOENT); 1686 if (mdio->md_mediasize < sc->sectorsize) 1687 return (EINVAL); 1688 if (mdio->md_mediasize < sc->mediasize && 1689 !(sc->flags & MD_FORCE) && 1690 !(mdio->md_options & MD_FORCE)) 1691 return (EBUSY); 1692 return (mdresize(sc, mdio)); 1693 case MDIOCQUERY: 1694 sc = mdfind(mdio->md_unit); 1695 if (sc == NULL) 1696 return (ENOENT); 1697 mdio->md_type = sc->type; 1698 mdio->md_options = sc->flags; 1699 mdio->md_mediasize = sc->mediasize; 1700 mdio->md_sectorsize = sc->sectorsize; 1701 if (sc->type == MD_VNODE) 1702 error = copyout(sc->file, mdio->md_file, 1703 strlen(sc->file) + 1); 1704 return (error); 1705 case MDIOCLIST: 1706 i = 1; 1707 LIST_FOREACH(sc, &md_softc_list, list) { 1708 if (i == MDNPAD - 1) 1709 mdio->md_pad[i] = -1; 1710 else 1711 mdio->md_pad[i++] = sc->unit; 1712 } 1713 mdio->md_pad[0] = i - 1; 1714 return (0); 1715 default: 1716 return (ENOIOCTL); 1717 }; 1718} 1719 1720static int 1721mdctlioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flags, struct thread *td) 1722{ 1723 int error; 1724 1725 sx_xlock(&md_sx); 1726 error = xmdctlioctl(dev, cmd, addr, flags, td); 1727 sx_xunlock(&md_sx); 1728 return (error); 1729} 1730 1731static void 1732md_preloaded(u_char *image, size_t length, const char *name) 1733{ 1734 struct md_s *sc; 1735 int error; 1736 1737 sc = mdnew(-1, &error, MD_PRELOAD); 1738 if (sc == NULL) 1739 return; 1740 sc->mediasize = length; 1741 sc->sectorsize = DEV_BSIZE; 1742 sc->pl_ptr = image; 1743 sc->pl_len = length; 1744 sc->start = mdstart_preload; 1745#ifdef MD_ROOT 1746 if (sc->unit == 0) 1747 rootdevnames[0] = MD_ROOT_FSTYPE ":/dev/md0"; 1748#endif 1749 mdinit(sc); 1750 if (name != NULL) { 1751 printf("%s%d: Preloaded image <%s> %zd bytes at %p\n", 1752 MD_NAME, sc->unit, name, length, image); 1753 } 1754} 1755 1756static void 1757g_md_init(struct g_class *mp __unused) 1758{ 1759 caddr_t mod; 1760 u_char *ptr, *name, *type; 1761 unsigned len; 1762 int i; 1763 1764 /* figure out log2(NINDIR) */ 1765 for (i = NINDIR, nshift = -1; i; nshift++) 1766 i >>= 1; 1767 1768 mod = NULL; 1769 sx_init(&md_sx, "MD config lock"); 1770 g_topology_unlock(); 1771 md_uh = new_unrhdr(0, INT_MAX, NULL); 1772#ifdef MD_ROOT_SIZE 1773 sx_xlock(&md_sx); 1774 md_preloaded(mfs_root.start, sizeof(mfs_root.start), NULL); 1775 sx_xunlock(&md_sx); 1776#endif 1777 /* XXX: are preload_* static or do they need Giant ? */ 1778 while ((mod = preload_search_next_name(mod)) != NULL) { 1779 name = (char *)preload_search_info(mod, MODINFO_NAME); 1780 if (name == NULL) 1781 continue; 1782 type = (char *)preload_search_info(mod, MODINFO_TYPE); 1783 if (type == NULL) 1784 continue; 1785 if (strcmp(type, "md_image") && strcmp(type, "mfs_root")) 1786 continue; 1787 ptr = preload_fetch_addr(mod); 1788 len = preload_fetch_size(mod); 1789 if (ptr != NULL && len != 0) { 1790 sx_xlock(&md_sx); 1791 md_preloaded(ptr, len, name); 1792 sx_xunlock(&md_sx); 1793 } 1794 } 1795 md_vnode_pbuf_freecnt = nswbuf / 10; 1796 status_dev = make_dev(&mdctl_cdevsw, INT_MAX, UID_ROOT, GID_WHEEL, 1797 0600, MDCTL_NAME); 1798 g_topology_lock(); 1799} 1800 1801static void 1802g_md_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, 1803 struct g_consumer *cp __unused, struct g_provider *pp) 1804{ 1805 struct md_s *mp; 1806 char *type; 1807 1808 mp = gp->softc; 1809 if (mp == NULL) 1810 return; 1811 1812 switch (mp->type) { 1813 case MD_MALLOC: 1814 type = "malloc"; 1815 break; 1816 case MD_PRELOAD: 1817 type = "preload"; 1818 break; 1819 case MD_VNODE: 1820 type = "vnode"; 1821 break; 1822 case MD_SWAP: 1823 type = "swap"; 1824 break; 1825 case MD_NULL: 1826 type = "null"; 1827 break; 1828 default: 1829 type = "unknown"; 1830 break; 1831 } 1832 1833 if (pp != NULL) { 1834 if (indent == NULL) { 1835 sbuf_printf(sb, " u %d", mp->unit); 1836 sbuf_printf(sb, " s %ju", (uintmax_t) mp->sectorsize); 1837 sbuf_printf(sb, " f %ju", (uintmax_t) mp->fwheads); 1838 sbuf_printf(sb, " fs %ju", (uintmax_t) mp->fwsectors); 1839 sbuf_printf(sb, " l %ju", (uintmax_t) mp->mediasize); 1840 sbuf_printf(sb, " t %s", type); 1841 if (mp->type == MD_VNODE && mp->vnode != NULL) 1842 sbuf_printf(sb, " file %s", mp->file); 1843 } else { 1844 sbuf_printf(sb, "%s<unit>%d</unit>\n", indent, 1845 mp->unit); 1846 sbuf_printf(sb, "%s<sectorsize>%ju</sectorsize>\n", 1847 indent, (uintmax_t) mp->sectorsize); 1848 sbuf_printf(sb, "%s<fwheads>%ju</fwheads>\n", 1849 indent, (uintmax_t) mp->fwheads); 1850 sbuf_printf(sb, "%s<fwsectors>%ju</fwsectors>\n", 1851 indent, (uintmax_t) mp->fwsectors); 1852 sbuf_printf(sb, "%s<length>%ju</length>\n", 1853 indent, (uintmax_t) mp->mediasize); 1854 sbuf_printf(sb, "%s<compression>%s</compression>\n", indent, 1855 (mp->flags & MD_COMPRESS) == 0 ? "off": "on"); 1856 sbuf_printf(sb, "%s<access>%s</access>\n", indent, 1857 (mp->flags & MD_READONLY) == 0 ? "read-write": 1858 "read-only"); 1859 sbuf_printf(sb, "%s<type>%s</type>\n", indent, 1860 type); 1861 if (mp->type == MD_VNODE && mp->vnode != NULL) { 1862 sbuf_printf(sb, "%s<file>", indent); 1863 g_conf_printf_escaped(sb, "%s", mp->file); 1864 sbuf_printf(sb, "</file>\n"); 1865 } 1866 } 1867 } 1868} 1869 1870static void 1871g_md_fini(struct g_class *mp __unused) 1872{ 1873 1874 sx_destroy(&md_sx); 1875 if (status_dev != NULL) 1876 destroy_dev(status_dev); 1877 delete_unrhdr(md_uh); 1878} 1879