1/* $NetBSD: kernel.c,v 1.5 2010/12/28 13:36:09 haad Exp $ */ 2 3/* 4 * CDDL HEADER START 5 * 6 * The contents of this file are subject to the terms of the 7 * Common Development and Distribution License (the "License"). 8 * You may not use this file except in compliance with the License. 9 * 10 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 11 * or http://www.opensolaris.org/os/licensing. 12 * See the License for the specific language governing permissions 13 * and limitations under the License. 14 * 15 * When distributing Covered Code, include this CDDL HEADER in each 16 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 17 * If applicable, add the following below this CDDL HEADER, with the 18 * fields enclosed by brackets "[]" replaced with your own identifying 19 * information: Portions Copyright [yyyy] [name of copyright owner] 20 * 21 * CDDL HEADER END 22 */ 23 24/* 25 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 26 * Use is subject to license terms. 27 */ 28 29#pragma ident "%Z%%M% %I% %E% SMI" 30 31#include <sys/cdefs.h> 32__RCSID("$NetBSD: kernel.c,v 1.5 2010/12/28 13:36:09 haad Exp $"); 33 34#include <sys/zfs_context.h> 35#include <sys/sysctl.h> 36#include <assert.h> 37#include <fcntl.h> 38#include <poll.h> 39#include <stdio.h> 40#include <stdlib.h> 41#include <errno.h> 42#include <string.h> 43#include <zlib.h> 44#include <sys/spa.h> 45#include <sys/stat.h> 46#include <sys/processor.h> 47#include <sys/zmod.h> 48#include <sys/utsname.h> 49 50/* 51 * Emulation of kernel services in userland. 52 */ 53 54#ifdef XXXNETBSD 55int hz = 119; /* frequency when using gethrtime() >> 23 for lbolt */ 56#endif 57int aok; 58uint64_t physmem; 59vnode_t *rootdir = (vnode_t *)0xabcd1234; 60char hw_serial[11]; 61size_t pgsize; 62 63struct utsname utsname = { 64 "userland" 65}; 66 67/* this only exists to have its address taken */ 68struct proc p0; 69 70/* 71 * ========================================================================= 72 * threads 73 * ========================================================================= 74 */ 75/*ARGSUSED*/ 76kthread_t * 77zk_thread_create(void (*func)(), void *arg) 78{ 79 thread_t tid; 80 81 VERIFY(thr_create(0, 0, (void *(*)(void *))func, arg, THR_DETACHED, 82 &tid) == 0); 83 84 return ((void *)(uintptr_t)tid); 85} 86 87/* 88 * ========================================================================= 89 * kstats 90 * ========================================================================= 91 */ 92/*ARGSUSED*/ 93kstat_t * 94kstat_create(char *module, int instance, char *name, char *class, 95 uchar_t type, ulong_t ndata, uchar_t ks_flag) 96{ 97 return (NULL); 98} 99 100/*ARGSUSED*/ 101void 102kstat_install(kstat_t *ksp) 103{} 104 105/*ARGSUSED*/ 106void 107kstat_delete(kstat_t *ksp) 108{} 109 110/* 111 * ========================================================================= 112 * vnode operations 113 * ========================================================================= 114 */ 115/* 116 * Note: for the xxxat() versions of these functions, we assume that the 117 * starting vp is always rootdir (which is true for spa_directory.c, the only 118 * ZFS consumer of these interfaces). We assert this is true, and then emulate 119 * them by adding '/' in front of the path. 120 */ 121 122/*ARGSUSED*/ 123int 124vn_open(char *path, int x1, int flags, int mode, vnode_t **vpp, int x2, int x3) 125{ 126 int fd; 127 vnode_t *vp; 128 int old_umask; 129 char realpath[MAXPATHLEN]; 130 struct stat64 st; 131 132 /* 133 * If we're accessing a real disk from userland, we need to use 134 * the character interface to avoid caching. This is particularly 135 * important if we're trying to look at a real in-kernel storage 136 * pool from userland, e.g. via zdb, because otherwise we won't 137 * see the changes occurring under the segmap cache. 138 * On the other hand, the stupid character device returns zero 139 * for its size. So -- gag -- we open the block device to get 140 * its size, and remember it for subsequent VOP_GETATTR(). 141 */ 142 if (strncmp(path, "/dev/", 5) == 0) { 143 char *dsk; 144 fd = open64(path, O_RDONLY); 145 if (fd == -1) 146 return (errno); 147 if (fstat64(fd, &st) == -1) { 148 close(fd); 149 return (errno); 150 } 151 close(fd); 152 (void) sprintf(realpath, "%s", path); 153 dsk = strstr(path, "/dsk/"); 154 if (dsk != NULL) 155 (void) sprintf(realpath + (dsk - path) + 1, "r%s", 156 dsk + 1); 157 } else { 158 (void) sprintf(realpath, "%s", path); 159 if (!(flags & FCREAT) && stat64(realpath, &st) == -1) 160 return (errno); 161 } 162 163 if (flags & FCREAT) 164 old_umask = umask(0); 165 166 /* 167 * The construct 'flags - FREAD' conveniently maps combinations of 168 * FREAD and FWRITE to the corresponding O_RDONLY, O_WRONLY, and O_RDWR. 169 */ 170 fd = open64(realpath, flags - FREAD, mode); 171 172 if (flags & FCREAT) 173 (void) umask(old_umask); 174 175 if (fd == -1) 176 return (errno); 177 178 if (fstat64(fd, &st) == -1) { 179 close(fd); 180 return (errno); 181 } 182 183 (void) fcntl(fd, F_SETFD, FD_CLOEXEC); 184 185 *vpp = vp = umem_zalloc(sizeof (vnode_t), UMEM_NOFAIL); 186 187 vp->v_fd = fd; 188 if (S_ISCHR(st.st_mode)) { 189#ifdef XXXAD 190 ioctl(fd, DIOCGMEDIASIZE, &vp->v_size); 191#endif 192 } else 193 vp->v_size = st.st_size; 194 vp->v_path = spa_strdup(path); 195 196 return (0); 197} 198 199int 200vn_openat(char *path, int x1, int flags, int mode, vnode_t **vpp, int x2, 201 int x3, vnode_t *startvp, int fd) 202{ 203 char *realpath = umem_alloc(strlen(path) + 2, UMEM_NOFAIL); 204 int ret; 205 206 ASSERT(startvp == rootdir); 207 (void) sprintf(realpath, "/%s", path); 208 209 ret = vn_open(realpath, x1, flags, mode, vpp, x2, x3); 210 211 umem_free(realpath, strlen(path) + 2); 212 213 return (ret); 214} 215 216int 217vn_getattr(vnode_t *vp, vattr_t *va) 218{ 219 int fd; 220 struct stat64 st; 221 222 fd = vp->v_fd; 223 224 if (fstat64(fd, &st) == -1) 225 return (errno); 226 227 vp->v_size = st.st_size; 228 va->va_size = st.st_size; 229 230 return 0; 231} 232 233 234/*ARGSUSED*/ 235int 236vn_rdwr(int uio, vnode_t *vp, void *addr, ssize_t len, offset_t offset, 237 int x1, int x2, rlim64_t x3, void *x4, ssize_t *residp) 238{ 239 ssize_t iolen, split; 240 241 if (uio == UIO_READ) { 242 iolen = pread64(vp->v_fd, addr, len, offset); 243 } else { 244 /* 245 * To simulate partial disk writes, we split writes into two 246 * system calls so that the process can be killed in between. 247 */ 248 split = (len > 0 ? rand() % len : 0); 249 iolen = pwrite64(vp->v_fd, addr, split, offset); 250 iolen += pwrite64(vp->v_fd, (char *)addr + split, 251 len - split, offset + split); 252 } 253 254 if (iolen == -1) 255 return (errno); 256 if (residp) 257 *residp = len - iolen; 258 else if (iolen != len) 259 return (EIO); 260 return (0); 261} 262 263void 264vn_close(vnode_t *vp) 265{ 266 close(vp->v_fd); 267 spa_strfree(vp->v_path); 268 umem_free(vp, sizeof (vnode_t)); 269} 270 271#ifdef ZFS_DEBUG 272 273/* 274 * ========================================================================= 275 * Figure out which debugging statements to print 276 * ========================================================================= 277 */ 278 279static char *dprintf_string; 280static int dprintf_print_all; 281 282int 283dprintf_find_string(const char *string) 284{ 285 char *tmp_str = dprintf_string; 286 int len = strlen(string); 287 288 /* 289 * Find out if this is a string we want to print. 290 * String format: file1.c,function_name1,file2.c,file3.c 291 */ 292 293 while (tmp_str != NULL) { 294 if (strncmp(tmp_str, string, len) == 0 && 295 (tmp_str[len] == ',' || tmp_str[len] == '\0')) 296 return (1); 297 tmp_str = strchr(tmp_str, ','); 298 if (tmp_str != NULL) 299 tmp_str++; /* Get rid of , */ 300 } 301 return (0); 302} 303 304void 305dprintf_setup(int *argc, char **argv) 306{ 307 int i, j; 308 309 /* 310 * Debugging can be specified two ways: by setting the 311 * environment variable ZFS_DEBUG, or by including a 312 * "debug=..." argument on the command line. The command 313 * line setting overrides the environment variable. 314 */ 315 316 for (i = 1; i < *argc; i++) { 317 int len = strlen("debug="); 318 /* First look for a command line argument */ 319 if (strncmp("debug=", argv[i], len) == 0) { 320 dprintf_string = argv[i] + len; 321 /* Remove from args */ 322 for (j = i; j < *argc; j++) 323 argv[j] = argv[j+1]; 324 argv[j] = NULL; 325 (*argc)--; 326 } 327 } 328 329 if (dprintf_string == NULL) { 330 /* Look for ZFS_DEBUG environment variable */ 331 dprintf_string = getenv("ZFS_DEBUG"); 332 } 333 334 /* 335 * Are we just turning on all debugging? 336 */ 337 if (dprintf_find_string("on")) 338 dprintf_print_all = 1; 339} 340 341/* 342 * ========================================================================= 343 * debug printfs 344 * ========================================================================= 345 */ 346void 347__dprintf(const char *file, const char *func, int line, const char *fmt, ...) 348{ 349 const char *newfile; 350 va_list adx; 351 352 /* 353 * Get rid of annoying "../common/" prefix to filename. 354 */ 355 newfile = strrchr(file, '/'); 356 if (newfile != NULL) { 357 newfile = newfile + 1; /* Get rid of leading / */ 358 } else { 359 newfile = file; 360 } 361 362 if (dprintf_print_all || 363 dprintf_find_string(newfile) || 364 dprintf_find_string(func)) { 365 /* Print out just the function name if requested */ 366 flockfile(stdout); 367 if (dprintf_find_string("pid")) 368 (void) printf("%d ", getpid()); 369 if (dprintf_find_string("tid")) 370 (void) printf("%u ", thr_self()); 371#if 0 372 if (dprintf_find_string("cpu")) 373 (void) printf("%u ", getcpuid()); 374#endif 375 if (dprintf_find_string("time")) 376 (void) printf("%llu ", gethrtime()); 377 if (dprintf_find_string("long")) 378 (void) printf("%s, line %d: ", newfile, line); 379 (void) printf("%s: ", func); 380 va_start(adx, fmt); 381 (void) vprintf(fmt, adx); 382 va_end(adx); 383 funlockfile(stdout); 384 } 385} 386 387#endif /* ZFS_DEBUG */ 388 389/* 390 * ========================================================================= 391 * cmn_err() and panic() 392 * ========================================================================= 393 */ 394static char ce_prefix[CE_IGNORE][10] = { "", "NOTICE: ", "WARNING: ", "" }; 395static char ce_suffix[CE_IGNORE][2] = { "", "\n", "\n", "" }; 396 397void 398vpanic(const char *fmt, va_list adx) 399{ 400 (void) fprintf(stderr, "error: "); 401 (void) vfprintf(stderr, fmt, adx); 402 (void) fprintf(stderr, "\n"); 403 404 abort(); /* think of it as a "user-level crash dump" */ 405} 406 407void 408panic(const char *fmt, ...) 409{ 410 va_list adx; 411 412 va_start(adx, fmt); 413 vpanic(fmt, adx); 414 va_end(adx); 415} 416 417void 418vcmn_err(int ce, const char *fmt, va_list adx) 419{ 420 if (ce == CE_PANIC) 421 vpanic(fmt, adx); 422 if (ce != CE_NOTE) { /* suppress noise in userland stress testing */ 423 (void) fprintf(stderr, "%s", ce_prefix[ce]); 424 (void) vfprintf(stderr, fmt, adx); 425 (void) fprintf(stderr, "%s", ce_suffix[ce]); 426 } 427} 428 429/*PRINTFLIKE2*/ 430void 431cmn_err(int ce, const char *fmt, ...) 432{ 433 va_list adx; 434 435 va_start(adx, fmt); 436 vcmn_err(ce, fmt, adx); 437 va_end(adx); 438} 439 440/* 441 * ========================================================================= 442 * kobj interfaces 443 * ========================================================================= 444 */ 445struct _buf * 446kobj_open_file(char *name) 447{ 448 struct _buf *file; 449 vnode_t *vp; 450 451 /* set vp as the _fd field of the file */ 452 if (vn_openat(name, UIO_SYSSPACE, FREAD, 0, &vp, 0, 0, rootdir, 0) != 0) 453 return ((void *)-1UL); 454 455 file = umem_zalloc(sizeof (struct _buf), UMEM_NOFAIL); 456 file->_fd = (intptr_t)vp; 457 return (file); 458} 459 460int 461kobj_read_file(struct _buf *file, char *buf, unsigned size, unsigned off) 462{ 463 ssize_t resid; 464 465 vn_rdwr(UIO_READ, (vnode_t *)file->_fd, buf, size, (offset_t)off, 466 UIO_SYSSPACE, 0, 0, 0, &resid); 467 468 return (size - resid); 469} 470 471void 472kobj_close_file(struct _buf *file) 473{ 474 vn_close((vnode_t *)file->_fd); 475 umem_free(file, sizeof (struct _buf)); 476} 477 478int 479kobj_get_filesize(struct _buf *file, uint64_t *size) 480{ 481 struct stat64 st; 482 vnode_t *vp = (vnode_t *)file->_fd; 483 484 if (fstat64(vp->v_fd, &st) == -1) { 485 vn_close(vp); 486 return (errno); 487 } 488 *size = st.st_size; 489 return (0); 490} 491 492/* 493 * ========================================================================= 494 * misc routines 495 * ========================================================================= 496 */ 497 498void 499delay(clock_t ticks) 500{ 501 poll(0, 0, ticks * (1000 / hz)); 502} 503 504#if 0 505/* 506 * Find highest one bit set. 507 * Returns bit number + 1 of highest bit that is set, otherwise returns 0. 508 * High order bit is 31 (or 63 in _LP64 kernel). 509 */ 510int 511highbit(ulong_t i) 512{ 513 register int h = 1; 514 515 if (i == 0) 516 return (0); 517#ifdef _LP64 518 if (i & 0xffffffff00000000ul) { 519 h += 32; i >>= 32; 520 } 521#endif 522 if (i & 0xffff0000) { 523 h += 16; i >>= 16; 524 } 525 if (i & 0xff00) { 526 h += 8; i >>= 8; 527 } 528 if (i & 0xf0) { 529 h += 4; i >>= 4; 530 } 531 if (i & 0xc) { 532 h += 2; i >>= 2; 533 } 534 if (i & 0x2) { 535 h += 1; 536 } 537 return (h); 538} 539#endif 540 541static int 542random_get_bytes_common(uint8_t *ptr, size_t len, char *devname) 543{ 544 int fd = open(devname, O_RDONLY); 545 size_t resid = len; 546 ssize_t bytes; 547 548 ASSERT(fd != -1); 549 550 while (resid != 0) { 551 bytes = read(fd, ptr, resid); 552 ASSERT(bytes >= 0); 553 ptr += bytes; 554 resid -= bytes; 555 } 556 557 close(fd); 558 559 return (0); 560} 561 562int 563random_get_bytes(uint8_t *ptr, size_t len) 564{ 565 return (random_get_bytes_common(ptr, len, "/dev/random")); 566} 567 568int 569random_get_pseudo_bytes(uint8_t *ptr, size_t len) 570{ 571 return (random_get_bytes_common(ptr, len, "/dev/urandom")); 572} 573 574int 575ddi_strtoul(const char *hw_serial, char **nptr, int base, unsigned long *result) 576{ 577 char *end; 578 579 *result = strtoul(hw_serial, &end, base); 580 if (*result == 0) 581 return (errno); 582 return (0); 583} 584 585/* 586 * ========================================================================= 587 * kernel emulation setup & teardown 588 * ========================================================================= 589 */ 590static int 591umem_out_of_memory(void) 592{ 593 char errmsg[] = "out of memory -- generating core dump\n"; 594 595 write(fileno(stderr), errmsg, sizeof (errmsg)); 596 abort(); 597 return (0); 598} 599 600void 601kernel_init(int mode) 602{ 603 umem_nofail_callback(umem_out_of_memory); 604 uint64_t physmem; 605 size_t len = sizeof(physmem); 606 static int mib[2] = { CTL_HW, HW_USERMEM64 }; 607 608 if (sysctl(mib, sizeof(mib), &physmem, &len, NULL, 0) != 0) { 609 len = 1048576 * 128; 610 } 611 612 pgsize = sysconf(_SC_PAGE_SIZE); 613 dprintf("physmem = %llu pages (%.2f GB)\n", 614 physmem / pgsize, (double)physmem / (1ULL << 30)); 615 616 snprintf(hw_serial, sizeof (hw_serial), "%ld", gethostid()); 617 618 system_taskq_init(); 619 620 spa_init(mode); 621} 622 623void 624kernel_fini(void) 625{ 626 spa_fini(); 627} 628 629int 630z_uncompress(void *dst, size_t *dstlen, const void *src, size_t srclen) 631{ 632 int ret; 633 uLongf len = *dstlen; 634 635 if ((ret = uncompress(dst, &len, src, srclen)) == Z_OK) 636 *dstlen = (size_t)len; 637 638 return (ret); 639} 640 641int 642z_compress_level(void *dst, size_t *dstlen, const void *src, size_t srclen, 643 int level) 644{ 645 int ret; 646 uLongf len = *dstlen; 647 648 if ((ret = compress2(dst, &len, src, srclen, level)) == Z_OK) 649 *dstlen = (size_t)len; 650 651 return (ret); 652} 653 654uid_t 655crgetuid(cred_t *cr) 656{ 657 return (0); 658} 659 660gid_t 661crgetgid(cred_t *cr) 662{ 663 return (0); 664} 665 666int 667crgetngroups(cred_t *cr) 668{ 669 return (0); 670} 671 672gid_t * 673crgetgroups(cred_t *cr) 674{ 675 return (NULL); 676} 677 678int 679zfs_secpolicy_snapshot_perms(const char *name, cred_t *cr) 680{ 681 return (0); 682} 683 684int 685zfs_secpolicy_rename_perms(const char *from, const char *to, cred_t *cr) 686{ 687 return (0); 688} 689 690int 691zfs_secpolicy_destroy_perms(const char *name, cred_t *cr) 692{ 693 return (0); 694} 695 696ksiddomain_t * 697ksid_lookupdomain(const char *dom) 698{ 699 ksiddomain_t *kd; 700 701 kd = umem_zalloc(sizeof (ksiddomain_t), UMEM_NOFAIL); 702 kd->kd_name = spa_strdup(dom); 703 return (kd); 704} 705 706void 707ksiddomain_rele(ksiddomain_t *ksid) 708{ 709 spa_strfree(ksid->kd_name); 710 umem_free(ksid, sizeof (ksiddomain_t)); 711} 712 713size_t 714ptob(size_t npg) 715{ 716 717 return npg * pgsize; 718} 719 720void 721print_timestamp(int fmt) 722{ 723 724 return; 725} 726 727/* 728 * Do not change the length of the returned string; it must be freed 729 * with strfree(). 730 */ 731char * 732kmem_asprintf(const char *fmt, ...) 733{ 734 int size; 735 va_list adx; 736 char *buf; 737 738 va_start(adx, fmt); 739 size = vsnprintf(NULL, 0, fmt, adx) + 1; 740 va_end(adx); 741 742 buf = kmem_alloc(size, KM_SLEEP); 743 744 va_start(adx, fmt); 745 size = vsnprintf(buf, size, fmt, adx); 746 va_end(adx); 747 748 return (buf); 749} 750