ztest.c revision 219089
1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21/* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 */ 24 25/* 26 * The objective of this program is to provide a DMU/ZAP/SPA stress test 27 * that runs entirely in userland, is easy to use, and easy to extend. 28 * 29 * The overall design of the ztest program is as follows: 30 * 31 * (1) For each major functional area (e.g. adding vdevs to a pool, 32 * creating and destroying datasets, reading and writing objects, etc) 33 * we have a simple routine to test that functionality. These 34 * individual routines do not have to do anything "stressful". 35 * 36 * (2) We turn these simple functionality tests into a stress test by 37 * running them all in parallel, with as many threads as desired, 38 * and spread across as many datasets, objects, and vdevs as desired. 39 * 40 * (3) While all this is happening, we inject faults into the pool to 41 * verify that self-healing data really works. 42 * 43 * (4) Every time we open a dataset, we change its checksum and compression 44 * functions. Thus even individual objects vary from block to block 45 * in which checksum they use and whether they're compressed. 46 * 47 * (5) To verify that we never lose on-disk consistency after a crash, 48 * we run the entire test in a child of the main process. 49 * At random times, the child self-immolates with a SIGKILL. 50 * This is the software equivalent of pulling the power cord. 51 * The parent then runs the test again, using the existing 52 * storage pool, as many times as desired. 53 * 54 * (6) To verify that we don't have future leaks or temporal incursions, 55 * many of the functional tests record the transaction group number 56 * as part of their data. When reading old data, they verify that 57 * the transaction group number is less than the current, open txg. 58 * If you add a new test, please do this if applicable. 59 * 60 * When run with no arguments, ztest runs for about five minutes and 61 * produces no output if successful. To get a little bit of information, 62 * specify -V. To get more information, specify -VV, and so on. 63 * 64 * To turn this into an overnight stress test, use -T to specify run time. 65 * 66 * You can ask more more vdevs [-v], datasets [-d], or threads [-t] 67 * to increase the pool capacity, fanout, and overall stress level. 68 * 69 * The -N(okill) option will suppress kills, so each child runs to completion. 70 * This can be useful when you're trying to distinguish temporal incursions 71 * from plain old race conditions. 72 */ 73 74#include <sys/zfs_context.h> 75#include <sys/spa.h> 76#include <sys/dmu.h> 77#include <sys/txg.h> 78#include <sys/dbuf.h> 79#include <sys/zap.h> 80#include <sys/dmu_objset.h> 81#include <sys/poll.h> 82#include <sys/stat.h> 83#include <sys/time.h> 84#include <sys/wait.h> 85#include <sys/mman.h> 86#include <sys/resource.h> 87#include <sys/zio.h> 88#include <sys/zil.h> 89#include <sys/zil_impl.h> 90#include <sys/vdev_impl.h> 91#include <sys/vdev_file.h> 92#include <sys/spa_impl.h> 93#include <sys/metaslab_impl.h> 94#include <sys/dsl_prop.h> 95#include <sys/dsl_dataset.h> 96#include <sys/dsl_scan.h> 97#include <sys/zio_checksum.h> 98#include <sys/refcount.h> 99#include <stdio.h> 100#include <stdio_ext.h> 101#include <stdlib.h> 102#include <unistd.h> 103#include <signal.h> 104#include <umem.h> 105#include <dlfcn.h> 106#include <ctype.h> 107#include <math.h> 108#include <errno.h> 109#include <sys/fs/zfs.h> 110#include <libnvpair.h> 111 112static char cmdname[] = "ztest"; 113static char *zopt_pool = cmdname; 114static char *progname; 115 116static uint64_t zopt_vdevs = 5; 117static uint64_t zopt_vdevtime; 118static int zopt_ashift = SPA_MINBLOCKSHIFT; 119static int zopt_mirrors = 2; 120static int zopt_raidz = 4; 121static int zopt_raidz_parity = 1; 122static size_t zopt_vdev_size = SPA_MINDEVSIZE; 123static int zopt_datasets = 7; 124static int zopt_threads = 23; 125static uint64_t zopt_passtime = 60; /* 60 seconds */ 126static uint64_t zopt_killrate = 70; /* 70% kill rate */ 127static int zopt_verbose = 0; 128static int zopt_init = 1; 129static char *zopt_dir = "/tmp"; 130static uint64_t zopt_time = 300; /* 5 minutes */ 131static uint64_t zopt_maxloops = 50; /* max loops during spa_freeze() */ 132 133#define BT_MAGIC 0x123456789abcdefULL 134#define MAXFAULTS() (MAX(zs->zs_mirrors, 1) * (zopt_raidz_parity + 1) - 1) 135 136enum ztest_io_type { 137 ZTEST_IO_WRITE_TAG, 138 ZTEST_IO_WRITE_PATTERN, 139 ZTEST_IO_WRITE_ZEROES, 140 ZTEST_IO_TRUNCATE, 141 ZTEST_IO_SETATTR, 142 ZTEST_IO_TYPES 143}; 144 145typedef struct ztest_block_tag { 146 uint64_t bt_magic; 147 uint64_t bt_objset; 148 uint64_t bt_object; 149 uint64_t bt_offset; 150 uint64_t bt_gen; 151 uint64_t bt_txg; 152 uint64_t bt_crtxg; 153} ztest_block_tag_t; 154 155typedef struct bufwad { 156 uint64_t bw_index; 157 uint64_t bw_txg; 158 uint64_t bw_data; 159} bufwad_t; 160 161/* 162 * XXX -- fix zfs range locks to be generic so we can use them here. 163 */ 164typedef enum { 165 RL_READER, 166 RL_WRITER, 167 RL_APPEND 168} rl_type_t; 169 170typedef struct rll { 171 void *rll_writer; 172 int rll_readers; 173 mutex_t rll_lock; 174 cond_t rll_cv; 175} rll_t; 176 177typedef struct rl { 178 uint64_t rl_object; 179 uint64_t rl_offset; 180 uint64_t rl_size; 181 rll_t *rl_lock; 182} rl_t; 183 184#define ZTEST_RANGE_LOCKS 64 185#define ZTEST_OBJECT_LOCKS 64 186 187/* 188 * Object descriptor. Used as a template for object lookup/create/remove. 189 */ 190typedef struct ztest_od { 191 uint64_t od_dir; 192 uint64_t od_object; 193 dmu_object_type_t od_type; 194 dmu_object_type_t od_crtype; 195 uint64_t od_blocksize; 196 uint64_t od_crblocksize; 197 uint64_t od_gen; 198 uint64_t od_crgen; 199 char od_name[MAXNAMELEN]; 200} ztest_od_t; 201 202/* 203 * Per-dataset state. 204 */ 205typedef struct ztest_ds { 206 objset_t *zd_os; 207 zilog_t *zd_zilog; 208 uint64_t zd_seq; 209 ztest_od_t *zd_od; /* debugging aid */ 210 char zd_name[MAXNAMELEN]; 211 mutex_t zd_dirobj_lock; 212 rll_t zd_object_lock[ZTEST_OBJECT_LOCKS]; 213 rll_t zd_range_lock[ZTEST_RANGE_LOCKS]; 214} ztest_ds_t; 215 216/* 217 * Per-iteration state. 218 */ 219typedef void ztest_func_t(ztest_ds_t *zd, uint64_t id); 220 221typedef struct ztest_info { 222 ztest_func_t *zi_func; /* test function */ 223 uint64_t zi_iters; /* iterations per execution */ 224 uint64_t *zi_interval; /* execute every <interval> seconds */ 225 uint64_t zi_call_count; /* per-pass count */ 226 uint64_t zi_call_time; /* per-pass time */ 227 uint64_t zi_call_next; /* next time to call this function */ 228} ztest_info_t; 229 230/* 231 * Note: these aren't static because we want dladdr() to work. 232 */ 233ztest_func_t ztest_dmu_read_write; 234ztest_func_t ztest_dmu_write_parallel; 235ztest_func_t ztest_dmu_object_alloc_free; 236ztest_func_t ztest_dmu_commit_callbacks; 237ztest_func_t ztest_zap; 238ztest_func_t ztest_zap_parallel; 239ztest_func_t ztest_zil_commit; 240ztest_func_t ztest_dmu_read_write_zcopy; 241ztest_func_t ztest_dmu_objset_create_destroy; 242ztest_func_t ztest_dmu_prealloc; 243ztest_func_t ztest_fzap; 244ztest_func_t ztest_dmu_snapshot_create_destroy; 245ztest_func_t ztest_dsl_prop_get_set; 246ztest_func_t ztest_spa_prop_get_set; 247ztest_func_t ztest_spa_create_destroy; 248ztest_func_t ztest_fault_inject; 249ztest_func_t ztest_ddt_repair; 250ztest_func_t ztest_dmu_snapshot_hold; 251ztest_func_t ztest_spa_rename; 252ztest_func_t ztest_scrub; 253ztest_func_t ztest_dsl_dataset_promote_busy; 254ztest_func_t ztest_vdev_attach_detach; 255ztest_func_t ztest_vdev_LUN_growth; 256ztest_func_t ztest_vdev_add_remove; 257ztest_func_t ztest_vdev_aux_add_remove; 258ztest_func_t ztest_split_pool; 259 260uint64_t zopt_always = 0ULL * NANOSEC; /* all the time */ 261uint64_t zopt_incessant = 1ULL * NANOSEC / 10; /* every 1/10 second */ 262uint64_t zopt_often = 1ULL * NANOSEC; /* every second */ 263uint64_t zopt_sometimes = 10ULL * NANOSEC; /* every 10 seconds */ 264uint64_t zopt_rarely = 60ULL * NANOSEC; /* every 60 seconds */ 265 266ztest_info_t ztest_info[] = { 267 { ztest_dmu_read_write, 1, &zopt_always }, 268 { ztest_dmu_write_parallel, 10, &zopt_always }, 269 { ztest_dmu_object_alloc_free, 1, &zopt_always }, 270 { ztest_dmu_commit_callbacks, 1, &zopt_always }, 271 { ztest_zap, 30, &zopt_always }, 272 { ztest_zap_parallel, 100, &zopt_always }, 273 { ztest_split_pool, 1, &zopt_always }, 274 { ztest_zil_commit, 1, &zopt_incessant }, 275 { ztest_dmu_read_write_zcopy, 1, &zopt_often }, 276 { ztest_dmu_objset_create_destroy, 1, &zopt_often }, 277 { ztest_dsl_prop_get_set, 1, &zopt_often }, 278 { ztest_spa_prop_get_set, 1, &zopt_sometimes }, 279#if 0 280 { ztest_dmu_prealloc, 1, &zopt_sometimes }, 281#endif 282 { ztest_fzap, 1, &zopt_sometimes }, 283 { ztest_dmu_snapshot_create_destroy, 1, &zopt_sometimes }, 284 { ztest_spa_create_destroy, 1, &zopt_sometimes }, 285 { ztest_fault_inject, 1, &zopt_sometimes }, 286 { ztest_ddt_repair, 1, &zopt_sometimes }, 287 { ztest_dmu_snapshot_hold, 1, &zopt_sometimes }, 288 { ztest_spa_rename, 1, &zopt_rarely }, 289 { ztest_scrub, 1, &zopt_rarely }, 290 { ztest_dsl_dataset_promote_busy, 1, &zopt_rarely }, 291 { ztest_vdev_attach_detach, 1, &zopt_rarely }, 292 { ztest_vdev_LUN_growth, 1, &zopt_rarely }, 293 { ztest_vdev_add_remove, 1, &zopt_vdevtime }, 294 { ztest_vdev_aux_add_remove, 1, &zopt_vdevtime }, 295}; 296 297#define ZTEST_FUNCS (sizeof (ztest_info) / sizeof (ztest_info_t)) 298 299/* 300 * The following struct is used to hold a list of uncalled commit callbacks. 301 * The callbacks are ordered by txg number. 302 */ 303typedef struct ztest_cb_list { 304 mutex_t zcl_callbacks_lock; 305 list_t zcl_callbacks; 306} ztest_cb_list_t; 307 308/* 309 * Stuff we need to share writably between parent and child. 310 */ 311typedef struct ztest_shared { 312 char *zs_pool; 313 spa_t *zs_spa; 314 hrtime_t zs_proc_start; 315 hrtime_t zs_proc_stop; 316 hrtime_t zs_thread_start; 317 hrtime_t zs_thread_stop; 318 hrtime_t zs_thread_kill; 319 uint64_t zs_enospc_count; 320 uint64_t zs_vdev_next_leaf; 321 uint64_t zs_vdev_aux; 322 uint64_t zs_alloc; 323 uint64_t zs_space; 324 mutex_t zs_vdev_lock; 325 rwlock_t zs_name_lock; 326 ztest_info_t zs_info[ZTEST_FUNCS]; 327 uint64_t zs_splits; 328 uint64_t zs_mirrors; 329 ztest_ds_t zs_zd[]; 330} ztest_shared_t; 331 332#define ID_PARALLEL -1ULL 333 334static char ztest_dev_template[] = "%s/%s.%llua"; 335static char ztest_aux_template[] = "%s/%s.%s.%llu"; 336ztest_shared_t *ztest_shared; 337uint64_t *ztest_seq; 338 339static int ztest_random_fd; 340static int ztest_dump_core = 1; 341 342static boolean_t ztest_exiting; 343 344/* Global commit callback list */ 345static ztest_cb_list_t zcl; 346 347extern uint64_t metaslab_gang_bang; 348extern uint64_t metaslab_df_alloc_threshold; 349static uint64_t metaslab_sz; 350 351enum ztest_object { 352 ZTEST_META_DNODE = 0, 353 ZTEST_DIROBJ, 354 ZTEST_OBJECTS 355}; 356 357static void usage(boolean_t) __NORETURN; 358 359/* 360 * These libumem hooks provide a reasonable set of defaults for the allocator's 361 * debugging facilities. 362 */ 363const char * 364_umem_debug_init() 365{ 366 return ("default,verbose"); /* $UMEM_DEBUG setting */ 367} 368 369const char * 370_umem_logging_init(void) 371{ 372 return ("fail,contents"); /* $UMEM_LOGGING setting */ 373} 374 375#define FATAL_MSG_SZ 1024 376 377char *fatal_msg; 378 379static void 380fatal(int do_perror, char *message, ...) 381{ 382 va_list args; 383 int save_errno = errno; 384 char buf[FATAL_MSG_SZ]; 385 386 (void) fflush(stdout); 387 388 va_start(args, message); 389 (void) sprintf(buf, "ztest: "); 390 /* LINTED */ 391 (void) vsprintf(buf + strlen(buf), message, args); 392 va_end(args); 393 if (do_perror) { 394 (void) snprintf(buf + strlen(buf), FATAL_MSG_SZ - strlen(buf), 395 ": %s", strerror(save_errno)); 396 } 397 (void) fprintf(stderr, "%s\n", buf); 398 fatal_msg = buf; /* to ease debugging */ 399 if (ztest_dump_core) 400 abort(); 401 exit(3); 402} 403 404static int 405str2shift(const char *buf) 406{ 407 const char *ends = "BKMGTPEZ"; 408 int i; 409 410 if (buf[0] == '\0') 411 return (0); 412 for (i = 0; i < strlen(ends); i++) { 413 if (toupper(buf[0]) == ends[i]) 414 break; 415 } 416 if (i == strlen(ends)) { 417 (void) fprintf(stderr, "ztest: invalid bytes suffix: %s\n", 418 buf); 419 usage(B_FALSE); 420 } 421 if (buf[1] == '\0' || (toupper(buf[1]) == 'B' && buf[2] == '\0')) { 422 return (10*i); 423 } 424 (void) fprintf(stderr, "ztest: invalid bytes suffix: %s\n", buf); 425 usage(B_FALSE); 426 /* NOTREACHED */ 427} 428 429static uint64_t 430nicenumtoull(const char *buf) 431{ 432 char *end; 433 uint64_t val; 434 435 val = strtoull(buf, &end, 0); 436 if (end == buf) { 437 (void) fprintf(stderr, "ztest: bad numeric value: %s\n", buf); 438 usage(B_FALSE); 439 } else if (end[0] == '.') { 440 double fval = strtod(buf, &end); 441 fval *= pow(2, str2shift(end)); 442 if (fval > UINT64_MAX) { 443 (void) fprintf(stderr, "ztest: value too large: %s\n", 444 buf); 445 usage(B_FALSE); 446 } 447 val = (uint64_t)fval; 448 } else { 449 int shift = str2shift(end); 450 if (shift >= 64 || (val << shift) >> shift != val) { 451 (void) fprintf(stderr, "ztest: value too large: %s\n", 452 buf); 453 usage(B_FALSE); 454 } 455 val <<= shift; 456 } 457 return (val); 458} 459 460static void 461usage(boolean_t requested) 462{ 463 char nice_vdev_size[10]; 464 char nice_gang_bang[10]; 465 FILE *fp = requested ? stdout : stderr; 466 467 nicenum(zopt_vdev_size, nice_vdev_size); 468 nicenum(metaslab_gang_bang, nice_gang_bang); 469 470 (void) fprintf(fp, "Usage: %s\n" 471 "\t[-v vdevs (default: %llu)]\n" 472 "\t[-s size_of_each_vdev (default: %s)]\n" 473 "\t[-a alignment_shift (default: %d)] use 0 for random\n" 474 "\t[-m mirror_copies (default: %d)]\n" 475 "\t[-r raidz_disks (default: %d)]\n" 476 "\t[-R raidz_parity (default: %d)]\n" 477 "\t[-d datasets (default: %d)]\n" 478 "\t[-t threads (default: %d)]\n" 479 "\t[-g gang_block_threshold (default: %s)]\n" 480 "\t[-i init_count (default: %d)] initialize pool i times\n" 481 "\t[-k kill_percentage (default: %llu%%)]\n" 482 "\t[-p pool_name (default: %s)]\n" 483 "\t[-f dir (default: %s)] file directory for vdev files\n" 484 "\t[-V] verbose (use multiple times for ever more blather)\n" 485 "\t[-E] use existing pool instead of creating new one\n" 486 "\t[-T time (default: %llu sec)] total run time\n" 487 "\t[-F freezeloops (default: %llu)] max loops in spa_freeze()\n" 488 "\t[-P passtime (default: %llu sec)] time per pass\n" 489 "\t[-h] (print help)\n" 490 "", 491 cmdname, 492 (u_longlong_t)zopt_vdevs, /* -v */ 493 nice_vdev_size, /* -s */ 494 zopt_ashift, /* -a */ 495 zopt_mirrors, /* -m */ 496 zopt_raidz, /* -r */ 497 zopt_raidz_parity, /* -R */ 498 zopt_datasets, /* -d */ 499 zopt_threads, /* -t */ 500 nice_gang_bang, /* -g */ 501 zopt_init, /* -i */ 502 (u_longlong_t)zopt_killrate, /* -k */ 503 zopt_pool, /* -p */ 504 zopt_dir, /* -f */ 505 (u_longlong_t)zopt_time, /* -T */ 506 (u_longlong_t)zopt_maxloops, /* -F */ 507 (u_longlong_t)zopt_passtime); /* -P */ 508 exit(requested ? 0 : 1); 509} 510 511static void 512process_options(int argc, char **argv) 513{ 514 int opt; 515 uint64_t value; 516 517 /* Remember program name. */ 518 progname = argv[0]; 519 520 /* By default, test gang blocks for blocks 32K and greater */ 521 metaslab_gang_bang = 32 << 10; 522 523 while ((opt = getopt(argc, argv, 524 "v:s:a:m:r:R:d:t:g:i:k:p:f:VET:P:hF:")) != EOF) { 525 value = 0; 526 switch (opt) { 527 case 'v': 528 case 's': 529 case 'a': 530 case 'm': 531 case 'r': 532 case 'R': 533 case 'd': 534 case 't': 535 case 'g': 536 case 'i': 537 case 'k': 538 case 'T': 539 case 'P': 540 case 'F': 541 value = nicenumtoull(optarg); 542 } 543 switch (opt) { 544 case 'v': 545 zopt_vdevs = value; 546 break; 547 case 's': 548 zopt_vdev_size = MAX(SPA_MINDEVSIZE, value); 549 break; 550 case 'a': 551 zopt_ashift = value; 552 break; 553 case 'm': 554 zopt_mirrors = value; 555 break; 556 case 'r': 557 zopt_raidz = MAX(1, value); 558 break; 559 case 'R': 560 zopt_raidz_parity = MIN(MAX(value, 1), 3); 561 break; 562 case 'd': 563 zopt_datasets = MAX(1, value); 564 break; 565 case 't': 566 zopt_threads = MAX(1, value); 567 break; 568 case 'g': 569 metaslab_gang_bang = MAX(SPA_MINBLOCKSIZE << 1, value); 570 break; 571 case 'i': 572 zopt_init = value; 573 break; 574 case 'k': 575 zopt_killrate = value; 576 break; 577 case 'p': 578 zopt_pool = strdup(optarg); 579 break; 580 case 'f': 581 zopt_dir = strdup(optarg); 582 break; 583 case 'V': 584 zopt_verbose++; 585 break; 586 case 'E': 587 zopt_init = 0; 588 break; 589 case 'T': 590 zopt_time = value; 591 break; 592 case 'P': 593 zopt_passtime = MAX(1, value); 594 break; 595 case 'F': 596 zopt_maxloops = MAX(1, value); 597 break; 598 case 'h': 599 usage(B_TRUE); 600 break; 601 case '?': 602 default: 603 usage(B_FALSE); 604 break; 605 } 606 } 607 608 zopt_raidz_parity = MIN(zopt_raidz_parity, zopt_raidz - 1); 609 610 zopt_vdevtime = (zopt_vdevs > 0 ? zopt_time * NANOSEC / zopt_vdevs : 611 UINT64_MAX >> 2); 612} 613 614static void 615ztest_kill(ztest_shared_t *zs) 616{ 617 zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(zs->zs_spa)); 618 zs->zs_space = metaslab_class_get_space(spa_normal_class(zs->zs_spa)); 619 (void) kill(getpid(), SIGKILL); 620} 621 622static uint64_t 623ztest_random(uint64_t range) 624{ 625 uint64_t r; 626 627 if (range == 0) 628 return (0); 629 630 if (read(ztest_random_fd, &r, sizeof (r)) != sizeof (r)) 631 fatal(1, "short read from /dev/urandom"); 632 633 return (r % range); 634} 635 636/* ARGSUSED */ 637static void 638ztest_record_enospc(const char *s) 639{ 640 ztest_shared->zs_enospc_count++; 641} 642 643static uint64_t 644ztest_get_ashift(void) 645{ 646 if (zopt_ashift == 0) 647 return (SPA_MINBLOCKSHIFT + ztest_random(3)); 648 return (zopt_ashift); 649} 650 651static nvlist_t * 652make_vdev_file(char *path, char *aux, size_t size, uint64_t ashift) 653{ 654 char pathbuf[MAXPATHLEN]; 655 uint64_t vdev; 656 nvlist_t *file; 657 658 if (ashift == 0) 659 ashift = ztest_get_ashift(); 660 661 if (path == NULL) { 662 path = pathbuf; 663 664 if (aux != NULL) { 665 vdev = ztest_shared->zs_vdev_aux; 666 (void) sprintf(path, ztest_aux_template, 667 zopt_dir, zopt_pool, aux, vdev); 668 } else { 669 vdev = ztest_shared->zs_vdev_next_leaf++; 670 (void) sprintf(path, ztest_dev_template, 671 zopt_dir, zopt_pool, vdev); 672 } 673 } 674 675 if (size != 0) { 676 int fd = open(path, O_RDWR | O_CREAT | O_TRUNC, 0666); 677 if (fd == -1) 678 fatal(1, "can't open %s", path); 679 if (ftruncate(fd, size) != 0) 680 fatal(1, "can't ftruncate %s", path); 681 (void) close(fd); 682 } 683 684 VERIFY(nvlist_alloc(&file, NV_UNIQUE_NAME, 0) == 0); 685 VERIFY(nvlist_add_string(file, ZPOOL_CONFIG_TYPE, VDEV_TYPE_FILE) == 0); 686 VERIFY(nvlist_add_string(file, ZPOOL_CONFIG_PATH, path) == 0); 687 VERIFY(nvlist_add_uint64(file, ZPOOL_CONFIG_ASHIFT, ashift) == 0); 688 689 return (file); 690} 691 692static nvlist_t * 693make_vdev_raidz(char *path, char *aux, size_t size, uint64_t ashift, int r) 694{ 695 nvlist_t *raidz, **child; 696 int c; 697 698 if (r < 2) 699 return (make_vdev_file(path, aux, size, ashift)); 700 child = umem_alloc(r * sizeof (nvlist_t *), UMEM_NOFAIL); 701 702 for (c = 0; c < r; c++) 703 child[c] = make_vdev_file(path, aux, size, ashift); 704 705 VERIFY(nvlist_alloc(&raidz, NV_UNIQUE_NAME, 0) == 0); 706 VERIFY(nvlist_add_string(raidz, ZPOOL_CONFIG_TYPE, 707 VDEV_TYPE_RAIDZ) == 0); 708 VERIFY(nvlist_add_uint64(raidz, ZPOOL_CONFIG_NPARITY, 709 zopt_raidz_parity) == 0); 710 VERIFY(nvlist_add_nvlist_array(raidz, ZPOOL_CONFIG_CHILDREN, 711 child, r) == 0); 712 713 for (c = 0; c < r; c++) 714 nvlist_free(child[c]); 715 716 umem_free(child, r * sizeof (nvlist_t *)); 717 718 return (raidz); 719} 720 721static nvlist_t * 722make_vdev_mirror(char *path, char *aux, size_t size, uint64_t ashift, 723 int r, int m) 724{ 725 nvlist_t *mirror, **child; 726 int c; 727 728 if (m < 1) 729 return (make_vdev_raidz(path, aux, size, ashift, r)); 730 731 child = umem_alloc(m * sizeof (nvlist_t *), UMEM_NOFAIL); 732 733 for (c = 0; c < m; c++) 734 child[c] = make_vdev_raidz(path, aux, size, ashift, r); 735 736 VERIFY(nvlist_alloc(&mirror, NV_UNIQUE_NAME, 0) == 0); 737 VERIFY(nvlist_add_string(mirror, ZPOOL_CONFIG_TYPE, 738 VDEV_TYPE_MIRROR) == 0); 739 VERIFY(nvlist_add_nvlist_array(mirror, ZPOOL_CONFIG_CHILDREN, 740 child, m) == 0); 741 742 for (c = 0; c < m; c++) 743 nvlist_free(child[c]); 744 745 umem_free(child, m * sizeof (nvlist_t *)); 746 747 return (mirror); 748} 749 750static nvlist_t * 751make_vdev_root(char *path, char *aux, size_t size, uint64_t ashift, 752 int log, int r, int m, int t) 753{ 754 nvlist_t *root, **child; 755 int c; 756 757 ASSERT(t > 0); 758 759 child = umem_alloc(t * sizeof (nvlist_t *), UMEM_NOFAIL); 760 761 for (c = 0; c < t; c++) { 762 child[c] = make_vdev_mirror(path, aux, size, ashift, r, m); 763 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_IS_LOG, 764 log) == 0); 765 } 766 767 VERIFY(nvlist_alloc(&root, NV_UNIQUE_NAME, 0) == 0); 768 VERIFY(nvlist_add_string(root, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT) == 0); 769 VERIFY(nvlist_add_nvlist_array(root, aux ? aux : ZPOOL_CONFIG_CHILDREN, 770 child, t) == 0); 771 772 for (c = 0; c < t; c++) 773 nvlist_free(child[c]); 774 775 umem_free(child, t * sizeof (nvlist_t *)); 776 777 return (root); 778} 779 780static int 781ztest_random_blocksize(void) 782{ 783 return (1 << (SPA_MINBLOCKSHIFT + 784 ztest_random(SPA_MAXBLOCKSHIFT - SPA_MINBLOCKSHIFT + 1))); 785} 786 787static int 788ztest_random_ibshift(void) 789{ 790 return (DN_MIN_INDBLKSHIFT + 791 ztest_random(DN_MAX_INDBLKSHIFT - DN_MIN_INDBLKSHIFT + 1)); 792} 793 794static uint64_t 795ztest_random_vdev_top(spa_t *spa, boolean_t log_ok) 796{ 797 uint64_t top; 798 vdev_t *rvd = spa->spa_root_vdev; 799 vdev_t *tvd; 800 801 ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); 802 803 do { 804 top = ztest_random(rvd->vdev_children); 805 tvd = rvd->vdev_child[top]; 806 } while (tvd->vdev_ishole || (tvd->vdev_islog && !log_ok) || 807 tvd->vdev_mg == NULL || tvd->vdev_mg->mg_class == NULL); 808 809 return (top); 810} 811 812static uint64_t 813ztest_random_dsl_prop(zfs_prop_t prop) 814{ 815 uint64_t value; 816 817 do { 818 value = zfs_prop_random_value(prop, ztest_random(-1ULL)); 819 } while (prop == ZFS_PROP_CHECKSUM && value == ZIO_CHECKSUM_OFF); 820 821 return (value); 822} 823 824static int 825ztest_dsl_prop_set_uint64(char *osname, zfs_prop_t prop, uint64_t value, 826 boolean_t inherit) 827{ 828 const char *propname = zfs_prop_to_name(prop); 829 const char *valname; 830 char setpoint[MAXPATHLEN]; 831 uint64_t curval; 832 int error; 833 834 error = dsl_prop_set(osname, propname, 835 (inherit ? ZPROP_SRC_NONE : ZPROP_SRC_LOCAL), 836 sizeof (value), 1, &value); 837 838 if (error == ENOSPC) { 839 ztest_record_enospc(FTAG); 840 return (error); 841 } 842 ASSERT3U(error, ==, 0); 843 844 VERIFY3U(dsl_prop_get(osname, propname, sizeof (curval), 845 1, &curval, setpoint), ==, 0); 846 847 if (zopt_verbose >= 6) { 848 VERIFY(zfs_prop_index_to_string(prop, curval, &valname) == 0); 849 (void) printf("%s %s = %s at '%s'\n", 850 osname, propname, valname, setpoint); 851 } 852 853 return (error); 854} 855 856static int 857ztest_spa_prop_set_uint64(ztest_shared_t *zs, zpool_prop_t prop, uint64_t value) 858{ 859 spa_t *spa = zs->zs_spa; 860 nvlist_t *props = NULL; 861 int error; 862 863 VERIFY(nvlist_alloc(&props, NV_UNIQUE_NAME, 0) == 0); 864 VERIFY(nvlist_add_uint64(props, zpool_prop_to_name(prop), value) == 0); 865 866 error = spa_prop_set(spa, props); 867 868 nvlist_free(props); 869 870 if (error == ENOSPC) { 871 ztest_record_enospc(FTAG); 872 return (error); 873 } 874 ASSERT3U(error, ==, 0); 875 876 return (error); 877} 878 879static void 880ztest_rll_init(rll_t *rll) 881{ 882 rll->rll_writer = NULL; 883 rll->rll_readers = 0; 884 VERIFY(_mutex_init(&rll->rll_lock, USYNC_THREAD, NULL) == 0); 885 VERIFY(cond_init(&rll->rll_cv, USYNC_THREAD, NULL) == 0); 886} 887 888static void 889ztest_rll_destroy(rll_t *rll) 890{ 891 ASSERT(rll->rll_writer == NULL); 892 ASSERT(rll->rll_readers == 0); 893 VERIFY(_mutex_destroy(&rll->rll_lock) == 0); 894 VERIFY(cond_destroy(&rll->rll_cv) == 0); 895} 896 897static void 898ztest_rll_lock(rll_t *rll, rl_type_t type) 899{ 900 VERIFY(mutex_lock(&rll->rll_lock) == 0); 901 902 if (type == RL_READER) { 903 while (rll->rll_writer != NULL) 904 (void) cond_wait(&rll->rll_cv, &rll->rll_lock); 905 rll->rll_readers++; 906 } else { 907 while (rll->rll_writer != NULL || rll->rll_readers) 908 (void) cond_wait(&rll->rll_cv, &rll->rll_lock); 909 rll->rll_writer = curthread; 910 } 911 912 VERIFY(mutex_unlock(&rll->rll_lock) == 0); 913} 914 915static void 916ztest_rll_unlock(rll_t *rll) 917{ 918 VERIFY(mutex_lock(&rll->rll_lock) == 0); 919 920 if (rll->rll_writer) { 921 ASSERT(rll->rll_readers == 0); 922 rll->rll_writer = NULL; 923 } else { 924 ASSERT(rll->rll_readers != 0); 925 ASSERT(rll->rll_writer == NULL); 926 rll->rll_readers--; 927 } 928 929 if (rll->rll_writer == NULL && rll->rll_readers == 0) 930 VERIFY(cond_broadcast(&rll->rll_cv) == 0); 931 932 VERIFY(mutex_unlock(&rll->rll_lock) == 0); 933} 934 935static void 936ztest_object_lock(ztest_ds_t *zd, uint64_t object, rl_type_t type) 937{ 938 rll_t *rll = &zd->zd_object_lock[object & (ZTEST_OBJECT_LOCKS - 1)]; 939 940 ztest_rll_lock(rll, type); 941} 942 943static void 944ztest_object_unlock(ztest_ds_t *zd, uint64_t object) 945{ 946 rll_t *rll = &zd->zd_object_lock[object & (ZTEST_OBJECT_LOCKS - 1)]; 947 948 ztest_rll_unlock(rll); 949} 950 951static rl_t * 952ztest_range_lock(ztest_ds_t *zd, uint64_t object, uint64_t offset, 953 uint64_t size, rl_type_t type) 954{ 955 uint64_t hash = object ^ (offset % (ZTEST_RANGE_LOCKS + 1)); 956 rll_t *rll = &zd->zd_range_lock[hash & (ZTEST_RANGE_LOCKS - 1)]; 957 rl_t *rl; 958 959 rl = umem_alloc(sizeof (*rl), UMEM_NOFAIL); 960 rl->rl_object = object; 961 rl->rl_offset = offset; 962 rl->rl_size = size; 963 rl->rl_lock = rll; 964 965 ztest_rll_lock(rll, type); 966 967 return (rl); 968} 969 970static void 971ztest_range_unlock(rl_t *rl) 972{ 973 rll_t *rll = rl->rl_lock; 974 975 ztest_rll_unlock(rll); 976 977 umem_free(rl, sizeof (*rl)); 978} 979 980static void 981ztest_zd_init(ztest_ds_t *zd, objset_t *os) 982{ 983 zd->zd_os = os; 984 zd->zd_zilog = dmu_objset_zil(os); 985 zd->zd_seq = 0; 986 dmu_objset_name(os, zd->zd_name); 987 988 VERIFY(_mutex_init(&zd->zd_dirobj_lock, USYNC_THREAD, NULL) == 0); 989 990 for (int l = 0; l < ZTEST_OBJECT_LOCKS; l++) 991 ztest_rll_init(&zd->zd_object_lock[l]); 992 993 for (int l = 0; l < ZTEST_RANGE_LOCKS; l++) 994 ztest_rll_init(&zd->zd_range_lock[l]); 995} 996 997static void 998ztest_zd_fini(ztest_ds_t *zd) 999{ 1000 VERIFY(_mutex_destroy(&zd->zd_dirobj_lock) == 0); 1001 1002 for (int l = 0; l < ZTEST_OBJECT_LOCKS; l++) 1003 ztest_rll_destroy(&zd->zd_object_lock[l]); 1004 1005 for (int l = 0; l < ZTEST_RANGE_LOCKS; l++) 1006 ztest_rll_destroy(&zd->zd_range_lock[l]); 1007} 1008 1009#define TXG_MIGHTWAIT (ztest_random(10) == 0 ? TXG_NOWAIT : TXG_WAIT) 1010 1011static uint64_t 1012ztest_tx_assign(dmu_tx_t *tx, uint64_t txg_how, const char *tag) 1013{ 1014 uint64_t txg; 1015 int error; 1016 1017 /* 1018 * Attempt to assign tx to some transaction group. 1019 */ 1020 error = dmu_tx_assign(tx, txg_how); 1021 if (error) { 1022 if (error == ERESTART) { 1023 ASSERT(txg_how == TXG_NOWAIT); 1024 dmu_tx_wait(tx); 1025 } else { 1026 ASSERT3U(error, ==, ENOSPC); 1027 ztest_record_enospc(tag); 1028 } 1029 dmu_tx_abort(tx); 1030 return (0); 1031 } 1032 txg = dmu_tx_get_txg(tx); 1033 ASSERT(txg != 0); 1034 return (txg); 1035} 1036 1037static void 1038ztest_pattern_set(void *buf, uint64_t size, uint64_t value) 1039{ 1040 uint64_t *ip = buf; 1041 uint64_t *ip_end = (uint64_t *)((uintptr_t)buf + (uintptr_t)size); 1042 1043 while (ip < ip_end) 1044 *ip++ = value; 1045} 1046 1047static boolean_t 1048ztest_pattern_match(void *buf, uint64_t size, uint64_t value) 1049{ 1050 uint64_t *ip = buf; 1051 uint64_t *ip_end = (uint64_t *)((uintptr_t)buf + (uintptr_t)size); 1052 uint64_t diff = 0; 1053 1054 while (ip < ip_end) 1055 diff |= (value - *ip++); 1056 1057 return (diff == 0); 1058} 1059 1060static void 1061ztest_bt_generate(ztest_block_tag_t *bt, objset_t *os, uint64_t object, 1062 uint64_t offset, uint64_t gen, uint64_t txg, uint64_t crtxg) 1063{ 1064 bt->bt_magic = BT_MAGIC; 1065 bt->bt_objset = dmu_objset_id(os); 1066 bt->bt_object = object; 1067 bt->bt_offset = offset; 1068 bt->bt_gen = gen; 1069 bt->bt_txg = txg; 1070 bt->bt_crtxg = crtxg; 1071} 1072 1073static void 1074ztest_bt_verify(ztest_block_tag_t *bt, objset_t *os, uint64_t object, 1075 uint64_t offset, uint64_t gen, uint64_t txg, uint64_t crtxg) 1076{ 1077 ASSERT(bt->bt_magic == BT_MAGIC); 1078 ASSERT(bt->bt_objset == dmu_objset_id(os)); 1079 ASSERT(bt->bt_object == object); 1080 ASSERT(bt->bt_offset == offset); 1081 ASSERT(bt->bt_gen <= gen); 1082 ASSERT(bt->bt_txg <= txg); 1083 ASSERT(bt->bt_crtxg == crtxg); 1084} 1085 1086static ztest_block_tag_t * 1087ztest_bt_bonus(dmu_buf_t *db) 1088{ 1089 dmu_object_info_t doi; 1090 ztest_block_tag_t *bt; 1091 1092 dmu_object_info_from_db(db, &doi); 1093 ASSERT3U(doi.doi_bonus_size, <=, db->db_size); 1094 ASSERT3U(doi.doi_bonus_size, >=, sizeof (*bt)); 1095 bt = (void *)((char *)db->db_data + doi.doi_bonus_size - sizeof (*bt)); 1096 1097 return (bt); 1098} 1099 1100/* 1101 * ZIL logging ops 1102 */ 1103 1104#define lrz_type lr_mode 1105#define lrz_blocksize lr_uid 1106#define lrz_ibshift lr_gid 1107#define lrz_bonustype lr_rdev 1108#define lrz_bonuslen lr_crtime[1] 1109 1110static void 1111ztest_log_create(ztest_ds_t *zd, dmu_tx_t *tx, lr_create_t *lr) 1112{ 1113 char *name = (void *)(lr + 1); /* name follows lr */ 1114 size_t namesize = strlen(name) + 1; 1115 itx_t *itx; 1116 1117 if (zil_replaying(zd->zd_zilog, tx)) 1118 return; 1119 1120 itx = zil_itx_create(TX_CREATE, sizeof (*lr) + namesize); 1121 bcopy(&lr->lr_common + 1, &itx->itx_lr + 1, 1122 sizeof (*lr) + namesize - sizeof (lr_t)); 1123 1124 zil_itx_assign(zd->zd_zilog, itx, tx); 1125} 1126 1127static void 1128ztest_log_remove(ztest_ds_t *zd, dmu_tx_t *tx, lr_remove_t *lr, uint64_t object) 1129{ 1130 char *name = (void *)(lr + 1); /* name follows lr */ 1131 size_t namesize = strlen(name) + 1; 1132 itx_t *itx; 1133 1134 if (zil_replaying(zd->zd_zilog, tx)) 1135 return; 1136 1137 itx = zil_itx_create(TX_REMOVE, sizeof (*lr) + namesize); 1138 bcopy(&lr->lr_common + 1, &itx->itx_lr + 1, 1139 sizeof (*lr) + namesize - sizeof (lr_t)); 1140 1141 itx->itx_oid = object; 1142 zil_itx_assign(zd->zd_zilog, itx, tx); 1143} 1144 1145static void 1146ztest_log_write(ztest_ds_t *zd, dmu_tx_t *tx, lr_write_t *lr) 1147{ 1148 itx_t *itx; 1149 itx_wr_state_t write_state = ztest_random(WR_NUM_STATES); 1150 1151 if (zil_replaying(zd->zd_zilog, tx)) 1152 return; 1153 1154 if (lr->lr_length > ZIL_MAX_LOG_DATA) 1155 write_state = WR_INDIRECT; 1156 1157 itx = zil_itx_create(TX_WRITE, 1158 sizeof (*lr) + (write_state == WR_COPIED ? lr->lr_length : 0)); 1159 1160 if (write_state == WR_COPIED && 1161 dmu_read(zd->zd_os, lr->lr_foid, lr->lr_offset, lr->lr_length, 1162 ((lr_write_t *)&itx->itx_lr) + 1, DMU_READ_NO_PREFETCH) != 0) { 1163 zil_itx_destroy(itx); 1164 itx = zil_itx_create(TX_WRITE, sizeof (*lr)); 1165 write_state = WR_NEED_COPY; 1166 } 1167 itx->itx_private = zd; 1168 itx->itx_wr_state = write_state; 1169 itx->itx_sync = (ztest_random(8) == 0); 1170 itx->itx_sod += (write_state == WR_NEED_COPY ? lr->lr_length : 0); 1171 1172 bcopy(&lr->lr_common + 1, &itx->itx_lr + 1, 1173 sizeof (*lr) - sizeof (lr_t)); 1174 1175 zil_itx_assign(zd->zd_zilog, itx, tx); 1176} 1177 1178static void 1179ztest_log_truncate(ztest_ds_t *zd, dmu_tx_t *tx, lr_truncate_t *lr) 1180{ 1181 itx_t *itx; 1182 1183 if (zil_replaying(zd->zd_zilog, tx)) 1184 return; 1185 1186 itx = zil_itx_create(TX_TRUNCATE, sizeof (*lr)); 1187 bcopy(&lr->lr_common + 1, &itx->itx_lr + 1, 1188 sizeof (*lr) - sizeof (lr_t)); 1189 1190 itx->itx_sync = B_FALSE; 1191 zil_itx_assign(zd->zd_zilog, itx, tx); 1192} 1193 1194static void 1195ztest_log_setattr(ztest_ds_t *zd, dmu_tx_t *tx, lr_setattr_t *lr) 1196{ 1197 itx_t *itx; 1198 1199 if (zil_replaying(zd->zd_zilog, tx)) 1200 return; 1201 1202 itx = zil_itx_create(TX_SETATTR, sizeof (*lr)); 1203 bcopy(&lr->lr_common + 1, &itx->itx_lr + 1, 1204 sizeof (*lr) - sizeof (lr_t)); 1205 1206 itx->itx_sync = B_FALSE; 1207 zil_itx_assign(zd->zd_zilog, itx, tx); 1208} 1209 1210/* 1211 * ZIL replay ops 1212 */ 1213static int 1214ztest_replay_create(ztest_ds_t *zd, lr_create_t *lr, boolean_t byteswap) 1215{ 1216 char *name = (void *)(lr + 1); /* name follows lr */ 1217 objset_t *os = zd->zd_os; 1218 ztest_block_tag_t *bbt; 1219 dmu_buf_t *db; 1220 dmu_tx_t *tx; 1221 uint64_t txg; 1222 int error = 0; 1223 1224 if (byteswap) 1225 byteswap_uint64_array(lr, sizeof (*lr)); 1226 1227 ASSERT(lr->lr_doid == ZTEST_DIROBJ); 1228 ASSERT(name[0] != '\0'); 1229 1230 tx = dmu_tx_create(os); 1231 1232 dmu_tx_hold_zap(tx, lr->lr_doid, B_TRUE, name); 1233 1234 if (lr->lrz_type == DMU_OT_ZAP_OTHER) { 1235 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); 1236 } else { 1237 dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); 1238 } 1239 1240 txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); 1241 if (txg == 0) 1242 return (ENOSPC); 1243 1244 ASSERT(dmu_objset_zil(os)->zl_replay == !!lr->lr_foid); 1245 1246 if (lr->lrz_type == DMU_OT_ZAP_OTHER) { 1247 if (lr->lr_foid == 0) { 1248 lr->lr_foid = zap_create(os, 1249 lr->lrz_type, lr->lrz_bonustype, 1250 lr->lrz_bonuslen, tx); 1251 } else { 1252 error = zap_create_claim(os, lr->lr_foid, 1253 lr->lrz_type, lr->lrz_bonustype, 1254 lr->lrz_bonuslen, tx); 1255 } 1256 } else { 1257 if (lr->lr_foid == 0) { 1258 lr->lr_foid = dmu_object_alloc(os, 1259 lr->lrz_type, 0, lr->lrz_bonustype, 1260 lr->lrz_bonuslen, tx); 1261 } else { 1262 error = dmu_object_claim(os, lr->lr_foid, 1263 lr->lrz_type, 0, lr->lrz_bonustype, 1264 lr->lrz_bonuslen, tx); 1265 } 1266 } 1267 1268 if (error) { 1269 ASSERT3U(error, ==, EEXIST); 1270 ASSERT(zd->zd_zilog->zl_replay); 1271 dmu_tx_commit(tx); 1272 return (error); 1273 } 1274 1275 ASSERT(lr->lr_foid != 0); 1276 1277 if (lr->lrz_type != DMU_OT_ZAP_OTHER) 1278 VERIFY3U(0, ==, dmu_object_set_blocksize(os, lr->lr_foid, 1279 lr->lrz_blocksize, lr->lrz_ibshift, tx)); 1280 1281 VERIFY3U(0, ==, dmu_bonus_hold(os, lr->lr_foid, FTAG, &db)); 1282 bbt = ztest_bt_bonus(db); 1283 dmu_buf_will_dirty(db, tx); 1284 ztest_bt_generate(bbt, os, lr->lr_foid, -1ULL, lr->lr_gen, txg, txg); 1285 dmu_buf_rele(db, FTAG); 1286 1287 VERIFY3U(0, ==, zap_add(os, lr->lr_doid, name, sizeof (uint64_t), 1, 1288 &lr->lr_foid, tx)); 1289 1290 (void) ztest_log_create(zd, tx, lr); 1291 1292 dmu_tx_commit(tx); 1293 1294 return (0); 1295} 1296 1297static int 1298ztest_replay_remove(ztest_ds_t *zd, lr_remove_t *lr, boolean_t byteswap) 1299{ 1300 char *name = (void *)(lr + 1); /* name follows lr */ 1301 objset_t *os = zd->zd_os; 1302 dmu_object_info_t doi; 1303 dmu_tx_t *tx; 1304 uint64_t object, txg; 1305 1306 if (byteswap) 1307 byteswap_uint64_array(lr, sizeof (*lr)); 1308 1309 ASSERT(lr->lr_doid == ZTEST_DIROBJ); 1310 ASSERT(name[0] != '\0'); 1311 1312 VERIFY3U(0, ==, 1313 zap_lookup(os, lr->lr_doid, name, sizeof (object), 1, &object)); 1314 ASSERT(object != 0); 1315 1316 ztest_object_lock(zd, object, RL_WRITER); 1317 1318 VERIFY3U(0, ==, dmu_object_info(os, object, &doi)); 1319 1320 tx = dmu_tx_create(os); 1321 1322 dmu_tx_hold_zap(tx, lr->lr_doid, B_FALSE, name); 1323 dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END); 1324 1325 txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); 1326 if (txg == 0) { 1327 ztest_object_unlock(zd, object); 1328 return (ENOSPC); 1329 } 1330 1331 if (doi.doi_type == DMU_OT_ZAP_OTHER) { 1332 VERIFY3U(0, ==, zap_destroy(os, object, tx)); 1333 } else { 1334 VERIFY3U(0, ==, dmu_object_free(os, object, tx)); 1335 } 1336 1337 VERIFY3U(0, ==, zap_remove(os, lr->lr_doid, name, tx)); 1338 1339 (void) ztest_log_remove(zd, tx, lr, object); 1340 1341 dmu_tx_commit(tx); 1342 1343 ztest_object_unlock(zd, object); 1344 1345 return (0); 1346} 1347 1348static int 1349ztest_replay_write(ztest_ds_t *zd, lr_write_t *lr, boolean_t byteswap) 1350{ 1351 objset_t *os = zd->zd_os; 1352 void *data = lr + 1; /* data follows lr */ 1353 uint64_t offset, length; 1354 ztest_block_tag_t *bt = data; 1355 ztest_block_tag_t *bbt; 1356 uint64_t gen, txg, lrtxg, crtxg; 1357 dmu_object_info_t doi; 1358 dmu_tx_t *tx; 1359 dmu_buf_t *db; 1360 arc_buf_t *abuf = NULL; 1361 rl_t *rl; 1362 1363 if (byteswap) 1364 byteswap_uint64_array(lr, sizeof (*lr)); 1365 1366 offset = lr->lr_offset; 1367 length = lr->lr_length; 1368 1369 /* If it's a dmu_sync() block, write the whole block */ 1370 if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) { 1371 uint64_t blocksize = BP_GET_LSIZE(&lr->lr_blkptr); 1372 if (length < blocksize) { 1373 offset -= offset % blocksize; 1374 length = blocksize; 1375 } 1376 } 1377 1378 if (bt->bt_magic == BSWAP_64(BT_MAGIC)) 1379 byteswap_uint64_array(bt, sizeof (*bt)); 1380 1381 if (bt->bt_magic != BT_MAGIC) 1382 bt = NULL; 1383 1384 ztest_object_lock(zd, lr->lr_foid, RL_READER); 1385 rl = ztest_range_lock(zd, lr->lr_foid, offset, length, RL_WRITER); 1386 1387 VERIFY3U(0, ==, dmu_bonus_hold(os, lr->lr_foid, FTAG, &db)); 1388 1389 dmu_object_info_from_db(db, &doi); 1390 1391 bbt = ztest_bt_bonus(db); 1392 ASSERT3U(bbt->bt_magic, ==, BT_MAGIC); 1393 gen = bbt->bt_gen; 1394 crtxg = bbt->bt_crtxg; 1395 lrtxg = lr->lr_common.lrc_txg; 1396 1397 tx = dmu_tx_create(os); 1398 1399 dmu_tx_hold_write(tx, lr->lr_foid, offset, length); 1400 1401 if (ztest_random(8) == 0 && length == doi.doi_data_block_size && 1402 P2PHASE(offset, length) == 0) 1403 abuf = dmu_request_arcbuf(db, length); 1404 1405 txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); 1406 if (txg == 0) { 1407 if (abuf != NULL) 1408 dmu_return_arcbuf(abuf); 1409 dmu_buf_rele(db, FTAG); 1410 ztest_range_unlock(rl); 1411 ztest_object_unlock(zd, lr->lr_foid); 1412 return (ENOSPC); 1413 } 1414 1415 if (bt != NULL) { 1416 /* 1417 * Usually, verify the old data before writing new data -- 1418 * but not always, because we also want to verify correct 1419 * behavior when the data was not recently read into cache. 1420 */ 1421 ASSERT(offset % doi.doi_data_block_size == 0); 1422 if (ztest_random(4) != 0) { 1423 int prefetch = ztest_random(2) ? 1424 DMU_READ_PREFETCH : DMU_READ_NO_PREFETCH; 1425 ztest_block_tag_t rbt; 1426 1427 VERIFY(dmu_read(os, lr->lr_foid, offset, 1428 sizeof (rbt), &rbt, prefetch) == 0); 1429 if (rbt.bt_magic == BT_MAGIC) { 1430 ztest_bt_verify(&rbt, os, lr->lr_foid, 1431 offset, gen, txg, crtxg); 1432 } 1433 } 1434 1435 /* 1436 * Writes can appear to be newer than the bonus buffer because 1437 * the ztest_get_data() callback does a dmu_read() of the 1438 * open-context data, which may be different than the data 1439 * as it was when the write was generated. 1440 */ 1441 if (zd->zd_zilog->zl_replay) { 1442 ztest_bt_verify(bt, os, lr->lr_foid, offset, 1443 MAX(gen, bt->bt_gen), MAX(txg, lrtxg), 1444 bt->bt_crtxg); 1445 } 1446 1447 /* 1448 * Set the bt's gen/txg to the bonus buffer's gen/txg 1449 * so that all of the usual ASSERTs will work. 1450 */ 1451 ztest_bt_generate(bt, os, lr->lr_foid, offset, gen, txg, crtxg); 1452 } 1453 1454 if (abuf == NULL) { 1455 dmu_write(os, lr->lr_foid, offset, length, data, tx); 1456 } else { 1457 bcopy(data, abuf->b_data, length); 1458 dmu_assign_arcbuf(db, offset, abuf, tx); 1459 } 1460 1461 (void) ztest_log_write(zd, tx, lr); 1462 1463 dmu_buf_rele(db, FTAG); 1464 1465 dmu_tx_commit(tx); 1466 1467 ztest_range_unlock(rl); 1468 ztest_object_unlock(zd, lr->lr_foid); 1469 1470 return (0); 1471} 1472 1473static int 1474ztest_replay_truncate(ztest_ds_t *zd, lr_truncate_t *lr, boolean_t byteswap) 1475{ 1476 objset_t *os = zd->zd_os; 1477 dmu_tx_t *tx; 1478 uint64_t txg; 1479 rl_t *rl; 1480 1481 if (byteswap) 1482 byteswap_uint64_array(lr, sizeof (*lr)); 1483 1484 ztest_object_lock(zd, lr->lr_foid, RL_READER); 1485 rl = ztest_range_lock(zd, lr->lr_foid, lr->lr_offset, lr->lr_length, 1486 RL_WRITER); 1487 1488 tx = dmu_tx_create(os); 1489 1490 dmu_tx_hold_free(tx, lr->lr_foid, lr->lr_offset, lr->lr_length); 1491 1492 txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); 1493 if (txg == 0) { 1494 ztest_range_unlock(rl); 1495 ztest_object_unlock(zd, lr->lr_foid); 1496 return (ENOSPC); 1497 } 1498 1499 VERIFY(dmu_free_range(os, lr->lr_foid, lr->lr_offset, 1500 lr->lr_length, tx) == 0); 1501 1502 (void) ztest_log_truncate(zd, tx, lr); 1503 1504 dmu_tx_commit(tx); 1505 1506 ztest_range_unlock(rl); 1507 ztest_object_unlock(zd, lr->lr_foid); 1508 1509 return (0); 1510} 1511 1512static int 1513ztest_replay_setattr(ztest_ds_t *zd, lr_setattr_t *lr, boolean_t byteswap) 1514{ 1515 objset_t *os = zd->zd_os; 1516 dmu_tx_t *tx; 1517 dmu_buf_t *db; 1518 ztest_block_tag_t *bbt; 1519 uint64_t txg, lrtxg, crtxg; 1520 1521 if (byteswap) 1522 byteswap_uint64_array(lr, sizeof (*lr)); 1523 1524 ztest_object_lock(zd, lr->lr_foid, RL_WRITER); 1525 1526 VERIFY3U(0, ==, dmu_bonus_hold(os, lr->lr_foid, FTAG, &db)); 1527 1528 tx = dmu_tx_create(os); 1529 dmu_tx_hold_bonus(tx, lr->lr_foid); 1530 1531 txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); 1532 if (txg == 0) { 1533 dmu_buf_rele(db, FTAG); 1534 ztest_object_unlock(zd, lr->lr_foid); 1535 return (ENOSPC); 1536 } 1537 1538 bbt = ztest_bt_bonus(db); 1539 ASSERT3U(bbt->bt_magic, ==, BT_MAGIC); 1540 crtxg = bbt->bt_crtxg; 1541 lrtxg = lr->lr_common.lrc_txg; 1542 1543 if (zd->zd_zilog->zl_replay) { 1544 ASSERT(lr->lr_size != 0); 1545 ASSERT(lr->lr_mode != 0); 1546 ASSERT(lrtxg != 0); 1547 } else { 1548 /* 1549 * Randomly change the size and increment the generation. 1550 */ 1551 lr->lr_size = (ztest_random(db->db_size / sizeof (*bbt)) + 1) * 1552 sizeof (*bbt); 1553 lr->lr_mode = bbt->bt_gen + 1; 1554 ASSERT(lrtxg == 0); 1555 } 1556 1557 /* 1558 * Verify that the current bonus buffer is not newer than our txg. 1559 */ 1560 ztest_bt_verify(bbt, os, lr->lr_foid, -1ULL, lr->lr_mode, 1561 MAX(txg, lrtxg), crtxg); 1562 1563 dmu_buf_will_dirty(db, tx); 1564 1565 ASSERT3U(lr->lr_size, >=, sizeof (*bbt)); 1566 ASSERT3U(lr->lr_size, <=, db->db_size); 1567 VERIFY3U(dmu_set_bonus(db, lr->lr_size, tx), ==, 0); 1568 bbt = ztest_bt_bonus(db); 1569 1570 ztest_bt_generate(bbt, os, lr->lr_foid, -1ULL, lr->lr_mode, txg, crtxg); 1571 1572 dmu_buf_rele(db, FTAG); 1573 1574 (void) ztest_log_setattr(zd, tx, lr); 1575 1576 dmu_tx_commit(tx); 1577 1578 ztest_object_unlock(zd, lr->lr_foid); 1579 1580 return (0); 1581} 1582 1583zil_replay_func_t *ztest_replay_vector[TX_MAX_TYPE] = { 1584 NULL, /* 0 no such transaction type */ 1585 ztest_replay_create, /* TX_CREATE */ 1586 NULL, /* TX_MKDIR */ 1587 NULL, /* TX_MKXATTR */ 1588 NULL, /* TX_SYMLINK */ 1589 ztest_replay_remove, /* TX_REMOVE */ 1590 NULL, /* TX_RMDIR */ 1591 NULL, /* TX_LINK */ 1592 NULL, /* TX_RENAME */ 1593 ztest_replay_write, /* TX_WRITE */ 1594 ztest_replay_truncate, /* TX_TRUNCATE */ 1595 ztest_replay_setattr, /* TX_SETATTR */ 1596 NULL, /* TX_ACL */ 1597 NULL, /* TX_CREATE_ACL */ 1598 NULL, /* TX_CREATE_ATTR */ 1599 NULL, /* TX_CREATE_ACL_ATTR */ 1600 NULL, /* TX_MKDIR_ACL */ 1601 NULL, /* TX_MKDIR_ATTR */ 1602 NULL, /* TX_MKDIR_ACL_ATTR */ 1603 NULL, /* TX_WRITE2 */ 1604}; 1605 1606/* 1607 * ZIL get_data callbacks 1608 */ 1609 1610static void 1611ztest_get_done(zgd_t *zgd, int error) 1612{ 1613 ztest_ds_t *zd = zgd->zgd_private; 1614 uint64_t object = zgd->zgd_rl->rl_object; 1615 1616 if (zgd->zgd_db) 1617 dmu_buf_rele(zgd->zgd_db, zgd); 1618 1619 ztest_range_unlock(zgd->zgd_rl); 1620 ztest_object_unlock(zd, object); 1621 1622 if (error == 0 && zgd->zgd_bp) 1623 zil_add_block(zgd->zgd_zilog, zgd->zgd_bp); 1624 1625 umem_free(zgd, sizeof (*zgd)); 1626} 1627 1628static int 1629ztest_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio) 1630{ 1631 ztest_ds_t *zd = arg; 1632 objset_t *os = zd->zd_os; 1633 uint64_t object = lr->lr_foid; 1634 uint64_t offset = lr->lr_offset; 1635 uint64_t size = lr->lr_length; 1636 blkptr_t *bp = &lr->lr_blkptr; 1637 uint64_t txg = lr->lr_common.lrc_txg; 1638 uint64_t crtxg; 1639 dmu_object_info_t doi; 1640 dmu_buf_t *db; 1641 zgd_t *zgd; 1642 int error; 1643 1644 ztest_object_lock(zd, object, RL_READER); 1645 error = dmu_bonus_hold(os, object, FTAG, &db); 1646 if (error) { 1647 ztest_object_unlock(zd, object); 1648 return (error); 1649 } 1650 1651 crtxg = ztest_bt_bonus(db)->bt_crtxg; 1652 1653 if (crtxg == 0 || crtxg > txg) { 1654 dmu_buf_rele(db, FTAG); 1655 ztest_object_unlock(zd, object); 1656 return (ENOENT); 1657 } 1658 1659 dmu_object_info_from_db(db, &doi); 1660 dmu_buf_rele(db, FTAG); 1661 db = NULL; 1662 1663 zgd = umem_zalloc(sizeof (*zgd), UMEM_NOFAIL); 1664 zgd->zgd_zilog = zd->zd_zilog; 1665 zgd->zgd_private = zd; 1666 1667 if (buf != NULL) { /* immediate write */ 1668 zgd->zgd_rl = ztest_range_lock(zd, object, offset, size, 1669 RL_READER); 1670 1671 error = dmu_read(os, object, offset, size, buf, 1672 DMU_READ_NO_PREFETCH); 1673 ASSERT(error == 0); 1674 } else { 1675 size = doi.doi_data_block_size; 1676 if (ISP2(size)) { 1677 offset = P2ALIGN(offset, size); 1678 } else { 1679 ASSERT(offset < size); 1680 offset = 0; 1681 } 1682 1683 zgd->zgd_rl = ztest_range_lock(zd, object, offset, size, 1684 RL_READER); 1685 1686 error = dmu_buf_hold(os, object, offset, zgd, &db, 1687 DMU_READ_NO_PREFETCH); 1688 1689 if (error == 0) { 1690 zgd->zgd_db = db; 1691 zgd->zgd_bp = bp; 1692 1693 ASSERT(db->db_offset == offset); 1694 ASSERT(db->db_size == size); 1695 1696 error = dmu_sync(zio, lr->lr_common.lrc_txg, 1697 ztest_get_done, zgd); 1698 1699 if (error == 0) 1700 return (0); 1701 } 1702 } 1703 1704 ztest_get_done(zgd, error); 1705 1706 return (error); 1707} 1708 1709static void * 1710ztest_lr_alloc(size_t lrsize, char *name) 1711{ 1712 char *lr; 1713 size_t namesize = name ? strlen(name) + 1 : 0; 1714 1715 lr = umem_zalloc(lrsize + namesize, UMEM_NOFAIL); 1716 1717 if (name) 1718 bcopy(name, lr + lrsize, namesize); 1719 1720 return (lr); 1721} 1722 1723void 1724ztest_lr_free(void *lr, size_t lrsize, char *name) 1725{ 1726 size_t namesize = name ? strlen(name) + 1 : 0; 1727 1728 umem_free(lr, lrsize + namesize); 1729} 1730 1731/* 1732 * Lookup a bunch of objects. Returns the number of objects not found. 1733 */ 1734static int 1735ztest_lookup(ztest_ds_t *zd, ztest_od_t *od, int count) 1736{ 1737 int missing = 0; 1738 int error; 1739 1740 ASSERT(_mutex_held(&zd->zd_dirobj_lock)); 1741 1742 for (int i = 0; i < count; i++, od++) { 1743 od->od_object = 0; 1744 error = zap_lookup(zd->zd_os, od->od_dir, od->od_name, 1745 sizeof (uint64_t), 1, &od->od_object); 1746 if (error) { 1747 ASSERT(error == ENOENT); 1748 ASSERT(od->od_object == 0); 1749 missing++; 1750 } else { 1751 dmu_buf_t *db; 1752 ztest_block_tag_t *bbt; 1753 dmu_object_info_t doi; 1754 1755 ASSERT(od->od_object != 0); 1756 ASSERT(missing == 0); /* there should be no gaps */ 1757 1758 ztest_object_lock(zd, od->od_object, RL_READER); 1759 VERIFY3U(0, ==, dmu_bonus_hold(zd->zd_os, 1760 od->od_object, FTAG, &db)); 1761 dmu_object_info_from_db(db, &doi); 1762 bbt = ztest_bt_bonus(db); 1763 ASSERT3U(bbt->bt_magic, ==, BT_MAGIC); 1764 od->od_type = doi.doi_type; 1765 od->od_blocksize = doi.doi_data_block_size; 1766 od->od_gen = bbt->bt_gen; 1767 dmu_buf_rele(db, FTAG); 1768 ztest_object_unlock(zd, od->od_object); 1769 } 1770 } 1771 1772 return (missing); 1773} 1774 1775static int 1776ztest_create(ztest_ds_t *zd, ztest_od_t *od, int count) 1777{ 1778 int missing = 0; 1779 1780 ASSERT(_mutex_held(&zd->zd_dirobj_lock)); 1781 1782 for (int i = 0; i < count; i++, od++) { 1783 if (missing) { 1784 od->od_object = 0; 1785 missing++; 1786 continue; 1787 } 1788 1789 lr_create_t *lr = ztest_lr_alloc(sizeof (*lr), od->od_name); 1790 1791 lr->lr_doid = od->od_dir; 1792 lr->lr_foid = 0; /* 0 to allocate, > 0 to claim */ 1793 lr->lrz_type = od->od_crtype; 1794 lr->lrz_blocksize = od->od_crblocksize; 1795 lr->lrz_ibshift = ztest_random_ibshift(); 1796 lr->lrz_bonustype = DMU_OT_UINT64_OTHER; 1797 lr->lrz_bonuslen = dmu_bonus_max(); 1798 lr->lr_gen = od->od_crgen; 1799 lr->lr_crtime[0] = time(NULL); 1800 1801 if (ztest_replay_create(zd, lr, B_FALSE) != 0) { 1802 ASSERT(missing == 0); 1803 od->od_object = 0; 1804 missing++; 1805 } else { 1806 od->od_object = lr->lr_foid; 1807 od->od_type = od->od_crtype; 1808 od->od_blocksize = od->od_crblocksize; 1809 od->od_gen = od->od_crgen; 1810 ASSERT(od->od_object != 0); 1811 } 1812 1813 ztest_lr_free(lr, sizeof (*lr), od->od_name); 1814 } 1815 1816 return (missing); 1817} 1818 1819static int 1820ztest_remove(ztest_ds_t *zd, ztest_od_t *od, int count) 1821{ 1822 int missing = 0; 1823 int error; 1824 1825 ASSERT(_mutex_held(&zd->zd_dirobj_lock)); 1826 1827 od += count - 1; 1828 1829 for (int i = count - 1; i >= 0; i--, od--) { 1830 if (missing) { 1831 missing++; 1832 continue; 1833 } 1834 1835 if (od->od_object == 0) 1836 continue; 1837 1838 lr_remove_t *lr = ztest_lr_alloc(sizeof (*lr), od->od_name); 1839 1840 lr->lr_doid = od->od_dir; 1841 1842 if ((error = ztest_replay_remove(zd, lr, B_FALSE)) != 0) { 1843 ASSERT3U(error, ==, ENOSPC); 1844 missing++; 1845 } else { 1846 od->od_object = 0; 1847 } 1848 ztest_lr_free(lr, sizeof (*lr), od->od_name); 1849 } 1850 1851 return (missing); 1852} 1853 1854static int 1855ztest_write(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size, 1856 void *data) 1857{ 1858 lr_write_t *lr; 1859 int error; 1860 1861 lr = ztest_lr_alloc(sizeof (*lr) + size, NULL); 1862 1863 lr->lr_foid = object; 1864 lr->lr_offset = offset; 1865 lr->lr_length = size; 1866 lr->lr_blkoff = 0; 1867 BP_ZERO(&lr->lr_blkptr); 1868 1869 bcopy(data, lr + 1, size); 1870 1871 error = ztest_replay_write(zd, lr, B_FALSE); 1872 1873 ztest_lr_free(lr, sizeof (*lr) + size, NULL); 1874 1875 return (error); 1876} 1877 1878static int 1879ztest_truncate(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size) 1880{ 1881 lr_truncate_t *lr; 1882 int error; 1883 1884 lr = ztest_lr_alloc(sizeof (*lr), NULL); 1885 1886 lr->lr_foid = object; 1887 lr->lr_offset = offset; 1888 lr->lr_length = size; 1889 1890 error = ztest_replay_truncate(zd, lr, B_FALSE); 1891 1892 ztest_lr_free(lr, sizeof (*lr), NULL); 1893 1894 return (error); 1895} 1896 1897static int 1898ztest_setattr(ztest_ds_t *zd, uint64_t object) 1899{ 1900 lr_setattr_t *lr; 1901 int error; 1902 1903 lr = ztest_lr_alloc(sizeof (*lr), NULL); 1904 1905 lr->lr_foid = object; 1906 lr->lr_size = 0; 1907 lr->lr_mode = 0; 1908 1909 error = ztest_replay_setattr(zd, lr, B_FALSE); 1910 1911 ztest_lr_free(lr, sizeof (*lr), NULL); 1912 1913 return (error); 1914} 1915 1916static void 1917ztest_prealloc(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size) 1918{ 1919 objset_t *os = zd->zd_os; 1920 dmu_tx_t *tx; 1921 uint64_t txg; 1922 rl_t *rl; 1923 1924 txg_wait_synced(dmu_objset_pool(os), 0); 1925 1926 ztest_object_lock(zd, object, RL_READER); 1927 rl = ztest_range_lock(zd, object, offset, size, RL_WRITER); 1928 1929 tx = dmu_tx_create(os); 1930 1931 dmu_tx_hold_write(tx, object, offset, size); 1932 1933 txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); 1934 1935 if (txg != 0) { 1936 dmu_prealloc(os, object, offset, size, tx); 1937 dmu_tx_commit(tx); 1938 txg_wait_synced(dmu_objset_pool(os), txg); 1939 } else { 1940 (void) dmu_free_long_range(os, object, offset, size); 1941 } 1942 1943 ztest_range_unlock(rl); 1944 ztest_object_unlock(zd, object); 1945} 1946 1947static void 1948ztest_io(ztest_ds_t *zd, uint64_t object, uint64_t offset) 1949{ 1950 ztest_block_tag_t wbt; 1951 dmu_object_info_t doi; 1952 enum ztest_io_type io_type; 1953 uint64_t blocksize; 1954 void *data; 1955 1956 VERIFY(dmu_object_info(zd->zd_os, object, &doi) == 0); 1957 blocksize = doi.doi_data_block_size; 1958 data = umem_alloc(blocksize, UMEM_NOFAIL); 1959 1960 /* 1961 * Pick an i/o type at random, biased toward writing block tags. 1962 */ 1963 io_type = ztest_random(ZTEST_IO_TYPES); 1964 if (ztest_random(2) == 0) 1965 io_type = ZTEST_IO_WRITE_TAG; 1966 1967 switch (io_type) { 1968 1969 case ZTEST_IO_WRITE_TAG: 1970 ztest_bt_generate(&wbt, zd->zd_os, object, offset, 0, 0, 0); 1971 (void) ztest_write(zd, object, offset, sizeof (wbt), &wbt); 1972 break; 1973 1974 case ZTEST_IO_WRITE_PATTERN: 1975 (void) memset(data, 'a' + (object + offset) % 5, blocksize); 1976 if (ztest_random(2) == 0) { 1977 /* 1978 * Induce fletcher2 collisions to ensure that 1979 * zio_ddt_collision() detects and resolves them 1980 * when using fletcher2-verify for deduplication. 1981 */ 1982 ((uint64_t *)data)[0] ^= 1ULL << 63; 1983 ((uint64_t *)data)[4] ^= 1ULL << 63; 1984 } 1985 (void) ztest_write(zd, object, offset, blocksize, data); 1986 break; 1987 1988 case ZTEST_IO_WRITE_ZEROES: 1989 bzero(data, blocksize); 1990 (void) ztest_write(zd, object, offset, blocksize, data); 1991 break; 1992 1993 case ZTEST_IO_TRUNCATE: 1994 (void) ztest_truncate(zd, object, offset, blocksize); 1995 break; 1996 1997 case ZTEST_IO_SETATTR: 1998 (void) ztest_setattr(zd, object); 1999 break; 2000 } 2001 2002 umem_free(data, blocksize); 2003} 2004 2005/* 2006 * Initialize an object description template. 2007 */ 2008static void 2009ztest_od_init(ztest_od_t *od, uint64_t id, char *tag, uint64_t index, 2010 dmu_object_type_t type, uint64_t blocksize, uint64_t gen) 2011{ 2012 od->od_dir = ZTEST_DIROBJ; 2013 od->od_object = 0; 2014 2015 od->od_crtype = type; 2016 od->od_crblocksize = blocksize ? blocksize : ztest_random_blocksize(); 2017 od->od_crgen = gen; 2018 2019 od->od_type = DMU_OT_NONE; 2020 od->od_blocksize = 0; 2021 od->od_gen = 0; 2022 2023 (void) snprintf(od->od_name, sizeof (od->od_name), "%s(%lld)[%llu]", 2024 tag, (int64_t)id, index); 2025} 2026 2027/* 2028 * Lookup or create the objects for a test using the od template. 2029 * If the objects do not all exist, or if 'remove' is specified, 2030 * remove any existing objects and create new ones. Otherwise, 2031 * use the existing objects. 2032 */ 2033static int 2034ztest_object_init(ztest_ds_t *zd, ztest_od_t *od, size_t size, boolean_t remove) 2035{ 2036 int count = size / sizeof (*od); 2037 int rv = 0; 2038 2039 VERIFY(mutex_lock(&zd->zd_dirobj_lock) == 0); 2040 if ((ztest_lookup(zd, od, count) != 0 || remove) && 2041 (ztest_remove(zd, od, count) != 0 || 2042 ztest_create(zd, od, count) != 0)) 2043 rv = -1; 2044 zd->zd_od = od; 2045 VERIFY(mutex_unlock(&zd->zd_dirobj_lock) == 0); 2046 2047 return (rv); 2048} 2049 2050/* ARGSUSED */ 2051void 2052ztest_zil_commit(ztest_ds_t *zd, uint64_t id) 2053{ 2054 zilog_t *zilog = zd->zd_zilog; 2055 2056 zil_commit(zilog, ztest_random(ZTEST_OBJECTS)); 2057 2058 /* 2059 * Remember the committed values in zd, which is in parent/child 2060 * shared memory. If we die, the next iteration of ztest_run() 2061 * will verify that the log really does contain this record. 2062 */ 2063 mutex_enter(&zilog->zl_lock); 2064 ASSERT(zd->zd_seq <= zilog->zl_commit_lr_seq); 2065 zd->zd_seq = zilog->zl_commit_lr_seq; 2066 mutex_exit(&zilog->zl_lock); 2067} 2068 2069/* 2070 * Verify that we can't destroy an active pool, create an existing pool, 2071 * or create a pool with a bad vdev spec. 2072 */ 2073/* ARGSUSED */ 2074void 2075ztest_spa_create_destroy(ztest_ds_t *zd, uint64_t id) 2076{ 2077 ztest_shared_t *zs = ztest_shared; 2078 spa_t *spa; 2079 nvlist_t *nvroot; 2080 2081 /* 2082 * Attempt to create using a bad file. 2083 */ 2084 nvroot = make_vdev_root("/dev/bogus", NULL, 0, 0, 0, 0, 0, 1); 2085 VERIFY3U(ENOENT, ==, 2086 spa_create("ztest_bad_file", nvroot, NULL, NULL, NULL)); 2087 nvlist_free(nvroot); 2088 2089 /* 2090 * Attempt to create using a bad mirror. 2091 */ 2092 nvroot = make_vdev_root("/dev/bogus", NULL, 0, 0, 0, 0, 2, 1); 2093 VERIFY3U(ENOENT, ==, 2094 spa_create("ztest_bad_mirror", nvroot, NULL, NULL, NULL)); 2095 nvlist_free(nvroot); 2096 2097 /* 2098 * Attempt to create an existing pool. It shouldn't matter 2099 * what's in the nvroot; we should fail with EEXIST. 2100 */ 2101 (void) rw_rdlock(&zs->zs_name_lock); 2102 nvroot = make_vdev_root("/dev/bogus", NULL, 0, 0, 0, 0, 0, 1); 2103 VERIFY3U(EEXIST, ==, spa_create(zs->zs_pool, nvroot, NULL, NULL, NULL)); 2104 nvlist_free(nvroot); 2105 VERIFY3U(0, ==, spa_open(zs->zs_pool, &spa, FTAG)); 2106 VERIFY3U(EBUSY, ==, spa_destroy(zs->zs_pool)); 2107 spa_close(spa, FTAG); 2108 2109 (void) rw_unlock(&zs->zs_name_lock); 2110} 2111 2112static vdev_t * 2113vdev_lookup_by_path(vdev_t *vd, const char *path) 2114{ 2115 vdev_t *mvd; 2116 2117 if (vd->vdev_path != NULL && strcmp(path, vd->vdev_path) == 0) 2118 return (vd); 2119 2120 for (int c = 0; c < vd->vdev_children; c++) 2121 if ((mvd = vdev_lookup_by_path(vd->vdev_child[c], path)) != 2122 NULL) 2123 return (mvd); 2124 2125 return (NULL); 2126} 2127 2128/* 2129 * Find the first available hole which can be used as a top-level. 2130 */ 2131int 2132find_vdev_hole(spa_t *spa) 2133{ 2134 vdev_t *rvd = spa->spa_root_vdev; 2135 int c; 2136 2137 ASSERT(spa_config_held(spa, SCL_VDEV, RW_READER) == SCL_VDEV); 2138 2139 for (c = 0; c < rvd->vdev_children; c++) { 2140 vdev_t *cvd = rvd->vdev_child[c]; 2141 2142 if (cvd->vdev_ishole) 2143 break; 2144 } 2145 return (c); 2146} 2147 2148/* 2149 * Verify that vdev_add() works as expected. 2150 */ 2151/* ARGSUSED */ 2152void 2153ztest_vdev_add_remove(ztest_ds_t *zd, uint64_t id) 2154{ 2155 ztest_shared_t *zs = ztest_shared; 2156 spa_t *spa = zs->zs_spa; 2157 uint64_t leaves; 2158 uint64_t guid; 2159 nvlist_t *nvroot; 2160 int error; 2161 2162 VERIFY(mutex_lock(&zs->zs_vdev_lock) == 0); 2163 leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * zopt_raidz; 2164 2165 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 2166 2167 ztest_shared->zs_vdev_next_leaf = find_vdev_hole(spa) * leaves; 2168 2169 /* 2170 * If we have slogs then remove them 1/4 of the time. 2171 */ 2172 if (spa_has_slogs(spa) && ztest_random(4) == 0) { 2173 /* 2174 * Grab the guid from the head of the log class rotor. 2175 */ 2176 guid = spa_log_class(spa)->mc_rotor->mg_vd->vdev_guid; 2177 2178 spa_config_exit(spa, SCL_VDEV, FTAG); 2179 2180 /* 2181 * We have to grab the zs_name_lock as writer to 2182 * prevent a race between removing a slog (dmu_objset_find) 2183 * and destroying a dataset. Removing the slog will 2184 * grab a reference on the dataset which may cause 2185 * dmu_objset_destroy() to fail with EBUSY thus 2186 * leaving the dataset in an inconsistent state. 2187 */ 2188 VERIFY(rw_wrlock(&ztest_shared->zs_name_lock) == 0); 2189 error = spa_vdev_remove(spa, guid, B_FALSE); 2190 VERIFY(rw_unlock(&ztest_shared->zs_name_lock) == 0); 2191 2192 if (error && error != EEXIST) 2193 fatal(0, "spa_vdev_remove() = %d", error); 2194 } else { 2195 spa_config_exit(spa, SCL_VDEV, FTAG); 2196 2197 /* 2198 * Make 1/4 of the devices be log devices. 2199 */ 2200 nvroot = make_vdev_root(NULL, NULL, zopt_vdev_size, 0, 2201 ztest_random(4) == 0, zopt_raidz, zs->zs_mirrors, 1); 2202 2203 error = spa_vdev_add(spa, nvroot); 2204 nvlist_free(nvroot); 2205 2206 if (error == ENOSPC) 2207 ztest_record_enospc("spa_vdev_add"); 2208 else if (error != 0) 2209 fatal(0, "spa_vdev_add() = %d", error); 2210 } 2211 2212 VERIFY(mutex_unlock(&ztest_shared->zs_vdev_lock) == 0); 2213} 2214 2215/* 2216 * Verify that adding/removing aux devices (l2arc, hot spare) works as expected. 2217 */ 2218/* ARGSUSED */ 2219void 2220ztest_vdev_aux_add_remove(ztest_ds_t *zd, uint64_t id) 2221{ 2222 ztest_shared_t *zs = ztest_shared; 2223 spa_t *spa = zs->zs_spa; 2224 vdev_t *rvd = spa->spa_root_vdev; 2225 spa_aux_vdev_t *sav; 2226 char *aux; 2227 uint64_t guid = 0; 2228 int error; 2229 2230 if (ztest_random(2) == 0) { 2231 sav = &spa->spa_spares; 2232 aux = ZPOOL_CONFIG_SPARES; 2233 } else { 2234 sav = &spa->spa_l2cache; 2235 aux = ZPOOL_CONFIG_L2CACHE; 2236 } 2237 2238 VERIFY(mutex_lock(&zs->zs_vdev_lock) == 0); 2239 2240 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 2241 2242 if (sav->sav_count != 0 && ztest_random(4) == 0) { 2243 /* 2244 * Pick a random device to remove. 2245 */ 2246 guid = sav->sav_vdevs[ztest_random(sav->sav_count)]->vdev_guid; 2247 } else { 2248 /* 2249 * Find an unused device we can add. 2250 */ 2251 zs->zs_vdev_aux = 0; 2252 for (;;) { 2253 char path[MAXPATHLEN]; 2254 int c; 2255 (void) sprintf(path, ztest_aux_template, zopt_dir, 2256 zopt_pool, aux, zs->zs_vdev_aux); 2257 for (c = 0; c < sav->sav_count; c++) 2258 if (strcmp(sav->sav_vdevs[c]->vdev_path, 2259 path) == 0) 2260 break; 2261 if (c == sav->sav_count && 2262 vdev_lookup_by_path(rvd, path) == NULL) 2263 break; 2264 zs->zs_vdev_aux++; 2265 } 2266 } 2267 2268 spa_config_exit(spa, SCL_VDEV, FTAG); 2269 2270 if (guid == 0) { 2271 /* 2272 * Add a new device. 2273 */ 2274 nvlist_t *nvroot = make_vdev_root(NULL, aux, 2275 (zopt_vdev_size * 5) / 4, 0, 0, 0, 0, 1); 2276 error = spa_vdev_add(spa, nvroot); 2277 if (error != 0) 2278 fatal(0, "spa_vdev_add(%p) = %d", nvroot, error); 2279 nvlist_free(nvroot); 2280 } else { 2281 /* 2282 * Remove an existing device. Sometimes, dirty its 2283 * vdev state first to make sure we handle removal 2284 * of devices that have pending state changes. 2285 */ 2286 if (ztest_random(2) == 0) 2287 (void) vdev_online(spa, guid, 0, NULL); 2288 2289 error = spa_vdev_remove(spa, guid, B_FALSE); 2290 if (error != 0 && error != EBUSY) 2291 fatal(0, "spa_vdev_remove(%llu) = %d", guid, error); 2292 } 2293 2294 VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0); 2295} 2296 2297/* 2298 * split a pool if it has mirror tlvdevs 2299 */ 2300/* ARGSUSED */ 2301void 2302ztest_split_pool(ztest_ds_t *zd, uint64_t id) 2303{ 2304 ztest_shared_t *zs = ztest_shared; 2305 spa_t *spa = zs->zs_spa; 2306 vdev_t *rvd = spa->spa_root_vdev; 2307 nvlist_t *tree, **child, *config, *split, **schild; 2308 uint_t c, children, schildren = 0, lastlogid = 0; 2309 int error = 0; 2310 2311 VERIFY(mutex_lock(&zs->zs_vdev_lock) == 0); 2312 2313 /* ensure we have a useable config; mirrors of raidz aren't supported */ 2314 if (zs->zs_mirrors < 3 || zopt_raidz > 1) { 2315 VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0); 2316 return; 2317 } 2318 2319 /* clean up the old pool, if any */ 2320 (void) spa_destroy("splitp"); 2321 2322 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 2323 2324 /* generate a config from the existing config */ 2325 mutex_enter(&spa->spa_props_lock); 2326 VERIFY(nvlist_lookup_nvlist(spa->spa_config, ZPOOL_CONFIG_VDEV_TREE, 2327 &tree) == 0); 2328 mutex_exit(&spa->spa_props_lock); 2329 2330 VERIFY(nvlist_lookup_nvlist_array(tree, ZPOOL_CONFIG_CHILDREN, &child, 2331 &children) == 0); 2332 2333 schild = malloc(rvd->vdev_children * sizeof (nvlist_t *)); 2334 for (c = 0; c < children; c++) { 2335 vdev_t *tvd = rvd->vdev_child[c]; 2336 nvlist_t **mchild; 2337 uint_t mchildren; 2338 2339 if (tvd->vdev_islog || tvd->vdev_ops == &vdev_hole_ops) { 2340 VERIFY(nvlist_alloc(&schild[schildren], NV_UNIQUE_NAME, 2341 0) == 0); 2342 VERIFY(nvlist_add_string(schild[schildren], 2343 ZPOOL_CONFIG_TYPE, VDEV_TYPE_HOLE) == 0); 2344 VERIFY(nvlist_add_uint64(schild[schildren], 2345 ZPOOL_CONFIG_IS_HOLE, 1) == 0); 2346 if (lastlogid == 0) 2347 lastlogid = schildren; 2348 ++schildren; 2349 continue; 2350 } 2351 lastlogid = 0; 2352 VERIFY(nvlist_lookup_nvlist_array(child[c], 2353 ZPOOL_CONFIG_CHILDREN, &mchild, &mchildren) == 0); 2354 VERIFY(nvlist_dup(mchild[0], &schild[schildren++], 0) == 0); 2355 } 2356 2357 /* OK, create a config that can be used to split */ 2358 VERIFY(nvlist_alloc(&split, NV_UNIQUE_NAME, 0) == 0); 2359 VERIFY(nvlist_add_string(split, ZPOOL_CONFIG_TYPE, 2360 VDEV_TYPE_ROOT) == 0); 2361 VERIFY(nvlist_add_nvlist_array(split, ZPOOL_CONFIG_CHILDREN, schild, 2362 lastlogid != 0 ? lastlogid : schildren) == 0); 2363 2364 VERIFY(nvlist_alloc(&config, NV_UNIQUE_NAME, 0) == 0); 2365 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, split) == 0); 2366 2367 for (c = 0; c < schildren; c++) 2368 nvlist_free(schild[c]); 2369 free(schild); 2370 nvlist_free(split); 2371 2372 spa_config_exit(spa, SCL_VDEV, FTAG); 2373 2374 (void) rw_wrlock(&zs->zs_name_lock); 2375 error = spa_vdev_split_mirror(spa, "splitp", config, NULL, B_FALSE); 2376 (void) rw_unlock(&zs->zs_name_lock); 2377 2378 nvlist_free(config); 2379 2380 if (error == 0) { 2381 (void) printf("successful split - results:\n"); 2382 mutex_enter(&spa_namespace_lock); 2383 show_pool_stats(spa); 2384 show_pool_stats(spa_lookup("splitp")); 2385 mutex_exit(&spa_namespace_lock); 2386 ++zs->zs_splits; 2387 --zs->zs_mirrors; 2388 } 2389 VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0); 2390 2391} 2392 2393/* 2394 * Verify that we can attach and detach devices. 2395 */ 2396/* ARGSUSED */ 2397void 2398ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id) 2399{ 2400 ztest_shared_t *zs = ztest_shared; 2401 spa_t *spa = zs->zs_spa; 2402 spa_aux_vdev_t *sav = &spa->spa_spares; 2403 vdev_t *rvd = spa->spa_root_vdev; 2404 vdev_t *oldvd, *newvd, *pvd; 2405 nvlist_t *root; 2406 uint64_t leaves; 2407 uint64_t leaf, top; 2408 uint64_t ashift = ztest_get_ashift(); 2409 uint64_t oldguid, pguid; 2410 size_t oldsize, newsize; 2411 char oldpath[MAXPATHLEN], newpath[MAXPATHLEN]; 2412 int replacing; 2413 int oldvd_has_siblings = B_FALSE; 2414 int newvd_is_spare = B_FALSE; 2415 int oldvd_is_log; 2416 int error, expected_error; 2417 2418 VERIFY(mutex_lock(&zs->zs_vdev_lock) == 0); 2419 leaves = MAX(zs->zs_mirrors, 1) * zopt_raidz; 2420 2421 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 2422 2423 /* 2424 * Decide whether to do an attach or a replace. 2425 */ 2426 replacing = ztest_random(2); 2427 2428 /* 2429 * Pick a random top-level vdev. 2430 */ 2431 top = ztest_random_vdev_top(spa, B_TRUE); 2432 2433 /* 2434 * Pick a random leaf within it. 2435 */ 2436 leaf = ztest_random(leaves); 2437 2438 /* 2439 * Locate this vdev. 2440 */ 2441 oldvd = rvd->vdev_child[top]; 2442 if (zs->zs_mirrors >= 1) { 2443 ASSERT(oldvd->vdev_ops == &vdev_mirror_ops); 2444 ASSERT(oldvd->vdev_children >= zs->zs_mirrors); 2445 oldvd = oldvd->vdev_child[leaf / zopt_raidz]; 2446 } 2447 if (zopt_raidz > 1) { 2448 ASSERT(oldvd->vdev_ops == &vdev_raidz_ops); 2449 ASSERT(oldvd->vdev_children == zopt_raidz); 2450 oldvd = oldvd->vdev_child[leaf % zopt_raidz]; 2451 } 2452 2453 /* 2454 * If we're already doing an attach or replace, oldvd may be a 2455 * mirror vdev -- in which case, pick a random child. 2456 */ 2457 while (oldvd->vdev_children != 0) { 2458 oldvd_has_siblings = B_TRUE; 2459 ASSERT(oldvd->vdev_children >= 2); 2460 oldvd = oldvd->vdev_child[ztest_random(oldvd->vdev_children)]; 2461 } 2462 2463 oldguid = oldvd->vdev_guid; 2464 oldsize = vdev_get_min_asize(oldvd); 2465 oldvd_is_log = oldvd->vdev_top->vdev_islog; 2466 (void) strcpy(oldpath, oldvd->vdev_path); 2467 pvd = oldvd->vdev_parent; 2468 pguid = pvd->vdev_guid; 2469 2470 /* 2471 * If oldvd has siblings, then half of the time, detach it. 2472 */ 2473 if (oldvd_has_siblings && ztest_random(2) == 0) { 2474 spa_config_exit(spa, SCL_VDEV, FTAG); 2475 error = spa_vdev_detach(spa, oldguid, pguid, B_FALSE); 2476 if (error != 0 && error != ENODEV && error != EBUSY && 2477 error != ENOTSUP) 2478 fatal(0, "detach (%s) returned %d", oldpath, error); 2479 VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0); 2480 return; 2481 } 2482 2483 /* 2484 * For the new vdev, choose with equal probability between the two 2485 * standard paths (ending in either 'a' or 'b') or a random hot spare. 2486 */ 2487 if (sav->sav_count != 0 && ztest_random(3) == 0) { 2488 newvd = sav->sav_vdevs[ztest_random(sav->sav_count)]; 2489 newvd_is_spare = B_TRUE; 2490 (void) strcpy(newpath, newvd->vdev_path); 2491 } else { 2492 (void) snprintf(newpath, sizeof (newpath), ztest_dev_template, 2493 zopt_dir, zopt_pool, top * leaves + leaf); 2494 if (ztest_random(2) == 0) 2495 newpath[strlen(newpath) - 1] = 'b'; 2496 newvd = vdev_lookup_by_path(rvd, newpath); 2497 } 2498 2499 if (newvd) { 2500 newsize = vdev_get_min_asize(newvd); 2501 } else { 2502 /* 2503 * Make newsize a little bigger or smaller than oldsize. 2504 * If it's smaller, the attach should fail. 2505 * If it's larger, and we're doing a replace, 2506 * we should get dynamic LUN growth when we're done. 2507 */ 2508 newsize = 10 * oldsize / (9 + ztest_random(3)); 2509 } 2510 2511 /* 2512 * If pvd is not a mirror or root, the attach should fail with ENOTSUP, 2513 * unless it's a replace; in that case any non-replacing parent is OK. 2514 * 2515 * If newvd is already part of the pool, it should fail with EBUSY. 2516 * 2517 * If newvd is too small, it should fail with EOVERFLOW. 2518 */ 2519 if (pvd->vdev_ops != &vdev_mirror_ops && 2520 pvd->vdev_ops != &vdev_root_ops && (!replacing || 2521 pvd->vdev_ops == &vdev_replacing_ops || 2522 pvd->vdev_ops == &vdev_spare_ops)) 2523 expected_error = ENOTSUP; 2524 else if (newvd_is_spare && (!replacing || oldvd_is_log)) 2525 expected_error = ENOTSUP; 2526 else if (newvd == oldvd) 2527 expected_error = replacing ? 0 : EBUSY; 2528 else if (vdev_lookup_by_path(rvd, newpath) != NULL) 2529 expected_error = EBUSY; 2530 else if (newsize < oldsize) 2531 expected_error = EOVERFLOW; 2532 else if (ashift > oldvd->vdev_top->vdev_ashift) 2533 expected_error = EDOM; 2534 else 2535 expected_error = 0; 2536 2537 spa_config_exit(spa, SCL_VDEV, FTAG); 2538 2539 /* 2540 * Build the nvlist describing newpath. 2541 */ 2542 root = make_vdev_root(newpath, NULL, newvd == NULL ? newsize : 0, 2543 ashift, 0, 0, 0, 1); 2544 2545 error = spa_vdev_attach(spa, oldguid, root, replacing); 2546 2547 nvlist_free(root); 2548 2549 /* 2550 * If our parent was the replacing vdev, but the replace completed, 2551 * then instead of failing with ENOTSUP we may either succeed, 2552 * fail with ENODEV, or fail with EOVERFLOW. 2553 */ 2554 if (expected_error == ENOTSUP && 2555 (error == 0 || error == ENODEV || error == EOVERFLOW)) 2556 expected_error = error; 2557 2558 /* 2559 * If someone grew the LUN, the replacement may be too small. 2560 */ 2561 if (error == EOVERFLOW || error == EBUSY) 2562 expected_error = error; 2563 2564 /* XXX workaround 6690467 */ 2565 if (error != expected_error && expected_error != EBUSY) { 2566 fatal(0, "attach (%s %llu, %s %llu, %d) " 2567 "returned %d, expected %d", 2568 oldpath, (longlong_t)oldsize, newpath, 2569 (longlong_t)newsize, replacing, error, expected_error); 2570 } 2571 2572 VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0); 2573} 2574 2575/* 2576 * Callback function which expands the physical size of the vdev. 2577 */ 2578vdev_t * 2579grow_vdev(vdev_t *vd, void *arg) 2580{ 2581 spa_t *spa = vd->vdev_spa; 2582 size_t *newsize = arg; 2583 size_t fsize; 2584 int fd; 2585 2586 ASSERT(spa_config_held(spa, SCL_STATE, RW_READER) == SCL_STATE); 2587 ASSERT(vd->vdev_ops->vdev_op_leaf); 2588 2589 if ((fd = open(vd->vdev_path, O_RDWR)) == -1) 2590 return (vd); 2591 2592 fsize = lseek(fd, 0, SEEK_END); 2593 (void) ftruncate(fd, *newsize); 2594 2595 if (zopt_verbose >= 6) { 2596 (void) printf("%s grew from %lu to %lu bytes\n", 2597 vd->vdev_path, (ulong_t)fsize, (ulong_t)*newsize); 2598 } 2599 (void) close(fd); 2600 return (NULL); 2601} 2602 2603/* 2604 * Callback function which expands a given vdev by calling vdev_online(). 2605 */ 2606/* ARGSUSED */ 2607vdev_t * 2608online_vdev(vdev_t *vd, void *arg) 2609{ 2610 spa_t *spa = vd->vdev_spa; 2611 vdev_t *tvd = vd->vdev_top; 2612 uint64_t guid = vd->vdev_guid; 2613 uint64_t generation = spa->spa_config_generation + 1; 2614 vdev_state_t newstate = VDEV_STATE_UNKNOWN; 2615 int error; 2616 2617 ASSERT(spa_config_held(spa, SCL_STATE, RW_READER) == SCL_STATE); 2618 ASSERT(vd->vdev_ops->vdev_op_leaf); 2619 2620 /* Calling vdev_online will initialize the new metaslabs */ 2621 spa_config_exit(spa, SCL_STATE, spa); 2622 error = vdev_online(spa, guid, ZFS_ONLINE_EXPAND, &newstate); 2623 spa_config_enter(spa, SCL_STATE, spa, RW_READER); 2624 2625 /* 2626 * If vdev_online returned an error or the underlying vdev_open 2627 * failed then we abort the expand. The only way to know that 2628 * vdev_open fails is by checking the returned newstate. 2629 */ 2630 if (error || newstate != VDEV_STATE_HEALTHY) { 2631 if (zopt_verbose >= 5) { 2632 (void) printf("Unable to expand vdev, state %llu, " 2633 "error %d\n", (u_longlong_t)newstate, error); 2634 } 2635 return (vd); 2636 } 2637 ASSERT3U(newstate, ==, VDEV_STATE_HEALTHY); 2638 2639 /* 2640 * Since we dropped the lock we need to ensure that we're 2641 * still talking to the original vdev. It's possible this 2642 * vdev may have been detached/replaced while we were 2643 * trying to online it. 2644 */ 2645 if (generation != spa->spa_config_generation) { 2646 if (zopt_verbose >= 5) { 2647 (void) printf("vdev configuration has changed, " 2648 "guid %llu, state %llu, expected gen %llu, " 2649 "got gen %llu\n", 2650 (u_longlong_t)guid, 2651 (u_longlong_t)tvd->vdev_state, 2652 (u_longlong_t)generation, 2653 (u_longlong_t)spa->spa_config_generation); 2654 } 2655 return (vd); 2656 } 2657 return (NULL); 2658} 2659 2660/* 2661 * Traverse the vdev tree calling the supplied function. 2662 * We continue to walk the tree until we either have walked all 2663 * children or we receive a non-NULL return from the callback. 2664 * If a NULL callback is passed, then we just return back the first 2665 * leaf vdev we encounter. 2666 */ 2667vdev_t * 2668vdev_walk_tree(vdev_t *vd, vdev_t *(*func)(vdev_t *, void *), void *arg) 2669{ 2670 if (vd->vdev_ops->vdev_op_leaf) { 2671 if (func == NULL) 2672 return (vd); 2673 else 2674 return (func(vd, arg)); 2675 } 2676 2677 for (uint_t c = 0; c < vd->vdev_children; c++) { 2678 vdev_t *cvd = vd->vdev_child[c]; 2679 if ((cvd = vdev_walk_tree(cvd, func, arg)) != NULL) 2680 return (cvd); 2681 } 2682 return (NULL); 2683} 2684 2685/* 2686 * Verify that dynamic LUN growth works as expected. 2687 */ 2688/* ARGSUSED */ 2689void 2690ztest_vdev_LUN_growth(ztest_ds_t *zd, uint64_t id) 2691{ 2692 ztest_shared_t *zs = ztest_shared; 2693 spa_t *spa = zs->zs_spa; 2694 vdev_t *vd, *tvd; 2695 metaslab_class_t *mc; 2696 metaslab_group_t *mg; 2697 size_t psize, newsize; 2698 uint64_t top; 2699 uint64_t old_class_space, new_class_space, old_ms_count, new_ms_count; 2700 2701 VERIFY(mutex_lock(&zs->zs_vdev_lock) == 0); 2702 spa_config_enter(spa, SCL_STATE, spa, RW_READER); 2703 2704 top = ztest_random_vdev_top(spa, B_TRUE); 2705 2706 tvd = spa->spa_root_vdev->vdev_child[top]; 2707 mg = tvd->vdev_mg; 2708 mc = mg->mg_class; 2709 old_ms_count = tvd->vdev_ms_count; 2710 old_class_space = metaslab_class_get_space(mc); 2711 2712 /* 2713 * Determine the size of the first leaf vdev associated with 2714 * our top-level device. 2715 */ 2716 vd = vdev_walk_tree(tvd, NULL, NULL); 2717 ASSERT3P(vd, !=, NULL); 2718 ASSERT(vd->vdev_ops->vdev_op_leaf); 2719 2720 psize = vd->vdev_psize; 2721 2722 /* 2723 * We only try to expand the vdev if it's healthy, less than 4x its 2724 * original size, and it has a valid psize. 2725 */ 2726 if (tvd->vdev_state != VDEV_STATE_HEALTHY || 2727 psize == 0 || psize >= 4 * zopt_vdev_size) { 2728 spa_config_exit(spa, SCL_STATE, spa); 2729 VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0); 2730 return; 2731 } 2732 ASSERT(psize > 0); 2733 newsize = psize + psize / 8; 2734 ASSERT3U(newsize, >, psize); 2735 2736 if (zopt_verbose >= 6) { 2737 (void) printf("Expanding LUN %s from %lu to %lu\n", 2738 vd->vdev_path, (ulong_t)psize, (ulong_t)newsize); 2739 } 2740 2741 /* 2742 * Growing the vdev is a two step process: 2743 * 1). expand the physical size (i.e. relabel) 2744 * 2). online the vdev to create the new metaslabs 2745 */ 2746 if (vdev_walk_tree(tvd, grow_vdev, &newsize) != NULL || 2747 vdev_walk_tree(tvd, online_vdev, NULL) != NULL || 2748 tvd->vdev_state != VDEV_STATE_HEALTHY) { 2749 if (zopt_verbose >= 5) { 2750 (void) printf("Could not expand LUN because " 2751 "the vdev configuration changed.\n"); 2752 } 2753 spa_config_exit(spa, SCL_STATE, spa); 2754 VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0); 2755 return; 2756 } 2757 2758 spa_config_exit(spa, SCL_STATE, spa); 2759 2760 /* 2761 * Expanding the LUN will update the config asynchronously, 2762 * thus we must wait for the async thread to complete any 2763 * pending tasks before proceeding. 2764 */ 2765 for (;;) { 2766 boolean_t done; 2767 mutex_enter(&spa->spa_async_lock); 2768 done = (spa->spa_async_thread == NULL && !spa->spa_async_tasks); 2769 mutex_exit(&spa->spa_async_lock); 2770 if (done) 2771 break; 2772 txg_wait_synced(spa_get_dsl(spa), 0); 2773 (void) poll(NULL, 0, 100); 2774 } 2775 2776 spa_config_enter(spa, SCL_STATE, spa, RW_READER); 2777 2778 tvd = spa->spa_root_vdev->vdev_child[top]; 2779 new_ms_count = tvd->vdev_ms_count; 2780 new_class_space = metaslab_class_get_space(mc); 2781 2782 if (tvd->vdev_mg != mg || mg->mg_class != mc) { 2783 if (zopt_verbose >= 5) { 2784 (void) printf("Could not verify LUN expansion due to " 2785 "intervening vdev offline or remove.\n"); 2786 } 2787 spa_config_exit(spa, SCL_STATE, spa); 2788 VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0); 2789 return; 2790 } 2791 2792 /* 2793 * Make sure we were able to grow the vdev. 2794 */ 2795 if (new_ms_count <= old_ms_count) 2796 fatal(0, "LUN expansion failed: ms_count %llu <= %llu\n", 2797 old_ms_count, new_ms_count); 2798 2799 /* 2800 * Make sure we were able to grow the pool. 2801 */ 2802 if (new_class_space <= old_class_space) 2803 fatal(0, "LUN expansion failed: class_space %llu <= %llu\n", 2804 old_class_space, new_class_space); 2805 2806 if (zopt_verbose >= 5) { 2807 char oldnumbuf[6], newnumbuf[6]; 2808 2809 nicenum(old_class_space, oldnumbuf); 2810 nicenum(new_class_space, newnumbuf); 2811 (void) printf("%s grew from %s to %s\n", 2812 spa->spa_name, oldnumbuf, newnumbuf); 2813 } 2814 2815 spa_config_exit(spa, SCL_STATE, spa); 2816 VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0); 2817} 2818 2819/* 2820 * Verify that dmu_objset_{create,destroy,open,close} work as expected. 2821 */ 2822/* ARGSUSED */ 2823static void 2824ztest_objset_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx) 2825{ 2826 /* 2827 * Create the objects common to all ztest datasets. 2828 */ 2829 VERIFY(zap_create_claim(os, ZTEST_DIROBJ, 2830 DMU_OT_ZAP_OTHER, DMU_OT_NONE, 0, tx) == 0); 2831} 2832 2833static int 2834ztest_dataset_create(char *dsname) 2835{ 2836 uint64_t zilset = ztest_random(100); 2837 int err = dmu_objset_create(dsname, DMU_OST_OTHER, 0, 2838 ztest_objset_create_cb, NULL); 2839 2840 if (err || zilset < 80) 2841 return (err); 2842 2843 (void) printf("Setting dataset %s to sync always\n", dsname); 2844 return (ztest_dsl_prop_set_uint64(dsname, ZFS_PROP_SYNC, 2845 ZFS_SYNC_ALWAYS, B_FALSE)); 2846} 2847 2848/* ARGSUSED */ 2849static int 2850ztest_objset_destroy_cb(const char *name, void *arg) 2851{ 2852 objset_t *os; 2853 dmu_object_info_t doi; 2854 int error; 2855 2856 /* 2857 * Verify that the dataset contains a directory object. 2858 */ 2859 VERIFY3U(0, ==, dmu_objset_hold(name, FTAG, &os)); 2860 error = dmu_object_info(os, ZTEST_DIROBJ, &doi); 2861 if (error != ENOENT) { 2862 /* We could have crashed in the middle of destroying it */ 2863 ASSERT3U(error, ==, 0); 2864 ASSERT3U(doi.doi_type, ==, DMU_OT_ZAP_OTHER); 2865 ASSERT3S(doi.doi_physical_blocks_512, >=, 0); 2866 } 2867 dmu_objset_rele(os, FTAG); 2868 2869 /* 2870 * Destroy the dataset. 2871 */ 2872 VERIFY3U(0, ==, dmu_objset_destroy(name, B_FALSE)); 2873 return (0); 2874} 2875 2876static boolean_t 2877ztest_snapshot_create(char *osname, uint64_t id) 2878{ 2879 char snapname[MAXNAMELEN]; 2880 int error; 2881 2882 (void) snprintf(snapname, MAXNAMELEN, "%s@%llu", osname, 2883 (u_longlong_t)id); 2884 2885 error = dmu_objset_snapshot(osname, strchr(snapname, '@') + 1, 2886 NULL, NULL, B_FALSE, B_FALSE, -1); 2887 if (error == ENOSPC) { 2888 ztest_record_enospc(FTAG); 2889 return (B_FALSE); 2890 } 2891 if (error != 0 && error != EEXIST) 2892 fatal(0, "ztest_snapshot_create(%s) = %d", snapname, error); 2893 return (B_TRUE); 2894} 2895 2896static boolean_t 2897ztest_snapshot_destroy(char *osname, uint64_t id) 2898{ 2899 char snapname[MAXNAMELEN]; 2900 int error; 2901 2902 (void) snprintf(snapname, MAXNAMELEN, "%s@%llu", osname, 2903 (u_longlong_t)id); 2904 2905 error = dmu_objset_destroy(snapname, B_FALSE); 2906 if (error != 0 && error != ENOENT) 2907 fatal(0, "ztest_snapshot_destroy(%s) = %d", snapname, error); 2908 return (B_TRUE); 2909} 2910 2911/* ARGSUSED */ 2912void 2913ztest_dmu_objset_create_destroy(ztest_ds_t *zd, uint64_t id) 2914{ 2915 ztest_shared_t *zs = ztest_shared; 2916 ztest_ds_t zdtmp; 2917 int iters; 2918 int error; 2919 objset_t *os, *os2; 2920 char name[MAXNAMELEN]; 2921 zilog_t *zilog; 2922 2923 (void) rw_rdlock(&zs->zs_name_lock); 2924 2925 (void) snprintf(name, MAXNAMELEN, "%s/temp_%llu", 2926 zs->zs_pool, (u_longlong_t)id); 2927 2928 /* 2929 * If this dataset exists from a previous run, process its replay log 2930 * half of the time. If we don't replay it, then dmu_objset_destroy() 2931 * (invoked from ztest_objset_destroy_cb()) should just throw it away. 2932 */ 2933 if (ztest_random(2) == 0 && 2934 dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, FTAG, &os) == 0) { 2935 ztest_zd_init(&zdtmp, os); 2936 zil_replay(os, &zdtmp, ztest_replay_vector); 2937 ztest_zd_fini(&zdtmp); 2938 dmu_objset_disown(os, FTAG); 2939 } 2940 2941 /* 2942 * There may be an old instance of the dataset we're about to 2943 * create lying around from a previous run. If so, destroy it 2944 * and all of its snapshots. 2945 */ 2946 (void) dmu_objset_find(name, ztest_objset_destroy_cb, NULL, 2947 DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS); 2948 2949 /* 2950 * Verify that the destroyed dataset is no longer in the namespace. 2951 */ 2952 VERIFY3U(ENOENT, ==, dmu_objset_hold(name, FTAG, &os)); 2953 2954 /* 2955 * Verify that we can create a new dataset. 2956 */ 2957 error = ztest_dataset_create(name); 2958 if (error) { 2959 if (error == ENOSPC) { 2960 ztest_record_enospc(FTAG); 2961 (void) rw_unlock(&zs->zs_name_lock); 2962 return; 2963 } 2964 fatal(0, "dmu_objset_create(%s) = %d", name, error); 2965 } 2966 2967 VERIFY3U(0, ==, 2968 dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, FTAG, &os)); 2969 2970 ztest_zd_init(&zdtmp, os); 2971 2972 /* 2973 * Open the intent log for it. 2974 */ 2975 zilog = zil_open(os, ztest_get_data); 2976 2977 /* 2978 * Put some objects in there, do a little I/O to them, 2979 * and randomly take a couple of snapshots along the way. 2980 */ 2981 iters = ztest_random(5); 2982 for (int i = 0; i < iters; i++) { 2983 ztest_dmu_object_alloc_free(&zdtmp, id); 2984 if (ztest_random(iters) == 0) 2985 (void) ztest_snapshot_create(name, i); 2986 } 2987 2988 /* 2989 * Verify that we cannot create an existing dataset. 2990 */ 2991 VERIFY3U(EEXIST, ==, 2992 dmu_objset_create(name, DMU_OST_OTHER, 0, NULL, NULL)); 2993 2994 /* 2995 * Verify that we can hold an objset that is also owned. 2996 */ 2997 VERIFY3U(0, ==, dmu_objset_hold(name, FTAG, &os2)); 2998 dmu_objset_rele(os2, FTAG); 2999 3000 /* 3001 * Verify that we cannot own an objset that is already owned. 3002 */ 3003 VERIFY3U(EBUSY, ==, 3004 dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, FTAG, &os2)); 3005 3006 zil_close(zilog); 3007 dmu_objset_disown(os, FTAG); 3008 ztest_zd_fini(&zdtmp); 3009 3010 (void) rw_unlock(&zs->zs_name_lock); 3011} 3012 3013/* 3014 * Verify that dmu_snapshot_{create,destroy,open,close} work as expected. 3015 */ 3016void 3017ztest_dmu_snapshot_create_destroy(ztest_ds_t *zd, uint64_t id) 3018{ 3019 ztest_shared_t *zs = ztest_shared; 3020 3021 (void) rw_rdlock(&zs->zs_name_lock); 3022 (void) ztest_snapshot_destroy(zd->zd_name, id); 3023 (void) ztest_snapshot_create(zd->zd_name, id); 3024 (void) rw_unlock(&zs->zs_name_lock); 3025} 3026 3027/* 3028 * Cleanup non-standard snapshots and clones. 3029 */ 3030void 3031ztest_dsl_dataset_cleanup(char *osname, uint64_t id) 3032{ 3033 char snap1name[MAXNAMELEN]; 3034 char clone1name[MAXNAMELEN]; 3035 char snap2name[MAXNAMELEN]; 3036 char clone2name[MAXNAMELEN]; 3037 char snap3name[MAXNAMELEN]; 3038 int error; 3039 3040 (void) snprintf(snap1name, MAXNAMELEN, "%s@s1_%llu", osname, id); 3041 (void) snprintf(clone1name, MAXNAMELEN, "%s/c1_%llu", osname, id); 3042 (void) snprintf(snap2name, MAXNAMELEN, "%s@s2_%llu", clone1name, id); 3043 (void) snprintf(clone2name, MAXNAMELEN, "%s/c2_%llu", osname, id); 3044 (void) snprintf(snap3name, MAXNAMELEN, "%s@s3_%llu", clone1name, id); 3045 3046 error = dmu_objset_destroy(clone2name, B_FALSE); 3047 if (error && error != ENOENT) 3048 fatal(0, "dmu_objset_destroy(%s) = %d", clone2name, error); 3049 error = dmu_objset_destroy(snap3name, B_FALSE); 3050 if (error && error != ENOENT) 3051 fatal(0, "dmu_objset_destroy(%s) = %d", snap3name, error); 3052 error = dmu_objset_destroy(snap2name, B_FALSE); 3053 if (error && error != ENOENT) 3054 fatal(0, "dmu_objset_destroy(%s) = %d", snap2name, error); 3055 error = dmu_objset_destroy(clone1name, B_FALSE); 3056 if (error && error != ENOENT) 3057 fatal(0, "dmu_objset_destroy(%s) = %d", clone1name, error); 3058 error = dmu_objset_destroy(snap1name, B_FALSE); 3059 if (error && error != ENOENT) 3060 fatal(0, "dmu_objset_destroy(%s) = %d", snap1name, error); 3061} 3062 3063/* 3064 * Verify dsl_dataset_promote handles EBUSY 3065 */ 3066void 3067ztest_dsl_dataset_promote_busy(ztest_ds_t *zd, uint64_t id) 3068{ 3069 ztest_shared_t *zs = ztest_shared; 3070 objset_t *clone; 3071 dsl_dataset_t *ds; 3072 char snap1name[MAXNAMELEN]; 3073 char clone1name[MAXNAMELEN]; 3074 char snap2name[MAXNAMELEN]; 3075 char clone2name[MAXNAMELEN]; 3076 char snap3name[MAXNAMELEN]; 3077 char *osname = zd->zd_name; 3078 int error; 3079 3080 (void) rw_rdlock(&zs->zs_name_lock); 3081 3082 ztest_dsl_dataset_cleanup(osname, id); 3083 3084 (void) snprintf(snap1name, MAXNAMELEN, "%s@s1_%llu", osname, id); 3085 (void) snprintf(clone1name, MAXNAMELEN, "%s/c1_%llu", osname, id); 3086 (void) snprintf(snap2name, MAXNAMELEN, "%s@s2_%llu", clone1name, id); 3087 (void) snprintf(clone2name, MAXNAMELEN, "%s/c2_%llu", osname, id); 3088 (void) snprintf(snap3name, MAXNAMELEN, "%s@s3_%llu", clone1name, id); 3089 3090 error = dmu_objset_snapshot(osname, strchr(snap1name, '@')+1, 3091 NULL, NULL, B_FALSE, B_FALSE, -1); 3092 if (error && error != EEXIST) { 3093 if (error == ENOSPC) { 3094 ztest_record_enospc(FTAG); 3095 goto out; 3096 } 3097 fatal(0, "dmu_take_snapshot(%s) = %d", snap1name, error); 3098 } 3099 3100 error = dmu_objset_hold(snap1name, FTAG, &clone); 3101 if (error) 3102 fatal(0, "dmu_open_snapshot(%s) = %d", snap1name, error); 3103 3104 error = dmu_objset_clone(clone1name, dmu_objset_ds(clone), 0); 3105 dmu_objset_rele(clone, FTAG); 3106 if (error) { 3107 if (error == ENOSPC) { 3108 ztest_record_enospc(FTAG); 3109 goto out; 3110 } 3111 fatal(0, "dmu_objset_create(%s) = %d", clone1name, error); 3112 } 3113 3114 error = dmu_objset_snapshot(clone1name, strchr(snap2name, '@')+1, 3115 NULL, NULL, B_FALSE, B_FALSE, -1); 3116 if (error && error != EEXIST) { 3117 if (error == ENOSPC) { 3118 ztest_record_enospc(FTAG); 3119 goto out; 3120 } 3121 fatal(0, "dmu_open_snapshot(%s) = %d", snap2name, error); 3122 } 3123 3124 error = dmu_objset_snapshot(clone1name, strchr(snap3name, '@')+1, 3125 NULL, NULL, B_FALSE, B_FALSE, -1); 3126 if (error && error != EEXIST) { 3127 if (error == ENOSPC) { 3128 ztest_record_enospc(FTAG); 3129 goto out; 3130 } 3131 fatal(0, "dmu_open_snapshot(%s) = %d", snap3name, error); 3132 } 3133 3134 error = dmu_objset_hold(snap3name, FTAG, &clone); 3135 if (error) 3136 fatal(0, "dmu_open_snapshot(%s) = %d", snap3name, error); 3137 3138 error = dmu_objset_clone(clone2name, dmu_objset_ds(clone), 0); 3139 dmu_objset_rele(clone, FTAG); 3140 if (error) { 3141 if (error == ENOSPC) { 3142 ztest_record_enospc(FTAG); 3143 goto out; 3144 } 3145 fatal(0, "dmu_objset_create(%s) = %d", clone2name, error); 3146 } 3147 3148 error = dsl_dataset_own(snap2name, B_FALSE, FTAG, &ds); 3149 if (error) 3150 fatal(0, "dsl_dataset_own(%s) = %d", snap2name, error); 3151 error = dsl_dataset_promote(clone2name, NULL); 3152 if (error != EBUSY) 3153 fatal(0, "dsl_dataset_promote(%s), %d, not EBUSY", clone2name, 3154 error); 3155 dsl_dataset_disown(ds, FTAG); 3156 3157out: 3158 ztest_dsl_dataset_cleanup(osname, id); 3159 3160 (void) rw_unlock(&zs->zs_name_lock); 3161} 3162 3163/* 3164 * Verify that dmu_object_{alloc,free} work as expected. 3165 */ 3166void 3167ztest_dmu_object_alloc_free(ztest_ds_t *zd, uint64_t id) 3168{ 3169 ztest_od_t od[4]; 3170 int batchsize = sizeof (od) / sizeof (od[0]); 3171 3172 for (int b = 0; b < batchsize; b++) 3173 ztest_od_init(&od[b], id, FTAG, b, DMU_OT_UINT64_OTHER, 0, 0); 3174 3175 /* 3176 * Destroy the previous batch of objects, create a new batch, 3177 * and do some I/O on the new objects. 3178 */ 3179 if (ztest_object_init(zd, od, sizeof (od), B_TRUE) != 0) 3180 return; 3181 3182 while (ztest_random(4 * batchsize) != 0) 3183 ztest_io(zd, od[ztest_random(batchsize)].od_object, 3184 ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT); 3185} 3186 3187/* 3188 * Verify that dmu_{read,write} work as expected. 3189 */ 3190void 3191ztest_dmu_read_write(ztest_ds_t *zd, uint64_t id) 3192{ 3193 objset_t *os = zd->zd_os; 3194 ztest_od_t od[2]; 3195 dmu_tx_t *tx; 3196 int i, freeit, error; 3197 uint64_t n, s, txg; 3198 bufwad_t *packbuf, *bigbuf, *pack, *bigH, *bigT; 3199 uint64_t packobj, packoff, packsize, bigobj, bigoff, bigsize; 3200 uint64_t chunksize = (1000 + ztest_random(1000)) * sizeof (uint64_t); 3201 uint64_t regions = 997; 3202 uint64_t stride = 123456789ULL; 3203 uint64_t width = 40; 3204 int free_percent = 5; 3205 3206 /* 3207 * This test uses two objects, packobj and bigobj, that are always 3208 * updated together (i.e. in the same tx) so that their contents are 3209 * in sync and can be compared. Their contents relate to each other 3210 * in a simple way: packobj is a dense array of 'bufwad' structures, 3211 * while bigobj is a sparse array of the same bufwads. Specifically, 3212 * for any index n, there are three bufwads that should be identical: 3213 * 3214 * packobj, at offset n * sizeof (bufwad_t) 3215 * bigobj, at the head of the nth chunk 3216 * bigobj, at the tail of the nth chunk 3217 * 3218 * The chunk size is arbitrary. It doesn't have to be a power of two, 3219 * and it doesn't have any relation to the object blocksize. 3220 * The only requirement is that it can hold at least two bufwads. 3221 * 3222 * Normally, we write the bufwad to each of these locations. 3223 * However, free_percent of the time we instead write zeroes to 3224 * packobj and perform a dmu_free_range() on bigobj. By comparing 3225 * bigobj to packobj, we can verify that the DMU is correctly 3226 * tracking which parts of an object are allocated and free, 3227 * and that the contents of the allocated blocks are correct. 3228 */ 3229 3230 /* 3231 * Read the directory info. If it's the first time, set things up. 3232 */ 3233 ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, 0, chunksize); 3234 ztest_od_init(&od[1], id, FTAG, 1, DMU_OT_UINT64_OTHER, 0, chunksize); 3235 3236 if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0) 3237 return; 3238 3239 bigobj = od[0].od_object; 3240 packobj = od[1].od_object; 3241 chunksize = od[0].od_gen; 3242 ASSERT(chunksize == od[1].od_gen); 3243 3244 /* 3245 * Prefetch a random chunk of the big object. 3246 * Our aim here is to get some async reads in flight 3247 * for blocks that we may free below; the DMU should 3248 * handle this race correctly. 3249 */ 3250 n = ztest_random(regions) * stride + ztest_random(width); 3251 s = 1 + ztest_random(2 * width - 1); 3252 dmu_prefetch(os, bigobj, n * chunksize, s * chunksize); 3253 3254 /* 3255 * Pick a random index and compute the offsets into packobj and bigobj. 3256 */ 3257 n = ztest_random(regions) * stride + ztest_random(width); 3258 s = 1 + ztest_random(width - 1); 3259 3260 packoff = n * sizeof (bufwad_t); 3261 packsize = s * sizeof (bufwad_t); 3262 3263 bigoff = n * chunksize; 3264 bigsize = s * chunksize; 3265 3266 packbuf = umem_alloc(packsize, UMEM_NOFAIL); 3267 bigbuf = umem_alloc(bigsize, UMEM_NOFAIL); 3268 3269 /* 3270 * free_percent of the time, free a range of bigobj rather than 3271 * overwriting it. 3272 */ 3273 freeit = (ztest_random(100) < free_percent); 3274 3275 /* 3276 * Read the current contents of our objects. 3277 */ 3278 error = dmu_read(os, packobj, packoff, packsize, packbuf, 3279 DMU_READ_PREFETCH); 3280 ASSERT3U(error, ==, 0); 3281 error = dmu_read(os, bigobj, bigoff, bigsize, bigbuf, 3282 DMU_READ_PREFETCH); 3283 ASSERT3U(error, ==, 0); 3284 3285 /* 3286 * Get a tx for the mods to both packobj and bigobj. 3287 */ 3288 tx = dmu_tx_create(os); 3289 3290 dmu_tx_hold_write(tx, packobj, packoff, packsize); 3291 3292 if (freeit) 3293 dmu_tx_hold_free(tx, bigobj, bigoff, bigsize); 3294 else 3295 dmu_tx_hold_write(tx, bigobj, bigoff, bigsize); 3296 3297 txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); 3298 if (txg == 0) { 3299 umem_free(packbuf, packsize); 3300 umem_free(bigbuf, bigsize); 3301 return; 3302 } 3303 3304 dmu_object_set_checksum(os, bigobj, 3305 (enum zio_checksum)ztest_random_dsl_prop(ZFS_PROP_CHECKSUM), tx); 3306 3307 dmu_object_set_compress(os, bigobj, 3308 (enum zio_compress)ztest_random_dsl_prop(ZFS_PROP_COMPRESSION), tx); 3309 3310 /* 3311 * For each index from n to n + s, verify that the existing bufwad 3312 * in packobj matches the bufwads at the head and tail of the 3313 * corresponding chunk in bigobj. Then update all three bufwads 3314 * with the new values we want to write out. 3315 */ 3316 for (i = 0; i < s; i++) { 3317 /* LINTED */ 3318 pack = (bufwad_t *)((char *)packbuf + i * sizeof (bufwad_t)); 3319 /* LINTED */ 3320 bigH = (bufwad_t *)((char *)bigbuf + i * chunksize); 3321 /* LINTED */ 3322 bigT = (bufwad_t *)((char *)bigH + chunksize) - 1; 3323 3324 ASSERT((uintptr_t)bigH - (uintptr_t)bigbuf < bigsize); 3325 ASSERT((uintptr_t)bigT - (uintptr_t)bigbuf < bigsize); 3326 3327 if (pack->bw_txg > txg) 3328 fatal(0, "future leak: got %llx, open txg is %llx", 3329 pack->bw_txg, txg); 3330 3331 if (pack->bw_data != 0 && pack->bw_index != n + i) 3332 fatal(0, "wrong index: got %llx, wanted %llx+%llx", 3333 pack->bw_index, n, i); 3334 3335 if (bcmp(pack, bigH, sizeof (bufwad_t)) != 0) 3336 fatal(0, "pack/bigH mismatch in %p/%p", pack, bigH); 3337 3338 if (bcmp(pack, bigT, sizeof (bufwad_t)) != 0) 3339 fatal(0, "pack/bigT mismatch in %p/%p", pack, bigT); 3340 3341 if (freeit) { 3342 bzero(pack, sizeof (bufwad_t)); 3343 } else { 3344 pack->bw_index = n + i; 3345 pack->bw_txg = txg; 3346 pack->bw_data = 1 + ztest_random(-2ULL); 3347 } 3348 *bigH = *pack; 3349 *bigT = *pack; 3350 } 3351 3352 /* 3353 * We've verified all the old bufwads, and made new ones. 3354 * Now write them out. 3355 */ 3356 dmu_write(os, packobj, packoff, packsize, packbuf, tx); 3357 3358 if (freeit) { 3359 if (zopt_verbose >= 7) { 3360 (void) printf("freeing offset %llx size %llx" 3361 " txg %llx\n", 3362 (u_longlong_t)bigoff, 3363 (u_longlong_t)bigsize, 3364 (u_longlong_t)txg); 3365 } 3366 VERIFY(0 == dmu_free_range(os, bigobj, bigoff, bigsize, tx)); 3367 } else { 3368 if (zopt_verbose >= 7) { 3369 (void) printf("writing offset %llx size %llx" 3370 " txg %llx\n", 3371 (u_longlong_t)bigoff, 3372 (u_longlong_t)bigsize, 3373 (u_longlong_t)txg); 3374 } 3375 dmu_write(os, bigobj, bigoff, bigsize, bigbuf, tx); 3376 } 3377 3378 dmu_tx_commit(tx); 3379 3380 /* 3381 * Sanity check the stuff we just wrote. 3382 */ 3383 { 3384 void *packcheck = umem_alloc(packsize, UMEM_NOFAIL); 3385 void *bigcheck = umem_alloc(bigsize, UMEM_NOFAIL); 3386 3387 VERIFY(0 == dmu_read(os, packobj, packoff, 3388 packsize, packcheck, DMU_READ_PREFETCH)); 3389 VERIFY(0 == dmu_read(os, bigobj, bigoff, 3390 bigsize, bigcheck, DMU_READ_PREFETCH)); 3391 3392 ASSERT(bcmp(packbuf, packcheck, packsize) == 0); 3393 ASSERT(bcmp(bigbuf, bigcheck, bigsize) == 0); 3394 3395 umem_free(packcheck, packsize); 3396 umem_free(bigcheck, bigsize); 3397 } 3398 3399 umem_free(packbuf, packsize); 3400 umem_free(bigbuf, bigsize); 3401} 3402 3403void 3404compare_and_update_pbbufs(uint64_t s, bufwad_t *packbuf, bufwad_t *bigbuf, 3405 uint64_t bigsize, uint64_t n, uint64_t chunksize, uint64_t txg) 3406{ 3407 uint64_t i; 3408 bufwad_t *pack; 3409 bufwad_t *bigH; 3410 bufwad_t *bigT; 3411 3412 /* 3413 * For each index from n to n + s, verify that the existing bufwad 3414 * in packobj matches the bufwads at the head and tail of the 3415 * corresponding chunk in bigobj. Then update all three bufwads 3416 * with the new values we want to write out. 3417 */ 3418 for (i = 0; i < s; i++) { 3419 /* LINTED */ 3420 pack = (bufwad_t *)((char *)packbuf + i * sizeof (bufwad_t)); 3421 /* LINTED */ 3422 bigH = (bufwad_t *)((char *)bigbuf + i * chunksize); 3423 /* LINTED */ 3424 bigT = (bufwad_t *)((char *)bigH + chunksize) - 1; 3425 3426 ASSERT((uintptr_t)bigH - (uintptr_t)bigbuf < bigsize); 3427 ASSERT((uintptr_t)bigT - (uintptr_t)bigbuf < bigsize); 3428 3429 if (pack->bw_txg > txg) 3430 fatal(0, "future leak: got %llx, open txg is %llx", 3431 pack->bw_txg, txg); 3432 3433 if (pack->bw_data != 0 && pack->bw_index != n + i) 3434 fatal(0, "wrong index: got %llx, wanted %llx+%llx", 3435 pack->bw_index, n, i); 3436 3437 if (bcmp(pack, bigH, sizeof (bufwad_t)) != 0) 3438 fatal(0, "pack/bigH mismatch in %p/%p", pack, bigH); 3439 3440 if (bcmp(pack, bigT, sizeof (bufwad_t)) != 0) 3441 fatal(0, "pack/bigT mismatch in %p/%p", pack, bigT); 3442 3443 pack->bw_index = n + i; 3444 pack->bw_txg = txg; 3445 pack->bw_data = 1 + ztest_random(-2ULL); 3446 3447 *bigH = *pack; 3448 *bigT = *pack; 3449 } 3450} 3451 3452void 3453ztest_dmu_read_write_zcopy(ztest_ds_t *zd, uint64_t id) 3454{ 3455 objset_t *os = zd->zd_os; 3456 ztest_od_t od[2]; 3457 dmu_tx_t *tx; 3458 uint64_t i; 3459 int error; 3460 uint64_t n, s, txg; 3461 bufwad_t *packbuf, *bigbuf; 3462 uint64_t packobj, packoff, packsize, bigobj, bigoff, bigsize; 3463 uint64_t blocksize = ztest_random_blocksize(); 3464 uint64_t chunksize = blocksize; 3465 uint64_t regions = 997; 3466 uint64_t stride = 123456789ULL; 3467 uint64_t width = 9; 3468 dmu_buf_t *bonus_db; 3469 arc_buf_t **bigbuf_arcbufs; 3470 dmu_object_info_t doi; 3471 3472 /* 3473 * This test uses two objects, packobj and bigobj, that are always 3474 * updated together (i.e. in the same tx) so that their contents are 3475 * in sync and can be compared. Their contents relate to each other 3476 * in a simple way: packobj is a dense array of 'bufwad' structures, 3477 * while bigobj is a sparse array of the same bufwads. Specifically, 3478 * for any index n, there are three bufwads that should be identical: 3479 * 3480 * packobj, at offset n * sizeof (bufwad_t) 3481 * bigobj, at the head of the nth chunk 3482 * bigobj, at the tail of the nth chunk 3483 * 3484 * The chunk size is set equal to bigobj block size so that 3485 * dmu_assign_arcbuf() can be tested for object updates. 3486 */ 3487 3488 /* 3489 * Read the directory info. If it's the first time, set things up. 3490 */ 3491 ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, 0); 3492 ztest_od_init(&od[1], id, FTAG, 1, DMU_OT_UINT64_OTHER, 0, chunksize); 3493 3494 if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0) 3495 return; 3496 3497 bigobj = od[0].od_object; 3498 packobj = od[1].od_object; 3499 blocksize = od[0].od_blocksize; 3500 chunksize = blocksize; 3501 ASSERT(chunksize == od[1].od_gen); 3502 3503 VERIFY(dmu_object_info(os, bigobj, &doi) == 0); 3504 VERIFY(ISP2(doi.doi_data_block_size)); 3505 VERIFY(chunksize == doi.doi_data_block_size); 3506 VERIFY(chunksize >= 2 * sizeof (bufwad_t)); 3507 3508 /* 3509 * Pick a random index and compute the offsets into packobj and bigobj. 3510 */ 3511 n = ztest_random(regions) * stride + ztest_random(width); 3512 s = 1 + ztest_random(width - 1); 3513 3514 packoff = n * sizeof (bufwad_t); 3515 packsize = s * sizeof (bufwad_t); 3516 3517 bigoff = n * chunksize; 3518 bigsize = s * chunksize; 3519 3520 packbuf = umem_zalloc(packsize, UMEM_NOFAIL); 3521 bigbuf = umem_zalloc(bigsize, UMEM_NOFAIL); 3522 3523 VERIFY3U(0, ==, dmu_bonus_hold(os, bigobj, FTAG, &bonus_db)); 3524 3525 bigbuf_arcbufs = umem_zalloc(2 * s * sizeof (arc_buf_t *), UMEM_NOFAIL); 3526 3527 /* 3528 * Iteration 0 test zcopy for DB_UNCACHED dbufs. 3529 * Iteration 1 test zcopy to already referenced dbufs. 3530 * Iteration 2 test zcopy to dirty dbuf in the same txg. 3531 * Iteration 3 test zcopy to dbuf dirty in previous txg. 3532 * Iteration 4 test zcopy when dbuf is no longer dirty. 3533 * Iteration 5 test zcopy when it can't be done. 3534 * Iteration 6 one more zcopy write. 3535 */ 3536 for (i = 0; i < 7; i++) { 3537 uint64_t j; 3538 uint64_t off; 3539 3540 /* 3541 * In iteration 5 (i == 5) use arcbufs 3542 * that don't match bigobj blksz to test 3543 * dmu_assign_arcbuf() when it can't directly 3544 * assign an arcbuf to a dbuf. 3545 */ 3546 for (j = 0; j < s; j++) { 3547 if (i != 5) { 3548 bigbuf_arcbufs[j] = 3549 dmu_request_arcbuf(bonus_db, chunksize); 3550 } else { 3551 bigbuf_arcbufs[2 * j] = 3552 dmu_request_arcbuf(bonus_db, chunksize / 2); 3553 bigbuf_arcbufs[2 * j + 1] = 3554 dmu_request_arcbuf(bonus_db, chunksize / 2); 3555 } 3556 } 3557 3558 /* 3559 * Get a tx for the mods to both packobj and bigobj. 3560 */ 3561 tx = dmu_tx_create(os); 3562 3563 dmu_tx_hold_write(tx, packobj, packoff, packsize); 3564 dmu_tx_hold_write(tx, bigobj, bigoff, bigsize); 3565 3566 txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); 3567 if (txg == 0) { 3568 umem_free(packbuf, packsize); 3569 umem_free(bigbuf, bigsize); 3570 for (j = 0; j < s; j++) { 3571 if (i != 5) { 3572 dmu_return_arcbuf(bigbuf_arcbufs[j]); 3573 } else { 3574 dmu_return_arcbuf( 3575 bigbuf_arcbufs[2 * j]); 3576 dmu_return_arcbuf( 3577 bigbuf_arcbufs[2 * j + 1]); 3578 } 3579 } 3580 umem_free(bigbuf_arcbufs, 2 * s * sizeof (arc_buf_t *)); 3581 dmu_buf_rele(bonus_db, FTAG); 3582 return; 3583 } 3584 3585 /* 3586 * 50% of the time don't read objects in the 1st iteration to 3587 * test dmu_assign_arcbuf() for the case when there're no 3588 * existing dbufs for the specified offsets. 3589 */ 3590 if (i != 0 || ztest_random(2) != 0) { 3591 error = dmu_read(os, packobj, packoff, 3592 packsize, packbuf, DMU_READ_PREFETCH); 3593 ASSERT3U(error, ==, 0); 3594 error = dmu_read(os, bigobj, bigoff, bigsize, 3595 bigbuf, DMU_READ_PREFETCH); 3596 ASSERT3U(error, ==, 0); 3597 } 3598 compare_and_update_pbbufs(s, packbuf, bigbuf, bigsize, 3599 n, chunksize, txg); 3600 3601 /* 3602 * We've verified all the old bufwads, and made new ones. 3603 * Now write them out. 3604 */ 3605 dmu_write(os, packobj, packoff, packsize, packbuf, tx); 3606 if (zopt_verbose >= 7) { 3607 (void) printf("writing offset %llx size %llx" 3608 " txg %llx\n", 3609 (u_longlong_t)bigoff, 3610 (u_longlong_t)bigsize, 3611 (u_longlong_t)txg); 3612 } 3613 for (off = bigoff, j = 0; j < s; j++, off += chunksize) { 3614 dmu_buf_t *dbt; 3615 if (i != 5) { 3616 bcopy((caddr_t)bigbuf + (off - bigoff), 3617 bigbuf_arcbufs[j]->b_data, chunksize); 3618 } else { 3619 bcopy((caddr_t)bigbuf + (off - bigoff), 3620 bigbuf_arcbufs[2 * j]->b_data, 3621 chunksize / 2); 3622 bcopy((caddr_t)bigbuf + (off - bigoff) + 3623 chunksize / 2, 3624 bigbuf_arcbufs[2 * j + 1]->b_data, 3625 chunksize / 2); 3626 } 3627 3628 if (i == 1) { 3629 VERIFY(dmu_buf_hold(os, bigobj, off, 3630 FTAG, &dbt, DMU_READ_NO_PREFETCH) == 0); 3631 } 3632 if (i != 5) { 3633 dmu_assign_arcbuf(bonus_db, off, 3634 bigbuf_arcbufs[j], tx); 3635 } else { 3636 dmu_assign_arcbuf(bonus_db, off, 3637 bigbuf_arcbufs[2 * j], tx); 3638 dmu_assign_arcbuf(bonus_db, 3639 off + chunksize / 2, 3640 bigbuf_arcbufs[2 * j + 1], tx); 3641 } 3642 if (i == 1) { 3643 dmu_buf_rele(dbt, FTAG); 3644 } 3645 } 3646 dmu_tx_commit(tx); 3647 3648 /* 3649 * Sanity check the stuff we just wrote. 3650 */ 3651 { 3652 void *packcheck = umem_alloc(packsize, UMEM_NOFAIL); 3653 void *bigcheck = umem_alloc(bigsize, UMEM_NOFAIL); 3654 3655 VERIFY(0 == dmu_read(os, packobj, packoff, 3656 packsize, packcheck, DMU_READ_PREFETCH)); 3657 VERIFY(0 == dmu_read(os, bigobj, bigoff, 3658 bigsize, bigcheck, DMU_READ_PREFETCH)); 3659 3660 ASSERT(bcmp(packbuf, packcheck, packsize) == 0); 3661 ASSERT(bcmp(bigbuf, bigcheck, bigsize) == 0); 3662 3663 umem_free(packcheck, packsize); 3664 umem_free(bigcheck, bigsize); 3665 } 3666 if (i == 2) { 3667 txg_wait_open(dmu_objset_pool(os), 0); 3668 } else if (i == 3) { 3669 txg_wait_synced(dmu_objset_pool(os), 0); 3670 } 3671 } 3672 3673 dmu_buf_rele(bonus_db, FTAG); 3674 umem_free(packbuf, packsize); 3675 umem_free(bigbuf, bigsize); 3676 umem_free(bigbuf_arcbufs, 2 * s * sizeof (arc_buf_t *)); 3677} 3678 3679/* ARGSUSED */ 3680void 3681ztest_dmu_write_parallel(ztest_ds_t *zd, uint64_t id) 3682{ 3683 ztest_od_t od[1]; 3684 uint64_t offset = (1ULL << (ztest_random(20) + 43)) + 3685 (ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT); 3686 3687 /* 3688 * Have multiple threads write to large offsets in an object 3689 * to verify that parallel writes to an object -- even to the 3690 * same blocks within the object -- doesn't cause any trouble. 3691 */ 3692 ztest_od_init(&od[0], ID_PARALLEL, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0); 3693 3694 if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0) 3695 return; 3696 3697 while (ztest_random(10) != 0) 3698 ztest_io(zd, od[0].od_object, offset); 3699} 3700 3701void 3702ztest_dmu_prealloc(ztest_ds_t *zd, uint64_t id) 3703{ 3704 ztest_od_t od[1]; 3705 uint64_t offset = (1ULL << (ztest_random(4) + SPA_MAXBLOCKSHIFT)) + 3706 (ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT); 3707 uint64_t count = ztest_random(20) + 1; 3708 uint64_t blocksize = ztest_random_blocksize(); 3709 void *data; 3710 3711 ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, 0); 3712 3713 if (ztest_object_init(zd, od, sizeof (od), !ztest_random(2)) != 0) 3714 return; 3715 3716 if (ztest_truncate(zd, od[0].od_object, offset, count * blocksize) != 0) 3717 return; 3718 3719 ztest_prealloc(zd, od[0].od_object, offset, count * blocksize); 3720 3721 data = umem_zalloc(blocksize, UMEM_NOFAIL); 3722 3723 while (ztest_random(count) != 0) { 3724 uint64_t randoff = offset + (ztest_random(count) * blocksize); 3725 if (ztest_write(zd, od[0].od_object, randoff, blocksize, 3726 data) != 0) 3727 break; 3728 while (ztest_random(4) != 0) 3729 ztest_io(zd, od[0].od_object, randoff); 3730 } 3731 3732 umem_free(data, blocksize); 3733} 3734 3735/* 3736 * Verify that zap_{create,destroy,add,remove,update} work as expected. 3737 */ 3738#define ZTEST_ZAP_MIN_INTS 1 3739#define ZTEST_ZAP_MAX_INTS 4 3740#define ZTEST_ZAP_MAX_PROPS 1000 3741 3742void 3743ztest_zap(ztest_ds_t *zd, uint64_t id) 3744{ 3745 objset_t *os = zd->zd_os; 3746 ztest_od_t od[1]; 3747 uint64_t object; 3748 uint64_t txg, last_txg; 3749 uint64_t value[ZTEST_ZAP_MAX_INTS]; 3750 uint64_t zl_ints, zl_intsize, prop; 3751 int i, ints; 3752 dmu_tx_t *tx; 3753 char propname[100], txgname[100]; 3754 int error; 3755 char *hc[2] = { "s.acl.h", ".s.open.h.hyLZlg" }; 3756 3757 ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_ZAP_OTHER, 0, 0); 3758 3759 if (ztest_object_init(zd, od, sizeof (od), !ztest_random(2)) != 0) 3760 return; 3761 3762 object = od[0].od_object; 3763 3764 /* 3765 * Generate a known hash collision, and verify that 3766 * we can lookup and remove both entries. 3767 */ 3768 tx = dmu_tx_create(os); 3769 dmu_tx_hold_zap(tx, object, B_TRUE, NULL); 3770 txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); 3771 if (txg == 0) 3772 return; 3773 for (i = 0; i < 2; i++) { 3774 value[i] = i; 3775 VERIFY3U(0, ==, zap_add(os, object, hc[i], sizeof (uint64_t), 3776 1, &value[i], tx)); 3777 } 3778 for (i = 0; i < 2; i++) { 3779 VERIFY3U(EEXIST, ==, zap_add(os, object, hc[i], 3780 sizeof (uint64_t), 1, &value[i], tx)); 3781 VERIFY3U(0, ==, 3782 zap_length(os, object, hc[i], &zl_intsize, &zl_ints)); 3783 ASSERT3U(zl_intsize, ==, sizeof (uint64_t)); 3784 ASSERT3U(zl_ints, ==, 1); 3785 } 3786 for (i = 0; i < 2; i++) { 3787 VERIFY3U(0, ==, zap_remove(os, object, hc[i], tx)); 3788 } 3789 dmu_tx_commit(tx); 3790 3791 /* 3792 * Generate a buch of random entries. 3793 */ 3794 ints = MAX(ZTEST_ZAP_MIN_INTS, object % ZTEST_ZAP_MAX_INTS); 3795 3796 prop = ztest_random(ZTEST_ZAP_MAX_PROPS); 3797 (void) sprintf(propname, "prop_%llu", (u_longlong_t)prop); 3798 (void) sprintf(txgname, "txg_%llu", (u_longlong_t)prop); 3799 bzero(value, sizeof (value)); 3800 last_txg = 0; 3801 3802 /* 3803 * If these zap entries already exist, validate their contents. 3804 */ 3805 error = zap_length(os, object, txgname, &zl_intsize, &zl_ints); 3806 if (error == 0) { 3807 ASSERT3U(zl_intsize, ==, sizeof (uint64_t)); 3808 ASSERT3U(zl_ints, ==, 1); 3809 3810 VERIFY(zap_lookup(os, object, txgname, zl_intsize, 3811 zl_ints, &last_txg) == 0); 3812 3813 VERIFY(zap_length(os, object, propname, &zl_intsize, 3814 &zl_ints) == 0); 3815 3816 ASSERT3U(zl_intsize, ==, sizeof (uint64_t)); 3817 ASSERT3U(zl_ints, ==, ints); 3818 3819 VERIFY(zap_lookup(os, object, propname, zl_intsize, 3820 zl_ints, value) == 0); 3821 3822 for (i = 0; i < ints; i++) { 3823 ASSERT3U(value[i], ==, last_txg + object + i); 3824 } 3825 } else { 3826 ASSERT3U(error, ==, ENOENT); 3827 } 3828 3829 /* 3830 * Atomically update two entries in our zap object. 3831 * The first is named txg_%llu, and contains the txg 3832 * in which the property was last updated. The second 3833 * is named prop_%llu, and the nth element of its value 3834 * should be txg + object + n. 3835 */ 3836 tx = dmu_tx_create(os); 3837 dmu_tx_hold_zap(tx, object, B_TRUE, NULL); 3838 txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); 3839 if (txg == 0) 3840 return; 3841 3842 if (last_txg > txg) 3843 fatal(0, "zap future leak: old %llu new %llu", last_txg, txg); 3844 3845 for (i = 0; i < ints; i++) 3846 value[i] = txg + object + i; 3847 3848 VERIFY3U(0, ==, zap_update(os, object, txgname, sizeof (uint64_t), 3849 1, &txg, tx)); 3850 VERIFY3U(0, ==, zap_update(os, object, propname, sizeof (uint64_t), 3851 ints, value, tx)); 3852 3853 dmu_tx_commit(tx); 3854 3855 /* 3856 * Remove a random pair of entries. 3857 */ 3858 prop = ztest_random(ZTEST_ZAP_MAX_PROPS); 3859 (void) sprintf(propname, "prop_%llu", (u_longlong_t)prop); 3860 (void) sprintf(txgname, "txg_%llu", (u_longlong_t)prop); 3861 3862 error = zap_length(os, object, txgname, &zl_intsize, &zl_ints); 3863 3864 if (error == ENOENT) 3865 return; 3866 3867 ASSERT3U(error, ==, 0); 3868 3869 tx = dmu_tx_create(os); 3870 dmu_tx_hold_zap(tx, object, B_TRUE, NULL); 3871 txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); 3872 if (txg == 0) 3873 return; 3874 VERIFY3U(0, ==, zap_remove(os, object, txgname, tx)); 3875 VERIFY3U(0, ==, zap_remove(os, object, propname, tx)); 3876 dmu_tx_commit(tx); 3877} 3878 3879/* 3880 * Testcase to test the upgrading of a microzap to fatzap. 3881 */ 3882void 3883ztest_fzap(ztest_ds_t *zd, uint64_t id) 3884{ 3885 objset_t *os = zd->zd_os; 3886 ztest_od_t od[1]; 3887 uint64_t object, txg; 3888 3889 ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_ZAP_OTHER, 0, 0); 3890 3891 if (ztest_object_init(zd, od, sizeof (od), !ztest_random(2)) != 0) 3892 return; 3893 3894 object = od[0].od_object; 3895 3896 /* 3897 * Add entries to this ZAP and make sure it spills over 3898 * and gets upgraded to a fatzap. Also, since we are adding 3899 * 2050 entries we should see ptrtbl growth and leaf-block split. 3900 */ 3901 for (int i = 0; i < 2050; i++) { 3902 char name[MAXNAMELEN]; 3903 uint64_t value = i; 3904 dmu_tx_t *tx; 3905 int error; 3906 3907 (void) snprintf(name, sizeof (name), "fzap-%llu-%llu", 3908 id, value); 3909 3910 tx = dmu_tx_create(os); 3911 dmu_tx_hold_zap(tx, object, B_TRUE, name); 3912 txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); 3913 if (txg == 0) 3914 return; 3915 error = zap_add(os, object, name, sizeof (uint64_t), 1, 3916 &value, tx); 3917 ASSERT(error == 0 || error == EEXIST); 3918 dmu_tx_commit(tx); 3919 } 3920} 3921 3922/* ARGSUSED */ 3923void 3924ztest_zap_parallel(ztest_ds_t *zd, uint64_t id) 3925{ 3926 objset_t *os = zd->zd_os; 3927 ztest_od_t od[1]; 3928 uint64_t txg, object, count, wsize, wc, zl_wsize, zl_wc; 3929 dmu_tx_t *tx; 3930 int i, namelen, error; 3931 int micro = ztest_random(2); 3932 char name[20], string_value[20]; 3933 void *data; 3934 3935 ztest_od_init(&od[0], ID_PARALLEL, FTAG, micro, DMU_OT_ZAP_OTHER, 0, 0); 3936 3937 if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0) 3938 return; 3939 3940 object = od[0].od_object; 3941 3942 /* 3943 * Generate a random name of the form 'xxx.....' where each 3944 * x is a random printable character and the dots are dots. 3945 * There are 94 such characters, and the name length goes from 3946 * 6 to 20, so there are 94^3 * 15 = 12,458,760 possible names. 3947 */ 3948 namelen = ztest_random(sizeof (name) - 5) + 5 + 1; 3949 3950 for (i = 0; i < 3; i++) 3951 name[i] = '!' + ztest_random('~' - '!' + 1); 3952 for (; i < namelen - 1; i++) 3953 name[i] = '.'; 3954 name[i] = '\0'; 3955 3956 if ((namelen & 1) || micro) { 3957 wsize = sizeof (txg); 3958 wc = 1; 3959 data = &txg; 3960 } else { 3961 wsize = 1; 3962 wc = namelen; 3963 data = string_value; 3964 } 3965 3966 count = -1ULL; 3967 VERIFY(zap_count(os, object, &count) == 0); 3968 ASSERT(count != -1ULL); 3969 3970 /* 3971 * Select an operation: length, lookup, add, update, remove. 3972 */ 3973 i = ztest_random(5); 3974 3975 if (i >= 2) { 3976 tx = dmu_tx_create(os); 3977 dmu_tx_hold_zap(tx, object, B_TRUE, NULL); 3978 txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); 3979 if (txg == 0) 3980 return; 3981 bcopy(name, string_value, namelen); 3982 } else { 3983 tx = NULL; 3984 txg = 0; 3985 bzero(string_value, namelen); 3986 } 3987 3988 switch (i) { 3989 3990 case 0: 3991 error = zap_length(os, object, name, &zl_wsize, &zl_wc); 3992 if (error == 0) { 3993 ASSERT3U(wsize, ==, zl_wsize); 3994 ASSERT3U(wc, ==, zl_wc); 3995 } else { 3996 ASSERT3U(error, ==, ENOENT); 3997 } 3998 break; 3999 4000 case 1: 4001 error = zap_lookup(os, object, name, wsize, wc, data); 4002 if (error == 0) { 4003 if (data == string_value && 4004 bcmp(name, data, namelen) != 0) 4005 fatal(0, "name '%s' != val '%s' len %d", 4006 name, data, namelen); 4007 } else { 4008 ASSERT3U(error, ==, ENOENT); 4009 } 4010 break; 4011 4012 case 2: 4013 error = zap_add(os, object, name, wsize, wc, data, tx); 4014 ASSERT(error == 0 || error == EEXIST); 4015 break; 4016 4017 case 3: 4018 VERIFY(zap_update(os, object, name, wsize, wc, data, tx) == 0); 4019 break; 4020 4021 case 4: 4022 error = zap_remove(os, object, name, tx); 4023 ASSERT(error == 0 || error == ENOENT); 4024 break; 4025 } 4026 4027 if (tx != NULL) 4028 dmu_tx_commit(tx); 4029} 4030 4031/* 4032 * Commit callback data. 4033 */ 4034typedef struct ztest_cb_data { 4035 list_node_t zcd_node; 4036 uint64_t zcd_txg; 4037 int zcd_expected_err; 4038 boolean_t zcd_added; 4039 boolean_t zcd_called; 4040 spa_t *zcd_spa; 4041} ztest_cb_data_t; 4042 4043/* This is the actual commit callback function */ 4044static void 4045ztest_commit_callback(void *arg, int error) 4046{ 4047 ztest_cb_data_t *data = arg; 4048 uint64_t synced_txg; 4049 4050 VERIFY(data != NULL); 4051 VERIFY3S(data->zcd_expected_err, ==, error); 4052 VERIFY(!data->zcd_called); 4053 4054 synced_txg = spa_last_synced_txg(data->zcd_spa); 4055 if (data->zcd_txg > synced_txg) 4056 fatal(0, "commit callback of txg %" PRIu64 " called prematurely" 4057 ", last synced txg = %" PRIu64 "\n", data->zcd_txg, 4058 synced_txg); 4059 4060 data->zcd_called = B_TRUE; 4061 4062 if (error == ECANCELED) { 4063 ASSERT3U(data->zcd_txg, ==, 0); 4064 ASSERT(!data->zcd_added); 4065 4066 /* 4067 * The private callback data should be destroyed here, but 4068 * since we are going to check the zcd_called field after 4069 * dmu_tx_abort(), we will destroy it there. 4070 */ 4071 return; 4072 } 4073 4074 /* Was this callback added to the global callback list? */ 4075 if (!data->zcd_added) 4076 goto out; 4077 4078 ASSERT3U(data->zcd_txg, !=, 0); 4079 4080 /* Remove our callback from the list */ 4081 (void) mutex_lock(&zcl.zcl_callbacks_lock); 4082 list_remove(&zcl.zcl_callbacks, data); 4083 (void) mutex_unlock(&zcl.zcl_callbacks_lock); 4084 4085out: 4086 umem_free(data, sizeof (ztest_cb_data_t)); 4087} 4088 4089/* Allocate and initialize callback data structure */ 4090static ztest_cb_data_t * 4091ztest_create_cb_data(objset_t *os, uint64_t txg) 4092{ 4093 ztest_cb_data_t *cb_data; 4094 4095 cb_data = umem_zalloc(sizeof (ztest_cb_data_t), UMEM_NOFAIL); 4096 4097 cb_data->zcd_txg = txg; 4098 cb_data->zcd_spa = dmu_objset_spa(os); 4099 4100 return (cb_data); 4101} 4102 4103/* 4104 * If a number of txgs equal to this threshold have been created after a commit 4105 * callback has been registered but not called, then we assume there is an 4106 * implementation bug. 4107 */ 4108#define ZTEST_COMMIT_CALLBACK_THRESH (TXG_CONCURRENT_STATES + 2) 4109 4110/* 4111 * Commit callback test. 4112 */ 4113void 4114ztest_dmu_commit_callbacks(ztest_ds_t *zd, uint64_t id) 4115{ 4116 objset_t *os = zd->zd_os; 4117 ztest_od_t od[1]; 4118 dmu_tx_t *tx; 4119 ztest_cb_data_t *cb_data[3], *tmp_cb; 4120 uint64_t old_txg, txg; 4121 int i, error; 4122 4123 ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0); 4124 4125 if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0) 4126 return; 4127 4128 tx = dmu_tx_create(os); 4129 4130 cb_data[0] = ztest_create_cb_data(os, 0); 4131 dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[0]); 4132 4133 dmu_tx_hold_write(tx, od[0].od_object, 0, sizeof (uint64_t)); 4134 4135 /* Every once in a while, abort the transaction on purpose */ 4136 if (ztest_random(100) == 0) 4137 error = -1; 4138 4139 if (!error) 4140 error = dmu_tx_assign(tx, TXG_NOWAIT); 4141 4142 txg = error ? 0 : dmu_tx_get_txg(tx); 4143 4144 cb_data[0]->zcd_txg = txg; 4145 cb_data[1] = ztest_create_cb_data(os, txg); 4146 dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[1]); 4147 4148 if (error) { 4149 /* 4150 * It's not a strict requirement to call the registered 4151 * callbacks from inside dmu_tx_abort(), but that's what 4152 * it's supposed to happen in the current implementation 4153 * so we will check for that. 4154 */ 4155 for (i = 0; i < 2; i++) { 4156 cb_data[i]->zcd_expected_err = ECANCELED; 4157 VERIFY(!cb_data[i]->zcd_called); 4158 } 4159 4160 dmu_tx_abort(tx); 4161 4162 for (i = 0; i < 2; i++) { 4163 VERIFY(cb_data[i]->zcd_called); 4164 umem_free(cb_data[i], sizeof (ztest_cb_data_t)); 4165 } 4166 4167 return; 4168 } 4169 4170 cb_data[2] = ztest_create_cb_data(os, txg); 4171 dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[2]); 4172 4173 /* 4174 * Read existing data to make sure there isn't a future leak. 4175 */ 4176 VERIFY(0 == dmu_read(os, od[0].od_object, 0, sizeof (uint64_t), 4177 &old_txg, DMU_READ_PREFETCH)); 4178 4179 if (old_txg > txg) 4180 fatal(0, "future leak: got %" PRIu64 ", open txg is %" PRIu64, 4181 old_txg, txg); 4182 4183 dmu_write(os, od[0].od_object, 0, sizeof (uint64_t), &txg, tx); 4184 4185 (void) mutex_lock(&zcl.zcl_callbacks_lock); 4186 4187 /* 4188 * Since commit callbacks don't have any ordering requirement and since 4189 * it is theoretically possible for a commit callback to be called 4190 * after an arbitrary amount of time has elapsed since its txg has been 4191 * synced, it is difficult to reliably determine whether a commit 4192 * callback hasn't been called due to high load or due to a flawed 4193 * implementation. 4194 * 4195 * In practice, we will assume that if after a certain number of txgs a 4196 * commit callback hasn't been called, then most likely there's an 4197 * implementation bug.. 4198 */ 4199 tmp_cb = list_head(&zcl.zcl_callbacks); 4200 if (tmp_cb != NULL && 4201 tmp_cb->zcd_txg > txg - ZTEST_COMMIT_CALLBACK_THRESH) { 4202 fatal(0, "Commit callback threshold exceeded, oldest txg: %" 4203 PRIu64 ", open txg: %" PRIu64 "\n", tmp_cb->zcd_txg, txg); 4204 } 4205 4206 /* 4207 * Let's find the place to insert our callbacks. 4208 * 4209 * Even though the list is ordered by txg, it is possible for the 4210 * insertion point to not be the end because our txg may already be 4211 * quiescing at this point and other callbacks in the open txg 4212 * (from other objsets) may have sneaked in. 4213 */ 4214 tmp_cb = list_tail(&zcl.zcl_callbacks); 4215 while (tmp_cb != NULL && tmp_cb->zcd_txg > txg) 4216 tmp_cb = list_prev(&zcl.zcl_callbacks, tmp_cb); 4217 4218 /* Add the 3 callbacks to the list */ 4219 for (i = 0; i < 3; i++) { 4220 if (tmp_cb == NULL) 4221 list_insert_head(&zcl.zcl_callbacks, cb_data[i]); 4222 else 4223 list_insert_after(&zcl.zcl_callbacks, tmp_cb, 4224 cb_data[i]); 4225 4226 cb_data[i]->zcd_added = B_TRUE; 4227 VERIFY(!cb_data[i]->zcd_called); 4228 4229 tmp_cb = cb_data[i]; 4230 } 4231 4232 (void) mutex_unlock(&zcl.zcl_callbacks_lock); 4233 4234 dmu_tx_commit(tx); 4235} 4236 4237/* ARGSUSED */ 4238void 4239ztest_dsl_prop_get_set(ztest_ds_t *zd, uint64_t id) 4240{ 4241 zfs_prop_t proplist[] = { 4242 ZFS_PROP_CHECKSUM, 4243 ZFS_PROP_COMPRESSION, 4244 ZFS_PROP_COPIES, 4245 ZFS_PROP_DEDUP 4246 }; 4247 ztest_shared_t *zs = ztest_shared; 4248 4249 (void) rw_rdlock(&zs->zs_name_lock); 4250 4251 for (int p = 0; p < sizeof (proplist) / sizeof (proplist[0]); p++) 4252 (void) ztest_dsl_prop_set_uint64(zd->zd_name, proplist[p], 4253 ztest_random_dsl_prop(proplist[p]), (int)ztest_random(2)); 4254 4255 (void) rw_unlock(&zs->zs_name_lock); 4256} 4257 4258/* ARGSUSED */ 4259void 4260ztest_spa_prop_get_set(ztest_ds_t *zd, uint64_t id) 4261{ 4262 ztest_shared_t *zs = ztest_shared; 4263 nvlist_t *props = NULL; 4264 4265 (void) rw_rdlock(&zs->zs_name_lock); 4266 4267 (void) ztest_spa_prop_set_uint64(zs, ZPOOL_PROP_DEDUPDITTO, 4268 ZIO_DEDUPDITTO_MIN + ztest_random(ZIO_DEDUPDITTO_MIN)); 4269 4270 VERIFY3U(spa_prop_get(zs->zs_spa, &props), ==, 0); 4271 4272 if (zopt_verbose >= 6) 4273 dump_nvlist(props, 4); 4274 4275 nvlist_free(props); 4276 4277 (void) rw_unlock(&zs->zs_name_lock); 4278} 4279 4280/* 4281 * Test snapshot hold/release and deferred destroy. 4282 */ 4283void 4284ztest_dmu_snapshot_hold(ztest_ds_t *zd, uint64_t id) 4285{ 4286 int error; 4287 objset_t *os = zd->zd_os; 4288 objset_t *origin; 4289 char snapname[100]; 4290 char fullname[100]; 4291 char clonename[100]; 4292 char tag[100]; 4293 char osname[MAXNAMELEN]; 4294 4295 (void) rw_rdlock(&ztest_shared->zs_name_lock); 4296 4297 dmu_objset_name(os, osname); 4298 4299 (void) snprintf(snapname, 100, "sh1_%llu", id); 4300 (void) snprintf(fullname, 100, "%s@%s", osname, snapname); 4301 (void) snprintf(clonename, 100, "%s/ch1_%llu", osname, id); 4302 (void) snprintf(tag, 100, "%tag_%llu", id); 4303 4304 /* 4305 * Clean up from any previous run. 4306 */ 4307 (void) dmu_objset_destroy(clonename, B_FALSE); 4308 (void) dsl_dataset_user_release(osname, snapname, tag, B_FALSE); 4309 (void) dmu_objset_destroy(fullname, B_FALSE); 4310 4311 /* 4312 * Create snapshot, clone it, mark snap for deferred destroy, 4313 * destroy clone, verify snap was also destroyed. 4314 */ 4315 error = dmu_objset_snapshot(osname, snapname, NULL, NULL, FALSE, 4316 FALSE, -1); 4317 if (error) { 4318 if (error == ENOSPC) { 4319 ztest_record_enospc("dmu_objset_snapshot"); 4320 goto out; 4321 } 4322 fatal(0, "dmu_objset_snapshot(%s) = %d", fullname, error); 4323 } 4324 4325 error = dmu_objset_hold(fullname, FTAG, &origin); 4326 if (error) 4327 fatal(0, "dmu_objset_hold(%s) = %d", fullname, error); 4328 4329 error = dmu_objset_clone(clonename, dmu_objset_ds(origin), 0); 4330 dmu_objset_rele(origin, FTAG); 4331 if (error) { 4332 if (error == ENOSPC) { 4333 ztest_record_enospc("dmu_objset_clone"); 4334 goto out; 4335 } 4336 fatal(0, "dmu_objset_clone(%s) = %d", clonename, error); 4337 } 4338 4339 error = dmu_objset_destroy(fullname, B_TRUE); 4340 if (error) { 4341 fatal(0, "dmu_objset_destroy(%s, B_TRUE) = %d", 4342 fullname, error); 4343 } 4344 4345 error = dmu_objset_destroy(clonename, B_FALSE); 4346 if (error) 4347 fatal(0, "dmu_objset_destroy(%s) = %d", clonename, error); 4348 4349 error = dmu_objset_hold(fullname, FTAG, &origin); 4350 if (error != ENOENT) 4351 fatal(0, "dmu_objset_hold(%s) = %d", fullname, error); 4352 4353 /* 4354 * Create snapshot, add temporary hold, verify that we can't 4355 * destroy a held snapshot, mark for deferred destroy, 4356 * release hold, verify snapshot was destroyed. 4357 */ 4358 error = dmu_objset_snapshot(osname, snapname, NULL, NULL, FALSE, 4359 FALSE, -1); 4360 if (error) { 4361 if (error == ENOSPC) { 4362 ztest_record_enospc("dmu_objset_snapshot"); 4363 goto out; 4364 } 4365 fatal(0, "dmu_objset_snapshot(%s) = %d", fullname, error); 4366 } 4367 4368 error = dsl_dataset_user_hold(osname, snapname, tag, B_FALSE, 4369 B_TRUE, -1); 4370 if (error) 4371 fatal(0, "dsl_dataset_user_hold(%s)", fullname, tag); 4372 4373 error = dmu_objset_destroy(fullname, B_FALSE); 4374 if (error != EBUSY) { 4375 fatal(0, "dmu_objset_destroy(%s, B_FALSE) = %d", 4376 fullname, error); 4377 } 4378 4379 error = dmu_objset_destroy(fullname, B_TRUE); 4380 if (error) { 4381 fatal(0, "dmu_objset_destroy(%s, B_TRUE) = %d", 4382 fullname, error); 4383 } 4384 4385 error = dsl_dataset_user_release(osname, snapname, tag, B_FALSE); 4386 if (error) 4387 fatal(0, "dsl_dataset_user_release(%s)", fullname, tag); 4388 4389 VERIFY(dmu_objset_hold(fullname, FTAG, &origin) == ENOENT); 4390 4391out: 4392 (void) rw_unlock(&ztest_shared->zs_name_lock); 4393} 4394 4395/* 4396 * Inject random faults into the on-disk data. 4397 */ 4398/* ARGSUSED */ 4399void 4400ztest_fault_inject(ztest_ds_t *zd, uint64_t id) 4401{ 4402 ztest_shared_t *zs = ztest_shared; 4403 spa_t *spa = zs->zs_spa; 4404 int fd; 4405 uint64_t offset; 4406 uint64_t leaves; 4407 uint64_t bad = 0x1990c0ffeedecadeULL; 4408 uint64_t top, leaf; 4409 char path0[MAXPATHLEN]; 4410 char pathrand[MAXPATHLEN]; 4411 size_t fsize; 4412 int bshift = SPA_MAXBLOCKSHIFT + 2; /* don't scrog all labels */ 4413 int iters = 1000; 4414 int maxfaults; 4415 int mirror_save; 4416 vdev_t *vd0 = NULL; 4417 uint64_t guid0 = 0; 4418 boolean_t islog = B_FALSE; 4419 4420 VERIFY(mutex_lock(&zs->zs_vdev_lock) == 0); 4421 maxfaults = MAXFAULTS(); 4422 leaves = MAX(zs->zs_mirrors, 1) * zopt_raidz; 4423 mirror_save = zs->zs_mirrors; 4424 VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0); 4425 4426 ASSERT(leaves >= 1); 4427 4428 /* 4429 * We need SCL_STATE here because we're going to look at vd0->vdev_tsd. 4430 */ 4431 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 4432 4433 if (ztest_random(2) == 0) { 4434 /* 4435 * Inject errors on a normal data device or slog device. 4436 */ 4437 top = ztest_random_vdev_top(spa, B_TRUE); 4438 leaf = ztest_random(leaves) + zs->zs_splits; 4439 4440 /* 4441 * Generate paths to the first leaf in this top-level vdev, 4442 * and to the random leaf we selected. We'll induce transient 4443 * write failures and random online/offline activity on leaf 0, 4444 * and we'll write random garbage to the randomly chosen leaf. 4445 */ 4446 (void) snprintf(path0, sizeof (path0), ztest_dev_template, 4447 zopt_dir, zopt_pool, top * leaves + zs->zs_splits); 4448 (void) snprintf(pathrand, sizeof (pathrand), ztest_dev_template, 4449 zopt_dir, zopt_pool, top * leaves + leaf); 4450 4451 vd0 = vdev_lookup_by_path(spa->spa_root_vdev, path0); 4452 if (vd0 != NULL && vd0->vdev_top->vdev_islog) 4453 islog = B_TRUE; 4454 4455 if (vd0 != NULL && maxfaults != 1) { 4456 /* 4457 * Make vd0 explicitly claim to be unreadable, 4458 * or unwriteable, or reach behind its back 4459 * and close the underlying fd. We can do this if 4460 * maxfaults == 0 because we'll fail and reexecute, 4461 * and we can do it if maxfaults >= 2 because we'll 4462 * have enough redundancy. If maxfaults == 1, the 4463 * combination of this with injection of random data 4464 * corruption below exceeds the pool's fault tolerance. 4465 */ 4466 vdev_file_t *vf = vd0->vdev_tsd; 4467 4468 if (vf != NULL && ztest_random(3) == 0) { 4469 (void) close(vf->vf_vnode->v_fd); 4470 vf->vf_vnode->v_fd = -1; 4471 } else if (ztest_random(2) == 0) { 4472 vd0->vdev_cant_read = B_TRUE; 4473 } else { 4474 vd0->vdev_cant_write = B_TRUE; 4475 } 4476 guid0 = vd0->vdev_guid; 4477 } 4478 } else { 4479 /* 4480 * Inject errors on an l2cache device. 4481 */ 4482 spa_aux_vdev_t *sav = &spa->spa_l2cache; 4483 4484 if (sav->sav_count == 0) { 4485 spa_config_exit(spa, SCL_STATE, FTAG); 4486 return; 4487 } 4488 vd0 = sav->sav_vdevs[ztest_random(sav->sav_count)]; 4489 guid0 = vd0->vdev_guid; 4490 (void) strcpy(path0, vd0->vdev_path); 4491 (void) strcpy(pathrand, vd0->vdev_path); 4492 4493 leaf = 0; 4494 leaves = 1; 4495 maxfaults = INT_MAX; /* no limit on cache devices */ 4496 } 4497 4498 spa_config_exit(spa, SCL_STATE, FTAG); 4499 4500 /* 4501 * If we can tolerate two or more faults, or we're dealing 4502 * with a slog, randomly online/offline vd0. 4503 */ 4504 if ((maxfaults >= 2 || islog) && guid0 != 0) { 4505 if (ztest_random(10) < 6) { 4506 int flags = (ztest_random(2) == 0 ? 4507 ZFS_OFFLINE_TEMPORARY : 0); 4508 4509 /* 4510 * We have to grab the zs_name_lock as writer to 4511 * prevent a race between offlining a slog and 4512 * destroying a dataset. Offlining the slog will 4513 * grab a reference on the dataset which may cause 4514 * dmu_objset_destroy() to fail with EBUSY thus 4515 * leaving the dataset in an inconsistent state. 4516 */ 4517 if (islog) 4518 (void) rw_wrlock(&ztest_shared->zs_name_lock); 4519 4520 VERIFY(vdev_offline(spa, guid0, flags) != EBUSY); 4521 4522 if (islog) 4523 (void) rw_unlock(&ztest_shared->zs_name_lock); 4524 } else { 4525 (void) vdev_online(spa, guid0, 0, NULL); 4526 } 4527 } 4528 4529 if (maxfaults == 0) 4530 return; 4531 4532 /* 4533 * We have at least single-fault tolerance, so inject data corruption. 4534 */ 4535 fd = open(pathrand, O_RDWR); 4536 4537 if (fd == -1) /* we hit a gap in the device namespace */ 4538 return; 4539 4540 fsize = lseek(fd, 0, SEEK_END); 4541 4542 while (--iters != 0) { 4543 offset = ztest_random(fsize / (leaves << bshift)) * 4544 (leaves << bshift) + (leaf << bshift) + 4545 (ztest_random(1ULL << (bshift - 1)) & -8ULL); 4546 4547 if (offset >= fsize) 4548 continue; 4549 4550 VERIFY(mutex_lock(&zs->zs_vdev_lock) == 0); 4551 if (mirror_save != zs->zs_mirrors) { 4552 VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0); 4553 (void) close(fd); 4554 return; 4555 } 4556 4557 if (pwrite(fd, &bad, sizeof (bad), offset) != sizeof (bad)) 4558 fatal(1, "can't inject bad word at 0x%llx in %s", 4559 offset, pathrand); 4560 4561 VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0); 4562 4563 if (zopt_verbose >= 7) 4564 (void) printf("injected bad word into %s," 4565 " offset 0x%llx\n", pathrand, (u_longlong_t)offset); 4566 } 4567 4568 (void) close(fd); 4569} 4570 4571/* 4572 * Verify that DDT repair works as expected. 4573 */ 4574void 4575ztest_ddt_repair(ztest_ds_t *zd, uint64_t id) 4576{ 4577 ztest_shared_t *zs = ztest_shared; 4578 spa_t *spa = zs->zs_spa; 4579 objset_t *os = zd->zd_os; 4580 ztest_od_t od[1]; 4581 uint64_t object, blocksize, txg, pattern, psize; 4582 enum zio_checksum checksum = spa_dedup_checksum(spa); 4583 dmu_buf_t *db; 4584 dmu_tx_t *tx; 4585 void *buf; 4586 blkptr_t blk; 4587 int copies = 2 * ZIO_DEDUPDITTO_MIN; 4588 4589 blocksize = ztest_random_blocksize(); 4590 blocksize = MIN(blocksize, 2048); /* because we write so many */ 4591 4592 ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, 0); 4593 4594 if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0) 4595 return; 4596 4597 /* 4598 * Take the name lock as writer to prevent anyone else from changing 4599 * the pool and dataset properies we need to maintain during this test. 4600 */ 4601 (void) rw_wrlock(&zs->zs_name_lock); 4602 4603 if (ztest_dsl_prop_set_uint64(zd->zd_name, ZFS_PROP_DEDUP, checksum, 4604 B_FALSE) != 0 || 4605 ztest_dsl_prop_set_uint64(zd->zd_name, ZFS_PROP_COPIES, 1, 4606 B_FALSE) != 0) { 4607 (void) rw_unlock(&zs->zs_name_lock); 4608 return; 4609 } 4610 4611 object = od[0].od_object; 4612 blocksize = od[0].od_blocksize; 4613 pattern = spa_guid(spa) ^ dmu_objset_fsid_guid(os); 4614 4615 ASSERT(object != 0); 4616 4617 tx = dmu_tx_create(os); 4618 dmu_tx_hold_write(tx, object, 0, copies * blocksize); 4619 txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); 4620 if (txg == 0) { 4621 (void) rw_unlock(&zs->zs_name_lock); 4622 return; 4623 } 4624 4625 /* 4626 * Write all the copies of our block. 4627 */ 4628 for (int i = 0; i < copies; i++) { 4629 uint64_t offset = i * blocksize; 4630 VERIFY(dmu_buf_hold(os, object, offset, FTAG, &db, 4631 DMU_READ_NO_PREFETCH) == 0); 4632 ASSERT(db->db_offset == offset); 4633 ASSERT(db->db_size == blocksize); 4634 ASSERT(ztest_pattern_match(db->db_data, db->db_size, pattern) || 4635 ztest_pattern_match(db->db_data, db->db_size, 0ULL)); 4636 dmu_buf_will_fill(db, tx); 4637 ztest_pattern_set(db->db_data, db->db_size, pattern); 4638 dmu_buf_rele(db, FTAG); 4639 } 4640 4641 dmu_tx_commit(tx); 4642 txg_wait_synced(spa_get_dsl(spa), txg); 4643 4644 /* 4645 * Find out what block we got. 4646 */ 4647 VERIFY(dmu_buf_hold(os, object, 0, FTAG, &db, 4648 DMU_READ_NO_PREFETCH) == 0); 4649 blk = *((dmu_buf_impl_t *)db)->db_blkptr; 4650 dmu_buf_rele(db, FTAG); 4651 4652 /* 4653 * Damage the block. Dedup-ditto will save us when we read it later. 4654 */ 4655 psize = BP_GET_PSIZE(&blk); 4656 buf = zio_buf_alloc(psize); 4657 ztest_pattern_set(buf, psize, ~pattern); 4658 4659 (void) zio_wait(zio_rewrite(NULL, spa, 0, &blk, 4660 buf, psize, NULL, NULL, ZIO_PRIORITY_SYNC_WRITE, 4661 ZIO_FLAG_CANFAIL | ZIO_FLAG_INDUCE_DAMAGE, NULL)); 4662 4663 zio_buf_free(buf, psize); 4664 4665 (void) rw_unlock(&zs->zs_name_lock); 4666} 4667 4668/* 4669 * Scrub the pool. 4670 */ 4671/* ARGSUSED */ 4672void 4673ztest_scrub(ztest_ds_t *zd, uint64_t id) 4674{ 4675 ztest_shared_t *zs = ztest_shared; 4676 spa_t *spa = zs->zs_spa; 4677 4678 (void) spa_scan(spa, POOL_SCAN_SCRUB); 4679 (void) poll(NULL, 0, 100); /* wait a moment, then force a restart */ 4680 (void) spa_scan(spa, POOL_SCAN_SCRUB); 4681} 4682 4683/* 4684 * Rename the pool to a different name and then rename it back. 4685 */ 4686/* ARGSUSED */ 4687void 4688ztest_spa_rename(ztest_ds_t *zd, uint64_t id) 4689{ 4690 ztest_shared_t *zs = ztest_shared; 4691 char *oldname, *newname; 4692 spa_t *spa; 4693 4694 (void) rw_wrlock(&zs->zs_name_lock); 4695 4696 oldname = zs->zs_pool; 4697 newname = umem_alloc(strlen(oldname) + 5, UMEM_NOFAIL); 4698 (void) strcpy(newname, oldname); 4699 (void) strcat(newname, "_tmp"); 4700 4701 /* 4702 * Do the rename 4703 */ 4704 VERIFY3U(0, ==, spa_rename(oldname, newname)); 4705 4706 /* 4707 * Try to open it under the old name, which shouldn't exist 4708 */ 4709 VERIFY3U(ENOENT, ==, spa_open(oldname, &spa, FTAG)); 4710 4711 /* 4712 * Open it under the new name and make sure it's still the same spa_t. 4713 */ 4714 VERIFY3U(0, ==, spa_open(newname, &spa, FTAG)); 4715 4716 ASSERT(spa == zs->zs_spa); 4717 spa_close(spa, FTAG); 4718 4719 /* 4720 * Rename it back to the original 4721 */ 4722 VERIFY3U(0, ==, spa_rename(newname, oldname)); 4723 4724 /* 4725 * Make sure it can still be opened 4726 */ 4727 VERIFY3U(0, ==, spa_open(oldname, &spa, FTAG)); 4728 4729 ASSERT(spa == zs->zs_spa); 4730 spa_close(spa, FTAG); 4731 4732 umem_free(newname, strlen(newname) + 1); 4733 4734 (void) rw_unlock(&zs->zs_name_lock); 4735} 4736 4737/* 4738 * Verify pool integrity by running zdb. 4739 */ 4740static void 4741ztest_run_zdb(char *pool) 4742{ 4743 int status; 4744 char zdb[MAXPATHLEN + MAXNAMELEN + 20]; 4745 char zbuf[1024]; 4746 char *bin; 4747 char *ztest; 4748 char *isa; 4749 int isalen; 4750 FILE *fp; 4751 4752 strlcpy(zdb, "/usr/bin/ztest", sizeof(zdb)); 4753 4754 /* zdb lives in /usr/sbin, while ztest lives in /usr/bin */ 4755 bin = strstr(zdb, "/usr/bin/"); 4756 ztest = strstr(bin, "/ztest"); 4757 isa = bin + 8; 4758 isalen = ztest - isa; 4759 isa = strdup(isa); 4760 /* LINTED */ 4761 (void) sprintf(bin, 4762 "/usr/sbin%.*s/zdb -bcc%s%s -U %s %s", 4763 isalen, 4764 isa, 4765 zopt_verbose >= 3 ? "s" : "", 4766 zopt_verbose >= 4 ? "v" : "", 4767 spa_config_path, 4768 pool); 4769 free(isa); 4770 4771 if (zopt_verbose >= 5) 4772 (void) printf("Executing %s\n", strstr(zdb, "zdb ")); 4773 4774 fp = popen(zdb, "r"); 4775 assert(fp != NULL); 4776 4777 while (fgets(zbuf, sizeof (zbuf), fp) != NULL) 4778 if (zopt_verbose >= 3) 4779 (void) printf("%s", zbuf); 4780 4781 status = pclose(fp); 4782 4783 if (status == 0) 4784 return; 4785 4786 ztest_dump_core = 0; 4787 if (WIFEXITED(status)) 4788 fatal(0, "'%s' exit code %d", zdb, WEXITSTATUS(status)); 4789 else 4790 fatal(0, "'%s' died with signal %d", zdb, WTERMSIG(status)); 4791} 4792 4793static void 4794ztest_walk_pool_directory(char *header) 4795{ 4796 spa_t *spa = NULL; 4797 4798 if (zopt_verbose >= 6) 4799 (void) printf("%s\n", header); 4800 4801 mutex_enter(&spa_namespace_lock); 4802 while ((spa = spa_next(spa)) != NULL) 4803 if (zopt_verbose >= 6) 4804 (void) printf("\t%s\n", spa_name(spa)); 4805 mutex_exit(&spa_namespace_lock); 4806} 4807 4808static void 4809ztest_spa_import_export(char *oldname, char *newname) 4810{ 4811 nvlist_t *config, *newconfig; 4812 uint64_t pool_guid; 4813 spa_t *spa; 4814 4815 if (zopt_verbose >= 4) { 4816 (void) printf("import/export: old = %s, new = %s\n", 4817 oldname, newname); 4818 } 4819 4820 /* 4821 * Clean up from previous runs. 4822 */ 4823 (void) spa_destroy(newname); 4824 4825 /* 4826 * Get the pool's configuration and guid. 4827 */ 4828 VERIFY3U(0, ==, spa_open(oldname, &spa, FTAG)); 4829 4830 /* 4831 * Kick off a scrub to tickle scrub/export races. 4832 */ 4833 if (ztest_random(2) == 0) 4834 (void) spa_scan(spa, POOL_SCAN_SCRUB); 4835 4836 pool_guid = spa_guid(spa); 4837 spa_close(spa, FTAG); 4838 4839 ztest_walk_pool_directory("pools before export"); 4840 4841 /* 4842 * Export it. 4843 */ 4844 VERIFY3U(0, ==, spa_export(oldname, &config, B_FALSE, B_FALSE)); 4845 4846 ztest_walk_pool_directory("pools after export"); 4847 4848 /* 4849 * Try to import it. 4850 */ 4851 newconfig = spa_tryimport(config); 4852 ASSERT(newconfig != NULL); 4853 nvlist_free(newconfig); 4854 4855 /* 4856 * Import it under the new name. 4857 */ 4858 VERIFY3U(0, ==, spa_import(newname, config, NULL, 0)); 4859 4860 ztest_walk_pool_directory("pools after import"); 4861 4862 /* 4863 * Try to import it again -- should fail with EEXIST. 4864 */ 4865 VERIFY3U(EEXIST, ==, spa_import(newname, config, NULL, 0)); 4866 4867 /* 4868 * Try to import it under a different name -- should fail with EEXIST. 4869 */ 4870 VERIFY3U(EEXIST, ==, spa_import(oldname, config, NULL, 0)); 4871 4872 /* 4873 * Verify that the pool is no longer visible under the old name. 4874 */ 4875 VERIFY3U(ENOENT, ==, spa_open(oldname, &spa, FTAG)); 4876 4877 /* 4878 * Verify that we can open and close the pool using the new name. 4879 */ 4880 VERIFY3U(0, ==, spa_open(newname, &spa, FTAG)); 4881 ASSERT(pool_guid == spa_guid(spa)); 4882 spa_close(spa, FTAG); 4883 4884 nvlist_free(config); 4885} 4886 4887static void 4888ztest_resume(spa_t *spa) 4889{ 4890 if (spa_suspended(spa) && zopt_verbose >= 6) 4891 (void) printf("resuming from suspended state\n"); 4892 spa_vdev_state_enter(spa, SCL_NONE); 4893 vdev_clear(spa, NULL); 4894 (void) spa_vdev_state_exit(spa, NULL, 0); 4895 (void) zio_resume(spa); 4896} 4897 4898static void * 4899ztest_resume_thread(void *arg) 4900{ 4901 spa_t *spa = arg; 4902 4903 while (!ztest_exiting) { 4904 if (spa_suspended(spa)) 4905 ztest_resume(spa); 4906 (void) poll(NULL, 0, 100); 4907 } 4908 return (NULL); 4909} 4910 4911static void * 4912ztest_deadman_thread(void *arg) 4913{ 4914 ztest_shared_t *zs = arg; 4915 int grace = 300; 4916 hrtime_t delta; 4917 4918 delta = (zs->zs_thread_stop - zs->zs_thread_start) / NANOSEC + grace; 4919 4920 (void) poll(NULL, 0, (int)(1000 * delta)); 4921 4922 fatal(0, "failed to complete within %d seconds of deadline", grace); 4923 4924 return (NULL); 4925} 4926 4927static void 4928ztest_execute(ztest_info_t *zi, uint64_t id) 4929{ 4930 ztest_shared_t *zs = ztest_shared; 4931 ztest_ds_t *zd = &zs->zs_zd[id % zopt_datasets]; 4932 hrtime_t functime = gethrtime(); 4933 4934 for (int i = 0; i < zi->zi_iters; i++) 4935 zi->zi_func(zd, id); 4936 4937 functime = gethrtime() - functime; 4938 4939 atomic_add_64(&zi->zi_call_count, 1); 4940 atomic_add_64(&zi->zi_call_time, functime); 4941 4942 if (zopt_verbose >= 4) { 4943 Dl_info dli; 4944 (void) dladdr((void *)zi->zi_func, &dli); 4945 (void) printf("%6.2f sec in %s\n", 4946 (double)functime / NANOSEC, dli.dli_sname); 4947 } 4948} 4949 4950static void * 4951ztest_thread(void *arg) 4952{ 4953 uint64_t id = (uintptr_t)arg; 4954 ztest_shared_t *zs = ztest_shared; 4955 uint64_t call_next; 4956 hrtime_t now; 4957 ztest_info_t *zi; 4958 4959 while ((now = gethrtime()) < zs->zs_thread_stop) { 4960 /* 4961 * See if it's time to force a crash. 4962 */ 4963 if (now > zs->zs_thread_kill) 4964 ztest_kill(zs); 4965 4966 /* 4967 * If we're getting ENOSPC with some regularity, stop. 4968 */ 4969 if (zs->zs_enospc_count > 10) 4970 break; 4971 4972 /* 4973 * Pick a random function to execute. 4974 */ 4975 zi = &zs->zs_info[ztest_random(ZTEST_FUNCS)]; 4976 call_next = zi->zi_call_next; 4977 4978 if (now >= call_next && 4979 atomic_cas_64(&zi->zi_call_next, call_next, call_next + 4980 ztest_random(2 * zi->zi_interval[0] + 1)) == call_next) 4981 ztest_execute(zi, id); 4982 } 4983 4984 return (NULL); 4985} 4986 4987static void 4988ztest_dataset_name(char *dsname, char *pool, int d) 4989{ 4990 (void) snprintf(dsname, MAXNAMELEN, "%s/ds_%d", pool, d); 4991} 4992 4993static void 4994ztest_dataset_destroy(ztest_shared_t *zs, int d) 4995{ 4996 char name[MAXNAMELEN]; 4997 4998 ztest_dataset_name(name, zs->zs_pool, d); 4999 5000 if (zopt_verbose >= 3) 5001 (void) printf("Destroying %s to free up space\n", name); 5002 5003 /* 5004 * Cleanup any non-standard clones and snapshots. In general, 5005 * ztest thread t operates on dataset (t % zopt_datasets), 5006 * so there may be more than one thing to clean up. 5007 */ 5008 for (int t = d; t < zopt_threads; t += zopt_datasets) 5009 ztest_dsl_dataset_cleanup(name, t); 5010 5011 (void) dmu_objset_find(name, ztest_objset_destroy_cb, NULL, 5012 DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN); 5013} 5014 5015static void 5016ztest_dataset_dirobj_verify(ztest_ds_t *zd) 5017{ 5018 uint64_t usedobjs, dirobjs, scratch; 5019 5020 /* 5021 * ZTEST_DIROBJ is the object directory for the entire dataset. 5022 * Therefore, the number of objects in use should equal the 5023 * number of ZTEST_DIROBJ entries, +1 for ZTEST_DIROBJ itself. 5024 * If not, we have an object leak. 5025 * 5026 * Note that we can only check this in ztest_dataset_open(), 5027 * when the open-context and syncing-context values agree. 5028 * That's because zap_count() returns the open-context value, 5029 * while dmu_objset_space() returns the rootbp fill count. 5030 */ 5031 VERIFY3U(0, ==, zap_count(zd->zd_os, ZTEST_DIROBJ, &dirobjs)); 5032 dmu_objset_space(zd->zd_os, &scratch, &scratch, &usedobjs, &scratch); 5033 ASSERT3U(dirobjs + 1, ==, usedobjs); 5034} 5035 5036static int 5037ztest_dataset_open(ztest_shared_t *zs, int d) 5038{ 5039 ztest_ds_t *zd = &zs->zs_zd[d]; 5040 uint64_t committed_seq = zd->zd_seq; 5041 objset_t *os; 5042 zilog_t *zilog; 5043 char name[MAXNAMELEN]; 5044 int error; 5045 5046 ztest_dataset_name(name, zs->zs_pool, d); 5047 5048 (void) rw_rdlock(&zs->zs_name_lock); 5049 5050 error = ztest_dataset_create(name); 5051 if (error == ENOSPC) { 5052 (void) rw_unlock(&zs->zs_name_lock); 5053 ztest_record_enospc(FTAG); 5054 return (error); 5055 } 5056 ASSERT(error == 0 || error == EEXIST); 5057 5058 VERIFY3U(dmu_objset_hold(name, zd, &os), ==, 0); 5059 (void) rw_unlock(&zs->zs_name_lock); 5060 5061 ztest_zd_init(zd, os); 5062 5063 zilog = zd->zd_zilog; 5064 5065 if (zilog->zl_header->zh_claim_lr_seq != 0 && 5066 zilog->zl_header->zh_claim_lr_seq < committed_seq) 5067 fatal(0, "missing log records: claimed %llu < committed %llu", 5068 zilog->zl_header->zh_claim_lr_seq, committed_seq); 5069 5070 ztest_dataset_dirobj_verify(zd); 5071 5072 zil_replay(os, zd, ztest_replay_vector); 5073 5074 ztest_dataset_dirobj_verify(zd); 5075 5076 if (zopt_verbose >= 6) 5077 (void) printf("%s replay %llu blocks, %llu records, seq %llu\n", 5078 zd->zd_name, 5079 (u_longlong_t)zilog->zl_parse_blk_count, 5080 (u_longlong_t)zilog->zl_parse_lr_count, 5081 (u_longlong_t)zilog->zl_replaying_seq); 5082 5083 zilog = zil_open(os, ztest_get_data); 5084 5085 if (zilog->zl_replaying_seq != 0 && 5086 zilog->zl_replaying_seq < committed_seq) 5087 fatal(0, "missing log records: replayed %llu < committed %llu", 5088 zilog->zl_replaying_seq, committed_seq); 5089 5090 return (0); 5091} 5092 5093static void 5094ztest_dataset_close(ztest_shared_t *zs, int d) 5095{ 5096 ztest_ds_t *zd = &zs->zs_zd[d]; 5097 5098 zil_close(zd->zd_zilog); 5099 dmu_objset_rele(zd->zd_os, zd); 5100 5101 ztest_zd_fini(zd); 5102} 5103 5104/* 5105 * Kick off threads to run tests on all datasets in parallel. 5106 */ 5107static void 5108ztest_run(ztest_shared_t *zs) 5109{ 5110 thread_t *tid; 5111 spa_t *spa; 5112 thread_t resume_tid; 5113 int error; 5114 5115 ztest_exiting = B_FALSE; 5116 5117 /* 5118 * Initialize parent/child shared state. 5119 */ 5120 VERIFY(_mutex_init(&zs->zs_vdev_lock, USYNC_THREAD, NULL) == 0); 5121 VERIFY(rwlock_init(&zs->zs_name_lock, USYNC_THREAD, NULL) == 0); 5122 5123 zs->zs_thread_start = gethrtime(); 5124 zs->zs_thread_stop = zs->zs_thread_start + zopt_passtime * NANOSEC; 5125 zs->zs_thread_stop = MIN(zs->zs_thread_stop, zs->zs_proc_stop); 5126 zs->zs_thread_kill = zs->zs_thread_stop; 5127 if (ztest_random(100) < zopt_killrate) 5128 zs->zs_thread_kill -= ztest_random(zopt_passtime * NANOSEC); 5129 5130 (void) _mutex_init(&zcl.zcl_callbacks_lock, USYNC_THREAD, NULL); 5131 5132 list_create(&zcl.zcl_callbacks, sizeof (ztest_cb_data_t), 5133 offsetof(ztest_cb_data_t, zcd_node)); 5134 5135 /* 5136 * Open our pool. 5137 */ 5138 kernel_init(FREAD | FWRITE); 5139 VERIFY(spa_open(zs->zs_pool, &spa, FTAG) == 0); 5140 zs->zs_spa = spa; 5141 5142 spa->spa_dedup_ditto = 2 * ZIO_DEDUPDITTO_MIN; 5143 5144 /* 5145 * We don't expect the pool to suspend unless maxfaults == 0, 5146 * in which case ztest_fault_inject() temporarily takes away 5147 * the only valid replica. 5148 */ 5149 if (MAXFAULTS() == 0) 5150 spa->spa_failmode = ZIO_FAILURE_MODE_WAIT; 5151 else 5152 spa->spa_failmode = ZIO_FAILURE_MODE_PANIC; 5153 5154 /* 5155 * Create a thread to periodically resume suspended I/O. 5156 */ 5157 VERIFY(thr_create(0, 0, ztest_resume_thread, spa, THR_BOUND, 5158 &resume_tid) == 0); 5159 5160 /* 5161 * Create a deadman thread to abort() if we hang. 5162 */ 5163 VERIFY(thr_create(0, 0, ztest_deadman_thread, zs, THR_BOUND, 5164 NULL) == 0); 5165 5166 /* 5167 * Verify that we can safely inquire about about any object, 5168 * whether it's allocated or not. To make it interesting, 5169 * we probe a 5-wide window around each power of two. 5170 * This hits all edge cases, including zero and the max. 5171 */ 5172 for (int t = 0; t < 64; t++) { 5173 for (int d = -5; d <= 5; d++) { 5174 error = dmu_object_info(spa->spa_meta_objset, 5175 (1ULL << t) + d, NULL); 5176 ASSERT(error == 0 || error == ENOENT || 5177 error == EINVAL); 5178 } 5179 } 5180 5181 /* 5182 * If we got any ENOSPC errors on the previous run, destroy something. 5183 */ 5184 if (zs->zs_enospc_count != 0) { 5185 int d = ztest_random(zopt_datasets); 5186 ztest_dataset_destroy(zs, d); 5187 } 5188 zs->zs_enospc_count = 0; 5189 5190 tid = umem_zalloc(zopt_threads * sizeof (thread_t), UMEM_NOFAIL); 5191 5192 if (zopt_verbose >= 4) 5193 (void) printf("starting main threads...\n"); 5194 5195 /* 5196 * Kick off all the tests that run in parallel. 5197 */ 5198 for (int t = 0; t < zopt_threads; t++) { 5199 if (t < zopt_datasets && ztest_dataset_open(zs, t) != 0) 5200 return; 5201 VERIFY(thr_create(0, 0, ztest_thread, (void *)(uintptr_t)t, 5202 THR_BOUND, &tid[t]) == 0); 5203 } 5204 5205 /* 5206 * Wait for all of the tests to complete. We go in reverse order 5207 * so we don't close datasets while threads are still using them. 5208 */ 5209 for (int t = zopt_threads - 1; t >= 0; t--) { 5210 VERIFY(thr_join(tid[t], NULL, NULL) == 0); 5211 if (t < zopt_datasets) 5212 ztest_dataset_close(zs, t); 5213 } 5214 5215 txg_wait_synced(spa_get_dsl(spa), 0); 5216 5217 zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(spa)); 5218 zs->zs_space = metaslab_class_get_space(spa_normal_class(spa)); 5219 5220 umem_free(tid, zopt_threads * sizeof (thread_t)); 5221 5222 /* Kill the resume thread */ 5223 ztest_exiting = B_TRUE; 5224 VERIFY(thr_join(resume_tid, NULL, NULL) == 0); 5225 ztest_resume(spa); 5226 5227 /* 5228 * Right before closing the pool, kick off a bunch of async I/O; 5229 * spa_close() should wait for it to complete. 5230 */ 5231 for (uint64_t object = 1; object < 50; object++) 5232 dmu_prefetch(spa->spa_meta_objset, object, 0, 1ULL << 20); 5233 5234 spa_close(spa, FTAG); 5235 5236 /* 5237 * Verify that we can loop over all pools. 5238 */ 5239 mutex_enter(&spa_namespace_lock); 5240 for (spa = spa_next(NULL); spa != NULL; spa = spa_next(spa)) 5241 if (zopt_verbose > 3) 5242 (void) printf("spa_next: found %s\n", spa_name(spa)); 5243 mutex_exit(&spa_namespace_lock); 5244 5245 /* 5246 * Verify that we can export the pool and reimport it under a 5247 * different name. 5248 */ 5249 if (ztest_random(2) == 0) { 5250 char name[MAXNAMELEN]; 5251 (void) snprintf(name, MAXNAMELEN, "%s_import", zs->zs_pool); 5252 ztest_spa_import_export(zs->zs_pool, name); 5253 ztest_spa_import_export(name, zs->zs_pool); 5254 } 5255 5256 kernel_fini(); 5257 5258 list_destroy(&zcl.zcl_callbacks); 5259 5260 (void) _mutex_destroy(&zcl.zcl_callbacks_lock); 5261 5262 (void) rwlock_destroy(&zs->zs_name_lock); 5263 (void) _mutex_destroy(&zs->zs_vdev_lock); 5264} 5265 5266static void 5267ztest_freeze(ztest_shared_t *zs) 5268{ 5269 ztest_ds_t *zd = &zs->zs_zd[0]; 5270 spa_t *spa; 5271 int numloops = 0; 5272 5273 if (zopt_verbose >= 3) 5274 (void) printf("testing spa_freeze()...\n"); 5275 5276 kernel_init(FREAD | FWRITE); 5277 VERIFY3U(0, ==, spa_open(zs->zs_pool, &spa, FTAG)); 5278 VERIFY3U(0, ==, ztest_dataset_open(zs, 0)); 5279 5280 /* 5281 * Force the first log block to be transactionally allocated. 5282 * We have to do this before we freeze the pool -- otherwise 5283 * the log chain won't be anchored. 5284 */ 5285 while (BP_IS_HOLE(&zd->zd_zilog->zl_header->zh_log)) { 5286 ztest_dmu_object_alloc_free(zd, 0); 5287 zil_commit(zd->zd_zilog, 0); 5288 } 5289 5290 txg_wait_synced(spa_get_dsl(spa), 0); 5291 5292 /* 5293 * Freeze the pool. This stops spa_sync() from doing anything, 5294 * so that the only way to record changes from now on is the ZIL. 5295 */ 5296 spa_freeze(spa); 5297 5298 /* 5299 * Run tests that generate log records but don't alter the pool config 5300 * or depend on DSL sync tasks (snapshots, objset create/destroy, etc). 5301 * We do a txg_wait_synced() after each iteration to force the txg 5302 * to increase well beyond the last synced value in the uberblock. 5303 * The ZIL should be OK with that. 5304 */ 5305 while (ztest_random(10) != 0 && numloops++ < zopt_maxloops) { 5306 ztest_dmu_write_parallel(zd, 0); 5307 ztest_dmu_object_alloc_free(zd, 0); 5308 txg_wait_synced(spa_get_dsl(spa), 0); 5309 } 5310 5311 /* 5312 * Commit all of the changes we just generated. 5313 */ 5314 zil_commit(zd->zd_zilog, 0); 5315 txg_wait_synced(spa_get_dsl(spa), 0); 5316 5317 /* 5318 * Close our dataset and close the pool. 5319 */ 5320 ztest_dataset_close(zs, 0); 5321 spa_close(spa, FTAG); 5322 kernel_fini(); 5323 5324 /* 5325 * Open and close the pool and dataset to induce log replay. 5326 */ 5327 kernel_init(FREAD | FWRITE); 5328 VERIFY3U(0, ==, spa_open(zs->zs_pool, &spa, FTAG)); 5329 VERIFY3U(0, ==, ztest_dataset_open(zs, 0)); 5330 ztest_dataset_close(zs, 0); 5331 spa_close(spa, FTAG); 5332 kernel_fini(); 5333} 5334 5335void 5336print_time(hrtime_t t, char *timebuf) 5337{ 5338 hrtime_t s = t / NANOSEC; 5339 hrtime_t m = s / 60; 5340 hrtime_t h = m / 60; 5341 hrtime_t d = h / 24; 5342 5343 s -= m * 60; 5344 m -= h * 60; 5345 h -= d * 24; 5346 5347 timebuf[0] = '\0'; 5348 5349 if (d) 5350 (void) sprintf(timebuf, 5351 "%llud%02lluh%02llum%02llus", d, h, m, s); 5352 else if (h) 5353 (void) sprintf(timebuf, "%lluh%02llum%02llus", h, m, s); 5354 else if (m) 5355 (void) sprintf(timebuf, "%llum%02llus", m, s); 5356 else 5357 (void) sprintf(timebuf, "%llus", s); 5358} 5359 5360static nvlist_t * 5361make_random_props() 5362{ 5363 nvlist_t *props; 5364 5365 if (ztest_random(2) == 0) 5366 return (NULL); 5367 5368 VERIFY(nvlist_alloc(&props, NV_UNIQUE_NAME, 0) == 0); 5369 VERIFY(nvlist_add_uint64(props, "autoreplace", 1) == 0); 5370 5371 (void) printf("props:\n"); 5372 dump_nvlist(props, 4); 5373 5374 return (props); 5375} 5376 5377/* 5378 * Create a storage pool with the given name and initial vdev size. 5379 * Then test spa_freeze() functionality. 5380 */ 5381static void 5382ztest_init(ztest_shared_t *zs) 5383{ 5384 spa_t *spa; 5385 nvlist_t *nvroot, *props; 5386 5387 VERIFY(_mutex_init(&zs->zs_vdev_lock, USYNC_THREAD, NULL) == 0); 5388 VERIFY(rwlock_init(&zs->zs_name_lock, USYNC_THREAD, NULL) == 0); 5389 5390 kernel_init(FREAD | FWRITE); 5391 5392 /* 5393 * Create the storage pool. 5394 */ 5395 (void) spa_destroy(zs->zs_pool); 5396 ztest_shared->zs_vdev_next_leaf = 0; 5397 zs->zs_splits = 0; 5398 zs->zs_mirrors = zopt_mirrors; 5399 nvroot = make_vdev_root(NULL, NULL, zopt_vdev_size, 0, 5400 0, zopt_raidz, zs->zs_mirrors, 1); 5401 props = make_random_props(); 5402 VERIFY3U(0, ==, spa_create(zs->zs_pool, nvroot, props, NULL, NULL)); 5403 nvlist_free(nvroot); 5404 5405 VERIFY3U(0, ==, spa_open(zs->zs_pool, &spa, FTAG)); 5406 metaslab_sz = 1ULL << spa->spa_root_vdev->vdev_child[0]->vdev_ms_shift; 5407 spa_close(spa, FTAG); 5408 5409 kernel_fini(); 5410 5411 ztest_run_zdb(zs->zs_pool); 5412 5413 ztest_freeze(zs); 5414 5415 ztest_run_zdb(zs->zs_pool); 5416 5417 (void) rwlock_destroy(&zs->zs_name_lock); 5418 (void) _mutex_destroy(&zs->zs_vdev_lock); 5419} 5420 5421int 5422main(int argc, char **argv) 5423{ 5424 int kills = 0; 5425 int iters = 0; 5426 ztest_shared_t *zs; 5427 size_t shared_size; 5428 ztest_info_t *zi; 5429 char timebuf[100]; 5430 char numbuf[6]; 5431 spa_t *spa; 5432 5433 (void) setvbuf(stdout, NULL, _IOLBF, 0); 5434 5435 ztest_random_fd = open("/dev/urandom", O_RDONLY); 5436 5437 process_options(argc, argv); 5438 5439 /* Override location of zpool.cache */ 5440 (void) asprintf((char **)&spa_config_path, "%s/zpool.cache", zopt_dir); 5441 5442 /* 5443 * Blow away any existing copy of zpool.cache 5444 */ 5445 if (zopt_init != 0) 5446 (void) remove(spa_config_path); 5447 5448 shared_size = sizeof (*zs) + zopt_datasets * sizeof (ztest_ds_t); 5449 5450 zs = ztest_shared = (void *)mmap(0, 5451 P2ROUNDUP(shared_size, getpagesize()), 5452 PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANON, -1, 0); 5453 5454 if (zopt_verbose >= 1) { 5455 (void) printf("%llu vdevs, %d datasets, %d threads," 5456 " %llu seconds...\n", 5457 (u_longlong_t)zopt_vdevs, zopt_datasets, zopt_threads, 5458 (u_longlong_t)zopt_time); 5459 } 5460 5461 /* 5462 * Create and initialize our storage pool. 5463 */ 5464 for (int i = 1; i <= zopt_init; i++) { 5465 bzero(zs, sizeof (ztest_shared_t)); 5466 if (zopt_verbose >= 3 && zopt_init != 1) 5467 (void) printf("ztest_init(), pass %d\n", i); 5468 zs->zs_pool = zopt_pool; 5469 ztest_init(zs); 5470 } 5471 5472 zs->zs_pool = zopt_pool; 5473 zs->zs_proc_start = gethrtime(); 5474 zs->zs_proc_stop = zs->zs_proc_start + zopt_time * NANOSEC; 5475 5476 for (int f = 0; f < ZTEST_FUNCS; f++) { 5477 zi = &zs->zs_info[f]; 5478 *zi = ztest_info[f]; 5479 if (zs->zs_proc_start + zi->zi_interval[0] > zs->zs_proc_stop) 5480 zi->zi_call_next = UINT64_MAX; 5481 else 5482 zi->zi_call_next = zs->zs_proc_start + 5483 ztest_random(2 * zi->zi_interval[0] + 1); 5484 } 5485 5486 /* 5487 * Run the tests in a loop. These tests include fault injection 5488 * to verify that self-healing data works, and forced crashes 5489 * to verify that we never lose on-disk consistency. 5490 */ 5491 while (gethrtime() < zs->zs_proc_stop) { 5492 int status; 5493 pid_t pid; 5494 5495 /* 5496 * Initialize the workload counters for each function. 5497 */ 5498 for (int f = 0; f < ZTEST_FUNCS; f++) { 5499 zi = &zs->zs_info[f]; 5500 zi->zi_call_count = 0; 5501 zi->zi_call_time = 0; 5502 } 5503 5504 /* Set the allocation switch size */ 5505 metaslab_df_alloc_threshold = ztest_random(metaslab_sz / 4) + 1; 5506 5507 pid = fork(); 5508 5509 if (pid == -1) 5510 fatal(1, "fork failed"); 5511 5512 if (pid == 0) { /* child */ 5513 struct rlimit rl = { 1024, 1024 }; 5514 (void) setrlimit(RLIMIT_NOFILE, &rl); 5515 (void) enable_extended_FILE_stdio(-1, -1); 5516 ztest_run(zs); 5517 exit(0); 5518 } 5519 5520 while (waitpid(pid, &status, 0) != pid) 5521 continue; 5522 5523 if (WIFEXITED(status)) { 5524 if (WEXITSTATUS(status) != 0) { 5525 (void) fprintf(stderr, 5526 "child exited with code %d\n", 5527 WEXITSTATUS(status)); 5528 exit(2); 5529 } 5530 } else if (WIFSIGNALED(status)) { 5531 if (WTERMSIG(status) != SIGKILL) { 5532 (void) fprintf(stderr, 5533 "child died with signal %d\n", 5534 WTERMSIG(status)); 5535 exit(3); 5536 } 5537 kills++; 5538 } else { 5539 (void) fprintf(stderr, "something strange happened " 5540 "to child\n"); 5541 exit(4); 5542 } 5543 5544 iters++; 5545 5546 if (zopt_verbose >= 1) { 5547 hrtime_t now = gethrtime(); 5548 5549 now = MIN(now, zs->zs_proc_stop); 5550 print_time(zs->zs_proc_stop - now, timebuf); 5551 nicenum(zs->zs_space, numbuf); 5552 5553 (void) printf("Pass %3d, %8s, %3llu ENOSPC, " 5554 "%4.1f%% of %5s used, %3.0f%% done, %8s to go\n", 5555 iters, 5556 WIFEXITED(status) ? "Complete" : "SIGKILL", 5557 (u_longlong_t)zs->zs_enospc_count, 5558 100.0 * zs->zs_alloc / zs->zs_space, 5559 numbuf, 5560 100.0 * (now - zs->zs_proc_start) / 5561 (zopt_time * NANOSEC), timebuf); 5562 } 5563 5564 if (zopt_verbose >= 2) { 5565 (void) printf("\nWorkload summary:\n\n"); 5566 (void) printf("%7s %9s %s\n", 5567 "Calls", "Time", "Function"); 5568 (void) printf("%7s %9s %s\n", 5569 "-----", "----", "--------"); 5570 for (int f = 0; f < ZTEST_FUNCS; f++) { 5571 Dl_info dli; 5572 5573 zi = &zs->zs_info[f]; 5574 print_time(zi->zi_call_time, timebuf); 5575 (void) dladdr((void *)zi->zi_func, &dli); 5576 (void) printf("%7llu %9s %s\n", 5577 (u_longlong_t)zi->zi_call_count, timebuf, 5578 dli.dli_sname); 5579 } 5580 (void) printf("\n"); 5581 } 5582 5583 /* 5584 * It's possible that we killed a child during a rename test, 5585 * in which case we'll have a 'ztest_tmp' pool lying around 5586 * instead of 'ztest'. Do a blind rename in case this happened. 5587 */ 5588 kernel_init(FREAD); 5589 if (spa_open(zopt_pool, &spa, FTAG) == 0) { 5590 spa_close(spa, FTAG); 5591 } else { 5592 char tmpname[MAXNAMELEN]; 5593 kernel_fini(); 5594 kernel_init(FREAD | FWRITE); 5595 (void) snprintf(tmpname, sizeof (tmpname), "%s_tmp", 5596 zopt_pool); 5597 (void) spa_rename(tmpname, zopt_pool); 5598 } 5599 kernel_fini(); 5600 5601 ztest_run_zdb(zopt_pool); 5602 } 5603 5604 if (zopt_verbose >= 1) { 5605 (void) printf("%d killed, %d completed, %.0f%% kill rate\n", 5606 kills, iters - kills, (100.0 * kills) / MAX(1, iters)); 5607 } 5608 5609 return (0); 5610} 5611