1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21/* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2011, 2018 by Delphix. All rights reserved. 24 * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 25 * Copyright (c) 2012 Martin Matuska <mm@FreeBSD.org>. All rights reserved. 26 * Copyright (c) 2013 Steven Hartland. All rights reserved. 27 * Copyright (c) 2014 Integros [integros.com] 28 * Copyright 2017 Joyent, Inc. 29 * Copyright (c) 2017, Intel Corporation. 30 * Copyright 2017 RackTop Systems. 31 */ 32 33/* 34 * The objective of this program is to provide a DMU/ZAP/SPA stress test 35 * that runs entirely in userland, is easy to use, and easy to extend. 36 * 37 * The overall design of the ztest program is as follows: 38 * 39 * (1) For each major functional area (e.g. adding vdevs to a pool, 40 * creating and destroying datasets, reading and writing objects, etc) 41 * we have a simple routine to test that functionality. These 42 * individual routines do not have to do anything "stressful". 43 * 44 * (2) We turn these simple functionality tests into a stress test by 45 * running them all in parallel, with as many threads as desired, 46 * and spread across as many datasets, objects, and vdevs as desired. 47 * 48 * (3) While all this is happening, we inject faults into the pool to 49 * verify that self-healing data really works. 50 * 51 * (4) Every time we open a dataset, we change its checksum and compression 52 * functions. Thus even individual objects vary from block to block 53 * in which checksum they use and whether they're compressed. 54 * 55 * (5) To verify that we never lose on-disk consistency after a crash, 56 * we run the entire test in a child of the main process. 57 * At random times, the child self-immolates with a SIGKILL. 58 * This is the software equivalent of pulling the power cord. 59 * The parent then runs the test again, using the existing 60 * storage pool, as many times as desired. If backwards compatibility 61 * testing is enabled ztest will sometimes run the "older" version 62 * of ztest after a SIGKILL. 63 * 64 * (6) To verify that we don't have future leaks or temporal incursions, 65 * many of the functional tests record the transaction group number 66 * as part of their data. When reading old data, they verify that 67 * the transaction group number is less than the current, open txg. 68 * If you add a new test, please do this if applicable. 69 * 70 * When run with no arguments, ztest runs for about five minutes and 71 * produces no output if successful. To get a little bit of information, 72 * specify -V. To get more information, specify -VV, and so on. 73 * 74 * To turn this into an overnight stress test, use -T to specify run time. 75 * 76 * You can ask more more vdevs [-v], datasets [-d], or threads [-t] 77 * to increase the pool capacity, fanout, and overall stress level. 78 * 79 * Use the -k option to set the desired frequency of kills. 80 * 81 * When ztest invokes itself it passes all relevant information through a 82 * temporary file which is mmap-ed in the child process. This allows shared 83 * memory to survive the exec syscall. The ztest_shared_hdr_t struct is always 84 * stored at offset 0 of this file and contains information on the size and 85 * number of shared structures in the file. The information stored in this file 86 * must remain backwards compatible with older versions of ztest so that 87 * ztest can invoke them during backwards compatibility testing (-B). 88 */ 89 90#include <sys/zfs_context.h> 91#include <sys/spa.h> 92#include <sys/dmu.h> 93#include <sys/txg.h> 94#include <sys/dbuf.h> 95#include <sys/zap.h> 96#include <sys/dmu_objset.h> 97#include <sys/poll.h> 98#include <sys/stat.h> 99#include <sys/time.h> 100#include <sys/wait.h> 101#include <sys/mman.h> 102#include <sys/resource.h> 103#include <sys/zio.h> 104#include <sys/zil.h> 105#include <sys/zil_impl.h> 106#include <sys/vdev_impl.h> 107#include <sys/vdev_file.h> 108#include <sys/vdev_initialize.h> 109#include <sys/spa_impl.h> 110#include <sys/metaslab_impl.h> 111#include <sys/dsl_prop.h> 112#include <sys/dsl_dataset.h> 113#include <sys/dsl_destroy.h> 114#include <sys/dsl_scan.h> 115#include <sys/zio_checksum.h> 116#include <sys/refcount.h> 117#include <sys/zfeature.h> 118#include <sys/dsl_userhold.h> 119#include <sys/abd.h> 120#include <stdio.h> 121#include <stdio_ext.h> 122#include <stdlib.h> 123#include <unistd.h> 124#include <signal.h> 125#include <umem.h> 126#include <dlfcn.h> 127#include <ctype.h> 128#include <math.h> 129#include <errno.h> 130#include <sys/fs/zfs.h> 131#include <libnvpair.h> 132#include <libzfs.h> 133#include <libcmdutils.h> 134 135static int ztest_fd_data = -1; 136static int ztest_fd_rand = -1; 137 138typedef struct ztest_shared_hdr { 139 uint64_t zh_hdr_size; 140 uint64_t zh_opts_size; 141 uint64_t zh_size; 142 uint64_t zh_stats_size; 143 uint64_t zh_stats_count; 144 uint64_t zh_ds_size; 145 uint64_t zh_ds_count; 146} ztest_shared_hdr_t; 147 148static ztest_shared_hdr_t *ztest_shared_hdr; 149 150enum ztest_class_state { 151 ZTEST_VDEV_CLASS_OFF, 152 ZTEST_VDEV_CLASS_ON, 153 ZTEST_VDEV_CLASS_RND 154}; 155 156typedef struct ztest_shared_opts { 157 char zo_pool[ZFS_MAX_DATASET_NAME_LEN]; 158 char zo_dir[ZFS_MAX_DATASET_NAME_LEN]; 159 char zo_alt_ztest[MAXNAMELEN]; 160 char zo_alt_libpath[MAXNAMELEN]; 161 uint64_t zo_vdevs; 162 uint64_t zo_vdevtime; 163 size_t zo_vdev_size; 164 int zo_ashift; 165 int zo_mirrors; 166 int zo_raidz; 167 int zo_raidz_parity; 168 int zo_datasets; 169 int zo_threads; 170 uint64_t zo_passtime; 171 uint64_t zo_killrate; 172 int zo_verbose; 173 int zo_init; 174 uint64_t zo_time; 175 uint64_t zo_maxloops; 176 uint64_t zo_metaslab_force_ganging; 177 int zo_mmp_test; 178 int zo_special_vdevs; 179} ztest_shared_opts_t; 180 181static const ztest_shared_opts_t ztest_opts_defaults = { 182 .zo_pool = { 'z', 't', 'e', 's', 't', '\0' }, 183 .zo_dir = { '/', 't', 'm', 'p', '\0' }, 184 .zo_alt_ztest = { '\0' }, 185 .zo_alt_libpath = { '\0' }, 186 .zo_vdevs = 5, 187 .zo_ashift = SPA_MINBLOCKSHIFT, 188 .zo_mirrors = 2, 189 .zo_raidz = 4, 190 .zo_raidz_parity = 1, 191 .zo_vdev_size = SPA_MINDEVSIZE * 4, /* 256m default size */ 192 .zo_datasets = 7, 193 .zo_threads = 23, 194 .zo_passtime = 60, /* 60 seconds */ 195 .zo_killrate = 70, /* 70% kill rate */ 196 .zo_verbose = 0, 197 .zo_mmp_test = 0, 198 .zo_init = 1, 199 .zo_time = 300, /* 5 minutes */ 200 .zo_maxloops = 50, /* max loops during spa_freeze() */ 201 .zo_metaslab_force_ganging = 32 << 10, 202 .zo_special_vdevs = ZTEST_VDEV_CLASS_RND, 203}; 204 205extern uint64_t metaslab_force_ganging; 206extern uint64_t metaslab_df_alloc_threshold; 207extern uint64_t zfs_deadman_synctime_ms; 208extern int metaslab_preload_limit; 209extern boolean_t zfs_compressed_arc_enabled; 210extern boolean_t zfs_abd_scatter_enabled; 211extern int dmu_object_alloc_chunk_shift; 212extern boolean_t zfs_force_some_double_word_sm_entries; 213extern unsigned long zfs_reconstruct_indirect_damage_fraction; 214 215static ztest_shared_opts_t *ztest_shared_opts; 216static ztest_shared_opts_t ztest_opts; 217 218typedef struct ztest_shared_ds { 219 uint64_t zd_seq; 220} ztest_shared_ds_t; 221 222static ztest_shared_ds_t *ztest_shared_ds; 223#define ZTEST_GET_SHARED_DS(d) (&ztest_shared_ds[d]) 224 225#define BT_MAGIC 0x123456789abcdefULL 226#define MAXFAULTS() \ 227 (MAX(zs->zs_mirrors, 1) * (ztest_opts.zo_raidz_parity + 1) - 1) 228 229enum ztest_io_type { 230 ZTEST_IO_WRITE_TAG, 231 ZTEST_IO_WRITE_PATTERN, 232 ZTEST_IO_WRITE_ZEROES, 233 ZTEST_IO_TRUNCATE, 234 ZTEST_IO_SETATTR, 235 ZTEST_IO_REWRITE, 236 ZTEST_IO_TYPES 237}; 238 239typedef struct ztest_block_tag { 240 uint64_t bt_magic; 241 uint64_t bt_objset; 242 uint64_t bt_object; 243 uint64_t bt_dnodesize; 244 uint64_t bt_offset; 245 uint64_t bt_gen; 246 uint64_t bt_txg; 247 uint64_t bt_crtxg; 248} ztest_block_tag_t; 249 250typedef struct bufwad { 251 uint64_t bw_index; 252 uint64_t bw_txg; 253 uint64_t bw_data; 254} bufwad_t; 255 256/* 257 * It would be better to use a rangelock_t per object. Unfortunately 258 * the rangelock_t is not a drop-in replacement for rl_t, because we 259 * still need to map from object ID to rangelock_t. 260 */ 261typedef enum { 262 RL_READER, 263 RL_WRITER, 264 RL_APPEND 265} rl_type_t; 266 267typedef struct rll { 268 void *rll_writer; 269 int rll_readers; 270 kmutex_t rll_lock; 271 kcondvar_t rll_cv; 272} rll_t; 273 274typedef struct rl { 275 uint64_t rl_object; 276 uint64_t rl_offset; 277 uint64_t rl_size; 278 rll_t *rl_lock; 279} rl_t; 280 281#define ZTEST_RANGE_LOCKS 64 282#define ZTEST_OBJECT_LOCKS 64 283 284/* 285 * Object descriptor. Used as a template for object lookup/create/remove. 286 */ 287typedef struct ztest_od { 288 uint64_t od_dir; 289 uint64_t od_object; 290 dmu_object_type_t od_type; 291 dmu_object_type_t od_crtype; 292 uint64_t od_blocksize; 293 uint64_t od_crblocksize; 294 uint64_t od_crdnodesize; 295 uint64_t od_gen; 296 uint64_t od_crgen; 297 char od_name[ZFS_MAX_DATASET_NAME_LEN]; 298} ztest_od_t; 299 300/* 301 * Per-dataset state. 302 */ 303typedef struct ztest_ds { 304 ztest_shared_ds_t *zd_shared; 305 objset_t *zd_os; 306 krwlock_t zd_zilog_lock; 307 zilog_t *zd_zilog; 308 ztest_od_t *zd_od; /* debugging aid */ 309 char zd_name[ZFS_MAX_DATASET_NAME_LEN]; 310 kmutex_t zd_dirobj_lock; 311 rll_t zd_object_lock[ZTEST_OBJECT_LOCKS]; 312 rll_t zd_range_lock[ZTEST_RANGE_LOCKS]; 313} ztest_ds_t; 314 315/* 316 * Per-iteration state. 317 */ 318typedef void ztest_func_t(ztest_ds_t *zd, uint64_t id); 319 320typedef struct ztest_info { 321 ztest_func_t *zi_func; /* test function */ 322 uint64_t zi_iters; /* iterations per execution */ 323 uint64_t *zi_interval; /* execute every <interval> seconds */ 324} ztest_info_t; 325 326typedef struct ztest_shared_callstate { 327 uint64_t zc_count; /* per-pass count */ 328 uint64_t zc_time; /* per-pass time */ 329 uint64_t zc_next; /* next time to call this function */ 330} ztest_shared_callstate_t; 331 332static ztest_shared_callstate_t *ztest_shared_callstate; 333#define ZTEST_GET_SHARED_CALLSTATE(c) (&ztest_shared_callstate[c]) 334 335/* 336 * Note: these aren't static because we want dladdr() to work. 337 */ 338ztest_func_t ztest_dmu_read_write; 339ztest_func_t ztest_dmu_write_parallel; 340ztest_func_t ztest_dmu_object_alloc_free; 341ztest_func_t ztest_dmu_object_next_chunk; 342ztest_func_t ztest_dmu_commit_callbacks; 343ztest_func_t ztest_zap; 344ztest_func_t ztest_zap_parallel; 345ztest_func_t ztest_zil_commit; 346ztest_func_t ztest_zil_remount; 347ztest_func_t ztest_dmu_read_write_zcopy; 348ztest_func_t ztest_dmu_objset_create_destroy; 349ztest_func_t ztest_dmu_prealloc; 350ztest_func_t ztest_fzap; 351ztest_func_t ztest_dmu_snapshot_create_destroy; 352ztest_func_t ztest_dsl_prop_get_set; 353ztest_func_t ztest_spa_prop_get_set; 354ztest_func_t ztest_spa_create_destroy; 355ztest_func_t ztest_fault_inject; 356ztest_func_t ztest_ddt_repair; 357ztest_func_t ztest_dmu_snapshot_hold; 358ztest_func_t ztest_mmp_enable_disable; 359ztest_func_t ztest_scrub; 360ztest_func_t ztest_dsl_dataset_promote_busy; 361ztest_func_t ztest_vdev_attach_detach; 362ztest_func_t ztest_vdev_LUN_growth; 363ztest_func_t ztest_vdev_add_remove; 364ztest_func_t ztest_vdev_class_add; 365ztest_func_t ztest_vdev_aux_add_remove; 366ztest_func_t ztest_split_pool; 367ztest_func_t ztest_reguid; 368ztest_func_t ztest_spa_upgrade; 369ztest_func_t ztest_device_removal; 370ztest_func_t ztest_remap_blocks; 371ztest_func_t ztest_spa_checkpoint_create_discard; 372ztest_func_t ztest_initialize; 373ztest_func_t ztest_verify_dnode_bt; 374 375uint64_t zopt_always = 0ULL * NANOSEC; /* all the time */ 376uint64_t zopt_incessant = 1ULL * NANOSEC / 10; /* every 1/10 second */ 377uint64_t zopt_often = 1ULL * NANOSEC; /* every second */ 378uint64_t zopt_sometimes = 10ULL * NANOSEC; /* every 10 seconds */ 379uint64_t zopt_rarely = 60ULL * NANOSEC; /* every 60 seconds */ 380 381ztest_info_t ztest_info[] = { 382 { ztest_dmu_read_write, 1, &zopt_always }, 383 { ztest_dmu_write_parallel, 10, &zopt_always }, 384 { ztest_dmu_object_alloc_free, 1, &zopt_always }, 385 { ztest_dmu_object_next_chunk, 1, &zopt_sometimes }, 386 { ztest_dmu_commit_callbacks, 1, &zopt_always }, 387 { ztest_zap, 30, &zopt_always }, 388 { ztest_zap_parallel, 100, &zopt_always }, 389 { ztest_split_pool, 1, &zopt_always }, 390 { ztest_zil_commit, 1, &zopt_incessant }, 391 { ztest_zil_remount, 1, &zopt_sometimes }, 392 { ztest_dmu_read_write_zcopy, 1, &zopt_often }, 393 { ztest_dmu_objset_create_destroy, 1, &zopt_often }, 394 { ztest_dsl_prop_get_set, 1, &zopt_often }, 395 { ztest_spa_prop_get_set, 1, &zopt_sometimes }, 396#if 0 397 { ztest_dmu_prealloc, 1, &zopt_sometimes }, 398#endif 399 { ztest_fzap, 1, &zopt_sometimes }, 400 { ztest_dmu_snapshot_create_destroy, 1, &zopt_sometimes }, 401 { ztest_spa_create_destroy, 1, &zopt_sometimes }, 402 { ztest_fault_inject, 1, &zopt_incessant }, 403 { ztest_ddt_repair, 1, &zopt_sometimes }, 404 { ztest_dmu_snapshot_hold, 1, &zopt_sometimes }, 405 { ztest_mmp_enable_disable, 1, &zopt_sometimes }, 406 { ztest_reguid, 1, &zopt_rarely }, 407 { ztest_scrub, 1, &zopt_often }, 408 { ztest_spa_upgrade, 1, &zopt_rarely }, 409 { ztest_dsl_dataset_promote_busy, 1, &zopt_rarely }, 410 { ztest_vdev_attach_detach, 1, &zopt_incessant }, 411 { ztest_vdev_LUN_growth, 1, &zopt_rarely }, 412 { ztest_vdev_add_remove, 1, 413 &ztest_opts.zo_vdevtime }, 414 { ztest_vdev_class_add, 1, 415 &ztest_opts.zo_vdevtime }, 416 { ztest_vdev_aux_add_remove, 1, 417 &ztest_opts.zo_vdevtime }, 418 { ztest_device_removal, 1, &zopt_sometimes }, 419 { ztest_remap_blocks, 1, &zopt_sometimes }, 420 { ztest_spa_checkpoint_create_discard, 1, &zopt_rarely }, 421 { ztest_initialize, 1, &zopt_sometimes }, 422 { ztest_verify_dnode_bt, 1, &zopt_sometimes } 423}; 424 425#define ZTEST_FUNCS (sizeof (ztest_info) / sizeof (ztest_info_t)) 426 427/* 428 * The following struct is used to hold a list of uncalled commit callbacks. 429 * The callbacks are ordered by txg number. 430 */ 431typedef struct ztest_cb_list { 432 kmutex_t zcl_callbacks_lock; 433 list_t zcl_callbacks; 434} ztest_cb_list_t; 435 436/* 437 * Stuff we need to share writably between parent and child. 438 */ 439typedef struct ztest_shared { 440 boolean_t zs_do_init; 441 hrtime_t zs_proc_start; 442 hrtime_t zs_proc_stop; 443 hrtime_t zs_thread_start; 444 hrtime_t zs_thread_stop; 445 hrtime_t zs_thread_kill; 446 uint64_t zs_enospc_count; 447 uint64_t zs_vdev_next_leaf; 448 uint64_t zs_vdev_aux; 449 uint64_t zs_alloc; 450 uint64_t zs_space; 451 uint64_t zs_splits; 452 uint64_t zs_mirrors; 453 uint64_t zs_metaslab_sz; 454 uint64_t zs_metaslab_df_alloc_threshold; 455 uint64_t zs_guid; 456} ztest_shared_t; 457 458#define ID_PARALLEL -1ULL 459 460static char ztest_dev_template[] = "%s/%s.%llua"; 461static char ztest_aux_template[] = "%s/%s.%s.%llu"; 462ztest_shared_t *ztest_shared; 463 464static spa_t *ztest_spa = NULL; 465static ztest_ds_t *ztest_ds; 466 467static kmutex_t ztest_vdev_lock; 468static boolean_t ztest_device_removal_active = B_FALSE; 469static kmutex_t ztest_checkpoint_lock; 470 471/* 472 * The ztest_name_lock protects the pool and dataset namespace used by 473 * the individual tests. To modify the namespace, consumers must grab 474 * this lock as writer. Grabbing the lock as reader will ensure that the 475 * namespace does not change while the lock is held. 476 */ 477static krwlock_t ztest_name_lock; 478 479static boolean_t ztest_dump_core = B_TRUE; 480static boolean_t ztest_exiting; 481 482/* Global commit callback list */ 483static ztest_cb_list_t zcl; 484 485enum ztest_object { 486 ZTEST_META_DNODE = 0, 487 ZTEST_DIROBJ, 488 ZTEST_OBJECTS 489}; 490 491static void usage(boolean_t) __NORETURN; 492 493/* 494 * These libumem hooks provide a reasonable set of defaults for the allocator's 495 * debugging facilities. 496 */ 497const char * 498_umem_debug_init() 499{ 500 return ("default,verbose"); /* $UMEM_DEBUG setting */ 501} 502 503const char * 504_umem_logging_init(void) 505{ 506 return ("fail,contents"); /* $UMEM_LOGGING setting */ 507} 508 509#define FATAL_MSG_SZ 1024 510 511char *fatal_msg; 512 513static void 514fatal(int do_perror, char *message, ...) 515{ 516 va_list args; 517 int save_errno = errno; 518 char buf[FATAL_MSG_SZ]; 519 520 (void) fflush(stdout); 521 522 va_start(args, message); 523 (void) sprintf(buf, "ztest: "); 524 /* LINTED */ 525 (void) vsprintf(buf + strlen(buf), message, args); 526 va_end(args); 527 if (do_perror) { 528 (void) snprintf(buf + strlen(buf), FATAL_MSG_SZ - strlen(buf), 529 ": %s", strerror(save_errno)); 530 } 531 (void) fprintf(stderr, "%s\n", buf); 532 fatal_msg = buf; /* to ease debugging */ 533 if (ztest_dump_core) 534 abort(); 535 exit(3); 536} 537 538static int 539str2shift(const char *buf) 540{ 541 const char *ends = "BKMGTPEZ"; 542 int i; 543 544 if (buf[0] == '\0') 545 return (0); 546 for (i = 0; i < strlen(ends); i++) { 547 if (toupper(buf[0]) == ends[i]) 548 break; 549 } 550 if (i == strlen(ends)) { 551 (void) fprintf(stderr, "ztest: invalid bytes suffix: %s\n", 552 buf); 553 usage(B_FALSE); 554 } 555 if (buf[1] == '\0' || (toupper(buf[1]) == 'B' && buf[2] == '\0')) { 556 return (10*i); 557 } 558 (void) fprintf(stderr, "ztest: invalid bytes suffix: %s\n", buf); 559 usage(B_FALSE); 560 /* NOTREACHED */ 561} 562 563static uint64_t 564nicenumtoull(const char *buf) 565{ 566 char *end; 567 uint64_t val; 568 569 val = strtoull(buf, &end, 0); 570 if (end == buf) { 571 (void) fprintf(stderr, "ztest: bad numeric value: %s\n", buf); 572 usage(B_FALSE); 573 } else if (end[0] == '.') { 574 double fval = strtod(buf, &end); 575 fval *= pow(2, str2shift(end)); 576 if (fval > UINT64_MAX) { 577 (void) fprintf(stderr, "ztest: value too large: %s\n", 578 buf); 579 usage(B_FALSE); 580 } 581 val = (uint64_t)fval; 582 } else { 583 int shift = str2shift(end); 584 if (shift >= 64 || (val << shift) >> shift != val) { 585 (void) fprintf(stderr, "ztest: value too large: %s\n", 586 buf); 587 usage(B_FALSE); 588 } 589 val <<= shift; 590 } 591 return (val); 592} 593 594static void 595usage(boolean_t requested) 596{ 597 const ztest_shared_opts_t *zo = &ztest_opts_defaults; 598 599 char nice_vdev_size[NN_NUMBUF_SZ]; 600 char nice_force_ganging[NN_NUMBUF_SZ]; 601 FILE *fp = requested ? stdout : stderr; 602 603 nicenum(zo->zo_vdev_size, nice_vdev_size, sizeof (nice_vdev_size)); 604 nicenum(zo->zo_metaslab_force_ganging, nice_force_ganging, 605 sizeof (nice_force_ganging)); 606 607 (void) fprintf(fp, "Usage: %s\n" 608 "\t[-v vdevs (default: %llu)]\n" 609 "\t[-s size_of_each_vdev (default: %s)]\n" 610 "\t[-a alignment_shift (default: %d)] use 0 for random\n" 611 "\t[-m mirror_copies (default: %d)]\n" 612 "\t[-r raidz_disks (default: %d)]\n" 613 "\t[-R raidz_parity (default: %d)]\n" 614 "\t[-d datasets (default: %d)]\n" 615 "\t[-t threads (default: %d)]\n" 616 "\t[-g gang_block_threshold (default: %s)]\n" 617 "\t[-i init_count (default: %d)] initialize pool i times\n" 618 "\t[-k kill_percentage (default: %llu%%)]\n" 619 "\t[-p pool_name (default: %s)]\n" 620 "\t[-f dir (default: %s)] file directory for vdev files\n" 621 "\t[-M] Multi-host simulate pool imported on remote host\n" 622 "\t[-V] verbose (use multiple times for ever more blather)\n" 623 "\t[-E] use existing pool instead of creating new one\n" 624 "\t[-T time (default: %llu sec)] total run time\n" 625 "\t[-F freezeloops (default: %llu)] max loops in spa_freeze()\n" 626 "\t[-P passtime (default: %llu sec)] time per pass\n" 627 "\t[-B alt_ztest (default: <none>)] alternate ztest path\n" 628 "\t[-C vdev class state (default: random)] special=on|off|random\n" 629 "\t[-o variable=value] ... set global variable to an unsigned\n" 630 "\t 32-bit integer value\n" 631 "\t[-h] (print help)\n" 632 "", 633 zo->zo_pool, 634 (u_longlong_t)zo->zo_vdevs, /* -v */ 635 nice_vdev_size, /* -s */ 636 zo->zo_ashift, /* -a */ 637 zo->zo_mirrors, /* -m */ 638 zo->zo_raidz, /* -r */ 639 zo->zo_raidz_parity, /* -R */ 640 zo->zo_datasets, /* -d */ 641 zo->zo_threads, /* -t */ 642 nice_force_ganging, /* -g */ 643 zo->zo_init, /* -i */ 644 (u_longlong_t)zo->zo_killrate, /* -k */ 645 zo->zo_pool, /* -p */ 646 zo->zo_dir, /* -f */ 647 (u_longlong_t)zo->zo_time, /* -T */ 648 (u_longlong_t)zo->zo_maxloops, /* -F */ 649 (u_longlong_t)zo->zo_passtime); 650 exit(requested ? 0 : 1); 651} 652 653 654static void 655ztest_parse_name_value(const char *input, ztest_shared_opts_t *zo) 656{ 657 char name[32]; 658 char *value; 659 int state = ZTEST_VDEV_CLASS_RND; 660 661 (void) strlcpy(name, input, sizeof (name)); 662 663 value = strchr(name, '='); 664 if (value == NULL) { 665 (void) fprintf(stderr, "missing value in property=value " 666 "'-C' argument (%s)\n", input); 667 usage(B_FALSE); 668 } 669 *(value) = '\0'; 670 value++; 671 672 if (strcmp(value, "on") == 0) { 673 state = ZTEST_VDEV_CLASS_ON; 674 } else if (strcmp(value, "off") == 0) { 675 state = ZTEST_VDEV_CLASS_OFF; 676 } else if (strcmp(value, "random") == 0) { 677 state = ZTEST_VDEV_CLASS_RND; 678 } else { 679 (void) fprintf(stderr, "invalid property value '%s'\n", value); 680 usage(B_FALSE); 681 } 682 683 if (strcmp(name, "special") == 0) { 684 zo->zo_special_vdevs = state; 685 } else { 686 (void) fprintf(stderr, "invalid property name '%s'\n", name); 687 usage(B_FALSE); 688 } 689 if (zo->zo_verbose >= 3) 690 (void) printf("%s vdev state is '%s'\n", name, value); 691} 692 693static void 694process_options(int argc, char **argv) 695{ 696 char *path; 697 ztest_shared_opts_t *zo = &ztest_opts; 698 699 int opt; 700 uint64_t value; 701 char altdir[MAXNAMELEN] = { 0 }; 702 703 bcopy(&ztest_opts_defaults, zo, sizeof (*zo)); 704 705 while ((opt = getopt(argc, argv, 706 "v:s:a:m:r:R:d:t:g:i:k:p:f:MVET:P:hF:B:C:o:")) != EOF) { 707 value = 0; 708 switch (opt) { 709 case 'v': 710 case 's': 711 case 'a': 712 case 'm': 713 case 'r': 714 case 'R': 715 case 'd': 716 case 't': 717 case 'g': 718 case 'i': 719 case 'k': 720 case 'T': 721 case 'P': 722 case 'F': 723 value = nicenumtoull(optarg); 724 } 725 switch (opt) { 726 case 'v': 727 zo->zo_vdevs = value; 728 break; 729 case 's': 730 zo->zo_vdev_size = MAX(SPA_MINDEVSIZE, value); 731 break; 732 case 'a': 733 zo->zo_ashift = value; 734 break; 735 case 'm': 736 zo->zo_mirrors = value; 737 break; 738 case 'r': 739 zo->zo_raidz = MAX(1, value); 740 break; 741 case 'R': 742 zo->zo_raidz_parity = MIN(MAX(value, 1), 3); 743 break; 744 case 'd': 745 zo->zo_datasets = MAX(1, value); 746 break; 747 case 't': 748 zo->zo_threads = MAX(1, value); 749 break; 750 case 'g': 751 zo->zo_metaslab_force_ganging = 752 MAX(SPA_MINBLOCKSIZE << 1, value); 753 break; 754 case 'i': 755 zo->zo_init = value; 756 break; 757 case 'k': 758 zo->zo_killrate = value; 759 break; 760 case 'p': 761 (void) strlcpy(zo->zo_pool, optarg, 762 sizeof (zo->zo_pool)); 763 break; 764 case 'f': 765 path = realpath(optarg, NULL); 766 if (path == NULL) { 767 (void) fprintf(stderr, "error: %s: %s\n", 768 optarg, strerror(errno)); 769 usage(B_FALSE); 770 } else { 771 (void) strlcpy(zo->zo_dir, path, 772 sizeof (zo->zo_dir)); 773 } 774 break; 775 case 'M': 776 zo->zo_mmp_test = 1; 777 break; 778 case 'V': 779 zo->zo_verbose++; 780 break; 781 case 'E': 782 zo->zo_init = 0; 783 break; 784 case 'T': 785 zo->zo_time = value; 786 break; 787 case 'P': 788 zo->zo_passtime = MAX(1, value); 789 break; 790 case 'F': 791 zo->zo_maxloops = MAX(1, value); 792 break; 793 case 'B': 794 (void) strlcpy(altdir, optarg, sizeof (altdir)); 795 break; 796 case 'C': 797 ztest_parse_name_value(optarg, zo); 798 break; 799 case 'o': 800 if (set_global_var(optarg) != 0) 801 usage(B_FALSE); 802 break; 803 case 'h': 804 usage(B_TRUE); 805 break; 806 case '?': 807 default: 808 usage(B_FALSE); 809 break; 810 } 811 } 812 813 zo->zo_raidz_parity = MIN(zo->zo_raidz_parity, zo->zo_raidz - 1); 814 815 zo->zo_vdevtime = 816 (zo->zo_vdevs > 0 ? zo->zo_time * NANOSEC / zo->zo_vdevs : 817 UINT64_MAX >> 2); 818 819 if (strlen(altdir) > 0) { 820 char *cmd; 821 char *realaltdir; 822 char *bin; 823 char *ztest; 824 char *isa; 825 int isalen; 826 827 cmd = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 828 realaltdir = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 829 830 VERIFY(NULL != realpath(getexecname(), cmd)); 831 if (0 != access(altdir, F_OK)) { 832 ztest_dump_core = B_FALSE; 833 fatal(B_TRUE, "invalid alternate ztest path: %s", 834 altdir); 835 } 836 VERIFY(NULL != realpath(altdir, realaltdir)); 837 838 /* 839 * 'cmd' should be of the form "<anything>/usr/bin/<isa>/ztest". 840 * We want to extract <isa> to determine if we should use 841 * 32 or 64 bit binaries. 842 */ 843 bin = strstr(cmd, "/usr/bin/"); 844 ztest = strstr(bin, "/ztest"); 845 isa = bin + 9; 846 isalen = ztest - isa; 847 (void) snprintf(zo->zo_alt_ztest, sizeof (zo->zo_alt_ztest), 848 "%s/usr/bin/%.*s/ztest", realaltdir, isalen, isa); 849 (void) snprintf(zo->zo_alt_libpath, sizeof (zo->zo_alt_libpath), 850 "%s/usr/lib/%.*s", realaltdir, isalen, isa); 851 852 if (0 != access(zo->zo_alt_ztest, X_OK)) { 853 ztest_dump_core = B_FALSE; 854 fatal(B_TRUE, "invalid alternate ztest: %s", 855 zo->zo_alt_ztest); 856 } else if (0 != access(zo->zo_alt_libpath, X_OK)) { 857 ztest_dump_core = B_FALSE; 858 fatal(B_TRUE, "invalid alternate lib directory %s", 859 zo->zo_alt_libpath); 860 } 861 862 umem_free(cmd, MAXPATHLEN); 863 umem_free(realaltdir, MAXPATHLEN); 864 } 865} 866 867static void 868ztest_kill(ztest_shared_t *zs) 869{ 870 zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(ztest_spa)); 871 zs->zs_space = metaslab_class_get_space(spa_normal_class(ztest_spa)); 872 873 /* 874 * Before we kill off ztest, make sure that the config is updated. 875 * See comment above spa_write_cachefile(). 876 */ 877 mutex_enter(&spa_namespace_lock); 878 spa_write_cachefile(ztest_spa, B_FALSE, B_FALSE); 879 mutex_exit(&spa_namespace_lock); 880 881 zfs_dbgmsg_print(FTAG); 882 (void) kill(getpid(), SIGKILL); 883} 884 885static uint64_t 886ztest_random(uint64_t range) 887{ 888 uint64_t r; 889 890 ASSERT3S(ztest_fd_rand, >=, 0); 891 892 if (range == 0) 893 return (0); 894 895 if (read(ztest_fd_rand, &r, sizeof (r)) != sizeof (r)) 896 fatal(1, "short read from /dev/urandom"); 897 898 return (r % range); 899} 900 901/* ARGSUSED */ 902static void 903ztest_record_enospc(const char *s) 904{ 905 ztest_shared->zs_enospc_count++; 906} 907 908static uint64_t 909ztest_get_ashift(void) 910{ 911 if (ztest_opts.zo_ashift == 0) 912 return (SPA_MINBLOCKSHIFT + ztest_random(5)); 913 return (ztest_opts.zo_ashift); 914} 915 916static nvlist_t * 917make_vdev_file(char *path, char *aux, char *pool, size_t size, uint64_t ashift) 918{ 919 char pathbuf[MAXPATHLEN]; 920 uint64_t vdev; 921 nvlist_t *file; 922 923 if (ashift == 0) 924 ashift = ztest_get_ashift(); 925 926 if (path == NULL) { 927 path = pathbuf; 928 929 if (aux != NULL) { 930 vdev = ztest_shared->zs_vdev_aux; 931 (void) snprintf(path, sizeof (pathbuf), 932 ztest_aux_template, ztest_opts.zo_dir, 933 pool == NULL ? ztest_opts.zo_pool : pool, 934 aux, vdev); 935 } else { 936 vdev = ztest_shared->zs_vdev_next_leaf++; 937 (void) snprintf(path, sizeof (pathbuf), 938 ztest_dev_template, ztest_opts.zo_dir, 939 pool == NULL ? ztest_opts.zo_pool : pool, vdev); 940 } 941 } 942 943 if (size != 0) { 944 int fd = open(path, O_RDWR | O_CREAT | O_TRUNC, 0666); 945 if (fd == -1) 946 fatal(1, "can't open %s", path); 947 if (ftruncate(fd, size) != 0) 948 fatal(1, "can't ftruncate %s", path); 949 (void) close(fd); 950 } 951 952 VERIFY(nvlist_alloc(&file, NV_UNIQUE_NAME, 0) == 0); 953 VERIFY(nvlist_add_string(file, ZPOOL_CONFIG_TYPE, VDEV_TYPE_FILE) == 0); 954 VERIFY(nvlist_add_string(file, ZPOOL_CONFIG_PATH, path) == 0); 955 VERIFY(nvlist_add_uint64(file, ZPOOL_CONFIG_ASHIFT, ashift) == 0); 956 957 return (file); 958} 959 960static nvlist_t * 961make_vdev_raidz(char *path, char *aux, char *pool, size_t size, 962 uint64_t ashift, int r) 963{ 964 nvlist_t *raidz, **child; 965 int c; 966 967 if (r < 2) 968 return (make_vdev_file(path, aux, pool, size, ashift)); 969 child = umem_alloc(r * sizeof (nvlist_t *), UMEM_NOFAIL); 970 971 for (c = 0; c < r; c++) 972 child[c] = make_vdev_file(path, aux, pool, size, ashift); 973 974 VERIFY(nvlist_alloc(&raidz, NV_UNIQUE_NAME, 0) == 0); 975 VERIFY(nvlist_add_string(raidz, ZPOOL_CONFIG_TYPE, 976 VDEV_TYPE_RAIDZ) == 0); 977 VERIFY(nvlist_add_uint64(raidz, ZPOOL_CONFIG_NPARITY, 978 ztest_opts.zo_raidz_parity) == 0); 979 VERIFY(nvlist_add_nvlist_array(raidz, ZPOOL_CONFIG_CHILDREN, 980 child, r) == 0); 981 982 for (c = 0; c < r; c++) 983 nvlist_free(child[c]); 984 985 umem_free(child, r * sizeof (nvlist_t *)); 986 987 return (raidz); 988} 989 990static nvlist_t * 991make_vdev_mirror(char *path, char *aux, char *pool, size_t size, 992 uint64_t ashift, int r, int m) 993{ 994 nvlist_t *mirror, **child; 995 int c; 996 997 if (m < 1) 998 return (make_vdev_raidz(path, aux, pool, size, ashift, r)); 999 1000 child = umem_alloc(m * sizeof (nvlist_t *), UMEM_NOFAIL); 1001 1002 for (c = 0; c < m; c++) 1003 child[c] = make_vdev_raidz(path, aux, pool, size, ashift, r); 1004 1005 VERIFY(nvlist_alloc(&mirror, NV_UNIQUE_NAME, 0) == 0); 1006 VERIFY(nvlist_add_string(mirror, ZPOOL_CONFIG_TYPE, 1007 VDEV_TYPE_MIRROR) == 0); 1008 VERIFY(nvlist_add_nvlist_array(mirror, ZPOOL_CONFIG_CHILDREN, 1009 child, m) == 0); 1010 1011 for (c = 0; c < m; c++) 1012 nvlist_free(child[c]); 1013 1014 umem_free(child, m * sizeof (nvlist_t *)); 1015 1016 return (mirror); 1017} 1018 1019static nvlist_t * 1020make_vdev_root(char *path, char *aux, char *pool, size_t size, uint64_t ashift, 1021 const char *class, int r, int m, int t) 1022{ 1023 nvlist_t *root, **child; 1024 int c; 1025 boolean_t log; 1026 1027 ASSERT(t > 0); 1028 1029 log = (class != NULL && strcmp(class, "log") == 0); 1030 1031 child = umem_alloc(t * sizeof (nvlist_t *), UMEM_NOFAIL); 1032 1033 for (c = 0; c < t; c++) { 1034 child[c] = make_vdev_mirror(path, aux, pool, size, ashift, 1035 r, m); 1036 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_IS_LOG, 1037 log) == 0); 1038 1039 if (class != NULL && class[0] != '\0') { 1040 ASSERT(m > 1 || log); /* expecting a mirror */ 1041 VERIFY(nvlist_add_string(child[c], 1042 ZPOOL_CONFIG_ALLOCATION_BIAS, class) == 0); 1043 } 1044 } 1045 1046 VERIFY(nvlist_alloc(&root, NV_UNIQUE_NAME, 0) == 0); 1047 VERIFY(nvlist_add_string(root, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT) == 0); 1048 VERIFY(nvlist_add_nvlist_array(root, aux ? aux : ZPOOL_CONFIG_CHILDREN, 1049 child, t) == 0); 1050 1051 for (c = 0; c < t; c++) 1052 nvlist_free(child[c]); 1053 1054 umem_free(child, t * sizeof (nvlist_t *)); 1055 1056 return (root); 1057} 1058 1059/* 1060 * Find a random spa version. Returns back a random spa version in the 1061 * range [initial_version, SPA_VERSION_FEATURES]. 1062 */ 1063static uint64_t 1064ztest_random_spa_version(uint64_t initial_version) 1065{ 1066 uint64_t version = initial_version; 1067 1068 if (version <= SPA_VERSION_BEFORE_FEATURES) { 1069 version = version + 1070 ztest_random(SPA_VERSION_BEFORE_FEATURES - version + 1); 1071 } 1072 1073 if (version > SPA_VERSION_BEFORE_FEATURES) 1074 version = SPA_VERSION_FEATURES; 1075 1076 ASSERT(SPA_VERSION_IS_SUPPORTED(version)); 1077 return (version); 1078} 1079 1080static int 1081ztest_random_blocksize(void) 1082{ 1083 uint64_t block_shift; 1084 1085 ASSERT(ztest_spa->spa_max_ashift != 0); 1086 1087 /* 1088 * Choose a block size >= the ashift. 1089 * If the SPA supports new MAXBLOCKSIZE, test up to 1MB blocks. 1090 */ 1091 int maxbs = SPA_OLD_MAXBLOCKSHIFT; 1092 if (spa_maxblocksize(ztest_spa) == SPA_MAXBLOCKSIZE) 1093 maxbs = 20; 1094 block_shift = ztest_random(maxbs - ztest_spa->spa_max_ashift + 1); 1095 return (1 << (SPA_MINBLOCKSHIFT + block_shift)); 1096} 1097 1098static int 1099ztest_random_dnodesize(void) 1100{ 1101 int slots; 1102 int max_slots = spa_maxdnodesize(ztest_spa) >> DNODE_SHIFT; 1103 1104 if (max_slots == DNODE_MIN_SLOTS) 1105 return (DNODE_MIN_SIZE); 1106 1107 /* 1108 * Weight the random distribution more heavily toward smaller 1109 * dnode sizes since that is more likely to reflect real-world 1110 * usage. 1111 */ 1112 ASSERT3U(max_slots, >, 4); 1113 switch (ztest_random(10)) { 1114 case 0: 1115 slots = 5 + ztest_random(max_slots - 4); 1116 break; 1117 case 1 ... 4: 1118 slots = 2 + ztest_random(3); 1119 break; 1120 default: 1121 slots = 1; 1122 break; 1123 } 1124 1125 return (slots << DNODE_SHIFT); 1126} 1127 1128static int 1129ztest_random_ibshift(void) 1130{ 1131 return (DN_MIN_INDBLKSHIFT + 1132 ztest_random(DN_MAX_INDBLKSHIFT - DN_MIN_INDBLKSHIFT + 1)); 1133} 1134 1135static uint64_t 1136ztest_random_vdev_top(spa_t *spa, boolean_t log_ok) 1137{ 1138 uint64_t top; 1139 vdev_t *rvd = spa->spa_root_vdev; 1140 vdev_t *tvd; 1141 1142 ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); 1143 1144 do { 1145 top = ztest_random(rvd->vdev_children); 1146 tvd = rvd->vdev_child[top]; 1147 } while (!vdev_is_concrete(tvd) || (tvd->vdev_islog && !log_ok) || 1148 tvd->vdev_mg == NULL || tvd->vdev_mg->mg_class == NULL); 1149 1150 return (top); 1151} 1152 1153static uint64_t 1154ztest_random_dsl_prop(zfs_prop_t prop) 1155{ 1156 uint64_t value; 1157 1158 do { 1159 value = zfs_prop_random_value(prop, ztest_random(-1ULL)); 1160 } while (prop == ZFS_PROP_CHECKSUM && value == ZIO_CHECKSUM_OFF); 1161 1162 return (value); 1163} 1164 1165static int 1166ztest_dsl_prop_set_uint64(char *osname, zfs_prop_t prop, uint64_t value, 1167 boolean_t inherit) 1168{ 1169 const char *propname = zfs_prop_to_name(prop); 1170 const char *valname; 1171 char setpoint[MAXPATHLEN]; 1172 uint64_t curval; 1173 int error; 1174 1175 error = dsl_prop_set_int(osname, propname, 1176 (inherit ? ZPROP_SRC_NONE : ZPROP_SRC_LOCAL), value); 1177 1178 if (error == ENOSPC) { 1179 ztest_record_enospc(FTAG); 1180 return (error); 1181 } 1182 ASSERT0(error); 1183 1184 VERIFY0(dsl_prop_get_integer(osname, propname, &curval, setpoint)); 1185 1186 if (ztest_opts.zo_verbose >= 6) { 1187 VERIFY(zfs_prop_index_to_string(prop, curval, &valname) == 0); 1188 (void) printf("%s %s = %s at '%s'\n", 1189 osname, propname, valname, setpoint); 1190 } 1191 1192 return (error); 1193} 1194 1195static int 1196ztest_spa_prop_set_uint64(zpool_prop_t prop, uint64_t value) 1197{ 1198 spa_t *spa = ztest_spa; 1199 nvlist_t *props = NULL; 1200 int error; 1201 1202 VERIFY(nvlist_alloc(&props, NV_UNIQUE_NAME, 0) == 0); 1203 VERIFY(nvlist_add_uint64(props, zpool_prop_to_name(prop), value) == 0); 1204 1205 error = spa_prop_set(spa, props); 1206 1207 nvlist_free(props); 1208 1209 if (error == ENOSPC) { 1210 ztest_record_enospc(FTAG); 1211 return (error); 1212 } 1213 ASSERT0(error); 1214 1215 return (error); 1216} 1217 1218static void 1219ztest_rll_init(rll_t *rll) 1220{ 1221 rll->rll_writer = NULL; 1222 rll->rll_readers = 0; 1223 mutex_init(&rll->rll_lock, NULL, USYNC_THREAD, NULL); 1224 cv_init(&rll->rll_cv, NULL, USYNC_THREAD, NULL); 1225} 1226 1227static void 1228ztest_rll_destroy(rll_t *rll) 1229{ 1230 ASSERT(rll->rll_writer == NULL); 1231 ASSERT(rll->rll_readers == 0); 1232 mutex_destroy(&rll->rll_lock); 1233 cv_destroy(&rll->rll_cv); 1234} 1235 1236static void 1237ztest_rll_lock(rll_t *rll, rl_type_t type) 1238{ 1239 mutex_enter(&rll->rll_lock); 1240 1241 if (type == RL_READER) { 1242 while (rll->rll_writer != NULL) 1243 cv_wait(&rll->rll_cv, &rll->rll_lock); 1244 rll->rll_readers++; 1245 } else { 1246 while (rll->rll_writer != NULL || rll->rll_readers) 1247 cv_wait(&rll->rll_cv, &rll->rll_lock); 1248 rll->rll_writer = curthread; 1249 } 1250 1251 mutex_exit(&rll->rll_lock); 1252} 1253 1254static void 1255ztest_rll_unlock(rll_t *rll) 1256{ 1257 mutex_enter(&rll->rll_lock); 1258 1259 if (rll->rll_writer) { 1260 ASSERT(rll->rll_readers == 0); 1261 rll->rll_writer = NULL; 1262 } else { 1263 ASSERT(rll->rll_readers != 0); 1264 ASSERT(rll->rll_writer == NULL); 1265 rll->rll_readers--; 1266 } 1267 1268 if (rll->rll_writer == NULL && rll->rll_readers == 0) 1269 cv_broadcast(&rll->rll_cv); 1270 1271 mutex_exit(&rll->rll_lock); 1272} 1273 1274static void 1275ztest_object_lock(ztest_ds_t *zd, uint64_t object, rl_type_t type) 1276{ 1277 rll_t *rll = &zd->zd_object_lock[object & (ZTEST_OBJECT_LOCKS - 1)]; 1278 1279 ztest_rll_lock(rll, type); 1280} 1281 1282static void 1283ztest_object_unlock(ztest_ds_t *zd, uint64_t object) 1284{ 1285 rll_t *rll = &zd->zd_object_lock[object & (ZTEST_OBJECT_LOCKS - 1)]; 1286 1287 ztest_rll_unlock(rll); 1288} 1289 1290static rl_t * 1291ztest_range_lock(ztest_ds_t *zd, uint64_t object, uint64_t offset, 1292 uint64_t size, rl_type_t type) 1293{ 1294 uint64_t hash = object ^ (offset % (ZTEST_RANGE_LOCKS + 1)); 1295 rll_t *rll = &zd->zd_range_lock[hash & (ZTEST_RANGE_LOCKS - 1)]; 1296 rl_t *rl; 1297 1298 rl = umem_alloc(sizeof (*rl), UMEM_NOFAIL); 1299 rl->rl_object = object; 1300 rl->rl_offset = offset; 1301 rl->rl_size = size; 1302 rl->rl_lock = rll; 1303 1304 ztest_rll_lock(rll, type); 1305 1306 return (rl); 1307} 1308 1309static void 1310ztest_range_unlock(rl_t *rl) 1311{ 1312 rll_t *rll = rl->rl_lock; 1313 1314 ztest_rll_unlock(rll); 1315 1316 umem_free(rl, sizeof (*rl)); 1317} 1318 1319static void 1320ztest_zd_init(ztest_ds_t *zd, ztest_shared_ds_t *szd, objset_t *os) 1321{ 1322 zd->zd_os = os; 1323 zd->zd_zilog = dmu_objset_zil(os); 1324 zd->zd_shared = szd; 1325 dmu_objset_name(os, zd->zd_name); 1326 1327 if (zd->zd_shared != NULL) 1328 zd->zd_shared->zd_seq = 0; 1329 1330 rw_init(&zd->zd_zilog_lock, NULL, USYNC_THREAD, NULL); 1331 mutex_init(&zd->zd_dirobj_lock, NULL, USYNC_THREAD, NULL); 1332 1333 for (int l = 0; l < ZTEST_OBJECT_LOCKS; l++) 1334 ztest_rll_init(&zd->zd_object_lock[l]); 1335 1336 for (int l = 0; l < ZTEST_RANGE_LOCKS; l++) 1337 ztest_rll_init(&zd->zd_range_lock[l]); 1338} 1339 1340static void 1341ztest_zd_fini(ztest_ds_t *zd) 1342{ 1343 mutex_destroy(&zd->zd_dirobj_lock); 1344 1345 for (int l = 0; l < ZTEST_OBJECT_LOCKS; l++) 1346 ztest_rll_destroy(&zd->zd_object_lock[l]); 1347 1348 for (int l = 0; l < ZTEST_RANGE_LOCKS; l++) 1349 ztest_rll_destroy(&zd->zd_range_lock[l]); 1350} 1351 1352#define TXG_MIGHTWAIT (ztest_random(10) == 0 ? TXG_NOWAIT : TXG_WAIT) 1353 1354static uint64_t 1355ztest_tx_assign(dmu_tx_t *tx, uint64_t txg_how, const char *tag) 1356{ 1357 uint64_t txg; 1358 int error; 1359 1360 /* 1361 * Attempt to assign tx to some transaction group. 1362 */ 1363 error = dmu_tx_assign(tx, txg_how); 1364 if (error) { 1365 if (error == ERESTART) { 1366 ASSERT(txg_how == TXG_NOWAIT); 1367 dmu_tx_wait(tx); 1368 } else { 1369 ASSERT3U(error, ==, ENOSPC); 1370 ztest_record_enospc(tag); 1371 } 1372 dmu_tx_abort(tx); 1373 return (0); 1374 } 1375 txg = dmu_tx_get_txg(tx); 1376 ASSERT(txg != 0); 1377 return (txg); 1378} 1379 1380static void 1381ztest_pattern_set(void *buf, uint64_t size, uint64_t value) 1382{ 1383 uint64_t *ip = buf; 1384 uint64_t *ip_end = (uint64_t *)((uintptr_t)buf + (uintptr_t)size); 1385 1386 while (ip < ip_end) 1387 *ip++ = value; 1388} 1389 1390static boolean_t 1391ztest_pattern_match(void *buf, uint64_t size, uint64_t value) 1392{ 1393 uint64_t *ip = buf; 1394 uint64_t *ip_end = (uint64_t *)((uintptr_t)buf + (uintptr_t)size); 1395 uint64_t diff = 0; 1396 1397 while (ip < ip_end) 1398 diff |= (value - *ip++); 1399 1400 return (diff == 0); 1401} 1402 1403static void 1404ztest_bt_generate(ztest_block_tag_t *bt, objset_t *os, uint64_t object, 1405 uint64_t dnodesize, uint64_t offset, uint64_t gen, uint64_t txg, 1406 uint64_t crtxg) 1407{ 1408 bt->bt_magic = BT_MAGIC; 1409 bt->bt_objset = dmu_objset_id(os); 1410 bt->bt_object = object; 1411 bt->bt_dnodesize = dnodesize; 1412 bt->bt_offset = offset; 1413 bt->bt_gen = gen; 1414 bt->bt_txg = txg; 1415 bt->bt_crtxg = crtxg; 1416} 1417 1418static void 1419ztest_bt_verify(ztest_block_tag_t *bt, objset_t *os, uint64_t object, 1420 uint64_t dnodesize, uint64_t offset, uint64_t gen, uint64_t txg, 1421 uint64_t crtxg) 1422{ 1423 ASSERT3U(bt->bt_magic, ==, BT_MAGIC); 1424 ASSERT3U(bt->bt_objset, ==, dmu_objset_id(os)); 1425 ASSERT3U(bt->bt_object, ==, object); 1426 ASSERT3U(bt->bt_dnodesize, ==, dnodesize); 1427 ASSERT3U(bt->bt_offset, ==, offset); 1428 ASSERT3U(bt->bt_gen, <=, gen); 1429 ASSERT3U(bt->bt_txg, <=, txg); 1430 ASSERT3U(bt->bt_crtxg, ==, crtxg); 1431} 1432 1433static ztest_block_tag_t * 1434ztest_bt_bonus(dmu_buf_t *db) 1435{ 1436 dmu_object_info_t doi; 1437 ztest_block_tag_t *bt; 1438 1439 dmu_object_info_from_db(db, &doi); 1440 ASSERT3U(doi.doi_bonus_size, <=, db->db_size); 1441 ASSERT3U(doi.doi_bonus_size, >=, sizeof (*bt)); 1442 bt = (void *)((char *)db->db_data + doi.doi_bonus_size - sizeof (*bt)); 1443 1444 return (bt); 1445} 1446 1447/* 1448 * Generate a token to fill up unused bonus buffer space. Try to make 1449 * it unique to the object, generation, and offset to verify that data 1450 * is not getting overwritten by data from other dnodes. 1451 */ 1452#define ZTEST_BONUS_FILL_TOKEN(obj, ds, gen, offset) \ 1453 (((ds) << 48) | ((gen) << 32) | ((obj) << 8) | (offset)) 1454 1455/* 1456 * Fill up the unused bonus buffer region before the block tag with a 1457 * verifiable pattern. Filling the whole bonus area with non-zero data 1458 * helps ensure that all dnode traversal code properly skips the 1459 * interior regions of large dnodes. 1460 */ 1461void 1462ztest_fill_unused_bonus(dmu_buf_t *db, void *end, uint64_t obj, 1463 objset_t *os, uint64_t gen) 1464{ 1465 uint64_t *bonusp; 1466 1467 ASSERT(IS_P2ALIGNED((char *)end - (char *)db->db_data, 8)); 1468 1469 for (bonusp = db->db_data; bonusp < (uint64_t *)end; bonusp++) { 1470 uint64_t token = ZTEST_BONUS_FILL_TOKEN(obj, dmu_objset_id(os), 1471 gen, bonusp - (uint64_t *)db->db_data); 1472 *bonusp = token; 1473 } 1474} 1475 1476/* 1477 * Verify that the unused area of a bonus buffer is filled with the 1478 * expected tokens. 1479 */ 1480void 1481ztest_verify_unused_bonus(dmu_buf_t *db, void *end, uint64_t obj, 1482 objset_t *os, uint64_t gen) 1483{ 1484 uint64_t *bonusp; 1485 1486 for (bonusp = db->db_data; bonusp < (uint64_t *)end; bonusp++) { 1487 uint64_t token = ZTEST_BONUS_FILL_TOKEN(obj, dmu_objset_id(os), 1488 gen, bonusp - (uint64_t *)db->db_data); 1489 VERIFY3U(*bonusp, ==, token); 1490 } 1491} 1492 1493/* 1494 * ZIL logging ops 1495 */ 1496 1497#define lrz_type lr_mode 1498#define lrz_blocksize lr_uid 1499#define lrz_ibshift lr_gid 1500#define lrz_bonustype lr_rdev 1501#define lrz_dnodesize lr_crtime[1] 1502 1503static void 1504ztest_log_create(ztest_ds_t *zd, dmu_tx_t *tx, lr_create_t *lr) 1505{ 1506 char *name = (void *)(lr + 1); /* name follows lr */ 1507 size_t namesize = strlen(name) + 1; 1508 itx_t *itx; 1509 1510 if (zil_replaying(zd->zd_zilog, tx)) 1511 return; 1512 1513 itx = zil_itx_create(TX_CREATE, sizeof (*lr) + namesize); 1514 bcopy(&lr->lr_common + 1, &itx->itx_lr + 1, 1515 sizeof (*lr) + namesize - sizeof (lr_t)); 1516 1517 zil_itx_assign(zd->zd_zilog, itx, tx); 1518} 1519 1520static void 1521ztest_log_remove(ztest_ds_t *zd, dmu_tx_t *tx, lr_remove_t *lr, uint64_t object) 1522{ 1523 char *name = (void *)(lr + 1); /* name follows lr */ 1524 size_t namesize = strlen(name) + 1; 1525 itx_t *itx; 1526 1527 if (zil_replaying(zd->zd_zilog, tx)) 1528 return; 1529 1530 itx = zil_itx_create(TX_REMOVE, sizeof (*lr) + namesize); 1531 bcopy(&lr->lr_common + 1, &itx->itx_lr + 1, 1532 sizeof (*lr) + namesize - sizeof (lr_t)); 1533 1534 itx->itx_oid = object; 1535 zil_itx_assign(zd->zd_zilog, itx, tx); 1536} 1537 1538static void 1539ztest_log_write(ztest_ds_t *zd, dmu_tx_t *tx, lr_write_t *lr) 1540{ 1541 itx_t *itx; 1542 itx_wr_state_t write_state = ztest_random(WR_NUM_STATES); 1543 1544 if (zil_replaying(zd->zd_zilog, tx)) 1545 return; 1546 1547 if (lr->lr_length > zil_max_log_data(zd->zd_zilog)) 1548 write_state = WR_INDIRECT; 1549 1550 itx = zil_itx_create(TX_WRITE, 1551 sizeof (*lr) + (write_state == WR_COPIED ? lr->lr_length : 0)); 1552 1553 if (write_state == WR_COPIED && 1554 dmu_read(zd->zd_os, lr->lr_foid, lr->lr_offset, lr->lr_length, 1555 ((lr_write_t *)&itx->itx_lr) + 1, DMU_READ_NO_PREFETCH) != 0) { 1556 zil_itx_destroy(itx); 1557 itx = zil_itx_create(TX_WRITE, sizeof (*lr)); 1558 write_state = WR_NEED_COPY; 1559 } 1560 itx->itx_private = zd; 1561 itx->itx_wr_state = write_state; 1562 itx->itx_sync = (ztest_random(8) == 0); 1563 1564 bcopy(&lr->lr_common + 1, &itx->itx_lr + 1, 1565 sizeof (*lr) - sizeof (lr_t)); 1566 1567 zil_itx_assign(zd->zd_zilog, itx, tx); 1568} 1569 1570static void 1571ztest_log_truncate(ztest_ds_t *zd, dmu_tx_t *tx, lr_truncate_t *lr) 1572{ 1573 itx_t *itx; 1574 1575 if (zil_replaying(zd->zd_zilog, tx)) 1576 return; 1577 1578 itx = zil_itx_create(TX_TRUNCATE, sizeof (*lr)); 1579 bcopy(&lr->lr_common + 1, &itx->itx_lr + 1, 1580 sizeof (*lr) - sizeof (lr_t)); 1581 1582 itx->itx_sync = B_FALSE; 1583 zil_itx_assign(zd->zd_zilog, itx, tx); 1584} 1585 1586static void 1587ztest_log_setattr(ztest_ds_t *zd, dmu_tx_t *tx, lr_setattr_t *lr) 1588{ 1589 itx_t *itx; 1590 1591 if (zil_replaying(zd->zd_zilog, tx)) 1592 return; 1593 1594 itx = zil_itx_create(TX_SETATTR, sizeof (*lr)); 1595 bcopy(&lr->lr_common + 1, &itx->itx_lr + 1, 1596 sizeof (*lr) - sizeof (lr_t)); 1597 1598 itx->itx_sync = B_FALSE; 1599 zil_itx_assign(zd->zd_zilog, itx, tx); 1600} 1601 1602/* 1603 * ZIL replay ops 1604 */ 1605static int 1606ztest_replay_create(void *arg1, void *arg2, boolean_t byteswap) 1607{ 1608 ztest_ds_t *zd = arg1; 1609 lr_create_t *lr = arg2; 1610 char *name = (void *)(lr + 1); /* name follows lr */ 1611 objset_t *os = zd->zd_os; 1612 ztest_block_tag_t *bbt; 1613 dmu_buf_t *db; 1614 dmu_tx_t *tx; 1615 uint64_t txg; 1616 int error = 0; 1617 int bonuslen; 1618 1619 if (byteswap) 1620 byteswap_uint64_array(lr, sizeof (*lr)); 1621 1622 ASSERT(lr->lr_doid == ZTEST_DIROBJ); 1623 ASSERT(name[0] != '\0'); 1624 1625 tx = dmu_tx_create(os); 1626 1627 dmu_tx_hold_zap(tx, lr->lr_doid, B_TRUE, name); 1628 1629 if (lr->lrz_type == DMU_OT_ZAP_OTHER) { 1630 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); 1631 } else { 1632 dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); 1633 } 1634 1635 txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); 1636 if (txg == 0) 1637 return (ENOSPC); 1638 1639 ASSERT(dmu_objset_zil(os)->zl_replay == !!lr->lr_foid); 1640 bonuslen = DN_BONUS_SIZE(lr->lrz_dnodesize); 1641 1642 if (lr->lrz_type == DMU_OT_ZAP_OTHER) { 1643 if (lr->lr_foid == 0) { 1644 lr->lr_foid = zap_create_dnsize(os, 1645 lr->lrz_type, lr->lrz_bonustype, 1646 bonuslen, lr->lrz_dnodesize, tx); 1647 } else { 1648 error = zap_create_claim_dnsize(os, lr->lr_foid, 1649 lr->lrz_type, lr->lrz_bonustype, 1650 bonuslen, lr->lrz_dnodesize, tx); 1651 } 1652 } else { 1653 if (lr->lr_foid == 0) { 1654 lr->lr_foid = dmu_object_alloc_dnsize(os, 1655 lr->lrz_type, 0, lr->lrz_bonustype, 1656 bonuslen, lr->lrz_dnodesize, tx); 1657 } else { 1658 error = dmu_object_claim_dnsize(os, lr->lr_foid, 1659 lr->lrz_type, 0, lr->lrz_bonustype, 1660 bonuslen, lr->lrz_dnodesize, tx); 1661 } 1662 } 1663 1664 if (error) { 1665 ASSERT3U(error, ==, EEXIST); 1666 ASSERT(zd->zd_zilog->zl_replay); 1667 dmu_tx_commit(tx); 1668 return (error); 1669 } 1670 1671 ASSERT(lr->lr_foid != 0); 1672 1673 if (lr->lrz_type != DMU_OT_ZAP_OTHER) 1674 VERIFY3U(0, ==, dmu_object_set_blocksize(os, lr->lr_foid, 1675 lr->lrz_blocksize, lr->lrz_ibshift, tx)); 1676 1677 VERIFY3U(0, ==, dmu_bonus_hold(os, lr->lr_foid, FTAG, &db)); 1678 bbt = ztest_bt_bonus(db); 1679 dmu_buf_will_dirty(db, tx); 1680 ztest_bt_generate(bbt, os, lr->lr_foid, lr->lrz_dnodesize, -1ULL, 1681 lr->lr_gen, txg, txg); 1682 ztest_fill_unused_bonus(db, bbt, lr->lr_foid, os, lr->lr_gen); 1683 dmu_buf_rele(db, FTAG); 1684 1685 VERIFY3U(0, ==, zap_add(os, lr->lr_doid, name, sizeof (uint64_t), 1, 1686 &lr->lr_foid, tx)); 1687 1688 (void) ztest_log_create(zd, tx, lr); 1689 1690 dmu_tx_commit(tx); 1691 1692 return (0); 1693} 1694 1695static int 1696ztest_replay_remove(void *arg1, void *arg2, boolean_t byteswap) 1697{ 1698 ztest_ds_t *zd = arg1; 1699 lr_remove_t *lr = arg2; 1700 char *name = (void *)(lr + 1); /* name follows lr */ 1701 objset_t *os = zd->zd_os; 1702 dmu_object_info_t doi; 1703 dmu_tx_t *tx; 1704 uint64_t object, txg; 1705 1706 if (byteswap) 1707 byteswap_uint64_array(lr, sizeof (*lr)); 1708 1709 ASSERT(lr->lr_doid == ZTEST_DIROBJ); 1710 ASSERT(name[0] != '\0'); 1711 1712 VERIFY3U(0, ==, 1713 zap_lookup(os, lr->lr_doid, name, sizeof (object), 1, &object)); 1714 ASSERT(object != 0); 1715 1716 ztest_object_lock(zd, object, RL_WRITER); 1717 1718 VERIFY3U(0, ==, dmu_object_info(os, object, &doi)); 1719 1720 tx = dmu_tx_create(os); 1721 1722 dmu_tx_hold_zap(tx, lr->lr_doid, B_FALSE, name); 1723 dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END); 1724 1725 txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); 1726 if (txg == 0) { 1727 ztest_object_unlock(zd, object); 1728 return (ENOSPC); 1729 } 1730 1731 if (doi.doi_type == DMU_OT_ZAP_OTHER) { 1732 VERIFY3U(0, ==, zap_destroy(os, object, tx)); 1733 } else { 1734 VERIFY3U(0, ==, dmu_object_free(os, object, tx)); 1735 } 1736 1737 VERIFY3U(0, ==, zap_remove(os, lr->lr_doid, name, tx)); 1738 1739 (void) ztest_log_remove(zd, tx, lr, object); 1740 1741 dmu_tx_commit(tx); 1742 1743 ztest_object_unlock(zd, object); 1744 1745 return (0); 1746} 1747 1748static int 1749ztest_replay_write(void *arg1, void *arg2, boolean_t byteswap) 1750{ 1751 ztest_ds_t *zd = arg1; 1752 lr_write_t *lr = arg2; 1753 objset_t *os = zd->zd_os; 1754 void *data = lr + 1; /* data follows lr */ 1755 uint64_t offset, length; 1756 ztest_block_tag_t *bt = data; 1757 ztest_block_tag_t *bbt; 1758 uint64_t gen, txg, lrtxg, crtxg; 1759 dmu_object_info_t doi; 1760 dmu_tx_t *tx; 1761 dmu_buf_t *db; 1762 arc_buf_t *abuf = NULL; 1763 rl_t *rl; 1764 1765 if (byteswap) 1766 byteswap_uint64_array(lr, sizeof (*lr)); 1767 1768 offset = lr->lr_offset; 1769 length = lr->lr_length; 1770 1771 /* If it's a dmu_sync() block, write the whole block */ 1772 if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) { 1773 uint64_t blocksize = BP_GET_LSIZE(&lr->lr_blkptr); 1774 if (length < blocksize) { 1775 offset -= offset % blocksize; 1776 length = blocksize; 1777 } 1778 } 1779 1780 if (bt->bt_magic == BSWAP_64(BT_MAGIC)) 1781 byteswap_uint64_array(bt, sizeof (*bt)); 1782 1783 if (bt->bt_magic != BT_MAGIC) 1784 bt = NULL; 1785 1786 ztest_object_lock(zd, lr->lr_foid, RL_READER); 1787 rl = ztest_range_lock(zd, lr->lr_foid, offset, length, RL_WRITER); 1788 1789 VERIFY3U(0, ==, dmu_bonus_hold(os, lr->lr_foid, FTAG, &db)); 1790 1791 dmu_object_info_from_db(db, &doi); 1792 1793 bbt = ztest_bt_bonus(db); 1794 ASSERT3U(bbt->bt_magic, ==, BT_MAGIC); 1795 gen = bbt->bt_gen; 1796 crtxg = bbt->bt_crtxg; 1797 lrtxg = lr->lr_common.lrc_txg; 1798 1799 tx = dmu_tx_create(os); 1800 1801 dmu_tx_hold_write(tx, lr->lr_foid, offset, length); 1802 1803 if (ztest_random(8) == 0 && length == doi.doi_data_block_size && 1804 P2PHASE(offset, length) == 0) 1805 abuf = dmu_request_arcbuf(db, length); 1806 1807 txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); 1808 if (txg == 0) { 1809 if (abuf != NULL) 1810 dmu_return_arcbuf(abuf); 1811 dmu_buf_rele(db, FTAG); 1812 ztest_range_unlock(rl); 1813 ztest_object_unlock(zd, lr->lr_foid); 1814 return (ENOSPC); 1815 } 1816 1817 if (bt != NULL) { 1818 /* 1819 * Usually, verify the old data before writing new data -- 1820 * but not always, because we also want to verify correct 1821 * behavior when the data was not recently read into cache. 1822 */ 1823 ASSERT(offset % doi.doi_data_block_size == 0); 1824 if (ztest_random(4) != 0) { 1825 int prefetch = ztest_random(2) ? 1826 DMU_READ_PREFETCH : DMU_READ_NO_PREFETCH; 1827 ztest_block_tag_t rbt; 1828 1829 VERIFY(dmu_read(os, lr->lr_foid, offset, 1830 sizeof (rbt), &rbt, prefetch) == 0); 1831 if (rbt.bt_magic == BT_MAGIC) { 1832 ztest_bt_verify(&rbt, os, lr->lr_foid, 0, 1833 offset, gen, txg, crtxg); 1834 } 1835 } 1836 1837 /* 1838 * Writes can appear to be newer than the bonus buffer because 1839 * the ztest_get_data() callback does a dmu_read() of the 1840 * open-context data, which may be different than the data 1841 * as it was when the write was generated. 1842 */ 1843 if (zd->zd_zilog->zl_replay) { 1844 ztest_bt_verify(bt, os, lr->lr_foid, 0, offset, 1845 MAX(gen, bt->bt_gen), MAX(txg, lrtxg), 1846 bt->bt_crtxg); 1847 } 1848 1849 /* 1850 * Set the bt's gen/txg to the bonus buffer's gen/txg 1851 * so that all of the usual ASSERTs will work. 1852 */ 1853 ztest_bt_generate(bt, os, lr->lr_foid, 0, offset, gen, txg, 1854 crtxg); 1855 } 1856 1857 if (abuf == NULL) { 1858 dmu_write(os, lr->lr_foid, offset, length, data, tx); 1859 } else { 1860 bcopy(data, abuf->b_data, length); 1861 dmu_assign_arcbuf(db, offset, abuf, tx); 1862 } 1863 1864 (void) ztest_log_write(zd, tx, lr); 1865 1866 dmu_buf_rele(db, FTAG); 1867 1868 dmu_tx_commit(tx); 1869 1870 ztest_range_unlock(rl); 1871 ztest_object_unlock(zd, lr->lr_foid); 1872 1873 return (0); 1874} 1875 1876static int 1877ztest_replay_truncate(void *arg1, void *arg2, boolean_t byteswap) 1878{ 1879 ztest_ds_t *zd = arg1; 1880 lr_truncate_t *lr = arg2; 1881 objset_t *os = zd->zd_os; 1882 dmu_tx_t *tx; 1883 uint64_t txg; 1884 rl_t *rl; 1885 1886 if (byteswap) 1887 byteswap_uint64_array(lr, sizeof (*lr)); 1888 1889 ztest_object_lock(zd, lr->lr_foid, RL_READER); 1890 rl = ztest_range_lock(zd, lr->lr_foid, lr->lr_offset, lr->lr_length, 1891 RL_WRITER); 1892 1893 tx = dmu_tx_create(os); 1894 1895 dmu_tx_hold_free(tx, lr->lr_foid, lr->lr_offset, lr->lr_length); 1896 1897 txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); 1898 if (txg == 0) { 1899 ztest_range_unlock(rl); 1900 ztest_object_unlock(zd, lr->lr_foid); 1901 return (ENOSPC); 1902 } 1903 1904 VERIFY(dmu_free_range(os, lr->lr_foid, lr->lr_offset, 1905 lr->lr_length, tx) == 0); 1906 1907 (void) ztest_log_truncate(zd, tx, lr); 1908 1909 dmu_tx_commit(tx); 1910 1911 ztest_range_unlock(rl); 1912 ztest_object_unlock(zd, lr->lr_foid); 1913 1914 return (0); 1915} 1916 1917static int 1918ztest_replay_setattr(void *arg1, void *arg2, boolean_t byteswap) 1919{ 1920 ztest_ds_t *zd = arg1; 1921 lr_setattr_t *lr = arg2; 1922 objset_t *os = zd->zd_os; 1923 dmu_tx_t *tx; 1924 dmu_buf_t *db; 1925 ztest_block_tag_t *bbt; 1926 uint64_t txg, lrtxg, crtxg, dnodesize; 1927 1928 if (byteswap) 1929 byteswap_uint64_array(lr, sizeof (*lr)); 1930 1931 ztest_object_lock(zd, lr->lr_foid, RL_WRITER); 1932 1933 VERIFY3U(0, ==, dmu_bonus_hold(os, lr->lr_foid, FTAG, &db)); 1934 1935 tx = dmu_tx_create(os); 1936 dmu_tx_hold_bonus(tx, lr->lr_foid); 1937 1938 txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); 1939 if (txg == 0) { 1940 dmu_buf_rele(db, FTAG); 1941 ztest_object_unlock(zd, lr->lr_foid); 1942 return (ENOSPC); 1943 } 1944 1945 bbt = ztest_bt_bonus(db); 1946 ASSERT3U(bbt->bt_magic, ==, BT_MAGIC); 1947 crtxg = bbt->bt_crtxg; 1948 lrtxg = lr->lr_common.lrc_txg; 1949 dnodesize = bbt->bt_dnodesize; 1950 1951 if (zd->zd_zilog->zl_replay) { 1952 ASSERT(lr->lr_size != 0); 1953 ASSERT(lr->lr_mode != 0); 1954 ASSERT(lrtxg != 0); 1955 } else { 1956 /* 1957 * Randomly change the size and increment the generation. 1958 */ 1959 lr->lr_size = (ztest_random(db->db_size / sizeof (*bbt)) + 1) * 1960 sizeof (*bbt); 1961 lr->lr_mode = bbt->bt_gen + 1; 1962 ASSERT(lrtxg == 0); 1963 } 1964 1965 /* 1966 * Verify that the current bonus buffer is not newer than our txg. 1967 */ 1968 ztest_bt_verify(bbt, os, lr->lr_foid, dnodesize, -1ULL, lr->lr_mode, 1969 MAX(txg, lrtxg), crtxg); 1970 1971 dmu_buf_will_dirty(db, tx); 1972 1973 ASSERT3U(lr->lr_size, >=, sizeof (*bbt)); 1974 ASSERT3U(lr->lr_size, <=, db->db_size); 1975 VERIFY0(dmu_set_bonus(db, lr->lr_size, tx)); 1976 bbt = ztest_bt_bonus(db); 1977 1978 ztest_bt_generate(bbt, os, lr->lr_foid, dnodesize, -1ULL, lr->lr_mode, 1979 txg, crtxg); 1980 ztest_fill_unused_bonus(db, bbt, lr->lr_foid, os, bbt->bt_gen); 1981 1982 dmu_buf_rele(db, FTAG); 1983 1984 (void) ztest_log_setattr(zd, tx, lr); 1985 1986 dmu_tx_commit(tx); 1987 1988 ztest_object_unlock(zd, lr->lr_foid); 1989 1990 return (0); 1991} 1992 1993zil_replay_func_t *ztest_replay_vector[TX_MAX_TYPE] = { 1994 NULL, /* 0 no such transaction type */ 1995 ztest_replay_create, /* TX_CREATE */ 1996 NULL, /* TX_MKDIR */ 1997 NULL, /* TX_MKXATTR */ 1998 NULL, /* TX_SYMLINK */ 1999 ztest_replay_remove, /* TX_REMOVE */ 2000 NULL, /* TX_RMDIR */ 2001 NULL, /* TX_LINK */ 2002 NULL, /* TX_RENAME */ 2003 ztest_replay_write, /* TX_WRITE */ 2004 ztest_replay_truncate, /* TX_TRUNCATE */ 2005 ztest_replay_setattr, /* TX_SETATTR */ 2006 NULL, /* TX_ACL */ 2007 NULL, /* TX_CREATE_ACL */ 2008 NULL, /* TX_CREATE_ATTR */ 2009 NULL, /* TX_CREATE_ACL_ATTR */ 2010 NULL, /* TX_MKDIR_ACL */ 2011 NULL, /* TX_MKDIR_ATTR */ 2012 NULL, /* TX_MKDIR_ACL_ATTR */ 2013 NULL, /* TX_WRITE2 */ 2014}; 2015 2016/* 2017 * ZIL get_data callbacks 2018 */ 2019 2020/* ARGSUSED */ 2021static void 2022ztest_get_done(zgd_t *zgd, int error) 2023{ 2024 ztest_ds_t *zd = zgd->zgd_private; 2025 uint64_t object = ((rl_t *)zgd->zgd_lr)->rl_object; 2026 2027 if (zgd->zgd_db) 2028 dmu_buf_rele(zgd->zgd_db, zgd); 2029 2030 ztest_range_unlock((rl_t *)zgd->zgd_lr); 2031 ztest_object_unlock(zd, object); 2032 2033 umem_free(zgd, sizeof (*zgd)); 2034} 2035 2036static int 2037ztest_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, 2038 zio_t *zio) 2039{ 2040 ztest_ds_t *zd = arg; 2041 objset_t *os = zd->zd_os; 2042 uint64_t object = lr->lr_foid; 2043 uint64_t offset = lr->lr_offset; 2044 uint64_t size = lr->lr_length; 2045 uint64_t txg = lr->lr_common.lrc_txg; 2046 uint64_t crtxg; 2047 dmu_object_info_t doi; 2048 dmu_buf_t *db; 2049 zgd_t *zgd; 2050 int error; 2051 2052 ASSERT3P(lwb, !=, NULL); 2053 ASSERT3P(zio, !=, NULL); 2054 ASSERT3U(size, !=, 0); 2055 2056 ztest_object_lock(zd, object, RL_READER); 2057 error = dmu_bonus_hold(os, object, FTAG, &db); 2058 if (error) { 2059 ztest_object_unlock(zd, object); 2060 return (error); 2061 } 2062 2063 crtxg = ztest_bt_bonus(db)->bt_crtxg; 2064 2065 if (crtxg == 0 || crtxg > txg) { 2066 dmu_buf_rele(db, FTAG); 2067 ztest_object_unlock(zd, object); 2068 return (ENOENT); 2069 } 2070 2071 dmu_object_info_from_db(db, &doi); 2072 dmu_buf_rele(db, FTAG); 2073 db = NULL; 2074 2075 zgd = umem_zalloc(sizeof (*zgd), UMEM_NOFAIL); 2076 zgd->zgd_lwb = lwb; 2077 zgd->zgd_private = zd; 2078 2079 if (buf != NULL) { /* immediate write */ 2080 zgd->zgd_lr = (struct locked_range *)ztest_range_lock(zd, 2081 object, offset, size, RL_READER); 2082 2083 error = dmu_read(os, object, offset, size, buf, 2084 DMU_READ_NO_PREFETCH); 2085 ASSERT(error == 0); 2086 } else { 2087 size = doi.doi_data_block_size; 2088 if (ISP2(size)) { 2089 offset = P2ALIGN(offset, size); 2090 } else { 2091 ASSERT(offset < size); 2092 offset = 0; 2093 } 2094 2095 zgd->zgd_lr = (struct locked_range *)ztest_range_lock(zd, 2096 object, offset, size, RL_READER); 2097 2098 error = dmu_buf_hold(os, object, offset, zgd, &db, 2099 DMU_READ_NO_PREFETCH); 2100 2101 if (error == 0) { 2102 blkptr_t *bp = &lr->lr_blkptr; 2103 2104 zgd->zgd_db = db; 2105 zgd->zgd_bp = bp; 2106 2107 ASSERT(db->db_offset == offset); 2108 ASSERT(db->db_size == size); 2109 2110 error = dmu_sync(zio, lr->lr_common.lrc_txg, 2111 ztest_get_done, zgd); 2112 2113 if (error == 0) 2114 return (0); 2115 } 2116 } 2117 2118 ztest_get_done(zgd, error); 2119 2120 return (error); 2121} 2122 2123static void * 2124ztest_lr_alloc(size_t lrsize, char *name) 2125{ 2126 char *lr; 2127 size_t namesize = name ? strlen(name) + 1 : 0; 2128 2129 lr = umem_zalloc(lrsize + namesize, UMEM_NOFAIL); 2130 2131 if (name) 2132 bcopy(name, lr + lrsize, namesize); 2133 2134 return (lr); 2135} 2136 2137void 2138ztest_lr_free(void *lr, size_t lrsize, char *name) 2139{ 2140 size_t namesize = name ? strlen(name) + 1 : 0; 2141 2142 umem_free(lr, lrsize + namesize); 2143} 2144 2145/* 2146 * Lookup a bunch of objects. Returns the number of objects not found. 2147 */ 2148static int 2149ztest_lookup(ztest_ds_t *zd, ztest_od_t *od, int count) 2150{ 2151 int missing = 0; 2152 int error; 2153 2154 ASSERT(MUTEX_HELD(&zd->zd_dirobj_lock)); 2155 2156 for (int i = 0; i < count; i++, od++) { 2157 od->od_object = 0; 2158 error = zap_lookup(zd->zd_os, od->od_dir, od->od_name, 2159 sizeof (uint64_t), 1, &od->od_object); 2160 if (error) { 2161 ASSERT(error == ENOENT); 2162 ASSERT(od->od_object == 0); 2163 missing++; 2164 } else { 2165 dmu_buf_t *db; 2166 ztest_block_tag_t *bbt; 2167 dmu_object_info_t doi; 2168 2169 ASSERT(od->od_object != 0); 2170 ASSERT(missing == 0); /* there should be no gaps */ 2171 2172 ztest_object_lock(zd, od->od_object, RL_READER); 2173 VERIFY3U(0, ==, dmu_bonus_hold(zd->zd_os, 2174 od->od_object, FTAG, &db)); 2175 dmu_object_info_from_db(db, &doi); 2176 bbt = ztest_bt_bonus(db); 2177 ASSERT3U(bbt->bt_magic, ==, BT_MAGIC); 2178 od->od_type = doi.doi_type; 2179 od->od_blocksize = doi.doi_data_block_size; 2180 od->od_gen = bbt->bt_gen; 2181 dmu_buf_rele(db, FTAG); 2182 ztest_object_unlock(zd, od->od_object); 2183 } 2184 } 2185 2186 return (missing); 2187} 2188 2189static int 2190ztest_create(ztest_ds_t *zd, ztest_od_t *od, int count) 2191{ 2192 int missing = 0; 2193 2194 ASSERT(MUTEX_HELD(&zd->zd_dirobj_lock)); 2195 2196 for (int i = 0; i < count; i++, od++) { 2197 if (missing) { 2198 od->od_object = 0; 2199 missing++; 2200 continue; 2201 } 2202 2203 lr_create_t *lr = ztest_lr_alloc(sizeof (*lr), od->od_name); 2204 2205 lr->lr_doid = od->od_dir; 2206 lr->lr_foid = 0; /* 0 to allocate, > 0 to claim */ 2207 lr->lrz_type = od->od_crtype; 2208 lr->lrz_blocksize = od->od_crblocksize; 2209 lr->lrz_ibshift = ztest_random_ibshift(); 2210 lr->lrz_bonustype = DMU_OT_UINT64_OTHER; 2211 lr->lrz_dnodesize = od->od_crdnodesize; 2212 lr->lr_gen = od->od_crgen; 2213 lr->lr_crtime[0] = time(NULL); 2214 2215 if (ztest_replay_create(zd, lr, B_FALSE) != 0) { 2216 ASSERT(missing == 0); 2217 od->od_object = 0; 2218 missing++; 2219 } else { 2220 od->od_object = lr->lr_foid; 2221 od->od_type = od->od_crtype; 2222 od->od_blocksize = od->od_crblocksize; 2223 od->od_gen = od->od_crgen; 2224 ASSERT(od->od_object != 0); 2225 } 2226 2227 ztest_lr_free(lr, sizeof (*lr), od->od_name); 2228 } 2229 2230 return (missing); 2231} 2232 2233static int 2234ztest_remove(ztest_ds_t *zd, ztest_od_t *od, int count) 2235{ 2236 int missing = 0; 2237 int error; 2238 2239 ASSERT(MUTEX_HELD(&zd->zd_dirobj_lock)); 2240 2241 od += count - 1; 2242 2243 for (int i = count - 1; i >= 0; i--, od--) { 2244 if (missing) { 2245 missing++; 2246 continue; 2247 } 2248 2249 /* 2250 * No object was found. 2251 */ 2252 if (od->od_object == 0) 2253 continue; 2254 2255 lr_remove_t *lr = ztest_lr_alloc(sizeof (*lr), od->od_name); 2256 2257 lr->lr_doid = od->od_dir; 2258 2259 if ((error = ztest_replay_remove(zd, lr, B_FALSE)) != 0) { 2260 ASSERT3U(error, ==, ENOSPC); 2261 missing++; 2262 } else { 2263 od->od_object = 0; 2264 } 2265 ztest_lr_free(lr, sizeof (*lr), od->od_name); 2266 } 2267 2268 return (missing); 2269} 2270 2271static int 2272ztest_write(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size, 2273 void *data) 2274{ 2275 lr_write_t *lr; 2276 int error; 2277 2278 lr = ztest_lr_alloc(sizeof (*lr) + size, NULL); 2279 2280 lr->lr_foid = object; 2281 lr->lr_offset = offset; 2282 lr->lr_length = size; 2283 lr->lr_blkoff = 0; 2284 BP_ZERO(&lr->lr_blkptr); 2285 2286 bcopy(data, lr + 1, size); 2287 2288 error = ztest_replay_write(zd, lr, B_FALSE); 2289 2290 ztest_lr_free(lr, sizeof (*lr) + size, NULL); 2291 2292 return (error); 2293} 2294 2295static int 2296ztest_truncate(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size) 2297{ 2298 lr_truncate_t *lr; 2299 int error; 2300 2301 lr = ztest_lr_alloc(sizeof (*lr), NULL); 2302 2303 lr->lr_foid = object; 2304 lr->lr_offset = offset; 2305 lr->lr_length = size; 2306 2307 error = ztest_replay_truncate(zd, lr, B_FALSE); 2308 2309 ztest_lr_free(lr, sizeof (*lr), NULL); 2310 2311 return (error); 2312} 2313 2314static int 2315ztest_setattr(ztest_ds_t *zd, uint64_t object) 2316{ 2317 lr_setattr_t *lr; 2318 int error; 2319 2320 lr = ztest_lr_alloc(sizeof (*lr), NULL); 2321 2322 lr->lr_foid = object; 2323 lr->lr_size = 0; 2324 lr->lr_mode = 0; 2325 2326 error = ztest_replay_setattr(zd, lr, B_FALSE); 2327 2328 ztest_lr_free(lr, sizeof (*lr), NULL); 2329 2330 return (error); 2331} 2332 2333static void 2334ztest_prealloc(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size) 2335{ 2336 objset_t *os = zd->zd_os; 2337 dmu_tx_t *tx; 2338 uint64_t txg; 2339 rl_t *rl; 2340 2341 txg_wait_synced(dmu_objset_pool(os), 0); 2342 2343 ztest_object_lock(zd, object, RL_READER); 2344 rl = ztest_range_lock(zd, object, offset, size, RL_WRITER); 2345 2346 tx = dmu_tx_create(os); 2347 2348 dmu_tx_hold_write(tx, object, offset, size); 2349 2350 txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); 2351 2352 if (txg != 0) { 2353 dmu_prealloc(os, object, offset, size, tx); 2354 dmu_tx_commit(tx); 2355 txg_wait_synced(dmu_objset_pool(os), txg); 2356 } else { 2357 (void) dmu_free_long_range(os, object, offset, size); 2358 } 2359 2360 ztest_range_unlock(rl); 2361 ztest_object_unlock(zd, object); 2362} 2363 2364static void 2365ztest_io(ztest_ds_t *zd, uint64_t object, uint64_t offset) 2366{ 2367 int err; 2368 ztest_block_tag_t wbt; 2369 dmu_object_info_t doi; 2370 enum ztest_io_type io_type; 2371 uint64_t blocksize; 2372 void *data; 2373 2374 VERIFY(dmu_object_info(zd->zd_os, object, &doi) == 0); 2375 blocksize = doi.doi_data_block_size; 2376 data = umem_alloc(blocksize, UMEM_NOFAIL); 2377 2378 /* 2379 * Pick an i/o type at random, biased toward writing block tags. 2380 */ 2381 io_type = ztest_random(ZTEST_IO_TYPES); 2382 if (ztest_random(2) == 0) 2383 io_type = ZTEST_IO_WRITE_TAG; 2384 2385 rw_enter(&zd->zd_zilog_lock, RW_READER); 2386 2387 switch (io_type) { 2388 2389 case ZTEST_IO_WRITE_TAG: 2390 ztest_bt_generate(&wbt, zd->zd_os, object, doi.doi_dnodesize, 2391 offset, 0, 0, 0); 2392 (void) ztest_write(zd, object, offset, sizeof (wbt), &wbt); 2393 break; 2394 2395 case ZTEST_IO_WRITE_PATTERN: 2396 (void) memset(data, 'a' + (object + offset) % 5, blocksize); 2397 if (ztest_random(2) == 0) { 2398 /* 2399 * Induce fletcher2 collisions to ensure that 2400 * zio_ddt_collision() detects and resolves them 2401 * when using fletcher2-verify for deduplication. 2402 */ 2403 ((uint64_t *)data)[0] ^= 1ULL << 63; 2404 ((uint64_t *)data)[4] ^= 1ULL << 63; 2405 } 2406 (void) ztest_write(zd, object, offset, blocksize, data); 2407 break; 2408 2409 case ZTEST_IO_WRITE_ZEROES: 2410 bzero(data, blocksize); 2411 (void) ztest_write(zd, object, offset, blocksize, data); 2412 break; 2413 2414 case ZTEST_IO_TRUNCATE: 2415 (void) ztest_truncate(zd, object, offset, blocksize); 2416 break; 2417 2418 case ZTEST_IO_SETATTR: 2419 (void) ztest_setattr(zd, object); 2420 break; 2421 2422 case ZTEST_IO_REWRITE: 2423 rw_enter(&ztest_name_lock, RW_READER); 2424 err = ztest_dsl_prop_set_uint64(zd->zd_name, 2425 ZFS_PROP_CHECKSUM, spa_dedup_checksum(ztest_spa), 2426 B_FALSE); 2427 VERIFY(err == 0 || err == ENOSPC); 2428 err = ztest_dsl_prop_set_uint64(zd->zd_name, 2429 ZFS_PROP_COMPRESSION, 2430 ztest_random_dsl_prop(ZFS_PROP_COMPRESSION), 2431 B_FALSE); 2432 VERIFY(err == 0 || err == ENOSPC); 2433 rw_exit(&ztest_name_lock); 2434 2435 VERIFY0(dmu_read(zd->zd_os, object, offset, blocksize, data, 2436 DMU_READ_NO_PREFETCH)); 2437 2438 (void) ztest_write(zd, object, offset, blocksize, data); 2439 break; 2440 } 2441 2442 rw_exit(&zd->zd_zilog_lock); 2443 2444 umem_free(data, blocksize); 2445} 2446 2447/* 2448 * Initialize an object description template. 2449 */ 2450static void 2451ztest_od_init(ztest_od_t *od, uint64_t id, char *tag, uint64_t index, 2452 dmu_object_type_t type, uint64_t blocksize, uint64_t dnodesize, 2453 uint64_t gen) 2454{ 2455 od->od_dir = ZTEST_DIROBJ; 2456 od->od_object = 0; 2457 2458 od->od_crtype = type; 2459 od->od_crblocksize = blocksize ? blocksize : ztest_random_blocksize(); 2460 od->od_crdnodesize = dnodesize ? dnodesize : ztest_random_dnodesize(); 2461 od->od_crgen = gen; 2462 2463 od->od_type = DMU_OT_NONE; 2464 od->od_blocksize = 0; 2465 od->od_gen = 0; 2466 2467 (void) snprintf(od->od_name, sizeof (od->od_name), "%s(%lld)[%llu]", 2468 tag, (int64_t)id, index); 2469} 2470 2471/* 2472 * Lookup or create the objects for a test using the od template. 2473 * If the objects do not all exist, or if 'remove' is specified, 2474 * remove any existing objects and create new ones. Otherwise, 2475 * use the existing objects. 2476 */ 2477static int 2478ztest_object_init(ztest_ds_t *zd, ztest_od_t *od, size_t size, boolean_t remove) 2479{ 2480 int count = size / sizeof (*od); 2481 int rv = 0; 2482 2483 mutex_enter(&zd->zd_dirobj_lock); 2484 if ((ztest_lookup(zd, od, count) != 0 || remove) && 2485 (ztest_remove(zd, od, count) != 0 || 2486 ztest_create(zd, od, count) != 0)) 2487 rv = -1; 2488 zd->zd_od = od; 2489 mutex_exit(&zd->zd_dirobj_lock); 2490 2491 return (rv); 2492} 2493 2494/* ARGSUSED */ 2495void 2496ztest_zil_commit(ztest_ds_t *zd, uint64_t id) 2497{ 2498 zilog_t *zilog = zd->zd_zilog; 2499 2500 rw_enter(&zd->zd_zilog_lock, RW_READER); 2501 2502 zil_commit(zilog, ztest_random(ZTEST_OBJECTS)); 2503 2504 /* 2505 * Remember the committed values in zd, which is in parent/child 2506 * shared memory. If we die, the next iteration of ztest_run() 2507 * will verify that the log really does contain this record. 2508 */ 2509 mutex_enter(&zilog->zl_lock); 2510 ASSERT(zd->zd_shared != NULL); 2511 ASSERT3U(zd->zd_shared->zd_seq, <=, zilog->zl_commit_lr_seq); 2512 zd->zd_shared->zd_seq = zilog->zl_commit_lr_seq; 2513 mutex_exit(&zilog->zl_lock); 2514 2515 rw_exit(&zd->zd_zilog_lock); 2516} 2517 2518/* 2519 * This function is designed to simulate the operations that occur during a 2520 * mount/unmount operation. We hold the dataset across these operations in an 2521 * attempt to expose any implicit assumptions about ZIL management. 2522 */ 2523/* ARGSUSED */ 2524void 2525ztest_zil_remount(ztest_ds_t *zd, uint64_t id) 2526{ 2527 objset_t *os = zd->zd_os; 2528 2529 /* 2530 * We grab the zd_dirobj_lock to ensure that no other thread is 2531 * updating the zil (i.e. adding in-memory log records) and the 2532 * zd_zilog_lock to block any I/O. 2533 */ 2534 mutex_enter(&zd->zd_dirobj_lock); 2535 rw_enter(&zd->zd_zilog_lock, RW_WRITER); 2536 2537 /* zfsvfs_teardown() */ 2538 zil_close(zd->zd_zilog); 2539 2540 /* zfsvfs_setup() */ 2541 VERIFY(zil_open(os, ztest_get_data) == zd->zd_zilog); 2542 zil_replay(os, zd, ztest_replay_vector); 2543 2544 rw_exit(&zd->zd_zilog_lock); 2545 mutex_exit(&zd->zd_dirobj_lock); 2546} 2547 2548/* 2549 * Verify that we can't destroy an active pool, create an existing pool, 2550 * or create a pool with a bad vdev spec. 2551 */ 2552/* ARGSUSED */ 2553void 2554ztest_spa_create_destroy(ztest_ds_t *zd, uint64_t id) 2555{ 2556 ztest_shared_opts_t *zo = &ztest_opts; 2557 spa_t *spa; 2558 nvlist_t *nvroot; 2559 2560 if (zo->zo_mmp_test) 2561 return; 2562 2563 /* 2564 * Attempt to create using a bad file. 2565 */ 2566 nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, NULL, 0, 0, 1); 2567 VERIFY3U(ENOENT, ==, 2568 spa_create("ztest_bad_file", nvroot, NULL, NULL)); 2569 nvlist_free(nvroot); 2570 2571 /* 2572 * Attempt to create using a bad mirror. 2573 */ 2574 nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, NULL, 0, 2, 1); 2575 VERIFY3U(ENOENT, ==, 2576 spa_create("ztest_bad_mirror", nvroot, NULL, NULL)); 2577 nvlist_free(nvroot); 2578 2579 /* 2580 * Attempt to create an existing pool. It shouldn't matter 2581 * what's in the nvroot; we should fail with EEXIST. 2582 */ 2583 rw_enter(&ztest_name_lock, RW_READER); 2584 nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, NULL, 0, 0, 1); 2585 VERIFY3U(EEXIST, ==, spa_create(zo->zo_pool, nvroot, NULL, NULL)); 2586 nvlist_free(nvroot); 2587 VERIFY3U(0, ==, spa_open(zo->zo_pool, &spa, FTAG)); 2588 VERIFY3U(EBUSY, ==, spa_destroy(zo->zo_pool)); 2589 spa_close(spa, FTAG); 2590 2591 rw_exit(&ztest_name_lock); 2592} 2593 2594/* 2595 * Start and then stop the MMP threads to ensure the startup and shutdown code 2596 * works properly. Actual protection and property-related code tested via ZTS. 2597 */ 2598/* ARGSUSED */ 2599void 2600ztest_mmp_enable_disable(ztest_ds_t *zd, uint64_t id) 2601{ 2602 ztest_shared_opts_t *zo = &ztest_opts; 2603 spa_t *spa = ztest_spa; 2604 2605 if (zo->zo_mmp_test) 2606 return; 2607 2608 /* 2609 * Since enabling MMP involves setting a property, it could not be done 2610 * while the pool is suspended. 2611 */ 2612 if (spa_suspended(spa)) 2613 return; 2614 2615 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 2616 mutex_enter(&spa->spa_props_lock); 2617 2618 zfs_multihost_fail_intervals = 0; 2619 2620 if (!spa_multihost(spa)) { 2621 spa->spa_multihost = B_TRUE; 2622 mmp_thread_start(spa); 2623 } 2624 2625 mutex_exit(&spa->spa_props_lock); 2626 spa_config_exit(spa, SCL_CONFIG, FTAG); 2627 2628 txg_wait_synced(spa_get_dsl(spa), 0); 2629 mmp_signal_all_threads(); 2630 txg_wait_synced(spa_get_dsl(spa), 0); 2631 2632 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 2633 mutex_enter(&spa->spa_props_lock); 2634 2635 if (spa_multihost(spa)) { 2636 mmp_thread_stop(spa); 2637 spa->spa_multihost = B_FALSE; 2638 } 2639 2640 mutex_exit(&spa->spa_props_lock); 2641 spa_config_exit(spa, SCL_CONFIG, FTAG); 2642} 2643 2644/* ARGSUSED */ 2645void 2646ztest_spa_upgrade(ztest_ds_t *zd, uint64_t id) 2647{ 2648 spa_t *spa; 2649 uint64_t initial_version = SPA_VERSION_INITIAL; 2650 uint64_t version, newversion; 2651 nvlist_t *nvroot, *props; 2652 char *name; 2653 2654 if (ztest_opts.zo_mmp_test) 2655 return; 2656 2657 mutex_enter(&ztest_vdev_lock); 2658 name = kmem_asprintf("%s_upgrade", ztest_opts.zo_pool); 2659 2660 /* 2661 * Clean up from previous runs. 2662 */ 2663 (void) spa_destroy(name); 2664 2665 nvroot = make_vdev_root(NULL, NULL, name, ztest_opts.zo_vdev_size, 0, 2666 NULL, ztest_opts.zo_raidz, ztest_opts.zo_mirrors, 1); 2667 2668 /* 2669 * If we're configuring a RAIDZ device then make sure that the 2670 * the initial version is capable of supporting that feature. 2671 */ 2672 switch (ztest_opts.zo_raidz_parity) { 2673 case 0: 2674 case 1: 2675 initial_version = SPA_VERSION_INITIAL; 2676 break; 2677 case 2: 2678 initial_version = SPA_VERSION_RAIDZ2; 2679 break; 2680 case 3: 2681 initial_version = SPA_VERSION_RAIDZ3; 2682 break; 2683 } 2684 2685 /* 2686 * Create a pool with a spa version that can be upgraded. Pick 2687 * a value between initial_version and SPA_VERSION_BEFORE_FEATURES. 2688 */ 2689 do { 2690 version = ztest_random_spa_version(initial_version); 2691 } while (version > SPA_VERSION_BEFORE_FEATURES); 2692 2693 props = fnvlist_alloc(); 2694 fnvlist_add_uint64(props, 2695 zpool_prop_to_name(ZPOOL_PROP_VERSION), version); 2696 VERIFY0(spa_create(name, nvroot, props, NULL)); 2697 fnvlist_free(nvroot); 2698 fnvlist_free(props); 2699 2700 VERIFY0(spa_open(name, &spa, FTAG)); 2701 VERIFY3U(spa_version(spa), ==, version); 2702 newversion = ztest_random_spa_version(version + 1); 2703 2704 if (ztest_opts.zo_verbose >= 4) { 2705 (void) printf("upgrading spa version from %llu to %llu\n", 2706 (u_longlong_t)version, (u_longlong_t)newversion); 2707 } 2708 2709 spa_upgrade(spa, newversion); 2710 VERIFY3U(spa_version(spa), >, version); 2711 VERIFY3U(spa_version(spa), ==, fnvlist_lookup_uint64(spa->spa_config, 2712 zpool_prop_to_name(ZPOOL_PROP_VERSION))); 2713 spa_close(spa, FTAG); 2714 2715 strfree(name); 2716 mutex_exit(&ztest_vdev_lock); 2717} 2718 2719static void 2720ztest_spa_checkpoint(spa_t *spa) 2721{ 2722 ASSERT(MUTEX_HELD(&ztest_checkpoint_lock)); 2723 2724 int error = spa_checkpoint(spa->spa_name); 2725 2726 switch (error) { 2727 case 0: 2728 case ZFS_ERR_DEVRM_IN_PROGRESS: 2729 case ZFS_ERR_DISCARDING_CHECKPOINT: 2730 case ZFS_ERR_CHECKPOINT_EXISTS: 2731 break; 2732 case ENOSPC: 2733 ztest_record_enospc(FTAG); 2734 break; 2735 default: 2736 fatal(0, "spa_checkpoint(%s) = %d", spa->spa_name, error); 2737 } 2738} 2739 2740static void 2741ztest_spa_discard_checkpoint(spa_t *spa) 2742{ 2743 ASSERT(MUTEX_HELD(&ztest_checkpoint_lock)); 2744 2745 int error = spa_checkpoint_discard(spa->spa_name); 2746 2747 switch (error) { 2748 case 0: 2749 case ZFS_ERR_DISCARDING_CHECKPOINT: 2750 case ZFS_ERR_NO_CHECKPOINT: 2751 break; 2752 default: 2753 fatal(0, "spa_discard_checkpoint(%s) = %d", 2754 spa->spa_name, error); 2755 } 2756 2757} 2758 2759/* ARGSUSED */ 2760void 2761ztest_spa_checkpoint_create_discard(ztest_ds_t *zd, uint64_t id) 2762{ 2763 spa_t *spa = ztest_spa; 2764 2765 mutex_enter(&ztest_checkpoint_lock); 2766 if (ztest_random(2) == 0) { 2767 ztest_spa_checkpoint(spa); 2768 } else { 2769 ztest_spa_discard_checkpoint(spa); 2770 } 2771 mutex_exit(&ztest_checkpoint_lock); 2772} 2773 2774 2775static vdev_t * 2776vdev_lookup_by_path(vdev_t *vd, const char *path) 2777{ 2778 vdev_t *mvd; 2779 2780 if (vd->vdev_path != NULL && strcmp(path, vd->vdev_path) == 0) 2781 return (vd); 2782 2783 for (int c = 0; c < vd->vdev_children; c++) 2784 if ((mvd = vdev_lookup_by_path(vd->vdev_child[c], path)) != 2785 NULL) 2786 return (mvd); 2787 2788 return (NULL); 2789} 2790 2791/* 2792 * Find the first available hole which can be used as a top-level. 2793 */ 2794int 2795find_vdev_hole(spa_t *spa) 2796{ 2797 vdev_t *rvd = spa->spa_root_vdev; 2798 int c; 2799 2800 ASSERT(spa_config_held(spa, SCL_VDEV, RW_READER) == SCL_VDEV); 2801 2802 for (c = 0; c < rvd->vdev_children; c++) { 2803 vdev_t *cvd = rvd->vdev_child[c]; 2804 2805 if (cvd->vdev_ishole) 2806 break; 2807 } 2808 return (c); 2809} 2810 2811/* 2812 * Verify that vdev_add() works as expected. 2813 */ 2814/* ARGSUSED */ 2815void 2816ztest_vdev_add_remove(ztest_ds_t *zd, uint64_t id) 2817{ 2818 ztest_shared_t *zs = ztest_shared; 2819 spa_t *spa = ztest_spa; 2820 uint64_t leaves; 2821 uint64_t guid; 2822 nvlist_t *nvroot; 2823 int error; 2824 2825 if (ztest_opts.zo_mmp_test) 2826 return; 2827 2828 mutex_enter(&ztest_vdev_lock); 2829 leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * ztest_opts.zo_raidz; 2830 2831 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 2832 2833 ztest_shared->zs_vdev_next_leaf = find_vdev_hole(spa) * leaves; 2834 2835 /* 2836 * If we have slogs then remove them 1/4 of the time. 2837 */ 2838 if (spa_has_slogs(spa) && ztest_random(4) == 0) { 2839 metaslab_group_t *mg; 2840 2841 /* 2842 * find the first real slog in log allocation class 2843 */ 2844 mg = spa_log_class(spa)->mc_rotor; 2845 while (!mg->mg_vd->vdev_islog) 2846 mg = mg->mg_next; 2847 2848 guid = mg->mg_vd->vdev_guid; 2849 2850 spa_config_exit(spa, SCL_VDEV, FTAG); 2851 2852 /* 2853 * We have to grab the zs_name_lock as writer to 2854 * prevent a race between removing a slog (dmu_objset_find) 2855 * and destroying a dataset. Removing the slog will 2856 * grab a reference on the dataset which may cause 2857 * dmu_objset_destroy() to fail with EBUSY thus 2858 * leaving the dataset in an inconsistent state. 2859 */ 2860 rw_enter(&ztest_name_lock, RW_WRITER); 2861 error = spa_vdev_remove(spa, guid, B_FALSE); 2862 rw_exit(&ztest_name_lock); 2863 2864 switch (error) { 2865 case 0: 2866 case EEXIST: 2867 case ZFS_ERR_CHECKPOINT_EXISTS: 2868 case ZFS_ERR_DISCARDING_CHECKPOINT: 2869 break; 2870 default: 2871 fatal(0, "spa_vdev_remove() = %d", error); 2872 } 2873 } else { 2874 spa_config_exit(spa, SCL_VDEV, FTAG); 2875 2876 /* 2877 * Make 1/4 of the devices be log devices 2878 */ 2879 nvroot = make_vdev_root(NULL, NULL, NULL, 2880 ztest_opts.zo_vdev_size, 0, (ztest_random(4) == 0) ? 2881 "log" : NULL, ztest_opts.zo_raidz, zs->zs_mirrors, 1); 2882 2883 error = spa_vdev_add(spa, nvroot); 2884 nvlist_free(nvroot); 2885 2886 switch (error) { 2887 case 0: 2888 break; 2889 case ENOSPC: 2890 ztest_record_enospc("spa_vdev_add"); 2891 break; 2892 default: 2893 fatal(0, "spa_vdev_add() = %d", error); 2894 } 2895 } 2896 2897 mutex_exit(&ztest_vdev_lock); 2898} 2899 2900/* ARGSUSED */ 2901void 2902ztest_vdev_class_add(ztest_ds_t *zd, uint64_t id) 2903{ 2904 ztest_shared_t *zs = ztest_shared; 2905 spa_t *spa = ztest_spa; 2906 uint64_t leaves; 2907 nvlist_t *nvroot; 2908 const char *class = (ztest_random(2) == 0) ? 2909 VDEV_ALLOC_BIAS_SPECIAL : VDEV_ALLOC_BIAS_DEDUP; 2910 int error; 2911 2912 /* 2913 * By default add a special vdev 50% of the time 2914 */ 2915 if ((ztest_opts.zo_special_vdevs == ZTEST_VDEV_CLASS_OFF) || 2916 (ztest_opts.zo_special_vdevs == ZTEST_VDEV_CLASS_RND && 2917 ztest_random(2) == 0)) { 2918 return; 2919 } 2920 2921 mutex_enter(&ztest_vdev_lock); 2922 2923 /* Only test with mirrors */ 2924 if (zs->zs_mirrors < 2) { 2925 mutex_exit(&ztest_vdev_lock); 2926 return; 2927 } 2928 2929 /* requires feature@allocation_classes */ 2930 if (!spa_feature_is_enabled(spa, SPA_FEATURE_ALLOCATION_CLASSES)) { 2931 mutex_exit(&ztest_vdev_lock); 2932 return; 2933 } 2934 2935 leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * ztest_opts.zo_raidz; 2936 2937 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 2938 ztest_shared->zs_vdev_next_leaf = find_vdev_hole(spa) * leaves; 2939 spa_config_exit(spa, SCL_VDEV, FTAG); 2940 2941 nvroot = make_vdev_root(NULL, NULL, NULL, ztest_opts.zo_vdev_size, 0, 2942 class, ztest_opts.zo_raidz, zs->zs_mirrors, 1); 2943 2944 error = spa_vdev_add(spa, nvroot); 2945 nvlist_free(nvroot); 2946 2947 if (error == ENOSPC) 2948 ztest_record_enospc("spa_vdev_add"); 2949 else if (error != 0) 2950 fatal(0, "spa_vdev_add() = %d", error); 2951 2952 /* 2953 * 50% of the time allow small blocks in the special class 2954 */ 2955 if (error == 0 && 2956 spa_special_class(spa)->mc_groups == 1 && ztest_random(2) == 0) { 2957 if (ztest_opts.zo_verbose >= 3) 2958 (void) printf("Enabling special VDEV small blocks\n"); 2959 (void) ztest_dsl_prop_set_uint64(zd->zd_name, 2960 ZFS_PROP_SPECIAL_SMALL_BLOCKS, 32768, B_FALSE); 2961 } 2962 2963 mutex_exit(&ztest_vdev_lock); 2964 2965 if (ztest_opts.zo_verbose >= 3) { 2966 metaslab_class_t *mc; 2967 2968 if (strcmp(class, VDEV_ALLOC_BIAS_SPECIAL) == 0) 2969 mc = spa_special_class(spa); 2970 else 2971 mc = spa_dedup_class(spa); 2972 (void) printf("Added a %s mirrored vdev (of %d)\n", 2973 class, (int)mc->mc_groups); 2974 } 2975} 2976 2977/* 2978 * Verify that adding/removing aux devices (l2arc, hot spare) works as expected. 2979 */ 2980/* ARGSUSED */ 2981void 2982ztest_vdev_aux_add_remove(ztest_ds_t *zd, uint64_t id) 2983{ 2984 ztest_shared_t *zs = ztest_shared; 2985 spa_t *spa = ztest_spa; 2986 vdev_t *rvd = spa->spa_root_vdev; 2987 spa_aux_vdev_t *sav; 2988 char *aux; 2989 uint64_t guid = 0; 2990 int error; 2991 2992 if (ztest_opts.zo_mmp_test) 2993 return; 2994 2995 if (ztest_random(2) == 0) { 2996 sav = &spa->spa_spares; 2997 aux = ZPOOL_CONFIG_SPARES; 2998 } else { 2999 sav = &spa->spa_l2cache; 3000 aux = ZPOOL_CONFIG_L2CACHE; 3001 } 3002 3003 mutex_enter(&ztest_vdev_lock); 3004 3005 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 3006 3007 if (sav->sav_count != 0 && ztest_random(4) == 0) { 3008 /* 3009 * Pick a random device to remove. 3010 */ 3011 guid = sav->sav_vdevs[ztest_random(sav->sav_count)]->vdev_guid; 3012 } else { 3013 /* 3014 * Find an unused device we can add. 3015 */ 3016 zs->zs_vdev_aux = 0; 3017 for (;;) { 3018 char path[MAXPATHLEN]; 3019 int c; 3020 (void) snprintf(path, sizeof (path), ztest_aux_template, 3021 ztest_opts.zo_dir, ztest_opts.zo_pool, aux, 3022 zs->zs_vdev_aux); 3023 for (c = 0; c < sav->sav_count; c++) 3024 if (strcmp(sav->sav_vdevs[c]->vdev_path, 3025 path) == 0) 3026 break; 3027 if (c == sav->sav_count && 3028 vdev_lookup_by_path(rvd, path) == NULL) 3029 break; 3030 zs->zs_vdev_aux++; 3031 } 3032 } 3033 3034 spa_config_exit(spa, SCL_VDEV, FTAG); 3035 3036 if (guid == 0) { 3037 /* 3038 * Add a new device. 3039 */ 3040 nvlist_t *nvroot = make_vdev_root(NULL, aux, NULL, 3041 (ztest_opts.zo_vdev_size * 5) / 4, 0, NULL, 0, 0, 1); 3042 error = spa_vdev_add(spa, nvroot); 3043 3044 switch (error) { 3045 case 0: 3046 break; 3047 default: 3048 fatal(0, "spa_vdev_add(%p) = %d", nvroot, error); 3049 } 3050 nvlist_free(nvroot); 3051 } else { 3052 /* 3053 * Remove an existing device. Sometimes, dirty its 3054 * vdev state first to make sure we handle removal 3055 * of devices that have pending state changes. 3056 */ 3057 if (ztest_random(2) == 0) 3058 (void) vdev_online(spa, guid, 0, NULL); 3059 3060 error = spa_vdev_remove(spa, guid, B_FALSE); 3061 3062 switch (error) { 3063 case 0: 3064 case EBUSY: 3065 case ZFS_ERR_CHECKPOINT_EXISTS: 3066 case ZFS_ERR_DISCARDING_CHECKPOINT: 3067 break; 3068 default: 3069 fatal(0, "spa_vdev_remove(%llu) = %d", guid, error); 3070 } 3071 } 3072 3073 mutex_exit(&ztest_vdev_lock); 3074} 3075 3076/* 3077 * split a pool if it has mirror tlvdevs 3078 */ 3079/* ARGSUSED */ 3080void 3081ztest_split_pool(ztest_ds_t *zd, uint64_t id) 3082{ 3083 ztest_shared_t *zs = ztest_shared; 3084 spa_t *spa = ztest_spa; 3085 vdev_t *rvd = spa->spa_root_vdev; 3086 nvlist_t *tree, **child, *config, *split, **schild; 3087 uint_t c, children, schildren = 0, lastlogid = 0; 3088 int error = 0; 3089 3090 if (ztest_opts.zo_mmp_test) 3091 return; 3092 3093 mutex_enter(&ztest_vdev_lock); 3094 3095 /* ensure we have a useable config; mirrors of raidz aren't supported */ 3096 if (zs->zs_mirrors < 3 || ztest_opts.zo_raidz > 1) { 3097 mutex_exit(&ztest_vdev_lock); 3098 return; 3099 } 3100 3101 /* clean up the old pool, if any */ 3102 (void) spa_destroy("splitp"); 3103 3104 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 3105 3106 /* generate a config from the existing config */ 3107 mutex_enter(&spa->spa_props_lock); 3108 VERIFY(nvlist_lookup_nvlist(spa->spa_config, ZPOOL_CONFIG_VDEV_TREE, 3109 &tree) == 0); 3110 mutex_exit(&spa->spa_props_lock); 3111 3112 VERIFY(nvlist_lookup_nvlist_array(tree, ZPOOL_CONFIG_CHILDREN, &child, 3113 &children) == 0); 3114 3115 schild = malloc(rvd->vdev_children * sizeof (nvlist_t *)); 3116 for (c = 0; c < children; c++) { 3117 vdev_t *tvd = rvd->vdev_child[c]; 3118 nvlist_t **mchild; 3119 uint_t mchildren; 3120 3121 if (tvd->vdev_islog || tvd->vdev_ops == &vdev_hole_ops) { 3122 VERIFY(nvlist_alloc(&schild[schildren], NV_UNIQUE_NAME, 3123 0) == 0); 3124 VERIFY(nvlist_add_string(schild[schildren], 3125 ZPOOL_CONFIG_TYPE, VDEV_TYPE_HOLE) == 0); 3126 VERIFY(nvlist_add_uint64(schild[schildren], 3127 ZPOOL_CONFIG_IS_HOLE, 1) == 0); 3128 if (lastlogid == 0) 3129 lastlogid = schildren; 3130 ++schildren; 3131 continue; 3132 } 3133 lastlogid = 0; 3134 VERIFY(nvlist_lookup_nvlist_array(child[c], 3135 ZPOOL_CONFIG_CHILDREN, &mchild, &mchildren) == 0); 3136 VERIFY(nvlist_dup(mchild[0], &schild[schildren++], 0) == 0); 3137 } 3138 3139 /* OK, create a config that can be used to split */ 3140 VERIFY(nvlist_alloc(&split, NV_UNIQUE_NAME, 0) == 0); 3141 VERIFY(nvlist_add_string(split, ZPOOL_CONFIG_TYPE, 3142 VDEV_TYPE_ROOT) == 0); 3143 VERIFY(nvlist_add_nvlist_array(split, ZPOOL_CONFIG_CHILDREN, schild, 3144 lastlogid != 0 ? lastlogid : schildren) == 0); 3145 3146 VERIFY(nvlist_alloc(&config, NV_UNIQUE_NAME, 0) == 0); 3147 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, split) == 0); 3148 3149 for (c = 0; c < schildren; c++) 3150 nvlist_free(schild[c]); 3151 free(schild); 3152 nvlist_free(split); 3153 3154 spa_config_exit(spa, SCL_VDEV, FTAG); 3155 3156 rw_enter(&ztest_name_lock, RW_WRITER); 3157 error = spa_vdev_split_mirror(spa, "splitp", config, NULL, B_FALSE); 3158 rw_exit(&ztest_name_lock); 3159 3160 nvlist_free(config); 3161 3162 if (error == 0) { 3163 (void) printf("successful split - results:\n"); 3164 mutex_enter(&spa_namespace_lock); 3165 show_pool_stats(spa); 3166 show_pool_stats(spa_lookup("splitp")); 3167 mutex_exit(&spa_namespace_lock); 3168 ++zs->zs_splits; 3169 --zs->zs_mirrors; 3170 } 3171 mutex_exit(&ztest_vdev_lock); 3172} 3173 3174/* 3175 * Verify that we can attach and detach devices. 3176 */ 3177/* ARGSUSED */ 3178void 3179ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id) 3180{ 3181 ztest_shared_t *zs = ztest_shared; 3182 spa_t *spa = ztest_spa; 3183 spa_aux_vdev_t *sav = &spa->spa_spares; 3184 vdev_t *rvd = spa->spa_root_vdev; 3185 vdev_t *oldvd, *newvd, *pvd; 3186 nvlist_t *root; 3187 uint64_t leaves; 3188 uint64_t leaf, top; 3189 uint64_t ashift = ztest_get_ashift(); 3190 uint64_t oldguid, pguid; 3191 uint64_t oldsize, newsize; 3192 char oldpath[MAXPATHLEN], newpath[MAXPATHLEN]; 3193 int replacing; 3194 int oldvd_has_siblings = B_FALSE; 3195 int newvd_is_spare = B_FALSE; 3196 int oldvd_is_log; 3197 int error, expected_error; 3198 3199 if (ztest_opts.zo_mmp_test) 3200 return; 3201 3202 mutex_enter(&ztest_vdev_lock); 3203 leaves = MAX(zs->zs_mirrors, 1) * ztest_opts.zo_raidz; 3204 3205 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3206 3207 /* 3208 * If a vdev is in the process of being removed, its removal may 3209 * finish while we are in progress, leading to an unexpected error 3210 * value. Don't bother trying to attach while we are in the middle 3211 * of removal. 3212 */ 3213 if (ztest_device_removal_active) { 3214 spa_config_exit(spa, SCL_ALL, FTAG); 3215 mutex_exit(&ztest_vdev_lock); 3216 return; 3217 } 3218 3219 /* 3220 * Decide whether to do an attach or a replace. 3221 */ 3222 replacing = ztest_random(2); 3223 3224 /* 3225 * Pick a random top-level vdev. 3226 */ 3227 top = ztest_random_vdev_top(spa, B_TRUE); 3228 3229 /* 3230 * Pick a random leaf within it. 3231 */ 3232 leaf = ztest_random(leaves); 3233 3234 /* 3235 * Locate this vdev. 3236 */ 3237 oldvd = rvd->vdev_child[top]; 3238 3239 /* pick a child from the mirror */ 3240 if (zs->zs_mirrors >= 1) { 3241 ASSERT(oldvd->vdev_ops == &vdev_mirror_ops); 3242 ASSERT(oldvd->vdev_children >= zs->zs_mirrors); 3243 oldvd = oldvd->vdev_child[leaf / ztest_opts.zo_raidz]; 3244 } 3245 3246 /* pick a child out of the raidz group */ 3247 if (ztest_opts.zo_raidz > 1) { 3248 ASSERT(oldvd->vdev_ops == &vdev_raidz_ops); 3249 ASSERT(oldvd->vdev_children == ztest_opts.zo_raidz); 3250 oldvd = oldvd->vdev_child[leaf % ztest_opts.zo_raidz]; 3251 } 3252 3253 /* 3254 * If we're already doing an attach or replace, oldvd may be a 3255 * mirror vdev -- in which case, pick a random child. 3256 */ 3257 while (oldvd->vdev_children != 0) { 3258 oldvd_has_siblings = B_TRUE; 3259 ASSERT(oldvd->vdev_children >= 2); 3260 oldvd = oldvd->vdev_child[ztest_random(oldvd->vdev_children)]; 3261 } 3262 3263 oldguid = oldvd->vdev_guid; 3264 oldsize = vdev_get_min_asize(oldvd); 3265 oldvd_is_log = oldvd->vdev_top->vdev_islog; 3266 (void) strcpy(oldpath, oldvd->vdev_path); 3267 pvd = oldvd->vdev_parent; 3268 pguid = pvd->vdev_guid; 3269 3270 /* 3271 * If oldvd has siblings, then half of the time, detach it. 3272 */ 3273 if (oldvd_has_siblings && ztest_random(2) == 0) { 3274 spa_config_exit(spa, SCL_ALL, FTAG); 3275 error = spa_vdev_detach(spa, oldguid, pguid, B_FALSE); 3276 if (error != 0 && error != ENODEV && error != EBUSY && 3277 error != ENOTSUP && error != ZFS_ERR_CHECKPOINT_EXISTS && 3278 error != ZFS_ERR_DISCARDING_CHECKPOINT) 3279 fatal(0, "detach (%s) returned %d", oldpath, error); 3280 mutex_exit(&ztest_vdev_lock); 3281 return; 3282 } 3283 3284 /* 3285 * For the new vdev, choose with equal probability between the two 3286 * standard paths (ending in either 'a' or 'b') or a random hot spare. 3287 */ 3288 if (sav->sav_count != 0 && ztest_random(3) == 0) { 3289 newvd = sav->sav_vdevs[ztest_random(sav->sav_count)]; 3290 newvd_is_spare = B_TRUE; 3291 (void) strcpy(newpath, newvd->vdev_path); 3292 } else { 3293 (void) snprintf(newpath, sizeof (newpath), ztest_dev_template, 3294 ztest_opts.zo_dir, ztest_opts.zo_pool, 3295 top * leaves + leaf); 3296 if (ztest_random(2) == 0) 3297 newpath[strlen(newpath) - 1] = 'b'; 3298 newvd = vdev_lookup_by_path(rvd, newpath); 3299 } 3300 3301 if (newvd) { 3302 /* 3303 * Reopen to ensure the vdev's asize field isn't stale. 3304 */ 3305 vdev_reopen(newvd); 3306 newsize = vdev_get_min_asize(newvd); 3307 } else { 3308 /* 3309 * Make newsize a little bigger or smaller than oldsize. 3310 * If it's smaller, the attach should fail. 3311 * If it's larger, and we're doing a replace, 3312 * we should get dynamic LUN growth when we're done. 3313 */ 3314 newsize = 10 * oldsize / (9 + ztest_random(3)); 3315 } 3316 3317 /* 3318 * If pvd is not a mirror or root, the attach should fail with ENOTSUP, 3319 * unless it's a replace; in that case any non-replacing parent is OK. 3320 * 3321 * If newvd is already part of the pool, it should fail with EBUSY. 3322 * 3323 * If newvd is too small, it should fail with EOVERFLOW. 3324 */ 3325 if (pvd->vdev_ops != &vdev_mirror_ops && 3326 pvd->vdev_ops != &vdev_root_ops && (!replacing || 3327 pvd->vdev_ops == &vdev_replacing_ops || 3328 pvd->vdev_ops == &vdev_spare_ops)) 3329 expected_error = ENOTSUP; 3330 else if (newvd_is_spare && (!replacing || oldvd_is_log)) 3331 expected_error = ENOTSUP; 3332 else if (newvd == oldvd) 3333 expected_error = replacing ? 0 : EBUSY; 3334 else if (vdev_lookup_by_path(rvd, newpath) != NULL) 3335 expected_error = EBUSY; 3336 else if (newsize < oldsize) 3337 expected_error = EOVERFLOW; 3338 else if (ashift > oldvd->vdev_top->vdev_ashift) 3339 expected_error = EDOM; 3340 else 3341 expected_error = 0; 3342 3343 spa_config_exit(spa, SCL_ALL, FTAG); 3344 3345 /* 3346 * Build the nvlist describing newpath. 3347 */ 3348 root = make_vdev_root(newpath, NULL, NULL, newvd == NULL ? newsize : 0, 3349 ashift, NULL, 0, 0, 1); 3350 3351 error = spa_vdev_attach(spa, oldguid, root, replacing); 3352 3353 nvlist_free(root); 3354 3355 /* 3356 * If our parent was the replacing vdev, but the replace completed, 3357 * then instead of failing with ENOTSUP we may either succeed, 3358 * fail with ENODEV, or fail with EOVERFLOW. 3359 */ 3360 if (expected_error == ENOTSUP && 3361 (error == 0 || error == ENODEV || error == EOVERFLOW)) 3362 expected_error = error; 3363 3364 /* 3365 * If someone grew the LUN, the replacement may be too small. 3366 */ 3367 if (error == EOVERFLOW || error == EBUSY) 3368 expected_error = error; 3369 3370 if (error == ZFS_ERR_CHECKPOINT_EXISTS || 3371 error == ZFS_ERR_DISCARDING_CHECKPOINT) 3372 expected_error = error; 3373 3374 /* XXX workaround 6690467 */ 3375 if (error != expected_error && expected_error != EBUSY) { 3376 fatal(0, "attach (%s %llu, %s %llu, %d) " 3377 "returned %d, expected %d", 3378 oldpath, oldsize, newpath, 3379 newsize, replacing, error, expected_error); 3380 } 3381 3382 mutex_exit(&ztest_vdev_lock); 3383} 3384 3385/* ARGSUSED */ 3386void 3387ztest_device_removal(ztest_ds_t *zd, uint64_t id) 3388{ 3389 spa_t *spa = ztest_spa; 3390 vdev_t *vd; 3391 uint64_t guid; 3392 int error; 3393 3394 mutex_enter(&ztest_vdev_lock); 3395 3396 if (ztest_device_removal_active) { 3397 mutex_exit(&ztest_vdev_lock); 3398 return; 3399 } 3400 3401 /* 3402 * Remove a random top-level vdev and wait for removal to finish. 3403 */ 3404 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 3405 vd = vdev_lookup_top(spa, ztest_random_vdev_top(spa, B_FALSE)); 3406 guid = vd->vdev_guid; 3407 spa_config_exit(spa, SCL_VDEV, FTAG); 3408 3409 error = spa_vdev_remove(spa, guid, B_FALSE); 3410 if (error == 0) { 3411 ztest_device_removal_active = B_TRUE; 3412 mutex_exit(&ztest_vdev_lock); 3413 3414 while (spa->spa_vdev_removal != NULL) 3415 txg_wait_synced(spa_get_dsl(spa), 0); 3416 } else { 3417 mutex_exit(&ztest_vdev_lock); 3418 return; 3419 } 3420 3421 /* 3422 * The pool needs to be scrubbed after completing device removal. 3423 * Failure to do so may result in checksum errors due to the 3424 * strategy employed by ztest_fault_inject() when selecting which 3425 * offset are redundant and can be damaged. 3426 */ 3427 error = spa_scan(spa, POOL_SCAN_SCRUB); 3428 if (error == 0) { 3429 while (dsl_scan_scrubbing(spa_get_dsl(spa))) 3430 txg_wait_synced(spa_get_dsl(spa), 0); 3431 } 3432 3433 mutex_enter(&ztest_vdev_lock); 3434 ztest_device_removal_active = B_FALSE; 3435 mutex_exit(&ztest_vdev_lock); 3436} 3437 3438/* 3439 * Callback function which expands the physical size of the vdev. 3440 */ 3441vdev_t * 3442grow_vdev(vdev_t *vd, void *arg) 3443{ 3444 spa_t *spa = vd->vdev_spa; 3445 size_t *newsize = arg; 3446 size_t fsize; 3447 int fd; 3448 3449 ASSERT(spa_config_held(spa, SCL_STATE, RW_READER) == SCL_STATE); 3450 ASSERT(vd->vdev_ops->vdev_op_leaf); 3451 3452 if ((fd = open(vd->vdev_path, O_RDWR)) == -1) 3453 return (vd); 3454 3455 fsize = lseek(fd, 0, SEEK_END); 3456 (void) ftruncate(fd, *newsize); 3457 3458 if (ztest_opts.zo_verbose >= 6) { 3459 (void) printf("%s grew from %lu to %lu bytes\n", 3460 vd->vdev_path, (ulong_t)fsize, (ulong_t)*newsize); 3461 } 3462 (void) close(fd); 3463 return (NULL); 3464} 3465 3466/* 3467 * Callback function which expands a given vdev by calling vdev_online(). 3468 */ 3469/* ARGSUSED */ 3470vdev_t * 3471online_vdev(vdev_t *vd, void *arg) 3472{ 3473 spa_t *spa = vd->vdev_spa; 3474 vdev_t *tvd = vd->vdev_top; 3475 uint64_t guid = vd->vdev_guid; 3476 uint64_t generation = spa->spa_config_generation + 1; 3477 vdev_state_t newstate = VDEV_STATE_UNKNOWN; 3478 int error; 3479 3480 ASSERT(spa_config_held(spa, SCL_STATE, RW_READER) == SCL_STATE); 3481 ASSERT(vd->vdev_ops->vdev_op_leaf); 3482 3483 /* Calling vdev_online will initialize the new metaslabs */ 3484 spa_config_exit(spa, SCL_STATE, spa); 3485 error = vdev_online(spa, guid, ZFS_ONLINE_EXPAND, &newstate); 3486 spa_config_enter(spa, SCL_STATE, spa, RW_READER); 3487 3488 /* 3489 * If vdev_online returned an error or the underlying vdev_open 3490 * failed then we abort the expand. The only way to know that 3491 * vdev_open fails is by checking the returned newstate. 3492 */ 3493 if (error || newstate != VDEV_STATE_HEALTHY) { 3494 if (ztest_opts.zo_verbose >= 5) { 3495 (void) printf("Unable to expand vdev, state %llu, " 3496 "error %d\n", (u_longlong_t)newstate, error); 3497 } 3498 return (vd); 3499 } 3500 ASSERT3U(newstate, ==, VDEV_STATE_HEALTHY); 3501 3502 /* 3503 * Since we dropped the lock we need to ensure that we're 3504 * still talking to the original vdev. It's possible this 3505 * vdev may have been detached/replaced while we were 3506 * trying to online it. 3507 */ 3508 if (generation != spa->spa_config_generation) { 3509 if (ztest_opts.zo_verbose >= 5) { 3510 (void) printf("vdev configuration has changed, " 3511 "guid %llu, state %llu, expected gen %llu, " 3512 "got gen %llu\n", 3513 (u_longlong_t)guid, 3514 (u_longlong_t)tvd->vdev_state, 3515 (u_longlong_t)generation, 3516 (u_longlong_t)spa->spa_config_generation); 3517 } 3518 return (vd); 3519 } 3520 return (NULL); 3521} 3522 3523/* 3524 * Traverse the vdev tree calling the supplied function. 3525 * We continue to walk the tree until we either have walked all 3526 * children or we receive a non-NULL return from the callback. 3527 * If a NULL callback is passed, then we just return back the first 3528 * leaf vdev we encounter. 3529 */ 3530vdev_t * 3531vdev_walk_tree(vdev_t *vd, vdev_t *(*func)(vdev_t *, void *), void *arg) 3532{ 3533 if (vd->vdev_ops->vdev_op_leaf) { 3534 if (func == NULL) 3535 return (vd); 3536 else 3537 return (func(vd, arg)); 3538 } 3539 3540 for (uint_t c = 0; c < vd->vdev_children; c++) { 3541 vdev_t *cvd = vd->vdev_child[c]; 3542 if ((cvd = vdev_walk_tree(cvd, func, arg)) != NULL) 3543 return (cvd); 3544 } 3545 return (NULL); 3546} 3547 3548/* 3549 * Verify that dynamic LUN growth works as expected. 3550 */ 3551/* ARGSUSED */ 3552void 3553ztest_vdev_LUN_growth(ztest_ds_t *zd, uint64_t id) 3554{ 3555 spa_t *spa = ztest_spa; 3556 vdev_t *vd, *tvd; 3557 metaslab_class_t *mc; 3558 metaslab_group_t *mg; 3559 size_t psize, newsize; 3560 uint64_t top; 3561 uint64_t old_class_space, new_class_space, old_ms_count, new_ms_count; 3562 3563 mutex_enter(&ztest_checkpoint_lock); 3564 mutex_enter(&ztest_vdev_lock); 3565 spa_config_enter(spa, SCL_STATE, spa, RW_READER); 3566 3567 /* 3568 * If there is a vdev removal in progress, it could complete while 3569 * we are running, in which case we would not be able to verify 3570 * that the metaslab_class space increased (because it decreases 3571 * when the device removal completes). 3572 */ 3573 if (ztest_device_removal_active) { 3574 spa_config_exit(spa, SCL_STATE, spa); 3575 mutex_exit(&ztest_vdev_lock); 3576 mutex_exit(&ztest_checkpoint_lock); 3577 return; 3578 } 3579 3580 top = ztest_random_vdev_top(spa, B_TRUE); 3581 3582 tvd = spa->spa_root_vdev->vdev_child[top]; 3583 mg = tvd->vdev_mg; 3584 mc = mg->mg_class; 3585 old_ms_count = tvd->vdev_ms_count; 3586 old_class_space = metaslab_class_get_space(mc); 3587 3588 /* 3589 * Determine the size of the first leaf vdev associated with 3590 * our top-level device. 3591 */ 3592 vd = vdev_walk_tree(tvd, NULL, NULL); 3593 ASSERT3P(vd, !=, NULL); 3594 ASSERT(vd->vdev_ops->vdev_op_leaf); 3595 3596 psize = vd->vdev_psize; 3597 3598 /* 3599 * We only try to expand the vdev if it's healthy, less than 4x its 3600 * original size, and it has a valid psize. 3601 */ 3602 if (tvd->vdev_state != VDEV_STATE_HEALTHY || 3603 psize == 0 || psize >= 4 * ztest_opts.zo_vdev_size) { 3604 spa_config_exit(spa, SCL_STATE, spa); 3605 mutex_exit(&ztest_vdev_lock); 3606 mutex_exit(&ztest_checkpoint_lock); 3607 return; 3608 } 3609 ASSERT(psize > 0); 3610 newsize = psize + MAX(psize / 8, SPA_MAXBLOCKSIZE); 3611 ASSERT3U(newsize, >, psize); 3612 3613 if (ztest_opts.zo_verbose >= 6) { 3614 (void) printf("Expanding LUN %s from %lu to %lu\n", 3615 vd->vdev_path, (ulong_t)psize, (ulong_t)newsize); 3616 } 3617 3618 /* 3619 * Growing the vdev is a two step process: 3620 * 1). expand the physical size (i.e. relabel) 3621 * 2). online the vdev to create the new metaslabs 3622 */ 3623 if (vdev_walk_tree(tvd, grow_vdev, &newsize) != NULL || 3624 vdev_walk_tree(tvd, online_vdev, NULL) != NULL || 3625 tvd->vdev_state != VDEV_STATE_HEALTHY) { 3626 if (ztest_opts.zo_verbose >= 5) { 3627 (void) printf("Could not expand LUN because " 3628 "the vdev configuration changed.\n"); 3629 } 3630 spa_config_exit(spa, SCL_STATE, spa); 3631 mutex_exit(&ztest_vdev_lock); 3632 mutex_exit(&ztest_checkpoint_lock); 3633 return; 3634 } 3635 3636 spa_config_exit(spa, SCL_STATE, spa); 3637 3638 /* 3639 * Expanding the LUN will update the config asynchronously, 3640 * thus we must wait for the async thread to complete any 3641 * pending tasks before proceeding. 3642 */ 3643 for (;;) { 3644 boolean_t done; 3645 mutex_enter(&spa->spa_async_lock); 3646 done = (spa->spa_async_thread == NULL && !spa->spa_async_tasks); 3647 mutex_exit(&spa->spa_async_lock); 3648 if (done) 3649 break; 3650 txg_wait_synced(spa_get_dsl(spa), 0); 3651 (void) poll(NULL, 0, 100); 3652 } 3653 3654 spa_config_enter(spa, SCL_STATE, spa, RW_READER); 3655 3656 tvd = spa->spa_root_vdev->vdev_child[top]; 3657 new_ms_count = tvd->vdev_ms_count; 3658 new_class_space = metaslab_class_get_space(mc); 3659 3660 if (tvd->vdev_mg != mg || mg->mg_class != mc) { 3661 if (ztest_opts.zo_verbose >= 5) { 3662 (void) printf("Could not verify LUN expansion due to " 3663 "intervening vdev offline or remove.\n"); 3664 } 3665 spa_config_exit(spa, SCL_STATE, spa); 3666 mutex_exit(&ztest_vdev_lock); 3667 mutex_exit(&ztest_checkpoint_lock); 3668 return; 3669 } 3670 3671 /* 3672 * Make sure we were able to grow the vdev. 3673 */ 3674 if (new_ms_count <= old_ms_count) { 3675 fatal(0, "LUN expansion failed: ms_count %llu < %llu\n", 3676 old_ms_count, new_ms_count); 3677 } 3678 3679 /* 3680 * Make sure we were able to grow the pool. 3681 */ 3682 if (new_class_space <= old_class_space) { 3683 fatal(0, "LUN expansion failed: class_space %llu < %llu\n", 3684 old_class_space, new_class_space); 3685 } 3686 3687 if (ztest_opts.zo_verbose >= 5) { 3688 char oldnumbuf[NN_NUMBUF_SZ], newnumbuf[NN_NUMBUF_SZ]; 3689 3690 nicenum(old_class_space, oldnumbuf, sizeof (oldnumbuf)); 3691 nicenum(new_class_space, newnumbuf, sizeof (newnumbuf)); 3692 (void) printf("%s grew from %s to %s\n", 3693 spa->spa_name, oldnumbuf, newnumbuf); 3694 } 3695 3696 spa_config_exit(spa, SCL_STATE, spa); 3697 mutex_exit(&ztest_vdev_lock); 3698 mutex_exit(&ztest_checkpoint_lock); 3699} 3700 3701/* 3702 * Verify that dmu_objset_{create,destroy,open,close} work as expected. 3703 */ 3704/* ARGSUSED */ 3705static void 3706ztest_objset_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx) 3707{ 3708 /* 3709 * Create the objects common to all ztest datasets. 3710 */ 3711 VERIFY(zap_create_claim(os, ZTEST_DIROBJ, 3712 DMU_OT_ZAP_OTHER, DMU_OT_NONE, 0, tx) == 0); 3713} 3714 3715static int 3716ztest_dataset_create(char *dsname) 3717{ 3718 uint64_t zilset = ztest_random(100); 3719 int err = dmu_objset_create(dsname, DMU_OST_OTHER, 0, 3720 ztest_objset_create_cb, NULL); 3721 3722 if (err || zilset < 80) 3723 return (err); 3724 3725 if (ztest_opts.zo_verbose >= 6) 3726 (void) printf("Setting dataset %s to sync always\n", dsname); 3727 return (ztest_dsl_prop_set_uint64(dsname, ZFS_PROP_SYNC, 3728 ZFS_SYNC_ALWAYS, B_FALSE)); 3729} 3730 3731/* ARGSUSED */ 3732static int 3733ztest_objset_destroy_cb(const char *name, void *arg) 3734{ 3735 objset_t *os; 3736 dmu_object_info_t doi; 3737 int error; 3738 3739 /* 3740 * Verify that the dataset contains a directory object. 3741 */ 3742 VERIFY0(dmu_objset_own(name, DMU_OST_OTHER, B_TRUE, FTAG, &os)); 3743 error = dmu_object_info(os, ZTEST_DIROBJ, &doi); 3744 if (error != ENOENT) { 3745 /* We could have crashed in the middle of destroying it */ 3746 ASSERT0(error); 3747 ASSERT3U(doi.doi_type, ==, DMU_OT_ZAP_OTHER); 3748 ASSERT3S(doi.doi_physical_blocks_512, >=, 0); 3749 } 3750 dmu_objset_disown(os, FTAG); 3751 3752 /* 3753 * Destroy the dataset. 3754 */ 3755 if (strchr(name, '@') != NULL) { 3756 VERIFY0(dsl_destroy_snapshot(name, B_FALSE)); 3757 } else { 3758 VERIFY0(dsl_destroy_head(name)); 3759 } 3760 return (0); 3761} 3762 3763static boolean_t 3764ztest_snapshot_create(char *osname, uint64_t id) 3765{ 3766 char snapname[ZFS_MAX_DATASET_NAME_LEN]; 3767 int error; 3768 3769 (void) snprintf(snapname, sizeof (snapname), "%llu", (u_longlong_t)id); 3770 3771 error = dmu_objset_snapshot_one(osname, snapname); 3772 if (error == ENOSPC) { 3773 ztest_record_enospc(FTAG); 3774 return (B_FALSE); 3775 } 3776 if (error != 0 && error != EEXIST) { 3777 fatal(0, "ztest_snapshot_create(%s@%s) = %d", osname, 3778 snapname, error); 3779 } 3780 return (B_TRUE); 3781} 3782 3783static boolean_t 3784ztest_snapshot_destroy(char *osname, uint64_t id) 3785{ 3786 char snapname[ZFS_MAX_DATASET_NAME_LEN]; 3787 int error; 3788 3789 (void) snprintf(snapname, sizeof (snapname), "%s@%llu", osname, 3790 (u_longlong_t)id); 3791 3792 error = dsl_destroy_snapshot(snapname, B_FALSE); 3793 if (error != 0 && error != ENOENT) 3794 fatal(0, "ztest_snapshot_destroy(%s) = %d", snapname, error); 3795 return (B_TRUE); 3796} 3797 3798/* ARGSUSED */ 3799void 3800ztest_dmu_objset_create_destroy(ztest_ds_t *zd, uint64_t id) 3801{ 3802 ztest_ds_t zdtmp; 3803 int iters; 3804 int error; 3805 objset_t *os, *os2; 3806 char name[ZFS_MAX_DATASET_NAME_LEN]; 3807 zilog_t *zilog; 3808 3809 rw_enter(&ztest_name_lock, RW_READER); 3810 3811 (void) snprintf(name, sizeof (name), "%s/temp_%llu", 3812 ztest_opts.zo_pool, (u_longlong_t)id); 3813 3814 /* 3815 * If this dataset exists from a previous run, process its replay log 3816 * half of the time. If we don't replay it, then dmu_objset_destroy() 3817 * (invoked from ztest_objset_destroy_cb()) should just throw it away. 3818 */ 3819 if (ztest_random(2) == 0 && 3820 dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, FTAG, &os) == 0) { 3821 ztest_zd_init(&zdtmp, NULL, os); 3822 zil_replay(os, &zdtmp, ztest_replay_vector); 3823 ztest_zd_fini(&zdtmp); 3824 dmu_objset_disown(os, FTAG); 3825 } 3826 3827 /* 3828 * There may be an old instance of the dataset we're about to 3829 * create lying around from a previous run. If so, destroy it 3830 * and all of its snapshots. 3831 */ 3832 (void) dmu_objset_find(name, ztest_objset_destroy_cb, NULL, 3833 DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS); 3834 3835 /* 3836 * Verify that the destroyed dataset is no longer in the namespace. 3837 */ 3838 VERIFY3U(ENOENT, ==, dmu_objset_own(name, DMU_OST_OTHER, B_TRUE, 3839 FTAG, &os)); 3840 3841 /* 3842 * Verify that we can create a new dataset. 3843 */ 3844 error = ztest_dataset_create(name); 3845 if (error) { 3846 if (error == ENOSPC) { 3847 ztest_record_enospc(FTAG); 3848 rw_exit(&ztest_name_lock); 3849 return; 3850 } 3851 fatal(0, "dmu_objset_create(%s) = %d", name, error); 3852 } 3853 3854 VERIFY0(dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, FTAG, &os)); 3855 3856 ztest_zd_init(&zdtmp, NULL, os); 3857 3858 /* 3859 * Open the intent log for it. 3860 */ 3861 zilog = zil_open(os, ztest_get_data); 3862 3863 /* 3864 * Put some objects in there, do a little I/O to them, 3865 * and randomly take a couple of snapshots along the way. 3866 */ 3867 iters = ztest_random(5); 3868 for (int i = 0; i < iters; i++) { 3869 ztest_dmu_object_alloc_free(&zdtmp, id); 3870 if (ztest_random(iters) == 0) 3871 (void) ztest_snapshot_create(name, i); 3872 } 3873 3874 /* 3875 * Verify that we cannot create an existing dataset. 3876 */ 3877 VERIFY3U(EEXIST, ==, 3878 dmu_objset_create(name, DMU_OST_OTHER, 0, NULL, NULL)); 3879 3880 /* 3881 * Verify that we can hold an objset that is also owned. 3882 */ 3883 VERIFY3U(0, ==, dmu_objset_hold(name, FTAG, &os2)); 3884 dmu_objset_rele(os2, FTAG); 3885 3886 /* 3887 * Verify that we cannot own an objset that is already owned. 3888 */ 3889 VERIFY3U(EBUSY, ==, 3890 dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, FTAG, &os2)); 3891 3892 zil_close(zilog); 3893 dmu_objset_disown(os, FTAG); 3894 ztest_zd_fini(&zdtmp); 3895 3896 rw_exit(&ztest_name_lock); 3897} 3898 3899/* 3900 * Verify that dmu_snapshot_{create,destroy,open,close} work as expected. 3901 */ 3902void 3903ztest_dmu_snapshot_create_destroy(ztest_ds_t *zd, uint64_t id) 3904{ 3905 rw_enter(&ztest_name_lock, RW_READER); 3906 (void) ztest_snapshot_destroy(zd->zd_name, id); 3907 (void) ztest_snapshot_create(zd->zd_name, id); 3908 rw_exit(&ztest_name_lock); 3909} 3910 3911/* 3912 * Cleanup non-standard snapshots and clones. 3913 */ 3914void 3915ztest_dsl_dataset_cleanup(char *osname, uint64_t id) 3916{ 3917 char snap1name[ZFS_MAX_DATASET_NAME_LEN]; 3918 char clone1name[ZFS_MAX_DATASET_NAME_LEN]; 3919 char snap2name[ZFS_MAX_DATASET_NAME_LEN]; 3920 char clone2name[ZFS_MAX_DATASET_NAME_LEN]; 3921 char snap3name[ZFS_MAX_DATASET_NAME_LEN]; 3922 int error; 3923 3924 (void) snprintf(snap1name, sizeof (snap1name), 3925 "%s@s1_%llu", osname, id); 3926 (void) snprintf(clone1name, sizeof (clone1name), 3927 "%s/c1_%llu", osname, id); 3928 (void) snprintf(snap2name, sizeof (snap2name), 3929 "%s@s2_%llu", clone1name, id); 3930 (void) snprintf(clone2name, sizeof (clone2name), 3931 "%s/c2_%llu", osname, id); 3932 (void) snprintf(snap3name, sizeof (snap3name), 3933 "%s@s3_%llu", clone1name, id); 3934 3935 error = dsl_destroy_head(clone2name); 3936 if (error && error != ENOENT) 3937 fatal(0, "dsl_destroy_head(%s) = %d", clone2name, error); 3938 error = dsl_destroy_snapshot(snap3name, B_FALSE); 3939 if (error && error != ENOENT) 3940 fatal(0, "dsl_destroy_snapshot(%s) = %d", snap3name, error); 3941 error = dsl_destroy_snapshot(snap2name, B_FALSE); 3942 if (error && error != ENOENT) 3943 fatal(0, "dsl_destroy_snapshot(%s) = %d", snap2name, error); 3944 error = dsl_destroy_head(clone1name); 3945 if (error && error != ENOENT) 3946 fatal(0, "dsl_destroy_head(%s) = %d", clone1name, error); 3947 error = dsl_destroy_snapshot(snap1name, B_FALSE); 3948 if (error && error != ENOENT) 3949 fatal(0, "dsl_destroy_snapshot(%s) = %d", snap1name, error); 3950} 3951 3952/* 3953 * Verify dsl_dataset_promote handles EBUSY 3954 */ 3955void 3956ztest_dsl_dataset_promote_busy(ztest_ds_t *zd, uint64_t id) 3957{ 3958 objset_t *os; 3959 char snap1name[ZFS_MAX_DATASET_NAME_LEN]; 3960 char clone1name[ZFS_MAX_DATASET_NAME_LEN]; 3961 char snap2name[ZFS_MAX_DATASET_NAME_LEN]; 3962 char clone2name[ZFS_MAX_DATASET_NAME_LEN]; 3963 char snap3name[ZFS_MAX_DATASET_NAME_LEN]; 3964 char *osname = zd->zd_name; 3965 int error; 3966 3967 rw_enter(&ztest_name_lock, RW_READER); 3968 3969 ztest_dsl_dataset_cleanup(osname, id); 3970 3971 (void) snprintf(snap1name, sizeof (snap1name), 3972 "%s@s1_%llu", osname, id); 3973 (void) snprintf(clone1name, sizeof (clone1name), 3974 "%s/c1_%llu", osname, id); 3975 (void) snprintf(snap2name, sizeof (snap2name), 3976 "%s@s2_%llu", clone1name, id); 3977 (void) snprintf(clone2name, sizeof (clone2name), 3978 "%s/c2_%llu", osname, id); 3979 (void) snprintf(snap3name, sizeof (snap3name), 3980 "%s@s3_%llu", clone1name, id); 3981 3982 error = dmu_objset_snapshot_one(osname, strchr(snap1name, '@') + 1); 3983 if (error && error != EEXIST) { 3984 if (error == ENOSPC) { 3985 ztest_record_enospc(FTAG); 3986 goto out; 3987 } 3988 fatal(0, "dmu_take_snapshot(%s) = %d", snap1name, error); 3989 } 3990 3991 error = dmu_objset_clone(clone1name, snap1name); 3992 if (error) { 3993 if (error == ENOSPC) { 3994 ztest_record_enospc(FTAG); 3995 goto out; 3996 } 3997 fatal(0, "dmu_objset_create(%s) = %d", clone1name, error); 3998 } 3999 4000 error = dmu_objset_snapshot_one(clone1name, strchr(snap2name, '@') + 1); 4001 if (error && error != EEXIST) { 4002 if (error == ENOSPC) { 4003 ztest_record_enospc(FTAG); 4004 goto out; 4005 } 4006 fatal(0, "dmu_open_snapshot(%s) = %d", snap2name, error); 4007 } 4008 4009 error = dmu_objset_snapshot_one(clone1name, strchr(snap3name, '@') + 1); 4010 if (error && error != EEXIST) { 4011 if (error == ENOSPC) { 4012 ztest_record_enospc(FTAG); 4013 goto out; 4014 } 4015 fatal(0, "dmu_open_snapshot(%s) = %d", snap3name, error); 4016 } 4017 4018 error = dmu_objset_clone(clone2name, snap3name); 4019 if (error) { 4020 if (error == ENOSPC) { 4021 ztest_record_enospc(FTAG); 4022 goto out; 4023 } 4024 fatal(0, "dmu_objset_create(%s) = %d", clone2name, error); 4025 } 4026 4027 error = dmu_objset_own(snap2name, DMU_OST_ANY, B_TRUE, FTAG, &os); 4028 if (error) 4029 fatal(0, "dmu_objset_own(%s) = %d", snap2name, error); 4030 error = dsl_dataset_promote(clone2name, NULL); 4031 if (error == ENOSPC) { 4032 dmu_objset_disown(os, FTAG); 4033 ztest_record_enospc(FTAG); 4034 goto out; 4035 } 4036 if (error != EBUSY) 4037 fatal(0, "dsl_dataset_promote(%s), %d, not EBUSY", clone2name, 4038 error); 4039 dmu_objset_disown(os, FTAG); 4040 4041out: 4042 ztest_dsl_dataset_cleanup(osname, id); 4043 4044 rw_exit(&ztest_name_lock); 4045} 4046 4047/* 4048 * Verify that dmu_object_{alloc,free} work as expected. 4049 */ 4050void 4051ztest_dmu_object_alloc_free(ztest_ds_t *zd, uint64_t id) 4052{ 4053 ztest_od_t od[4]; 4054 int batchsize = sizeof (od) / sizeof (od[0]); 4055 4056 for (int b = 0; b < batchsize; b++) { 4057 ztest_od_init(&od[b], id, FTAG, b, DMU_OT_UINT64_OTHER, 4058 0, 0, 0); 4059 } 4060 4061 /* 4062 * Destroy the previous batch of objects, create a new batch, 4063 * and do some I/O on the new objects. 4064 */ 4065 if (ztest_object_init(zd, od, sizeof (od), B_TRUE) != 0) 4066 return; 4067 4068 while (ztest_random(4 * batchsize) != 0) 4069 ztest_io(zd, od[ztest_random(batchsize)].od_object, 4070 ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT); 4071} 4072 4073/* 4074 * Rewind the global allocator to verify object allocation backfilling. 4075 */ 4076void 4077ztest_dmu_object_next_chunk(ztest_ds_t *zd, uint64_t id) 4078{ 4079 objset_t *os = zd->zd_os; 4080 int dnodes_per_chunk = 1 << dmu_object_alloc_chunk_shift; 4081 uint64_t object; 4082 4083 /* 4084 * Rewind the global allocator randomly back to a lower object number 4085 * to force backfilling and reclamation of recently freed dnodes. 4086 */ 4087 mutex_enter(&os->os_obj_lock); 4088 object = ztest_random(os->os_obj_next_chunk); 4089 os->os_obj_next_chunk = P2ALIGN(object, dnodes_per_chunk); 4090 mutex_exit(&os->os_obj_lock); 4091} 4092 4093/* 4094 * Verify that dmu_{read,write} work as expected. 4095 */ 4096void 4097ztest_dmu_read_write(ztest_ds_t *zd, uint64_t id) 4098{ 4099 objset_t *os = zd->zd_os; 4100 ztest_od_t od[2]; 4101 dmu_tx_t *tx; 4102 int i, freeit, error; 4103 uint64_t n, s, txg; 4104 bufwad_t *packbuf, *bigbuf, *pack, *bigH, *bigT; 4105 uint64_t packobj, packoff, packsize, bigobj, bigoff, bigsize; 4106 uint64_t chunksize = (1000 + ztest_random(1000)) * sizeof (uint64_t); 4107 uint64_t regions = 997; 4108 uint64_t stride = 123456789ULL; 4109 uint64_t width = 40; 4110 int free_percent = 5; 4111 4112 /* 4113 * This test uses two objects, packobj and bigobj, that are always 4114 * updated together (i.e. in the same tx) so that their contents are 4115 * in sync and can be compared. Their contents relate to each other 4116 * in a simple way: packobj is a dense array of 'bufwad' structures, 4117 * while bigobj is a sparse array of the same bufwads. Specifically, 4118 * for any index n, there are three bufwads that should be identical: 4119 * 4120 * packobj, at offset n * sizeof (bufwad_t) 4121 * bigobj, at the head of the nth chunk 4122 * bigobj, at the tail of the nth chunk 4123 * 4124 * The chunk size is arbitrary. It doesn't have to be a power of two, 4125 * and it doesn't have any relation to the object blocksize. 4126 * The only requirement is that it can hold at least two bufwads. 4127 * 4128 * Normally, we write the bufwad to each of these locations. 4129 * However, free_percent of the time we instead write zeroes to 4130 * packobj and perform a dmu_free_range() on bigobj. By comparing 4131 * bigobj to packobj, we can verify that the DMU is correctly 4132 * tracking which parts of an object are allocated and free, 4133 * and that the contents of the allocated blocks are correct. 4134 */ 4135 4136 /* 4137 * Read the directory info. If it's the first time, set things up. 4138 */ 4139 ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0, 4140 chunksize); 4141 ztest_od_init(&od[1], id, FTAG, 1, DMU_OT_UINT64_OTHER, 0, 0, 4142 chunksize); 4143 4144 if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0) 4145 return; 4146 4147 bigobj = od[0].od_object; 4148 packobj = od[1].od_object; 4149 chunksize = od[0].od_gen; 4150 ASSERT(chunksize == od[1].od_gen); 4151 4152 /* 4153 * Prefetch a random chunk of the big object. 4154 * Our aim here is to get some async reads in flight 4155 * for blocks that we may free below; the DMU should 4156 * handle this race correctly. 4157 */ 4158 n = ztest_random(regions) * stride + ztest_random(width); 4159 s = 1 + ztest_random(2 * width - 1); 4160 dmu_prefetch(os, bigobj, 0, n * chunksize, s * chunksize, 4161 ZIO_PRIORITY_SYNC_READ); 4162 4163 /* 4164 * Pick a random index and compute the offsets into packobj and bigobj. 4165 */ 4166 n = ztest_random(regions) * stride + ztest_random(width); 4167 s = 1 + ztest_random(width - 1); 4168 4169 packoff = n * sizeof (bufwad_t); 4170 packsize = s * sizeof (bufwad_t); 4171 4172 bigoff = n * chunksize; 4173 bigsize = s * chunksize; 4174 4175 packbuf = umem_alloc(packsize, UMEM_NOFAIL); 4176 bigbuf = umem_alloc(bigsize, UMEM_NOFAIL); 4177 4178 /* 4179 * free_percent of the time, free a range of bigobj rather than 4180 * overwriting it. 4181 */ 4182 freeit = (ztest_random(100) < free_percent); 4183 4184 /* 4185 * Read the current contents of our objects. 4186 */ 4187 error = dmu_read(os, packobj, packoff, packsize, packbuf, 4188 DMU_READ_PREFETCH); 4189 ASSERT0(error); 4190 error = dmu_read(os, bigobj, bigoff, bigsize, bigbuf, 4191 DMU_READ_PREFETCH); 4192 ASSERT0(error); 4193 4194 /* 4195 * Get a tx for the mods to both packobj and bigobj. 4196 */ 4197 tx = dmu_tx_create(os); 4198 4199 dmu_tx_hold_write(tx, packobj, packoff, packsize); 4200 4201 if (freeit) 4202 dmu_tx_hold_free(tx, bigobj, bigoff, bigsize); 4203 else 4204 dmu_tx_hold_write(tx, bigobj, bigoff, bigsize); 4205 4206 /* This accounts for setting the checksum/compression. */ 4207 dmu_tx_hold_bonus(tx, bigobj); 4208 4209 txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); 4210 if (txg == 0) { 4211 umem_free(packbuf, packsize); 4212 umem_free(bigbuf, bigsize); 4213 return; 4214 } 4215 4216 enum zio_checksum cksum; 4217 do { 4218 cksum = (enum zio_checksum) 4219 ztest_random_dsl_prop(ZFS_PROP_CHECKSUM); 4220 } while (cksum >= ZIO_CHECKSUM_LEGACY_FUNCTIONS); 4221 dmu_object_set_checksum(os, bigobj, cksum, tx); 4222 4223 enum zio_compress comp; 4224 do { 4225 comp = (enum zio_compress) 4226 ztest_random_dsl_prop(ZFS_PROP_COMPRESSION); 4227 } while (comp >= ZIO_COMPRESS_LEGACY_FUNCTIONS); 4228 dmu_object_set_compress(os, bigobj, comp, tx); 4229 4230 /* 4231 * For each index from n to n + s, verify that the existing bufwad 4232 * in packobj matches the bufwads at the head and tail of the 4233 * corresponding chunk in bigobj. Then update all three bufwads 4234 * with the new values we want to write out. 4235 */ 4236 for (i = 0; i < s; i++) { 4237 /* LINTED */ 4238 pack = (bufwad_t *)((char *)packbuf + i * sizeof (bufwad_t)); 4239 /* LINTED */ 4240 bigH = (bufwad_t *)((char *)bigbuf + i * chunksize); 4241 /* LINTED */ 4242 bigT = (bufwad_t *)((char *)bigH + chunksize) - 1; 4243 4244 ASSERT((uintptr_t)bigH - (uintptr_t)bigbuf < bigsize); 4245 ASSERT((uintptr_t)bigT - (uintptr_t)bigbuf < bigsize); 4246 4247 if (pack->bw_txg > txg) 4248 fatal(0, "future leak: got %llx, open txg is %llx", 4249 pack->bw_txg, txg); 4250 4251 if (pack->bw_data != 0 && pack->bw_index != n + i) 4252 fatal(0, "wrong index: got %llx, wanted %llx+%llx", 4253 pack->bw_index, n, i); 4254 4255 if (bcmp(pack, bigH, sizeof (bufwad_t)) != 0) 4256 fatal(0, "pack/bigH mismatch in %p/%p", pack, bigH); 4257 4258 if (bcmp(pack, bigT, sizeof (bufwad_t)) != 0) 4259 fatal(0, "pack/bigT mismatch in %p/%p", pack, bigT); 4260 4261 if (freeit) { 4262 bzero(pack, sizeof (bufwad_t)); 4263 } else { 4264 pack->bw_index = n + i; 4265 pack->bw_txg = txg; 4266 pack->bw_data = 1 + ztest_random(-2ULL); 4267 } 4268 *bigH = *pack; 4269 *bigT = *pack; 4270 } 4271 4272 /* 4273 * We've verified all the old bufwads, and made new ones. 4274 * Now write them out. 4275 */ 4276 dmu_write(os, packobj, packoff, packsize, packbuf, tx); 4277 4278 if (freeit) { 4279 if (ztest_opts.zo_verbose >= 7) { 4280 (void) printf("freeing offset %llx size %llx" 4281 " txg %llx\n", 4282 (u_longlong_t)bigoff, 4283 (u_longlong_t)bigsize, 4284 (u_longlong_t)txg); 4285 } 4286 VERIFY(0 == dmu_free_range(os, bigobj, bigoff, bigsize, tx)); 4287 } else { 4288 if (ztest_opts.zo_verbose >= 7) { 4289 (void) printf("writing offset %llx size %llx" 4290 " txg %llx\n", 4291 (u_longlong_t)bigoff, 4292 (u_longlong_t)bigsize, 4293 (u_longlong_t)txg); 4294 } 4295 dmu_write(os, bigobj, bigoff, bigsize, bigbuf, tx); 4296 } 4297 4298 dmu_tx_commit(tx); 4299 4300 /* 4301 * Sanity check the stuff we just wrote. 4302 */ 4303 { 4304 void *packcheck = umem_alloc(packsize, UMEM_NOFAIL); 4305 void *bigcheck = umem_alloc(bigsize, UMEM_NOFAIL); 4306 4307 VERIFY(0 == dmu_read(os, packobj, packoff, 4308 packsize, packcheck, DMU_READ_PREFETCH)); 4309 VERIFY(0 == dmu_read(os, bigobj, bigoff, 4310 bigsize, bigcheck, DMU_READ_PREFETCH)); 4311 4312 ASSERT(bcmp(packbuf, packcheck, packsize) == 0); 4313 ASSERT(bcmp(bigbuf, bigcheck, bigsize) == 0); 4314 4315 umem_free(packcheck, packsize); 4316 umem_free(bigcheck, bigsize); 4317 } 4318 4319 umem_free(packbuf, packsize); 4320 umem_free(bigbuf, bigsize); 4321} 4322 4323void 4324compare_and_update_pbbufs(uint64_t s, bufwad_t *packbuf, bufwad_t *bigbuf, 4325 uint64_t bigsize, uint64_t n, uint64_t chunksize, uint64_t txg) 4326{ 4327 uint64_t i; 4328 bufwad_t *pack; 4329 bufwad_t *bigH; 4330 bufwad_t *bigT; 4331 4332 /* 4333 * For each index from n to n + s, verify that the existing bufwad 4334 * in packobj matches the bufwads at the head and tail of the 4335 * corresponding chunk in bigobj. Then update all three bufwads 4336 * with the new values we want to write out. 4337 */ 4338 for (i = 0; i < s; i++) { 4339 /* LINTED */ 4340 pack = (bufwad_t *)((char *)packbuf + i * sizeof (bufwad_t)); 4341 /* LINTED */ 4342 bigH = (bufwad_t *)((char *)bigbuf + i * chunksize); 4343 /* LINTED */ 4344 bigT = (bufwad_t *)((char *)bigH + chunksize) - 1; 4345 4346 ASSERT((uintptr_t)bigH - (uintptr_t)bigbuf < bigsize); 4347 ASSERT((uintptr_t)bigT - (uintptr_t)bigbuf < bigsize); 4348 4349 if (pack->bw_txg > txg) 4350 fatal(0, "future leak: got %llx, open txg is %llx", 4351 pack->bw_txg, txg); 4352 4353 if (pack->bw_data != 0 && pack->bw_index != n + i) 4354 fatal(0, "wrong index: got %llx, wanted %llx+%llx", 4355 pack->bw_index, n, i); 4356 4357 if (bcmp(pack, bigH, sizeof (bufwad_t)) != 0) 4358 fatal(0, "pack/bigH mismatch in %p/%p", pack, bigH); 4359 4360 if (bcmp(pack, bigT, sizeof (bufwad_t)) != 0) 4361 fatal(0, "pack/bigT mismatch in %p/%p", pack, bigT); 4362 4363 pack->bw_index = n + i; 4364 pack->bw_txg = txg; 4365 pack->bw_data = 1 + ztest_random(-2ULL); 4366 4367 *bigH = *pack; 4368 *bigT = *pack; 4369 } 4370} 4371 4372void 4373ztest_dmu_read_write_zcopy(ztest_ds_t *zd, uint64_t id) 4374{ 4375 objset_t *os = zd->zd_os; 4376 ztest_od_t od[2]; 4377 dmu_tx_t *tx; 4378 uint64_t i; 4379 int error; 4380 uint64_t n, s, txg; 4381 bufwad_t *packbuf, *bigbuf; 4382 uint64_t packobj, packoff, packsize, bigobj, bigoff, bigsize; 4383 uint64_t blocksize = ztest_random_blocksize(); 4384 uint64_t chunksize = blocksize; 4385 uint64_t regions = 997; 4386 uint64_t stride = 123456789ULL; 4387 uint64_t width = 9; 4388 dmu_buf_t *bonus_db; 4389 arc_buf_t **bigbuf_arcbufs; 4390 dmu_object_info_t doi; 4391 4392 /* 4393 * This test uses two objects, packobj and bigobj, that are always 4394 * updated together (i.e. in the same tx) so that their contents are 4395 * in sync and can be compared. Their contents relate to each other 4396 * in a simple way: packobj is a dense array of 'bufwad' structures, 4397 * while bigobj is a sparse array of the same bufwads. Specifically, 4398 * for any index n, there are three bufwads that should be identical: 4399 * 4400 * packobj, at offset n * sizeof (bufwad_t) 4401 * bigobj, at the head of the nth chunk 4402 * bigobj, at the tail of the nth chunk 4403 * 4404 * The chunk size is set equal to bigobj block size so that 4405 * dmu_assign_arcbuf() can be tested for object updates. 4406 */ 4407 4408 /* 4409 * Read the directory info. If it's the first time, set things up. 4410 */ 4411 ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, 4412 0, 0); 4413 ztest_od_init(&od[1], id, FTAG, 1, DMU_OT_UINT64_OTHER, 0, 0, 4414 chunksize); 4415 4416 if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0) 4417 return; 4418 4419 bigobj = od[0].od_object; 4420 packobj = od[1].od_object; 4421 blocksize = od[0].od_blocksize; 4422 chunksize = blocksize; 4423 ASSERT(chunksize == od[1].od_gen); 4424 4425 VERIFY(dmu_object_info(os, bigobj, &doi) == 0); 4426 VERIFY(ISP2(doi.doi_data_block_size)); 4427 VERIFY(chunksize == doi.doi_data_block_size); 4428 VERIFY(chunksize >= 2 * sizeof (bufwad_t)); 4429 4430 /* 4431 * Pick a random index and compute the offsets into packobj and bigobj. 4432 */ 4433 n = ztest_random(regions) * stride + ztest_random(width); 4434 s = 1 + ztest_random(width - 1); 4435 4436 packoff = n * sizeof (bufwad_t); 4437 packsize = s * sizeof (bufwad_t); 4438 4439 bigoff = n * chunksize; 4440 bigsize = s * chunksize; 4441 4442 packbuf = umem_zalloc(packsize, UMEM_NOFAIL); 4443 bigbuf = umem_zalloc(bigsize, UMEM_NOFAIL); 4444 4445 VERIFY3U(0, ==, dmu_bonus_hold(os, bigobj, FTAG, &bonus_db)); 4446 4447 bigbuf_arcbufs = umem_zalloc(2 * s * sizeof (arc_buf_t *), UMEM_NOFAIL); 4448 4449 /* 4450 * Iteration 0 test zcopy for DB_UNCACHED dbufs. 4451 * Iteration 1 test zcopy to already referenced dbufs. 4452 * Iteration 2 test zcopy to dirty dbuf in the same txg. 4453 * Iteration 3 test zcopy to dbuf dirty in previous txg. 4454 * Iteration 4 test zcopy when dbuf is no longer dirty. 4455 * Iteration 5 test zcopy when it can't be done. 4456 * Iteration 6 one more zcopy write. 4457 */ 4458 for (i = 0; i < 7; i++) { 4459 uint64_t j; 4460 uint64_t off; 4461 4462 /* 4463 * In iteration 5 (i == 5) use arcbufs 4464 * that don't match bigobj blksz to test 4465 * dmu_assign_arcbuf() when it can't directly 4466 * assign an arcbuf to a dbuf. 4467 */ 4468 for (j = 0; j < s; j++) { 4469 if (i != 5) { 4470 bigbuf_arcbufs[j] = 4471 dmu_request_arcbuf(bonus_db, chunksize); 4472 } else { 4473 bigbuf_arcbufs[2 * j] = 4474 dmu_request_arcbuf(bonus_db, chunksize / 2); 4475 bigbuf_arcbufs[2 * j + 1] = 4476 dmu_request_arcbuf(bonus_db, chunksize / 2); 4477 } 4478 } 4479 4480 /* 4481 * Get a tx for the mods to both packobj and bigobj. 4482 */ 4483 tx = dmu_tx_create(os); 4484 4485 dmu_tx_hold_write(tx, packobj, packoff, packsize); 4486 dmu_tx_hold_write(tx, bigobj, bigoff, bigsize); 4487 4488 txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); 4489 if (txg == 0) { 4490 umem_free(packbuf, packsize); 4491 umem_free(bigbuf, bigsize); 4492 for (j = 0; j < s; j++) { 4493 if (i != 5) { 4494 dmu_return_arcbuf(bigbuf_arcbufs[j]); 4495 } else { 4496 dmu_return_arcbuf( 4497 bigbuf_arcbufs[2 * j]); 4498 dmu_return_arcbuf( 4499 bigbuf_arcbufs[2 * j + 1]); 4500 } 4501 } 4502 umem_free(bigbuf_arcbufs, 2 * s * sizeof (arc_buf_t *)); 4503 dmu_buf_rele(bonus_db, FTAG); 4504 return; 4505 } 4506 4507 /* 4508 * 50% of the time don't read objects in the 1st iteration to 4509 * test dmu_assign_arcbuf() for the case when there're no 4510 * existing dbufs for the specified offsets. 4511 */ 4512 if (i != 0 || ztest_random(2) != 0) { 4513 error = dmu_read(os, packobj, packoff, 4514 packsize, packbuf, DMU_READ_PREFETCH); 4515 ASSERT0(error); 4516 error = dmu_read(os, bigobj, bigoff, bigsize, 4517 bigbuf, DMU_READ_PREFETCH); 4518 ASSERT0(error); 4519 } 4520 compare_and_update_pbbufs(s, packbuf, bigbuf, bigsize, 4521 n, chunksize, txg); 4522 4523 /* 4524 * We've verified all the old bufwads, and made new ones. 4525 * Now write them out. 4526 */ 4527 dmu_write(os, packobj, packoff, packsize, packbuf, tx); 4528 if (ztest_opts.zo_verbose >= 7) { 4529 (void) printf("writing offset %llx size %llx" 4530 " txg %llx\n", 4531 (u_longlong_t)bigoff, 4532 (u_longlong_t)bigsize, 4533 (u_longlong_t)txg); 4534 } 4535 for (off = bigoff, j = 0; j < s; j++, off += chunksize) { 4536 dmu_buf_t *dbt; 4537 if (i != 5) { 4538 bcopy((caddr_t)bigbuf + (off - bigoff), 4539 bigbuf_arcbufs[j]->b_data, chunksize); 4540 } else { 4541 bcopy((caddr_t)bigbuf + (off - bigoff), 4542 bigbuf_arcbufs[2 * j]->b_data, 4543 chunksize / 2); 4544 bcopy((caddr_t)bigbuf + (off - bigoff) + 4545 chunksize / 2, 4546 bigbuf_arcbufs[2 * j + 1]->b_data, 4547 chunksize / 2); 4548 } 4549 4550 if (i == 1) { 4551 VERIFY(dmu_buf_hold(os, bigobj, off, 4552 FTAG, &dbt, DMU_READ_NO_PREFETCH) == 0); 4553 } 4554 if (i != 5) { 4555 dmu_assign_arcbuf(bonus_db, off, 4556 bigbuf_arcbufs[j], tx); 4557 } else { 4558 dmu_assign_arcbuf(bonus_db, off, 4559 bigbuf_arcbufs[2 * j], tx); 4560 dmu_assign_arcbuf(bonus_db, 4561 off + chunksize / 2, 4562 bigbuf_arcbufs[2 * j + 1], tx); 4563 } 4564 if (i == 1) { 4565 dmu_buf_rele(dbt, FTAG); 4566 } 4567 } 4568 dmu_tx_commit(tx); 4569 4570 /* 4571 * Sanity check the stuff we just wrote. 4572 */ 4573 { 4574 void *packcheck = umem_alloc(packsize, UMEM_NOFAIL); 4575 void *bigcheck = umem_alloc(bigsize, UMEM_NOFAIL); 4576 4577 VERIFY(0 == dmu_read(os, packobj, packoff, 4578 packsize, packcheck, DMU_READ_PREFETCH)); 4579 VERIFY(0 == dmu_read(os, bigobj, bigoff, 4580 bigsize, bigcheck, DMU_READ_PREFETCH)); 4581 4582 ASSERT(bcmp(packbuf, packcheck, packsize) == 0); 4583 ASSERT(bcmp(bigbuf, bigcheck, bigsize) == 0); 4584 4585 umem_free(packcheck, packsize); 4586 umem_free(bigcheck, bigsize); 4587 } 4588 if (i == 2) { 4589 txg_wait_open(dmu_objset_pool(os), 0); 4590 } else if (i == 3) { 4591 txg_wait_synced(dmu_objset_pool(os), 0); 4592 } 4593 } 4594 4595 dmu_buf_rele(bonus_db, FTAG); 4596 umem_free(packbuf, packsize); 4597 umem_free(bigbuf, bigsize); 4598 umem_free(bigbuf_arcbufs, 2 * s * sizeof (arc_buf_t *)); 4599} 4600 4601/* ARGSUSED */ 4602void 4603ztest_dmu_write_parallel(ztest_ds_t *zd, uint64_t id) 4604{ 4605 ztest_od_t od[1]; 4606 uint64_t offset = (1ULL << (ztest_random(20) + 43)) + 4607 (ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT); 4608 4609 /* 4610 * Have multiple threads write to large offsets in an object 4611 * to verify that parallel writes to an object -- even to the 4612 * same blocks within the object -- doesn't cause any trouble. 4613 */ 4614 ztest_od_init(&od[0], ID_PARALLEL, FTAG, 0, DMU_OT_UINT64_OTHER, 4615 0, 0, 0); 4616 4617 if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0) 4618 return; 4619 4620 while (ztest_random(10) != 0) 4621 ztest_io(zd, od[0].od_object, offset); 4622} 4623 4624void 4625ztest_dmu_prealloc(ztest_ds_t *zd, uint64_t id) 4626{ 4627 ztest_od_t od[1]; 4628 uint64_t offset = (1ULL << (ztest_random(4) + SPA_MAXBLOCKSHIFT)) + 4629 (ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT); 4630 uint64_t count = ztest_random(20) + 1; 4631 uint64_t blocksize = ztest_random_blocksize(); 4632 void *data; 4633 4634 ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, 4635 0, 0); 4636 4637 if (ztest_object_init(zd, od, sizeof (od), !ztest_random(2)) != 0) 4638 return; 4639 4640 if (ztest_truncate(zd, od[0].od_object, offset, count * blocksize) != 0) 4641 return; 4642 4643 ztest_prealloc(zd, od[0].od_object, offset, count * blocksize); 4644 4645 data = umem_zalloc(blocksize, UMEM_NOFAIL); 4646 4647 while (ztest_random(count) != 0) { 4648 uint64_t randoff = offset + (ztest_random(count) * blocksize); 4649 if (ztest_write(zd, od[0].od_object, randoff, blocksize, 4650 data) != 0) 4651 break; 4652 while (ztest_random(4) != 0) 4653 ztest_io(zd, od[0].od_object, randoff); 4654 } 4655 4656 umem_free(data, blocksize); 4657} 4658 4659/* 4660 * Verify that zap_{create,destroy,add,remove,update} work as expected. 4661 */ 4662#define ZTEST_ZAP_MIN_INTS 1 4663#define ZTEST_ZAP_MAX_INTS 4 4664#define ZTEST_ZAP_MAX_PROPS 1000 4665 4666void 4667ztest_zap(ztest_ds_t *zd, uint64_t id) 4668{ 4669 objset_t *os = zd->zd_os; 4670 ztest_od_t od[1]; 4671 uint64_t object; 4672 uint64_t txg, last_txg; 4673 uint64_t value[ZTEST_ZAP_MAX_INTS]; 4674 uint64_t zl_ints, zl_intsize, prop; 4675 int i, ints; 4676 dmu_tx_t *tx; 4677 char propname[100], txgname[100]; 4678 int error; 4679 char *hc[2] = { "s.acl.h", ".s.open.h.hyLZlg" }; 4680 4681 ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_ZAP_OTHER, 0, 0, 0); 4682 4683 if (ztest_object_init(zd, od, sizeof (od), !ztest_random(2)) != 0) 4684 return; 4685 4686 object = od[0].od_object; 4687 4688 /* 4689 * Generate a known hash collision, and verify that 4690 * we can lookup and remove both entries. 4691 */ 4692 tx = dmu_tx_create(os); 4693 dmu_tx_hold_zap(tx, object, B_TRUE, NULL); 4694 txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); 4695 if (txg == 0) 4696 return; 4697 for (i = 0; i < 2; i++) { 4698 value[i] = i; 4699 VERIFY3U(0, ==, zap_add(os, object, hc[i], sizeof (uint64_t), 4700 1, &value[i], tx)); 4701 } 4702 for (i = 0; i < 2; i++) { 4703 VERIFY3U(EEXIST, ==, zap_add(os, object, hc[i], 4704 sizeof (uint64_t), 1, &value[i], tx)); 4705 VERIFY3U(0, ==, 4706 zap_length(os, object, hc[i], &zl_intsize, &zl_ints)); 4707 ASSERT3U(zl_intsize, ==, sizeof (uint64_t)); 4708 ASSERT3U(zl_ints, ==, 1); 4709 } 4710 for (i = 0; i < 2; i++) { 4711 VERIFY3U(0, ==, zap_remove(os, object, hc[i], tx)); 4712 } 4713 dmu_tx_commit(tx); 4714 4715 /* 4716 * Generate a buch of random entries. 4717 */ 4718 ints = MAX(ZTEST_ZAP_MIN_INTS, object % ZTEST_ZAP_MAX_INTS); 4719 4720 prop = ztest_random(ZTEST_ZAP_MAX_PROPS); 4721 (void) sprintf(propname, "prop_%llu", (u_longlong_t)prop); 4722 (void) sprintf(txgname, "txg_%llu", (u_longlong_t)prop); 4723 bzero(value, sizeof (value)); 4724 last_txg = 0; 4725 4726 /* 4727 * If these zap entries already exist, validate their contents. 4728 */ 4729 error = zap_length(os, object, txgname, &zl_intsize, &zl_ints); 4730 if (error == 0) { 4731 ASSERT3U(zl_intsize, ==, sizeof (uint64_t)); 4732 ASSERT3U(zl_ints, ==, 1); 4733 4734 VERIFY(zap_lookup(os, object, txgname, zl_intsize, 4735 zl_ints, &last_txg) == 0); 4736 4737 VERIFY(zap_length(os, object, propname, &zl_intsize, 4738 &zl_ints) == 0); 4739 4740 ASSERT3U(zl_intsize, ==, sizeof (uint64_t)); 4741 ASSERT3U(zl_ints, ==, ints); 4742 4743 VERIFY(zap_lookup(os, object, propname, zl_intsize, 4744 zl_ints, value) == 0); 4745 4746 for (i = 0; i < ints; i++) { 4747 ASSERT3U(value[i], ==, last_txg + object + i); 4748 } 4749 } else { 4750 ASSERT3U(error, ==, ENOENT); 4751 } 4752 4753 /* 4754 * Atomically update two entries in our zap object. 4755 * The first is named txg_%llu, and contains the txg 4756 * in which the property was last updated. The second 4757 * is named prop_%llu, and the nth element of its value 4758 * should be txg + object + n. 4759 */ 4760 tx = dmu_tx_create(os); 4761 dmu_tx_hold_zap(tx, object, B_TRUE, NULL); 4762 txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); 4763 if (txg == 0) 4764 return; 4765 4766 if (last_txg > txg) 4767 fatal(0, "zap future leak: old %llu new %llu", last_txg, txg); 4768 4769 for (i = 0; i < ints; i++) 4770 value[i] = txg + object + i; 4771 4772 VERIFY3U(0, ==, zap_update(os, object, txgname, sizeof (uint64_t), 4773 1, &txg, tx)); 4774 VERIFY3U(0, ==, zap_update(os, object, propname, sizeof (uint64_t), 4775 ints, value, tx)); 4776 4777 dmu_tx_commit(tx); 4778 4779 /* 4780 * Remove a random pair of entries. 4781 */ 4782 prop = ztest_random(ZTEST_ZAP_MAX_PROPS); 4783 (void) sprintf(propname, "prop_%llu", (u_longlong_t)prop); 4784 (void) sprintf(txgname, "txg_%llu", (u_longlong_t)prop); 4785 4786 error = zap_length(os, object, txgname, &zl_intsize, &zl_ints); 4787 4788 if (error == ENOENT) 4789 return; 4790 4791 ASSERT0(error); 4792 4793 tx = dmu_tx_create(os); 4794 dmu_tx_hold_zap(tx, object, B_TRUE, NULL); 4795 txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); 4796 if (txg == 0) 4797 return; 4798 VERIFY3U(0, ==, zap_remove(os, object, txgname, tx)); 4799 VERIFY3U(0, ==, zap_remove(os, object, propname, tx)); 4800 dmu_tx_commit(tx); 4801} 4802 4803/* 4804 * Testcase to test the upgrading of a microzap to fatzap. 4805 */ 4806void 4807ztest_fzap(ztest_ds_t *zd, uint64_t id) 4808{ 4809 objset_t *os = zd->zd_os; 4810 ztest_od_t od[1]; 4811 uint64_t object, txg; 4812 4813 ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_ZAP_OTHER, 0, 0, 0); 4814 4815 if (ztest_object_init(zd, od, sizeof (od), !ztest_random(2)) != 0) 4816 return; 4817 4818 object = od[0].od_object; 4819 4820 /* 4821 * Add entries to this ZAP and make sure it spills over 4822 * and gets upgraded to a fatzap. Also, since we are adding 4823 * 2050 entries we should see ptrtbl growth and leaf-block split. 4824 */ 4825 for (int i = 0; i < 2050; i++) { 4826 char name[ZFS_MAX_DATASET_NAME_LEN]; 4827 uint64_t value = i; 4828 dmu_tx_t *tx; 4829 int error; 4830 4831 (void) snprintf(name, sizeof (name), "fzap-%llu-%llu", 4832 id, value); 4833 4834 tx = dmu_tx_create(os); 4835 dmu_tx_hold_zap(tx, object, B_TRUE, name); 4836 txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); 4837 if (txg == 0) 4838 return; 4839 error = zap_add(os, object, name, sizeof (uint64_t), 1, 4840 &value, tx); 4841 ASSERT(error == 0 || error == EEXIST); 4842 dmu_tx_commit(tx); 4843 } 4844} 4845 4846/* ARGSUSED */ 4847void 4848ztest_zap_parallel(ztest_ds_t *zd, uint64_t id) 4849{ 4850 objset_t *os = zd->zd_os; 4851 ztest_od_t od[1]; 4852 uint64_t txg, object, count, wsize, wc, zl_wsize, zl_wc; 4853 dmu_tx_t *tx; 4854 int i, namelen, error; 4855 int micro = ztest_random(2); 4856 char name[20], string_value[20]; 4857 void *data; 4858 4859 ztest_od_init(&od[0], ID_PARALLEL, FTAG, micro, DMU_OT_ZAP_OTHER, 4860 0, 0, 0); 4861 4862 if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0) 4863 return; 4864 4865 object = od[0].od_object; 4866 4867 /* 4868 * Generate a random name of the form 'xxx.....' where each 4869 * x is a random printable character and the dots are dots. 4870 * There are 94 such characters, and the name length goes from 4871 * 6 to 20, so there are 94^3 * 15 = 12,458,760 possible names. 4872 */ 4873 namelen = ztest_random(sizeof (name) - 5) + 5 + 1; 4874 4875 for (i = 0; i < 3; i++) 4876 name[i] = '!' + ztest_random('~' - '!' + 1); 4877 for (; i < namelen - 1; i++) 4878 name[i] = '.'; 4879 name[i] = '\0'; 4880 4881 if ((namelen & 1) || micro) { 4882 wsize = sizeof (txg); 4883 wc = 1; 4884 data = &txg; 4885 } else { 4886 wsize = 1; 4887 wc = namelen; 4888 data = string_value; 4889 } 4890 4891 count = -1ULL; 4892 VERIFY0(zap_count(os, object, &count)); 4893 ASSERT(count != -1ULL); 4894 4895 /* 4896 * Select an operation: length, lookup, add, update, remove. 4897 */ 4898 i = ztest_random(5); 4899 4900 if (i >= 2) { 4901 tx = dmu_tx_create(os); 4902 dmu_tx_hold_zap(tx, object, B_TRUE, NULL); 4903 txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); 4904 if (txg == 0) 4905 return; 4906 bcopy(name, string_value, namelen); 4907 } else { 4908 tx = NULL; 4909 txg = 0; 4910 bzero(string_value, namelen); 4911 } 4912 4913 switch (i) { 4914 4915 case 0: 4916 error = zap_length(os, object, name, &zl_wsize, &zl_wc); 4917 if (error == 0) { 4918 ASSERT3U(wsize, ==, zl_wsize); 4919 ASSERT3U(wc, ==, zl_wc); 4920 } else { 4921 ASSERT3U(error, ==, ENOENT); 4922 } 4923 break; 4924 4925 case 1: 4926 error = zap_lookup(os, object, name, wsize, wc, data); 4927 if (error == 0) { 4928 if (data == string_value && 4929 bcmp(name, data, namelen) != 0) 4930 fatal(0, "name '%s' != val '%s' len %d", 4931 name, data, namelen); 4932 } else { 4933 ASSERT3U(error, ==, ENOENT); 4934 } 4935 break; 4936 4937 case 2: 4938 error = zap_add(os, object, name, wsize, wc, data, tx); 4939 ASSERT(error == 0 || error == EEXIST); 4940 break; 4941 4942 case 3: 4943 VERIFY(zap_update(os, object, name, wsize, wc, data, tx) == 0); 4944 break; 4945 4946 case 4: 4947 error = zap_remove(os, object, name, tx); 4948 ASSERT(error == 0 || error == ENOENT); 4949 break; 4950 } 4951 4952 if (tx != NULL) 4953 dmu_tx_commit(tx); 4954} 4955 4956/* 4957 * Commit callback data. 4958 */ 4959typedef struct ztest_cb_data { 4960 list_node_t zcd_node; 4961 uint64_t zcd_txg; 4962 int zcd_expected_err; 4963 boolean_t zcd_added; 4964 boolean_t zcd_called; 4965 spa_t *zcd_spa; 4966} ztest_cb_data_t; 4967 4968/* This is the actual commit callback function */ 4969static void 4970ztest_commit_callback(void *arg, int error) 4971{ 4972 ztest_cb_data_t *data = arg; 4973 uint64_t synced_txg; 4974 4975 VERIFY(data != NULL); 4976 VERIFY3S(data->zcd_expected_err, ==, error); 4977 VERIFY(!data->zcd_called); 4978 4979 synced_txg = spa_last_synced_txg(data->zcd_spa); 4980 if (data->zcd_txg > synced_txg) 4981 fatal(0, "commit callback of txg %" PRIu64 " called prematurely" 4982 ", last synced txg = %" PRIu64 "\n", data->zcd_txg, 4983 synced_txg); 4984 4985 data->zcd_called = B_TRUE; 4986 4987 if (error == ECANCELED) { 4988 ASSERT0(data->zcd_txg); 4989 ASSERT(!data->zcd_added); 4990 4991 /* 4992 * The private callback data should be destroyed here, but 4993 * since we are going to check the zcd_called field after 4994 * dmu_tx_abort(), we will destroy it there. 4995 */ 4996 return; 4997 } 4998 4999 /* Was this callback added to the global callback list? */ 5000 if (!data->zcd_added) 5001 goto out; 5002 5003 ASSERT3U(data->zcd_txg, !=, 0); 5004 5005 /* Remove our callback from the list */ 5006 mutex_enter(&zcl.zcl_callbacks_lock); 5007 list_remove(&zcl.zcl_callbacks, data); 5008 mutex_exit(&zcl.zcl_callbacks_lock); 5009 5010out: 5011 umem_free(data, sizeof (ztest_cb_data_t)); 5012} 5013 5014/* Allocate and initialize callback data structure */ 5015static ztest_cb_data_t * 5016ztest_create_cb_data(objset_t *os, uint64_t txg) 5017{ 5018 ztest_cb_data_t *cb_data; 5019 5020 cb_data = umem_zalloc(sizeof (ztest_cb_data_t), UMEM_NOFAIL); 5021 5022 cb_data->zcd_txg = txg; 5023 cb_data->zcd_spa = dmu_objset_spa(os); 5024 5025 return (cb_data); 5026} 5027 5028/* 5029 * If a number of txgs equal to this threshold have been created after a commit 5030 * callback has been registered but not called, then we assume there is an 5031 * implementation bug. 5032 */ 5033#define ZTEST_COMMIT_CALLBACK_THRESH (TXG_CONCURRENT_STATES + 2) 5034 5035/* 5036 * Commit callback test. 5037 */ 5038void 5039ztest_dmu_commit_callbacks(ztest_ds_t *zd, uint64_t id) 5040{ 5041 objset_t *os = zd->zd_os; 5042 ztest_od_t od[1]; 5043 dmu_tx_t *tx; 5044 ztest_cb_data_t *cb_data[3], *tmp_cb; 5045 uint64_t old_txg, txg; 5046 int i, error; 5047 5048 ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0, 0); 5049 5050 if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0) 5051 return; 5052 5053 tx = dmu_tx_create(os); 5054 5055 cb_data[0] = ztest_create_cb_data(os, 0); 5056 dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[0]); 5057 5058 dmu_tx_hold_write(tx, od[0].od_object, 0, sizeof (uint64_t)); 5059 5060 /* Every once in a while, abort the transaction on purpose */ 5061 if (ztest_random(100) == 0) 5062 error = -1; 5063 5064 if (!error) 5065 error = dmu_tx_assign(tx, TXG_NOWAIT); 5066 5067 txg = error ? 0 : dmu_tx_get_txg(tx); 5068 5069 cb_data[0]->zcd_txg = txg; 5070 cb_data[1] = ztest_create_cb_data(os, txg); 5071 dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[1]); 5072 5073 if (error) { 5074 /* 5075 * It's not a strict requirement to call the registered 5076 * callbacks from inside dmu_tx_abort(), but that's what 5077 * it's supposed to happen in the current implementation 5078 * so we will check for that. 5079 */ 5080 for (i = 0; i < 2; i++) { 5081 cb_data[i]->zcd_expected_err = ECANCELED; 5082 VERIFY(!cb_data[i]->zcd_called); 5083 } 5084 5085 dmu_tx_abort(tx); 5086 5087 for (i = 0; i < 2; i++) { 5088 VERIFY(cb_data[i]->zcd_called); 5089 umem_free(cb_data[i], sizeof (ztest_cb_data_t)); 5090 } 5091 5092 return; 5093 } 5094 5095 cb_data[2] = ztest_create_cb_data(os, txg); 5096 dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[2]); 5097 5098 /* 5099 * Read existing data to make sure there isn't a future leak. 5100 */ 5101 VERIFY(0 == dmu_read(os, od[0].od_object, 0, sizeof (uint64_t), 5102 &old_txg, DMU_READ_PREFETCH)); 5103 5104 if (old_txg > txg) 5105 fatal(0, "future leak: got %" PRIu64 ", open txg is %" PRIu64, 5106 old_txg, txg); 5107 5108 dmu_write(os, od[0].od_object, 0, sizeof (uint64_t), &txg, tx); 5109 5110 mutex_enter(&zcl.zcl_callbacks_lock); 5111 5112 /* 5113 * Since commit callbacks don't have any ordering requirement and since 5114 * it is theoretically possible for a commit callback to be called 5115 * after an arbitrary amount of time has elapsed since its txg has been 5116 * synced, it is difficult to reliably determine whether a commit 5117 * callback hasn't been called due to high load or due to a flawed 5118 * implementation. 5119 * 5120 * In practice, we will assume that if after a certain number of txgs a 5121 * commit callback hasn't been called, then most likely there's an 5122 * implementation bug.. 5123 */ 5124 tmp_cb = list_head(&zcl.zcl_callbacks); 5125 if (tmp_cb != NULL && 5126 (txg - ZTEST_COMMIT_CALLBACK_THRESH) > tmp_cb->zcd_txg) { 5127 fatal(0, "Commit callback threshold exceeded, oldest txg: %" 5128 PRIu64 ", open txg: %" PRIu64 "\n", tmp_cb->zcd_txg, txg); 5129 } 5130 5131 /* 5132 * Let's find the place to insert our callbacks. 5133 * 5134 * Even though the list is ordered by txg, it is possible for the 5135 * insertion point to not be the end because our txg may already be 5136 * quiescing at this point and other callbacks in the open txg 5137 * (from other objsets) may have sneaked in. 5138 */ 5139 tmp_cb = list_tail(&zcl.zcl_callbacks); 5140 while (tmp_cb != NULL && tmp_cb->zcd_txg > txg) 5141 tmp_cb = list_prev(&zcl.zcl_callbacks, tmp_cb); 5142 5143 /* Add the 3 callbacks to the list */ 5144 for (i = 0; i < 3; i++) { 5145 if (tmp_cb == NULL) 5146 list_insert_head(&zcl.zcl_callbacks, cb_data[i]); 5147 else 5148 list_insert_after(&zcl.zcl_callbacks, tmp_cb, 5149 cb_data[i]); 5150 5151 cb_data[i]->zcd_added = B_TRUE; 5152 VERIFY(!cb_data[i]->zcd_called); 5153 5154 tmp_cb = cb_data[i]; 5155 } 5156 5157 mutex_exit(&zcl.zcl_callbacks_lock); 5158 5159 dmu_tx_commit(tx); 5160} 5161 5162/* 5163 * Visit each object in the dataset. Verify that its properties 5164 * are consistent what was stored in the block tag when it was created, 5165 * and that its unused bonus buffer space has not been overwritten. 5166 */ 5167void 5168ztest_verify_dnode_bt(ztest_ds_t *zd, uint64_t id) 5169{ 5170 objset_t *os = zd->zd_os; 5171 uint64_t obj; 5172 int err = 0; 5173 5174 for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE, 0)) { 5175 ztest_block_tag_t *bt = NULL; 5176 dmu_object_info_t doi; 5177 dmu_buf_t *db; 5178 5179 if (dmu_bonus_hold(os, obj, FTAG, &db) != 0) 5180 continue; 5181 5182 dmu_object_info_from_db(db, &doi); 5183 if (doi.doi_bonus_size >= sizeof (*bt)) 5184 bt = ztest_bt_bonus(db); 5185 5186 if (bt && bt->bt_magic == BT_MAGIC) { 5187 ztest_bt_verify(bt, os, obj, doi.doi_dnodesize, 5188 bt->bt_offset, bt->bt_gen, bt->bt_txg, 5189 bt->bt_crtxg); 5190 ztest_verify_unused_bonus(db, bt, obj, os, bt->bt_gen); 5191 } 5192 5193 dmu_buf_rele(db, FTAG); 5194 } 5195} 5196 5197/* ARGSUSED */ 5198void 5199ztest_dsl_prop_get_set(ztest_ds_t *zd, uint64_t id) 5200{ 5201 zfs_prop_t proplist[] = { 5202 ZFS_PROP_CHECKSUM, 5203 ZFS_PROP_COMPRESSION, 5204 ZFS_PROP_COPIES, 5205 ZFS_PROP_DEDUP 5206 }; 5207 5208 rw_enter(&ztest_name_lock, RW_READER); 5209 5210 for (int p = 0; p < sizeof (proplist) / sizeof (proplist[0]); p++) 5211 (void) ztest_dsl_prop_set_uint64(zd->zd_name, proplist[p], 5212 ztest_random_dsl_prop(proplist[p]), (int)ztest_random(2)); 5213 5214 rw_exit(&ztest_name_lock); 5215} 5216 5217/* ARGSUSED */ 5218void 5219ztest_remap_blocks(ztest_ds_t *zd, uint64_t id) 5220{ 5221 rw_enter(&ztest_name_lock, RW_READER); 5222 5223 int error = dmu_objset_remap_indirects(zd->zd_name); 5224 if (error == ENOSPC) 5225 error = 0; 5226 ASSERT0(error); 5227 5228 rw_exit(&ztest_name_lock); 5229} 5230 5231/* ARGSUSED */ 5232void 5233ztest_spa_prop_get_set(ztest_ds_t *zd, uint64_t id) 5234{ 5235 nvlist_t *props = NULL; 5236 5237 rw_enter(&ztest_name_lock, RW_READER); 5238 5239 (void) ztest_spa_prop_set_uint64(ZPOOL_PROP_DEDUPDITTO, 5240 ZIO_DEDUPDITTO_MIN + ztest_random(ZIO_DEDUPDITTO_MIN)); 5241 5242 VERIFY0(spa_prop_get(ztest_spa, &props)); 5243 5244 if (ztest_opts.zo_verbose >= 6) 5245 dump_nvlist(props, 4); 5246 5247 nvlist_free(props); 5248 5249 rw_exit(&ztest_name_lock); 5250} 5251 5252static int 5253user_release_one(const char *snapname, const char *holdname) 5254{ 5255 nvlist_t *snaps, *holds; 5256 int error; 5257 5258 snaps = fnvlist_alloc(); 5259 holds = fnvlist_alloc(); 5260 fnvlist_add_boolean(holds, holdname); 5261 fnvlist_add_nvlist(snaps, snapname, holds); 5262 fnvlist_free(holds); 5263 error = dsl_dataset_user_release(snaps, NULL); 5264 fnvlist_free(snaps); 5265 return (error); 5266} 5267 5268/* 5269 * Test snapshot hold/release and deferred destroy. 5270 */ 5271void 5272ztest_dmu_snapshot_hold(ztest_ds_t *zd, uint64_t id) 5273{ 5274 int error; 5275 objset_t *os = zd->zd_os; 5276 objset_t *origin; 5277 char snapname[100]; 5278 char fullname[100]; 5279 char clonename[100]; 5280 char tag[100]; 5281 char osname[ZFS_MAX_DATASET_NAME_LEN]; 5282 nvlist_t *holds; 5283 5284 rw_enter(&ztest_name_lock, RW_READER); 5285 5286 dmu_objset_name(os, osname); 5287 5288 (void) snprintf(snapname, sizeof (snapname), "sh1_%llu", id); 5289 (void) snprintf(fullname, sizeof (fullname), "%s@%s", osname, snapname); 5290 (void) snprintf(clonename, sizeof (clonename), 5291 "%s/ch1_%llu", osname, id); 5292 (void) snprintf(tag, sizeof (tag), "tag_%llu", id); 5293 5294 /* 5295 * Clean up from any previous run. 5296 */ 5297 error = dsl_destroy_head(clonename); 5298 if (error != ENOENT) 5299 ASSERT0(error); 5300 error = user_release_one(fullname, tag); 5301 if (error != ESRCH && error != ENOENT) 5302 ASSERT0(error); 5303 error = dsl_destroy_snapshot(fullname, B_FALSE); 5304 if (error != ENOENT) 5305 ASSERT0(error); 5306 5307 /* 5308 * Create snapshot, clone it, mark snap for deferred destroy, 5309 * destroy clone, verify snap was also destroyed. 5310 */ 5311 error = dmu_objset_snapshot_one(osname, snapname); 5312 if (error) { 5313 if (error == ENOSPC) { 5314 ztest_record_enospc("dmu_objset_snapshot"); 5315 goto out; 5316 } 5317 fatal(0, "dmu_objset_snapshot(%s) = %d", fullname, error); 5318 } 5319 5320 error = dmu_objset_clone(clonename, fullname); 5321 if (error) { 5322 if (error == ENOSPC) { 5323 ztest_record_enospc("dmu_objset_clone"); 5324 goto out; 5325 } 5326 fatal(0, "dmu_objset_clone(%s) = %d", clonename, error); 5327 } 5328 5329 error = dsl_destroy_snapshot(fullname, B_TRUE); 5330 if (error) { 5331 fatal(0, "dsl_destroy_snapshot(%s, B_TRUE) = %d", 5332 fullname, error); 5333 } 5334 5335 error = dsl_destroy_head(clonename); 5336 if (error) 5337 fatal(0, "dsl_destroy_head(%s) = %d", clonename, error); 5338 5339 error = dmu_objset_hold(fullname, FTAG, &origin); 5340 if (error != ENOENT) 5341 fatal(0, "dmu_objset_hold(%s) = %d", fullname, error); 5342 5343 /* 5344 * Create snapshot, add temporary hold, verify that we can't 5345 * destroy a held snapshot, mark for deferred destroy, 5346 * release hold, verify snapshot was destroyed. 5347 */ 5348 error = dmu_objset_snapshot_one(osname, snapname); 5349 if (error) { 5350 if (error == ENOSPC) { 5351 ztest_record_enospc("dmu_objset_snapshot"); 5352 goto out; 5353 } 5354 fatal(0, "dmu_objset_snapshot(%s) = %d", fullname, error); 5355 } 5356 5357 holds = fnvlist_alloc(); 5358 fnvlist_add_string(holds, fullname, tag); 5359 error = dsl_dataset_user_hold(holds, 0, NULL); 5360 fnvlist_free(holds); 5361 5362 if (error == ENOSPC) { 5363 ztest_record_enospc("dsl_dataset_user_hold"); 5364 goto out; 5365 } else if (error) { 5366 fatal(0, "dsl_dataset_user_hold(%s, %s) = %u", 5367 fullname, tag, error); 5368 } 5369 5370 error = dsl_destroy_snapshot(fullname, B_FALSE); 5371 if (error != EBUSY) { 5372 fatal(0, "dsl_destroy_snapshot(%s, B_FALSE) = %d", 5373 fullname, error); 5374 } 5375 5376 error = dsl_destroy_snapshot(fullname, B_TRUE); 5377 if (error) { 5378 fatal(0, "dsl_destroy_snapshot(%s, B_TRUE) = %d", 5379 fullname, error); 5380 } 5381 5382 error = user_release_one(fullname, tag); 5383 if (error) 5384 fatal(0, "user_release_one(%s, %s) = %d", fullname, tag, error); 5385 5386 VERIFY3U(dmu_objset_hold(fullname, FTAG, &origin), ==, ENOENT); 5387 5388out: 5389 rw_exit(&ztest_name_lock); 5390} 5391 5392/* 5393 * Inject random faults into the on-disk data. 5394 */ 5395/* ARGSUSED */ 5396void 5397ztest_fault_inject(ztest_ds_t *zd, uint64_t id) 5398{ 5399 ztest_shared_t *zs = ztest_shared; 5400 spa_t *spa = ztest_spa; 5401 int fd; 5402 uint64_t offset; 5403 uint64_t leaves; 5404 uint64_t bad = 0x1990c0ffeedecadeULL; 5405 uint64_t top, leaf; 5406 char path0[MAXPATHLEN]; 5407 char pathrand[MAXPATHLEN]; 5408 size_t fsize; 5409 int bshift = SPA_MAXBLOCKSHIFT + 2; 5410 int iters = 1000; 5411 int maxfaults; 5412 int mirror_save; 5413 vdev_t *vd0 = NULL; 5414 uint64_t guid0 = 0; 5415 boolean_t islog = B_FALSE; 5416 5417 mutex_enter(&ztest_vdev_lock); 5418 5419 /* 5420 * Device removal is in progress, fault injection must be disabled 5421 * until it completes and the pool is scrubbed. The fault injection 5422 * strategy for damaging blocks does not take in to account evacuated 5423 * blocks which may have already been damaged. 5424 */ 5425 if (ztest_device_removal_active) { 5426 mutex_exit(&ztest_vdev_lock); 5427 return; 5428 } 5429 5430 maxfaults = MAXFAULTS(); 5431 leaves = MAX(zs->zs_mirrors, 1) * ztest_opts.zo_raidz; 5432 mirror_save = zs->zs_mirrors; 5433 mutex_exit(&ztest_vdev_lock); 5434 5435 ASSERT(leaves >= 1); 5436 5437 /* 5438 * Grab the name lock as reader. There are some operations 5439 * which don't like to have their vdevs changed while 5440 * they are in progress (i.e. spa_change_guid). Those 5441 * operations will have grabbed the name lock as writer. 5442 */ 5443 rw_enter(&ztest_name_lock, RW_READER); 5444 5445 /* 5446 * We need SCL_STATE here because we're going to look at vd0->vdev_tsd. 5447 */ 5448 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 5449 5450 if (ztest_random(2) == 0) { 5451 /* 5452 * Inject errors on a normal data device or slog device. 5453 */ 5454 top = ztest_random_vdev_top(spa, B_TRUE); 5455 leaf = ztest_random(leaves) + zs->zs_splits; 5456 5457 /* 5458 * Generate paths to the first leaf in this top-level vdev, 5459 * and to the random leaf we selected. We'll induce transient 5460 * write failures and random online/offline activity on leaf 0, 5461 * and we'll write random garbage to the randomly chosen leaf. 5462 */ 5463 (void) snprintf(path0, sizeof (path0), ztest_dev_template, 5464 ztest_opts.zo_dir, ztest_opts.zo_pool, 5465 top * leaves + zs->zs_splits); 5466 (void) snprintf(pathrand, sizeof (pathrand), ztest_dev_template, 5467 ztest_opts.zo_dir, ztest_opts.zo_pool, 5468 top * leaves + leaf); 5469 5470 vd0 = vdev_lookup_by_path(spa->spa_root_vdev, path0); 5471 if (vd0 != NULL && vd0->vdev_top->vdev_islog) 5472 islog = B_TRUE; 5473 5474 /* 5475 * If the top-level vdev needs to be resilvered 5476 * then we only allow faults on the device that is 5477 * resilvering. 5478 */ 5479 if (vd0 != NULL && maxfaults != 1 && 5480 (!vdev_resilver_needed(vd0->vdev_top, NULL, NULL) || 5481 vd0->vdev_resilver_txg != 0)) { 5482 /* 5483 * Make vd0 explicitly claim to be unreadable, 5484 * or unwriteable, or reach behind its back 5485 * and close the underlying fd. We can do this if 5486 * maxfaults == 0 because we'll fail and reexecute, 5487 * and we can do it if maxfaults >= 2 because we'll 5488 * have enough redundancy. If maxfaults == 1, the 5489 * combination of this with injection of random data 5490 * corruption below exceeds the pool's fault tolerance. 5491 */ 5492 vdev_file_t *vf = vd0->vdev_tsd; 5493 5494 zfs_dbgmsg("injecting fault to vdev %llu; maxfaults=%d", 5495 (long long)vd0->vdev_id, (int)maxfaults); 5496 5497 if (vf != NULL && ztest_random(3) == 0) { 5498 (void) close(vf->vf_vnode->v_fd); 5499 vf->vf_vnode->v_fd = -1; 5500 } else if (ztest_random(2) == 0) { 5501 vd0->vdev_cant_read = B_TRUE; 5502 } else { 5503 vd0->vdev_cant_write = B_TRUE; 5504 } 5505 guid0 = vd0->vdev_guid; 5506 } 5507 } else { 5508 /* 5509 * Inject errors on an l2cache device. 5510 */ 5511 spa_aux_vdev_t *sav = &spa->spa_l2cache; 5512 5513 if (sav->sav_count == 0) { 5514 spa_config_exit(spa, SCL_STATE, FTAG); 5515 rw_exit(&ztest_name_lock); 5516 return; 5517 } 5518 vd0 = sav->sav_vdevs[ztest_random(sav->sav_count)]; 5519 guid0 = vd0->vdev_guid; 5520 (void) strcpy(path0, vd0->vdev_path); 5521 (void) strcpy(pathrand, vd0->vdev_path); 5522 5523 leaf = 0; 5524 leaves = 1; 5525 maxfaults = INT_MAX; /* no limit on cache devices */ 5526 } 5527 5528 spa_config_exit(spa, SCL_STATE, FTAG); 5529 rw_exit(&ztest_name_lock); 5530 5531 /* 5532 * If we can tolerate two or more faults, or we're dealing 5533 * with a slog, randomly online/offline vd0. 5534 */ 5535 if ((maxfaults >= 2 || islog) && guid0 != 0) { 5536 if (ztest_random(10) < 6) { 5537 int flags = (ztest_random(2) == 0 ? 5538 ZFS_OFFLINE_TEMPORARY : 0); 5539 5540 /* 5541 * We have to grab the zs_name_lock as writer to 5542 * prevent a race between offlining a slog and 5543 * destroying a dataset. Offlining the slog will 5544 * grab a reference on the dataset which may cause 5545 * dmu_objset_destroy() to fail with EBUSY thus 5546 * leaving the dataset in an inconsistent state. 5547 */ 5548 if (islog) 5549 rw_enter(&ztest_name_lock, RW_WRITER); 5550 5551 VERIFY(vdev_offline(spa, guid0, flags) != EBUSY); 5552 5553 if (islog) 5554 rw_exit(&ztest_name_lock); 5555 } else { 5556 /* 5557 * Ideally we would like to be able to randomly 5558 * call vdev_[on|off]line without holding locks 5559 * to force unpredictable failures but the side 5560 * effects of vdev_[on|off]line prevent us from 5561 * doing so. We grab the ztest_vdev_lock here to 5562 * prevent a race between injection testing and 5563 * aux_vdev removal. 5564 */ 5565 mutex_enter(&ztest_vdev_lock); 5566 (void) vdev_online(spa, guid0, 0, NULL); 5567 mutex_exit(&ztest_vdev_lock); 5568 } 5569 } 5570 5571 if (maxfaults == 0) 5572 return; 5573 5574 /* 5575 * We have at least single-fault tolerance, so inject data corruption. 5576 */ 5577 fd = open(pathrand, O_RDWR); 5578 5579 if (fd == -1) /* we hit a gap in the device namespace */ 5580 return; 5581 5582 fsize = lseek(fd, 0, SEEK_END); 5583 5584 while (--iters != 0) { 5585 /* 5586 * The offset must be chosen carefully to ensure that 5587 * we do not inject a given logical block with errors 5588 * on two different leaf devices, because ZFS can not 5589 * tolerate that (if maxfaults==1). 5590 * 5591 * We divide each leaf into chunks of size 5592 * (# leaves * SPA_MAXBLOCKSIZE * 4). Within each chunk 5593 * there is a series of ranges to which we can inject errors. 5594 * Each range can accept errors on only a single leaf vdev. 5595 * The error injection ranges are separated by ranges 5596 * which we will not inject errors on any device (DMZs). 5597 * Each DMZ must be large enough such that a single block 5598 * can not straddle it, so that a single block can not be 5599 * a target in two different injection ranges (on different 5600 * leaf vdevs). 5601 * 5602 * For example, with 3 leaves, each chunk looks like: 5603 * 0 to 32M: injection range for leaf 0 5604 * 32M to 64M: DMZ - no injection allowed 5605 * 64M to 96M: injection range for leaf 1 5606 * 96M to 128M: DMZ - no injection allowed 5607 * 128M to 160M: injection range for leaf 2 5608 * 160M to 192M: DMZ - no injection allowed 5609 */ 5610 offset = ztest_random(fsize / (leaves << bshift)) * 5611 (leaves << bshift) + (leaf << bshift) + 5612 (ztest_random(1ULL << (bshift - 1)) & -8ULL); 5613 5614 /* 5615 * Only allow damage to the labels at one end of the vdev. 5616 * 5617 * If all labels are damaged, the device will be totally 5618 * inaccessible, which will result in loss of data, 5619 * because we also damage (parts of) the other side of 5620 * the mirror/raidz. 5621 * 5622 * Additionally, we will always have both an even and an 5623 * odd label, so that we can handle crashes in the 5624 * middle of vdev_config_sync(). 5625 */ 5626 if ((leaf & 1) == 0 && offset < VDEV_LABEL_START_SIZE) 5627 continue; 5628 5629 /* 5630 * The two end labels are stored at the "end" of the disk, but 5631 * the end of the disk (vdev_psize) is aligned to 5632 * sizeof (vdev_label_t). 5633 */ 5634 uint64_t psize = P2ALIGN(fsize, sizeof (vdev_label_t)); 5635 if ((leaf & 1) == 1 && 5636 offset + sizeof (bad) > psize - VDEV_LABEL_END_SIZE) 5637 continue; 5638 5639 mutex_enter(&ztest_vdev_lock); 5640 if (mirror_save != zs->zs_mirrors) { 5641 mutex_exit(&ztest_vdev_lock); 5642 (void) close(fd); 5643 return; 5644 } 5645 5646 if (pwrite(fd, &bad, sizeof (bad), offset) != sizeof (bad)) 5647 fatal(1, "can't inject bad word at 0x%llx in %s", 5648 offset, pathrand); 5649 5650 mutex_exit(&ztest_vdev_lock); 5651 5652 if (ztest_opts.zo_verbose >= 7) 5653 (void) printf("injected bad word into %s," 5654 " offset 0x%llx\n", pathrand, (u_longlong_t)offset); 5655 } 5656 5657 (void) close(fd); 5658} 5659 5660/* 5661 * Verify that DDT repair works as expected. 5662 */ 5663void 5664ztest_ddt_repair(ztest_ds_t *zd, uint64_t id) 5665{ 5666 ztest_shared_t *zs = ztest_shared; 5667 spa_t *spa = ztest_spa; 5668 objset_t *os = zd->zd_os; 5669 ztest_od_t od[1]; 5670 uint64_t object, blocksize, txg, pattern, psize; 5671 enum zio_checksum checksum = spa_dedup_checksum(spa); 5672 dmu_buf_t *db; 5673 dmu_tx_t *tx; 5674 abd_t *abd; 5675 blkptr_t blk; 5676 int copies = 2 * ZIO_DEDUPDITTO_MIN; 5677 5678 blocksize = ztest_random_blocksize(); 5679 blocksize = MIN(blocksize, 2048); /* because we write so many */ 5680 5681 ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, 5682 0, 0); 5683 5684 if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0) 5685 return; 5686 5687 /* 5688 * Take the name lock as writer to prevent anyone else from changing 5689 * the pool and dataset properies we need to maintain during this test. 5690 */ 5691 rw_enter(&ztest_name_lock, RW_WRITER); 5692 5693 if (ztest_dsl_prop_set_uint64(zd->zd_name, ZFS_PROP_DEDUP, checksum, 5694 B_FALSE) != 0 || 5695 ztest_dsl_prop_set_uint64(zd->zd_name, ZFS_PROP_COPIES, 1, 5696 B_FALSE) != 0) { 5697 rw_exit(&ztest_name_lock); 5698 return; 5699 } 5700 5701 dmu_objset_stats_t dds; 5702 dsl_pool_config_enter(dmu_objset_pool(os), FTAG); 5703 dmu_objset_fast_stat(os, &dds); 5704 dsl_pool_config_exit(dmu_objset_pool(os), FTAG); 5705 5706 object = od[0].od_object; 5707 blocksize = od[0].od_blocksize; 5708 pattern = zs->zs_guid ^ dds.dds_guid; 5709 5710 ASSERT(object != 0); 5711 5712 tx = dmu_tx_create(os); 5713 dmu_tx_hold_write(tx, object, 0, copies * blocksize); 5714 txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); 5715 if (txg == 0) { 5716 rw_exit(&ztest_name_lock); 5717 return; 5718 } 5719 5720 /* 5721 * Write all the copies of our block. 5722 */ 5723 for (int i = 0; i < copies; i++) { 5724 uint64_t offset = i * blocksize; 5725 int error = dmu_buf_hold(os, object, offset, FTAG, &db, 5726 DMU_READ_NO_PREFETCH); 5727 if (error != 0) { 5728 fatal(B_FALSE, "dmu_buf_hold(%p, %llu, %llu) = %u", 5729 os, (long long)object, (long long) offset, error); 5730 } 5731 ASSERT(db->db_offset == offset); 5732 ASSERT(db->db_size == blocksize); 5733 ASSERT(ztest_pattern_match(db->db_data, db->db_size, pattern) || 5734 ztest_pattern_match(db->db_data, db->db_size, 0ULL)); 5735 dmu_buf_will_fill(db, tx); 5736 ztest_pattern_set(db->db_data, db->db_size, pattern); 5737 dmu_buf_rele(db, FTAG); 5738 } 5739 5740 dmu_tx_commit(tx); 5741 txg_wait_synced(spa_get_dsl(spa), txg); 5742 5743 /* 5744 * Find out what block we got. 5745 */ 5746 VERIFY0(dmu_buf_hold(os, object, 0, FTAG, &db, 5747 DMU_READ_NO_PREFETCH)); 5748 blk = *((dmu_buf_impl_t *)db)->db_blkptr; 5749 dmu_buf_rele(db, FTAG); 5750 5751 /* 5752 * Damage the block. Dedup-ditto will save us when we read it later. 5753 */ 5754 psize = BP_GET_PSIZE(&blk); 5755 abd = abd_alloc_linear(psize, B_TRUE); 5756 ztest_pattern_set(abd_to_buf(abd), psize, ~pattern); 5757 5758 (void) zio_wait(zio_rewrite(NULL, spa, 0, &blk, 5759 abd, psize, NULL, NULL, ZIO_PRIORITY_SYNC_WRITE, 5760 ZIO_FLAG_CANFAIL | ZIO_FLAG_INDUCE_DAMAGE, NULL)); 5761 5762 abd_free(abd); 5763 5764 rw_exit(&ztest_name_lock); 5765} 5766 5767/* 5768 * Scrub the pool. 5769 */ 5770/* ARGSUSED */ 5771void 5772ztest_scrub(ztest_ds_t *zd, uint64_t id) 5773{ 5774 spa_t *spa = ztest_spa; 5775 5776 /* 5777 * Scrub in progress by device removal. 5778 */ 5779 if (ztest_device_removal_active) 5780 return; 5781 5782 (void) spa_scan(spa, POOL_SCAN_SCRUB); 5783 (void) poll(NULL, 0, 100); /* wait a moment, then force a restart */ 5784 (void) spa_scan(spa, POOL_SCAN_SCRUB); 5785} 5786 5787/* 5788 * Change the guid for the pool. 5789 */ 5790/* ARGSUSED */ 5791void 5792ztest_reguid(ztest_ds_t *zd, uint64_t id) 5793{ 5794 spa_t *spa = ztest_spa; 5795 uint64_t orig, load; 5796 int error; 5797 5798 if (ztest_opts.zo_mmp_test) 5799 return; 5800 5801 orig = spa_guid(spa); 5802 load = spa_load_guid(spa); 5803 5804 rw_enter(&ztest_name_lock, RW_WRITER); 5805 error = spa_change_guid(spa); 5806 rw_exit(&ztest_name_lock); 5807 5808 if (error != 0) 5809 return; 5810 5811 if (ztest_opts.zo_verbose >= 4) { 5812 (void) printf("Changed guid old %llu -> %llu\n", 5813 (u_longlong_t)orig, (u_longlong_t)spa_guid(spa)); 5814 } 5815 5816 VERIFY3U(orig, !=, spa_guid(spa)); 5817 VERIFY3U(load, ==, spa_load_guid(spa)); 5818} 5819 5820static vdev_t * 5821ztest_random_concrete_vdev_leaf(vdev_t *vd) 5822{ 5823 if (vd == NULL) 5824 return (NULL); 5825 5826 if (vd->vdev_children == 0) 5827 return (vd); 5828 5829 vdev_t *eligible[vd->vdev_children]; 5830 int eligible_idx = 0, i; 5831 for (i = 0; i < vd->vdev_children; i++) { 5832 vdev_t *cvd = vd->vdev_child[i]; 5833 if (cvd->vdev_top->vdev_removing) 5834 continue; 5835 if (cvd->vdev_children > 0 || 5836 (vdev_is_concrete(cvd) && !cvd->vdev_detached)) { 5837 eligible[eligible_idx++] = cvd; 5838 } 5839 } 5840 VERIFY(eligible_idx > 0); 5841 5842 uint64_t child_no = ztest_random(eligible_idx); 5843 return (ztest_random_concrete_vdev_leaf(eligible[child_no])); 5844} 5845 5846/* ARGSUSED */ 5847void 5848ztest_initialize(ztest_ds_t *zd, uint64_t id) 5849{ 5850 spa_t *spa = ztest_spa; 5851 int error = 0; 5852 5853 mutex_enter(&ztest_vdev_lock); 5854 5855 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 5856 5857 /* Random leaf vdev */ 5858 vdev_t *rand_vd = ztest_random_concrete_vdev_leaf(spa->spa_root_vdev); 5859 if (rand_vd == NULL) { 5860 spa_config_exit(spa, SCL_VDEV, FTAG); 5861 mutex_exit(&ztest_vdev_lock); 5862 return; 5863 } 5864 5865 /* 5866 * The random vdev we've selected may change as soon as we 5867 * drop the spa_config_lock. We create local copies of things 5868 * we're interested in. 5869 */ 5870 uint64_t guid = rand_vd->vdev_guid; 5871 char *path = strdup(rand_vd->vdev_path); 5872 boolean_t active = rand_vd->vdev_initialize_thread != NULL; 5873 5874 zfs_dbgmsg("vd %p, guid %llu", rand_vd, guid); 5875 spa_config_exit(spa, SCL_VDEV, FTAG); 5876 5877 uint64_t cmd = ztest_random(POOL_INITIALIZE_FUNCS); 5878 error = spa_vdev_initialize(spa, guid, cmd); 5879 switch (cmd) { 5880 case POOL_INITIALIZE_CANCEL: 5881 if (ztest_opts.zo_verbose >= 4) { 5882 (void) printf("Cancel initialize %s", path); 5883 if (!active) 5884 (void) printf(" failed (no initialize active)"); 5885 (void) printf("\n"); 5886 } 5887 break; 5888 case POOL_INITIALIZE_DO: 5889 if (ztest_opts.zo_verbose >= 4) { 5890 (void) printf("Start initialize %s", path); 5891 if (active && error == 0) 5892 (void) printf(" failed (already active)"); 5893 else if (error != 0) 5894 (void) printf(" failed (error %d)", error); 5895 (void) printf("\n"); 5896 } 5897 break; 5898 case POOL_INITIALIZE_SUSPEND: 5899 if (ztest_opts.zo_verbose >= 4) { 5900 (void) printf("Suspend initialize %s", path); 5901 if (!active) 5902 (void) printf(" failed (no initialize active)"); 5903 (void) printf("\n"); 5904 } 5905 break; 5906 } 5907 free(path); 5908 mutex_exit(&ztest_vdev_lock); 5909} 5910 5911/* 5912 * Verify pool integrity by running zdb. 5913 */ 5914static void 5915ztest_run_zdb(char *pool) 5916{ 5917 int status; 5918 char zdb[MAXPATHLEN + MAXNAMELEN + 20]; 5919 char zbuf[1024]; 5920 char *bin; 5921 char *ztest; 5922 char *isa; 5923 int isalen; 5924 FILE *fp; 5925 5926 strlcpy(zdb, "/usr/bin/ztest", sizeof(zdb)); 5927 5928 /* zdb lives in /usr/sbin, while ztest lives in /usr/bin */ 5929 bin = strstr(zdb, "/usr/bin/"); 5930 ztest = strstr(bin, "/ztest"); 5931 isa = bin + 8; 5932 isalen = ztest - isa; 5933 isa = strdup(isa); 5934 /* LINTED */ 5935 (void) sprintf(bin, 5936 "/usr/sbin%.*s/zdb -bcc%s%s -G -d -U %s " 5937 "-o zfs_reconstruct_indirect_combinations_max=65536 %s", 5938 isalen, 5939 isa, 5940 ztest_opts.zo_verbose >= 3 ? "s" : "", 5941 ztest_opts.zo_verbose >= 4 ? "v" : "", 5942 spa_config_path, 5943 pool); 5944 free(isa); 5945 5946 if (ztest_opts.zo_verbose >= 5) 5947 (void) printf("Executing %s\n", strstr(zdb, "zdb ")); 5948 5949 fp = popen(zdb, "r"); 5950 assert(fp != NULL); 5951 5952 while (fgets(zbuf, sizeof (zbuf), fp) != NULL) 5953 if (ztest_opts.zo_verbose >= 3) 5954 (void) printf("%s", zbuf); 5955 5956 status = pclose(fp); 5957 5958 if (status == 0) 5959 return; 5960 5961 ztest_dump_core = 0; 5962 if (WIFEXITED(status)) 5963 fatal(0, "'%s' exit code %d", zdb, WEXITSTATUS(status)); 5964 else 5965 fatal(0, "'%s' died with signal %d", zdb, WTERMSIG(status)); 5966} 5967 5968static void 5969ztest_walk_pool_directory(char *header) 5970{ 5971 spa_t *spa = NULL; 5972 5973 if (ztest_opts.zo_verbose >= 6) 5974 (void) printf("%s\n", header); 5975 5976 mutex_enter(&spa_namespace_lock); 5977 while ((spa = spa_next(spa)) != NULL) 5978 if (ztest_opts.zo_verbose >= 6) 5979 (void) printf("\t%s\n", spa_name(spa)); 5980 mutex_exit(&spa_namespace_lock); 5981} 5982 5983static void 5984ztest_spa_import_export(char *oldname, char *newname) 5985{ 5986 nvlist_t *config, *newconfig; 5987 uint64_t pool_guid; 5988 spa_t *spa; 5989 int error; 5990 5991 if (ztest_opts.zo_verbose >= 4) { 5992 (void) printf("import/export: old = %s, new = %s\n", 5993 oldname, newname); 5994 } 5995 5996 /* 5997 * Clean up from previous runs. 5998 */ 5999 (void) spa_destroy(newname); 6000 6001 /* 6002 * Get the pool's configuration and guid. 6003 */ 6004 VERIFY3U(0, ==, spa_open(oldname, &spa, FTAG)); 6005 6006 /* 6007 * Kick off a scrub to tickle scrub/export races. 6008 */ 6009 if (ztest_random(2) == 0) 6010 (void) spa_scan(spa, POOL_SCAN_SCRUB); 6011 6012 pool_guid = spa_guid(spa); 6013 spa_close(spa, FTAG); 6014 6015 ztest_walk_pool_directory("pools before export"); 6016 6017 /* 6018 * Export it. 6019 */ 6020 VERIFY3U(0, ==, spa_export(oldname, &config, B_FALSE, B_FALSE)); 6021 6022 ztest_walk_pool_directory("pools after export"); 6023 6024 /* 6025 * Try to import it. 6026 */ 6027 newconfig = spa_tryimport(config); 6028 ASSERT(newconfig != NULL); 6029 nvlist_free(newconfig); 6030 6031 /* 6032 * Import it under the new name. 6033 */ 6034 error = spa_import(newname, config, NULL, 0); 6035 if (error != 0) { 6036 dump_nvlist(config, 0); 6037 fatal(B_FALSE, "couldn't import pool %s as %s: error %u", 6038 oldname, newname, error); 6039 } 6040 6041 ztest_walk_pool_directory("pools after import"); 6042 6043 /* 6044 * Try to import it again -- should fail with EEXIST. 6045 */ 6046 VERIFY3U(EEXIST, ==, spa_import(newname, config, NULL, 0)); 6047 6048 /* 6049 * Try to import it under a different name -- should fail with EEXIST. 6050 */ 6051 VERIFY3U(EEXIST, ==, spa_import(oldname, config, NULL, 0)); 6052 6053 /* 6054 * Verify that the pool is no longer visible under the old name. 6055 */ 6056 VERIFY3U(ENOENT, ==, spa_open(oldname, &spa, FTAG)); 6057 6058 /* 6059 * Verify that we can open and close the pool using the new name. 6060 */ 6061 VERIFY3U(0, ==, spa_open(newname, &spa, FTAG)); 6062 ASSERT(pool_guid == spa_guid(spa)); 6063 spa_close(spa, FTAG); 6064 6065 nvlist_free(config); 6066} 6067 6068static void 6069ztest_resume(spa_t *spa) 6070{ 6071 if (spa_suspended(spa) && ztest_opts.zo_verbose >= 6) 6072 (void) printf("resuming from suspended state\n"); 6073 spa_vdev_state_enter(spa, SCL_NONE); 6074 vdev_clear(spa, NULL); 6075 (void) spa_vdev_state_exit(spa, NULL, 0); 6076 (void) zio_resume(spa); 6077} 6078 6079static void * 6080ztest_resume_thread(void *arg) 6081{ 6082 spa_t *spa = arg; 6083 6084 while (!ztest_exiting) { 6085 if (spa_suspended(spa)) 6086 ztest_resume(spa); 6087 (void) poll(NULL, 0, 100); 6088 6089 /* 6090 * Periodically change the zfs_compressed_arc_enabled setting. 6091 */ 6092 if (ztest_random(10) == 0) 6093 zfs_compressed_arc_enabled = ztest_random(2); 6094 6095 /* 6096 * Periodically change the zfs_abd_scatter_enabled setting. 6097 */ 6098 if (ztest_random(10) == 0) 6099 zfs_abd_scatter_enabled = ztest_random(2); 6100 } 6101 return (NULL); 6102} 6103 6104static void * 6105ztest_deadman_thread(void *arg) 6106{ 6107 ztest_shared_t *zs = arg; 6108 spa_t *spa = ztest_spa; 6109 hrtime_t delta, total = 0; 6110 6111 for (;;) { 6112 delta = zs->zs_thread_stop - zs->zs_thread_start + 6113 MSEC2NSEC(zfs_deadman_synctime_ms); 6114 6115 (void) poll(NULL, 0, (int)NSEC2MSEC(delta)); 6116 6117 /* 6118 * If the pool is suspended then fail immediately. Otherwise, 6119 * check to see if the pool is making any progress. If 6120 * vdev_deadman() discovers that there hasn't been any recent 6121 * I/Os then it will end up aborting the tests. 6122 */ 6123 if (spa_suspended(spa) || spa->spa_root_vdev == NULL) { 6124 fatal(0, "aborting test after %llu seconds because " 6125 "pool has transitioned to a suspended state.", 6126 zfs_deadman_synctime_ms / 1000); 6127 return (NULL); 6128 } 6129 vdev_deadman(spa->spa_root_vdev); 6130 6131 total += zfs_deadman_synctime_ms/1000; 6132 (void) printf("ztest has been running for %lld seconds\n", 6133 total); 6134 } 6135} 6136 6137static void 6138ztest_execute(int test, ztest_info_t *zi, uint64_t id) 6139{ 6140 ztest_ds_t *zd = &ztest_ds[id % ztest_opts.zo_datasets]; 6141 ztest_shared_callstate_t *zc = ZTEST_GET_SHARED_CALLSTATE(test); 6142 hrtime_t functime = gethrtime(); 6143 6144 for (int i = 0; i < zi->zi_iters; i++) 6145 zi->zi_func(zd, id); 6146 6147 functime = gethrtime() - functime; 6148 6149 atomic_add_64(&zc->zc_count, 1); 6150 atomic_add_64(&zc->zc_time, functime); 6151 6152 if (ztest_opts.zo_verbose >= 4) { 6153 Dl_info dli; 6154 (void) dladdr((void *)zi->zi_func, &dli); 6155 (void) printf("%6.2f sec in %s\n", 6156 (double)functime / NANOSEC, dli.dli_sname); 6157 } 6158} 6159 6160static void * 6161ztest_thread(void *arg) 6162{ 6163 int rand; 6164 uint64_t id = (uintptr_t)arg; 6165 ztest_shared_t *zs = ztest_shared; 6166 uint64_t call_next; 6167 hrtime_t now; 6168 ztest_info_t *zi; 6169 ztest_shared_callstate_t *zc; 6170 6171 while ((now = gethrtime()) < zs->zs_thread_stop) { 6172 /* 6173 * See if it's time to force a crash. 6174 */ 6175 if (now > zs->zs_thread_kill) 6176 ztest_kill(zs); 6177 6178 /* 6179 * If we're getting ENOSPC with some regularity, stop. 6180 */ 6181 if (zs->zs_enospc_count > 10) 6182 break; 6183 6184 /* 6185 * Pick a random function to execute. 6186 */ 6187 rand = ztest_random(ZTEST_FUNCS); 6188 zi = &ztest_info[rand]; 6189 zc = ZTEST_GET_SHARED_CALLSTATE(rand); 6190 call_next = zc->zc_next; 6191 6192 if (now >= call_next && 6193 atomic_cas_64(&zc->zc_next, call_next, call_next + 6194 ztest_random(2 * zi->zi_interval[0] + 1)) == call_next) { 6195 ztest_execute(rand, zi, id); 6196 } 6197 } 6198 6199 return (NULL); 6200} 6201 6202static void 6203ztest_dataset_name(char *dsname, char *pool, int d) 6204{ 6205 (void) snprintf(dsname, ZFS_MAX_DATASET_NAME_LEN, "%s/ds_%d", pool, d); 6206} 6207 6208static void 6209ztest_dataset_destroy(int d) 6210{ 6211 char name[ZFS_MAX_DATASET_NAME_LEN]; 6212 6213 ztest_dataset_name(name, ztest_opts.zo_pool, d); 6214 6215 if (ztest_opts.zo_verbose >= 3) 6216 (void) printf("Destroying %s to free up space\n", name); 6217 6218 /* 6219 * Cleanup any non-standard clones and snapshots. In general, 6220 * ztest thread t operates on dataset (t % zopt_datasets), 6221 * so there may be more than one thing to clean up. 6222 */ 6223 for (int t = d; t < ztest_opts.zo_threads; 6224 t += ztest_opts.zo_datasets) { 6225 ztest_dsl_dataset_cleanup(name, t); 6226 } 6227 6228 (void) dmu_objset_find(name, ztest_objset_destroy_cb, NULL, 6229 DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN); 6230} 6231 6232static void 6233ztest_dataset_dirobj_verify(ztest_ds_t *zd) 6234{ 6235 uint64_t usedobjs, dirobjs, scratch; 6236 6237 /* 6238 * ZTEST_DIROBJ is the object directory for the entire dataset. 6239 * Therefore, the number of objects in use should equal the 6240 * number of ZTEST_DIROBJ entries, +1 for ZTEST_DIROBJ itself. 6241 * If not, we have an object leak. 6242 * 6243 * Note that we can only check this in ztest_dataset_open(), 6244 * when the open-context and syncing-context values agree. 6245 * That's because zap_count() returns the open-context value, 6246 * while dmu_objset_space() returns the rootbp fill count. 6247 */ 6248 VERIFY3U(0, ==, zap_count(zd->zd_os, ZTEST_DIROBJ, &dirobjs)); 6249 dmu_objset_space(zd->zd_os, &scratch, &scratch, &usedobjs, &scratch); 6250 ASSERT3U(dirobjs + 1, ==, usedobjs); 6251} 6252 6253static int 6254ztest_dataset_open(int d) 6255{ 6256 ztest_ds_t *zd = &ztest_ds[d]; 6257 uint64_t committed_seq = ZTEST_GET_SHARED_DS(d)->zd_seq; 6258 objset_t *os; 6259 zilog_t *zilog; 6260 char name[ZFS_MAX_DATASET_NAME_LEN]; 6261 int error; 6262 6263 ztest_dataset_name(name, ztest_opts.zo_pool, d); 6264 6265 rw_enter(&ztest_name_lock, RW_READER); 6266 6267 error = ztest_dataset_create(name); 6268 if (error == ENOSPC) { 6269 rw_exit(&ztest_name_lock); 6270 ztest_record_enospc(FTAG); 6271 return (error); 6272 } 6273 ASSERT(error == 0 || error == EEXIST); 6274 6275 VERIFY0(dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, zd, &os)); 6276 rw_exit(&ztest_name_lock); 6277 6278 ztest_zd_init(zd, ZTEST_GET_SHARED_DS(d), os); 6279 6280 zilog = zd->zd_zilog; 6281 6282 if (zilog->zl_header->zh_claim_lr_seq != 0 && 6283 zilog->zl_header->zh_claim_lr_seq < committed_seq) 6284 fatal(0, "missing log records: claimed %llu < committed %llu", 6285 zilog->zl_header->zh_claim_lr_seq, committed_seq); 6286 6287 ztest_dataset_dirobj_verify(zd); 6288 6289 zil_replay(os, zd, ztest_replay_vector); 6290 6291 ztest_dataset_dirobj_verify(zd); 6292 6293 if (ztest_opts.zo_verbose >= 6) 6294 (void) printf("%s replay %llu blocks, %llu records, seq %llu\n", 6295 zd->zd_name, 6296 (u_longlong_t)zilog->zl_parse_blk_count, 6297 (u_longlong_t)zilog->zl_parse_lr_count, 6298 (u_longlong_t)zilog->zl_replaying_seq); 6299 6300 zilog = zil_open(os, ztest_get_data); 6301 6302 if (zilog->zl_replaying_seq != 0 && 6303 zilog->zl_replaying_seq < committed_seq) 6304 fatal(0, "missing log records: replayed %llu < committed %llu", 6305 zilog->zl_replaying_seq, committed_seq); 6306 6307 return (0); 6308} 6309 6310static void 6311ztest_dataset_close(int d) 6312{ 6313 ztest_ds_t *zd = &ztest_ds[d]; 6314 6315 zil_close(zd->zd_zilog); 6316 dmu_objset_disown(zd->zd_os, zd); 6317 6318 ztest_zd_fini(zd); 6319} 6320 6321/* 6322 * Kick off threads to run tests on all datasets in parallel. 6323 */ 6324static void 6325ztest_run(ztest_shared_t *zs) 6326{ 6327 thread_t *tid; 6328 spa_t *spa; 6329 objset_t *os; 6330 thread_t resume_tid; 6331 int error; 6332 6333 ztest_exiting = B_FALSE; 6334 6335 /* 6336 * Initialize parent/child shared state. 6337 */ 6338 mutex_init(&ztest_checkpoint_lock, NULL, USYNC_THREAD, NULL); 6339 mutex_init(&ztest_vdev_lock, NULL, USYNC_THREAD, NULL); 6340 rw_init(&ztest_name_lock, NULL, USYNC_THREAD, NULL); 6341 6342 zs->zs_thread_start = gethrtime(); 6343 zs->zs_thread_stop = 6344 zs->zs_thread_start + ztest_opts.zo_passtime * NANOSEC; 6345 zs->zs_thread_stop = MIN(zs->zs_thread_stop, zs->zs_proc_stop); 6346 zs->zs_thread_kill = zs->zs_thread_stop; 6347 if (ztest_random(100) < ztest_opts.zo_killrate) { 6348 zs->zs_thread_kill -= 6349 ztest_random(ztest_opts.zo_passtime * NANOSEC); 6350 } 6351 6352 mutex_init(&zcl.zcl_callbacks_lock, NULL, USYNC_THREAD, NULL); 6353 6354 list_create(&zcl.zcl_callbacks, sizeof (ztest_cb_data_t), 6355 offsetof(ztest_cb_data_t, zcd_node)); 6356 6357 /* 6358 * Open our pool. 6359 */ 6360 kernel_init(FREAD | FWRITE); 6361 VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); 6362 metaslab_preload_limit = ztest_random(20) + 1; 6363 ztest_spa = spa; 6364 6365 dmu_objset_stats_t dds; 6366 VERIFY0(dmu_objset_own(ztest_opts.zo_pool, 6367 DMU_OST_ANY, B_TRUE, FTAG, &os)); 6368 dsl_pool_config_enter(dmu_objset_pool(os), FTAG); 6369 dmu_objset_fast_stat(os, &dds); 6370 dsl_pool_config_exit(dmu_objset_pool(os), FTAG); 6371 zs->zs_guid = dds.dds_guid; 6372 dmu_objset_disown(os, FTAG); 6373 6374 spa->spa_dedup_ditto = 2 * ZIO_DEDUPDITTO_MIN; 6375 6376 /* 6377 * We don't expect the pool to suspend unless maxfaults == 0, 6378 * in which case ztest_fault_inject() temporarily takes away 6379 * the only valid replica. 6380 */ 6381 if (MAXFAULTS() == 0) 6382 spa->spa_failmode = ZIO_FAILURE_MODE_WAIT; 6383 else 6384 spa->spa_failmode = ZIO_FAILURE_MODE_PANIC; 6385 6386 /* 6387 * Create a thread to periodically resume suspended I/O. 6388 */ 6389 VERIFY(thr_create(0, 0, ztest_resume_thread, spa, THR_BOUND, 6390 &resume_tid) == 0); 6391 6392 /* 6393 * Create a deadman thread to abort() if we hang. 6394 */ 6395 VERIFY(thr_create(0, 0, ztest_deadman_thread, zs, THR_BOUND, 6396 NULL) == 0); 6397 6398 /* 6399 * Verify that we can safely inquire about any object, 6400 * whether it's allocated or not. To make it interesting, 6401 * we probe a 5-wide window around each power of two. 6402 * This hits all edge cases, including zero and the max. 6403 */ 6404 for (int t = 0; t < 64; t++) { 6405 for (int d = -5; d <= 5; d++) { 6406 error = dmu_object_info(spa->spa_meta_objset, 6407 (1ULL << t) + d, NULL); 6408 ASSERT(error == 0 || error == ENOENT || 6409 error == EINVAL); 6410 } 6411 } 6412 6413 /* 6414 * If we got any ENOSPC errors on the previous run, destroy something. 6415 */ 6416 if (zs->zs_enospc_count != 0) { 6417 int d = ztest_random(ztest_opts.zo_datasets); 6418 ztest_dataset_destroy(d); 6419 } 6420 zs->zs_enospc_count = 0; 6421 6422 tid = umem_zalloc(ztest_opts.zo_threads * sizeof (thread_t), 6423 UMEM_NOFAIL); 6424 6425 if (ztest_opts.zo_verbose >= 4) 6426 (void) printf("starting main threads...\n"); 6427 6428 /* 6429 * Kick off all the tests that run in parallel. 6430 */ 6431 for (int t = 0; t < ztest_opts.zo_threads; t++) { 6432 if (t < ztest_opts.zo_datasets && 6433 ztest_dataset_open(t) != 0) 6434 return; 6435 VERIFY(thr_create(0, 0, ztest_thread, (void *)(uintptr_t)t, 6436 THR_BOUND, &tid[t]) == 0); 6437 } 6438 6439 /* 6440 * Wait for all of the tests to complete. We go in reverse order 6441 * so we don't close datasets while threads are still using them. 6442 */ 6443 for (int t = ztest_opts.zo_threads - 1; t >= 0; t--) { 6444 VERIFY(thr_join(tid[t], NULL, NULL) == 0); 6445 if (t < ztest_opts.zo_datasets) 6446 ztest_dataset_close(t); 6447 } 6448 6449 txg_wait_synced(spa_get_dsl(spa), 0); 6450 6451 zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(spa)); 6452 zs->zs_space = metaslab_class_get_space(spa_normal_class(spa)); 6453 zfs_dbgmsg_print(FTAG); 6454 6455 umem_free(tid, ztest_opts.zo_threads * sizeof (thread_t)); 6456 6457 /* Kill the resume thread */ 6458 ztest_exiting = B_TRUE; 6459 VERIFY(thr_join(resume_tid, NULL, NULL) == 0); 6460 ztest_resume(spa); 6461 6462 /* 6463 * Right before closing the pool, kick off a bunch of async I/O; 6464 * spa_close() should wait for it to complete. 6465 */ 6466 for (uint64_t object = 1; object < 50; object++) { 6467 dmu_prefetch(spa->spa_meta_objset, object, 0, 0, 1ULL << 20, 6468 ZIO_PRIORITY_SYNC_READ); 6469 } 6470 6471 spa_close(spa, FTAG); 6472 6473 /* 6474 * Verify that we can loop over all pools. 6475 */ 6476 mutex_enter(&spa_namespace_lock); 6477 for (spa = spa_next(NULL); spa != NULL; spa = spa_next(spa)) 6478 if (ztest_opts.zo_verbose > 3) 6479 (void) printf("spa_next: found %s\n", spa_name(spa)); 6480 mutex_exit(&spa_namespace_lock); 6481 6482 /* 6483 * Verify that we can export the pool and reimport it under a 6484 * different name. 6485 */ 6486 if ((ztest_random(2) == 0) && !ztest_opts.zo_mmp_test) { 6487 char name[ZFS_MAX_DATASET_NAME_LEN]; 6488 (void) snprintf(name, sizeof (name), "%s_import", 6489 ztest_opts.zo_pool); 6490 ztest_spa_import_export(ztest_opts.zo_pool, name); 6491 ztest_spa_import_export(name, ztest_opts.zo_pool); 6492 } 6493 6494 kernel_fini(); 6495 6496 list_destroy(&zcl.zcl_callbacks); 6497 6498 mutex_destroy(&zcl.zcl_callbacks_lock); 6499 6500 rw_destroy(&ztest_name_lock); 6501 mutex_destroy(&ztest_vdev_lock); 6502 mutex_destroy(&ztest_checkpoint_lock); 6503} 6504 6505static void 6506ztest_freeze(void) 6507{ 6508 ztest_ds_t *zd = &ztest_ds[0]; 6509 spa_t *spa; 6510 int numloops = 0; 6511 6512 if (ztest_opts.zo_verbose >= 3) 6513 (void) printf("testing spa_freeze()...\n"); 6514 6515 kernel_init(FREAD | FWRITE); 6516 VERIFY3U(0, ==, spa_open(ztest_opts.zo_pool, &spa, FTAG)); 6517 VERIFY3U(0, ==, ztest_dataset_open(0)); 6518 ztest_spa = spa; 6519 6520 /* 6521 * Force the first log block to be transactionally allocated. 6522 * We have to do this before we freeze the pool -- otherwise 6523 * the log chain won't be anchored. 6524 */ 6525 while (BP_IS_HOLE(&zd->zd_zilog->zl_header->zh_log)) { 6526 ztest_dmu_object_alloc_free(zd, 0); 6527 zil_commit(zd->zd_zilog, 0); 6528 } 6529 6530 txg_wait_synced(spa_get_dsl(spa), 0); 6531 6532 /* 6533 * Freeze the pool. This stops spa_sync() from doing anything, 6534 * so that the only way to record changes from now on is the ZIL. 6535 */ 6536 spa_freeze(spa); 6537 6538 /* 6539 * Because it is hard to predict how much space a write will actually 6540 * require beforehand, we leave ourselves some fudge space to write over 6541 * capacity. 6542 */ 6543 uint64_t capacity = metaslab_class_get_space(spa_normal_class(spa)) / 2; 6544 6545 /* 6546 * Run tests that generate log records but don't alter the pool config 6547 * or depend on DSL sync tasks (snapshots, objset create/destroy, etc). 6548 * We do a txg_wait_synced() after each iteration to force the txg 6549 * to increase well beyond the last synced value in the uberblock. 6550 * The ZIL should be OK with that. 6551 * 6552 * Run a random number of times less than zo_maxloops and ensure we do 6553 * not run out of space on the pool. 6554 */ 6555 while (ztest_random(10) != 0 && 6556 numloops++ < ztest_opts.zo_maxloops && 6557 metaslab_class_get_alloc(spa_normal_class(spa)) < capacity) { 6558 ztest_od_t od; 6559 ztest_od_init(&od, 0, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0, 0); 6560 VERIFY0(ztest_object_init(zd, &od, sizeof (od), B_FALSE)); 6561 ztest_io(zd, od.od_object, 6562 ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT); 6563 txg_wait_synced(spa_get_dsl(spa), 0); 6564 } 6565 6566 /* 6567 * Commit all of the changes we just generated. 6568 */ 6569 zil_commit(zd->zd_zilog, 0); 6570 txg_wait_synced(spa_get_dsl(spa), 0); 6571 6572 /* 6573 * Close our dataset and close the pool. 6574 */ 6575 ztest_dataset_close(0); 6576 spa_close(spa, FTAG); 6577 kernel_fini(); 6578 6579 /* 6580 * Open and close the pool and dataset to induce log replay. 6581 */ 6582 kernel_init(FREAD | FWRITE); 6583 VERIFY3U(0, ==, spa_open(ztest_opts.zo_pool, &spa, FTAG)); 6584 ASSERT(spa_freeze_txg(spa) == UINT64_MAX); 6585 VERIFY3U(0, ==, ztest_dataset_open(0)); 6586 ztest_dataset_close(0); 6587 6588 ztest_spa = spa; 6589 txg_wait_synced(spa_get_dsl(spa), 0); 6590 ztest_reguid(NULL, 0); 6591 6592 spa_close(spa, FTAG); 6593 kernel_fini(); 6594} 6595 6596void 6597print_time(hrtime_t t, char *timebuf) 6598{ 6599 hrtime_t s = t / NANOSEC; 6600 hrtime_t m = s / 60; 6601 hrtime_t h = m / 60; 6602 hrtime_t d = h / 24; 6603 6604 s -= m * 60; 6605 m -= h * 60; 6606 h -= d * 24; 6607 6608 timebuf[0] = '\0'; 6609 6610 if (d) 6611 (void) sprintf(timebuf, 6612 "%llud%02lluh%02llum%02llus", d, h, m, s); 6613 else if (h) 6614 (void) sprintf(timebuf, "%lluh%02llum%02llus", h, m, s); 6615 else if (m) 6616 (void) sprintf(timebuf, "%llum%02llus", m, s); 6617 else 6618 (void) sprintf(timebuf, "%llus", s); 6619} 6620 6621static nvlist_t * 6622make_random_props() 6623{ 6624 nvlist_t *props; 6625 6626 VERIFY(nvlist_alloc(&props, NV_UNIQUE_NAME, 0) == 0); 6627 6628 if (ztest_random(2) == 0) 6629 return (props); 6630 VERIFY(nvlist_add_uint64(props, "autoreplace", 1) == 0); 6631 6632 return (props); 6633} 6634 6635/* 6636 * Import a storage pool with the given name. 6637 */ 6638static void 6639ztest_import(ztest_shared_t *zs) 6640{ 6641 libzfs_handle_t *hdl; 6642 importargs_t args = { 0 }; 6643 spa_t *spa; 6644 nvlist_t *cfg = NULL; 6645 int nsearch = 1; 6646 char *searchdirs[nsearch]; 6647 char *name = ztest_opts.zo_pool; 6648 int flags = ZFS_IMPORT_MISSING_LOG; 6649 int error; 6650 6651 mutex_init(&ztest_vdev_lock, NULL, MUTEX_DEFAULT, NULL); 6652 rw_init(&ztest_name_lock, NULL, USYNC_THREAD, NULL); 6653 6654 kernel_init(FREAD | FWRITE); 6655 hdl = libzfs_init(); 6656 6657 searchdirs[0] = ztest_opts.zo_dir; 6658 args.paths = nsearch; 6659 args.path = searchdirs; 6660 args.can_be_active = B_FALSE; 6661 6662 error = zpool_tryimport(hdl, name, &cfg, &args); 6663 if (error) 6664 (void) fatal(0, "No pools found\n"); 6665 6666 VERIFY0(spa_import(name, cfg, NULL, flags)); 6667 VERIFY0(spa_open(name, &spa, FTAG)); 6668 zs->zs_metaslab_sz = 6669 1ULL << spa->spa_root_vdev->vdev_child[0]->vdev_ms_shift; 6670 spa_close(spa, FTAG); 6671 6672 libzfs_fini(hdl); 6673 kernel_fini(); 6674 6675 if (!ztest_opts.zo_mmp_test) { 6676 ztest_run_zdb(ztest_opts.zo_pool); 6677 ztest_freeze(); 6678 ztest_run_zdb(ztest_opts.zo_pool); 6679 } 6680 6681 rw_destroy(&ztest_name_lock); 6682 mutex_destroy(&ztest_vdev_lock); 6683} 6684 6685/* 6686 * Create a storage pool with the given name and initial vdev size. 6687 * Then test spa_freeze() functionality. 6688 */ 6689static void 6690ztest_init(ztest_shared_t *zs) 6691{ 6692 spa_t *spa; 6693 nvlist_t *nvroot, *props; 6694 6695 mutex_init(&ztest_vdev_lock, NULL, USYNC_THREAD, NULL); 6696 mutex_init(&ztest_checkpoint_lock, NULL, USYNC_THREAD, NULL); 6697 rw_init(&ztest_name_lock, NULL, USYNC_THREAD, NULL); 6698 6699 kernel_init(FREAD | FWRITE); 6700 6701 /* 6702 * Create the storage pool. 6703 */ 6704 (void) spa_destroy(ztest_opts.zo_pool); 6705 ztest_shared->zs_vdev_next_leaf = 0; 6706 zs->zs_splits = 0; 6707 zs->zs_mirrors = ztest_opts.zo_mirrors; 6708 nvroot = make_vdev_root(NULL, NULL, NULL, ztest_opts.zo_vdev_size, 0, 6709 NULL, ztest_opts.zo_raidz, zs->zs_mirrors, 1); 6710 props = make_random_props(); 6711 for (int i = 0; i < SPA_FEATURES; i++) { 6712 char buf[1024]; 6713 (void) snprintf(buf, sizeof (buf), "feature@%s", 6714 spa_feature_table[i].fi_uname); 6715 VERIFY3U(0, ==, nvlist_add_uint64(props, buf, 0)); 6716 } 6717 VERIFY3U(0, ==, spa_create(ztest_opts.zo_pool, nvroot, props, NULL)); 6718 nvlist_free(nvroot); 6719 nvlist_free(props); 6720 6721 VERIFY3U(0, ==, spa_open(ztest_opts.zo_pool, &spa, FTAG)); 6722 zs->zs_metaslab_sz = 6723 1ULL << spa->spa_root_vdev->vdev_child[0]->vdev_ms_shift; 6724 6725 spa_close(spa, FTAG); 6726 6727 kernel_fini(); 6728 6729 if (!ztest_opts.zo_mmp_test) { 6730 ztest_run_zdb(ztest_opts.zo_pool); 6731 ztest_freeze(); 6732 ztest_run_zdb(ztest_opts.zo_pool); 6733 } 6734 6735 rw_destroy(&ztest_name_lock); 6736 mutex_destroy(&ztest_vdev_lock); 6737 mutex_destroy(&ztest_checkpoint_lock); 6738} 6739 6740static void 6741setup_data_fd(void) 6742{ 6743 static char ztest_name_data[] = "/tmp/ztest.data.XXXXXX"; 6744 6745 ztest_fd_data = mkstemp(ztest_name_data); 6746 ASSERT3S(ztest_fd_data, >=, 0); 6747 (void) unlink(ztest_name_data); 6748} 6749 6750 6751static int 6752shared_data_size(ztest_shared_hdr_t *hdr) 6753{ 6754 int size; 6755 6756 size = hdr->zh_hdr_size; 6757 size += hdr->zh_opts_size; 6758 size += hdr->zh_size; 6759 size += hdr->zh_stats_size * hdr->zh_stats_count; 6760 size += hdr->zh_ds_size * hdr->zh_ds_count; 6761 6762 return (size); 6763} 6764 6765static void 6766setup_hdr(void) 6767{ 6768 int size; 6769 ztest_shared_hdr_t *hdr; 6770 6771 hdr = (void *)mmap(0, P2ROUNDUP(sizeof (*hdr), getpagesize()), 6772 PROT_READ | PROT_WRITE, MAP_SHARED, ztest_fd_data, 0); 6773 ASSERT(hdr != MAP_FAILED); 6774 6775 VERIFY3U(0, ==, ftruncate(ztest_fd_data, sizeof (ztest_shared_hdr_t))); 6776 6777 hdr->zh_hdr_size = sizeof (ztest_shared_hdr_t); 6778 hdr->zh_opts_size = sizeof (ztest_shared_opts_t); 6779 hdr->zh_size = sizeof (ztest_shared_t); 6780 hdr->zh_stats_size = sizeof (ztest_shared_callstate_t); 6781 hdr->zh_stats_count = ZTEST_FUNCS; 6782 hdr->zh_ds_size = sizeof (ztest_shared_ds_t); 6783 hdr->zh_ds_count = ztest_opts.zo_datasets; 6784 6785 size = shared_data_size(hdr); 6786 VERIFY3U(0, ==, ftruncate(ztest_fd_data, size)); 6787 6788 (void) munmap((caddr_t)hdr, P2ROUNDUP(sizeof (*hdr), getpagesize())); 6789} 6790 6791static void 6792setup_data(void) 6793{ 6794 int size, offset; 6795 ztest_shared_hdr_t *hdr; 6796 uint8_t *buf; 6797 6798 hdr = (void *)mmap(0, P2ROUNDUP(sizeof (*hdr), getpagesize()), 6799 PROT_READ, MAP_SHARED, ztest_fd_data, 0); 6800 ASSERT(hdr != MAP_FAILED); 6801 6802 size = shared_data_size(hdr); 6803 6804 (void) munmap((caddr_t)hdr, P2ROUNDUP(sizeof (*hdr), getpagesize())); 6805 hdr = ztest_shared_hdr = (void *)mmap(0, P2ROUNDUP(size, getpagesize()), 6806 PROT_READ | PROT_WRITE, MAP_SHARED, ztest_fd_data, 0); 6807 ASSERT(hdr != MAP_FAILED); 6808 buf = (uint8_t *)hdr; 6809 6810 offset = hdr->zh_hdr_size; 6811 ztest_shared_opts = (void *)&buf[offset]; 6812 offset += hdr->zh_opts_size; 6813 ztest_shared = (void *)&buf[offset]; 6814 offset += hdr->zh_size; 6815 ztest_shared_callstate = (void *)&buf[offset]; 6816 offset += hdr->zh_stats_size * hdr->zh_stats_count; 6817 ztest_shared_ds = (void *)&buf[offset]; 6818} 6819 6820static boolean_t 6821exec_child(char *cmd, char *libpath, boolean_t ignorekill, int *statusp) 6822{ 6823 pid_t pid; 6824 int status; 6825 char *cmdbuf = NULL; 6826 6827 pid = fork(); 6828 6829 if (cmd == NULL) { 6830 cmdbuf = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 6831 (void) strlcpy(cmdbuf, getexecname(), MAXPATHLEN); 6832 cmd = cmdbuf; 6833 } 6834 6835 if (pid == -1) 6836 fatal(1, "fork failed"); 6837 6838 if (pid == 0) { /* child */ 6839 char *emptyargv[2] = { cmd, NULL }; 6840 char fd_data_str[12]; 6841 6842 struct rlimit rl = { 1024, 1024 }; 6843 (void) setrlimit(RLIMIT_NOFILE, &rl); 6844 6845 (void) close(ztest_fd_rand); 6846 VERIFY3U(11, >=, 6847 snprintf(fd_data_str, 12, "%d", ztest_fd_data)); 6848 VERIFY0(setenv("ZTEST_FD_DATA", fd_data_str, 1)); 6849 6850 (void) enable_extended_FILE_stdio(-1, -1); 6851 if (libpath != NULL) 6852 VERIFY(0 == setenv("LD_LIBRARY_PATH", libpath, 1)); 6853#ifdef illumos 6854 (void) execv(cmd, emptyargv); 6855#else 6856 (void) execvp(cmd, emptyargv); 6857#endif 6858 ztest_dump_core = B_FALSE; 6859 fatal(B_TRUE, "exec failed: %s", cmd); 6860 } 6861 6862 if (cmdbuf != NULL) { 6863 umem_free(cmdbuf, MAXPATHLEN); 6864 cmd = NULL; 6865 } 6866 6867 while (waitpid(pid, &status, 0) != pid) 6868 continue; 6869 if (statusp != NULL) 6870 *statusp = status; 6871 6872 if (WIFEXITED(status)) { 6873 if (WEXITSTATUS(status) != 0) { 6874 (void) fprintf(stderr, "child exited with code %d\n", 6875 WEXITSTATUS(status)); 6876 exit(2); 6877 } 6878 return (B_FALSE); 6879 } else if (WIFSIGNALED(status)) { 6880 if (!ignorekill || WTERMSIG(status) != SIGKILL) { 6881 (void) fprintf(stderr, "child died with signal %d\n", 6882 WTERMSIG(status)); 6883 exit(3); 6884 } 6885 return (B_TRUE); 6886 } else { 6887 (void) fprintf(stderr, "something strange happened to child\n"); 6888 exit(4); 6889 /* NOTREACHED */ 6890 } 6891} 6892 6893static void 6894ztest_run_init(void) 6895{ 6896 ztest_shared_t *zs = ztest_shared; 6897 6898 /* 6899 * Blow away any existing copy of zpool.cache 6900 */ 6901 (void) remove(spa_config_path); 6902 6903 if (ztest_opts.zo_init == 0) { 6904 if (ztest_opts.zo_verbose >= 1) 6905 (void) printf("Importing pool %s\n", 6906 ztest_opts.zo_pool); 6907 ztest_import(zs); 6908 return; 6909 } 6910 6911 /* 6912 * Create and initialize our storage pool. 6913 */ 6914 for (int i = 1; i <= ztest_opts.zo_init; i++) { 6915 bzero(zs, sizeof (ztest_shared_t)); 6916 if (ztest_opts.zo_verbose >= 3 && 6917 ztest_opts.zo_init != 1) { 6918 (void) printf("ztest_init(), pass %d\n", i); 6919 } 6920 ztest_init(zs); 6921 } 6922} 6923 6924int 6925main(int argc, char **argv) 6926{ 6927 int kills = 0; 6928 int iters = 0; 6929 int older = 0; 6930 int newer = 0; 6931 ztest_shared_t *zs; 6932 ztest_info_t *zi; 6933 ztest_shared_callstate_t *zc; 6934 char timebuf[100]; 6935 char numbuf[NN_NUMBUF_SZ]; 6936 char *cmd; 6937 boolean_t hasalt; 6938 char *fd_data_str = getenv("ZTEST_FD_DATA"); 6939 6940 (void) setvbuf(stdout, NULL, _IOLBF, 0); 6941 6942 dprintf_setup(&argc, argv); 6943 zfs_deadman_synctime_ms = 300000; 6944 /* 6945 * As two-word space map entries may not come up often (especially 6946 * if pool and vdev sizes are small) we want to force at least some 6947 * of them so the feature get tested. 6948 */ 6949 zfs_force_some_double_word_sm_entries = B_TRUE; 6950 6951 /* 6952 * Verify that even extensively damaged split blocks with many 6953 * segments can be reconstructed in a reasonable amount of time 6954 * when reconstruction is known to be possible. 6955 */ 6956 zfs_reconstruct_indirect_damage_fraction = 4; 6957 6958 ztest_fd_rand = open("/dev/urandom", O_RDONLY); 6959 ASSERT3S(ztest_fd_rand, >=, 0); 6960 6961 if (!fd_data_str) { 6962 process_options(argc, argv); 6963 6964 setup_data_fd(); 6965 setup_hdr(); 6966 setup_data(); 6967 bcopy(&ztest_opts, ztest_shared_opts, 6968 sizeof (*ztest_shared_opts)); 6969 } else { 6970 ztest_fd_data = atoi(fd_data_str); 6971 setup_data(); 6972 bcopy(ztest_shared_opts, &ztest_opts, sizeof (ztest_opts)); 6973 } 6974 ASSERT3U(ztest_opts.zo_datasets, ==, ztest_shared_hdr->zh_ds_count); 6975 6976 /* Override location of zpool.cache */ 6977 VERIFY3U(asprintf((char **)&spa_config_path, "%s/zpool.cache", 6978 ztest_opts.zo_dir), !=, -1); 6979 6980 ztest_ds = umem_alloc(ztest_opts.zo_datasets * sizeof (ztest_ds_t), 6981 UMEM_NOFAIL); 6982 zs = ztest_shared; 6983 6984 if (fd_data_str) { 6985 metaslab_force_ganging = ztest_opts.zo_metaslab_force_ganging; 6986 metaslab_df_alloc_threshold = 6987 zs->zs_metaslab_df_alloc_threshold; 6988 6989 if (zs->zs_do_init) 6990 ztest_run_init(); 6991 else 6992 ztest_run(zs); 6993 exit(0); 6994 } 6995 6996 hasalt = (strlen(ztest_opts.zo_alt_ztest) != 0); 6997 6998 if (ztest_opts.zo_verbose >= 1) { 6999 (void) printf("%llu vdevs, %d datasets, %d threads," 7000 " %llu seconds...\n", 7001 (u_longlong_t)ztest_opts.zo_vdevs, 7002 ztest_opts.zo_datasets, 7003 ztest_opts.zo_threads, 7004 (u_longlong_t)ztest_opts.zo_time); 7005 } 7006 7007 cmd = umem_alloc(MAXNAMELEN, UMEM_NOFAIL); 7008 (void) strlcpy(cmd, getexecname(), MAXNAMELEN); 7009 7010 zs->zs_do_init = B_TRUE; 7011 if (strlen(ztest_opts.zo_alt_ztest) != 0) { 7012 if (ztest_opts.zo_verbose >= 1) { 7013 (void) printf("Executing older ztest for " 7014 "initialization: %s\n", ztest_opts.zo_alt_ztest); 7015 } 7016 VERIFY(!exec_child(ztest_opts.zo_alt_ztest, 7017 ztest_opts.zo_alt_libpath, B_FALSE, NULL)); 7018 } else { 7019 VERIFY(!exec_child(NULL, NULL, B_FALSE, NULL)); 7020 } 7021 zs->zs_do_init = B_FALSE; 7022 7023 zs->zs_proc_start = gethrtime(); 7024 zs->zs_proc_stop = zs->zs_proc_start + ztest_opts.zo_time * NANOSEC; 7025 7026 for (int f = 0; f < ZTEST_FUNCS; f++) { 7027 zi = &ztest_info[f]; 7028 zc = ZTEST_GET_SHARED_CALLSTATE(f); 7029 if (zs->zs_proc_start + zi->zi_interval[0] > zs->zs_proc_stop) 7030 zc->zc_next = UINT64_MAX; 7031 else 7032 zc->zc_next = zs->zs_proc_start + 7033 ztest_random(2 * zi->zi_interval[0] + 1); 7034 } 7035 7036 /* 7037 * Run the tests in a loop. These tests include fault injection 7038 * to verify that self-healing data works, and forced crashes 7039 * to verify that we never lose on-disk consistency. 7040 */ 7041 while (gethrtime() < zs->zs_proc_stop) { 7042 int status; 7043 boolean_t killed; 7044 7045 /* 7046 * Initialize the workload counters for each function. 7047 */ 7048 for (int f = 0; f < ZTEST_FUNCS; f++) { 7049 zc = ZTEST_GET_SHARED_CALLSTATE(f); 7050 zc->zc_count = 0; 7051 zc->zc_time = 0; 7052 } 7053 7054 /* Set the allocation switch size */ 7055 zs->zs_metaslab_df_alloc_threshold = 7056 ztest_random(zs->zs_metaslab_sz / 4) + 1; 7057 7058 if (!hasalt || ztest_random(2) == 0) { 7059 if (hasalt && ztest_opts.zo_verbose >= 1) { 7060 (void) printf("Executing newer ztest: %s\n", 7061 cmd); 7062 } 7063 newer++; 7064 killed = exec_child(cmd, NULL, B_TRUE, &status); 7065 } else { 7066 if (hasalt && ztest_opts.zo_verbose >= 1) { 7067 (void) printf("Executing older ztest: %s\n", 7068 ztest_opts.zo_alt_ztest); 7069 } 7070 older++; 7071 killed = exec_child(ztest_opts.zo_alt_ztest, 7072 ztest_opts.zo_alt_libpath, B_TRUE, &status); 7073 } 7074 7075 if (killed) 7076 kills++; 7077 iters++; 7078 7079 if (ztest_opts.zo_verbose >= 1) { 7080 hrtime_t now = gethrtime(); 7081 7082 now = MIN(now, zs->zs_proc_stop); 7083 print_time(zs->zs_proc_stop - now, timebuf); 7084 nicenum(zs->zs_space, numbuf, sizeof (numbuf)); 7085 7086 (void) printf("Pass %3d, %8s, %3llu ENOSPC, " 7087 "%4.1f%% of %5s used, %3.0f%% done, %8s to go\n", 7088 iters, 7089 WIFEXITED(status) ? "Complete" : "SIGKILL", 7090 (u_longlong_t)zs->zs_enospc_count, 7091 100.0 * zs->zs_alloc / zs->zs_space, 7092 numbuf, 7093 100.0 * (now - zs->zs_proc_start) / 7094 (ztest_opts.zo_time * NANOSEC), timebuf); 7095 } 7096 7097 if (ztest_opts.zo_verbose >= 2) { 7098 (void) printf("\nWorkload summary:\n\n"); 7099 (void) printf("%7s %9s %s\n", 7100 "Calls", "Time", "Function"); 7101 (void) printf("%7s %9s %s\n", 7102 "-----", "----", "--------"); 7103 for (int f = 0; f < ZTEST_FUNCS; f++) { 7104 Dl_info dli; 7105 7106 zi = &ztest_info[f]; 7107 zc = ZTEST_GET_SHARED_CALLSTATE(f); 7108 print_time(zc->zc_time, timebuf); 7109 (void) dladdr((void *)zi->zi_func, &dli); 7110 (void) printf("%7llu %9s %s\n", 7111 (u_longlong_t)zc->zc_count, timebuf, 7112 dli.dli_sname); 7113 } 7114 (void) printf("\n"); 7115 } 7116 7117 if (!ztest_opts.zo_mmp_test) 7118 ztest_run_zdb(ztest_opts.zo_pool); 7119 } 7120 7121 if (ztest_opts.zo_verbose >= 1) { 7122 if (hasalt) { 7123 (void) printf("%d runs of older ztest: %s\n", older, 7124 ztest_opts.zo_alt_ztest); 7125 (void) printf("%d runs of newer ztest: %s\n", newer, 7126 cmd); 7127 } 7128 (void) printf("%d killed, %d completed, %.0f%% kill rate\n", 7129 kills, iters - kills, (100.0 * kills) / MAX(1, iters)); 7130 } 7131 7132 umem_free(cmd, MAXNAMELEN); 7133 7134 return (0); 7135} 7136