zio.c revision 310106
1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21/* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2011, 2016 by Delphix. All rights reserved. 24 * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved. 25 * Copyright (c) 2014 Integros [integros.com] 26 */ 27 28#include <sys/sysmacros.h> 29#include <sys/zfs_context.h> 30#include <sys/fm/fs/zfs.h> 31#include <sys/spa.h> 32#include <sys/txg.h> 33#include <sys/spa_impl.h> 34#include <sys/vdev_impl.h> 35#include <sys/zio_impl.h> 36#include <sys/zio_compress.h> 37#include <sys/zio_checksum.h> 38#include <sys/dmu_objset.h> 39#include <sys/arc.h> 40#include <sys/ddt.h> 41#include <sys/trim_map.h> 42#include <sys/blkptr.h> 43#include <sys/zfeature.h> 44#include <sys/metaslab_impl.h> 45 46SYSCTL_DECL(_vfs_zfs); 47SYSCTL_NODE(_vfs_zfs, OID_AUTO, zio, CTLFLAG_RW, 0, "ZFS ZIO"); 48#if defined(__amd64__) 49static int zio_use_uma = 1; 50#else 51static int zio_use_uma = 0; 52#endif 53TUNABLE_INT("vfs.zfs.zio.use_uma", &zio_use_uma); 54SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, use_uma, CTLFLAG_RDTUN, &zio_use_uma, 0, 55 "Use uma(9) for ZIO allocations"); 56static int zio_exclude_metadata = 0; 57TUNABLE_INT("vfs.zfs.zio.exclude_metadata", &zio_exclude_metadata); 58SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, exclude_metadata, CTLFLAG_RDTUN, &zio_exclude_metadata, 0, 59 "Exclude metadata buffers from dumps as well"); 60 61zio_trim_stats_t zio_trim_stats = { 62 { "bytes", KSTAT_DATA_UINT64, 63 "Number of bytes successfully TRIMmed" }, 64 { "success", KSTAT_DATA_UINT64, 65 "Number of successful TRIM requests" }, 66 { "unsupported", KSTAT_DATA_UINT64, 67 "Number of TRIM requests that failed because TRIM is not supported" }, 68 { "failed", KSTAT_DATA_UINT64, 69 "Number of TRIM requests that failed for reasons other than not supported" }, 70}; 71 72static kstat_t *zio_trim_ksp; 73 74/* 75 * ========================================================================== 76 * I/O type descriptions 77 * ========================================================================== 78 */ 79const char *zio_type_name[ZIO_TYPES] = { 80 "zio_null", "zio_read", "zio_write", "zio_free", "zio_claim", 81 "zio_ioctl" 82}; 83 84boolean_t zio_dva_throttle_enabled = B_TRUE; 85SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, dva_throttle_enabled, CTLFLAG_RDTUN, 86 &zio_dva_throttle_enabled, 0, ""); 87 88/* 89 * ========================================================================== 90 * I/O kmem caches 91 * ========================================================================== 92 */ 93kmem_cache_t *zio_cache; 94kmem_cache_t *zio_link_cache; 95kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; 96kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; 97 98#ifdef _KERNEL 99extern vmem_t *zio_alloc_arena; 100#endif 101 102#define ZIO_PIPELINE_CONTINUE 0x100 103#define ZIO_PIPELINE_STOP 0x101 104 105#define BP_SPANB(indblkshift, level) \ 106 (((uint64_t)1) << ((level) * ((indblkshift) - SPA_BLKPTRSHIFT))) 107#define COMPARE_META_LEVEL 0x80000000ul 108/* 109 * The following actions directly effect the spa's sync-to-convergence logic. 110 * The values below define the sync pass when we start performing the action. 111 * Care should be taken when changing these values as they directly impact 112 * spa_sync() performance. Tuning these values may introduce subtle performance 113 * pathologies and should only be done in the context of performance analysis. 114 * These tunables will eventually be removed and replaced with #defines once 115 * enough analysis has been done to determine optimal values. 116 * 117 * The 'zfs_sync_pass_deferred_free' pass must be greater than 1 to ensure that 118 * regular blocks are not deferred. 119 */ 120int zfs_sync_pass_deferred_free = 2; /* defer frees starting in this pass */ 121TUNABLE_INT("vfs.zfs.sync_pass_deferred_free", &zfs_sync_pass_deferred_free); 122SYSCTL_INT(_vfs_zfs, OID_AUTO, sync_pass_deferred_free, CTLFLAG_RDTUN, 123 &zfs_sync_pass_deferred_free, 0, "defer frees starting in this pass"); 124int zfs_sync_pass_dont_compress = 5; /* don't compress starting in this pass */ 125TUNABLE_INT("vfs.zfs.sync_pass_dont_compress", &zfs_sync_pass_dont_compress); 126SYSCTL_INT(_vfs_zfs, OID_AUTO, sync_pass_dont_compress, CTLFLAG_RDTUN, 127 &zfs_sync_pass_dont_compress, 0, "don't compress starting in this pass"); 128int zfs_sync_pass_rewrite = 2; /* rewrite new bps starting in this pass */ 129TUNABLE_INT("vfs.zfs.sync_pass_rewrite", &zfs_sync_pass_rewrite); 130SYSCTL_INT(_vfs_zfs, OID_AUTO, sync_pass_rewrite, CTLFLAG_RDTUN, 131 &zfs_sync_pass_rewrite, 0, "rewrite new bps starting in this pass"); 132 133/* 134 * An allocating zio is one that either currently has the DVA allocate 135 * stage set or will have it later in its lifetime. 136 */ 137#define IO_IS_ALLOCATING(zio) ((zio)->io_orig_pipeline & ZIO_STAGE_DVA_ALLOCATE) 138 139boolean_t zio_requeue_io_start_cut_in_line = B_TRUE; 140 141#ifdef illumos 142#ifdef ZFS_DEBUG 143int zio_buf_debug_limit = 16384; 144#else 145int zio_buf_debug_limit = 0; 146#endif 147#endif 148 149static void zio_taskq_dispatch(zio_t *, zio_taskq_type_t, boolean_t); 150 151void 152zio_init(void) 153{ 154 size_t c; 155 zio_cache = kmem_cache_create("zio_cache", 156 sizeof (zio_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 157 zio_link_cache = kmem_cache_create("zio_link_cache", 158 sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 159 if (!zio_use_uma) 160 goto out; 161 162 /* 163 * For small buffers, we want a cache for each multiple of 164 * SPA_MINBLOCKSIZE. For larger buffers, we want a cache 165 * for each quarter-power of 2. 166 */ 167 for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { 168 size_t size = (c + 1) << SPA_MINBLOCKSHIFT; 169 size_t p2 = size; 170 size_t align = 0; 171 int cflags = zio_exclude_metadata ? KMC_NODEBUG : 0; 172 173 while (!ISP2(p2)) 174 p2 &= p2 - 1; 175 176#ifdef illumos 177#ifndef _KERNEL 178 /* 179 * If we are using watchpoints, put each buffer on its own page, 180 * to eliminate the performance overhead of trapping to the 181 * kernel when modifying a non-watched buffer that shares the 182 * page with a watched buffer. 183 */ 184 if (arc_watch && !IS_P2ALIGNED(size, PAGESIZE)) 185 continue; 186#endif 187#endif /* illumos */ 188 if (size <= 4 * SPA_MINBLOCKSIZE) { 189 align = SPA_MINBLOCKSIZE; 190 } else if (IS_P2ALIGNED(size, p2 >> 2)) { 191 align = MIN(p2 >> 2, PAGESIZE); 192 } 193 194 if (align != 0) { 195 char name[36]; 196 (void) sprintf(name, "zio_buf_%lu", (ulong_t)size); 197 zio_buf_cache[c] = kmem_cache_create(name, size, 198 align, NULL, NULL, NULL, NULL, NULL, cflags); 199 200 /* 201 * Since zio_data bufs do not appear in crash dumps, we 202 * pass KMC_NOTOUCH so that no allocator metadata is 203 * stored with the buffers. 204 */ 205 (void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size); 206 zio_data_buf_cache[c] = kmem_cache_create(name, size, 207 align, NULL, NULL, NULL, NULL, NULL, 208 cflags | KMC_NOTOUCH | KMC_NODEBUG); 209 } 210 } 211 212 while (--c != 0) { 213 ASSERT(zio_buf_cache[c] != NULL); 214 if (zio_buf_cache[c - 1] == NULL) 215 zio_buf_cache[c - 1] = zio_buf_cache[c]; 216 217 ASSERT(zio_data_buf_cache[c] != NULL); 218 if (zio_data_buf_cache[c - 1] == NULL) 219 zio_data_buf_cache[c - 1] = zio_data_buf_cache[c]; 220 } 221out: 222 223 zio_inject_init(); 224 225 zio_trim_ksp = kstat_create("zfs", 0, "zio_trim", "misc", 226 KSTAT_TYPE_NAMED, 227 sizeof(zio_trim_stats) / sizeof(kstat_named_t), 228 KSTAT_FLAG_VIRTUAL); 229 230 if (zio_trim_ksp != NULL) { 231 zio_trim_ksp->ks_data = &zio_trim_stats; 232 kstat_install(zio_trim_ksp); 233 } 234} 235 236void 237zio_fini(void) 238{ 239 size_t c; 240 kmem_cache_t *last_cache = NULL; 241 kmem_cache_t *last_data_cache = NULL; 242 243 for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { 244 if (zio_buf_cache[c] != last_cache) { 245 last_cache = zio_buf_cache[c]; 246 kmem_cache_destroy(zio_buf_cache[c]); 247 } 248 zio_buf_cache[c] = NULL; 249 250 if (zio_data_buf_cache[c] != last_data_cache) { 251 last_data_cache = zio_data_buf_cache[c]; 252 kmem_cache_destroy(zio_data_buf_cache[c]); 253 } 254 zio_data_buf_cache[c] = NULL; 255 } 256 257 kmem_cache_destroy(zio_link_cache); 258 kmem_cache_destroy(zio_cache); 259 260 zio_inject_fini(); 261 262 if (zio_trim_ksp != NULL) { 263 kstat_delete(zio_trim_ksp); 264 zio_trim_ksp = NULL; 265 } 266} 267 268/* 269 * ========================================================================== 270 * Allocate and free I/O buffers 271 * ========================================================================== 272 */ 273 274/* 275 * Use zio_buf_alloc to allocate ZFS metadata. This data will appear in a 276 * crashdump if the kernel panics, so use it judiciously. Obviously, it's 277 * useful to inspect ZFS metadata, but if possible, we should avoid keeping 278 * excess / transient data in-core during a crashdump. 279 */ 280void * 281zio_buf_alloc(size_t size) 282{ 283 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 284 int flags = zio_exclude_metadata ? KM_NODEBUG : 0; 285 286 VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 287 288 if (zio_use_uma) 289 return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE)); 290 else 291 return (kmem_alloc(size, KM_SLEEP|flags)); 292} 293 294/* 295 * Use zio_data_buf_alloc to allocate data. The data will not appear in a 296 * crashdump if the kernel panics. This exists so that we will limit the amount 297 * of ZFS data that shows up in a kernel crashdump. (Thus reducing the amount 298 * of kernel heap dumped to disk when the kernel panics) 299 */ 300void * 301zio_data_buf_alloc(size_t size) 302{ 303 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 304 305 VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 306 307 if (zio_use_uma) 308 return (kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE)); 309 else 310 return (kmem_alloc(size, KM_SLEEP | KM_NODEBUG)); 311} 312 313void 314zio_buf_free(void *buf, size_t size) 315{ 316 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 317 318 VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 319 320 if (zio_use_uma) 321 kmem_cache_free(zio_buf_cache[c], buf); 322 else 323 kmem_free(buf, size); 324} 325 326void 327zio_data_buf_free(void *buf, size_t size) 328{ 329 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 330 331 VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 332 333 if (zio_use_uma) 334 kmem_cache_free(zio_data_buf_cache[c], buf); 335 else 336 kmem_free(buf, size); 337} 338 339/* 340 * ========================================================================== 341 * Push and pop I/O transform buffers 342 * ========================================================================== 343 */ 344void 345zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize, 346 zio_transform_func_t *transform) 347{ 348 zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP); 349 350 zt->zt_orig_data = zio->io_data; 351 zt->zt_orig_size = zio->io_size; 352 zt->zt_bufsize = bufsize; 353 zt->zt_transform = transform; 354 355 zt->zt_next = zio->io_transform_stack; 356 zio->io_transform_stack = zt; 357 358 zio->io_data = data; 359 zio->io_size = size; 360} 361 362void 363zio_pop_transforms(zio_t *zio) 364{ 365 zio_transform_t *zt; 366 367 while ((zt = zio->io_transform_stack) != NULL) { 368 if (zt->zt_transform != NULL) 369 zt->zt_transform(zio, 370 zt->zt_orig_data, zt->zt_orig_size); 371 372 if (zt->zt_bufsize != 0) 373 zio_buf_free(zio->io_data, zt->zt_bufsize); 374 375 zio->io_data = zt->zt_orig_data; 376 zio->io_size = zt->zt_orig_size; 377 zio->io_transform_stack = zt->zt_next; 378 379 kmem_free(zt, sizeof (zio_transform_t)); 380 } 381} 382 383/* 384 * ========================================================================== 385 * I/O transform callbacks for subblocks and decompression 386 * ========================================================================== 387 */ 388static void 389zio_subblock(zio_t *zio, void *data, uint64_t size) 390{ 391 ASSERT(zio->io_size > size); 392 393 if (zio->io_type == ZIO_TYPE_READ) 394 bcopy(zio->io_data, data, size); 395} 396 397static void 398zio_decompress(zio_t *zio, void *data, uint64_t size) 399{ 400 if (zio->io_error == 0 && 401 zio_decompress_data(BP_GET_COMPRESS(zio->io_bp), 402 zio->io_data, data, zio->io_size, size) != 0) 403 zio->io_error = SET_ERROR(EIO); 404} 405 406/* 407 * ========================================================================== 408 * I/O parent/child relationships and pipeline interlocks 409 * ========================================================================== 410 */ 411zio_t * 412zio_walk_parents(zio_t *cio, zio_link_t **zl) 413{ 414 list_t *pl = &cio->io_parent_list; 415 416 *zl = (*zl == NULL) ? list_head(pl) : list_next(pl, *zl); 417 if (*zl == NULL) 418 return (NULL); 419 420 ASSERT((*zl)->zl_child == cio); 421 return ((*zl)->zl_parent); 422} 423 424zio_t * 425zio_walk_children(zio_t *pio, zio_link_t **zl) 426{ 427 list_t *cl = &pio->io_child_list; 428 429 *zl = (*zl == NULL) ? list_head(cl) : list_next(cl, *zl); 430 if (*zl == NULL) 431 return (NULL); 432 433 ASSERT((*zl)->zl_parent == pio); 434 return ((*zl)->zl_child); 435} 436 437zio_t * 438zio_unique_parent(zio_t *cio) 439{ 440 zio_link_t *zl = NULL; 441 zio_t *pio = zio_walk_parents(cio, &zl); 442 443 VERIFY3P(zio_walk_parents(cio, &zl), ==, NULL); 444 return (pio); 445} 446 447void 448zio_add_child(zio_t *pio, zio_t *cio) 449{ 450 zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP); 451 452 /* 453 * Logical I/Os can have logical, gang, or vdev children. 454 * Gang I/Os can have gang or vdev children. 455 * Vdev I/Os can only have vdev children. 456 * The following ASSERT captures all of these constraints. 457 */ 458 ASSERT(cio->io_child_type <= pio->io_child_type); 459 460 zl->zl_parent = pio; 461 zl->zl_child = cio; 462 463 mutex_enter(&cio->io_lock); 464 mutex_enter(&pio->io_lock); 465 466 ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0); 467 468 for (int w = 0; w < ZIO_WAIT_TYPES; w++) 469 pio->io_children[cio->io_child_type][w] += !cio->io_state[w]; 470 471 list_insert_head(&pio->io_child_list, zl); 472 list_insert_head(&cio->io_parent_list, zl); 473 474 pio->io_child_count++; 475 cio->io_parent_count++; 476 477 mutex_exit(&pio->io_lock); 478 mutex_exit(&cio->io_lock); 479} 480 481static void 482zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl) 483{ 484 ASSERT(zl->zl_parent == pio); 485 ASSERT(zl->zl_child == cio); 486 487 mutex_enter(&cio->io_lock); 488 mutex_enter(&pio->io_lock); 489 490 list_remove(&pio->io_child_list, zl); 491 list_remove(&cio->io_parent_list, zl); 492 493 pio->io_child_count--; 494 cio->io_parent_count--; 495 496 mutex_exit(&pio->io_lock); 497 mutex_exit(&cio->io_lock); 498 499 kmem_cache_free(zio_link_cache, zl); 500} 501 502static boolean_t 503zio_wait_for_children(zio_t *zio, enum zio_child child, enum zio_wait_type wait) 504{ 505 uint64_t *countp = &zio->io_children[child][wait]; 506 boolean_t waiting = B_FALSE; 507 508 mutex_enter(&zio->io_lock); 509 ASSERT(zio->io_stall == NULL); 510 if (*countp != 0) { 511 zio->io_stage >>= 1; 512 ASSERT3U(zio->io_stage, !=, ZIO_STAGE_OPEN); 513 zio->io_stall = countp; 514 waiting = B_TRUE; 515 } 516 mutex_exit(&zio->io_lock); 517 518 return (waiting); 519} 520 521static void 522zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait) 523{ 524 uint64_t *countp = &pio->io_children[zio->io_child_type][wait]; 525 int *errorp = &pio->io_child_error[zio->io_child_type]; 526 527 mutex_enter(&pio->io_lock); 528 if (zio->io_error && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) 529 *errorp = zio_worst_error(*errorp, zio->io_error); 530 pio->io_reexecute |= zio->io_reexecute; 531 ASSERT3U(*countp, >, 0); 532 533 (*countp)--; 534 535 if (*countp == 0 && pio->io_stall == countp) { 536 zio_taskq_type_t type = 537 pio->io_stage < ZIO_STAGE_VDEV_IO_START ? ZIO_TASKQ_ISSUE : 538 ZIO_TASKQ_INTERRUPT; 539 pio->io_stall = NULL; 540 mutex_exit(&pio->io_lock); 541 /* 542 * Dispatch the parent zio in its own taskq so that 543 * the child can continue to make progress. This also 544 * prevents overflowing the stack when we have deeply nested 545 * parent-child relationships. 546 */ 547 zio_taskq_dispatch(pio, type, B_FALSE); 548 } else { 549 mutex_exit(&pio->io_lock); 550 } 551} 552 553static void 554zio_inherit_child_errors(zio_t *zio, enum zio_child c) 555{ 556 if (zio->io_child_error[c] != 0 && zio->io_error == 0) 557 zio->io_error = zio->io_child_error[c]; 558} 559 560int 561zio_timestamp_compare(const void *x1, const void *x2) 562{ 563 const zio_t *z1 = x1; 564 const zio_t *z2 = x2; 565 566 if (z1->io_queued_timestamp < z2->io_queued_timestamp) 567 return (-1); 568 if (z1->io_queued_timestamp > z2->io_queued_timestamp) 569 return (1); 570 571 if (z1->io_bookmark.zb_objset < z2->io_bookmark.zb_objset) 572 return (-1); 573 if (z1->io_bookmark.zb_objset > z2->io_bookmark.zb_objset) 574 return (1); 575 576 if (z1->io_bookmark.zb_object < z2->io_bookmark.zb_object) 577 return (-1); 578 if (z1->io_bookmark.zb_object > z2->io_bookmark.zb_object) 579 return (1); 580 581 if (z1->io_bookmark.zb_level < z2->io_bookmark.zb_level) 582 return (-1); 583 if (z1->io_bookmark.zb_level > z2->io_bookmark.zb_level) 584 return (1); 585 586 if (z1->io_bookmark.zb_blkid < z2->io_bookmark.zb_blkid) 587 return (-1); 588 if (z1->io_bookmark.zb_blkid > z2->io_bookmark.zb_blkid) 589 return (1); 590 591 if (z1 < z2) 592 return (-1); 593 if (z1 > z2) 594 return (1); 595 596 return (0); 597} 598 599/* 600 * ========================================================================== 601 * Create the various types of I/O (read, write, free, etc) 602 * ========================================================================== 603 */ 604static zio_t * 605zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, 606 void *data, uint64_t size, zio_done_func_t *done, void *private, 607 zio_type_t type, zio_priority_t priority, enum zio_flag flags, 608 vdev_t *vd, uint64_t offset, const zbookmark_phys_t *zb, 609 enum zio_stage stage, enum zio_stage pipeline) 610{ 611 zio_t *zio; 612 613 ASSERT3U(type == ZIO_TYPE_FREE || size, <=, SPA_MAXBLOCKSIZE); 614 ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0); 615 ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0); 616 617 ASSERT(!vd || spa_config_held(spa, SCL_STATE_ALL, RW_READER)); 618 ASSERT(!bp || !(flags & ZIO_FLAG_CONFIG_WRITER)); 619 ASSERT(vd || stage == ZIO_STAGE_OPEN); 620 621 zio = kmem_cache_alloc(zio_cache, KM_SLEEP); 622 bzero(zio, sizeof (zio_t)); 623 624 mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL); 625 cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL); 626 627 list_create(&zio->io_parent_list, sizeof (zio_link_t), 628 offsetof(zio_link_t, zl_parent_node)); 629 list_create(&zio->io_child_list, sizeof (zio_link_t), 630 offsetof(zio_link_t, zl_child_node)); 631 632 if (vd != NULL) 633 zio->io_child_type = ZIO_CHILD_VDEV; 634 else if (flags & ZIO_FLAG_GANG_CHILD) 635 zio->io_child_type = ZIO_CHILD_GANG; 636 else if (flags & ZIO_FLAG_DDT_CHILD) 637 zio->io_child_type = ZIO_CHILD_DDT; 638 else 639 zio->io_child_type = ZIO_CHILD_LOGICAL; 640 641 if (bp != NULL) { 642 zio->io_bp = (blkptr_t *)bp; 643 zio->io_bp_copy = *bp; 644 zio->io_bp_orig = *bp; 645 if (type != ZIO_TYPE_WRITE || 646 zio->io_child_type == ZIO_CHILD_DDT) 647 zio->io_bp = &zio->io_bp_copy; /* so caller can free */ 648 if (zio->io_child_type == ZIO_CHILD_LOGICAL) 649 zio->io_logical = zio; 650 if (zio->io_child_type > ZIO_CHILD_GANG && BP_IS_GANG(bp)) 651 pipeline |= ZIO_GANG_STAGES; 652 } 653 654 zio->io_spa = spa; 655 zio->io_txg = txg; 656 zio->io_done = done; 657 zio->io_private = private; 658 zio->io_type = type; 659 zio->io_priority = priority; 660 zio->io_vd = vd; 661 zio->io_offset = offset; 662 zio->io_orig_data = zio->io_data = data; 663 zio->io_orig_size = zio->io_size = size; 664 zio->io_orig_flags = zio->io_flags = flags; 665 zio->io_orig_stage = zio->io_stage = stage; 666 zio->io_orig_pipeline = zio->io_pipeline = pipeline; 667 zio->io_pipeline_trace = ZIO_STAGE_OPEN; 668 669 zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY); 670 zio->io_state[ZIO_WAIT_DONE] = (stage >= ZIO_STAGE_DONE); 671 672 if (zb != NULL) 673 zio->io_bookmark = *zb; 674 675 if (pio != NULL) { 676 if (zio->io_logical == NULL) 677 zio->io_logical = pio->io_logical; 678 if (zio->io_child_type == ZIO_CHILD_GANG) 679 zio->io_gang_leader = pio->io_gang_leader; 680 zio_add_child(pio, zio); 681 } 682 683 return (zio); 684} 685 686static void 687zio_destroy(zio_t *zio) 688{ 689 list_destroy(&zio->io_parent_list); 690 list_destroy(&zio->io_child_list); 691 mutex_destroy(&zio->io_lock); 692 cv_destroy(&zio->io_cv); 693 kmem_cache_free(zio_cache, zio); 694} 695 696zio_t * 697zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done, 698 void *private, enum zio_flag flags) 699{ 700 zio_t *zio; 701 702 zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private, 703 ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL, 704 ZIO_STAGE_OPEN, ZIO_INTERLOCK_PIPELINE); 705 706 return (zio); 707} 708 709zio_t * 710zio_root(spa_t *spa, zio_done_func_t *done, void *private, enum zio_flag flags) 711{ 712 return (zio_null(NULL, spa, NULL, done, private, flags)); 713} 714 715void 716zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp) 717{ 718 if (!DMU_OT_IS_VALID(BP_GET_TYPE(bp))) { 719 zfs_panic_recover("blkptr at %p has invalid TYPE %llu", 720 bp, (longlong_t)BP_GET_TYPE(bp)); 721 } 722 if (BP_GET_CHECKSUM(bp) >= ZIO_CHECKSUM_FUNCTIONS || 723 BP_GET_CHECKSUM(bp) <= ZIO_CHECKSUM_ON) { 724 zfs_panic_recover("blkptr at %p has invalid CHECKSUM %llu", 725 bp, (longlong_t)BP_GET_CHECKSUM(bp)); 726 } 727 if (BP_GET_COMPRESS(bp) >= ZIO_COMPRESS_FUNCTIONS || 728 BP_GET_COMPRESS(bp) <= ZIO_COMPRESS_ON) { 729 zfs_panic_recover("blkptr at %p has invalid COMPRESS %llu", 730 bp, (longlong_t)BP_GET_COMPRESS(bp)); 731 } 732 if (BP_GET_LSIZE(bp) > SPA_MAXBLOCKSIZE) { 733 zfs_panic_recover("blkptr at %p has invalid LSIZE %llu", 734 bp, (longlong_t)BP_GET_LSIZE(bp)); 735 } 736 if (BP_GET_PSIZE(bp) > SPA_MAXBLOCKSIZE) { 737 zfs_panic_recover("blkptr at %p has invalid PSIZE %llu", 738 bp, (longlong_t)BP_GET_PSIZE(bp)); 739 } 740 741 if (BP_IS_EMBEDDED(bp)) { 742 if (BPE_GET_ETYPE(bp) > NUM_BP_EMBEDDED_TYPES) { 743 zfs_panic_recover("blkptr at %p has invalid ETYPE %llu", 744 bp, (longlong_t)BPE_GET_ETYPE(bp)); 745 } 746 } 747 748 /* 749 * Pool-specific checks. 750 * 751 * Note: it would be nice to verify that the blk_birth and 752 * BP_PHYSICAL_BIRTH() are not too large. However, spa_freeze() 753 * allows the birth time of log blocks (and dmu_sync()-ed blocks 754 * that are in the log) to be arbitrarily large. 755 */ 756 for (int i = 0; i < BP_GET_NDVAS(bp); i++) { 757 uint64_t vdevid = DVA_GET_VDEV(&bp->blk_dva[i]); 758 if (vdevid >= spa->spa_root_vdev->vdev_children) { 759 zfs_panic_recover("blkptr at %p DVA %u has invalid " 760 "VDEV %llu", 761 bp, i, (longlong_t)vdevid); 762 continue; 763 } 764 vdev_t *vd = spa->spa_root_vdev->vdev_child[vdevid]; 765 if (vd == NULL) { 766 zfs_panic_recover("blkptr at %p DVA %u has invalid " 767 "VDEV %llu", 768 bp, i, (longlong_t)vdevid); 769 continue; 770 } 771 if (vd->vdev_ops == &vdev_hole_ops) { 772 zfs_panic_recover("blkptr at %p DVA %u has hole " 773 "VDEV %llu", 774 bp, i, (longlong_t)vdevid); 775 continue; 776 } 777 if (vd->vdev_ops == &vdev_missing_ops) { 778 /* 779 * "missing" vdevs are valid during import, but we 780 * don't have their detailed info (e.g. asize), so 781 * we can't perform any more checks on them. 782 */ 783 continue; 784 } 785 uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]); 786 uint64_t asize = DVA_GET_ASIZE(&bp->blk_dva[i]); 787 if (BP_IS_GANG(bp)) 788 asize = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); 789 if (offset + asize > vd->vdev_asize) { 790 zfs_panic_recover("blkptr at %p DVA %u has invalid " 791 "OFFSET %llu", 792 bp, i, (longlong_t)offset); 793 } 794 } 795} 796 797zio_t * 798zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, 799 void *data, uint64_t size, zio_done_func_t *done, void *private, 800 zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb) 801{ 802 zio_t *zio; 803 804 zfs_blkptr_verify(spa, bp); 805 806 zio = zio_create(pio, spa, BP_PHYSICAL_BIRTH(bp), bp, 807 data, size, done, private, 808 ZIO_TYPE_READ, priority, flags, NULL, 0, zb, 809 ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ? 810 ZIO_DDT_CHILD_READ_PIPELINE : ZIO_READ_PIPELINE); 811 812 return (zio); 813} 814 815zio_t * 816zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, 817 void *data, uint64_t size, const zio_prop_t *zp, 818 zio_done_func_t *ready, zio_done_func_t *children_ready, 819 zio_done_func_t *physdone, zio_done_func_t *done, 820 void *private, zio_priority_t priority, enum zio_flag flags, 821 const zbookmark_phys_t *zb) 822{ 823 zio_t *zio; 824 825 ASSERT(zp->zp_checksum >= ZIO_CHECKSUM_OFF && 826 zp->zp_checksum < ZIO_CHECKSUM_FUNCTIONS && 827 zp->zp_compress >= ZIO_COMPRESS_OFF && 828 zp->zp_compress < ZIO_COMPRESS_FUNCTIONS && 829 DMU_OT_IS_VALID(zp->zp_type) && 830 zp->zp_level < 32 && 831 zp->zp_copies > 0 && 832 zp->zp_copies <= spa_max_replication(spa)); 833 834 zio = zio_create(pio, spa, txg, bp, data, size, done, private, 835 ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb, 836 ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ? 837 ZIO_DDT_CHILD_WRITE_PIPELINE : ZIO_WRITE_PIPELINE); 838 839 zio->io_ready = ready; 840 zio->io_children_ready = children_ready; 841 zio->io_physdone = physdone; 842 zio->io_prop = *zp; 843 844 /* 845 * Data can be NULL if we are going to call zio_write_override() to 846 * provide the already-allocated BP. But we may need the data to 847 * verify a dedup hit (if requested). In this case, don't try to 848 * dedup (just take the already-allocated BP verbatim). 849 */ 850 if (data == NULL && zio->io_prop.zp_dedup_verify) { 851 zio->io_prop.zp_dedup = zio->io_prop.zp_dedup_verify = B_FALSE; 852 } 853 854 return (zio); 855} 856 857zio_t * 858zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data, 859 uint64_t size, zio_done_func_t *done, void *private, 860 zio_priority_t priority, enum zio_flag flags, zbookmark_phys_t *zb) 861{ 862 zio_t *zio; 863 864 zio = zio_create(pio, spa, txg, bp, data, size, done, private, 865 ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_IO_REWRITE, NULL, 0, zb, 866 ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE); 867 868 return (zio); 869} 870 871void 872zio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite) 873{ 874 ASSERT(zio->io_type == ZIO_TYPE_WRITE); 875 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 876 ASSERT(zio->io_stage == ZIO_STAGE_OPEN); 877 ASSERT(zio->io_txg == spa_syncing_txg(zio->io_spa)); 878 879 /* 880 * We must reset the io_prop to match the values that existed 881 * when the bp was first written by dmu_sync() keeping in mind 882 * that nopwrite and dedup are mutually exclusive. 883 */ 884 zio->io_prop.zp_dedup = nopwrite ? B_FALSE : zio->io_prop.zp_dedup; 885 zio->io_prop.zp_nopwrite = nopwrite; 886 zio->io_prop.zp_copies = copies; 887 zio->io_bp_override = bp; 888} 889 890void 891zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp) 892{ 893 894 /* 895 * The check for EMBEDDED is a performance optimization. We 896 * process the free here (by ignoring it) rather than 897 * putting it on the list and then processing it in zio_free_sync(). 898 */ 899 if (BP_IS_EMBEDDED(bp)) 900 return; 901 metaslab_check_free(spa, bp); 902 903 /* 904 * Frees that are for the currently-syncing txg, are not going to be 905 * deferred, and which will not need to do a read (i.e. not GANG or 906 * DEDUP), can be processed immediately. Otherwise, put them on the 907 * in-memory list for later processing. 908 */ 909 if (zfs_trim_enabled || BP_IS_GANG(bp) || BP_GET_DEDUP(bp) || 910 txg != spa->spa_syncing_txg || 911 spa_sync_pass(spa) >= zfs_sync_pass_deferred_free) { 912 bplist_append(&spa->spa_free_bplist[txg & TXG_MASK], bp); 913 } else { 914 VERIFY0(zio_wait(zio_free_sync(NULL, spa, txg, bp, 915 BP_GET_PSIZE(bp), 0))); 916 } 917} 918 919zio_t * 920zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, 921 uint64_t size, enum zio_flag flags) 922{ 923 zio_t *zio; 924 enum zio_stage stage = ZIO_FREE_PIPELINE; 925 926 ASSERT(!BP_IS_HOLE(bp)); 927 ASSERT(spa_syncing_txg(spa) == txg); 928 ASSERT(spa_sync_pass(spa) < zfs_sync_pass_deferred_free); 929 930 if (BP_IS_EMBEDDED(bp)) 931 return (zio_null(pio, spa, NULL, NULL, NULL, 0)); 932 933 metaslab_check_free(spa, bp); 934 arc_freed(spa, bp); 935 936 if (zfs_trim_enabled) 937 stage |= ZIO_STAGE_ISSUE_ASYNC | ZIO_STAGE_VDEV_IO_START | 938 ZIO_STAGE_VDEV_IO_ASSESS; 939 /* 940 * GANG and DEDUP blocks can induce a read (for the gang block header, 941 * or the DDT), so issue them asynchronously so that this thread is 942 * not tied up. 943 */ 944 else if (BP_IS_GANG(bp) || BP_GET_DEDUP(bp)) 945 stage |= ZIO_STAGE_ISSUE_ASYNC; 946 947 flags |= ZIO_FLAG_DONT_QUEUE; 948 949 zio = zio_create(pio, spa, txg, bp, NULL, size, 950 NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_NOW, flags, 951 NULL, 0, NULL, ZIO_STAGE_OPEN, stage); 952 953 return (zio); 954} 955 956zio_t * 957zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, 958 zio_done_func_t *done, void *private, enum zio_flag flags) 959{ 960 zio_t *zio; 961 962 dprintf_bp(bp, "claiming in txg %llu", txg); 963 964 if (BP_IS_EMBEDDED(bp)) 965 return (zio_null(pio, spa, NULL, NULL, NULL, 0)); 966 967 /* 968 * A claim is an allocation of a specific block. Claims are needed 969 * to support immediate writes in the intent log. The issue is that 970 * immediate writes contain committed data, but in a txg that was 971 * *not* committed. Upon opening the pool after an unclean shutdown, 972 * the intent log claims all blocks that contain immediate write data 973 * so that the SPA knows they're in use. 974 * 975 * All claims *must* be resolved in the first txg -- before the SPA 976 * starts allocating blocks -- so that nothing is allocated twice. 977 * If txg == 0 we just verify that the block is claimable. 978 */ 979 ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_first_txg(spa)); 980 ASSERT(txg == spa_first_txg(spa) || txg == 0); 981 ASSERT(!BP_GET_DEDUP(bp) || !spa_writeable(spa)); /* zdb(1M) */ 982 983 zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp), 984 done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, flags, 985 NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE); 986 ASSERT0(zio->io_queued_timestamp); 987 988 return (zio); 989} 990 991zio_t * 992zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, uint64_t offset, 993 uint64_t size, zio_done_func_t *done, void *private, 994 zio_priority_t priority, enum zio_flag flags) 995{ 996 zio_t *zio; 997 int c; 998 999 if (vd->vdev_children == 0) { 1000 zio = zio_create(pio, spa, 0, NULL, NULL, size, done, private, 1001 ZIO_TYPE_IOCTL, priority, flags, vd, offset, NULL, 1002 ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE); 1003 1004 zio->io_cmd = cmd; 1005 } else { 1006 zio = zio_null(pio, spa, NULL, NULL, NULL, flags); 1007 1008 for (c = 0; c < vd->vdev_children; c++) 1009 zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd, 1010 offset, size, done, private, priority, flags)); 1011 } 1012 1013 return (zio); 1014} 1015 1016zio_t * 1017zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, 1018 void *data, int checksum, zio_done_func_t *done, void *private, 1019 zio_priority_t priority, enum zio_flag flags, boolean_t labels) 1020{ 1021 zio_t *zio; 1022 1023 ASSERT(vd->vdev_children == 0); 1024 ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE || 1025 offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); 1026 ASSERT3U(offset + size, <=, vd->vdev_psize); 1027 1028 zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private, 1029 ZIO_TYPE_READ, priority, flags | ZIO_FLAG_PHYSICAL, vd, offset, 1030 NULL, ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE); 1031 1032 zio->io_prop.zp_checksum = checksum; 1033 1034 return (zio); 1035} 1036 1037zio_t * 1038zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, 1039 void *data, int checksum, zio_done_func_t *done, void *private, 1040 zio_priority_t priority, enum zio_flag flags, boolean_t labels) 1041{ 1042 zio_t *zio; 1043 1044 ASSERT(vd->vdev_children == 0); 1045 ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE || 1046 offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); 1047 ASSERT3U(offset + size, <=, vd->vdev_psize); 1048 1049 zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private, 1050 ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_PHYSICAL, vd, offset, 1051 NULL, ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE); 1052 1053 zio->io_prop.zp_checksum = checksum; 1054 1055 if (zio_checksum_table[checksum].ci_flags & ZCHECKSUM_FLAG_EMBEDDED) { 1056 /* 1057 * zec checksums are necessarily destructive -- they modify 1058 * the end of the write buffer to hold the verifier/checksum. 1059 * Therefore, we must make a local copy in case the data is 1060 * being written to multiple places in parallel. 1061 */ 1062 void *wbuf = zio_buf_alloc(size); 1063 bcopy(data, wbuf, size); 1064 zio_push_transform(zio, wbuf, size, size, NULL); 1065 } 1066 1067 return (zio); 1068} 1069 1070/* 1071 * Create a child I/O to do some work for us. 1072 */ 1073zio_t * 1074zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset, 1075 void *data, uint64_t size, int type, zio_priority_t priority, 1076 enum zio_flag flags, zio_done_func_t *done, void *private) 1077{ 1078 enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE; 1079 zio_t *zio; 1080 1081 ASSERT(vd->vdev_parent == 1082 (pio->io_vd ? pio->io_vd : pio->io_spa->spa_root_vdev)); 1083 1084 if (type == ZIO_TYPE_READ && bp != NULL) { 1085 /* 1086 * If we have the bp, then the child should perform the 1087 * checksum and the parent need not. This pushes error 1088 * detection as close to the leaves as possible and 1089 * eliminates redundant checksums in the interior nodes. 1090 */ 1091 pipeline |= ZIO_STAGE_CHECKSUM_VERIFY; 1092 pio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY; 1093 } 1094 1095 /* Not all IO types require vdev io done stage e.g. free */ 1096 if (!(pio->io_pipeline & ZIO_STAGE_VDEV_IO_DONE)) 1097 pipeline &= ~ZIO_STAGE_VDEV_IO_DONE; 1098 1099 if (vd->vdev_children == 0) 1100 offset += VDEV_LABEL_START_SIZE; 1101 1102 flags |= ZIO_VDEV_CHILD_FLAGS(pio) | ZIO_FLAG_DONT_PROPAGATE; 1103 1104 /* 1105 * If we've decided to do a repair, the write is not speculative -- 1106 * even if the original read was. 1107 */ 1108 if (flags & ZIO_FLAG_IO_REPAIR) 1109 flags &= ~ZIO_FLAG_SPECULATIVE; 1110 1111 /* 1112 * If we're creating a child I/O that is not associated with a 1113 * top-level vdev, then the child zio is not an allocating I/O. 1114 * If this is a retried I/O then we ignore it since we will 1115 * have already processed the original allocating I/O. 1116 */ 1117 if (flags & ZIO_FLAG_IO_ALLOCATING && 1118 (vd != vd->vdev_top || (flags & ZIO_FLAG_IO_RETRY))) { 1119 metaslab_class_t *mc = spa_normal_class(pio->io_spa); 1120 1121 ASSERT(mc->mc_alloc_throttle_enabled); 1122 ASSERT(type == ZIO_TYPE_WRITE); 1123 ASSERT(priority == ZIO_PRIORITY_ASYNC_WRITE); 1124 ASSERT(!(flags & ZIO_FLAG_IO_REPAIR)); 1125 ASSERT(!(pio->io_flags & ZIO_FLAG_IO_REWRITE) || 1126 pio->io_child_type == ZIO_CHILD_GANG); 1127 1128 flags &= ~ZIO_FLAG_IO_ALLOCATING; 1129 } 1130 1131 zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size, 1132 done, private, type, priority, flags, vd, offset, &pio->io_bookmark, 1133 ZIO_STAGE_VDEV_IO_START >> 1, pipeline); 1134 ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV); 1135 1136 zio->io_physdone = pio->io_physdone; 1137 if (vd->vdev_ops->vdev_op_leaf && zio->io_logical != NULL) 1138 zio->io_logical->io_phys_children++; 1139 1140 return (zio); 1141} 1142 1143zio_t * 1144zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, void *data, uint64_t size, 1145 int type, zio_priority_t priority, enum zio_flag flags, 1146 zio_done_func_t *done, void *private) 1147{ 1148 zio_t *zio; 1149 1150 ASSERT(vd->vdev_ops->vdev_op_leaf); 1151 1152 zio = zio_create(NULL, vd->vdev_spa, 0, NULL, 1153 data, size, done, private, type, priority, 1154 flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_DELEGATED, 1155 vd, offset, NULL, 1156 ZIO_STAGE_VDEV_IO_START >> 1, ZIO_VDEV_CHILD_PIPELINE); 1157 1158 return (zio); 1159} 1160 1161void 1162zio_flush(zio_t *zio, vdev_t *vd) 1163{ 1164 zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE, 0, 0, 1165 NULL, NULL, ZIO_PRIORITY_NOW, 1166 ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY)); 1167} 1168 1169zio_t * 1170zio_trim(zio_t *zio, spa_t *spa, vdev_t *vd, uint64_t offset, uint64_t size) 1171{ 1172 1173 ASSERT(vd->vdev_ops->vdev_op_leaf); 1174 1175 return (zio_create(zio, spa, 0, NULL, NULL, size, NULL, NULL, 1176 ZIO_TYPE_FREE, ZIO_PRIORITY_TRIM, ZIO_FLAG_DONT_AGGREGATE | 1177 ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY, 1178 vd, offset, NULL, ZIO_STAGE_OPEN, ZIO_FREE_PHYS_PIPELINE)); 1179} 1180 1181void 1182zio_shrink(zio_t *zio, uint64_t size) 1183{ 1184 ASSERT(zio->io_executor == NULL); 1185 ASSERT(zio->io_orig_size == zio->io_size); 1186 ASSERT(size <= zio->io_size); 1187 1188 /* 1189 * We don't shrink for raidz because of problems with the 1190 * reconstruction when reading back less than the block size. 1191 * Note, BP_IS_RAIDZ() assumes no compression. 1192 */ 1193 ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF); 1194 if (!BP_IS_RAIDZ(zio->io_bp)) 1195 zio->io_orig_size = zio->io_size = size; 1196} 1197 1198/* 1199 * ========================================================================== 1200 * Prepare to read and write logical blocks 1201 * ========================================================================== 1202 */ 1203 1204static int 1205zio_read_bp_init(zio_t *zio) 1206{ 1207 blkptr_t *bp = zio->io_bp; 1208 1209 if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF && 1210 zio->io_child_type == ZIO_CHILD_LOGICAL && 1211 !(zio->io_flags & ZIO_FLAG_RAW)) { 1212 uint64_t psize = 1213 BP_IS_EMBEDDED(bp) ? BPE_GET_PSIZE(bp) : BP_GET_PSIZE(bp); 1214 void *cbuf = zio_buf_alloc(psize); 1215 1216 zio_push_transform(zio, cbuf, psize, psize, zio_decompress); 1217 } 1218 1219 if (BP_IS_EMBEDDED(bp) && BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA) { 1220 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1221 decode_embedded_bp_compressed(bp, zio->io_data); 1222 } else { 1223 ASSERT(!BP_IS_EMBEDDED(bp)); 1224 } 1225 1226 if (!DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) && BP_GET_LEVEL(bp) == 0) 1227 zio->io_flags |= ZIO_FLAG_DONT_CACHE; 1228 1229 if (BP_GET_TYPE(bp) == DMU_OT_DDT_ZAP) 1230 zio->io_flags |= ZIO_FLAG_DONT_CACHE; 1231 1232 if (BP_GET_DEDUP(bp) && zio->io_child_type == ZIO_CHILD_LOGICAL) 1233 zio->io_pipeline = ZIO_DDT_READ_PIPELINE; 1234 1235 return (ZIO_PIPELINE_CONTINUE); 1236} 1237 1238static int 1239zio_write_bp_init(zio_t *zio) 1240{ 1241 if (!IO_IS_ALLOCATING(zio)) 1242 return (ZIO_PIPELINE_CONTINUE); 1243 1244 ASSERT(zio->io_child_type != ZIO_CHILD_DDT); 1245 1246 if (zio->io_bp_override) { 1247 blkptr_t *bp = zio->io_bp; 1248 zio_prop_t *zp = &zio->io_prop; 1249 1250 ASSERT(bp->blk_birth != zio->io_txg); 1251 ASSERT(BP_GET_DEDUP(zio->io_bp_override) == 0); 1252 1253 *bp = *zio->io_bp_override; 1254 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1255 1256 if (BP_IS_EMBEDDED(bp)) 1257 return (ZIO_PIPELINE_CONTINUE); 1258 1259 /* 1260 * If we've been overridden and nopwrite is set then 1261 * set the flag accordingly to indicate that a nopwrite 1262 * has already occurred. 1263 */ 1264 if (!BP_IS_HOLE(bp) && zp->zp_nopwrite) { 1265 ASSERT(!zp->zp_dedup); 1266 ASSERT3U(BP_GET_CHECKSUM(bp), ==, zp->zp_checksum); 1267 zio->io_flags |= ZIO_FLAG_NOPWRITE; 1268 return (ZIO_PIPELINE_CONTINUE); 1269 } 1270 1271 ASSERT(!zp->zp_nopwrite); 1272 1273 if (BP_IS_HOLE(bp) || !zp->zp_dedup) 1274 return (ZIO_PIPELINE_CONTINUE); 1275 1276 ASSERT((zio_checksum_table[zp->zp_checksum].ci_flags & 1277 ZCHECKSUM_FLAG_DEDUP) || zp->zp_dedup_verify); 1278 1279 if (BP_GET_CHECKSUM(bp) == zp->zp_checksum) { 1280 BP_SET_DEDUP(bp, 1); 1281 zio->io_pipeline |= ZIO_STAGE_DDT_WRITE; 1282 return (ZIO_PIPELINE_CONTINUE); 1283 } 1284 1285 /* 1286 * We were unable to handle this as an override bp, treat 1287 * it as a regular write I/O. 1288 */ 1289 zio->io_bp_override = NULL; 1290 *bp = zio->io_bp_orig; 1291 zio->io_pipeline = zio->io_orig_pipeline; 1292 } 1293 1294 return (ZIO_PIPELINE_CONTINUE); 1295} 1296 1297static int 1298zio_write_compress(zio_t *zio) 1299{ 1300 spa_t *spa = zio->io_spa; 1301 zio_prop_t *zp = &zio->io_prop; 1302 enum zio_compress compress = zp->zp_compress; 1303 blkptr_t *bp = zio->io_bp; 1304 uint64_t lsize = zio->io_size; 1305 uint64_t psize = lsize; 1306 int pass = 1; 1307 1308 /* 1309 * If our children haven't all reached the ready stage, 1310 * wait for them and then repeat this pipeline stage. 1311 */ 1312 if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) || 1313 zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_READY)) 1314 return (ZIO_PIPELINE_STOP); 1315 1316 if (!IO_IS_ALLOCATING(zio)) 1317 return (ZIO_PIPELINE_CONTINUE); 1318 1319 if (zio->io_children_ready != NULL) { 1320 /* 1321 * Now that all our children are ready, run the callback 1322 * associated with this zio in case it wants to modify the 1323 * data to be written. 1324 */ 1325 ASSERT3U(zp->zp_level, >, 0); 1326 zio->io_children_ready(zio); 1327 } 1328 1329 ASSERT(zio->io_child_type != ZIO_CHILD_DDT); 1330 ASSERT(zio->io_bp_override == NULL); 1331 1332 if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg) { 1333 /* 1334 * We're rewriting an existing block, which means we're 1335 * working on behalf of spa_sync(). For spa_sync() to 1336 * converge, it must eventually be the case that we don't 1337 * have to allocate new blocks. But compression changes 1338 * the blocksize, which forces a reallocate, and makes 1339 * convergence take longer. Therefore, after the first 1340 * few passes, stop compressing to ensure convergence. 1341 */ 1342 pass = spa_sync_pass(spa); 1343 1344 ASSERT(zio->io_txg == spa_syncing_txg(spa)); 1345 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 1346 ASSERT(!BP_GET_DEDUP(bp)); 1347 1348 if (pass >= zfs_sync_pass_dont_compress) 1349 compress = ZIO_COMPRESS_OFF; 1350 1351 /* Make sure someone doesn't change their mind on overwrites */ 1352 ASSERT(BP_IS_EMBEDDED(bp) || MIN(zp->zp_copies + BP_IS_GANG(bp), 1353 spa_max_replication(spa)) == BP_GET_NDVAS(bp)); 1354 } 1355 1356 if (compress != ZIO_COMPRESS_OFF) { 1357 void *cbuf = zio_buf_alloc(lsize); 1358 psize = zio_compress_data(compress, zio->io_data, cbuf, lsize); 1359 if (psize == 0 || psize == lsize) { 1360 compress = ZIO_COMPRESS_OFF; 1361 zio_buf_free(cbuf, lsize); 1362 } else if (!zp->zp_dedup && psize <= BPE_PAYLOAD_SIZE && 1363 zp->zp_level == 0 && !DMU_OT_HAS_FILL(zp->zp_type) && 1364 spa_feature_is_enabled(spa, SPA_FEATURE_EMBEDDED_DATA)) { 1365 encode_embedded_bp_compressed(bp, 1366 cbuf, compress, lsize, psize); 1367 BPE_SET_ETYPE(bp, BP_EMBEDDED_TYPE_DATA); 1368 BP_SET_TYPE(bp, zio->io_prop.zp_type); 1369 BP_SET_LEVEL(bp, zio->io_prop.zp_level); 1370 zio_buf_free(cbuf, lsize); 1371 bp->blk_birth = zio->io_txg; 1372 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1373 ASSERT(spa_feature_is_active(spa, 1374 SPA_FEATURE_EMBEDDED_DATA)); 1375 return (ZIO_PIPELINE_CONTINUE); 1376 } else { 1377 /* 1378 * Round up compressed size up to the ashift 1379 * of the smallest-ashift device, and zero the tail. 1380 * This ensures that the compressed size of the BP 1381 * (and thus compressratio property) are correct, 1382 * in that we charge for the padding used to fill out 1383 * the last sector. 1384 */ 1385 ASSERT3U(spa->spa_min_ashift, >=, SPA_MINBLOCKSHIFT); 1386 size_t rounded = (size_t)P2ROUNDUP(psize, 1387 1ULL << spa->spa_min_ashift); 1388 if (rounded >= lsize) { 1389 compress = ZIO_COMPRESS_OFF; 1390 zio_buf_free(cbuf, lsize); 1391 psize = lsize; 1392 } else { 1393 bzero((char *)cbuf + psize, rounded - psize); 1394 psize = rounded; 1395 zio_push_transform(zio, cbuf, 1396 psize, lsize, NULL); 1397 } 1398 } 1399 1400 /* 1401 * We were unable to handle this as an override bp, treat 1402 * it as a regular write I/O. 1403 */ 1404 zio->io_bp_override = NULL; 1405 *bp = zio->io_bp_orig; 1406 zio->io_pipeline = zio->io_orig_pipeline; 1407 } 1408 1409 /* 1410 * The final pass of spa_sync() must be all rewrites, but the first 1411 * few passes offer a trade-off: allocating blocks defers convergence, 1412 * but newly allocated blocks are sequential, so they can be written 1413 * to disk faster. Therefore, we allow the first few passes of 1414 * spa_sync() to allocate new blocks, but force rewrites after that. 1415 * There should only be a handful of blocks after pass 1 in any case. 1416 */ 1417 if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg && 1418 BP_GET_PSIZE(bp) == psize && 1419 pass >= zfs_sync_pass_rewrite) { 1420 ASSERT(psize != 0); 1421 enum zio_stage gang_stages = zio->io_pipeline & ZIO_GANG_STAGES; 1422 zio->io_pipeline = ZIO_REWRITE_PIPELINE | gang_stages; 1423 zio->io_flags |= ZIO_FLAG_IO_REWRITE; 1424 } else { 1425 BP_ZERO(bp); 1426 zio->io_pipeline = ZIO_WRITE_PIPELINE; 1427 } 1428 1429 if (psize == 0) { 1430 if (zio->io_bp_orig.blk_birth != 0 && 1431 spa_feature_is_active(spa, SPA_FEATURE_HOLE_BIRTH)) { 1432 BP_SET_LSIZE(bp, lsize); 1433 BP_SET_TYPE(bp, zp->zp_type); 1434 BP_SET_LEVEL(bp, zp->zp_level); 1435 BP_SET_BIRTH(bp, zio->io_txg, 0); 1436 } 1437 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1438 } else { 1439 ASSERT(zp->zp_checksum != ZIO_CHECKSUM_GANG_HEADER); 1440 BP_SET_LSIZE(bp, lsize); 1441 BP_SET_TYPE(bp, zp->zp_type); 1442 BP_SET_LEVEL(bp, zp->zp_level); 1443 BP_SET_PSIZE(bp, psize); 1444 BP_SET_COMPRESS(bp, compress); 1445 BP_SET_CHECKSUM(bp, zp->zp_checksum); 1446 BP_SET_DEDUP(bp, zp->zp_dedup); 1447 BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); 1448 if (zp->zp_dedup) { 1449 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 1450 ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); 1451 zio->io_pipeline = ZIO_DDT_WRITE_PIPELINE; 1452 } 1453 if (zp->zp_nopwrite) { 1454 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 1455 ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); 1456 zio->io_pipeline |= ZIO_STAGE_NOP_WRITE; 1457 } 1458 } 1459 return (ZIO_PIPELINE_CONTINUE); 1460} 1461 1462static int 1463zio_free_bp_init(zio_t *zio) 1464{ 1465 blkptr_t *bp = zio->io_bp; 1466 1467 if (zio->io_child_type == ZIO_CHILD_LOGICAL) { 1468 if (BP_GET_DEDUP(bp)) 1469 zio->io_pipeline = ZIO_DDT_FREE_PIPELINE; 1470 } 1471 1472 return (ZIO_PIPELINE_CONTINUE); 1473} 1474 1475/* 1476 * ========================================================================== 1477 * Execute the I/O pipeline 1478 * ========================================================================== 1479 */ 1480 1481static void 1482zio_taskq_dispatch(zio_t *zio, zio_taskq_type_t q, boolean_t cutinline) 1483{ 1484 spa_t *spa = zio->io_spa; 1485 zio_type_t t = zio->io_type; 1486 int flags = (cutinline ? TQ_FRONT : 0); 1487 1488 ASSERT(q == ZIO_TASKQ_ISSUE || q == ZIO_TASKQ_INTERRUPT); 1489 1490 /* 1491 * If we're a config writer or a probe, the normal issue and 1492 * interrupt threads may all be blocked waiting for the config lock. 1493 * In this case, select the otherwise-unused taskq for ZIO_TYPE_NULL. 1494 */ 1495 if (zio->io_flags & (ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_PROBE)) 1496 t = ZIO_TYPE_NULL; 1497 1498 /* 1499 * A similar issue exists for the L2ARC write thread until L2ARC 2.0. 1500 */ 1501 if (t == ZIO_TYPE_WRITE && zio->io_vd && zio->io_vd->vdev_aux) 1502 t = ZIO_TYPE_NULL; 1503 1504 /* 1505 * If this is a high priority I/O, then use the high priority taskq if 1506 * available. 1507 */ 1508 if (zio->io_priority == ZIO_PRIORITY_NOW && 1509 spa->spa_zio_taskq[t][q + 1].stqs_count != 0) 1510 q++; 1511 1512 ASSERT3U(q, <, ZIO_TASKQ_TYPES); 1513 1514 /* 1515 * NB: We are assuming that the zio can only be dispatched 1516 * to a single taskq at a time. It would be a grievous error 1517 * to dispatch the zio to another taskq at the same time. 1518 */ 1519#if defined(illumos) || !defined(_KERNEL) 1520 ASSERT(zio->io_tqent.tqent_next == NULL); 1521#else 1522 ASSERT(zio->io_tqent.tqent_task.ta_pending == 0); 1523#endif 1524 spa_taskq_dispatch_ent(spa, t, q, (task_func_t *)zio_execute, zio, 1525 flags, &zio->io_tqent); 1526} 1527 1528static boolean_t 1529zio_taskq_member(zio_t *zio, zio_taskq_type_t q) 1530{ 1531 kthread_t *executor = zio->io_executor; 1532 spa_t *spa = zio->io_spa; 1533 1534 for (zio_type_t t = 0; t < ZIO_TYPES; t++) { 1535 spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; 1536 uint_t i; 1537 for (i = 0; i < tqs->stqs_count; i++) { 1538 if (taskq_member(tqs->stqs_taskq[i], executor)) 1539 return (B_TRUE); 1540 } 1541 } 1542 1543 return (B_FALSE); 1544} 1545 1546static int 1547zio_issue_async(zio_t *zio) 1548{ 1549 zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE); 1550 1551 return (ZIO_PIPELINE_STOP); 1552} 1553 1554void 1555zio_interrupt(zio_t *zio) 1556{ 1557 zio_taskq_dispatch(zio, ZIO_TASKQ_INTERRUPT, B_FALSE); 1558} 1559 1560void 1561zio_delay_interrupt(zio_t *zio) 1562{ 1563 /* 1564 * The timeout_generic() function isn't defined in userspace, so 1565 * rather than trying to implement the function, the zio delay 1566 * functionality has been disabled for userspace builds. 1567 */ 1568 1569#ifdef _KERNEL 1570 /* 1571 * If io_target_timestamp is zero, then no delay has been registered 1572 * for this IO, thus jump to the end of this function and "skip" the 1573 * delay; issuing it directly to the zio layer. 1574 */ 1575 if (zio->io_target_timestamp != 0) { 1576 hrtime_t now = gethrtime(); 1577 1578 if (now >= zio->io_target_timestamp) { 1579 /* 1580 * This IO has already taken longer than the target 1581 * delay to complete, so we don't want to delay it 1582 * any longer; we "miss" the delay and issue it 1583 * directly to the zio layer. This is likely due to 1584 * the target latency being set to a value less than 1585 * the underlying hardware can satisfy (e.g. delay 1586 * set to 1ms, but the disks take 10ms to complete an 1587 * IO request). 1588 */ 1589 1590 DTRACE_PROBE2(zio__delay__miss, zio_t *, zio, 1591 hrtime_t, now); 1592 1593 zio_interrupt(zio); 1594 } else { 1595 hrtime_t diff = zio->io_target_timestamp - now; 1596 1597 DTRACE_PROBE3(zio__delay__hit, zio_t *, zio, 1598 hrtime_t, now, hrtime_t, diff); 1599 1600 (void) timeout_generic(CALLOUT_NORMAL, 1601 (void (*)(void *))zio_interrupt, zio, diff, 1, 0); 1602 } 1603 1604 return; 1605 } 1606#endif 1607 1608 DTRACE_PROBE1(zio__delay__skip, zio_t *, zio); 1609 zio_interrupt(zio); 1610} 1611 1612/* 1613 * Execute the I/O pipeline until one of the following occurs: 1614 * 1615 * (1) the I/O completes 1616 * (2) the pipeline stalls waiting for dependent child I/Os 1617 * (3) the I/O issues, so we're waiting for an I/O completion interrupt 1618 * (4) the I/O is delegated by vdev-level caching or aggregation 1619 * (5) the I/O is deferred due to vdev-level queueing 1620 * (6) the I/O is handed off to another thread. 1621 * 1622 * In all cases, the pipeline stops whenever there's no CPU work; it never 1623 * burns a thread in cv_wait(). 1624 * 1625 * There's no locking on io_stage because there's no legitimate way 1626 * for multiple threads to be attempting to process the same I/O. 1627 */ 1628static zio_pipe_stage_t *zio_pipeline[]; 1629 1630void 1631zio_execute(zio_t *zio) 1632{ 1633 zio->io_executor = curthread; 1634 1635 ASSERT3U(zio->io_queued_timestamp, >, 0); 1636 1637 while (zio->io_stage < ZIO_STAGE_DONE) { 1638 enum zio_stage pipeline = zio->io_pipeline; 1639 enum zio_stage stage = zio->io_stage; 1640 int rv; 1641 1642 ASSERT(!MUTEX_HELD(&zio->io_lock)); 1643 ASSERT(ISP2(stage)); 1644 ASSERT(zio->io_stall == NULL); 1645 1646 do { 1647 stage <<= 1; 1648 } while ((stage & pipeline) == 0); 1649 1650 ASSERT(stage <= ZIO_STAGE_DONE); 1651 1652 /* 1653 * If we are in interrupt context and this pipeline stage 1654 * will grab a config lock that is held across I/O, 1655 * or may wait for an I/O that needs an interrupt thread 1656 * to complete, issue async to avoid deadlock. 1657 * 1658 * For VDEV_IO_START, we cut in line so that the io will 1659 * be sent to disk promptly. 1660 */ 1661 if ((stage & ZIO_BLOCKING_STAGES) && zio->io_vd == NULL && 1662 zio_taskq_member(zio, ZIO_TASKQ_INTERRUPT)) { 1663 boolean_t cut = (stage == ZIO_STAGE_VDEV_IO_START) ? 1664 zio_requeue_io_start_cut_in_line : B_FALSE; 1665 zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut); 1666 return; 1667 } 1668 1669 zio->io_stage = stage; 1670 zio->io_pipeline_trace |= zio->io_stage; 1671 rv = zio_pipeline[highbit64(stage) - 1](zio); 1672 1673 if (rv == ZIO_PIPELINE_STOP) 1674 return; 1675 1676 ASSERT(rv == ZIO_PIPELINE_CONTINUE); 1677 } 1678} 1679 1680/* 1681 * ========================================================================== 1682 * Initiate I/O, either sync or async 1683 * ========================================================================== 1684 */ 1685int 1686zio_wait(zio_t *zio) 1687{ 1688 int error; 1689 1690 ASSERT(zio->io_stage == ZIO_STAGE_OPEN); 1691 ASSERT(zio->io_executor == NULL); 1692 1693 zio->io_waiter = curthread; 1694 ASSERT0(zio->io_queued_timestamp); 1695 zio->io_queued_timestamp = gethrtime(); 1696 1697 zio_execute(zio); 1698 1699 mutex_enter(&zio->io_lock); 1700 while (zio->io_executor != NULL) 1701 cv_wait(&zio->io_cv, &zio->io_lock); 1702 mutex_exit(&zio->io_lock); 1703 1704 error = zio->io_error; 1705 zio_destroy(zio); 1706 1707 return (error); 1708} 1709 1710void 1711zio_nowait(zio_t *zio) 1712{ 1713 ASSERT(zio->io_executor == NULL); 1714 1715 if (zio->io_child_type == ZIO_CHILD_LOGICAL && 1716 zio_unique_parent(zio) == NULL) { 1717 /* 1718 * This is a logical async I/O with no parent to wait for it. 1719 * We add it to the spa_async_root_zio "Godfather" I/O which 1720 * will ensure they complete prior to unloading the pool. 1721 */ 1722 spa_t *spa = zio->io_spa; 1723 1724 zio_add_child(spa->spa_async_zio_root[CPU_SEQID], zio); 1725 } 1726 1727 ASSERT0(zio->io_queued_timestamp); 1728 zio->io_queued_timestamp = gethrtime(); 1729 zio_execute(zio); 1730} 1731 1732/* 1733 * ========================================================================== 1734 * Reexecute or suspend/resume failed I/O 1735 * ========================================================================== 1736 */ 1737 1738static void 1739zio_reexecute(zio_t *pio) 1740{ 1741 zio_t *cio, *cio_next; 1742 1743 ASSERT(pio->io_child_type == ZIO_CHILD_LOGICAL); 1744 ASSERT(pio->io_orig_stage == ZIO_STAGE_OPEN); 1745 ASSERT(pio->io_gang_leader == NULL); 1746 ASSERT(pio->io_gang_tree == NULL); 1747 1748 pio->io_flags = pio->io_orig_flags; 1749 pio->io_stage = pio->io_orig_stage; 1750 pio->io_pipeline = pio->io_orig_pipeline; 1751 pio->io_reexecute = 0; 1752 pio->io_flags |= ZIO_FLAG_REEXECUTED; 1753 pio->io_pipeline_trace = 0; 1754 pio->io_error = 0; 1755 for (int w = 0; w < ZIO_WAIT_TYPES; w++) 1756 pio->io_state[w] = 0; 1757 for (int c = 0; c < ZIO_CHILD_TYPES; c++) 1758 pio->io_child_error[c] = 0; 1759 1760 if (IO_IS_ALLOCATING(pio)) 1761 BP_ZERO(pio->io_bp); 1762 1763 /* 1764 * As we reexecute pio's children, new children could be created. 1765 * New children go to the head of pio's io_child_list, however, 1766 * so we will (correctly) not reexecute them. The key is that 1767 * the remainder of pio's io_child_list, from 'cio_next' onward, 1768 * cannot be affected by any side effects of reexecuting 'cio'. 1769 */ 1770 zio_link_t *zl = NULL; 1771 for (cio = zio_walk_children(pio, &zl); cio != NULL; cio = cio_next) { 1772 cio_next = zio_walk_children(pio, &zl); 1773 mutex_enter(&pio->io_lock); 1774 for (int w = 0; w < ZIO_WAIT_TYPES; w++) 1775 pio->io_children[cio->io_child_type][w]++; 1776 mutex_exit(&pio->io_lock); 1777 zio_reexecute(cio); 1778 } 1779 1780 /* 1781 * Now that all children have been reexecuted, execute the parent. 1782 * We don't reexecute "The Godfather" I/O here as it's the 1783 * responsibility of the caller to wait on him. 1784 */ 1785 if (!(pio->io_flags & ZIO_FLAG_GODFATHER)) { 1786 pio->io_queued_timestamp = gethrtime(); 1787 zio_execute(pio); 1788 } 1789} 1790 1791void 1792zio_suspend(spa_t *spa, zio_t *zio) 1793{ 1794 if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_PANIC) 1795 fm_panic("Pool '%s' has encountered an uncorrectable I/O " 1796 "failure and the failure mode property for this pool " 1797 "is set to panic.", spa_name(spa)); 1798 1799 zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL, NULL, 0, 0); 1800 1801 mutex_enter(&spa->spa_suspend_lock); 1802 1803 if (spa->spa_suspend_zio_root == NULL) 1804 spa->spa_suspend_zio_root = zio_root(spa, NULL, NULL, 1805 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | 1806 ZIO_FLAG_GODFATHER); 1807 1808 spa->spa_suspended = B_TRUE; 1809 1810 if (zio != NULL) { 1811 ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER)); 1812 ASSERT(zio != spa->spa_suspend_zio_root); 1813 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 1814 ASSERT(zio_unique_parent(zio) == NULL); 1815 ASSERT(zio->io_stage == ZIO_STAGE_DONE); 1816 zio_add_child(spa->spa_suspend_zio_root, zio); 1817 } 1818 1819 mutex_exit(&spa->spa_suspend_lock); 1820} 1821 1822int 1823zio_resume(spa_t *spa) 1824{ 1825 zio_t *pio; 1826 1827 /* 1828 * Reexecute all previously suspended i/o. 1829 */ 1830 mutex_enter(&spa->spa_suspend_lock); 1831 spa->spa_suspended = B_FALSE; 1832 cv_broadcast(&spa->spa_suspend_cv); 1833 pio = spa->spa_suspend_zio_root; 1834 spa->spa_suspend_zio_root = NULL; 1835 mutex_exit(&spa->spa_suspend_lock); 1836 1837 if (pio == NULL) 1838 return (0); 1839 1840 zio_reexecute(pio); 1841 return (zio_wait(pio)); 1842} 1843 1844void 1845zio_resume_wait(spa_t *spa) 1846{ 1847 mutex_enter(&spa->spa_suspend_lock); 1848 while (spa_suspended(spa)) 1849 cv_wait(&spa->spa_suspend_cv, &spa->spa_suspend_lock); 1850 mutex_exit(&spa->spa_suspend_lock); 1851} 1852 1853/* 1854 * ========================================================================== 1855 * Gang blocks. 1856 * 1857 * A gang block is a collection of small blocks that looks to the DMU 1858 * like one large block. When zio_dva_allocate() cannot find a block 1859 * of the requested size, due to either severe fragmentation or the pool 1860 * being nearly full, it calls zio_write_gang_block() to construct the 1861 * block from smaller fragments. 1862 * 1863 * A gang block consists of a gang header (zio_gbh_phys_t) and up to 1864 * three (SPA_GBH_NBLKPTRS) gang members. The gang header is just like 1865 * an indirect block: it's an array of block pointers. It consumes 1866 * only one sector and hence is allocatable regardless of fragmentation. 1867 * The gang header's bps point to its gang members, which hold the data. 1868 * 1869 * Gang blocks are self-checksumming, using the bp's <vdev, offset, txg> 1870 * as the verifier to ensure uniqueness of the SHA256 checksum. 1871 * Critically, the gang block bp's blk_cksum is the checksum of the data, 1872 * not the gang header. This ensures that data block signatures (needed for 1873 * deduplication) are independent of how the block is physically stored. 1874 * 1875 * Gang blocks can be nested: a gang member may itself be a gang block. 1876 * Thus every gang block is a tree in which root and all interior nodes are 1877 * gang headers, and the leaves are normal blocks that contain user data. 1878 * The root of the gang tree is called the gang leader. 1879 * 1880 * To perform any operation (read, rewrite, free, claim) on a gang block, 1881 * zio_gang_assemble() first assembles the gang tree (minus data leaves) 1882 * in the io_gang_tree field of the original logical i/o by recursively 1883 * reading the gang leader and all gang headers below it. This yields 1884 * an in-core tree containing the contents of every gang header and the 1885 * bps for every constituent of the gang block. 1886 * 1887 * With the gang tree now assembled, zio_gang_issue() just walks the gang tree 1888 * and invokes a callback on each bp. To free a gang block, zio_gang_issue() 1889 * calls zio_free_gang() -- a trivial wrapper around zio_free() -- for each bp. 1890 * zio_claim_gang() provides a similarly trivial wrapper for zio_claim(). 1891 * zio_read_gang() is a wrapper around zio_read() that omits reading gang 1892 * headers, since we already have those in io_gang_tree. zio_rewrite_gang() 1893 * performs a zio_rewrite() of the data or, for gang headers, a zio_rewrite() 1894 * of the gang header plus zio_checksum_compute() of the data to update the 1895 * gang header's blk_cksum as described above. 1896 * 1897 * The two-phase assemble/issue model solves the problem of partial failure -- 1898 * what if you'd freed part of a gang block but then couldn't read the 1899 * gang header for another part? Assembling the entire gang tree first 1900 * ensures that all the necessary gang header I/O has succeeded before 1901 * starting the actual work of free, claim, or write. Once the gang tree 1902 * is assembled, free and claim are in-memory operations that cannot fail. 1903 * 1904 * In the event that a gang write fails, zio_dva_unallocate() walks the 1905 * gang tree to immediately free (i.e. insert back into the space map) 1906 * everything we've allocated. This ensures that we don't get ENOSPC 1907 * errors during repeated suspend/resume cycles due to a flaky device. 1908 * 1909 * Gang rewrites only happen during sync-to-convergence. If we can't assemble 1910 * the gang tree, we won't modify the block, so we can safely defer the free 1911 * (knowing that the block is still intact). If we *can* assemble the gang 1912 * tree, then even if some of the rewrites fail, zio_dva_unallocate() will free 1913 * each constituent bp and we can allocate a new block on the next sync pass. 1914 * 1915 * In all cases, the gang tree allows complete recovery from partial failure. 1916 * ========================================================================== 1917 */ 1918 1919static zio_t * 1920zio_read_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) 1921{ 1922 if (gn != NULL) 1923 return (pio); 1924 1925 return (zio_read(pio, pio->io_spa, bp, data, BP_GET_PSIZE(bp), 1926 NULL, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), 1927 &pio->io_bookmark)); 1928} 1929 1930zio_t * 1931zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) 1932{ 1933 zio_t *zio; 1934 1935 if (gn != NULL) { 1936 zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, 1937 gn->gn_gbh, SPA_GANGBLOCKSIZE, NULL, NULL, pio->io_priority, 1938 ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); 1939 /* 1940 * As we rewrite each gang header, the pipeline will compute 1941 * a new gang block header checksum for it; but no one will 1942 * compute a new data checksum, so we do that here. The one 1943 * exception is the gang leader: the pipeline already computed 1944 * its data checksum because that stage precedes gang assembly. 1945 * (Presently, nothing actually uses interior data checksums; 1946 * this is just good hygiene.) 1947 */ 1948 if (gn != pio->io_gang_leader->io_gang_tree) { 1949 zio_checksum_compute(zio, BP_GET_CHECKSUM(bp), 1950 data, BP_GET_PSIZE(bp)); 1951 } 1952 /* 1953 * If we are here to damage data for testing purposes, 1954 * leave the GBH alone so that we can detect the damage. 1955 */ 1956 if (pio->io_gang_leader->io_flags & ZIO_FLAG_INDUCE_DAMAGE) 1957 zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; 1958 } else { 1959 zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, 1960 data, BP_GET_PSIZE(bp), NULL, NULL, pio->io_priority, 1961 ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); 1962 } 1963 1964 return (zio); 1965} 1966 1967/* ARGSUSED */ 1968zio_t * 1969zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) 1970{ 1971 return (zio_free_sync(pio, pio->io_spa, pio->io_txg, bp, 1972 BP_IS_GANG(bp) ? SPA_GANGBLOCKSIZE : BP_GET_PSIZE(bp), 1973 ZIO_GANG_CHILD_FLAGS(pio))); 1974} 1975 1976/* ARGSUSED */ 1977zio_t * 1978zio_claim_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) 1979{ 1980 return (zio_claim(pio, pio->io_spa, pio->io_txg, bp, 1981 NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio))); 1982} 1983 1984static zio_gang_issue_func_t *zio_gang_issue_func[ZIO_TYPES] = { 1985 NULL, 1986 zio_read_gang, 1987 zio_rewrite_gang, 1988 zio_free_gang, 1989 zio_claim_gang, 1990 NULL 1991}; 1992 1993static void zio_gang_tree_assemble_done(zio_t *zio); 1994 1995static zio_gang_node_t * 1996zio_gang_node_alloc(zio_gang_node_t **gnpp) 1997{ 1998 zio_gang_node_t *gn; 1999 2000 ASSERT(*gnpp == NULL); 2001 2002 gn = kmem_zalloc(sizeof (*gn), KM_SLEEP); 2003 gn->gn_gbh = zio_buf_alloc(SPA_GANGBLOCKSIZE); 2004 *gnpp = gn; 2005 2006 return (gn); 2007} 2008 2009static void 2010zio_gang_node_free(zio_gang_node_t **gnpp) 2011{ 2012 zio_gang_node_t *gn = *gnpp; 2013 2014 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) 2015 ASSERT(gn->gn_child[g] == NULL); 2016 2017 zio_buf_free(gn->gn_gbh, SPA_GANGBLOCKSIZE); 2018 kmem_free(gn, sizeof (*gn)); 2019 *gnpp = NULL; 2020} 2021 2022static void 2023zio_gang_tree_free(zio_gang_node_t **gnpp) 2024{ 2025 zio_gang_node_t *gn = *gnpp; 2026 2027 if (gn == NULL) 2028 return; 2029 2030 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) 2031 zio_gang_tree_free(&gn->gn_child[g]); 2032 2033 zio_gang_node_free(gnpp); 2034} 2035 2036static void 2037zio_gang_tree_assemble(zio_t *gio, blkptr_t *bp, zio_gang_node_t **gnpp) 2038{ 2039 zio_gang_node_t *gn = zio_gang_node_alloc(gnpp); 2040 2041 ASSERT(gio->io_gang_leader == gio); 2042 ASSERT(BP_IS_GANG(bp)); 2043 2044 zio_nowait(zio_read(gio, gio->io_spa, bp, gn->gn_gbh, 2045 SPA_GANGBLOCKSIZE, zio_gang_tree_assemble_done, gn, 2046 gio->io_priority, ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark)); 2047} 2048 2049static void 2050zio_gang_tree_assemble_done(zio_t *zio) 2051{ 2052 zio_t *gio = zio->io_gang_leader; 2053 zio_gang_node_t *gn = zio->io_private; 2054 blkptr_t *bp = zio->io_bp; 2055 2056 ASSERT(gio == zio_unique_parent(zio)); 2057 ASSERT(zio->io_child_count == 0); 2058 2059 if (zio->io_error) 2060 return; 2061 2062 if (BP_SHOULD_BYTESWAP(bp)) 2063 byteswap_uint64_array(zio->io_data, zio->io_size); 2064 2065 ASSERT(zio->io_data == gn->gn_gbh); 2066 ASSERT(zio->io_size == SPA_GANGBLOCKSIZE); 2067 ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC); 2068 2069 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { 2070 blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; 2071 if (!BP_IS_GANG(gbp)) 2072 continue; 2073 zio_gang_tree_assemble(gio, gbp, &gn->gn_child[g]); 2074 } 2075} 2076 2077static void 2078zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, void *data) 2079{ 2080 zio_t *gio = pio->io_gang_leader; 2081 zio_t *zio; 2082 2083 ASSERT(BP_IS_GANG(bp) == !!gn); 2084 ASSERT(BP_GET_CHECKSUM(bp) == BP_GET_CHECKSUM(gio->io_bp)); 2085 ASSERT(BP_GET_LSIZE(bp) == BP_GET_PSIZE(bp) || gn == gio->io_gang_tree); 2086 2087 /* 2088 * If you're a gang header, your data is in gn->gn_gbh. 2089 * If you're a gang member, your data is in 'data' and gn == NULL. 2090 */ 2091 zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data); 2092 2093 if (gn != NULL) { 2094 ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC); 2095 2096 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { 2097 blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; 2098 if (BP_IS_HOLE(gbp)) 2099 continue; 2100 zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data); 2101 data = (char *)data + BP_GET_PSIZE(gbp); 2102 } 2103 } 2104 2105 if (gn == gio->io_gang_tree && gio->io_data != NULL) 2106 ASSERT3P((char *)gio->io_data + gio->io_size, ==, data); 2107 2108 if (zio != pio) 2109 zio_nowait(zio); 2110} 2111 2112static int 2113zio_gang_assemble(zio_t *zio) 2114{ 2115 blkptr_t *bp = zio->io_bp; 2116 2117 ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == NULL); 2118 ASSERT(zio->io_child_type > ZIO_CHILD_GANG); 2119 2120 zio->io_gang_leader = zio; 2121 2122 zio_gang_tree_assemble(zio, bp, &zio->io_gang_tree); 2123 2124 return (ZIO_PIPELINE_CONTINUE); 2125} 2126 2127static int 2128zio_gang_issue(zio_t *zio) 2129{ 2130 blkptr_t *bp = zio->io_bp; 2131 2132 if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE)) 2133 return (ZIO_PIPELINE_STOP); 2134 2135 ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == zio); 2136 ASSERT(zio->io_child_type > ZIO_CHILD_GANG); 2137 2138 if (zio->io_child_error[ZIO_CHILD_GANG] == 0) 2139 zio_gang_tree_issue(zio, zio->io_gang_tree, bp, zio->io_data); 2140 else 2141 zio_gang_tree_free(&zio->io_gang_tree); 2142 2143 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 2144 2145 return (ZIO_PIPELINE_CONTINUE); 2146} 2147 2148static void 2149zio_write_gang_member_ready(zio_t *zio) 2150{ 2151 zio_t *pio = zio_unique_parent(zio); 2152 zio_t *gio = zio->io_gang_leader; 2153 dva_t *cdva = zio->io_bp->blk_dva; 2154 dva_t *pdva = pio->io_bp->blk_dva; 2155 uint64_t asize; 2156 2157 if (BP_IS_HOLE(zio->io_bp)) 2158 return; 2159 2160 ASSERT(BP_IS_HOLE(&zio->io_bp_orig)); 2161 2162 ASSERT(zio->io_child_type == ZIO_CHILD_GANG); 2163 ASSERT3U(zio->io_prop.zp_copies, ==, gio->io_prop.zp_copies); 2164 ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(zio->io_bp)); 2165 ASSERT3U(pio->io_prop.zp_copies, <=, BP_GET_NDVAS(pio->io_bp)); 2166 ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp)); 2167 2168 mutex_enter(&pio->io_lock); 2169 for (int d = 0; d < BP_GET_NDVAS(zio->io_bp); d++) { 2170 ASSERT(DVA_GET_GANG(&pdva[d])); 2171 asize = DVA_GET_ASIZE(&pdva[d]); 2172 asize += DVA_GET_ASIZE(&cdva[d]); 2173 DVA_SET_ASIZE(&pdva[d], asize); 2174 } 2175 mutex_exit(&pio->io_lock); 2176} 2177 2178static int 2179zio_write_gang_block(zio_t *pio) 2180{ 2181 spa_t *spa = pio->io_spa; 2182 metaslab_class_t *mc = spa_normal_class(spa); 2183 blkptr_t *bp = pio->io_bp; 2184 zio_t *gio = pio->io_gang_leader; 2185 zio_t *zio; 2186 zio_gang_node_t *gn, **gnpp; 2187 zio_gbh_phys_t *gbh; 2188 uint64_t txg = pio->io_txg; 2189 uint64_t resid = pio->io_size; 2190 uint64_t lsize; 2191 int copies = gio->io_prop.zp_copies; 2192 int gbh_copies = MIN(copies + 1, spa_max_replication(spa)); 2193 zio_prop_t zp; 2194 int error; 2195 2196 int flags = METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER; 2197 if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) { 2198 ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); 2199 ASSERT(!(pio->io_flags & ZIO_FLAG_NODATA)); 2200 2201 flags |= METASLAB_ASYNC_ALLOC; 2202 VERIFY(refcount_held(&mc->mc_alloc_slots, pio)); 2203 2204 /* 2205 * The logical zio has already placed a reservation for 2206 * 'copies' allocation slots but gang blocks may require 2207 * additional copies. These additional copies 2208 * (i.e. gbh_copies - copies) are guaranteed to succeed 2209 * since metaslab_class_throttle_reserve() always allows 2210 * additional reservations for gang blocks. 2211 */ 2212 VERIFY(metaslab_class_throttle_reserve(mc, gbh_copies - copies, 2213 pio, flags)); 2214 } 2215 2216 error = metaslab_alloc(spa, mc, SPA_GANGBLOCKSIZE, 2217 bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp, flags, pio); 2218 if (error) { 2219 if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) { 2220 ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); 2221 ASSERT(!(pio->io_flags & ZIO_FLAG_NODATA)); 2222 2223 /* 2224 * If we failed to allocate the gang block header then 2225 * we remove any additional allocation reservations that 2226 * we placed here. The original reservation will 2227 * be removed when the logical I/O goes to the ready 2228 * stage. 2229 */ 2230 metaslab_class_throttle_unreserve(mc, 2231 gbh_copies - copies, pio); 2232 } 2233 pio->io_error = error; 2234 return (ZIO_PIPELINE_CONTINUE); 2235 } 2236 2237 if (pio == gio) { 2238 gnpp = &gio->io_gang_tree; 2239 } else { 2240 gnpp = pio->io_private; 2241 ASSERT(pio->io_ready == zio_write_gang_member_ready); 2242 } 2243 2244 gn = zio_gang_node_alloc(gnpp); 2245 gbh = gn->gn_gbh; 2246 bzero(gbh, SPA_GANGBLOCKSIZE); 2247 2248 /* 2249 * Create the gang header. 2250 */ 2251 zio = zio_rewrite(pio, spa, txg, bp, gbh, SPA_GANGBLOCKSIZE, NULL, NULL, 2252 pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); 2253 2254 /* 2255 * Create and nowait the gang children. 2256 */ 2257 for (int g = 0; resid != 0; resid -= lsize, g++) { 2258 lsize = P2ROUNDUP(resid / (SPA_GBH_NBLKPTRS - g), 2259 SPA_MINBLOCKSIZE); 2260 ASSERT(lsize >= SPA_MINBLOCKSIZE && lsize <= resid); 2261 2262 zp.zp_checksum = gio->io_prop.zp_checksum; 2263 zp.zp_compress = ZIO_COMPRESS_OFF; 2264 zp.zp_type = DMU_OT_NONE; 2265 zp.zp_level = 0; 2266 zp.zp_copies = gio->io_prop.zp_copies; 2267 zp.zp_dedup = B_FALSE; 2268 zp.zp_dedup_verify = B_FALSE; 2269 zp.zp_nopwrite = B_FALSE; 2270 2271 zio_t *cio = zio_write(zio, spa, txg, &gbh->zg_blkptr[g], 2272 (char *)pio->io_data + (pio->io_size - resid), lsize, &zp, 2273 zio_write_gang_member_ready, NULL, NULL, NULL, 2274 &gn->gn_child[g], pio->io_priority, 2275 ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); 2276 2277 if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) { 2278 ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); 2279 ASSERT(!(pio->io_flags & ZIO_FLAG_NODATA)); 2280 2281 /* 2282 * Gang children won't throttle but we should 2283 * account for their work, so reserve an allocation 2284 * slot for them here. 2285 */ 2286 VERIFY(metaslab_class_throttle_reserve(mc, 2287 zp.zp_copies, cio, flags)); 2288 } 2289 zio_nowait(cio); 2290 } 2291 2292 /* 2293 * Set pio's pipeline to just wait for zio to finish. 2294 */ 2295 pio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 2296 2297 zio_nowait(zio); 2298 2299 return (ZIO_PIPELINE_CONTINUE); 2300} 2301 2302/* 2303 * The zio_nop_write stage in the pipeline determines if allocating a 2304 * new bp is necessary. The nopwrite feature can handle writes in 2305 * either syncing or open context (i.e. zil writes) and as a result is 2306 * mutually exclusive with dedup. 2307 * 2308 * By leveraging a cryptographically secure checksum, such as SHA256, we 2309 * can compare the checksums of the new data and the old to determine if 2310 * allocating a new block is required. Note that our requirements for 2311 * cryptographic strength are fairly weak: there can't be any accidental 2312 * hash collisions, but we don't need to be secure against intentional 2313 * (malicious) collisions. To trigger a nopwrite, you have to be able 2314 * to write the file to begin with, and triggering an incorrect (hash 2315 * collision) nopwrite is no worse than simply writing to the file. 2316 * That said, there are no known attacks against the checksum algorithms 2317 * used for nopwrite, assuming that the salt and the checksums 2318 * themselves remain secret. 2319 */ 2320static int 2321zio_nop_write(zio_t *zio) 2322{ 2323 blkptr_t *bp = zio->io_bp; 2324 blkptr_t *bp_orig = &zio->io_bp_orig; 2325 zio_prop_t *zp = &zio->io_prop; 2326 2327 ASSERT(BP_GET_LEVEL(bp) == 0); 2328 ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); 2329 ASSERT(zp->zp_nopwrite); 2330 ASSERT(!zp->zp_dedup); 2331 ASSERT(zio->io_bp_override == NULL); 2332 ASSERT(IO_IS_ALLOCATING(zio)); 2333 2334 /* 2335 * Check to see if the original bp and the new bp have matching 2336 * characteristics (i.e. same checksum, compression algorithms, etc). 2337 * If they don't then just continue with the pipeline which will 2338 * allocate a new bp. 2339 */ 2340 if (BP_IS_HOLE(bp_orig) || 2341 !(zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_flags & 2342 ZCHECKSUM_FLAG_NOPWRITE) || 2343 BP_GET_CHECKSUM(bp) != BP_GET_CHECKSUM(bp_orig) || 2344 BP_GET_COMPRESS(bp) != BP_GET_COMPRESS(bp_orig) || 2345 BP_GET_DEDUP(bp) != BP_GET_DEDUP(bp_orig) || 2346 zp->zp_copies != BP_GET_NDVAS(bp_orig)) 2347 return (ZIO_PIPELINE_CONTINUE); 2348 2349 /* 2350 * If the checksums match then reset the pipeline so that we 2351 * avoid allocating a new bp and issuing any I/O. 2352 */ 2353 if (ZIO_CHECKSUM_EQUAL(bp->blk_cksum, bp_orig->blk_cksum)) { 2354 ASSERT(zio_checksum_table[zp->zp_checksum].ci_flags & 2355 ZCHECKSUM_FLAG_NOPWRITE); 2356 ASSERT3U(BP_GET_PSIZE(bp), ==, BP_GET_PSIZE(bp_orig)); 2357 ASSERT3U(BP_GET_LSIZE(bp), ==, BP_GET_LSIZE(bp_orig)); 2358 ASSERT(zp->zp_compress != ZIO_COMPRESS_OFF); 2359 ASSERT(bcmp(&bp->blk_prop, &bp_orig->blk_prop, 2360 sizeof (uint64_t)) == 0); 2361 2362 *bp = *bp_orig; 2363 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 2364 zio->io_flags |= ZIO_FLAG_NOPWRITE; 2365 } 2366 2367 return (ZIO_PIPELINE_CONTINUE); 2368} 2369 2370/* 2371 * ========================================================================== 2372 * Dedup 2373 * ========================================================================== 2374 */ 2375static void 2376zio_ddt_child_read_done(zio_t *zio) 2377{ 2378 blkptr_t *bp = zio->io_bp; 2379 ddt_entry_t *dde = zio->io_private; 2380 ddt_phys_t *ddp; 2381 zio_t *pio = zio_unique_parent(zio); 2382 2383 mutex_enter(&pio->io_lock); 2384 ddp = ddt_phys_select(dde, bp); 2385 if (zio->io_error == 0) 2386 ddt_phys_clear(ddp); /* this ddp doesn't need repair */ 2387 if (zio->io_error == 0 && dde->dde_repair_data == NULL) 2388 dde->dde_repair_data = zio->io_data; 2389 else 2390 zio_buf_free(zio->io_data, zio->io_size); 2391 mutex_exit(&pio->io_lock); 2392} 2393 2394static int 2395zio_ddt_read_start(zio_t *zio) 2396{ 2397 blkptr_t *bp = zio->io_bp; 2398 2399 ASSERT(BP_GET_DEDUP(bp)); 2400 ASSERT(BP_GET_PSIZE(bp) == zio->io_size); 2401 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 2402 2403 if (zio->io_child_error[ZIO_CHILD_DDT]) { 2404 ddt_t *ddt = ddt_select(zio->io_spa, bp); 2405 ddt_entry_t *dde = ddt_repair_start(ddt, bp); 2406 ddt_phys_t *ddp = dde->dde_phys; 2407 ddt_phys_t *ddp_self = ddt_phys_select(dde, bp); 2408 blkptr_t blk; 2409 2410 ASSERT(zio->io_vsd == NULL); 2411 zio->io_vsd = dde; 2412 2413 if (ddp_self == NULL) 2414 return (ZIO_PIPELINE_CONTINUE); 2415 2416 for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { 2417 if (ddp->ddp_phys_birth == 0 || ddp == ddp_self) 2418 continue; 2419 ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, ddp, 2420 &blk); 2421 zio_nowait(zio_read(zio, zio->io_spa, &blk, 2422 zio_buf_alloc(zio->io_size), zio->io_size, 2423 zio_ddt_child_read_done, dde, zio->io_priority, 2424 ZIO_DDT_CHILD_FLAGS(zio) | ZIO_FLAG_DONT_PROPAGATE, 2425 &zio->io_bookmark)); 2426 } 2427 return (ZIO_PIPELINE_CONTINUE); 2428 } 2429 2430 zio_nowait(zio_read(zio, zio->io_spa, bp, 2431 zio->io_data, zio->io_size, NULL, NULL, zio->io_priority, 2432 ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark)); 2433 2434 return (ZIO_PIPELINE_CONTINUE); 2435} 2436 2437static int 2438zio_ddt_read_done(zio_t *zio) 2439{ 2440 blkptr_t *bp = zio->io_bp; 2441 2442 if (zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE)) 2443 return (ZIO_PIPELINE_STOP); 2444 2445 ASSERT(BP_GET_DEDUP(bp)); 2446 ASSERT(BP_GET_PSIZE(bp) == zio->io_size); 2447 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 2448 2449 if (zio->io_child_error[ZIO_CHILD_DDT]) { 2450 ddt_t *ddt = ddt_select(zio->io_spa, bp); 2451 ddt_entry_t *dde = zio->io_vsd; 2452 if (ddt == NULL) { 2453 ASSERT(spa_load_state(zio->io_spa) != SPA_LOAD_NONE); 2454 return (ZIO_PIPELINE_CONTINUE); 2455 } 2456 if (dde == NULL) { 2457 zio->io_stage = ZIO_STAGE_DDT_READ_START >> 1; 2458 zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE); 2459 return (ZIO_PIPELINE_STOP); 2460 } 2461 if (dde->dde_repair_data != NULL) { 2462 bcopy(dde->dde_repair_data, zio->io_data, zio->io_size); 2463 zio->io_child_error[ZIO_CHILD_DDT] = 0; 2464 } 2465 ddt_repair_done(ddt, dde); 2466 zio->io_vsd = NULL; 2467 } 2468 2469 ASSERT(zio->io_vsd == NULL); 2470 2471 return (ZIO_PIPELINE_CONTINUE); 2472} 2473 2474static boolean_t 2475zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde) 2476{ 2477 spa_t *spa = zio->io_spa; 2478 2479 /* 2480 * Note: we compare the original data, not the transformed data, 2481 * because when zio->io_bp is an override bp, we will not have 2482 * pushed the I/O transforms. That's an important optimization 2483 * because otherwise we'd compress/encrypt all dmu_sync() data twice. 2484 */ 2485 for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) { 2486 zio_t *lio = dde->dde_lead_zio[p]; 2487 2488 if (lio != NULL) { 2489 return (lio->io_orig_size != zio->io_orig_size || 2490 bcmp(zio->io_orig_data, lio->io_orig_data, 2491 zio->io_orig_size) != 0); 2492 } 2493 } 2494 2495 for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) { 2496 ddt_phys_t *ddp = &dde->dde_phys[p]; 2497 2498 if (ddp->ddp_phys_birth != 0) { 2499 arc_buf_t *abuf = NULL; 2500 arc_flags_t aflags = ARC_FLAG_WAIT; 2501 blkptr_t blk = *zio->io_bp; 2502 int error; 2503 2504 ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth); 2505 2506 ddt_exit(ddt); 2507 2508 error = arc_read(NULL, spa, &blk, 2509 arc_getbuf_func, &abuf, ZIO_PRIORITY_SYNC_READ, 2510 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, 2511 &aflags, &zio->io_bookmark); 2512 2513 if (error == 0) { 2514 if (arc_buf_size(abuf) != zio->io_orig_size || 2515 bcmp(abuf->b_data, zio->io_orig_data, 2516 zio->io_orig_size) != 0) 2517 error = SET_ERROR(EEXIST); 2518 arc_buf_destroy(abuf, &abuf); 2519 } 2520 2521 ddt_enter(ddt); 2522 return (error != 0); 2523 } 2524 } 2525 2526 return (B_FALSE); 2527} 2528 2529static void 2530zio_ddt_child_write_ready(zio_t *zio) 2531{ 2532 int p = zio->io_prop.zp_copies; 2533 ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp); 2534 ddt_entry_t *dde = zio->io_private; 2535 ddt_phys_t *ddp = &dde->dde_phys[p]; 2536 zio_t *pio; 2537 2538 if (zio->io_error) 2539 return; 2540 2541 ddt_enter(ddt); 2542 2543 ASSERT(dde->dde_lead_zio[p] == zio); 2544 2545 ddt_phys_fill(ddp, zio->io_bp); 2546 2547 zio_link_t *zl = NULL; 2548 while ((pio = zio_walk_parents(zio, &zl)) != NULL) 2549 ddt_bp_fill(ddp, pio->io_bp, zio->io_txg); 2550 2551 ddt_exit(ddt); 2552} 2553 2554static void 2555zio_ddt_child_write_done(zio_t *zio) 2556{ 2557 int p = zio->io_prop.zp_copies; 2558 ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp); 2559 ddt_entry_t *dde = zio->io_private; 2560 ddt_phys_t *ddp = &dde->dde_phys[p]; 2561 2562 ddt_enter(ddt); 2563 2564 ASSERT(ddp->ddp_refcnt == 0); 2565 ASSERT(dde->dde_lead_zio[p] == zio); 2566 dde->dde_lead_zio[p] = NULL; 2567 2568 if (zio->io_error == 0) { 2569 zio_link_t *zl = NULL; 2570 while (zio_walk_parents(zio, &zl) != NULL) 2571 ddt_phys_addref(ddp); 2572 } else { 2573 ddt_phys_clear(ddp); 2574 } 2575 2576 ddt_exit(ddt); 2577} 2578 2579static void 2580zio_ddt_ditto_write_done(zio_t *zio) 2581{ 2582 int p = DDT_PHYS_DITTO; 2583 zio_prop_t *zp = &zio->io_prop; 2584 blkptr_t *bp = zio->io_bp; 2585 ddt_t *ddt = ddt_select(zio->io_spa, bp); 2586 ddt_entry_t *dde = zio->io_private; 2587 ddt_phys_t *ddp = &dde->dde_phys[p]; 2588 ddt_key_t *ddk = &dde->dde_key; 2589 2590 ddt_enter(ddt); 2591 2592 ASSERT(ddp->ddp_refcnt == 0); 2593 ASSERT(dde->dde_lead_zio[p] == zio); 2594 dde->dde_lead_zio[p] = NULL; 2595 2596 if (zio->io_error == 0) { 2597 ASSERT(ZIO_CHECKSUM_EQUAL(bp->blk_cksum, ddk->ddk_cksum)); 2598 ASSERT(zp->zp_copies < SPA_DVAS_PER_BP); 2599 ASSERT(zp->zp_copies == BP_GET_NDVAS(bp) - BP_IS_GANG(bp)); 2600 if (ddp->ddp_phys_birth != 0) 2601 ddt_phys_free(ddt, ddk, ddp, zio->io_txg); 2602 ddt_phys_fill(ddp, bp); 2603 } 2604 2605 ddt_exit(ddt); 2606} 2607 2608static int 2609zio_ddt_write(zio_t *zio) 2610{ 2611 spa_t *spa = zio->io_spa; 2612 blkptr_t *bp = zio->io_bp; 2613 uint64_t txg = zio->io_txg; 2614 zio_prop_t *zp = &zio->io_prop; 2615 int p = zp->zp_copies; 2616 int ditto_copies; 2617 zio_t *cio = NULL; 2618 zio_t *dio = NULL; 2619 ddt_t *ddt = ddt_select(spa, bp); 2620 ddt_entry_t *dde; 2621 ddt_phys_t *ddp; 2622 2623 ASSERT(BP_GET_DEDUP(bp)); 2624 ASSERT(BP_GET_CHECKSUM(bp) == zp->zp_checksum); 2625 ASSERT(BP_IS_HOLE(bp) || zio->io_bp_override); 2626 2627 ddt_enter(ddt); 2628 dde = ddt_lookup(ddt, bp, B_TRUE); 2629 ddp = &dde->dde_phys[p]; 2630 2631 if (zp->zp_dedup_verify && zio_ddt_collision(zio, ddt, dde)) { 2632 /* 2633 * If we're using a weak checksum, upgrade to a strong checksum 2634 * and try again. If we're already using a strong checksum, 2635 * we can't resolve it, so just convert to an ordinary write. 2636 * (And automatically e-mail a paper to Nature?) 2637 */ 2638 if (!(zio_checksum_table[zp->zp_checksum].ci_flags & 2639 ZCHECKSUM_FLAG_DEDUP)) { 2640 zp->zp_checksum = spa_dedup_checksum(spa); 2641 zio_pop_transforms(zio); 2642 zio->io_stage = ZIO_STAGE_OPEN; 2643 BP_ZERO(bp); 2644 } else { 2645 zp->zp_dedup = B_FALSE; 2646 } 2647 zio->io_pipeline = ZIO_WRITE_PIPELINE; 2648 ddt_exit(ddt); 2649 return (ZIO_PIPELINE_CONTINUE); 2650 } 2651 2652 ditto_copies = ddt_ditto_copies_needed(ddt, dde, ddp); 2653 ASSERT(ditto_copies < SPA_DVAS_PER_BP); 2654 2655 if (ditto_copies > ddt_ditto_copies_present(dde) && 2656 dde->dde_lead_zio[DDT_PHYS_DITTO] == NULL) { 2657 zio_prop_t czp = *zp; 2658 2659 czp.zp_copies = ditto_copies; 2660 2661 /* 2662 * If we arrived here with an override bp, we won't have run 2663 * the transform stack, so we won't have the data we need to 2664 * generate a child i/o. So, toss the override bp and restart. 2665 * This is safe, because using the override bp is just an 2666 * optimization; and it's rare, so the cost doesn't matter. 2667 */ 2668 if (zio->io_bp_override) { 2669 zio_pop_transforms(zio); 2670 zio->io_stage = ZIO_STAGE_OPEN; 2671 zio->io_pipeline = ZIO_WRITE_PIPELINE; 2672 zio->io_bp_override = NULL; 2673 BP_ZERO(bp); 2674 ddt_exit(ddt); 2675 return (ZIO_PIPELINE_CONTINUE); 2676 } 2677 2678 dio = zio_write(zio, spa, txg, bp, zio->io_orig_data, 2679 zio->io_orig_size, &czp, NULL, NULL, 2680 NULL, zio_ddt_ditto_write_done, dde, zio->io_priority, 2681 ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark); 2682 2683 zio_push_transform(dio, zio->io_data, zio->io_size, 0, NULL); 2684 dde->dde_lead_zio[DDT_PHYS_DITTO] = dio; 2685 } 2686 2687 if (ddp->ddp_phys_birth != 0 || dde->dde_lead_zio[p] != NULL) { 2688 if (ddp->ddp_phys_birth != 0) 2689 ddt_bp_fill(ddp, bp, txg); 2690 if (dde->dde_lead_zio[p] != NULL) 2691 zio_add_child(zio, dde->dde_lead_zio[p]); 2692 else 2693 ddt_phys_addref(ddp); 2694 } else if (zio->io_bp_override) { 2695 ASSERT(bp->blk_birth == txg); 2696 ASSERT(BP_EQUAL(bp, zio->io_bp_override)); 2697 ddt_phys_fill(ddp, bp); 2698 ddt_phys_addref(ddp); 2699 } else { 2700 cio = zio_write(zio, spa, txg, bp, zio->io_orig_data, 2701 zio->io_orig_size, zp, 2702 zio_ddt_child_write_ready, NULL, NULL, 2703 zio_ddt_child_write_done, dde, zio->io_priority, 2704 ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark); 2705 2706 zio_push_transform(cio, zio->io_data, zio->io_size, 0, NULL); 2707 dde->dde_lead_zio[p] = cio; 2708 } 2709 2710 ddt_exit(ddt); 2711 2712 if (cio) 2713 zio_nowait(cio); 2714 if (dio) 2715 zio_nowait(dio); 2716 2717 return (ZIO_PIPELINE_CONTINUE); 2718} 2719 2720ddt_entry_t *freedde; /* for debugging */ 2721 2722static int 2723zio_ddt_free(zio_t *zio) 2724{ 2725 spa_t *spa = zio->io_spa; 2726 blkptr_t *bp = zio->io_bp; 2727 ddt_t *ddt = ddt_select(spa, bp); 2728 ddt_entry_t *dde; 2729 ddt_phys_t *ddp; 2730 2731 ASSERT(BP_GET_DEDUP(bp)); 2732 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 2733 2734 ddt_enter(ddt); 2735 freedde = dde = ddt_lookup(ddt, bp, B_TRUE); 2736 ddp = ddt_phys_select(dde, bp); 2737 ddt_phys_decref(ddp); 2738 ddt_exit(ddt); 2739 2740 return (ZIO_PIPELINE_CONTINUE); 2741} 2742 2743/* 2744 * ========================================================================== 2745 * Allocate and free blocks 2746 * ========================================================================== 2747 */ 2748 2749static zio_t * 2750zio_io_to_allocate(spa_t *spa) 2751{ 2752 zio_t *zio; 2753 2754 ASSERT(MUTEX_HELD(&spa->spa_alloc_lock)); 2755 2756 zio = avl_first(&spa->spa_alloc_tree); 2757 if (zio == NULL) 2758 return (NULL); 2759 2760 ASSERT(IO_IS_ALLOCATING(zio)); 2761 2762 /* 2763 * Try to place a reservation for this zio. If we're unable to 2764 * reserve then we throttle. 2765 */ 2766 if (!metaslab_class_throttle_reserve(spa_normal_class(spa), 2767 zio->io_prop.zp_copies, zio, 0)) { 2768 return (NULL); 2769 } 2770 2771 avl_remove(&spa->spa_alloc_tree, zio); 2772 ASSERT3U(zio->io_stage, <, ZIO_STAGE_DVA_ALLOCATE); 2773 2774 return (zio); 2775} 2776 2777static int 2778zio_dva_throttle(zio_t *zio) 2779{ 2780 spa_t *spa = zio->io_spa; 2781 zio_t *nio; 2782 2783 if (zio->io_priority == ZIO_PRIORITY_SYNC_WRITE || 2784 !spa_normal_class(zio->io_spa)->mc_alloc_throttle_enabled || 2785 zio->io_child_type == ZIO_CHILD_GANG || 2786 zio->io_flags & ZIO_FLAG_NODATA) { 2787 return (ZIO_PIPELINE_CONTINUE); 2788 } 2789 2790 ASSERT(zio->io_child_type > ZIO_CHILD_GANG); 2791 2792 ASSERT3U(zio->io_queued_timestamp, >, 0); 2793 ASSERT(zio->io_stage == ZIO_STAGE_DVA_THROTTLE); 2794 2795 mutex_enter(&spa->spa_alloc_lock); 2796 2797 ASSERT(zio->io_type == ZIO_TYPE_WRITE); 2798 avl_add(&spa->spa_alloc_tree, zio); 2799 2800 nio = zio_io_to_allocate(zio->io_spa); 2801 mutex_exit(&spa->spa_alloc_lock); 2802 2803 if (nio == zio) 2804 return (ZIO_PIPELINE_CONTINUE); 2805 2806 if (nio != NULL) { 2807 ASSERT3U(nio->io_queued_timestamp, <=, 2808 zio->io_queued_timestamp); 2809 ASSERT(nio->io_stage == ZIO_STAGE_DVA_THROTTLE); 2810 /* 2811 * We are passing control to a new zio so make sure that 2812 * it is processed by a different thread. We do this to 2813 * avoid stack overflows that can occur when parents are 2814 * throttled and children are making progress. We allow 2815 * it to go to the head of the taskq since it's already 2816 * been waiting. 2817 */ 2818 zio_taskq_dispatch(nio, ZIO_TASKQ_ISSUE, B_TRUE); 2819 } 2820 return (ZIO_PIPELINE_STOP); 2821} 2822 2823void 2824zio_allocate_dispatch(spa_t *spa) 2825{ 2826 zio_t *zio; 2827 2828 mutex_enter(&spa->spa_alloc_lock); 2829 zio = zio_io_to_allocate(spa); 2830 mutex_exit(&spa->spa_alloc_lock); 2831 if (zio == NULL) 2832 return; 2833 2834 ASSERT3U(zio->io_stage, ==, ZIO_STAGE_DVA_THROTTLE); 2835 ASSERT0(zio->io_error); 2836 zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_TRUE); 2837} 2838 2839static int 2840zio_dva_allocate(zio_t *zio) 2841{ 2842 spa_t *spa = zio->io_spa; 2843 metaslab_class_t *mc = spa_normal_class(spa); 2844 blkptr_t *bp = zio->io_bp; 2845 int error; 2846 int flags = 0; 2847 2848 if (zio->io_gang_leader == NULL) { 2849 ASSERT(zio->io_child_type > ZIO_CHILD_GANG); 2850 zio->io_gang_leader = zio; 2851 } 2852 2853 ASSERT(BP_IS_HOLE(bp)); 2854 ASSERT0(BP_GET_NDVAS(bp)); 2855 ASSERT3U(zio->io_prop.zp_copies, >, 0); 2856 ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa)); 2857 ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp)); 2858 2859 if (zio->io_flags & ZIO_FLAG_NODATA) { 2860 flags |= METASLAB_DONT_THROTTLE; 2861 } 2862 if (zio->io_flags & ZIO_FLAG_GANG_CHILD) { 2863 flags |= METASLAB_GANG_CHILD; 2864 } 2865 if (zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE) { 2866 flags |= METASLAB_ASYNC_ALLOC; 2867 } 2868 2869 error = metaslab_alloc(spa, mc, zio->io_size, bp, 2870 zio->io_prop.zp_copies, zio->io_txg, NULL, flags, zio); 2871 2872 if (error != 0) { 2873 spa_dbgmsg(spa, "%s: metaslab allocation failure: zio %p, " 2874 "size %llu, error %d", spa_name(spa), zio, zio->io_size, 2875 error); 2876 if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE) 2877 return (zio_write_gang_block(zio)); 2878 zio->io_error = error; 2879 } 2880 2881 return (ZIO_PIPELINE_CONTINUE); 2882} 2883 2884static int 2885zio_dva_free(zio_t *zio) 2886{ 2887 metaslab_free(zio->io_spa, zio->io_bp, zio->io_txg, B_FALSE); 2888 2889 return (ZIO_PIPELINE_CONTINUE); 2890} 2891 2892static int 2893zio_dva_claim(zio_t *zio) 2894{ 2895 int error; 2896 2897 error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg); 2898 if (error) 2899 zio->io_error = error; 2900 2901 return (ZIO_PIPELINE_CONTINUE); 2902} 2903 2904/* 2905 * Undo an allocation. This is used by zio_done() when an I/O fails 2906 * and we want to give back the block we just allocated. 2907 * This handles both normal blocks and gang blocks. 2908 */ 2909static void 2910zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp) 2911{ 2912 ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp)); 2913 ASSERT(zio->io_bp_override == NULL); 2914 2915 if (!BP_IS_HOLE(bp)) 2916 metaslab_free(zio->io_spa, bp, bp->blk_birth, B_TRUE); 2917 2918 if (gn != NULL) { 2919 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { 2920 zio_dva_unallocate(zio, gn->gn_child[g], 2921 &gn->gn_gbh->zg_blkptr[g]); 2922 } 2923 } 2924} 2925 2926/* 2927 * Try to allocate an intent log block. Return 0 on success, errno on failure. 2928 */ 2929int 2930zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, blkptr_t *old_bp, 2931 uint64_t size, boolean_t use_slog) 2932{ 2933 int error = 1; 2934 2935 ASSERT(txg > spa_syncing_txg(spa)); 2936 2937 if (use_slog) { 2938 error = metaslab_alloc(spa, spa_log_class(spa), size, 2939 new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID, NULL); 2940 } 2941 2942 if (error) { 2943 error = metaslab_alloc(spa, spa_normal_class(spa), size, 2944 new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID, NULL); 2945 } 2946 2947 if (error == 0) { 2948 BP_SET_LSIZE(new_bp, size); 2949 BP_SET_PSIZE(new_bp, size); 2950 BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF); 2951 BP_SET_CHECKSUM(new_bp, 2952 spa_version(spa) >= SPA_VERSION_SLIM_ZIL 2953 ? ZIO_CHECKSUM_ZILOG2 : ZIO_CHECKSUM_ZILOG); 2954 BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG); 2955 BP_SET_LEVEL(new_bp, 0); 2956 BP_SET_DEDUP(new_bp, 0); 2957 BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER); 2958 } 2959 2960 return (error); 2961} 2962 2963/* 2964 * Free an intent log block. 2965 */ 2966void 2967zio_free_zil(spa_t *spa, uint64_t txg, blkptr_t *bp) 2968{ 2969 ASSERT(BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG); 2970 ASSERT(!BP_IS_GANG(bp)); 2971 2972 zio_free(spa, txg, bp); 2973} 2974 2975/* 2976 * ========================================================================== 2977 * Read, write and delete to physical devices 2978 * ========================================================================== 2979 */ 2980 2981 2982/* 2983 * Issue an I/O to the underlying vdev. Typically the issue pipeline 2984 * stops after this stage and will resume upon I/O completion. 2985 * However, there are instances where the vdev layer may need to 2986 * continue the pipeline when an I/O was not issued. Since the I/O 2987 * that was sent to the vdev layer might be different than the one 2988 * currently active in the pipeline (see vdev_queue_io()), we explicitly 2989 * force the underlying vdev layers to call either zio_execute() or 2990 * zio_interrupt() to ensure that the pipeline continues with the correct I/O. 2991 */ 2992static int 2993zio_vdev_io_start(zio_t *zio) 2994{ 2995 vdev_t *vd = zio->io_vd; 2996 uint64_t align; 2997 spa_t *spa = zio->io_spa; 2998 int ret; 2999 3000 ASSERT(zio->io_error == 0); 3001 ASSERT(zio->io_child_error[ZIO_CHILD_VDEV] == 0); 3002 3003 if (vd == NULL) { 3004 if (!(zio->io_flags & ZIO_FLAG_CONFIG_WRITER)) 3005 spa_config_enter(spa, SCL_ZIO, zio, RW_READER); 3006 3007 /* 3008 * The mirror_ops handle multiple DVAs in a single BP. 3009 */ 3010 vdev_mirror_ops.vdev_op_io_start(zio); 3011 return (ZIO_PIPELINE_STOP); 3012 } 3013 3014 if (vd->vdev_ops->vdev_op_leaf && zio->io_type == ZIO_TYPE_FREE && 3015 zio->io_priority == ZIO_PRIORITY_NOW) { 3016 trim_map_free(vd, zio->io_offset, zio->io_size, zio->io_txg); 3017 return (ZIO_PIPELINE_CONTINUE); 3018 } 3019 3020 ASSERT3P(zio->io_logical, !=, zio); 3021 3022 /* 3023 * We keep track of time-sensitive I/Os so that the scan thread 3024 * can quickly react to certain workloads. In particular, we care 3025 * about non-scrubbing, top-level reads and writes with the following 3026 * characteristics: 3027 * - synchronous writes of user data to non-slog devices 3028 * - any reads of user data 3029 * When these conditions are met, adjust the timestamp of spa_last_io 3030 * which allows the scan thread to adjust its workload accordingly. 3031 */ 3032 if (!(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && zio->io_bp != NULL && 3033 vd == vd->vdev_top && !vd->vdev_islog && 3034 zio->io_bookmark.zb_objset != DMU_META_OBJSET && 3035 zio->io_txg != spa_syncing_txg(spa)) { 3036 uint64_t old = spa->spa_last_io; 3037 uint64_t new = ddi_get_lbolt64(); 3038 if (old != new) 3039 (void) atomic_cas_64(&spa->spa_last_io, old, new); 3040 } 3041 3042 align = 1ULL << vd->vdev_top->vdev_ashift; 3043 3044 if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) && 3045 P2PHASE(zio->io_size, align) != 0) { 3046 /* Transform logical writes to be a full physical block size. */ 3047 uint64_t asize = P2ROUNDUP(zio->io_size, align); 3048 char *abuf = NULL; 3049 if (zio->io_type == ZIO_TYPE_READ || 3050 zio->io_type == ZIO_TYPE_WRITE) 3051 abuf = zio_buf_alloc(asize); 3052 ASSERT(vd == vd->vdev_top); 3053 if (zio->io_type == ZIO_TYPE_WRITE) { 3054 bcopy(zio->io_data, abuf, zio->io_size); 3055 bzero(abuf + zio->io_size, asize - zio->io_size); 3056 } 3057 zio_push_transform(zio, abuf, asize, abuf ? asize : 0, 3058 zio_subblock); 3059 } 3060 3061 /* 3062 * If this is not a physical io, make sure that it is properly aligned 3063 * before proceeding. 3064 */ 3065 if (!(zio->io_flags & ZIO_FLAG_PHYSICAL)) { 3066 ASSERT0(P2PHASE(zio->io_offset, align)); 3067 ASSERT0(P2PHASE(zio->io_size, align)); 3068 } else { 3069 /* 3070 * For the physical io we allow alignment 3071 * to a logical block size. 3072 */ 3073 uint64_t log_align = 3074 1ULL << vd->vdev_top->vdev_logical_ashift; 3075 ASSERT0(P2PHASE(zio->io_offset, log_align)); 3076 ASSERT0(P2PHASE(zio->io_size, log_align)); 3077 } 3078 3079 VERIFY(zio->io_type == ZIO_TYPE_READ || spa_writeable(spa)); 3080 3081 /* 3082 * If this is a repair I/O, and there's no self-healing involved -- 3083 * that is, we're just resilvering what we expect to resilver -- 3084 * then don't do the I/O unless zio's txg is actually in vd's DTL. 3085 * This prevents spurious resilvering with nested replication. 3086 * For example, given a mirror of mirrors, (A+B)+(C+D), if only 3087 * A is out of date, we'll read from C+D, then use the data to 3088 * resilver A+B -- but we don't actually want to resilver B, just A. 3089 * The top-level mirror has no way to know this, so instead we just 3090 * discard unnecessary repairs as we work our way down the vdev tree. 3091 * The same logic applies to any form of nested replication: 3092 * ditto + mirror, RAID-Z + replacing, etc. This covers them all. 3093 */ 3094 if ((zio->io_flags & ZIO_FLAG_IO_REPAIR) && 3095 !(zio->io_flags & ZIO_FLAG_SELF_HEAL) && 3096 zio->io_txg != 0 && /* not a delegated i/o */ 3097 !vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) { 3098 ASSERT(zio->io_type == ZIO_TYPE_WRITE); 3099 zio_vdev_io_bypass(zio); 3100 return (ZIO_PIPELINE_CONTINUE); 3101 } 3102 3103 if (vd->vdev_ops->vdev_op_leaf) { 3104 switch (zio->io_type) { 3105 case ZIO_TYPE_READ: 3106 if (vdev_cache_read(zio)) 3107 return (ZIO_PIPELINE_CONTINUE); 3108 /* FALLTHROUGH */ 3109 case ZIO_TYPE_WRITE: 3110 case ZIO_TYPE_FREE: 3111 if ((zio = vdev_queue_io(zio)) == NULL) 3112 return (ZIO_PIPELINE_STOP); 3113 3114 if (!vdev_accessible(vd, zio)) { 3115 zio->io_error = SET_ERROR(ENXIO); 3116 zio_interrupt(zio); 3117 return (ZIO_PIPELINE_STOP); 3118 } 3119 break; 3120 } 3121 /* 3122 * Note that we ignore repair writes for TRIM because they can 3123 * conflict with normal writes. This isn't an issue because, by 3124 * definition, we only repair blocks that aren't freed. 3125 */ 3126 if (zio->io_type == ZIO_TYPE_WRITE && 3127 !(zio->io_flags & ZIO_FLAG_IO_REPAIR) && 3128 !trim_map_write_start(zio)) 3129 return (ZIO_PIPELINE_STOP); 3130 } 3131 3132 vd->vdev_ops->vdev_op_io_start(zio); 3133 return (ZIO_PIPELINE_STOP); 3134} 3135 3136static int 3137zio_vdev_io_done(zio_t *zio) 3138{ 3139 vdev_t *vd = zio->io_vd; 3140 vdev_ops_t *ops = vd ? vd->vdev_ops : &vdev_mirror_ops; 3141 boolean_t unexpected_error = B_FALSE; 3142 3143 if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE)) 3144 return (ZIO_PIPELINE_STOP); 3145 3146 ASSERT(zio->io_type == ZIO_TYPE_READ || 3147 zio->io_type == ZIO_TYPE_WRITE || zio->io_type == ZIO_TYPE_FREE); 3148 3149 if (vd != NULL && vd->vdev_ops->vdev_op_leaf && 3150 (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE || 3151 zio->io_type == ZIO_TYPE_FREE)) { 3152 3153 if (zio->io_type == ZIO_TYPE_WRITE && 3154 !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) 3155 trim_map_write_done(zio); 3156 3157 vdev_queue_io_done(zio); 3158 3159 if (zio->io_type == ZIO_TYPE_WRITE) 3160 vdev_cache_write(zio); 3161 3162 if (zio_injection_enabled && zio->io_error == 0) 3163 zio->io_error = zio_handle_device_injection(vd, 3164 zio, EIO); 3165 3166 if (zio_injection_enabled && zio->io_error == 0) 3167 zio->io_error = zio_handle_label_injection(zio, EIO); 3168 3169 if (zio->io_error) { 3170 if (zio->io_error == ENOTSUP && 3171 zio->io_type == ZIO_TYPE_FREE) { 3172 /* Not all devices support TRIM. */ 3173 } else if (!vdev_accessible(vd, zio)) { 3174 zio->io_error = SET_ERROR(ENXIO); 3175 } else { 3176 unexpected_error = B_TRUE; 3177 } 3178 } 3179 } 3180 3181 ops->vdev_op_io_done(zio); 3182 3183 if (unexpected_error) 3184 VERIFY(vdev_probe(vd, zio) == NULL); 3185 3186 return (ZIO_PIPELINE_CONTINUE); 3187} 3188 3189/* 3190 * For non-raidz ZIOs, we can just copy aside the bad data read from the 3191 * disk, and use that to finish the checksum ereport later. 3192 */ 3193static void 3194zio_vsd_default_cksum_finish(zio_cksum_report_t *zcr, 3195 const void *good_buf) 3196{ 3197 /* no processing needed */ 3198 zfs_ereport_finish_checksum(zcr, good_buf, zcr->zcr_cbdata, B_FALSE); 3199} 3200 3201/*ARGSUSED*/ 3202void 3203zio_vsd_default_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *ignored) 3204{ 3205 void *buf = zio_buf_alloc(zio->io_size); 3206 3207 bcopy(zio->io_data, buf, zio->io_size); 3208 3209 zcr->zcr_cbinfo = zio->io_size; 3210 zcr->zcr_cbdata = buf; 3211 zcr->zcr_finish = zio_vsd_default_cksum_finish; 3212 zcr->zcr_free = zio_buf_free; 3213} 3214 3215static int 3216zio_vdev_io_assess(zio_t *zio) 3217{ 3218 vdev_t *vd = zio->io_vd; 3219 3220 if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE)) 3221 return (ZIO_PIPELINE_STOP); 3222 3223 if (vd == NULL && !(zio->io_flags & ZIO_FLAG_CONFIG_WRITER)) 3224 spa_config_exit(zio->io_spa, SCL_ZIO, zio); 3225 3226 if (zio->io_vsd != NULL) { 3227 zio->io_vsd_ops->vsd_free(zio); 3228 zio->io_vsd = NULL; 3229 } 3230 3231 if (zio_injection_enabled && zio->io_error == 0) 3232 zio->io_error = zio_handle_fault_injection(zio, EIO); 3233 3234 if (zio->io_type == ZIO_TYPE_FREE && 3235 zio->io_priority != ZIO_PRIORITY_NOW) { 3236 switch (zio->io_error) { 3237 case 0: 3238 ZIO_TRIM_STAT_INCR(bytes, zio->io_size); 3239 ZIO_TRIM_STAT_BUMP(success); 3240 break; 3241 case EOPNOTSUPP: 3242 ZIO_TRIM_STAT_BUMP(unsupported); 3243 break; 3244 default: 3245 ZIO_TRIM_STAT_BUMP(failed); 3246 break; 3247 } 3248 } 3249 3250 /* 3251 * If the I/O failed, determine whether we should attempt to retry it. 3252 * 3253 * On retry, we cut in line in the issue queue, since we don't want 3254 * compression/checksumming/etc. work to prevent our (cheap) IO reissue. 3255 */ 3256 if (zio->io_error && vd == NULL && 3257 !(zio->io_flags & (ZIO_FLAG_DONT_RETRY | ZIO_FLAG_IO_RETRY))) { 3258 ASSERT(!(zio->io_flags & ZIO_FLAG_DONT_QUEUE)); /* not a leaf */ 3259 ASSERT(!(zio->io_flags & ZIO_FLAG_IO_BYPASS)); /* not a leaf */ 3260 zio->io_error = 0; 3261 zio->io_flags |= ZIO_FLAG_IO_RETRY | 3262 ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE; 3263 zio->io_stage = ZIO_STAGE_VDEV_IO_START >> 1; 3264 zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, 3265 zio_requeue_io_start_cut_in_line); 3266 return (ZIO_PIPELINE_STOP); 3267 } 3268 3269 /* 3270 * If we got an error on a leaf device, convert it to ENXIO 3271 * if the device is not accessible at all. 3272 */ 3273 if (zio->io_error && vd != NULL && vd->vdev_ops->vdev_op_leaf && 3274 !vdev_accessible(vd, zio)) 3275 zio->io_error = SET_ERROR(ENXIO); 3276 3277 /* 3278 * If we can't write to an interior vdev (mirror or RAID-Z), 3279 * set vdev_cant_write so that we stop trying to allocate from it. 3280 */ 3281 if (zio->io_error == ENXIO && zio->io_type == ZIO_TYPE_WRITE && 3282 vd != NULL && !vd->vdev_ops->vdev_op_leaf) { 3283 vd->vdev_cant_write = B_TRUE; 3284 } 3285 3286 if (zio->io_error) 3287 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 3288 3289 if (vd != NULL && vd->vdev_ops->vdev_op_leaf && 3290 zio->io_physdone != NULL) { 3291 ASSERT(!(zio->io_flags & ZIO_FLAG_DELEGATED)); 3292 ASSERT(zio->io_child_type == ZIO_CHILD_VDEV); 3293 zio->io_physdone(zio->io_logical); 3294 } 3295 3296 return (ZIO_PIPELINE_CONTINUE); 3297} 3298 3299void 3300zio_vdev_io_reissue(zio_t *zio) 3301{ 3302 ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); 3303 ASSERT(zio->io_error == 0); 3304 3305 zio->io_stage >>= 1; 3306} 3307 3308void 3309zio_vdev_io_redone(zio_t *zio) 3310{ 3311 ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE); 3312 3313 zio->io_stage >>= 1; 3314} 3315 3316void 3317zio_vdev_io_bypass(zio_t *zio) 3318{ 3319 ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); 3320 ASSERT(zio->io_error == 0); 3321 3322 zio->io_flags |= ZIO_FLAG_IO_BYPASS; 3323 zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS >> 1; 3324} 3325 3326/* 3327 * ========================================================================== 3328 * Generate and verify checksums 3329 * ========================================================================== 3330 */ 3331static int 3332zio_checksum_generate(zio_t *zio) 3333{ 3334 blkptr_t *bp = zio->io_bp; 3335 enum zio_checksum checksum; 3336 3337 if (bp == NULL) { 3338 /* 3339 * This is zio_write_phys(). 3340 * We're either generating a label checksum, or none at all. 3341 */ 3342 checksum = zio->io_prop.zp_checksum; 3343 3344 if (checksum == ZIO_CHECKSUM_OFF) 3345 return (ZIO_PIPELINE_CONTINUE); 3346 3347 ASSERT(checksum == ZIO_CHECKSUM_LABEL); 3348 } else { 3349 if (BP_IS_GANG(bp) && zio->io_child_type == ZIO_CHILD_GANG) { 3350 ASSERT(!IO_IS_ALLOCATING(zio)); 3351 checksum = ZIO_CHECKSUM_GANG_HEADER; 3352 } else { 3353 checksum = BP_GET_CHECKSUM(bp); 3354 } 3355 } 3356 3357 zio_checksum_compute(zio, checksum, zio->io_data, zio->io_size); 3358 3359 return (ZIO_PIPELINE_CONTINUE); 3360} 3361 3362static int 3363zio_checksum_verify(zio_t *zio) 3364{ 3365 zio_bad_cksum_t info; 3366 blkptr_t *bp = zio->io_bp; 3367 int error; 3368 3369 ASSERT(zio->io_vd != NULL); 3370 3371 if (bp == NULL) { 3372 /* 3373 * This is zio_read_phys(). 3374 * We're either verifying a label checksum, or nothing at all. 3375 */ 3376 if (zio->io_prop.zp_checksum == ZIO_CHECKSUM_OFF) 3377 return (ZIO_PIPELINE_CONTINUE); 3378 3379 ASSERT(zio->io_prop.zp_checksum == ZIO_CHECKSUM_LABEL); 3380 } 3381 3382 if ((error = zio_checksum_error(zio, &info)) != 0) { 3383 zio->io_error = error; 3384 if (error == ECKSUM && 3385 !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 3386 zfs_ereport_start_checksum(zio->io_spa, 3387 zio->io_vd, zio, zio->io_offset, 3388 zio->io_size, NULL, &info); 3389 } 3390 } 3391 3392 return (ZIO_PIPELINE_CONTINUE); 3393} 3394 3395/* 3396 * Called by RAID-Z to ensure we don't compute the checksum twice. 3397 */ 3398void 3399zio_checksum_verified(zio_t *zio) 3400{ 3401 zio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY; 3402} 3403 3404/* 3405 * ========================================================================== 3406 * Error rank. Error are ranked in the order 0, ENXIO, ECKSUM, EIO, other. 3407 * An error of 0 indicates success. ENXIO indicates whole-device failure, 3408 * which may be transient (e.g. unplugged) or permament. ECKSUM and EIO 3409 * indicate errors that are specific to one I/O, and most likely permanent. 3410 * Any other error is presumed to be worse because we weren't expecting it. 3411 * ========================================================================== 3412 */ 3413int 3414zio_worst_error(int e1, int e2) 3415{ 3416 static int zio_error_rank[] = { 0, ENXIO, ECKSUM, EIO }; 3417 int r1, r2; 3418 3419 for (r1 = 0; r1 < sizeof (zio_error_rank) / sizeof (int); r1++) 3420 if (e1 == zio_error_rank[r1]) 3421 break; 3422 3423 for (r2 = 0; r2 < sizeof (zio_error_rank) / sizeof (int); r2++) 3424 if (e2 == zio_error_rank[r2]) 3425 break; 3426 3427 return (r1 > r2 ? e1 : e2); 3428} 3429 3430/* 3431 * ========================================================================== 3432 * I/O completion 3433 * ========================================================================== 3434 */ 3435static int 3436zio_ready(zio_t *zio) 3437{ 3438 blkptr_t *bp = zio->io_bp; 3439 zio_t *pio, *pio_next; 3440 zio_link_t *zl = NULL; 3441 3442 if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) || 3443 zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_READY)) 3444 return (ZIO_PIPELINE_STOP); 3445 3446 if (zio->io_ready) { 3447 ASSERT(IO_IS_ALLOCATING(zio)); 3448 ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp) || 3449 (zio->io_flags & ZIO_FLAG_NOPWRITE)); 3450 ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0); 3451 3452 zio->io_ready(zio); 3453 } 3454 3455 if (bp != NULL && bp != &zio->io_bp_copy) 3456 zio->io_bp_copy = *bp; 3457 3458 if (zio->io_error != 0) { 3459 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 3460 3461 if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING) { 3462 ASSERT(IO_IS_ALLOCATING(zio)); 3463 ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); 3464 /* 3465 * We were unable to allocate anything, unreserve and 3466 * issue the next I/O to allocate. 3467 */ 3468 metaslab_class_throttle_unreserve( 3469 spa_normal_class(zio->io_spa), 3470 zio->io_prop.zp_copies, zio); 3471 zio_allocate_dispatch(zio->io_spa); 3472 } 3473 } 3474 3475 mutex_enter(&zio->io_lock); 3476 zio->io_state[ZIO_WAIT_READY] = 1; 3477 pio = zio_walk_parents(zio, &zl); 3478 mutex_exit(&zio->io_lock); 3479 3480 /* 3481 * As we notify zio's parents, new parents could be added. 3482 * New parents go to the head of zio's io_parent_list, however, 3483 * so we will (correctly) not notify them. The remainder of zio's 3484 * io_parent_list, from 'pio_next' onward, cannot change because 3485 * all parents must wait for us to be done before they can be done. 3486 */ 3487 for (; pio != NULL; pio = pio_next) { 3488 pio_next = zio_walk_parents(zio, &zl); 3489 zio_notify_parent(pio, zio, ZIO_WAIT_READY); 3490 } 3491 3492 if (zio->io_flags & ZIO_FLAG_NODATA) { 3493 if (BP_IS_GANG(bp)) { 3494 zio->io_flags &= ~ZIO_FLAG_NODATA; 3495 } else { 3496 ASSERT((uintptr_t)zio->io_data < SPA_MAXBLOCKSIZE); 3497 zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; 3498 } 3499 } 3500 3501 if (zio_injection_enabled && 3502 zio->io_spa->spa_syncing_txg == zio->io_txg) 3503 zio_handle_ignored_writes(zio); 3504 3505 return (ZIO_PIPELINE_CONTINUE); 3506} 3507 3508/* 3509 * Update the allocation throttle accounting. 3510 */ 3511static void 3512zio_dva_throttle_done(zio_t *zio) 3513{ 3514 zio_t *lio = zio->io_logical; 3515 zio_t *pio = zio_unique_parent(zio); 3516 vdev_t *vd = zio->io_vd; 3517 int flags = METASLAB_ASYNC_ALLOC; 3518 3519 ASSERT3P(zio->io_bp, !=, NULL); 3520 ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); 3521 ASSERT3U(zio->io_priority, ==, ZIO_PRIORITY_ASYNC_WRITE); 3522 ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV); 3523 ASSERT(vd != NULL); 3524 ASSERT3P(vd, ==, vd->vdev_top); 3525 ASSERT(!(zio->io_flags & (ZIO_FLAG_IO_REPAIR | ZIO_FLAG_IO_RETRY))); 3526 ASSERT(zio->io_flags & ZIO_FLAG_IO_ALLOCATING); 3527 ASSERT(!(lio->io_flags & ZIO_FLAG_IO_REWRITE)); 3528 ASSERT(!(lio->io_orig_flags & ZIO_FLAG_NODATA)); 3529 3530 /* 3531 * Parents of gang children can have two flavors -- ones that 3532 * allocated the gang header (will have ZIO_FLAG_IO_REWRITE set) 3533 * and ones that allocated the constituent blocks. The allocation 3534 * throttle needs to know the allocating parent zio so we must find 3535 * it here. 3536 */ 3537 if (pio->io_child_type == ZIO_CHILD_GANG) { 3538 /* 3539 * If our parent is a rewrite gang child then our grandparent 3540 * would have been the one that performed the allocation. 3541 */ 3542 if (pio->io_flags & ZIO_FLAG_IO_REWRITE) 3543 pio = zio_unique_parent(pio); 3544 flags |= METASLAB_GANG_CHILD; 3545 } 3546 3547 ASSERT(IO_IS_ALLOCATING(pio)); 3548 ASSERT3P(zio, !=, zio->io_logical); 3549 ASSERT(zio->io_logical != NULL); 3550 ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REPAIR)); 3551 ASSERT0(zio->io_flags & ZIO_FLAG_NOPWRITE); 3552 3553 mutex_enter(&pio->io_lock); 3554 metaslab_group_alloc_decrement(zio->io_spa, vd->vdev_id, pio, flags); 3555 mutex_exit(&pio->io_lock); 3556 3557 metaslab_class_throttle_unreserve(spa_normal_class(zio->io_spa), 3558 1, pio); 3559 3560 /* 3561 * Call into the pipeline to see if there is more work that 3562 * needs to be done. If there is work to be done it will be 3563 * dispatched to another taskq thread. 3564 */ 3565 zio_allocate_dispatch(zio->io_spa); 3566} 3567 3568static int 3569zio_done(zio_t *zio) 3570{ 3571 spa_t *spa = zio->io_spa; 3572 zio_t *lio = zio->io_logical; 3573 blkptr_t *bp = zio->io_bp; 3574 vdev_t *vd = zio->io_vd; 3575 uint64_t psize = zio->io_size; 3576 zio_t *pio, *pio_next; 3577 metaslab_class_t *mc = spa_normal_class(spa); 3578 zio_link_t *zl = NULL; 3579 3580 /* 3581 * If our children haven't all completed, 3582 * wait for them and then repeat this pipeline stage. 3583 */ 3584 if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE) || 3585 zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE) || 3586 zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE) || 3587 zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE)) 3588 return (ZIO_PIPELINE_STOP); 3589 3590 /* 3591 * If the allocation throttle is enabled, then update the accounting. 3592 * We only track child I/Os that are part of an allocating async 3593 * write. We must do this since the allocation is performed 3594 * by the logical I/O but the actual write is done by child I/Os. 3595 */ 3596 if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING && 3597 zio->io_child_type == ZIO_CHILD_VDEV) { 3598 ASSERT(mc->mc_alloc_throttle_enabled); 3599 zio_dva_throttle_done(zio); 3600 } 3601 3602 /* 3603 * If the allocation throttle is enabled, verify that 3604 * we have decremented the refcounts for every I/O that was throttled. 3605 */ 3606 if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING) { 3607 ASSERT(zio->io_type == ZIO_TYPE_WRITE); 3608 ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); 3609 ASSERT(bp != NULL); 3610 metaslab_group_alloc_verify(spa, zio->io_bp, zio); 3611 VERIFY(refcount_not_held(&mc->mc_alloc_slots, zio)); 3612 } 3613 3614 for (int c = 0; c < ZIO_CHILD_TYPES; c++) 3615 for (int w = 0; w < ZIO_WAIT_TYPES; w++) 3616 ASSERT(zio->io_children[c][w] == 0); 3617 3618 if (bp != NULL && !BP_IS_EMBEDDED(bp)) { 3619 ASSERT(bp->blk_pad[0] == 0); 3620 ASSERT(bp->blk_pad[1] == 0); 3621 ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0 || 3622 (bp == zio_unique_parent(zio)->io_bp)); 3623 if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) && 3624 zio->io_bp_override == NULL && 3625 !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) { 3626 ASSERT(!BP_SHOULD_BYTESWAP(bp)); 3627 ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(bp)); 3628 ASSERT(BP_COUNT_GANG(bp) == 0 || 3629 (BP_COUNT_GANG(bp) == BP_GET_NDVAS(bp))); 3630 } 3631 if (zio->io_flags & ZIO_FLAG_NOPWRITE) 3632 VERIFY(BP_EQUAL(bp, &zio->io_bp_orig)); 3633 } 3634 3635 /* 3636 * If there were child vdev/gang/ddt errors, they apply to us now. 3637 */ 3638 zio_inherit_child_errors(zio, ZIO_CHILD_VDEV); 3639 zio_inherit_child_errors(zio, ZIO_CHILD_GANG); 3640 zio_inherit_child_errors(zio, ZIO_CHILD_DDT); 3641 3642 /* 3643 * If the I/O on the transformed data was successful, generate any 3644 * checksum reports now while we still have the transformed data. 3645 */ 3646 if (zio->io_error == 0) { 3647 while (zio->io_cksum_report != NULL) { 3648 zio_cksum_report_t *zcr = zio->io_cksum_report; 3649 uint64_t align = zcr->zcr_align; 3650 uint64_t asize = P2ROUNDUP(psize, align); 3651 char *abuf = zio->io_data; 3652 3653 if (asize != psize) { 3654 abuf = zio_buf_alloc(asize); 3655 bcopy(zio->io_data, abuf, psize); 3656 bzero(abuf + psize, asize - psize); 3657 } 3658 3659 zio->io_cksum_report = zcr->zcr_next; 3660 zcr->zcr_next = NULL; 3661 zcr->zcr_finish(zcr, abuf); 3662 zfs_ereport_free_checksum(zcr); 3663 3664 if (asize != psize) 3665 zio_buf_free(abuf, asize); 3666 } 3667 } 3668 3669 zio_pop_transforms(zio); /* note: may set zio->io_error */ 3670 3671 vdev_stat_update(zio, psize); 3672 3673 if (zio->io_error) { 3674 /* 3675 * If this I/O is attached to a particular vdev, 3676 * generate an error message describing the I/O failure 3677 * at the block level. We ignore these errors if the 3678 * device is currently unavailable. 3679 */ 3680 if (zio->io_error != ECKSUM && vd != NULL && !vdev_is_dead(vd)) 3681 zfs_ereport_post(FM_EREPORT_ZFS_IO, spa, vd, zio, 0, 0); 3682 3683 if ((zio->io_error == EIO || !(zio->io_flags & 3684 (ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_PROPAGATE))) && 3685 zio == lio) { 3686 /* 3687 * For logical I/O requests, tell the SPA to log the 3688 * error and generate a logical data ereport. 3689 */ 3690 spa_log_error(spa, zio); 3691 zfs_ereport_post(FM_EREPORT_ZFS_DATA, spa, NULL, zio, 3692 0, 0); 3693 } 3694 } 3695 3696 if (zio->io_error && zio == lio) { 3697 /* 3698 * Determine whether zio should be reexecuted. This will 3699 * propagate all the way to the root via zio_notify_parent(). 3700 */ 3701 ASSERT(vd == NULL && bp != NULL); 3702 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 3703 3704 if (IO_IS_ALLOCATING(zio) && 3705 !(zio->io_flags & ZIO_FLAG_CANFAIL)) { 3706 if (zio->io_error != ENOSPC) 3707 zio->io_reexecute |= ZIO_REEXECUTE_NOW; 3708 else 3709 zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; 3710 } 3711 3712 if ((zio->io_type == ZIO_TYPE_READ || 3713 zio->io_type == ZIO_TYPE_FREE) && 3714 !(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && 3715 zio->io_error == ENXIO && 3716 spa_load_state(spa) == SPA_LOAD_NONE && 3717 spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE) 3718 zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; 3719 3720 if (!(zio->io_flags & ZIO_FLAG_CANFAIL) && !zio->io_reexecute) 3721 zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; 3722 3723 /* 3724 * Here is a possibly good place to attempt to do 3725 * either combinatorial reconstruction or error correction 3726 * based on checksums. It also might be a good place 3727 * to send out preliminary ereports before we suspend 3728 * processing. 3729 */ 3730 } 3731 3732 /* 3733 * If there were logical child errors, they apply to us now. 3734 * We defer this until now to avoid conflating logical child 3735 * errors with errors that happened to the zio itself when 3736 * updating vdev stats and reporting FMA events above. 3737 */ 3738 zio_inherit_child_errors(zio, ZIO_CHILD_LOGICAL); 3739 3740 if ((zio->io_error || zio->io_reexecute) && 3741 IO_IS_ALLOCATING(zio) && zio->io_gang_leader == zio && 3742 !(zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE))) 3743 zio_dva_unallocate(zio, zio->io_gang_tree, bp); 3744 3745 zio_gang_tree_free(&zio->io_gang_tree); 3746 3747 /* 3748 * Godfather I/Os should never suspend. 3749 */ 3750 if ((zio->io_flags & ZIO_FLAG_GODFATHER) && 3751 (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) 3752 zio->io_reexecute = 0; 3753 3754 if (zio->io_reexecute) { 3755 /* 3756 * This is a logical I/O that wants to reexecute. 3757 * 3758 * Reexecute is top-down. When an i/o fails, if it's not 3759 * the root, it simply notifies its parent and sticks around. 3760 * The parent, seeing that it still has children in zio_done(), 3761 * does the same. This percolates all the way up to the root. 3762 * The root i/o will reexecute or suspend the entire tree. 3763 * 3764 * This approach ensures that zio_reexecute() honors 3765 * all the original i/o dependency relationships, e.g. 3766 * parents not executing until children are ready. 3767 */ 3768 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 3769 3770 zio->io_gang_leader = NULL; 3771 3772 mutex_enter(&zio->io_lock); 3773 zio->io_state[ZIO_WAIT_DONE] = 1; 3774 mutex_exit(&zio->io_lock); 3775 3776 /* 3777 * "The Godfather" I/O monitors its children but is 3778 * not a true parent to them. It will track them through 3779 * the pipeline but severs its ties whenever they get into 3780 * trouble (e.g. suspended). This allows "The Godfather" 3781 * I/O to return status without blocking. 3782 */ 3783 zl = NULL; 3784 for (pio = zio_walk_parents(zio, &zl); pio != NULL; 3785 pio = pio_next) { 3786 zio_link_t *remove_zl = zl; 3787 pio_next = zio_walk_parents(zio, &zl); 3788 3789 if ((pio->io_flags & ZIO_FLAG_GODFATHER) && 3790 (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) { 3791 zio_remove_child(pio, zio, remove_zl); 3792 zio_notify_parent(pio, zio, ZIO_WAIT_DONE); 3793 } 3794 } 3795 3796 if ((pio = zio_unique_parent(zio)) != NULL) { 3797 /* 3798 * We're not a root i/o, so there's nothing to do 3799 * but notify our parent. Don't propagate errors 3800 * upward since we haven't permanently failed yet. 3801 */ 3802 ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER)); 3803 zio->io_flags |= ZIO_FLAG_DONT_PROPAGATE; 3804 zio_notify_parent(pio, zio, ZIO_WAIT_DONE); 3805 } else if (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND) { 3806 /* 3807 * We'd fail again if we reexecuted now, so suspend 3808 * until conditions improve (e.g. device comes online). 3809 */ 3810 zio_suspend(spa, zio); 3811 } else { 3812 /* 3813 * Reexecution is potentially a huge amount of work. 3814 * Hand it off to the otherwise-unused claim taskq. 3815 */ 3816#if defined(illumos) || !defined(_KERNEL) 3817 ASSERT(zio->io_tqent.tqent_next == NULL); 3818#else 3819 ASSERT(zio->io_tqent.tqent_task.ta_pending == 0); 3820#endif 3821 spa_taskq_dispatch_ent(spa, ZIO_TYPE_CLAIM, 3822 ZIO_TASKQ_ISSUE, (task_func_t *)zio_reexecute, zio, 3823 0, &zio->io_tqent); 3824 } 3825 return (ZIO_PIPELINE_STOP); 3826 } 3827 3828 ASSERT(zio->io_child_count == 0); 3829 ASSERT(zio->io_reexecute == 0); 3830 ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL)); 3831 3832 /* 3833 * Report any checksum errors, since the I/O is complete. 3834 */ 3835 while (zio->io_cksum_report != NULL) { 3836 zio_cksum_report_t *zcr = zio->io_cksum_report; 3837 zio->io_cksum_report = zcr->zcr_next; 3838 zcr->zcr_next = NULL; 3839 zcr->zcr_finish(zcr, NULL); 3840 zfs_ereport_free_checksum(zcr); 3841 } 3842 3843 /* 3844 * It is the responsibility of the done callback to ensure that this 3845 * particular zio is no longer discoverable for adoption, and as 3846 * such, cannot acquire any new parents. 3847 */ 3848 if (zio->io_done) 3849 zio->io_done(zio); 3850 3851 mutex_enter(&zio->io_lock); 3852 zio->io_state[ZIO_WAIT_DONE] = 1; 3853 mutex_exit(&zio->io_lock); 3854 3855 zl = NULL; 3856 for (pio = zio_walk_parents(zio, &zl); pio != NULL; pio = pio_next) { 3857 zio_link_t *remove_zl = zl; 3858 pio_next = zio_walk_parents(zio, &zl); 3859 zio_remove_child(pio, zio, remove_zl); 3860 zio_notify_parent(pio, zio, ZIO_WAIT_DONE); 3861 } 3862 3863 if (zio->io_waiter != NULL) { 3864 mutex_enter(&zio->io_lock); 3865 zio->io_executor = NULL; 3866 cv_broadcast(&zio->io_cv); 3867 mutex_exit(&zio->io_lock); 3868 } else { 3869 zio_destroy(zio); 3870 } 3871 3872 return (ZIO_PIPELINE_STOP); 3873} 3874 3875/* 3876 * ========================================================================== 3877 * I/O pipeline definition 3878 * ========================================================================== 3879 */ 3880static zio_pipe_stage_t *zio_pipeline[] = { 3881 NULL, 3882 zio_read_bp_init, 3883 zio_write_bp_init, 3884 zio_free_bp_init, 3885 zio_issue_async, 3886 zio_write_compress, 3887 zio_checksum_generate, 3888 zio_nop_write, 3889 zio_ddt_read_start, 3890 zio_ddt_read_done, 3891 zio_ddt_write, 3892 zio_ddt_free, 3893 zio_gang_assemble, 3894 zio_gang_issue, 3895 zio_dva_throttle, 3896 zio_dva_allocate, 3897 zio_dva_free, 3898 zio_dva_claim, 3899 zio_ready, 3900 zio_vdev_io_start, 3901 zio_vdev_io_done, 3902 zio_vdev_io_assess, 3903 zio_checksum_verify, 3904 zio_done 3905}; 3906 3907 3908 3909 3910/* 3911 * Compare two zbookmark_phys_t's to see which we would reach first in a 3912 * pre-order traversal of the object tree. 3913 * 3914 * This is simple in every case aside from the meta-dnode object. For all other 3915 * objects, we traverse them in order (object 1 before object 2, and so on). 3916 * However, all of these objects are traversed while traversing object 0, since 3917 * the data it points to is the list of objects. Thus, we need to convert to a 3918 * canonical representation so we can compare meta-dnode bookmarks to 3919 * non-meta-dnode bookmarks. 3920 * 3921 * We do this by calculating "equivalents" for each field of the zbookmark. 3922 * zbookmarks outside of the meta-dnode use their own object and level, and 3923 * calculate the level 0 equivalent (the first L0 blkid that is contained in the 3924 * blocks this bookmark refers to) by multiplying their blkid by their span 3925 * (the number of L0 blocks contained within one block at their level). 3926 * zbookmarks inside the meta-dnode calculate their object equivalent 3927 * (which is L0equiv * dnodes per data block), use 0 for their L0equiv, and use 3928 * level + 1<<31 (any value larger than a level could ever be) for their level. 3929 * This causes them to always compare before a bookmark in their object 3930 * equivalent, compare appropriately to bookmarks in other objects, and to 3931 * compare appropriately to other bookmarks in the meta-dnode. 3932 */ 3933int 3934zbookmark_compare(uint16_t dbss1, uint8_t ibs1, uint16_t dbss2, uint8_t ibs2, 3935 const zbookmark_phys_t *zb1, const zbookmark_phys_t *zb2) 3936{ 3937 /* 3938 * These variables represent the "equivalent" values for the zbookmark, 3939 * after converting zbookmarks inside the meta dnode to their 3940 * normal-object equivalents. 3941 */ 3942 uint64_t zb1obj, zb2obj; 3943 uint64_t zb1L0, zb2L0; 3944 uint64_t zb1level, zb2level; 3945 3946 if (zb1->zb_object == zb2->zb_object && 3947 zb1->zb_level == zb2->zb_level && 3948 zb1->zb_blkid == zb2->zb_blkid) 3949 return (0); 3950 3951 /* 3952 * BP_SPANB calculates the span in blocks. 3953 */ 3954 zb1L0 = (zb1->zb_blkid) * BP_SPANB(ibs1, zb1->zb_level); 3955 zb2L0 = (zb2->zb_blkid) * BP_SPANB(ibs2, zb2->zb_level); 3956 3957 if (zb1->zb_object == DMU_META_DNODE_OBJECT) { 3958 zb1obj = zb1L0 * (dbss1 << (SPA_MINBLOCKSHIFT - DNODE_SHIFT)); 3959 zb1L0 = 0; 3960 zb1level = zb1->zb_level + COMPARE_META_LEVEL; 3961 } else { 3962 zb1obj = zb1->zb_object; 3963 zb1level = zb1->zb_level; 3964 } 3965 3966 if (zb2->zb_object == DMU_META_DNODE_OBJECT) { 3967 zb2obj = zb2L0 * (dbss2 << (SPA_MINBLOCKSHIFT - DNODE_SHIFT)); 3968 zb2L0 = 0; 3969 zb2level = zb2->zb_level + COMPARE_META_LEVEL; 3970 } else { 3971 zb2obj = zb2->zb_object; 3972 zb2level = zb2->zb_level; 3973 } 3974 3975 /* Now that we have a canonical representation, do the comparison. */ 3976 if (zb1obj != zb2obj) 3977 return (zb1obj < zb2obj ? -1 : 1); 3978 else if (zb1L0 != zb2L0) 3979 return (zb1L0 < zb2L0 ? -1 : 1); 3980 else if (zb1level != zb2level) 3981 return (zb1level > zb2level ? -1 : 1); 3982 /* 3983 * This can (theoretically) happen if the bookmarks have the same object 3984 * and level, but different blkids, if the block sizes are not the same. 3985 * There is presently no way to change the indirect block sizes 3986 */ 3987 return (0); 3988} 3989 3990/* 3991 * This function checks the following: given that last_block is the place that 3992 * our traversal stopped last time, does that guarantee that we've visited 3993 * every node under subtree_root? Therefore, we can't just use the raw output 3994 * of zbookmark_compare. We have to pass in a modified version of 3995 * subtree_root; by incrementing the block id, and then checking whether 3996 * last_block is before or equal to that, we can tell whether or not having 3997 * visited last_block implies that all of subtree_root's children have been 3998 * visited. 3999 */ 4000boolean_t 4001zbookmark_subtree_completed(const dnode_phys_t *dnp, 4002 const zbookmark_phys_t *subtree_root, const zbookmark_phys_t *last_block) 4003{ 4004 zbookmark_phys_t mod_zb = *subtree_root; 4005 mod_zb.zb_blkid++; 4006 ASSERT(last_block->zb_level == 0); 4007 4008 /* The objset_phys_t isn't before anything. */ 4009 if (dnp == NULL) 4010 return (B_FALSE); 4011 4012 /* 4013 * We pass in 1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT) for the 4014 * data block size in sectors, because that variable is only used if 4015 * the bookmark refers to a block in the meta-dnode. Since we don't 4016 * know without examining it what object it refers to, and there's no 4017 * harm in passing in this value in other cases, we always pass it in. 4018 * 4019 * We pass in 0 for the indirect block size shift because zb2 must be 4020 * level 0. The indirect block size is only used to calculate the span 4021 * of the bookmark, but since the bookmark must be level 0, the span is 4022 * always 1, so the math works out. 4023 * 4024 * If you make changes to how the zbookmark_compare code works, be sure 4025 * to make sure that this code still works afterwards. 4026 */ 4027 return (zbookmark_compare(dnp->dn_datablkszsec, dnp->dn_indblkshift, 4028 1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT), 0, &mod_zb, 4029 last_block) <= 0); 4030} 4031