zio.c revision 269733
1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21/* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2011, 2014 by Delphix. All rights reserved. 24 * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved. 25 */ 26 27#include <sys/zfs_context.h> 28#include <sys/fm/fs/zfs.h> 29#include <sys/spa.h> 30#include <sys/txg.h> 31#include <sys/spa_impl.h> 32#include <sys/vdev_impl.h> 33#include <sys/zio_impl.h> 34#include <sys/zio_compress.h> 35#include <sys/zio_checksum.h> 36#include <sys/dmu_objset.h> 37#include <sys/arc.h> 38#include <sys/ddt.h> 39#include <sys/trim_map.h> 40#include <sys/blkptr.h> 41#include <sys/zfeature.h> 42 43SYSCTL_DECL(_vfs_zfs); 44SYSCTL_NODE(_vfs_zfs, OID_AUTO, zio, CTLFLAG_RW, 0, "ZFS ZIO"); 45#if defined(__amd64__) 46static int zio_use_uma = 1; 47#else 48static int zio_use_uma = 0; 49#endif 50TUNABLE_INT("vfs.zfs.zio.use_uma", &zio_use_uma); 51SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, use_uma, CTLFLAG_RDTUN, &zio_use_uma, 0, 52 "Use uma(9) for ZIO allocations"); 53static int zio_exclude_metadata = 0; 54TUNABLE_INT("vfs.zfs.zio.exclude_metadata", &zio_exclude_metadata); 55SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, exclude_metadata, CTLFLAG_RDTUN, &zio_exclude_metadata, 0, 56 "Exclude metadata buffers from dumps as well"); 57 58zio_trim_stats_t zio_trim_stats = { 59 { "bytes", KSTAT_DATA_UINT64, 60 "Number of bytes successfully TRIMmed" }, 61 { "success", KSTAT_DATA_UINT64, 62 "Number of successful TRIM requests" }, 63 { "unsupported", KSTAT_DATA_UINT64, 64 "Number of TRIM requests that failed because TRIM is not supported" }, 65 { "failed", KSTAT_DATA_UINT64, 66 "Number of TRIM requests that failed for reasons other than not supported" }, 67}; 68 69static kstat_t *zio_trim_ksp; 70 71/* 72 * ========================================================================== 73 * I/O type descriptions 74 * ========================================================================== 75 */ 76const char *zio_type_name[ZIO_TYPES] = { 77 "zio_null", "zio_read", "zio_write", "zio_free", "zio_claim", 78 "zio_ioctl" 79}; 80 81/* 82 * ========================================================================== 83 * I/O kmem caches 84 * ========================================================================== 85 */ 86kmem_cache_t *zio_cache; 87kmem_cache_t *zio_link_cache; 88kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; 89kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; 90 91#ifdef _KERNEL 92extern vmem_t *zio_alloc_arena; 93#endif 94 95/* 96 * The following actions directly effect the spa's sync-to-convergence logic. 97 * The values below define the sync pass when we start performing the action. 98 * Care should be taken when changing these values as they directly impact 99 * spa_sync() performance. Tuning these values may introduce subtle performance 100 * pathologies and should only be done in the context of performance analysis. 101 * These tunables will eventually be removed and replaced with #defines once 102 * enough analysis has been done to determine optimal values. 103 * 104 * The 'zfs_sync_pass_deferred_free' pass must be greater than 1 to ensure that 105 * regular blocks are not deferred. 106 */ 107int zfs_sync_pass_deferred_free = 2; /* defer frees starting in this pass */ 108TUNABLE_INT("vfs.zfs.sync_pass_deferred_free", &zfs_sync_pass_deferred_free); 109SYSCTL_INT(_vfs_zfs, OID_AUTO, sync_pass_deferred_free, CTLFLAG_RDTUN, 110 &zfs_sync_pass_deferred_free, 0, "defer frees starting in this pass"); 111int zfs_sync_pass_dont_compress = 5; /* don't compress starting in this pass */ 112TUNABLE_INT("vfs.zfs.sync_pass_dont_compress", &zfs_sync_pass_dont_compress); 113SYSCTL_INT(_vfs_zfs, OID_AUTO, sync_pass_dont_compress, CTLFLAG_RDTUN, 114 &zfs_sync_pass_dont_compress, 0, "don't compress starting in this pass"); 115int zfs_sync_pass_rewrite = 2; /* rewrite new bps starting in this pass */ 116TUNABLE_INT("vfs.zfs.sync_pass_rewrite", &zfs_sync_pass_rewrite); 117SYSCTL_INT(_vfs_zfs, OID_AUTO, sync_pass_rewrite, CTLFLAG_RDTUN, 118 &zfs_sync_pass_rewrite, 0, "rewrite new bps starting in this pass"); 119 120/* 121 * An allocating zio is one that either currently has the DVA allocate 122 * stage set or will have it later in its lifetime. 123 */ 124#define IO_IS_ALLOCATING(zio) ((zio)->io_orig_pipeline & ZIO_STAGE_DVA_ALLOCATE) 125 126boolean_t zio_requeue_io_start_cut_in_line = B_TRUE; 127 128#ifdef ZFS_DEBUG 129int zio_buf_debug_limit = 16384; 130#else 131int zio_buf_debug_limit = 0; 132#endif 133 134void 135zio_init(void) 136{ 137 size_t c; 138 zio_cache = kmem_cache_create("zio_cache", 139 sizeof (zio_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 140 zio_link_cache = kmem_cache_create("zio_link_cache", 141 sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 142 if (!zio_use_uma) 143 goto out; 144 145 /* 146 * For small buffers, we want a cache for each multiple of 147 * SPA_MINBLOCKSIZE. For medium-size buffers, we want a cache 148 * for each quarter-power of 2. For large buffers, we want 149 * a cache for each multiple of PAGESIZE. 150 */ 151 for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { 152 size_t size = (c + 1) << SPA_MINBLOCKSHIFT; 153 size_t p2 = size; 154 size_t align = 0; 155 size_t cflags = (size > zio_buf_debug_limit) ? KMC_NODEBUG : 0; 156 157 while (p2 & (p2 - 1)) 158 p2 &= p2 - 1; 159 160#ifdef illumos 161#ifndef _KERNEL 162 /* 163 * If we are using watchpoints, put each buffer on its own page, 164 * to eliminate the performance overhead of trapping to the 165 * kernel when modifying a non-watched buffer that shares the 166 * page with a watched buffer. 167 */ 168 if (arc_watch && !IS_P2ALIGNED(size, PAGESIZE)) 169 continue; 170#endif 171#endif /* illumos */ 172 if (size <= 4 * SPA_MINBLOCKSIZE) { 173 align = SPA_MINBLOCKSIZE; 174 } else if (IS_P2ALIGNED(size, PAGESIZE)) { 175 align = PAGESIZE; 176 } else if (IS_P2ALIGNED(size, p2 >> 2)) { 177 align = p2 >> 2; 178 } 179 180 if (align != 0) { 181 char name[36]; 182 (void) sprintf(name, "zio_buf_%lu", (ulong_t)size); 183 zio_buf_cache[c] = kmem_cache_create(name, size, 184 align, NULL, NULL, NULL, NULL, NULL, cflags); 185 186 /* 187 * Since zio_data bufs do not appear in crash dumps, we 188 * pass KMC_NOTOUCH so that no allocator metadata is 189 * stored with the buffers. 190 */ 191 (void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size); 192 zio_data_buf_cache[c] = kmem_cache_create(name, size, 193 align, NULL, NULL, NULL, NULL, NULL, 194 cflags | KMC_NOTOUCH | KMC_NODEBUG); 195 } 196 } 197 198 while (--c != 0) { 199 ASSERT(zio_buf_cache[c] != NULL); 200 if (zio_buf_cache[c - 1] == NULL) 201 zio_buf_cache[c - 1] = zio_buf_cache[c]; 202 203 ASSERT(zio_data_buf_cache[c] != NULL); 204 if (zio_data_buf_cache[c - 1] == NULL) 205 zio_data_buf_cache[c - 1] = zio_data_buf_cache[c]; 206 } 207out: 208 209 zio_inject_init(); 210 211 zio_trim_ksp = kstat_create("zfs", 0, "zio_trim", "misc", 212 KSTAT_TYPE_NAMED, 213 sizeof(zio_trim_stats) / sizeof(kstat_named_t), 214 KSTAT_FLAG_VIRTUAL); 215 216 if (zio_trim_ksp != NULL) { 217 zio_trim_ksp->ks_data = &zio_trim_stats; 218 kstat_install(zio_trim_ksp); 219 } 220} 221 222void 223zio_fini(void) 224{ 225 size_t c; 226 kmem_cache_t *last_cache = NULL; 227 kmem_cache_t *last_data_cache = NULL; 228 229 for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { 230 if (zio_buf_cache[c] != last_cache) { 231 last_cache = zio_buf_cache[c]; 232 kmem_cache_destroy(zio_buf_cache[c]); 233 } 234 zio_buf_cache[c] = NULL; 235 236 if (zio_data_buf_cache[c] != last_data_cache) { 237 last_data_cache = zio_data_buf_cache[c]; 238 kmem_cache_destroy(zio_data_buf_cache[c]); 239 } 240 zio_data_buf_cache[c] = NULL; 241 } 242 243 kmem_cache_destroy(zio_link_cache); 244 kmem_cache_destroy(zio_cache); 245 246 zio_inject_fini(); 247 248 if (zio_trim_ksp != NULL) { 249 kstat_delete(zio_trim_ksp); 250 zio_trim_ksp = NULL; 251 } 252} 253 254/* 255 * ========================================================================== 256 * Allocate and free I/O buffers 257 * ========================================================================== 258 */ 259 260/* 261 * Use zio_buf_alloc to allocate ZFS metadata. This data will appear in a 262 * crashdump if the kernel panics, so use it judiciously. Obviously, it's 263 * useful to inspect ZFS metadata, but if possible, we should avoid keeping 264 * excess / transient data in-core during a crashdump. 265 */ 266void * 267zio_buf_alloc(size_t size) 268{ 269 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 270 int flags = zio_exclude_metadata ? KM_NODEBUG : 0; 271 272 ASSERT3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 273 274 if (zio_use_uma) 275 return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE)); 276 else 277 return (kmem_alloc(size, KM_SLEEP|flags)); 278} 279 280/* 281 * Use zio_data_buf_alloc to allocate data. The data will not appear in a 282 * crashdump if the kernel panics. This exists so that we will limit the amount 283 * of ZFS data that shows up in a kernel crashdump. (Thus reducing the amount 284 * of kernel heap dumped to disk when the kernel panics) 285 */ 286void * 287zio_data_buf_alloc(size_t size) 288{ 289 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 290 291 ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 292 293 if (zio_use_uma) 294 return (kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE)); 295 else 296 return (kmem_alloc(size, KM_SLEEP | KM_NODEBUG)); 297} 298 299void 300zio_buf_free(void *buf, size_t size) 301{ 302 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 303 304 ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 305 306 if (zio_use_uma) 307 kmem_cache_free(zio_buf_cache[c], buf); 308 else 309 kmem_free(buf, size); 310} 311 312void 313zio_data_buf_free(void *buf, size_t size) 314{ 315 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 316 317 ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 318 319 if (zio_use_uma) 320 kmem_cache_free(zio_data_buf_cache[c], buf); 321 else 322 kmem_free(buf, size); 323} 324 325/* 326 * ========================================================================== 327 * Push and pop I/O transform buffers 328 * ========================================================================== 329 */ 330static void 331zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize, 332 zio_transform_func_t *transform) 333{ 334 zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP); 335 336 zt->zt_orig_data = zio->io_data; 337 zt->zt_orig_size = zio->io_size; 338 zt->zt_bufsize = bufsize; 339 zt->zt_transform = transform; 340 341 zt->zt_next = zio->io_transform_stack; 342 zio->io_transform_stack = zt; 343 344 zio->io_data = data; 345 zio->io_size = size; 346} 347 348static void 349zio_pop_transforms(zio_t *zio) 350{ 351 zio_transform_t *zt; 352 353 while ((zt = zio->io_transform_stack) != NULL) { 354 if (zt->zt_transform != NULL) 355 zt->zt_transform(zio, 356 zt->zt_orig_data, zt->zt_orig_size); 357 358 if (zt->zt_bufsize != 0) 359 zio_buf_free(zio->io_data, zt->zt_bufsize); 360 361 zio->io_data = zt->zt_orig_data; 362 zio->io_size = zt->zt_orig_size; 363 zio->io_transform_stack = zt->zt_next; 364 365 kmem_free(zt, sizeof (zio_transform_t)); 366 } 367} 368 369/* 370 * ========================================================================== 371 * I/O transform callbacks for subblocks and decompression 372 * ========================================================================== 373 */ 374static void 375zio_subblock(zio_t *zio, void *data, uint64_t size) 376{ 377 ASSERT(zio->io_size > size); 378 379 if (zio->io_type == ZIO_TYPE_READ) 380 bcopy(zio->io_data, data, size); 381} 382 383static void 384zio_decompress(zio_t *zio, void *data, uint64_t size) 385{ 386 if (zio->io_error == 0 && 387 zio_decompress_data(BP_GET_COMPRESS(zio->io_bp), 388 zio->io_data, data, zio->io_size, size) != 0) 389 zio->io_error = SET_ERROR(EIO); 390} 391 392/* 393 * ========================================================================== 394 * I/O parent/child relationships and pipeline interlocks 395 * ========================================================================== 396 */ 397/* 398 * NOTE - Callers to zio_walk_parents() and zio_walk_children must 399 * continue calling these functions until they return NULL. 400 * Otherwise, the next caller will pick up the list walk in 401 * some indeterminate state. (Otherwise every caller would 402 * have to pass in a cookie to keep the state represented by 403 * io_walk_link, which gets annoying.) 404 */ 405zio_t * 406zio_walk_parents(zio_t *cio) 407{ 408 zio_link_t *zl = cio->io_walk_link; 409 list_t *pl = &cio->io_parent_list; 410 411 zl = (zl == NULL) ? list_head(pl) : list_next(pl, zl); 412 cio->io_walk_link = zl; 413 414 if (zl == NULL) 415 return (NULL); 416 417 ASSERT(zl->zl_child == cio); 418 return (zl->zl_parent); 419} 420 421zio_t * 422zio_walk_children(zio_t *pio) 423{ 424 zio_link_t *zl = pio->io_walk_link; 425 list_t *cl = &pio->io_child_list; 426 427 zl = (zl == NULL) ? list_head(cl) : list_next(cl, zl); 428 pio->io_walk_link = zl; 429 430 if (zl == NULL) 431 return (NULL); 432 433 ASSERT(zl->zl_parent == pio); 434 return (zl->zl_child); 435} 436 437zio_t * 438zio_unique_parent(zio_t *cio) 439{ 440 zio_t *pio = zio_walk_parents(cio); 441 442 VERIFY(zio_walk_parents(cio) == NULL); 443 return (pio); 444} 445 446void 447zio_add_child(zio_t *pio, zio_t *cio) 448{ 449 zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP); 450 451 /* 452 * Logical I/Os can have logical, gang, or vdev children. 453 * Gang I/Os can have gang or vdev children. 454 * Vdev I/Os can only have vdev children. 455 * The following ASSERT captures all of these constraints. 456 */ 457 ASSERT(cio->io_child_type <= pio->io_child_type); 458 459 zl->zl_parent = pio; 460 zl->zl_child = cio; 461 462 mutex_enter(&cio->io_lock); 463 mutex_enter(&pio->io_lock); 464 465 ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0); 466 467 for (int w = 0; w < ZIO_WAIT_TYPES; w++) 468 pio->io_children[cio->io_child_type][w] += !cio->io_state[w]; 469 470 list_insert_head(&pio->io_child_list, zl); 471 list_insert_head(&cio->io_parent_list, zl); 472 473 pio->io_child_count++; 474 cio->io_parent_count++; 475 476 mutex_exit(&pio->io_lock); 477 mutex_exit(&cio->io_lock); 478} 479 480static void 481zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl) 482{ 483 ASSERT(zl->zl_parent == pio); 484 ASSERT(zl->zl_child == cio); 485 486 mutex_enter(&cio->io_lock); 487 mutex_enter(&pio->io_lock); 488 489 list_remove(&pio->io_child_list, zl); 490 list_remove(&cio->io_parent_list, zl); 491 492 pio->io_child_count--; 493 cio->io_parent_count--; 494 495 mutex_exit(&pio->io_lock); 496 mutex_exit(&cio->io_lock); 497 498 kmem_cache_free(zio_link_cache, zl); 499} 500 501static boolean_t 502zio_wait_for_children(zio_t *zio, enum zio_child child, enum zio_wait_type wait) 503{ 504 uint64_t *countp = &zio->io_children[child][wait]; 505 boolean_t waiting = B_FALSE; 506 507 mutex_enter(&zio->io_lock); 508 ASSERT(zio->io_stall == NULL); 509 if (*countp != 0) { 510 zio->io_stage >>= 1; 511 zio->io_stall = countp; 512 waiting = B_TRUE; 513 } 514 mutex_exit(&zio->io_lock); 515 516 return (waiting); 517} 518 519static void 520zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait) 521{ 522 uint64_t *countp = &pio->io_children[zio->io_child_type][wait]; 523 int *errorp = &pio->io_child_error[zio->io_child_type]; 524 525 mutex_enter(&pio->io_lock); 526 if (zio->io_error && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) 527 *errorp = zio_worst_error(*errorp, zio->io_error); 528 pio->io_reexecute |= zio->io_reexecute; 529 ASSERT3U(*countp, >, 0); 530 531 (*countp)--; 532 533 if (*countp == 0 && pio->io_stall == countp) { 534 pio->io_stall = NULL; 535 mutex_exit(&pio->io_lock); 536 zio_execute(pio); 537 } else { 538 mutex_exit(&pio->io_lock); 539 } 540} 541 542static void 543zio_inherit_child_errors(zio_t *zio, enum zio_child c) 544{ 545 if (zio->io_child_error[c] != 0 && zio->io_error == 0) 546 zio->io_error = zio->io_child_error[c]; 547} 548 549/* 550 * ========================================================================== 551 * Create the various types of I/O (read, write, free, etc) 552 * ========================================================================== 553 */ 554static zio_t * 555zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, 556 void *data, uint64_t size, zio_done_func_t *done, void *private, 557 zio_type_t type, zio_priority_t priority, enum zio_flag flags, 558 vdev_t *vd, uint64_t offset, const zbookmark_phys_t *zb, 559 enum zio_stage stage, enum zio_stage pipeline) 560{ 561 zio_t *zio; 562 563 ASSERT3U(type == ZIO_TYPE_FREE || size, <=, SPA_MAXBLOCKSIZE); 564 ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0); 565 ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0); 566 567 ASSERT(!vd || spa_config_held(spa, SCL_STATE_ALL, RW_READER)); 568 ASSERT(!bp || !(flags & ZIO_FLAG_CONFIG_WRITER)); 569 ASSERT(vd || stage == ZIO_STAGE_OPEN); 570 571 zio = kmem_cache_alloc(zio_cache, KM_SLEEP); 572 bzero(zio, sizeof (zio_t)); 573 574 mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL); 575 cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL); 576 577 list_create(&zio->io_parent_list, sizeof (zio_link_t), 578 offsetof(zio_link_t, zl_parent_node)); 579 list_create(&zio->io_child_list, sizeof (zio_link_t), 580 offsetof(zio_link_t, zl_child_node)); 581 582 if (vd != NULL) 583 zio->io_child_type = ZIO_CHILD_VDEV; 584 else if (flags & ZIO_FLAG_GANG_CHILD) 585 zio->io_child_type = ZIO_CHILD_GANG; 586 else if (flags & ZIO_FLAG_DDT_CHILD) 587 zio->io_child_type = ZIO_CHILD_DDT; 588 else 589 zio->io_child_type = ZIO_CHILD_LOGICAL; 590 591 if (bp != NULL) { 592 zio->io_bp = (blkptr_t *)bp; 593 zio->io_bp_copy = *bp; 594 zio->io_bp_orig = *bp; 595 if (type != ZIO_TYPE_WRITE || 596 zio->io_child_type == ZIO_CHILD_DDT) 597 zio->io_bp = &zio->io_bp_copy; /* so caller can free */ 598 if (zio->io_child_type == ZIO_CHILD_LOGICAL) 599 zio->io_logical = zio; 600 if (zio->io_child_type > ZIO_CHILD_GANG && BP_IS_GANG(bp)) 601 pipeline |= ZIO_GANG_STAGES; 602 } 603 604 zio->io_spa = spa; 605 zio->io_txg = txg; 606 zio->io_done = done; 607 zio->io_private = private; 608 zio->io_type = type; 609 zio->io_priority = priority; 610 zio->io_vd = vd; 611 zio->io_offset = offset; 612 zio->io_orig_data = zio->io_data = data; 613 zio->io_orig_size = zio->io_size = size; 614 zio->io_orig_flags = zio->io_flags = flags; 615 zio->io_orig_stage = zio->io_stage = stage; 616 zio->io_orig_pipeline = zio->io_pipeline = pipeline; 617 618 zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY); 619 zio->io_state[ZIO_WAIT_DONE] = (stage >= ZIO_STAGE_DONE); 620 621 if (zb != NULL) 622 zio->io_bookmark = *zb; 623 624 if (pio != NULL) { 625 if (zio->io_logical == NULL) 626 zio->io_logical = pio->io_logical; 627 if (zio->io_child_type == ZIO_CHILD_GANG) 628 zio->io_gang_leader = pio->io_gang_leader; 629 zio_add_child(pio, zio); 630 } 631 632 return (zio); 633} 634 635static void 636zio_destroy(zio_t *zio) 637{ 638 list_destroy(&zio->io_parent_list); 639 list_destroy(&zio->io_child_list); 640 mutex_destroy(&zio->io_lock); 641 cv_destroy(&zio->io_cv); 642 kmem_cache_free(zio_cache, zio); 643} 644 645zio_t * 646zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done, 647 void *private, enum zio_flag flags) 648{ 649 zio_t *zio; 650 651 zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private, 652 ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL, 653 ZIO_STAGE_OPEN, ZIO_INTERLOCK_PIPELINE); 654 655 return (zio); 656} 657 658zio_t * 659zio_root(spa_t *spa, zio_done_func_t *done, void *private, enum zio_flag flags) 660{ 661 return (zio_null(NULL, spa, NULL, done, private, flags)); 662} 663 664zio_t * 665zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, 666 void *data, uint64_t size, zio_done_func_t *done, void *private, 667 zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb) 668{ 669 zio_t *zio; 670 671 zio = zio_create(pio, spa, BP_PHYSICAL_BIRTH(bp), bp, 672 data, size, done, private, 673 ZIO_TYPE_READ, priority, flags, NULL, 0, zb, 674 ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ? 675 ZIO_DDT_CHILD_READ_PIPELINE : ZIO_READ_PIPELINE); 676 677 return (zio); 678} 679 680zio_t * 681zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, 682 void *data, uint64_t size, const zio_prop_t *zp, 683 zio_done_func_t *ready, zio_done_func_t *physdone, zio_done_func_t *done, 684 void *private, 685 zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb) 686{ 687 zio_t *zio; 688 689 ASSERT(zp->zp_checksum >= ZIO_CHECKSUM_OFF && 690 zp->zp_checksum < ZIO_CHECKSUM_FUNCTIONS && 691 zp->zp_compress >= ZIO_COMPRESS_OFF && 692 zp->zp_compress < ZIO_COMPRESS_FUNCTIONS && 693 DMU_OT_IS_VALID(zp->zp_type) && 694 zp->zp_level < 32 && 695 zp->zp_copies > 0 && 696 zp->zp_copies <= spa_max_replication(spa)); 697 698 zio = zio_create(pio, spa, txg, bp, data, size, done, private, 699 ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb, 700 ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ? 701 ZIO_DDT_CHILD_WRITE_PIPELINE : ZIO_WRITE_PIPELINE); 702 703 zio->io_ready = ready; 704 zio->io_physdone = physdone; 705 zio->io_prop = *zp; 706 707 /* 708 * Data can be NULL if we are going to call zio_write_override() to 709 * provide the already-allocated BP. But we may need the data to 710 * verify a dedup hit (if requested). In this case, don't try to 711 * dedup (just take the already-allocated BP verbatim). 712 */ 713 if (data == NULL && zio->io_prop.zp_dedup_verify) { 714 zio->io_prop.zp_dedup = zio->io_prop.zp_dedup_verify = B_FALSE; 715 } 716 717 return (zio); 718} 719 720zio_t * 721zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data, 722 uint64_t size, zio_done_func_t *done, void *private, 723 zio_priority_t priority, enum zio_flag flags, zbookmark_phys_t *zb) 724{ 725 zio_t *zio; 726 727 zio = zio_create(pio, spa, txg, bp, data, size, done, private, 728 ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb, 729 ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE); 730 731 return (zio); 732} 733 734void 735zio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite) 736{ 737 ASSERT(zio->io_type == ZIO_TYPE_WRITE); 738 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 739 ASSERT(zio->io_stage == ZIO_STAGE_OPEN); 740 ASSERT(zio->io_txg == spa_syncing_txg(zio->io_spa)); 741 742 /* 743 * We must reset the io_prop to match the values that existed 744 * when the bp was first written by dmu_sync() keeping in mind 745 * that nopwrite and dedup are mutually exclusive. 746 */ 747 zio->io_prop.zp_dedup = nopwrite ? B_FALSE : zio->io_prop.zp_dedup; 748 zio->io_prop.zp_nopwrite = nopwrite; 749 zio->io_prop.zp_copies = copies; 750 zio->io_bp_override = bp; 751} 752 753void 754zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp) 755{ 756 757 /* 758 * The check for EMBEDDED is a performance optimization. We 759 * process the free here (by ignoring it) rather than 760 * putting it on the list and then processing it in zio_free_sync(). 761 */ 762 if (BP_IS_EMBEDDED(bp)) 763 return; 764 metaslab_check_free(spa, bp); 765 766 /* 767 * Frees that are for the currently-syncing txg, are not going to be 768 * deferred, and which will not need to do a read (i.e. not GANG or 769 * DEDUP), can be processed immediately. Otherwise, put them on the 770 * in-memory list for later processing. 771 */ 772 if (zfs_trim_enabled || BP_IS_GANG(bp) || BP_GET_DEDUP(bp) || 773 txg != spa->spa_syncing_txg || 774 spa_sync_pass(spa) >= zfs_sync_pass_deferred_free) { 775 bplist_append(&spa->spa_free_bplist[txg & TXG_MASK], bp); 776 } else { 777 VERIFY0(zio_wait(zio_free_sync(NULL, spa, txg, bp, 778 BP_GET_PSIZE(bp), 0))); 779 } 780} 781 782zio_t * 783zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, 784 uint64_t size, enum zio_flag flags) 785{ 786 zio_t *zio; 787 enum zio_stage stage = ZIO_FREE_PIPELINE; 788 789 ASSERT(!BP_IS_HOLE(bp)); 790 ASSERT(spa_syncing_txg(spa) == txg); 791 ASSERT(spa_sync_pass(spa) < zfs_sync_pass_deferred_free); 792 793 if (BP_IS_EMBEDDED(bp)) 794 return (zio_null(pio, spa, NULL, NULL, NULL, 0)); 795 796 metaslab_check_free(spa, bp); 797 arc_freed(spa, bp); 798 799 if (zfs_trim_enabled) 800 stage |= ZIO_STAGE_ISSUE_ASYNC | ZIO_STAGE_VDEV_IO_START | 801 ZIO_STAGE_VDEV_IO_ASSESS; 802 /* 803 * GANG and DEDUP blocks can induce a read (for the gang block header, 804 * or the DDT), so issue them asynchronously so that this thread is 805 * not tied up. 806 */ 807 else if (BP_IS_GANG(bp) || BP_GET_DEDUP(bp)) 808 stage |= ZIO_STAGE_ISSUE_ASYNC; 809 810 zio = zio_create(pio, spa, txg, bp, NULL, size, 811 NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_NOW, flags, 812 NULL, 0, NULL, ZIO_STAGE_OPEN, stage); 813 814 return (zio); 815} 816 817zio_t * 818zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, 819 zio_done_func_t *done, void *private, enum zio_flag flags) 820{ 821 zio_t *zio; 822 823 dprintf_bp(bp, "claiming in txg %llu", txg); 824 825 if (BP_IS_EMBEDDED(bp)) 826 return (zio_null(pio, spa, NULL, NULL, NULL, 0)); 827 828 /* 829 * A claim is an allocation of a specific block. Claims are needed 830 * to support immediate writes in the intent log. The issue is that 831 * immediate writes contain committed data, but in a txg that was 832 * *not* committed. Upon opening the pool after an unclean shutdown, 833 * the intent log claims all blocks that contain immediate write data 834 * so that the SPA knows they're in use. 835 * 836 * All claims *must* be resolved in the first txg -- before the SPA 837 * starts allocating blocks -- so that nothing is allocated twice. 838 * If txg == 0 we just verify that the block is claimable. 839 */ 840 ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_first_txg(spa)); 841 ASSERT(txg == spa_first_txg(spa) || txg == 0); 842 ASSERT(!BP_GET_DEDUP(bp) || !spa_writeable(spa)); /* zdb(1M) */ 843 844 zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp), 845 done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, flags, 846 NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE); 847 848 return (zio); 849} 850 851zio_t * 852zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, uint64_t offset, 853 uint64_t size, zio_done_func_t *done, void *private, 854 enum zio_flag flags) 855{ 856 zio_t *zio; 857 int c; 858 859 if (vd->vdev_children == 0) { 860 zio = zio_create(pio, spa, 0, NULL, NULL, size, done, private, 861 ZIO_TYPE_IOCTL, ZIO_PRIORITY_NOW, flags, vd, offset, NULL, 862 ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE); 863 864 zio->io_cmd = cmd; 865 } else { 866 zio = zio_null(pio, spa, NULL, NULL, NULL, flags); 867 868 for (c = 0; c < vd->vdev_children; c++) 869 zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd, 870 offset, size, done, private, flags)); 871 } 872 873 return (zio); 874} 875 876zio_t * 877zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, 878 void *data, int checksum, zio_done_func_t *done, void *private, 879 zio_priority_t priority, enum zio_flag flags, boolean_t labels) 880{ 881 zio_t *zio; 882 883 ASSERT(vd->vdev_children == 0); 884 ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE || 885 offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); 886 ASSERT3U(offset + size, <=, vd->vdev_psize); 887 888 zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private, 889 ZIO_TYPE_READ, priority, flags | ZIO_FLAG_PHYSICAL, vd, offset, 890 NULL, ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE); 891 892 zio->io_prop.zp_checksum = checksum; 893 894 return (zio); 895} 896 897zio_t * 898zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, 899 void *data, int checksum, zio_done_func_t *done, void *private, 900 zio_priority_t priority, enum zio_flag flags, boolean_t labels) 901{ 902 zio_t *zio; 903 904 ASSERT(vd->vdev_children == 0); 905 ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE || 906 offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); 907 ASSERT3U(offset + size, <=, vd->vdev_psize); 908 909 zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private, 910 ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_PHYSICAL, vd, offset, 911 NULL, ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE); 912 913 zio->io_prop.zp_checksum = checksum; 914 915 if (zio_checksum_table[checksum].ci_eck) { 916 /* 917 * zec checksums are necessarily destructive -- they modify 918 * the end of the write buffer to hold the verifier/checksum. 919 * Therefore, we must make a local copy in case the data is 920 * being written to multiple places in parallel. 921 */ 922 void *wbuf = zio_buf_alloc(size); 923 bcopy(data, wbuf, size); 924 zio_push_transform(zio, wbuf, size, size, NULL); 925 } 926 927 return (zio); 928} 929 930/* 931 * Create a child I/O to do some work for us. 932 */ 933zio_t * 934zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset, 935 void *data, uint64_t size, int type, zio_priority_t priority, 936 enum zio_flag flags, zio_done_func_t *done, void *private) 937{ 938 enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE; 939 zio_t *zio; 940 941 ASSERT(vd->vdev_parent == 942 (pio->io_vd ? pio->io_vd : pio->io_spa->spa_root_vdev)); 943 944 if (type == ZIO_TYPE_READ && bp != NULL) { 945 /* 946 * If we have the bp, then the child should perform the 947 * checksum and the parent need not. This pushes error 948 * detection as close to the leaves as possible and 949 * eliminates redundant checksums in the interior nodes. 950 */ 951 pipeline |= ZIO_STAGE_CHECKSUM_VERIFY; 952 pio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY; 953 } 954 955 if (vd->vdev_children == 0) 956 offset += VDEV_LABEL_START_SIZE; 957 958 flags |= ZIO_VDEV_CHILD_FLAGS(pio) | ZIO_FLAG_DONT_PROPAGATE; 959 960 /* 961 * If we've decided to do a repair, the write is not speculative -- 962 * even if the original read was. 963 */ 964 if (flags & ZIO_FLAG_IO_REPAIR) 965 flags &= ~ZIO_FLAG_SPECULATIVE; 966 967 zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size, 968 done, private, type, priority, flags, vd, offset, &pio->io_bookmark, 969 ZIO_STAGE_VDEV_IO_START >> 1, pipeline); 970 971 zio->io_physdone = pio->io_physdone; 972 if (vd->vdev_ops->vdev_op_leaf && zio->io_logical != NULL) 973 zio->io_logical->io_phys_children++; 974 975 return (zio); 976} 977 978zio_t * 979zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, void *data, uint64_t size, 980 int type, zio_priority_t priority, enum zio_flag flags, 981 zio_done_func_t *done, void *private) 982{ 983 zio_t *zio; 984 985 ASSERT(vd->vdev_ops->vdev_op_leaf); 986 987 zio = zio_create(NULL, vd->vdev_spa, 0, NULL, 988 data, size, done, private, type, priority, 989 flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_DELEGATED, 990 vd, offset, NULL, 991 ZIO_STAGE_VDEV_IO_START >> 1, ZIO_VDEV_CHILD_PIPELINE); 992 993 return (zio); 994} 995 996void 997zio_flush(zio_t *zio, vdev_t *vd) 998{ 999 zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE, 0, 0, 1000 NULL, NULL, 1001 ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY)); 1002} 1003 1004zio_t * 1005zio_trim(zio_t *zio, spa_t *spa, vdev_t *vd, uint64_t offset, uint64_t size) 1006{ 1007 1008 ASSERT(vd->vdev_ops->vdev_op_leaf); 1009 1010 return zio_ioctl(zio, spa, vd, DKIOCTRIM, offset, size, 1011 NULL, NULL, 1012 ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY); 1013} 1014 1015void 1016zio_shrink(zio_t *zio, uint64_t size) 1017{ 1018 ASSERT(zio->io_executor == NULL); 1019 ASSERT(zio->io_orig_size == zio->io_size); 1020 ASSERT(size <= zio->io_size); 1021 1022 /* 1023 * We don't shrink for raidz because of problems with the 1024 * reconstruction when reading back less than the block size. 1025 * Note, BP_IS_RAIDZ() assumes no compression. 1026 */ 1027 ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF); 1028 if (!BP_IS_RAIDZ(zio->io_bp)) 1029 zio->io_orig_size = zio->io_size = size; 1030} 1031 1032/* 1033 * ========================================================================== 1034 * Prepare to read and write logical blocks 1035 * ========================================================================== 1036 */ 1037 1038static int 1039zio_read_bp_init(zio_t **ziop) 1040{ 1041 zio_t *zio = *ziop; 1042 blkptr_t *bp = zio->io_bp; 1043 1044 if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF && 1045 zio->io_child_type == ZIO_CHILD_LOGICAL && 1046 !(zio->io_flags & ZIO_FLAG_RAW)) { 1047 uint64_t psize = 1048 BP_IS_EMBEDDED(bp) ? BPE_GET_PSIZE(bp) : BP_GET_PSIZE(bp); 1049 void *cbuf = zio_buf_alloc(psize); 1050 1051 zio_push_transform(zio, cbuf, psize, psize, zio_decompress); 1052 } 1053 1054 if (BP_IS_EMBEDDED(bp) && BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA) { 1055 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1056 decode_embedded_bp_compressed(bp, zio->io_data); 1057 } else { 1058 ASSERT(!BP_IS_EMBEDDED(bp)); 1059 } 1060 1061 if (!DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) && BP_GET_LEVEL(bp) == 0) 1062 zio->io_flags |= ZIO_FLAG_DONT_CACHE; 1063 1064 if (BP_GET_TYPE(bp) == DMU_OT_DDT_ZAP) 1065 zio->io_flags |= ZIO_FLAG_DONT_CACHE; 1066 1067 if (BP_GET_DEDUP(bp) && zio->io_child_type == ZIO_CHILD_LOGICAL) 1068 zio->io_pipeline = ZIO_DDT_READ_PIPELINE; 1069 1070 return (ZIO_PIPELINE_CONTINUE); 1071} 1072 1073static int 1074zio_write_bp_init(zio_t **ziop) 1075{ 1076 zio_t *zio = *ziop; 1077 spa_t *spa = zio->io_spa; 1078 zio_prop_t *zp = &zio->io_prop; 1079 enum zio_compress compress = zp->zp_compress; 1080 blkptr_t *bp = zio->io_bp; 1081 uint64_t lsize = zio->io_size; 1082 uint64_t psize = lsize; 1083 int pass = 1; 1084 1085 /* 1086 * If our children haven't all reached the ready stage, 1087 * wait for them and then repeat this pipeline stage. 1088 */ 1089 if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) || 1090 zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_READY)) 1091 return (ZIO_PIPELINE_STOP); 1092 1093 if (!IO_IS_ALLOCATING(zio)) 1094 return (ZIO_PIPELINE_CONTINUE); 1095 1096 ASSERT(zio->io_child_type != ZIO_CHILD_DDT); 1097 1098 if (zio->io_bp_override) { 1099 ASSERT(bp->blk_birth != zio->io_txg); 1100 ASSERT(BP_GET_DEDUP(zio->io_bp_override) == 0); 1101 1102 *bp = *zio->io_bp_override; 1103 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1104 1105 if (BP_IS_EMBEDDED(bp)) 1106 return (ZIO_PIPELINE_CONTINUE); 1107 1108 /* 1109 * If we've been overridden and nopwrite is set then 1110 * set the flag accordingly to indicate that a nopwrite 1111 * has already occurred. 1112 */ 1113 if (!BP_IS_HOLE(bp) && zp->zp_nopwrite) { 1114 ASSERT(!zp->zp_dedup); 1115 zio->io_flags |= ZIO_FLAG_NOPWRITE; 1116 return (ZIO_PIPELINE_CONTINUE); 1117 } 1118 1119 ASSERT(!zp->zp_nopwrite); 1120 1121 if (BP_IS_HOLE(bp) || !zp->zp_dedup) 1122 return (ZIO_PIPELINE_CONTINUE); 1123 1124 ASSERT(zio_checksum_table[zp->zp_checksum].ci_dedup || 1125 zp->zp_dedup_verify); 1126 1127 if (BP_GET_CHECKSUM(bp) == zp->zp_checksum) { 1128 BP_SET_DEDUP(bp, 1); 1129 zio->io_pipeline |= ZIO_STAGE_DDT_WRITE; 1130 return (ZIO_PIPELINE_CONTINUE); 1131 } 1132 zio->io_bp_override = NULL; 1133 BP_ZERO(bp); 1134 } 1135 1136 if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg) { 1137 /* 1138 * We're rewriting an existing block, which means we're 1139 * working on behalf of spa_sync(). For spa_sync() to 1140 * converge, it must eventually be the case that we don't 1141 * have to allocate new blocks. But compression changes 1142 * the blocksize, which forces a reallocate, and makes 1143 * convergence take longer. Therefore, after the first 1144 * few passes, stop compressing to ensure convergence. 1145 */ 1146 pass = spa_sync_pass(spa); 1147 1148 ASSERT(zio->io_txg == spa_syncing_txg(spa)); 1149 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 1150 ASSERT(!BP_GET_DEDUP(bp)); 1151 1152 if (pass >= zfs_sync_pass_dont_compress) 1153 compress = ZIO_COMPRESS_OFF; 1154 1155 /* Make sure someone doesn't change their mind on overwrites */ 1156 ASSERT(BP_IS_EMBEDDED(bp) || MIN(zp->zp_copies + BP_IS_GANG(bp), 1157 spa_max_replication(spa)) == BP_GET_NDVAS(bp)); 1158 } 1159 1160 if (compress != ZIO_COMPRESS_OFF) { 1161 void *cbuf = zio_buf_alloc(lsize); 1162 psize = zio_compress_data(compress, zio->io_data, cbuf, lsize); 1163 if (psize == 0 || psize == lsize) { 1164 compress = ZIO_COMPRESS_OFF; 1165 zio_buf_free(cbuf, lsize); 1166 } else if (!zp->zp_dedup && psize <= BPE_PAYLOAD_SIZE && 1167 zp->zp_level == 0 && !DMU_OT_HAS_FILL(zp->zp_type) && 1168 spa_feature_is_enabled(spa, SPA_FEATURE_EMBEDDED_DATA)) { 1169 encode_embedded_bp_compressed(bp, 1170 cbuf, compress, lsize, psize); 1171 BPE_SET_ETYPE(bp, BP_EMBEDDED_TYPE_DATA); 1172 BP_SET_TYPE(bp, zio->io_prop.zp_type); 1173 BP_SET_LEVEL(bp, zio->io_prop.zp_level); 1174 zio_buf_free(cbuf, lsize); 1175 bp->blk_birth = zio->io_txg; 1176 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1177 ASSERT(spa_feature_is_active(spa, 1178 SPA_FEATURE_EMBEDDED_DATA)); 1179 return (ZIO_PIPELINE_CONTINUE); 1180 } else { 1181 /* 1182 * Round up compressed size to MINBLOCKSIZE and 1183 * zero the tail. 1184 */ 1185 size_t rounded = 1186 P2ROUNDUP(psize, (size_t)SPA_MINBLOCKSIZE); 1187 if (rounded > psize) { 1188 bzero((char *)cbuf + psize, rounded - psize); 1189 psize = rounded; 1190 } 1191 if (psize == lsize) { 1192 compress = ZIO_COMPRESS_OFF; 1193 zio_buf_free(cbuf, lsize); 1194 } else { 1195 zio_push_transform(zio, cbuf, 1196 psize, lsize, NULL); 1197 } 1198 } 1199 } 1200 1201 /* 1202 * The final pass of spa_sync() must be all rewrites, but the first 1203 * few passes offer a trade-off: allocating blocks defers convergence, 1204 * but newly allocated blocks are sequential, so they can be written 1205 * to disk faster. Therefore, we allow the first few passes of 1206 * spa_sync() to allocate new blocks, but force rewrites after that. 1207 * There should only be a handful of blocks after pass 1 in any case. 1208 */ 1209 if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg && 1210 BP_GET_PSIZE(bp) == psize && 1211 pass >= zfs_sync_pass_rewrite) { 1212 ASSERT(psize != 0); 1213 enum zio_stage gang_stages = zio->io_pipeline & ZIO_GANG_STAGES; 1214 zio->io_pipeline = ZIO_REWRITE_PIPELINE | gang_stages; 1215 zio->io_flags |= ZIO_FLAG_IO_REWRITE; 1216 } else { 1217 BP_ZERO(bp); 1218 zio->io_pipeline = ZIO_WRITE_PIPELINE; 1219 } 1220 1221 if (psize == 0) { 1222 if (zio->io_bp_orig.blk_birth != 0 && 1223 spa_feature_is_active(spa, SPA_FEATURE_HOLE_BIRTH)) { 1224 BP_SET_LSIZE(bp, lsize); 1225 BP_SET_TYPE(bp, zp->zp_type); 1226 BP_SET_LEVEL(bp, zp->zp_level); 1227 BP_SET_BIRTH(bp, zio->io_txg, 0); 1228 } 1229 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1230 } else { 1231 ASSERT(zp->zp_checksum != ZIO_CHECKSUM_GANG_HEADER); 1232 BP_SET_LSIZE(bp, lsize); 1233 BP_SET_TYPE(bp, zp->zp_type); 1234 BP_SET_LEVEL(bp, zp->zp_level); 1235 BP_SET_PSIZE(bp, psize); 1236 BP_SET_COMPRESS(bp, compress); 1237 BP_SET_CHECKSUM(bp, zp->zp_checksum); 1238 BP_SET_DEDUP(bp, zp->zp_dedup); 1239 BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); 1240 if (zp->zp_dedup) { 1241 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 1242 ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); 1243 zio->io_pipeline = ZIO_DDT_WRITE_PIPELINE; 1244 } 1245 if (zp->zp_nopwrite) { 1246 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 1247 ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); 1248 zio->io_pipeline |= ZIO_STAGE_NOP_WRITE; 1249 } 1250 } 1251 1252 return (ZIO_PIPELINE_CONTINUE); 1253} 1254 1255static int 1256zio_free_bp_init(zio_t **ziop) 1257{ 1258 zio_t *zio = *ziop; 1259 blkptr_t *bp = zio->io_bp; 1260 1261 if (zio->io_child_type == ZIO_CHILD_LOGICAL) { 1262 if (BP_GET_DEDUP(bp)) 1263 zio->io_pipeline = ZIO_DDT_FREE_PIPELINE; 1264 } 1265 1266 return (ZIO_PIPELINE_CONTINUE); 1267} 1268 1269/* 1270 * ========================================================================== 1271 * Execute the I/O pipeline 1272 * ========================================================================== 1273 */ 1274 1275static void 1276zio_taskq_dispatch(zio_t *zio, zio_taskq_type_t q, boolean_t cutinline) 1277{ 1278 spa_t *spa = zio->io_spa; 1279 zio_type_t t = zio->io_type; 1280 int flags = (cutinline ? TQ_FRONT : 0); 1281 1282 ASSERT(q == ZIO_TASKQ_ISSUE || q == ZIO_TASKQ_INTERRUPT); 1283 1284 /* 1285 * If we're a config writer or a probe, the normal issue and 1286 * interrupt threads may all be blocked waiting for the config lock. 1287 * In this case, select the otherwise-unused taskq for ZIO_TYPE_NULL. 1288 */ 1289 if (zio->io_flags & (ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_PROBE)) 1290 t = ZIO_TYPE_NULL; 1291 1292 /* 1293 * A similar issue exists for the L2ARC write thread until L2ARC 2.0. 1294 */ 1295 if (t == ZIO_TYPE_WRITE && zio->io_vd && zio->io_vd->vdev_aux) 1296 t = ZIO_TYPE_NULL; 1297 1298 /* 1299 * If this is a high priority I/O, then use the high priority taskq if 1300 * available. 1301 */ 1302 if (zio->io_priority == ZIO_PRIORITY_NOW && 1303 spa->spa_zio_taskq[t][q + 1].stqs_count != 0) 1304 q++; 1305 1306 ASSERT3U(q, <, ZIO_TASKQ_TYPES); 1307 1308 /* 1309 * NB: We are assuming that the zio can only be dispatched 1310 * to a single taskq at a time. It would be a grievous error 1311 * to dispatch the zio to another taskq at the same time. 1312 */ 1313#if defined(illumos) || !defined(_KERNEL) 1314 ASSERT(zio->io_tqent.tqent_next == NULL); 1315#else 1316 ASSERT(zio->io_tqent.tqent_task.ta_pending == 0); 1317#endif 1318 spa_taskq_dispatch_ent(spa, t, q, (task_func_t *)zio_execute, zio, 1319 flags, &zio->io_tqent); 1320} 1321 1322static boolean_t 1323zio_taskq_member(zio_t *zio, zio_taskq_type_t q) 1324{ 1325 kthread_t *executor = zio->io_executor; 1326 spa_t *spa = zio->io_spa; 1327 1328 for (zio_type_t t = 0; t < ZIO_TYPES; t++) { 1329 spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; 1330 uint_t i; 1331 for (i = 0; i < tqs->stqs_count; i++) { 1332 if (taskq_member(tqs->stqs_taskq[i], executor)) 1333 return (B_TRUE); 1334 } 1335 } 1336 1337 return (B_FALSE); 1338} 1339 1340static int 1341zio_issue_async(zio_t **ziop) 1342{ 1343 zio_t *zio = *ziop; 1344 1345 zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE); 1346 1347 return (ZIO_PIPELINE_STOP); 1348} 1349 1350void 1351zio_interrupt(zio_t *zio) 1352{ 1353 zio_taskq_dispatch(zio, ZIO_TASKQ_INTERRUPT, B_FALSE); 1354} 1355 1356/* 1357 * Execute the I/O pipeline until one of the following occurs: 1358 * 1359 * (1) the I/O completes 1360 * (2) the pipeline stalls waiting for dependent child I/Os 1361 * (3) the I/O issues, so we're waiting for an I/O completion interrupt 1362 * (4) the I/O is delegated by vdev-level caching or aggregation 1363 * (5) the I/O is deferred due to vdev-level queueing 1364 * (6) the I/O is handed off to another thread. 1365 * 1366 * In all cases, the pipeline stops whenever there's no CPU work; it never 1367 * burns a thread in cv_wait(). 1368 * 1369 * There's no locking on io_stage because there's no legitimate way 1370 * for multiple threads to be attempting to process the same I/O. 1371 */ 1372static zio_pipe_stage_t *zio_pipeline[]; 1373 1374void 1375zio_execute(zio_t *zio) 1376{ 1377 zio->io_executor = curthread; 1378 1379 while (zio->io_stage < ZIO_STAGE_DONE) { 1380 enum zio_stage pipeline = zio->io_pipeline; 1381 enum zio_stage stage = zio->io_stage; 1382 int rv; 1383 1384 ASSERT(!MUTEX_HELD(&zio->io_lock)); 1385 ASSERT(ISP2(stage)); 1386 ASSERT(zio->io_stall == NULL); 1387 1388 do { 1389 stage <<= 1; 1390 } while ((stage & pipeline) == 0); 1391 1392 ASSERT(stage <= ZIO_STAGE_DONE); 1393 1394 /* 1395 * If we are in interrupt context and this pipeline stage 1396 * will grab a config lock that is held across I/O, 1397 * or may wait for an I/O that needs an interrupt thread 1398 * to complete, issue async to avoid deadlock. 1399 * 1400 * For VDEV_IO_START, we cut in line so that the io will 1401 * be sent to disk promptly. 1402 */ 1403 if ((stage & ZIO_BLOCKING_STAGES) && zio->io_vd == NULL && 1404 zio_taskq_member(zio, ZIO_TASKQ_INTERRUPT)) { 1405 boolean_t cut = (stage == ZIO_STAGE_VDEV_IO_START) ? 1406 zio_requeue_io_start_cut_in_line : B_FALSE; 1407 zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut); 1408 return; 1409 } 1410 1411 zio->io_stage = stage; 1412 rv = zio_pipeline[highbit64(stage) - 1](&zio); 1413 1414 if (rv == ZIO_PIPELINE_STOP) 1415 return; 1416 1417 ASSERT(rv == ZIO_PIPELINE_CONTINUE); 1418 } 1419} 1420 1421/* 1422 * ========================================================================== 1423 * Initiate I/O, either sync or async 1424 * ========================================================================== 1425 */ 1426int 1427zio_wait(zio_t *zio) 1428{ 1429 int error; 1430 1431 ASSERT(zio->io_stage == ZIO_STAGE_OPEN); 1432 ASSERT(zio->io_executor == NULL); 1433 1434 zio->io_waiter = curthread; 1435 1436 zio_execute(zio); 1437 1438 mutex_enter(&zio->io_lock); 1439 while (zio->io_executor != NULL) 1440 cv_wait(&zio->io_cv, &zio->io_lock); 1441 mutex_exit(&zio->io_lock); 1442 1443 error = zio->io_error; 1444 zio_destroy(zio); 1445 1446 return (error); 1447} 1448 1449void 1450zio_nowait(zio_t *zio) 1451{ 1452 ASSERT(zio->io_executor == NULL); 1453 1454 if (zio->io_child_type == ZIO_CHILD_LOGICAL && 1455 zio_unique_parent(zio) == NULL) { 1456 /* 1457 * This is a logical async I/O with no parent to wait for it. 1458 * We add it to the spa_async_root_zio "Godfather" I/O which 1459 * will ensure they complete prior to unloading the pool. 1460 */ 1461 spa_t *spa = zio->io_spa; 1462 1463 zio_add_child(spa->spa_async_zio_root, zio); 1464 } 1465 1466 zio_execute(zio); 1467} 1468 1469/* 1470 * ========================================================================== 1471 * Reexecute or suspend/resume failed I/O 1472 * ========================================================================== 1473 */ 1474 1475static void 1476zio_reexecute(zio_t *pio) 1477{ 1478 zio_t *cio, *cio_next; 1479 1480 ASSERT(pio->io_child_type == ZIO_CHILD_LOGICAL); 1481 ASSERT(pio->io_orig_stage == ZIO_STAGE_OPEN); 1482 ASSERT(pio->io_gang_leader == NULL); 1483 ASSERT(pio->io_gang_tree == NULL); 1484 1485 pio->io_flags = pio->io_orig_flags; 1486 pio->io_stage = pio->io_orig_stage; 1487 pio->io_pipeline = pio->io_orig_pipeline; 1488 pio->io_reexecute = 0; 1489 pio->io_flags |= ZIO_FLAG_REEXECUTED; 1490 pio->io_error = 0; 1491 for (int w = 0; w < ZIO_WAIT_TYPES; w++) 1492 pio->io_state[w] = 0; 1493 for (int c = 0; c < ZIO_CHILD_TYPES; c++) 1494 pio->io_child_error[c] = 0; 1495 1496 if (IO_IS_ALLOCATING(pio)) 1497 BP_ZERO(pio->io_bp); 1498 1499 /* 1500 * As we reexecute pio's children, new children could be created. 1501 * New children go to the head of pio's io_child_list, however, 1502 * so we will (correctly) not reexecute them. The key is that 1503 * the remainder of pio's io_child_list, from 'cio_next' onward, 1504 * cannot be affected by any side effects of reexecuting 'cio'. 1505 */ 1506 for (cio = zio_walk_children(pio); cio != NULL; cio = cio_next) { 1507 cio_next = zio_walk_children(pio); 1508 mutex_enter(&pio->io_lock); 1509 for (int w = 0; w < ZIO_WAIT_TYPES; w++) 1510 pio->io_children[cio->io_child_type][w]++; 1511 mutex_exit(&pio->io_lock); 1512 zio_reexecute(cio); 1513 } 1514 1515 /* 1516 * Now that all children have been reexecuted, execute the parent. 1517 * We don't reexecute "The Godfather" I/O here as it's the 1518 * responsibility of the caller to wait on him. 1519 */ 1520 if (!(pio->io_flags & ZIO_FLAG_GODFATHER)) 1521 zio_execute(pio); 1522} 1523 1524void 1525zio_suspend(spa_t *spa, zio_t *zio) 1526{ 1527 if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_PANIC) 1528 fm_panic("Pool '%s' has encountered an uncorrectable I/O " 1529 "failure and the failure mode property for this pool " 1530 "is set to panic.", spa_name(spa)); 1531 1532 zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL, NULL, 0, 0); 1533 1534 mutex_enter(&spa->spa_suspend_lock); 1535 1536 if (spa->spa_suspend_zio_root == NULL) 1537 spa->spa_suspend_zio_root = zio_root(spa, NULL, NULL, 1538 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | 1539 ZIO_FLAG_GODFATHER); 1540 1541 spa->spa_suspended = B_TRUE; 1542 1543 if (zio != NULL) { 1544 ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER)); 1545 ASSERT(zio != spa->spa_suspend_zio_root); 1546 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 1547 ASSERT(zio_unique_parent(zio) == NULL); 1548 ASSERT(zio->io_stage == ZIO_STAGE_DONE); 1549 zio_add_child(spa->spa_suspend_zio_root, zio); 1550 } 1551 1552 mutex_exit(&spa->spa_suspend_lock); 1553} 1554 1555int 1556zio_resume(spa_t *spa) 1557{ 1558 zio_t *pio; 1559 1560 /* 1561 * Reexecute all previously suspended i/o. 1562 */ 1563 mutex_enter(&spa->spa_suspend_lock); 1564 spa->spa_suspended = B_FALSE; 1565 cv_broadcast(&spa->spa_suspend_cv); 1566 pio = spa->spa_suspend_zio_root; 1567 spa->spa_suspend_zio_root = NULL; 1568 mutex_exit(&spa->spa_suspend_lock); 1569 1570 if (pio == NULL) 1571 return (0); 1572 1573 zio_reexecute(pio); 1574 return (zio_wait(pio)); 1575} 1576 1577void 1578zio_resume_wait(spa_t *spa) 1579{ 1580 mutex_enter(&spa->spa_suspend_lock); 1581 while (spa_suspended(spa)) 1582 cv_wait(&spa->spa_suspend_cv, &spa->spa_suspend_lock); 1583 mutex_exit(&spa->spa_suspend_lock); 1584} 1585 1586/* 1587 * ========================================================================== 1588 * Gang blocks. 1589 * 1590 * A gang block is a collection of small blocks that looks to the DMU 1591 * like one large block. When zio_dva_allocate() cannot find a block 1592 * of the requested size, due to either severe fragmentation or the pool 1593 * being nearly full, it calls zio_write_gang_block() to construct the 1594 * block from smaller fragments. 1595 * 1596 * A gang block consists of a gang header (zio_gbh_phys_t) and up to 1597 * three (SPA_GBH_NBLKPTRS) gang members. The gang header is just like 1598 * an indirect block: it's an array of block pointers. It consumes 1599 * only one sector and hence is allocatable regardless of fragmentation. 1600 * The gang header's bps point to its gang members, which hold the data. 1601 * 1602 * Gang blocks are self-checksumming, using the bp's <vdev, offset, txg> 1603 * as the verifier to ensure uniqueness of the SHA256 checksum. 1604 * Critically, the gang block bp's blk_cksum is the checksum of the data, 1605 * not the gang header. This ensures that data block signatures (needed for 1606 * deduplication) are independent of how the block is physically stored. 1607 * 1608 * Gang blocks can be nested: a gang member may itself be a gang block. 1609 * Thus every gang block is a tree in which root and all interior nodes are 1610 * gang headers, and the leaves are normal blocks that contain user data. 1611 * The root of the gang tree is called the gang leader. 1612 * 1613 * To perform any operation (read, rewrite, free, claim) on a gang block, 1614 * zio_gang_assemble() first assembles the gang tree (minus data leaves) 1615 * in the io_gang_tree field of the original logical i/o by recursively 1616 * reading the gang leader and all gang headers below it. This yields 1617 * an in-core tree containing the contents of every gang header and the 1618 * bps for every constituent of the gang block. 1619 * 1620 * With the gang tree now assembled, zio_gang_issue() just walks the gang tree 1621 * and invokes a callback on each bp. To free a gang block, zio_gang_issue() 1622 * calls zio_free_gang() -- a trivial wrapper around zio_free() -- for each bp. 1623 * zio_claim_gang() provides a similarly trivial wrapper for zio_claim(). 1624 * zio_read_gang() is a wrapper around zio_read() that omits reading gang 1625 * headers, since we already have those in io_gang_tree. zio_rewrite_gang() 1626 * performs a zio_rewrite() of the data or, for gang headers, a zio_rewrite() 1627 * of the gang header plus zio_checksum_compute() of the data to update the 1628 * gang header's blk_cksum as described above. 1629 * 1630 * The two-phase assemble/issue model solves the problem of partial failure -- 1631 * what if you'd freed part of a gang block but then couldn't read the 1632 * gang header for another part? Assembling the entire gang tree first 1633 * ensures that all the necessary gang header I/O has succeeded before 1634 * starting the actual work of free, claim, or write. Once the gang tree 1635 * is assembled, free and claim are in-memory operations that cannot fail. 1636 * 1637 * In the event that a gang write fails, zio_dva_unallocate() walks the 1638 * gang tree to immediately free (i.e. insert back into the space map) 1639 * everything we've allocated. This ensures that we don't get ENOSPC 1640 * errors during repeated suspend/resume cycles due to a flaky device. 1641 * 1642 * Gang rewrites only happen during sync-to-convergence. If we can't assemble 1643 * the gang tree, we won't modify the block, so we can safely defer the free 1644 * (knowing that the block is still intact). If we *can* assemble the gang 1645 * tree, then even if some of the rewrites fail, zio_dva_unallocate() will free 1646 * each constituent bp and we can allocate a new block on the next sync pass. 1647 * 1648 * In all cases, the gang tree allows complete recovery from partial failure. 1649 * ========================================================================== 1650 */ 1651 1652static zio_t * 1653zio_read_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) 1654{ 1655 if (gn != NULL) 1656 return (pio); 1657 1658 return (zio_read(pio, pio->io_spa, bp, data, BP_GET_PSIZE(bp), 1659 NULL, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), 1660 &pio->io_bookmark)); 1661} 1662 1663zio_t * 1664zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) 1665{ 1666 zio_t *zio; 1667 1668 if (gn != NULL) { 1669 zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, 1670 gn->gn_gbh, SPA_GANGBLOCKSIZE, NULL, NULL, pio->io_priority, 1671 ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); 1672 /* 1673 * As we rewrite each gang header, the pipeline will compute 1674 * a new gang block header checksum for it; but no one will 1675 * compute a new data checksum, so we do that here. The one 1676 * exception is the gang leader: the pipeline already computed 1677 * its data checksum because that stage precedes gang assembly. 1678 * (Presently, nothing actually uses interior data checksums; 1679 * this is just good hygiene.) 1680 */ 1681 if (gn != pio->io_gang_leader->io_gang_tree) { 1682 zio_checksum_compute(zio, BP_GET_CHECKSUM(bp), 1683 data, BP_GET_PSIZE(bp)); 1684 } 1685 /* 1686 * If we are here to damage data for testing purposes, 1687 * leave the GBH alone so that we can detect the damage. 1688 */ 1689 if (pio->io_gang_leader->io_flags & ZIO_FLAG_INDUCE_DAMAGE) 1690 zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; 1691 } else { 1692 zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, 1693 data, BP_GET_PSIZE(bp), NULL, NULL, pio->io_priority, 1694 ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); 1695 } 1696 1697 return (zio); 1698} 1699 1700/* ARGSUSED */ 1701zio_t * 1702zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) 1703{ 1704 return (zio_free_sync(pio, pio->io_spa, pio->io_txg, bp, 1705 BP_IS_GANG(bp) ? SPA_GANGBLOCKSIZE : BP_GET_PSIZE(bp), 1706 ZIO_GANG_CHILD_FLAGS(pio))); 1707} 1708 1709/* ARGSUSED */ 1710zio_t * 1711zio_claim_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) 1712{ 1713 return (zio_claim(pio, pio->io_spa, pio->io_txg, bp, 1714 NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio))); 1715} 1716 1717static zio_gang_issue_func_t *zio_gang_issue_func[ZIO_TYPES] = { 1718 NULL, 1719 zio_read_gang, 1720 zio_rewrite_gang, 1721 zio_free_gang, 1722 zio_claim_gang, 1723 NULL 1724}; 1725 1726static void zio_gang_tree_assemble_done(zio_t *zio); 1727 1728static zio_gang_node_t * 1729zio_gang_node_alloc(zio_gang_node_t **gnpp) 1730{ 1731 zio_gang_node_t *gn; 1732 1733 ASSERT(*gnpp == NULL); 1734 1735 gn = kmem_zalloc(sizeof (*gn), KM_SLEEP); 1736 gn->gn_gbh = zio_buf_alloc(SPA_GANGBLOCKSIZE); 1737 *gnpp = gn; 1738 1739 return (gn); 1740} 1741 1742static void 1743zio_gang_node_free(zio_gang_node_t **gnpp) 1744{ 1745 zio_gang_node_t *gn = *gnpp; 1746 1747 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) 1748 ASSERT(gn->gn_child[g] == NULL); 1749 1750 zio_buf_free(gn->gn_gbh, SPA_GANGBLOCKSIZE); 1751 kmem_free(gn, sizeof (*gn)); 1752 *gnpp = NULL; 1753} 1754 1755static void 1756zio_gang_tree_free(zio_gang_node_t **gnpp) 1757{ 1758 zio_gang_node_t *gn = *gnpp; 1759 1760 if (gn == NULL) 1761 return; 1762 1763 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) 1764 zio_gang_tree_free(&gn->gn_child[g]); 1765 1766 zio_gang_node_free(gnpp); 1767} 1768 1769static void 1770zio_gang_tree_assemble(zio_t *gio, blkptr_t *bp, zio_gang_node_t **gnpp) 1771{ 1772 zio_gang_node_t *gn = zio_gang_node_alloc(gnpp); 1773 1774 ASSERT(gio->io_gang_leader == gio); 1775 ASSERT(BP_IS_GANG(bp)); 1776 1777 zio_nowait(zio_read(gio, gio->io_spa, bp, gn->gn_gbh, 1778 SPA_GANGBLOCKSIZE, zio_gang_tree_assemble_done, gn, 1779 gio->io_priority, ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark)); 1780} 1781 1782static void 1783zio_gang_tree_assemble_done(zio_t *zio) 1784{ 1785 zio_t *gio = zio->io_gang_leader; 1786 zio_gang_node_t *gn = zio->io_private; 1787 blkptr_t *bp = zio->io_bp; 1788 1789 ASSERT(gio == zio_unique_parent(zio)); 1790 ASSERT(zio->io_child_count == 0); 1791 1792 if (zio->io_error) 1793 return; 1794 1795 if (BP_SHOULD_BYTESWAP(bp)) 1796 byteswap_uint64_array(zio->io_data, zio->io_size); 1797 1798 ASSERT(zio->io_data == gn->gn_gbh); 1799 ASSERT(zio->io_size == SPA_GANGBLOCKSIZE); 1800 ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC); 1801 1802 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { 1803 blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; 1804 if (!BP_IS_GANG(gbp)) 1805 continue; 1806 zio_gang_tree_assemble(gio, gbp, &gn->gn_child[g]); 1807 } 1808} 1809 1810static void 1811zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, void *data) 1812{ 1813 zio_t *gio = pio->io_gang_leader; 1814 zio_t *zio; 1815 1816 ASSERT(BP_IS_GANG(bp) == !!gn); 1817 ASSERT(BP_GET_CHECKSUM(bp) == BP_GET_CHECKSUM(gio->io_bp)); 1818 ASSERT(BP_GET_LSIZE(bp) == BP_GET_PSIZE(bp) || gn == gio->io_gang_tree); 1819 1820 /* 1821 * If you're a gang header, your data is in gn->gn_gbh. 1822 * If you're a gang member, your data is in 'data' and gn == NULL. 1823 */ 1824 zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data); 1825 1826 if (gn != NULL) { 1827 ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC); 1828 1829 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { 1830 blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; 1831 if (BP_IS_HOLE(gbp)) 1832 continue; 1833 zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data); 1834 data = (char *)data + BP_GET_PSIZE(gbp); 1835 } 1836 } 1837 1838 if (gn == gio->io_gang_tree && gio->io_data != NULL) 1839 ASSERT3P((char *)gio->io_data + gio->io_size, ==, data); 1840 1841 if (zio != pio) 1842 zio_nowait(zio); 1843} 1844 1845static int 1846zio_gang_assemble(zio_t **ziop) 1847{ 1848 zio_t *zio = *ziop; 1849 blkptr_t *bp = zio->io_bp; 1850 1851 ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == NULL); 1852 ASSERT(zio->io_child_type > ZIO_CHILD_GANG); 1853 1854 zio->io_gang_leader = zio; 1855 1856 zio_gang_tree_assemble(zio, bp, &zio->io_gang_tree); 1857 1858 return (ZIO_PIPELINE_CONTINUE); 1859} 1860 1861static int 1862zio_gang_issue(zio_t **ziop) 1863{ 1864 zio_t *zio = *ziop; 1865 blkptr_t *bp = zio->io_bp; 1866 1867 if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE)) 1868 return (ZIO_PIPELINE_STOP); 1869 1870 ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == zio); 1871 ASSERT(zio->io_child_type > ZIO_CHILD_GANG); 1872 1873 if (zio->io_child_error[ZIO_CHILD_GANG] == 0) 1874 zio_gang_tree_issue(zio, zio->io_gang_tree, bp, zio->io_data); 1875 else 1876 zio_gang_tree_free(&zio->io_gang_tree); 1877 1878 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1879 1880 return (ZIO_PIPELINE_CONTINUE); 1881} 1882 1883static void 1884zio_write_gang_member_ready(zio_t *zio) 1885{ 1886 zio_t *pio = zio_unique_parent(zio); 1887 zio_t *gio = zio->io_gang_leader; 1888 dva_t *cdva = zio->io_bp->blk_dva; 1889 dva_t *pdva = pio->io_bp->blk_dva; 1890 uint64_t asize; 1891 1892 if (BP_IS_HOLE(zio->io_bp)) 1893 return; 1894 1895 ASSERT(BP_IS_HOLE(&zio->io_bp_orig)); 1896 1897 ASSERT(zio->io_child_type == ZIO_CHILD_GANG); 1898 ASSERT3U(zio->io_prop.zp_copies, ==, gio->io_prop.zp_copies); 1899 ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(zio->io_bp)); 1900 ASSERT3U(pio->io_prop.zp_copies, <=, BP_GET_NDVAS(pio->io_bp)); 1901 ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp)); 1902 1903 mutex_enter(&pio->io_lock); 1904 for (int d = 0; d < BP_GET_NDVAS(zio->io_bp); d++) { 1905 ASSERT(DVA_GET_GANG(&pdva[d])); 1906 asize = DVA_GET_ASIZE(&pdva[d]); 1907 asize += DVA_GET_ASIZE(&cdva[d]); 1908 DVA_SET_ASIZE(&pdva[d], asize); 1909 } 1910 mutex_exit(&pio->io_lock); 1911} 1912 1913static int 1914zio_write_gang_block(zio_t *pio) 1915{ 1916 spa_t *spa = pio->io_spa; 1917 blkptr_t *bp = pio->io_bp; 1918 zio_t *gio = pio->io_gang_leader; 1919 zio_t *zio; 1920 zio_gang_node_t *gn, **gnpp; 1921 zio_gbh_phys_t *gbh; 1922 uint64_t txg = pio->io_txg; 1923 uint64_t resid = pio->io_size; 1924 uint64_t lsize; 1925 int copies = gio->io_prop.zp_copies; 1926 int gbh_copies = MIN(copies + 1, spa_max_replication(spa)); 1927 zio_prop_t zp; 1928 int error; 1929 1930 error = metaslab_alloc(spa, spa_normal_class(spa), SPA_GANGBLOCKSIZE, 1931 bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp, 1932 METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER); 1933 if (error) { 1934 pio->io_error = error; 1935 return (ZIO_PIPELINE_CONTINUE); 1936 } 1937 1938 if (pio == gio) { 1939 gnpp = &gio->io_gang_tree; 1940 } else { 1941 gnpp = pio->io_private; 1942 ASSERT(pio->io_ready == zio_write_gang_member_ready); 1943 } 1944 1945 gn = zio_gang_node_alloc(gnpp); 1946 gbh = gn->gn_gbh; 1947 bzero(gbh, SPA_GANGBLOCKSIZE); 1948 1949 /* 1950 * Create the gang header. 1951 */ 1952 zio = zio_rewrite(pio, spa, txg, bp, gbh, SPA_GANGBLOCKSIZE, NULL, NULL, 1953 pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); 1954 1955 /* 1956 * Create and nowait the gang children. 1957 */ 1958 for (int g = 0; resid != 0; resid -= lsize, g++) { 1959 lsize = P2ROUNDUP(resid / (SPA_GBH_NBLKPTRS - g), 1960 SPA_MINBLOCKSIZE); 1961 ASSERT(lsize >= SPA_MINBLOCKSIZE && lsize <= resid); 1962 1963 zp.zp_checksum = gio->io_prop.zp_checksum; 1964 zp.zp_compress = ZIO_COMPRESS_OFF; 1965 zp.zp_type = DMU_OT_NONE; 1966 zp.zp_level = 0; 1967 zp.zp_copies = gio->io_prop.zp_copies; 1968 zp.zp_dedup = B_FALSE; 1969 zp.zp_dedup_verify = B_FALSE; 1970 zp.zp_nopwrite = B_FALSE; 1971 1972 zio_nowait(zio_write(zio, spa, txg, &gbh->zg_blkptr[g], 1973 (char *)pio->io_data + (pio->io_size - resid), lsize, &zp, 1974 zio_write_gang_member_ready, NULL, NULL, &gn->gn_child[g], 1975 pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), 1976 &pio->io_bookmark)); 1977 } 1978 1979 /* 1980 * Set pio's pipeline to just wait for zio to finish. 1981 */ 1982 pio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1983 1984 zio_nowait(zio); 1985 1986 return (ZIO_PIPELINE_CONTINUE); 1987} 1988 1989/* 1990 * The zio_nop_write stage in the pipeline determines if allocating 1991 * a new bp is necessary. By leveraging a cryptographically secure checksum, 1992 * such as SHA256, we can compare the checksums of the new data and the old 1993 * to determine if allocating a new block is required. The nopwrite 1994 * feature can handle writes in either syncing or open context (i.e. zil 1995 * writes) and as a result is mutually exclusive with dedup. 1996 */ 1997static int 1998zio_nop_write(zio_t **ziop) 1999{ 2000 zio_t *zio = *ziop; 2001 blkptr_t *bp = zio->io_bp; 2002 blkptr_t *bp_orig = &zio->io_bp_orig; 2003 zio_prop_t *zp = &zio->io_prop; 2004 2005 ASSERT(BP_GET_LEVEL(bp) == 0); 2006 ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); 2007 ASSERT(zp->zp_nopwrite); 2008 ASSERT(!zp->zp_dedup); 2009 ASSERT(zio->io_bp_override == NULL); 2010 ASSERT(IO_IS_ALLOCATING(zio)); 2011 2012 /* 2013 * Check to see if the original bp and the new bp have matching 2014 * characteristics (i.e. same checksum, compression algorithms, etc). 2015 * If they don't then just continue with the pipeline which will 2016 * allocate a new bp. 2017 */ 2018 if (BP_IS_HOLE(bp_orig) || 2019 !zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_dedup || 2020 BP_GET_CHECKSUM(bp) != BP_GET_CHECKSUM(bp_orig) || 2021 BP_GET_COMPRESS(bp) != BP_GET_COMPRESS(bp_orig) || 2022 BP_GET_DEDUP(bp) != BP_GET_DEDUP(bp_orig) || 2023 zp->zp_copies != BP_GET_NDVAS(bp_orig)) 2024 return (ZIO_PIPELINE_CONTINUE); 2025 2026 /* 2027 * If the checksums match then reset the pipeline so that we 2028 * avoid allocating a new bp and issuing any I/O. 2029 */ 2030 if (ZIO_CHECKSUM_EQUAL(bp->blk_cksum, bp_orig->blk_cksum)) { 2031 ASSERT(zio_checksum_table[zp->zp_checksum].ci_dedup); 2032 ASSERT3U(BP_GET_PSIZE(bp), ==, BP_GET_PSIZE(bp_orig)); 2033 ASSERT3U(BP_GET_LSIZE(bp), ==, BP_GET_LSIZE(bp_orig)); 2034 ASSERT(zp->zp_compress != ZIO_COMPRESS_OFF); 2035 ASSERT(bcmp(&bp->blk_prop, &bp_orig->blk_prop, 2036 sizeof (uint64_t)) == 0); 2037 2038 *bp = *bp_orig; 2039 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 2040 zio->io_flags |= ZIO_FLAG_NOPWRITE; 2041 } 2042 2043 return (ZIO_PIPELINE_CONTINUE); 2044} 2045 2046/* 2047 * ========================================================================== 2048 * Dedup 2049 * ========================================================================== 2050 */ 2051static void 2052zio_ddt_child_read_done(zio_t *zio) 2053{ 2054 blkptr_t *bp = zio->io_bp; 2055 ddt_entry_t *dde = zio->io_private; 2056 ddt_phys_t *ddp; 2057 zio_t *pio = zio_unique_parent(zio); 2058 2059 mutex_enter(&pio->io_lock); 2060 ddp = ddt_phys_select(dde, bp); 2061 if (zio->io_error == 0) 2062 ddt_phys_clear(ddp); /* this ddp doesn't need repair */ 2063 if (zio->io_error == 0 && dde->dde_repair_data == NULL) 2064 dde->dde_repair_data = zio->io_data; 2065 else 2066 zio_buf_free(zio->io_data, zio->io_size); 2067 mutex_exit(&pio->io_lock); 2068} 2069 2070static int 2071zio_ddt_read_start(zio_t **ziop) 2072{ 2073 zio_t *zio = *ziop; 2074 blkptr_t *bp = zio->io_bp; 2075 2076 ASSERT(BP_GET_DEDUP(bp)); 2077 ASSERT(BP_GET_PSIZE(bp) == zio->io_size); 2078 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 2079 2080 if (zio->io_child_error[ZIO_CHILD_DDT]) { 2081 ddt_t *ddt = ddt_select(zio->io_spa, bp); 2082 ddt_entry_t *dde = ddt_repair_start(ddt, bp); 2083 ddt_phys_t *ddp = dde->dde_phys; 2084 ddt_phys_t *ddp_self = ddt_phys_select(dde, bp); 2085 blkptr_t blk; 2086 2087 ASSERT(zio->io_vsd == NULL); 2088 zio->io_vsd = dde; 2089 2090 if (ddp_self == NULL) 2091 return (ZIO_PIPELINE_CONTINUE); 2092 2093 for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { 2094 if (ddp->ddp_phys_birth == 0 || ddp == ddp_self) 2095 continue; 2096 ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, ddp, 2097 &blk); 2098 zio_nowait(zio_read(zio, zio->io_spa, &blk, 2099 zio_buf_alloc(zio->io_size), zio->io_size, 2100 zio_ddt_child_read_done, dde, zio->io_priority, 2101 ZIO_DDT_CHILD_FLAGS(zio) | ZIO_FLAG_DONT_PROPAGATE, 2102 &zio->io_bookmark)); 2103 } 2104 return (ZIO_PIPELINE_CONTINUE); 2105 } 2106 2107 zio_nowait(zio_read(zio, zio->io_spa, bp, 2108 zio->io_data, zio->io_size, NULL, NULL, zio->io_priority, 2109 ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark)); 2110 2111 return (ZIO_PIPELINE_CONTINUE); 2112} 2113 2114static int 2115zio_ddt_read_done(zio_t **ziop) 2116{ 2117 zio_t *zio = *ziop; 2118 blkptr_t *bp = zio->io_bp; 2119 2120 if (zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE)) 2121 return (ZIO_PIPELINE_STOP); 2122 2123 ASSERT(BP_GET_DEDUP(bp)); 2124 ASSERT(BP_GET_PSIZE(bp) == zio->io_size); 2125 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 2126 2127 if (zio->io_child_error[ZIO_CHILD_DDT]) { 2128 ddt_t *ddt = ddt_select(zio->io_spa, bp); 2129 ddt_entry_t *dde = zio->io_vsd; 2130 if (ddt == NULL) { 2131 ASSERT(spa_load_state(zio->io_spa) != SPA_LOAD_NONE); 2132 return (ZIO_PIPELINE_CONTINUE); 2133 } 2134 if (dde == NULL) { 2135 zio->io_stage = ZIO_STAGE_DDT_READ_START >> 1; 2136 zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE); 2137 return (ZIO_PIPELINE_STOP); 2138 } 2139 if (dde->dde_repair_data != NULL) { 2140 bcopy(dde->dde_repair_data, zio->io_data, zio->io_size); 2141 zio->io_child_error[ZIO_CHILD_DDT] = 0; 2142 } 2143 ddt_repair_done(ddt, dde); 2144 zio->io_vsd = NULL; 2145 } 2146 2147 ASSERT(zio->io_vsd == NULL); 2148 2149 return (ZIO_PIPELINE_CONTINUE); 2150} 2151 2152static boolean_t 2153zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde) 2154{ 2155 spa_t *spa = zio->io_spa; 2156 2157 /* 2158 * Note: we compare the original data, not the transformed data, 2159 * because when zio->io_bp is an override bp, we will not have 2160 * pushed the I/O transforms. That's an important optimization 2161 * because otherwise we'd compress/encrypt all dmu_sync() data twice. 2162 */ 2163 for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) { 2164 zio_t *lio = dde->dde_lead_zio[p]; 2165 2166 if (lio != NULL) { 2167 return (lio->io_orig_size != zio->io_orig_size || 2168 bcmp(zio->io_orig_data, lio->io_orig_data, 2169 zio->io_orig_size) != 0); 2170 } 2171 } 2172 2173 for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) { 2174 ddt_phys_t *ddp = &dde->dde_phys[p]; 2175 2176 if (ddp->ddp_phys_birth != 0) { 2177 arc_buf_t *abuf = NULL; 2178 uint32_t aflags = ARC_WAIT; 2179 blkptr_t blk = *zio->io_bp; 2180 int error; 2181 2182 ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth); 2183 2184 ddt_exit(ddt); 2185 2186 error = arc_read(NULL, spa, &blk, 2187 arc_getbuf_func, &abuf, ZIO_PRIORITY_SYNC_READ, 2188 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, 2189 &aflags, &zio->io_bookmark); 2190 2191 if (error == 0) { 2192 if (arc_buf_size(abuf) != zio->io_orig_size || 2193 bcmp(abuf->b_data, zio->io_orig_data, 2194 zio->io_orig_size) != 0) 2195 error = SET_ERROR(EEXIST); 2196 VERIFY(arc_buf_remove_ref(abuf, &abuf)); 2197 } 2198 2199 ddt_enter(ddt); 2200 return (error != 0); 2201 } 2202 } 2203 2204 return (B_FALSE); 2205} 2206 2207static void 2208zio_ddt_child_write_ready(zio_t *zio) 2209{ 2210 int p = zio->io_prop.zp_copies; 2211 ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp); 2212 ddt_entry_t *dde = zio->io_private; 2213 ddt_phys_t *ddp = &dde->dde_phys[p]; 2214 zio_t *pio; 2215 2216 if (zio->io_error) 2217 return; 2218 2219 ddt_enter(ddt); 2220 2221 ASSERT(dde->dde_lead_zio[p] == zio); 2222 2223 ddt_phys_fill(ddp, zio->io_bp); 2224 2225 while ((pio = zio_walk_parents(zio)) != NULL) 2226 ddt_bp_fill(ddp, pio->io_bp, zio->io_txg); 2227 2228 ddt_exit(ddt); 2229} 2230 2231static void 2232zio_ddt_child_write_done(zio_t *zio) 2233{ 2234 int p = zio->io_prop.zp_copies; 2235 ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp); 2236 ddt_entry_t *dde = zio->io_private; 2237 ddt_phys_t *ddp = &dde->dde_phys[p]; 2238 2239 ddt_enter(ddt); 2240 2241 ASSERT(ddp->ddp_refcnt == 0); 2242 ASSERT(dde->dde_lead_zio[p] == zio); 2243 dde->dde_lead_zio[p] = NULL; 2244 2245 if (zio->io_error == 0) { 2246 while (zio_walk_parents(zio) != NULL) 2247 ddt_phys_addref(ddp); 2248 } else { 2249 ddt_phys_clear(ddp); 2250 } 2251 2252 ddt_exit(ddt); 2253} 2254 2255static void 2256zio_ddt_ditto_write_done(zio_t *zio) 2257{ 2258 int p = DDT_PHYS_DITTO; 2259 zio_prop_t *zp = &zio->io_prop; 2260 blkptr_t *bp = zio->io_bp; 2261 ddt_t *ddt = ddt_select(zio->io_spa, bp); 2262 ddt_entry_t *dde = zio->io_private; 2263 ddt_phys_t *ddp = &dde->dde_phys[p]; 2264 ddt_key_t *ddk = &dde->dde_key; 2265 2266 ddt_enter(ddt); 2267 2268 ASSERT(ddp->ddp_refcnt == 0); 2269 ASSERT(dde->dde_lead_zio[p] == zio); 2270 dde->dde_lead_zio[p] = NULL; 2271 2272 if (zio->io_error == 0) { 2273 ASSERT(ZIO_CHECKSUM_EQUAL(bp->blk_cksum, ddk->ddk_cksum)); 2274 ASSERT(zp->zp_copies < SPA_DVAS_PER_BP); 2275 ASSERT(zp->zp_copies == BP_GET_NDVAS(bp) - BP_IS_GANG(bp)); 2276 if (ddp->ddp_phys_birth != 0) 2277 ddt_phys_free(ddt, ddk, ddp, zio->io_txg); 2278 ddt_phys_fill(ddp, bp); 2279 } 2280 2281 ddt_exit(ddt); 2282} 2283 2284static int 2285zio_ddt_write(zio_t **ziop) 2286{ 2287 zio_t *zio = *ziop; 2288 spa_t *spa = zio->io_spa; 2289 blkptr_t *bp = zio->io_bp; 2290 uint64_t txg = zio->io_txg; 2291 zio_prop_t *zp = &zio->io_prop; 2292 int p = zp->zp_copies; 2293 int ditto_copies; 2294 zio_t *cio = NULL; 2295 zio_t *dio = NULL; 2296 ddt_t *ddt = ddt_select(spa, bp); 2297 ddt_entry_t *dde; 2298 ddt_phys_t *ddp; 2299 2300 ASSERT(BP_GET_DEDUP(bp)); 2301 ASSERT(BP_GET_CHECKSUM(bp) == zp->zp_checksum); 2302 ASSERT(BP_IS_HOLE(bp) || zio->io_bp_override); 2303 2304 ddt_enter(ddt); 2305 dde = ddt_lookup(ddt, bp, B_TRUE); 2306 ddp = &dde->dde_phys[p]; 2307 2308 if (zp->zp_dedup_verify && zio_ddt_collision(zio, ddt, dde)) { 2309 /* 2310 * If we're using a weak checksum, upgrade to a strong checksum 2311 * and try again. If we're already using a strong checksum, 2312 * we can't resolve it, so just convert to an ordinary write. 2313 * (And automatically e-mail a paper to Nature?) 2314 */ 2315 if (!zio_checksum_table[zp->zp_checksum].ci_dedup) { 2316 zp->zp_checksum = spa_dedup_checksum(spa); 2317 zio_pop_transforms(zio); 2318 zio->io_stage = ZIO_STAGE_OPEN; 2319 BP_ZERO(bp); 2320 } else { 2321 zp->zp_dedup = B_FALSE; 2322 } 2323 zio->io_pipeline = ZIO_WRITE_PIPELINE; 2324 ddt_exit(ddt); 2325 return (ZIO_PIPELINE_CONTINUE); 2326 } 2327 2328 ditto_copies = ddt_ditto_copies_needed(ddt, dde, ddp); 2329 ASSERT(ditto_copies < SPA_DVAS_PER_BP); 2330 2331 if (ditto_copies > ddt_ditto_copies_present(dde) && 2332 dde->dde_lead_zio[DDT_PHYS_DITTO] == NULL) { 2333 zio_prop_t czp = *zp; 2334 2335 czp.zp_copies = ditto_copies; 2336 2337 /* 2338 * If we arrived here with an override bp, we won't have run 2339 * the transform stack, so we won't have the data we need to 2340 * generate a child i/o. So, toss the override bp and restart. 2341 * This is safe, because using the override bp is just an 2342 * optimization; and it's rare, so the cost doesn't matter. 2343 */ 2344 if (zio->io_bp_override) { 2345 zio_pop_transforms(zio); 2346 zio->io_stage = ZIO_STAGE_OPEN; 2347 zio->io_pipeline = ZIO_WRITE_PIPELINE; 2348 zio->io_bp_override = NULL; 2349 BP_ZERO(bp); 2350 ddt_exit(ddt); 2351 return (ZIO_PIPELINE_CONTINUE); 2352 } 2353 2354 dio = zio_write(zio, spa, txg, bp, zio->io_orig_data, 2355 zio->io_orig_size, &czp, NULL, NULL, 2356 zio_ddt_ditto_write_done, dde, zio->io_priority, 2357 ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark); 2358 2359 zio_push_transform(dio, zio->io_data, zio->io_size, 0, NULL); 2360 dde->dde_lead_zio[DDT_PHYS_DITTO] = dio; 2361 } 2362 2363 if (ddp->ddp_phys_birth != 0 || dde->dde_lead_zio[p] != NULL) { 2364 if (ddp->ddp_phys_birth != 0) 2365 ddt_bp_fill(ddp, bp, txg); 2366 if (dde->dde_lead_zio[p] != NULL) 2367 zio_add_child(zio, dde->dde_lead_zio[p]); 2368 else 2369 ddt_phys_addref(ddp); 2370 } else if (zio->io_bp_override) { 2371 ASSERT(bp->blk_birth == txg); 2372 ASSERT(BP_EQUAL(bp, zio->io_bp_override)); 2373 ddt_phys_fill(ddp, bp); 2374 ddt_phys_addref(ddp); 2375 } else { 2376 cio = zio_write(zio, spa, txg, bp, zio->io_orig_data, 2377 zio->io_orig_size, zp, zio_ddt_child_write_ready, NULL, 2378 zio_ddt_child_write_done, dde, zio->io_priority, 2379 ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark); 2380 2381 zio_push_transform(cio, zio->io_data, zio->io_size, 0, NULL); 2382 dde->dde_lead_zio[p] = cio; 2383 } 2384 2385 ddt_exit(ddt); 2386 2387 if (cio) 2388 zio_nowait(cio); 2389 if (dio) 2390 zio_nowait(dio); 2391 2392 return (ZIO_PIPELINE_CONTINUE); 2393} 2394 2395ddt_entry_t *freedde; /* for debugging */ 2396 2397static int 2398zio_ddt_free(zio_t **ziop) 2399{ 2400 zio_t *zio = *ziop; 2401 spa_t *spa = zio->io_spa; 2402 blkptr_t *bp = zio->io_bp; 2403 ddt_t *ddt = ddt_select(spa, bp); 2404 ddt_entry_t *dde; 2405 ddt_phys_t *ddp; 2406 2407 ASSERT(BP_GET_DEDUP(bp)); 2408 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 2409 2410 ddt_enter(ddt); 2411 freedde = dde = ddt_lookup(ddt, bp, B_TRUE); 2412 ddp = ddt_phys_select(dde, bp); 2413 ddt_phys_decref(ddp); 2414 ddt_exit(ddt); 2415 2416 return (ZIO_PIPELINE_CONTINUE); 2417} 2418 2419/* 2420 * ========================================================================== 2421 * Allocate and free blocks 2422 * ========================================================================== 2423 */ 2424static int 2425zio_dva_allocate(zio_t **ziop) 2426{ 2427 zio_t *zio = *ziop; 2428 spa_t *spa = zio->io_spa; 2429 metaslab_class_t *mc = spa_normal_class(spa); 2430 blkptr_t *bp = zio->io_bp; 2431 int error; 2432 int flags = 0; 2433 2434 if (zio->io_gang_leader == NULL) { 2435 ASSERT(zio->io_child_type > ZIO_CHILD_GANG); 2436 zio->io_gang_leader = zio; 2437 } 2438 2439 ASSERT(BP_IS_HOLE(bp)); 2440 ASSERT0(BP_GET_NDVAS(bp)); 2441 ASSERT3U(zio->io_prop.zp_copies, >, 0); 2442 ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa)); 2443 ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp)); 2444 2445 /* 2446 * The dump device does not support gang blocks so allocation on 2447 * behalf of the dump device (i.e. ZIO_FLAG_NODATA) must avoid 2448 * the "fast" gang feature. 2449 */ 2450 flags |= (zio->io_flags & ZIO_FLAG_NODATA) ? METASLAB_GANG_AVOID : 0; 2451 flags |= (zio->io_flags & ZIO_FLAG_GANG_CHILD) ? 2452 METASLAB_GANG_CHILD : 0; 2453 error = metaslab_alloc(spa, mc, zio->io_size, bp, 2454 zio->io_prop.zp_copies, zio->io_txg, NULL, flags); 2455 2456 if (error) { 2457 spa_dbgmsg(spa, "%s: metaslab allocation failure: zio %p, " 2458 "size %llu, error %d", spa_name(spa), zio, zio->io_size, 2459 error); 2460 if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE) 2461 return (zio_write_gang_block(zio)); 2462 zio->io_error = error; 2463 } 2464 2465 return (ZIO_PIPELINE_CONTINUE); 2466} 2467 2468static int 2469zio_dva_free(zio_t **ziop) 2470{ 2471 zio_t *zio = *ziop; 2472 2473 metaslab_free(zio->io_spa, zio->io_bp, zio->io_txg, B_FALSE); 2474 2475 return (ZIO_PIPELINE_CONTINUE); 2476} 2477 2478static int 2479zio_dva_claim(zio_t **ziop) 2480{ 2481 zio_t *zio = *ziop; 2482 int error; 2483 2484 error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg); 2485 if (error) 2486 zio->io_error = error; 2487 2488 return (ZIO_PIPELINE_CONTINUE); 2489} 2490 2491/* 2492 * Undo an allocation. This is used by zio_done() when an I/O fails 2493 * and we want to give back the block we just allocated. 2494 * This handles both normal blocks and gang blocks. 2495 */ 2496static void 2497zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp) 2498{ 2499 ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp)); 2500 ASSERT(zio->io_bp_override == NULL); 2501 2502 if (!BP_IS_HOLE(bp)) 2503 metaslab_free(zio->io_spa, bp, bp->blk_birth, B_TRUE); 2504 2505 if (gn != NULL) { 2506 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { 2507 zio_dva_unallocate(zio, gn->gn_child[g], 2508 &gn->gn_gbh->zg_blkptr[g]); 2509 } 2510 } 2511} 2512 2513/* 2514 * Try to allocate an intent log block. Return 0 on success, errno on failure. 2515 */ 2516int 2517zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, blkptr_t *old_bp, 2518 uint64_t size, boolean_t use_slog) 2519{ 2520 int error = 1; 2521 2522 ASSERT(txg > spa_syncing_txg(spa)); 2523 2524 /* 2525 * ZIL blocks are always contiguous (i.e. not gang blocks) so we 2526 * set the METASLAB_GANG_AVOID flag so that they don't "fast gang" 2527 * when allocating them. 2528 */ 2529 if (use_slog) { 2530 error = metaslab_alloc(spa, spa_log_class(spa), size, 2531 new_bp, 1, txg, old_bp, 2532 METASLAB_HINTBP_AVOID | METASLAB_GANG_AVOID); 2533 } 2534 2535 if (error) { 2536 error = metaslab_alloc(spa, spa_normal_class(spa), size, 2537 new_bp, 1, txg, old_bp, 2538 METASLAB_HINTBP_AVOID); 2539 } 2540 2541 if (error == 0) { 2542 BP_SET_LSIZE(new_bp, size); 2543 BP_SET_PSIZE(new_bp, size); 2544 BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF); 2545 BP_SET_CHECKSUM(new_bp, 2546 spa_version(spa) >= SPA_VERSION_SLIM_ZIL 2547 ? ZIO_CHECKSUM_ZILOG2 : ZIO_CHECKSUM_ZILOG); 2548 BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG); 2549 BP_SET_LEVEL(new_bp, 0); 2550 BP_SET_DEDUP(new_bp, 0); 2551 BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER); 2552 } 2553 2554 return (error); 2555} 2556 2557/* 2558 * Free an intent log block. 2559 */ 2560void 2561zio_free_zil(spa_t *spa, uint64_t txg, blkptr_t *bp) 2562{ 2563 ASSERT(BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG); 2564 ASSERT(!BP_IS_GANG(bp)); 2565 2566 zio_free(spa, txg, bp); 2567} 2568 2569/* 2570 * ========================================================================== 2571 * Read, write and delete to physical devices 2572 * ========================================================================== 2573 */ 2574static int 2575zio_vdev_io_start(zio_t **ziop) 2576{ 2577 zio_t *zio = *ziop; 2578 vdev_t *vd = zio->io_vd; 2579 uint64_t align; 2580 spa_t *spa = zio->io_spa; 2581 2582 ASSERT(zio->io_error == 0); 2583 ASSERT(zio->io_child_error[ZIO_CHILD_VDEV] == 0); 2584 2585 if (vd == NULL) { 2586 if (!(zio->io_flags & ZIO_FLAG_CONFIG_WRITER)) 2587 spa_config_enter(spa, SCL_ZIO, zio, RW_READER); 2588 2589 /* 2590 * The mirror_ops handle multiple DVAs in a single BP. 2591 */ 2592 return (vdev_mirror_ops.vdev_op_io_start(zio)); 2593 } 2594 2595 if (vd->vdev_ops->vdev_op_leaf && zio->io_type == ZIO_TYPE_FREE) { 2596 trim_map_free(vd, zio->io_offset, zio->io_size, zio->io_txg); 2597 return (ZIO_PIPELINE_CONTINUE); 2598 } 2599 2600 /* 2601 * We keep track of time-sensitive I/Os so that the scan thread 2602 * can quickly react to certain workloads. In particular, we care 2603 * about non-scrubbing, top-level reads and writes with the following 2604 * characteristics: 2605 * - synchronous writes of user data to non-slog devices 2606 * - any reads of user data 2607 * When these conditions are met, adjust the timestamp of spa_last_io 2608 * which allows the scan thread to adjust its workload accordingly. 2609 */ 2610 if (!(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && zio->io_bp != NULL && 2611 vd == vd->vdev_top && !vd->vdev_islog && 2612 zio->io_bookmark.zb_objset != DMU_META_OBJSET && 2613 zio->io_txg != spa_syncing_txg(spa)) { 2614 uint64_t old = spa->spa_last_io; 2615 uint64_t new = ddi_get_lbolt64(); 2616 if (old != new) 2617 (void) atomic_cas_64(&spa->spa_last_io, old, new); 2618 } 2619 2620 align = 1ULL << vd->vdev_top->vdev_ashift; 2621 2622 if ((!(zio->io_flags & ZIO_FLAG_PHYSICAL) || 2623 (vd->vdev_top->vdev_physical_ashift > SPA_MINBLOCKSHIFT)) && 2624 P2PHASE(zio->io_size, align) != 0) { 2625 /* Transform logical writes to be a full physical block size. */ 2626 uint64_t asize = P2ROUNDUP(zio->io_size, align); 2627 char *abuf = NULL; 2628 if (zio->io_type == ZIO_TYPE_READ || 2629 zio->io_type == ZIO_TYPE_WRITE) 2630 abuf = zio_buf_alloc(asize); 2631 ASSERT(vd == vd->vdev_top); 2632 if (zio->io_type == ZIO_TYPE_WRITE) { 2633 bcopy(zio->io_data, abuf, zio->io_size); 2634 bzero(abuf + zio->io_size, asize - zio->io_size); 2635 } 2636 zio_push_transform(zio, abuf, asize, abuf ? asize : 0, 2637 zio_subblock); 2638 } 2639 2640 /* 2641 * If this is not a physical io, make sure that it is properly aligned 2642 * before proceeding. 2643 */ 2644 if (!(zio->io_flags & ZIO_FLAG_PHYSICAL)) { 2645 ASSERT0(P2PHASE(zio->io_offset, align)); 2646 ASSERT0(P2PHASE(zio->io_size, align)); 2647 } else { 2648 /* 2649 * For physical writes, we allow 512b aligned writes and assume 2650 * the device will perform a read-modify-write as necessary. 2651 */ 2652 ASSERT0(P2PHASE(zio->io_offset, SPA_MINBLOCKSIZE)); 2653 ASSERT0(P2PHASE(zio->io_size, SPA_MINBLOCKSIZE)); 2654 } 2655 2656 VERIFY(zio->io_type == ZIO_TYPE_READ || spa_writeable(spa)); 2657 2658 /* 2659 * If this is a repair I/O, and there's no self-healing involved -- 2660 * that is, we're just resilvering what we expect to resilver -- 2661 * then don't do the I/O unless zio's txg is actually in vd's DTL. 2662 * This prevents spurious resilvering with nested replication. 2663 * For example, given a mirror of mirrors, (A+B)+(C+D), if only 2664 * A is out of date, we'll read from C+D, then use the data to 2665 * resilver A+B -- but we don't actually want to resilver B, just A. 2666 * The top-level mirror has no way to know this, so instead we just 2667 * discard unnecessary repairs as we work our way down the vdev tree. 2668 * The same logic applies to any form of nested replication: 2669 * ditto + mirror, RAID-Z + replacing, etc. This covers them all. 2670 */ 2671 if ((zio->io_flags & ZIO_FLAG_IO_REPAIR) && 2672 !(zio->io_flags & ZIO_FLAG_SELF_HEAL) && 2673 zio->io_txg != 0 && /* not a delegated i/o */ 2674 !vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) { 2675 ASSERT(zio->io_type == ZIO_TYPE_WRITE); 2676 zio_vdev_io_bypass(zio); 2677 return (ZIO_PIPELINE_CONTINUE); 2678 } 2679 2680 if (vd->vdev_ops->vdev_op_leaf && 2681 (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE)) { 2682 2683 if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio)) 2684 return (ZIO_PIPELINE_CONTINUE); 2685 2686 if ((zio = vdev_queue_io(zio)) == NULL) 2687 return (ZIO_PIPELINE_STOP); 2688 *ziop = zio; 2689 2690 if (!vdev_accessible(vd, zio)) { 2691 zio->io_error = SET_ERROR(ENXIO); 2692 zio_interrupt(zio); 2693 return (ZIO_PIPELINE_STOP); 2694 } 2695 } 2696 2697 /* 2698 * Note that we ignore repair writes for TRIM because they can conflict 2699 * with normal writes. This isn't an issue because, by definition, we 2700 * only repair blocks that aren't freed. 2701 */ 2702 if (vd->vdev_ops->vdev_op_leaf && zio->io_type == ZIO_TYPE_WRITE && 2703 !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) { 2704 if (!trim_map_write_start(zio)) 2705 return (ZIO_PIPELINE_STOP); 2706 } 2707 2708 return (vd->vdev_ops->vdev_op_io_start(zio)); 2709} 2710 2711static int 2712zio_vdev_io_done(zio_t **ziop) 2713{ 2714 zio_t *zio = *ziop; 2715 vdev_t *vd = zio->io_vd; 2716 vdev_ops_t *ops = vd ? vd->vdev_ops : &vdev_mirror_ops; 2717 boolean_t unexpected_error = B_FALSE; 2718 2719 if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE)) 2720 return (ZIO_PIPELINE_STOP); 2721 2722 ASSERT(zio->io_type == ZIO_TYPE_READ || 2723 zio->io_type == ZIO_TYPE_WRITE || zio->io_type == ZIO_TYPE_FREE); 2724 2725 if (vd != NULL && vd->vdev_ops->vdev_op_leaf && 2726 (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE)) { 2727 2728 if (zio->io_type == ZIO_TYPE_WRITE && 2729 !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) 2730 trim_map_write_done(zio); 2731 2732 vdev_queue_io_done(zio); 2733 2734 if (zio->io_type == ZIO_TYPE_WRITE) 2735 vdev_cache_write(zio); 2736 2737 if (zio_injection_enabled && zio->io_error == 0) 2738 zio->io_error = zio_handle_device_injection(vd, 2739 zio, EIO); 2740 2741 if (zio_injection_enabled && zio->io_error == 0) 2742 zio->io_error = zio_handle_label_injection(zio, EIO); 2743 2744 if (zio->io_error) { 2745 if (!vdev_accessible(vd, zio)) { 2746 zio->io_error = SET_ERROR(ENXIO); 2747 } else { 2748 unexpected_error = B_TRUE; 2749 } 2750 } 2751 } 2752 2753 ops->vdev_op_io_done(zio); 2754 2755 if (unexpected_error) 2756 VERIFY(vdev_probe(vd, zio) == NULL); 2757 2758 return (ZIO_PIPELINE_CONTINUE); 2759} 2760 2761/* 2762 * For non-raidz ZIOs, we can just copy aside the bad data read from the 2763 * disk, and use that to finish the checksum ereport later. 2764 */ 2765static void 2766zio_vsd_default_cksum_finish(zio_cksum_report_t *zcr, 2767 const void *good_buf) 2768{ 2769 /* no processing needed */ 2770 zfs_ereport_finish_checksum(zcr, good_buf, zcr->zcr_cbdata, B_FALSE); 2771} 2772 2773/*ARGSUSED*/ 2774void 2775zio_vsd_default_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *ignored) 2776{ 2777 void *buf = zio_buf_alloc(zio->io_size); 2778 2779 bcopy(zio->io_data, buf, zio->io_size); 2780 2781 zcr->zcr_cbinfo = zio->io_size; 2782 zcr->zcr_cbdata = buf; 2783 zcr->zcr_finish = zio_vsd_default_cksum_finish; 2784 zcr->zcr_free = zio_buf_free; 2785} 2786 2787static int 2788zio_vdev_io_assess(zio_t **ziop) 2789{ 2790 zio_t *zio = *ziop; 2791 vdev_t *vd = zio->io_vd; 2792 2793 if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE)) 2794 return (ZIO_PIPELINE_STOP); 2795 2796 if (vd == NULL && !(zio->io_flags & ZIO_FLAG_CONFIG_WRITER)) 2797 spa_config_exit(zio->io_spa, SCL_ZIO, zio); 2798 2799 if (zio->io_vsd != NULL) { 2800 zio->io_vsd_ops->vsd_free(zio); 2801 zio->io_vsd = NULL; 2802 } 2803 2804 if (zio_injection_enabled && zio->io_error == 0) 2805 zio->io_error = zio_handle_fault_injection(zio, EIO); 2806 2807 if (zio->io_type == ZIO_TYPE_IOCTL && zio->io_cmd == DKIOCTRIM) 2808 switch (zio->io_error) { 2809 case 0: 2810 ZIO_TRIM_STAT_INCR(bytes, zio->io_size); 2811 ZIO_TRIM_STAT_BUMP(success); 2812 break; 2813 case EOPNOTSUPP: 2814 ZIO_TRIM_STAT_BUMP(unsupported); 2815 break; 2816 default: 2817 ZIO_TRIM_STAT_BUMP(failed); 2818 break; 2819 } 2820 2821 /* 2822 * If the I/O failed, determine whether we should attempt to retry it. 2823 * 2824 * On retry, we cut in line in the issue queue, since we don't want 2825 * compression/checksumming/etc. work to prevent our (cheap) IO reissue. 2826 */ 2827 if (zio->io_error && vd == NULL && 2828 !(zio->io_flags & (ZIO_FLAG_DONT_RETRY | ZIO_FLAG_IO_RETRY))) { 2829 ASSERT(!(zio->io_flags & ZIO_FLAG_DONT_QUEUE)); /* not a leaf */ 2830 ASSERT(!(zio->io_flags & ZIO_FLAG_IO_BYPASS)); /* not a leaf */ 2831 zio->io_error = 0; 2832 zio->io_flags |= ZIO_FLAG_IO_RETRY | 2833 ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE; 2834 zio->io_stage = ZIO_STAGE_VDEV_IO_START >> 1; 2835 zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, 2836 zio_requeue_io_start_cut_in_line); 2837 return (ZIO_PIPELINE_STOP); 2838 } 2839 2840 /* 2841 * If we got an error on a leaf device, convert it to ENXIO 2842 * if the device is not accessible at all. 2843 */ 2844 if (zio->io_error && vd != NULL && vd->vdev_ops->vdev_op_leaf && 2845 !vdev_accessible(vd, zio)) 2846 zio->io_error = SET_ERROR(ENXIO); 2847 2848 /* 2849 * If we can't write to an interior vdev (mirror or RAID-Z), 2850 * set vdev_cant_write so that we stop trying to allocate from it. 2851 */ 2852 if (zio->io_error == ENXIO && zio->io_type == ZIO_TYPE_WRITE && 2853 vd != NULL && !vd->vdev_ops->vdev_op_leaf) { 2854 vd->vdev_cant_write = B_TRUE; 2855 } 2856 2857 if (zio->io_error) 2858 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 2859 2860 if (vd != NULL && vd->vdev_ops->vdev_op_leaf && 2861 zio->io_physdone != NULL) { 2862 ASSERT(!(zio->io_flags & ZIO_FLAG_DELEGATED)); 2863 ASSERT(zio->io_child_type == ZIO_CHILD_VDEV); 2864 zio->io_physdone(zio->io_logical); 2865 } 2866 2867 return (ZIO_PIPELINE_CONTINUE); 2868} 2869 2870void 2871zio_vdev_io_reissue(zio_t *zio) 2872{ 2873 ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); 2874 ASSERT(zio->io_error == 0); 2875 2876 zio->io_stage >>= 1; 2877} 2878 2879void 2880zio_vdev_io_redone(zio_t *zio) 2881{ 2882 ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE); 2883 2884 zio->io_stage >>= 1; 2885} 2886 2887void 2888zio_vdev_io_bypass(zio_t *zio) 2889{ 2890 ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); 2891 ASSERT(zio->io_error == 0); 2892 2893 zio->io_flags |= ZIO_FLAG_IO_BYPASS; 2894 zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS >> 1; 2895} 2896 2897/* 2898 * ========================================================================== 2899 * Generate and verify checksums 2900 * ========================================================================== 2901 */ 2902static int 2903zio_checksum_generate(zio_t **ziop) 2904{ 2905 zio_t *zio = *ziop; 2906 blkptr_t *bp = zio->io_bp; 2907 enum zio_checksum checksum; 2908 2909 if (bp == NULL) { 2910 /* 2911 * This is zio_write_phys(). 2912 * We're either generating a label checksum, or none at all. 2913 */ 2914 checksum = zio->io_prop.zp_checksum; 2915 2916 if (checksum == ZIO_CHECKSUM_OFF) 2917 return (ZIO_PIPELINE_CONTINUE); 2918 2919 ASSERT(checksum == ZIO_CHECKSUM_LABEL); 2920 } else { 2921 if (BP_IS_GANG(bp) && zio->io_child_type == ZIO_CHILD_GANG) { 2922 ASSERT(!IO_IS_ALLOCATING(zio)); 2923 checksum = ZIO_CHECKSUM_GANG_HEADER; 2924 } else { 2925 checksum = BP_GET_CHECKSUM(bp); 2926 } 2927 } 2928 2929 zio_checksum_compute(zio, checksum, zio->io_data, zio->io_size); 2930 2931 return (ZIO_PIPELINE_CONTINUE); 2932} 2933 2934static int 2935zio_checksum_verify(zio_t **ziop) 2936{ 2937 zio_t *zio = *ziop; 2938 zio_bad_cksum_t info; 2939 blkptr_t *bp = zio->io_bp; 2940 int error; 2941 2942 ASSERT(zio->io_vd != NULL); 2943 2944 if (bp == NULL) { 2945 /* 2946 * This is zio_read_phys(). 2947 * We're either verifying a label checksum, or nothing at all. 2948 */ 2949 if (zio->io_prop.zp_checksum == ZIO_CHECKSUM_OFF) 2950 return (ZIO_PIPELINE_CONTINUE); 2951 2952 ASSERT(zio->io_prop.zp_checksum == ZIO_CHECKSUM_LABEL); 2953 } 2954 2955 if ((error = zio_checksum_error(zio, &info)) != 0) { 2956 zio->io_error = error; 2957 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 2958 zfs_ereport_start_checksum(zio->io_spa, 2959 zio->io_vd, zio, zio->io_offset, 2960 zio->io_size, NULL, &info); 2961 } 2962 } 2963 2964 return (ZIO_PIPELINE_CONTINUE); 2965} 2966 2967/* 2968 * Called by RAID-Z to ensure we don't compute the checksum twice. 2969 */ 2970void 2971zio_checksum_verified(zio_t *zio) 2972{ 2973 zio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY; 2974} 2975 2976/* 2977 * ========================================================================== 2978 * Error rank. Error are ranked in the order 0, ENXIO, ECKSUM, EIO, other. 2979 * An error of 0 indicates success. ENXIO indicates whole-device failure, 2980 * which may be transient (e.g. unplugged) or permament. ECKSUM and EIO 2981 * indicate errors that are specific to one I/O, and most likely permanent. 2982 * Any other error is presumed to be worse because we weren't expecting it. 2983 * ========================================================================== 2984 */ 2985int 2986zio_worst_error(int e1, int e2) 2987{ 2988 static int zio_error_rank[] = { 0, ENXIO, ECKSUM, EIO }; 2989 int r1, r2; 2990 2991 for (r1 = 0; r1 < sizeof (zio_error_rank) / sizeof (int); r1++) 2992 if (e1 == zio_error_rank[r1]) 2993 break; 2994 2995 for (r2 = 0; r2 < sizeof (zio_error_rank) / sizeof (int); r2++) 2996 if (e2 == zio_error_rank[r2]) 2997 break; 2998 2999 return (r1 > r2 ? e1 : e2); 3000} 3001 3002/* 3003 * ========================================================================== 3004 * I/O completion 3005 * ========================================================================== 3006 */ 3007static int 3008zio_ready(zio_t **ziop) 3009{ 3010 zio_t *zio = *ziop; 3011 blkptr_t *bp = zio->io_bp; 3012 zio_t *pio, *pio_next; 3013 3014 if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) || 3015 zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_READY)) 3016 return (ZIO_PIPELINE_STOP); 3017 3018 if (zio->io_ready) { 3019 ASSERT(IO_IS_ALLOCATING(zio)); 3020 ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp) || 3021 (zio->io_flags & ZIO_FLAG_NOPWRITE)); 3022 ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0); 3023 3024 zio->io_ready(zio); 3025 } 3026 3027 if (bp != NULL && bp != &zio->io_bp_copy) 3028 zio->io_bp_copy = *bp; 3029 3030 if (zio->io_error) 3031 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 3032 3033 mutex_enter(&zio->io_lock); 3034 zio->io_state[ZIO_WAIT_READY] = 1; 3035 pio = zio_walk_parents(zio); 3036 mutex_exit(&zio->io_lock); 3037 3038 /* 3039 * As we notify zio's parents, new parents could be added. 3040 * New parents go to the head of zio's io_parent_list, however, 3041 * so we will (correctly) not notify them. The remainder of zio's 3042 * io_parent_list, from 'pio_next' onward, cannot change because 3043 * all parents must wait for us to be done before they can be done. 3044 */ 3045 for (; pio != NULL; pio = pio_next) { 3046 pio_next = zio_walk_parents(zio); 3047 zio_notify_parent(pio, zio, ZIO_WAIT_READY); 3048 } 3049 3050 if (zio->io_flags & ZIO_FLAG_NODATA) { 3051 if (BP_IS_GANG(bp)) { 3052 zio->io_flags &= ~ZIO_FLAG_NODATA; 3053 } else { 3054 ASSERT((uintptr_t)zio->io_data < SPA_MAXBLOCKSIZE); 3055 zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; 3056 } 3057 } 3058 3059 if (zio_injection_enabled && 3060 zio->io_spa->spa_syncing_txg == zio->io_txg) 3061 zio_handle_ignored_writes(zio); 3062 3063 return (ZIO_PIPELINE_CONTINUE); 3064} 3065 3066static int 3067zio_done(zio_t **ziop) 3068{ 3069 zio_t *zio = *ziop; 3070 spa_t *spa = zio->io_spa; 3071 zio_t *lio = zio->io_logical; 3072 blkptr_t *bp = zio->io_bp; 3073 vdev_t *vd = zio->io_vd; 3074 uint64_t psize = zio->io_size; 3075 zio_t *pio, *pio_next; 3076 3077 /* 3078 * If our children haven't all completed, 3079 * wait for them and then repeat this pipeline stage. 3080 */ 3081 if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE) || 3082 zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE) || 3083 zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE) || 3084 zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE)) 3085 return (ZIO_PIPELINE_STOP); 3086 3087 for (int c = 0; c < ZIO_CHILD_TYPES; c++) 3088 for (int w = 0; w < ZIO_WAIT_TYPES; w++) 3089 ASSERT(zio->io_children[c][w] == 0); 3090 3091 if (bp != NULL && !BP_IS_EMBEDDED(bp)) { 3092 ASSERT(bp->blk_pad[0] == 0); 3093 ASSERT(bp->blk_pad[1] == 0); 3094 ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0 || 3095 (bp == zio_unique_parent(zio)->io_bp)); 3096 if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) && 3097 zio->io_bp_override == NULL && 3098 !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) { 3099 ASSERT(!BP_SHOULD_BYTESWAP(bp)); 3100 ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(bp)); 3101 ASSERT(BP_COUNT_GANG(bp) == 0 || 3102 (BP_COUNT_GANG(bp) == BP_GET_NDVAS(bp))); 3103 } 3104 if (zio->io_flags & ZIO_FLAG_NOPWRITE) 3105 VERIFY(BP_EQUAL(bp, &zio->io_bp_orig)); 3106 } 3107 3108 /* 3109 * If there were child vdev/gang/ddt errors, they apply to us now. 3110 */ 3111 zio_inherit_child_errors(zio, ZIO_CHILD_VDEV); 3112 zio_inherit_child_errors(zio, ZIO_CHILD_GANG); 3113 zio_inherit_child_errors(zio, ZIO_CHILD_DDT); 3114 3115 /* 3116 * If the I/O on the transformed data was successful, generate any 3117 * checksum reports now while we still have the transformed data. 3118 */ 3119 if (zio->io_error == 0) { 3120 while (zio->io_cksum_report != NULL) { 3121 zio_cksum_report_t *zcr = zio->io_cksum_report; 3122 uint64_t align = zcr->zcr_align; 3123 uint64_t asize = P2ROUNDUP(psize, align); 3124 char *abuf = zio->io_data; 3125 3126 if (asize != psize) { 3127 abuf = zio_buf_alloc(asize); 3128 bcopy(zio->io_data, abuf, psize); 3129 bzero(abuf + psize, asize - psize); 3130 } 3131 3132 zio->io_cksum_report = zcr->zcr_next; 3133 zcr->zcr_next = NULL; 3134 zcr->zcr_finish(zcr, abuf); 3135 zfs_ereport_free_checksum(zcr); 3136 3137 if (asize != psize) 3138 zio_buf_free(abuf, asize); 3139 } 3140 } 3141 3142 zio_pop_transforms(zio); /* note: may set zio->io_error */ 3143 3144 vdev_stat_update(zio, psize); 3145 3146 if (zio->io_error) { 3147 /* 3148 * If this I/O is attached to a particular vdev, 3149 * generate an error message describing the I/O failure 3150 * at the block level. We ignore these errors if the 3151 * device is currently unavailable. 3152 */ 3153 if (zio->io_error != ECKSUM && vd != NULL && !vdev_is_dead(vd)) 3154 zfs_ereport_post(FM_EREPORT_ZFS_IO, spa, vd, zio, 0, 0); 3155 3156 if ((zio->io_error == EIO || !(zio->io_flags & 3157 (ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_PROPAGATE))) && 3158 zio == lio) { 3159 /* 3160 * For logical I/O requests, tell the SPA to log the 3161 * error and generate a logical data ereport. 3162 */ 3163 spa_log_error(spa, zio); 3164 zfs_ereport_post(FM_EREPORT_ZFS_DATA, spa, NULL, zio, 3165 0, 0); 3166 } 3167 } 3168 3169 if (zio->io_error && zio == lio) { 3170 /* 3171 * Determine whether zio should be reexecuted. This will 3172 * propagate all the way to the root via zio_notify_parent(). 3173 */ 3174 ASSERT(vd == NULL && bp != NULL); 3175 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 3176 3177 if (IO_IS_ALLOCATING(zio) && 3178 !(zio->io_flags & ZIO_FLAG_CANFAIL)) { 3179 if (zio->io_error != ENOSPC) 3180 zio->io_reexecute |= ZIO_REEXECUTE_NOW; 3181 else 3182 zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; 3183 } 3184 3185 if ((zio->io_type == ZIO_TYPE_READ || 3186 zio->io_type == ZIO_TYPE_FREE) && 3187 !(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && 3188 zio->io_error == ENXIO && 3189 spa_load_state(spa) == SPA_LOAD_NONE && 3190 spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE) 3191 zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; 3192 3193 if (!(zio->io_flags & ZIO_FLAG_CANFAIL) && !zio->io_reexecute) 3194 zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; 3195 3196 /* 3197 * Here is a possibly good place to attempt to do 3198 * either combinatorial reconstruction or error correction 3199 * based on checksums. It also might be a good place 3200 * to send out preliminary ereports before we suspend 3201 * processing. 3202 */ 3203 } 3204 3205 /* 3206 * If there were logical child errors, they apply to us now. 3207 * We defer this until now to avoid conflating logical child 3208 * errors with errors that happened to the zio itself when 3209 * updating vdev stats and reporting FMA events above. 3210 */ 3211 zio_inherit_child_errors(zio, ZIO_CHILD_LOGICAL); 3212 3213 if ((zio->io_error || zio->io_reexecute) && 3214 IO_IS_ALLOCATING(zio) && zio->io_gang_leader == zio && 3215 !(zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE))) 3216 zio_dva_unallocate(zio, zio->io_gang_tree, bp); 3217 3218 zio_gang_tree_free(&zio->io_gang_tree); 3219 3220 /* 3221 * Godfather I/Os should never suspend. 3222 */ 3223 if ((zio->io_flags & ZIO_FLAG_GODFATHER) && 3224 (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) 3225 zio->io_reexecute = 0; 3226 3227 if (zio->io_reexecute) { 3228 /* 3229 * This is a logical I/O that wants to reexecute. 3230 * 3231 * Reexecute is top-down. When an i/o fails, if it's not 3232 * the root, it simply notifies its parent and sticks around. 3233 * The parent, seeing that it still has children in zio_done(), 3234 * does the same. This percolates all the way up to the root. 3235 * The root i/o will reexecute or suspend the entire tree. 3236 * 3237 * This approach ensures that zio_reexecute() honors 3238 * all the original i/o dependency relationships, e.g. 3239 * parents not executing until children are ready. 3240 */ 3241 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 3242 3243 zio->io_gang_leader = NULL; 3244 3245 mutex_enter(&zio->io_lock); 3246 zio->io_state[ZIO_WAIT_DONE] = 1; 3247 mutex_exit(&zio->io_lock); 3248 3249 /* 3250 * "The Godfather" I/O monitors its children but is 3251 * not a true parent to them. It will track them through 3252 * the pipeline but severs its ties whenever they get into 3253 * trouble (e.g. suspended). This allows "The Godfather" 3254 * I/O to return status without blocking. 3255 */ 3256 for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) { 3257 zio_link_t *zl = zio->io_walk_link; 3258 pio_next = zio_walk_parents(zio); 3259 3260 if ((pio->io_flags & ZIO_FLAG_GODFATHER) && 3261 (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) { 3262 zio_remove_child(pio, zio, zl); 3263 zio_notify_parent(pio, zio, ZIO_WAIT_DONE); 3264 } 3265 } 3266 3267 if ((pio = zio_unique_parent(zio)) != NULL) { 3268 /* 3269 * We're not a root i/o, so there's nothing to do 3270 * but notify our parent. Don't propagate errors 3271 * upward since we haven't permanently failed yet. 3272 */ 3273 ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER)); 3274 zio->io_flags |= ZIO_FLAG_DONT_PROPAGATE; 3275 zio_notify_parent(pio, zio, ZIO_WAIT_DONE); 3276 } else if (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND) { 3277 /* 3278 * We'd fail again if we reexecuted now, so suspend 3279 * until conditions improve (e.g. device comes online). 3280 */ 3281 zio_suspend(spa, zio); 3282 } else { 3283 /* 3284 * Reexecution is potentially a huge amount of work. 3285 * Hand it off to the otherwise-unused claim taskq. 3286 */ 3287#if defined(illumos) || !defined(_KERNEL) 3288 ASSERT(zio->io_tqent.tqent_next == NULL); 3289#else 3290 ASSERT(zio->io_tqent.tqent_task.ta_pending == 0); 3291#endif 3292 spa_taskq_dispatch_ent(spa, ZIO_TYPE_CLAIM, 3293 ZIO_TASKQ_ISSUE, (task_func_t *)zio_reexecute, zio, 3294 0, &zio->io_tqent); 3295 } 3296 return (ZIO_PIPELINE_STOP); 3297 } 3298 3299 ASSERT(zio->io_child_count == 0); 3300 ASSERT(zio->io_reexecute == 0); 3301 ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL)); 3302 3303 /* 3304 * Report any checksum errors, since the I/O is complete. 3305 */ 3306 while (zio->io_cksum_report != NULL) { 3307 zio_cksum_report_t *zcr = zio->io_cksum_report; 3308 zio->io_cksum_report = zcr->zcr_next; 3309 zcr->zcr_next = NULL; 3310 zcr->zcr_finish(zcr, NULL); 3311 zfs_ereport_free_checksum(zcr); 3312 } 3313 3314 /* 3315 * It is the responsibility of the done callback to ensure that this 3316 * particular zio is no longer discoverable for adoption, and as 3317 * such, cannot acquire any new parents. 3318 */ 3319 if (zio->io_done) 3320 zio->io_done(zio); 3321 3322 mutex_enter(&zio->io_lock); 3323 zio->io_state[ZIO_WAIT_DONE] = 1; 3324 mutex_exit(&zio->io_lock); 3325 3326 for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) { 3327 zio_link_t *zl = zio->io_walk_link; 3328 pio_next = zio_walk_parents(zio); 3329 zio_remove_child(pio, zio, zl); 3330 zio_notify_parent(pio, zio, ZIO_WAIT_DONE); 3331 } 3332 3333 if (zio->io_waiter != NULL) { 3334 mutex_enter(&zio->io_lock); 3335 zio->io_executor = NULL; 3336 cv_broadcast(&zio->io_cv); 3337 mutex_exit(&zio->io_lock); 3338 } else { 3339 zio_destroy(zio); 3340 } 3341 3342 return (ZIO_PIPELINE_STOP); 3343} 3344 3345/* 3346 * ========================================================================== 3347 * I/O pipeline definition 3348 * ========================================================================== 3349 */ 3350static zio_pipe_stage_t *zio_pipeline[] = { 3351 NULL, 3352 zio_read_bp_init, 3353 zio_free_bp_init, 3354 zio_issue_async, 3355 zio_write_bp_init, 3356 zio_checksum_generate, 3357 zio_nop_write, 3358 zio_ddt_read_start, 3359 zio_ddt_read_done, 3360 zio_ddt_write, 3361 zio_ddt_free, 3362 zio_gang_assemble, 3363 zio_gang_issue, 3364 zio_dva_allocate, 3365 zio_dva_free, 3366 zio_dva_claim, 3367 zio_ready, 3368 zio_vdev_io_start, 3369 zio_vdev_io_done, 3370 zio_vdev_io_assess, 3371 zio_checksum_verify, 3372 zio_done 3373}; 3374 3375/* dnp is the dnode for zb1->zb_object */ 3376boolean_t 3377zbookmark_is_before(const dnode_phys_t *dnp, const zbookmark_phys_t *zb1, 3378 const zbookmark_phys_t *zb2) 3379{ 3380 uint64_t zb1nextL0, zb2thisobj; 3381 3382 ASSERT(zb1->zb_objset == zb2->zb_objset); 3383 ASSERT(zb2->zb_level == 0); 3384 3385 /* The objset_phys_t isn't before anything. */ 3386 if (dnp == NULL) 3387 return (B_FALSE); 3388 3389 zb1nextL0 = (zb1->zb_blkid + 1) << 3390 ((zb1->zb_level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT)); 3391 3392 zb2thisobj = zb2->zb_object ? zb2->zb_object : 3393 zb2->zb_blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT); 3394 3395 if (zb1->zb_object == DMU_META_DNODE_OBJECT) { 3396 uint64_t nextobj = zb1nextL0 * 3397 (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT) >> DNODE_SHIFT; 3398 return (nextobj <= zb2thisobj); 3399 } 3400 3401 if (zb1->zb_object < zb2thisobj) 3402 return (B_TRUE); 3403 if (zb1->zb_object > zb2thisobj) 3404 return (B_FALSE); 3405 if (zb2->zb_object == DMU_META_DNODE_OBJECT) 3406 return (B_FALSE); 3407 return (zb1nextL0 <= zb2->zb_blkid); 3408} 3409