1168404Spjd/* 2168404Spjd * CDDL HEADER START 3168404Spjd * 4168404Spjd * The contents of this file are subject to the terms of the 5168404Spjd * Common Development and Distribution License (the "License"). 6168404Spjd * You may not use this file except in compliance with the License. 7168404Spjd * 8168404Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9168404Spjd * or http://www.opensolaris.org/os/licensing. 10168404Spjd * See the License for the specific language governing permissions 11168404Spjd * and limitations under the License. 12168404Spjd * 13168404Spjd * When distributing Covered Code, include this CDDL HEADER in each 14168404Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15168404Spjd * If applicable, add the following below this CDDL HEADER, with the 16168404Spjd * fields enclosed by brackets "[]" replaced with your own identifying 17168404Spjd * information: Portions Copyright [yyyy] [name of copyright owner] 18168404Spjd * 19168404Spjd * CDDL HEADER END 20168404Spjd */ 21168404Spjd/* 22219089Spjd * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23285001Savg * Copyright (c) 2011, 2015 by Delphix. All rights reserved. 24260742Savg * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved. 25168404Spjd */ 26168404Spjd 27277547Sdelphij#include <sys/sysmacros.h> 28168404Spjd#include <sys/zfs_context.h> 29168404Spjd#include <sys/fm/fs/zfs.h> 30168404Spjd#include <sys/spa.h> 31168404Spjd#include <sys/txg.h> 32168404Spjd#include <sys/spa_impl.h> 33168404Spjd#include <sys/vdev_impl.h> 34168404Spjd#include <sys/zio_impl.h> 35168404Spjd#include <sys/zio_compress.h> 36168404Spjd#include <sys/zio_checksum.h> 37219089Spjd#include <sys/dmu_objset.h> 38219089Spjd#include <sys/arc.h> 39219089Spjd#include <sys/ddt.h> 40240868Spjd#include <sys/trim_map.h> 41268649Sdelphij#include <sys/blkptr.h> 42263397Sdelphij#include <sys/zfeature.h> 43168404Spjd 44208148SpjdSYSCTL_DECL(_vfs_zfs); 45208148SpjdSYSCTL_NODE(_vfs_zfs, OID_AUTO, zio, CTLFLAG_RW, 0, "ZFS ZIO"); 46260338Smav#if defined(__amd64__) 47260338Smavstatic int zio_use_uma = 1; 48260338Smav#else 49209261Spjdstatic int zio_use_uma = 0; 50260338Smav#endif 51208148SpjdTUNABLE_INT("vfs.zfs.zio.use_uma", &zio_use_uma); 52208148SpjdSYSCTL_INT(_vfs_zfs_zio, OID_AUTO, use_uma, CTLFLAG_RDTUN, &zio_use_uma, 0, 53208148Spjd "Use uma(9) for ZIO allocations"); 54230647Skmacystatic int zio_exclude_metadata = 0; 55230647SkmacyTUNABLE_INT("vfs.zfs.zio.exclude_metadata", &zio_exclude_metadata); 56230647SkmacySYSCTL_INT(_vfs_zfs_zio, OID_AUTO, exclude_metadata, CTLFLAG_RDTUN, &zio_exclude_metadata, 0, 57230647Skmacy "Exclude metadata buffers from dumps as well"); 58208148Spjd 59240868Spjdzio_trim_stats_t zio_trim_stats = { 60244155Ssmh { "bytes", KSTAT_DATA_UINT64, 61244155Ssmh "Number of bytes successfully TRIMmed" }, 62244155Ssmh { "success", KSTAT_DATA_UINT64, 63244155Ssmh "Number of successful TRIM requests" }, 64244155Ssmh { "unsupported", KSTAT_DATA_UINT64, 65244155Ssmh "Number of TRIM requests that failed because TRIM is not supported" }, 66244155Ssmh { "failed", KSTAT_DATA_UINT64, 67244155Ssmh "Number of TRIM requests that failed for reasons other than not supported" }, 68240868Spjd}; 69240868Spjd 70240868Spjdstatic kstat_t *zio_trim_ksp; 71240868Spjd 72240868Spjd/* 73168404Spjd * ========================================================================== 74168404Spjd * I/O type descriptions 75168404Spjd * ========================================================================== 76168404Spjd */ 77260763Savgconst char *zio_type_name[ZIO_TYPES] = { 78211931Smm "zio_null", "zio_read", "zio_write", "zio_free", "zio_claim", 79211931Smm "zio_ioctl" 80211931Smm}; 81168404Spjd 82168404Spjd/* 83168404Spjd * ========================================================================== 84168404Spjd * I/O kmem caches 85168404Spjd * ========================================================================== 86168404Spjd */ 87168926Spjdkmem_cache_t *zio_cache; 88209962Smmkmem_cache_t *zio_link_cache; 89168404Spjdkmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; 90168404Spjdkmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; 91168404Spjd 92168404Spjd#ifdef _KERNEL 93168404Spjdextern vmem_t *zio_alloc_arena; 94168404Spjd#endif 95168404Spjd 96185029Spjd/* 97243503Smm * The following actions directly effect the spa's sync-to-convergence logic. 98243503Smm * The values below define the sync pass when we start performing the action. 99243503Smm * Care should be taken when changing these values as they directly impact 100243503Smm * spa_sync() performance. Tuning these values may introduce subtle performance 101243503Smm * pathologies and should only be done in the context of performance analysis. 102243503Smm * These tunables will eventually be removed and replaced with #defines once 103243503Smm * enough analysis has been done to determine optimal values. 104243503Smm * 105243503Smm * The 'zfs_sync_pass_deferred_free' pass must be greater than 1 to ensure that 106243503Smm * regular blocks are not deferred. 107243503Smm */ 108243503Smmint zfs_sync_pass_deferred_free = 2; /* defer frees starting in this pass */ 109243503SmmTUNABLE_INT("vfs.zfs.sync_pass_deferred_free", &zfs_sync_pass_deferred_free); 110243503SmmSYSCTL_INT(_vfs_zfs, OID_AUTO, sync_pass_deferred_free, CTLFLAG_RDTUN, 111243503Smm &zfs_sync_pass_deferred_free, 0, "defer frees starting in this pass"); 112243503Smmint zfs_sync_pass_dont_compress = 5; /* don't compress starting in this pass */ 113243503SmmTUNABLE_INT("vfs.zfs.sync_pass_dont_compress", &zfs_sync_pass_dont_compress); 114243503SmmSYSCTL_INT(_vfs_zfs, OID_AUTO, sync_pass_dont_compress, CTLFLAG_RDTUN, 115243503Smm &zfs_sync_pass_dont_compress, 0, "don't compress starting in this pass"); 116243503Smmint zfs_sync_pass_rewrite = 2; /* rewrite new bps starting in this pass */ 117243503SmmTUNABLE_INT("vfs.zfs.sync_pass_rewrite", &zfs_sync_pass_rewrite); 118243503SmmSYSCTL_INT(_vfs_zfs, OID_AUTO, sync_pass_rewrite, CTLFLAG_RDTUN, 119243503Smm &zfs_sync_pass_rewrite, 0, "rewrite new bps starting in this pass"); 120243503Smm 121243503Smm/* 122185029Spjd * An allocating zio is one that either currently has the DVA allocate 123185029Spjd * stage set or will have it later in its lifetime. 124185029Spjd */ 125219089Spjd#define IO_IS_ALLOCATING(zio) ((zio)->io_orig_pipeline & ZIO_STAGE_DVA_ALLOCATE) 126185029Spjd 127219089Spjdboolean_t zio_requeue_io_start_cut_in_line = B_TRUE; 128219089Spjd 129219089Spjd#ifdef ZFS_DEBUG 130219089Spjdint zio_buf_debug_limit = 16384; 131219089Spjd#else 132219089Spjdint zio_buf_debug_limit = 0; 133219089Spjd#endif 134219089Spjd 135168404Spjdvoid 136168404Spjdzio_init(void) 137168404Spjd{ 138168404Spjd size_t c; 139209962Smm zio_cache = kmem_cache_create("zio_cache", 140209962Smm sizeof (zio_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 141209962Smm zio_link_cache = kmem_cache_create("zio_link_cache", 142209962Smm sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 143250149Sdavide if (!zio_use_uma) 144250149Sdavide goto out; 145168926Spjd 146168404Spjd /* 147168404Spjd * For small buffers, we want a cache for each multiple of 148276081Sdelphij * SPA_MINBLOCKSIZE. For larger buffers, we want a cache 149276081Sdelphij * for each quarter-power of 2. 150168404Spjd */ 151168404Spjd for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { 152168404Spjd size_t size = (c + 1) << SPA_MINBLOCKSHIFT; 153168404Spjd size_t p2 = size; 154168404Spjd size_t align = 0; 155219089Spjd size_t cflags = (size > zio_buf_debug_limit) ? KMC_NODEBUG : 0; 156168404Spjd 157277547Sdelphij while (!ISP2(p2)) 158168404Spjd p2 &= p2 - 1; 159168404Spjd 160240133Smm#ifdef illumos 161240133Smm#ifndef _KERNEL 162240133Smm /* 163240133Smm * If we are using watchpoints, put each buffer on its own page, 164240133Smm * to eliminate the performance overhead of trapping to the 165240133Smm * kernel when modifying a non-watched buffer that shares the 166240133Smm * page with a watched buffer. 167240133Smm */ 168240133Smm if (arc_watch && !IS_P2ALIGNED(size, PAGESIZE)) 169240133Smm continue; 170240133Smm#endif 171240133Smm#endif /* illumos */ 172168404Spjd if (size <= 4 * SPA_MINBLOCKSIZE) { 173168404Spjd align = SPA_MINBLOCKSIZE; 174240133Smm } else if (IS_P2ALIGNED(size, p2 >> 2)) { 175276081Sdelphij align = MIN(p2 >> 2, PAGESIZE); 176168404Spjd } 177168404Spjd 178168404Spjd if (align != 0) { 179168404Spjd char name[36]; 180168404Spjd (void) sprintf(name, "zio_buf_%lu", (ulong_t)size); 181168404Spjd zio_buf_cache[c] = kmem_cache_create(name, size, 182219089Spjd align, NULL, NULL, NULL, NULL, NULL, cflags); 183168404Spjd 184219089Spjd /* 185219089Spjd * Since zio_data bufs do not appear in crash dumps, we 186219089Spjd * pass KMC_NOTOUCH so that no allocator metadata is 187219089Spjd * stored with the buffers. 188219089Spjd */ 189168404Spjd (void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size); 190168404Spjd zio_data_buf_cache[c] = kmem_cache_create(name, size, 191219089Spjd align, NULL, NULL, NULL, NULL, NULL, 192230689Skmacy cflags | KMC_NOTOUCH | KMC_NODEBUG); 193168404Spjd } 194168404Spjd } 195168404Spjd 196168404Spjd while (--c != 0) { 197168404Spjd ASSERT(zio_buf_cache[c] != NULL); 198168404Spjd if (zio_buf_cache[c - 1] == NULL) 199168404Spjd zio_buf_cache[c - 1] = zio_buf_cache[c]; 200168404Spjd 201168404Spjd ASSERT(zio_data_buf_cache[c] != NULL); 202168404Spjd if (zio_data_buf_cache[c - 1] == NULL) 203168404Spjd zio_data_buf_cache[c - 1] = zio_data_buf_cache[c]; 204168404Spjd } 205250149Sdavideout: 206208458Spjd 207168404Spjd zio_inject_init(); 208240868Spjd 209240868Spjd zio_trim_ksp = kstat_create("zfs", 0, "zio_trim", "misc", 210240868Spjd KSTAT_TYPE_NAMED, 211240868Spjd sizeof(zio_trim_stats) / sizeof(kstat_named_t), 212240868Spjd KSTAT_FLAG_VIRTUAL); 213240868Spjd 214240868Spjd if (zio_trim_ksp != NULL) { 215240868Spjd zio_trim_ksp->ks_data = &zio_trim_stats; 216240868Spjd kstat_install(zio_trim_ksp); 217240868Spjd } 218168404Spjd} 219168404Spjd 220168404Spjdvoid 221168404Spjdzio_fini(void) 222168404Spjd{ 223168404Spjd size_t c; 224168404Spjd kmem_cache_t *last_cache = NULL; 225168404Spjd kmem_cache_t *last_data_cache = NULL; 226168404Spjd 227168404Spjd for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { 228168404Spjd if (zio_buf_cache[c] != last_cache) { 229168404Spjd last_cache = zio_buf_cache[c]; 230168404Spjd kmem_cache_destroy(zio_buf_cache[c]); 231168404Spjd } 232168404Spjd zio_buf_cache[c] = NULL; 233168404Spjd 234168404Spjd if (zio_data_buf_cache[c] != last_data_cache) { 235168404Spjd last_data_cache = zio_data_buf_cache[c]; 236168404Spjd kmem_cache_destroy(zio_data_buf_cache[c]); 237168404Spjd } 238168404Spjd zio_data_buf_cache[c] = NULL; 239168404Spjd } 240168404Spjd 241209962Smm kmem_cache_destroy(zio_link_cache); 242168926Spjd kmem_cache_destroy(zio_cache); 243168926Spjd 244168404Spjd zio_inject_fini(); 245240868Spjd 246240868Spjd if (zio_trim_ksp != NULL) { 247240868Spjd kstat_delete(zio_trim_ksp); 248240868Spjd zio_trim_ksp = NULL; 249240868Spjd } 250168404Spjd} 251168404Spjd 252168404Spjd/* 253168404Spjd * ========================================================================== 254168404Spjd * Allocate and free I/O buffers 255168404Spjd * ========================================================================== 256168404Spjd */ 257168404Spjd 258168404Spjd/* 259168404Spjd * Use zio_buf_alloc to allocate ZFS metadata. This data will appear in a 260168404Spjd * crashdump if the kernel panics, so use it judiciously. Obviously, it's 261168404Spjd * useful to inspect ZFS metadata, but if possible, we should avoid keeping 262168404Spjd * excess / transient data in-core during a crashdump. 263168404Spjd */ 264168404Spjdvoid * 265168404Spjdzio_buf_alloc(size_t size) 266168404Spjd{ 267168404Spjd size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 268230647Skmacy int flags = zio_exclude_metadata ? KM_NODEBUG : 0; 269168404Spjd 270277582Sdelphij VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 271168404Spjd 272208148Spjd if (zio_use_uma) 273208148Spjd return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE)); 274208148Spjd else 275230647Skmacy return (kmem_alloc(size, KM_SLEEP|flags)); 276168404Spjd} 277168404Spjd 278168404Spjd/* 279168404Spjd * Use zio_data_buf_alloc to allocate data. The data will not appear in a 280168404Spjd * crashdump if the kernel panics. This exists so that we will limit the amount 281168404Spjd * of ZFS data that shows up in a kernel crashdump. (Thus reducing the amount 282168404Spjd * of kernel heap dumped to disk when the kernel panics) 283168404Spjd */ 284168404Spjdvoid * 285168404Spjdzio_data_buf_alloc(size_t size) 286168404Spjd{ 287168404Spjd size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 288168404Spjd 289277582Sdelphij VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 290168404Spjd 291208148Spjd if (zio_use_uma) 292208148Spjd return (kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE)); 293208148Spjd else 294230623Skmacy return (kmem_alloc(size, KM_SLEEP | KM_NODEBUG)); 295168404Spjd} 296168404Spjd 297168404Spjdvoid 298168404Spjdzio_buf_free(void *buf, size_t size) 299168404Spjd{ 300168404Spjd size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 301168404Spjd 302277582Sdelphij VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 303168404Spjd 304208148Spjd if (zio_use_uma) 305208148Spjd kmem_cache_free(zio_buf_cache[c], buf); 306208148Spjd else 307208148Spjd kmem_free(buf, size); 308168404Spjd} 309168404Spjd 310168404Spjdvoid 311168404Spjdzio_data_buf_free(void *buf, size_t size) 312168404Spjd{ 313168404Spjd size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 314168404Spjd 315277582Sdelphij VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 316168404Spjd 317208148Spjd if (zio_use_uma) 318208148Spjd kmem_cache_free(zio_data_buf_cache[c], buf); 319208148Spjd else 320208148Spjd kmem_free(buf, size); 321168404Spjd} 322168404Spjd 323168404Spjd/* 324168404Spjd * ========================================================================== 325168404Spjd * Push and pop I/O transform buffers 326168404Spjd * ========================================================================== 327168404Spjd */ 328168404Spjdstatic void 329185029Spjdzio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize, 330185029Spjd zio_transform_func_t *transform) 331168404Spjd{ 332168404Spjd zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP); 333168404Spjd 334185029Spjd zt->zt_orig_data = zio->io_data; 335185029Spjd zt->zt_orig_size = zio->io_size; 336168404Spjd zt->zt_bufsize = bufsize; 337185029Spjd zt->zt_transform = transform; 338168404Spjd 339168404Spjd zt->zt_next = zio->io_transform_stack; 340168404Spjd zio->io_transform_stack = zt; 341168404Spjd 342168404Spjd zio->io_data = data; 343168404Spjd zio->io_size = size; 344168404Spjd} 345168404Spjd 346168404Spjdstatic void 347185029Spjdzio_pop_transforms(zio_t *zio) 348168404Spjd{ 349185029Spjd zio_transform_t *zt; 350168404Spjd 351185029Spjd while ((zt = zio->io_transform_stack) != NULL) { 352185029Spjd if (zt->zt_transform != NULL) 353185029Spjd zt->zt_transform(zio, 354185029Spjd zt->zt_orig_data, zt->zt_orig_size); 355168404Spjd 356219089Spjd if (zt->zt_bufsize != 0) 357219089Spjd zio_buf_free(zio->io_data, zt->zt_bufsize); 358168404Spjd 359185029Spjd zio->io_data = zt->zt_orig_data; 360185029Spjd zio->io_size = zt->zt_orig_size; 361185029Spjd zio->io_transform_stack = zt->zt_next; 362185029Spjd 363185029Spjd kmem_free(zt, sizeof (zio_transform_t)); 364168404Spjd } 365168404Spjd} 366168404Spjd 367185029Spjd/* 368185029Spjd * ========================================================================== 369185029Spjd * I/O transform callbacks for subblocks and decompression 370185029Spjd * ========================================================================== 371185029Spjd */ 372168404Spjdstatic void 373185029Spjdzio_subblock(zio_t *zio, void *data, uint64_t size) 374168404Spjd{ 375185029Spjd ASSERT(zio->io_size > size); 376168404Spjd 377185029Spjd if (zio->io_type == ZIO_TYPE_READ) 378185029Spjd bcopy(zio->io_data, data, size); 379185029Spjd} 380168404Spjd 381185029Spjdstatic void 382185029Spjdzio_decompress(zio_t *zio, void *data, uint64_t size) 383185029Spjd{ 384185029Spjd if (zio->io_error == 0 && 385185029Spjd zio_decompress_data(BP_GET_COMPRESS(zio->io_bp), 386219089Spjd zio->io_data, data, zio->io_size, size) != 0) 387249195Smm zio->io_error = SET_ERROR(EIO); 388185029Spjd} 389185029Spjd 390185029Spjd/* 391185029Spjd * ========================================================================== 392185029Spjd * I/O parent/child relationships and pipeline interlocks 393185029Spjd * ========================================================================== 394185029Spjd */ 395209962Smm/* 396209962Smm * NOTE - Callers to zio_walk_parents() and zio_walk_children must 397209962Smm * continue calling these functions until they return NULL. 398209962Smm * Otherwise, the next caller will pick up the list walk in 399209962Smm * some indeterminate state. (Otherwise every caller would 400209962Smm * have to pass in a cookie to keep the state represented by 401209962Smm * io_walk_link, which gets annoying.) 402209962Smm */ 403209962Smmzio_t * 404209962Smmzio_walk_parents(zio_t *cio) 405209962Smm{ 406209962Smm zio_link_t *zl = cio->io_walk_link; 407209962Smm list_t *pl = &cio->io_parent_list; 408185029Spjd 409209962Smm zl = (zl == NULL) ? list_head(pl) : list_next(pl, zl); 410209962Smm cio->io_walk_link = zl; 411209962Smm 412209962Smm if (zl == NULL) 413209962Smm return (NULL); 414209962Smm 415209962Smm ASSERT(zl->zl_child == cio); 416209962Smm return (zl->zl_parent); 417209962Smm} 418209962Smm 419209962Smmzio_t * 420209962Smmzio_walk_children(zio_t *pio) 421185029Spjd{ 422209962Smm zio_link_t *zl = pio->io_walk_link; 423209962Smm list_t *cl = &pio->io_child_list; 424209962Smm 425209962Smm zl = (zl == NULL) ? list_head(cl) : list_next(cl, zl); 426209962Smm pio->io_walk_link = zl; 427209962Smm 428209962Smm if (zl == NULL) 429209962Smm return (NULL); 430209962Smm 431209962Smm ASSERT(zl->zl_parent == pio); 432209962Smm return (zl->zl_child); 433209962Smm} 434209962Smm 435209962Smmzio_t * 436209962Smmzio_unique_parent(zio_t *cio) 437209962Smm{ 438209962Smm zio_t *pio = zio_walk_parents(cio); 439209962Smm 440209962Smm VERIFY(zio_walk_parents(cio) == NULL); 441209962Smm return (pio); 442209962Smm} 443209962Smm 444209962Smmvoid 445209962Smmzio_add_child(zio_t *pio, zio_t *cio) 446209962Smm{ 447209962Smm zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP); 448209962Smm 449209962Smm /* 450209962Smm * Logical I/Os can have logical, gang, or vdev children. 451209962Smm * Gang I/Os can have gang or vdev children. 452209962Smm * Vdev I/Os can only have vdev children. 453209962Smm * The following ASSERT captures all of these constraints. 454209962Smm */ 455209962Smm ASSERT(cio->io_child_type <= pio->io_child_type); 456209962Smm 457209962Smm zl->zl_parent = pio; 458209962Smm zl->zl_child = cio; 459209962Smm 460209962Smm mutex_enter(&cio->io_lock); 461185029Spjd mutex_enter(&pio->io_lock); 462209962Smm 463209962Smm ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0); 464209962Smm 465209962Smm for (int w = 0; w < ZIO_WAIT_TYPES; w++) 466209962Smm pio->io_children[cio->io_child_type][w] += !cio->io_state[w]; 467209962Smm 468209962Smm list_insert_head(&pio->io_child_list, zl); 469209962Smm list_insert_head(&cio->io_parent_list, zl); 470209962Smm 471219089Spjd pio->io_child_count++; 472219089Spjd cio->io_parent_count++; 473219089Spjd 474185029Spjd mutex_exit(&pio->io_lock); 475209962Smm mutex_exit(&cio->io_lock); 476185029Spjd} 477185029Spjd 478185029Spjdstatic void 479209962Smmzio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl) 480185029Spjd{ 481209962Smm ASSERT(zl->zl_parent == pio); 482209962Smm ASSERT(zl->zl_child == cio); 483185029Spjd 484209962Smm mutex_enter(&cio->io_lock); 485209962Smm mutex_enter(&pio->io_lock); 486185029Spjd 487209962Smm list_remove(&pio->io_child_list, zl); 488209962Smm list_remove(&cio->io_parent_list, zl); 489209962Smm 490219089Spjd pio->io_child_count--; 491219089Spjd cio->io_parent_count--; 492219089Spjd 493185029Spjd mutex_exit(&pio->io_lock); 494209962Smm mutex_exit(&cio->io_lock); 495209962Smm 496209962Smm kmem_cache_free(zio_link_cache, zl); 497185029Spjd} 498185029Spjd 499185029Spjdstatic boolean_t 500185029Spjdzio_wait_for_children(zio_t *zio, enum zio_child child, enum zio_wait_type wait) 501185029Spjd{ 502185029Spjd uint64_t *countp = &zio->io_children[child][wait]; 503185029Spjd boolean_t waiting = B_FALSE; 504185029Spjd 505185029Spjd mutex_enter(&zio->io_lock); 506185029Spjd ASSERT(zio->io_stall == NULL); 507185029Spjd if (*countp != 0) { 508219089Spjd zio->io_stage >>= 1; 509185029Spjd zio->io_stall = countp; 510185029Spjd waiting = B_TRUE; 511168404Spjd } 512185029Spjd mutex_exit(&zio->io_lock); 513185029Spjd 514185029Spjd return (waiting); 515168404Spjd} 516168404Spjd 517185029Spjdstatic void 518185029Spjdzio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait) 519185029Spjd{ 520185029Spjd uint64_t *countp = &pio->io_children[zio->io_child_type][wait]; 521185029Spjd int *errorp = &pio->io_child_error[zio->io_child_type]; 522185029Spjd 523185029Spjd mutex_enter(&pio->io_lock); 524185029Spjd if (zio->io_error && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) 525185029Spjd *errorp = zio_worst_error(*errorp, zio->io_error); 526185029Spjd pio->io_reexecute |= zio->io_reexecute; 527185029Spjd ASSERT3U(*countp, >, 0); 528260763Savg 529260763Savg (*countp)--; 530260763Savg 531260763Savg if (*countp == 0 && pio->io_stall == countp) { 532185029Spjd pio->io_stall = NULL; 533185029Spjd mutex_exit(&pio->io_lock); 534185029Spjd zio_execute(pio); 535185029Spjd } else { 536185029Spjd mutex_exit(&pio->io_lock); 537185029Spjd } 538185029Spjd} 539185029Spjd 540185029Spjdstatic void 541185029Spjdzio_inherit_child_errors(zio_t *zio, enum zio_child c) 542185029Spjd{ 543185029Spjd if (zio->io_child_error[c] != 0 && zio->io_error == 0) 544185029Spjd zio->io_error = zio->io_child_error[c]; 545185029Spjd} 546185029Spjd 547168404Spjd/* 548168404Spjd * ========================================================================== 549185029Spjd * Create the various types of I/O (read, write, free, etc) 550168404Spjd * ========================================================================== 551168404Spjd */ 552168404Spjdstatic zio_t * 553219089Spjdzio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, 554168404Spjd void *data, uint64_t size, zio_done_func_t *done, void *private, 555260763Savg zio_type_t type, zio_priority_t priority, enum zio_flag flags, 556268657Sdelphij vdev_t *vd, uint64_t offset, const zbookmark_phys_t *zb, 557219089Spjd enum zio_stage stage, enum zio_stage pipeline) 558168404Spjd{ 559168404Spjd zio_t *zio; 560168404Spjd 561240868Spjd ASSERT3U(type == ZIO_TYPE_FREE || size, <=, SPA_MAXBLOCKSIZE); 562168404Spjd ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0); 563185029Spjd ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0); 564168404Spjd 565185029Spjd ASSERT(!vd || spa_config_held(spa, SCL_STATE_ALL, RW_READER)); 566185029Spjd ASSERT(!bp || !(flags & ZIO_FLAG_CONFIG_WRITER)); 567185029Spjd ASSERT(vd || stage == ZIO_STAGE_OPEN); 568185029Spjd 569168926Spjd zio = kmem_cache_alloc(zio_cache, KM_SLEEP); 570168926Spjd bzero(zio, sizeof (zio_t)); 571185029Spjd 572185029Spjd mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL); 573185029Spjd cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL); 574185029Spjd 575209962Smm list_create(&zio->io_parent_list, sizeof (zio_link_t), 576209962Smm offsetof(zio_link_t, zl_parent_node)); 577209962Smm list_create(&zio->io_child_list, sizeof (zio_link_t), 578209962Smm offsetof(zio_link_t, zl_child_node)); 579209962Smm 580185029Spjd if (vd != NULL) 581185029Spjd zio->io_child_type = ZIO_CHILD_VDEV; 582185029Spjd else if (flags & ZIO_FLAG_GANG_CHILD) 583185029Spjd zio->io_child_type = ZIO_CHILD_GANG; 584219089Spjd else if (flags & ZIO_FLAG_DDT_CHILD) 585219089Spjd zio->io_child_type = ZIO_CHILD_DDT; 586185029Spjd else 587185029Spjd zio->io_child_type = ZIO_CHILD_LOGICAL; 588185029Spjd 589168404Spjd if (bp != NULL) { 590219089Spjd zio->io_bp = (blkptr_t *)bp; 591168404Spjd zio->io_bp_copy = *bp; 592168404Spjd zio->io_bp_orig = *bp; 593219089Spjd if (type != ZIO_TYPE_WRITE || 594219089Spjd zio->io_child_type == ZIO_CHILD_DDT) 595185029Spjd zio->io_bp = &zio->io_bp_copy; /* so caller can free */ 596209962Smm if (zio->io_child_type == ZIO_CHILD_LOGICAL) 597185029Spjd zio->io_logical = zio; 598209962Smm if (zio->io_child_type > ZIO_CHILD_GANG && BP_IS_GANG(bp)) 599209962Smm pipeline |= ZIO_GANG_STAGES; 600168404Spjd } 601185029Spjd 602185029Spjd zio->io_spa = spa; 603185029Spjd zio->io_txg = txg; 604168404Spjd zio->io_done = done; 605168404Spjd zio->io_private = private; 606168404Spjd zio->io_type = type; 607168404Spjd zio->io_priority = priority; 608185029Spjd zio->io_vd = vd; 609185029Spjd zio->io_offset = offset; 610219089Spjd zio->io_orig_data = zio->io_data = data; 611219089Spjd zio->io_orig_size = zio->io_size = size; 612185029Spjd zio->io_orig_flags = zio->io_flags = flags; 613185029Spjd zio->io_orig_stage = zio->io_stage = stage; 614185029Spjd zio->io_orig_pipeline = zio->io_pipeline = pipeline; 615168404Spjd 616209962Smm zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY); 617209962Smm zio->io_state[ZIO_WAIT_DONE] = (stage >= ZIO_STAGE_DONE); 618209962Smm 619185029Spjd if (zb != NULL) 620185029Spjd zio->io_bookmark = *zb; 621185029Spjd 622185029Spjd if (pio != NULL) { 623185029Spjd if (zio->io_logical == NULL) 624168404Spjd zio->io_logical = pio->io_logical; 625209962Smm if (zio->io_child_type == ZIO_CHILD_GANG) 626209962Smm zio->io_gang_leader = pio->io_gang_leader; 627185029Spjd zio_add_child(pio, zio); 628168404Spjd } 629168404Spjd 630168404Spjd return (zio); 631168404Spjd} 632168404Spjd 633185029Spjdstatic void 634185029Spjdzio_destroy(zio_t *zio) 635185029Spjd{ 636209962Smm list_destroy(&zio->io_parent_list); 637209962Smm list_destroy(&zio->io_child_list); 638185029Spjd mutex_destroy(&zio->io_lock); 639185029Spjd cv_destroy(&zio->io_cv); 640185029Spjd kmem_cache_free(zio_cache, zio); 641185029Spjd} 642185029Spjd 643168404Spjdzio_t * 644209962Smmzio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done, 645219089Spjd void *private, enum zio_flag flags) 646168404Spjd{ 647168404Spjd zio_t *zio; 648168404Spjd 649168404Spjd zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private, 650209962Smm ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL, 651185029Spjd ZIO_STAGE_OPEN, ZIO_INTERLOCK_PIPELINE); 652168404Spjd 653168404Spjd return (zio); 654168404Spjd} 655168404Spjd 656168404Spjdzio_t * 657219089Spjdzio_root(spa_t *spa, zio_done_func_t *done, void *private, enum zio_flag flags) 658168404Spjd{ 659209962Smm return (zio_null(NULL, spa, NULL, done, private, flags)); 660168404Spjd} 661168404Spjd 662277582Sdelphijvoid 663277582Sdelphijzfs_blkptr_verify(spa_t *spa, const blkptr_t *bp) 664277582Sdelphij{ 665277582Sdelphij if (!DMU_OT_IS_VALID(BP_GET_TYPE(bp))) { 666277582Sdelphij zfs_panic_recover("blkptr at %p has invalid TYPE %llu", 667277582Sdelphij bp, (longlong_t)BP_GET_TYPE(bp)); 668277582Sdelphij } 669277582Sdelphij if (BP_GET_CHECKSUM(bp) >= ZIO_CHECKSUM_FUNCTIONS || 670277582Sdelphij BP_GET_CHECKSUM(bp) <= ZIO_CHECKSUM_ON) { 671277582Sdelphij zfs_panic_recover("blkptr at %p has invalid CHECKSUM %llu", 672277582Sdelphij bp, (longlong_t)BP_GET_CHECKSUM(bp)); 673277582Sdelphij } 674277582Sdelphij if (BP_GET_COMPRESS(bp) >= ZIO_COMPRESS_FUNCTIONS || 675277582Sdelphij BP_GET_COMPRESS(bp) <= ZIO_COMPRESS_ON) { 676277582Sdelphij zfs_panic_recover("blkptr at %p has invalid COMPRESS %llu", 677277582Sdelphij bp, (longlong_t)BP_GET_COMPRESS(bp)); 678277582Sdelphij } 679277582Sdelphij if (BP_GET_LSIZE(bp) > SPA_MAXBLOCKSIZE) { 680277582Sdelphij zfs_panic_recover("blkptr at %p has invalid LSIZE %llu", 681277582Sdelphij bp, (longlong_t)BP_GET_LSIZE(bp)); 682277582Sdelphij } 683277582Sdelphij if (BP_GET_PSIZE(bp) > SPA_MAXBLOCKSIZE) { 684277582Sdelphij zfs_panic_recover("blkptr at %p has invalid PSIZE %llu", 685277582Sdelphij bp, (longlong_t)BP_GET_PSIZE(bp)); 686277582Sdelphij } 687277582Sdelphij 688277582Sdelphij if (BP_IS_EMBEDDED(bp)) { 689277582Sdelphij if (BPE_GET_ETYPE(bp) > NUM_BP_EMBEDDED_TYPES) { 690277582Sdelphij zfs_panic_recover("blkptr at %p has invalid ETYPE %llu", 691277582Sdelphij bp, (longlong_t)BPE_GET_ETYPE(bp)); 692277582Sdelphij } 693277582Sdelphij } 694277582Sdelphij 695277582Sdelphij /* 696277582Sdelphij * Pool-specific checks. 697277582Sdelphij * 698277582Sdelphij * Note: it would be nice to verify that the blk_birth and 699277582Sdelphij * BP_PHYSICAL_BIRTH() are not too large. However, spa_freeze() 700277582Sdelphij * allows the birth time of log blocks (and dmu_sync()-ed blocks 701277582Sdelphij * that are in the log) to be arbitrarily large. 702277582Sdelphij */ 703277582Sdelphij for (int i = 0; i < BP_GET_NDVAS(bp); i++) { 704277582Sdelphij uint64_t vdevid = DVA_GET_VDEV(&bp->blk_dva[i]); 705277582Sdelphij if (vdevid >= spa->spa_root_vdev->vdev_children) { 706277582Sdelphij zfs_panic_recover("blkptr at %p DVA %u has invalid " 707277582Sdelphij "VDEV %llu", 708277582Sdelphij bp, i, (longlong_t)vdevid); 709277618Sdelphij continue; 710277582Sdelphij } 711277582Sdelphij vdev_t *vd = spa->spa_root_vdev->vdev_child[vdevid]; 712277582Sdelphij if (vd == NULL) { 713277582Sdelphij zfs_panic_recover("blkptr at %p DVA %u has invalid " 714277582Sdelphij "VDEV %llu", 715277582Sdelphij bp, i, (longlong_t)vdevid); 716277618Sdelphij continue; 717277582Sdelphij } 718277582Sdelphij if (vd->vdev_ops == &vdev_hole_ops) { 719277582Sdelphij zfs_panic_recover("blkptr at %p DVA %u has hole " 720277582Sdelphij "VDEV %llu", 721277582Sdelphij bp, i, (longlong_t)vdevid); 722277618Sdelphij continue; 723277582Sdelphij } 724277582Sdelphij if (vd->vdev_ops == &vdev_missing_ops) { 725277582Sdelphij /* 726277582Sdelphij * "missing" vdevs are valid during import, but we 727277582Sdelphij * don't have their detailed info (e.g. asize), so 728277582Sdelphij * we can't perform any more checks on them. 729277582Sdelphij */ 730277582Sdelphij continue; 731277582Sdelphij } 732277582Sdelphij uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]); 733277582Sdelphij uint64_t asize = DVA_GET_ASIZE(&bp->blk_dva[i]); 734277582Sdelphij if (BP_IS_GANG(bp)) 735277582Sdelphij asize = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); 736277582Sdelphij if (offset + asize > vd->vdev_asize) { 737277582Sdelphij zfs_panic_recover("blkptr at %p DVA %u has invalid " 738277582Sdelphij "OFFSET %llu", 739277582Sdelphij bp, i, (longlong_t)offset); 740277582Sdelphij } 741277582Sdelphij } 742277582Sdelphij} 743277582Sdelphij 744168404Spjdzio_t * 745185029Spjdzio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, 746185029Spjd void *data, uint64_t size, zio_done_func_t *done, void *private, 747268657Sdelphij zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb) 748168404Spjd{ 749168404Spjd zio_t *zio; 750168404Spjd 751277582Sdelphij zfs_blkptr_verify(spa, bp); 752277582Sdelphij 753219089Spjd zio = zio_create(pio, spa, BP_PHYSICAL_BIRTH(bp), bp, 754185029Spjd data, size, done, private, 755185029Spjd ZIO_TYPE_READ, priority, flags, NULL, 0, zb, 756219089Spjd ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ? 757219089Spjd ZIO_DDT_CHILD_READ_PIPELINE : ZIO_READ_PIPELINE); 758168404Spjd 759168404Spjd return (zio); 760168404Spjd} 761168404Spjd 762168404Spjdzio_t * 763185029Spjdzio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, 764219089Spjd void *data, uint64_t size, const zio_prop_t *zp, 765260763Savg zio_done_func_t *ready, zio_done_func_t *physdone, zio_done_func_t *done, 766260763Savg void *private, 767268657Sdelphij zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb) 768168404Spjd{ 769168404Spjd zio_t *zio; 770168404Spjd 771185029Spjd ASSERT(zp->zp_checksum >= ZIO_CHECKSUM_OFF && 772185029Spjd zp->zp_checksum < ZIO_CHECKSUM_FUNCTIONS && 773185029Spjd zp->zp_compress >= ZIO_COMPRESS_OFF && 774185029Spjd zp->zp_compress < ZIO_COMPRESS_FUNCTIONS && 775236884Smm DMU_OT_IS_VALID(zp->zp_type) && 776185029Spjd zp->zp_level < 32 && 777219089Spjd zp->zp_copies > 0 && 778243524Smm zp->zp_copies <= spa_max_replication(spa)); 779168404Spjd 780168404Spjd zio = zio_create(pio, spa, txg, bp, data, size, done, private, 781185029Spjd ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb, 782219089Spjd ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ? 783219089Spjd ZIO_DDT_CHILD_WRITE_PIPELINE : ZIO_WRITE_PIPELINE); 784168404Spjd 785168404Spjd zio->io_ready = ready; 786260763Savg zio->io_physdone = physdone; 787185029Spjd zio->io_prop = *zp; 788168404Spjd 789268649Sdelphij /* 790268649Sdelphij * Data can be NULL if we are going to call zio_write_override() to 791268649Sdelphij * provide the already-allocated BP. But we may need the data to 792268649Sdelphij * verify a dedup hit (if requested). In this case, don't try to 793268649Sdelphij * dedup (just take the already-allocated BP verbatim). 794268649Sdelphij */ 795268649Sdelphij if (data == NULL && zio->io_prop.zp_dedup_verify) { 796268649Sdelphij zio->io_prop.zp_dedup = zio->io_prop.zp_dedup_verify = B_FALSE; 797268649Sdelphij } 798268649Sdelphij 799168404Spjd return (zio); 800168404Spjd} 801168404Spjd 802168404Spjdzio_t * 803185029Spjdzio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data, 804260763Savg uint64_t size, zio_done_func_t *done, void *private, 805268657Sdelphij zio_priority_t priority, enum zio_flag flags, zbookmark_phys_t *zb) 806168404Spjd{ 807168404Spjd zio_t *zio; 808168404Spjd 809168404Spjd zio = zio_create(pio, spa, txg, bp, data, size, done, private, 810185029Spjd ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb, 811168404Spjd ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE); 812168404Spjd 813168404Spjd return (zio); 814168404Spjd} 815168404Spjd 816219089Spjdvoid 817243524Smmzio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite) 818219089Spjd{ 819219089Spjd ASSERT(zio->io_type == ZIO_TYPE_WRITE); 820219089Spjd ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 821219089Spjd ASSERT(zio->io_stage == ZIO_STAGE_OPEN); 822219089Spjd ASSERT(zio->io_txg == spa_syncing_txg(zio->io_spa)); 823219089Spjd 824243524Smm /* 825243524Smm * We must reset the io_prop to match the values that existed 826243524Smm * when the bp was first written by dmu_sync() keeping in mind 827243524Smm * that nopwrite and dedup are mutually exclusive. 828243524Smm */ 829243524Smm zio->io_prop.zp_dedup = nopwrite ? B_FALSE : zio->io_prop.zp_dedup; 830243524Smm zio->io_prop.zp_nopwrite = nopwrite; 831219089Spjd zio->io_prop.zp_copies = copies; 832219089Spjd zio->io_bp_override = bp; 833219089Spjd} 834219089Spjd 835219089Spjdvoid 836219089Spjdzio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp) 837219089Spjd{ 838268649Sdelphij 839268649Sdelphij /* 840268649Sdelphij * The check for EMBEDDED is a performance optimization. We 841268649Sdelphij * process the free here (by ignoring it) rather than 842268649Sdelphij * putting it on the list and then processing it in zio_free_sync(). 843268649Sdelphij */ 844268649Sdelphij if (BP_IS_EMBEDDED(bp)) 845268649Sdelphij return; 846248571Smm metaslab_check_free(spa, bp); 847252840Smm 848252840Smm /* 849252840Smm * Frees that are for the currently-syncing txg, are not going to be 850252840Smm * deferred, and which will not need to do a read (i.e. not GANG or 851252840Smm * DEDUP), can be processed immediately. Otherwise, put them on the 852252840Smm * in-memory list for later processing. 853252840Smm */ 854253992Smav if (zfs_trim_enabled || BP_IS_GANG(bp) || BP_GET_DEDUP(bp) || 855252840Smm txg != spa->spa_syncing_txg || 856252840Smm spa_sync_pass(spa) >= zfs_sync_pass_deferred_free) { 857252840Smm bplist_append(&spa->spa_free_bplist[txg & TXG_MASK], bp); 858252840Smm } else { 859252840Smm VERIFY0(zio_wait(zio_free_sync(NULL, spa, txg, bp, 860252840Smm BP_GET_PSIZE(bp), 0))); 861252840Smm } 862219089Spjd} 863219089Spjd 864168404Spjdzio_t * 865219089Spjdzio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, 866240868Spjd uint64_t size, enum zio_flag flags) 867168404Spjd{ 868168404Spjd zio_t *zio; 869252840Smm enum zio_stage stage = ZIO_FREE_PIPELINE; 870168404Spjd 871168404Spjd ASSERT(!BP_IS_HOLE(bp)); 872219089Spjd ASSERT(spa_syncing_txg(spa) == txg); 873243503Smm ASSERT(spa_sync_pass(spa) < zfs_sync_pass_deferred_free); 874168404Spjd 875268649Sdelphij if (BP_IS_EMBEDDED(bp)) 876268649Sdelphij return (zio_null(pio, spa, NULL, NULL, NULL, 0)); 877268649Sdelphij 878248571Smm metaslab_check_free(spa, bp); 879251520Sdelphij arc_freed(spa, bp); 880248571Smm 881253992Smav if (zfs_trim_enabled) 882253992Smav stage |= ZIO_STAGE_ISSUE_ASYNC | ZIO_STAGE_VDEV_IO_START | 883253992Smav ZIO_STAGE_VDEV_IO_ASSESS; 884252840Smm /* 885252840Smm * GANG and DEDUP blocks can induce a read (for the gang block header, 886252840Smm * or the DDT), so issue them asynchronously so that this thread is 887252840Smm * not tied up. 888252840Smm */ 889253992Smav else if (BP_IS_GANG(bp) || BP_GET_DEDUP(bp)) 890252840Smm stage |= ZIO_STAGE_ISSUE_ASYNC; 891252840Smm 892270312Ssmh flags |= ZIO_FLAG_DONT_QUEUE; 893270312Ssmh 894240868Spjd zio = zio_create(pio, spa, txg, bp, NULL, size, 895260763Savg NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_NOW, flags, 896252840Smm NULL, 0, NULL, ZIO_STAGE_OPEN, stage); 897168404Spjd 898168404Spjd return (zio); 899168404Spjd} 900168404Spjd 901168404Spjdzio_t * 902219089Spjdzio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, 903219089Spjd zio_done_func_t *done, void *private, enum zio_flag flags) 904168404Spjd{ 905168404Spjd zio_t *zio; 906168404Spjd 907268649Sdelphij dprintf_bp(bp, "claiming in txg %llu", txg); 908268649Sdelphij 909268649Sdelphij if (BP_IS_EMBEDDED(bp)) 910268649Sdelphij return (zio_null(pio, spa, NULL, NULL, NULL, 0)); 911268649Sdelphij 912168404Spjd /* 913168404Spjd * A claim is an allocation of a specific block. Claims are needed 914168404Spjd * to support immediate writes in the intent log. The issue is that 915168404Spjd * immediate writes contain committed data, but in a txg that was 916168404Spjd * *not* committed. Upon opening the pool after an unclean shutdown, 917168404Spjd * the intent log claims all blocks that contain immediate write data 918168404Spjd * so that the SPA knows they're in use. 919168404Spjd * 920168404Spjd * All claims *must* be resolved in the first txg -- before the SPA 921168404Spjd * starts allocating blocks -- so that nothing is allocated twice. 922219089Spjd * If txg == 0 we just verify that the block is claimable. 923168404Spjd */ 924168404Spjd ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_first_txg(spa)); 925219089Spjd ASSERT(txg == spa_first_txg(spa) || txg == 0); 926219089Spjd ASSERT(!BP_GET_DEDUP(bp) || !spa_writeable(spa)); /* zdb(1M) */ 927168404Spjd 928185029Spjd zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp), 929185029Spjd done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, flags, 930185029Spjd NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE); 931168404Spjd 932168404Spjd return (zio); 933168404Spjd} 934168404Spjd 935168404Spjdzio_t * 936240868Spjdzio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, uint64_t offset, 937260763Savg uint64_t size, zio_done_func_t *done, void *private, 938270312Ssmh zio_priority_t priority, enum zio_flag flags) 939168404Spjd{ 940168404Spjd zio_t *zio; 941168404Spjd int c; 942168404Spjd 943168404Spjd if (vd->vdev_children == 0) { 944240868Spjd zio = zio_create(pio, spa, 0, NULL, NULL, size, done, private, 945270312Ssmh ZIO_TYPE_IOCTL, priority, flags, vd, offset, NULL, 946168404Spjd ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE); 947168404Spjd 948168404Spjd zio->io_cmd = cmd; 949168404Spjd } else { 950209962Smm zio = zio_null(pio, spa, NULL, NULL, NULL, flags); 951168404Spjd 952168404Spjd for (c = 0; c < vd->vdev_children; c++) 953168404Spjd zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd, 954270312Ssmh offset, size, done, private, priority, flags)); 955168404Spjd } 956168404Spjd 957168404Spjd return (zio); 958168404Spjd} 959168404Spjd 960168404Spjdzio_t * 961168404Spjdzio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, 962168404Spjd void *data, int checksum, zio_done_func_t *done, void *private, 963260763Savg zio_priority_t priority, enum zio_flag flags, boolean_t labels) 964168404Spjd{ 965168404Spjd zio_t *zio; 966168404Spjd 967185029Spjd ASSERT(vd->vdev_children == 0); 968185029Spjd ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE || 969185029Spjd offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); 970185029Spjd ASSERT3U(offset + size, <=, vd->vdev_psize); 971168404Spjd 972185029Spjd zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private, 973269416Sdelphij ZIO_TYPE_READ, priority, flags | ZIO_FLAG_PHYSICAL, vd, offset, 974269416Sdelphij NULL, ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE); 975168404Spjd 976185029Spjd zio->io_prop.zp_checksum = checksum; 977168404Spjd 978168404Spjd return (zio); 979168404Spjd} 980168404Spjd 981168404Spjdzio_t * 982168404Spjdzio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, 983168404Spjd void *data, int checksum, zio_done_func_t *done, void *private, 984260763Savg zio_priority_t priority, enum zio_flag flags, boolean_t labels) 985168404Spjd{ 986168404Spjd zio_t *zio; 987168404Spjd 988185029Spjd ASSERT(vd->vdev_children == 0); 989185029Spjd ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE || 990185029Spjd offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); 991185029Spjd ASSERT3U(offset + size, <=, vd->vdev_psize); 992168404Spjd 993185029Spjd zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private, 994269416Sdelphij ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_PHYSICAL, vd, offset, 995269416Sdelphij NULL, ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE); 996168404Spjd 997185029Spjd zio->io_prop.zp_checksum = checksum; 998168404Spjd 999219089Spjd if (zio_checksum_table[checksum].ci_eck) { 1000168404Spjd /* 1001219089Spjd * zec checksums are necessarily destructive -- they modify 1002185029Spjd * the end of the write buffer to hold the verifier/checksum. 1003168404Spjd * Therefore, we must make a local copy in case the data is 1004185029Spjd * being written to multiple places in parallel. 1005168404Spjd */ 1006185029Spjd void *wbuf = zio_buf_alloc(size); 1007168404Spjd bcopy(data, wbuf, size); 1008185029Spjd zio_push_transform(zio, wbuf, size, size, NULL); 1009168404Spjd } 1010168404Spjd 1011168404Spjd return (zio); 1012168404Spjd} 1013168404Spjd 1014168404Spjd/* 1015185029Spjd * Create a child I/O to do some work for us. 1016168404Spjd */ 1017168404Spjdzio_t * 1018185029Spjdzio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset, 1019260763Savg void *data, uint64_t size, int type, zio_priority_t priority, 1020260763Savg enum zio_flag flags, zio_done_func_t *done, void *private) 1021168404Spjd{ 1022219089Spjd enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE; 1023185029Spjd zio_t *zio; 1024168404Spjd 1025185029Spjd ASSERT(vd->vdev_parent == 1026185029Spjd (pio->io_vd ? pio->io_vd : pio->io_spa->spa_root_vdev)); 1027185029Spjd 1028168404Spjd if (type == ZIO_TYPE_READ && bp != NULL) { 1029168404Spjd /* 1030168404Spjd * If we have the bp, then the child should perform the 1031168404Spjd * checksum and the parent need not. This pushes error 1032168404Spjd * detection as close to the leaves as possible and 1033168404Spjd * eliminates redundant checksums in the interior nodes. 1034168404Spjd */ 1035219089Spjd pipeline |= ZIO_STAGE_CHECKSUM_VERIFY; 1036219089Spjd pio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY; 1037168404Spjd } 1038168404Spjd 1039270312Ssmh /* Not all IO types require vdev io done stage e.g. free */ 1040270312Ssmh if (!(pio->io_pipeline & ZIO_STAGE_VDEV_IO_DONE)) 1041270312Ssmh pipeline &= ~ZIO_STAGE_VDEV_IO_DONE; 1042270312Ssmh 1043185029Spjd if (vd->vdev_children == 0) 1044185029Spjd offset += VDEV_LABEL_START_SIZE; 1045185029Spjd 1046219089Spjd flags |= ZIO_VDEV_CHILD_FLAGS(pio) | ZIO_FLAG_DONT_PROPAGATE; 1047219089Spjd 1048219089Spjd /* 1049219089Spjd * If we've decided to do a repair, the write is not speculative -- 1050219089Spjd * even if the original read was. 1051219089Spjd */ 1052219089Spjd if (flags & ZIO_FLAG_IO_REPAIR) 1053219089Spjd flags &= ~ZIO_FLAG_SPECULATIVE; 1054219089Spjd 1055185029Spjd zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size, 1056219089Spjd done, private, type, priority, flags, vd, offset, &pio->io_bookmark, 1057219089Spjd ZIO_STAGE_VDEV_IO_START >> 1, pipeline); 1058168404Spjd 1059260763Savg zio->io_physdone = pio->io_physdone; 1060260763Savg if (vd->vdev_ops->vdev_op_leaf && zio->io_logical != NULL) 1061260763Savg zio->io_logical->io_phys_children++; 1062260763Savg 1063185029Spjd return (zio); 1064168404Spjd} 1065168404Spjd 1066185029Spjdzio_t * 1067185029Spjdzio_vdev_delegated_io(vdev_t *vd, uint64_t offset, void *data, uint64_t size, 1068260763Savg int type, zio_priority_t priority, enum zio_flag flags, 1069219089Spjd zio_done_func_t *done, void *private) 1070168404Spjd{ 1071185029Spjd zio_t *zio; 1072168404Spjd 1073185029Spjd ASSERT(vd->vdev_ops->vdev_op_leaf); 1074168404Spjd 1075185029Spjd zio = zio_create(NULL, vd->vdev_spa, 0, NULL, 1076185029Spjd data, size, done, private, type, priority, 1077260763Savg flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_DELEGATED, 1078185029Spjd vd, offset, NULL, 1079219089Spjd ZIO_STAGE_VDEV_IO_START >> 1, ZIO_VDEV_CHILD_PIPELINE); 1080168404Spjd 1081185029Spjd return (zio); 1082168404Spjd} 1083168404Spjd 1084168404Spjdvoid 1085185029Spjdzio_flush(zio_t *zio, vdev_t *vd) 1086168404Spjd{ 1087240868Spjd zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE, 0, 0, 1088270312Ssmh NULL, NULL, ZIO_PRIORITY_NOW, 1089185029Spjd ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY)); 1090168404Spjd} 1091168404Spjd 1092240868Spjdzio_t * 1093240868Spjdzio_trim(zio_t *zio, spa_t *spa, vdev_t *vd, uint64_t offset, uint64_t size) 1094240868Spjd{ 1095240868Spjd 1096240868Spjd ASSERT(vd->vdev_ops->vdev_op_leaf); 1097240868Spjd 1098270312Ssmh return (zio_create(zio, spa, 0, NULL, NULL, size, NULL, NULL, 1099270312Ssmh ZIO_TYPE_FREE, ZIO_PRIORITY_TRIM, ZIO_FLAG_DONT_AGGREGATE | 1100270312Ssmh ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY, 1101270312Ssmh vd, offset, NULL, ZIO_STAGE_OPEN, ZIO_FREE_PHYS_PIPELINE)); 1102240868Spjd} 1103240868Spjd 1104219089Spjdvoid 1105219089Spjdzio_shrink(zio_t *zio, uint64_t size) 1106219089Spjd{ 1107219089Spjd ASSERT(zio->io_executor == NULL); 1108219089Spjd ASSERT(zio->io_orig_size == zio->io_size); 1109219089Spjd ASSERT(size <= zio->io_size); 1110219089Spjd 1111219089Spjd /* 1112219089Spjd * We don't shrink for raidz because of problems with the 1113219089Spjd * reconstruction when reading back less than the block size. 1114219089Spjd * Note, BP_IS_RAIDZ() assumes no compression. 1115219089Spjd */ 1116219089Spjd ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF); 1117219089Spjd if (!BP_IS_RAIDZ(zio->io_bp)) 1118219089Spjd zio->io_orig_size = zio->io_size = size; 1119219089Spjd} 1120219089Spjd 1121168404Spjd/* 1122168404Spjd * ========================================================================== 1123185029Spjd * Prepare to read and write logical blocks 1124168404Spjd * ========================================================================== 1125168404Spjd */ 1126185029Spjd 1127185029Spjdstatic int 1128270312Ssmhzio_read_bp_init(zio_t *zio) 1129168404Spjd{ 1130185029Spjd blkptr_t *bp = zio->io_bp; 1131185029Spjd 1132209962Smm if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF && 1133209962Smm zio->io_child_type == ZIO_CHILD_LOGICAL && 1134209962Smm !(zio->io_flags & ZIO_FLAG_RAW)) { 1135268649Sdelphij uint64_t psize = 1136268649Sdelphij BP_IS_EMBEDDED(bp) ? BPE_GET_PSIZE(bp) : BP_GET_PSIZE(bp); 1137219089Spjd void *cbuf = zio_buf_alloc(psize); 1138185029Spjd 1139219089Spjd zio_push_transform(zio, cbuf, psize, psize, zio_decompress); 1140168404Spjd } 1141185029Spjd 1142268649Sdelphij if (BP_IS_EMBEDDED(bp) && BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA) { 1143268649Sdelphij zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1144268649Sdelphij decode_embedded_bp_compressed(bp, zio->io_data); 1145268649Sdelphij } else { 1146268649Sdelphij ASSERT(!BP_IS_EMBEDDED(bp)); 1147268649Sdelphij } 1148268649Sdelphij 1149236884Smm if (!DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) && BP_GET_LEVEL(bp) == 0) 1150185029Spjd zio->io_flags |= ZIO_FLAG_DONT_CACHE; 1151185029Spjd 1152219089Spjd if (BP_GET_TYPE(bp) == DMU_OT_DDT_ZAP) 1153219089Spjd zio->io_flags |= ZIO_FLAG_DONT_CACHE; 1154219089Spjd 1155219089Spjd if (BP_GET_DEDUP(bp) && zio->io_child_type == ZIO_CHILD_LOGICAL) 1156219089Spjd zio->io_pipeline = ZIO_DDT_READ_PIPELINE; 1157219089Spjd 1158185029Spjd return (ZIO_PIPELINE_CONTINUE); 1159168404Spjd} 1160168404Spjd 1161185029Spjdstatic int 1162270312Ssmhzio_write_bp_init(zio_t *zio) 1163168404Spjd{ 1164219089Spjd spa_t *spa = zio->io_spa; 1165185029Spjd zio_prop_t *zp = &zio->io_prop; 1166219089Spjd enum zio_compress compress = zp->zp_compress; 1167185029Spjd blkptr_t *bp = zio->io_bp; 1168185029Spjd uint64_t lsize = zio->io_size; 1169219089Spjd uint64_t psize = lsize; 1170185029Spjd int pass = 1; 1171168404Spjd 1172185029Spjd /* 1173185029Spjd * If our children haven't all reached the ready stage, 1174185029Spjd * wait for them and then repeat this pipeline stage. 1175185029Spjd */ 1176185029Spjd if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) || 1177185029Spjd zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_READY)) 1178185029Spjd return (ZIO_PIPELINE_STOP); 1179185029Spjd 1180185029Spjd if (!IO_IS_ALLOCATING(zio)) 1181185029Spjd return (ZIO_PIPELINE_CONTINUE); 1182185029Spjd 1183219089Spjd ASSERT(zio->io_child_type != ZIO_CHILD_DDT); 1184185029Spjd 1185219089Spjd if (zio->io_bp_override) { 1186219089Spjd ASSERT(bp->blk_birth != zio->io_txg); 1187219089Spjd ASSERT(BP_GET_DEDUP(zio->io_bp_override) == 0); 1188219089Spjd 1189219089Spjd *bp = *zio->io_bp_override; 1190219089Spjd zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1191219089Spjd 1192268649Sdelphij if (BP_IS_EMBEDDED(bp)) 1193268649Sdelphij return (ZIO_PIPELINE_CONTINUE); 1194268649Sdelphij 1195243524Smm /* 1196243524Smm * If we've been overridden and nopwrite is set then 1197243524Smm * set the flag accordingly to indicate that a nopwrite 1198243524Smm * has already occurred. 1199243524Smm */ 1200243524Smm if (!BP_IS_HOLE(bp) && zp->zp_nopwrite) { 1201243524Smm ASSERT(!zp->zp_dedup); 1202243524Smm zio->io_flags |= ZIO_FLAG_NOPWRITE; 1203243524Smm return (ZIO_PIPELINE_CONTINUE); 1204243524Smm } 1205243524Smm 1206243524Smm ASSERT(!zp->zp_nopwrite); 1207243524Smm 1208219089Spjd if (BP_IS_HOLE(bp) || !zp->zp_dedup) 1209219089Spjd return (ZIO_PIPELINE_CONTINUE); 1210219089Spjd 1211219089Spjd ASSERT(zio_checksum_table[zp->zp_checksum].ci_dedup || 1212219089Spjd zp->zp_dedup_verify); 1213219089Spjd 1214219089Spjd if (BP_GET_CHECKSUM(bp) == zp->zp_checksum) { 1215219089Spjd BP_SET_DEDUP(bp, 1); 1216219089Spjd zio->io_pipeline |= ZIO_STAGE_DDT_WRITE; 1217219089Spjd return (ZIO_PIPELINE_CONTINUE); 1218219089Spjd } 1219219089Spjd zio->io_bp_override = NULL; 1220219089Spjd BP_ZERO(bp); 1221219089Spjd } 1222219089Spjd 1223263397Sdelphij if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg) { 1224185029Spjd /* 1225185029Spjd * We're rewriting an existing block, which means we're 1226185029Spjd * working on behalf of spa_sync(). For spa_sync() to 1227185029Spjd * converge, it must eventually be the case that we don't 1228185029Spjd * have to allocate new blocks. But compression changes 1229185029Spjd * the blocksize, which forces a reallocate, and makes 1230185029Spjd * convergence take longer. Therefore, after the first 1231185029Spjd * few passes, stop compressing to ensure convergence. 1232185029Spjd */ 1233219089Spjd pass = spa_sync_pass(spa); 1234185029Spjd 1235219089Spjd ASSERT(zio->io_txg == spa_syncing_txg(spa)); 1236219089Spjd ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 1237219089Spjd ASSERT(!BP_GET_DEDUP(bp)); 1238219089Spjd 1239243503Smm if (pass >= zfs_sync_pass_dont_compress) 1240185029Spjd compress = ZIO_COMPRESS_OFF; 1241185029Spjd 1242185029Spjd /* Make sure someone doesn't change their mind on overwrites */ 1243268649Sdelphij ASSERT(BP_IS_EMBEDDED(bp) || MIN(zp->zp_copies + BP_IS_GANG(bp), 1244219089Spjd spa_max_replication(spa)) == BP_GET_NDVAS(bp)); 1245185029Spjd } 1246185029Spjd 1247185029Spjd if (compress != ZIO_COMPRESS_OFF) { 1248219089Spjd void *cbuf = zio_buf_alloc(lsize); 1249269732Sdelphij psize = zio_compress_data(compress, zio->io_data, cbuf, lsize); 1250219089Spjd if (psize == 0 || psize == lsize) { 1251185029Spjd compress = ZIO_COMPRESS_OFF; 1252219089Spjd zio_buf_free(cbuf, lsize); 1253268649Sdelphij } else if (!zp->zp_dedup && psize <= BPE_PAYLOAD_SIZE && 1254268649Sdelphij zp->zp_level == 0 && !DMU_OT_HAS_FILL(zp->zp_type) && 1255268649Sdelphij spa_feature_is_enabled(spa, SPA_FEATURE_EMBEDDED_DATA)) { 1256268649Sdelphij encode_embedded_bp_compressed(bp, 1257268649Sdelphij cbuf, compress, lsize, psize); 1258268649Sdelphij BPE_SET_ETYPE(bp, BP_EMBEDDED_TYPE_DATA); 1259268649Sdelphij BP_SET_TYPE(bp, zio->io_prop.zp_type); 1260268649Sdelphij BP_SET_LEVEL(bp, zio->io_prop.zp_level); 1261268649Sdelphij zio_buf_free(cbuf, lsize); 1262268649Sdelphij bp->blk_birth = zio->io_txg; 1263268649Sdelphij zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1264268649Sdelphij ASSERT(spa_feature_is_active(spa, 1265268649Sdelphij SPA_FEATURE_EMBEDDED_DATA)); 1266268649Sdelphij return (ZIO_PIPELINE_CONTINUE); 1267219089Spjd } else { 1268268649Sdelphij /* 1269285001Savg * Round up compressed size up to the ashift 1270285001Savg * of the smallest-ashift device, and zero the tail. 1271285001Savg * This ensures that the compressed size of the BP 1272285001Savg * (and thus compressratio property) are correct, 1273285001Savg * in that we charge for the padding used to fill out 1274285001Savg * the last sector. 1275268649Sdelphij */ 1276285001Savg ASSERT3U(spa->spa_min_ashift, >=, SPA_MINBLOCKSHIFT); 1277285001Savg size_t rounded = (size_t)P2ROUNDUP(psize, 1278285001Savg 1ULL << spa->spa_min_ashift); 1279285001Savg if (rounded >= lsize) { 1280268649Sdelphij compress = ZIO_COMPRESS_OFF; 1281268649Sdelphij zio_buf_free(cbuf, lsize); 1282285001Savg psize = lsize; 1283268649Sdelphij } else { 1284285001Savg bzero((char *)cbuf + psize, rounded - psize); 1285285001Savg psize = rounded; 1286268649Sdelphij zio_push_transform(zio, cbuf, 1287268649Sdelphij psize, lsize, NULL); 1288268649Sdelphij } 1289185029Spjd } 1290185029Spjd } 1291185029Spjd 1292185029Spjd /* 1293185029Spjd * The final pass of spa_sync() must be all rewrites, but the first 1294185029Spjd * few passes offer a trade-off: allocating blocks defers convergence, 1295185029Spjd * but newly allocated blocks are sequential, so they can be written 1296185029Spjd * to disk faster. Therefore, we allow the first few passes of 1297185029Spjd * spa_sync() to allocate new blocks, but force rewrites after that. 1298185029Spjd * There should only be a handful of blocks after pass 1 in any case. 1299185029Spjd */ 1300263397Sdelphij if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg && 1301263397Sdelphij BP_GET_PSIZE(bp) == psize && 1302243503Smm pass >= zfs_sync_pass_rewrite) { 1303219089Spjd ASSERT(psize != 0); 1304219089Spjd enum zio_stage gang_stages = zio->io_pipeline & ZIO_GANG_STAGES; 1305185029Spjd zio->io_pipeline = ZIO_REWRITE_PIPELINE | gang_stages; 1306185029Spjd zio->io_flags |= ZIO_FLAG_IO_REWRITE; 1307168404Spjd } else { 1308185029Spjd BP_ZERO(bp); 1309185029Spjd zio->io_pipeline = ZIO_WRITE_PIPELINE; 1310168404Spjd } 1311185029Spjd 1312219089Spjd if (psize == 0) { 1313263397Sdelphij if (zio->io_bp_orig.blk_birth != 0 && 1314263397Sdelphij spa_feature_is_active(spa, SPA_FEATURE_HOLE_BIRTH)) { 1315263397Sdelphij BP_SET_LSIZE(bp, lsize); 1316263397Sdelphij BP_SET_TYPE(bp, zp->zp_type); 1317263397Sdelphij BP_SET_LEVEL(bp, zp->zp_level); 1318263397Sdelphij BP_SET_BIRTH(bp, zio->io_txg, 0); 1319263397Sdelphij } 1320185029Spjd zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1321185029Spjd } else { 1322185029Spjd ASSERT(zp->zp_checksum != ZIO_CHECKSUM_GANG_HEADER); 1323185029Spjd BP_SET_LSIZE(bp, lsize); 1324263397Sdelphij BP_SET_TYPE(bp, zp->zp_type); 1325263397Sdelphij BP_SET_LEVEL(bp, zp->zp_level); 1326219089Spjd BP_SET_PSIZE(bp, psize); 1327185029Spjd BP_SET_COMPRESS(bp, compress); 1328185029Spjd BP_SET_CHECKSUM(bp, zp->zp_checksum); 1329219089Spjd BP_SET_DEDUP(bp, zp->zp_dedup); 1330185029Spjd BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); 1331219089Spjd if (zp->zp_dedup) { 1332219089Spjd ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 1333219089Spjd ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); 1334219089Spjd zio->io_pipeline = ZIO_DDT_WRITE_PIPELINE; 1335219089Spjd } 1336243524Smm if (zp->zp_nopwrite) { 1337243524Smm ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 1338243524Smm ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); 1339243524Smm zio->io_pipeline |= ZIO_STAGE_NOP_WRITE; 1340243524Smm } 1341185029Spjd } 1342185029Spjd 1343185029Spjd return (ZIO_PIPELINE_CONTINUE); 1344168404Spjd} 1345168404Spjd 1346219089Spjdstatic int 1347270312Ssmhzio_free_bp_init(zio_t *zio) 1348219089Spjd{ 1349219089Spjd blkptr_t *bp = zio->io_bp; 1350219089Spjd 1351219089Spjd if (zio->io_child_type == ZIO_CHILD_LOGICAL) { 1352219089Spjd if (BP_GET_DEDUP(bp)) 1353219089Spjd zio->io_pipeline = ZIO_DDT_FREE_PIPELINE; 1354219089Spjd } 1355219089Spjd 1356219089Spjd return (ZIO_PIPELINE_CONTINUE); 1357219089Spjd} 1358219089Spjd 1359185029Spjd/* 1360185029Spjd * ========================================================================== 1361185029Spjd * Execute the I/O pipeline 1362185029Spjd * ========================================================================== 1363185029Spjd */ 1364185029Spjd 1365168404Spjdstatic void 1366260750Savgzio_taskq_dispatch(zio_t *zio, zio_taskq_type_t q, boolean_t cutinline) 1367168404Spjd{ 1368211931Smm spa_t *spa = zio->io_spa; 1369185029Spjd zio_type_t t = zio->io_type; 1370260742Savg int flags = (cutinline ? TQ_FRONT : 0); 1371168404Spjd 1372216919Smm ASSERT(q == ZIO_TASKQ_ISSUE || q == ZIO_TASKQ_INTERRUPT); 1373216919Smm 1374185029Spjd /* 1375209096Smm * If we're a config writer or a probe, the normal issue and 1376209096Smm * interrupt threads may all be blocked waiting for the config lock. 1377209096Smm * In this case, select the otherwise-unused taskq for ZIO_TYPE_NULL. 1378185029Spjd */ 1379209096Smm if (zio->io_flags & (ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_PROBE)) 1380185029Spjd t = ZIO_TYPE_NULL; 1381185029Spjd 1382185029Spjd /* 1383185029Spjd * A similar issue exists for the L2ARC write thread until L2ARC 2.0. 1384185029Spjd */ 1385185029Spjd if (t == ZIO_TYPE_WRITE && zio->io_vd && zio->io_vd->vdev_aux) 1386185029Spjd t = ZIO_TYPE_NULL; 1387185029Spjd 1388211931Smm /* 1389260750Savg * If this is a high priority I/O, then use the high priority taskq if 1390260750Savg * available. 1391211931Smm */ 1392211931Smm if (zio->io_priority == ZIO_PRIORITY_NOW && 1393260750Savg spa->spa_zio_taskq[t][q + 1].stqs_count != 0) 1394211931Smm q++; 1395211931Smm 1396211931Smm ASSERT3U(q, <, ZIO_TASKQ_TYPES); 1397260742Savg 1398260742Savg /* 1399260742Savg * NB: We are assuming that the zio can only be dispatched 1400260742Savg * to a single taskq at a time. It would be a grievous error 1401260742Savg * to dispatch the zio to another taskq at the same time. 1402260742Savg */ 1403260742Savg#if defined(illumos) || !defined(_KERNEL) 1404260742Savg ASSERT(zio->io_tqent.tqent_next == NULL); 1405216919Smm#else 1406260742Savg ASSERT(zio->io_tqent.tqent_task.ta_pending == 0); 1407216919Smm#endif 1408260750Savg spa_taskq_dispatch_ent(spa, t, q, (task_func_t *)zio_execute, zio, 1409260750Savg flags, &zio->io_tqent); 1410168404Spjd} 1411168404Spjd 1412185029Spjdstatic boolean_t 1413260750Savgzio_taskq_member(zio_t *zio, zio_taskq_type_t q) 1414168404Spjd{ 1415185029Spjd kthread_t *executor = zio->io_executor; 1416185029Spjd spa_t *spa = zio->io_spa; 1417168404Spjd 1418260750Savg for (zio_type_t t = 0; t < ZIO_TYPES; t++) { 1419260750Savg spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; 1420260750Savg uint_t i; 1421260750Savg for (i = 0; i < tqs->stqs_count; i++) { 1422260750Savg if (taskq_member(tqs->stqs_taskq[i], executor)) 1423260750Savg return (B_TRUE); 1424260750Savg } 1425260750Savg } 1426168404Spjd 1427185029Spjd return (B_FALSE); 1428185029Spjd} 1429168404Spjd 1430185029Spjdstatic int 1431270312Ssmhzio_issue_async(zio_t *zio) 1432185029Spjd{ 1433219089Spjd zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE); 1434168404Spjd 1435185029Spjd return (ZIO_PIPELINE_STOP); 1436168404Spjd} 1437168404Spjd 1438185029Spjdvoid 1439185029Spjdzio_interrupt(zio_t *zio) 1440168404Spjd{ 1441219089Spjd zio_taskq_dispatch(zio, ZIO_TASKQ_INTERRUPT, B_FALSE); 1442185029Spjd} 1443168404Spjd 1444185029Spjd/* 1445185029Spjd * Execute the I/O pipeline until one of the following occurs: 1446185029Spjd * 1447251631Sdelphij * (1) the I/O completes 1448251631Sdelphij * (2) the pipeline stalls waiting for dependent child I/Os 1449251631Sdelphij * (3) the I/O issues, so we're waiting for an I/O completion interrupt 1450251631Sdelphij * (4) the I/O is delegated by vdev-level caching or aggregation 1451251631Sdelphij * (5) the I/O is deferred due to vdev-level queueing 1452251631Sdelphij * (6) the I/O is handed off to another thread. 1453251631Sdelphij * 1454251631Sdelphij * In all cases, the pipeline stops whenever there's no CPU work; it never 1455251631Sdelphij * burns a thread in cv_wait(). 1456251631Sdelphij * 1457185029Spjd * There's no locking on io_stage because there's no legitimate way 1458185029Spjd * for multiple threads to be attempting to process the same I/O. 1459185029Spjd */ 1460219089Spjdstatic zio_pipe_stage_t *zio_pipeline[]; 1461168404Spjd 1462185029Spjdvoid 1463185029Spjdzio_execute(zio_t *zio) 1464185029Spjd{ 1465185029Spjd zio->io_executor = curthread; 1466168404Spjd 1467185029Spjd while (zio->io_stage < ZIO_STAGE_DONE) { 1468219089Spjd enum zio_stage pipeline = zio->io_pipeline; 1469219089Spjd enum zio_stage stage = zio->io_stage; 1470185029Spjd int rv; 1471168404Spjd 1472185029Spjd ASSERT(!MUTEX_HELD(&zio->io_lock)); 1473219089Spjd ASSERT(ISP2(stage)); 1474219089Spjd ASSERT(zio->io_stall == NULL); 1475168404Spjd 1476219089Spjd do { 1477219089Spjd stage <<= 1; 1478219089Spjd } while ((stage & pipeline) == 0); 1479168404Spjd 1480185029Spjd ASSERT(stage <= ZIO_STAGE_DONE); 1481168404Spjd 1482168404Spjd /* 1483185029Spjd * If we are in interrupt context and this pipeline stage 1484185029Spjd * will grab a config lock that is held across I/O, 1485219089Spjd * or may wait for an I/O that needs an interrupt thread 1486219089Spjd * to complete, issue async to avoid deadlock. 1487219089Spjd * 1488219089Spjd * For VDEV_IO_START, we cut in line so that the io will 1489219089Spjd * be sent to disk promptly. 1490168404Spjd */ 1491219089Spjd if ((stage & ZIO_BLOCKING_STAGES) && zio->io_vd == NULL && 1492185029Spjd zio_taskq_member(zio, ZIO_TASKQ_INTERRUPT)) { 1493219089Spjd boolean_t cut = (stage == ZIO_STAGE_VDEV_IO_START) ? 1494219089Spjd zio_requeue_io_start_cut_in_line : B_FALSE; 1495219089Spjd zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut); 1496185029Spjd return; 1497185029Spjd } 1498168404Spjd 1499185029Spjd zio->io_stage = stage; 1500270312Ssmh rv = zio_pipeline[highbit64(stage) - 1](zio); 1501185029Spjd 1502185029Spjd if (rv == ZIO_PIPELINE_STOP) 1503185029Spjd return; 1504185029Spjd 1505185029Spjd ASSERT(rv == ZIO_PIPELINE_CONTINUE); 1506168404Spjd } 1507185029Spjd} 1508168404Spjd 1509185029Spjd/* 1510185029Spjd * ========================================================================== 1511185029Spjd * Initiate I/O, either sync or async 1512185029Spjd * ========================================================================== 1513185029Spjd */ 1514185029Spjdint 1515185029Spjdzio_wait(zio_t *zio) 1516185029Spjd{ 1517185029Spjd int error; 1518168404Spjd 1519185029Spjd ASSERT(zio->io_stage == ZIO_STAGE_OPEN); 1520185029Spjd ASSERT(zio->io_executor == NULL); 1521168404Spjd 1522185029Spjd zio->io_waiter = curthread; 1523168404Spjd 1524185029Spjd zio_execute(zio); 1525168404Spjd 1526185029Spjd mutex_enter(&zio->io_lock); 1527185029Spjd while (zio->io_executor != NULL) 1528185029Spjd cv_wait(&zio->io_cv, &zio->io_lock); 1529185029Spjd mutex_exit(&zio->io_lock); 1530168404Spjd 1531185029Spjd error = zio->io_error; 1532185029Spjd zio_destroy(zio); 1533168404Spjd 1534185029Spjd return (error); 1535185029Spjd} 1536185029Spjd 1537185029Spjdvoid 1538185029Spjdzio_nowait(zio_t *zio) 1539185029Spjd{ 1540185029Spjd ASSERT(zio->io_executor == NULL); 1541185029Spjd 1542209962Smm if (zio->io_child_type == ZIO_CHILD_LOGICAL && 1543209962Smm zio_unique_parent(zio) == NULL) { 1544185029Spjd /* 1545185029Spjd * This is a logical async I/O with no parent to wait for it. 1546209962Smm * We add it to the spa_async_root_zio "Godfather" I/O which 1547209962Smm * will ensure they complete prior to unloading the pool. 1548185029Spjd */ 1549185029Spjd spa_t *spa = zio->io_spa; 1550209962Smm 1551273348Sdelphij zio_add_child(spa->spa_async_zio_root[CPU_SEQID], zio); 1552168404Spjd } 1553185029Spjd 1554185029Spjd zio_execute(zio); 1555168404Spjd} 1556168404Spjd 1557168404Spjd/* 1558168404Spjd * ========================================================================== 1559185029Spjd * Reexecute or suspend/resume failed I/O 1560168404Spjd * ========================================================================== 1561168404Spjd */ 1562185029Spjd 1563168404Spjdstatic void 1564185029Spjdzio_reexecute(zio_t *pio) 1565168404Spjd{ 1566209962Smm zio_t *cio, *cio_next; 1567168404Spjd 1568209962Smm ASSERT(pio->io_child_type == ZIO_CHILD_LOGICAL); 1569209962Smm ASSERT(pio->io_orig_stage == ZIO_STAGE_OPEN); 1570209962Smm ASSERT(pio->io_gang_leader == NULL); 1571209962Smm ASSERT(pio->io_gang_tree == NULL); 1572209962Smm 1573185029Spjd pio->io_flags = pio->io_orig_flags; 1574185029Spjd pio->io_stage = pio->io_orig_stage; 1575185029Spjd pio->io_pipeline = pio->io_orig_pipeline; 1576185029Spjd pio->io_reexecute = 0; 1577243524Smm pio->io_flags |= ZIO_FLAG_REEXECUTED; 1578185029Spjd pio->io_error = 0; 1579209962Smm for (int w = 0; w < ZIO_WAIT_TYPES; w++) 1580209962Smm pio->io_state[w] = 0; 1581185029Spjd for (int c = 0; c < ZIO_CHILD_TYPES; c++) 1582185029Spjd pio->io_child_error[c] = 0; 1583185029Spjd 1584219089Spjd if (IO_IS_ALLOCATING(pio)) 1585219089Spjd BP_ZERO(pio->io_bp); 1586168404Spjd 1587185029Spjd /* 1588185029Spjd * As we reexecute pio's children, new children could be created. 1589209962Smm * New children go to the head of pio's io_child_list, however, 1590185029Spjd * so we will (correctly) not reexecute them. The key is that 1591209962Smm * the remainder of pio's io_child_list, from 'cio_next' onward, 1592209962Smm * cannot be affected by any side effects of reexecuting 'cio'. 1593185029Spjd */ 1594209962Smm for (cio = zio_walk_children(pio); cio != NULL; cio = cio_next) { 1595209962Smm cio_next = zio_walk_children(pio); 1596185029Spjd mutex_enter(&pio->io_lock); 1597209962Smm for (int w = 0; w < ZIO_WAIT_TYPES; w++) 1598209962Smm pio->io_children[cio->io_child_type][w]++; 1599185029Spjd mutex_exit(&pio->io_lock); 1600209962Smm zio_reexecute(cio); 1601185029Spjd } 1602168404Spjd 1603168404Spjd /* 1604185029Spjd * Now that all children have been reexecuted, execute the parent. 1605209962Smm * We don't reexecute "The Godfather" I/O here as it's the 1606209962Smm * responsibility of the caller to wait on him. 1607168404Spjd */ 1608209962Smm if (!(pio->io_flags & ZIO_FLAG_GODFATHER)) 1609209962Smm zio_execute(pio); 1610185029Spjd} 1611185029Spjd 1612185029Spjdvoid 1613185029Spjdzio_suspend(spa_t *spa, zio_t *zio) 1614185029Spjd{ 1615185029Spjd if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_PANIC) 1616185029Spjd fm_panic("Pool '%s' has encountered an uncorrectable I/O " 1617185029Spjd "failure and the failure mode property for this pool " 1618185029Spjd "is set to panic.", spa_name(spa)); 1619185029Spjd 1620185029Spjd zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL, NULL, 0, 0); 1621185029Spjd 1622185029Spjd mutex_enter(&spa->spa_suspend_lock); 1623185029Spjd 1624185029Spjd if (spa->spa_suspend_zio_root == NULL) 1625209962Smm spa->spa_suspend_zio_root = zio_root(spa, NULL, NULL, 1626209962Smm ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | 1627209962Smm ZIO_FLAG_GODFATHER); 1628185029Spjd 1629185029Spjd spa->spa_suspended = B_TRUE; 1630185029Spjd 1631185029Spjd if (zio != NULL) { 1632209962Smm ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER)); 1633185029Spjd ASSERT(zio != spa->spa_suspend_zio_root); 1634185029Spjd ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 1635209962Smm ASSERT(zio_unique_parent(zio) == NULL); 1636185029Spjd ASSERT(zio->io_stage == ZIO_STAGE_DONE); 1637185029Spjd zio_add_child(spa->spa_suspend_zio_root, zio); 1638168404Spjd } 1639168404Spjd 1640185029Spjd mutex_exit(&spa->spa_suspend_lock); 1641168404Spjd} 1642168404Spjd 1643209962Smmint 1644185029Spjdzio_resume(spa_t *spa) 1645168404Spjd{ 1646209962Smm zio_t *pio; 1647168404Spjd 1648185029Spjd /* 1649185029Spjd * Reexecute all previously suspended i/o. 1650185029Spjd */ 1651185029Spjd mutex_enter(&spa->spa_suspend_lock); 1652185029Spjd spa->spa_suspended = B_FALSE; 1653185029Spjd cv_broadcast(&spa->spa_suspend_cv); 1654185029Spjd pio = spa->spa_suspend_zio_root; 1655185029Spjd spa->spa_suspend_zio_root = NULL; 1656185029Spjd mutex_exit(&spa->spa_suspend_lock); 1657168404Spjd 1658185029Spjd if (pio == NULL) 1659209962Smm return (0); 1660168404Spjd 1661209962Smm zio_reexecute(pio); 1662209962Smm return (zio_wait(pio)); 1663168404Spjd} 1664168404Spjd 1665185029Spjdvoid 1666185029Spjdzio_resume_wait(spa_t *spa) 1667185029Spjd{ 1668185029Spjd mutex_enter(&spa->spa_suspend_lock); 1669185029Spjd while (spa_suspended(spa)) 1670185029Spjd cv_wait(&spa->spa_suspend_cv, &spa->spa_suspend_lock); 1671185029Spjd mutex_exit(&spa->spa_suspend_lock); 1672185029Spjd} 1673185029Spjd 1674168404Spjd/* 1675168404Spjd * ========================================================================== 1676185029Spjd * Gang blocks. 1677185029Spjd * 1678185029Spjd * A gang block is a collection of small blocks that looks to the DMU 1679185029Spjd * like one large block. When zio_dva_allocate() cannot find a block 1680185029Spjd * of the requested size, due to either severe fragmentation or the pool 1681185029Spjd * being nearly full, it calls zio_write_gang_block() to construct the 1682185029Spjd * block from smaller fragments. 1683185029Spjd * 1684185029Spjd * A gang block consists of a gang header (zio_gbh_phys_t) and up to 1685185029Spjd * three (SPA_GBH_NBLKPTRS) gang members. The gang header is just like 1686185029Spjd * an indirect block: it's an array of block pointers. It consumes 1687185029Spjd * only one sector and hence is allocatable regardless of fragmentation. 1688185029Spjd * The gang header's bps point to its gang members, which hold the data. 1689185029Spjd * 1690185029Spjd * Gang blocks are self-checksumming, using the bp's <vdev, offset, txg> 1691185029Spjd * as the verifier to ensure uniqueness of the SHA256 checksum. 1692185029Spjd * Critically, the gang block bp's blk_cksum is the checksum of the data, 1693185029Spjd * not the gang header. This ensures that data block signatures (needed for 1694185029Spjd * deduplication) are independent of how the block is physically stored. 1695185029Spjd * 1696185029Spjd * Gang blocks can be nested: a gang member may itself be a gang block. 1697185029Spjd * Thus every gang block is a tree in which root and all interior nodes are 1698185029Spjd * gang headers, and the leaves are normal blocks that contain user data. 1699185029Spjd * The root of the gang tree is called the gang leader. 1700185029Spjd * 1701185029Spjd * To perform any operation (read, rewrite, free, claim) on a gang block, 1702185029Spjd * zio_gang_assemble() first assembles the gang tree (minus data leaves) 1703185029Spjd * in the io_gang_tree field of the original logical i/o by recursively 1704185029Spjd * reading the gang leader and all gang headers below it. This yields 1705185029Spjd * an in-core tree containing the contents of every gang header and the 1706185029Spjd * bps for every constituent of the gang block. 1707185029Spjd * 1708185029Spjd * With the gang tree now assembled, zio_gang_issue() just walks the gang tree 1709185029Spjd * and invokes a callback on each bp. To free a gang block, zio_gang_issue() 1710185029Spjd * calls zio_free_gang() -- a trivial wrapper around zio_free() -- for each bp. 1711185029Spjd * zio_claim_gang() provides a similarly trivial wrapper for zio_claim(). 1712185029Spjd * zio_read_gang() is a wrapper around zio_read() that omits reading gang 1713185029Spjd * headers, since we already have those in io_gang_tree. zio_rewrite_gang() 1714185029Spjd * performs a zio_rewrite() of the data or, for gang headers, a zio_rewrite() 1715185029Spjd * of the gang header plus zio_checksum_compute() of the data to update the 1716185029Spjd * gang header's blk_cksum as described above. 1717185029Spjd * 1718185029Spjd * The two-phase assemble/issue model solves the problem of partial failure -- 1719185029Spjd * what if you'd freed part of a gang block but then couldn't read the 1720185029Spjd * gang header for another part? Assembling the entire gang tree first 1721185029Spjd * ensures that all the necessary gang header I/O has succeeded before 1722185029Spjd * starting the actual work of free, claim, or write. Once the gang tree 1723185029Spjd * is assembled, free and claim are in-memory operations that cannot fail. 1724185029Spjd * 1725185029Spjd * In the event that a gang write fails, zio_dva_unallocate() walks the 1726185029Spjd * gang tree to immediately free (i.e. insert back into the space map) 1727185029Spjd * everything we've allocated. This ensures that we don't get ENOSPC 1728185029Spjd * errors during repeated suspend/resume cycles due to a flaky device. 1729185029Spjd * 1730185029Spjd * Gang rewrites only happen during sync-to-convergence. If we can't assemble 1731185029Spjd * the gang tree, we won't modify the block, so we can safely defer the free 1732185029Spjd * (knowing that the block is still intact). If we *can* assemble the gang 1733185029Spjd * tree, then even if some of the rewrites fail, zio_dva_unallocate() will free 1734185029Spjd * each constituent bp and we can allocate a new block on the next sync pass. 1735185029Spjd * 1736185029Spjd * In all cases, the gang tree allows complete recovery from partial failure. 1737168404Spjd * ========================================================================== 1738168404Spjd */ 1739185029Spjd 1740185029Spjdstatic zio_t * 1741185029Spjdzio_read_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) 1742168404Spjd{ 1743185029Spjd if (gn != NULL) 1744185029Spjd return (pio); 1745168404Spjd 1746185029Spjd return (zio_read(pio, pio->io_spa, bp, data, BP_GET_PSIZE(bp), 1747185029Spjd NULL, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), 1748185029Spjd &pio->io_bookmark)); 1749168404Spjd} 1750168404Spjd 1751185029Spjdzio_t * 1752185029Spjdzio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) 1753168404Spjd{ 1754185029Spjd zio_t *zio; 1755168404Spjd 1756185029Spjd if (gn != NULL) { 1757185029Spjd zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, 1758185029Spjd gn->gn_gbh, SPA_GANGBLOCKSIZE, NULL, NULL, pio->io_priority, 1759185029Spjd ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); 1760185029Spjd /* 1761185029Spjd * As we rewrite each gang header, the pipeline will compute 1762185029Spjd * a new gang block header checksum for it; but no one will 1763185029Spjd * compute a new data checksum, so we do that here. The one 1764185029Spjd * exception is the gang leader: the pipeline already computed 1765185029Spjd * its data checksum because that stage precedes gang assembly. 1766185029Spjd * (Presently, nothing actually uses interior data checksums; 1767185029Spjd * this is just good hygiene.) 1768185029Spjd */ 1769209962Smm if (gn != pio->io_gang_leader->io_gang_tree) { 1770185029Spjd zio_checksum_compute(zio, BP_GET_CHECKSUM(bp), 1771185029Spjd data, BP_GET_PSIZE(bp)); 1772185029Spjd } 1773219089Spjd /* 1774219089Spjd * If we are here to damage data for testing purposes, 1775219089Spjd * leave the GBH alone so that we can detect the damage. 1776219089Spjd */ 1777219089Spjd if (pio->io_gang_leader->io_flags & ZIO_FLAG_INDUCE_DAMAGE) 1778219089Spjd zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; 1779185029Spjd } else { 1780185029Spjd zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, 1781185029Spjd data, BP_GET_PSIZE(bp), NULL, NULL, pio->io_priority, 1782185029Spjd ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); 1783185029Spjd } 1784185029Spjd 1785185029Spjd return (zio); 1786168404Spjd} 1787168404Spjd 1788185029Spjd/* ARGSUSED */ 1789185029Spjdzio_t * 1790185029Spjdzio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) 1791168404Spjd{ 1792219089Spjd return (zio_free_sync(pio, pio->io_spa, pio->io_txg, bp, 1793240868Spjd BP_IS_GANG(bp) ? SPA_GANGBLOCKSIZE : BP_GET_PSIZE(bp), 1794219089Spjd ZIO_GANG_CHILD_FLAGS(pio))); 1795185029Spjd} 1796168404Spjd 1797185029Spjd/* ARGSUSED */ 1798185029Spjdzio_t * 1799185029Spjdzio_claim_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) 1800185029Spjd{ 1801185029Spjd return (zio_claim(pio, pio->io_spa, pio->io_txg, bp, 1802185029Spjd NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio))); 1803185029Spjd} 1804168404Spjd 1805185029Spjdstatic zio_gang_issue_func_t *zio_gang_issue_func[ZIO_TYPES] = { 1806185029Spjd NULL, 1807185029Spjd zio_read_gang, 1808185029Spjd zio_rewrite_gang, 1809185029Spjd zio_free_gang, 1810185029Spjd zio_claim_gang, 1811185029Spjd NULL 1812185029Spjd}; 1813168404Spjd 1814185029Spjdstatic void zio_gang_tree_assemble_done(zio_t *zio); 1815168404Spjd 1816185029Spjdstatic zio_gang_node_t * 1817185029Spjdzio_gang_node_alloc(zio_gang_node_t **gnpp) 1818185029Spjd{ 1819185029Spjd zio_gang_node_t *gn; 1820185029Spjd 1821185029Spjd ASSERT(*gnpp == NULL); 1822185029Spjd 1823185029Spjd gn = kmem_zalloc(sizeof (*gn), KM_SLEEP); 1824185029Spjd gn->gn_gbh = zio_buf_alloc(SPA_GANGBLOCKSIZE); 1825185029Spjd *gnpp = gn; 1826185029Spjd 1827185029Spjd return (gn); 1828168404Spjd} 1829168404Spjd 1830168404Spjdstatic void 1831185029Spjdzio_gang_node_free(zio_gang_node_t **gnpp) 1832168404Spjd{ 1833185029Spjd zio_gang_node_t *gn = *gnpp; 1834168404Spjd 1835185029Spjd for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) 1836185029Spjd ASSERT(gn->gn_child[g] == NULL); 1837168404Spjd 1838185029Spjd zio_buf_free(gn->gn_gbh, SPA_GANGBLOCKSIZE); 1839185029Spjd kmem_free(gn, sizeof (*gn)); 1840185029Spjd *gnpp = NULL; 1841185029Spjd} 1842168404Spjd 1843185029Spjdstatic void 1844185029Spjdzio_gang_tree_free(zio_gang_node_t **gnpp) 1845185029Spjd{ 1846185029Spjd zio_gang_node_t *gn = *gnpp; 1847168404Spjd 1848185029Spjd if (gn == NULL) 1849185029Spjd return; 1850168404Spjd 1851185029Spjd for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) 1852185029Spjd zio_gang_tree_free(&gn->gn_child[g]); 1853168404Spjd 1854185029Spjd zio_gang_node_free(gnpp); 1855168404Spjd} 1856168404Spjd 1857168404Spjdstatic void 1858209962Smmzio_gang_tree_assemble(zio_t *gio, blkptr_t *bp, zio_gang_node_t **gnpp) 1859168404Spjd{ 1860185029Spjd zio_gang_node_t *gn = zio_gang_node_alloc(gnpp); 1861168404Spjd 1862209962Smm ASSERT(gio->io_gang_leader == gio); 1863185029Spjd ASSERT(BP_IS_GANG(bp)); 1864168404Spjd 1865209962Smm zio_nowait(zio_read(gio, gio->io_spa, bp, gn->gn_gbh, 1866185029Spjd SPA_GANGBLOCKSIZE, zio_gang_tree_assemble_done, gn, 1867209962Smm gio->io_priority, ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark)); 1868185029Spjd} 1869168404Spjd 1870185029Spjdstatic void 1871185029Spjdzio_gang_tree_assemble_done(zio_t *zio) 1872185029Spjd{ 1873209962Smm zio_t *gio = zio->io_gang_leader; 1874185029Spjd zio_gang_node_t *gn = zio->io_private; 1875185029Spjd blkptr_t *bp = zio->io_bp; 1876168404Spjd 1877209962Smm ASSERT(gio == zio_unique_parent(zio)); 1878219089Spjd ASSERT(zio->io_child_count == 0); 1879168404Spjd 1880185029Spjd if (zio->io_error) 1881185029Spjd return; 1882168404Spjd 1883185029Spjd if (BP_SHOULD_BYTESWAP(bp)) 1884185029Spjd byteswap_uint64_array(zio->io_data, zio->io_size); 1885185029Spjd 1886185029Spjd ASSERT(zio->io_data == gn->gn_gbh); 1887185029Spjd ASSERT(zio->io_size == SPA_GANGBLOCKSIZE); 1888219089Spjd ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC); 1889185029Spjd 1890185029Spjd for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { 1891185029Spjd blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; 1892185029Spjd if (!BP_IS_GANG(gbp)) 1893185029Spjd continue; 1894209962Smm zio_gang_tree_assemble(gio, gbp, &gn->gn_child[g]); 1895168404Spjd } 1896168404Spjd} 1897168404Spjd 1898168404Spjdstatic void 1899185029Spjdzio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, void *data) 1900168404Spjd{ 1901209962Smm zio_t *gio = pio->io_gang_leader; 1902185029Spjd zio_t *zio; 1903168404Spjd 1904185029Spjd ASSERT(BP_IS_GANG(bp) == !!gn); 1905209962Smm ASSERT(BP_GET_CHECKSUM(bp) == BP_GET_CHECKSUM(gio->io_bp)); 1906209962Smm ASSERT(BP_GET_LSIZE(bp) == BP_GET_PSIZE(bp) || gn == gio->io_gang_tree); 1907168404Spjd 1908185029Spjd /* 1909185029Spjd * If you're a gang header, your data is in gn->gn_gbh. 1910185029Spjd * If you're a gang member, your data is in 'data' and gn == NULL. 1911185029Spjd */ 1912209962Smm zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data); 1913168404Spjd 1914185029Spjd if (gn != NULL) { 1915219089Spjd ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC); 1916168404Spjd 1917185029Spjd for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { 1918185029Spjd blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; 1919185029Spjd if (BP_IS_HOLE(gbp)) 1920185029Spjd continue; 1921185029Spjd zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data); 1922185029Spjd data = (char *)data + BP_GET_PSIZE(gbp); 1923185029Spjd } 1924168404Spjd } 1925168404Spjd 1926240868Spjd if (gn == gio->io_gang_tree && gio->io_data != NULL) 1927209962Smm ASSERT3P((char *)gio->io_data + gio->io_size, ==, data); 1928185029Spjd 1929185029Spjd if (zio != pio) 1930185029Spjd zio_nowait(zio); 1931168404Spjd} 1932168404Spjd 1933185029Spjdstatic int 1934270312Ssmhzio_gang_assemble(zio_t *zio) 1935168404Spjd{ 1936185029Spjd blkptr_t *bp = zio->io_bp; 1937168404Spjd 1938209962Smm ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == NULL); 1939209962Smm ASSERT(zio->io_child_type > ZIO_CHILD_GANG); 1940168404Spjd 1941209962Smm zio->io_gang_leader = zio; 1942209962Smm 1943185029Spjd zio_gang_tree_assemble(zio, bp, &zio->io_gang_tree); 1944168404Spjd 1945185029Spjd return (ZIO_PIPELINE_CONTINUE); 1946185029Spjd} 1947168404Spjd 1948185029Spjdstatic int 1949270312Ssmhzio_gang_issue(zio_t *zio) 1950185029Spjd{ 1951185029Spjd blkptr_t *bp = zio->io_bp; 1952185029Spjd 1953185029Spjd if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE)) 1954185029Spjd return (ZIO_PIPELINE_STOP); 1955185029Spjd 1956209962Smm ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == zio); 1957209962Smm ASSERT(zio->io_child_type > ZIO_CHILD_GANG); 1958185029Spjd 1959185029Spjd if (zio->io_child_error[ZIO_CHILD_GANG] == 0) 1960209962Smm zio_gang_tree_issue(zio, zio->io_gang_tree, bp, zio->io_data); 1961185029Spjd else 1962209962Smm zio_gang_tree_free(&zio->io_gang_tree); 1963185029Spjd 1964185029Spjd zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1965185029Spjd 1966185029Spjd return (ZIO_PIPELINE_CONTINUE); 1967168404Spjd} 1968168404Spjd 1969168404Spjdstatic void 1970185029Spjdzio_write_gang_member_ready(zio_t *zio) 1971168404Spjd{ 1972209962Smm zio_t *pio = zio_unique_parent(zio); 1973209962Smm zio_t *gio = zio->io_gang_leader; 1974168404Spjd dva_t *cdva = zio->io_bp->blk_dva; 1975168404Spjd dva_t *pdva = pio->io_bp->blk_dva; 1976168404Spjd uint64_t asize; 1977168404Spjd 1978185029Spjd if (BP_IS_HOLE(zio->io_bp)) 1979185029Spjd return; 1980185029Spjd 1981185029Spjd ASSERT(BP_IS_HOLE(&zio->io_bp_orig)); 1982185029Spjd 1983185029Spjd ASSERT(zio->io_child_type == ZIO_CHILD_GANG); 1984219089Spjd ASSERT3U(zio->io_prop.zp_copies, ==, gio->io_prop.zp_copies); 1985219089Spjd ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(zio->io_bp)); 1986219089Spjd ASSERT3U(pio->io_prop.zp_copies, <=, BP_GET_NDVAS(pio->io_bp)); 1987168404Spjd ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp)); 1988168404Spjd 1989168404Spjd mutex_enter(&pio->io_lock); 1990185029Spjd for (int d = 0; d < BP_GET_NDVAS(zio->io_bp); d++) { 1991168404Spjd ASSERT(DVA_GET_GANG(&pdva[d])); 1992168404Spjd asize = DVA_GET_ASIZE(&pdva[d]); 1993168404Spjd asize += DVA_GET_ASIZE(&cdva[d]); 1994168404Spjd DVA_SET_ASIZE(&pdva[d], asize); 1995168404Spjd } 1996168404Spjd mutex_exit(&pio->io_lock); 1997168404Spjd} 1998168404Spjd 1999185029Spjdstatic int 2000185029Spjdzio_write_gang_block(zio_t *pio) 2001168404Spjd{ 2002185029Spjd spa_t *spa = pio->io_spa; 2003185029Spjd blkptr_t *bp = pio->io_bp; 2004209962Smm zio_t *gio = pio->io_gang_leader; 2005185029Spjd zio_t *zio; 2006185029Spjd zio_gang_node_t *gn, **gnpp; 2007168404Spjd zio_gbh_phys_t *gbh; 2008185029Spjd uint64_t txg = pio->io_txg; 2009185029Spjd uint64_t resid = pio->io_size; 2010185029Spjd uint64_t lsize; 2011219089Spjd int copies = gio->io_prop.zp_copies; 2012219089Spjd int gbh_copies = MIN(copies + 1, spa_max_replication(spa)); 2013185029Spjd zio_prop_t zp; 2014168404Spjd int error; 2015168404Spjd 2016219089Spjd error = metaslab_alloc(spa, spa_normal_class(spa), SPA_GANGBLOCKSIZE, 2017219089Spjd bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp, 2018185029Spjd METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER); 2019185029Spjd if (error) { 2020185029Spjd pio->io_error = error; 2021185029Spjd return (ZIO_PIPELINE_CONTINUE); 2022185029Spjd } 2023168404Spjd 2024209962Smm if (pio == gio) { 2025209962Smm gnpp = &gio->io_gang_tree; 2026185029Spjd } else { 2027185029Spjd gnpp = pio->io_private; 2028185029Spjd ASSERT(pio->io_ready == zio_write_gang_member_ready); 2029185029Spjd } 2030168404Spjd 2031185029Spjd gn = zio_gang_node_alloc(gnpp); 2032185029Spjd gbh = gn->gn_gbh; 2033185029Spjd bzero(gbh, SPA_GANGBLOCKSIZE); 2034168404Spjd 2035185029Spjd /* 2036185029Spjd * Create the gang header. 2037185029Spjd */ 2038185029Spjd zio = zio_rewrite(pio, spa, txg, bp, gbh, SPA_GANGBLOCKSIZE, NULL, NULL, 2039185029Spjd pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); 2040168404Spjd 2041185029Spjd /* 2042185029Spjd * Create and nowait the gang children. 2043185029Spjd */ 2044185029Spjd for (int g = 0; resid != 0; resid -= lsize, g++) { 2045185029Spjd lsize = P2ROUNDUP(resid / (SPA_GBH_NBLKPTRS - g), 2046185029Spjd SPA_MINBLOCKSIZE); 2047185029Spjd ASSERT(lsize >= SPA_MINBLOCKSIZE && lsize <= resid); 2048168404Spjd 2049209962Smm zp.zp_checksum = gio->io_prop.zp_checksum; 2050185029Spjd zp.zp_compress = ZIO_COMPRESS_OFF; 2051185029Spjd zp.zp_type = DMU_OT_NONE; 2052185029Spjd zp.zp_level = 0; 2053219089Spjd zp.zp_copies = gio->io_prop.zp_copies; 2054243524Smm zp.zp_dedup = B_FALSE; 2055243524Smm zp.zp_dedup_verify = B_FALSE; 2056243524Smm zp.zp_nopwrite = B_FALSE; 2057168404Spjd 2058185029Spjd zio_nowait(zio_write(zio, spa, txg, &gbh->zg_blkptr[g], 2059185029Spjd (char *)pio->io_data + (pio->io_size - resid), lsize, &zp, 2060260763Savg zio_write_gang_member_ready, NULL, NULL, &gn->gn_child[g], 2061185029Spjd pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), 2062185029Spjd &pio->io_bookmark)); 2063168404Spjd } 2064168404Spjd 2065185029Spjd /* 2066185029Spjd * Set pio's pipeline to just wait for zio to finish. 2067185029Spjd */ 2068185029Spjd pio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 2069168404Spjd 2070185029Spjd zio_nowait(zio); 2071168404Spjd 2072185029Spjd return (ZIO_PIPELINE_CONTINUE); 2073168404Spjd} 2074168404Spjd 2075168404Spjd/* 2076243524Smm * The zio_nop_write stage in the pipeline determines if allocating 2077243524Smm * a new bp is necessary. By leveraging a cryptographically secure checksum, 2078243524Smm * such as SHA256, we can compare the checksums of the new data and the old 2079243524Smm * to determine if allocating a new block is required. The nopwrite 2080243524Smm * feature can handle writes in either syncing or open context (i.e. zil 2081243524Smm * writes) and as a result is mutually exclusive with dedup. 2082243524Smm */ 2083243524Smmstatic int 2084270312Ssmhzio_nop_write(zio_t *zio) 2085243524Smm{ 2086243524Smm blkptr_t *bp = zio->io_bp; 2087243524Smm blkptr_t *bp_orig = &zio->io_bp_orig; 2088243524Smm zio_prop_t *zp = &zio->io_prop; 2089243524Smm 2090243524Smm ASSERT(BP_GET_LEVEL(bp) == 0); 2091243524Smm ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); 2092243524Smm ASSERT(zp->zp_nopwrite); 2093243524Smm ASSERT(!zp->zp_dedup); 2094243524Smm ASSERT(zio->io_bp_override == NULL); 2095243524Smm ASSERT(IO_IS_ALLOCATING(zio)); 2096243524Smm 2097243524Smm /* 2098243524Smm * Check to see if the original bp and the new bp have matching 2099243524Smm * characteristics (i.e. same checksum, compression algorithms, etc). 2100243524Smm * If they don't then just continue with the pipeline which will 2101243524Smm * allocate a new bp. 2102243524Smm */ 2103243524Smm if (BP_IS_HOLE(bp_orig) || 2104243524Smm !zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_dedup || 2105243524Smm BP_GET_CHECKSUM(bp) != BP_GET_CHECKSUM(bp_orig) || 2106243524Smm BP_GET_COMPRESS(bp) != BP_GET_COMPRESS(bp_orig) || 2107243524Smm BP_GET_DEDUP(bp) != BP_GET_DEDUP(bp_orig) || 2108243524Smm zp->zp_copies != BP_GET_NDVAS(bp_orig)) 2109243524Smm return (ZIO_PIPELINE_CONTINUE); 2110243524Smm 2111243524Smm /* 2112243524Smm * If the checksums match then reset the pipeline so that we 2113243524Smm * avoid allocating a new bp and issuing any I/O. 2114243524Smm */ 2115243524Smm if (ZIO_CHECKSUM_EQUAL(bp->blk_cksum, bp_orig->blk_cksum)) { 2116243524Smm ASSERT(zio_checksum_table[zp->zp_checksum].ci_dedup); 2117243524Smm ASSERT3U(BP_GET_PSIZE(bp), ==, BP_GET_PSIZE(bp_orig)); 2118243524Smm ASSERT3U(BP_GET_LSIZE(bp), ==, BP_GET_LSIZE(bp_orig)); 2119243524Smm ASSERT(zp->zp_compress != ZIO_COMPRESS_OFF); 2120243524Smm ASSERT(bcmp(&bp->blk_prop, &bp_orig->blk_prop, 2121243524Smm sizeof (uint64_t)) == 0); 2122243524Smm 2123243524Smm *bp = *bp_orig; 2124243524Smm zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 2125243524Smm zio->io_flags |= ZIO_FLAG_NOPWRITE; 2126243524Smm } 2127243524Smm 2128243524Smm return (ZIO_PIPELINE_CONTINUE); 2129243524Smm} 2130243524Smm 2131243524Smm/* 2132168404Spjd * ========================================================================== 2133219089Spjd * Dedup 2134168404Spjd * ========================================================================== 2135168404Spjd */ 2136219089Spjdstatic void 2137219089Spjdzio_ddt_child_read_done(zio_t *zio) 2138219089Spjd{ 2139219089Spjd blkptr_t *bp = zio->io_bp; 2140219089Spjd ddt_entry_t *dde = zio->io_private; 2141219089Spjd ddt_phys_t *ddp; 2142219089Spjd zio_t *pio = zio_unique_parent(zio); 2143185029Spjd 2144219089Spjd mutex_enter(&pio->io_lock); 2145219089Spjd ddp = ddt_phys_select(dde, bp); 2146219089Spjd if (zio->io_error == 0) 2147219089Spjd ddt_phys_clear(ddp); /* this ddp doesn't need repair */ 2148219089Spjd if (zio->io_error == 0 && dde->dde_repair_data == NULL) 2149219089Spjd dde->dde_repair_data = zio->io_data; 2150219089Spjd else 2151219089Spjd zio_buf_free(zio->io_data, zio->io_size); 2152219089Spjd mutex_exit(&pio->io_lock); 2153219089Spjd} 2154219089Spjd 2155185029Spjdstatic int 2156270312Ssmhzio_ddt_read_start(zio_t *zio) 2157219089Spjd{ 2158219089Spjd blkptr_t *bp = zio->io_bp; 2159219089Spjd 2160219089Spjd ASSERT(BP_GET_DEDUP(bp)); 2161219089Spjd ASSERT(BP_GET_PSIZE(bp) == zio->io_size); 2162219089Spjd ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 2163219089Spjd 2164219089Spjd if (zio->io_child_error[ZIO_CHILD_DDT]) { 2165219089Spjd ddt_t *ddt = ddt_select(zio->io_spa, bp); 2166219089Spjd ddt_entry_t *dde = ddt_repair_start(ddt, bp); 2167219089Spjd ddt_phys_t *ddp = dde->dde_phys; 2168219089Spjd ddt_phys_t *ddp_self = ddt_phys_select(dde, bp); 2169219089Spjd blkptr_t blk; 2170219089Spjd 2171219089Spjd ASSERT(zio->io_vsd == NULL); 2172219089Spjd zio->io_vsd = dde; 2173219089Spjd 2174219089Spjd if (ddp_self == NULL) 2175219089Spjd return (ZIO_PIPELINE_CONTINUE); 2176219089Spjd 2177219089Spjd for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { 2178219089Spjd if (ddp->ddp_phys_birth == 0 || ddp == ddp_self) 2179219089Spjd continue; 2180219089Spjd ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, ddp, 2181219089Spjd &blk); 2182219089Spjd zio_nowait(zio_read(zio, zio->io_spa, &blk, 2183219089Spjd zio_buf_alloc(zio->io_size), zio->io_size, 2184219089Spjd zio_ddt_child_read_done, dde, zio->io_priority, 2185219089Spjd ZIO_DDT_CHILD_FLAGS(zio) | ZIO_FLAG_DONT_PROPAGATE, 2186219089Spjd &zio->io_bookmark)); 2187219089Spjd } 2188219089Spjd return (ZIO_PIPELINE_CONTINUE); 2189219089Spjd } 2190219089Spjd 2191219089Spjd zio_nowait(zio_read(zio, zio->io_spa, bp, 2192219089Spjd zio->io_data, zio->io_size, NULL, NULL, zio->io_priority, 2193219089Spjd ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark)); 2194219089Spjd 2195219089Spjd return (ZIO_PIPELINE_CONTINUE); 2196219089Spjd} 2197219089Spjd 2198219089Spjdstatic int 2199270312Ssmhzio_ddt_read_done(zio_t *zio) 2200219089Spjd{ 2201219089Spjd blkptr_t *bp = zio->io_bp; 2202219089Spjd 2203219089Spjd if (zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE)) 2204219089Spjd return (ZIO_PIPELINE_STOP); 2205219089Spjd 2206219089Spjd ASSERT(BP_GET_DEDUP(bp)); 2207219089Spjd ASSERT(BP_GET_PSIZE(bp) == zio->io_size); 2208219089Spjd ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 2209219089Spjd 2210219089Spjd if (zio->io_child_error[ZIO_CHILD_DDT]) { 2211219089Spjd ddt_t *ddt = ddt_select(zio->io_spa, bp); 2212219089Spjd ddt_entry_t *dde = zio->io_vsd; 2213219089Spjd if (ddt == NULL) { 2214219089Spjd ASSERT(spa_load_state(zio->io_spa) != SPA_LOAD_NONE); 2215219089Spjd return (ZIO_PIPELINE_CONTINUE); 2216219089Spjd } 2217219089Spjd if (dde == NULL) { 2218219089Spjd zio->io_stage = ZIO_STAGE_DDT_READ_START >> 1; 2219219089Spjd zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE); 2220219089Spjd return (ZIO_PIPELINE_STOP); 2221219089Spjd } 2222219089Spjd if (dde->dde_repair_data != NULL) { 2223219089Spjd bcopy(dde->dde_repair_data, zio->io_data, zio->io_size); 2224219089Spjd zio->io_child_error[ZIO_CHILD_DDT] = 0; 2225219089Spjd } 2226219089Spjd ddt_repair_done(ddt, dde); 2227219089Spjd zio->io_vsd = NULL; 2228219089Spjd } 2229219089Spjd 2230219089Spjd ASSERT(zio->io_vsd == NULL); 2231219089Spjd 2232219089Spjd return (ZIO_PIPELINE_CONTINUE); 2233219089Spjd} 2234219089Spjd 2235219089Spjdstatic boolean_t 2236219089Spjdzio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde) 2237219089Spjd{ 2238219089Spjd spa_t *spa = zio->io_spa; 2239219089Spjd 2240219089Spjd /* 2241219089Spjd * Note: we compare the original data, not the transformed data, 2242219089Spjd * because when zio->io_bp is an override bp, we will not have 2243219089Spjd * pushed the I/O transforms. That's an important optimization 2244219089Spjd * because otherwise we'd compress/encrypt all dmu_sync() data twice. 2245219089Spjd */ 2246219089Spjd for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) { 2247219089Spjd zio_t *lio = dde->dde_lead_zio[p]; 2248219089Spjd 2249219089Spjd if (lio != NULL) { 2250219089Spjd return (lio->io_orig_size != zio->io_orig_size || 2251219089Spjd bcmp(zio->io_orig_data, lio->io_orig_data, 2252219089Spjd zio->io_orig_size) != 0); 2253219089Spjd } 2254219089Spjd } 2255219089Spjd 2256219089Spjd for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) { 2257219089Spjd ddt_phys_t *ddp = &dde->dde_phys[p]; 2258219089Spjd 2259219089Spjd if (ddp->ddp_phys_birth != 0) { 2260219089Spjd arc_buf_t *abuf = NULL; 2261277586Sdelphij arc_flags_t aflags = ARC_FLAG_WAIT; 2262219089Spjd blkptr_t blk = *zio->io_bp; 2263219089Spjd int error; 2264219089Spjd 2265219089Spjd ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth); 2266219089Spjd 2267219089Spjd ddt_exit(ddt); 2268219089Spjd 2269246666Smm error = arc_read(NULL, spa, &blk, 2270219089Spjd arc_getbuf_func, &abuf, ZIO_PRIORITY_SYNC_READ, 2271219089Spjd ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, 2272219089Spjd &aflags, &zio->io_bookmark); 2273219089Spjd 2274219089Spjd if (error == 0) { 2275219089Spjd if (arc_buf_size(abuf) != zio->io_orig_size || 2276219089Spjd bcmp(abuf->b_data, zio->io_orig_data, 2277219089Spjd zio->io_orig_size) != 0) 2278249195Smm error = SET_ERROR(EEXIST); 2279248571Smm VERIFY(arc_buf_remove_ref(abuf, &abuf)); 2280219089Spjd } 2281219089Spjd 2282219089Spjd ddt_enter(ddt); 2283219089Spjd return (error != 0); 2284219089Spjd } 2285219089Spjd } 2286219089Spjd 2287219089Spjd return (B_FALSE); 2288219089Spjd} 2289219089Spjd 2290219089Spjdstatic void 2291219089Spjdzio_ddt_child_write_ready(zio_t *zio) 2292219089Spjd{ 2293219089Spjd int p = zio->io_prop.zp_copies; 2294219089Spjd ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp); 2295219089Spjd ddt_entry_t *dde = zio->io_private; 2296219089Spjd ddt_phys_t *ddp = &dde->dde_phys[p]; 2297219089Spjd zio_t *pio; 2298219089Spjd 2299219089Spjd if (zio->io_error) 2300219089Spjd return; 2301219089Spjd 2302219089Spjd ddt_enter(ddt); 2303219089Spjd 2304219089Spjd ASSERT(dde->dde_lead_zio[p] == zio); 2305219089Spjd 2306219089Spjd ddt_phys_fill(ddp, zio->io_bp); 2307219089Spjd 2308219089Spjd while ((pio = zio_walk_parents(zio)) != NULL) 2309219089Spjd ddt_bp_fill(ddp, pio->io_bp, zio->io_txg); 2310219089Spjd 2311219089Spjd ddt_exit(ddt); 2312219089Spjd} 2313219089Spjd 2314219089Spjdstatic void 2315219089Spjdzio_ddt_child_write_done(zio_t *zio) 2316219089Spjd{ 2317219089Spjd int p = zio->io_prop.zp_copies; 2318219089Spjd ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp); 2319219089Spjd ddt_entry_t *dde = zio->io_private; 2320219089Spjd ddt_phys_t *ddp = &dde->dde_phys[p]; 2321219089Spjd 2322219089Spjd ddt_enter(ddt); 2323219089Spjd 2324219089Spjd ASSERT(ddp->ddp_refcnt == 0); 2325219089Spjd ASSERT(dde->dde_lead_zio[p] == zio); 2326219089Spjd dde->dde_lead_zio[p] = NULL; 2327219089Spjd 2328219089Spjd if (zio->io_error == 0) { 2329219089Spjd while (zio_walk_parents(zio) != NULL) 2330219089Spjd ddt_phys_addref(ddp); 2331219089Spjd } else { 2332219089Spjd ddt_phys_clear(ddp); 2333219089Spjd } 2334219089Spjd 2335219089Spjd ddt_exit(ddt); 2336219089Spjd} 2337219089Spjd 2338219089Spjdstatic void 2339219089Spjdzio_ddt_ditto_write_done(zio_t *zio) 2340219089Spjd{ 2341219089Spjd int p = DDT_PHYS_DITTO; 2342219089Spjd zio_prop_t *zp = &zio->io_prop; 2343219089Spjd blkptr_t *bp = zio->io_bp; 2344219089Spjd ddt_t *ddt = ddt_select(zio->io_spa, bp); 2345219089Spjd ddt_entry_t *dde = zio->io_private; 2346219089Spjd ddt_phys_t *ddp = &dde->dde_phys[p]; 2347219089Spjd ddt_key_t *ddk = &dde->dde_key; 2348219089Spjd 2349219089Spjd ddt_enter(ddt); 2350219089Spjd 2351219089Spjd ASSERT(ddp->ddp_refcnt == 0); 2352219089Spjd ASSERT(dde->dde_lead_zio[p] == zio); 2353219089Spjd dde->dde_lead_zio[p] = NULL; 2354219089Spjd 2355219089Spjd if (zio->io_error == 0) { 2356219089Spjd ASSERT(ZIO_CHECKSUM_EQUAL(bp->blk_cksum, ddk->ddk_cksum)); 2357219089Spjd ASSERT(zp->zp_copies < SPA_DVAS_PER_BP); 2358219089Spjd ASSERT(zp->zp_copies == BP_GET_NDVAS(bp) - BP_IS_GANG(bp)); 2359219089Spjd if (ddp->ddp_phys_birth != 0) 2360219089Spjd ddt_phys_free(ddt, ddk, ddp, zio->io_txg); 2361219089Spjd ddt_phys_fill(ddp, bp); 2362219089Spjd } 2363219089Spjd 2364219089Spjd ddt_exit(ddt); 2365219089Spjd} 2366219089Spjd 2367219089Spjdstatic int 2368270312Ssmhzio_ddt_write(zio_t *zio) 2369219089Spjd{ 2370219089Spjd spa_t *spa = zio->io_spa; 2371219089Spjd blkptr_t *bp = zio->io_bp; 2372219089Spjd uint64_t txg = zio->io_txg; 2373219089Spjd zio_prop_t *zp = &zio->io_prop; 2374219089Spjd int p = zp->zp_copies; 2375219089Spjd int ditto_copies; 2376219089Spjd zio_t *cio = NULL; 2377219089Spjd zio_t *dio = NULL; 2378219089Spjd ddt_t *ddt = ddt_select(spa, bp); 2379219089Spjd ddt_entry_t *dde; 2380219089Spjd ddt_phys_t *ddp; 2381219089Spjd 2382219089Spjd ASSERT(BP_GET_DEDUP(bp)); 2383219089Spjd ASSERT(BP_GET_CHECKSUM(bp) == zp->zp_checksum); 2384219089Spjd ASSERT(BP_IS_HOLE(bp) || zio->io_bp_override); 2385219089Spjd 2386219089Spjd ddt_enter(ddt); 2387219089Spjd dde = ddt_lookup(ddt, bp, B_TRUE); 2388219089Spjd ddp = &dde->dde_phys[p]; 2389219089Spjd 2390219089Spjd if (zp->zp_dedup_verify && zio_ddt_collision(zio, ddt, dde)) { 2391219089Spjd /* 2392219089Spjd * If we're using a weak checksum, upgrade to a strong checksum 2393219089Spjd * and try again. If we're already using a strong checksum, 2394219089Spjd * we can't resolve it, so just convert to an ordinary write. 2395219089Spjd * (And automatically e-mail a paper to Nature?) 2396219089Spjd */ 2397219089Spjd if (!zio_checksum_table[zp->zp_checksum].ci_dedup) { 2398219089Spjd zp->zp_checksum = spa_dedup_checksum(spa); 2399219089Spjd zio_pop_transforms(zio); 2400219089Spjd zio->io_stage = ZIO_STAGE_OPEN; 2401219089Spjd BP_ZERO(bp); 2402219089Spjd } else { 2403243524Smm zp->zp_dedup = B_FALSE; 2404219089Spjd } 2405219089Spjd zio->io_pipeline = ZIO_WRITE_PIPELINE; 2406219089Spjd ddt_exit(ddt); 2407219089Spjd return (ZIO_PIPELINE_CONTINUE); 2408219089Spjd } 2409219089Spjd 2410219089Spjd ditto_copies = ddt_ditto_copies_needed(ddt, dde, ddp); 2411219089Spjd ASSERT(ditto_copies < SPA_DVAS_PER_BP); 2412219089Spjd 2413219089Spjd if (ditto_copies > ddt_ditto_copies_present(dde) && 2414219089Spjd dde->dde_lead_zio[DDT_PHYS_DITTO] == NULL) { 2415219089Spjd zio_prop_t czp = *zp; 2416219089Spjd 2417219089Spjd czp.zp_copies = ditto_copies; 2418219089Spjd 2419219089Spjd /* 2420219089Spjd * If we arrived here with an override bp, we won't have run 2421219089Spjd * the transform stack, so we won't have the data we need to 2422219089Spjd * generate a child i/o. So, toss the override bp and restart. 2423219089Spjd * This is safe, because using the override bp is just an 2424219089Spjd * optimization; and it's rare, so the cost doesn't matter. 2425219089Spjd */ 2426219089Spjd if (zio->io_bp_override) { 2427219089Spjd zio_pop_transforms(zio); 2428219089Spjd zio->io_stage = ZIO_STAGE_OPEN; 2429219089Spjd zio->io_pipeline = ZIO_WRITE_PIPELINE; 2430219089Spjd zio->io_bp_override = NULL; 2431219089Spjd BP_ZERO(bp); 2432219089Spjd ddt_exit(ddt); 2433219089Spjd return (ZIO_PIPELINE_CONTINUE); 2434219089Spjd } 2435219089Spjd 2436219089Spjd dio = zio_write(zio, spa, txg, bp, zio->io_orig_data, 2437260763Savg zio->io_orig_size, &czp, NULL, NULL, 2438219089Spjd zio_ddt_ditto_write_done, dde, zio->io_priority, 2439219089Spjd ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark); 2440219089Spjd 2441219089Spjd zio_push_transform(dio, zio->io_data, zio->io_size, 0, NULL); 2442219089Spjd dde->dde_lead_zio[DDT_PHYS_DITTO] = dio; 2443219089Spjd } 2444219089Spjd 2445219089Spjd if (ddp->ddp_phys_birth != 0 || dde->dde_lead_zio[p] != NULL) { 2446219089Spjd if (ddp->ddp_phys_birth != 0) 2447219089Spjd ddt_bp_fill(ddp, bp, txg); 2448219089Spjd if (dde->dde_lead_zio[p] != NULL) 2449219089Spjd zio_add_child(zio, dde->dde_lead_zio[p]); 2450219089Spjd else 2451219089Spjd ddt_phys_addref(ddp); 2452219089Spjd } else if (zio->io_bp_override) { 2453219089Spjd ASSERT(bp->blk_birth == txg); 2454219089Spjd ASSERT(BP_EQUAL(bp, zio->io_bp_override)); 2455219089Spjd ddt_phys_fill(ddp, bp); 2456219089Spjd ddt_phys_addref(ddp); 2457219089Spjd } else { 2458219089Spjd cio = zio_write(zio, spa, txg, bp, zio->io_orig_data, 2459260763Savg zio->io_orig_size, zp, zio_ddt_child_write_ready, NULL, 2460219089Spjd zio_ddt_child_write_done, dde, zio->io_priority, 2461219089Spjd ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark); 2462219089Spjd 2463219089Spjd zio_push_transform(cio, zio->io_data, zio->io_size, 0, NULL); 2464219089Spjd dde->dde_lead_zio[p] = cio; 2465219089Spjd } 2466219089Spjd 2467219089Spjd ddt_exit(ddt); 2468219089Spjd 2469219089Spjd if (cio) 2470219089Spjd zio_nowait(cio); 2471219089Spjd if (dio) 2472219089Spjd zio_nowait(dio); 2473219089Spjd 2474219089Spjd return (ZIO_PIPELINE_CONTINUE); 2475219089Spjd} 2476219089Spjd 2477219089Spjdddt_entry_t *freedde; /* for debugging */ 2478219089Spjd 2479219089Spjdstatic int 2480270312Ssmhzio_ddt_free(zio_t *zio) 2481219089Spjd{ 2482219089Spjd spa_t *spa = zio->io_spa; 2483219089Spjd blkptr_t *bp = zio->io_bp; 2484219089Spjd ddt_t *ddt = ddt_select(spa, bp); 2485219089Spjd ddt_entry_t *dde; 2486219089Spjd ddt_phys_t *ddp; 2487219089Spjd 2488219089Spjd ASSERT(BP_GET_DEDUP(bp)); 2489219089Spjd ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 2490219089Spjd 2491219089Spjd ddt_enter(ddt); 2492219089Spjd freedde = dde = ddt_lookup(ddt, bp, B_TRUE); 2493219089Spjd ddp = ddt_phys_select(dde, bp); 2494219089Spjd ddt_phys_decref(ddp); 2495219089Spjd ddt_exit(ddt); 2496219089Spjd 2497219089Spjd return (ZIO_PIPELINE_CONTINUE); 2498219089Spjd} 2499219089Spjd 2500219089Spjd/* 2501219089Spjd * ========================================================================== 2502219089Spjd * Allocate and free blocks 2503219089Spjd * ========================================================================== 2504219089Spjd */ 2505219089Spjdstatic int 2506270312Ssmhzio_dva_allocate(zio_t *zio) 2507168404Spjd{ 2508185029Spjd spa_t *spa = zio->io_spa; 2509219089Spjd metaslab_class_t *mc = spa_normal_class(spa); 2510168404Spjd blkptr_t *bp = zio->io_bp; 2511168404Spjd int error; 2512224177Smm int flags = 0; 2513168404Spjd 2514209962Smm if (zio->io_gang_leader == NULL) { 2515209962Smm ASSERT(zio->io_child_type > ZIO_CHILD_GANG); 2516209962Smm zio->io_gang_leader = zio; 2517209962Smm } 2518209962Smm 2519168404Spjd ASSERT(BP_IS_HOLE(bp)); 2520240415Smm ASSERT0(BP_GET_NDVAS(bp)); 2521219089Spjd ASSERT3U(zio->io_prop.zp_copies, >, 0); 2522219089Spjd ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa)); 2523168404Spjd ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp)); 2524168404Spjd 2525224177Smm /* 2526224177Smm * The dump device does not support gang blocks so allocation on 2527224177Smm * behalf of the dump device (i.e. ZIO_FLAG_NODATA) must avoid 2528224177Smm * the "fast" gang feature. 2529224177Smm */ 2530224177Smm flags |= (zio->io_flags & ZIO_FLAG_NODATA) ? METASLAB_GANG_AVOID : 0; 2531224177Smm flags |= (zio->io_flags & ZIO_FLAG_GANG_CHILD) ? 2532224177Smm METASLAB_GANG_CHILD : 0; 2533185029Spjd error = metaslab_alloc(spa, mc, zio->io_size, bp, 2534224177Smm zio->io_prop.zp_copies, zio->io_txg, NULL, flags); 2535168404Spjd 2536185029Spjd if (error) { 2537224177Smm spa_dbgmsg(spa, "%s: metaslab allocation failure: zio %p, " 2538224177Smm "size %llu, error %d", spa_name(spa), zio, zio->io_size, 2539224177Smm error); 2540185029Spjd if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE) 2541185029Spjd return (zio_write_gang_block(zio)); 2542168404Spjd zio->io_error = error; 2543168404Spjd } 2544185029Spjd 2545185029Spjd return (ZIO_PIPELINE_CONTINUE); 2546168404Spjd} 2547168404Spjd 2548185029Spjdstatic int 2549270312Ssmhzio_dva_free(zio_t *zio) 2550168404Spjd{ 2551185029Spjd metaslab_free(zio->io_spa, zio->io_bp, zio->io_txg, B_FALSE); 2552168404Spjd 2553185029Spjd return (ZIO_PIPELINE_CONTINUE); 2554185029Spjd} 2555168404Spjd 2556185029Spjdstatic int 2557270312Ssmhzio_dva_claim(zio_t *zio) 2558185029Spjd{ 2559185029Spjd int error; 2560168404Spjd 2561185029Spjd error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg); 2562185029Spjd if (error) 2563185029Spjd zio->io_error = error; 2564185029Spjd 2565185029Spjd return (ZIO_PIPELINE_CONTINUE); 2566168404Spjd} 2567168404Spjd 2568185029Spjd/* 2569185029Spjd * Undo an allocation. This is used by zio_done() when an I/O fails 2570185029Spjd * and we want to give back the block we just allocated. 2571185029Spjd * This handles both normal blocks and gang blocks. 2572185029Spjd */ 2573168404Spjdstatic void 2574185029Spjdzio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp) 2575168404Spjd{ 2576185029Spjd ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp)); 2577219089Spjd ASSERT(zio->io_bp_override == NULL); 2578185029Spjd 2579185029Spjd if (!BP_IS_HOLE(bp)) 2580219089Spjd metaslab_free(zio->io_spa, bp, bp->blk_birth, B_TRUE); 2581185029Spjd 2582185029Spjd if (gn != NULL) { 2583185029Spjd for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { 2584185029Spjd zio_dva_unallocate(zio, gn->gn_child[g], 2585185029Spjd &gn->gn_gbh->zg_blkptr[g]); 2586185029Spjd } 2587185029Spjd } 2588168404Spjd} 2589168404Spjd 2590168404Spjd/* 2591185029Spjd * Try to allocate an intent log block. Return 0 on success, errno on failure. 2592185029Spjd */ 2593185029Spjdint 2594219089Spjdzio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, blkptr_t *old_bp, 2595219089Spjd uint64_t size, boolean_t use_slog) 2596185029Spjd{ 2597219089Spjd int error = 1; 2598185029Spjd 2599219089Spjd ASSERT(txg > spa_syncing_txg(spa)); 2600185029Spjd 2601230514Smm /* 2602230514Smm * ZIL blocks are always contiguous (i.e. not gang blocks) so we 2603230514Smm * set the METASLAB_GANG_AVOID flag so that they don't "fast gang" 2604230514Smm * when allocating them. 2605230514Smm */ 2606230514Smm if (use_slog) { 2607219089Spjd error = metaslab_alloc(spa, spa_log_class(spa), size, 2608230514Smm new_bp, 1, txg, old_bp, 2609230514Smm METASLAB_HINTBP_AVOID | METASLAB_GANG_AVOID); 2610230514Smm } 2611219089Spjd 2612230514Smm if (error) { 2613219089Spjd error = metaslab_alloc(spa, spa_normal_class(spa), size, 2614230514Smm new_bp, 1, txg, old_bp, 2615260768Savg METASLAB_HINTBP_AVOID); 2616230514Smm } 2617185029Spjd 2618185029Spjd if (error == 0) { 2619185029Spjd BP_SET_LSIZE(new_bp, size); 2620185029Spjd BP_SET_PSIZE(new_bp, size); 2621185029Spjd BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF); 2622219089Spjd BP_SET_CHECKSUM(new_bp, 2623219089Spjd spa_version(spa) >= SPA_VERSION_SLIM_ZIL 2624219089Spjd ? ZIO_CHECKSUM_ZILOG2 : ZIO_CHECKSUM_ZILOG); 2625185029Spjd BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG); 2626185029Spjd BP_SET_LEVEL(new_bp, 0); 2627219089Spjd BP_SET_DEDUP(new_bp, 0); 2628185029Spjd BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER); 2629185029Spjd } 2630185029Spjd 2631185029Spjd return (error); 2632185029Spjd} 2633185029Spjd 2634185029Spjd/* 2635219089Spjd * Free an intent log block. 2636185029Spjd */ 2637185029Spjdvoid 2638219089Spjdzio_free_zil(spa_t *spa, uint64_t txg, blkptr_t *bp) 2639185029Spjd{ 2640219089Spjd ASSERT(BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG); 2641185029Spjd ASSERT(!BP_IS_GANG(bp)); 2642185029Spjd 2643219089Spjd zio_free(spa, txg, bp); 2644185029Spjd} 2645185029Spjd 2646185029Spjd/* 2647168404Spjd * ========================================================================== 2648244187Ssmh * Read, write and delete to physical devices 2649168404Spjd * ========================================================================== 2650168404Spjd */ 2651185029Spjdstatic int 2652270312Ssmhzio_vdev_io_start(zio_t *zio) 2653168404Spjd{ 2654168404Spjd vdev_t *vd = zio->io_vd; 2655168404Spjd uint64_t align; 2656185029Spjd spa_t *spa = zio->io_spa; 2657270312Ssmh int ret; 2658168404Spjd 2659185029Spjd ASSERT(zio->io_error == 0); 2660185029Spjd ASSERT(zio->io_child_error[ZIO_CHILD_VDEV] == 0); 2661185029Spjd 2662168404Spjd if (vd == NULL) { 2663185029Spjd if (!(zio->io_flags & ZIO_FLAG_CONFIG_WRITER)) 2664185029Spjd spa_config_enter(spa, SCL_ZIO, zio, RW_READER); 2665185029Spjd 2666185029Spjd /* 2667185029Spjd * The mirror_ops handle multiple DVAs in a single BP. 2668185029Spjd */ 2669185029Spjd return (vdev_mirror_ops.vdev_op_io_start(zio)); 2670168404Spjd } 2671168404Spjd 2672270312Ssmh if (vd->vdev_ops->vdev_op_leaf && zio->io_type == ZIO_TYPE_FREE && 2673270312Ssmh zio->io_priority == ZIO_PRIORITY_NOW) { 2674248574Ssmh trim_map_free(vd, zio->io_offset, zio->io_size, zio->io_txg); 2675240868Spjd return (ZIO_PIPELINE_CONTINUE); 2676240868Spjd } 2677240868Spjd 2678219089Spjd /* 2679219089Spjd * We keep track of time-sensitive I/Os so that the scan thread 2680219089Spjd * can quickly react to certain workloads. In particular, we care 2681219089Spjd * about non-scrubbing, top-level reads and writes with the following 2682219089Spjd * characteristics: 2683219089Spjd * - synchronous writes of user data to non-slog devices 2684219089Spjd * - any reads of user data 2685219089Spjd * When these conditions are met, adjust the timestamp of spa_last_io 2686219089Spjd * which allows the scan thread to adjust its workload accordingly. 2687219089Spjd */ 2688219089Spjd if (!(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && zio->io_bp != NULL && 2689219089Spjd vd == vd->vdev_top && !vd->vdev_islog && 2690219089Spjd zio->io_bookmark.zb_objset != DMU_META_OBJSET && 2691219089Spjd zio->io_txg != spa_syncing_txg(spa)) { 2692219089Spjd uint64_t old = spa->spa_last_io; 2693219089Spjd uint64_t new = ddi_get_lbolt64(); 2694219089Spjd if (old != new) 2695219089Spjd (void) atomic_cas_64(&spa->spa_last_io, old, new); 2696219089Spjd } 2697219089Spjd 2698185029Spjd align = 1ULL << vd->vdev_top->vdev_ashift; 2699168404Spjd 2700269733Sdelphij if ((!(zio->io_flags & ZIO_FLAG_PHYSICAL) || 2701269733Sdelphij (vd->vdev_top->vdev_physical_ashift > SPA_MINBLOCKSHIFT)) && 2702269416Sdelphij P2PHASE(zio->io_size, align) != 0) { 2703269416Sdelphij /* Transform logical writes to be a full physical block size. */ 2704168404Spjd uint64_t asize = P2ROUNDUP(zio->io_size, align); 2705240868Spjd char *abuf = NULL; 2706240868Spjd if (zio->io_type == ZIO_TYPE_READ || 2707240868Spjd zio->io_type == ZIO_TYPE_WRITE) 2708240868Spjd abuf = zio_buf_alloc(asize); 2709185029Spjd ASSERT(vd == vd->vdev_top); 2710168404Spjd if (zio->io_type == ZIO_TYPE_WRITE) { 2711168404Spjd bcopy(zio->io_data, abuf, zio->io_size); 2712168404Spjd bzero(abuf + zio->io_size, asize - zio->io_size); 2713168404Spjd } 2714240868Spjd zio_push_transform(zio, abuf, asize, abuf ? asize : 0, 2715240868Spjd zio_subblock); 2716168404Spjd } 2717168404Spjd 2718269416Sdelphij /* 2719269416Sdelphij * If this is not a physical io, make sure that it is properly aligned 2720269416Sdelphij * before proceeding. 2721269416Sdelphij */ 2722269416Sdelphij if (!(zio->io_flags & ZIO_FLAG_PHYSICAL)) { 2723269416Sdelphij ASSERT0(P2PHASE(zio->io_offset, align)); 2724269416Sdelphij ASSERT0(P2PHASE(zio->io_size, align)); 2725269416Sdelphij } else { 2726269416Sdelphij /* 2727269416Sdelphij * For physical writes, we allow 512b aligned writes and assume 2728269416Sdelphij * the device will perform a read-modify-write as necessary. 2729269416Sdelphij */ 2730269416Sdelphij ASSERT0(P2PHASE(zio->io_offset, SPA_MINBLOCKSIZE)); 2731269416Sdelphij ASSERT0(P2PHASE(zio->io_size, SPA_MINBLOCKSIZE)); 2732269416Sdelphij } 2733269416Sdelphij 2734240868Spjd VERIFY(zio->io_type == ZIO_TYPE_READ || spa_writeable(spa)); 2735168404Spjd 2736209962Smm /* 2737209962Smm * If this is a repair I/O, and there's no self-healing involved -- 2738209962Smm * that is, we're just resilvering what we expect to resilver -- 2739209962Smm * then don't do the I/O unless zio's txg is actually in vd's DTL. 2740209962Smm * This prevents spurious resilvering with nested replication. 2741209962Smm * For example, given a mirror of mirrors, (A+B)+(C+D), if only 2742209962Smm * A is out of date, we'll read from C+D, then use the data to 2743209962Smm * resilver A+B -- but we don't actually want to resilver B, just A. 2744209962Smm * The top-level mirror has no way to know this, so instead we just 2745209962Smm * discard unnecessary repairs as we work our way down the vdev tree. 2746209962Smm * The same logic applies to any form of nested replication: 2747209962Smm * ditto + mirror, RAID-Z + replacing, etc. This covers them all. 2748209962Smm */ 2749209962Smm if ((zio->io_flags & ZIO_FLAG_IO_REPAIR) && 2750209962Smm !(zio->io_flags & ZIO_FLAG_SELF_HEAL) && 2751209962Smm zio->io_txg != 0 && /* not a delegated i/o */ 2752209962Smm !vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) { 2753209962Smm ASSERT(zio->io_type == ZIO_TYPE_WRITE); 2754209962Smm zio_vdev_io_bypass(zio); 2755209962Smm return (ZIO_PIPELINE_CONTINUE); 2756209962Smm } 2757209962Smm 2758270312Ssmh if (vd->vdev_ops->vdev_op_leaf) { 2759270312Ssmh switch (zio->io_type) { 2760270312Ssmh case ZIO_TYPE_READ: 2761270312Ssmh if (vdev_cache_read(zio)) 2762270312Ssmh return (ZIO_PIPELINE_CONTINUE); 2763270312Ssmh /* FALLTHROUGH */ 2764270312Ssmh case ZIO_TYPE_WRITE: 2765270312Ssmh case ZIO_TYPE_FREE: 2766270312Ssmh if ((zio = vdev_queue_io(zio)) == NULL) 2767270312Ssmh return (ZIO_PIPELINE_STOP); 2768168404Spjd 2769270312Ssmh if (!vdev_accessible(vd, zio)) { 2770270312Ssmh zio->io_error = SET_ERROR(ENXIO); 2771270312Ssmh zio_interrupt(zio); 2772270312Ssmh return (ZIO_PIPELINE_STOP); 2773270312Ssmh } 2774270312Ssmh break; 2775185029Spjd } 2776270312Ssmh /* 2777270312Ssmh * Note that we ignore repair writes for TRIM because they can 2778270312Ssmh * conflict with normal writes. This isn't an issue because, by 2779270312Ssmh * definition, we only repair blocks that aren't freed. 2780270312Ssmh */ 2781270312Ssmh if (zio->io_type == ZIO_TYPE_WRITE && 2782270312Ssmh !(zio->io_flags & ZIO_FLAG_IO_REPAIR) && 2783270312Ssmh !trim_map_write_start(zio)) 2784240868Spjd return (ZIO_PIPELINE_STOP); 2785240868Spjd } 2786240868Spjd 2787270312Ssmh ret = vd->vdev_ops->vdev_op_io_start(zio); 2788270312Ssmh ASSERT(ret == ZIO_PIPELINE_STOP); 2789270312Ssmh 2790270312Ssmh return (ret); 2791168404Spjd} 2792168404Spjd 2793185029Spjdstatic int 2794270312Ssmhzio_vdev_io_done(zio_t *zio) 2795168404Spjd{ 2796168404Spjd vdev_t *vd = zio->io_vd; 2797185029Spjd vdev_ops_t *ops = vd ? vd->vdev_ops : &vdev_mirror_ops; 2798185029Spjd boolean_t unexpected_error = B_FALSE; 2799168404Spjd 2800185029Spjd if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE)) 2801185029Spjd return (ZIO_PIPELINE_STOP); 2802168404Spjd 2803240868Spjd ASSERT(zio->io_type == ZIO_TYPE_READ || 2804240868Spjd zio->io_type == ZIO_TYPE_WRITE || zio->io_type == ZIO_TYPE_FREE); 2805185029Spjd 2806240868Spjd if (vd != NULL && vd->vdev_ops->vdev_op_leaf && 2807270312Ssmh (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE || 2808270312Ssmh zio->io_type == ZIO_TYPE_FREE)) { 2809240868Spjd 2810248573Ssmh if (zio->io_type == ZIO_TYPE_WRITE && 2811248573Ssmh !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) 2812248573Ssmh trim_map_write_done(zio); 2813248573Ssmh 2814185029Spjd vdev_queue_io_done(zio); 2815185029Spjd 2816185029Spjd if (zio->io_type == ZIO_TYPE_WRITE) 2817185029Spjd vdev_cache_write(zio); 2818185029Spjd 2819185029Spjd if (zio_injection_enabled && zio->io_error == 0) 2820213198Smm zio->io_error = zio_handle_device_injection(vd, 2821213198Smm zio, EIO); 2822185029Spjd 2823185029Spjd if (zio_injection_enabled && zio->io_error == 0) 2824185029Spjd zio->io_error = zio_handle_label_injection(zio, EIO); 2825185029Spjd 2826185029Spjd if (zio->io_error) { 2827271683Ssmh if (zio->io_error == ENOTSUP && 2828271683Ssmh zio->io_type == ZIO_TYPE_FREE) { 2829271683Ssmh /* Not all devices support TRIM. */ 2830271683Ssmh } else if (!vdev_accessible(vd, zio)) { 2831249195Smm zio->io_error = SET_ERROR(ENXIO); 2832185029Spjd } else { 2833185029Spjd unexpected_error = B_TRUE; 2834185029Spjd } 2835185029Spjd } 2836185029Spjd } 2837185029Spjd 2838185029Spjd ops->vdev_op_io_done(zio); 2839185029Spjd 2840185029Spjd if (unexpected_error) 2841209962Smm VERIFY(vdev_probe(vd, zio) == NULL); 2842185029Spjd 2843185029Spjd return (ZIO_PIPELINE_CONTINUE); 2844168404Spjd} 2845168404Spjd 2846219089Spjd/* 2847219089Spjd * For non-raidz ZIOs, we can just copy aside the bad data read from the 2848219089Spjd * disk, and use that to finish the checksum ereport later. 2849219089Spjd */ 2850219089Spjdstatic void 2851219089Spjdzio_vsd_default_cksum_finish(zio_cksum_report_t *zcr, 2852219089Spjd const void *good_buf) 2853219089Spjd{ 2854219089Spjd /* no processing needed */ 2855219089Spjd zfs_ereport_finish_checksum(zcr, good_buf, zcr->zcr_cbdata, B_FALSE); 2856219089Spjd} 2857219089Spjd 2858219089Spjd/*ARGSUSED*/ 2859219089Spjdvoid 2860219089Spjdzio_vsd_default_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *ignored) 2861219089Spjd{ 2862219089Spjd void *buf = zio_buf_alloc(zio->io_size); 2863219089Spjd 2864219089Spjd bcopy(zio->io_data, buf, zio->io_size); 2865219089Spjd 2866219089Spjd zcr->zcr_cbinfo = zio->io_size; 2867219089Spjd zcr->zcr_cbdata = buf; 2868219089Spjd zcr->zcr_finish = zio_vsd_default_cksum_finish; 2869219089Spjd zcr->zcr_free = zio_buf_free; 2870219089Spjd} 2871219089Spjd 2872185029Spjdstatic int 2873270312Ssmhzio_vdev_io_assess(zio_t *zio) 2874168404Spjd{ 2875168404Spjd vdev_t *vd = zio->io_vd; 2876168404Spjd 2877185029Spjd if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE)) 2878185029Spjd return (ZIO_PIPELINE_STOP); 2879168404Spjd 2880185029Spjd if (vd == NULL && !(zio->io_flags & ZIO_FLAG_CONFIG_WRITER)) 2881185029Spjd spa_config_exit(zio->io_spa, SCL_ZIO, zio); 2882185029Spjd 2883185029Spjd if (zio->io_vsd != NULL) { 2884219089Spjd zio->io_vsd_ops->vsd_free(zio); 2885185029Spjd zio->io_vsd = NULL; 2886168404Spjd } 2887168404Spjd 2888185029Spjd if (zio_injection_enabled && zio->io_error == 0) 2889168404Spjd zio->io_error = zio_handle_fault_injection(zio, EIO); 2890168404Spjd 2891270312Ssmh if (zio->io_type == ZIO_TYPE_FREE && 2892270312Ssmh zio->io_priority != ZIO_PRIORITY_NOW) { 2893240868Spjd switch (zio->io_error) { 2894240868Spjd case 0: 2895244155Ssmh ZIO_TRIM_STAT_INCR(bytes, zio->io_size); 2896244155Ssmh ZIO_TRIM_STAT_BUMP(success); 2897240868Spjd break; 2898240868Spjd case EOPNOTSUPP: 2899244155Ssmh ZIO_TRIM_STAT_BUMP(unsupported); 2900240868Spjd break; 2901240868Spjd default: 2902244155Ssmh ZIO_TRIM_STAT_BUMP(failed); 2903240868Spjd break; 2904240868Spjd } 2905270312Ssmh } 2906240868Spjd 2907168404Spjd /* 2908168404Spjd * If the I/O failed, determine whether we should attempt to retry it. 2909219089Spjd * 2910219089Spjd * On retry, we cut in line in the issue queue, since we don't want 2911219089Spjd * compression/checksumming/etc. work to prevent our (cheap) IO reissue. 2912168404Spjd */ 2913185029Spjd if (zio->io_error && vd == NULL && 2914185029Spjd !(zio->io_flags & (ZIO_FLAG_DONT_RETRY | ZIO_FLAG_IO_RETRY))) { 2915185029Spjd ASSERT(!(zio->io_flags & ZIO_FLAG_DONT_QUEUE)); /* not a leaf */ 2916185029Spjd ASSERT(!(zio->io_flags & ZIO_FLAG_IO_BYPASS)); /* not a leaf */ 2917168404Spjd zio->io_error = 0; 2918185029Spjd zio->io_flags |= ZIO_FLAG_IO_RETRY | 2919185029Spjd ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE; 2920219089Spjd zio->io_stage = ZIO_STAGE_VDEV_IO_START >> 1; 2921219089Spjd zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, 2922219089Spjd zio_requeue_io_start_cut_in_line); 2923185029Spjd return (ZIO_PIPELINE_STOP); 2924185029Spjd } 2925168404Spjd 2926185029Spjd /* 2927185029Spjd * If we got an error on a leaf device, convert it to ENXIO 2928185029Spjd * if the device is not accessible at all. 2929185029Spjd */ 2930185029Spjd if (zio->io_error && vd != NULL && vd->vdev_ops->vdev_op_leaf && 2931185029Spjd !vdev_accessible(vd, zio)) 2932249195Smm zio->io_error = SET_ERROR(ENXIO); 2933168404Spjd 2934185029Spjd /* 2935185029Spjd * If we can't write to an interior vdev (mirror or RAID-Z), 2936185029Spjd * set vdev_cant_write so that we stop trying to allocate from it. 2937185029Spjd */ 2938185029Spjd if (zio->io_error == ENXIO && zio->io_type == ZIO_TYPE_WRITE && 2939248571Smm vd != NULL && !vd->vdev_ops->vdev_op_leaf) { 2940185029Spjd vd->vdev_cant_write = B_TRUE; 2941248571Smm } 2942168404Spjd 2943185029Spjd if (zio->io_error) 2944185029Spjd zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 2945168404Spjd 2946260763Savg if (vd != NULL && vd->vdev_ops->vdev_op_leaf && 2947260763Savg zio->io_physdone != NULL) { 2948260763Savg ASSERT(!(zio->io_flags & ZIO_FLAG_DELEGATED)); 2949260763Savg ASSERT(zio->io_child_type == ZIO_CHILD_VDEV); 2950260763Savg zio->io_physdone(zio->io_logical); 2951260763Savg } 2952260763Savg 2953185029Spjd return (ZIO_PIPELINE_CONTINUE); 2954168404Spjd} 2955168404Spjd 2956168404Spjdvoid 2957168404Spjdzio_vdev_io_reissue(zio_t *zio) 2958168404Spjd{ 2959168404Spjd ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); 2960168404Spjd ASSERT(zio->io_error == 0); 2961168404Spjd 2962219089Spjd zio->io_stage >>= 1; 2963168404Spjd} 2964168404Spjd 2965168404Spjdvoid 2966168404Spjdzio_vdev_io_redone(zio_t *zio) 2967168404Spjd{ 2968168404Spjd ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE); 2969168404Spjd 2970219089Spjd zio->io_stage >>= 1; 2971168404Spjd} 2972168404Spjd 2973168404Spjdvoid 2974168404Spjdzio_vdev_io_bypass(zio_t *zio) 2975168404Spjd{ 2976168404Spjd ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); 2977168404Spjd ASSERT(zio->io_error == 0); 2978168404Spjd 2979168404Spjd zio->io_flags |= ZIO_FLAG_IO_BYPASS; 2980219089Spjd zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS >> 1; 2981168404Spjd} 2982168404Spjd 2983168404Spjd/* 2984168404Spjd * ========================================================================== 2985168404Spjd * Generate and verify checksums 2986168404Spjd * ========================================================================== 2987168404Spjd */ 2988185029Spjdstatic int 2989270312Ssmhzio_checksum_generate(zio_t *zio) 2990168404Spjd{ 2991168404Spjd blkptr_t *bp = zio->io_bp; 2992185029Spjd enum zio_checksum checksum; 2993168404Spjd 2994185029Spjd if (bp == NULL) { 2995185029Spjd /* 2996185029Spjd * This is zio_write_phys(). 2997185029Spjd * We're either generating a label checksum, or none at all. 2998185029Spjd */ 2999185029Spjd checksum = zio->io_prop.zp_checksum; 3000168404Spjd 3001185029Spjd if (checksum == ZIO_CHECKSUM_OFF) 3002185029Spjd return (ZIO_PIPELINE_CONTINUE); 3003168404Spjd 3004185029Spjd ASSERT(checksum == ZIO_CHECKSUM_LABEL); 3005185029Spjd } else { 3006185029Spjd if (BP_IS_GANG(bp) && zio->io_child_type == ZIO_CHILD_GANG) { 3007185029Spjd ASSERT(!IO_IS_ALLOCATING(zio)); 3008185029Spjd checksum = ZIO_CHECKSUM_GANG_HEADER; 3009185029Spjd } else { 3010185029Spjd checksum = BP_GET_CHECKSUM(bp); 3011185029Spjd } 3012185029Spjd } 3013168404Spjd 3014185029Spjd zio_checksum_compute(zio, checksum, zio->io_data, zio->io_size); 3015185029Spjd 3016185029Spjd return (ZIO_PIPELINE_CONTINUE); 3017168404Spjd} 3018168404Spjd 3019185029Spjdstatic int 3020270312Ssmhzio_checksum_verify(zio_t *zio) 3021168404Spjd{ 3022219089Spjd zio_bad_cksum_t info; 3023185029Spjd blkptr_t *bp = zio->io_bp; 3024185029Spjd int error; 3025168404Spjd 3026219089Spjd ASSERT(zio->io_vd != NULL); 3027219089Spjd 3028185029Spjd if (bp == NULL) { 3029185029Spjd /* 3030185029Spjd * This is zio_read_phys(). 3031185029Spjd * We're either verifying a label checksum, or nothing at all. 3032185029Spjd */ 3033185029Spjd if (zio->io_prop.zp_checksum == ZIO_CHECKSUM_OFF) 3034185029Spjd return (ZIO_PIPELINE_CONTINUE); 3035168404Spjd 3036185029Spjd ASSERT(zio->io_prop.zp_checksum == ZIO_CHECKSUM_LABEL); 3037185029Spjd } 3038168404Spjd 3039219089Spjd if ((error = zio_checksum_error(zio, &info)) != 0) { 3040185029Spjd zio->io_error = error; 3041277575Sdelphij if (error == ECKSUM && 3042277575Sdelphij !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 3043219089Spjd zfs_ereport_start_checksum(zio->io_spa, 3044219089Spjd zio->io_vd, zio, zio->io_offset, 3045219089Spjd zio->io_size, NULL, &info); 3046185029Spjd } 3047168404Spjd } 3048168404Spjd 3049185029Spjd return (ZIO_PIPELINE_CONTINUE); 3050168404Spjd} 3051168404Spjd 3052168404Spjd/* 3053168404Spjd * Called by RAID-Z to ensure we don't compute the checksum twice. 3054168404Spjd */ 3055168404Spjdvoid 3056168404Spjdzio_checksum_verified(zio_t *zio) 3057168404Spjd{ 3058219089Spjd zio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY; 3059168404Spjd} 3060168404Spjd 3061168404Spjd/* 3062185029Spjd * ========================================================================== 3063185029Spjd * Error rank. Error are ranked in the order 0, ENXIO, ECKSUM, EIO, other. 3064268649Sdelphij * An error of 0 indicates success. ENXIO indicates whole-device failure, 3065185029Spjd * which may be transient (e.g. unplugged) or permament. ECKSUM and EIO 3066185029Spjd * indicate errors that are specific to one I/O, and most likely permanent. 3067185029Spjd * Any other error is presumed to be worse because we weren't expecting it. 3068185029Spjd * ========================================================================== 3069168404Spjd */ 3070185029Spjdint 3071185029Spjdzio_worst_error(int e1, int e2) 3072168404Spjd{ 3073185029Spjd static int zio_error_rank[] = { 0, ENXIO, ECKSUM, EIO }; 3074185029Spjd int r1, r2; 3075168404Spjd 3076185029Spjd for (r1 = 0; r1 < sizeof (zio_error_rank) / sizeof (int); r1++) 3077185029Spjd if (e1 == zio_error_rank[r1]) 3078185029Spjd break; 3079185029Spjd 3080185029Spjd for (r2 = 0; r2 < sizeof (zio_error_rank) / sizeof (int); r2++) 3081185029Spjd if (e2 == zio_error_rank[r2]) 3082185029Spjd break; 3083185029Spjd 3084185029Spjd return (r1 > r2 ? e1 : e2); 3085168404Spjd} 3086168404Spjd 3087168404Spjd/* 3088168404Spjd * ========================================================================== 3089185029Spjd * I/O completion 3090168404Spjd * ========================================================================== 3091168404Spjd */ 3092185029Spjdstatic int 3093270312Ssmhzio_ready(zio_t *zio) 3094168404Spjd{ 3095185029Spjd blkptr_t *bp = zio->io_bp; 3096209962Smm zio_t *pio, *pio_next; 3097168404Spjd 3098219089Spjd if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) || 3099219089Spjd zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_READY)) 3100209962Smm return (ZIO_PIPELINE_STOP); 3101209962Smm 3102185029Spjd if (zio->io_ready) { 3103185029Spjd ASSERT(IO_IS_ALLOCATING(zio)); 3104243524Smm ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp) || 3105243524Smm (zio->io_flags & ZIO_FLAG_NOPWRITE)); 3106185029Spjd ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0); 3107168404Spjd 3108185029Spjd zio->io_ready(zio); 3109168404Spjd } 3110168404Spjd 3111185029Spjd if (bp != NULL && bp != &zio->io_bp_copy) 3112185029Spjd zio->io_bp_copy = *bp; 3113168404Spjd 3114185029Spjd if (zio->io_error) 3115185029Spjd zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 3116168404Spjd 3117209962Smm mutex_enter(&zio->io_lock); 3118209962Smm zio->io_state[ZIO_WAIT_READY] = 1; 3119209962Smm pio = zio_walk_parents(zio); 3120209962Smm mutex_exit(&zio->io_lock); 3121209962Smm 3122209962Smm /* 3123209962Smm * As we notify zio's parents, new parents could be added. 3124209962Smm * New parents go to the head of zio's io_parent_list, however, 3125209962Smm * so we will (correctly) not notify them. The remainder of zio's 3126209962Smm * io_parent_list, from 'pio_next' onward, cannot change because 3127209962Smm * all parents must wait for us to be done before they can be done. 3128209962Smm */ 3129209962Smm for (; pio != NULL; pio = pio_next) { 3130209962Smm pio_next = zio_walk_parents(zio); 3131185029Spjd zio_notify_parent(pio, zio, ZIO_WAIT_READY); 3132209962Smm } 3133185029Spjd 3134219089Spjd if (zio->io_flags & ZIO_FLAG_NODATA) { 3135219089Spjd if (BP_IS_GANG(bp)) { 3136219089Spjd zio->io_flags &= ~ZIO_FLAG_NODATA; 3137219089Spjd } else { 3138219089Spjd ASSERT((uintptr_t)zio->io_data < SPA_MAXBLOCKSIZE); 3139219089Spjd zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; 3140219089Spjd } 3141219089Spjd } 3142219089Spjd 3143219089Spjd if (zio_injection_enabled && 3144219089Spjd zio->io_spa->spa_syncing_txg == zio->io_txg) 3145219089Spjd zio_handle_ignored_writes(zio); 3146219089Spjd 3147185029Spjd return (ZIO_PIPELINE_CONTINUE); 3148185029Spjd} 3149185029Spjd 3150185029Spjdstatic int 3151270312Ssmhzio_done(zio_t *zio) 3152185029Spjd{ 3153185029Spjd spa_t *spa = zio->io_spa; 3154185029Spjd zio_t *lio = zio->io_logical; 3155185029Spjd blkptr_t *bp = zio->io_bp; 3156185029Spjd vdev_t *vd = zio->io_vd; 3157185029Spjd uint64_t psize = zio->io_size; 3158209962Smm zio_t *pio, *pio_next; 3159185029Spjd 3160168404Spjd /* 3161209962Smm * If our children haven't all completed, 3162185029Spjd * wait for them and then repeat this pipeline stage. 3163168404Spjd */ 3164185029Spjd if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE) || 3165185029Spjd zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE) || 3166219089Spjd zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE) || 3167185029Spjd zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE)) 3168185029Spjd return (ZIO_PIPELINE_STOP); 3169185029Spjd 3170185029Spjd for (int c = 0; c < ZIO_CHILD_TYPES; c++) 3171185029Spjd for (int w = 0; w < ZIO_WAIT_TYPES; w++) 3172185029Spjd ASSERT(zio->io_children[c][w] == 0); 3173185029Spjd 3174268649Sdelphij if (bp != NULL && !BP_IS_EMBEDDED(bp)) { 3175185029Spjd ASSERT(bp->blk_pad[0] == 0); 3176185029Spjd ASSERT(bp->blk_pad[1] == 0); 3177185029Spjd ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0 || 3178209962Smm (bp == zio_unique_parent(zio)->io_bp)); 3179185029Spjd if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) && 3180219089Spjd zio->io_bp_override == NULL && 3181185029Spjd !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) { 3182185029Spjd ASSERT(!BP_SHOULD_BYTESWAP(bp)); 3183219089Spjd ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(bp)); 3184185029Spjd ASSERT(BP_COUNT_GANG(bp) == 0 || 3185185029Spjd (BP_COUNT_GANG(bp) == BP_GET_NDVAS(bp))); 3186185029Spjd } 3187243524Smm if (zio->io_flags & ZIO_FLAG_NOPWRITE) 3188243524Smm VERIFY(BP_EQUAL(bp, &zio->io_bp_orig)); 3189168404Spjd } 3190168404Spjd 3191185029Spjd /* 3192219089Spjd * If there were child vdev/gang/ddt errors, they apply to us now. 3193185029Spjd */ 3194185029Spjd zio_inherit_child_errors(zio, ZIO_CHILD_VDEV); 3195185029Spjd zio_inherit_child_errors(zio, ZIO_CHILD_GANG); 3196219089Spjd zio_inherit_child_errors(zio, ZIO_CHILD_DDT); 3197168404Spjd 3198219089Spjd /* 3199219089Spjd * If the I/O on the transformed data was successful, generate any 3200219089Spjd * checksum reports now while we still have the transformed data. 3201219089Spjd */ 3202219089Spjd if (zio->io_error == 0) { 3203219089Spjd while (zio->io_cksum_report != NULL) { 3204219089Spjd zio_cksum_report_t *zcr = zio->io_cksum_report; 3205219089Spjd uint64_t align = zcr->zcr_align; 3206219089Spjd uint64_t asize = P2ROUNDUP(psize, align); 3207219089Spjd char *abuf = zio->io_data; 3208219089Spjd 3209219089Spjd if (asize != psize) { 3210219089Spjd abuf = zio_buf_alloc(asize); 3211219089Spjd bcopy(zio->io_data, abuf, psize); 3212219089Spjd bzero(abuf + psize, asize - psize); 3213219089Spjd } 3214219089Spjd 3215219089Spjd zio->io_cksum_report = zcr->zcr_next; 3216219089Spjd zcr->zcr_next = NULL; 3217219089Spjd zcr->zcr_finish(zcr, abuf); 3218219089Spjd zfs_ereport_free_checksum(zcr); 3219219089Spjd 3220219089Spjd if (asize != psize) 3221219089Spjd zio_buf_free(abuf, asize); 3222219089Spjd } 3223219089Spjd } 3224219089Spjd 3225185029Spjd zio_pop_transforms(zio); /* note: may set zio->io_error */ 3226168404Spjd 3227185029Spjd vdev_stat_update(zio, psize); 3228185029Spjd 3229168404Spjd if (zio->io_error) { 3230185029Spjd /* 3231185029Spjd * If this I/O is attached to a particular vdev, 3232185029Spjd * generate an error message describing the I/O failure 3233185029Spjd * at the block level. We ignore these errors if the 3234185029Spjd * device is currently unavailable. 3235185029Spjd */ 3236185029Spjd if (zio->io_error != ECKSUM && vd != NULL && !vdev_is_dead(vd)) 3237185029Spjd zfs_ereport_post(FM_EREPORT_ZFS_IO, spa, vd, zio, 0, 0); 3238185029Spjd 3239219089Spjd if ((zio->io_error == EIO || !(zio->io_flags & 3240219089Spjd (ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_PROPAGATE))) && 3241219089Spjd zio == lio) { 3242185029Spjd /* 3243185029Spjd * For logical I/O requests, tell the SPA to log the 3244185029Spjd * error and generate a logical data ereport. 3245185029Spjd */ 3246185029Spjd spa_log_error(spa, zio); 3247185029Spjd zfs_ereport_post(FM_EREPORT_ZFS_DATA, spa, NULL, zio, 3248185029Spjd 0, 0); 3249185029Spjd } 3250168404Spjd } 3251168404Spjd 3252185029Spjd if (zio->io_error && zio == lio) { 3253185029Spjd /* 3254185029Spjd * Determine whether zio should be reexecuted. This will 3255185029Spjd * propagate all the way to the root via zio_notify_parent(). 3256185029Spjd */ 3257185029Spjd ASSERT(vd == NULL && bp != NULL); 3258219089Spjd ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 3259168404Spjd 3260219089Spjd if (IO_IS_ALLOCATING(zio) && 3261219089Spjd !(zio->io_flags & ZIO_FLAG_CANFAIL)) { 3262185029Spjd if (zio->io_error != ENOSPC) 3263185029Spjd zio->io_reexecute |= ZIO_REEXECUTE_NOW; 3264185029Spjd else 3265185029Spjd zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; 3266219089Spjd } 3267168404Spjd 3268185029Spjd if ((zio->io_type == ZIO_TYPE_READ || 3269185029Spjd zio->io_type == ZIO_TYPE_FREE) && 3270219089Spjd !(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && 3271185029Spjd zio->io_error == ENXIO && 3272219089Spjd spa_load_state(spa) == SPA_LOAD_NONE && 3273185029Spjd spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE) 3274185029Spjd zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; 3275185029Spjd 3276185029Spjd if (!(zio->io_flags & ZIO_FLAG_CANFAIL) && !zio->io_reexecute) 3277185029Spjd zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; 3278219089Spjd 3279219089Spjd /* 3280219089Spjd * Here is a possibly good place to attempt to do 3281219089Spjd * either combinatorial reconstruction or error correction 3282219089Spjd * based on checksums. It also might be a good place 3283219089Spjd * to send out preliminary ereports before we suspend 3284219089Spjd * processing. 3285219089Spjd */ 3286185029Spjd } 3287185029Spjd 3288168404Spjd /* 3289185029Spjd * If there were logical child errors, they apply to us now. 3290185029Spjd * We defer this until now to avoid conflating logical child 3291185029Spjd * errors with errors that happened to the zio itself when 3292185029Spjd * updating vdev stats and reporting FMA events above. 3293168404Spjd */ 3294185029Spjd zio_inherit_child_errors(zio, ZIO_CHILD_LOGICAL); 3295185029Spjd 3296219089Spjd if ((zio->io_error || zio->io_reexecute) && 3297219089Spjd IO_IS_ALLOCATING(zio) && zio->io_gang_leader == zio && 3298243524Smm !(zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE))) 3299209962Smm zio_dva_unallocate(zio, zio->io_gang_tree, bp); 3300209962Smm 3301209962Smm zio_gang_tree_free(&zio->io_gang_tree); 3302209962Smm 3303209962Smm /* 3304209962Smm * Godfather I/Os should never suspend. 3305209962Smm */ 3306209962Smm if ((zio->io_flags & ZIO_FLAG_GODFATHER) && 3307209962Smm (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) 3308209962Smm zio->io_reexecute = 0; 3309209962Smm 3310185029Spjd if (zio->io_reexecute) { 3311185029Spjd /* 3312185029Spjd * This is a logical I/O that wants to reexecute. 3313185029Spjd * 3314185029Spjd * Reexecute is top-down. When an i/o fails, if it's not 3315185029Spjd * the root, it simply notifies its parent and sticks around. 3316185029Spjd * The parent, seeing that it still has children in zio_done(), 3317185029Spjd * does the same. This percolates all the way up to the root. 3318185029Spjd * The root i/o will reexecute or suspend the entire tree. 3319185029Spjd * 3320185029Spjd * This approach ensures that zio_reexecute() honors 3321185029Spjd * all the original i/o dependency relationships, e.g. 3322185029Spjd * parents not executing until children are ready. 3323185029Spjd */ 3324185029Spjd ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 3325185029Spjd 3326209962Smm zio->io_gang_leader = NULL; 3327185029Spjd 3328209962Smm mutex_enter(&zio->io_lock); 3329209962Smm zio->io_state[ZIO_WAIT_DONE] = 1; 3330209962Smm mutex_exit(&zio->io_lock); 3331185029Spjd 3332209962Smm /* 3333209962Smm * "The Godfather" I/O monitors its children but is 3334209962Smm * not a true parent to them. It will track them through 3335209962Smm * the pipeline but severs its ties whenever they get into 3336209962Smm * trouble (e.g. suspended). This allows "The Godfather" 3337209962Smm * I/O to return status without blocking. 3338209962Smm */ 3339209962Smm for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) { 3340209962Smm zio_link_t *zl = zio->io_walk_link; 3341209962Smm pio_next = zio_walk_parents(zio); 3342209962Smm 3343209962Smm if ((pio->io_flags & ZIO_FLAG_GODFATHER) && 3344209962Smm (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) { 3345209962Smm zio_remove_child(pio, zio, zl); 3346209962Smm zio_notify_parent(pio, zio, ZIO_WAIT_DONE); 3347209962Smm } 3348209962Smm } 3349209962Smm 3350209962Smm if ((pio = zio_unique_parent(zio)) != NULL) { 3351185029Spjd /* 3352185029Spjd * We're not a root i/o, so there's nothing to do 3353185029Spjd * but notify our parent. Don't propagate errors 3354185029Spjd * upward since we haven't permanently failed yet. 3355185029Spjd */ 3356209962Smm ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER)); 3357185029Spjd zio->io_flags |= ZIO_FLAG_DONT_PROPAGATE; 3358185029Spjd zio_notify_parent(pio, zio, ZIO_WAIT_DONE); 3359185029Spjd } else if (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND) { 3360185029Spjd /* 3361185029Spjd * We'd fail again if we reexecuted now, so suspend 3362185029Spjd * until conditions improve (e.g. device comes online). 3363185029Spjd */ 3364185029Spjd zio_suspend(spa, zio); 3365185029Spjd } else { 3366185029Spjd /* 3367185029Spjd * Reexecution is potentially a huge amount of work. 3368185029Spjd * Hand it off to the otherwise-unused claim taskq. 3369185029Spjd */ 3370260742Savg#if defined(illumos) || !defined(_KERNEL) 3371260742Savg ASSERT(zio->io_tqent.tqent_next == NULL); 3372216919Smm#else 3373260742Savg ASSERT(zio->io_tqent.tqent_task.ta_pending == 0); 3374260742Savg#endif 3375260750Savg spa_taskq_dispatch_ent(spa, ZIO_TYPE_CLAIM, 3376260750Savg ZIO_TASKQ_ISSUE, (task_func_t *)zio_reexecute, zio, 3377260750Savg 0, &zio->io_tqent); 3378185029Spjd } 3379185029Spjd return (ZIO_PIPELINE_STOP); 3380168404Spjd } 3381168404Spjd 3382219089Spjd ASSERT(zio->io_child_count == 0); 3383185029Spjd ASSERT(zio->io_reexecute == 0); 3384185029Spjd ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL)); 3385168404Spjd 3386209962Smm /* 3387219089Spjd * Report any checksum errors, since the I/O is complete. 3388219089Spjd */ 3389219089Spjd while (zio->io_cksum_report != NULL) { 3390219089Spjd zio_cksum_report_t *zcr = zio->io_cksum_report; 3391219089Spjd zio->io_cksum_report = zcr->zcr_next; 3392219089Spjd zcr->zcr_next = NULL; 3393219089Spjd zcr->zcr_finish(zcr, NULL); 3394219089Spjd zfs_ereport_free_checksum(zcr); 3395219089Spjd } 3396219089Spjd 3397219089Spjd /* 3398209962Smm * It is the responsibility of the done callback to ensure that this 3399209962Smm * particular zio is no longer discoverable for adoption, and as 3400209962Smm * such, cannot acquire any new parents. 3401209962Smm */ 3402185029Spjd if (zio->io_done) 3403185029Spjd zio->io_done(zio); 3404168404Spjd 3405209962Smm mutex_enter(&zio->io_lock); 3406209962Smm zio->io_state[ZIO_WAIT_DONE] = 1; 3407209962Smm mutex_exit(&zio->io_lock); 3408168404Spjd 3409209962Smm for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) { 3410209962Smm zio_link_t *zl = zio->io_walk_link; 3411209962Smm pio_next = zio_walk_parents(zio); 3412209962Smm zio_remove_child(pio, zio, zl); 3413185029Spjd zio_notify_parent(pio, zio, ZIO_WAIT_DONE); 3414168404Spjd } 3415168404Spjd 3416185029Spjd if (zio->io_waiter != NULL) { 3417185029Spjd mutex_enter(&zio->io_lock); 3418185029Spjd zio->io_executor = NULL; 3419185029Spjd cv_broadcast(&zio->io_cv); 3420185029Spjd mutex_exit(&zio->io_lock); 3421185029Spjd } else { 3422185029Spjd zio_destroy(zio); 3423168404Spjd } 3424168404Spjd 3425185029Spjd return (ZIO_PIPELINE_STOP); 3426168404Spjd} 3427168404Spjd 3428168404Spjd/* 3429185029Spjd * ========================================================================== 3430185029Spjd * I/O pipeline definition 3431185029Spjd * ========================================================================== 3432168404Spjd */ 3433219089Spjdstatic zio_pipe_stage_t *zio_pipeline[] = { 3434185029Spjd NULL, 3435219089Spjd zio_read_bp_init, 3436219089Spjd zio_free_bp_init, 3437185029Spjd zio_issue_async, 3438185029Spjd zio_write_bp_init, 3439185029Spjd zio_checksum_generate, 3440243524Smm zio_nop_write, 3441219089Spjd zio_ddt_read_start, 3442219089Spjd zio_ddt_read_done, 3443219089Spjd zio_ddt_write, 3444219089Spjd zio_ddt_free, 3445185029Spjd zio_gang_assemble, 3446185029Spjd zio_gang_issue, 3447185029Spjd zio_dva_allocate, 3448185029Spjd zio_dva_free, 3449185029Spjd zio_dva_claim, 3450185029Spjd zio_ready, 3451185029Spjd zio_vdev_io_start, 3452185029Spjd zio_vdev_io_done, 3453185029Spjd zio_vdev_io_assess, 3454185029Spjd zio_checksum_verify, 3455185029Spjd zio_done 3456185029Spjd}; 3457236884Smm 3458236884Smm/* dnp is the dnode for zb1->zb_object */ 3459236884Smmboolean_t 3460268657Sdelphijzbookmark_is_before(const dnode_phys_t *dnp, const zbookmark_phys_t *zb1, 3461268657Sdelphij const zbookmark_phys_t *zb2) 3462236884Smm{ 3463236884Smm uint64_t zb1nextL0, zb2thisobj; 3464236884Smm 3465236884Smm ASSERT(zb1->zb_objset == zb2->zb_objset); 3466236884Smm ASSERT(zb2->zb_level == 0); 3467236884Smm 3468236884Smm /* The objset_phys_t isn't before anything. */ 3469236884Smm if (dnp == NULL) 3470236884Smm return (B_FALSE); 3471236884Smm 3472236884Smm zb1nextL0 = (zb1->zb_blkid + 1) << 3473236884Smm ((zb1->zb_level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT)); 3474236884Smm 3475236884Smm zb2thisobj = zb2->zb_object ? zb2->zb_object : 3476236884Smm zb2->zb_blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT); 3477236884Smm 3478236884Smm if (zb1->zb_object == DMU_META_DNODE_OBJECT) { 3479236884Smm uint64_t nextobj = zb1nextL0 * 3480236884Smm (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT) >> DNODE_SHIFT; 3481236884Smm return (nextobj <= zb2thisobj); 3482236884Smm } 3483236884Smm 3484236884Smm if (zb1->zb_object < zb2thisobj) 3485236884Smm return (B_TRUE); 3486236884Smm if (zb1->zb_object > zb2thisobj) 3487236884Smm return (B_FALSE); 3488236884Smm if (zb2->zb_object == DMU_META_DNODE_OBJECT) 3489236884Smm return (B_FALSE); 3490236884Smm return (zb1nextL0 <= zb2->zb_blkid); 3491236884Smm} 3492