Cross Reference: /freebsd-11-stable/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c

Deleted Added

sdiff udiff text old ( 332547 ) new ( 339034 )

full compact

zio.c (332547)	zio.c (339034)
1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 / 21/ 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2011, 2017 by Delphix. All rights reserved. 24 * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved. 25 * Copyright (c) 2014 Integros [integros.com] 26 */ 27 28#include <sys/sysmacros.h> 29#include <sys/zfs_context.h> 30#include <sys/fm/fs/zfs.h> 31#include <sys/spa.h> 32#include <sys/txg.h> 33#include <sys/spa_impl.h> 34#include <sys/vdev_impl.h> 35#include <sys/zio_impl.h> 36#include <sys/zio_compress.h> 37#include <sys/zio_checksum.h> 38#include <sys/dmu_objset.h> 39#include <sys/arc.h> 40#include <sys/ddt.h> 41#include <sys/trim_map.h> 42#include <sys/blkptr.h> 43#include <sys/zfeature.h>	1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 / 21/ 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2011, 2017 by Delphix. All rights reserved. 24 * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved. 25 * Copyright (c) 2014 Integros [integros.com] 26 */ 27 28#include <sys/sysmacros.h> 29#include <sys/zfs_context.h> 30#include <sys/fm/fs/zfs.h> 31#include <sys/spa.h> 32#include <sys/txg.h> 33#include <sys/spa_impl.h> 34#include <sys/vdev_impl.h> 35#include <sys/zio_impl.h> 36#include <sys/zio_compress.h> 37#include <sys/zio_checksum.h> 38#include <sys/dmu_objset.h> 39#include <sys/arc.h> 40#include <sys/ddt.h> 41#include <sys/trim_map.h> 42#include <sys/blkptr.h> 43#include <sys/zfeature.h>
	44#include <sys/dsl_scan.h>
44#include <sys/metaslab_impl.h> 45#include <sys/abd.h> 46 47SYSCTL_DECL(_vfs_zfs); 48SYSCTL_NODE(_vfs_zfs, OID_AUTO, zio, CTLFLAG_RW, 0, "ZFS ZIO"); 49#if defined(__amd64__) 50static int zio_use_uma = 1; 51#else 52static int zio_use_uma = 0; 53#endif 54SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, use_uma, CTLFLAG_RDTUN, &zio_use_uma, 0, 55 "Use uma(9) for ZIO allocations"); 56static int zio_exclude_metadata = 0; 57SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, exclude_metadata, CTLFLAG_RDTUN, &zio_exclude_metadata, 0, 58 "Exclude metadata buffers from dumps as well"); 59 60zio_trim_stats_t zio_trim_stats = { 61 { "bytes", KSTAT_DATA_UINT64, 62 "Number of bytes successfully TRIMmed" }, 63 { "success", KSTAT_DATA_UINT64, 64 "Number of successful TRIM requests" }, 65 { "unsupported", KSTAT_DATA_UINT64, 66 "Number of TRIM requests that failed because TRIM is not supported" }, 67 { "failed", KSTAT_DATA_UINT64, 68 "Number of TRIM requests that failed for reasons other than not supported" }, 69}; 70 71static kstat_t zio_trim_ksp; 72 73/ 74 * ========================================================================== 75 * I/O type descriptions 76 * ========================================================================== 77 / 78const char zio_type_name[ZIO_TYPES] = { 79 "zio_null", "zio_read", "zio_write", "zio_free", "zio_claim", 80 "zio_ioctl" 81}; 82 83boolean_t zio_dva_throttle_enabled = B_TRUE; 84SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, dva_throttle_enabled, CTLFLAG_RDTUN, 85 &zio_dva_throttle_enabled, 0, ""); 86 87/* 88 * ========================================================================== 89 * I/O kmem caches 90 * ========================================================================== 91 / 92kmem_cache_t zio_cache; 93kmem_cache_t zio_link_cache; 94kmem_cache_t zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; 95kmem_cache_t zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; 96 97#ifdef _KERNEL 98extern vmem_t zio_alloc_arena; 99#endif 100 101#define ZIO_PIPELINE_CONTINUE 0x100 102#define ZIO_PIPELINE_STOP 0x101 103 104#define BP_SPANB(indblkshift, level) \ 105 (((uint64_t)1) << ((level) * ((indblkshift) - SPA_BLKPTRSHIFT))) 106#define COMPARE_META_LEVEL 0x80000000ul 107/* 108 * The following actions directly effect the spa's sync-to-convergence logic. 109 * The values below define the sync pass when we start performing the action. 110 * Care should be taken when changing these values as they directly impact 111 * spa_sync() performance. Tuning these values may introduce subtle performance 112 * pathologies and should only be done in the context of performance analysis. 113 * These tunables will eventually be removed and replaced with #defines once 114 * enough analysis has been done to determine optimal values. 115 * 116 * The 'zfs_sync_pass_deferred_free' pass must be greater than 1 to ensure that 117 * regular blocks are not deferred. 118 / 119int zfs_sync_pass_deferred_free = 2; / defer frees starting in this pass / 120SYSCTL_INT(_vfs_zfs, OID_AUTO, sync_pass_deferred_free, CTLFLAG_RDTUN, 121* &zfs_sync_pass_deferred_free, 0, "defer frees starting in this pass"); 122int zfs_sync_pass_dont_compress = 5; /* don't compress starting in this pass / 123SYSCTL_INT(_vfs_zfs, OID_AUTO, sync_pass_dont_compress, CTLFLAG_RDTUN, 124* &zfs_sync_pass_dont_compress, 0, "don't compress starting in this pass"); 125int zfs_sync_pass_rewrite = 2; /* rewrite new bps starting in this pass / 126SYSCTL_INT(_vfs_zfs, OID_AUTO, sync_pass_rewrite, CTLFLAG_RDTUN, 127* &zfs_sync_pass_rewrite, 0, "rewrite new bps starting in this pass"); 128 129/* 130 * An allocating zio is one that either currently has the DVA allocate 131 * stage set or will have it later in its lifetime. 132 / 133#define IO_IS_ALLOCATING(zio) ((zio)->io_orig_pipeline & ZIO_STAGE_DVA_ALLOCATE) 134* 135boolean_t zio_requeue_io_start_cut_in_line = B_TRUE; 136 137#ifdef illumos 138#ifdef ZFS_DEBUG 139int zio_buf_debug_limit = 16384; 140#else 141int zio_buf_debug_limit = 0; 142#endif 143#endif 144 145static void zio_taskq_dispatch(zio_t , zio_taskq_type_t, boolean_t); 146* 147void 148zio_init(void) 149{ 150 size_t c; 151 zio_cache = kmem_cache_create("zio_cache", 152 sizeof (zio_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 153 zio_link_cache = kmem_cache_create("zio_link_cache", 154 sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 155 if (!zio_use_uma) 156 goto out; 157 158 /* 159 * For small buffers, we want a cache for each multiple of 160 * SPA_MINBLOCKSIZE. For larger buffers, we want a cache 161 * for each quarter-power of 2. 162 / 163* for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { 164 size_t size = (c + 1) << SPA_MINBLOCKSHIFT; 165 size_t p2 = size; 166 size_t align = 0; 167 int cflags = zio_exclude_metadata ? KMC_NODEBUG : 0; 168 169 while (!ISP2(p2)) 170 p2 &= p2 - 1; 171 172#ifdef illumos 173#ifndef _KERNEL 174 /* 175 * If we are using watchpoints, put each buffer on its own page, 176 * to eliminate the performance overhead of trapping to the 177 * kernel when modifying a non-watched buffer that shares the 178 * page with a watched buffer. 179 / 180* if (arc_watch && !IS_P2ALIGNED(size, PAGESIZE)) 181 continue; 182#endif 183#endif /* illumos / 184* if (size <= 4 * SPA_MINBLOCKSIZE) { 185 align = SPA_MINBLOCKSIZE; 186 } else if (IS_P2ALIGNED(size, p2 >> 2)) { 187 align = MIN(p2 >> 2, PAGESIZE); 188 } 189 190 if (align != 0) { 191 char name[36]; 192 (void) sprintf(name, "zio_buf_%lu", (ulong_t)size); 193 zio_buf_cache[c] = kmem_cache_create(name, size, 194 align, NULL, NULL, NULL, NULL, NULL, cflags); 195 196 /* 197 * Since zio_data bufs do not appear in crash dumps, we 198 * pass KMC_NOTOUCH so that no allocator metadata is 199 * stored with the buffers. 200 / 201* (void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size); 202 zio_data_buf_cache[c] = kmem_cache_create(name, size, 203 align, NULL, NULL, NULL, NULL, NULL, 204 cflags \| KMC_NOTOUCH \| KMC_NODEBUG); 205 } 206 } 207 208 while (--c != 0) { 209 ASSERT(zio_buf_cache[c] != NULL); 210 if (zio_buf_cache[c - 1] == NULL) 211 zio_buf_cache[c - 1] = zio_buf_cache[c]; 212 213 ASSERT(zio_data_buf_cache[c] != NULL); 214 if (zio_data_buf_cache[c - 1] == NULL) 215 zio_data_buf_cache[c - 1] = zio_data_buf_cache[c]; 216 } 217out: 218 219 zio_inject_init(); 220 221 zio_trim_ksp = kstat_create("zfs", 0, "zio_trim", "misc", 222 KSTAT_TYPE_NAMED, 223 sizeof(zio_trim_stats) / sizeof(kstat_named_t), 224 KSTAT_FLAG_VIRTUAL); 225 226 if (zio_trim_ksp != NULL) { 227 zio_trim_ksp->ks_data = &zio_trim_stats; 228 kstat_install(zio_trim_ksp); 229 } 230} 231 232void 233zio_fini(void) 234{ 235 size_t c; 236 kmem_cache_t last_cache = NULL; 237* kmem_cache_t last_data_cache = NULL; 238* 239 for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { 240 if (zio_buf_cache[c] != last_cache) { 241 last_cache = zio_buf_cache[c]; 242 kmem_cache_destroy(zio_buf_cache[c]); 243 } 244 zio_buf_cache[c] = NULL; 245 246 if (zio_data_buf_cache[c] != last_data_cache) { 247 last_data_cache = zio_data_buf_cache[c]; 248 kmem_cache_destroy(zio_data_buf_cache[c]); 249 } 250 zio_data_buf_cache[c] = NULL; 251 } 252 253 kmem_cache_destroy(zio_link_cache); 254 kmem_cache_destroy(zio_cache); 255 256 zio_inject_fini(); 257 258 if (zio_trim_ksp != NULL) { 259 kstat_delete(zio_trim_ksp); 260 zio_trim_ksp = NULL; 261 } 262} 263 264/* 265 * ========================================================================== 266 * Allocate and free I/O buffers 267 * ========================================================================== 268 / 269* 270/* 271 * Use zio_buf_alloc to allocate ZFS metadata. This data will appear in a 272 * crashdump if the kernel panics, so use it judiciously. Obviously, it's 273 * useful to inspect ZFS metadata, but if possible, we should avoid keeping 274 * excess / transient data in-core during a crashdump. 275 / 276void 277zio_buf_alloc(size_t size) 278{ 279 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 280 int flags = zio_exclude_metadata ? KM_NODEBUG : 0; 281 282 VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 283 284 if (zio_use_uma) 285 return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE)); 286 else 287 return (kmem_alloc(size, KM_SLEEP\|flags)); 288} 289 290/* 291 * Use zio_data_buf_alloc to allocate data. The data will not appear in a 292 * crashdump if the kernel panics. This exists so that we will limit the amount 293 * of ZFS data that shows up in a kernel crashdump. (Thus reducing the amount 294 * of kernel heap dumped to disk when the kernel panics) 295 / 296void 297zio_data_buf_alloc(size_t size) 298{ 299 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 300 301 VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 302 303 if (zio_use_uma) 304 return (kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE)); 305 else 306 return (kmem_alloc(size, KM_SLEEP \| KM_NODEBUG)); 307} 308 309void 310zio_buf_free(void buf, size_t size) 311{ 312* size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 313 314 VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 315 316 if (zio_use_uma) 317 kmem_cache_free(zio_buf_cache[c], buf); 318 else 319 kmem_free(buf, size); 320} 321 322void 323zio_data_buf_free(void buf, size_t size) 324{ 325* size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 326 327 VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 328 329 if (zio_use_uma) 330 kmem_cache_free(zio_data_buf_cache[c], buf); 331 else 332 kmem_free(buf, size); 333} 334 335/* 336 * ========================================================================== 337 * Push and pop I/O transform buffers 338 * ========================================================================== 339 / 340void 341zio_push_transform(zio_t zio, abd_t data, uint64_t size, uint64_t bufsize, 342* zio_transform_func_t transform) 343{ 344* zio_transform_t zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP); 345* 346 /* 347 * Ensure that anyone expecting this zio to contain a linear ABD isn't 348 * going to get a nasty surprise when they try to access the data. 349 / 350#ifdef illumos 351* IMPLY(abd_is_linear(zio->io_abd), abd_is_linear(data)); 352#else 353 IMPLY(zio->io_abd != NULL && abd_is_linear(zio->io_abd), 354 abd_is_linear(data)); 355#endif 356 357 zt->zt_orig_abd = zio->io_abd; 358 zt->zt_orig_size = zio->io_size; 359 zt->zt_bufsize = bufsize; 360 zt->zt_transform = transform; 361 362 zt->zt_next = zio->io_transform_stack; 363 zio->io_transform_stack = zt; 364 365 zio->io_abd = data; 366 zio->io_size = size; 367} 368 369void 370zio_pop_transforms(zio_t zio) 371{ 372* zio_transform_t zt; 373* 374 while ((zt = zio->io_transform_stack) != NULL) { 375 if (zt->zt_transform != NULL) 376 zt->zt_transform(zio, 377 zt->zt_orig_abd, zt->zt_orig_size); 378 379 if (zt->zt_bufsize != 0) 380 abd_free(zio->io_abd); 381 382 zio->io_abd = zt->zt_orig_abd; 383 zio->io_size = zt->zt_orig_size; 384 zio->io_transform_stack = zt->zt_next; 385 386 kmem_free(zt, sizeof (zio_transform_t)); 387 } 388} 389 390/* 391 * ========================================================================== 392 * I/O transform callbacks for subblocks and decompression 393 * ========================================================================== 394 / 395static void 396zio_subblock(zio_t zio, abd_t data, uint64_t size) 397{ 398* ASSERT(zio->io_size > size); 399 400 if (zio->io_type == ZIO_TYPE_READ) 401 abd_copy(data, zio->io_abd, size); 402} 403 404static void 405zio_decompress(zio_t zio, abd_t data, uint64_t size) 406{ 407 if (zio->io_error == 0) { 408 void tmp = abd_borrow_buf(data, size); 409* int ret = zio_decompress_data(BP_GET_COMPRESS(zio->io_bp), 410 zio->io_abd, tmp, zio->io_size, size); 411 abd_return_buf_copy(data, tmp, size); 412 413 if (ret != 0) 414 zio->io_error = SET_ERROR(EIO); 415 } 416} 417 418/* 419 * ========================================================================== 420 * I/O parent/child relationships and pipeline interlocks 421 * ========================================================================== 422 / 423zio_t 424zio_walk_parents(zio_t cio, zio_link_t zl) 425{ 426* list_t pl = &cio->io_parent_list; 427* 428 zl = (zl == NULL) ? list_head(pl) : list_next(pl, zl); 429* if (zl == NULL) 430* return (NULL); 431 432 ASSERT((zl)->zl_child == cio); 433* return ((zl)->zl_parent); 434} 435* 436zio_t * 437zio_walk_children(zio_t pio, zio_link_t zl) 438{ 439* list_t cl = &pio->io_child_list; 440*	45#include <sys/metaslab_impl.h> 46#include <sys/abd.h> 47 48SYSCTL_DECL(_vfs_zfs); 49SYSCTL_NODE(_vfs_zfs, OID_AUTO, zio, CTLFLAG_RW, 0, "ZFS ZIO"); 50#if defined(__amd64__) 51static int zio_use_uma = 1; 52#else 53static int zio_use_uma = 0; 54#endif 55SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, use_uma, CTLFLAG_RDTUN, &zio_use_uma, 0, 56 "Use uma(9) for ZIO allocations"); 57static int zio_exclude_metadata = 0; 58SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, exclude_metadata, CTLFLAG_RDTUN, &zio_exclude_metadata, 0, 59 "Exclude metadata buffers from dumps as well"); 60 61zio_trim_stats_t zio_trim_stats = { 62 { "bytes", KSTAT_DATA_UINT64, 63 "Number of bytes successfully TRIMmed" }, 64 { "success", KSTAT_DATA_UINT64, 65 "Number of successful TRIM requests" }, 66 { "unsupported", KSTAT_DATA_UINT64, 67 "Number of TRIM requests that failed because TRIM is not supported" }, 68 { "failed", KSTAT_DATA_UINT64, 69 "Number of TRIM requests that failed for reasons other than not supported" }, 70}; 71 72static kstat_t zio_trim_ksp; 73 74/ 75 * ========================================================================== 76 * I/O type descriptions 77 * ========================================================================== 78 / 79const char zio_type_name[ZIO_TYPES] = { 80 "zio_null", "zio_read", "zio_write", "zio_free", "zio_claim", 81 "zio_ioctl" 82}; 83 84boolean_t zio_dva_throttle_enabled = B_TRUE; 85SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, dva_throttle_enabled, CTLFLAG_RDTUN, 86 &zio_dva_throttle_enabled, 0, ""); 87 88/* 89 * ========================================================================== 90 * I/O kmem caches 91 * ========================================================================== 92 / 93kmem_cache_t zio_cache; 94kmem_cache_t zio_link_cache; 95kmem_cache_t zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; 96kmem_cache_t zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; 97 98#ifdef _KERNEL 99extern vmem_t zio_alloc_arena; 100#endif 101 102#define ZIO_PIPELINE_CONTINUE 0x100 103#define ZIO_PIPELINE_STOP 0x101 104 105#define BP_SPANB(indblkshift, level) \ 106 (((uint64_t)1) << ((level) * ((indblkshift) - SPA_BLKPTRSHIFT))) 107#define COMPARE_META_LEVEL 0x80000000ul 108/* 109 * The following actions directly effect the spa's sync-to-convergence logic. 110 * The values below define the sync pass when we start performing the action. 111 * Care should be taken when changing these values as they directly impact 112 * spa_sync() performance. Tuning these values may introduce subtle performance 113 * pathologies and should only be done in the context of performance analysis. 114 * These tunables will eventually be removed and replaced with #defines once 115 * enough analysis has been done to determine optimal values. 116 * 117 * The 'zfs_sync_pass_deferred_free' pass must be greater than 1 to ensure that 118 * regular blocks are not deferred. 119 / 120int zfs_sync_pass_deferred_free = 2; / defer frees starting in this pass / 121SYSCTL_INT(_vfs_zfs, OID_AUTO, sync_pass_deferred_free, CTLFLAG_RDTUN, 122* &zfs_sync_pass_deferred_free, 0, "defer frees starting in this pass"); 123int zfs_sync_pass_dont_compress = 5; /* don't compress starting in this pass / 124SYSCTL_INT(_vfs_zfs, OID_AUTO, sync_pass_dont_compress, CTLFLAG_RDTUN, 125* &zfs_sync_pass_dont_compress, 0, "don't compress starting in this pass"); 126int zfs_sync_pass_rewrite = 2; /* rewrite new bps starting in this pass / 127SYSCTL_INT(_vfs_zfs, OID_AUTO, sync_pass_rewrite, CTLFLAG_RDTUN, 128* &zfs_sync_pass_rewrite, 0, "rewrite new bps starting in this pass"); 129 130/* 131 * An allocating zio is one that either currently has the DVA allocate 132 * stage set or will have it later in its lifetime. 133 / 134#define IO_IS_ALLOCATING(zio) ((zio)->io_orig_pipeline & ZIO_STAGE_DVA_ALLOCATE) 135* 136boolean_t zio_requeue_io_start_cut_in_line = B_TRUE; 137 138#ifdef illumos 139#ifdef ZFS_DEBUG 140int zio_buf_debug_limit = 16384; 141#else 142int zio_buf_debug_limit = 0; 143#endif 144#endif 145 146static void zio_taskq_dispatch(zio_t , zio_taskq_type_t, boolean_t); 147* 148void 149zio_init(void) 150{ 151 size_t c; 152 zio_cache = kmem_cache_create("zio_cache", 153 sizeof (zio_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 154 zio_link_cache = kmem_cache_create("zio_link_cache", 155 sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 156 if (!zio_use_uma) 157 goto out; 158 159 /* 160 * For small buffers, we want a cache for each multiple of 161 * SPA_MINBLOCKSIZE. For larger buffers, we want a cache 162 * for each quarter-power of 2. 163 / 164* for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { 165 size_t size = (c + 1) << SPA_MINBLOCKSHIFT; 166 size_t p2 = size; 167 size_t align = 0; 168 int cflags = zio_exclude_metadata ? KMC_NODEBUG : 0; 169 170 while (!ISP2(p2)) 171 p2 &= p2 - 1; 172 173#ifdef illumos 174#ifndef _KERNEL 175 /* 176 * If we are using watchpoints, put each buffer on its own page, 177 * to eliminate the performance overhead of trapping to the 178 * kernel when modifying a non-watched buffer that shares the 179 * page with a watched buffer. 180 / 181* if (arc_watch && !IS_P2ALIGNED(size, PAGESIZE)) 182 continue; 183#endif 184#endif /* illumos / 185* if (size <= 4 * SPA_MINBLOCKSIZE) { 186 align = SPA_MINBLOCKSIZE; 187 } else if (IS_P2ALIGNED(size, p2 >> 2)) { 188 align = MIN(p2 >> 2, PAGESIZE); 189 } 190 191 if (align != 0) { 192 char name[36]; 193 (void) sprintf(name, "zio_buf_%lu", (ulong_t)size); 194 zio_buf_cache[c] = kmem_cache_create(name, size, 195 align, NULL, NULL, NULL, NULL, NULL, cflags); 196 197 /* 198 * Since zio_data bufs do not appear in crash dumps, we 199 * pass KMC_NOTOUCH so that no allocator metadata is 200 * stored with the buffers. 201 / 202* (void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size); 203 zio_data_buf_cache[c] = kmem_cache_create(name, size, 204 align, NULL, NULL, NULL, NULL, NULL, 205 cflags \| KMC_NOTOUCH \| KMC_NODEBUG); 206 } 207 } 208 209 while (--c != 0) { 210 ASSERT(zio_buf_cache[c] != NULL); 211 if (zio_buf_cache[c - 1] == NULL) 212 zio_buf_cache[c - 1] = zio_buf_cache[c]; 213 214 ASSERT(zio_data_buf_cache[c] != NULL); 215 if (zio_data_buf_cache[c - 1] == NULL) 216 zio_data_buf_cache[c - 1] = zio_data_buf_cache[c]; 217 } 218out: 219 220 zio_inject_init(); 221 222 zio_trim_ksp = kstat_create("zfs", 0, "zio_trim", "misc", 223 KSTAT_TYPE_NAMED, 224 sizeof(zio_trim_stats) / sizeof(kstat_named_t), 225 KSTAT_FLAG_VIRTUAL); 226 227 if (zio_trim_ksp != NULL) { 228 zio_trim_ksp->ks_data = &zio_trim_stats; 229 kstat_install(zio_trim_ksp); 230 } 231} 232 233void 234zio_fini(void) 235{ 236 size_t c; 237 kmem_cache_t last_cache = NULL; 238* kmem_cache_t last_data_cache = NULL; 239* 240 for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { 241 if (zio_buf_cache[c] != last_cache) { 242 last_cache = zio_buf_cache[c]; 243 kmem_cache_destroy(zio_buf_cache[c]); 244 } 245 zio_buf_cache[c] = NULL; 246 247 if (zio_data_buf_cache[c] != last_data_cache) { 248 last_data_cache = zio_data_buf_cache[c]; 249 kmem_cache_destroy(zio_data_buf_cache[c]); 250 } 251 zio_data_buf_cache[c] = NULL; 252 } 253 254 kmem_cache_destroy(zio_link_cache); 255 kmem_cache_destroy(zio_cache); 256 257 zio_inject_fini(); 258 259 if (zio_trim_ksp != NULL) { 260 kstat_delete(zio_trim_ksp); 261 zio_trim_ksp = NULL; 262 } 263} 264 265/* 266 * ========================================================================== 267 * Allocate and free I/O buffers 268 * ========================================================================== 269 / 270* 271/* 272 * Use zio_buf_alloc to allocate ZFS metadata. This data will appear in a 273 * crashdump if the kernel panics, so use it judiciously. Obviously, it's 274 * useful to inspect ZFS metadata, but if possible, we should avoid keeping 275 * excess / transient data in-core during a crashdump. 276 / 277void 278zio_buf_alloc(size_t size) 279{ 280 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 281 int flags = zio_exclude_metadata ? KM_NODEBUG : 0; 282 283 VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 284 285 if (zio_use_uma) 286 return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE)); 287 else 288 return (kmem_alloc(size, KM_SLEEP\|flags)); 289} 290 291/* 292 * Use zio_data_buf_alloc to allocate data. The data will not appear in a 293 * crashdump if the kernel panics. This exists so that we will limit the amount 294 * of ZFS data that shows up in a kernel crashdump. (Thus reducing the amount 295 * of kernel heap dumped to disk when the kernel panics) 296 / 297void 298zio_data_buf_alloc(size_t size) 299{ 300 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 301 302 VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 303 304 if (zio_use_uma) 305 return (kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE)); 306 else 307 return (kmem_alloc(size, KM_SLEEP \| KM_NODEBUG)); 308} 309 310void 311zio_buf_free(void buf, size_t size) 312{ 313* size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 314 315 VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 316 317 if (zio_use_uma) 318 kmem_cache_free(zio_buf_cache[c], buf); 319 else 320 kmem_free(buf, size); 321} 322 323void 324zio_data_buf_free(void buf, size_t size) 325{ 326* size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 327 328 VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 329 330 if (zio_use_uma) 331 kmem_cache_free(zio_data_buf_cache[c], buf); 332 else 333 kmem_free(buf, size); 334} 335 336/* 337 * ========================================================================== 338 * Push and pop I/O transform buffers 339 * ========================================================================== 340 / 341void 342zio_push_transform(zio_t zio, abd_t data, uint64_t size, uint64_t bufsize, 343* zio_transform_func_t transform) 344{ 345* zio_transform_t zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP); 346* 347 /* 348 * Ensure that anyone expecting this zio to contain a linear ABD isn't 349 * going to get a nasty surprise when they try to access the data. 350 / 351#ifdef illumos 352* IMPLY(abd_is_linear(zio->io_abd), abd_is_linear(data)); 353#else 354 IMPLY(zio->io_abd != NULL && abd_is_linear(zio->io_abd), 355 abd_is_linear(data)); 356#endif 357 358 zt->zt_orig_abd = zio->io_abd; 359 zt->zt_orig_size = zio->io_size; 360 zt->zt_bufsize = bufsize; 361 zt->zt_transform = transform; 362 363 zt->zt_next = zio->io_transform_stack; 364 zio->io_transform_stack = zt; 365 366 zio->io_abd = data; 367 zio->io_size = size; 368} 369 370void 371zio_pop_transforms(zio_t zio) 372{ 373* zio_transform_t zt; 374* 375 while ((zt = zio->io_transform_stack) != NULL) { 376 if (zt->zt_transform != NULL) 377 zt->zt_transform(zio, 378 zt->zt_orig_abd, zt->zt_orig_size); 379 380 if (zt->zt_bufsize != 0) 381 abd_free(zio->io_abd); 382 383 zio->io_abd = zt->zt_orig_abd; 384 zio->io_size = zt->zt_orig_size; 385 zio->io_transform_stack = zt->zt_next; 386 387 kmem_free(zt, sizeof (zio_transform_t)); 388 } 389} 390 391/* 392 * ========================================================================== 393 * I/O transform callbacks for subblocks and decompression 394 * ========================================================================== 395 / 396static void 397zio_subblock(zio_t zio, abd_t data, uint64_t size) 398{ 399* ASSERT(zio->io_size > size); 400 401 if (zio->io_type == ZIO_TYPE_READ) 402 abd_copy(data, zio->io_abd, size); 403} 404 405static void 406zio_decompress(zio_t zio, abd_t data, uint64_t size) 407{ 408 if (zio->io_error == 0) { 409 void tmp = abd_borrow_buf(data, size); 410* int ret = zio_decompress_data(BP_GET_COMPRESS(zio->io_bp), 411 zio->io_abd, tmp, zio->io_size, size); 412 abd_return_buf_copy(data, tmp, size); 413 414 if (ret != 0) 415 zio->io_error = SET_ERROR(EIO); 416 } 417} 418 419/* 420 * ========================================================================== 421 * I/O parent/child relationships and pipeline interlocks 422 * ========================================================================== 423 / 424zio_t 425zio_walk_parents(zio_t cio, zio_link_t zl) 426{ 427* list_t pl = &cio->io_parent_list; 428* 429 zl = (zl == NULL) ? list_head(pl) : list_next(pl, zl); 430* if (zl == NULL) 431* return (NULL); 432 433 ASSERT((zl)->zl_child == cio); 434* return ((zl)->zl_parent); 435} 436* 437zio_t * 438zio_walk_children(zio_t pio, zio_link_t zl) 439{ 440* list_t cl = &pio->io_child_list; 441*
	442 ASSERT(MUTEX_HELD(&pio->io_lock)); 443
441 zl = (zl == NULL) ? list_head(cl) : list_next(cl, zl); 442* if (zl == NULL) 443* return (NULL); 444 445 ASSERT((zl)->zl_parent == pio); 446* return ((zl)->zl_child); 447} 448* 449zio_t * 450zio_unique_parent(zio_t cio) 451{ 452* zio_link_t zl = NULL; 453* zio_t pio = zio_walk_parents(cio, &zl); 454* 455 VERIFY3P(zio_walk_parents(cio, &zl), ==, NULL); 456 return (pio); 457} 458 459void 460zio_add_child(zio_t pio, zio_t cio) 461{ 462 zio_link_t zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP); 463* 464 /* 465 * Logical I/Os can have logical, gang, or vdev children. 466 * Gang I/Os can have gang or vdev children. 467 * Vdev I/Os can only have vdev children. 468 * The following ASSERT captures all of these constraints. 469 / 470* ASSERT3S(cio->io_child_type, <=, pio->io_child_type); 471 472 zl->zl_parent = pio; 473 zl->zl_child = cio; 474	444 zl = (zl == NULL) ? list_head(cl) : list_next(cl, zl); 445* if (zl == NULL) 446* return (NULL); 447 448 ASSERT((zl)->zl_parent == pio); 449* return ((zl)->zl_child); 450} 451* 452zio_t * 453zio_unique_parent(zio_t cio) 454{ 455* zio_link_t zl = NULL; 456* zio_t pio = zio_walk_parents(cio, &zl); 457* 458 VERIFY3P(zio_walk_parents(cio, &zl), ==, NULL); 459 return (pio); 460} 461 462void 463zio_add_child(zio_t pio, zio_t cio) 464{ 465 zio_link_t zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP); 466* 467 /* 468 * Logical I/Os can have logical, gang, or vdev children. 469 * Gang I/Os can have gang or vdev children. 470 * Vdev I/Os can only have vdev children. 471 * The following ASSERT captures all of these constraints. 472 / 473* ASSERT3S(cio->io_child_type, <=, pio->io_child_type); 474 475 zl->zl_parent = pio; 476 zl->zl_child = cio; 477
475 mutex_enter(&cio->io_lock);
476 mutex_enter(&pio->io_lock);	478 mutex_enter(&pio->io_lock);
	479 mutex_enter(&cio->io_lock);
477 478 ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0); 479 480 for (int w = 0; w < ZIO_WAIT_TYPES; w++) 481 pio->io_children[cio->io_child_type][w] += !cio->io_state[w]; 482 483 list_insert_head(&pio->io_child_list, zl); 484 list_insert_head(&cio->io_parent_list, zl); 485 486 pio->io_child_count++; 487 cio->io_parent_count++; 488	480 481 ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0); 482 483 for (int w = 0; w < ZIO_WAIT_TYPES; w++) 484 pio->io_children[cio->io_child_type][w] += !cio->io_state[w]; 485 486 list_insert_head(&pio->io_child_list, zl); 487 list_insert_head(&cio->io_parent_list, zl); 488 489 pio->io_child_count++; 490 cio->io_parent_count++; 491
489 mutex_exit(&pio->io_lock);
490 mutex_exit(&cio->io_lock);	492 mutex_exit(&cio->io_lock);
	493 mutex_exit(&pio->io_lock);
491} 492 493static void 494zio_remove_child(zio_t pio, zio_t cio, zio_link_t zl) 495{ 496* ASSERT(zl->zl_parent == pio); 497 ASSERT(zl->zl_child == cio); 498	494} 495 496static void 497zio_remove_child(zio_t pio, zio_t cio, zio_link_t zl) 498{ 499* ASSERT(zl->zl_parent == pio); 500 ASSERT(zl->zl_child == cio); 501
499 mutex_enter(&cio->io_lock);
500 mutex_enter(&pio->io_lock);	502 mutex_enter(&pio->io_lock);
	503 mutex_enter(&cio->io_lock);
501 502 list_remove(&pio->io_child_list, zl); 503 list_remove(&cio->io_parent_list, zl); 504 505 pio->io_child_count--; 506 cio->io_parent_count--; 507	504 505 list_remove(&pio->io_child_list, zl); 506 list_remove(&cio->io_parent_list, zl); 507 508 pio->io_child_count--; 509 cio->io_parent_count--; 510
508 mutex_exit(&pio->io_lock);
509 mutex_exit(&cio->io_lock);	511 mutex_exit(&cio->io_lock);
510	512 mutex_exit(&pio->io_lock);
511 kmem_cache_free(zio_link_cache, zl); 512} 513 514static boolean_t 515zio_wait_for_children(zio_t zio, uint8_t childbits, enum zio_wait_type wait) 516{ 517* boolean_t waiting = B_FALSE; 518 519 mutex_enter(&zio->io_lock); 520 ASSERT(zio->io_stall == NULL); 521 for (int c = 0; c < ZIO_CHILD_TYPES; c++) { 522 if (!(ZIO_CHILD_BIT_IS_SET(childbits, c))) 523 continue; 524 525 uint64_t countp = &zio->io_children[c][wait]; 526* if (countp != 0) { 527* zio->io_stage >>= 1; 528 ASSERT3U(zio->io_stage, !=, ZIO_STAGE_OPEN); 529 zio->io_stall = countp; 530 waiting = B_TRUE; 531 break; 532 } 533 } 534 mutex_exit(&zio->io_lock); 535 return (waiting); 536} 537 538static void 539zio_notify_parent(zio_t pio, zio_t zio, enum zio_wait_type wait) 540{ 541 uint64_t countp = &pio->io_children[zio->io_child_type][wait]; 542* int errorp = &pio->io_child_error[zio->io_child_type]; 543* 544 mutex_enter(&pio->io_lock); 545 if (zio->io_error && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) 546 errorp = zio_worst_error(errorp, zio->io_error); 547 pio->io_reexecute \|= zio->io_reexecute; 548 ASSERT3U(countp, >, 0); 549* 550 (countp)--; 551* 552 if (countp == 0 && pio->io_stall == countp) { 553* zio_taskq_type_t type = 554 pio->io_stage < ZIO_STAGE_VDEV_IO_START ? ZIO_TASKQ_ISSUE : 555 ZIO_TASKQ_INTERRUPT; 556 pio->io_stall = NULL; 557 mutex_exit(&pio->io_lock); 558 /* 559 * Dispatch the parent zio in its own taskq so that 560 * the child can continue to make progress. This also 561 * prevents overflowing the stack when we have deeply nested 562 * parent-child relationships. 563 / 564* zio_taskq_dispatch(pio, type, B_FALSE); 565 } else { 566 mutex_exit(&pio->io_lock); 567 } 568} 569 570static void 571zio_inherit_child_errors(zio_t zio, enum zio_child c) 572{ 573* if (zio->io_child_error[c] != 0 && zio->io_error == 0) 574 zio->io_error = zio->io_child_error[c]; 575} 576 577int 578zio_bookmark_compare(const void x1, const void x2) 579{ 580 const zio_t z1 = x1; 581* const zio_t z2 = x2; 582* 583 if (z1->io_bookmark.zb_objset < z2->io_bookmark.zb_objset) 584 return (-1); 585 if (z1->io_bookmark.zb_objset > z2->io_bookmark.zb_objset) 586 return (1); 587 588 if (z1->io_bookmark.zb_object < z2->io_bookmark.zb_object) 589 return (-1); 590 if (z1->io_bookmark.zb_object > z2->io_bookmark.zb_object) 591 return (1); 592 593 if (z1->io_bookmark.zb_level < z2->io_bookmark.zb_level) 594 return (-1); 595 if (z1->io_bookmark.zb_level > z2->io_bookmark.zb_level) 596 return (1); 597 598 if (z1->io_bookmark.zb_blkid < z2->io_bookmark.zb_blkid) 599 return (-1); 600 if (z1->io_bookmark.zb_blkid > z2->io_bookmark.zb_blkid) 601 return (1); 602 603 if (z1 < z2) 604 return (-1); 605 if (z1 > z2) 606 return (1); 607 608 return (0); 609} 610 611/* 612 * ========================================================================== 613 * Create the various types of I/O (read, write, free, etc) 614 * ========================================================================== 615 / 616static zio_t 617zio_create(zio_t pio, spa_t spa, uint64_t txg, const blkptr_t bp, 618* abd_t data, uint64_t lsize, uint64_t psize, zio_done_func_t done, 619 void private, zio_type_t type, zio_priority_t priority, 620* enum zio_flag flags, vdev_t vd, uint64_t offset, 621* const zbookmark_phys_t zb, enum zio_stage stage, enum zio_stage pipeline) 622{ 623* zio_t zio; 624* 625 ASSERT3U(type == ZIO_TYPE_FREE \|\| psize, <=, SPA_MAXBLOCKSIZE); 626 ASSERT(P2PHASE(psize, SPA_MINBLOCKSIZE) == 0); 627 ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0); 628 629 ASSERT(!vd \|\| spa_config_held(spa, SCL_STATE_ALL, RW_READER)); 630 ASSERT(!bp \|\| !(flags & ZIO_FLAG_CONFIG_WRITER)); 631 ASSERT(vd \|\| stage == ZIO_STAGE_OPEN); 632 633 IMPLY(lsize != psize, (flags & ZIO_FLAG_RAW) != 0); 634 635 zio = kmem_cache_alloc(zio_cache, KM_SLEEP); 636 bzero(zio, sizeof (zio_t)); 637 638 mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL); 639 cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL); 640 641 list_create(&zio->io_parent_list, sizeof (zio_link_t), 642 offsetof(zio_link_t, zl_parent_node)); 643 list_create(&zio->io_child_list, sizeof (zio_link_t), 644 offsetof(zio_link_t, zl_child_node)); 645 metaslab_trace_init(&zio->io_alloc_list); 646 647 if (vd != NULL) 648 zio->io_child_type = ZIO_CHILD_VDEV; 649 else if (flags & ZIO_FLAG_GANG_CHILD) 650 zio->io_child_type = ZIO_CHILD_GANG; 651 else if (flags & ZIO_FLAG_DDT_CHILD) 652 zio->io_child_type = ZIO_CHILD_DDT; 653 else 654 zio->io_child_type = ZIO_CHILD_LOGICAL; 655 656 if (bp != NULL) { 657 zio->io_bp = (blkptr_t )bp; 658* zio->io_bp_copy = bp; 659* zio->io_bp_orig = bp; 660* if (type != ZIO_TYPE_WRITE \|\| 661 zio->io_child_type == ZIO_CHILD_DDT) 662 zio->io_bp = &zio->io_bp_copy; /* so caller can free / 663* if (zio->io_child_type == ZIO_CHILD_LOGICAL) 664 zio->io_logical = zio; 665 if (zio->io_child_type > ZIO_CHILD_GANG && BP_IS_GANG(bp)) 666 pipeline \|= ZIO_GANG_STAGES; 667 } 668 669 zio->io_spa = spa; 670 zio->io_txg = txg; 671 zio->io_done = done; 672 zio->io_private = private; 673 zio->io_type = type; 674 zio->io_priority = priority; 675 zio->io_vd = vd; 676 zio->io_offset = offset; 677 zio->io_orig_abd = zio->io_abd = data; 678 zio->io_orig_size = zio->io_size = psize; 679 zio->io_lsize = lsize; 680 zio->io_orig_flags = zio->io_flags = flags; 681 zio->io_orig_stage = zio->io_stage = stage; 682 zio->io_orig_pipeline = zio->io_pipeline = pipeline; 683 zio->io_pipeline_trace = ZIO_STAGE_OPEN; 684 685 zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY); 686 zio->io_state[ZIO_WAIT_DONE] = (stage >= ZIO_STAGE_DONE); 687 688 if (zb != NULL) 689 zio->io_bookmark = zb; 690* 691 if (pio != NULL) { 692 if (zio->io_logical == NULL) 693 zio->io_logical = pio->io_logical; 694 if (zio->io_child_type == ZIO_CHILD_GANG) 695 zio->io_gang_leader = pio->io_gang_leader; 696 zio_add_child(pio, zio); 697 } 698 699 return (zio); 700} 701 702static void 703zio_destroy(zio_t zio) 704{ 705* metaslab_trace_fini(&zio->io_alloc_list); 706 list_destroy(&zio->io_parent_list); 707 list_destroy(&zio->io_child_list); 708 mutex_destroy(&zio->io_lock); 709 cv_destroy(&zio->io_cv); 710 kmem_cache_free(zio_cache, zio); 711} 712 713zio_t * 714zio_null(zio_t pio, spa_t spa, vdev_t vd, zio_done_func_t done, 715 void private, enum zio_flag flags) 716{ 717* zio_t zio; 718* 719 zio = zio_create(pio, spa, 0, NULL, NULL, 0, 0, done, private, 720 ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL, 721 ZIO_STAGE_OPEN, ZIO_INTERLOCK_PIPELINE); 722 723 return (zio); 724} 725 726zio_t * 727zio_root(spa_t spa, zio_done_func_t done, void private, enum zio_flag flags) 728{ 729* return (zio_null(NULL, spa, NULL, done, private, flags)); 730} 731 732void 733zfs_blkptr_verify(spa_t spa, const blkptr_t bp) 734{ 735 if (!DMU_OT_IS_VALID(BP_GET_TYPE(bp))) { 736 zfs_panic_recover("blkptr at %p has invalid TYPE %llu", 737 bp, (longlong_t)BP_GET_TYPE(bp)); 738 } 739 if (BP_GET_CHECKSUM(bp) >= ZIO_CHECKSUM_FUNCTIONS \|\| 740 BP_GET_CHECKSUM(bp) <= ZIO_CHECKSUM_ON) { 741 zfs_panic_recover("blkptr at %p has invalid CHECKSUM %llu", 742 bp, (longlong_t)BP_GET_CHECKSUM(bp)); 743 } 744 if (BP_GET_COMPRESS(bp) >= ZIO_COMPRESS_FUNCTIONS \|\| 745 BP_GET_COMPRESS(bp) <= ZIO_COMPRESS_ON) { 746 zfs_panic_recover("blkptr at %p has invalid COMPRESS %llu", 747 bp, (longlong_t)BP_GET_COMPRESS(bp)); 748 } 749 if (BP_GET_LSIZE(bp) > SPA_MAXBLOCKSIZE) { 750 zfs_panic_recover("blkptr at %p has invalid LSIZE %llu", 751 bp, (longlong_t)BP_GET_LSIZE(bp)); 752 } 753 if (BP_GET_PSIZE(bp) > SPA_MAXBLOCKSIZE) { 754 zfs_panic_recover("blkptr at %p has invalid PSIZE %llu", 755 bp, (longlong_t)BP_GET_PSIZE(bp)); 756 } 757 758 if (BP_IS_EMBEDDED(bp)) { 759 if (BPE_GET_ETYPE(bp) > NUM_BP_EMBEDDED_TYPES) { 760 zfs_panic_recover("blkptr at %p has invalid ETYPE %llu", 761 bp, (longlong_t)BPE_GET_ETYPE(bp)); 762 } 763 } 764 765 /* 766 * Do not verify individual DVAs if the config is not trusted. This 767 * will be done once the zio is executed in vdev_mirror_map_alloc. 768 / 769* if (!spa->spa_trust_config) 770 return; 771 772 /* 773 * Pool-specific checks. 774 * 775 * Note: it would be nice to verify that the blk_birth and 776 * BP_PHYSICAL_BIRTH() are not too large. However, spa_freeze() 777 * allows the birth time of log blocks (and dmu_sync()-ed blocks 778 * that are in the log) to be arbitrarily large. 779 / 780* for (int i = 0; i < BP_GET_NDVAS(bp); i++) { 781 uint64_t vdevid = DVA_GET_VDEV(&bp->blk_dva[i]); 782 if (vdevid >= spa->spa_root_vdev->vdev_children) { 783 zfs_panic_recover("blkptr at %p DVA %u has invalid " 784 "VDEV %llu", 785 bp, i, (longlong_t)vdevid); 786 continue; 787 } 788 vdev_t vd = spa->spa_root_vdev->vdev_child[vdevid]; 789* if (vd == NULL) { 790 zfs_panic_recover("blkptr at %p DVA %u has invalid " 791 "VDEV %llu", 792 bp, i, (longlong_t)vdevid); 793 continue; 794 } 795 if (vd->vdev_ops == &vdev_hole_ops) { 796 zfs_panic_recover("blkptr at %p DVA %u has hole " 797 "VDEV %llu", 798 bp, i, (longlong_t)vdevid); 799 continue; 800 } 801 if (vd->vdev_ops == &vdev_missing_ops) { 802 /* 803 * "missing" vdevs are valid during import, but we 804 * don't have their detailed info (e.g. asize), so 805 * we can't perform any more checks on them. 806 / 807* continue; 808 } 809 uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]); 810 uint64_t asize = DVA_GET_ASIZE(&bp->blk_dva[i]); 811 if (BP_IS_GANG(bp)) 812 asize = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); 813 if (offset + asize > vd->vdev_asize) { 814 zfs_panic_recover("blkptr at %p DVA %u has invalid " 815 "OFFSET %llu", 816 bp, i, (longlong_t)offset); 817 } 818 } 819} 820 821boolean_t 822zfs_dva_valid(spa_t spa, const dva_t dva, const blkptr_t bp) 823{ 824* uint64_t vdevid = DVA_GET_VDEV(dva); 825 826 if (vdevid >= spa->spa_root_vdev->vdev_children) 827 return (B_FALSE); 828 829 vdev_t vd = spa->spa_root_vdev->vdev_child[vdevid]; 830* if (vd == NULL) 831 return (B_FALSE); 832 833 if (vd->vdev_ops == &vdev_hole_ops) 834 return (B_FALSE); 835 836 if (vd->vdev_ops == &vdev_missing_ops) { 837 return (B_FALSE); 838 } 839 840 uint64_t offset = DVA_GET_OFFSET(dva); 841 uint64_t asize = DVA_GET_ASIZE(dva); 842 843 if (BP_IS_GANG(bp)) 844 asize = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); 845 if (offset + asize > vd->vdev_asize) 846 return (B_FALSE); 847 848 return (B_TRUE); 849} 850 851zio_t * 852zio_read(zio_t pio, spa_t spa, const blkptr_t bp, 853* abd_t data, uint64_t size, zio_done_func_t done, void private, 854* zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t zb) 855{ 856* zio_t zio; 857* 858 zfs_blkptr_verify(spa, bp); 859 860 zio = zio_create(pio, spa, BP_PHYSICAL_BIRTH(bp), bp, 861 data, size, size, done, private, 862 ZIO_TYPE_READ, priority, flags, NULL, 0, zb, 863 ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ? 864 ZIO_DDT_CHILD_READ_PIPELINE : ZIO_READ_PIPELINE); 865 866 return (zio); 867} 868 869zio_t * 870zio_write(zio_t pio, spa_t spa, uint64_t txg, blkptr_t bp, 871* abd_t data, uint64_t lsize, uint64_t psize, const zio_prop_t zp, 872 zio_done_func_t ready, zio_done_func_t children_ready, 873 zio_done_func_t physdone, zio_done_func_t done, 874 void private, zio_priority_t priority, enum zio_flag flags, 875* const zbookmark_phys_t zb) 876{ 877* zio_t zio; 878* 879 ASSERT(zp->zp_checksum >= ZIO_CHECKSUM_OFF && 880 zp->zp_checksum < ZIO_CHECKSUM_FUNCTIONS && 881 zp->zp_compress >= ZIO_COMPRESS_OFF && 882 zp->zp_compress < ZIO_COMPRESS_FUNCTIONS && 883 DMU_OT_IS_VALID(zp->zp_type) && 884 zp->zp_level < 32 && 885 zp->zp_copies > 0 && 886 zp->zp_copies <= spa_max_replication(spa)); 887 888 zio = zio_create(pio, spa, txg, bp, data, lsize, psize, done, private, 889 ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb, 890 ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ? 891 ZIO_DDT_CHILD_WRITE_PIPELINE : ZIO_WRITE_PIPELINE); 892 893 zio->io_ready = ready; 894 zio->io_children_ready = children_ready; 895 zio->io_physdone = physdone; 896 zio->io_prop = zp; 897* 898 /* 899 * Data can be NULL if we are going to call zio_write_override() to 900 * provide the already-allocated BP. But we may need the data to 901 * verify a dedup hit (if requested). In this case, don't try to 902 * dedup (just take the already-allocated BP verbatim). 903 / 904* if (data == NULL && zio->io_prop.zp_dedup_verify) { 905 zio->io_prop.zp_dedup = zio->io_prop.zp_dedup_verify = B_FALSE; 906 } 907 908 return (zio); 909} 910 911zio_t * 912zio_rewrite(zio_t pio, spa_t spa, uint64_t txg, blkptr_t bp, abd_t data, 913 uint64_t size, zio_done_func_t done, void private, 914 zio_priority_t priority, enum zio_flag flags, zbookmark_phys_t zb) 915{ 916* zio_t zio; 917* 918 zio = zio_create(pio, spa, txg, bp, data, size, size, done, private, 919 ZIO_TYPE_WRITE, priority, flags \| ZIO_FLAG_IO_REWRITE, NULL, 0, zb, 920 ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE); 921 922 return (zio); 923} 924 925void 926zio_write_override(zio_t zio, blkptr_t bp, int copies, boolean_t nopwrite) 927{ 928 ASSERT(zio->io_type == ZIO_TYPE_WRITE); 929 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 930 ASSERT(zio->io_stage == ZIO_STAGE_OPEN); 931 ASSERT(zio->io_txg == spa_syncing_txg(zio->io_spa)); 932 933 /* 934 * We must reset the io_prop to match the values that existed 935 * when the bp was first written by dmu_sync() keeping in mind 936 * that nopwrite and dedup are mutually exclusive. 937 / 938* zio->io_prop.zp_dedup = nopwrite ? B_FALSE : zio->io_prop.zp_dedup; 939 zio->io_prop.zp_nopwrite = nopwrite; 940 zio->io_prop.zp_copies = copies; 941 zio->io_bp_override = bp; 942} 943 944void 945zio_free(spa_t spa, uint64_t txg, const blkptr_t bp) 946{ 947 948 zfs_blkptr_verify(spa, bp); 949 950 /* 951 * The check for EMBEDDED is a performance optimization. We 952 * process the free here (by ignoring it) rather than 953 * putting it on the list and then processing it in zio_free_sync(). 954 / 955* if (BP_IS_EMBEDDED(bp)) 956 return; 957 metaslab_check_free(spa, bp); 958 959 /* 960 * Frees that are for the currently-syncing txg, are not going to be 961 * deferred, and which will not need to do a read (i.e. not GANG or 962 * DEDUP), can be processed immediately. Otherwise, put them on the 963 * in-memory list for later processing. 964 / 965* if (zfs_trim_enabled \|\| BP_IS_GANG(bp) \|\| BP_GET_DEDUP(bp) \|\| 966 txg != spa->spa_syncing_txg \|\| 967 spa_sync_pass(spa) >= zfs_sync_pass_deferred_free) { 968 bplist_append(&spa->spa_free_bplist[txg & TXG_MASK], bp); 969 } else { 970 VERIFY0(zio_wait(zio_free_sync(NULL, spa, txg, bp, 971 BP_GET_PSIZE(bp), 0))); 972 } 973} 974 975zio_t * 976zio_free_sync(zio_t pio, spa_t spa, uint64_t txg, const blkptr_t bp, 977* uint64_t size, enum zio_flag flags) 978{ 979 zio_t zio; 980* enum zio_stage stage = ZIO_FREE_PIPELINE; 981 982 ASSERT(!BP_IS_HOLE(bp)); 983 ASSERT(spa_syncing_txg(spa) == txg); 984 ASSERT(spa_sync_pass(spa) < zfs_sync_pass_deferred_free); 985 986 if (BP_IS_EMBEDDED(bp)) 987 return (zio_null(pio, spa, NULL, NULL, NULL, 0)); 988 989 metaslab_check_free(spa, bp); 990 arc_freed(spa, bp);	513 kmem_cache_free(zio_link_cache, zl); 514} 515 516static boolean_t 517zio_wait_for_children(zio_t zio, uint8_t childbits, enum zio_wait_type wait) 518{ 519* boolean_t waiting = B_FALSE; 520 521 mutex_enter(&zio->io_lock); 522 ASSERT(zio->io_stall == NULL); 523 for (int c = 0; c < ZIO_CHILD_TYPES; c++) { 524 if (!(ZIO_CHILD_BIT_IS_SET(childbits, c))) 525 continue; 526 527 uint64_t countp = &zio->io_children[c][wait]; 528* if (countp != 0) { 529* zio->io_stage >>= 1; 530 ASSERT3U(zio->io_stage, !=, ZIO_STAGE_OPEN); 531 zio->io_stall = countp; 532 waiting = B_TRUE; 533 break; 534 } 535 } 536 mutex_exit(&zio->io_lock); 537 return (waiting); 538} 539 540static void 541zio_notify_parent(zio_t pio, zio_t zio, enum zio_wait_type wait) 542{ 543 uint64_t countp = &pio->io_children[zio->io_child_type][wait]; 544* int errorp = &pio->io_child_error[zio->io_child_type]; 545* 546 mutex_enter(&pio->io_lock); 547 if (zio->io_error && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) 548 errorp = zio_worst_error(errorp, zio->io_error); 549 pio->io_reexecute \|= zio->io_reexecute; 550 ASSERT3U(countp, >, 0); 551* 552 (countp)--; 553* 554 if (countp == 0 && pio->io_stall == countp) { 555* zio_taskq_type_t type = 556 pio->io_stage < ZIO_STAGE_VDEV_IO_START ? ZIO_TASKQ_ISSUE : 557 ZIO_TASKQ_INTERRUPT; 558 pio->io_stall = NULL; 559 mutex_exit(&pio->io_lock); 560 /* 561 * Dispatch the parent zio in its own taskq so that 562 * the child can continue to make progress. This also 563 * prevents overflowing the stack when we have deeply nested 564 * parent-child relationships. 565 / 566* zio_taskq_dispatch(pio, type, B_FALSE); 567 } else { 568 mutex_exit(&pio->io_lock); 569 } 570} 571 572static void 573zio_inherit_child_errors(zio_t zio, enum zio_child c) 574{ 575* if (zio->io_child_error[c] != 0 && zio->io_error == 0) 576 zio->io_error = zio->io_child_error[c]; 577} 578 579int 580zio_bookmark_compare(const void x1, const void x2) 581{ 582 const zio_t z1 = x1; 583* const zio_t z2 = x2; 584* 585 if (z1->io_bookmark.zb_objset < z2->io_bookmark.zb_objset) 586 return (-1); 587 if (z1->io_bookmark.zb_objset > z2->io_bookmark.zb_objset) 588 return (1); 589 590 if (z1->io_bookmark.zb_object < z2->io_bookmark.zb_object) 591 return (-1); 592 if (z1->io_bookmark.zb_object > z2->io_bookmark.zb_object) 593 return (1); 594 595 if (z1->io_bookmark.zb_level < z2->io_bookmark.zb_level) 596 return (-1); 597 if (z1->io_bookmark.zb_level > z2->io_bookmark.zb_level) 598 return (1); 599 600 if (z1->io_bookmark.zb_blkid < z2->io_bookmark.zb_blkid) 601 return (-1); 602 if (z1->io_bookmark.zb_blkid > z2->io_bookmark.zb_blkid) 603 return (1); 604 605 if (z1 < z2) 606 return (-1); 607 if (z1 > z2) 608 return (1); 609 610 return (0); 611} 612 613/* 614 * ========================================================================== 615 * Create the various types of I/O (read, write, free, etc) 616 * ========================================================================== 617 / 618static zio_t 619zio_create(zio_t pio, spa_t spa, uint64_t txg, const blkptr_t bp, 620* abd_t data, uint64_t lsize, uint64_t psize, zio_done_func_t done, 621 void private, zio_type_t type, zio_priority_t priority, 622* enum zio_flag flags, vdev_t vd, uint64_t offset, 623* const zbookmark_phys_t zb, enum zio_stage stage, enum zio_stage pipeline) 624{ 625* zio_t zio; 626* 627 ASSERT3U(type == ZIO_TYPE_FREE \|\| psize, <=, SPA_MAXBLOCKSIZE); 628 ASSERT(P2PHASE(psize, SPA_MINBLOCKSIZE) == 0); 629 ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0); 630 631 ASSERT(!vd \|\| spa_config_held(spa, SCL_STATE_ALL, RW_READER)); 632 ASSERT(!bp \|\| !(flags & ZIO_FLAG_CONFIG_WRITER)); 633 ASSERT(vd \|\| stage == ZIO_STAGE_OPEN); 634 635 IMPLY(lsize != psize, (flags & ZIO_FLAG_RAW) != 0); 636 637 zio = kmem_cache_alloc(zio_cache, KM_SLEEP); 638 bzero(zio, sizeof (zio_t)); 639 640 mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL); 641 cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL); 642 643 list_create(&zio->io_parent_list, sizeof (zio_link_t), 644 offsetof(zio_link_t, zl_parent_node)); 645 list_create(&zio->io_child_list, sizeof (zio_link_t), 646 offsetof(zio_link_t, zl_child_node)); 647 metaslab_trace_init(&zio->io_alloc_list); 648 649 if (vd != NULL) 650 zio->io_child_type = ZIO_CHILD_VDEV; 651 else if (flags & ZIO_FLAG_GANG_CHILD) 652 zio->io_child_type = ZIO_CHILD_GANG; 653 else if (flags & ZIO_FLAG_DDT_CHILD) 654 zio->io_child_type = ZIO_CHILD_DDT; 655 else 656 zio->io_child_type = ZIO_CHILD_LOGICAL; 657 658 if (bp != NULL) { 659 zio->io_bp = (blkptr_t )bp; 660* zio->io_bp_copy = bp; 661* zio->io_bp_orig = bp; 662* if (type != ZIO_TYPE_WRITE \|\| 663 zio->io_child_type == ZIO_CHILD_DDT) 664 zio->io_bp = &zio->io_bp_copy; /* so caller can free / 665* if (zio->io_child_type == ZIO_CHILD_LOGICAL) 666 zio->io_logical = zio; 667 if (zio->io_child_type > ZIO_CHILD_GANG && BP_IS_GANG(bp)) 668 pipeline \|= ZIO_GANG_STAGES; 669 } 670 671 zio->io_spa = spa; 672 zio->io_txg = txg; 673 zio->io_done = done; 674 zio->io_private = private; 675 zio->io_type = type; 676 zio->io_priority = priority; 677 zio->io_vd = vd; 678 zio->io_offset = offset; 679 zio->io_orig_abd = zio->io_abd = data; 680 zio->io_orig_size = zio->io_size = psize; 681 zio->io_lsize = lsize; 682 zio->io_orig_flags = zio->io_flags = flags; 683 zio->io_orig_stage = zio->io_stage = stage; 684 zio->io_orig_pipeline = zio->io_pipeline = pipeline; 685 zio->io_pipeline_trace = ZIO_STAGE_OPEN; 686 687 zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY); 688 zio->io_state[ZIO_WAIT_DONE] = (stage >= ZIO_STAGE_DONE); 689 690 if (zb != NULL) 691 zio->io_bookmark = zb; 692* 693 if (pio != NULL) { 694 if (zio->io_logical == NULL) 695 zio->io_logical = pio->io_logical; 696 if (zio->io_child_type == ZIO_CHILD_GANG) 697 zio->io_gang_leader = pio->io_gang_leader; 698 zio_add_child(pio, zio); 699 } 700 701 return (zio); 702} 703 704static void 705zio_destroy(zio_t zio) 706{ 707* metaslab_trace_fini(&zio->io_alloc_list); 708 list_destroy(&zio->io_parent_list); 709 list_destroy(&zio->io_child_list); 710 mutex_destroy(&zio->io_lock); 711 cv_destroy(&zio->io_cv); 712 kmem_cache_free(zio_cache, zio); 713} 714 715zio_t * 716zio_null(zio_t pio, spa_t spa, vdev_t vd, zio_done_func_t done, 717 void private, enum zio_flag flags) 718{ 719* zio_t zio; 720* 721 zio = zio_create(pio, spa, 0, NULL, NULL, 0, 0, done, private, 722 ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL, 723 ZIO_STAGE_OPEN, ZIO_INTERLOCK_PIPELINE); 724 725 return (zio); 726} 727 728zio_t * 729zio_root(spa_t spa, zio_done_func_t done, void private, enum zio_flag flags) 730{ 731* return (zio_null(NULL, spa, NULL, done, private, flags)); 732} 733 734void 735zfs_blkptr_verify(spa_t spa, const blkptr_t bp) 736{ 737 if (!DMU_OT_IS_VALID(BP_GET_TYPE(bp))) { 738 zfs_panic_recover("blkptr at %p has invalid TYPE %llu", 739 bp, (longlong_t)BP_GET_TYPE(bp)); 740 } 741 if (BP_GET_CHECKSUM(bp) >= ZIO_CHECKSUM_FUNCTIONS \|\| 742 BP_GET_CHECKSUM(bp) <= ZIO_CHECKSUM_ON) { 743 zfs_panic_recover("blkptr at %p has invalid CHECKSUM %llu", 744 bp, (longlong_t)BP_GET_CHECKSUM(bp)); 745 } 746 if (BP_GET_COMPRESS(bp) >= ZIO_COMPRESS_FUNCTIONS \|\| 747 BP_GET_COMPRESS(bp) <= ZIO_COMPRESS_ON) { 748 zfs_panic_recover("blkptr at %p has invalid COMPRESS %llu", 749 bp, (longlong_t)BP_GET_COMPRESS(bp)); 750 } 751 if (BP_GET_LSIZE(bp) > SPA_MAXBLOCKSIZE) { 752 zfs_panic_recover("blkptr at %p has invalid LSIZE %llu", 753 bp, (longlong_t)BP_GET_LSIZE(bp)); 754 } 755 if (BP_GET_PSIZE(bp) > SPA_MAXBLOCKSIZE) { 756 zfs_panic_recover("blkptr at %p has invalid PSIZE %llu", 757 bp, (longlong_t)BP_GET_PSIZE(bp)); 758 } 759 760 if (BP_IS_EMBEDDED(bp)) { 761 if (BPE_GET_ETYPE(bp) > NUM_BP_EMBEDDED_TYPES) { 762 zfs_panic_recover("blkptr at %p has invalid ETYPE %llu", 763 bp, (longlong_t)BPE_GET_ETYPE(bp)); 764 } 765 } 766 767 /* 768 * Do not verify individual DVAs if the config is not trusted. This 769 * will be done once the zio is executed in vdev_mirror_map_alloc. 770 / 771* if (!spa->spa_trust_config) 772 return; 773 774 /* 775 * Pool-specific checks. 776 * 777 * Note: it would be nice to verify that the blk_birth and 778 * BP_PHYSICAL_BIRTH() are not too large. However, spa_freeze() 779 * allows the birth time of log blocks (and dmu_sync()-ed blocks 780 * that are in the log) to be arbitrarily large. 781 / 782* for (int i = 0; i < BP_GET_NDVAS(bp); i++) { 783 uint64_t vdevid = DVA_GET_VDEV(&bp->blk_dva[i]); 784 if (vdevid >= spa->spa_root_vdev->vdev_children) { 785 zfs_panic_recover("blkptr at %p DVA %u has invalid " 786 "VDEV %llu", 787 bp, i, (longlong_t)vdevid); 788 continue; 789 } 790 vdev_t vd = spa->spa_root_vdev->vdev_child[vdevid]; 791* if (vd == NULL) { 792 zfs_panic_recover("blkptr at %p DVA %u has invalid " 793 "VDEV %llu", 794 bp, i, (longlong_t)vdevid); 795 continue; 796 } 797 if (vd->vdev_ops == &vdev_hole_ops) { 798 zfs_panic_recover("blkptr at %p DVA %u has hole " 799 "VDEV %llu", 800 bp, i, (longlong_t)vdevid); 801 continue; 802 } 803 if (vd->vdev_ops == &vdev_missing_ops) { 804 /* 805 * "missing" vdevs are valid during import, but we 806 * don't have their detailed info (e.g. asize), so 807 * we can't perform any more checks on them. 808 / 809* continue; 810 } 811 uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]); 812 uint64_t asize = DVA_GET_ASIZE(&bp->blk_dva[i]); 813 if (BP_IS_GANG(bp)) 814 asize = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); 815 if (offset + asize > vd->vdev_asize) { 816 zfs_panic_recover("blkptr at %p DVA %u has invalid " 817 "OFFSET %llu", 818 bp, i, (longlong_t)offset); 819 } 820 } 821} 822 823boolean_t 824zfs_dva_valid(spa_t spa, const dva_t dva, const blkptr_t bp) 825{ 826* uint64_t vdevid = DVA_GET_VDEV(dva); 827 828 if (vdevid >= spa->spa_root_vdev->vdev_children) 829 return (B_FALSE); 830 831 vdev_t vd = spa->spa_root_vdev->vdev_child[vdevid]; 832* if (vd == NULL) 833 return (B_FALSE); 834 835 if (vd->vdev_ops == &vdev_hole_ops) 836 return (B_FALSE); 837 838 if (vd->vdev_ops == &vdev_missing_ops) { 839 return (B_FALSE); 840 } 841 842 uint64_t offset = DVA_GET_OFFSET(dva); 843 uint64_t asize = DVA_GET_ASIZE(dva); 844 845 if (BP_IS_GANG(bp)) 846 asize = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); 847 if (offset + asize > vd->vdev_asize) 848 return (B_FALSE); 849 850 return (B_TRUE); 851} 852 853zio_t * 854zio_read(zio_t pio, spa_t spa, const blkptr_t bp, 855* abd_t data, uint64_t size, zio_done_func_t done, void private, 856* zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t zb) 857{ 858* zio_t zio; 859* 860 zfs_blkptr_verify(spa, bp); 861 862 zio = zio_create(pio, spa, BP_PHYSICAL_BIRTH(bp), bp, 863 data, size, size, done, private, 864 ZIO_TYPE_READ, priority, flags, NULL, 0, zb, 865 ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ? 866 ZIO_DDT_CHILD_READ_PIPELINE : ZIO_READ_PIPELINE); 867 868 return (zio); 869} 870 871zio_t * 872zio_write(zio_t pio, spa_t spa, uint64_t txg, blkptr_t bp, 873* abd_t data, uint64_t lsize, uint64_t psize, const zio_prop_t zp, 874 zio_done_func_t ready, zio_done_func_t children_ready, 875 zio_done_func_t physdone, zio_done_func_t done, 876 void private, zio_priority_t priority, enum zio_flag flags, 877* const zbookmark_phys_t zb) 878{ 879* zio_t zio; 880* 881 ASSERT(zp->zp_checksum >= ZIO_CHECKSUM_OFF && 882 zp->zp_checksum < ZIO_CHECKSUM_FUNCTIONS && 883 zp->zp_compress >= ZIO_COMPRESS_OFF && 884 zp->zp_compress < ZIO_COMPRESS_FUNCTIONS && 885 DMU_OT_IS_VALID(zp->zp_type) && 886 zp->zp_level < 32 && 887 zp->zp_copies > 0 && 888 zp->zp_copies <= spa_max_replication(spa)); 889 890 zio = zio_create(pio, spa, txg, bp, data, lsize, psize, done, private, 891 ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb, 892 ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ? 893 ZIO_DDT_CHILD_WRITE_PIPELINE : ZIO_WRITE_PIPELINE); 894 895 zio->io_ready = ready; 896 zio->io_children_ready = children_ready; 897 zio->io_physdone = physdone; 898 zio->io_prop = zp; 899* 900 /* 901 * Data can be NULL if we are going to call zio_write_override() to 902 * provide the already-allocated BP. But we may need the data to 903 * verify a dedup hit (if requested). In this case, don't try to 904 * dedup (just take the already-allocated BP verbatim). 905 / 906* if (data == NULL && zio->io_prop.zp_dedup_verify) { 907 zio->io_prop.zp_dedup = zio->io_prop.zp_dedup_verify = B_FALSE; 908 } 909 910 return (zio); 911} 912 913zio_t * 914zio_rewrite(zio_t pio, spa_t spa, uint64_t txg, blkptr_t bp, abd_t data, 915 uint64_t size, zio_done_func_t done, void private, 916 zio_priority_t priority, enum zio_flag flags, zbookmark_phys_t zb) 917{ 918* zio_t zio; 919* 920 zio = zio_create(pio, spa, txg, bp, data, size, size, done, private, 921 ZIO_TYPE_WRITE, priority, flags \| ZIO_FLAG_IO_REWRITE, NULL, 0, zb, 922 ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE); 923 924 return (zio); 925} 926 927void 928zio_write_override(zio_t zio, blkptr_t bp, int copies, boolean_t nopwrite) 929{ 930 ASSERT(zio->io_type == ZIO_TYPE_WRITE); 931 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 932 ASSERT(zio->io_stage == ZIO_STAGE_OPEN); 933 ASSERT(zio->io_txg == spa_syncing_txg(zio->io_spa)); 934 935 /* 936 * We must reset the io_prop to match the values that existed 937 * when the bp was first written by dmu_sync() keeping in mind 938 * that nopwrite and dedup are mutually exclusive. 939 / 940* zio->io_prop.zp_dedup = nopwrite ? B_FALSE : zio->io_prop.zp_dedup; 941 zio->io_prop.zp_nopwrite = nopwrite; 942 zio->io_prop.zp_copies = copies; 943 zio->io_bp_override = bp; 944} 945 946void 947zio_free(spa_t spa, uint64_t txg, const blkptr_t bp) 948{ 949 950 zfs_blkptr_verify(spa, bp); 951 952 /* 953 * The check for EMBEDDED is a performance optimization. We 954 * process the free here (by ignoring it) rather than 955 * putting it on the list and then processing it in zio_free_sync(). 956 / 957* if (BP_IS_EMBEDDED(bp)) 958 return; 959 metaslab_check_free(spa, bp); 960 961 /* 962 * Frees that are for the currently-syncing txg, are not going to be 963 * deferred, and which will not need to do a read (i.e. not GANG or 964 * DEDUP), can be processed immediately. Otherwise, put them on the 965 * in-memory list for later processing. 966 / 967* if (zfs_trim_enabled \|\| BP_IS_GANG(bp) \|\| BP_GET_DEDUP(bp) \|\| 968 txg != spa->spa_syncing_txg \|\| 969 spa_sync_pass(spa) >= zfs_sync_pass_deferred_free) { 970 bplist_append(&spa->spa_free_bplist[txg & TXG_MASK], bp); 971 } else { 972 VERIFY0(zio_wait(zio_free_sync(NULL, spa, txg, bp, 973 BP_GET_PSIZE(bp), 0))); 974 } 975} 976 977zio_t * 978zio_free_sync(zio_t pio, spa_t spa, uint64_t txg, const blkptr_t bp, 979* uint64_t size, enum zio_flag flags) 980{ 981 zio_t zio; 982* enum zio_stage stage = ZIO_FREE_PIPELINE; 983 984 ASSERT(!BP_IS_HOLE(bp)); 985 ASSERT(spa_syncing_txg(spa) == txg); 986 ASSERT(spa_sync_pass(spa) < zfs_sync_pass_deferred_free); 987 988 if (BP_IS_EMBEDDED(bp)) 989 return (zio_null(pio, spa, NULL, NULL, NULL, 0)); 990 991 metaslab_check_free(spa, bp); 992 arc_freed(spa, bp);
	993 dsl_scan_freed(spa, bp);
991 992 if (zfs_trim_enabled) 993 stage \|= ZIO_STAGE_ISSUE_ASYNC \| ZIO_STAGE_VDEV_IO_START \| 994 ZIO_STAGE_VDEV_IO_ASSESS; 995 /* 996 * GANG and DEDUP blocks can induce a read (for the gang block header, 997 * or the DDT), so issue them asynchronously so that this thread is 998 * not tied up. 999 / 1000* else if (BP_IS_GANG(bp) \|\| BP_GET_DEDUP(bp)) 1001 stage \|= ZIO_STAGE_ISSUE_ASYNC; 1002 1003 flags \|= ZIO_FLAG_DONT_QUEUE; 1004 1005 zio = zio_create(pio, spa, txg, bp, NULL, size, 1006 size, NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_NOW, 1007 flags, NULL, 0, NULL, ZIO_STAGE_OPEN, stage); 1008 1009 return (zio); 1010} 1011 1012zio_t * 1013zio_claim(zio_t pio, spa_t spa, uint64_t txg, const blkptr_t bp, 1014* zio_done_func_t done, void private, enum zio_flag flags) 1015{ 1016 zio_t zio; 1017* 1018 zfs_blkptr_verify(spa, bp); 1019 1020 if (BP_IS_EMBEDDED(bp)) 1021 return (zio_null(pio, spa, NULL, NULL, NULL, 0)); 1022 1023 /* 1024 * A claim is an allocation of a specific block. Claims are needed 1025 * to support immediate writes in the intent log. The issue is that 1026 * immediate writes contain committed data, but in a txg that was 1027 * not committed. Upon opening the pool after an unclean shutdown, 1028 * the intent log claims all blocks that contain immediate write data 1029 * so that the SPA knows they're in use. 1030 * 1031 * All claims must be resolved in the first txg -- before the SPA 1032 * starts allocating blocks -- so that nothing is allocated twice. 1033 * If txg == 0 we just verify that the block is claimable. 1034 / 1035* ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, 1036 spa_min_claim_txg(spa)); 1037 ASSERT(txg == spa_min_claim_txg(spa) \|\| txg == 0); 1038 ASSERT(!BP_GET_DEDUP(bp) \|\| !spa_writeable(spa)); /* zdb(1M) / 1039* 1040 zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp), 1041 BP_GET_PSIZE(bp), done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, 1042 flags, NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE); 1043 ASSERT0(zio->io_queued_timestamp); 1044 1045 return (zio); 1046} 1047 1048zio_t * 1049zio_ioctl(zio_t pio, spa_t spa, vdev_t vd, int cmd, uint64_t offset, 1050* uint64_t size, zio_done_func_t done, void private, 1051 zio_priority_t priority, enum zio_flag flags) 1052{ 1053 zio_t zio; 1054* int c; 1055 1056 if (vd->vdev_children == 0) { 1057 zio = zio_create(pio, spa, 0, NULL, NULL, 0, 0, done, private, 1058 ZIO_TYPE_IOCTL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL, 1059 ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE); 1060 1061 zio->io_cmd = cmd; 1062 } else { 1063 zio = zio_null(pio, spa, NULL, NULL, NULL, flags); 1064 1065 for (c = 0; c < vd->vdev_children; c++) 1066 zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd, 1067 offset, size, done, private, priority, flags)); 1068 } 1069 1070 return (zio); 1071} 1072 1073zio_t * 1074zio_read_phys(zio_t pio, vdev_t vd, uint64_t offset, uint64_t size, 1075 abd_t data, int checksum, zio_done_func_t done, void private, 1076* zio_priority_t priority, enum zio_flag flags, boolean_t labels) 1077{ 1078 zio_t zio; 1079* 1080 ASSERT(vd->vdev_children == 0); 1081 ASSERT(!labels \|\| offset + size <= VDEV_LABEL_START_SIZE \|\| 1082 offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); 1083 ASSERT3U(offset + size, <=, vd->vdev_psize); 1084 1085 zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, size, done, 1086 private, ZIO_TYPE_READ, priority, flags \| ZIO_FLAG_PHYSICAL, vd, 1087 offset, NULL, ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE); 1088 1089 zio->io_prop.zp_checksum = checksum; 1090 1091 return (zio); 1092} 1093 1094zio_t * 1095zio_write_phys(zio_t pio, vdev_t vd, uint64_t offset, uint64_t size, 1096 abd_t data, int checksum, zio_done_func_t done, void private, 1097* zio_priority_t priority, enum zio_flag flags, boolean_t labels) 1098{ 1099 zio_t zio; 1100* 1101 ASSERT(vd->vdev_children == 0); 1102 ASSERT(!labels \|\| offset + size <= VDEV_LABEL_START_SIZE \|\| 1103 offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); 1104 ASSERT3U(offset + size, <=, vd->vdev_psize); 1105 1106 zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, size, done, 1107 private, ZIO_TYPE_WRITE, priority, flags \| ZIO_FLAG_PHYSICAL, vd, 1108 offset, NULL, ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE); 1109 1110 zio->io_prop.zp_checksum = checksum; 1111 1112 if (zio_checksum_table[checksum].ci_flags & ZCHECKSUM_FLAG_EMBEDDED) { 1113 /* 1114 * zec checksums are necessarily destructive -- they modify 1115 * the end of the write buffer to hold the verifier/checksum. 1116 * Therefore, we must make a local copy in case the data is 1117 * being written to multiple places in parallel. 1118 / 1119* abd_t wbuf = abd_alloc_sametype(data, size); 1120* abd_copy(wbuf, data, size); 1121 1122 zio_push_transform(zio, wbuf, size, size, NULL); 1123 } 1124 1125 return (zio); 1126} 1127 1128/* 1129 * Create a child I/O to do some work for us. 1130 / 1131zio_t 1132zio_vdev_child_io(zio_t pio, blkptr_t bp, vdev_t vd, uint64_t offset, 1133* abd_t data, uint64_t size, int type, zio_priority_t priority, 1134* enum zio_flag flags, zio_done_func_t done, void private) 1135{ 1136 enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE; 1137 zio_t zio; 1138* 1139 /* 1140 * vdev child I/Os do not propagate their error to the parent. 1141 * Therefore, for correct operation the caller must check for 1142 * and handle the error in the child i/o's done callback. 1143 * The only exceptions are i/os that we don't care about 1144 * (OPTIONAL or REPAIR). 1145 / 1146* ASSERT((flags & ZIO_FLAG_OPTIONAL) \|\| (flags & ZIO_FLAG_IO_REPAIR) \|\| 1147 done != NULL); 1148 1149 /* 1150 * In the common case, where the parent zio was to a normal vdev, 1151 * the child zio must be to a child vdev of that vdev. Otherwise, 1152 * the child zio must be to a top-level vdev. 1153 / 1154* if (pio->io_vd != NULL && pio->io_vd->vdev_ops != &vdev_indirect_ops) { 1155 ASSERT3P(vd->vdev_parent, ==, pio->io_vd); 1156 } else { 1157 ASSERT3P(vd, ==, vd->vdev_top); 1158 } 1159 1160 if (type == ZIO_TYPE_READ && bp != NULL) { 1161 /* 1162 * If we have the bp, then the child should perform the 1163 * checksum and the parent need not. This pushes error 1164 * detection as close to the leaves as possible and 1165 * eliminates redundant checksums in the interior nodes. 1166 / 1167* pipeline \|= ZIO_STAGE_CHECKSUM_VERIFY; 1168 pio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY; 1169 } 1170 1171 /* Not all IO types require vdev io done stage e.g. free / 1172* if (!(pio->io_pipeline & ZIO_STAGE_VDEV_IO_DONE)) 1173 pipeline &= ~ZIO_STAGE_VDEV_IO_DONE; 1174 1175 if (vd->vdev_ops->vdev_op_leaf) { 1176 ASSERT0(vd->vdev_children); 1177 offset += VDEV_LABEL_START_SIZE; 1178 } 1179 1180 flags \|= ZIO_VDEV_CHILD_FLAGS(pio); 1181 1182 /* 1183 * If we've decided to do a repair, the write is not speculative -- 1184 * even if the original read was. 1185 / 1186* if (flags & ZIO_FLAG_IO_REPAIR) 1187 flags &= ~ZIO_FLAG_SPECULATIVE; 1188 1189 /* 1190 * If we're creating a child I/O that is not associated with a 1191 * top-level vdev, then the child zio is not an allocating I/O. 1192 * If this is a retried I/O then we ignore it since we will 1193 * have already processed the original allocating I/O. 1194 / 1195* if (flags & ZIO_FLAG_IO_ALLOCATING && 1196 (vd != vd->vdev_top \|\| (flags & ZIO_FLAG_IO_RETRY))) { 1197 metaslab_class_t mc = spa_normal_class(pio->io_spa); 1198* 1199 ASSERT(mc->mc_alloc_throttle_enabled); 1200 ASSERT(type == ZIO_TYPE_WRITE); 1201 ASSERT(priority == ZIO_PRIORITY_ASYNC_WRITE); 1202 ASSERT(!(flags & ZIO_FLAG_IO_REPAIR)); 1203 ASSERT(!(pio->io_flags & ZIO_FLAG_IO_REWRITE) \|\| 1204 pio->io_child_type == ZIO_CHILD_GANG); 1205 1206 flags &= ~ZIO_FLAG_IO_ALLOCATING; 1207 } 1208 1209 zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size, size, 1210 done, private, type, priority, flags, vd, offset, &pio->io_bookmark, 1211 ZIO_STAGE_VDEV_IO_START >> 1, pipeline); 1212 ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV); 1213 1214 zio->io_physdone = pio->io_physdone; 1215 if (vd->vdev_ops->vdev_op_leaf && zio->io_logical != NULL) 1216 zio->io_logical->io_phys_children++; 1217 1218 return (zio); 1219} 1220 1221zio_t * 1222zio_vdev_delegated_io(vdev_t vd, uint64_t offset, abd_t data, uint64_t size, 1223 int type, zio_priority_t priority, enum zio_flag flags, 1224 zio_done_func_t done, void private) 1225{ 1226 zio_t zio; 1227* 1228 ASSERT(vd->vdev_ops->vdev_op_leaf); 1229 1230 zio = zio_create(NULL, vd->vdev_spa, 0, NULL, 1231 data, size, size, done, private, type, priority, 1232 flags \| ZIO_FLAG_CANFAIL \| ZIO_FLAG_DONT_RETRY \| ZIO_FLAG_DELEGATED, 1233 vd, offset, NULL, 1234 ZIO_STAGE_VDEV_IO_START >> 1, ZIO_VDEV_CHILD_PIPELINE); 1235 1236 return (zio); 1237} 1238 1239void 1240zio_flush(zio_t zio, vdev_t vd) 1241{ 1242 zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE, 0, 0, 1243 NULL, NULL, ZIO_PRIORITY_NOW, 1244 ZIO_FLAG_CANFAIL \| ZIO_FLAG_DONT_PROPAGATE \| ZIO_FLAG_DONT_RETRY)); 1245} 1246 1247zio_t * 1248zio_trim(zio_t zio, spa_t spa, vdev_t vd, uint64_t offset, uint64_t size) 1249{ 1250* 1251 ASSERT(vd->vdev_ops->vdev_op_leaf); 1252 1253 return (zio_create(zio, spa, 0, NULL, NULL, size, size, NULL, NULL, 1254 ZIO_TYPE_FREE, ZIO_PRIORITY_TRIM, ZIO_FLAG_DONT_AGGREGATE \| 1255 ZIO_FLAG_CANFAIL \| ZIO_FLAG_DONT_PROPAGATE \| ZIO_FLAG_DONT_RETRY, 1256 vd, offset, NULL, ZIO_STAGE_OPEN, ZIO_FREE_PHYS_PIPELINE)); 1257} 1258 1259void 1260zio_shrink(zio_t zio, uint64_t size) 1261{ 1262* ASSERT3P(zio->io_executor, ==, NULL); 1263 ASSERT3P(zio->io_orig_size, ==, zio->io_size); 1264 ASSERT3U(size, <=, zio->io_size); 1265 1266 /* 1267 * We don't shrink for raidz because of problems with the 1268 * reconstruction when reading back less than the block size. 1269 * Note, BP_IS_RAIDZ() assumes no compression. 1270 / 1271* ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF); 1272 if (!BP_IS_RAIDZ(zio->io_bp)) { 1273 /* we are not doing a raw write / 1274* ASSERT3U(zio->io_size, ==, zio->io_lsize); 1275 zio->io_orig_size = zio->io_size = zio->io_lsize = size; 1276 } 1277} 1278 1279/* 1280 * ========================================================================== 1281 * Prepare to read and write logical blocks 1282 * ========================================================================== 1283 / 1284* 1285static int 1286zio_read_bp_init(zio_t zio) 1287{ 1288* blkptr_t bp = zio->io_bp; 1289* 1290 ASSERT3P(zio->io_bp, ==, &zio->io_bp_copy); 1291 1292 if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF && 1293 zio->io_child_type == ZIO_CHILD_LOGICAL && 1294 !(zio->io_flags & ZIO_FLAG_RAW)) { 1295 uint64_t psize = 1296 BP_IS_EMBEDDED(bp) ? BPE_GET_PSIZE(bp) : BP_GET_PSIZE(bp); 1297 zio_push_transform(zio, abd_alloc_sametype(zio->io_abd, psize), 1298 psize, psize, zio_decompress); 1299 } 1300 1301 if (BP_IS_EMBEDDED(bp) && BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA) { 1302 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1303 1304 int psize = BPE_GET_PSIZE(bp); 1305 void data = abd_borrow_buf(zio->io_abd, psize); 1306* decode_embedded_bp_compressed(bp, data); 1307 abd_return_buf_copy(zio->io_abd, data, psize); 1308 } else { 1309 ASSERT(!BP_IS_EMBEDDED(bp)); 1310 ASSERT3P(zio->io_bp, ==, &zio->io_bp_copy); 1311 } 1312 1313 if (!DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) && BP_GET_LEVEL(bp) == 0) 1314 zio->io_flags \|= ZIO_FLAG_DONT_CACHE; 1315 1316 if (BP_GET_TYPE(bp) == DMU_OT_DDT_ZAP) 1317 zio->io_flags \|= ZIO_FLAG_DONT_CACHE; 1318 1319 if (BP_GET_DEDUP(bp) && zio->io_child_type == ZIO_CHILD_LOGICAL) 1320 zio->io_pipeline = ZIO_DDT_READ_PIPELINE; 1321 1322 return (ZIO_PIPELINE_CONTINUE); 1323} 1324 1325static int 1326zio_write_bp_init(zio_t zio) 1327{ 1328* if (!IO_IS_ALLOCATING(zio)) 1329 return (ZIO_PIPELINE_CONTINUE); 1330 1331 ASSERT(zio->io_child_type != ZIO_CHILD_DDT); 1332 1333 if (zio->io_bp_override) { 1334 blkptr_t bp = zio->io_bp; 1335* zio_prop_t zp = &zio->io_prop; 1336* 1337 ASSERT(bp->blk_birth != zio->io_txg); 1338 ASSERT(BP_GET_DEDUP(zio->io_bp_override) == 0); 1339 1340 bp = zio->io_bp_override; 1341 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1342 1343 if (BP_IS_EMBEDDED(bp)) 1344 return (ZIO_PIPELINE_CONTINUE); 1345 1346 /* 1347 * If we've been overridden and nopwrite is set then 1348 * set the flag accordingly to indicate that a nopwrite 1349 * has already occurred. 1350 / 1351* if (!BP_IS_HOLE(bp) && zp->zp_nopwrite) { 1352 ASSERT(!zp->zp_dedup); 1353 ASSERT3U(BP_GET_CHECKSUM(bp), ==, zp->zp_checksum); 1354 zio->io_flags \|= ZIO_FLAG_NOPWRITE; 1355 return (ZIO_PIPELINE_CONTINUE); 1356 } 1357 1358 ASSERT(!zp->zp_nopwrite); 1359 1360 if (BP_IS_HOLE(bp) \|\| !zp->zp_dedup) 1361 return (ZIO_PIPELINE_CONTINUE); 1362 1363 ASSERT((zio_checksum_table[zp->zp_checksum].ci_flags & 1364 ZCHECKSUM_FLAG_DEDUP) \|\| zp->zp_dedup_verify); 1365 1366 if (BP_GET_CHECKSUM(bp) == zp->zp_checksum) { 1367 BP_SET_DEDUP(bp, 1); 1368 zio->io_pipeline \|= ZIO_STAGE_DDT_WRITE; 1369 return (ZIO_PIPELINE_CONTINUE); 1370 } 1371 1372 /* 1373 * We were unable to handle this as an override bp, treat 1374 * it as a regular write I/O. 1375 / 1376* zio->io_bp_override = NULL; 1377 bp = zio->io_bp_orig; 1378* zio->io_pipeline = zio->io_orig_pipeline; 1379 } 1380 1381 return (ZIO_PIPELINE_CONTINUE); 1382} 1383 1384static int 1385zio_write_compress(zio_t zio) 1386{ 1387* spa_t spa = zio->io_spa; 1388* zio_prop_t zp = &zio->io_prop; 1389* enum zio_compress compress = zp->zp_compress; 1390 blkptr_t bp = zio->io_bp; 1391* uint64_t lsize = zio->io_lsize; 1392 uint64_t psize = zio->io_size; 1393 int pass = 1; 1394 1395 EQUIV(lsize != psize, (zio->io_flags & ZIO_FLAG_RAW) != 0); 1396 1397 /* 1398 * If our children haven't all reached the ready stage, 1399 * wait for them and then repeat this pipeline stage. 1400 / 1401* if (zio_wait_for_children(zio, ZIO_CHILD_LOGICAL_BIT \| 1402 ZIO_CHILD_GANG_BIT, ZIO_WAIT_READY)) { 1403 return (ZIO_PIPELINE_STOP); 1404 } 1405 1406 if (!IO_IS_ALLOCATING(zio)) 1407 return (ZIO_PIPELINE_CONTINUE); 1408 1409 if (zio->io_children_ready != NULL) { 1410 /* 1411 * Now that all our children are ready, run the callback 1412 * associated with this zio in case it wants to modify the 1413 * data to be written. 1414 / 1415* ASSERT3U(zp->zp_level, >, 0); 1416 zio->io_children_ready(zio); 1417 } 1418 1419 ASSERT(zio->io_child_type != ZIO_CHILD_DDT); 1420 ASSERT(zio->io_bp_override == NULL); 1421 1422 if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg) { 1423 /* 1424 * We're rewriting an existing block, which means we're 1425 * working on behalf of spa_sync(). For spa_sync() to 1426 * converge, it must eventually be the case that we don't 1427 * have to allocate new blocks. But compression changes 1428 * the blocksize, which forces a reallocate, and makes 1429 * convergence take longer. Therefore, after the first 1430 * few passes, stop compressing to ensure convergence. 1431 / 1432* pass = spa_sync_pass(spa); 1433 1434 ASSERT(zio->io_txg == spa_syncing_txg(spa)); 1435 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 1436 ASSERT(!BP_GET_DEDUP(bp)); 1437 1438 if (pass >= zfs_sync_pass_dont_compress) 1439 compress = ZIO_COMPRESS_OFF; 1440 1441 /* Make sure someone doesn't change their mind on overwrites / 1442* ASSERT(BP_IS_EMBEDDED(bp) \|\| MIN(zp->zp_copies + BP_IS_GANG(bp), 1443 spa_max_replication(spa)) == BP_GET_NDVAS(bp)); 1444 } 1445 1446 /* If it's a compressed write that is not raw, compress the buffer. / 1447* if (compress != ZIO_COMPRESS_OFF && psize == lsize) { 1448 void cbuf = zio_buf_alloc(lsize); 1449* psize = zio_compress_data(compress, zio->io_abd, cbuf, lsize); 1450 if (psize == 0 \|\| psize == lsize) { 1451 compress = ZIO_COMPRESS_OFF; 1452 zio_buf_free(cbuf, lsize); 1453 } else if (!zp->zp_dedup && psize <= BPE_PAYLOAD_SIZE && 1454 zp->zp_level == 0 && !DMU_OT_HAS_FILL(zp->zp_type) && 1455 spa_feature_is_enabled(spa, SPA_FEATURE_EMBEDDED_DATA)) { 1456 encode_embedded_bp_compressed(bp, 1457 cbuf, compress, lsize, psize); 1458 BPE_SET_ETYPE(bp, BP_EMBEDDED_TYPE_DATA); 1459 BP_SET_TYPE(bp, zio->io_prop.zp_type); 1460 BP_SET_LEVEL(bp, zio->io_prop.zp_level); 1461 zio_buf_free(cbuf, lsize); 1462 bp->blk_birth = zio->io_txg; 1463 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1464 ASSERT(spa_feature_is_active(spa, 1465 SPA_FEATURE_EMBEDDED_DATA)); 1466 return (ZIO_PIPELINE_CONTINUE); 1467 } else { 1468 /* 1469 * Round up compressed size up to the ashift 1470 * of the smallest-ashift device, and zero the tail. 1471 * This ensures that the compressed size of the BP 1472 * (and thus compressratio property) are correct, 1473 * in that we charge for the padding used to fill out 1474 * the last sector. 1475 / 1476* ASSERT3U(spa->spa_min_ashift, >=, SPA_MINBLOCKSHIFT); 1477 size_t rounded = (size_t)P2ROUNDUP(psize, 1478 1ULL << spa->spa_min_ashift); 1479 if (rounded >= lsize) { 1480 compress = ZIO_COMPRESS_OFF; 1481 zio_buf_free(cbuf, lsize); 1482 psize = lsize; 1483 } else { 1484 abd_t cdata = abd_get_from_buf(cbuf, lsize); 1485* abd_take_ownership_of_buf(cdata, B_TRUE); 1486 abd_zero_off(cdata, psize, rounded - psize); 1487 psize = rounded; 1488 zio_push_transform(zio, cdata, 1489 psize, lsize, NULL); 1490 } 1491 } 1492 1493 /* 1494 * We were unable to handle this as an override bp, treat 1495 * it as a regular write I/O. 1496 / 1497* zio->io_bp_override = NULL; 1498 bp = zio->io_bp_orig; 1499* zio->io_pipeline = zio->io_orig_pipeline; 1500 } else { 1501 ASSERT3U(psize, !=, 0); 1502 } 1503 1504 /* 1505 * The final pass of spa_sync() must be all rewrites, but the first 1506 * few passes offer a trade-off: allocating blocks defers convergence, 1507 * but newly allocated blocks are sequential, so they can be written 1508 * to disk faster. Therefore, we allow the first few passes of 1509 * spa_sync() to allocate new blocks, but force rewrites after that. 1510 * There should only be a handful of blocks after pass 1 in any case. 1511 / 1512* if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg && 1513 BP_GET_PSIZE(bp) == psize && 1514 pass >= zfs_sync_pass_rewrite) { 1515 ASSERT(psize != 0); 1516 enum zio_stage gang_stages = zio->io_pipeline & ZIO_GANG_STAGES; 1517 zio->io_pipeline = ZIO_REWRITE_PIPELINE \| gang_stages; 1518 zio->io_flags \|= ZIO_FLAG_IO_REWRITE; 1519 } else { 1520 BP_ZERO(bp); 1521 zio->io_pipeline = ZIO_WRITE_PIPELINE; 1522 } 1523 1524 if (psize == 0) { 1525 if (zio->io_bp_orig.blk_birth != 0 && 1526 spa_feature_is_active(spa, SPA_FEATURE_HOLE_BIRTH)) { 1527 BP_SET_LSIZE(bp, lsize); 1528 BP_SET_TYPE(bp, zp->zp_type); 1529 BP_SET_LEVEL(bp, zp->zp_level); 1530 BP_SET_BIRTH(bp, zio->io_txg, 0); 1531 } 1532 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1533 } else { 1534 ASSERT(zp->zp_checksum != ZIO_CHECKSUM_GANG_HEADER); 1535 BP_SET_LSIZE(bp, lsize); 1536 BP_SET_TYPE(bp, zp->zp_type); 1537 BP_SET_LEVEL(bp, zp->zp_level); 1538 BP_SET_PSIZE(bp, psize); 1539 BP_SET_COMPRESS(bp, compress); 1540 BP_SET_CHECKSUM(bp, zp->zp_checksum); 1541 BP_SET_DEDUP(bp, zp->zp_dedup); 1542 BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); 1543 if (zp->zp_dedup) { 1544 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 1545 ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); 1546 zio->io_pipeline = ZIO_DDT_WRITE_PIPELINE; 1547 } 1548 if (zp->zp_nopwrite) { 1549 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 1550 ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); 1551 zio->io_pipeline \|= ZIO_STAGE_NOP_WRITE; 1552 } 1553 } 1554 return (ZIO_PIPELINE_CONTINUE); 1555} 1556 1557static int 1558zio_free_bp_init(zio_t zio) 1559{ 1560* blkptr_t bp = zio->io_bp; 1561* 1562 if (zio->io_child_type == ZIO_CHILD_LOGICAL) { 1563 if (BP_GET_DEDUP(bp)) 1564 zio->io_pipeline = ZIO_DDT_FREE_PIPELINE; 1565 } 1566 1567 ASSERT3P(zio->io_bp, ==, &zio->io_bp_copy); 1568 1569 return (ZIO_PIPELINE_CONTINUE); 1570} 1571 1572/* 1573 * ========================================================================== 1574 * Execute the I/O pipeline 1575 * ========================================================================== 1576 / 1577* 1578static void 1579zio_taskq_dispatch(zio_t zio, zio_taskq_type_t q, boolean_t cutinline) 1580{ 1581* spa_t spa = zio->io_spa; 1582* zio_type_t t = zio->io_type; 1583 int flags = (cutinline ? TQ_FRONT : 0); 1584 1585 ASSERT(q == ZIO_TASKQ_ISSUE \|\| q == ZIO_TASKQ_INTERRUPT); 1586 1587 /* 1588 * If we're a config writer or a probe, the normal issue and 1589 * interrupt threads may all be blocked waiting for the config lock. 1590 * In this case, select the otherwise-unused taskq for ZIO_TYPE_NULL. 1591 / 1592* if (zio->io_flags & (ZIO_FLAG_CONFIG_WRITER \| ZIO_FLAG_PROBE)) 1593 t = ZIO_TYPE_NULL; 1594 1595 /* 1596 * A similar issue exists for the L2ARC write thread until L2ARC 2.0. 1597 / 1598* if (t == ZIO_TYPE_WRITE && zio->io_vd && zio->io_vd->vdev_aux) 1599 t = ZIO_TYPE_NULL; 1600 1601 /* 1602 * If this is a high priority I/O, then use the high priority taskq if 1603 * available. 1604 / 1605* if (zio->io_priority == ZIO_PRIORITY_NOW && 1606 spa->spa_zio_taskq[t][q + 1].stqs_count != 0) 1607 q++; 1608 1609 ASSERT3U(q, <, ZIO_TASKQ_TYPES); 1610 1611 /* 1612 * NB: We are assuming that the zio can only be dispatched 1613 * to a single taskq at a time. It would be a grievous error 1614 * to dispatch the zio to another taskq at the same time. 1615 / 1616#if defined(illumos) \|\| !defined(_KERNEL) 1617* ASSERT(zio->io_tqent.tqent_next == NULL); 1618#else 1619 ASSERT(zio->io_tqent.tqent_task.ta_pending == 0); 1620#endif 1621 spa_taskq_dispatch_ent(spa, t, q, (task_func_t )zio_execute, zio, 1622* flags, &zio->io_tqent); 1623} 1624 1625static boolean_t 1626zio_taskq_member(zio_t zio, zio_taskq_type_t q) 1627{ 1628* kthread_t executor = zio->io_executor; 1629* spa_t spa = zio->io_spa; 1630* 1631 for (zio_type_t t = 0; t < ZIO_TYPES; t++) { 1632 spa_taskqs_t tqs = &spa->spa_zio_taskq[t][q]; 1633* uint_t i; 1634 for (i = 0; i < tqs->stqs_count; i++) { 1635 if (taskq_member(tqs->stqs_taskq[i], executor)) 1636 return (B_TRUE); 1637 } 1638 } 1639 1640 return (B_FALSE); 1641} 1642 1643static int 1644zio_issue_async(zio_t zio) 1645{ 1646* zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE); 1647 1648 return (ZIO_PIPELINE_STOP); 1649} 1650 1651void 1652zio_interrupt(zio_t zio) 1653{ 1654* zio_taskq_dispatch(zio, ZIO_TASKQ_INTERRUPT, B_FALSE); 1655} 1656 1657void 1658zio_delay_interrupt(zio_t zio) 1659{ 1660* /* 1661 * The timeout_generic() function isn't defined in userspace, so 1662 * rather than trying to implement the function, the zio delay 1663 * functionality has been disabled for userspace builds. 1664 / 1665* 1666#ifdef _KERNEL 1667 /* 1668 * If io_target_timestamp is zero, then no delay has been registered 1669 * for this IO, thus jump to the end of this function and "skip" the 1670 * delay; issuing it directly to the zio layer. 1671 / 1672* if (zio->io_target_timestamp != 0) { 1673 hrtime_t now = gethrtime(); 1674 1675 if (now >= zio->io_target_timestamp) { 1676 /* 1677 * This IO has already taken longer than the target 1678 * delay to complete, so we don't want to delay it 1679 * any longer; we "miss" the delay and issue it 1680 * directly to the zio layer. This is likely due to 1681 * the target latency being set to a value less than 1682 * the underlying hardware can satisfy (e.g. delay 1683 * set to 1ms, but the disks take 10ms to complete an 1684 * IO request). 1685 / 1686* 1687 DTRACE_PROBE2(zio__delay__miss, zio_t , zio, 1688* hrtime_t, now); 1689 1690 zio_interrupt(zio); 1691 } else { 1692 hrtime_t diff = zio->io_target_timestamp - now; 1693 1694 DTRACE_PROBE3(zio__delay__hit, zio_t , zio, 1695* hrtime_t, now, hrtime_t, diff); 1696 1697 (void) timeout_generic(CALLOUT_NORMAL, 1698 (void ()(void ))zio_interrupt, zio, diff, 1, 0); 1699 } 1700 1701 return; 1702 } 1703#endif 1704 1705 DTRACE_PROBE1(zio__delay__skip, zio_t , zio); 1706* zio_interrupt(zio); 1707} 1708 1709/* 1710 * Execute the I/O pipeline until one of the following occurs: 1711 * 1712 * (1) the I/O completes 1713 * (2) the pipeline stalls waiting for dependent child I/Os 1714 * (3) the I/O issues, so we're waiting for an I/O completion interrupt 1715 * (4) the I/O is delegated by vdev-level caching or aggregation 1716 * (5) the I/O is deferred due to vdev-level queueing 1717 * (6) the I/O is handed off to another thread. 1718 * 1719 * In all cases, the pipeline stops whenever there's no CPU work; it never 1720 * burns a thread in cv_wait(). 1721 * 1722 * There's no locking on io_stage because there's no legitimate way 1723 * for multiple threads to be attempting to process the same I/O. 1724 / 1725static zio_pipe_stage_t zio_pipeline[]; 1726 1727void 1728zio_execute(zio_t zio) 1729{ 1730* zio->io_executor = curthread; 1731 1732 ASSERT3U(zio->io_queued_timestamp, >, 0); 1733 1734 while (zio->io_stage < ZIO_STAGE_DONE) { 1735 enum zio_stage pipeline = zio->io_pipeline; 1736 enum zio_stage stage = zio->io_stage; 1737 int rv; 1738 1739 ASSERT(!MUTEX_HELD(&zio->io_lock)); 1740 ASSERT(ISP2(stage)); 1741 ASSERT(zio->io_stall == NULL); 1742 1743 do { 1744 stage <<= 1; 1745 } while ((stage & pipeline) == 0); 1746 1747 ASSERT(stage <= ZIO_STAGE_DONE); 1748 1749 /* 1750 * If we are in interrupt context and this pipeline stage 1751 * will grab a config lock that is held across I/O, 1752 * or may wait for an I/O that needs an interrupt thread 1753 * to complete, issue async to avoid deadlock. 1754 * 1755 * For VDEV_IO_START, we cut in line so that the io will 1756 * be sent to disk promptly. 1757 / 1758* if ((stage & ZIO_BLOCKING_STAGES) && zio->io_vd == NULL && 1759 zio_taskq_member(zio, ZIO_TASKQ_INTERRUPT)) { 1760 boolean_t cut = (stage == ZIO_STAGE_VDEV_IO_START) ? 1761 zio_requeue_io_start_cut_in_line : B_FALSE; 1762 zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut); 1763 return; 1764 } 1765 1766 zio->io_stage = stage; 1767 zio->io_pipeline_trace \|= zio->io_stage; 1768 rv = zio_pipeline[highbit64(stage) - 1](zio); 1769 1770 if (rv == ZIO_PIPELINE_STOP) 1771 return; 1772 1773 ASSERT(rv == ZIO_PIPELINE_CONTINUE); 1774 } 1775} 1776 1777/* 1778 * ========================================================================== 1779 * Initiate I/O, either sync or async 1780 * ========================================================================== 1781 / 1782int 1783zio_wait(zio_t zio) 1784{ 1785 int error; 1786 1787 ASSERT3P(zio->io_stage, ==, ZIO_STAGE_OPEN); 1788 ASSERT3P(zio->io_executor, ==, NULL); 1789 1790 zio->io_waiter = curthread; 1791 ASSERT0(zio->io_queued_timestamp); 1792 zio->io_queued_timestamp = gethrtime(); 1793 1794 zio_execute(zio); 1795 1796 mutex_enter(&zio->io_lock); 1797 while (zio->io_executor != NULL) 1798 cv_wait(&zio->io_cv, &zio->io_lock); 1799 mutex_exit(&zio->io_lock); 1800 1801 error = zio->io_error; 1802 zio_destroy(zio); 1803 1804 return (error); 1805} 1806 1807void 1808zio_nowait(zio_t zio) 1809{ 1810* ASSERT3P(zio->io_executor, ==, NULL); 1811 1812 if (zio->io_child_type == ZIO_CHILD_LOGICAL && 1813 zio_unique_parent(zio) == NULL) { 1814 /* 1815 * This is a logical async I/O with no parent to wait for it. 1816 * We add it to the spa_async_root_zio "Godfather" I/O which 1817 * will ensure they complete prior to unloading the pool. 1818 / 1819* spa_t spa = zio->io_spa; 1820* 1821 zio_add_child(spa->spa_async_zio_root[CPU_SEQID], zio); 1822 } 1823 1824 ASSERT0(zio->io_queued_timestamp); 1825 zio->io_queued_timestamp = gethrtime(); 1826 zio_execute(zio); 1827} 1828 1829/* 1830 * ========================================================================== 1831 * Reexecute, cancel, or suspend/resume failed I/O 1832 * ========================================================================== 1833 / 1834* 1835static void 1836zio_reexecute(zio_t pio) 1837{ 1838* zio_t cio, cio_next; 1839 1840 ASSERT(pio->io_child_type == ZIO_CHILD_LOGICAL); 1841 ASSERT(pio->io_orig_stage == ZIO_STAGE_OPEN); 1842 ASSERT(pio->io_gang_leader == NULL); 1843 ASSERT(pio->io_gang_tree == NULL); 1844 1845 pio->io_flags = pio->io_orig_flags; 1846 pio->io_stage = pio->io_orig_stage; 1847 pio->io_pipeline = pio->io_orig_pipeline; 1848 pio->io_reexecute = 0; 1849 pio->io_flags \|= ZIO_FLAG_REEXECUTED; 1850 pio->io_pipeline_trace = 0; 1851 pio->io_error = 0; 1852 for (int w = 0; w < ZIO_WAIT_TYPES; w++) 1853 pio->io_state[w] = 0; 1854 for (int c = 0; c < ZIO_CHILD_TYPES; c++) 1855 pio->io_child_error[c] = 0; 1856 1857 if (IO_IS_ALLOCATING(pio)) 1858 BP_ZERO(pio->io_bp); 1859 1860 /* 1861 * As we reexecute pio's children, new children could be created. 1862 * New children go to the head of pio's io_child_list, however, 1863 * so we will (correctly) not reexecute them. The key is that 1864 * the remainder of pio's io_child_list, from 'cio_next' onward, 1865 * cannot be affected by any side effects of reexecuting 'cio'. 1866 / 1867* zio_link_t *zl = NULL;	994 995 if (zfs_trim_enabled) 996 stage \|= ZIO_STAGE_ISSUE_ASYNC \| ZIO_STAGE_VDEV_IO_START \| 997 ZIO_STAGE_VDEV_IO_ASSESS; 998 /* 999 * GANG and DEDUP blocks can induce a read (for the gang block header, 1000 * or the DDT), so issue them asynchronously so that this thread is 1001 * not tied up. 1002 / 1003* else if (BP_IS_GANG(bp) \|\| BP_GET_DEDUP(bp)) 1004 stage \|= ZIO_STAGE_ISSUE_ASYNC; 1005 1006 flags \|= ZIO_FLAG_DONT_QUEUE; 1007 1008 zio = zio_create(pio, spa, txg, bp, NULL, size, 1009 size, NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_NOW, 1010 flags, NULL, 0, NULL, ZIO_STAGE_OPEN, stage); 1011 1012 return (zio); 1013} 1014 1015zio_t * 1016zio_claim(zio_t pio, spa_t spa, uint64_t txg, const blkptr_t bp, 1017* zio_done_func_t done, void private, enum zio_flag flags) 1018{ 1019 zio_t zio; 1020* 1021 zfs_blkptr_verify(spa, bp); 1022 1023 if (BP_IS_EMBEDDED(bp)) 1024 return (zio_null(pio, spa, NULL, NULL, NULL, 0)); 1025 1026 /* 1027 * A claim is an allocation of a specific block. Claims are needed 1028 * to support immediate writes in the intent log. The issue is that 1029 * immediate writes contain committed data, but in a txg that was 1030 * not committed. Upon opening the pool after an unclean shutdown, 1031 * the intent log claims all blocks that contain immediate write data 1032 * so that the SPA knows they're in use. 1033 * 1034 * All claims must be resolved in the first txg -- before the SPA 1035 * starts allocating blocks -- so that nothing is allocated twice. 1036 * If txg == 0 we just verify that the block is claimable. 1037 / 1038* ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, 1039 spa_min_claim_txg(spa)); 1040 ASSERT(txg == spa_min_claim_txg(spa) \|\| txg == 0); 1041 ASSERT(!BP_GET_DEDUP(bp) \|\| !spa_writeable(spa)); /* zdb(1M) / 1042* 1043 zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp), 1044 BP_GET_PSIZE(bp), done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, 1045 flags, NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE); 1046 ASSERT0(zio->io_queued_timestamp); 1047 1048 return (zio); 1049} 1050 1051zio_t * 1052zio_ioctl(zio_t pio, spa_t spa, vdev_t vd, int cmd, uint64_t offset, 1053* uint64_t size, zio_done_func_t done, void private, 1054 zio_priority_t priority, enum zio_flag flags) 1055{ 1056 zio_t zio; 1057* int c; 1058 1059 if (vd->vdev_children == 0) { 1060 zio = zio_create(pio, spa, 0, NULL, NULL, 0, 0, done, private, 1061 ZIO_TYPE_IOCTL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL, 1062 ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE); 1063 1064 zio->io_cmd = cmd; 1065 } else { 1066 zio = zio_null(pio, spa, NULL, NULL, NULL, flags); 1067 1068 for (c = 0; c < vd->vdev_children; c++) 1069 zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd, 1070 offset, size, done, private, priority, flags)); 1071 } 1072 1073 return (zio); 1074} 1075 1076zio_t * 1077zio_read_phys(zio_t pio, vdev_t vd, uint64_t offset, uint64_t size, 1078 abd_t data, int checksum, zio_done_func_t done, void private, 1079* zio_priority_t priority, enum zio_flag flags, boolean_t labels) 1080{ 1081 zio_t zio; 1082* 1083 ASSERT(vd->vdev_children == 0); 1084 ASSERT(!labels \|\| offset + size <= VDEV_LABEL_START_SIZE \|\| 1085 offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); 1086 ASSERT3U(offset + size, <=, vd->vdev_psize); 1087 1088 zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, size, done, 1089 private, ZIO_TYPE_READ, priority, flags \| ZIO_FLAG_PHYSICAL, vd, 1090 offset, NULL, ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE); 1091 1092 zio->io_prop.zp_checksum = checksum; 1093 1094 return (zio); 1095} 1096 1097zio_t * 1098zio_write_phys(zio_t pio, vdev_t vd, uint64_t offset, uint64_t size, 1099 abd_t data, int checksum, zio_done_func_t done, void private, 1100* zio_priority_t priority, enum zio_flag flags, boolean_t labels) 1101{ 1102 zio_t zio; 1103* 1104 ASSERT(vd->vdev_children == 0); 1105 ASSERT(!labels \|\| offset + size <= VDEV_LABEL_START_SIZE \|\| 1106 offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); 1107 ASSERT3U(offset + size, <=, vd->vdev_psize); 1108 1109 zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, size, done, 1110 private, ZIO_TYPE_WRITE, priority, flags \| ZIO_FLAG_PHYSICAL, vd, 1111 offset, NULL, ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE); 1112 1113 zio->io_prop.zp_checksum = checksum; 1114 1115 if (zio_checksum_table[checksum].ci_flags & ZCHECKSUM_FLAG_EMBEDDED) { 1116 /* 1117 * zec checksums are necessarily destructive -- they modify 1118 * the end of the write buffer to hold the verifier/checksum. 1119 * Therefore, we must make a local copy in case the data is 1120 * being written to multiple places in parallel. 1121 / 1122* abd_t wbuf = abd_alloc_sametype(data, size); 1123* abd_copy(wbuf, data, size); 1124 1125 zio_push_transform(zio, wbuf, size, size, NULL); 1126 } 1127 1128 return (zio); 1129} 1130 1131/* 1132 * Create a child I/O to do some work for us. 1133 / 1134zio_t 1135zio_vdev_child_io(zio_t pio, blkptr_t bp, vdev_t vd, uint64_t offset, 1136* abd_t data, uint64_t size, int type, zio_priority_t priority, 1137* enum zio_flag flags, zio_done_func_t done, void private) 1138{ 1139 enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE; 1140 zio_t zio; 1141* 1142 /* 1143 * vdev child I/Os do not propagate their error to the parent. 1144 * Therefore, for correct operation the caller must check for 1145 * and handle the error in the child i/o's done callback. 1146 * The only exceptions are i/os that we don't care about 1147 * (OPTIONAL or REPAIR). 1148 / 1149* ASSERT((flags & ZIO_FLAG_OPTIONAL) \|\| (flags & ZIO_FLAG_IO_REPAIR) \|\| 1150 done != NULL); 1151 1152 /* 1153 * In the common case, where the parent zio was to a normal vdev, 1154 * the child zio must be to a child vdev of that vdev. Otherwise, 1155 * the child zio must be to a top-level vdev. 1156 / 1157* if (pio->io_vd != NULL && pio->io_vd->vdev_ops != &vdev_indirect_ops) { 1158 ASSERT3P(vd->vdev_parent, ==, pio->io_vd); 1159 } else { 1160 ASSERT3P(vd, ==, vd->vdev_top); 1161 } 1162 1163 if (type == ZIO_TYPE_READ && bp != NULL) { 1164 /* 1165 * If we have the bp, then the child should perform the 1166 * checksum and the parent need not. This pushes error 1167 * detection as close to the leaves as possible and 1168 * eliminates redundant checksums in the interior nodes. 1169 / 1170* pipeline \|= ZIO_STAGE_CHECKSUM_VERIFY; 1171 pio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY; 1172 } 1173 1174 /* Not all IO types require vdev io done stage e.g. free / 1175* if (!(pio->io_pipeline & ZIO_STAGE_VDEV_IO_DONE)) 1176 pipeline &= ~ZIO_STAGE_VDEV_IO_DONE; 1177 1178 if (vd->vdev_ops->vdev_op_leaf) { 1179 ASSERT0(vd->vdev_children); 1180 offset += VDEV_LABEL_START_SIZE; 1181 } 1182 1183 flags \|= ZIO_VDEV_CHILD_FLAGS(pio); 1184 1185 /* 1186 * If we've decided to do a repair, the write is not speculative -- 1187 * even if the original read was. 1188 / 1189* if (flags & ZIO_FLAG_IO_REPAIR) 1190 flags &= ~ZIO_FLAG_SPECULATIVE; 1191 1192 /* 1193 * If we're creating a child I/O that is not associated with a 1194 * top-level vdev, then the child zio is not an allocating I/O. 1195 * If this is a retried I/O then we ignore it since we will 1196 * have already processed the original allocating I/O. 1197 / 1198* if (flags & ZIO_FLAG_IO_ALLOCATING && 1199 (vd != vd->vdev_top \|\| (flags & ZIO_FLAG_IO_RETRY))) { 1200 metaslab_class_t mc = spa_normal_class(pio->io_spa); 1201* 1202 ASSERT(mc->mc_alloc_throttle_enabled); 1203 ASSERT(type == ZIO_TYPE_WRITE); 1204 ASSERT(priority == ZIO_PRIORITY_ASYNC_WRITE); 1205 ASSERT(!(flags & ZIO_FLAG_IO_REPAIR)); 1206 ASSERT(!(pio->io_flags & ZIO_FLAG_IO_REWRITE) \|\| 1207 pio->io_child_type == ZIO_CHILD_GANG); 1208 1209 flags &= ~ZIO_FLAG_IO_ALLOCATING; 1210 } 1211 1212 zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size, size, 1213 done, private, type, priority, flags, vd, offset, &pio->io_bookmark, 1214 ZIO_STAGE_VDEV_IO_START >> 1, pipeline); 1215 ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV); 1216 1217 zio->io_physdone = pio->io_physdone; 1218 if (vd->vdev_ops->vdev_op_leaf && zio->io_logical != NULL) 1219 zio->io_logical->io_phys_children++; 1220 1221 return (zio); 1222} 1223 1224zio_t * 1225zio_vdev_delegated_io(vdev_t vd, uint64_t offset, abd_t data, uint64_t size, 1226 int type, zio_priority_t priority, enum zio_flag flags, 1227 zio_done_func_t done, void private) 1228{ 1229 zio_t zio; 1230* 1231 ASSERT(vd->vdev_ops->vdev_op_leaf); 1232 1233 zio = zio_create(NULL, vd->vdev_spa, 0, NULL, 1234 data, size, size, done, private, type, priority, 1235 flags \| ZIO_FLAG_CANFAIL \| ZIO_FLAG_DONT_RETRY \| ZIO_FLAG_DELEGATED, 1236 vd, offset, NULL, 1237 ZIO_STAGE_VDEV_IO_START >> 1, ZIO_VDEV_CHILD_PIPELINE); 1238 1239 return (zio); 1240} 1241 1242void 1243zio_flush(zio_t zio, vdev_t vd) 1244{ 1245 zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE, 0, 0, 1246 NULL, NULL, ZIO_PRIORITY_NOW, 1247 ZIO_FLAG_CANFAIL \| ZIO_FLAG_DONT_PROPAGATE \| ZIO_FLAG_DONT_RETRY)); 1248} 1249 1250zio_t * 1251zio_trim(zio_t zio, spa_t spa, vdev_t vd, uint64_t offset, uint64_t size) 1252{ 1253* 1254 ASSERT(vd->vdev_ops->vdev_op_leaf); 1255 1256 return (zio_create(zio, spa, 0, NULL, NULL, size, size, NULL, NULL, 1257 ZIO_TYPE_FREE, ZIO_PRIORITY_TRIM, ZIO_FLAG_DONT_AGGREGATE \| 1258 ZIO_FLAG_CANFAIL \| ZIO_FLAG_DONT_PROPAGATE \| ZIO_FLAG_DONT_RETRY, 1259 vd, offset, NULL, ZIO_STAGE_OPEN, ZIO_FREE_PHYS_PIPELINE)); 1260} 1261 1262void 1263zio_shrink(zio_t zio, uint64_t size) 1264{ 1265* ASSERT3P(zio->io_executor, ==, NULL); 1266 ASSERT3P(zio->io_orig_size, ==, zio->io_size); 1267 ASSERT3U(size, <=, zio->io_size); 1268 1269 /* 1270 * We don't shrink for raidz because of problems with the 1271 * reconstruction when reading back less than the block size. 1272 * Note, BP_IS_RAIDZ() assumes no compression. 1273 / 1274* ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF); 1275 if (!BP_IS_RAIDZ(zio->io_bp)) { 1276 /* we are not doing a raw write / 1277* ASSERT3U(zio->io_size, ==, zio->io_lsize); 1278 zio->io_orig_size = zio->io_size = zio->io_lsize = size; 1279 } 1280} 1281 1282/* 1283 * ========================================================================== 1284 * Prepare to read and write logical blocks 1285 * ========================================================================== 1286 / 1287* 1288static int 1289zio_read_bp_init(zio_t zio) 1290{ 1291* blkptr_t bp = zio->io_bp; 1292* 1293 ASSERT3P(zio->io_bp, ==, &zio->io_bp_copy); 1294 1295 if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF && 1296 zio->io_child_type == ZIO_CHILD_LOGICAL && 1297 !(zio->io_flags & ZIO_FLAG_RAW)) { 1298 uint64_t psize = 1299 BP_IS_EMBEDDED(bp) ? BPE_GET_PSIZE(bp) : BP_GET_PSIZE(bp); 1300 zio_push_transform(zio, abd_alloc_sametype(zio->io_abd, psize), 1301 psize, psize, zio_decompress); 1302 } 1303 1304 if (BP_IS_EMBEDDED(bp) && BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA) { 1305 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1306 1307 int psize = BPE_GET_PSIZE(bp); 1308 void data = abd_borrow_buf(zio->io_abd, psize); 1309* decode_embedded_bp_compressed(bp, data); 1310 abd_return_buf_copy(zio->io_abd, data, psize); 1311 } else { 1312 ASSERT(!BP_IS_EMBEDDED(bp)); 1313 ASSERT3P(zio->io_bp, ==, &zio->io_bp_copy); 1314 } 1315 1316 if (!DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) && BP_GET_LEVEL(bp) == 0) 1317 zio->io_flags \|= ZIO_FLAG_DONT_CACHE; 1318 1319 if (BP_GET_TYPE(bp) == DMU_OT_DDT_ZAP) 1320 zio->io_flags \|= ZIO_FLAG_DONT_CACHE; 1321 1322 if (BP_GET_DEDUP(bp) && zio->io_child_type == ZIO_CHILD_LOGICAL) 1323 zio->io_pipeline = ZIO_DDT_READ_PIPELINE; 1324 1325 return (ZIO_PIPELINE_CONTINUE); 1326} 1327 1328static int 1329zio_write_bp_init(zio_t zio) 1330{ 1331* if (!IO_IS_ALLOCATING(zio)) 1332 return (ZIO_PIPELINE_CONTINUE); 1333 1334 ASSERT(zio->io_child_type != ZIO_CHILD_DDT); 1335 1336 if (zio->io_bp_override) { 1337 blkptr_t bp = zio->io_bp; 1338* zio_prop_t zp = &zio->io_prop; 1339* 1340 ASSERT(bp->blk_birth != zio->io_txg); 1341 ASSERT(BP_GET_DEDUP(zio->io_bp_override) == 0); 1342 1343 bp = zio->io_bp_override; 1344 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1345 1346 if (BP_IS_EMBEDDED(bp)) 1347 return (ZIO_PIPELINE_CONTINUE); 1348 1349 /* 1350 * If we've been overridden and nopwrite is set then 1351 * set the flag accordingly to indicate that a nopwrite 1352 * has already occurred. 1353 / 1354* if (!BP_IS_HOLE(bp) && zp->zp_nopwrite) { 1355 ASSERT(!zp->zp_dedup); 1356 ASSERT3U(BP_GET_CHECKSUM(bp), ==, zp->zp_checksum); 1357 zio->io_flags \|= ZIO_FLAG_NOPWRITE; 1358 return (ZIO_PIPELINE_CONTINUE); 1359 } 1360 1361 ASSERT(!zp->zp_nopwrite); 1362 1363 if (BP_IS_HOLE(bp) \|\| !zp->zp_dedup) 1364 return (ZIO_PIPELINE_CONTINUE); 1365 1366 ASSERT((zio_checksum_table[zp->zp_checksum].ci_flags & 1367 ZCHECKSUM_FLAG_DEDUP) \|\| zp->zp_dedup_verify); 1368 1369 if (BP_GET_CHECKSUM(bp) == zp->zp_checksum) { 1370 BP_SET_DEDUP(bp, 1); 1371 zio->io_pipeline \|= ZIO_STAGE_DDT_WRITE; 1372 return (ZIO_PIPELINE_CONTINUE); 1373 } 1374 1375 /* 1376 * We were unable to handle this as an override bp, treat 1377 * it as a regular write I/O. 1378 / 1379* zio->io_bp_override = NULL; 1380 bp = zio->io_bp_orig; 1381* zio->io_pipeline = zio->io_orig_pipeline; 1382 } 1383 1384 return (ZIO_PIPELINE_CONTINUE); 1385} 1386 1387static int 1388zio_write_compress(zio_t zio) 1389{ 1390* spa_t spa = zio->io_spa; 1391* zio_prop_t zp = &zio->io_prop; 1392* enum zio_compress compress = zp->zp_compress; 1393 blkptr_t bp = zio->io_bp; 1394* uint64_t lsize = zio->io_lsize; 1395 uint64_t psize = zio->io_size; 1396 int pass = 1; 1397 1398 EQUIV(lsize != psize, (zio->io_flags & ZIO_FLAG_RAW) != 0); 1399 1400 /* 1401 * If our children haven't all reached the ready stage, 1402 * wait for them and then repeat this pipeline stage. 1403 / 1404* if (zio_wait_for_children(zio, ZIO_CHILD_LOGICAL_BIT \| 1405 ZIO_CHILD_GANG_BIT, ZIO_WAIT_READY)) { 1406 return (ZIO_PIPELINE_STOP); 1407 } 1408 1409 if (!IO_IS_ALLOCATING(zio)) 1410 return (ZIO_PIPELINE_CONTINUE); 1411 1412 if (zio->io_children_ready != NULL) { 1413 /* 1414 * Now that all our children are ready, run the callback 1415 * associated with this zio in case it wants to modify the 1416 * data to be written. 1417 / 1418* ASSERT3U(zp->zp_level, >, 0); 1419 zio->io_children_ready(zio); 1420 } 1421 1422 ASSERT(zio->io_child_type != ZIO_CHILD_DDT); 1423 ASSERT(zio->io_bp_override == NULL); 1424 1425 if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg) { 1426 /* 1427 * We're rewriting an existing block, which means we're 1428 * working on behalf of spa_sync(). For spa_sync() to 1429 * converge, it must eventually be the case that we don't 1430 * have to allocate new blocks. But compression changes 1431 * the blocksize, which forces a reallocate, and makes 1432 * convergence take longer. Therefore, after the first 1433 * few passes, stop compressing to ensure convergence. 1434 / 1435* pass = spa_sync_pass(spa); 1436 1437 ASSERT(zio->io_txg == spa_syncing_txg(spa)); 1438 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 1439 ASSERT(!BP_GET_DEDUP(bp)); 1440 1441 if (pass >= zfs_sync_pass_dont_compress) 1442 compress = ZIO_COMPRESS_OFF; 1443 1444 /* Make sure someone doesn't change their mind on overwrites / 1445* ASSERT(BP_IS_EMBEDDED(bp) \|\| MIN(zp->zp_copies + BP_IS_GANG(bp), 1446 spa_max_replication(spa)) == BP_GET_NDVAS(bp)); 1447 } 1448 1449 /* If it's a compressed write that is not raw, compress the buffer. / 1450* if (compress != ZIO_COMPRESS_OFF && psize == lsize) { 1451 void cbuf = zio_buf_alloc(lsize); 1452* psize = zio_compress_data(compress, zio->io_abd, cbuf, lsize); 1453 if (psize == 0 \|\| psize == lsize) { 1454 compress = ZIO_COMPRESS_OFF; 1455 zio_buf_free(cbuf, lsize); 1456 } else if (!zp->zp_dedup && psize <= BPE_PAYLOAD_SIZE && 1457 zp->zp_level == 0 && !DMU_OT_HAS_FILL(zp->zp_type) && 1458 spa_feature_is_enabled(spa, SPA_FEATURE_EMBEDDED_DATA)) { 1459 encode_embedded_bp_compressed(bp, 1460 cbuf, compress, lsize, psize); 1461 BPE_SET_ETYPE(bp, BP_EMBEDDED_TYPE_DATA); 1462 BP_SET_TYPE(bp, zio->io_prop.zp_type); 1463 BP_SET_LEVEL(bp, zio->io_prop.zp_level); 1464 zio_buf_free(cbuf, lsize); 1465 bp->blk_birth = zio->io_txg; 1466 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1467 ASSERT(spa_feature_is_active(spa, 1468 SPA_FEATURE_EMBEDDED_DATA)); 1469 return (ZIO_PIPELINE_CONTINUE); 1470 } else { 1471 /* 1472 * Round up compressed size up to the ashift 1473 * of the smallest-ashift device, and zero the tail. 1474 * This ensures that the compressed size of the BP 1475 * (and thus compressratio property) are correct, 1476 * in that we charge for the padding used to fill out 1477 * the last sector. 1478 / 1479* ASSERT3U(spa->spa_min_ashift, >=, SPA_MINBLOCKSHIFT); 1480 size_t rounded = (size_t)P2ROUNDUP(psize, 1481 1ULL << spa->spa_min_ashift); 1482 if (rounded >= lsize) { 1483 compress = ZIO_COMPRESS_OFF; 1484 zio_buf_free(cbuf, lsize); 1485 psize = lsize; 1486 } else { 1487 abd_t cdata = abd_get_from_buf(cbuf, lsize); 1488* abd_take_ownership_of_buf(cdata, B_TRUE); 1489 abd_zero_off(cdata, psize, rounded - psize); 1490 psize = rounded; 1491 zio_push_transform(zio, cdata, 1492 psize, lsize, NULL); 1493 } 1494 } 1495 1496 /* 1497 * We were unable to handle this as an override bp, treat 1498 * it as a regular write I/O. 1499 / 1500* zio->io_bp_override = NULL; 1501 bp = zio->io_bp_orig; 1502* zio->io_pipeline = zio->io_orig_pipeline; 1503 } else { 1504 ASSERT3U(psize, !=, 0); 1505 } 1506 1507 /* 1508 * The final pass of spa_sync() must be all rewrites, but the first 1509 * few passes offer a trade-off: allocating blocks defers convergence, 1510 * but newly allocated blocks are sequential, so they can be written 1511 * to disk faster. Therefore, we allow the first few passes of 1512 * spa_sync() to allocate new blocks, but force rewrites after that. 1513 * There should only be a handful of blocks after pass 1 in any case. 1514 / 1515* if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg && 1516 BP_GET_PSIZE(bp) == psize && 1517 pass >= zfs_sync_pass_rewrite) { 1518 ASSERT(psize != 0); 1519 enum zio_stage gang_stages = zio->io_pipeline & ZIO_GANG_STAGES; 1520 zio->io_pipeline = ZIO_REWRITE_PIPELINE \| gang_stages; 1521 zio->io_flags \|= ZIO_FLAG_IO_REWRITE; 1522 } else { 1523 BP_ZERO(bp); 1524 zio->io_pipeline = ZIO_WRITE_PIPELINE; 1525 } 1526 1527 if (psize == 0) { 1528 if (zio->io_bp_orig.blk_birth != 0 && 1529 spa_feature_is_active(spa, SPA_FEATURE_HOLE_BIRTH)) { 1530 BP_SET_LSIZE(bp, lsize); 1531 BP_SET_TYPE(bp, zp->zp_type); 1532 BP_SET_LEVEL(bp, zp->zp_level); 1533 BP_SET_BIRTH(bp, zio->io_txg, 0); 1534 } 1535 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1536 } else { 1537 ASSERT(zp->zp_checksum != ZIO_CHECKSUM_GANG_HEADER); 1538 BP_SET_LSIZE(bp, lsize); 1539 BP_SET_TYPE(bp, zp->zp_type); 1540 BP_SET_LEVEL(bp, zp->zp_level); 1541 BP_SET_PSIZE(bp, psize); 1542 BP_SET_COMPRESS(bp, compress); 1543 BP_SET_CHECKSUM(bp, zp->zp_checksum); 1544 BP_SET_DEDUP(bp, zp->zp_dedup); 1545 BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); 1546 if (zp->zp_dedup) { 1547 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 1548 ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); 1549 zio->io_pipeline = ZIO_DDT_WRITE_PIPELINE; 1550 } 1551 if (zp->zp_nopwrite) { 1552 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 1553 ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); 1554 zio->io_pipeline \|= ZIO_STAGE_NOP_WRITE; 1555 } 1556 } 1557 return (ZIO_PIPELINE_CONTINUE); 1558} 1559 1560static int 1561zio_free_bp_init(zio_t zio) 1562{ 1563* blkptr_t bp = zio->io_bp; 1564* 1565 if (zio->io_child_type == ZIO_CHILD_LOGICAL) { 1566 if (BP_GET_DEDUP(bp)) 1567 zio->io_pipeline = ZIO_DDT_FREE_PIPELINE; 1568 } 1569 1570 ASSERT3P(zio->io_bp, ==, &zio->io_bp_copy); 1571 1572 return (ZIO_PIPELINE_CONTINUE); 1573} 1574 1575/* 1576 * ========================================================================== 1577 * Execute the I/O pipeline 1578 * ========================================================================== 1579 / 1580* 1581static void 1582zio_taskq_dispatch(zio_t zio, zio_taskq_type_t q, boolean_t cutinline) 1583{ 1584* spa_t spa = zio->io_spa; 1585* zio_type_t t = zio->io_type; 1586 int flags = (cutinline ? TQ_FRONT : 0); 1587 1588 ASSERT(q == ZIO_TASKQ_ISSUE \|\| q == ZIO_TASKQ_INTERRUPT); 1589 1590 /* 1591 * If we're a config writer or a probe, the normal issue and 1592 * interrupt threads may all be blocked waiting for the config lock. 1593 * In this case, select the otherwise-unused taskq for ZIO_TYPE_NULL. 1594 / 1595* if (zio->io_flags & (ZIO_FLAG_CONFIG_WRITER \| ZIO_FLAG_PROBE)) 1596 t = ZIO_TYPE_NULL; 1597 1598 /* 1599 * A similar issue exists for the L2ARC write thread until L2ARC 2.0. 1600 / 1601* if (t == ZIO_TYPE_WRITE && zio->io_vd && zio->io_vd->vdev_aux) 1602 t = ZIO_TYPE_NULL; 1603 1604 /* 1605 * If this is a high priority I/O, then use the high priority taskq if 1606 * available. 1607 / 1608* if (zio->io_priority == ZIO_PRIORITY_NOW && 1609 spa->spa_zio_taskq[t][q + 1].stqs_count != 0) 1610 q++; 1611 1612 ASSERT3U(q, <, ZIO_TASKQ_TYPES); 1613 1614 /* 1615 * NB: We are assuming that the zio can only be dispatched 1616 * to a single taskq at a time. It would be a grievous error 1617 * to dispatch the zio to another taskq at the same time. 1618 / 1619#if defined(illumos) \|\| !defined(_KERNEL) 1620* ASSERT(zio->io_tqent.tqent_next == NULL); 1621#else 1622 ASSERT(zio->io_tqent.tqent_task.ta_pending == 0); 1623#endif 1624 spa_taskq_dispatch_ent(spa, t, q, (task_func_t )zio_execute, zio, 1625* flags, &zio->io_tqent); 1626} 1627 1628static boolean_t 1629zio_taskq_member(zio_t zio, zio_taskq_type_t q) 1630{ 1631* kthread_t executor = zio->io_executor; 1632* spa_t spa = zio->io_spa; 1633* 1634 for (zio_type_t t = 0; t < ZIO_TYPES; t++) { 1635 spa_taskqs_t tqs = &spa->spa_zio_taskq[t][q]; 1636* uint_t i; 1637 for (i = 0; i < tqs->stqs_count; i++) { 1638 if (taskq_member(tqs->stqs_taskq[i], executor)) 1639 return (B_TRUE); 1640 } 1641 } 1642 1643 return (B_FALSE); 1644} 1645 1646static int 1647zio_issue_async(zio_t zio) 1648{ 1649* zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE); 1650 1651 return (ZIO_PIPELINE_STOP); 1652} 1653 1654void 1655zio_interrupt(zio_t zio) 1656{ 1657* zio_taskq_dispatch(zio, ZIO_TASKQ_INTERRUPT, B_FALSE); 1658} 1659 1660void 1661zio_delay_interrupt(zio_t zio) 1662{ 1663* /* 1664 * The timeout_generic() function isn't defined in userspace, so 1665 * rather than trying to implement the function, the zio delay 1666 * functionality has been disabled for userspace builds. 1667 / 1668* 1669#ifdef _KERNEL 1670 /* 1671 * If io_target_timestamp is zero, then no delay has been registered 1672 * for this IO, thus jump to the end of this function and "skip" the 1673 * delay; issuing it directly to the zio layer. 1674 / 1675* if (zio->io_target_timestamp != 0) { 1676 hrtime_t now = gethrtime(); 1677 1678 if (now >= zio->io_target_timestamp) { 1679 /* 1680 * This IO has already taken longer than the target 1681 * delay to complete, so we don't want to delay it 1682 * any longer; we "miss" the delay and issue it 1683 * directly to the zio layer. This is likely due to 1684 * the target latency being set to a value less than 1685 * the underlying hardware can satisfy (e.g. delay 1686 * set to 1ms, but the disks take 10ms to complete an 1687 * IO request). 1688 / 1689* 1690 DTRACE_PROBE2(zio__delay__miss, zio_t , zio, 1691* hrtime_t, now); 1692 1693 zio_interrupt(zio); 1694 } else { 1695 hrtime_t diff = zio->io_target_timestamp - now; 1696 1697 DTRACE_PROBE3(zio__delay__hit, zio_t , zio, 1698* hrtime_t, now, hrtime_t, diff); 1699 1700 (void) timeout_generic(CALLOUT_NORMAL, 1701 (void ()(void ))zio_interrupt, zio, diff, 1, 0); 1702 } 1703 1704 return; 1705 } 1706#endif 1707 1708 DTRACE_PROBE1(zio__delay__skip, zio_t , zio); 1709* zio_interrupt(zio); 1710} 1711 1712/* 1713 * Execute the I/O pipeline until one of the following occurs: 1714 * 1715 * (1) the I/O completes 1716 * (2) the pipeline stalls waiting for dependent child I/Os 1717 * (3) the I/O issues, so we're waiting for an I/O completion interrupt 1718 * (4) the I/O is delegated by vdev-level caching or aggregation 1719 * (5) the I/O is deferred due to vdev-level queueing 1720 * (6) the I/O is handed off to another thread. 1721 * 1722 * In all cases, the pipeline stops whenever there's no CPU work; it never 1723 * burns a thread in cv_wait(). 1724 * 1725 * There's no locking on io_stage because there's no legitimate way 1726 * for multiple threads to be attempting to process the same I/O. 1727 / 1728static zio_pipe_stage_t zio_pipeline[]; 1729 1730void 1731zio_execute(zio_t zio) 1732{ 1733* zio->io_executor = curthread; 1734 1735 ASSERT3U(zio->io_queued_timestamp, >, 0); 1736 1737 while (zio->io_stage < ZIO_STAGE_DONE) { 1738 enum zio_stage pipeline = zio->io_pipeline; 1739 enum zio_stage stage = zio->io_stage; 1740 int rv; 1741 1742 ASSERT(!MUTEX_HELD(&zio->io_lock)); 1743 ASSERT(ISP2(stage)); 1744 ASSERT(zio->io_stall == NULL); 1745 1746 do { 1747 stage <<= 1; 1748 } while ((stage & pipeline) == 0); 1749 1750 ASSERT(stage <= ZIO_STAGE_DONE); 1751 1752 /* 1753 * If we are in interrupt context and this pipeline stage 1754 * will grab a config lock that is held across I/O, 1755 * or may wait for an I/O that needs an interrupt thread 1756 * to complete, issue async to avoid deadlock. 1757 * 1758 * For VDEV_IO_START, we cut in line so that the io will 1759 * be sent to disk promptly. 1760 / 1761* if ((stage & ZIO_BLOCKING_STAGES) && zio->io_vd == NULL && 1762 zio_taskq_member(zio, ZIO_TASKQ_INTERRUPT)) { 1763 boolean_t cut = (stage == ZIO_STAGE_VDEV_IO_START) ? 1764 zio_requeue_io_start_cut_in_line : B_FALSE; 1765 zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut); 1766 return; 1767 } 1768 1769 zio->io_stage = stage; 1770 zio->io_pipeline_trace \|= zio->io_stage; 1771 rv = zio_pipeline[highbit64(stage) - 1](zio); 1772 1773 if (rv == ZIO_PIPELINE_STOP) 1774 return; 1775 1776 ASSERT(rv == ZIO_PIPELINE_CONTINUE); 1777 } 1778} 1779 1780/* 1781 * ========================================================================== 1782 * Initiate I/O, either sync or async 1783 * ========================================================================== 1784 / 1785int 1786zio_wait(zio_t zio) 1787{ 1788 int error; 1789 1790 ASSERT3P(zio->io_stage, ==, ZIO_STAGE_OPEN); 1791 ASSERT3P(zio->io_executor, ==, NULL); 1792 1793 zio->io_waiter = curthread; 1794 ASSERT0(zio->io_queued_timestamp); 1795 zio->io_queued_timestamp = gethrtime(); 1796 1797 zio_execute(zio); 1798 1799 mutex_enter(&zio->io_lock); 1800 while (zio->io_executor != NULL) 1801 cv_wait(&zio->io_cv, &zio->io_lock); 1802 mutex_exit(&zio->io_lock); 1803 1804 error = zio->io_error; 1805 zio_destroy(zio); 1806 1807 return (error); 1808} 1809 1810void 1811zio_nowait(zio_t zio) 1812{ 1813* ASSERT3P(zio->io_executor, ==, NULL); 1814 1815 if (zio->io_child_type == ZIO_CHILD_LOGICAL && 1816 zio_unique_parent(zio) == NULL) { 1817 /* 1818 * This is a logical async I/O with no parent to wait for it. 1819 * We add it to the spa_async_root_zio "Godfather" I/O which 1820 * will ensure they complete prior to unloading the pool. 1821 / 1822* spa_t spa = zio->io_spa; 1823* 1824 zio_add_child(spa->spa_async_zio_root[CPU_SEQID], zio); 1825 } 1826 1827 ASSERT0(zio->io_queued_timestamp); 1828 zio->io_queued_timestamp = gethrtime(); 1829 zio_execute(zio); 1830} 1831 1832/* 1833 * ========================================================================== 1834 * Reexecute, cancel, or suspend/resume failed I/O 1835 * ========================================================================== 1836 / 1837* 1838static void 1839zio_reexecute(zio_t pio) 1840{ 1841* zio_t cio, cio_next; 1842 1843 ASSERT(pio->io_child_type == ZIO_CHILD_LOGICAL); 1844 ASSERT(pio->io_orig_stage == ZIO_STAGE_OPEN); 1845 ASSERT(pio->io_gang_leader == NULL); 1846 ASSERT(pio->io_gang_tree == NULL); 1847 1848 pio->io_flags = pio->io_orig_flags; 1849 pio->io_stage = pio->io_orig_stage; 1850 pio->io_pipeline = pio->io_orig_pipeline; 1851 pio->io_reexecute = 0; 1852 pio->io_flags \|= ZIO_FLAG_REEXECUTED; 1853 pio->io_pipeline_trace = 0; 1854 pio->io_error = 0; 1855 for (int w = 0; w < ZIO_WAIT_TYPES; w++) 1856 pio->io_state[w] = 0; 1857 for (int c = 0; c < ZIO_CHILD_TYPES; c++) 1858 pio->io_child_error[c] = 0; 1859 1860 if (IO_IS_ALLOCATING(pio)) 1861 BP_ZERO(pio->io_bp); 1862 1863 /* 1864 * As we reexecute pio's children, new children could be created. 1865 * New children go to the head of pio's io_child_list, however, 1866 * so we will (correctly) not reexecute them. The key is that 1867 * the remainder of pio's io_child_list, from 'cio_next' onward, 1868 * cannot be affected by any side effects of reexecuting 'cio'. 1869 / 1870* zio_link_t *zl = NULL;
	1871 mutex_enter(&pio->io_lock);
1868 for (cio = zio_walk_children(pio, &zl); cio != NULL; cio = cio_next) { 1869 cio_next = zio_walk_children(pio, &zl);	1872 for (cio = zio_walk_children(pio, &zl); cio != NULL; cio = cio_next) { 1873 cio_next = zio_walk_children(pio, &zl);
1870 mutex_enter(&pio->io_lock);
1871 for (int w = 0; w < ZIO_WAIT_TYPES; w++) 1872 pio->io_children[cio->io_child_type][w]++; 1873 mutex_exit(&pio->io_lock); 1874 zio_reexecute(cio);	1874 for (int w = 0; w < ZIO_WAIT_TYPES; w++) 1875 pio->io_children[cio->io_child_type][w]++; 1876 mutex_exit(&pio->io_lock); 1877 zio_reexecute(cio);
	1878 mutex_enter(&pio->io_lock);
1875 }	1879 }
	1880 mutex_exit(&pio->io_lock);
1876 1877 /* 1878 * Now that all children have been reexecuted, execute the parent. 1879 * We don't reexecute "The Godfather" I/O here as it's the 1880 * responsibility of the caller to wait on it. 1881 / 1882* if (!(pio->io_flags & ZIO_FLAG_GODFATHER)) { 1883 pio->io_queued_timestamp = gethrtime(); 1884 zio_execute(pio); 1885 } 1886} 1887 1888void 1889zio_suspend(spa_t spa, zio_t zio) 1890{ 1891 if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_PANIC) 1892 fm_panic("Pool '%s' has encountered an uncorrectable I/O " 1893 "failure and the failure mode property for this pool " 1894 "is set to panic.", spa_name(spa)); 1895 1896 zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL, NULL, 0, 0); 1897 1898 mutex_enter(&spa->spa_suspend_lock); 1899 1900 if (spa->spa_suspend_zio_root == NULL) 1901 spa->spa_suspend_zio_root = zio_root(spa, NULL, NULL, 1902 ZIO_FLAG_CANFAIL \| ZIO_FLAG_SPECULATIVE \| 1903 ZIO_FLAG_GODFATHER); 1904 1905 spa->spa_suspended = B_TRUE; 1906 1907 if (zio != NULL) { 1908 ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER)); 1909 ASSERT(zio != spa->spa_suspend_zio_root); 1910 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 1911 ASSERT(zio_unique_parent(zio) == NULL); 1912 ASSERT(zio->io_stage == ZIO_STAGE_DONE); 1913 zio_add_child(spa->spa_suspend_zio_root, zio); 1914 } 1915 1916 mutex_exit(&spa->spa_suspend_lock); 1917} 1918 1919int 1920zio_resume(spa_t spa) 1921{ 1922* zio_t pio; 1923* 1924 /* 1925 * Reexecute all previously suspended i/o. 1926 / 1927* mutex_enter(&spa->spa_suspend_lock); 1928 spa->spa_suspended = B_FALSE; 1929 cv_broadcast(&spa->spa_suspend_cv); 1930 pio = spa->spa_suspend_zio_root; 1931 spa->spa_suspend_zio_root = NULL; 1932 mutex_exit(&spa->spa_suspend_lock); 1933 1934 if (pio == NULL) 1935 return (0); 1936 1937 zio_reexecute(pio); 1938 return (zio_wait(pio)); 1939} 1940 1941void 1942zio_resume_wait(spa_t spa) 1943{ 1944* mutex_enter(&spa->spa_suspend_lock); 1945 while (spa_suspended(spa)) 1946 cv_wait(&spa->spa_suspend_cv, &spa->spa_suspend_lock); 1947 mutex_exit(&spa->spa_suspend_lock); 1948} 1949 1950/* 1951 * ========================================================================== 1952 * Gang blocks. 1953 * 1954 * A gang block is a collection of small blocks that looks to the DMU 1955 * like one large block. When zio_dva_allocate() cannot find a block 1956 * of the requested size, due to either severe fragmentation or the pool 1957 * being nearly full, it calls zio_write_gang_block() to construct the 1958 * block from smaller fragments. 1959 * 1960 * A gang block consists of a gang header (zio_gbh_phys_t) and up to 1961 * three (SPA_GBH_NBLKPTRS) gang members. The gang header is just like 1962 * an indirect block: it's an array of block pointers. It consumes 1963 * only one sector and hence is allocatable regardless of fragmentation. 1964 * The gang header's bps point to its gang members, which hold the data. 1965 * 1966 * Gang blocks are self-checksumming, using the bp's <vdev, offset, txg> 1967 * as the verifier to ensure uniqueness of the SHA256 checksum. 1968 * Critically, the gang block bp's blk_cksum is the checksum of the data, 1969 * not the gang header. This ensures that data block signatures (needed for 1970 * deduplication) are independent of how the block is physically stored. 1971 * 1972 * Gang blocks can be nested: a gang member may itself be a gang block. 1973 * Thus every gang block is a tree in which root and all interior nodes are 1974 * gang headers, and the leaves are normal blocks that contain user data. 1975 * The root of the gang tree is called the gang leader. 1976 * 1977 * To perform any operation (read, rewrite, free, claim) on a gang block, 1978 * zio_gang_assemble() first assembles the gang tree (minus data leaves) 1979 * in the io_gang_tree field of the original logical i/o by recursively 1980 * reading the gang leader and all gang headers below it. This yields 1981 * an in-core tree containing the contents of every gang header and the 1982 * bps for every constituent of the gang block. 1983 * 1984 * With the gang tree now assembled, zio_gang_issue() just walks the gang tree 1985 * and invokes a callback on each bp. To free a gang block, zio_gang_issue() 1986 * calls zio_free_gang() -- a trivial wrapper around zio_free() -- for each bp. 1987 * zio_claim_gang() provides a similarly trivial wrapper for zio_claim(). 1988 * zio_read_gang() is a wrapper around zio_read() that omits reading gang 1989 * headers, since we already have those in io_gang_tree. zio_rewrite_gang() 1990 * performs a zio_rewrite() of the data or, for gang headers, a zio_rewrite() 1991 * of the gang header plus zio_checksum_compute() of the data to update the 1992 * gang header's blk_cksum as described above. 1993 * 1994 * The two-phase assemble/issue model solves the problem of partial failure -- 1995 * what if you'd freed part of a gang block but then couldn't read the 1996 * gang header for another part? Assembling the entire gang tree first 1997 * ensures that all the necessary gang header I/O has succeeded before 1998 * starting the actual work of free, claim, or write. Once the gang tree 1999 * is assembled, free and claim are in-memory operations that cannot fail. 2000 * 2001 * In the event that a gang write fails, zio_dva_unallocate() walks the 2002 * gang tree to immediately free (i.e. insert back into the space map) 2003 * everything we've allocated. This ensures that we don't get ENOSPC 2004 * errors during repeated suspend/resume cycles due to a flaky device. 2005 * 2006 * Gang rewrites only happen during sync-to-convergence. If we can't assemble 2007 * the gang tree, we won't modify the block, so we can safely defer the free 2008 * (knowing that the block is still intact). If we can assemble the gang 2009 * tree, then even if some of the rewrites fail, zio_dva_unallocate() will free 2010 * each constituent bp and we can allocate a new block on the next sync pass. 2011 * 2012 * In all cases, the gang tree allows complete recovery from partial failure. 2013 * ========================================================================== 2014 / 2015* 2016static void 2017zio_gang_issue_func_done(zio_t zio) 2018{ 2019* abd_put(zio->io_abd); 2020} 2021 2022static zio_t * 2023zio_read_gang(zio_t pio, blkptr_t bp, zio_gang_node_t gn, abd_t data, 2024 uint64_t offset) 2025{ 2026 if (gn != NULL) 2027 return (pio); 2028 2029 return (zio_read(pio, pio->io_spa, bp, abd_get_offset(data, offset), 2030 BP_GET_PSIZE(bp), zio_gang_issue_func_done, 2031 NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), 2032 &pio->io_bookmark)); 2033} 2034 2035static zio_t * 2036zio_rewrite_gang(zio_t pio, blkptr_t bp, zio_gang_node_t gn, abd_t data, 2037 uint64_t offset) 2038{ 2039 zio_t zio; 2040* 2041 if (gn != NULL) { 2042 abd_t gbh_abd = 2043* abd_get_from_buf(gn->gn_gbh, SPA_GANGBLOCKSIZE); 2044 zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, 2045 gbh_abd, SPA_GANGBLOCKSIZE, zio_gang_issue_func_done, NULL, 2046 pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), 2047 &pio->io_bookmark); 2048 /* 2049 * As we rewrite each gang header, the pipeline will compute 2050 * a new gang block header checksum for it; but no one will 2051 * compute a new data checksum, so we do that here. The one 2052 * exception is the gang leader: the pipeline already computed 2053 * its data checksum because that stage precedes gang assembly. 2054 * (Presently, nothing actually uses interior data checksums; 2055 * this is just good hygiene.) 2056 / 2057* if (gn != pio->io_gang_leader->io_gang_tree) { 2058 abd_t buf = abd_get_offset(data, offset); 2059* 2060 zio_checksum_compute(zio, BP_GET_CHECKSUM(bp), 2061 buf, BP_GET_PSIZE(bp)); 2062 2063 abd_put(buf); 2064 } 2065 /* 2066 * If we are here to damage data for testing purposes, 2067 * leave the GBH alone so that we can detect the damage. 2068 / 2069* if (pio->io_gang_leader->io_flags & ZIO_FLAG_INDUCE_DAMAGE) 2070 zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; 2071 } else { 2072 zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, 2073 abd_get_offset(data, offset), BP_GET_PSIZE(bp), 2074 zio_gang_issue_func_done, NULL, pio->io_priority, 2075 ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); 2076 } 2077 2078 return (zio); 2079} 2080 2081/* ARGSUSED / 2082static zio_t 2083zio_free_gang(zio_t pio, blkptr_t bp, zio_gang_node_t gn, abd_t data, 2084 uint64_t offset) 2085{ 2086 return (zio_free_sync(pio, pio->io_spa, pio->io_txg, bp, 2087 BP_IS_GANG(bp) ? SPA_GANGBLOCKSIZE : BP_GET_PSIZE(bp), 2088 ZIO_GANG_CHILD_FLAGS(pio))); 2089} 2090 2091/* ARGSUSED / 2092static zio_t 2093zio_claim_gang(zio_t pio, blkptr_t bp, zio_gang_node_t gn, abd_t data, 2094 uint64_t offset) 2095{ 2096 return (zio_claim(pio, pio->io_spa, pio->io_txg, bp, 2097 NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio))); 2098} 2099 2100static zio_gang_issue_func_t zio_gang_issue_func[ZIO_TYPES] = { 2101* NULL, 2102 zio_read_gang, 2103 zio_rewrite_gang, 2104 zio_free_gang, 2105 zio_claim_gang, 2106 NULL 2107}; 2108 2109static void zio_gang_tree_assemble_done(zio_t zio); 2110* 2111static zio_gang_node_t * 2112zio_gang_node_alloc(zio_gang_node_t *gnpp) 2113{ 2114* zio_gang_node_t gn; 2115* 2116 ASSERT(gnpp == NULL); 2117* 2118 gn = kmem_zalloc(sizeof (gn), KM_SLEEP); 2119* gn->gn_gbh = zio_buf_alloc(SPA_GANGBLOCKSIZE); 2120 gnpp = gn; 2121* 2122 return (gn); 2123} 2124 2125static void 2126zio_gang_node_free(zio_gang_node_t *gnpp) 2127{ 2128* zio_gang_node_t gn = gnpp; 2129 2130 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) 2131 ASSERT(gn->gn_child[g] == NULL); 2132 2133 zio_buf_free(gn->gn_gbh, SPA_GANGBLOCKSIZE); 2134 kmem_free(gn, sizeof (gn)); 2135* gnpp = NULL; 2136} 2137* 2138static void 2139zio_gang_tree_free(zio_gang_node_t *gnpp) 2140{ 2141* zio_gang_node_t gn = gnpp; 2142 2143 if (gn == NULL) 2144 return; 2145 2146 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) 2147 zio_gang_tree_free(&gn->gn_child[g]); 2148 2149 zio_gang_node_free(gnpp); 2150} 2151 2152static void 2153zio_gang_tree_assemble(zio_t gio, blkptr_t bp, zio_gang_node_t *gnpp) 2154{ 2155* zio_gang_node_t gn = zio_gang_node_alloc(gnpp); 2156* abd_t gbh_abd = abd_get_from_buf(gn->gn_gbh, SPA_GANGBLOCKSIZE); 2157* 2158 ASSERT(gio->io_gang_leader == gio); 2159 ASSERT(BP_IS_GANG(bp)); 2160 2161 zio_nowait(zio_read(gio, gio->io_spa, bp, gbh_abd, SPA_GANGBLOCKSIZE, 2162 zio_gang_tree_assemble_done, gn, gio->io_priority, 2163 ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark)); 2164} 2165 2166static void 2167zio_gang_tree_assemble_done(zio_t zio) 2168{ 2169* zio_t gio = zio->io_gang_leader; 2170* zio_gang_node_t gn = zio->io_private; 2171* blkptr_t bp = zio->io_bp; 2172* 2173 ASSERT(gio == zio_unique_parent(zio)); 2174 ASSERT(zio->io_child_count == 0); 2175 2176 if (zio->io_error) 2177 return; 2178 2179 /* this ABD was created from a linear buf in zio_gang_tree_assemble / 2180* if (BP_SHOULD_BYTESWAP(bp)) 2181 byteswap_uint64_array(abd_to_buf(zio->io_abd), zio->io_size); 2182 2183 ASSERT3P(abd_to_buf(zio->io_abd), ==, gn->gn_gbh); 2184 ASSERT(zio->io_size == SPA_GANGBLOCKSIZE); 2185 ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC); 2186 2187 abd_put(zio->io_abd); 2188 2189 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { 2190 blkptr_t gbp = &gn->gn_gbh->zg_blkptr[g]; 2191* if (!BP_IS_GANG(gbp)) 2192 continue; 2193 zio_gang_tree_assemble(gio, gbp, &gn->gn_child[g]); 2194 } 2195} 2196 2197static void 2198zio_gang_tree_issue(zio_t pio, zio_gang_node_t gn, blkptr_t bp, abd_t data, 2199 uint64_t offset) 2200{ 2201 zio_t gio = pio->io_gang_leader; 2202* zio_t zio; 2203* 2204 ASSERT(BP_IS_GANG(bp) == !!gn); 2205 ASSERT(BP_GET_CHECKSUM(bp) == BP_GET_CHECKSUM(gio->io_bp)); 2206 ASSERT(BP_GET_LSIZE(bp) == BP_GET_PSIZE(bp) \|\| gn == gio->io_gang_tree); 2207 2208 /* 2209 * If you're a gang header, your data is in gn->gn_gbh. 2210 * If you're a gang member, your data is in 'data' and gn == NULL. 2211 / 2212* zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data, offset); 2213 2214 if (gn != NULL) { 2215 ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC); 2216 2217 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { 2218 blkptr_t gbp = &gn->gn_gbh->zg_blkptr[g]; 2219* if (BP_IS_HOLE(gbp)) 2220 continue; 2221 zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data, 2222 offset); 2223 offset += BP_GET_PSIZE(gbp); 2224 } 2225 } 2226 2227 if (gn == gio->io_gang_tree && gio->io_abd != NULL) 2228 ASSERT3U(gio->io_size, ==, offset); 2229 2230 if (zio != pio) 2231 zio_nowait(zio); 2232} 2233 2234static int 2235zio_gang_assemble(zio_t zio) 2236{ 2237* blkptr_t bp = zio->io_bp; 2238* 2239 ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == NULL); 2240 ASSERT(zio->io_child_type > ZIO_CHILD_GANG); 2241 2242 zio->io_gang_leader = zio; 2243 2244 zio_gang_tree_assemble(zio, bp, &zio->io_gang_tree); 2245 2246 return (ZIO_PIPELINE_CONTINUE); 2247} 2248 2249static int 2250zio_gang_issue(zio_t zio) 2251{ 2252* blkptr_t bp = zio->io_bp; 2253* 2254 if (zio_wait_for_children(zio, ZIO_CHILD_GANG_BIT, ZIO_WAIT_DONE)) { 2255 return (ZIO_PIPELINE_STOP); 2256 } 2257 2258 ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == zio); 2259 ASSERT(zio->io_child_type > ZIO_CHILD_GANG); 2260 2261 if (zio->io_child_error[ZIO_CHILD_GANG] == 0) 2262 zio_gang_tree_issue(zio, zio->io_gang_tree, bp, zio->io_abd, 2263 0); 2264 else 2265 zio_gang_tree_free(&zio->io_gang_tree); 2266 2267 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 2268 2269 return (ZIO_PIPELINE_CONTINUE); 2270} 2271 2272static void 2273zio_write_gang_member_ready(zio_t zio) 2274{ 2275* zio_t pio = zio_unique_parent(zio); 2276* zio_t gio = zio->io_gang_leader; 2277* dva_t cdva = zio->io_bp->blk_dva; 2278* dva_t pdva = pio->io_bp->blk_dva; 2279* uint64_t asize; 2280 2281 if (BP_IS_HOLE(zio->io_bp)) 2282 return; 2283 2284 ASSERT(BP_IS_HOLE(&zio->io_bp_orig)); 2285 2286 ASSERT(zio->io_child_type == ZIO_CHILD_GANG); 2287 ASSERT3U(zio->io_prop.zp_copies, ==, gio->io_prop.zp_copies); 2288 ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(zio->io_bp)); 2289 ASSERT3U(pio->io_prop.zp_copies, <=, BP_GET_NDVAS(pio->io_bp)); 2290 ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp)); 2291 2292 mutex_enter(&pio->io_lock); 2293 for (int d = 0; d < BP_GET_NDVAS(zio->io_bp); d++) { 2294 ASSERT(DVA_GET_GANG(&pdva[d])); 2295 asize = DVA_GET_ASIZE(&pdva[d]); 2296 asize += DVA_GET_ASIZE(&cdva[d]); 2297 DVA_SET_ASIZE(&pdva[d], asize); 2298 } 2299 mutex_exit(&pio->io_lock); 2300} 2301 2302static void 2303zio_write_gang_done(zio_t zio) 2304{ 2305* abd_put(zio->io_abd); 2306} 2307 2308static int 2309zio_write_gang_block(zio_t pio) 2310{ 2311* spa_t spa = pio->io_spa; 2312* metaslab_class_t mc = spa_normal_class(spa); 2313* blkptr_t bp = pio->io_bp; 2314* zio_t gio = pio->io_gang_leader; 2315* zio_t zio; 2316* zio_gang_node_t gn, gnpp; 2317* zio_gbh_phys_t gbh; 2318* abd_t gbh_abd; 2319* uint64_t txg = pio->io_txg; 2320 uint64_t resid = pio->io_size; 2321 uint64_t lsize; 2322 int copies = gio->io_prop.zp_copies; 2323 int gbh_copies = MIN(copies + 1, spa_max_replication(spa)); 2324 zio_prop_t zp; 2325 int error; 2326 2327 int flags = METASLAB_HINTBP_FAVOR \| METASLAB_GANG_HEADER; 2328 if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) { 2329 ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); 2330 ASSERT(!(pio->io_flags & ZIO_FLAG_NODATA)); 2331 2332 flags \|= METASLAB_ASYNC_ALLOC; 2333 VERIFY(refcount_held(&mc->mc_alloc_slots, pio)); 2334 2335 /* 2336 * The logical zio has already placed a reservation for 2337 * 'copies' allocation slots but gang blocks may require 2338 * additional copies. These additional copies 2339 * (i.e. gbh_copies - copies) are guaranteed to succeed 2340 * since metaslab_class_throttle_reserve() always allows 2341 * additional reservations for gang blocks. 2342 / 2343* VERIFY(metaslab_class_throttle_reserve(mc, gbh_copies - copies, 2344 pio, flags)); 2345 } 2346 2347 error = metaslab_alloc(spa, mc, SPA_GANGBLOCKSIZE, 2348 bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp, flags, 2349 &pio->io_alloc_list, pio); 2350 if (error) { 2351 if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) { 2352 ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); 2353 ASSERT(!(pio->io_flags & ZIO_FLAG_NODATA)); 2354 2355 /* 2356 * If we failed to allocate the gang block header then 2357 * we remove any additional allocation reservations that 2358 * we placed here. The original reservation will 2359 * be removed when the logical I/O goes to the ready 2360 * stage. 2361 / 2362* metaslab_class_throttle_unreserve(mc, 2363 gbh_copies - copies, pio); 2364 } 2365 pio->io_error = error; 2366 return (ZIO_PIPELINE_CONTINUE); 2367 } 2368 2369 if (pio == gio) { 2370 gnpp = &gio->io_gang_tree; 2371 } else { 2372 gnpp = pio->io_private; 2373 ASSERT(pio->io_ready == zio_write_gang_member_ready); 2374 } 2375 2376 gn = zio_gang_node_alloc(gnpp); 2377 gbh = gn->gn_gbh; 2378 bzero(gbh, SPA_GANGBLOCKSIZE); 2379 gbh_abd = abd_get_from_buf(gbh, SPA_GANGBLOCKSIZE); 2380 2381 /* 2382 * Create the gang header. 2383 / 2384* zio = zio_rewrite(pio, spa, txg, bp, gbh_abd, SPA_GANGBLOCKSIZE, 2385 zio_write_gang_done, NULL, pio->io_priority, 2386 ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); 2387 2388 /* 2389 * Create and nowait the gang children. 2390 / 2391* for (int g = 0; resid != 0; resid -= lsize, g++) { 2392 lsize = P2ROUNDUP(resid / (SPA_GBH_NBLKPTRS - g), 2393 SPA_MINBLOCKSIZE); 2394 ASSERT(lsize >= SPA_MINBLOCKSIZE && lsize <= resid); 2395 2396 zp.zp_checksum = gio->io_prop.zp_checksum; 2397 zp.zp_compress = ZIO_COMPRESS_OFF; 2398 zp.zp_type = DMU_OT_NONE; 2399 zp.zp_level = 0; 2400 zp.zp_copies = gio->io_prop.zp_copies; 2401 zp.zp_dedup = B_FALSE; 2402 zp.zp_dedup_verify = B_FALSE; 2403 zp.zp_nopwrite = B_FALSE; 2404 2405 zio_t cio = zio_write(zio, spa, txg, &gbh->zg_blkptr[g], 2406* abd_get_offset(pio->io_abd, pio->io_size - resid), lsize, 2407 lsize, &zp, zio_write_gang_member_ready, NULL, NULL, 2408 zio_write_gang_done, &gn->gn_child[g], pio->io_priority, 2409 ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); 2410 2411 if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) { 2412 ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); 2413 ASSERT(!(pio->io_flags & ZIO_FLAG_NODATA)); 2414 2415 /* 2416 * Gang children won't throttle but we should 2417 * account for their work, so reserve an allocation 2418 * slot for them here. 2419 / 2420* VERIFY(metaslab_class_throttle_reserve(mc, 2421 zp.zp_copies, cio, flags)); 2422 } 2423 zio_nowait(cio); 2424 } 2425 2426 /* 2427 * Set pio's pipeline to just wait for zio to finish. 2428 / 2429* pio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 2430 2431 zio_nowait(zio); 2432 2433 return (ZIO_PIPELINE_CONTINUE); 2434} 2435 2436/* 2437 * The zio_nop_write stage in the pipeline determines if allocating a 2438 * new bp is necessary. The nopwrite feature can handle writes in 2439 * either syncing or open context (i.e. zil writes) and as a result is 2440 * mutually exclusive with dedup. 2441 * 2442 * By leveraging a cryptographically secure checksum, such as SHA256, we 2443 * can compare the checksums of the new data and the old to determine if 2444 * allocating a new block is required. Note that our requirements for 2445 * cryptographic strength are fairly weak: there can't be any accidental 2446 * hash collisions, but we don't need to be secure against intentional 2447 * (malicious) collisions. To trigger a nopwrite, you have to be able 2448 * to write the file to begin with, and triggering an incorrect (hash 2449 * collision) nopwrite is no worse than simply writing to the file. 2450 * That said, there are no known attacks against the checksum algorithms 2451 * used for nopwrite, assuming that the salt and the checksums 2452 * themselves remain secret. 2453 / 2454static int 2455zio_nop_write(zio_t zio) 2456{ 2457 blkptr_t bp = zio->io_bp; 2458* blkptr_t bp_orig = &zio->io_bp_orig; 2459* zio_prop_t zp = &zio->io_prop; 2460* 2461 ASSERT(BP_GET_LEVEL(bp) == 0); 2462 ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); 2463 ASSERT(zp->zp_nopwrite); 2464 ASSERT(!zp->zp_dedup); 2465 ASSERT(zio->io_bp_override == NULL); 2466 ASSERT(IO_IS_ALLOCATING(zio)); 2467 2468 /* 2469 * Check to see if the original bp and the new bp have matching 2470 * characteristics (i.e. same checksum, compression algorithms, etc). 2471 * If they don't then just continue with the pipeline which will 2472 * allocate a new bp. 2473 / 2474* if (BP_IS_HOLE(bp_orig) \|\| 2475 !(zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_flags & 2476 ZCHECKSUM_FLAG_NOPWRITE) \|\| 2477 BP_GET_CHECKSUM(bp) != BP_GET_CHECKSUM(bp_orig) \|\| 2478 BP_GET_COMPRESS(bp) != BP_GET_COMPRESS(bp_orig) \|\| 2479 BP_GET_DEDUP(bp) != BP_GET_DEDUP(bp_orig) \|\| 2480 zp->zp_copies != BP_GET_NDVAS(bp_orig)) 2481 return (ZIO_PIPELINE_CONTINUE); 2482 2483 /* 2484 * If the checksums match then reset the pipeline so that we 2485 * avoid allocating a new bp and issuing any I/O. 2486 / 2487* if (ZIO_CHECKSUM_EQUAL(bp->blk_cksum, bp_orig->blk_cksum)) { 2488 ASSERT(zio_checksum_table[zp->zp_checksum].ci_flags & 2489 ZCHECKSUM_FLAG_NOPWRITE); 2490 ASSERT3U(BP_GET_PSIZE(bp), ==, BP_GET_PSIZE(bp_orig)); 2491 ASSERT3U(BP_GET_LSIZE(bp), ==, BP_GET_LSIZE(bp_orig)); 2492 ASSERT(zp->zp_compress != ZIO_COMPRESS_OFF); 2493 ASSERT(bcmp(&bp->blk_prop, &bp_orig->blk_prop, 2494 sizeof (uint64_t)) == 0); 2495 2496 bp = bp_orig; 2497 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 2498 zio->io_flags \|= ZIO_FLAG_NOPWRITE; 2499 } 2500 2501 return (ZIO_PIPELINE_CONTINUE); 2502} 2503 2504/* 2505 * ========================================================================== 2506 * Dedup 2507 * ========================================================================== 2508 / 2509static void 2510zio_ddt_child_read_done(zio_t zio) 2511{ 2512 blkptr_t bp = zio->io_bp; 2513* ddt_entry_t dde = zio->io_private; 2514* ddt_phys_t ddp; 2515* zio_t pio = zio_unique_parent(zio); 2516* 2517 mutex_enter(&pio->io_lock); 2518 ddp = ddt_phys_select(dde, bp); 2519 if (zio->io_error == 0) 2520 ddt_phys_clear(ddp); /* this ddp doesn't need repair / 2521* 2522 if (zio->io_error == 0 && dde->dde_repair_abd == NULL) 2523 dde->dde_repair_abd = zio->io_abd; 2524 else 2525 abd_free(zio->io_abd); 2526 mutex_exit(&pio->io_lock); 2527} 2528 2529static int 2530zio_ddt_read_start(zio_t zio) 2531{ 2532* blkptr_t bp = zio->io_bp; 2533* 2534 ASSERT(BP_GET_DEDUP(bp)); 2535 ASSERT(BP_GET_PSIZE(bp) == zio->io_size); 2536 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 2537 2538 if (zio->io_child_error[ZIO_CHILD_DDT]) { 2539 ddt_t ddt = ddt_select(zio->io_spa, bp); 2540* ddt_entry_t dde = ddt_repair_start(ddt, bp); 2541* ddt_phys_t ddp = dde->dde_phys; 2542* ddt_phys_t ddp_self = ddt_phys_select(dde, bp); 2543* blkptr_t blk; 2544 2545 ASSERT(zio->io_vsd == NULL); 2546 zio->io_vsd = dde; 2547 2548 if (ddp_self == NULL) 2549 return (ZIO_PIPELINE_CONTINUE); 2550 2551 for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { 2552 if (ddp->ddp_phys_birth == 0 \|\| ddp == ddp_self) 2553 continue; 2554 ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, ddp, 2555 &blk); 2556 zio_nowait(zio_read(zio, zio->io_spa, &blk, 2557 abd_alloc_for_io(zio->io_size, B_TRUE), 2558 zio->io_size, zio_ddt_child_read_done, dde, 2559 zio->io_priority, ZIO_DDT_CHILD_FLAGS(zio) \| 2560 ZIO_FLAG_DONT_PROPAGATE, &zio->io_bookmark)); 2561 } 2562 return (ZIO_PIPELINE_CONTINUE); 2563 } 2564 2565 zio_nowait(zio_read(zio, zio->io_spa, bp, 2566 zio->io_abd, zio->io_size, NULL, NULL, zio->io_priority, 2567 ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark)); 2568 2569 return (ZIO_PIPELINE_CONTINUE); 2570} 2571 2572static int 2573zio_ddt_read_done(zio_t zio) 2574{ 2575* blkptr_t bp = zio->io_bp; 2576* 2577 if (zio_wait_for_children(zio, ZIO_CHILD_DDT_BIT, ZIO_WAIT_DONE)) { 2578 return (ZIO_PIPELINE_STOP); 2579 } 2580 2581 ASSERT(BP_GET_DEDUP(bp)); 2582 ASSERT(BP_GET_PSIZE(bp) == zio->io_size); 2583 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 2584 2585 if (zio->io_child_error[ZIO_CHILD_DDT]) { 2586 ddt_t ddt = ddt_select(zio->io_spa, bp); 2587* ddt_entry_t dde = zio->io_vsd; 2588* if (ddt == NULL) { 2589 ASSERT(spa_load_state(zio->io_spa) != SPA_LOAD_NONE); 2590 return (ZIO_PIPELINE_CONTINUE); 2591 } 2592 if (dde == NULL) { 2593 zio->io_stage = ZIO_STAGE_DDT_READ_START >> 1; 2594 zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE); 2595 return (ZIO_PIPELINE_STOP); 2596 } 2597 if (dde->dde_repair_abd != NULL) { 2598 abd_copy(zio->io_abd, dde->dde_repair_abd, 2599 zio->io_size); 2600 zio->io_child_error[ZIO_CHILD_DDT] = 0; 2601 } 2602 ddt_repair_done(ddt, dde); 2603 zio->io_vsd = NULL; 2604 } 2605 2606 ASSERT(zio->io_vsd == NULL); 2607 2608 return (ZIO_PIPELINE_CONTINUE); 2609} 2610 2611static boolean_t 2612zio_ddt_collision(zio_t zio, ddt_t ddt, ddt_entry_t dde) 2613{ 2614* spa_t spa = zio->io_spa; 2615* boolean_t do_raw = (zio->io_flags & ZIO_FLAG_RAW); 2616 2617 /* We should never get a raw, override zio / 2618* ASSERT(!(zio->io_bp_override && do_raw)); 2619 2620 /* 2621 * Note: we compare the original data, not the transformed data, 2622 * because when zio->io_bp is an override bp, we will not have 2623 * pushed the I/O transforms. That's an important optimization 2624 * because otherwise we'd compress/encrypt all dmu_sync() data twice. 2625 / 2626* for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) { 2627 zio_t lio = dde->dde_lead_zio[p]; 2628* 2629 if (lio != NULL) { 2630 return (lio->io_orig_size != zio->io_orig_size \|\| 2631 abd_cmp(zio->io_orig_abd, lio->io_orig_abd, 2632 zio->io_orig_size) != 0); 2633 } 2634 } 2635 2636 for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) { 2637 ddt_phys_t ddp = &dde->dde_phys[p]; 2638* 2639 if (ddp->ddp_phys_birth != 0) { 2640 arc_buf_t abuf = NULL; 2641* arc_flags_t aflags = ARC_FLAG_WAIT; 2642 int zio_flags = ZIO_FLAG_CANFAIL \| ZIO_FLAG_SPECULATIVE; 2643 blkptr_t blk = zio->io_bp; 2644* int error; 2645 2646 ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth); 2647 2648 ddt_exit(ddt); 2649 2650 /* 2651 * Intuitively, it would make more sense to compare 2652 * io_abd than io_orig_abd in the raw case since you 2653 * don't want to look at any transformations that have 2654 * happened to the data. However, for raw I/Os the 2655 * data will actually be the same in io_abd and 2656 * io_orig_abd, so all we have to do is issue this as 2657 * a raw ARC read. 2658 / 2659* if (do_raw) { 2660 zio_flags \|= ZIO_FLAG_RAW; 2661 ASSERT3U(zio->io_size, ==, zio->io_orig_size); 2662 ASSERT0(abd_cmp(zio->io_abd, zio->io_orig_abd, 2663 zio->io_size)); 2664 ASSERT3P(zio->io_transform_stack, ==, NULL); 2665 } 2666 2667 error = arc_read(NULL, spa, &blk, 2668 arc_getbuf_func, &abuf, ZIO_PRIORITY_SYNC_READ, 2669 zio_flags, &aflags, &zio->io_bookmark); 2670 2671 if (error == 0) { 2672 if (arc_buf_size(abuf) != zio->io_orig_size \|\| 2673 abd_cmp_buf(zio->io_orig_abd, abuf->b_data, 2674 zio->io_orig_size) != 0) 2675 error = SET_ERROR(EEXIST); 2676 arc_buf_destroy(abuf, &abuf); 2677 } 2678 2679 ddt_enter(ddt); 2680 return (error != 0); 2681 } 2682 } 2683 2684 return (B_FALSE); 2685} 2686 2687static void 2688zio_ddt_child_write_ready(zio_t zio) 2689{ 2690* int p = zio->io_prop.zp_copies; 2691 ddt_t ddt = ddt_select(zio->io_spa, zio->io_bp); 2692* ddt_entry_t dde = zio->io_private; 2693* ddt_phys_t ddp = &dde->dde_phys[p]; 2694* zio_t pio; 2695* 2696 if (zio->io_error) 2697 return; 2698 2699 ddt_enter(ddt); 2700 2701 ASSERT(dde->dde_lead_zio[p] == zio); 2702 2703 ddt_phys_fill(ddp, zio->io_bp); 2704 2705 zio_link_t zl = NULL; 2706* while ((pio = zio_walk_parents(zio, &zl)) != NULL) 2707 ddt_bp_fill(ddp, pio->io_bp, zio->io_txg); 2708 2709 ddt_exit(ddt); 2710} 2711 2712static void 2713zio_ddt_child_write_done(zio_t zio) 2714{ 2715* int p = zio->io_prop.zp_copies; 2716 ddt_t ddt = ddt_select(zio->io_spa, zio->io_bp); 2717* ddt_entry_t dde = zio->io_private; 2718* ddt_phys_t ddp = &dde->dde_phys[p]; 2719* 2720 ddt_enter(ddt); 2721 2722 ASSERT(ddp->ddp_refcnt == 0); 2723 ASSERT(dde->dde_lead_zio[p] == zio); 2724 dde->dde_lead_zio[p] = NULL; 2725 2726 if (zio->io_error == 0) { 2727 zio_link_t zl = NULL; 2728* while (zio_walk_parents(zio, &zl) != NULL) 2729 ddt_phys_addref(ddp); 2730 } else { 2731 ddt_phys_clear(ddp); 2732 } 2733 2734 ddt_exit(ddt); 2735} 2736 2737static void 2738zio_ddt_ditto_write_done(zio_t zio) 2739{ 2740* int p = DDT_PHYS_DITTO; 2741 zio_prop_t zp = &zio->io_prop; 2742* blkptr_t bp = zio->io_bp; 2743* ddt_t ddt = ddt_select(zio->io_spa, bp); 2744* ddt_entry_t dde = zio->io_private; 2745* ddt_phys_t ddp = &dde->dde_phys[p]; 2746* ddt_key_t ddk = &dde->dde_key; 2747* 2748 ddt_enter(ddt); 2749 2750 ASSERT(ddp->ddp_refcnt == 0); 2751 ASSERT(dde->dde_lead_zio[p] == zio); 2752 dde->dde_lead_zio[p] = NULL; 2753 2754 if (zio->io_error == 0) { 2755 ASSERT(ZIO_CHECKSUM_EQUAL(bp->blk_cksum, ddk->ddk_cksum)); 2756 ASSERT(zp->zp_copies < SPA_DVAS_PER_BP); 2757 ASSERT(zp->zp_copies == BP_GET_NDVAS(bp) - BP_IS_GANG(bp)); 2758 if (ddp->ddp_phys_birth != 0) 2759 ddt_phys_free(ddt, ddk, ddp, zio->io_txg); 2760 ddt_phys_fill(ddp, bp); 2761 } 2762 2763 ddt_exit(ddt); 2764} 2765 2766static int 2767zio_ddt_write(zio_t zio) 2768{ 2769* spa_t spa = zio->io_spa; 2770* blkptr_t bp = zio->io_bp; 2771* uint64_t txg = zio->io_txg; 2772 zio_prop_t zp = &zio->io_prop; 2773* int p = zp->zp_copies; 2774 int ditto_copies; 2775 zio_t cio = NULL; 2776* zio_t dio = NULL; 2777* ddt_t ddt = ddt_select(spa, bp); 2778* ddt_entry_t dde; 2779* ddt_phys_t ddp; 2780* 2781 ASSERT(BP_GET_DEDUP(bp)); 2782 ASSERT(BP_GET_CHECKSUM(bp) == zp->zp_checksum); 2783 ASSERT(BP_IS_HOLE(bp) \|\| zio->io_bp_override); 2784 ASSERT(!(zio->io_bp_override && (zio->io_flags & ZIO_FLAG_RAW))); 2785 2786 ddt_enter(ddt); 2787 dde = ddt_lookup(ddt, bp, B_TRUE); 2788 ddp = &dde->dde_phys[p]; 2789 2790 if (zp->zp_dedup_verify && zio_ddt_collision(zio, ddt, dde)) { 2791 /* 2792 * If we're using a weak checksum, upgrade to a strong checksum 2793 * and try again. If we're already using a strong checksum, 2794 * we can't resolve it, so just convert to an ordinary write. 2795 * (And automatically e-mail a paper to Nature?) 2796 / 2797* if (!(zio_checksum_table[zp->zp_checksum].ci_flags & 2798 ZCHECKSUM_FLAG_DEDUP)) { 2799 zp->zp_checksum = spa_dedup_checksum(spa); 2800 zio_pop_transforms(zio); 2801 zio->io_stage = ZIO_STAGE_OPEN; 2802 BP_ZERO(bp); 2803 } else { 2804 zp->zp_dedup = B_FALSE; 2805 BP_SET_DEDUP(bp, B_FALSE); 2806 } 2807 ASSERT(!BP_GET_DEDUP(bp)); 2808 zio->io_pipeline = ZIO_WRITE_PIPELINE; 2809 ddt_exit(ddt); 2810 return (ZIO_PIPELINE_CONTINUE); 2811 } 2812 2813 ditto_copies = ddt_ditto_copies_needed(ddt, dde, ddp); 2814 ASSERT(ditto_copies < SPA_DVAS_PER_BP); 2815 2816 if (ditto_copies > ddt_ditto_copies_present(dde) && 2817 dde->dde_lead_zio[DDT_PHYS_DITTO] == NULL) { 2818 zio_prop_t czp = zp; 2819* 2820 czp.zp_copies = ditto_copies; 2821 2822 /* 2823 * If we arrived here with an override bp, we won't have run 2824 * the transform stack, so we won't have the data we need to 2825 * generate a child i/o. So, toss the override bp and restart. 2826 * This is safe, because using the override bp is just an 2827 * optimization; and it's rare, so the cost doesn't matter. 2828 / 2829* if (zio->io_bp_override) { 2830 zio_pop_transforms(zio); 2831 zio->io_stage = ZIO_STAGE_OPEN; 2832 zio->io_pipeline = ZIO_WRITE_PIPELINE; 2833 zio->io_bp_override = NULL; 2834 BP_ZERO(bp); 2835 ddt_exit(ddt); 2836 return (ZIO_PIPELINE_CONTINUE); 2837 } 2838 2839 dio = zio_write(zio, spa, txg, bp, zio->io_orig_abd, 2840 zio->io_orig_size, zio->io_orig_size, &czp, NULL, NULL, 2841 NULL, zio_ddt_ditto_write_done, dde, zio->io_priority, 2842 ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark); 2843 2844 zio_push_transform(dio, zio->io_abd, zio->io_size, 0, NULL); 2845 dde->dde_lead_zio[DDT_PHYS_DITTO] = dio; 2846 } 2847 2848 if (ddp->ddp_phys_birth != 0 \|\| dde->dde_lead_zio[p] != NULL) { 2849 if (ddp->ddp_phys_birth != 0) 2850 ddt_bp_fill(ddp, bp, txg); 2851 if (dde->dde_lead_zio[p] != NULL) 2852 zio_add_child(zio, dde->dde_lead_zio[p]); 2853 else 2854 ddt_phys_addref(ddp); 2855 } else if (zio->io_bp_override) { 2856 ASSERT(bp->blk_birth == txg); 2857 ASSERT(BP_EQUAL(bp, zio->io_bp_override)); 2858 ddt_phys_fill(ddp, bp); 2859 ddt_phys_addref(ddp); 2860 } else { 2861 cio = zio_write(zio, spa, txg, bp, zio->io_orig_abd, 2862 zio->io_orig_size, zio->io_orig_size, zp, 2863 zio_ddt_child_write_ready, NULL, NULL, 2864 zio_ddt_child_write_done, dde, zio->io_priority, 2865 ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark); 2866 2867 zio_push_transform(cio, zio->io_abd, zio->io_size, 0, NULL); 2868 dde->dde_lead_zio[p] = cio; 2869 } 2870 2871 ddt_exit(ddt); 2872 2873 if (cio) 2874 zio_nowait(cio); 2875 if (dio) 2876 zio_nowait(dio); 2877 2878 return (ZIO_PIPELINE_CONTINUE); 2879} 2880 2881ddt_entry_t freedde; / for debugging / 2882* 2883static int 2884zio_ddt_free(zio_t zio) 2885{ 2886* spa_t spa = zio->io_spa; 2887* blkptr_t bp = zio->io_bp; 2888* ddt_t ddt = ddt_select(spa, bp); 2889* ddt_entry_t dde; 2890* ddt_phys_t ddp; 2891* 2892 ASSERT(BP_GET_DEDUP(bp)); 2893 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 2894 2895 ddt_enter(ddt); 2896 freedde = dde = ddt_lookup(ddt, bp, B_TRUE); 2897 ddp = ddt_phys_select(dde, bp); 2898 ddt_phys_decref(ddp); 2899 ddt_exit(ddt); 2900 2901 return (ZIO_PIPELINE_CONTINUE); 2902} 2903 2904/* 2905 * ========================================================================== 2906 * Allocate and free blocks 2907 * ========================================================================== 2908 / 2909* 2910static zio_t * 2911zio_io_to_allocate(spa_t spa) 2912{ 2913* zio_t zio; 2914* 2915 ASSERT(MUTEX_HELD(&spa->spa_alloc_lock)); 2916 2917 zio = avl_first(&spa->spa_alloc_tree); 2918 if (zio == NULL) 2919 return (NULL); 2920 2921 ASSERT(IO_IS_ALLOCATING(zio)); 2922 2923 /* 2924 * Try to place a reservation for this zio. If we're unable to 2925 * reserve then we throttle. 2926 / 2927* if (!metaslab_class_throttle_reserve(spa_normal_class(spa), 2928 zio->io_prop.zp_copies, zio, 0)) { 2929 return (NULL); 2930 } 2931 2932 avl_remove(&spa->spa_alloc_tree, zio); 2933 ASSERT3U(zio->io_stage, <, ZIO_STAGE_DVA_ALLOCATE); 2934 2935 return (zio); 2936} 2937 2938static int 2939zio_dva_throttle(zio_t zio) 2940{ 2941* spa_t spa = zio->io_spa; 2942* zio_t nio; 2943* 2944 if (zio->io_priority == ZIO_PRIORITY_SYNC_WRITE \|\| 2945 !spa_normal_class(zio->io_spa)->mc_alloc_throttle_enabled \|\| 2946 zio->io_child_type == ZIO_CHILD_GANG \|\| 2947 zio->io_flags & ZIO_FLAG_NODATA) { 2948 return (ZIO_PIPELINE_CONTINUE); 2949 } 2950 2951 ASSERT(zio->io_child_type > ZIO_CHILD_GANG); 2952 2953 ASSERT3U(zio->io_queued_timestamp, >, 0); 2954 ASSERT(zio->io_stage == ZIO_STAGE_DVA_THROTTLE); 2955 2956 mutex_enter(&spa->spa_alloc_lock); 2957 2958 ASSERT(zio->io_type == ZIO_TYPE_WRITE); 2959 avl_add(&spa->spa_alloc_tree, zio); 2960 2961 nio = zio_io_to_allocate(zio->io_spa); 2962 mutex_exit(&spa->spa_alloc_lock); 2963 2964 if (nio == zio) 2965 return (ZIO_PIPELINE_CONTINUE); 2966 2967 if (nio != NULL) { 2968 ASSERT(nio->io_stage == ZIO_STAGE_DVA_THROTTLE); 2969 /* 2970 * We are passing control to a new zio so make sure that 2971 * it is processed by a different thread. We do this to 2972 * avoid stack overflows that can occur when parents are 2973 * throttled and children are making progress. We allow 2974 * it to go to the head of the taskq since it's already 2975 * been waiting. 2976 / 2977* zio_taskq_dispatch(nio, ZIO_TASKQ_ISSUE, B_TRUE); 2978 } 2979 return (ZIO_PIPELINE_STOP); 2980} 2981 2982void 2983zio_allocate_dispatch(spa_t spa) 2984{ 2985* zio_t zio; 2986* 2987 mutex_enter(&spa->spa_alloc_lock); 2988 zio = zio_io_to_allocate(spa); 2989 mutex_exit(&spa->spa_alloc_lock); 2990 if (zio == NULL) 2991 return; 2992 2993 ASSERT3U(zio->io_stage, ==, ZIO_STAGE_DVA_THROTTLE); 2994 ASSERT0(zio->io_error); 2995 zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_TRUE); 2996} 2997 2998static int 2999zio_dva_allocate(zio_t zio) 3000{ 3001* spa_t spa = zio->io_spa; 3002* metaslab_class_t mc = spa_normal_class(spa); 3003* blkptr_t bp = zio->io_bp; 3004* int error; 3005 int flags = 0; 3006 3007 if (zio->io_gang_leader == NULL) { 3008 ASSERT(zio->io_child_type > ZIO_CHILD_GANG); 3009 zio->io_gang_leader = zio; 3010 } 3011 3012 ASSERT(BP_IS_HOLE(bp)); 3013 ASSERT0(BP_GET_NDVAS(bp)); 3014 ASSERT3U(zio->io_prop.zp_copies, >, 0); 3015 ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa)); 3016 ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp)); 3017 3018 if (zio->io_flags & ZIO_FLAG_NODATA) { 3019 flags \|= METASLAB_DONT_THROTTLE; 3020 } 3021 if (zio->io_flags & ZIO_FLAG_GANG_CHILD) { 3022 flags \|= METASLAB_GANG_CHILD; 3023 } 3024 if (zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE) { 3025 flags \|= METASLAB_ASYNC_ALLOC; 3026 } 3027 3028 error = metaslab_alloc(spa, mc, zio->io_size, bp, 3029 zio->io_prop.zp_copies, zio->io_txg, NULL, flags, 3030 &zio->io_alloc_list, zio); 3031 3032 if (error != 0) { 3033 spa_dbgmsg(spa, "%s: metaslab allocation failure: zio %p, " 3034 "size %llu, error %d", spa_name(spa), zio, zio->io_size, 3035 error); 3036 if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE) 3037 return (zio_write_gang_block(zio)); 3038 zio->io_error = error; 3039 } 3040 3041 return (ZIO_PIPELINE_CONTINUE); 3042} 3043 3044static int 3045zio_dva_free(zio_t zio) 3046{ 3047* metaslab_free(zio->io_spa, zio->io_bp, zio->io_txg, B_FALSE); 3048 3049 return (ZIO_PIPELINE_CONTINUE); 3050} 3051 3052static int 3053zio_dva_claim(zio_t zio) 3054{ 3055* int error; 3056 3057 error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg); 3058 if (error) 3059 zio->io_error = error; 3060 3061 return (ZIO_PIPELINE_CONTINUE); 3062} 3063 3064/* 3065 * Undo an allocation. This is used by zio_done() when an I/O fails 3066 * and we want to give back the block we just allocated. 3067 * This handles both normal blocks and gang blocks. 3068 / 3069static void 3070zio_dva_unallocate(zio_t zio, zio_gang_node_t gn, blkptr_t bp) 3071{ 3072 ASSERT(bp->blk_birth == zio->io_txg \|\| BP_IS_HOLE(bp)); 3073 ASSERT(zio->io_bp_override == NULL); 3074 3075 if (!BP_IS_HOLE(bp)) 3076 metaslab_free(zio->io_spa, bp, bp->blk_birth, B_TRUE); 3077 3078 if (gn != NULL) { 3079 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { 3080 zio_dva_unallocate(zio, gn->gn_child[g], 3081 &gn->gn_gbh->zg_blkptr[g]); 3082 } 3083 } 3084} 3085 3086/* 3087 * Try to allocate an intent log block. Return 0 on success, errno on failure. 3088 / 3089int 3090zio_alloc_zil(spa_t spa, uint64_t txg, blkptr_t new_bp, blkptr_t old_bp, 3091 uint64_t size, boolean_t slog) 3092{ 3093* int error = 1; 3094 zio_alloc_list_t io_alloc_list; 3095 3096 ASSERT(txg > spa_syncing_txg(spa)); 3097 3098 metaslab_trace_init(&io_alloc_list); 3099 error = metaslab_alloc(spa, spa_log_class(spa), size, new_bp, 1, 3100 txg, old_bp, METASLAB_HINTBP_AVOID, &io_alloc_list, NULL); 3101 if (error == 0) { 3102 slog = TRUE; 3103* } else { 3104 error = metaslab_alloc(spa, spa_normal_class(spa), size, 3105 new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID, 3106 &io_alloc_list, NULL); 3107 if (error == 0) 3108 slog = FALSE; 3109* } 3110 metaslab_trace_fini(&io_alloc_list); 3111 3112 if (error == 0) { 3113 BP_SET_LSIZE(new_bp, size); 3114 BP_SET_PSIZE(new_bp, size); 3115 BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF); 3116 BP_SET_CHECKSUM(new_bp, 3117 spa_version(spa) >= SPA_VERSION_SLIM_ZIL 3118 ? ZIO_CHECKSUM_ZILOG2 : ZIO_CHECKSUM_ZILOG); 3119 BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG); 3120 BP_SET_LEVEL(new_bp, 0); 3121 BP_SET_DEDUP(new_bp, 0); 3122 BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER); 3123 } else { 3124 zfs_dbgmsg("%s: zil block allocation failure: " 3125 "size %llu, error %d", spa_name(spa), size, error); 3126 } 3127 3128 return (error); 3129} 3130 3131/* 3132 * ========================================================================== 3133 * Read, write and delete to physical devices 3134 * ========================================================================== 3135 / 3136* 3137 3138/* 3139 * Issue an I/O to the underlying vdev. Typically the issue pipeline 3140 * stops after this stage and will resume upon I/O completion. 3141 * However, there are instances where the vdev layer may need to 3142 * continue the pipeline when an I/O was not issued. Since the I/O 3143 * that was sent to the vdev layer might be different than the one 3144 * currently active in the pipeline (see vdev_queue_io()), we explicitly 3145 * force the underlying vdev layers to call either zio_execute() or 3146 * zio_interrupt() to ensure that the pipeline continues with the correct I/O. 3147 / 3148static int 3149zio_vdev_io_start(zio_t zio) 3150{ 3151 vdev_t vd = zio->io_vd; 3152* uint64_t align; 3153 spa_t spa = zio->io_spa; 3154* int ret; 3155 3156 ASSERT(zio->io_error == 0); 3157 ASSERT(zio->io_child_error[ZIO_CHILD_VDEV] == 0); 3158 3159 if (vd == NULL) { 3160 if (!(zio->io_flags & ZIO_FLAG_CONFIG_WRITER)) 3161 spa_config_enter(spa, SCL_ZIO, zio, RW_READER); 3162 3163 /* 3164 * The mirror_ops handle multiple DVAs in a single BP. 3165 / 3166* vdev_mirror_ops.vdev_op_io_start(zio); 3167 return (ZIO_PIPELINE_STOP); 3168 } 3169 3170 if (vd->vdev_ops->vdev_op_leaf && zio->io_type == ZIO_TYPE_FREE && 3171 zio->io_priority == ZIO_PRIORITY_NOW) { 3172 trim_map_free(vd, zio->io_offset, zio->io_size, zio->io_txg); 3173 return (ZIO_PIPELINE_CONTINUE); 3174 } 3175 3176 ASSERT3P(zio->io_logical, !=, zio); 3177 if (zio->io_type == ZIO_TYPE_WRITE) { 3178 ASSERT(spa->spa_trust_config); 3179 3180 if (zio->io_vd->vdev_removing) { 3181 ASSERT(zio->io_flags & 3182 (ZIO_FLAG_PHYSICAL \| ZIO_FLAG_SELF_HEAL \| 3183 ZIO_FLAG_INDUCE_DAMAGE)); 3184 } 3185 } 3186	1881 1882 /* 1883 * Now that all children have been reexecuted, execute the parent. 1884 * We don't reexecute "The Godfather" I/O here as it's the 1885 * responsibility of the caller to wait on it. 1886 / 1887* if (!(pio->io_flags & ZIO_FLAG_GODFATHER)) { 1888 pio->io_queued_timestamp = gethrtime(); 1889 zio_execute(pio); 1890 } 1891} 1892 1893void 1894zio_suspend(spa_t spa, zio_t zio) 1895{ 1896 if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_PANIC) 1897 fm_panic("Pool '%s' has encountered an uncorrectable I/O " 1898 "failure and the failure mode property for this pool " 1899 "is set to panic.", spa_name(spa)); 1900 1901 zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL, NULL, 0, 0); 1902 1903 mutex_enter(&spa->spa_suspend_lock); 1904 1905 if (spa->spa_suspend_zio_root == NULL) 1906 spa->spa_suspend_zio_root = zio_root(spa, NULL, NULL, 1907 ZIO_FLAG_CANFAIL \| ZIO_FLAG_SPECULATIVE \| 1908 ZIO_FLAG_GODFATHER); 1909 1910 spa->spa_suspended = B_TRUE; 1911 1912 if (zio != NULL) { 1913 ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER)); 1914 ASSERT(zio != spa->spa_suspend_zio_root); 1915 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 1916 ASSERT(zio_unique_parent(zio) == NULL); 1917 ASSERT(zio->io_stage == ZIO_STAGE_DONE); 1918 zio_add_child(spa->spa_suspend_zio_root, zio); 1919 } 1920 1921 mutex_exit(&spa->spa_suspend_lock); 1922} 1923 1924int 1925zio_resume(spa_t spa) 1926{ 1927* zio_t pio; 1928* 1929 /* 1930 * Reexecute all previously suspended i/o. 1931 / 1932* mutex_enter(&spa->spa_suspend_lock); 1933 spa->spa_suspended = B_FALSE; 1934 cv_broadcast(&spa->spa_suspend_cv); 1935 pio = spa->spa_suspend_zio_root; 1936 spa->spa_suspend_zio_root = NULL; 1937 mutex_exit(&spa->spa_suspend_lock); 1938 1939 if (pio == NULL) 1940 return (0); 1941 1942 zio_reexecute(pio); 1943 return (zio_wait(pio)); 1944} 1945 1946void 1947zio_resume_wait(spa_t spa) 1948{ 1949* mutex_enter(&spa->spa_suspend_lock); 1950 while (spa_suspended(spa)) 1951 cv_wait(&spa->spa_suspend_cv, &spa->spa_suspend_lock); 1952 mutex_exit(&spa->spa_suspend_lock); 1953} 1954 1955/* 1956 * ========================================================================== 1957 * Gang blocks. 1958 * 1959 * A gang block is a collection of small blocks that looks to the DMU 1960 * like one large block. When zio_dva_allocate() cannot find a block 1961 * of the requested size, due to either severe fragmentation or the pool 1962 * being nearly full, it calls zio_write_gang_block() to construct the 1963 * block from smaller fragments. 1964 * 1965 * A gang block consists of a gang header (zio_gbh_phys_t) and up to 1966 * three (SPA_GBH_NBLKPTRS) gang members. The gang header is just like 1967 * an indirect block: it's an array of block pointers. It consumes 1968 * only one sector and hence is allocatable regardless of fragmentation. 1969 * The gang header's bps point to its gang members, which hold the data. 1970 * 1971 * Gang blocks are self-checksumming, using the bp's <vdev, offset, txg> 1972 * as the verifier to ensure uniqueness of the SHA256 checksum. 1973 * Critically, the gang block bp's blk_cksum is the checksum of the data, 1974 * not the gang header. This ensures that data block signatures (needed for 1975 * deduplication) are independent of how the block is physically stored. 1976 * 1977 * Gang blocks can be nested: a gang member may itself be a gang block. 1978 * Thus every gang block is a tree in which root and all interior nodes are 1979 * gang headers, and the leaves are normal blocks that contain user data. 1980 * The root of the gang tree is called the gang leader. 1981 * 1982 * To perform any operation (read, rewrite, free, claim) on a gang block, 1983 * zio_gang_assemble() first assembles the gang tree (minus data leaves) 1984 * in the io_gang_tree field of the original logical i/o by recursively 1985 * reading the gang leader and all gang headers below it. This yields 1986 * an in-core tree containing the contents of every gang header and the 1987 * bps for every constituent of the gang block. 1988 * 1989 * With the gang tree now assembled, zio_gang_issue() just walks the gang tree 1990 * and invokes a callback on each bp. To free a gang block, zio_gang_issue() 1991 * calls zio_free_gang() -- a trivial wrapper around zio_free() -- for each bp. 1992 * zio_claim_gang() provides a similarly trivial wrapper for zio_claim(). 1993 * zio_read_gang() is a wrapper around zio_read() that omits reading gang 1994 * headers, since we already have those in io_gang_tree. zio_rewrite_gang() 1995 * performs a zio_rewrite() of the data or, for gang headers, a zio_rewrite() 1996 * of the gang header plus zio_checksum_compute() of the data to update the 1997 * gang header's blk_cksum as described above. 1998 * 1999 * The two-phase assemble/issue model solves the problem of partial failure -- 2000 * what if you'd freed part of a gang block but then couldn't read the 2001 * gang header for another part? Assembling the entire gang tree first 2002 * ensures that all the necessary gang header I/O has succeeded before 2003 * starting the actual work of free, claim, or write. Once the gang tree 2004 * is assembled, free and claim are in-memory operations that cannot fail. 2005 * 2006 * In the event that a gang write fails, zio_dva_unallocate() walks the 2007 * gang tree to immediately free (i.e. insert back into the space map) 2008 * everything we've allocated. This ensures that we don't get ENOSPC 2009 * errors during repeated suspend/resume cycles due to a flaky device. 2010 * 2011 * Gang rewrites only happen during sync-to-convergence. If we can't assemble 2012 * the gang tree, we won't modify the block, so we can safely defer the free 2013 * (knowing that the block is still intact). If we can assemble the gang 2014 * tree, then even if some of the rewrites fail, zio_dva_unallocate() will free 2015 * each constituent bp and we can allocate a new block on the next sync pass. 2016 * 2017 * In all cases, the gang tree allows complete recovery from partial failure. 2018 * ========================================================================== 2019 / 2020* 2021static void 2022zio_gang_issue_func_done(zio_t zio) 2023{ 2024* abd_put(zio->io_abd); 2025} 2026 2027static zio_t * 2028zio_read_gang(zio_t pio, blkptr_t bp, zio_gang_node_t gn, abd_t data, 2029 uint64_t offset) 2030{ 2031 if (gn != NULL) 2032 return (pio); 2033 2034 return (zio_read(pio, pio->io_spa, bp, abd_get_offset(data, offset), 2035 BP_GET_PSIZE(bp), zio_gang_issue_func_done, 2036 NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), 2037 &pio->io_bookmark)); 2038} 2039 2040static zio_t * 2041zio_rewrite_gang(zio_t pio, blkptr_t bp, zio_gang_node_t gn, abd_t data, 2042 uint64_t offset) 2043{ 2044 zio_t zio; 2045* 2046 if (gn != NULL) { 2047 abd_t gbh_abd = 2048* abd_get_from_buf(gn->gn_gbh, SPA_GANGBLOCKSIZE); 2049 zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, 2050 gbh_abd, SPA_GANGBLOCKSIZE, zio_gang_issue_func_done, NULL, 2051 pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), 2052 &pio->io_bookmark); 2053 /* 2054 * As we rewrite each gang header, the pipeline will compute 2055 * a new gang block header checksum for it; but no one will 2056 * compute a new data checksum, so we do that here. The one 2057 * exception is the gang leader: the pipeline already computed 2058 * its data checksum because that stage precedes gang assembly. 2059 * (Presently, nothing actually uses interior data checksums; 2060 * this is just good hygiene.) 2061 / 2062* if (gn != pio->io_gang_leader->io_gang_tree) { 2063 abd_t buf = abd_get_offset(data, offset); 2064* 2065 zio_checksum_compute(zio, BP_GET_CHECKSUM(bp), 2066 buf, BP_GET_PSIZE(bp)); 2067 2068 abd_put(buf); 2069 } 2070 /* 2071 * If we are here to damage data for testing purposes, 2072 * leave the GBH alone so that we can detect the damage. 2073 / 2074* if (pio->io_gang_leader->io_flags & ZIO_FLAG_INDUCE_DAMAGE) 2075 zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; 2076 } else { 2077 zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, 2078 abd_get_offset(data, offset), BP_GET_PSIZE(bp), 2079 zio_gang_issue_func_done, NULL, pio->io_priority, 2080 ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); 2081 } 2082 2083 return (zio); 2084} 2085 2086/* ARGSUSED / 2087static zio_t 2088zio_free_gang(zio_t pio, blkptr_t bp, zio_gang_node_t gn, abd_t data, 2089 uint64_t offset) 2090{ 2091 return (zio_free_sync(pio, pio->io_spa, pio->io_txg, bp, 2092 BP_IS_GANG(bp) ? SPA_GANGBLOCKSIZE : BP_GET_PSIZE(bp), 2093 ZIO_GANG_CHILD_FLAGS(pio))); 2094} 2095 2096/* ARGSUSED / 2097static zio_t 2098zio_claim_gang(zio_t pio, blkptr_t bp, zio_gang_node_t gn, abd_t data, 2099 uint64_t offset) 2100{ 2101 return (zio_claim(pio, pio->io_spa, pio->io_txg, bp, 2102 NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio))); 2103} 2104 2105static zio_gang_issue_func_t zio_gang_issue_func[ZIO_TYPES] = { 2106* NULL, 2107 zio_read_gang, 2108 zio_rewrite_gang, 2109 zio_free_gang, 2110 zio_claim_gang, 2111 NULL 2112}; 2113 2114static void zio_gang_tree_assemble_done(zio_t zio); 2115* 2116static zio_gang_node_t * 2117zio_gang_node_alloc(zio_gang_node_t *gnpp) 2118{ 2119* zio_gang_node_t gn; 2120* 2121 ASSERT(gnpp == NULL); 2122* 2123 gn = kmem_zalloc(sizeof (gn), KM_SLEEP); 2124* gn->gn_gbh = zio_buf_alloc(SPA_GANGBLOCKSIZE); 2125 gnpp = gn; 2126* 2127 return (gn); 2128} 2129 2130static void 2131zio_gang_node_free(zio_gang_node_t *gnpp) 2132{ 2133* zio_gang_node_t gn = gnpp; 2134 2135 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) 2136 ASSERT(gn->gn_child[g] == NULL); 2137 2138 zio_buf_free(gn->gn_gbh, SPA_GANGBLOCKSIZE); 2139 kmem_free(gn, sizeof (gn)); 2140* gnpp = NULL; 2141} 2142* 2143static void 2144zio_gang_tree_free(zio_gang_node_t *gnpp) 2145{ 2146* zio_gang_node_t gn = gnpp; 2147 2148 if (gn == NULL) 2149 return; 2150 2151 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) 2152 zio_gang_tree_free(&gn->gn_child[g]); 2153 2154 zio_gang_node_free(gnpp); 2155} 2156 2157static void 2158zio_gang_tree_assemble(zio_t gio, blkptr_t bp, zio_gang_node_t *gnpp) 2159{ 2160* zio_gang_node_t gn = zio_gang_node_alloc(gnpp); 2161* abd_t gbh_abd = abd_get_from_buf(gn->gn_gbh, SPA_GANGBLOCKSIZE); 2162* 2163 ASSERT(gio->io_gang_leader == gio); 2164 ASSERT(BP_IS_GANG(bp)); 2165 2166 zio_nowait(zio_read(gio, gio->io_spa, bp, gbh_abd, SPA_GANGBLOCKSIZE, 2167 zio_gang_tree_assemble_done, gn, gio->io_priority, 2168 ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark)); 2169} 2170 2171static void 2172zio_gang_tree_assemble_done(zio_t zio) 2173{ 2174* zio_t gio = zio->io_gang_leader; 2175* zio_gang_node_t gn = zio->io_private; 2176* blkptr_t bp = zio->io_bp; 2177* 2178 ASSERT(gio == zio_unique_parent(zio)); 2179 ASSERT(zio->io_child_count == 0); 2180 2181 if (zio->io_error) 2182 return; 2183 2184 /* this ABD was created from a linear buf in zio_gang_tree_assemble / 2185* if (BP_SHOULD_BYTESWAP(bp)) 2186 byteswap_uint64_array(abd_to_buf(zio->io_abd), zio->io_size); 2187 2188 ASSERT3P(abd_to_buf(zio->io_abd), ==, gn->gn_gbh); 2189 ASSERT(zio->io_size == SPA_GANGBLOCKSIZE); 2190 ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC); 2191 2192 abd_put(zio->io_abd); 2193 2194 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { 2195 blkptr_t gbp = &gn->gn_gbh->zg_blkptr[g]; 2196* if (!BP_IS_GANG(gbp)) 2197 continue; 2198 zio_gang_tree_assemble(gio, gbp, &gn->gn_child[g]); 2199 } 2200} 2201 2202static void 2203zio_gang_tree_issue(zio_t pio, zio_gang_node_t gn, blkptr_t bp, abd_t data, 2204 uint64_t offset) 2205{ 2206 zio_t gio = pio->io_gang_leader; 2207* zio_t zio; 2208* 2209 ASSERT(BP_IS_GANG(bp) == !!gn); 2210 ASSERT(BP_GET_CHECKSUM(bp) == BP_GET_CHECKSUM(gio->io_bp)); 2211 ASSERT(BP_GET_LSIZE(bp) == BP_GET_PSIZE(bp) \|\| gn == gio->io_gang_tree); 2212 2213 /* 2214 * If you're a gang header, your data is in gn->gn_gbh. 2215 * If you're a gang member, your data is in 'data' and gn == NULL. 2216 / 2217* zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data, offset); 2218 2219 if (gn != NULL) { 2220 ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC); 2221 2222 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { 2223 blkptr_t gbp = &gn->gn_gbh->zg_blkptr[g]; 2224* if (BP_IS_HOLE(gbp)) 2225 continue; 2226 zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data, 2227 offset); 2228 offset += BP_GET_PSIZE(gbp); 2229 } 2230 } 2231 2232 if (gn == gio->io_gang_tree && gio->io_abd != NULL) 2233 ASSERT3U(gio->io_size, ==, offset); 2234 2235 if (zio != pio) 2236 zio_nowait(zio); 2237} 2238 2239static int 2240zio_gang_assemble(zio_t zio) 2241{ 2242* blkptr_t bp = zio->io_bp; 2243* 2244 ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == NULL); 2245 ASSERT(zio->io_child_type > ZIO_CHILD_GANG); 2246 2247 zio->io_gang_leader = zio; 2248 2249 zio_gang_tree_assemble(zio, bp, &zio->io_gang_tree); 2250 2251 return (ZIO_PIPELINE_CONTINUE); 2252} 2253 2254static int 2255zio_gang_issue(zio_t zio) 2256{ 2257* blkptr_t bp = zio->io_bp; 2258* 2259 if (zio_wait_for_children(zio, ZIO_CHILD_GANG_BIT, ZIO_WAIT_DONE)) { 2260 return (ZIO_PIPELINE_STOP); 2261 } 2262 2263 ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == zio); 2264 ASSERT(zio->io_child_type > ZIO_CHILD_GANG); 2265 2266 if (zio->io_child_error[ZIO_CHILD_GANG] == 0) 2267 zio_gang_tree_issue(zio, zio->io_gang_tree, bp, zio->io_abd, 2268 0); 2269 else 2270 zio_gang_tree_free(&zio->io_gang_tree); 2271 2272 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 2273 2274 return (ZIO_PIPELINE_CONTINUE); 2275} 2276 2277static void 2278zio_write_gang_member_ready(zio_t zio) 2279{ 2280* zio_t pio = zio_unique_parent(zio); 2281* zio_t gio = zio->io_gang_leader; 2282* dva_t cdva = zio->io_bp->blk_dva; 2283* dva_t pdva = pio->io_bp->blk_dva; 2284* uint64_t asize; 2285 2286 if (BP_IS_HOLE(zio->io_bp)) 2287 return; 2288 2289 ASSERT(BP_IS_HOLE(&zio->io_bp_orig)); 2290 2291 ASSERT(zio->io_child_type == ZIO_CHILD_GANG); 2292 ASSERT3U(zio->io_prop.zp_copies, ==, gio->io_prop.zp_copies); 2293 ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(zio->io_bp)); 2294 ASSERT3U(pio->io_prop.zp_copies, <=, BP_GET_NDVAS(pio->io_bp)); 2295 ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp)); 2296 2297 mutex_enter(&pio->io_lock); 2298 for (int d = 0; d < BP_GET_NDVAS(zio->io_bp); d++) { 2299 ASSERT(DVA_GET_GANG(&pdva[d])); 2300 asize = DVA_GET_ASIZE(&pdva[d]); 2301 asize += DVA_GET_ASIZE(&cdva[d]); 2302 DVA_SET_ASIZE(&pdva[d], asize); 2303 } 2304 mutex_exit(&pio->io_lock); 2305} 2306 2307static void 2308zio_write_gang_done(zio_t zio) 2309{ 2310* abd_put(zio->io_abd); 2311} 2312 2313static int 2314zio_write_gang_block(zio_t pio) 2315{ 2316* spa_t spa = pio->io_spa; 2317* metaslab_class_t mc = spa_normal_class(spa); 2318* blkptr_t bp = pio->io_bp; 2319* zio_t gio = pio->io_gang_leader; 2320* zio_t zio; 2321* zio_gang_node_t gn, gnpp; 2322* zio_gbh_phys_t gbh; 2323* abd_t gbh_abd; 2324* uint64_t txg = pio->io_txg; 2325 uint64_t resid = pio->io_size; 2326 uint64_t lsize; 2327 int copies = gio->io_prop.zp_copies; 2328 int gbh_copies = MIN(copies + 1, spa_max_replication(spa)); 2329 zio_prop_t zp; 2330 int error; 2331 2332 int flags = METASLAB_HINTBP_FAVOR \| METASLAB_GANG_HEADER; 2333 if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) { 2334 ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); 2335 ASSERT(!(pio->io_flags & ZIO_FLAG_NODATA)); 2336 2337 flags \|= METASLAB_ASYNC_ALLOC; 2338 VERIFY(refcount_held(&mc->mc_alloc_slots, pio)); 2339 2340 /* 2341 * The logical zio has already placed a reservation for 2342 * 'copies' allocation slots but gang blocks may require 2343 * additional copies. These additional copies 2344 * (i.e. gbh_copies - copies) are guaranteed to succeed 2345 * since metaslab_class_throttle_reserve() always allows 2346 * additional reservations for gang blocks. 2347 / 2348* VERIFY(metaslab_class_throttle_reserve(mc, gbh_copies - copies, 2349 pio, flags)); 2350 } 2351 2352 error = metaslab_alloc(spa, mc, SPA_GANGBLOCKSIZE, 2353 bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp, flags, 2354 &pio->io_alloc_list, pio); 2355 if (error) { 2356 if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) { 2357 ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); 2358 ASSERT(!(pio->io_flags & ZIO_FLAG_NODATA)); 2359 2360 /* 2361 * If we failed to allocate the gang block header then 2362 * we remove any additional allocation reservations that 2363 * we placed here. The original reservation will 2364 * be removed when the logical I/O goes to the ready 2365 * stage. 2366 / 2367* metaslab_class_throttle_unreserve(mc, 2368 gbh_copies - copies, pio); 2369 } 2370 pio->io_error = error; 2371 return (ZIO_PIPELINE_CONTINUE); 2372 } 2373 2374 if (pio == gio) { 2375 gnpp = &gio->io_gang_tree; 2376 } else { 2377 gnpp = pio->io_private; 2378 ASSERT(pio->io_ready == zio_write_gang_member_ready); 2379 } 2380 2381 gn = zio_gang_node_alloc(gnpp); 2382 gbh = gn->gn_gbh; 2383 bzero(gbh, SPA_GANGBLOCKSIZE); 2384 gbh_abd = abd_get_from_buf(gbh, SPA_GANGBLOCKSIZE); 2385 2386 /* 2387 * Create the gang header. 2388 / 2389* zio = zio_rewrite(pio, spa, txg, bp, gbh_abd, SPA_GANGBLOCKSIZE, 2390 zio_write_gang_done, NULL, pio->io_priority, 2391 ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); 2392 2393 /* 2394 * Create and nowait the gang children. 2395 / 2396* for (int g = 0; resid != 0; resid -= lsize, g++) { 2397 lsize = P2ROUNDUP(resid / (SPA_GBH_NBLKPTRS - g), 2398 SPA_MINBLOCKSIZE); 2399 ASSERT(lsize >= SPA_MINBLOCKSIZE && lsize <= resid); 2400 2401 zp.zp_checksum = gio->io_prop.zp_checksum; 2402 zp.zp_compress = ZIO_COMPRESS_OFF; 2403 zp.zp_type = DMU_OT_NONE; 2404 zp.zp_level = 0; 2405 zp.zp_copies = gio->io_prop.zp_copies; 2406 zp.zp_dedup = B_FALSE; 2407 zp.zp_dedup_verify = B_FALSE; 2408 zp.zp_nopwrite = B_FALSE; 2409 2410 zio_t cio = zio_write(zio, spa, txg, &gbh->zg_blkptr[g], 2411* abd_get_offset(pio->io_abd, pio->io_size - resid), lsize, 2412 lsize, &zp, zio_write_gang_member_ready, NULL, NULL, 2413 zio_write_gang_done, &gn->gn_child[g], pio->io_priority, 2414 ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); 2415 2416 if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) { 2417 ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); 2418 ASSERT(!(pio->io_flags & ZIO_FLAG_NODATA)); 2419 2420 /* 2421 * Gang children won't throttle but we should 2422 * account for their work, so reserve an allocation 2423 * slot for them here. 2424 / 2425* VERIFY(metaslab_class_throttle_reserve(mc, 2426 zp.zp_copies, cio, flags)); 2427 } 2428 zio_nowait(cio); 2429 } 2430 2431 /* 2432 * Set pio's pipeline to just wait for zio to finish. 2433 / 2434* pio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 2435 2436 zio_nowait(zio); 2437 2438 return (ZIO_PIPELINE_CONTINUE); 2439} 2440 2441/* 2442 * The zio_nop_write stage in the pipeline determines if allocating a 2443 * new bp is necessary. The nopwrite feature can handle writes in 2444 * either syncing or open context (i.e. zil writes) and as a result is 2445 * mutually exclusive with dedup. 2446 * 2447 * By leveraging a cryptographically secure checksum, such as SHA256, we 2448 * can compare the checksums of the new data and the old to determine if 2449 * allocating a new block is required. Note that our requirements for 2450 * cryptographic strength are fairly weak: there can't be any accidental 2451 * hash collisions, but we don't need to be secure against intentional 2452 * (malicious) collisions. To trigger a nopwrite, you have to be able 2453 * to write the file to begin with, and triggering an incorrect (hash 2454 * collision) nopwrite is no worse than simply writing to the file. 2455 * That said, there are no known attacks against the checksum algorithms 2456 * used for nopwrite, assuming that the salt and the checksums 2457 * themselves remain secret. 2458 / 2459static int 2460zio_nop_write(zio_t zio) 2461{ 2462 blkptr_t bp = zio->io_bp; 2463* blkptr_t bp_orig = &zio->io_bp_orig; 2464* zio_prop_t zp = &zio->io_prop; 2465* 2466 ASSERT(BP_GET_LEVEL(bp) == 0); 2467 ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); 2468 ASSERT(zp->zp_nopwrite); 2469 ASSERT(!zp->zp_dedup); 2470 ASSERT(zio->io_bp_override == NULL); 2471 ASSERT(IO_IS_ALLOCATING(zio)); 2472 2473 /* 2474 * Check to see if the original bp and the new bp have matching 2475 * characteristics (i.e. same checksum, compression algorithms, etc). 2476 * If they don't then just continue with the pipeline which will 2477 * allocate a new bp. 2478 / 2479* if (BP_IS_HOLE(bp_orig) \|\| 2480 !(zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_flags & 2481 ZCHECKSUM_FLAG_NOPWRITE) \|\| 2482 BP_GET_CHECKSUM(bp) != BP_GET_CHECKSUM(bp_orig) \|\| 2483 BP_GET_COMPRESS(bp) != BP_GET_COMPRESS(bp_orig) \|\| 2484 BP_GET_DEDUP(bp) != BP_GET_DEDUP(bp_orig) \|\| 2485 zp->zp_copies != BP_GET_NDVAS(bp_orig)) 2486 return (ZIO_PIPELINE_CONTINUE); 2487 2488 /* 2489 * If the checksums match then reset the pipeline so that we 2490 * avoid allocating a new bp and issuing any I/O. 2491 / 2492* if (ZIO_CHECKSUM_EQUAL(bp->blk_cksum, bp_orig->blk_cksum)) { 2493 ASSERT(zio_checksum_table[zp->zp_checksum].ci_flags & 2494 ZCHECKSUM_FLAG_NOPWRITE); 2495 ASSERT3U(BP_GET_PSIZE(bp), ==, BP_GET_PSIZE(bp_orig)); 2496 ASSERT3U(BP_GET_LSIZE(bp), ==, BP_GET_LSIZE(bp_orig)); 2497 ASSERT(zp->zp_compress != ZIO_COMPRESS_OFF); 2498 ASSERT(bcmp(&bp->blk_prop, &bp_orig->blk_prop, 2499 sizeof (uint64_t)) == 0); 2500 2501 bp = bp_orig; 2502 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 2503 zio->io_flags \|= ZIO_FLAG_NOPWRITE; 2504 } 2505 2506 return (ZIO_PIPELINE_CONTINUE); 2507} 2508 2509/* 2510 * ========================================================================== 2511 * Dedup 2512 * ========================================================================== 2513 / 2514static void 2515zio_ddt_child_read_done(zio_t zio) 2516{ 2517 blkptr_t bp = zio->io_bp; 2518* ddt_entry_t dde = zio->io_private; 2519* ddt_phys_t ddp; 2520* zio_t pio = zio_unique_parent(zio); 2521* 2522 mutex_enter(&pio->io_lock); 2523 ddp = ddt_phys_select(dde, bp); 2524 if (zio->io_error == 0) 2525 ddt_phys_clear(ddp); /* this ddp doesn't need repair / 2526* 2527 if (zio->io_error == 0 && dde->dde_repair_abd == NULL) 2528 dde->dde_repair_abd = zio->io_abd; 2529 else 2530 abd_free(zio->io_abd); 2531 mutex_exit(&pio->io_lock); 2532} 2533 2534static int 2535zio_ddt_read_start(zio_t zio) 2536{ 2537* blkptr_t bp = zio->io_bp; 2538* 2539 ASSERT(BP_GET_DEDUP(bp)); 2540 ASSERT(BP_GET_PSIZE(bp) == zio->io_size); 2541 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 2542 2543 if (zio->io_child_error[ZIO_CHILD_DDT]) { 2544 ddt_t ddt = ddt_select(zio->io_spa, bp); 2545* ddt_entry_t dde = ddt_repair_start(ddt, bp); 2546* ddt_phys_t ddp = dde->dde_phys; 2547* ddt_phys_t ddp_self = ddt_phys_select(dde, bp); 2548* blkptr_t blk; 2549 2550 ASSERT(zio->io_vsd == NULL); 2551 zio->io_vsd = dde; 2552 2553 if (ddp_self == NULL) 2554 return (ZIO_PIPELINE_CONTINUE); 2555 2556 for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { 2557 if (ddp->ddp_phys_birth == 0 \|\| ddp == ddp_self) 2558 continue; 2559 ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, ddp, 2560 &blk); 2561 zio_nowait(zio_read(zio, zio->io_spa, &blk, 2562 abd_alloc_for_io(zio->io_size, B_TRUE), 2563 zio->io_size, zio_ddt_child_read_done, dde, 2564 zio->io_priority, ZIO_DDT_CHILD_FLAGS(zio) \| 2565 ZIO_FLAG_DONT_PROPAGATE, &zio->io_bookmark)); 2566 } 2567 return (ZIO_PIPELINE_CONTINUE); 2568 } 2569 2570 zio_nowait(zio_read(zio, zio->io_spa, bp, 2571 zio->io_abd, zio->io_size, NULL, NULL, zio->io_priority, 2572 ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark)); 2573 2574 return (ZIO_PIPELINE_CONTINUE); 2575} 2576 2577static int 2578zio_ddt_read_done(zio_t zio) 2579{ 2580* blkptr_t bp = zio->io_bp; 2581* 2582 if (zio_wait_for_children(zio, ZIO_CHILD_DDT_BIT, ZIO_WAIT_DONE)) { 2583 return (ZIO_PIPELINE_STOP); 2584 } 2585 2586 ASSERT(BP_GET_DEDUP(bp)); 2587 ASSERT(BP_GET_PSIZE(bp) == zio->io_size); 2588 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 2589 2590 if (zio->io_child_error[ZIO_CHILD_DDT]) { 2591 ddt_t ddt = ddt_select(zio->io_spa, bp); 2592* ddt_entry_t dde = zio->io_vsd; 2593* if (ddt == NULL) { 2594 ASSERT(spa_load_state(zio->io_spa) != SPA_LOAD_NONE); 2595 return (ZIO_PIPELINE_CONTINUE); 2596 } 2597 if (dde == NULL) { 2598 zio->io_stage = ZIO_STAGE_DDT_READ_START >> 1; 2599 zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE); 2600 return (ZIO_PIPELINE_STOP); 2601 } 2602 if (dde->dde_repair_abd != NULL) { 2603 abd_copy(zio->io_abd, dde->dde_repair_abd, 2604 zio->io_size); 2605 zio->io_child_error[ZIO_CHILD_DDT] = 0; 2606 } 2607 ddt_repair_done(ddt, dde); 2608 zio->io_vsd = NULL; 2609 } 2610 2611 ASSERT(zio->io_vsd == NULL); 2612 2613 return (ZIO_PIPELINE_CONTINUE); 2614} 2615 2616static boolean_t 2617zio_ddt_collision(zio_t zio, ddt_t ddt, ddt_entry_t dde) 2618{ 2619* spa_t spa = zio->io_spa; 2620* boolean_t do_raw = (zio->io_flags & ZIO_FLAG_RAW); 2621 2622 /* We should never get a raw, override zio / 2623* ASSERT(!(zio->io_bp_override && do_raw)); 2624 2625 /* 2626 * Note: we compare the original data, not the transformed data, 2627 * because when zio->io_bp is an override bp, we will not have 2628 * pushed the I/O transforms. That's an important optimization 2629 * because otherwise we'd compress/encrypt all dmu_sync() data twice. 2630 / 2631* for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) { 2632 zio_t lio = dde->dde_lead_zio[p]; 2633* 2634 if (lio != NULL) { 2635 return (lio->io_orig_size != zio->io_orig_size \|\| 2636 abd_cmp(zio->io_orig_abd, lio->io_orig_abd, 2637 zio->io_orig_size) != 0); 2638 } 2639 } 2640 2641 for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) { 2642 ddt_phys_t ddp = &dde->dde_phys[p]; 2643* 2644 if (ddp->ddp_phys_birth != 0) { 2645 arc_buf_t abuf = NULL; 2646* arc_flags_t aflags = ARC_FLAG_WAIT; 2647 int zio_flags = ZIO_FLAG_CANFAIL \| ZIO_FLAG_SPECULATIVE; 2648 blkptr_t blk = zio->io_bp; 2649* int error; 2650 2651 ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth); 2652 2653 ddt_exit(ddt); 2654 2655 /* 2656 * Intuitively, it would make more sense to compare 2657 * io_abd than io_orig_abd in the raw case since you 2658 * don't want to look at any transformations that have 2659 * happened to the data. However, for raw I/Os the 2660 * data will actually be the same in io_abd and 2661 * io_orig_abd, so all we have to do is issue this as 2662 * a raw ARC read. 2663 / 2664* if (do_raw) { 2665 zio_flags \|= ZIO_FLAG_RAW; 2666 ASSERT3U(zio->io_size, ==, zio->io_orig_size); 2667 ASSERT0(abd_cmp(zio->io_abd, zio->io_orig_abd, 2668 zio->io_size)); 2669 ASSERT3P(zio->io_transform_stack, ==, NULL); 2670 } 2671 2672 error = arc_read(NULL, spa, &blk, 2673 arc_getbuf_func, &abuf, ZIO_PRIORITY_SYNC_READ, 2674 zio_flags, &aflags, &zio->io_bookmark); 2675 2676 if (error == 0) { 2677 if (arc_buf_size(abuf) != zio->io_orig_size \|\| 2678 abd_cmp_buf(zio->io_orig_abd, abuf->b_data, 2679 zio->io_orig_size) != 0) 2680 error = SET_ERROR(EEXIST); 2681 arc_buf_destroy(abuf, &abuf); 2682 } 2683 2684 ddt_enter(ddt); 2685 return (error != 0); 2686 } 2687 } 2688 2689 return (B_FALSE); 2690} 2691 2692static void 2693zio_ddt_child_write_ready(zio_t zio) 2694{ 2695* int p = zio->io_prop.zp_copies; 2696 ddt_t ddt = ddt_select(zio->io_spa, zio->io_bp); 2697* ddt_entry_t dde = zio->io_private; 2698* ddt_phys_t ddp = &dde->dde_phys[p]; 2699* zio_t pio; 2700* 2701 if (zio->io_error) 2702 return; 2703 2704 ddt_enter(ddt); 2705 2706 ASSERT(dde->dde_lead_zio[p] == zio); 2707 2708 ddt_phys_fill(ddp, zio->io_bp); 2709 2710 zio_link_t zl = NULL; 2711* while ((pio = zio_walk_parents(zio, &zl)) != NULL) 2712 ddt_bp_fill(ddp, pio->io_bp, zio->io_txg); 2713 2714 ddt_exit(ddt); 2715} 2716 2717static void 2718zio_ddt_child_write_done(zio_t zio) 2719{ 2720* int p = zio->io_prop.zp_copies; 2721 ddt_t ddt = ddt_select(zio->io_spa, zio->io_bp); 2722* ddt_entry_t dde = zio->io_private; 2723* ddt_phys_t ddp = &dde->dde_phys[p]; 2724* 2725 ddt_enter(ddt); 2726 2727 ASSERT(ddp->ddp_refcnt == 0); 2728 ASSERT(dde->dde_lead_zio[p] == zio); 2729 dde->dde_lead_zio[p] = NULL; 2730 2731 if (zio->io_error == 0) { 2732 zio_link_t zl = NULL; 2733* while (zio_walk_parents(zio, &zl) != NULL) 2734 ddt_phys_addref(ddp); 2735 } else { 2736 ddt_phys_clear(ddp); 2737 } 2738 2739 ddt_exit(ddt); 2740} 2741 2742static void 2743zio_ddt_ditto_write_done(zio_t zio) 2744{ 2745* int p = DDT_PHYS_DITTO; 2746 zio_prop_t zp = &zio->io_prop; 2747* blkptr_t bp = zio->io_bp; 2748* ddt_t ddt = ddt_select(zio->io_spa, bp); 2749* ddt_entry_t dde = zio->io_private; 2750* ddt_phys_t ddp = &dde->dde_phys[p]; 2751* ddt_key_t ddk = &dde->dde_key; 2752* 2753 ddt_enter(ddt); 2754 2755 ASSERT(ddp->ddp_refcnt == 0); 2756 ASSERT(dde->dde_lead_zio[p] == zio); 2757 dde->dde_lead_zio[p] = NULL; 2758 2759 if (zio->io_error == 0) { 2760 ASSERT(ZIO_CHECKSUM_EQUAL(bp->blk_cksum, ddk->ddk_cksum)); 2761 ASSERT(zp->zp_copies < SPA_DVAS_PER_BP); 2762 ASSERT(zp->zp_copies == BP_GET_NDVAS(bp) - BP_IS_GANG(bp)); 2763 if (ddp->ddp_phys_birth != 0) 2764 ddt_phys_free(ddt, ddk, ddp, zio->io_txg); 2765 ddt_phys_fill(ddp, bp); 2766 } 2767 2768 ddt_exit(ddt); 2769} 2770 2771static int 2772zio_ddt_write(zio_t zio) 2773{ 2774* spa_t spa = zio->io_spa; 2775* blkptr_t bp = zio->io_bp; 2776* uint64_t txg = zio->io_txg; 2777 zio_prop_t zp = &zio->io_prop; 2778* int p = zp->zp_copies; 2779 int ditto_copies; 2780 zio_t cio = NULL; 2781* zio_t dio = NULL; 2782* ddt_t ddt = ddt_select(spa, bp); 2783* ddt_entry_t dde; 2784* ddt_phys_t ddp; 2785* 2786 ASSERT(BP_GET_DEDUP(bp)); 2787 ASSERT(BP_GET_CHECKSUM(bp) == zp->zp_checksum); 2788 ASSERT(BP_IS_HOLE(bp) \|\| zio->io_bp_override); 2789 ASSERT(!(zio->io_bp_override && (zio->io_flags & ZIO_FLAG_RAW))); 2790 2791 ddt_enter(ddt); 2792 dde = ddt_lookup(ddt, bp, B_TRUE); 2793 ddp = &dde->dde_phys[p]; 2794 2795 if (zp->zp_dedup_verify && zio_ddt_collision(zio, ddt, dde)) { 2796 /* 2797 * If we're using a weak checksum, upgrade to a strong checksum 2798 * and try again. If we're already using a strong checksum, 2799 * we can't resolve it, so just convert to an ordinary write. 2800 * (And automatically e-mail a paper to Nature?) 2801 / 2802* if (!(zio_checksum_table[zp->zp_checksum].ci_flags & 2803 ZCHECKSUM_FLAG_DEDUP)) { 2804 zp->zp_checksum = spa_dedup_checksum(spa); 2805 zio_pop_transforms(zio); 2806 zio->io_stage = ZIO_STAGE_OPEN; 2807 BP_ZERO(bp); 2808 } else { 2809 zp->zp_dedup = B_FALSE; 2810 BP_SET_DEDUP(bp, B_FALSE); 2811 } 2812 ASSERT(!BP_GET_DEDUP(bp)); 2813 zio->io_pipeline = ZIO_WRITE_PIPELINE; 2814 ddt_exit(ddt); 2815 return (ZIO_PIPELINE_CONTINUE); 2816 } 2817 2818 ditto_copies = ddt_ditto_copies_needed(ddt, dde, ddp); 2819 ASSERT(ditto_copies < SPA_DVAS_PER_BP); 2820 2821 if (ditto_copies > ddt_ditto_copies_present(dde) && 2822 dde->dde_lead_zio[DDT_PHYS_DITTO] == NULL) { 2823 zio_prop_t czp = zp; 2824* 2825 czp.zp_copies = ditto_copies; 2826 2827 /* 2828 * If we arrived here with an override bp, we won't have run 2829 * the transform stack, so we won't have the data we need to 2830 * generate a child i/o. So, toss the override bp and restart. 2831 * This is safe, because using the override bp is just an 2832 * optimization; and it's rare, so the cost doesn't matter. 2833 / 2834* if (zio->io_bp_override) { 2835 zio_pop_transforms(zio); 2836 zio->io_stage = ZIO_STAGE_OPEN; 2837 zio->io_pipeline = ZIO_WRITE_PIPELINE; 2838 zio->io_bp_override = NULL; 2839 BP_ZERO(bp); 2840 ddt_exit(ddt); 2841 return (ZIO_PIPELINE_CONTINUE); 2842 } 2843 2844 dio = zio_write(zio, spa, txg, bp, zio->io_orig_abd, 2845 zio->io_orig_size, zio->io_orig_size, &czp, NULL, NULL, 2846 NULL, zio_ddt_ditto_write_done, dde, zio->io_priority, 2847 ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark); 2848 2849 zio_push_transform(dio, zio->io_abd, zio->io_size, 0, NULL); 2850 dde->dde_lead_zio[DDT_PHYS_DITTO] = dio; 2851 } 2852 2853 if (ddp->ddp_phys_birth != 0 \|\| dde->dde_lead_zio[p] != NULL) { 2854 if (ddp->ddp_phys_birth != 0) 2855 ddt_bp_fill(ddp, bp, txg); 2856 if (dde->dde_lead_zio[p] != NULL) 2857 zio_add_child(zio, dde->dde_lead_zio[p]); 2858 else 2859 ddt_phys_addref(ddp); 2860 } else if (zio->io_bp_override) { 2861 ASSERT(bp->blk_birth == txg); 2862 ASSERT(BP_EQUAL(bp, zio->io_bp_override)); 2863 ddt_phys_fill(ddp, bp); 2864 ddt_phys_addref(ddp); 2865 } else { 2866 cio = zio_write(zio, spa, txg, bp, zio->io_orig_abd, 2867 zio->io_orig_size, zio->io_orig_size, zp, 2868 zio_ddt_child_write_ready, NULL, NULL, 2869 zio_ddt_child_write_done, dde, zio->io_priority, 2870 ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark); 2871 2872 zio_push_transform(cio, zio->io_abd, zio->io_size, 0, NULL); 2873 dde->dde_lead_zio[p] = cio; 2874 } 2875 2876 ddt_exit(ddt); 2877 2878 if (cio) 2879 zio_nowait(cio); 2880 if (dio) 2881 zio_nowait(dio); 2882 2883 return (ZIO_PIPELINE_CONTINUE); 2884} 2885 2886ddt_entry_t freedde; / for debugging / 2887* 2888static int 2889zio_ddt_free(zio_t zio) 2890{ 2891* spa_t spa = zio->io_spa; 2892* blkptr_t bp = zio->io_bp; 2893* ddt_t ddt = ddt_select(spa, bp); 2894* ddt_entry_t dde; 2895* ddt_phys_t ddp; 2896* 2897 ASSERT(BP_GET_DEDUP(bp)); 2898 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 2899 2900 ddt_enter(ddt); 2901 freedde = dde = ddt_lookup(ddt, bp, B_TRUE); 2902 ddp = ddt_phys_select(dde, bp); 2903 ddt_phys_decref(ddp); 2904 ddt_exit(ddt); 2905 2906 return (ZIO_PIPELINE_CONTINUE); 2907} 2908 2909/* 2910 * ========================================================================== 2911 * Allocate and free blocks 2912 * ========================================================================== 2913 / 2914* 2915static zio_t * 2916zio_io_to_allocate(spa_t spa) 2917{ 2918* zio_t zio; 2919* 2920 ASSERT(MUTEX_HELD(&spa->spa_alloc_lock)); 2921 2922 zio = avl_first(&spa->spa_alloc_tree); 2923 if (zio == NULL) 2924 return (NULL); 2925 2926 ASSERT(IO_IS_ALLOCATING(zio)); 2927 2928 /* 2929 * Try to place a reservation for this zio. If we're unable to 2930 * reserve then we throttle. 2931 / 2932* if (!metaslab_class_throttle_reserve(spa_normal_class(spa), 2933 zio->io_prop.zp_copies, zio, 0)) { 2934 return (NULL); 2935 } 2936 2937 avl_remove(&spa->spa_alloc_tree, zio); 2938 ASSERT3U(zio->io_stage, <, ZIO_STAGE_DVA_ALLOCATE); 2939 2940 return (zio); 2941} 2942 2943static int 2944zio_dva_throttle(zio_t zio) 2945{ 2946* spa_t spa = zio->io_spa; 2947* zio_t nio; 2948* 2949 if (zio->io_priority == ZIO_PRIORITY_SYNC_WRITE \|\| 2950 !spa_normal_class(zio->io_spa)->mc_alloc_throttle_enabled \|\| 2951 zio->io_child_type == ZIO_CHILD_GANG \|\| 2952 zio->io_flags & ZIO_FLAG_NODATA) { 2953 return (ZIO_PIPELINE_CONTINUE); 2954 } 2955 2956 ASSERT(zio->io_child_type > ZIO_CHILD_GANG); 2957 2958 ASSERT3U(zio->io_queued_timestamp, >, 0); 2959 ASSERT(zio->io_stage == ZIO_STAGE_DVA_THROTTLE); 2960 2961 mutex_enter(&spa->spa_alloc_lock); 2962 2963 ASSERT(zio->io_type == ZIO_TYPE_WRITE); 2964 avl_add(&spa->spa_alloc_tree, zio); 2965 2966 nio = zio_io_to_allocate(zio->io_spa); 2967 mutex_exit(&spa->spa_alloc_lock); 2968 2969 if (nio == zio) 2970 return (ZIO_PIPELINE_CONTINUE); 2971 2972 if (nio != NULL) { 2973 ASSERT(nio->io_stage == ZIO_STAGE_DVA_THROTTLE); 2974 /* 2975 * We are passing control to a new zio so make sure that 2976 * it is processed by a different thread. We do this to 2977 * avoid stack overflows that can occur when parents are 2978 * throttled and children are making progress. We allow 2979 * it to go to the head of the taskq since it's already 2980 * been waiting. 2981 / 2982* zio_taskq_dispatch(nio, ZIO_TASKQ_ISSUE, B_TRUE); 2983 } 2984 return (ZIO_PIPELINE_STOP); 2985} 2986 2987void 2988zio_allocate_dispatch(spa_t spa) 2989{ 2990* zio_t zio; 2991* 2992 mutex_enter(&spa->spa_alloc_lock); 2993 zio = zio_io_to_allocate(spa); 2994 mutex_exit(&spa->spa_alloc_lock); 2995 if (zio == NULL) 2996 return; 2997 2998 ASSERT3U(zio->io_stage, ==, ZIO_STAGE_DVA_THROTTLE); 2999 ASSERT0(zio->io_error); 3000 zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_TRUE); 3001} 3002 3003static int 3004zio_dva_allocate(zio_t zio) 3005{ 3006* spa_t spa = zio->io_spa; 3007* metaslab_class_t mc = spa_normal_class(spa); 3008* blkptr_t bp = zio->io_bp; 3009* int error; 3010 int flags = 0; 3011 3012 if (zio->io_gang_leader == NULL) { 3013 ASSERT(zio->io_child_type > ZIO_CHILD_GANG); 3014 zio->io_gang_leader = zio; 3015 } 3016 3017 ASSERT(BP_IS_HOLE(bp)); 3018 ASSERT0(BP_GET_NDVAS(bp)); 3019 ASSERT3U(zio->io_prop.zp_copies, >, 0); 3020 ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa)); 3021 ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp)); 3022 3023 if (zio->io_flags & ZIO_FLAG_NODATA) { 3024 flags \|= METASLAB_DONT_THROTTLE; 3025 } 3026 if (zio->io_flags & ZIO_FLAG_GANG_CHILD) { 3027 flags \|= METASLAB_GANG_CHILD; 3028 } 3029 if (zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE) { 3030 flags \|= METASLAB_ASYNC_ALLOC; 3031 } 3032 3033 error = metaslab_alloc(spa, mc, zio->io_size, bp, 3034 zio->io_prop.zp_copies, zio->io_txg, NULL, flags, 3035 &zio->io_alloc_list, zio); 3036 3037 if (error != 0) { 3038 spa_dbgmsg(spa, "%s: metaslab allocation failure: zio %p, " 3039 "size %llu, error %d", spa_name(spa), zio, zio->io_size, 3040 error); 3041 if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE) 3042 return (zio_write_gang_block(zio)); 3043 zio->io_error = error; 3044 } 3045 3046 return (ZIO_PIPELINE_CONTINUE); 3047} 3048 3049static int 3050zio_dva_free(zio_t zio) 3051{ 3052* metaslab_free(zio->io_spa, zio->io_bp, zio->io_txg, B_FALSE); 3053 3054 return (ZIO_PIPELINE_CONTINUE); 3055} 3056 3057static int 3058zio_dva_claim(zio_t zio) 3059{ 3060* int error; 3061 3062 error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg); 3063 if (error) 3064 zio->io_error = error; 3065 3066 return (ZIO_PIPELINE_CONTINUE); 3067} 3068 3069/* 3070 * Undo an allocation. This is used by zio_done() when an I/O fails 3071 * and we want to give back the block we just allocated. 3072 * This handles both normal blocks and gang blocks. 3073 / 3074static void 3075zio_dva_unallocate(zio_t zio, zio_gang_node_t gn, blkptr_t bp) 3076{ 3077 ASSERT(bp->blk_birth == zio->io_txg \|\| BP_IS_HOLE(bp)); 3078 ASSERT(zio->io_bp_override == NULL); 3079 3080 if (!BP_IS_HOLE(bp)) 3081 metaslab_free(zio->io_spa, bp, bp->blk_birth, B_TRUE); 3082 3083 if (gn != NULL) { 3084 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { 3085 zio_dva_unallocate(zio, gn->gn_child[g], 3086 &gn->gn_gbh->zg_blkptr[g]); 3087 } 3088 } 3089} 3090 3091/* 3092 * Try to allocate an intent log block. Return 0 on success, errno on failure. 3093 / 3094int 3095zio_alloc_zil(spa_t spa, uint64_t txg, blkptr_t new_bp, blkptr_t old_bp, 3096 uint64_t size, boolean_t slog) 3097{ 3098* int error = 1; 3099 zio_alloc_list_t io_alloc_list; 3100 3101 ASSERT(txg > spa_syncing_txg(spa)); 3102 3103 metaslab_trace_init(&io_alloc_list); 3104 error = metaslab_alloc(spa, spa_log_class(spa), size, new_bp, 1, 3105 txg, old_bp, METASLAB_HINTBP_AVOID, &io_alloc_list, NULL); 3106 if (error == 0) { 3107 slog = TRUE; 3108* } else { 3109 error = metaslab_alloc(spa, spa_normal_class(spa), size, 3110 new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID, 3111 &io_alloc_list, NULL); 3112 if (error == 0) 3113 slog = FALSE; 3114* } 3115 metaslab_trace_fini(&io_alloc_list); 3116 3117 if (error == 0) { 3118 BP_SET_LSIZE(new_bp, size); 3119 BP_SET_PSIZE(new_bp, size); 3120 BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF); 3121 BP_SET_CHECKSUM(new_bp, 3122 spa_version(spa) >= SPA_VERSION_SLIM_ZIL 3123 ? ZIO_CHECKSUM_ZILOG2 : ZIO_CHECKSUM_ZILOG); 3124 BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG); 3125 BP_SET_LEVEL(new_bp, 0); 3126 BP_SET_DEDUP(new_bp, 0); 3127 BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER); 3128 } else { 3129 zfs_dbgmsg("%s: zil block allocation failure: " 3130 "size %llu, error %d", spa_name(spa), size, error); 3131 } 3132 3133 return (error); 3134} 3135 3136/* 3137 * ========================================================================== 3138 * Read, write and delete to physical devices 3139 * ========================================================================== 3140 / 3141* 3142 3143/* 3144 * Issue an I/O to the underlying vdev. Typically the issue pipeline 3145 * stops after this stage and will resume upon I/O completion. 3146 * However, there are instances where the vdev layer may need to 3147 * continue the pipeline when an I/O was not issued. Since the I/O 3148 * that was sent to the vdev layer might be different than the one 3149 * currently active in the pipeline (see vdev_queue_io()), we explicitly 3150 * force the underlying vdev layers to call either zio_execute() or 3151 * zio_interrupt() to ensure that the pipeline continues with the correct I/O. 3152 / 3153static int 3154zio_vdev_io_start(zio_t zio) 3155{ 3156 vdev_t vd = zio->io_vd; 3157* uint64_t align; 3158 spa_t spa = zio->io_spa; 3159* int ret; 3160 3161 ASSERT(zio->io_error == 0); 3162 ASSERT(zio->io_child_error[ZIO_CHILD_VDEV] == 0); 3163 3164 if (vd == NULL) { 3165 if (!(zio->io_flags & ZIO_FLAG_CONFIG_WRITER)) 3166 spa_config_enter(spa, SCL_ZIO, zio, RW_READER); 3167 3168 /* 3169 * The mirror_ops handle multiple DVAs in a single BP. 3170 / 3171* vdev_mirror_ops.vdev_op_io_start(zio); 3172 return (ZIO_PIPELINE_STOP); 3173 } 3174 3175 if (vd->vdev_ops->vdev_op_leaf && zio->io_type == ZIO_TYPE_FREE && 3176 zio->io_priority == ZIO_PRIORITY_NOW) { 3177 trim_map_free(vd, zio->io_offset, zio->io_size, zio->io_txg); 3178 return (ZIO_PIPELINE_CONTINUE); 3179 } 3180 3181 ASSERT3P(zio->io_logical, !=, zio); 3182 if (zio->io_type == ZIO_TYPE_WRITE) { 3183 ASSERT(spa->spa_trust_config); 3184 3185 if (zio->io_vd->vdev_removing) { 3186 ASSERT(zio->io_flags & 3187 (ZIO_FLAG_PHYSICAL \| ZIO_FLAG_SELF_HEAL \| 3188 ZIO_FLAG_INDUCE_DAMAGE)); 3189 } 3190 } 3191
3187 /* 3188 * We keep track of time-sensitive I/Os so that the scan thread 3189 * can quickly react to certain workloads. In particular, we care 3190 * about non-scrubbing, top-level reads and writes with the following 3191 * characteristics: 3192 * - synchronous writes of user data to non-slog devices 3193 * - any reads of user data 3194 * When these conditions are met, adjust the timestamp of spa_last_io 3195 * which allows the scan thread to adjust its workload accordingly. 3196 / 3197* if (!(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && zio->io_bp != NULL && 3198 vd == vd->vdev_top && !vd->vdev_islog && 3199 zio->io_bookmark.zb_objset != DMU_META_OBJSET && 3200 zio->io_txg != spa_syncing_txg(spa)) { 3201 uint64_t old = spa->spa_last_io; 3202 uint64_t new = ddi_get_lbolt64(); 3203 if (old != new) 3204 (void) atomic_cas_64(&spa->spa_last_io, old, new); 3205 } 3206	3192 /* 3193 * We keep track of time-sensitive I/Os so that the scan thread 3194 * can quickly react to certain workloads. In particular, we care 3195 * about non-scrubbing, top-level reads and writes with the following 3196 * characteristics: 3197 * - synchronous writes of user data to non-slog devices 3198 * - any reads of user data 3199 * When these conditions are met, adjust the timestamp of spa_last_io 3200 * which allows the scan thread to adjust its workload accordingly. 3201 / 3202* if (!(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && zio->io_bp != NULL && 3203 vd == vd->vdev_top && !vd->vdev_islog && 3204 zio->io_bookmark.zb_objset != DMU_META_OBJSET && 3205 zio->io_txg != spa_syncing_txg(spa)) { 3206 uint64_t old = spa->spa_last_io; 3207 uint64_t new = ddi_get_lbolt64(); 3208 if (old != new) 3209 (void) atomic_cas_64(&spa->spa_last_io, old, new); 3210 }
3207 align = 1ULL << vd->vdev_top->vdev_ashift; 3208 3209 if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) && 3210 P2PHASE(zio->io_size, align) != 0) { 3211 /* Transform logical writes to be a full physical block size. / 3212* uint64_t asize = P2ROUNDUP(zio->io_size, align); 3213 abd_t abuf = NULL; 3214* if (zio->io_type == ZIO_TYPE_READ \|\| 3215 zio->io_type == ZIO_TYPE_WRITE) 3216 abuf = abd_alloc_sametype(zio->io_abd, asize); 3217 ASSERT(vd == vd->vdev_top); 3218 if (zio->io_type == ZIO_TYPE_WRITE) { 3219 abd_copy(abuf, zio->io_abd, zio->io_size); 3220 abd_zero_off(abuf, zio->io_size, asize - zio->io_size); 3221 } 3222 zio_push_transform(zio, abuf, asize, abuf ? asize : 0, 3223 zio_subblock); 3224 } 3225 3226 /* 3227 * If this is not a physical io, make sure that it is properly aligned 3228 * before proceeding. 3229 / 3230* if (!(zio->io_flags & ZIO_FLAG_PHYSICAL)) { 3231 ASSERT0(P2PHASE(zio->io_offset, align)); 3232 ASSERT0(P2PHASE(zio->io_size, align)); 3233 } else { 3234 /* 3235 * For the physical io we allow alignment 3236 * to a logical block size. 3237 / 3238* uint64_t log_align = 3239 1ULL << vd->vdev_top->vdev_logical_ashift; 3240 ASSERT0(P2PHASE(zio->io_offset, log_align)); 3241 ASSERT0(P2PHASE(zio->io_size, log_align)); 3242 } 3243 3244 VERIFY(zio->io_type == ZIO_TYPE_READ \|\| spa_writeable(spa)); 3245 3246 /* 3247 * If this is a repair I/O, and there's no self-healing involved -- 3248 * that is, we're just resilvering what we expect to resilver -- 3249 * then don't do the I/O unless zio's txg is actually in vd's DTL. 3250 * This prevents spurious resilvering with nested replication. 3251 * For example, given a mirror of mirrors, (A+B)+(C+D), if only 3252 * A is out of date, we'll read from C+D, then use the data to 3253 * resilver A+B -- but we don't actually want to resilver B, just A. 3254 * The top-level mirror has no way to know this, so instead we just 3255 * discard unnecessary repairs as we work our way down the vdev tree. 3256 * The same logic applies to any form of nested replication: 3257 * ditto + mirror, RAID-Z + replacing, etc. This covers them all. 3258 / 3259* if ((zio->io_flags & ZIO_FLAG_IO_REPAIR) && 3260 !(zio->io_flags & ZIO_FLAG_SELF_HEAL) && 3261 zio->io_txg != 0 && /* not a delegated i/o / 3262* !vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) { 3263 ASSERT(zio->io_type == ZIO_TYPE_WRITE); 3264 zio_vdev_io_bypass(zio); 3265 return (ZIO_PIPELINE_CONTINUE); 3266 } 3267 3268 if (vd->vdev_ops->vdev_op_leaf) { 3269 switch (zio->io_type) { 3270 case ZIO_TYPE_READ: 3271 if (vdev_cache_read(zio)) 3272 return (ZIO_PIPELINE_CONTINUE); 3273 /* FALLTHROUGH / 3274* case ZIO_TYPE_WRITE: 3275 case ZIO_TYPE_FREE: 3276 if ((zio = vdev_queue_io(zio)) == NULL) 3277 return (ZIO_PIPELINE_STOP); 3278 3279 if (!vdev_accessible(vd, zio)) { 3280 zio->io_error = SET_ERROR(ENXIO); 3281 zio_interrupt(zio); 3282 return (ZIO_PIPELINE_STOP); 3283 } 3284 break; 3285 } 3286 /* 3287 * Note that we ignore repair writes for TRIM because they can 3288 * conflict with normal writes. This isn't an issue because, by 3289 * definition, we only repair blocks that aren't freed. 3290 / 3291* if (zio->io_type == ZIO_TYPE_WRITE && 3292 !(zio->io_flags & ZIO_FLAG_IO_REPAIR) && 3293 !trim_map_write_start(zio)) 3294 return (ZIO_PIPELINE_STOP); 3295 } 3296 3297 vd->vdev_ops->vdev_op_io_start(zio); 3298 return (ZIO_PIPELINE_STOP); 3299} 3300 3301static int 3302zio_vdev_io_done(zio_t zio) 3303{ 3304* vdev_t vd = zio->io_vd; 3305* vdev_ops_t ops = vd ? vd->vdev_ops : &vdev_mirror_ops; 3306* boolean_t unexpected_error = B_FALSE; 3307 3308 if (zio_wait_for_children(zio, ZIO_CHILD_VDEV_BIT, ZIO_WAIT_DONE)) { 3309 return (ZIO_PIPELINE_STOP); 3310 } 3311 3312 ASSERT(zio->io_type == ZIO_TYPE_READ \|\| 3313 zio->io_type == ZIO_TYPE_WRITE \|\| zio->io_type == ZIO_TYPE_FREE); 3314 3315 if (vd != NULL && vd->vdev_ops->vdev_op_leaf && 3316 (zio->io_type == ZIO_TYPE_READ \|\| zio->io_type == ZIO_TYPE_WRITE \|\| 3317 zio->io_type == ZIO_TYPE_FREE)) { 3318 3319 if (zio->io_type == ZIO_TYPE_WRITE && 3320 !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) 3321 trim_map_write_done(zio); 3322 3323 vdev_queue_io_done(zio); 3324 3325 if (zio->io_type == ZIO_TYPE_WRITE) 3326 vdev_cache_write(zio); 3327 3328 if (zio_injection_enabled && zio->io_error == 0) 3329 zio->io_error = zio_handle_device_injection(vd, 3330 zio, EIO); 3331 3332 if (zio_injection_enabled && zio->io_error == 0) 3333 zio->io_error = zio_handle_label_injection(zio, EIO); 3334 3335 if (zio->io_error) { 3336 if (zio->io_error == ENOTSUP && 3337 zio->io_type == ZIO_TYPE_FREE) { 3338 /* Not all devices support TRIM. / 3339* } else if (!vdev_accessible(vd, zio)) { 3340 zio->io_error = SET_ERROR(ENXIO); 3341 } else { 3342 unexpected_error = B_TRUE; 3343 } 3344 } 3345 } 3346 3347 ops->vdev_op_io_done(zio); 3348 3349 if (unexpected_error) 3350 VERIFY(vdev_probe(vd, zio) == NULL); 3351 3352 return (ZIO_PIPELINE_CONTINUE); 3353} 3354 3355/*	3211 align = 1ULL << vd->vdev_top->vdev_ashift; 3212 3213 if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) && 3214 P2PHASE(zio->io_size, align) != 0) { 3215 /* Transform logical writes to be a full physical block size. / 3216* uint64_t asize = P2ROUNDUP(zio->io_size, align); 3217 abd_t abuf = NULL; 3218* if (zio->io_type == ZIO_TYPE_READ \|\| 3219 zio->io_type == ZIO_TYPE_WRITE) 3220 abuf = abd_alloc_sametype(zio->io_abd, asize); 3221 ASSERT(vd == vd->vdev_top); 3222 if (zio->io_type == ZIO_TYPE_WRITE) { 3223 abd_copy(abuf, zio->io_abd, zio->io_size); 3224 abd_zero_off(abuf, zio->io_size, asize - zio->io_size); 3225 } 3226 zio_push_transform(zio, abuf, asize, abuf ? asize : 0, 3227 zio_subblock); 3228 } 3229 3230 /* 3231 * If this is not a physical io, make sure that it is properly aligned 3232 * before proceeding. 3233 / 3234* if (!(zio->io_flags & ZIO_FLAG_PHYSICAL)) { 3235 ASSERT0(P2PHASE(zio->io_offset, align)); 3236 ASSERT0(P2PHASE(zio->io_size, align)); 3237 } else { 3238 /* 3239 * For the physical io we allow alignment 3240 * to a logical block size. 3241 / 3242* uint64_t log_align = 3243 1ULL << vd->vdev_top->vdev_logical_ashift; 3244 ASSERT0(P2PHASE(zio->io_offset, log_align)); 3245 ASSERT0(P2PHASE(zio->io_size, log_align)); 3246 } 3247 3248 VERIFY(zio->io_type == ZIO_TYPE_READ \|\| spa_writeable(spa)); 3249 3250 /* 3251 * If this is a repair I/O, and there's no self-healing involved -- 3252 * that is, we're just resilvering what we expect to resilver -- 3253 * then don't do the I/O unless zio's txg is actually in vd's DTL. 3254 * This prevents spurious resilvering with nested replication. 3255 * For example, given a mirror of mirrors, (A+B)+(C+D), if only 3256 * A is out of date, we'll read from C+D, then use the data to 3257 * resilver A+B -- but we don't actually want to resilver B, just A. 3258 * The top-level mirror has no way to know this, so instead we just 3259 * discard unnecessary repairs as we work our way down the vdev tree. 3260 * The same logic applies to any form of nested replication: 3261 * ditto + mirror, RAID-Z + replacing, etc. This covers them all. 3262 / 3263* if ((zio->io_flags & ZIO_FLAG_IO_REPAIR) && 3264 !(zio->io_flags & ZIO_FLAG_SELF_HEAL) && 3265 zio->io_txg != 0 && /* not a delegated i/o / 3266* !vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) { 3267 ASSERT(zio->io_type == ZIO_TYPE_WRITE); 3268 zio_vdev_io_bypass(zio); 3269 return (ZIO_PIPELINE_CONTINUE); 3270 } 3271 3272 if (vd->vdev_ops->vdev_op_leaf) { 3273 switch (zio->io_type) { 3274 case ZIO_TYPE_READ: 3275 if (vdev_cache_read(zio)) 3276 return (ZIO_PIPELINE_CONTINUE); 3277 /* FALLTHROUGH / 3278* case ZIO_TYPE_WRITE: 3279 case ZIO_TYPE_FREE: 3280 if ((zio = vdev_queue_io(zio)) == NULL) 3281 return (ZIO_PIPELINE_STOP); 3282 3283 if (!vdev_accessible(vd, zio)) { 3284 zio->io_error = SET_ERROR(ENXIO); 3285 zio_interrupt(zio); 3286 return (ZIO_PIPELINE_STOP); 3287 } 3288 break; 3289 } 3290 /* 3291 * Note that we ignore repair writes for TRIM because they can 3292 * conflict with normal writes. This isn't an issue because, by 3293 * definition, we only repair blocks that aren't freed. 3294 / 3295* if (zio->io_type == ZIO_TYPE_WRITE && 3296 !(zio->io_flags & ZIO_FLAG_IO_REPAIR) && 3297 !trim_map_write_start(zio)) 3298 return (ZIO_PIPELINE_STOP); 3299 } 3300 3301 vd->vdev_ops->vdev_op_io_start(zio); 3302 return (ZIO_PIPELINE_STOP); 3303} 3304 3305static int 3306zio_vdev_io_done(zio_t zio) 3307{ 3308* vdev_t vd = zio->io_vd; 3309* vdev_ops_t ops = vd ? vd->vdev_ops : &vdev_mirror_ops; 3310* boolean_t unexpected_error = B_FALSE; 3311 3312 if (zio_wait_for_children(zio, ZIO_CHILD_VDEV_BIT, ZIO_WAIT_DONE)) { 3313 return (ZIO_PIPELINE_STOP); 3314 } 3315 3316 ASSERT(zio->io_type == ZIO_TYPE_READ \|\| 3317 zio->io_type == ZIO_TYPE_WRITE \|\| zio->io_type == ZIO_TYPE_FREE); 3318 3319 if (vd != NULL && vd->vdev_ops->vdev_op_leaf && 3320 (zio->io_type == ZIO_TYPE_READ \|\| zio->io_type == ZIO_TYPE_WRITE \|\| 3321 zio->io_type == ZIO_TYPE_FREE)) { 3322 3323 if (zio->io_type == ZIO_TYPE_WRITE && 3324 !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) 3325 trim_map_write_done(zio); 3326 3327 vdev_queue_io_done(zio); 3328 3329 if (zio->io_type == ZIO_TYPE_WRITE) 3330 vdev_cache_write(zio); 3331 3332 if (zio_injection_enabled && zio->io_error == 0) 3333 zio->io_error = zio_handle_device_injection(vd, 3334 zio, EIO); 3335 3336 if (zio_injection_enabled && zio->io_error == 0) 3337 zio->io_error = zio_handle_label_injection(zio, EIO); 3338 3339 if (zio->io_error) { 3340 if (zio->io_error == ENOTSUP && 3341 zio->io_type == ZIO_TYPE_FREE) { 3342 /* Not all devices support TRIM. / 3343* } else if (!vdev_accessible(vd, zio)) { 3344 zio->io_error = SET_ERROR(ENXIO); 3345 } else { 3346 unexpected_error = B_TRUE; 3347 } 3348 } 3349 } 3350 3351 ops->vdev_op_io_done(zio); 3352 3353 if (unexpected_error) 3354 VERIFY(vdev_probe(vd, zio) == NULL); 3355 3356 return (ZIO_PIPELINE_CONTINUE); 3357} 3358 3359/*
	3360 * This function is used to change the priority of an existing zio that is 3361 * currently in-flight. This is used by the arc to upgrade priority in the 3362 * event that a demand read is made for a block that is currently queued 3363 * as a scrub or async read IO. Otherwise, the high priority read request 3364 * would end up having to wait for the lower priority IO. 3365 / 3366void 3367zio_change_priority(zio_t pio, zio_priority_t priority) 3368{ 3369 zio_t cio, cio_next; 3370 zio_link_t zl = NULL; 3371* 3372 ASSERT3U(priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); 3373 3374 if (pio->io_vd != NULL && pio->io_vd->vdev_ops->vdev_op_leaf) { 3375 vdev_queue_change_io_priority(pio, priority); 3376 } else { 3377 pio->io_priority = priority; 3378 } 3379 3380 mutex_enter(&pio->io_lock); 3381 for (cio = zio_walk_children(pio, &zl); cio != NULL; cio = cio_next) { 3382 cio_next = zio_walk_children(pio, &zl); 3383 zio_change_priority(cio, priority); 3384 } 3385 mutex_exit(&pio->io_lock); 3386} 3387 3388/*
3356 * For non-raidz ZIOs, we can just copy aside the bad data read from the 3357 * disk, and use that to finish the checksum ereport later. 3358 / 3359static void 3360zio_vsd_default_cksum_finish(zio_cksum_report_t zcr, 3361 const void good_buf) 3362{ 3363* /* no processing needed / 3364* zfs_ereport_finish_checksum(zcr, good_buf, zcr->zcr_cbdata, B_FALSE); 3365} 3366 3367/ARGSUSED/ 3368void 3369zio_vsd_default_cksum_report(zio_t zio, zio_cksum_report_t zcr, void ignored) 3370{ 3371* void buf = zio_buf_alloc(zio->io_size); 3372* 3373 abd_copy_to_buf(buf, zio->io_abd, zio->io_size); 3374 3375 zcr->zcr_cbinfo = zio->io_size; 3376 zcr->zcr_cbdata = buf; 3377 zcr->zcr_finish = zio_vsd_default_cksum_finish; 3378 zcr->zcr_free = zio_buf_free; 3379} 3380 3381static int 3382zio_vdev_io_assess(zio_t zio) 3383{ 3384* vdev_t vd = zio->io_vd; 3385* 3386 if (zio_wait_for_children(zio, ZIO_CHILD_VDEV_BIT, ZIO_WAIT_DONE)) { 3387 return (ZIO_PIPELINE_STOP); 3388 } 3389 3390 if (vd == NULL && !(zio->io_flags & ZIO_FLAG_CONFIG_WRITER)) 3391 spa_config_exit(zio->io_spa, SCL_ZIO, zio); 3392 3393 if (zio->io_vsd != NULL) { 3394 zio->io_vsd_ops->vsd_free(zio); 3395 zio->io_vsd = NULL; 3396 } 3397 3398 if (zio_injection_enabled && zio->io_error == 0) 3399 zio->io_error = zio_handle_fault_injection(zio, EIO); 3400 3401 if (zio->io_type == ZIO_TYPE_FREE && 3402 zio->io_priority != ZIO_PRIORITY_NOW) { 3403 switch (zio->io_error) { 3404 case 0: 3405 ZIO_TRIM_STAT_INCR(bytes, zio->io_size); 3406 ZIO_TRIM_STAT_BUMP(success); 3407 break; 3408 case EOPNOTSUPP: 3409 ZIO_TRIM_STAT_BUMP(unsupported); 3410 break; 3411 default: 3412 ZIO_TRIM_STAT_BUMP(failed); 3413 break; 3414 } 3415 } 3416 3417 /* 3418 * If the I/O failed, determine whether we should attempt to retry it. 3419 * 3420 * On retry, we cut in line in the issue queue, since we don't want 3421 * compression/checksumming/etc. work to prevent our (cheap) IO reissue. 3422 / 3423* if (zio->io_error && vd == NULL && 3424 !(zio->io_flags & (ZIO_FLAG_DONT_RETRY \| ZIO_FLAG_IO_RETRY))) { 3425 ASSERT(!(zio->io_flags & ZIO_FLAG_DONT_QUEUE)); /* not a leaf / 3426* ASSERT(!(zio->io_flags & ZIO_FLAG_IO_BYPASS)); /* not a leaf / 3427* zio->io_error = 0; 3428 zio->io_flags \|= ZIO_FLAG_IO_RETRY \| 3429 ZIO_FLAG_DONT_CACHE \| ZIO_FLAG_DONT_AGGREGATE; 3430 zio->io_stage = ZIO_STAGE_VDEV_IO_START >> 1; 3431 zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, 3432 zio_requeue_io_start_cut_in_line); 3433 return (ZIO_PIPELINE_STOP); 3434 } 3435 3436 /* 3437 * If we got an error on a leaf device, convert it to ENXIO 3438 * if the device is not accessible at all. 3439 / 3440* if (zio->io_error && vd != NULL && vd->vdev_ops->vdev_op_leaf && 3441 !vdev_accessible(vd, zio)) 3442 zio->io_error = SET_ERROR(ENXIO); 3443 3444 /* 3445 * If we can't write to an interior vdev (mirror or RAID-Z), 3446 * set vdev_cant_write so that we stop trying to allocate from it. 3447 / 3448* if (zio->io_error == ENXIO && zio->io_type == ZIO_TYPE_WRITE && 3449 vd != NULL && !vd->vdev_ops->vdev_op_leaf) { 3450 vd->vdev_cant_write = B_TRUE; 3451 } 3452 3453 /* 3454 * If a cache flush returns ENOTSUP or ENOTTY, we know that no future 3455 * attempts will ever succeed. In this case we set a persistent bit so 3456 * that we don't bother with it in the future. 3457 / 3458* if ((zio->io_error == ENOTSUP \|\| zio->io_error == ENOTTY) && 3459 zio->io_type == ZIO_TYPE_IOCTL && 3460 zio->io_cmd == DKIOCFLUSHWRITECACHE && vd != NULL) 3461 vd->vdev_nowritecache = B_TRUE; 3462 3463 if (zio->io_error) 3464 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 3465 3466 if (vd != NULL && vd->vdev_ops->vdev_op_leaf && 3467 zio->io_physdone != NULL) { 3468 ASSERT(!(zio->io_flags & ZIO_FLAG_DELEGATED)); 3469 ASSERT(zio->io_child_type == ZIO_CHILD_VDEV); 3470 zio->io_physdone(zio->io_logical); 3471 } 3472 3473 return (ZIO_PIPELINE_CONTINUE); 3474} 3475 3476void 3477zio_vdev_io_reissue(zio_t zio) 3478{ 3479* ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); 3480 ASSERT(zio->io_error == 0); 3481 3482 zio->io_stage >>= 1; 3483} 3484 3485void 3486zio_vdev_io_redone(zio_t zio) 3487{ 3488* ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE); 3489 3490 zio->io_stage >>= 1; 3491} 3492 3493void 3494zio_vdev_io_bypass(zio_t zio) 3495{ 3496* ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); 3497 ASSERT(zio->io_error == 0); 3498 3499 zio->io_flags \|= ZIO_FLAG_IO_BYPASS; 3500 zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS >> 1; 3501} 3502 3503/* 3504 * ========================================================================== 3505 * Generate and verify checksums 3506 * ========================================================================== 3507 / 3508static int 3509zio_checksum_generate(zio_t zio) 3510{ 3511 blkptr_t bp = zio->io_bp; 3512* enum zio_checksum checksum; 3513 3514 if (bp == NULL) { 3515 /* 3516 * This is zio_write_phys(). 3517 * We're either generating a label checksum, or none at all. 3518 / 3519* checksum = zio->io_prop.zp_checksum; 3520 3521 if (checksum == ZIO_CHECKSUM_OFF) 3522 return (ZIO_PIPELINE_CONTINUE); 3523 3524 ASSERT(checksum == ZIO_CHECKSUM_LABEL); 3525 } else { 3526 if (BP_IS_GANG(bp) && zio->io_child_type == ZIO_CHILD_GANG) { 3527 ASSERT(!IO_IS_ALLOCATING(zio)); 3528 checksum = ZIO_CHECKSUM_GANG_HEADER; 3529 } else { 3530 checksum = BP_GET_CHECKSUM(bp); 3531 } 3532 } 3533 3534 zio_checksum_compute(zio, checksum, zio->io_abd, zio->io_size); 3535 3536 return (ZIO_PIPELINE_CONTINUE); 3537} 3538 3539static int 3540zio_checksum_verify(zio_t zio) 3541{ 3542* zio_bad_cksum_t info; 3543 blkptr_t bp = zio->io_bp; 3544* int error; 3545 3546 ASSERT(zio->io_vd != NULL); 3547 3548 if (bp == NULL) { 3549 /* 3550 * This is zio_read_phys(). 3551 * We're either verifying a label checksum, or nothing at all. 3552 / 3553* if (zio->io_prop.zp_checksum == ZIO_CHECKSUM_OFF) 3554 return (ZIO_PIPELINE_CONTINUE); 3555 3556 ASSERT(zio->io_prop.zp_checksum == ZIO_CHECKSUM_LABEL); 3557 } 3558 3559 if ((error = zio_checksum_error(zio, &info)) != 0) { 3560 zio->io_error = error; 3561 if (error == ECKSUM && 3562 !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 3563 zfs_ereport_start_checksum(zio->io_spa, 3564 zio->io_vd, zio, zio->io_offset, 3565 zio->io_size, NULL, &info); 3566 } 3567 } 3568 3569 return (ZIO_PIPELINE_CONTINUE); 3570} 3571 3572/* 3573 * Called by RAID-Z to ensure we don't compute the checksum twice. 3574 / 3575void 3576zio_checksum_verified(zio_t zio) 3577{ 3578 zio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY; 3579} 3580 3581/* 3582 * ========================================================================== 3583 * Error rank. Error are ranked in the order 0, ENXIO, ECKSUM, EIO, other. 3584 * An error of 0 indicates success. ENXIO indicates whole-device failure, 3585 * which may be transient (e.g. unplugged) or permament. ECKSUM and EIO 3586 * indicate errors that are specific to one I/O, and most likely permanent. 3587 * Any other error is presumed to be worse because we weren't expecting it. 3588 * ========================================================================== 3589 / 3590int 3591zio_worst_error(int e1, int e2) 3592{ 3593* static int zio_error_rank[] = { 0, ENXIO, ECKSUM, EIO }; 3594 int r1, r2; 3595 3596 for (r1 = 0; r1 < sizeof (zio_error_rank) / sizeof (int); r1++) 3597 if (e1 == zio_error_rank[r1]) 3598 break; 3599 3600 for (r2 = 0; r2 < sizeof (zio_error_rank) / sizeof (int); r2++) 3601 if (e2 == zio_error_rank[r2]) 3602 break; 3603 3604 return (r1 > r2 ? e1 : e2); 3605} 3606 3607/* 3608 * ========================================================================== 3609 * I/O completion 3610 * ========================================================================== 3611 / 3612static int 3613zio_ready(zio_t zio) 3614{ 3615 blkptr_t bp = zio->io_bp; 3616* zio_t pio, pio_next; 3617 zio_link_t zl = NULL; 3618* 3619 if (zio_wait_for_children(zio, ZIO_CHILD_GANG_BIT \| ZIO_CHILD_DDT_BIT, 3620 ZIO_WAIT_READY)) { 3621 return (ZIO_PIPELINE_STOP); 3622 } 3623 3624 if (zio->io_ready) { 3625 ASSERT(IO_IS_ALLOCATING(zio)); 3626 ASSERT(bp->blk_birth == zio->io_txg \|\| BP_IS_HOLE(bp) \|\| 3627 (zio->io_flags & ZIO_FLAG_NOPWRITE)); 3628 ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0); 3629 3630 zio->io_ready(zio); 3631 } 3632 3633 if (bp != NULL && bp != &zio->io_bp_copy) 3634 zio->io_bp_copy = bp; 3635* 3636 if (zio->io_error != 0) { 3637 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 3638 3639 if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING) { 3640 ASSERT(IO_IS_ALLOCATING(zio)); 3641 ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); 3642 /* 3643 * We were unable to allocate anything, unreserve and 3644 * issue the next I/O to allocate. 3645 / 3646* metaslab_class_throttle_unreserve( 3647 spa_normal_class(zio->io_spa), 3648 zio->io_prop.zp_copies, zio); 3649 zio_allocate_dispatch(zio->io_spa); 3650 } 3651 } 3652 3653 mutex_enter(&zio->io_lock); 3654 zio->io_state[ZIO_WAIT_READY] = 1; 3655 pio = zio_walk_parents(zio, &zl); 3656 mutex_exit(&zio->io_lock); 3657 3658 /* 3659 * As we notify zio's parents, new parents could be added. 3660 * New parents go to the head of zio's io_parent_list, however, 3661 * so we will (correctly) not notify them. The remainder of zio's 3662 * io_parent_list, from 'pio_next' onward, cannot change because 3663 * all parents must wait for us to be done before they can be done. 3664 / 3665* for (; pio != NULL; pio = pio_next) { 3666 pio_next = zio_walk_parents(zio, &zl); 3667 zio_notify_parent(pio, zio, ZIO_WAIT_READY); 3668 } 3669 3670 if (zio->io_flags & ZIO_FLAG_NODATA) { 3671 if (BP_IS_GANG(bp)) { 3672 zio->io_flags &= ~ZIO_FLAG_NODATA; 3673 } else { 3674 ASSERT((uintptr_t)zio->io_abd < SPA_MAXBLOCKSIZE); 3675 zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; 3676 } 3677 } 3678 3679 if (zio_injection_enabled && 3680 zio->io_spa->spa_syncing_txg == zio->io_txg) 3681 zio_handle_ignored_writes(zio); 3682 3683 return (ZIO_PIPELINE_CONTINUE); 3684} 3685 3686/* 3687 * Update the allocation throttle accounting. 3688 / 3689static void 3690zio_dva_throttle_done(zio_t zio) 3691{ 3692 zio_t lio = zio->io_logical; 3693* zio_t pio = zio_unique_parent(zio); 3694* vdev_t vd = zio->io_vd; 3695* int flags = METASLAB_ASYNC_ALLOC; 3696 3697 ASSERT3P(zio->io_bp, !=, NULL); 3698 ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); 3699 ASSERT3U(zio->io_priority, ==, ZIO_PRIORITY_ASYNC_WRITE); 3700 ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV); 3701 ASSERT(vd != NULL); 3702 ASSERT3P(vd, ==, vd->vdev_top); 3703 ASSERT(!(zio->io_flags & (ZIO_FLAG_IO_REPAIR \| ZIO_FLAG_IO_RETRY))); 3704 ASSERT(zio->io_flags & ZIO_FLAG_IO_ALLOCATING); 3705 ASSERT(!(lio->io_flags & ZIO_FLAG_IO_REWRITE)); 3706 ASSERT(!(lio->io_orig_flags & ZIO_FLAG_NODATA)); 3707 3708 /* 3709 * Parents of gang children can have two flavors -- ones that 3710 * allocated the gang header (will have ZIO_FLAG_IO_REWRITE set) 3711 * and ones that allocated the constituent blocks. The allocation 3712 * throttle needs to know the allocating parent zio so we must find 3713 * it here. 3714 / 3715* if (pio->io_child_type == ZIO_CHILD_GANG) { 3716 /* 3717 * If our parent is a rewrite gang child then our grandparent 3718 * would have been the one that performed the allocation. 3719 / 3720* if (pio->io_flags & ZIO_FLAG_IO_REWRITE) 3721 pio = zio_unique_parent(pio); 3722 flags \|= METASLAB_GANG_CHILD; 3723 } 3724 3725 ASSERT(IO_IS_ALLOCATING(pio)); 3726 ASSERT3P(zio, !=, zio->io_logical); 3727 ASSERT(zio->io_logical != NULL); 3728 ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REPAIR)); 3729 ASSERT0(zio->io_flags & ZIO_FLAG_NOPWRITE); 3730 3731 mutex_enter(&pio->io_lock); 3732 metaslab_group_alloc_decrement(zio->io_spa, vd->vdev_id, pio, flags); 3733 mutex_exit(&pio->io_lock); 3734 3735 metaslab_class_throttle_unreserve(spa_normal_class(zio->io_spa), 3736 1, pio); 3737 3738 /* 3739 * Call into the pipeline to see if there is more work that 3740 * needs to be done. If there is work to be done it will be 3741 * dispatched to another taskq thread. 3742 / 3743* zio_allocate_dispatch(zio->io_spa); 3744} 3745 3746static int 3747zio_done(zio_t zio) 3748{ 3749* spa_t spa = zio->io_spa; 3750* zio_t lio = zio->io_logical; 3751* blkptr_t bp = zio->io_bp; 3752* vdev_t vd = zio->io_vd; 3753* uint64_t psize = zio->io_size; 3754 zio_t pio, pio_next; 3755 metaslab_class_t mc = spa_normal_class(spa); 3756* zio_link_t zl = NULL; 3757* 3758 /* 3759 * If our children haven't all completed, 3760 * wait for them and then repeat this pipeline stage. 3761 / 3762* if (zio_wait_for_children(zio, ZIO_CHILD_ALL_BITS, ZIO_WAIT_DONE)) { 3763 return (ZIO_PIPELINE_STOP); 3764 } 3765 3766 /* 3767 * If the allocation throttle is enabled, then update the accounting. 3768 * We only track child I/Os that are part of an allocating async 3769 * write. We must do this since the allocation is performed 3770 * by the logical I/O but the actual write is done by child I/Os. 3771 / 3772* if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING && 3773 zio->io_child_type == ZIO_CHILD_VDEV) { 3774 ASSERT(mc->mc_alloc_throttle_enabled); 3775 zio_dva_throttle_done(zio); 3776 } 3777 3778 /* 3779 * If the allocation throttle is enabled, verify that 3780 * we have decremented the refcounts for every I/O that was throttled. 3781 / 3782* if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING) { 3783 ASSERT(zio->io_type == ZIO_TYPE_WRITE); 3784 ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); 3785 ASSERT(bp != NULL); 3786 metaslab_group_alloc_verify(spa, zio->io_bp, zio); 3787 VERIFY(refcount_not_held(&mc->mc_alloc_slots, zio)); 3788 } 3789 3790 for (int c = 0; c < ZIO_CHILD_TYPES; c++) 3791 for (int w = 0; w < ZIO_WAIT_TYPES; w++) 3792 ASSERT(zio->io_children[c][w] == 0); 3793 3794 if (bp != NULL && !BP_IS_EMBEDDED(bp)) { 3795 ASSERT(bp->blk_pad[0] == 0); 3796 ASSERT(bp->blk_pad[1] == 0); 3797 ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0 \|\| 3798 (bp == zio_unique_parent(zio)->io_bp)); 3799 if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) && 3800 zio->io_bp_override == NULL && 3801 !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) { 3802 ASSERT(!BP_SHOULD_BYTESWAP(bp)); 3803 ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(bp)); 3804 ASSERT(BP_COUNT_GANG(bp) == 0 \|\| 3805 (BP_COUNT_GANG(bp) == BP_GET_NDVAS(bp))); 3806 } 3807 if (zio->io_flags & ZIO_FLAG_NOPWRITE) 3808 VERIFY(BP_EQUAL(bp, &zio->io_bp_orig)); 3809 } 3810 3811 /* 3812 * If there were child vdev/gang/ddt errors, they apply to us now. 3813 / 3814* zio_inherit_child_errors(zio, ZIO_CHILD_VDEV); 3815 zio_inherit_child_errors(zio, ZIO_CHILD_GANG); 3816 zio_inherit_child_errors(zio, ZIO_CHILD_DDT); 3817 3818 /* 3819 * If the I/O on the transformed data was successful, generate any 3820 * checksum reports now while we still have the transformed data. 3821 / 3822* if (zio->io_error == 0) { 3823 while (zio->io_cksum_report != NULL) { 3824 zio_cksum_report_t zcr = zio->io_cksum_report; 3825* uint64_t align = zcr->zcr_align; 3826 uint64_t asize = P2ROUNDUP(psize, align); 3827 char abuf = NULL; 3828* abd_t adata = zio->io_abd; 3829* 3830 if (asize != psize) { 3831 adata = abd_alloc_linear(asize, B_TRUE); 3832 abd_copy(adata, zio->io_abd, psize); 3833 abd_zero_off(adata, psize, asize - psize); 3834 } 3835 3836 if (adata != NULL) 3837 abuf = abd_borrow_buf_copy(adata, asize); 3838 3839 zio->io_cksum_report = zcr->zcr_next; 3840 zcr->zcr_next = NULL; 3841 zcr->zcr_finish(zcr, abuf); 3842 zfs_ereport_free_checksum(zcr); 3843 3844 if (adata != NULL) 3845 abd_return_buf(adata, abuf, asize); 3846 3847 if (asize != psize) 3848 abd_free(adata); 3849 } 3850 } 3851 3852 zio_pop_transforms(zio); /* note: may set zio->io_error / 3853* 3854 vdev_stat_update(zio, psize); 3855 3856 if (zio->io_error) { 3857 /* 3858 * If this I/O is attached to a particular vdev, 3859 * generate an error message describing the I/O failure 3860 * at the block level. We ignore these errors if the 3861 * device is currently unavailable. 3862 / 3863* if (zio->io_error != ECKSUM && vd != NULL && !vdev_is_dead(vd)) 3864 zfs_ereport_post(FM_EREPORT_ZFS_IO, spa, vd, zio, 0, 0); 3865 3866 if ((zio->io_error == EIO \|\| !(zio->io_flags & 3867 (ZIO_FLAG_SPECULATIVE \| ZIO_FLAG_DONT_PROPAGATE))) && 3868 zio == lio) { 3869 /* 3870 * For logical I/O requests, tell the SPA to log the 3871 * error and generate a logical data ereport. 3872 / 3873* spa_log_error(spa, zio); 3874 zfs_ereport_post(FM_EREPORT_ZFS_DATA, spa, NULL, zio, 3875 0, 0); 3876 } 3877 } 3878 3879 if (zio->io_error && zio == lio) { 3880 /* 3881 * Determine whether zio should be reexecuted. This will 3882 * propagate all the way to the root via zio_notify_parent(). 3883 / 3884* ASSERT(vd == NULL && bp != NULL); 3885 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 3886 3887 if (IO_IS_ALLOCATING(zio) && 3888 !(zio->io_flags & ZIO_FLAG_CANFAIL)) { 3889 if (zio->io_error != ENOSPC) 3890 zio->io_reexecute \|= ZIO_REEXECUTE_NOW; 3891 else 3892 zio->io_reexecute \|= ZIO_REEXECUTE_SUSPEND; 3893 } 3894 3895 if ((zio->io_type == ZIO_TYPE_READ \|\| 3896 zio->io_type == ZIO_TYPE_FREE) && 3897 !(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && 3898 zio->io_error == ENXIO && 3899 spa_load_state(spa) == SPA_LOAD_NONE && 3900 spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE) 3901 zio->io_reexecute \|= ZIO_REEXECUTE_SUSPEND; 3902 3903 if (!(zio->io_flags & ZIO_FLAG_CANFAIL) && !zio->io_reexecute) 3904 zio->io_reexecute \|= ZIO_REEXECUTE_SUSPEND; 3905 3906 /* 3907 * Here is a possibly good place to attempt to do 3908 * either combinatorial reconstruction or error correction 3909 * based on checksums. It also might be a good place 3910 * to send out preliminary ereports before we suspend 3911 * processing. 3912 / 3913* } 3914 3915 /* 3916 * If there were logical child errors, they apply to us now. 3917 * We defer this until now to avoid conflating logical child 3918 * errors with errors that happened to the zio itself when 3919 * updating vdev stats and reporting FMA events above. 3920 / 3921* zio_inherit_child_errors(zio, ZIO_CHILD_LOGICAL); 3922 3923 if ((zio->io_error \|\| zio->io_reexecute) && 3924 IO_IS_ALLOCATING(zio) && zio->io_gang_leader == zio && 3925 !(zio->io_flags & (ZIO_FLAG_IO_REWRITE \| ZIO_FLAG_NOPWRITE))) 3926 zio_dva_unallocate(zio, zio->io_gang_tree, bp); 3927 3928 zio_gang_tree_free(&zio->io_gang_tree); 3929 3930 /* 3931 * Godfather I/Os should never suspend. 3932 / 3933* if ((zio->io_flags & ZIO_FLAG_GODFATHER) && 3934 (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) 3935 zio->io_reexecute = 0; 3936 3937 if (zio->io_reexecute) { 3938 /* 3939 * This is a logical I/O that wants to reexecute. 3940 * 3941 * Reexecute is top-down. When an i/o fails, if it's not 3942 * the root, it simply notifies its parent and sticks around. 3943 * The parent, seeing that it still has children in zio_done(), 3944 * does the same. This percolates all the way up to the root. 3945 * The root i/o will reexecute or suspend the entire tree. 3946 * 3947 * This approach ensures that zio_reexecute() honors 3948 * all the original i/o dependency relationships, e.g. 3949 * parents not executing until children are ready. 3950 / 3951* ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 3952 3953 zio->io_gang_leader = NULL; 3954 3955 mutex_enter(&zio->io_lock); 3956 zio->io_state[ZIO_WAIT_DONE] = 1; 3957 mutex_exit(&zio->io_lock); 3958 3959 /* 3960 * "The Godfather" I/O monitors its children but is 3961 * not a true parent to them. It will track them through 3962 * the pipeline but severs its ties whenever they get into 3963 * trouble (e.g. suspended). This allows "The Godfather" 3964 * I/O to return status without blocking. 3965 / 3966* zl = NULL; 3967 for (pio = zio_walk_parents(zio, &zl); pio != NULL; 3968 pio = pio_next) { 3969 zio_link_t remove_zl = zl; 3970* pio_next = zio_walk_parents(zio, &zl); 3971 3972 if ((pio->io_flags & ZIO_FLAG_GODFATHER) && 3973 (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) { 3974 zio_remove_child(pio, zio, remove_zl); 3975 zio_notify_parent(pio, zio, ZIO_WAIT_DONE); 3976 } 3977 } 3978 3979 if ((pio = zio_unique_parent(zio)) != NULL) { 3980 /* 3981 * We're not a root i/o, so there's nothing to do 3982 * but notify our parent. Don't propagate errors 3983 * upward since we haven't permanently failed yet. 3984 / 3985* ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER)); 3986 zio->io_flags \|= ZIO_FLAG_DONT_PROPAGATE; 3987 zio_notify_parent(pio, zio, ZIO_WAIT_DONE); 3988 } else if (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND) { 3989 /* 3990 * We'd fail again if we reexecuted now, so suspend 3991 * until conditions improve (e.g. device comes online). 3992 / 3993* zio_suspend(spa, zio); 3994 } else { 3995 /* 3996 * Reexecution is potentially a huge amount of work. 3997 * Hand it off to the otherwise-unused claim taskq. 3998 / 3999#if defined(illumos) \|\| !defined(_KERNEL) 4000* ASSERT(zio->io_tqent.tqent_next == NULL); 4001#else 4002 ASSERT(zio->io_tqent.tqent_task.ta_pending == 0); 4003#endif 4004 spa_taskq_dispatch_ent(spa, ZIO_TYPE_CLAIM, 4005 ZIO_TASKQ_ISSUE, (task_func_t )zio_reexecute, zio, 4006* 0, &zio->io_tqent); 4007 } 4008 return (ZIO_PIPELINE_STOP); 4009 } 4010 4011 ASSERT(zio->io_child_count == 0); 4012 ASSERT(zio->io_reexecute == 0); 4013 ASSERT(zio->io_error == 0 \|\| (zio->io_flags & ZIO_FLAG_CANFAIL)); 4014 4015 /* 4016 * Report any checksum errors, since the I/O is complete. 4017 / 4018* while (zio->io_cksum_report != NULL) { 4019 zio_cksum_report_t zcr = zio->io_cksum_report; 4020* zio->io_cksum_report = zcr->zcr_next; 4021 zcr->zcr_next = NULL; 4022 zcr->zcr_finish(zcr, NULL); 4023 zfs_ereport_free_checksum(zcr); 4024 } 4025 4026 /* 4027 * It is the responsibility of the done callback to ensure that this 4028 * particular zio is no longer discoverable for adoption, and as 4029 * such, cannot acquire any new parents. 4030 / 4031* if (zio->io_done) 4032 zio->io_done(zio); 4033 4034 mutex_enter(&zio->io_lock); 4035 zio->io_state[ZIO_WAIT_DONE] = 1; 4036 mutex_exit(&zio->io_lock); 4037 4038 zl = NULL; 4039 for (pio = zio_walk_parents(zio, &zl); pio != NULL; pio = pio_next) { 4040 zio_link_t remove_zl = zl; 4041* pio_next = zio_walk_parents(zio, &zl); 4042 zio_remove_child(pio, zio, remove_zl); 4043 zio_notify_parent(pio, zio, ZIO_WAIT_DONE); 4044 } 4045 4046 if (zio->io_waiter != NULL) { 4047 mutex_enter(&zio->io_lock); 4048 zio->io_executor = NULL; 4049 cv_broadcast(&zio->io_cv); 4050 mutex_exit(&zio->io_lock); 4051 } else { 4052 zio_destroy(zio); 4053 } 4054 4055 return (ZIO_PIPELINE_STOP); 4056} 4057 4058/* 4059 * ========================================================================== 4060 * I/O pipeline definition 4061 * ========================================================================== 4062 / 4063static zio_pipe_stage_t zio_pipeline[] = { 4064 NULL, 4065 zio_read_bp_init, 4066 zio_write_bp_init, 4067 zio_free_bp_init, 4068 zio_issue_async, 4069 zio_write_compress, 4070 zio_checksum_generate, 4071 zio_nop_write, 4072 zio_ddt_read_start, 4073 zio_ddt_read_done, 4074 zio_ddt_write, 4075 zio_ddt_free, 4076 zio_gang_assemble, 4077 zio_gang_issue, 4078 zio_dva_throttle, 4079 zio_dva_allocate, 4080 zio_dva_free, 4081 zio_dva_claim, 4082 zio_ready, 4083 zio_vdev_io_start, 4084 zio_vdev_io_done, 4085 zio_vdev_io_assess, 4086 zio_checksum_verify, 4087 zio_done 4088}; 4089 4090 4091 4092 4093/* 4094 * Compare two zbookmark_phys_t's to see which we would reach first in a 4095 * pre-order traversal of the object tree. 4096 * 4097 * This is simple in every case aside from the meta-dnode object. For all other 4098 * objects, we traverse them in order (object 1 before object 2, and so on). 4099 * However, all of these objects are traversed while traversing object 0, since 4100 * the data it points to is the list of objects. Thus, we need to convert to a 4101 * canonical representation so we can compare meta-dnode bookmarks to 4102 * non-meta-dnode bookmarks. 4103 * 4104 * We do this by calculating "equivalents" for each field of the zbookmark. 4105 * zbookmarks outside of the meta-dnode use their own object and level, and 4106 * calculate the level 0 equivalent (the first L0 blkid that is contained in the 4107 * blocks this bookmark refers to) by multiplying their blkid by their span 4108 * (the number of L0 blocks contained within one block at their level). 4109 * zbookmarks inside the meta-dnode calculate their object equivalent 4110 * (which is L0equiv * dnodes per data block), use 0 for their L0equiv, and use 4111 * level + 1<<31 (any value larger than a level could ever be) for their level. 4112 * This causes them to always compare before a bookmark in their object 4113 * equivalent, compare appropriately to bookmarks in other objects, and to 4114 * compare appropriately to other bookmarks in the meta-dnode. 4115 / 4116int 4117zbookmark_compare(uint16_t dbss1, uint8_t ibs1, uint16_t dbss2, uint8_t ibs2, 4118* const zbookmark_phys_t zb1, const zbookmark_phys_t zb2) 4119{ 4120 /* 4121 * These variables represent the "equivalent" values for the zbookmark, 4122 * after converting zbookmarks inside the meta dnode to their 4123 * normal-object equivalents. 4124 / 4125* uint64_t zb1obj, zb2obj; 4126 uint64_t zb1L0, zb2L0; 4127 uint64_t zb1level, zb2level; 4128 4129 if (zb1->zb_object == zb2->zb_object && 4130 zb1->zb_level == zb2->zb_level && 4131 zb1->zb_blkid == zb2->zb_blkid) 4132 return (0); 4133 4134 /* 4135 * BP_SPANB calculates the span in blocks. 4136 / 4137* zb1L0 = (zb1->zb_blkid) * BP_SPANB(ibs1, zb1->zb_level); 4138 zb2L0 = (zb2->zb_blkid) * BP_SPANB(ibs2, zb2->zb_level); 4139 4140 if (zb1->zb_object == DMU_META_DNODE_OBJECT) { 4141 zb1obj = zb1L0 * (dbss1 << (SPA_MINBLOCKSHIFT - DNODE_SHIFT)); 4142 zb1L0 = 0; 4143 zb1level = zb1->zb_level + COMPARE_META_LEVEL; 4144 } else { 4145 zb1obj = zb1->zb_object; 4146 zb1level = zb1->zb_level; 4147 } 4148 4149 if (zb2->zb_object == DMU_META_DNODE_OBJECT) { 4150 zb2obj = zb2L0 * (dbss2 << (SPA_MINBLOCKSHIFT - DNODE_SHIFT)); 4151 zb2L0 = 0; 4152 zb2level = zb2->zb_level + COMPARE_META_LEVEL; 4153 } else { 4154 zb2obj = zb2->zb_object; 4155 zb2level = zb2->zb_level; 4156 } 4157 4158 /* Now that we have a canonical representation, do the comparison. / 4159* if (zb1obj != zb2obj) 4160 return (zb1obj < zb2obj ? -1 : 1); 4161 else if (zb1L0 != zb2L0) 4162 return (zb1L0 < zb2L0 ? -1 : 1); 4163 else if (zb1level != zb2level) 4164 return (zb1level > zb2level ? -1 : 1); 4165 /* 4166 * This can (theoretically) happen if the bookmarks have the same object 4167 * and level, but different blkids, if the block sizes are not the same. 4168 * There is presently no way to change the indirect block sizes 4169 / 4170* return (0); 4171} 4172 4173/* 4174 * This function checks the following: given that last_block is the place that 4175 * our traversal stopped last time, does that guarantee that we've visited 4176 * every node under subtree_root? Therefore, we can't just use the raw output 4177 * of zbookmark_compare. We have to pass in a modified version of 4178 * subtree_root; by incrementing the block id, and then checking whether 4179 * last_block is before or equal to that, we can tell whether or not having 4180 * visited last_block implies that all of subtree_root's children have been 4181 * visited. 4182 / 4183boolean_t 4184zbookmark_subtree_completed(const dnode_phys_t dnp, 4185 const zbookmark_phys_t subtree_root, const zbookmark_phys_t last_block) 4186{ 4187 zbookmark_phys_t mod_zb = subtree_root; 4188* mod_zb.zb_blkid++; 4189 ASSERT(last_block->zb_level == 0); 4190 4191 /* The objset_phys_t isn't before anything. / 4192* if (dnp == NULL) 4193 return (B_FALSE); 4194 4195 /* 4196 * We pass in 1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT) for the 4197 * data block size in sectors, because that variable is only used if 4198 * the bookmark refers to a block in the meta-dnode. Since we don't 4199 * know without examining it what object it refers to, and there's no 4200 * harm in passing in this value in other cases, we always pass it in. 4201 * 4202 * We pass in 0 for the indirect block size shift because zb2 must be 4203 * level 0. The indirect block size is only used to calculate the span 4204 * of the bookmark, but since the bookmark must be level 0, the span is 4205 * always 1, so the math works out. 4206 * 4207 * If you make changes to how the zbookmark_compare code works, be sure 4208 * to make sure that this code still works afterwards. 4209 / 4210* return (zbookmark_compare(dnp->dn_datablkszsec, dnp->dn_indblkshift, 4211 1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT), 0, &mod_zb, 4212 last_block) <= 0); 4213}	3389 * For non-raidz ZIOs, we can just copy aside the bad data read from the 3390 * disk, and use that to finish the checksum ereport later. 3391 / 3392static void 3393zio_vsd_default_cksum_finish(zio_cksum_report_t zcr, 3394 const void good_buf) 3395{ 3396* /* no processing needed / 3397* zfs_ereport_finish_checksum(zcr, good_buf, zcr->zcr_cbdata, B_FALSE); 3398} 3399 3400/ARGSUSED/ 3401void 3402zio_vsd_default_cksum_report(zio_t zio, zio_cksum_report_t zcr, void ignored) 3403{ 3404* void buf = zio_buf_alloc(zio->io_size); 3405* 3406 abd_copy_to_buf(buf, zio->io_abd, zio->io_size); 3407 3408 zcr->zcr_cbinfo = zio->io_size; 3409 zcr->zcr_cbdata = buf; 3410 zcr->zcr_finish = zio_vsd_default_cksum_finish; 3411 zcr->zcr_free = zio_buf_free; 3412} 3413 3414static int 3415zio_vdev_io_assess(zio_t zio) 3416{ 3417* vdev_t vd = zio->io_vd; 3418* 3419 if (zio_wait_for_children(zio, ZIO_CHILD_VDEV_BIT, ZIO_WAIT_DONE)) { 3420 return (ZIO_PIPELINE_STOP); 3421 } 3422 3423 if (vd == NULL && !(zio->io_flags & ZIO_FLAG_CONFIG_WRITER)) 3424 spa_config_exit(zio->io_spa, SCL_ZIO, zio); 3425 3426 if (zio->io_vsd != NULL) { 3427 zio->io_vsd_ops->vsd_free(zio); 3428 zio->io_vsd = NULL; 3429 } 3430 3431 if (zio_injection_enabled && zio->io_error == 0) 3432 zio->io_error = zio_handle_fault_injection(zio, EIO); 3433 3434 if (zio->io_type == ZIO_TYPE_FREE && 3435 zio->io_priority != ZIO_PRIORITY_NOW) { 3436 switch (zio->io_error) { 3437 case 0: 3438 ZIO_TRIM_STAT_INCR(bytes, zio->io_size); 3439 ZIO_TRIM_STAT_BUMP(success); 3440 break; 3441 case EOPNOTSUPP: 3442 ZIO_TRIM_STAT_BUMP(unsupported); 3443 break; 3444 default: 3445 ZIO_TRIM_STAT_BUMP(failed); 3446 break; 3447 } 3448 } 3449 3450 /* 3451 * If the I/O failed, determine whether we should attempt to retry it. 3452 * 3453 * On retry, we cut in line in the issue queue, since we don't want 3454 * compression/checksumming/etc. work to prevent our (cheap) IO reissue. 3455 / 3456* if (zio->io_error && vd == NULL && 3457 !(zio->io_flags & (ZIO_FLAG_DONT_RETRY \| ZIO_FLAG_IO_RETRY))) { 3458 ASSERT(!(zio->io_flags & ZIO_FLAG_DONT_QUEUE)); /* not a leaf / 3459* ASSERT(!(zio->io_flags & ZIO_FLAG_IO_BYPASS)); /* not a leaf / 3460* zio->io_error = 0; 3461 zio->io_flags \|= ZIO_FLAG_IO_RETRY \| 3462 ZIO_FLAG_DONT_CACHE \| ZIO_FLAG_DONT_AGGREGATE; 3463 zio->io_stage = ZIO_STAGE_VDEV_IO_START >> 1; 3464 zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, 3465 zio_requeue_io_start_cut_in_line); 3466 return (ZIO_PIPELINE_STOP); 3467 } 3468 3469 /* 3470 * If we got an error on a leaf device, convert it to ENXIO 3471 * if the device is not accessible at all. 3472 / 3473* if (zio->io_error && vd != NULL && vd->vdev_ops->vdev_op_leaf && 3474 !vdev_accessible(vd, zio)) 3475 zio->io_error = SET_ERROR(ENXIO); 3476 3477 /* 3478 * If we can't write to an interior vdev (mirror or RAID-Z), 3479 * set vdev_cant_write so that we stop trying to allocate from it. 3480 / 3481* if (zio->io_error == ENXIO && zio->io_type == ZIO_TYPE_WRITE && 3482 vd != NULL && !vd->vdev_ops->vdev_op_leaf) { 3483 vd->vdev_cant_write = B_TRUE; 3484 } 3485 3486 /* 3487 * If a cache flush returns ENOTSUP or ENOTTY, we know that no future 3488 * attempts will ever succeed. In this case we set a persistent bit so 3489 * that we don't bother with it in the future. 3490 / 3491* if ((zio->io_error == ENOTSUP \|\| zio->io_error == ENOTTY) && 3492 zio->io_type == ZIO_TYPE_IOCTL && 3493 zio->io_cmd == DKIOCFLUSHWRITECACHE && vd != NULL) 3494 vd->vdev_nowritecache = B_TRUE; 3495 3496 if (zio->io_error) 3497 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 3498 3499 if (vd != NULL && vd->vdev_ops->vdev_op_leaf && 3500 zio->io_physdone != NULL) { 3501 ASSERT(!(zio->io_flags & ZIO_FLAG_DELEGATED)); 3502 ASSERT(zio->io_child_type == ZIO_CHILD_VDEV); 3503 zio->io_physdone(zio->io_logical); 3504 } 3505 3506 return (ZIO_PIPELINE_CONTINUE); 3507} 3508 3509void 3510zio_vdev_io_reissue(zio_t zio) 3511{ 3512* ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); 3513 ASSERT(zio->io_error == 0); 3514 3515 zio->io_stage >>= 1; 3516} 3517 3518void 3519zio_vdev_io_redone(zio_t zio) 3520{ 3521* ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE); 3522 3523 zio->io_stage >>= 1; 3524} 3525 3526void 3527zio_vdev_io_bypass(zio_t zio) 3528{ 3529* ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); 3530 ASSERT(zio->io_error == 0); 3531 3532 zio->io_flags \|= ZIO_FLAG_IO_BYPASS; 3533 zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS >> 1; 3534} 3535 3536/* 3537 * ========================================================================== 3538 * Generate and verify checksums 3539 * ========================================================================== 3540 / 3541static int 3542zio_checksum_generate(zio_t zio) 3543{ 3544 blkptr_t bp = zio->io_bp; 3545* enum zio_checksum checksum; 3546 3547 if (bp == NULL) { 3548 /* 3549 * This is zio_write_phys(). 3550 * We're either generating a label checksum, or none at all. 3551 / 3552* checksum = zio->io_prop.zp_checksum; 3553 3554 if (checksum == ZIO_CHECKSUM_OFF) 3555 return (ZIO_PIPELINE_CONTINUE); 3556 3557 ASSERT(checksum == ZIO_CHECKSUM_LABEL); 3558 } else { 3559 if (BP_IS_GANG(bp) && zio->io_child_type == ZIO_CHILD_GANG) { 3560 ASSERT(!IO_IS_ALLOCATING(zio)); 3561 checksum = ZIO_CHECKSUM_GANG_HEADER; 3562 } else { 3563 checksum = BP_GET_CHECKSUM(bp); 3564 } 3565 } 3566 3567 zio_checksum_compute(zio, checksum, zio->io_abd, zio->io_size); 3568 3569 return (ZIO_PIPELINE_CONTINUE); 3570} 3571 3572static int 3573zio_checksum_verify(zio_t zio) 3574{ 3575* zio_bad_cksum_t info; 3576 blkptr_t bp = zio->io_bp; 3577* int error; 3578 3579 ASSERT(zio->io_vd != NULL); 3580 3581 if (bp == NULL) { 3582 /* 3583 * This is zio_read_phys(). 3584 * We're either verifying a label checksum, or nothing at all. 3585 / 3586* if (zio->io_prop.zp_checksum == ZIO_CHECKSUM_OFF) 3587 return (ZIO_PIPELINE_CONTINUE); 3588 3589 ASSERT(zio->io_prop.zp_checksum == ZIO_CHECKSUM_LABEL); 3590 } 3591 3592 if ((error = zio_checksum_error(zio, &info)) != 0) { 3593 zio->io_error = error; 3594 if (error == ECKSUM && 3595 !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 3596 zfs_ereport_start_checksum(zio->io_spa, 3597 zio->io_vd, zio, zio->io_offset, 3598 zio->io_size, NULL, &info); 3599 } 3600 } 3601 3602 return (ZIO_PIPELINE_CONTINUE); 3603} 3604 3605/* 3606 * Called by RAID-Z to ensure we don't compute the checksum twice. 3607 / 3608void 3609zio_checksum_verified(zio_t zio) 3610{ 3611 zio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY; 3612} 3613 3614/* 3615 * ========================================================================== 3616 * Error rank. Error are ranked in the order 0, ENXIO, ECKSUM, EIO, other. 3617 * An error of 0 indicates success. ENXIO indicates whole-device failure, 3618 * which may be transient (e.g. unplugged) or permament. ECKSUM and EIO 3619 * indicate errors that are specific to one I/O, and most likely permanent. 3620 * Any other error is presumed to be worse because we weren't expecting it. 3621 * ========================================================================== 3622 / 3623int 3624zio_worst_error(int e1, int e2) 3625{ 3626* static int zio_error_rank[] = { 0, ENXIO, ECKSUM, EIO }; 3627 int r1, r2; 3628 3629 for (r1 = 0; r1 < sizeof (zio_error_rank) / sizeof (int); r1++) 3630 if (e1 == zio_error_rank[r1]) 3631 break; 3632 3633 for (r2 = 0; r2 < sizeof (zio_error_rank) / sizeof (int); r2++) 3634 if (e2 == zio_error_rank[r2]) 3635 break; 3636 3637 return (r1 > r2 ? e1 : e2); 3638} 3639 3640/* 3641 * ========================================================================== 3642 * I/O completion 3643 * ========================================================================== 3644 / 3645static int 3646zio_ready(zio_t zio) 3647{ 3648 blkptr_t bp = zio->io_bp; 3649* zio_t pio, pio_next; 3650 zio_link_t zl = NULL; 3651* 3652 if (zio_wait_for_children(zio, ZIO_CHILD_GANG_BIT \| ZIO_CHILD_DDT_BIT, 3653 ZIO_WAIT_READY)) { 3654 return (ZIO_PIPELINE_STOP); 3655 } 3656 3657 if (zio->io_ready) { 3658 ASSERT(IO_IS_ALLOCATING(zio)); 3659 ASSERT(bp->blk_birth == zio->io_txg \|\| BP_IS_HOLE(bp) \|\| 3660 (zio->io_flags & ZIO_FLAG_NOPWRITE)); 3661 ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0); 3662 3663 zio->io_ready(zio); 3664 } 3665 3666 if (bp != NULL && bp != &zio->io_bp_copy) 3667 zio->io_bp_copy = bp; 3668* 3669 if (zio->io_error != 0) { 3670 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 3671 3672 if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING) { 3673 ASSERT(IO_IS_ALLOCATING(zio)); 3674 ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); 3675 /* 3676 * We were unable to allocate anything, unreserve and 3677 * issue the next I/O to allocate. 3678 / 3679* metaslab_class_throttle_unreserve( 3680 spa_normal_class(zio->io_spa), 3681 zio->io_prop.zp_copies, zio); 3682 zio_allocate_dispatch(zio->io_spa); 3683 } 3684 } 3685 3686 mutex_enter(&zio->io_lock); 3687 zio->io_state[ZIO_WAIT_READY] = 1; 3688 pio = zio_walk_parents(zio, &zl); 3689 mutex_exit(&zio->io_lock); 3690 3691 /* 3692 * As we notify zio's parents, new parents could be added. 3693 * New parents go to the head of zio's io_parent_list, however, 3694 * so we will (correctly) not notify them. The remainder of zio's 3695 * io_parent_list, from 'pio_next' onward, cannot change because 3696 * all parents must wait for us to be done before they can be done. 3697 / 3698* for (; pio != NULL; pio = pio_next) { 3699 pio_next = zio_walk_parents(zio, &zl); 3700 zio_notify_parent(pio, zio, ZIO_WAIT_READY); 3701 } 3702 3703 if (zio->io_flags & ZIO_FLAG_NODATA) { 3704 if (BP_IS_GANG(bp)) { 3705 zio->io_flags &= ~ZIO_FLAG_NODATA; 3706 } else { 3707 ASSERT((uintptr_t)zio->io_abd < SPA_MAXBLOCKSIZE); 3708 zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; 3709 } 3710 } 3711 3712 if (zio_injection_enabled && 3713 zio->io_spa->spa_syncing_txg == zio->io_txg) 3714 zio_handle_ignored_writes(zio); 3715 3716 return (ZIO_PIPELINE_CONTINUE); 3717} 3718 3719/* 3720 * Update the allocation throttle accounting. 3721 / 3722static void 3723zio_dva_throttle_done(zio_t zio) 3724{ 3725 zio_t lio = zio->io_logical; 3726* zio_t pio = zio_unique_parent(zio); 3727* vdev_t vd = zio->io_vd; 3728* int flags = METASLAB_ASYNC_ALLOC; 3729 3730 ASSERT3P(zio->io_bp, !=, NULL); 3731 ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); 3732 ASSERT3U(zio->io_priority, ==, ZIO_PRIORITY_ASYNC_WRITE); 3733 ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV); 3734 ASSERT(vd != NULL); 3735 ASSERT3P(vd, ==, vd->vdev_top); 3736 ASSERT(!(zio->io_flags & (ZIO_FLAG_IO_REPAIR \| ZIO_FLAG_IO_RETRY))); 3737 ASSERT(zio->io_flags & ZIO_FLAG_IO_ALLOCATING); 3738 ASSERT(!(lio->io_flags & ZIO_FLAG_IO_REWRITE)); 3739 ASSERT(!(lio->io_orig_flags & ZIO_FLAG_NODATA)); 3740 3741 /* 3742 * Parents of gang children can have two flavors -- ones that 3743 * allocated the gang header (will have ZIO_FLAG_IO_REWRITE set) 3744 * and ones that allocated the constituent blocks. The allocation 3745 * throttle needs to know the allocating parent zio so we must find 3746 * it here. 3747 / 3748* if (pio->io_child_type == ZIO_CHILD_GANG) { 3749 /* 3750 * If our parent is a rewrite gang child then our grandparent 3751 * would have been the one that performed the allocation. 3752 / 3753* if (pio->io_flags & ZIO_FLAG_IO_REWRITE) 3754 pio = zio_unique_parent(pio); 3755 flags \|= METASLAB_GANG_CHILD; 3756 } 3757 3758 ASSERT(IO_IS_ALLOCATING(pio)); 3759 ASSERT3P(zio, !=, zio->io_logical); 3760 ASSERT(zio->io_logical != NULL); 3761 ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REPAIR)); 3762 ASSERT0(zio->io_flags & ZIO_FLAG_NOPWRITE); 3763 3764 mutex_enter(&pio->io_lock); 3765 metaslab_group_alloc_decrement(zio->io_spa, vd->vdev_id, pio, flags); 3766 mutex_exit(&pio->io_lock); 3767 3768 metaslab_class_throttle_unreserve(spa_normal_class(zio->io_spa), 3769 1, pio); 3770 3771 /* 3772 * Call into the pipeline to see if there is more work that 3773 * needs to be done. If there is work to be done it will be 3774 * dispatched to another taskq thread. 3775 / 3776* zio_allocate_dispatch(zio->io_spa); 3777} 3778 3779static int 3780zio_done(zio_t zio) 3781{ 3782* spa_t spa = zio->io_spa; 3783* zio_t lio = zio->io_logical; 3784* blkptr_t bp = zio->io_bp; 3785* vdev_t vd = zio->io_vd; 3786* uint64_t psize = zio->io_size; 3787 zio_t pio, pio_next; 3788 metaslab_class_t mc = spa_normal_class(spa); 3789* zio_link_t zl = NULL; 3790* 3791 /* 3792 * If our children haven't all completed, 3793 * wait for them and then repeat this pipeline stage. 3794 / 3795* if (zio_wait_for_children(zio, ZIO_CHILD_ALL_BITS, ZIO_WAIT_DONE)) { 3796 return (ZIO_PIPELINE_STOP); 3797 } 3798 3799 /* 3800 * If the allocation throttle is enabled, then update the accounting. 3801 * We only track child I/Os that are part of an allocating async 3802 * write. We must do this since the allocation is performed 3803 * by the logical I/O but the actual write is done by child I/Os. 3804 / 3805* if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING && 3806 zio->io_child_type == ZIO_CHILD_VDEV) { 3807 ASSERT(mc->mc_alloc_throttle_enabled); 3808 zio_dva_throttle_done(zio); 3809 } 3810 3811 /* 3812 * If the allocation throttle is enabled, verify that 3813 * we have decremented the refcounts for every I/O that was throttled. 3814 / 3815* if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING) { 3816 ASSERT(zio->io_type == ZIO_TYPE_WRITE); 3817 ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); 3818 ASSERT(bp != NULL); 3819 metaslab_group_alloc_verify(spa, zio->io_bp, zio); 3820 VERIFY(refcount_not_held(&mc->mc_alloc_slots, zio)); 3821 } 3822 3823 for (int c = 0; c < ZIO_CHILD_TYPES; c++) 3824 for (int w = 0; w < ZIO_WAIT_TYPES; w++) 3825 ASSERT(zio->io_children[c][w] == 0); 3826 3827 if (bp != NULL && !BP_IS_EMBEDDED(bp)) { 3828 ASSERT(bp->blk_pad[0] == 0); 3829 ASSERT(bp->blk_pad[1] == 0); 3830 ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0 \|\| 3831 (bp == zio_unique_parent(zio)->io_bp)); 3832 if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) && 3833 zio->io_bp_override == NULL && 3834 !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) { 3835 ASSERT(!BP_SHOULD_BYTESWAP(bp)); 3836 ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(bp)); 3837 ASSERT(BP_COUNT_GANG(bp) == 0 \|\| 3838 (BP_COUNT_GANG(bp) == BP_GET_NDVAS(bp))); 3839 } 3840 if (zio->io_flags & ZIO_FLAG_NOPWRITE) 3841 VERIFY(BP_EQUAL(bp, &zio->io_bp_orig)); 3842 } 3843 3844 /* 3845 * If there were child vdev/gang/ddt errors, they apply to us now. 3846 / 3847* zio_inherit_child_errors(zio, ZIO_CHILD_VDEV); 3848 zio_inherit_child_errors(zio, ZIO_CHILD_GANG); 3849 zio_inherit_child_errors(zio, ZIO_CHILD_DDT); 3850 3851 /* 3852 * If the I/O on the transformed data was successful, generate any 3853 * checksum reports now while we still have the transformed data. 3854 / 3855* if (zio->io_error == 0) { 3856 while (zio->io_cksum_report != NULL) { 3857 zio_cksum_report_t zcr = zio->io_cksum_report; 3858* uint64_t align = zcr->zcr_align; 3859 uint64_t asize = P2ROUNDUP(psize, align); 3860 char abuf = NULL; 3861* abd_t adata = zio->io_abd; 3862* 3863 if (asize != psize) { 3864 adata = abd_alloc_linear(asize, B_TRUE); 3865 abd_copy(adata, zio->io_abd, psize); 3866 abd_zero_off(adata, psize, asize - psize); 3867 } 3868 3869 if (adata != NULL) 3870 abuf = abd_borrow_buf_copy(adata, asize); 3871 3872 zio->io_cksum_report = zcr->zcr_next; 3873 zcr->zcr_next = NULL; 3874 zcr->zcr_finish(zcr, abuf); 3875 zfs_ereport_free_checksum(zcr); 3876 3877 if (adata != NULL) 3878 abd_return_buf(adata, abuf, asize); 3879 3880 if (asize != psize) 3881 abd_free(adata); 3882 } 3883 } 3884 3885 zio_pop_transforms(zio); /* note: may set zio->io_error / 3886* 3887 vdev_stat_update(zio, psize); 3888 3889 if (zio->io_error) { 3890 /* 3891 * If this I/O is attached to a particular vdev, 3892 * generate an error message describing the I/O failure 3893 * at the block level. We ignore these errors if the 3894 * device is currently unavailable. 3895 / 3896* if (zio->io_error != ECKSUM && vd != NULL && !vdev_is_dead(vd)) 3897 zfs_ereport_post(FM_EREPORT_ZFS_IO, spa, vd, zio, 0, 0); 3898 3899 if ((zio->io_error == EIO \|\| !(zio->io_flags & 3900 (ZIO_FLAG_SPECULATIVE \| ZIO_FLAG_DONT_PROPAGATE))) && 3901 zio == lio) { 3902 /* 3903 * For logical I/O requests, tell the SPA to log the 3904 * error and generate a logical data ereport. 3905 / 3906* spa_log_error(spa, zio); 3907 zfs_ereport_post(FM_EREPORT_ZFS_DATA, spa, NULL, zio, 3908 0, 0); 3909 } 3910 } 3911 3912 if (zio->io_error && zio == lio) { 3913 /* 3914 * Determine whether zio should be reexecuted. This will 3915 * propagate all the way to the root via zio_notify_parent(). 3916 / 3917* ASSERT(vd == NULL && bp != NULL); 3918 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 3919 3920 if (IO_IS_ALLOCATING(zio) && 3921 !(zio->io_flags & ZIO_FLAG_CANFAIL)) { 3922 if (zio->io_error != ENOSPC) 3923 zio->io_reexecute \|= ZIO_REEXECUTE_NOW; 3924 else 3925 zio->io_reexecute \|= ZIO_REEXECUTE_SUSPEND; 3926 } 3927 3928 if ((zio->io_type == ZIO_TYPE_READ \|\| 3929 zio->io_type == ZIO_TYPE_FREE) && 3930 !(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && 3931 zio->io_error == ENXIO && 3932 spa_load_state(spa) == SPA_LOAD_NONE && 3933 spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE) 3934 zio->io_reexecute \|= ZIO_REEXECUTE_SUSPEND; 3935 3936 if (!(zio->io_flags & ZIO_FLAG_CANFAIL) && !zio->io_reexecute) 3937 zio->io_reexecute \|= ZIO_REEXECUTE_SUSPEND; 3938 3939 /* 3940 * Here is a possibly good place to attempt to do 3941 * either combinatorial reconstruction or error correction 3942 * based on checksums. It also might be a good place 3943 * to send out preliminary ereports before we suspend 3944 * processing. 3945 / 3946* } 3947 3948 /* 3949 * If there were logical child errors, they apply to us now. 3950 * We defer this until now to avoid conflating logical child 3951 * errors with errors that happened to the zio itself when 3952 * updating vdev stats and reporting FMA events above. 3953 / 3954* zio_inherit_child_errors(zio, ZIO_CHILD_LOGICAL); 3955 3956 if ((zio->io_error \|\| zio->io_reexecute) && 3957 IO_IS_ALLOCATING(zio) && zio->io_gang_leader == zio && 3958 !(zio->io_flags & (ZIO_FLAG_IO_REWRITE \| ZIO_FLAG_NOPWRITE))) 3959 zio_dva_unallocate(zio, zio->io_gang_tree, bp); 3960 3961 zio_gang_tree_free(&zio->io_gang_tree); 3962 3963 /* 3964 * Godfather I/Os should never suspend. 3965 / 3966* if ((zio->io_flags & ZIO_FLAG_GODFATHER) && 3967 (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) 3968 zio->io_reexecute = 0; 3969 3970 if (zio->io_reexecute) { 3971 /* 3972 * This is a logical I/O that wants to reexecute. 3973 * 3974 * Reexecute is top-down. When an i/o fails, if it's not 3975 * the root, it simply notifies its parent and sticks around. 3976 * The parent, seeing that it still has children in zio_done(), 3977 * does the same. This percolates all the way up to the root. 3978 * The root i/o will reexecute or suspend the entire tree. 3979 * 3980 * This approach ensures that zio_reexecute() honors 3981 * all the original i/o dependency relationships, e.g. 3982 * parents not executing until children are ready. 3983 / 3984* ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 3985 3986 zio->io_gang_leader = NULL; 3987 3988 mutex_enter(&zio->io_lock); 3989 zio->io_state[ZIO_WAIT_DONE] = 1; 3990 mutex_exit(&zio->io_lock); 3991 3992 /* 3993 * "The Godfather" I/O monitors its children but is 3994 * not a true parent to them. It will track them through 3995 * the pipeline but severs its ties whenever they get into 3996 * trouble (e.g. suspended). This allows "The Godfather" 3997 * I/O to return status without blocking. 3998 / 3999* zl = NULL; 4000 for (pio = zio_walk_parents(zio, &zl); pio != NULL; 4001 pio = pio_next) { 4002 zio_link_t remove_zl = zl; 4003* pio_next = zio_walk_parents(zio, &zl); 4004 4005 if ((pio->io_flags & ZIO_FLAG_GODFATHER) && 4006 (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) { 4007 zio_remove_child(pio, zio, remove_zl); 4008 zio_notify_parent(pio, zio, ZIO_WAIT_DONE); 4009 } 4010 } 4011 4012 if ((pio = zio_unique_parent(zio)) != NULL) { 4013 /* 4014 * We're not a root i/o, so there's nothing to do 4015 * but notify our parent. Don't propagate errors 4016 * upward since we haven't permanently failed yet. 4017 / 4018* ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER)); 4019 zio->io_flags \|= ZIO_FLAG_DONT_PROPAGATE; 4020 zio_notify_parent(pio, zio, ZIO_WAIT_DONE); 4021 } else if (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND) { 4022 /* 4023 * We'd fail again if we reexecuted now, so suspend 4024 * until conditions improve (e.g. device comes online). 4025 / 4026* zio_suspend(spa, zio); 4027 } else { 4028 /* 4029 * Reexecution is potentially a huge amount of work. 4030 * Hand it off to the otherwise-unused claim taskq. 4031 / 4032#if defined(illumos) \|\| !defined(_KERNEL) 4033* ASSERT(zio->io_tqent.tqent_next == NULL); 4034#else 4035 ASSERT(zio->io_tqent.tqent_task.ta_pending == 0); 4036#endif 4037 spa_taskq_dispatch_ent(spa, ZIO_TYPE_CLAIM, 4038 ZIO_TASKQ_ISSUE, (task_func_t )zio_reexecute, zio, 4039* 0, &zio->io_tqent); 4040 } 4041 return (ZIO_PIPELINE_STOP); 4042 } 4043 4044 ASSERT(zio->io_child_count == 0); 4045 ASSERT(zio->io_reexecute == 0); 4046 ASSERT(zio->io_error == 0 \|\| (zio->io_flags & ZIO_FLAG_CANFAIL)); 4047 4048 /* 4049 * Report any checksum errors, since the I/O is complete. 4050 / 4051* while (zio->io_cksum_report != NULL) { 4052 zio_cksum_report_t zcr = zio->io_cksum_report; 4053* zio->io_cksum_report = zcr->zcr_next; 4054 zcr->zcr_next = NULL; 4055 zcr->zcr_finish(zcr, NULL); 4056 zfs_ereport_free_checksum(zcr); 4057 } 4058 4059 /* 4060 * It is the responsibility of the done callback to ensure that this 4061 * particular zio is no longer discoverable for adoption, and as 4062 * such, cannot acquire any new parents. 4063 / 4064* if (zio->io_done) 4065 zio->io_done(zio); 4066 4067 mutex_enter(&zio->io_lock); 4068 zio->io_state[ZIO_WAIT_DONE] = 1; 4069 mutex_exit(&zio->io_lock); 4070 4071 zl = NULL; 4072 for (pio = zio_walk_parents(zio, &zl); pio != NULL; pio = pio_next) { 4073 zio_link_t remove_zl = zl; 4074* pio_next = zio_walk_parents(zio, &zl); 4075 zio_remove_child(pio, zio, remove_zl); 4076 zio_notify_parent(pio, zio, ZIO_WAIT_DONE); 4077 } 4078 4079 if (zio->io_waiter != NULL) { 4080 mutex_enter(&zio->io_lock); 4081 zio->io_executor = NULL; 4082 cv_broadcast(&zio->io_cv); 4083 mutex_exit(&zio->io_lock); 4084 } else { 4085 zio_destroy(zio); 4086 } 4087 4088 return (ZIO_PIPELINE_STOP); 4089} 4090 4091/* 4092 * ========================================================================== 4093 * I/O pipeline definition 4094 * ========================================================================== 4095 / 4096static zio_pipe_stage_t zio_pipeline[] = { 4097 NULL, 4098 zio_read_bp_init, 4099 zio_write_bp_init, 4100 zio_free_bp_init, 4101 zio_issue_async, 4102 zio_write_compress, 4103 zio_checksum_generate, 4104 zio_nop_write, 4105 zio_ddt_read_start, 4106 zio_ddt_read_done, 4107 zio_ddt_write, 4108 zio_ddt_free, 4109 zio_gang_assemble, 4110 zio_gang_issue, 4111 zio_dva_throttle, 4112 zio_dva_allocate, 4113 zio_dva_free, 4114 zio_dva_claim, 4115 zio_ready, 4116 zio_vdev_io_start, 4117 zio_vdev_io_done, 4118 zio_vdev_io_assess, 4119 zio_checksum_verify, 4120 zio_done 4121}; 4122 4123 4124 4125 4126/* 4127 * Compare two zbookmark_phys_t's to see which we would reach first in a 4128 * pre-order traversal of the object tree. 4129 * 4130 * This is simple in every case aside from the meta-dnode object. For all other 4131 * objects, we traverse them in order (object 1 before object 2, and so on). 4132 * However, all of these objects are traversed while traversing object 0, since 4133 * the data it points to is the list of objects. Thus, we need to convert to a 4134 * canonical representation so we can compare meta-dnode bookmarks to 4135 * non-meta-dnode bookmarks. 4136 * 4137 * We do this by calculating "equivalents" for each field of the zbookmark. 4138 * zbookmarks outside of the meta-dnode use their own object and level, and 4139 * calculate the level 0 equivalent (the first L0 blkid that is contained in the 4140 * blocks this bookmark refers to) by multiplying their blkid by their span 4141 * (the number of L0 blocks contained within one block at their level). 4142 * zbookmarks inside the meta-dnode calculate their object equivalent 4143 * (which is L0equiv * dnodes per data block), use 0 for their L0equiv, and use 4144 * level + 1<<31 (any value larger than a level could ever be) for their level. 4145 * This causes them to always compare before a bookmark in their object 4146 * equivalent, compare appropriately to bookmarks in other objects, and to 4147 * compare appropriately to other bookmarks in the meta-dnode. 4148 / 4149int 4150zbookmark_compare(uint16_t dbss1, uint8_t ibs1, uint16_t dbss2, uint8_t ibs2, 4151* const zbookmark_phys_t zb1, const zbookmark_phys_t zb2) 4152{ 4153 /* 4154 * These variables represent the "equivalent" values for the zbookmark, 4155 * after converting zbookmarks inside the meta dnode to their 4156 * normal-object equivalents. 4157 / 4158* uint64_t zb1obj, zb2obj; 4159 uint64_t zb1L0, zb2L0; 4160 uint64_t zb1level, zb2level; 4161 4162 if (zb1->zb_object == zb2->zb_object && 4163 zb1->zb_level == zb2->zb_level && 4164 zb1->zb_blkid == zb2->zb_blkid) 4165 return (0); 4166 4167 /* 4168 * BP_SPANB calculates the span in blocks. 4169 / 4170* zb1L0 = (zb1->zb_blkid) * BP_SPANB(ibs1, zb1->zb_level); 4171 zb2L0 = (zb2->zb_blkid) * BP_SPANB(ibs2, zb2->zb_level); 4172 4173 if (zb1->zb_object == DMU_META_DNODE_OBJECT) { 4174 zb1obj = zb1L0 * (dbss1 << (SPA_MINBLOCKSHIFT - DNODE_SHIFT)); 4175 zb1L0 = 0; 4176 zb1level = zb1->zb_level + COMPARE_META_LEVEL; 4177 } else { 4178 zb1obj = zb1->zb_object; 4179 zb1level = zb1->zb_level; 4180 } 4181 4182 if (zb2->zb_object == DMU_META_DNODE_OBJECT) { 4183 zb2obj = zb2L0 * (dbss2 << (SPA_MINBLOCKSHIFT - DNODE_SHIFT)); 4184 zb2L0 = 0; 4185 zb2level = zb2->zb_level + COMPARE_META_LEVEL; 4186 } else { 4187 zb2obj = zb2->zb_object; 4188 zb2level = zb2->zb_level; 4189 } 4190 4191 /* Now that we have a canonical representation, do the comparison. / 4192* if (zb1obj != zb2obj) 4193 return (zb1obj < zb2obj ? -1 : 1); 4194 else if (zb1L0 != zb2L0) 4195 return (zb1L0 < zb2L0 ? -1 : 1); 4196 else if (zb1level != zb2level) 4197 return (zb1level > zb2level ? -1 : 1); 4198 /* 4199 * This can (theoretically) happen if the bookmarks have the same object 4200 * and level, but different blkids, if the block sizes are not the same. 4201 * There is presently no way to change the indirect block sizes 4202 / 4203* return (0); 4204} 4205 4206/* 4207 * This function checks the following: given that last_block is the place that 4208 * our traversal stopped last time, does that guarantee that we've visited 4209 * every node under subtree_root? Therefore, we can't just use the raw output 4210 * of zbookmark_compare. We have to pass in a modified version of 4211 * subtree_root; by incrementing the block id, and then checking whether 4212 * last_block is before or equal to that, we can tell whether or not having 4213 * visited last_block implies that all of subtree_root's children have been 4214 * visited. 4215 / 4216boolean_t 4217zbookmark_subtree_completed(const dnode_phys_t dnp, 4218 const zbookmark_phys_t subtree_root, const zbookmark_phys_t last_block) 4219{ 4220 zbookmark_phys_t mod_zb = subtree_root; 4221* mod_zb.zb_blkid++; 4222 ASSERT(last_block->zb_level == 0); 4223 4224 /* The objset_phys_t isn't before anything. / 4225* if (dnp == NULL) 4226 return (B_FALSE); 4227 4228 /* 4229 * We pass in 1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT) for the 4230 * data block size in sectors, because that variable is only used if 4231 * the bookmark refers to a block in the meta-dnode. Since we don't 4232 * know without examining it what object it refers to, and there's no 4233 * harm in passing in this value in other cases, we always pass it in. 4234 * 4235 * We pass in 0 for the indirect block size shift because zb2 must be 4236 * level 0. The indirect block size is only used to calculate the span 4237 * of the bookmark, but since the bookmark must be level 0, the span is 4238 * always 1, so the math works out. 4239 * 4240 * If you make changes to how the zbookmark_compare code works, be sure 4241 * to make sure that this code still works afterwards. 4242 / 4243* return (zbookmark_compare(dnp->dn_datablkszsec, dnp->dn_indblkshift, 4244 1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT), 0, &mod_zb, 4245 last_block) <= 0); 4246}