zil.c revision 343983
1168404Spjd/* 2168404Spjd * CDDL HEADER START 3168404Spjd * 4168404Spjd * The contents of this file are subject to the terms of the 5168404Spjd * Common Development and Distribution License (the "License"). 6168404Spjd * You may not use this file except in compliance with the License. 7168404Spjd * 8168404Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9168404Spjd * or http://www.opensolaris.org/os/licensing. 10168404Spjd * See the License for the specific language governing permissions 11168404Spjd * and limitations under the License. 12168404Spjd * 13168404Spjd * When distributing Covered Code, include this CDDL HEADER in each 14168404Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15168404Spjd * If applicable, add the following below this CDDL HEADER, with the 16168404Spjd * fields enclosed by brackets "[]" replaced with your own identifying 17168404Spjd * information: Portions Copyright [yyyy] [name of copyright owner] 18168404Spjd * 19168404Spjd * CDDL HEADER END 20168404Spjd */ 21168404Spjd/* 22219089Spjd * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23339105Smav * Copyright (c) 2011, 2018 by Delphix. All rights reserved. 24296519Smav * Copyright (c) 2014 Integros [integros.com] 25168404Spjd */ 26168404Spjd 27219089Spjd/* Portions Copyright 2010 Robert Milkowski */ 28219089Spjd 29168404Spjd#include <sys/zfs_context.h> 30168404Spjd#include <sys/spa.h> 31332547Smav#include <sys/spa_impl.h> 32168404Spjd#include <sys/dmu.h> 33168404Spjd#include <sys/zap.h> 34168404Spjd#include <sys/arc.h> 35168404Spjd#include <sys/stat.h> 36168404Spjd#include <sys/resource.h> 37168404Spjd#include <sys/zil.h> 38168404Spjd#include <sys/zil_impl.h> 39168404Spjd#include <sys/dsl_dataset.h> 40219089Spjd#include <sys/vdev_impl.h> 41168404Spjd#include <sys/dmu_tx.h> 42219089Spjd#include <sys/dsl_pool.h> 43321610Smav#include <sys/abd.h> 44168404Spjd 45168404Spjd/* 46325132Savg * The ZFS Intent Log (ZIL) saves "transaction records" (itxs) of system 47325132Savg * calls that change the file system. Each itx has enough information to 48325132Savg * be able to replay them after a system crash, power loss, or 49325132Savg * equivalent failure mode. These are stored in memory until either: 50168404Spjd * 51325132Savg * 1. they are committed to the pool by the DMU transaction group 52325132Savg * (txg), at which point they can be discarded; or 53325132Savg * 2. they are committed to the on-disk ZIL for the dataset being 54325132Savg * modified (e.g. due to an fsync, O_DSYNC, or other synchronous 55325132Savg * requirement). 56168404Spjd * 57325132Savg * In the event of a crash or power loss, the itxs contained by each 58325132Savg * dataset's on-disk ZIL will be replayed when that dataset is first 59325132Savg * instantianted (e.g. if the dataset is a normal fileystem, when it is 60325132Savg * first mounted). 61168404Spjd * 62325132Savg * As hinted at above, there is one ZIL per dataset (both the in-memory 63325132Savg * representation, and the on-disk representation). The on-disk format 64325132Savg * consists of 3 parts: 65325132Savg * 66325132Savg * - a single, per-dataset, ZIL header; which points to a chain of 67325132Savg * - zero or more ZIL blocks; each of which contains 68325132Savg * - zero or more ZIL records 69325132Savg * 70325132Savg * A ZIL record holds the information necessary to replay a single 71325132Savg * system call transaction. A ZIL block can hold many ZIL records, and 72325132Savg * the blocks are chained together, similarly to a singly linked list. 73325132Savg * 74325132Savg * Each ZIL block contains a block pointer (blkptr_t) to the next ZIL 75325132Savg * block in the chain, and the ZIL header points to the first block in 76325132Savg * the chain. 77325132Savg * 78325132Savg * Note, there is not a fixed place in the pool to hold these ZIL 79325132Savg * blocks; they are dynamically allocated and freed as needed from the 80325132Savg * blocks available on the pool, though they can be preferentially 81325132Savg * allocated from a dedicated "log" vdev. 82168404Spjd */ 83168404Spjd 84168404Spjd/* 85325132Savg * This controls the amount of time that a ZIL block (lwb) will remain 86325132Savg * "open" when it isn't "full", and it has a thread waiting for it to be 87325132Savg * committed to stable storage. Please refer to the zil_commit_waiter() 88325132Savg * function (and the comments within it) for more details. 89325132Savg */ 90325132Savgint zfs_commit_timeout_pct = 5; 91325132Savg 92325132Savg/* 93251631Sdelphij * Disable intent logging replay. This global ZIL switch affects all pools. 94168404Spjd */ 95251631Sdelphijint zil_replay_disable = 0; 96168404SpjdSYSCTL_DECL(_vfs_zfs); 97267992ShselaskySYSCTL_INT(_vfs_zfs, OID_AUTO, zil_replay_disable, CTLFLAG_RWTUN, 98219089Spjd &zil_replay_disable, 0, "Disable intent logging replay"); 99168404Spjd 100168404Spjd/* 101168404Spjd * Tunable parameter for debugging or performance analysis. Setting 102168404Spjd * zfs_nocacheflush will cause corruption on power loss if a volatile 103168404Spjd * out-of-order write cache is enabled. 104168404Spjd */ 105168404Spjdboolean_t zfs_nocacheflush = B_FALSE; 106343983SoshogboSYSCTL_INT(_vfs_zfs, OID_AUTO, cache_flush_disable, CTLFLAG_RWTUN, 107168404Spjd &zfs_nocacheflush, 0, "Disable cache flush"); 108249921Ssmhboolean_t zfs_trim_enabled = B_TRUE; 109249921SsmhSYSCTL_DECL(_vfs_zfs_trim); 110249921SsmhSYSCTL_INT(_vfs_zfs_trim, OID_AUTO, enabled, CTLFLAG_RDTUN, &zfs_trim_enabled, 0, 111249921Ssmh "Enable ZFS TRIM"); 112168404Spjd 113315441Smav/* 114315441Smav * Limit SLOG write size per commit executed with synchronous priority. 115321611Smav * Any writes above that will be executed with lower (asynchronous) priority 116321611Smav * to limit potential SLOG device abuse by single active ZIL writer. 117315441Smav */ 118321611Smavuint64_t zil_slog_bulk = 768 * 1024; 119321611SmavSYSCTL_QUAD(_vfs_zfs, OID_AUTO, zil_slog_bulk, CTLFLAG_RWTUN, 120321611Smav &zil_slog_bulk, 0, "Maximal SLOG commit size with sync priority"); 121315441Smav 122168404Spjdstatic kmem_cache_t *zil_lwb_cache; 123325132Savgstatic kmem_cache_t *zil_zcw_cache; 124168404Spjd 125219089Spjd#define LWB_EMPTY(lwb) ((BP_GET_LSIZE(&lwb->lwb_blk) - \ 126219089Spjd sizeof (zil_chain_t)) == (lwb->lwb_sz - lwb->lwb_nused)) 127219089Spjd 128168404Spjdstatic int 129219089Spjdzil_bp_compare(const void *x1, const void *x2) 130168404Spjd{ 131219089Spjd const dva_t *dva1 = &((zil_bp_node_t *)x1)->zn_dva; 132219089Spjd const dva_t *dva2 = &((zil_bp_node_t *)x2)->zn_dva; 133168404Spjd 134339158Smav int cmp = AVL_CMP(DVA_GET_VDEV(dva1), DVA_GET_VDEV(dva2)); 135339158Smav if (likely(cmp)) 136339158Smav return (cmp); 137168404Spjd 138339158Smav return (AVL_CMP(DVA_GET_OFFSET(dva1), DVA_GET_OFFSET(dva2))); 139168404Spjd} 140168404Spjd 141168404Spjdstatic void 142219089Spjdzil_bp_tree_init(zilog_t *zilog) 143168404Spjd{ 144219089Spjd avl_create(&zilog->zl_bp_tree, zil_bp_compare, 145219089Spjd sizeof (zil_bp_node_t), offsetof(zil_bp_node_t, zn_node)); 146168404Spjd} 147168404Spjd 148168404Spjdstatic void 149219089Spjdzil_bp_tree_fini(zilog_t *zilog) 150168404Spjd{ 151219089Spjd avl_tree_t *t = &zilog->zl_bp_tree; 152219089Spjd zil_bp_node_t *zn; 153168404Spjd void *cookie = NULL; 154168404Spjd 155168404Spjd while ((zn = avl_destroy_nodes(t, &cookie)) != NULL) 156219089Spjd kmem_free(zn, sizeof (zil_bp_node_t)); 157168404Spjd 158168404Spjd avl_destroy(t); 159168404Spjd} 160168404Spjd 161219089Spjdint 162219089Spjdzil_bp_tree_add(zilog_t *zilog, const blkptr_t *bp) 163168404Spjd{ 164219089Spjd avl_tree_t *t = &zilog->zl_bp_tree; 165268075Sdelphij const dva_t *dva; 166219089Spjd zil_bp_node_t *zn; 167168404Spjd avl_index_t where; 168168404Spjd 169268075Sdelphij if (BP_IS_EMBEDDED(bp)) 170268075Sdelphij return (0); 171268075Sdelphij 172268075Sdelphij dva = BP_IDENTITY(bp); 173268075Sdelphij 174168404Spjd if (avl_find(t, dva, &where) != NULL) 175249195Smm return (SET_ERROR(EEXIST)); 176168404Spjd 177219089Spjd zn = kmem_alloc(sizeof (zil_bp_node_t), KM_SLEEP); 178168404Spjd zn->zn_dva = *dva; 179168404Spjd avl_insert(t, zn, where); 180168404Spjd 181168404Spjd return (0); 182168404Spjd} 183168404Spjd 184168404Spjdstatic zil_header_t * 185168404Spjdzil_header_in_syncing_context(zilog_t *zilog) 186168404Spjd{ 187168404Spjd return ((zil_header_t *)zilog->zl_header); 188168404Spjd} 189168404Spjd 190168404Spjdstatic void 191168404Spjdzil_init_log_chain(zilog_t *zilog, blkptr_t *bp) 192168404Spjd{ 193168404Spjd zio_cksum_t *zc = &bp->blk_cksum; 194168404Spjd 195168404Spjd zc->zc_word[ZIL_ZC_GUID_0] = spa_get_random(-1ULL); 196168404Spjd zc->zc_word[ZIL_ZC_GUID_1] = spa_get_random(-1ULL); 197168404Spjd zc->zc_word[ZIL_ZC_OBJSET] = dmu_objset_id(zilog->zl_os); 198168404Spjd zc->zc_word[ZIL_ZC_SEQ] = 1ULL; 199168404Spjd} 200168404Spjd 201168404Spjd/* 202219089Spjd * Read a log block and make sure it's valid. 203168404Spjd */ 204168404Spjdstatic int 205219089Spjdzil_read_log_block(zilog_t *zilog, const blkptr_t *bp, blkptr_t *nbp, void *dst, 206219089Spjd char **end) 207168404Spjd{ 208219089Spjd enum zio_flag zio_flags = ZIO_FLAG_CANFAIL; 209275811Sdelphij arc_flags_t aflags = ARC_FLAG_WAIT; 210219089Spjd arc_buf_t *abuf = NULL; 211268123Sdelphij zbookmark_phys_t zb; 212168404Spjd int error; 213168404Spjd 214219089Spjd if (zilog->zl_header->zh_claim_txg == 0) 215219089Spjd zio_flags |= ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB; 216168404Spjd 217219089Spjd if (!(zilog->zl_header->zh_flags & ZIL_CLAIM_LR_SEQ_VALID)) 218219089Spjd zio_flags |= ZIO_FLAG_SPECULATIVE; 219168404Spjd 220219089Spjd SET_BOOKMARK(&zb, bp->blk_cksum.zc_word[ZIL_ZC_OBJSET], 221219089Spjd ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]); 222168404Spjd 223246666Smm error = arc_read(NULL, zilog->zl_spa, bp, arc_getbuf_func, &abuf, 224219089Spjd ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb); 225219089Spjd 226168404Spjd if (error == 0) { 227168404Spjd zio_cksum_t cksum = bp->blk_cksum; 228168404Spjd 229168404Spjd /* 230185029Spjd * Validate the checksummed log block. 231185029Spjd * 232168404Spjd * Sequence numbers should be... sequential. The checksum 233168404Spjd * verifier for the next block should be bp's checksum plus 1. 234185029Spjd * 235185029Spjd * Also check the log chain linkage and size used. 236168404Spjd */ 237168404Spjd cksum.zc_word[ZIL_ZC_SEQ]++; 238168404Spjd 239219089Spjd if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) { 240219089Spjd zil_chain_t *zilc = abuf->b_data; 241219089Spjd char *lr = (char *)(zilc + 1); 242219089Spjd uint64_t len = zilc->zc_nused - sizeof (zil_chain_t); 243219089Spjd 244219089Spjd if (bcmp(&cksum, &zilc->zc_next_blk.blk_cksum, 245219089Spjd sizeof (cksum)) || BP_IS_HOLE(&zilc->zc_next_blk)) { 246249195Smm error = SET_ERROR(ECKSUM); 247219089Spjd } else { 248274337Sdelphij ASSERT3U(len, <=, SPA_OLD_MAXBLOCKSIZE); 249219089Spjd bcopy(lr, dst, len); 250219089Spjd *end = (char *)dst + len; 251219089Spjd *nbp = zilc->zc_next_blk; 252219089Spjd } 253219089Spjd } else { 254219089Spjd char *lr = abuf->b_data; 255219089Spjd uint64_t size = BP_GET_LSIZE(bp); 256219089Spjd zil_chain_t *zilc = (zil_chain_t *)(lr + size) - 1; 257219089Spjd 258219089Spjd if (bcmp(&cksum, &zilc->zc_next_blk.blk_cksum, 259219089Spjd sizeof (cksum)) || BP_IS_HOLE(&zilc->zc_next_blk) || 260219089Spjd (zilc->zc_nused > (size - sizeof (*zilc)))) { 261249195Smm error = SET_ERROR(ECKSUM); 262219089Spjd } else { 263274337Sdelphij ASSERT3U(zilc->zc_nused, <=, 264274337Sdelphij SPA_OLD_MAXBLOCKSIZE); 265219089Spjd bcopy(lr, dst, zilc->zc_nused); 266219089Spjd *end = (char *)dst + zilc->zc_nused; 267219089Spjd *nbp = zilc->zc_next_blk; 268219089Spjd } 269185029Spjd } 270168404Spjd 271307265Smav arc_buf_destroy(abuf, &abuf); 272168404Spjd } 273168404Spjd 274219089Spjd return (error); 275219089Spjd} 276168404Spjd 277219089Spjd/* 278219089Spjd * Read a TX_WRITE log data block. 279219089Spjd */ 280219089Spjdstatic int 281219089Spjdzil_read_log_data(zilog_t *zilog, const lr_write_t *lr, void *wbuf) 282219089Spjd{ 283219089Spjd enum zio_flag zio_flags = ZIO_FLAG_CANFAIL; 284219089Spjd const blkptr_t *bp = &lr->lr_blkptr; 285275811Sdelphij arc_flags_t aflags = ARC_FLAG_WAIT; 286219089Spjd arc_buf_t *abuf = NULL; 287268123Sdelphij zbookmark_phys_t zb; 288219089Spjd int error; 289219089Spjd 290219089Spjd if (BP_IS_HOLE(bp)) { 291219089Spjd if (wbuf != NULL) 292219089Spjd bzero(wbuf, MAX(BP_GET_LSIZE(bp), lr->lr_length)); 293219089Spjd return (0); 294219089Spjd } 295219089Spjd 296219089Spjd if (zilog->zl_header->zh_claim_txg == 0) 297219089Spjd zio_flags |= ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB; 298219089Spjd 299219089Spjd SET_BOOKMARK(&zb, dmu_objset_id(zilog->zl_os), lr->lr_foid, 300219089Spjd ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp)); 301219089Spjd 302246666Smm error = arc_read(NULL, zilog->zl_spa, bp, arc_getbuf_func, &abuf, 303219089Spjd ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb); 304219089Spjd 305219089Spjd if (error == 0) { 306219089Spjd if (wbuf != NULL) 307219089Spjd bcopy(abuf->b_data, wbuf, arc_buf_size(abuf)); 308307265Smav arc_buf_destroy(abuf, &abuf); 309219089Spjd } 310219089Spjd 311168404Spjd return (error); 312168404Spjd} 313168404Spjd 314168404Spjd/* 315168404Spjd * Parse the intent log, and call parse_func for each valid record within. 316168404Spjd */ 317219089Spjdint 318168404Spjdzil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func, 319168404Spjd zil_parse_lr_func_t *parse_lr_func, void *arg, uint64_t txg) 320168404Spjd{ 321168404Spjd const zil_header_t *zh = zilog->zl_header; 322219089Spjd boolean_t claimed = !!zh->zh_claim_txg; 323219089Spjd uint64_t claim_blk_seq = claimed ? zh->zh_claim_blk_seq : UINT64_MAX; 324219089Spjd uint64_t claim_lr_seq = claimed ? zh->zh_claim_lr_seq : UINT64_MAX; 325219089Spjd uint64_t max_blk_seq = 0; 326219089Spjd uint64_t max_lr_seq = 0; 327219089Spjd uint64_t blk_count = 0; 328219089Spjd uint64_t lr_count = 0; 329219089Spjd blkptr_t blk, next_blk; 330168404Spjd char *lrbuf, *lrp; 331219089Spjd int error = 0; 332168404Spjd 333219089Spjd /* 334219089Spjd * Old logs didn't record the maximum zh_claim_lr_seq. 335219089Spjd */ 336219089Spjd if (!(zh->zh_flags & ZIL_CLAIM_LR_SEQ_VALID)) 337219089Spjd claim_lr_seq = UINT64_MAX; 338168404Spjd 339168404Spjd /* 340168404Spjd * Starting at the block pointed to by zh_log we read the log chain. 341168404Spjd * For each block in the chain we strongly check that block to 342168404Spjd * ensure its validity. We stop when an invalid block is found. 343168404Spjd * For each block pointer in the chain we call parse_blk_func(). 344168404Spjd * For each record in each valid block we call parse_lr_func(). 345168404Spjd * If the log has been claimed, stop if we encounter a sequence 346168404Spjd * number greater than the highest claimed sequence number. 347168404Spjd */ 348274337Sdelphij lrbuf = zio_buf_alloc(SPA_OLD_MAXBLOCKSIZE); 349219089Spjd zil_bp_tree_init(zilog); 350168404Spjd 351219089Spjd for (blk = zh->zh_log; !BP_IS_HOLE(&blk); blk = next_blk) { 352219089Spjd uint64_t blk_seq = blk.blk_cksum.zc_word[ZIL_ZC_SEQ]; 353219089Spjd int reclen; 354219089Spjd char *end; 355219089Spjd 356219089Spjd if (blk_seq > claim_blk_seq) 357168404Spjd break; 358219089Spjd if ((error = parse_blk_func(zilog, &blk, arg, txg)) != 0) 359219089Spjd break; 360219089Spjd ASSERT3U(max_blk_seq, <, blk_seq); 361219089Spjd max_blk_seq = blk_seq; 362219089Spjd blk_count++; 363168404Spjd 364219089Spjd if (max_lr_seq == claim_lr_seq && max_blk_seq == claim_blk_seq) 365219089Spjd break; 366168404Spjd 367219089Spjd error = zil_read_log_block(zilog, &blk, &next_blk, lrbuf, &end); 368248571Smm if (error != 0) 369168404Spjd break; 370168404Spjd 371219089Spjd for (lrp = lrbuf; lrp < end; lrp += reclen) { 372168404Spjd lr_t *lr = (lr_t *)lrp; 373168404Spjd reclen = lr->lrc_reclen; 374168404Spjd ASSERT3U(reclen, >=, sizeof (lr_t)); 375219089Spjd if (lr->lrc_seq > claim_lr_seq) 376219089Spjd goto done; 377219089Spjd if ((error = parse_lr_func(zilog, lr, arg, txg)) != 0) 378219089Spjd goto done; 379219089Spjd ASSERT3U(max_lr_seq, <, lr->lrc_seq); 380219089Spjd max_lr_seq = lr->lrc_seq; 381219089Spjd lr_count++; 382168404Spjd } 383168404Spjd } 384219089Spjddone: 385219089Spjd zilog->zl_parse_error = error; 386219089Spjd zilog->zl_parse_blk_seq = max_blk_seq; 387219089Spjd zilog->zl_parse_lr_seq = max_lr_seq; 388219089Spjd zilog->zl_parse_blk_count = blk_count; 389219089Spjd zilog->zl_parse_lr_count = lr_count; 390168404Spjd 391219089Spjd ASSERT(!claimed || !(zh->zh_flags & ZIL_CLAIM_LR_SEQ_VALID) || 392219089Spjd (max_blk_seq == claim_blk_seq && max_lr_seq == claim_lr_seq)); 393219089Spjd 394219089Spjd zil_bp_tree_fini(zilog); 395274337Sdelphij zio_buf_free(lrbuf, SPA_OLD_MAXBLOCKSIZE); 396219089Spjd 397219089Spjd return (error); 398168404Spjd} 399168404Spjd 400332547Smav/* ARGSUSED */ 401219089Spjdstatic int 402332547Smavzil_clear_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t first_txg) 403332547Smav{ 404332547Smav ASSERT(!BP_IS_HOLE(bp)); 405332547Smav 406332547Smav /* 407332547Smav * As we call this function from the context of a rewind to a 408332547Smav * checkpoint, each ZIL block whose txg is later than the txg 409332547Smav * that we rewind to is invalid. Thus, we return -1 so 410332547Smav * zil_parse() doesn't attempt to read it. 411332547Smav */ 412332547Smav if (bp->blk_birth >= first_txg) 413332547Smav return (-1); 414332547Smav 415332547Smav if (zil_bp_tree_add(zilog, bp) != 0) 416332547Smav return (0); 417332547Smav 418332547Smav zio_free(zilog->zl_spa, first_txg, bp); 419332547Smav return (0); 420332547Smav} 421332547Smav 422332547Smav/* ARGSUSED */ 423332547Smavstatic int 424332547Smavzil_noop_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t first_txg) 425332547Smav{ 426332547Smav return (0); 427332547Smav} 428332547Smav 429332547Smavstatic int 430168404Spjdzil_claim_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t first_txg) 431168404Spjd{ 432168404Spjd /* 433168404Spjd * Claim log block if not already committed and not already claimed. 434219089Spjd * If tx == NULL, just verify that the block is claimable. 435168404Spjd */ 436260150Sdelphij if (BP_IS_HOLE(bp) || bp->blk_birth < first_txg || 437260150Sdelphij zil_bp_tree_add(zilog, bp) != 0) 438219089Spjd return (0); 439219089Spjd 440219089Spjd return (zio_wait(zio_claim(NULL, zilog->zl_spa, 441219089Spjd tx == NULL ? 0 : first_txg, bp, spa_claim_notify, NULL, 442219089Spjd ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB))); 443168404Spjd} 444168404Spjd 445219089Spjdstatic int 446168404Spjdzil_claim_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t first_txg) 447168404Spjd{ 448219089Spjd lr_write_t *lr = (lr_write_t *)lrc; 449219089Spjd int error; 450219089Spjd 451219089Spjd if (lrc->lrc_txtype != TX_WRITE) 452219089Spjd return (0); 453219089Spjd 454219089Spjd /* 455219089Spjd * If the block is not readable, don't claim it. This can happen 456219089Spjd * in normal operation when a log block is written to disk before 457219089Spjd * some of the dmu_sync() blocks it points to. In this case, the 458219089Spjd * transaction cannot have been committed to anyone (we would have 459219089Spjd * waited for all writes to be stable first), so it is semantically 460219089Spjd * correct to declare this the end of the log. 461219089Spjd */ 462219089Spjd if (lr->lr_blkptr.blk_birth >= first_txg && 463219089Spjd (error = zil_read_log_data(zilog, lr, NULL)) != 0) 464219089Spjd return (error); 465219089Spjd return (zil_claim_log_block(zilog, &lr->lr_blkptr, tx, first_txg)); 466168404Spjd} 467168404Spjd 468168404Spjd/* ARGSUSED */ 469219089Spjdstatic int 470168404Spjdzil_free_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t claim_txg) 471168404Spjd{ 472332547Smav zio_free(zilog->zl_spa, dmu_tx_get_txg(tx), bp); 473219089Spjd 474219089Spjd return (0); 475168404Spjd} 476168404Spjd 477219089Spjdstatic int 478168404Spjdzil_free_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t claim_txg) 479168404Spjd{ 480219089Spjd lr_write_t *lr = (lr_write_t *)lrc; 481219089Spjd blkptr_t *bp = &lr->lr_blkptr; 482219089Spjd 483168404Spjd /* 484168404Spjd * If we previously claimed it, we need to free it. 485168404Spjd */ 486219089Spjd if (claim_txg != 0 && lrc->lrc_txtype == TX_WRITE && 487260150Sdelphij bp->blk_birth >= claim_txg && zil_bp_tree_add(zilog, bp) == 0 && 488260150Sdelphij !BP_IS_HOLE(bp)) 489219089Spjd zio_free(zilog->zl_spa, dmu_tx_get_txg(tx), bp); 490219089Spjd 491219089Spjd return (0); 492219089Spjd} 493219089Spjd 494325132Savgstatic int 495325132Savgzil_lwb_vdev_compare(const void *x1, const void *x2) 496325132Savg{ 497325132Savg const uint64_t v1 = ((zil_vdev_node_t *)x1)->zv_vdev; 498325132Savg const uint64_t v2 = ((zil_vdev_node_t *)x2)->zv_vdev; 499325132Savg 500339158Smav return (AVL_CMP(v1, v2)); 501325132Savg} 502325132Savg 503219089Spjdstatic lwb_t * 504315441Smavzil_alloc_lwb(zilog_t *zilog, blkptr_t *bp, boolean_t slog, uint64_t txg) 505219089Spjd{ 506219089Spjd lwb_t *lwb; 507219089Spjd 508219089Spjd lwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP); 509219089Spjd lwb->lwb_zilog = zilog; 510219089Spjd lwb->lwb_blk = *bp; 511315441Smav lwb->lwb_slog = slog; 512325132Savg lwb->lwb_state = LWB_STATE_CLOSED; 513219089Spjd lwb->lwb_buf = zio_buf_alloc(BP_GET_LSIZE(bp)); 514219089Spjd lwb->lwb_max_txg = txg; 515325132Savg lwb->lwb_write_zio = NULL; 516325132Savg lwb->lwb_root_zio = NULL; 517219089Spjd lwb->lwb_tx = NULL; 518325132Savg lwb->lwb_issued_timestamp = 0; 519219089Spjd if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) { 520219089Spjd lwb->lwb_nused = sizeof (zil_chain_t); 521219089Spjd lwb->lwb_sz = BP_GET_LSIZE(bp); 522219089Spjd } else { 523219089Spjd lwb->lwb_nused = 0; 524219089Spjd lwb->lwb_sz = BP_GET_LSIZE(bp) - sizeof (zil_chain_t); 525168404Spjd } 526219089Spjd 527219089Spjd mutex_enter(&zilog->zl_lock); 528219089Spjd list_insert_tail(&zilog->zl_lwb_list, lwb); 529219089Spjd mutex_exit(&zilog->zl_lock); 530219089Spjd 531325132Savg ASSERT(!MUTEX_HELD(&lwb->lwb_vdev_lock)); 532325132Savg ASSERT(avl_is_empty(&lwb->lwb_vdev_tree)); 533329486Smav VERIFY(list_is_empty(&lwb->lwb_waiters)); 534325132Savg 535219089Spjd return (lwb); 536168404Spjd} 537168404Spjd 538325132Savgstatic void 539325132Savgzil_free_lwb(zilog_t *zilog, lwb_t *lwb) 540325132Savg{ 541325132Savg ASSERT(MUTEX_HELD(&zilog->zl_lock)); 542325132Savg ASSERT(!MUTEX_HELD(&lwb->lwb_vdev_lock)); 543329486Smav VERIFY(list_is_empty(&lwb->lwb_waiters)); 544325132Savg ASSERT(avl_is_empty(&lwb->lwb_vdev_tree)); 545325132Savg ASSERT3P(lwb->lwb_write_zio, ==, NULL); 546325132Savg ASSERT3P(lwb->lwb_root_zio, ==, NULL); 547329486Smav ASSERT3U(lwb->lwb_max_txg, <=, spa_syncing_txg(zilog->zl_spa)); 548329486Smav ASSERT(lwb->lwb_state == LWB_STATE_CLOSED || 549329486Smav lwb->lwb_state == LWB_STATE_DONE); 550325132Savg 551325132Savg /* 552325132Savg * Clear the zilog's field to indicate this lwb is no longer 553325132Savg * valid, and prevent use-after-free errors. 554325132Savg */ 555325132Savg if (zilog->zl_last_lwb_opened == lwb) 556325132Savg zilog->zl_last_lwb_opened = NULL; 557325132Savg 558325132Savg kmem_cache_free(zil_lwb_cache, lwb); 559325132Savg} 560325132Savg 561168404Spjd/* 562239620Smm * Called when we create in-memory log transactions so that we know 563239620Smm * to cleanup the itxs at the end of spa_sync(). 564239620Smm */ 565239620Smmvoid 566239620Smmzilog_dirty(zilog_t *zilog, uint64_t txg) 567239620Smm{ 568239620Smm dsl_pool_t *dp = zilog->zl_dmu_pool; 569239620Smm dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os); 570239620Smm 571325132Savg ASSERT(spa_writeable(zilog->zl_spa)); 572325132Savg 573286575Smav if (ds->ds_is_snapshot) 574239620Smm panic("dirtying snapshot!"); 575239620Smm 576248571Smm if (txg_list_add(&dp->dp_dirty_zilogs, zilog, txg)) { 577239620Smm /* up the hold count until we can be written out */ 578239620Smm dmu_buf_add_ref(ds->ds_dbuf, zilog); 579325132Savg 580325132Savg zilog->zl_dirty_max_txg = MAX(txg, zilog->zl_dirty_max_txg); 581239620Smm } 582239620Smm} 583239620Smm 584310515Savg/* 585310515Savg * Determine if the zil is dirty in the specified txg. Callers wanting to 586310515Savg * ensure that the dirty state does not change must hold the itxg_lock for 587310515Savg * the specified txg. Holding the lock will ensure that the zil cannot be 588310515Savg * dirtied (zil_itx_assign) or cleaned (zil_clean) while we check its current 589310515Savg * state. 590310515Savg */ 591239620Smmboolean_t 592310515Savgzilog_is_dirty_in_txg(zilog_t *zilog, uint64_t txg) 593310515Savg{ 594310515Savg dsl_pool_t *dp = zilog->zl_dmu_pool; 595310515Savg 596310515Savg if (txg_list_member(&dp->dp_dirty_zilogs, zilog, txg & TXG_MASK)) 597310515Savg return (B_TRUE); 598310515Savg return (B_FALSE); 599310515Savg} 600310515Savg 601310515Savg/* 602310515Savg * Determine if the zil is dirty. The zil is considered dirty if it has 603310515Savg * any pending itx records that have not been cleaned by zil_clean(). 604310515Savg */ 605310515Savgboolean_t 606239620Smmzilog_is_dirty(zilog_t *zilog) 607239620Smm{ 608239620Smm dsl_pool_t *dp = zilog->zl_dmu_pool; 609239620Smm 610239620Smm for (int t = 0; t < TXG_SIZE; t++) { 611239620Smm if (txg_list_member(&dp->dp_dirty_zilogs, zilog, t)) 612239620Smm return (B_TRUE); 613239620Smm } 614239620Smm return (B_FALSE); 615239620Smm} 616239620Smm 617239620Smm/* 618168404Spjd * Create an on-disk intent log. 619168404Spjd */ 620219089Spjdstatic lwb_t * 621168404Spjdzil_create(zilog_t *zilog) 622168404Spjd{ 623168404Spjd const zil_header_t *zh = zilog->zl_header; 624219089Spjd lwb_t *lwb = NULL; 625168404Spjd uint64_t txg = 0; 626168404Spjd dmu_tx_t *tx = NULL; 627168404Spjd blkptr_t blk; 628168404Spjd int error = 0; 629315441Smav boolean_t slog = FALSE; 630168404Spjd 631168404Spjd /* 632168404Spjd * Wait for any previous destroy to complete. 633168404Spjd */ 634168404Spjd txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg); 635168404Spjd 636168404Spjd ASSERT(zh->zh_claim_txg == 0); 637168404Spjd ASSERT(zh->zh_replay_seq == 0); 638168404Spjd 639168404Spjd blk = zh->zh_log; 640168404Spjd 641168404Spjd /* 642219089Spjd * Allocate an initial log block if: 643219089Spjd * - there isn't one already 644219089Spjd * - the existing block is the wrong endianess 645168404Spjd */ 646207908Smm if (BP_IS_HOLE(&blk) || BP_SHOULD_BYTESWAP(&blk)) { 647168404Spjd tx = dmu_tx_create(zilog->zl_os); 648325132Savg VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); 649168404Spjd dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); 650168404Spjd txg = dmu_tx_get_txg(tx); 651168404Spjd 652207908Smm if (!BP_IS_HOLE(&blk)) { 653332547Smav zio_free(zilog->zl_spa, txg, &blk); 654207908Smm BP_ZERO(&blk); 655207908Smm } 656207908Smm 657339105Smav error = zio_alloc_zil(zilog->zl_spa, 658339105Smav zilog->zl_os->os_dsl_dataset->ds_object, txg, &blk, NULL, 659315441Smav ZIL_MIN_BLKSZ, &slog); 660168404Spjd 661168404Spjd if (error == 0) 662168404Spjd zil_init_log_chain(zilog, &blk); 663168404Spjd } 664168404Spjd 665168404Spjd /* 666325132Savg * Allocate a log write block (lwb) for the first log block. 667168404Spjd */ 668219089Spjd if (error == 0) 669315441Smav lwb = zil_alloc_lwb(zilog, &blk, slog, txg); 670168404Spjd 671168404Spjd /* 672168404Spjd * If we just allocated the first log block, commit our transaction 673168404Spjd * and wait for zil_sync() to stuff the block poiner into zh_log. 674168404Spjd * (zh is part of the MOS, so we cannot modify it in open context.) 675168404Spjd */ 676168404Spjd if (tx != NULL) { 677168404Spjd dmu_tx_commit(tx); 678168404Spjd txg_wait_synced(zilog->zl_dmu_pool, txg); 679168404Spjd } 680168404Spjd 681168404Spjd ASSERT(bcmp(&blk, &zh->zh_log, sizeof (blk)) == 0); 682219089Spjd 683219089Spjd return (lwb); 684168404Spjd} 685168404Spjd 686168404Spjd/* 687325132Savg * In one tx, free all log blocks and clear the log header. If keep_first 688325132Savg * is set, then we're replaying a log with no content. We want to keep the 689325132Savg * first block, however, so that the first synchronous transaction doesn't 690325132Savg * require a txg_wait_synced() in zil_create(). We don't need to 691325132Savg * txg_wait_synced() here either when keep_first is set, because both 692325132Savg * zil_create() and zil_destroy() will wait for any in-progress destroys 693325132Savg * to complete. 694168404Spjd */ 695168404Spjdvoid 696168404Spjdzil_destroy(zilog_t *zilog, boolean_t keep_first) 697168404Spjd{ 698168404Spjd const zil_header_t *zh = zilog->zl_header; 699168404Spjd lwb_t *lwb; 700168404Spjd dmu_tx_t *tx; 701168404Spjd uint64_t txg; 702168404Spjd 703168404Spjd /* 704168404Spjd * Wait for any previous destroy to complete. 705168404Spjd */ 706168404Spjd txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg); 707168404Spjd 708219089Spjd zilog->zl_old_header = *zh; /* debugging aid */ 709219089Spjd 710168404Spjd if (BP_IS_HOLE(&zh->zh_log)) 711168404Spjd return; 712168404Spjd 713168404Spjd tx = dmu_tx_create(zilog->zl_os); 714325132Savg VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); 715168404Spjd dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); 716168404Spjd txg = dmu_tx_get_txg(tx); 717168404Spjd 718168404Spjd mutex_enter(&zilog->zl_lock); 719168404Spjd 720168404Spjd ASSERT3U(zilog->zl_destroy_txg, <, txg); 721168404Spjd zilog->zl_destroy_txg = txg; 722168404Spjd zilog->zl_keep_first = keep_first; 723168404Spjd 724168404Spjd if (!list_is_empty(&zilog->zl_lwb_list)) { 725168404Spjd ASSERT(zh->zh_claim_txg == 0); 726224526Smm VERIFY(!keep_first); 727168404Spjd while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) { 728168404Spjd list_remove(&zilog->zl_lwb_list, lwb); 729168404Spjd if (lwb->lwb_buf != NULL) 730168404Spjd zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); 731325132Savg zio_free(zilog->zl_spa, txg, &lwb->lwb_blk); 732325132Savg zil_free_lwb(zilog, lwb); 733168404Spjd } 734219089Spjd } else if (!keep_first) { 735239620Smm zil_destroy_sync(zilog, tx); 736168404Spjd } 737168404Spjd mutex_exit(&zilog->zl_lock); 738168404Spjd 739168404Spjd dmu_tx_commit(tx); 740185029Spjd} 741168404Spjd 742239620Smmvoid 743239620Smmzil_destroy_sync(zilog_t *zilog, dmu_tx_t *tx) 744239620Smm{ 745239620Smm ASSERT(list_is_empty(&zilog->zl_lwb_list)); 746239620Smm (void) zil_parse(zilog, zil_free_log_block, 747239620Smm zil_free_log_record, tx, zilog->zl_header->zh_claim_txg); 748239620Smm} 749239620Smm 750168404Spjdint 751286686Smavzil_claim(dsl_pool_t *dp, dsl_dataset_t *ds, void *txarg) 752168404Spjd{ 753168404Spjd dmu_tx_t *tx = txarg; 754168404Spjd zilog_t *zilog; 755332547Smav uint64_t first_txg; 756168404Spjd zil_header_t *zh; 757168404Spjd objset_t *os; 758168404Spjd int error; 759168404Spjd 760286686Smav error = dmu_objset_own_obj(dp, ds->ds_object, 761286686Smav DMU_OST_ANY, B_FALSE, FTAG, &os); 762248571Smm if (error != 0) { 763271534Sdelphij /* 764271534Sdelphij * EBUSY indicates that the objset is inconsistent, in which 765271534Sdelphij * case it can not have a ZIL. 766271534Sdelphij */ 767271534Sdelphij if (error != EBUSY) { 768286686Smav cmn_err(CE_WARN, "can't open objset for %llu, error %u", 769286686Smav (unsigned long long)ds->ds_object, error); 770271534Sdelphij } 771168404Spjd return (0); 772168404Spjd } 773168404Spjd 774168404Spjd zilog = dmu_objset_zil(os); 775168404Spjd zh = zil_header_in_syncing_context(zilog); 776332547Smav ASSERT3U(tx->tx_txg, ==, spa_first_txg(zilog->zl_spa)); 777332547Smav first_txg = spa_min_claim_txg(zilog->zl_spa); 778168404Spjd 779332547Smav /* 780332547Smav * If the spa_log_state is not set to be cleared, check whether 781332547Smav * the current uberblock is a checkpoint one and if the current 782332547Smav * header has been claimed before moving on. 783332547Smav * 784332547Smav * If the current uberblock is a checkpointed uberblock then 785332547Smav * one of the following scenarios took place: 786332547Smav * 787332547Smav * 1] We are currently rewinding to the checkpoint of the pool. 788332547Smav * 2] We crashed in the middle of a checkpoint rewind but we 789332547Smav * did manage to write the checkpointed uberblock to the 790332547Smav * vdev labels, so when we tried to import the pool again 791332547Smav * the checkpointed uberblock was selected from the import 792332547Smav * procedure. 793332547Smav * 794332547Smav * In both cases we want to zero out all the ZIL blocks, except 795332547Smav * the ones that have been claimed at the time of the checkpoint 796332547Smav * (their zh_claim_txg != 0). The reason is that these blocks 797332547Smav * may be corrupted since we may have reused their locations on 798332547Smav * disk after we took the checkpoint. 799332547Smav * 800332547Smav * We could try to set spa_log_state to SPA_LOG_CLEAR earlier 801332547Smav * when we first figure out whether the current uberblock is 802332547Smav * checkpointed or not. Unfortunately, that would discard all 803332547Smav * the logs, including the ones that are claimed, and we would 804332547Smav * leak space. 805332547Smav */ 806332547Smav if (spa_get_log_state(zilog->zl_spa) == SPA_LOG_CLEAR || 807332547Smav (zilog->zl_spa->spa_uberblock.ub_checkpoint_txg != 0 && 808332547Smav zh->zh_claim_txg == 0)) { 809332547Smav if (!BP_IS_HOLE(&zh->zh_log)) { 810332547Smav (void) zil_parse(zilog, zil_clear_log_block, 811332547Smav zil_noop_log_record, tx, first_txg); 812332547Smav } 813213197Smm BP_ZERO(&zh->zh_log); 814213197Smm dsl_dataset_dirty(dmu_objset_ds(os), tx); 815248571Smm dmu_objset_disown(os, FTAG); 816219089Spjd return (0); 817213197Smm } 818213197Smm 819168404Spjd /* 820332547Smav * If we are not rewinding and opening the pool normally, then 821332547Smav * the min_claim_txg should be equal to the first txg of the pool. 822332547Smav */ 823332547Smav ASSERT3U(first_txg, ==, spa_first_txg(zilog->zl_spa)); 824332547Smav 825332547Smav /* 826168404Spjd * Claim all log blocks if we haven't already done so, and remember 827168404Spjd * the highest claimed sequence number. This ensures that if we can 828168404Spjd * read only part of the log now (e.g. due to a missing device), 829168404Spjd * but we can read the entire log later, we will not try to replay 830168404Spjd * or destroy beyond the last block we successfully claimed. 831168404Spjd */ 832168404Spjd ASSERT3U(zh->zh_claim_txg, <=, first_txg); 833168404Spjd if (zh->zh_claim_txg == 0 && !BP_IS_HOLE(&zh->zh_log)) { 834219089Spjd (void) zil_parse(zilog, zil_claim_log_block, 835219089Spjd zil_claim_log_record, tx, first_txg); 836168404Spjd zh->zh_claim_txg = first_txg; 837219089Spjd zh->zh_claim_blk_seq = zilog->zl_parse_blk_seq; 838219089Spjd zh->zh_claim_lr_seq = zilog->zl_parse_lr_seq; 839219089Spjd if (zilog->zl_parse_lr_count || zilog->zl_parse_blk_count > 1) 840219089Spjd zh->zh_flags |= ZIL_REPLAY_NEEDED; 841219089Spjd zh->zh_flags |= ZIL_CLAIM_LR_SEQ_VALID; 842168404Spjd dsl_dataset_dirty(dmu_objset_ds(os), tx); 843168404Spjd } 844168404Spjd 845168404Spjd ASSERT3U(first_txg, ==, (spa_last_synced_txg(zilog->zl_spa) + 1)); 846248571Smm dmu_objset_disown(os, FTAG); 847168404Spjd return (0); 848168404Spjd} 849168404Spjd 850185029Spjd/* 851185029Spjd * Check the log by walking the log chain. 852185029Spjd * Checksum errors are ok as they indicate the end of the chain. 853185029Spjd * Any other error (no device or read failure) returns an error. 854185029Spjd */ 855286686Smav/* ARGSUSED */ 856185029Spjdint 857286686Smavzil_check_log_chain(dsl_pool_t *dp, dsl_dataset_t *ds, void *tx) 858168404Spjd{ 859185029Spjd zilog_t *zilog; 860185029Spjd objset_t *os; 861219089Spjd blkptr_t *bp; 862185029Spjd int error; 863168404Spjd 864219089Spjd ASSERT(tx == NULL); 865219089Spjd 866286686Smav error = dmu_objset_from_ds(ds, &os); 867248571Smm if (error != 0) { 868286686Smav cmn_err(CE_WARN, "can't open objset %llu, error %d", 869286686Smav (unsigned long long)ds->ds_object, error); 870185029Spjd return (0); 871185029Spjd } 872168404Spjd 873185029Spjd zilog = dmu_objset_zil(os); 874219089Spjd bp = (blkptr_t *)&zilog->zl_header->zh_log; 875219089Spjd 876219089Spjd if (!BP_IS_HOLE(bp)) { 877219089Spjd vdev_t *vd; 878219089Spjd boolean_t valid = B_TRUE; 879219089Spjd 880332547Smav /* 881332547Smav * Check the first block and determine if it's on a log device 882332547Smav * which may have been removed or faulted prior to loading this 883332547Smav * pool. If so, there's no point in checking the rest of the 884332547Smav * log as its content should have already been synced to the 885332547Smav * pool. 886332547Smav */ 887219089Spjd spa_config_enter(os->os_spa, SCL_STATE, FTAG, RW_READER); 888219089Spjd vd = vdev_lookup_top(os->os_spa, DVA_GET_VDEV(&bp->blk_dva[0])); 889219089Spjd if (vd->vdev_islog && vdev_is_dead(vd)) 890219089Spjd valid = vdev_log_state_valid(vd); 891219089Spjd spa_config_exit(os->os_spa, SCL_STATE, FTAG); 892219089Spjd 893286686Smav if (!valid) 894219089Spjd return (0); 895332547Smav 896332547Smav /* 897332547Smav * Check whether the current uberblock is checkpointed (e.g. 898332547Smav * we are rewinding) and whether the current header has been 899332547Smav * claimed or not. If it hasn't then skip verifying it. We 900332547Smav * do this because its ZIL blocks may be part of the pool's 901332547Smav * state before the rewind, which is no longer valid. 902332547Smav */ 903332547Smav zil_header_t *zh = zil_header_in_syncing_context(zilog); 904332547Smav if (zilog->zl_spa->spa_uberblock.ub_checkpoint_txg != 0 && 905332547Smav zh->zh_claim_txg == 0) 906332547Smav return (0); 907168404Spjd } 908185029Spjd 909219089Spjd /* 910219089Spjd * Because tx == NULL, zil_claim_log_block() will not actually claim 911219089Spjd * any blocks, but just determine whether it is possible to do so. 912219089Spjd * In addition to checking the log chain, zil_claim_log_block() 913219089Spjd * will invoke zio_claim() with a done func of spa_claim_notify(), 914219089Spjd * which will update spa_max_claim_txg. See spa_load() for details. 915219089Spjd */ 916219089Spjd error = zil_parse(zilog, zil_claim_log_block, zil_claim_log_record, tx, 917332547Smav zilog->zl_header->zh_claim_txg ? -1ULL : 918332547Smav spa_min_claim_txg(os->os_spa)); 919219089Spjd 920219089Spjd return ((error == ECKSUM || error == ENOENT) ? 0 : error); 921168404Spjd} 922168404Spjd 923325132Savg/* 924325132Savg * When an itx is "skipped", this function is used to properly mark the 925325132Savg * waiter as "done, and signal any thread(s) waiting on it. An itx can 926325132Savg * be skipped (and not committed to an lwb) for a variety of reasons, 927325132Savg * one of them being that the itx was committed via spa_sync(), prior to 928325132Savg * it being committed to an lwb; this can happen if a thread calling 929325132Savg * zil_commit() is racing with spa_sync(). 930325132Savg */ 931325132Savgstatic void 932325132Savgzil_commit_waiter_skip(zil_commit_waiter_t *zcw) 933185029Spjd{ 934325132Savg mutex_enter(&zcw->zcw_lock); 935325132Savg ASSERT3B(zcw->zcw_done, ==, B_FALSE); 936325132Savg zcw->zcw_done = B_TRUE; 937325132Savg cv_broadcast(&zcw->zcw_cv); 938325132Savg mutex_exit(&zcw->zcw_lock); 939325132Savg} 940185029Spjd 941325132Savg/* 942325132Savg * This function is used when the given waiter is to be linked into an 943325132Savg * lwb's "lwb_waiter" list; i.e. when the itx is committed to the lwb. 944325132Savg * At this point, the waiter will no longer be referenced by the itx, 945325132Savg * and instead, will be referenced by the lwb. 946325132Savg */ 947325132Savgstatic void 948325132Savgzil_commit_waiter_link_lwb(zil_commit_waiter_t *zcw, lwb_t *lwb) 949325132Savg{ 950329486Smav /* 951329486Smav * The lwb_waiters field of the lwb is protected by the zilog's 952329486Smav * zl_lock, thus it must be held when calling this function. 953329486Smav */ 954329486Smav ASSERT(MUTEX_HELD(&lwb->lwb_zilog->zl_lock)); 955329486Smav 956325132Savg mutex_enter(&zcw->zcw_lock); 957325132Savg ASSERT(!list_link_active(&zcw->zcw_node)); 958325132Savg ASSERT3P(zcw->zcw_lwb, ==, NULL); 959325132Savg ASSERT3P(lwb, !=, NULL); 960325132Savg ASSERT(lwb->lwb_state == LWB_STATE_OPENED || 961325132Savg lwb->lwb_state == LWB_STATE_ISSUED); 962185029Spjd 963325132Savg list_insert_tail(&lwb->lwb_waiters, zcw); 964325132Savg zcw->zcw_lwb = lwb; 965325132Savg mutex_exit(&zcw->zcw_lock); 966185029Spjd} 967185029Spjd 968325132Savg/* 969325132Savg * This function is used when zio_alloc_zil() fails to allocate a ZIL 970325132Savg * block, and the given waiter must be linked to the "nolwb waiters" 971325132Savg * list inside of zil_process_commit_list(). 972325132Savg */ 973325132Savgstatic void 974325132Savgzil_commit_waiter_link_nolwb(zil_commit_waiter_t *zcw, list_t *nolwb) 975325132Savg{ 976325132Savg mutex_enter(&zcw->zcw_lock); 977325132Savg ASSERT(!list_link_active(&zcw->zcw_node)); 978325132Savg ASSERT3P(zcw->zcw_lwb, ==, NULL); 979325132Savg list_insert_tail(nolwb, zcw); 980325132Savg mutex_exit(&zcw->zcw_lock); 981325132Savg} 982325132Savg 983168404Spjdvoid 984325132Savgzil_lwb_add_block(lwb_t *lwb, const blkptr_t *bp) 985168404Spjd{ 986325132Savg avl_tree_t *t = &lwb->lwb_vdev_tree; 987185029Spjd avl_index_t where; 988185029Spjd zil_vdev_node_t *zv, zvsearch; 989185029Spjd int ndvas = BP_GET_NDVAS(bp); 990185029Spjd int i; 991168404Spjd 992185029Spjd if (zfs_nocacheflush) 993185029Spjd return; 994168404Spjd 995325132Savg mutex_enter(&lwb->lwb_vdev_lock); 996185029Spjd for (i = 0; i < ndvas; i++) { 997185029Spjd zvsearch.zv_vdev = DVA_GET_VDEV(&bp->blk_dva[i]); 998185029Spjd if (avl_find(t, &zvsearch, &where) == NULL) { 999185029Spjd zv = kmem_alloc(sizeof (*zv), KM_SLEEP); 1000185029Spjd zv->zv_vdev = zvsearch.zv_vdev; 1001185029Spjd avl_insert(t, zv, where); 1002185029Spjd } 1003185029Spjd } 1004325132Savg mutex_exit(&lwb->lwb_vdev_lock); 1005168404Spjd} 1006168404Spjd 1007325132Savgvoid 1008325132Savgzil_lwb_add_txg(lwb_t *lwb, uint64_t txg) 1009325132Savg{ 1010325132Savg lwb->lwb_max_txg = MAX(lwb->lwb_max_txg, txg); 1011325132Savg} 1012325132Savg 1013325132Savg/* 1014325132Savg * This function is a called after all VDEVs associated with a given lwb 1015325132Savg * write have completed their DKIOCFLUSHWRITECACHE command; or as soon 1016325132Savg * as the lwb write completes, if "zfs_nocacheflush" is set. 1017325132Savg * 1018325132Savg * The intention is for this function to be called as soon as the 1019325132Savg * contents of an lwb are considered "stable" on disk, and will survive 1020325132Savg * any sudden loss of power. At this point, any threads waiting for the 1021325132Savg * lwb to reach this state are signalled, and the "waiter" structures 1022325132Savg * are marked "done". 1023325132Savg */ 1024219089Spjdstatic void 1025325132Savgzil_lwb_flush_vdevs_done(zio_t *zio) 1026168404Spjd{ 1027325132Savg lwb_t *lwb = zio->io_private; 1028325132Savg zilog_t *zilog = lwb->lwb_zilog; 1029325132Savg dmu_tx_t *tx = lwb->lwb_tx; 1030325132Savg zil_commit_waiter_t *zcw; 1031168404Spjd 1032325132Savg spa_config_exit(zilog->zl_spa, SCL_STATE, lwb); 1033168404Spjd 1034325132Savg zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); 1035325132Savg 1036325132Savg mutex_enter(&zilog->zl_lock); 1037325132Savg 1038185029Spjd /* 1039325132Savg * Ensure the lwb buffer pointer is cleared before releasing the 1040325132Savg * txg. If we have had an allocation failure and the txg is 1041325132Savg * waiting to sync then we want zil_sync() to remove the lwb so 1042325132Savg * that it's not picked up as the next new one in 1043325132Savg * zil_process_commit_list(). zil_sync() will only remove the 1044325132Savg * lwb if lwb_buf is null. 1045185029Spjd */ 1046325132Savg lwb->lwb_buf = NULL; 1047325132Savg lwb->lwb_tx = NULL; 1048185029Spjd 1049325132Savg ASSERT3U(lwb->lwb_issued_timestamp, >, 0); 1050325132Savg zilog->zl_last_lwb_latency = gethrtime() - lwb->lwb_issued_timestamp; 1051185029Spjd 1052325132Savg lwb->lwb_root_zio = NULL; 1053325132Savg lwb->lwb_state = LWB_STATE_DONE; 1054325132Savg 1055325132Savg if (zilog->zl_last_lwb_opened == lwb) { 1056325132Savg /* 1057325132Savg * Remember the highest committed log sequence number 1058325132Savg * for ztest. We only update this value when all the log 1059325132Savg * writes succeeded, because ztest wants to ASSERT that 1060325132Savg * it got the whole log chain. 1061325132Savg */ 1062325132Savg zilog->zl_commit_lr_seq = zilog->zl_lr_seq; 1063168404Spjd } 1064168404Spjd 1065325132Savg while ((zcw = list_head(&lwb->lwb_waiters)) != NULL) { 1066325132Savg mutex_enter(&zcw->zcw_lock); 1067325132Savg 1068325132Savg ASSERT(list_link_active(&zcw->zcw_node)); 1069325132Savg list_remove(&lwb->lwb_waiters, zcw); 1070325132Savg 1071325132Savg ASSERT3P(zcw->zcw_lwb, ==, lwb); 1072325132Savg zcw->zcw_lwb = NULL; 1073325132Savg 1074325132Savg zcw->zcw_zio_error = zio->io_error; 1075325132Savg 1076325132Savg ASSERT3B(zcw->zcw_done, ==, B_FALSE); 1077325132Savg zcw->zcw_done = B_TRUE; 1078325132Savg cv_broadcast(&zcw->zcw_cv); 1079325132Savg 1080325132Savg mutex_exit(&zcw->zcw_lock); 1081325132Savg } 1082325132Savg 1083325132Savg mutex_exit(&zilog->zl_lock); 1084325132Savg 1085168404Spjd /* 1086325132Savg * Now that we've written this log block, we have a stable pointer 1087325132Savg * to the next block in the chain, so it's OK to let the txg in 1088325132Savg * which we allocated the next block sync. 1089168404Spjd */ 1090325132Savg dmu_tx_commit(tx); 1091168404Spjd} 1092168404Spjd 1093168404Spjd/* 1094325132Savg * This is called when an lwb write completes. This means, this specific 1095325132Savg * lwb was written to disk, and all dependent lwb have also been 1096325132Savg * written to disk. 1097325132Savg * 1098325132Savg * At this point, a DKIOCFLUSHWRITECACHE command hasn't been issued to 1099325132Savg * the VDEVs involved in writing out this specific lwb. The lwb will be 1100325132Savg * "done" once zil_lwb_flush_vdevs_done() is called, which occurs in the 1101325132Savg * zio completion callback for the lwb's root zio. 1102168404Spjd */ 1103168404Spjdstatic void 1104168404Spjdzil_lwb_write_done(zio_t *zio) 1105168404Spjd{ 1106168404Spjd lwb_t *lwb = zio->io_private; 1107325132Savg spa_t *spa = zio->io_spa; 1108168404Spjd zilog_t *zilog = lwb->lwb_zilog; 1109325132Savg avl_tree_t *t = &lwb->lwb_vdev_tree; 1110325132Savg void *cookie = NULL; 1111325132Savg zil_vdev_node_t *zv; 1112168404Spjd 1113325132Savg ASSERT3S(spa_config_held(spa, SCL_STATE, RW_READER), !=, 0); 1114325132Savg 1115185029Spjd ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF); 1116185029Spjd ASSERT(BP_GET_TYPE(zio->io_bp) == DMU_OT_INTENT_LOG); 1117185029Spjd ASSERT(BP_GET_LEVEL(zio->io_bp) == 0); 1118185029Spjd ASSERT(BP_GET_BYTEORDER(zio->io_bp) == ZFS_HOST_BYTEORDER); 1119185029Spjd ASSERT(!BP_IS_GANG(zio->io_bp)); 1120185029Spjd ASSERT(!BP_IS_HOLE(zio->io_bp)); 1121268075Sdelphij ASSERT(BP_GET_FILL(zio->io_bp) == 0); 1122185029Spjd 1123321610Smav abd_put(zio->io_abd); 1124325132Savg 1125325132Savg ASSERT3S(lwb->lwb_state, ==, LWB_STATE_ISSUED); 1126325132Savg 1127168404Spjd mutex_enter(&zilog->zl_lock); 1128325132Savg lwb->lwb_write_zio = NULL; 1129219089Spjd mutex_exit(&zilog->zl_lock); 1130209962Smm 1131325132Savg if (avl_numnodes(t) == 0) 1132325132Savg return; 1133325132Savg 1134209962Smm /* 1135325132Savg * If there was an IO error, we're not going to call zio_flush() 1136325132Savg * on these vdevs, so we simply empty the tree and free the 1137325132Savg * nodes. We avoid calling zio_flush() since there isn't any 1138325132Savg * good reason for doing so, after the lwb block failed to be 1139325132Savg * written out. 1140209962Smm */ 1141325132Savg if (zio->io_error != 0) { 1142325132Savg while ((zv = avl_destroy_nodes(t, &cookie)) != NULL) 1143325132Savg kmem_free(zv, sizeof (*zv)); 1144325132Savg return; 1145325132Savg } 1146325132Savg 1147325132Savg while ((zv = avl_destroy_nodes(t, &cookie)) != NULL) { 1148325132Savg vdev_t *vd = vdev_lookup_top(spa, zv->zv_vdev); 1149325132Savg if (vd != NULL) 1150325132Savg zio_flush(lwb->lwb_root_zio, vd); 1151325132Savg kmem_free(zv, sizeof (*zv)); 1152325132Savg } 1153168404Spjd} 1154168404Spjd 1155168404Spjd/* 1156325132Savg * This function's purpose is to "open" an lwb such that it is ready to 1157325132Savg * accept new itxs being committed to it. To do this, the lwb's zio 1158325132Savg * structures are created, and linked to the lwb. This function is 1159325132Savg * idempotent; if the passed in lwb has already been opened, this 1160325132Savg * function is essentially a no-op. 1161168404Spjd */ 1162168404Spjdstatic void 1163325132Savgzil_lwb_write_open(zilog_t *zilog, lwb_t *lwb) 1164168404Spjd{ 1165268123Sdelphij zbookmark_phys_t zb; 1166315441Smav zio_priority_t prio; 1167168404Spjd 1168329485Smav ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); 1169325132Savg ASSERT3P(lwb, !=, NULL); 1170325132Savg EQUIV(lwb->lwb_root_zio == NULL, lwb->lwb_state == LWB_STATE_CLOSED); 1171325132Savg EQUIV(lwb->lwb_root_zio != NULL, lwb->lwb_state == LWB_STATE_OPENED); 1172325132Savg 1173219089Spjd SET_BOOKMARK(&zb, lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_OBJSET], 1174219089Spjd ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, 1175219089Spjd lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_SEQ]); 1176168404Spjd 1177325132Savg if (lwb->lwb_root_zio == NULL) { 1178321610Smav abd_t *lwb_abd = abd_get_from_buf(lwb->lwb_buf, 1179321610Smav BP_GET_LSIZE(&lwb->lwb_blk)); 1180325132Savg 1181321611Smav if (!lwb->lwb_slog || zilog->zl_cur_used <= zil_slog_bulk) 1182315441Smav prio = ZIO_PRIORITY_SYNC_WRITE; 1183315441Smav else 1184315441Smav prio = ZIO_PRIORITY_ASYNC_WRITE; 1185325132Savg 1186325132Savg lwb->lwb_root_zio = zio_root(zilog->zl_spa, 1187325132Savg zil_lwb_flush_vdevs_done, lwb, ZIO_FLAG_CANFAIL); 1188325132Savg ASSERT3P(lwb->lwb_root_zio, !=, NULL); 1189325132Savg 1190325132Savg lwb->lwb_write_zio = zio_rewrite(lwb->lwb_root_zio, 1191325132Savg zilog->zl_spa, 0, &lwb->lwb_blk, lwb_abd, 1192325132Savg BP_GET_LSIZE(&lwb->lwb_blk), zil_lwb_write_done, lwb, 1193325132Savg prio, ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE, &zb); 1194325132Savg ASSERT3P(lwb->lwb_write_zio, !=, NULL); 1195325132Savg 1196325132Savg lwb->lwb_state = LWB_STATE_OPENED; 1197325132Savg 1198325132Savg mutex_enter(&zilog->zl_lock); 1199325132Savg 1200325132Savg /* 1201325132Savg * The zilog's "zl_last_lwb_opened" field is used to 1202325132Savg * build the lwb/zio dependency chain, which is used to 1203325132Savg * preserve the ordering of lwb completions that is 1204325132Savg * required by the semantics of the ZIL. Each new lwb 1205325132Savg * zio becomes a parent of the "previous" lwb zio, such 1206325132Savg * that the new lwb's zio cannot complete until the 1207325132Savg * "previous" lwb's zio completes. 1208325132Savg * 1209325132Savg * This is required by the semantics of zil_commit(); 1210325132Savg * the commit waiters attached to the lwbs will be woken 1211325132Savg * in the lwb zio's completion callback, so this zio 1212325132Savg * dependency graph ensures the waiters are woken in the 1213325132Savg * correct order (the same order the lwbs were created). 1214325132Savg */ 1215325132Savg lwb_t *last_lwb_opened = zilog->zl_last_lwb_opened; 1216325132Savg if (last_lwb_opened != NULL && 1217325132Savg last_lwb_opened->lwb_state != LWB_STATE_DONE) { 1218325132Savg ASSERT(last_lwb_opened->lwb_state == LWB_STATE_OPENED || 1219325132Savg last_lwb_opened->lwb_state == LWB_STATE_ISSUED); 1220325132Savg ASSERT3P(last_lwb_opened->lwb_root_zio, !=, NULL); 1221325132Savg zio_add_child(lwb->lwb_root_zio, 1222325132Savg last_lwb_opened->lwb_root_zio); 1223325132Savg } 1224325132Savg zilog->zl_last_lwb_opened = lwb; 1225325132Savg 1226325132Savg mutex_exit(&zilog->zl_lock); 1227168404Spjd } 1228325132Savg 1229325132Savg ASSERT3P(lwb->lwb_root_zio, !=, NULL); 1230325132Savg ASSERT3P(lwb->lwb_write_zio, !=, NULL); 1231325132Savg ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED); 1232168404Spjd} 1233168404Spjd 1234168404Spjd/* 1235219089Spjd * Define a limited set of intent log block sizes. 1236251631Sdelphij * 1237219089Spjd * These must be a multiple of 4KB. Note only the amount used (again 1238219089Spjd * aligned to 4KB) actually gets written. However, we can't always just 1239274337Sdelphij * allocate SPA_OLD_MAXBLOCKSIZE as the slog space could be exhausted. 1240219089Spjd */ 1241219089Spjduint64_t zil_block_buckets[] = { 1242219089Spjd 4096, /* non TX_WRITE */ 1243219089Spjd 8192+4096, /* data base */ 1244219089Spjd 32*1024 + 4096, /* NFS writes */ 1245219089Spjd UINT64_MAX 1246219089Spjd}; 1247219089Spjd 1248219089Spjd/* 1249168404Spjd * Start a log block write and advance to the next log block. 1250168404Spjd * Calls are serialized. 1251168404Spjd */ 1252168404Spjdstatic lwb_t * 1253325132Savgzil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb) 1254168404Spjd{ 1255219089Spjd lwb_t *nlwb = NULL; 1256219089Spjd zil_chain_t *zilc; 1257168404Spjd spa_t *spa = zilog->zl_spa; 1258219089Spjd blkptr_t *bp; 1259219089Spjd dmu_tx_t *tx; 1260168404Spjd uint64_t txg; 1261219089Spjd uint64_t zil_blksz, wsz; 1262219089Spjd int i, error; 1263315441Smav boolean_t slog; 1264168404Spjd 1265329485Smav ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); 1266325132Savg ASSERT3P(lwb->lwb_root_zio, !=, NULL); 1267325132Savg ASSERT3P(lwb->lwb_write_zio, !=, NULL); 1268325132Savg ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED); 1269325132Savg 1270219089Spjd if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2) { 1271219089Spjd zilc = (zil_chain_t *)lwb->lwb_buf; 1272219089Spjd bp = &zilc->zc_next_blk; 1273219089Spjd } else { 1274219089Spjd zilc = (zil_chain_t *)(lwb->lwb_buf + lwb->lwb_sz); 1275219089Spjd bp = &zilc->zc_next_blk; 1276219089Spjd } 1277168404Spjd 1278219089Spjd ASSERT(lwb->lwb_nused <= lwb->lwb_sz); 1279219089Spjd 1280168404Spjd /* 1281168404Spjd * Allocate the next block and save its address in this block 1282168404Spjd * before writing it in order to establish the log chain. 1283168404Spjd * Note that if the allocation of nlwb synced before we wrote 1284168404Spjd * the block that points at it (lwb), we'd leak it if we crashed. 1285219089Spjd * Therefore, we don't do dmu_tx_commit() until zil_lwb_write_done(). 1286219089Spjd * We dirty the dataset to ensure that zil_sync() will be called 1287219089Spjd * to clean up in the event of allocation failure or I/O failure. 1288168404Spjd */ 1289325132Savg 1290219089Spjd tx = dmu_tx_create(zilog->zl_os); 1291328235Smav 1292328235Smav /* 1293330986Savg * Since we are not going to create any new dirty data, and we 1294330986Savg * can even help with clearing the existing dirty data, we 1295330986Savg * should not be subject to the dirty data based delays. We 1296330986Savg * use TXG_NOTHROTTLE to bypass the delay mechanism. 1297328235Smav */ 1298330986Savg VERIFY0(dmu_tx_assign(tx, TXG_WAIT | TXG_NOTHROTTLE)); 1299330986Savg 1300219089Spjd dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); 1301219089Spjd txg = dmu_tx_get_txg(tx); 1302168404Spjd 1303219089Spjd lwb->lwb_tx = tx; 1304219089Spjd 1305168404Spjd /* 1306219089Spjd * Log blocks are pre-allocated. Here we select the size of the next 1307219089Spjd * block, based on size used in the last block. 1308219089Spjd * - first find the smallest bucket that will fit the block from a 1309219089Spjd * limited set of block sizes. This is because it's faster to write 1310219089Spjd * blocks allocated from the same metaslab as they are adjacent or 1311219089Spjd * close. 1312219089Spjd * - next find the maximum from the new suggested size and an array of 1313219089Spjd * previous sizes. This lessens a picket fence effect of wrongly 1314219089Spjd * guesssing the size if we have a stream of say 2k, 64k, 2k, 64k 1315219089Spjd * requests. 1316219089Spjd * 1317219089Spjd * Note we only write what is used, but we can't just allocate 1318219089Spjd * the maximum block size because we can exhaust the available 1319219089Spjd * pool log space. 1320168404Spjd */ 1321219089Spjd zil_blksz = zilog->zl_cur_used + sizeof (zil_chain_t); 1322219089Spjd for (i = 0; zil_blksz > zil_block_buckets[i]; i++) 1323219089Spjd continue; 1324219089Spjd zil_blksz = zil_block_buckets[i]; 1325219089Spjd if (zil_blksz == UINT64_MAX) 1326274337Sdelphij zil_blksz = SPA_OLD_MAXBLOCKSIZE; 1327219089Spjd zilog->zl_prev_blks[zilog->zl_prev_rotor] = zil_blksz; 1328219089Spjd for (i = 0; i < ZIL_PREV_BLKS; i++) 1329219089Spjd zil_blksz = MAX(zil_blksz, zilog->zl_prev_blks[i]); 1330219089Spjd zilog->zl_prev_rotor = (zilog->zl_prev_rotor + 1) & (ZIL_PREV_BLKS - 1); 1331168404Spjd 1332168404Spjd BP_ZERO(bp); 1333325132Savg 1334168404Spjd /* pass the old blkptr in order to spread log blocks across devs */ 1335339105Smav error = zio_alloc_zil(spa, zilog->zl_os->os_dsl_dataset->ds_object, 1336339105Smav txg, bp, &lwb->lwb_blk, zil_blksz, &slog); 1337248571Smm if (error == 0) { 1338219089Spjd ASSERT3U(bp->blk_birth, ==, txg); 1339219089Spjd bp->blk_cksum = lwb->lwb_blk.blk_cksum; 1340219089Spjd bp->blk_cksum.zc_word[ZIL_ZC_SEQ]++; 1341168404Spjd 1342168404Spjd /* 1343325132Savg * Allocate a new log write block (lwb). 1344168404Spjd */ 1345315441Smav nlwb = zil_alloc_lwb(zilog, bp, slog, txg); 1346168404Spjd } 1347168404Spjd 1348219089Spjd if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2) { 1349219089Spjd /* For Slim ZIL only write what is used. */ 1350219089Spjd wsz = P2ROUNDUP_TYPED(lwb->lwb_nused, ZIL_MIN_BLKSZ, uint64_t); 1351219089Spjd ASSERT3U(wsz, <=, lwb->lwb_sz); 1352325132Savg zio_shrink(lwb->lwb_write_zio, wsz); 1353168404Spjd 1354219089Spjd } else { 1355219089Spjd wsz = lwb->lwb_sz; 1356219089Spjd } 1357168404Spjd 1358219089Spjd zilc->zc_pad = 0; 1359219089Spjd zilc->zc_nused = lwb->lwb_nused; 1360219089Spjd zilc->zc_eck.zec_cksum = lwb->lwb_blk.blk_cksum; 1361168404Spjd 1362168404Spjd /* 1363219089Spjd * clear unused data for security 1364168404Spjd */ 1365219089Spjd bzero(lwb->lwb_buf + lwb->lwb_nused, wsz - lwb->lwb_nused); 1366168404Spjd 1367325132Savg spa_config_enter(zilog->zl_spa, SCL_STATE, lwb, RW_READER); 1368168404Spjd 1369325132Savg zil_lwb_add_block(lwb, &lwb->lwb_blk); 1370325132Savg lwb->lwb_issued_timestamp = gethrtime(); 1371325132Savg lwb->lwb_state = LWB_STATE_ISSUED; 1372325132Savg 1373325132Savg zio_nowait(lwb->lwb_root_zio); 1374325132Savg zio_nowait(lwb->lwb_write_zio); 1375325132Savg 1376168404Spjd /* 1377219089Spjd * If there was an allocation failure then nlwb will be null which 1378219089Spjd * forces a txg_wait_synced(). 1379168404Spjd */ 1380168404Spjd return (nlwb); 1381168404Spjd} 1382168404Spjd 1383168404Spjdstatic lwb_t * 1384168404Spjdzil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb) 1385168404Spjd{ 1386321611Smav lr_t *lrcb, *lrc; 1387321611Smav lr_write_t *lrwb, *lrw; 1388219089Spjd char *lr_buf; 1389321611Smav uint64_t dlen, dnow, lwb_sp, reclen, txg; 1390168404Spjd 1391329485Smav ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); 1392325132Savg ASSERT3P(lwb, !=, NULL); 1393325132Savg ASSERT3P(lwb->lwb_buf, !=, NULL); 1394219089Spjd 1395325132Savg zil_lwb_write_open(zilog, lwb); 1396168404Spjd 1397325132Savg lrc = &itx->itx_lr; 1398325132Savg lrw = (lr_write_t *)lrc; 1399325132Savg 1400325132Savg /* 1401325132Savg * A commit itx doesn't represent any on-disk state; instead 1402325132Savg * it's simply used as a place holder on the commit list, and 1403325132Savg * provides a mechanism for attaching a "commit waiter" onto the 1404325132Savg * correct lwb (such that the waiter can be signalled upon 1405325132Savg * completion of that lwb). Thus, we don't process this itx's 1406325132Savg * log record if it's a commit itx (these itx's don't have log 1407325132Savg * records), and instead link the itx's waiter onto the lwb's 1408325132Savg * list of waiters. 1409325132Savg * 1410325132Savg * For more details, see the comment above zil_commit(). 1411325132Savg */ 1412325132Savg if (lrc->lrc_txtype == TX_COMMIT) { 1413329486Smav mutex_enter(&zilog->zl_lock); 1414325132Savg zil_commit_waiter_link_lwb(itx->itx_private, lwb); 1415325132Savg itx->itx_private = NULL; 1416329486Smav mutex_exit(&zilog->zl_lock); 1417325132Savg return (lwb); 1418325132Savg } 1419325132Savg 1420321611Smav if (lrc->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY) { 1421168404Spjd dlen = P2ROUNDUP_TYPED( 1422219089Spjd lrw->lr_length, sizeof (uint64_t), uint64_t); 1423321611Smav } else { 1424321611Smav dlen = 0; 1425321611Smav } 1426321611Smav reclen = lrc->lrc_reclen; 1427168404Spjd zilog->zl_cur_used += (reclen + dlen); 1428321611Smav txg = lrc->lrc_txg; 1429168404Spjd 1430325132Savg ASSERT3U(zilog->zl_cur_used, <, UINT64_MAX - (reclen + dlen)); 1431168404Spjd 1432315441Smavcont: 1433168404Spjd /* 1434168404Spjd * If this record won't fit in the current log block, start a new one. 1435321611Smav * For WR_NEED_COPY optimize layout for minimal number of chunks. 1436168404Spjd */ 1437315441Smav lwb_sp = lwb->lwb_sz - lwb->lwb_nused; 1438315441Smav if (reclen > lwb_sp || (reclen + dlen > lwb_sp && 1439321611Smav lwb_sp < ZIL_MAX_WASTE_SPACE && (dlen % ZIL_MAX_LOG_DATA == 0 || 1440315441Smav lwb_sp < reclen + dlen % ZIL_MAX_LOG_DATA))) { 1441325132Savg lwb = zil_lwb_write_issue(zilog, lwb); 1442168404Spjd if (lwb == NULL) 1443168404Spjd return (NULL); 1444325132Savg zil_lwb_write_open(zilog, lwb); 1445219089Spjd ASSERT(LWB_EMPTY(lwb)); 1446315441Smav lwb_sp = lwb->lwb_sz - lwb->lwb_nused; 1447321611Smav ASSERT3U(reclen + MIN(dlen, sizeof (uint64_t)), <=, lwb_sp); 1448168404Spjd } 1449168404Spjd 1450315441Smav dnow = MIN(dlen, lwb_sp - reclen); 1451219089Spjd lr_buf = lwb->lwb_buf + lwb->lwb_nused; 1452219089Spjd bcopy(lrc, lr_buf, reclen); 1453321611Smav lrcb = (lr_t *)lr_buf; /* Like lrc, but inside lwb. */ 1454321611Smav lrwb = (lr_write_t *)lrcb; /* Like lrw, but inside lwb. */ 1455168404Spjd 1456168404Spjd /* 1457168404Spjd * If it's a write, fetch the data or get its blkptr as appropriate. 1458168404Spjd */ 1459168404Spjd if (lrc->lrc_txtype == TX_WRITE) { 1460168404Spjd if (txg > spa_freeze_txg(zilog->zl_spa)) 1461168404Spjd txg_wait_synced(zilog->zl_dmu_pool, txg); 1462168404Spjd if (itx->itx_wr_state != WR_COPIED) { 1463168404Spjd char *dbuf; 1464168404Spjd int error; 1465168404Spjd 1466315441Smav if (itx->itx_wr_state == WR_NEED_COPY) { 1467219089Spjd dbuf = lr_buf + reclen; 1468315441Smav lrcb->lrc_reclen += dnow; 1469315441Smav if (lrwb->lr_length > dnow) 1470315441Smav lrwb->lr_length = dnow; 1471315441Smav lrw->lr_offset += dnow; 1472315441Smav lrw->lr_length -= dnow; 1473168404Spjd } else { 1474168404Spjd ASSERT(itx->itx_wr_state == WR_INDIRECT); 1475168404Spjd dbuf = NULL; 1476168404Spjd } 1477325132Savg 1478325132Savg /* 1479325132Savg * We pass in the "lwb_write_zio" rather than 1480325132Savg * "lwb_root_zio" so that the "lwb_write_zio" 1481325132Savg * becomes the parent of any zio's created by 1482325132Savg * the "zl_get_data" callback. The vdevs are 1483325132Savg * flushed after the "lwb_write_zio" completes, 1484325132Savg * so we want to make sure that completion 1485325132Savg * callback waits for these additional zio's, 1486325132Savg * such that the vdevs used by those zio's will 1487325132Savg * be included in the lwb's vdev tree, and those 1488325132Savg * vdevs will be properly flushed. If we passed 1489325132Savg * in "lwb_root_zio" here, then these additional 1490325132Savg * vdevs may not be flushed; e.g. if these zio's 1491325132Savg * completed after "lwb_write_zio" completed. 1492325132Savg */ 1493325132Savg error = zilog->zl_get_data(itx->itx_private, 1494325132Savg lrwb, dbuf, lwb, lwb->lwb_write_zio); 1495325132Savg 1496214378Smm if (error == EIO) { 1497214378Smm txg_wait_synced(zilog->zl_dmu_pool, txg); 1498214378Smm return (lwb); 1499214378Smm } 1500248571Smm if (error != 0) { 1501168404Spjd ASSERT(error == ENOENT || error == EEXIST || 1502168404Spjd error == EALREADY); 1503168404Spjd return (lwb); 1504168404Spjd } 1505168404Spjd } 1506168404Spjd } 1507168404Spjd 1508219089Spjd /* 1509219089Spjd * We're actually making an entry, so update lrc_seq to be the 1510219089Spjd * log record sequence number. Note that this is generally not 1511219089Spjd * equal to the itx sequence number because not all transactions 1512219089Spjd * are synchronous, and sometimes spa_sync() gets there first. 1513219089Spjd */ 1514325132Savg lrcb->lrc_seq = ++zilog->zl_lr_seq; 1515315441Smav lwb->lwb_nused += reclen + dnow; 1516325132Savg 1517325132Savg zil_lwb_add_txg(lwb, txg); 1518325132Savg 1519219089Spjd ASSERT3U(lwb->lwb_nused, <=, lwb->lwb_sz); 1520240415Smm ASSERT0(P2PHASE(lwb->lwb_nused, sizeof (uint64_t))); 1521168404Spjd 1522315441Smav dlen -= dnow; 1523315441Smav if (dlen > 0) { 1524315441Smav zilog->zl_cur_used += reclen; 1525315441Smav goto cont; 1526315441Smav } 1527315441Smav 1528168404Spjd return (lwb); 1529168404Spjd} 1530168404Spjd 1531168404Spjditx_t * 1532185029Spjdzil_itx_create(uint64_t txtype, size_t lrsize) 1533168404Spjd{ 1534168404Spjd itx_t *itx; 1535168404Spjd 1536168404Spjd lrsize = P2ROUNDUP_TYPED(lrsize, sizeof (uint64_t), size_t); 1537168404Spjd 1538168404Spjd itx = kmem_alloc(offsetof(itx_t, itx_lr) + lrsize, KM_SLEEP); 1539168404Spjd itx->itx_lr.lrc_txtype = txtype; 1540168404Spjd itx->itx_lr.lrc_reclen = lrsize; 1541168404Spjd itx->itx_lr.lrc_seq = 0; /* defensive */ 1542219089Spjd itx->itx_sync = B_TRUE; /* default is synchronous */ 1543168404Spjd 1544168404Spjd return (itx); 1545168404Spjd} 1546168404Spjd 1547219089Spjdvoid 1548219089Spjdzil_itx_destroy(itx_t *itx) 1549168404Spjd{ 1550219089Spjd kmem_free(itx, offsetof(itx_t, itx_lr) + itx->itx_lr.lrc_reclen); 1551219089Spjd} 1552168404Spjd 1553219089Spjd/* 1554219089Spjd * Free up the sync and async itxs. The itxs_t has already been detached 1555219089Spjd * so no locks are needed. 1556219089Spjd */ 1557219089Spjdstatic void 1558219089Spjdzil_itxg_clean(itxs_t *itxs) 1559219089Spjd{ 1560219089Spjd itx_t *itx; 1561219089Spjd list_t *list; 1562219089Spjd avl_tree_t *t; 1563219089Spjd void *cookie; 1564219089Spjd itx_async_node_t *ian; 1565168404Spjd 1566219089Spjd list = &itxs->i_sync_list; 1567219089Spjd while ((itx = list_head(list)) != NULL) { 1568325132Savg /* 1569325132Savg * In the general case, commit itxs will not be found 1570325132Savg * here, as they'll be committed to an lwb via 1571325132Savg * zil_lwb_commit(), and free'd in that function. Having 1572325132Savg * said that, it is still possible for commit itxs to be 1573325132Savg * found here, due to the following race: 1574325132Savg * 1575325132Savg * - a thread calls zil_commit() which assigns the 1576325132Savg * commit itx to a per-txg i_sync_list 1577325132Savg * - zil_itxg_clean() is called (e.g. via spa_sync()) 1578325132Savg * while the waiter is still on the i_sync_list 1579325132Savg * 1580325132Savg * There's nothing to prevent syncing the txg while the 1581325132Savg * waiter is on the i_sync_list. This normally doesn't 1582325132Savg * happen because spa_sync() is slower than zil_commit(), 1583325132Savg * but if zil_commit() calls txg_wait_synced() (e.g. 1584325132Savg * because zil_create() or zil_commit_writer_stall() is 1585325132Savg * called) we will hit this case. 1586325132Savg */ 1587325132Savg if (itx->itx_lr.lrc_txtype == TX_COMMIT) 1588325132Savg zil_commit_waiter_skip(itx->itx_private); 1589325132Savg 1590219089Spjd list_remove(list, itx); 1591325132Savg zil_itx_destroy(itx); 1592219089Spjd } 1593168404Spjd 1594219089Spjd cookie = NULL; 1595219089Spjd t = &itxs->i_async_tree; 1596219089Spjd while ((ian = avl_destroy_nodes(t, &cookie)) != NULL) { 1597219089Spjd list = &ian->ia_list; 1598219089Spjd while ((itx = list_head(list)) != NULL) { 1599219089Spjd list_remove(list, itx); 1600325132Savg /* commit itxs should never be on the async lists. */ 1601325132Savg ASSERT3U(itx->itx_lr.lrc_txtype, !=, TX_COMMIT); 1602325132Savg zil_itx_destroy(itx); 1603219089Spjd } 1604219089Spjd list_destroy(list); 1605219089Spjd kmem_free(ian, sizeof (itx_async_node_t)); 1606219089Spjd } 1607219089Spjd avl_destroy(t); 1608219089Spjd 1609219089Spjd kmem_free(itxs, sizeof (itxs_t)); 1610168404Spjd} 1611168404Spjd 1612219089Spjdstatic int 1613219089Spjdzil_aitx_compare(const void *x1, const void *x2) 1614219089Spjd{ 1615219089Spjd const uint64_t o1 = ((itx_async_node_t *)x1)->ia_foid; 1616219089Spjd const uint64_t o2 = ((itx_async_node_t *)x2)->ia_foid; 1617219089Spjd 1618339158Smav return (AVL_CMP(o1, o2)); 1619219089Spjd} 1620219089Spjd 1621168404Spjd/* 1622219089Spjd * Remove all async itx with the given oid. 1623168404Spjd */ 1624168404Spjdstatic void 1625219089Spjdzil_remove_async(zilog_t *zilog, uint64_t oid) 1626168404Spjd{ 1627219089Spjd uint64_t otxg, txg; 1628219089Spjd itx_async_node_t *ian; 1629219089Spjd avl_tree_t *t; 1630219089Spjd avl_index_t where; 1631168404Spjd list_t clean_list; 1632168404Spjd itx_t *itx; 1633168404Spjd 1634219089Spjd ASSERT(oid != 0); 1635168404Spjd list_create(&clean_list, sizeof (itx_t), offsetof(itx_t, itx_node)); 1636168404Spjd 1637219089Spjd if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */ 1638219089Spjd otxg = ZILTEST_TXG; 1639219089Spjd else 1640219089Spjd otxg = spa_last_synced_txg(zilog->zl_spa) + 1; 1641219089Spjd 1642219089Spjd for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) { 1643219089Spjd itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK]; 1644219089Spjd 1645219089Spjd mutex_enter(&itxg->itxg_lock); 1646219089Spjd if (itxg->itxg_txg != txg) { 1647219089Spjd mutex_exit(&itxg->itxg_lock); 1648219089Spjd continue; 1649219089Spjd } 1650219089Spjd 1651219089Spjd /* 1652219089Spjd * Locate the object node and append its list. 1653219089Spjd */ 1654219089Spjd t = &itxg->itxg_itxs->i_async_tree; 1655219089Spjd ian = avl_find(t, &oid, &where); 1656219089Spjd if (ian != NULL) 1657219089Spjd list_move_tail(&clean_list, &ian->ia_list); 1658219089Spjd mutex_exit(&itxg->itxg_lock); 1659168404Spjd } 1660219089Spjd while ((itx = list_head(&clean_list)) != NULL) { 1661219089Spjd list_remove(&clean_list, itx); 1662325132Savg /* commit itxs should never be on the async lists. */ 1663325132Savg ASSERT3U(itx->itx_lr.lrc_txtype, !=, TX_COMMIT); 1664325132Savg zil_itx_destroy(itx); 1665219089Spjd } 1666219089Spjd list_destroy(&clean_list); 1667219089Spjd} 1668168404Spjd 1669219089Spjdvoid 1670219089Spjdzil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx) 1671219089Spjd{ 1672219089Spjd uint64_t txg; 1673219089Spjd itxg_t *itxg; 1674219089Spjd itxs_t *itxs, *clean = NULL; 1675219089Spjd 1676168404Spjd /* 1677219089Spjd * Object ids can be re-instantiated in the next txg so 1678219089Spjd * remove any async transactions to avoid future leaks. 1679219089Spjd * This can happen if a fsync occurs on the re-instantiated 1680219089Spjd * object for a WR_INDIRECT or WR_NEED_COPY write, which gets 1681219089Spjd * the new file data and flushes a write record for the old object. 1682168404Spjd */ 1683219089Spjd if ((itx->itx_lr.lrc_txtype & ~TX_CI) == TX_REMOVE) 1684219089Spjd zil_remove_async(zilog, itx->itx_oid); 1685219089Spjd 1686219089Spjd /* 1687219089Spjd * Ensure the data of a renamed file is committed before the rename. 1688219089Spjd */ 1689219089Spjd if ((itx->itx_lr.lrc_txtype & ~TX_CI) == TX_RENAME) 1690219089Spjd zil_async_to_sync(zilog, itx->itx_oid); 1691219089Spjd 1692239620Smm if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) 1693219089Spjd txg = ZILTEST_TXG; 1694219089Spjd else 1695219089Spjd txg = dmu_tx_get_txg(tx); 1696219089Spjd 1697219089Spjd itxg = &zilog->zl_itxg[txg & TXG_MASK]; 1698219089Spjd mutex_enter(&itxg->itxg_lock); 1699219089Spjd itxs = itxg->itxg_itxs; 1700219089Spjd if (itxg->itxg_txg != txg) { 1701219089Spjd if (itxs != NULL) { 1702219089Spjd /* 1703219089Spjd * The zil_clean callback hasn't got around to cleaning 1704219089Spjd * this itxg. Save the itxs for release below. 1705219089Spjd * This should be rare. 1706219089Spjd */ 1707321611Smav zfs_dbgmsg("zil_itx_assign: missed itx cleanup for " 1708321611Smav "txg %llu", itxg->itxg_txg); 1709219089Spjd clean = itxg->itxg_itxs; 1710219089Spjd } 1711219089Spjd itxg->itxg_txg = txg; 1712219089Spjd itxs = itxg->itxg_itxs = kmem_zalloc(sizeof (itxs_t), KM_SLEEP); 1713219089Spjd 1714219089Spjd list_create(&itxs->i_sync_list, sizeof (itx_t), 1715219089Spjd offsetof(itx_t, itx_node)); 1716219089Spjd avl_create(&itxs->i_async_tree, zil_aitx_compare, 1717219089Spjd sizeof (itx_async_node_t), 1718219089Spjd offsetof(itx_async_node_t, ia_node)); 1719168404Spjd } 1720219089Spjd if (itx->itx_sync) { 1721219089Spjd list_insert_tail(&itxs->i_sync_list, itx); 1722219089Spjd } else { 1723219089Spjd avl_tree_t *t = &itxs->i_async_tree; 1724219089Spjd uint64_t foid = ((lr_ooo_t *)&itx->itx_lr)->lr_foid; 1725219089Spjd itx_async_node_t *ian; 1726219089Spjd avl_index_t where; 1727168404Spjd 1728219089Spjd ian = avl_find(t, &foid, &where); 1729219089Spjd if (ian == NULL) { 1730219089Spjd ian = kmem_alloc(sizeof (itx_async_node_t), KM_SLEEP); 1731219089Spjd list_create(&ian->ia_list, sizeof (itx_t), 1732219089Spjd offsetof(itx_t, itx_node)); 1733219089Spjd ian->ia_foid = foid; 1734219089Spjd avl_insert(t, ian, where); 1735219089Spjd } 1736219089Spjd list_insert_tail(&ian->ia_list, itx); 1737168404Spjd } 1738219089Spjd 1739219089Spjd itx->itx_lr.lrc_txg = dmu_tx_get_txg(tx); 1740325132Savg 1741325132Savg /* 1742325132Savg * We don't want to dirty the ZIL using ZILTEST_TXG, because 1743325132Savg * zil_clean() will never be called using ZILTEST_TXG. Thus, we 1744325132Savg * need to be careful to always dirty the ZIL using the "real" 1745325132Savg * TXG (not itxg_txg) even when the SPA is frozen. 1746325132Savg */ 1747325132Savg zilog_dirty(zilog, dmu_tx_get_txg(tx)); 1748219089Spjd mutex_exit(&itxg->itxg_lock); 1749219089Spjd 1750219089Spjd /* Release the old itxs now we've dropped the lock */ 1751219089Spjd if (clean != NULL) 1752219089Spjd zil_itxg_clean(clean); 1753168404Spjd} 1754168404Spjd 1755168404Spjd/* 1756168404Spjd * If there are any in-memory intent log transactions which have now been 1757239620Smm * synced then start up a taskq to free them. We should only do this after we 1758239620Smm * have written out the uberblocks (i.e. txg has been comitted) so that 1759239620Smm * don't inadvertently clean out in-memory log records that would be required 1760239620Smm * by zil_commit(). 1761168404Spjd */ 1762168404Spjdvoid 1763219089Spjdzil_clean(zilog_t *zilog, uint64_t synced_txg) 1764168404Spjd{ 1765219089Spjd itxg_t *itxg = &zilog->zl_itxg[synced_txg & TXG_MASK]; 1766219089Spjd itxs_t *clean_me; 1767168404Spjd 1768325132Savg ASSERT3U(synced_txg, <, ZILTEST_TXG); 1769325132Savg 1770219089Spjd mutex_enter(&itxg->itxg_lock); 1771219089Spjd if (itxg->itxg_itxs == NULL || itxg->itxg_txg == ZILTEST_TXG) { 1772219089Spjd mutex_exit(&itxg->itxg_lock); 1773219089Spjd return; 1774168404Spjd } 1775219089Spjd ASSERT3U(itxg->itxg_txg, <=, synced_txg); 1776324205Savg ASSERT3U(itxg->itxg_txg, !=, 0); 1777219089Spjd clean_me = itxg->itxg_itxs; 1778219089Spjd itxg->itxg_itxs = NULL; 1779219089Spjd itxg->itxg_txg = 0; 1780219089Spjd mutex_exit(&itxg->itxg_lock); 1781219089Spjd /* 1782219089Spjd * Preferably start a task queue to free up the old itxs but 1783219089Spjd * if taskq_dispatch can't allocate resources to do that then 1784219089Spjd * free it in-line. This should be rare. Note, using TQ_SLEEP 1785219089Spjd * created a bad performance problem. 1786219089Spjd */ 1787324205Savg ASSERT3P(zilog->zl_dmu_pool, !=, NULL); 1788324205Savg ASSERT3P(zilog->zl_dmu_pool->dp_zil_clean_taskq, !=, NULL); 1789324205Savg if (taskq_dispatch(zilog->zl_dmu_pool->dp_zil_clean_taskq, 1790219089Spjd (void (*)(void *))zil_itxg_clean, clean_me, TQ_NOSLEEP) == 0) 1791219089Spjd zil_itxg_clean(clean_me); 1792168404Spjd} 1793168404Spjd 1794219089Spjd/* 1795325132Savg * This function will traverse the queue of itxs that need to be 1796325132Savg * committed, and move them onto the ZIL's zl_itx_commit_list. 1797219089Spjd */ 1798185029Spjdstatic void 1799219089Spjdzil_get_commit_list(zilog_t *zilog) 1800168404Spjd{ 1801219089Spjd uint64_t otxg, txg; 1802219089Spjd list_t *commit_list = &zilog->zl_itx_commit_list; 1803219089Spjd 1804329485Smav ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); 1805325132Savg 1806219089Spjd if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */ 1807219089Spjd otxg = ZILTEST_TXG; 1808219089Spjd else 1809219089Spjd otxg = spa_last_synced_txg(zilog->zl_spa) + 1; 1810219089Spjd 1811310515Savg /* 1812310515Savg * This is inherently racy, since there is nothing to prevent 1813310515Savg * the last synced txg from changing. That's okay since we'll 1814310515Savg * only commit things in the future. 1815310515Savg */ 1816219089Spjd for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) { 1817219089Spjd itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK]; 1818219089Spjd 1819219089Spjd mutex_enter(&itxg->itxg_lock); 1820219089Spjd if (itxg->itxg_txg != txg) { 1821219089Spjd mutex_exit(&itxg->itxg_lock); 1822219089Spjd continue; 1823219089Spjd } 1824219089Spjd 1825310515Savg /* 1826310515Savg * If we're adding itx records to the zl_itx_commit_list, 1827310515Savg * then the zil better be dirty in this "txg". We can assert 1828310515Savg * that here since we're holding the itxg_lock which will 1829310515Savg * prevent spa_sync from cleaning it. Once we add the itxs 1830310515Savg * to the zl_itx_commit_list we must commit it to disk even 1831310515Savg * if it's unnecessary (i.e. the txg was synced). 1832310515Savg */ 1833310515Savg ASSERT(zilog_is_dirty_in_txg(zilog, txg) || 1834310515Savg spa_freeze_txg(zilog->zl_spa) != UINT64_MAX); 1835219089Spjd list_move_tail(commit_list, &itxg->itxg_itxs->i_sync_list); 1836219089Spjd 1837219089Spjd mutex_exit(&itxg->itxg_lock); 1838219089Spjd } 1839219089Spjd} 1840219089Spjd 1841219089Spjd/* 1842219089Spjd * Move the async itxs for a specified object to commit into sync lists. 1843219089Spjd */ 1844308595Smavvoid 1845219089Spjdzil_async_to_sync(zilog_t *zilog, uint64_t foid) 1846219089Spjd{ 1847219089Spjd uint64_t otxg, txg; 1848219089Spjd itx_async_node_t *ian; 1849219089Spjd avl_tree_t *t; 1850219089Spjd avl_index_t where; 1851219089Spjd 1852219089Spjd if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */ 1853219089Spjd otxg = ZILTEST_TXG; 1854219089Spjd else 1855219089Spjd otxg = spa_last_synced_txg(zilog->zl_spa) + 1; 1856219089Spjd 1857310515Savg /* 1858310515Savg * This is inherently racy, since there is nothing to prevent 1859310515Savg * the last synced txg from changing. 1860310515Savg */ 1861219089Spjd for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) { 1862219089Spjd itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK]; 1863219089Spjd 1864219089Spjd mutex_enter(&itxg->itxg_lock); 1865219089Spjd if (itxg->itxg_txg != txg) { 1866219089Spjd mutex_exit(&itxg->itxg_lock); 1867219089Spjd continue; 1868219089Spjd } 1869219089Spjd 1870219089Spjd /* 1871219089Spjd * If a foid is specified then find that node and append its 1872219089Spjd * list. Otherwise walk the tree appending all the lists 1873219089Spjd * to the sync list. We add to the end rather than the 1874219089Spjd * beginning to ensure the create has happened. 1875219089Spjd */ 1876219089Spjd t = &itxg->itxg_itxs->i_async_tree; 1877219089Spjd if (foid != 0) { 1878219089Spjd ian = avl_find(t, &foid, &where); 1879219089Spjd if (ian != NULL) { 1880219089Spjd list_move_tail(&itxg->itxg_itxs->i_sync_list, 1881219089Spjd &ian->ia_list); 1882219089Spjd } 1883219089Spjd } else { 1884219089Spjd void *cookie = NULL; 1885219089Spjd 1886219089Spjd while ((ian = avl_destroy_nodes(t, &cookie)) != NULL) { 1887219089Spjd list_move_tail(&itxg->itxg_itxs->i_sync_list, 1888219089Spjd &ian->ia_list); 1889219089Spjd list_destroy(&ian->ia_list); 1890219089Spjd kmem_free(ian, sizeof (itx_async_node_t)); 1891219089Spjd } 1892219089Spjd } 1893219089Spjd mutex_exit(&itxg->itxg_lock); 1894219089Spjd } 1895219089Spjd} 1896219089Spjd 1897325132Savg/* 1898325132Savg * This function will prune commit itxs that are at the head of the 1899325132Savg * commit list (it won't prune past the first non-commit itx), and 1900325132Savg * either: a) attach them to the last lwb that's still pending 1901325132Savg * completion, or b) skip them altogether. 1902325132Savg * 1903325132Savg * This is used as a performance optimization to prevent commit itxs 1904325132Savg * from generating new lwbs when it's unnecessary to do so. 1905325132Savg */ 1906219089Spjdstatic void 1907325132Savgzil_prune_commit_list(zilog_t *zilog) 1908219089Spjd{ 1909219089Spjd itx_t *itx; 1910168404Spjd 1911329485Smav ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); 1912168404Spjd 1913325132Savg while (itx = list_head(&zilog->zl_itx_commit_list)) { 1914325132Savg lr_t *lrc = &itx->itx_lr; 1915325132Savg if (lrc->lrc_txtype != TX_COMMIT) 1916325132Savg break; 1917219089Spjd 1918325132Savg mutex_enter(&zilog->zl_lock); 1919219089Spjd 1920325132Savg lwb_t *last_lwb = zilog->zl_last_lwb_opened; 1921325132Savg if (last_lwb == NULL || last_lwb->lwb_state == LWB_STATE_DONE) { 1922325132Savg /* 1923325132Savg * All of the itxs this waiter was waiting on 1924325132Savg * must have already completed (or there were 1925325132Savg * never any itx's for it to wait on), so it's 1926325132Savg * safe to skip this waiter and mark it done. 1927325132Savg */ 1928325132Savg zil_commit_waiter_skip(itx->itx_private); 1929325132Savg } else { 1930325132Savg zil_commit_waiter_link_lwb(itx->itx_private, last_lwb); 1931325132Savg itx->itx_private = NULL; 1932325132Savg } 1933325132Savg 1934325132Savg mutex_exit(&zilog->zl_lock); 1935325132Savg 1936325132Savg list_remove(&zilog->zl_itx_commit_list, itx); 1937325132Savg zil_itx_destroy(itx); 1938325132Savg } 1939325132Savg 1940325132Savg IMPLY(itx != NULL, itx->itx_lr.lrc_txtype != TX_COMMIT); 1941325132Savg} 1942325132Savg 1943325132Savgstatic void 1944325132Savgzil_commit_writer_stall(zilog_t *zilog) 1945325132Savg{ 1946219089Spjd /* 1947325132Savg * When zio_alloc_zil() fails to allocate the next lwb block on 1948325132Savg * disk, we must call txg_wait_synced() to ensure all of the 1949325132Savg * lwbs in the zilog's zl_lwb_list are synced and then freed (in 1950325132Savg * zil_sync()), such that any subsequent ZIL writer (i.e. a call 1951325132Savg * to zil_process_commit_list()) will have to call zil_create(), 1952325132Savg * and start a new ZIL chain. 1953325132Savg * 1954325132Savg * Since zil_alloc_zil() failed, the lwb that was previously 1955325132Savg * issued does not have a pointer to the "next" lwb on disk. 1956325132Savg * Thus, if another ZIL writer thread was to allocate the "next" 1957325132Savg * on-disk lwb, that block could be leaked in the event of a 1958325132Savg * crash (because the previous lwb on-disk would not point to 1959325132Savg * it). 1960325132Savg * 1961329485Smav * We must hold the zilog's zl_issuer_lock while we do this, to 1962325132Savg * ensure no new threads enter zil_process_commit_list() until 1963325132Savg * all lwb's in the zl_lwb_list have been synced and freed 1964325132Savg * (which is achieved via the txg_wait_synced() call). 1965325132Savg */ 1966329485Smav ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); 1967325132Savg txg_wait_synced(zilog->zl_dmu_pool, 0); 1968325132Savg ASSERT3P(list_tail(&zilog->zl_lwb_list), ==, NULL); 1969325132Savg} 1970325132Savg 1971325132Savg/* 1972325132Savg * This function will traverse the commit list, creating new lwbs as 1973325132Savg * needed, and committing the itxs from the commit list to these newly 1974325132Savg * created lwbs. Additionally, as a new lwb is created, the previous 1975325132Savg * lwb will be issued to the zio layer to be written to disk. 1976325132Savg */ 1977325132Savgstatic void 1978325132Savgzil_process_commit_list(zilog_t *zilog) 1979325132Savg{ 1980325132Savg spa_t *spa = zilog->zl_spa; 1981325132Savg list_t nolwb_waiters; 1982325132Savg lwb_t *lwb; 1983325132Savg itx_t *itx; 1984325132Savg 1985329485Smav ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); 1986325132Savg 1987325132Savg /* 1988219089Spjd * Return if there's nothing to commit before we dirty the fs by 1989219089Spjd * calling zil_create(). 1990219089Spjd */ 1991325132Savg if (list_head(&zilog->zl_itx_commit_list) == NULL) 1992219089Spjd return; 1993219089Spjd 1994325132Savg list_create(&nolwb_waiters, sizeof (zil_commit_waiter_t), 1995325132Savg offsetof(zil_commit_waiter_t, zcw_node)); 1996325132Savg 1997325132Savg lwb = list_tail(&zilog->zl_lwb_list); 1998325132Savg if (lwb == NULL) { 1999325132Savg lwb = zil_create(zilog); 2000168404Spjd } else { 2001325132Savg ASSERT3S(lwb->lwb_state, !=, LWB_STATE_ISSUED); 2002325132Savg ASSERT3S(lwb->lwb_state, !=, LWB_STATE_DONE); 2003168404Spjd } 2004168404Spjd 2005219089Spjd while (itx = list_head(&zilog->zl_itx_commit_list)) { 2006325132Savg lr_t *lrc = &itx->itx_lr; 2007325132Savg uint64_t txg = lrc->lrc_txg; 2008325132Savg 2009310515Savg ASSERT3U(txg, !=, 0); 2010168404Spjd 2011325132Savg if (lrc->lrc_txtype == TX_COMMIT) { 2012325132Savg DTRACE_PROBE2(zil__process__commit__itx, 2013325132Savg zilog_t *, zilog, itx_t *, itx); 2014325132Savg } else { 2015325132Savg DTRACE_PROBE2(zil__process__normal__itx, 2016325132Savg zilog_t *, zilog, itx_t *, itx); 2017325132Savg } 2018325132Savg 2019325132Savg boolean_t synced = txg <= spa_last_synced_txg(spa); 2020325132Savg boolean_t frozen = txg > spa_freeze_txg(spa); 2021325132Savg 2022329486Smav /* 2023329486Smav * If the txg of this itx has already been synced out, then 2024329486Smav * we don't need to commit this itx to an lwb. This is 2025329486Smav * because the data of this itx will have already been 2026329486Smav * written to the main pool. This is inherently racy, and 2027329486Smav * it's still ok to commit an itx whose txg has already 2028329486Smav * been synced; this will result in a write that's 2029329486Smav * unnecessary, but will do no harm. 2030329486Smav * 2031329486Smav * With that said, we always want to commit TX_COMMIT itxs 2032329486Smav * to an lwb, regardless of whether or not that itx's txg 2033329486Smav * has been synced out. We do this to ensure any OPENED lwb 2034329486Smav * will always have at least one zil_commit_waiter_t linked 2035329486Smav * to the lwb. 2036329486Smav * 2037329486Smav * As a counter-example, if we skipped TX_COMMIT itx's 2038329486Smav * whose txg had already been synced, the following 2039329486Smav * situation could occur if we happened to be racing with 2040329486Smav * spa_sync: 2041329486Smav * 2042329486Smav * 1. we commit a non-TX_COMMIT itx to an lwb, where the 2043329486Smav * itx's txg is 10 and the last synced txg is 9. 2044329486Smav * 2. spa_sync finishes syncing out txg 10. 2045329486Smav * 3. we move to the next itx in the list, it's a TX_COMMIT 2046329486Smav * whose txg is 10, so we skip it rather than committing 2047329486Smav * it to the lwb used in (1). 2048329486Smav * 2049329486Smav * If the itx that is skipped in (3) is the last TX_COMMIT 2050329486Smav * itx in the commit list, than it's possible for the lwb 2051329486Smav * used in (1) to remain in the OPENED state indefinitely. 2052329486Smav * 2053329486Smav * To prevent the above scenario from occuring, ensuring 2054329486Smav * that once an lwb is OPENED it will transition to ISSUED 2055329486Smav * and eventually DONE, we always commit TX_COMMIT itx's to 2056329486Smav * an lwb here, even if that itx's txg has already been 2057329486Smav * synced. 2058329486Smav * 2059329486Smav * Finally, if the pool is frozen, we _always_ commit the 2060329486Smav * itx. The point of freezing the pool is to prevent data 2061329486Smav * from being written to the main pool via spa_sync, and 2062329486Smav * instead rely solely on the ZIL to persistently store the 2063329486Smav * data; i.e. when the pool is frozen, the last synced txg 2064329486Smav * value can't be trusted. 2065329486Smav */ 2066329486Smav if (frozen || !synced || lrc->lrc_txtype == TX_COMMIT) { 2067325132Savg if (lwb != NULL) { 2068325132Savg lwb = zil_lwb_commit(zilog, itx, lwb); 2069325132Savg } else if (lrc->lrc_txtype == TX_COMMIT) { 2070325132Savg ASSERT3P(lwb, ==, NULL); 2071325132Savg zil_commit_waiter_link_nolwb( 2072325132Savg itx->itx_private, &nolwb_waiters); 2073325132Savg } 2074325132Savg } 2075325132Savg 2076219089Spjd list_remove(&zilog->zl_itx_commit_list, itx); 2077325132Savg zil_itx_destroy(itx); 2078168404Spjd } 2079168404Spjd 2080325132Savg if (lwb == NULL) { 2081325132Savg /* 2082325132Savg * This indicates zio_alloc_zil() failed to allocate the 2083325132Savg * "next" lwb on-disk. When this happens, we must stall 2084325132Savg * the ZIL write pipeline; see the comment within 2085325132Savg * zil_commit_writer_stall() for more details. 2086325132Savg */ 2087325132Savg zil_commit_writer_stall(zilog); 2088168404Spjd 2089325132Savg /* 2090325132Savg * Additionally, we have to signal and mark the "nolwb" 2091325132Savg * waiters as "done" here, since without an lwb, we 2092325132Savg * can't do this via zil_lwb_flush_vdevs_done() like 2093325132Savg * normal. 2094325132Savg */ 2095325132Savg zil_commit_waiter_t *zcw; 2096325132Savg while (zcw = list_head(&nolwb_waiters)) { 2097325132Savg zil_commit_waiter_skip(zcw); 2098325132Savg list_remove(&nolwb_waiters, zcw); 2099325132Savg } 2100325132Savg } else { 2101325132Savg ASSERT(list_is_empty(&nolwb_waiters)); 2102325132Savg ASSERT3P(lwb, !=, NULL); 2103325132Savg ASSERT3S(lwb->lwb_state, !=, LWB_STATE_ISSUED); 2104325132Savg ASSERT3S(lwb->lwb_state, !=, LWB_STATE_DONE); 2105168404Spjd 2106325132Savg /* 2107325132Savg * At this point, the ZIL block pointed at by the "lwb" 2108325132Savg * variable is in one of the following states: "closed" 2109325132Savg * or "open". 2110325132Savg * 2111325132Savg * If its "closed", then no itxs have been committed to 2112325132Savg * it, so there's no point in issuing its zio (i.e. 2113325132Savg * it's "empty"). 2114325132Savg * 2115325132Savg * If its "open" state, then it contains one or more 2116325132Savg * itxs that eventually need to be committed to stable 2117325132Savg * storage. In this case we intentionally do not issue 2118325132Savg * the lwb's zio to disk yet, and instead rely on one of 2119325132Savg * the following two mechanisms for issuing the zio: 2120325132Savg * 2121325132Savg * 1. Ideally, there will be more ZIL activity occuring 2122325132Savg * on the system, such that this function will be 2123325132Savg * immediately called again (not necessarily by the same 2124325132Savg * thread) and this lwb's zio will be issued via 2125325132Savg * zil_lwb_commit(). This way, the lwb is guaranteed to 2126325132Savg * be "full" when it is issued to disk, and we'll make 2127325132Savg * use of the lwb's size the best we can. 2128325132Savg * 2129325132Savg * 2. If there isn't sufficient ZIL activity occuring on 2130325132Savg * the system, such that this lwb's zio isn't issued via 2131325132Savg * zil_lwb_commit(), zil_commit_waiter() will issue the 2132325132Savg * lwb's zio. If this occurs, the lwb is not guaranteed 2133325132Savg * to be "full" by the time its zio is issued, and means 2134325132Savg * the size of the lwb was "too large" given the amount 2135325132Savg * of ZIL activity occuring on the system at that time. 2136325132Savg * 2137325132Savg * We do this for a couple of reasons: 2138325132Savg * 2139325132Savg * 1. To try and reduce the number of IOPs needed to 2140325132Savg * write the same number of itxs. If an lwb has space 2141325132Savg * available in it's buffer for more itxs, and more itxs 2142325132Savg * will be committed relatively soon (relative to the 2143325132Savg * latency of performing a write), then it's beneficial 2144325132Savg * to wait for these "next" itxs. This way, more itxs 2145325132Savg * can be committed to stable storage with fewer writes. 2146325132Savg * 2147325132Savg * 2. To try and use the largest lwb block size that the 2148325132Savg * incoming rate of itxs can support. Again, this is to 2149325132Savg * try and pack as many itxs into as few lwbs as 2150325132Savg * possible, without significantly impacting the latency 2151325132Savg * of each individual itx. 2152325132Savg */ 2153325132Savg } 2154325132Savg} 2155325132Savg 2156325132Savg/* 2157325132Savg * This function is responsible for ensuring the passed in commit waiter 2158325132Savg * (and associated commit itx) is committed to an lwb. If the waiter is 2159325132Savg * not already committed to an lwb, all itxs in the zilog's queue of 2160325132Savg * itxs will be processed. The assumption is the passed in waiter's 2161325132Savg * commit itx will found in the queue just like the other non-commit 2162325132Savg * itxs, such that when the entire queue is processed, the waiter will 2163325132Savg * have been commited to an lwb. 2164325132Savg * 2165325132Savg * The lwb associated with the passed in waiter is not guaranteed to 2166325132Savg * have been issued by the time this function completes. If the lwb is 2167325132Savg * not issued, we rely on future calls to zil_commit_writer() to issue 2168325132Savg * the lwb, or the timeout mechanism found in zil_commit_waiter(). 2169325132Savg */ 2170325132Savgstatic void 2171325132Savgzil_commit_writer(zilog_t *zilog, zil_commit_waiter_t *zcw) 2172325132Savg{ 2173325132Savg ASSERT(!MUTEX_HELD(&zilog->zl_lock)); 2174325132Savg ASSERT(spa_writeable(zilog->zl_spa)); 2175325132Savg 2176329485Smav mutex_enter(&zilog->zl_issuer_lock); 2177325132Savg 2178325132Savg if (zcw->zcw_lwb != NULL || zcw->zcw_done) { 2179325132Savg /* 2180325132Savg * It's possible that, while we were waiting to acquire 2181329485Smav * the "zl_issuer_lock", another thread committed this 2182325132Savg * waiter to an lwb. If that occurs, we bail out early, 2183325132Savg * without processing any of the zilog's queue of itxs. 2184325132Savg * 2185325132Savg * On certain workloads and system configurations, the 2186329485Smav * "zl_issuer_lock" can become highly contended. In an 2187325132Savg * attempt to reduce this contention, we immediately drop 2188325132Savg * the lock if the waiter has already been processed. 2189325132Savg * 2190325132Savg * We've measured this optimization to reduce CPU spent 2191325132Savg * contending on this lock by up to 5%, using a system 2192325132Savg * with 32 CPUs, low latency storage (~50 usec writes), 2193325132Savg * and 1024 threads performing sync writes. 2194325132Savg */ 2195325132Savg goto out; 2196325132Savg } 2197325132Savg 2198325132Savg zil_get_commit_list(zilog); 2199325132Savg zil_prune_commit_list(zilog); 2200325132Savg zil_process_commit_list(zilog); 2201325132Savg 2202325132Savgout: 2203329485Smav mutex_exit(&zilog->zl_issuer_lock); 2204325132Savg} 2205325132Savg 2206325132Savgstatic void 2207325132Savgzil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw) 2208325132Savg{ 2209329485Smav ASSERT(!MUTEX_HELD(&zilog->zl_issuer_lock)); 2210325132Savg ASSERT(MUTEX_HELD(&zcw->zcw_lock)); 2211325132Savg ASSERT3B(zcw->zcw_done, ==, B_FALSE); 2212325132Savg 2213325132Savg lwb_t *lwb = zcw->zcw_lwb; 2214325132Savg ASSERT3P(lwb, !=, NULL); 2215325132Savg ASSERT3S(lwb->lwb_state, !=, LWB_STATE_CLOSED); 2216325132Savg 2217168404Spjd /* 2218325132Savg * If the lwb has already been issued by another thread, we can 2219325132Savg * immediately return since there's no work to be done (the 2220325132Savg * point of this function is to issue the lwb). Additionally, we 2221329485Smav * do this prior to acquiring the zl_issuer_lock, to avoid 2222325132Savg * acquiring it when it's not necessary to do so. 2223168404Spjd */ 2224325132Savg if (lwb->lwb_state == LWB_STATE_ISSUED || 2225325132Savg lwb->lwb_state == LWB_STATE_DONE) 2226325132Savg return; 2227325132Savg 2228325132Savg /* 2229325132Savg * In order to call zil_lwb_write_issue() we must hold the 2230329485Smav * zilog's "zl_issuer_lock". We can't simply acquire that lock, 2231325132Savg * since we're already holding the commit waiter's "zcw_lock", 2232325132Savg * and those two locks are aquired in the opposite order 2233325132Savg * elsewhere. 2234325132Savg */ 2235325132Savg mutex_exit(&zcw->zcw_lock); 2236329485Smav mutex_enter(&zilog->zl_issuer_lock); 2237325132Savg mutex_enter(&zcw->zcw_lock); 2238325132Savg 2239325132Savg /* 2240325132Savg * Since we just dropped and re-acquired the commit waiter's 2241325132Savg * lock, we have to re-check to see if the waiter was marked 2242325132Savg * "done" during that process. If the waiter was marked "done", 2243325132Savg * the "lwb" pointer is no longer valid (it can be free'd after 2244325132Savg * the waiter is marked "done"), so without this check we could 2245325132Savg * wind up with a use-after-free error below. 2246325132Savg */ 2247325132Savg if (zcw->zcw_done) 2248325132Savg goto out; 2249325132Savg 2250325132Savg ASSERT3P(lwb, ==, zcw->zcw_lwb); 2251325132Savg 2252325132Savg /* 2253329486Smav * We've already checked this above, but since we hadn't acquired 2254329486Smav * the zilog's zl_issuer_lock, we have to perform this check a 2255329486Smav * second time while holding the lock. 2256329486Smav * 2257329486Smav * We don't need to hold the zl_lock since the lwb cannot transition 2258329486Smav * from OPENED to ISSUED while we hold the zl_issuer_lock. The lwb 2259329486Smav * _can_ transition from ISSUED to DONE, but it's OK to race with 2260329486Smav * that transition since we treat the lwb the same, whether it's in 2261329486Smav * the ISSUED or DONE states. 2262329486Smav * 2263329486Smav * The important thing, is we treat the lwb differently depending on 2264329486Smav * if it's ISSUED or OPENED, and block any other threads that might 2265329486Smav * attempt to issue this lwb. For that reason we hold the 2266329486Smav * zl_issuer_lock when checking the lwb_state; we must not call 2267325132Savg * zil_lwb_write_issue() if the lwb had already been issued. 2268329486Smav * 2269329486Smav * See the comment above the lwb_state_t structure definition for 2270329486Smav * more details on the lwb states, and locking requirements. 2271325132Savg */ 2272325132Savg if (lwb->lwb_state == LWB_STATE_ISSUED || 2273325132Savg lwb->lwb_state == LWB_STATE_DONE) 2274325132Savg goto out; 2275325132Savg 2276325132Savg ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED); 2277325132Savg 2278325132Savg /* 2279325132Savg * As described in the comments above zil_commit_waiter() and 2280325132Savg * zil_process_commit_list(), we need to issue this lwb's zio 2281325132Savg * since we've reached the commit waiter's timeout and it still 2282325132Savg * hasn't been issued. 2283325132Savg */ 2284325132Savg lwb_t *nlwb = zil_lwb_write_issue(zilog, lwb); 2285325132Savg 2286339134Smav IMPLY(nlwb != NULL, lwb->lwb_state != LWB_STATE_OPENED); 2287325132Savg 2288325132Savg /* 2289325132Savg * Since the lwb's zio hadn't been issued by the time this thread 2290325132Savg * reached its timeout, we reset the zilog's "zl_cur_used" field 2291325132Savg * to influence the zil block size selection algorithm. 2292325132Savg * 2293325132Savg * By having to issue the lwb's zio here, it means the size of the 2294325132Savg * lwb was too large, given the incoming throughput of itxs. By 2295325132Savg * setting "zl_cur_used" to zero, we communicate this fact to the 2296325132Savg * block size selection algorithm, so it can take this informaiton 2297325132Savg * into account, and potentially select a smaller size for the 2298325132Savg * next lwb block that is allocated. 2299325132Savg */ 2300325132Savg zilog->zl_cur_used = 0; 2301325132Savg 2302325132Savg if (nlwb == NULL) { 2303325132Savg /* 2304325132Savg * When zil_lwb_write_issue() returns NULL, this 2305325132Savg * indicates zio_alloc_zil() failed to allocate the 2306325132Savg * "next" lwb on-disk. When this occurs, the ZIL write 2307325132Savg * pipeline must be stalled; see the comment within the 2308325132Savg * zil_commit_writer_stall() function for more details. 2309325132Savg * 2310325132Savg * We must drop the commit waiter's lock prior to 2311325132Savg * calling zil_commit_writer_stall() or else we can wind 2312325132Savg * up with the following deadlock: 2313325132Savg * 2314325132Savg * - This thread is waiting for the txg to sync while 2315325132Savg * holding the waiter's lock; txg_wait_synced() is 2316325132Savg * used within txg_commit_writer_stall(). 2317325132Savg * 2318325132Savg * - The txg can't sync because it is waiting for this 2319325132Savg * lwb's zio callback to call dmu_tx_commit(). 2320325132Savg * 2321325132Savg * - The lwb's zio callback can't call dmu_tx_commit() 2322325132Savg * because it's blocked trying to acquire the waiter's 2323325132Savg * lock, which occurs prior to calling dmu_tx_commit() 2324325132Savg */ 2325325132Savg mutex_exit(&zcw->zcw_lock); 2326325132Savg zil_commit_writer_stall(zilog); 2327325132Savg mutex_enter(&zcw->zcw_lock); 2328168404Spjd } 2329168404Spjd 2330325132Savgout: 2331329485Smav mutex_exit(&zilog->zl_issuer_lock); 2332325132Savg ASSERT(MUTEX_HELD(&zcw->zcw_lock)); 2333325132Savg} 2334168404Spjd 2335325132Savg/* 2336325132Savg * This function is responsible for performing the following two tasks: 2337325132Savg * 2338325132Savg * 1. its primary responsibility is to block until the given "commit 2339325132Savg * waiter" is considered "done". 2340325132Savg * 2341325132Savg * 2. its secondary responsibility is to issue the zio for the lwb that 2342325132Savg * the given "commit waiter" is waiting on, if this function has 2343325132Savg * waited "long enough" and the lwb is still in the "open" state. 2344325132Savg * 2345325132Savg * Given a sufficient amount of itxs being generated and written using 2346325132Savg * the ZIL, the lwb's zio will be issued via the zil_lwb_commit() 2347325132Savg * function. If this does not occur, this secondary responsibility will 2348325132Savg * ensure the lwb is issued even if there is not other synchronous 2349325132Savg * activity on the system. 2350325132Savg * 2351325132Savg * For more details, see zil_process_commit_list(); more specifically, 2352325132Savg * the comment at the bottom of that function. 2353325132Savg */ 2354325132Savgstatic void 2355325132Savgzil_commit_waiter(zilog_t *zilog, zil_commit_waiter_t *zcw) 2356325132Savg{ 2357325132Savg ASSERT(!MUTEX_HELD(&zilog->zl_lock)); 2358329485Smav ASSERT(!MUTEX_HELD(&zilog->zl_issuer_lock)); 2359325132Savg ASSERT(spa_writeable(zilog->zl_spa)); 2360168404Spjd 2361325132Savg mutex_enter(&zcw->zcw_lock); 2362325132Savg 2363219089Spjd /* 2364325132Savg * The timeout is scaled based on the lwb latency to avoid 2365325132Savg * significantly impacting the latency of each individual itx. 2366325132Savg * For more details, see the comment at the bottom of the 2367325132Savg * zil_process_commit_list() function. 2368219089Spjd */ 2369325132Savg int pct = MAX(zfs_commit_timeout_pct, 1); 2370325132Savg#if defined(illumos) || !defined(_KERNEL) 2371325132Savg hrtime_t sleep = (zilog->zl_last_lwb_latency * pct) / 100; 2372325132Savg hrtime_t wakeup = gethrtime() + sleep; 2373325132Savg#else 2374325132Savg sbintime_t sleep = nstosbt((zilog->zl_last_lwb_latency * pct) / 100); 2375325132Savg sbintime_t wakeup = getsbinuptime() + sleep; 2376325132Savg#endif 2377325132Savg boolean_t timedout = B_FALSE; 2378325132Savg 2379325132Savg while (!zcw->zcw_done) { 2380325132Savg ASSERT(MUTEX_HELD(&zcw->zcw_lock)); 2381325132Savg 2382325132Savg lwb_t *lwb = zcw->zcw_lwb; 2383325132Savg 2384325132Savg /* 2385325132Savg * Usually, the waiter will have a non-NULL lwb field here, 2386325132Savg * but it's possible for it to be NULL as a result of 2387325132Savg * zil_commit() racing with spa_sync(). 2388325132Savg * 2389325132Savg * When zil_clean() is called, it's possible for the itxg 2390325132Savg * list (which may be cleaned via a taskq) to contain 2391325132Savg * commit itxs. When this occurs, the commit waiters linked 2392325132Savg * off of these commit itxs will not be committed to an 2393325132Savg * lwb. Additionally, these commit waiters will not be 2394325132Savg * marked done until zil_commit_waiter_skip() is called via 2395325132Savg * zil_itxg_clean(). 2396325132Savg * 2397325132Savg * Thus, it's possible for this commit waiter (i.e. the 2398325132Savg * "zcw" variable) to be found in this "in between" state; 2399325132Savg * where it's "zcw_lwb" field is NULL, and it hasn't yet 2400325132Savg * been skipped, so it's "zcw_done" field is still B_FALSE. 2401325132Savg */ 2402325132Savg IMPLY(lwb != NULL, lwb->lwb_state != LWB_STATE_CLOSED); 2403325132Savg 2404325132Savg if (lwb != NULL && lwb->lwb_state == LWB_STATE_OPENED) { 2405325132Savg ASSERT3B(timedout, ==, B_FALSE); 2406325132Savg 2407325132Savg /* 2408325132Savg * If the lwb hasn't been issued yet, then we 2409325132Savg * need to wait with a timeout, in case this 2410325132Savg * function needs to issue the lwb after the 2411325132Savg * timeout is reached; responsibility (2) from 2412325132Savg * the comment above this function. 2413325132Savg */ 2414325132Savg#if defined(illumos) || !defined(_KERNEL) 2415325132Savg clock_t timeleft = cv_timedwait_hires(&zcw->zcw_cv, 2416325132Savg &zcw->zcw_lock, wakeup, USEC2NSEC(1), 2417325132Savg CALLOUT_FLAG_ABSOLUTE); 2418325132Savg 2419325132Savg if (timeleft >= 0 || zcw->zcw_done) 2420325132Savg continue; 2421325132Savg#else 2422325132Savg int wait_err = cv_timedwait_sbt(&zcw->zcw_cv, 2423325132Savg &zcw->zcw_lock, wakeup, SBT_1NS, C_ABSOLUTE); 2424325132Savg if (wait_err != EWOULDBLOCK || zcw->zcw_done) 2425325132Savg continue; 2426325132Savg#endif 2427325132Savg 2428325132Savg timedout = B_TRUE; 2429325132Savg zil_commit_waiter_timeout(zilog, zcw); 2430325132Savg 2431325132Savg if (!zcw->zcw_done) { 2432325132Savg /* 2433325132Savg * If the commit waiter has already been 2434325132Savg * marked "done", it's possible for the 2435325132Savg * waiter's lwb structure to have already 2436325132Savg * been freed. Thus, we can only reliably 2437325132Savg * make these assertions if the waiter 2438325132Savg * isn't done. 2439325132Savg */ 2440325132Savg ASSERT3P(lwb, ==, zcw->zcw_lwb); 2441325132Savg ASSERT3S(lwb->lwb_state, !=, LWB_STATE_OPENED); 2442325132Savg } 2443325132Savg } else { 2444325132Savg /* 2445325132Savg * If the lwb isn't open, then it must have already 2446325132Savg * been issued. In that case, there's no need to 2447325132Savg * use a timeout when waiting for the lwb to 2448325132Savg * complete. 2449325132Savg * 2450325132Savg * Additionally, if the lwb is NULL, the waiter 2451325132Savg * will soon be signalled and marked done via 2452325132Savg * zil_clean() and zil_itxg_clean(), so no timeout 2453325132Savg * is required. 2454325132Savg */ 2455325132Savg 2456325132Savg IMPLY(lwb != NULL, 2457325132Savg lwb->lwb_state == LWB_STATE_ISSUED || 2458325132Savg lwb->lwb_state == LWB_STATE_DONE); 2459325132Savg cv_wait(&zcw->zcw_cv, &zcw->zcw_lock); 2460325132Savg } 2461325132Savg } 2462325132Savg 2463325132Savg mutex_exit(&zcw->zcw_lock); 2464168404Spjd} 2465168404Spjd 2466325132Savgstatic zil_commit_waiter_t * 2467325132Savgzil_alloc_commit_waiter() 2468325132Savg{ 2469325132Savg zil_commit_waiter_t *zcw = kmem_cache_alloc(zil_zcw_cache, KM_SLEEP); 2470325132Savg 2471325132Savg cv_init(&zcw->zcw_cv, NULL, CV_DEFAULT, NULL); 2472325132Savg mutex_init(&zcw->zcw_lock, NULL, MUTEX_DEFAULT, NULL); 2473325132Savg list_link_init(&zcw->zcw_node); 2474325132Savg zcw->zcw_lwb = NULL; 2475325132Savg zcw->zcw_done = B_FALSE; 2476325132Savg zcw->zcw_zio_error = 0; 2477325132Savg 2478325132Savg return (zcw); 2479325132Savg} 2480325132Savg 2481325132Savgstatic void 2482325132Savgzil_free_commit_waiter(zil_commit_waiter_t *zcw) 2483325132Savg{ 2484325132Savg ASSERT(!list_link_active(&zcw->zcw_node)); 2485325132Savg ASSERT3P(zcw->zcw_lwb, ==, NULL); 2486325132Savg ASSERT3B(zcw->zcw_done, ==, B_TRUE); 2487325132Savg mutex_destroy(&zcw->zcw_lock); 2488325132Savg cv_destroy(&zcw->zcw_cv); 2489325132Savg kmem_cache_free(zil_zcw_cache, zcw); 2490325132Savg} 2491325132Savg 2492168404Spjd/* 2493325132Savg * This function is used to create a TX_COMMIT itx and assign it. This 2494325132Savg * way, it will be linked into the ZIL's list of synchronous itxs, and 2495325132Savg * then later committed to an lwb (or skipped) when 2496325132Savg * zil_process_commit_list() is called. 2497325132Savg */ 2498325132Savgstatic void 2499325132Savgzil_commit_itx_assign(zilog_t *zilog, zil_commit_waiter_t *zcw) 2500325132Savg{ 2501325132Savg dmu_tx_t *tx = dmu_tx_create(zilog->zl_os); 2502325132Savg VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); 2503325132Savg 2504325132Savg itx_t *itx = zil_itx_create(TX_COMMIT, sizeof (lr_t)); 2505325132Savg itx->itx_sync = B_TRUE; 2506325132Savg itx->itx_private = zcw; 2507325132Savg 2508325132Savg zil_itx_assign(zilog, itx, tx); 2509325132Savg 2510325132Savg dmu_tx_commit(tx); 2511325132Savg} 2512325132Savg 2513325132Savg/* 2514325132Savg * Commit ZFS Intent Log transactions (itxs) to stable storage. 2515219089Spjd * 2516325132Savg * When writing ZIL transactions to the on-disk representation of the 2517325132Savg * ZIL, the itxs are committed to a Log Write Block (lwb). Multiple 2518325132Savg * itxs can be committed to a single lwb. Once a lwb is written and 2519325132Savg * committed to stable storage (i.e. the lwb is written, and vdevs have 2520325132Savg * been flushed), each itx that was committed to that lwb is also 2521325132Savg * considered to be committed to stable storage. 2522219089Spjd * 2523325132Savg * When an itx is committed to an lwb, the log record (lr_t) contained 2524325132Savg * by the itx is copied into the lwb's zio buffer, and once this buffer 2525325132Savg * is written to disk, it becomes an on-disk ZIL block. 2526219089Spjd * 2527325132Savg * As itxs are generated, they're inserted into the ZIL's queue of 2528325132Savg * uncommitted itxs. The semantics of zil_commit() are such that it will 2529325132Savg * block until all itxs that were in the queue when it was called, are 2530325132Savg * committed to stable storage. 2531219089Spjd * 2532325132Savg * If "foid" is zero, this means all "synchronous" and "asynchronous" 2533325132Savg * itxs, for all objects in the dataset, will be committed to stable 2534325132Savg * storage prior to zil_commit() returning. If "foid" is non-zero, all 2535325132Savg * "synchronous" itxs for all objects, but only "asynchronous" itxs 2536325132Savg * that correspond to the foid passed in, will be committed to stable 2537325132Savg * storage prior to zil_commit() returning. 2538325132Savg * 2539325132Savg * Generally speaking, when zil_commit() is called, the consumer doesn't 2540325132Savg * actually care about _all_ of the uncommitted itxs. Instead, they're 2541325132Savg * simply trying to waiting for a specific itx to be committed to disk, 2542325132Savg * but the interface(s) for interacting with the ZIL don't allow such 2543325132Savg * fine-grained communication. A better interface would allow a consumer 2544325132Savg * to create and assign an itx, and then pass a reference to this itx to 2545325132Savg * zil_commit(); such that zil_commit() would return as soon as that 2546325132Savg * specific itx was committed to disk (instead of waiting for _all_ 2547325132Savg * itxs to be committed). 2548325132Savg * 2549325132Savg * When a thread calls zil_commit() a special "commit itx" will be 2550325132Savg * generated, along with a corresponding "waiter" for this commit itx. 2551325132Savg * zil_commit() will wait on this waiter's CV, such that when the waiter 2552325132Savg * is marked done, and signalled, zil_commit() will return. 2553325132Savg * 2554325132Savg * This commit itx is inserted into the queue of uncommitted itxs. This 2555325132Savg * provides an easy mechanism for determining which itxs were in the 2556325132Savg * queue prior to zil_commit() having been called, and which itxs were 2557325132Savg * added after zil_commit() was called. 2558325132Savg * 2559325132Savg * The commit it is special; it doesn't have any on-disk representation. 2560325132Savg * When a commit itx is "committed" to an lwb, the waiter associated 2561325132Savg * with it is linked onto the lwb's list of waiters. Then, when that lwb 2562325132Savg * completes, each waiter on the lwb's list is marked done and signalled 2563325132Savg * -- allowing the thread waiting on the waiter to return from zil_commit(). 2564325132Savg * 2565325132Savg * It's important to point out a few critical factors that allow us 2566325132Savg * to make use of the commit itxs, commit waiters, per-lwb lists of 2567325132Savg * commit waiters, and zio completion callbacks like we're doing: 2568325132Savg * 2569325132Savg * 1. The list of waiters for each lwb is traversed, and each commit 2570325132Savg * waiter is marked "done" and signalled, in the zio completion 2571325132Savg * callback of the lwb's zio[*]. 2572325132Savg * 2573325132Savg * * Actually, the waiters are signalled in the zio completion 2574325132Savg * callback of the root zio for the DKIOCFLUSHWRITECACHE commands 2575325132Savg * that are sent to the vdevs upon completion of the lwb zio. 2576325132Savg * 2577325132Savg * 2. When the itxs are inserted into the ZIL's queue of uncommitted 2578325132Savg * itxs, the order in which they are inserted is preserved[*]; as 2579325132Savg * itxs are added to the queue, they are added to the tail of 2580325132Savg * in-memory linked lists. 2581325132Savg * 2582325132Savg * When committing the itxs to lwbs (to be written to disk), they 2583325132Savg * are committed in the same order in which the itxs were added to 2584325132Savg * the uncommitted queue's linked list(s); i.e. the linked list of 2585325132Savg * itxs to commit is traversed from head to tail, and each itx is 2586325132Savg * committed to an lwb in that order. 2587325132Savg * 2588325132Savg * * To clarify: 2589325132Savg * 2590325132Savg * - the order of "sync" itxs is preserved w.r.t. other 2591325132Savg * "sync" itxs, regardless of the corresponding objects. 2592325132Savg * - the order of "async" itxs is preserved w.r.t. other 2593325132Savg * "async" itxs corresponding to the same object. 2594325132Savg * - the order of "async" itxs is *not* preserved w.r.t. other 2595325132Savg * "async" itxs corresponding to different objects. 2596325132Savg * - the order of "sync" itxs w.r.t. "async" itxs (or vice 2597325132Savg * versa) is *not* preserved, even for itxs that correspond 2598325132Savg * to the same object. 2599325132Savg * 2600325132Savg * For more details, see: zil_itx_assign(), zil_async_to_sync(), 2601325132Savg * zil_get_commit_list(), and zil_process_commit_list(). 2602325132Savg * 2603325132Savg * 3. The lwbs represent a linked list of blocks on disk. Thus, any 2604325132Savg * lwb cannot be considered committed to stable storage, until its 2605325132Savg * "previous" lwb is also committed to stable storage. This fact, 2606325132Savg * coupled with the fact described above, means that itxs are 2607325132Savg * committed in (roughly) the order in which they were generated. 2608325132Savg * This is essential because itxs are dependent on prior itxs. 2609325132Savg * Thus, we *must not* deem an itx as being committed to stable 2610325132Savg * storage, until *all* prior itxs have also been committed to 2611325132Savg * stable storage. 2612325132Savg * 2613325132Savg * To enforce this ordering of lwb zio's, while still leveraging as 2614325132Savg * much of the underlying storage performance as possible, we rely 2615325132Savg * on two fundamental concepts: 2616325132Savg * 2617325132Savg * 1. The creation and issuance of lwb zio's is protected by 2618329485Smav * the zilog's "zl_issuer_lock", which ensures only a single 2619325132Savg * thread is creating and/or issuing lwb's at a time 2620325132Savg * 2. The "previous" lwb is a child of the "current" lwb 2621325132Savg * (leveraging the zio parent-child depenency graph) 2622325132Savg * 2623325132Savg * By relying on this parent-child zio relationship, we can have 2624325132Savg * many lwb zio's concurrently issued to the underlying storage, 2625325132Savg * but the order in which they complete will be the same order in 2626325132Savg * which they were created. 2627168404Spjd */ 2628168404Spjdvoid 2629219089Spjdzil_commit(zilog_t *zilog, uint64_t foid) 2630168404Spjd{ 2631325132Savg /* 2632325132Savg * We should never attempt to call zil_commit on a snapshot for 2633325132Savg * a couple of reasons: 2634325132Savg * 2635325132Savg * 1. A snapshot may never be modified, thus it cannot have any 2636325132Savg * in-flight itxs that would have modified the dataset. 2637325132Savg * 2638325132Savg * 2. By design, when zil_commit() is called, a commit itx will 2639325132Savg * be assigned to this zilog; as a result, the zilog will be 2640325132Savg * dirtied. We must not dirty the zilog of a snapshot; there's 2641325132Savg * checks in the code that enforce this invariant, and will 2642325132Savg * cause a panic if it's not upheld. 2643325132Savg */ 2644325132Savg ASSERT3B(dmu_objset_is_snapshot(zilog->zl_os), ==, B_FALSE); 2645219089Spjd 2646219089Spjd if (zilog->zl_sync == ZFS_SYNC_DISABLED) 2647168404Spjd return; 2648168404Spjd 2649325132Savg if (!spa_writeable(zilog->zl_spa)) { 2650325132Savg /* 2651325132Savg * If the SPA is not writable, there should never be any 2652325132Savg * pending itxs waiting to be committed to disk. If that 2653325132Savg * weren't true, we'd skip writing those itxs out, and 2654325132Savg * would break the sematics of zil_commit(); thus, we're 2655325132Savg * verifying that truth before we return to the caller. 2656325132Savg */ 2657325132Savg ASSERT(list_is_empty(&zilog->zl_lwb_list)); 2658325132Savg ASSERT3P(zilog->zl_last_lwb_opened, ==, NULL); 2659325132Savg for (int i = 0; i < TXG_SIZE; i++) 2660325132Savg ASSERT3P(zilog->zl_itxg[i].itxg_itxs, ==, NULL); 2661325132Savg return; 2662325132Savg } 2663219089Spjd 2664325132Savg /* 2665325132Savg * If the ZIL is suspended, we don't want to dirty it by calling 2666325132Savg * zil_commit_itx_assign() below, nor can we write out 2667325132Savg * lwbs like would be done in zil_commit_write(). Thus, we 2668325132Savg * simply rely on txg_wait_synced() to maintain the necessary 2669325132Savg * semantics, and avoid calling those functions altogether. 2670325132Savg */ 2671325132Savg if (zilog->zl_suspend > 0) { 2672325132Savg txg_wait_synced(zilog->zl_dmu_pool, 0); 2673325132Savg return; 2674168404Spjd } 2675219089Spjd 2676329486Smav zil_commit_impl(zilog, foid); 2677329486Smav} 2678329486Smav 2679329486Smavvoid 2680329486Smavzil_commit_impl(zilog_t *zilog, uint64_t foid) 2681329486Smav{ 2682325132Savg /* 2683325132Savg * Move the "async" itxs for the specified foid to the "sync" 2684325132Savg * queues, such that they will be later committed (or skipped) 2685325132Savg * to an lwb when zil_process_commit_list() is called. 2686325132Savg * 2687325132Savg * Since these "async" itxs must be committed prior to this 2688325132Savg * call to zil_commit returning, we must perform this operation 2689325132Savg * before we call zil_commit_itx_assign(). 2690325132Savg */ 2691325132Savg zil_async_to_sync(zilog, foid); 2692219089Spjd 2693325132Savg /* 2694325132Savg * We allocate a new "waiter" structure which will initially be 2695325132Savg * linked to the commit itx using the itx's "itx_private" field. 2696325132Savg * Since the commit itx doesn't represent any on-disk state, 2697325132Savg * when it's committed to an lwb, rather than copying the its 2698325132Savg * lr_t into the lwb's buffer, the commit itx's "waiter" will be 2699325132Savg * added to the lwb's list of waiters. Then, when the lwb is 2700325132Savg * committed to stable storage, each waiter in the lwb's list of 2701325132Savg * waiters will be marked "done", and signalled. 2702325132Savg * 2703325132Savg * We must create the waiter and assign the commit itx prior to 2704325132Savg * calling zil_commit_writer(), or else our specific commit itx 2705325132Savg * is not guaranteed to be committed to an lwb prior to calling 2706325132Savg * zil_commit_waiter(). 2707325132Savg */ 2708325132Savg zil_commit_waiter_t *zcw = zil_alloc_commit_waiter(); 2709325132Savg zil_commit_itx_assign(zilog, zcw); 2710219089Spjd 2711325132Savg zil_commit_writer(zilog, zcw); 2712325132Savg zil_commit_waiter(zilog, zcw); 2713325132Savg 2714325132Savg if (zcw->zcw_zio_error != 0) { 2715325132Savg /* 2716325132Savg * If there was an error writing out the ZIL blocks that 2717325132Savg * this thread is waiting on, then we fallback to 2718325132Savg * relying on spa_sync() to write out the data this 2719325132Savg * thread is waiting on. Obviously this has performance 2720325132Savg * implications, but the expectation is for this to be 2721325132Savg * an exceptional case, and shouldn't occur often. 2722325132Savg */ 2723325132Savg DTRACE_PROBE2(zil__commit__io__error, 2724325132Savg zilog_t *, zilog, zil_commit_waiter_t *, zcw); 2725325132Savg txg_wait_synced(zilog->zl_dmu_pool, 0); 2726325132Savg } 2727325132Savg 2728325132Savg zil_free_commit_waiter(zcw); 2729168404Spjd} 2730168404Spjd 2731168404Spjd/* 2732168404Spjd * Called in syncing context to free committed log blocks and update log header. 2733168404Spjd */ 2734168404Spjdvoid 2735168404Spjdzil_sync(zilog_t *zilog, dmu_tx_t *tx) 2736168404Spjd{ 2737168404Spjd zil_header_t *zh = zil_header_in_syncing_context(zilog); 2738168404Spjd uint64_t txg = dmu_tx_get_txg(tx); 2739168404Spjd spa_t *spa = zilog->zl_spa; 2740219089Spjd uint64_t *replayed_seq = &zilog->zl_replayed_seq[txg & TXG_MASK]; 2741168404Spjd lwb_t *lwb; 2742168404Spjd 2743209962Smm /* 2744209962Smm * We don't zero out zl_destroy_txg, so make sure we don't try 2745209962Smm * to destroy it twice. 2746209962Smm */ 2747209962Smm if (spa_sync_pass(spa) != 1) 2748209962Smm return; 2749209962Smm 2750168404Spjd mutex_enter(&zilog->zl_lock); 2751168404Spjd 2752168404Spjd ASSERT(zilog->zl_stop_sync == 0); 2753168404Spjd 2754219089Spjd if (*replayed_seq != 0) { 2755219089Spjd ASSERT(zh->zh_replay_seq < *replayed_seq); 2756219089Spjd zh->zh_replay_seq = *replayed_seq; 2757219089Spjd *replayed_seq = 0; 2758219089Spjd } 2759168404Spjd 2760168404Spjd if (zilog->zl_destroy_txg == txg) { 2761168404Spjd blkptr_t blk = zh->zh_log; 2762168404Spjd 2763168404Spjd ASSERT(list_head(&zilog->zl_lwb_list) == NULL); 2764168404Spjd 2765168404Spjd bzero(zh, sizeof (zil_header_t)); 2766209962Smm bzero(zilog->zl_replayed_seq, sizeof (zilog->zl_replayed_seq)); 2767168404Spjd 2768168404Spjd if (zilog->zl_keep_first) { 2769168404Spjd /* 2770168404Spjd * If this block was part of log chain that couldn't 2771168404Spjd * be claimed because a device was missing during 2772168404Spjd * zil_claim(), but that device later returns, 2773168404Spjd * then this block could erroneously appear valid. 2774168404Spjd * To guard against this, assign a new GUID to the new 2775168404Spjd * log chain so it doesn't matter what blk points to. 2776168404Spjd */ 2777168404Spjd zil_init_log_chain(zilog, &blk); 2778168404Spjd zh->zh_log = blk; 2779168404Spjd } 2780168404Spjd } 2781168404Spjd 2782213197Smm while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) { 2783168404Spjd zh->zh_log = lwb->lwb_blk; 2784168404Spjd if (lwb->lwb_buf != NULL || lwb->lwb_max_txg > txg) 2785168404Spjd break; 2786168404Spjd list_remove(&zilog->zl_lwb_list, lwb); 2787325132Savg zio_free(spa, txg, &lwb->lwb_blk); 2788325132Savg zil_free_lwb(zilog, lwb); 2789168404Spjd 2790168404Spjd /* 2791168404Spjd * If we don't have anything left in the lwb list then 2792168404Spjd * we've had an allocation failure and we need to zero 2793168404Spjd * out the zil_header blkptr so that we don't end 2794168404Spjd * up freeing the same block twice. 2795168404Spjd */ 2796168404Spjd if (list_head(&zilog->zl_lwb_list) == NULL) 2797168404Spjd BP_ZERO(&zh->zh_log); 2798168404Spjd } 2799168404Spjd mutex_exit(&zilog->zl_lock); 2800168404Spjd} 2801168404Spjd 2802325132Savg/* ARGSUSED */ 2803325132Savgstatic int 2804325132Savgzil_lwb_cons(void *vbuf, void *unused, int kmflag) 2805325132Savg{ 2806325132Savg lwb_t *lwb = vbuf; 2807325132Savg list_create(&lwb->lwb_waiters, sizeof (zil_commit_waiter_t), 2808325132Savg offsetof(zil_commit_waiter_t, zcw_node)); 2809325132Savg avl_create(&lwb->lwb_vdev_tree, zil_lwb_vdev_compare, 2810325132Savg sizeof (zil_vdev_node_t), offsetof(zil_vdev_node_t, zv_node)); 2811325132Savg mutex_init(&lwb->lwb_vdev_lock, NULL, MUTEX_DEFAULT, NULL); 2812325132Savg return (0); 2813325132Savg} 2814325132Savg 2815325132Savg/* ARGSUSED */ 2816325132Savgstatic void 2817325132Savgzil_lwb_dest(void *vbuf, void *unused) 2818325132Savg{ 2819325132Savg lwb_t *lwb = vbuf; 2820325132Savg mutex_destroy(&lwb->lwb_vdev_lock); 2821325132Savg avl_destroy(&lwb->lwb_vdev_tree); 2822325132Savg list_destroy(&lwb->lwb_waiters); 2823325132Savg} 2824325132Savg 2825168404Spjdvoid 2826168404Spjdzil_init(void) 2827168404Spjd{ 2828168404Spjd zil_lwb_cache = kmem_cache_create("zil_lwb_cache", 2829325132Savg sizeof (lwb_t), 0, zil_lwb_cons, zil_lwb_dest, NULL, NULL, NULL, 0); 2830325132Savg 2831325132Savg zil_zcw_cache = kmem_cache_create("zil_zcw_cache", 2832325132Savg sizeof (zil_commit_waiter_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 2833168404Spjd} 2834168404Spjd 2835168404Spjdvoid 2836168404Spjdzil_fini(void) 2837168404Spjd{ 2838325132Savg kmem_cache_destroy(zil_zcw_cache); 2839168404Spjd kmem_cache_destroy(zil_lwb_cache); 2840168404Spjd} 2841168404Spjd 2842219089Spjdvoid 2843219089Spjdzil_set_sync(zilog_t *zilog, uint64_t sync) 2844219089Spjd{ 2845219089Spjd zilog->zl_sync = sync; 2846219089Spjd} 2847219089Spjd 2848219089Spjdvoid 2849219089Spjdzil_set_logbias(zilog_t *zilog, uint64_t logbias) 2850219089Spjd{ 2851219089Spjd zilog->zl_logbias = logbias; 2852219089Spjd} 2853219089Spjd 2854168404Spjdzilog_t * 2855168404Spjdzil_alloc(objset_t *os, zil_header_t *zh_phys) 2856168404Spjd{ 2857168404Spjd zilog_t *zilog; 2858168404Spjd 2859168404Spjd zilog = kmem_zalloc(sizeof (zilog_t), KM_SLEEP); 2860168404Spjd 2861168404Spjd zilog->zl_header = zh_phys; 2862168404Spjd zilog->zl_os = os; 2863168404Spjd zilog->zl_spa = dmu_objset_spa(os); 2864168404Spjd zilog->zl_dmu_pool = dmu_objset_pool(os); 2865168404Spjd zilog->zl_destroy_txg = TXG_INITIAL - 1; 2866219089Spjd zilog->zl_logbias = dmu_objset_logbias(os); 2867219089Spjd zilog->zl_sync = dmu_objset_syncprop(os); 2868325132Savg zilog->zl_dirty_max_txg = 0; 2869325132Savg zilog->zl_last_lwb_opened = NULL; 2870325132Savg zilog->zl_last_lwb_latency = 0; 2871168404Spjd 2872168404Spjd mutex_init(&zilog->zl_lock, NULL, MUTEX_DEFAULT, NULL); 2873329485Smav mutex_init(&zilog->zl_issuer_lock, NULL, MUTEX_DEFAULT, NULL); 2874168404Spjd 2875219089Spjd for (int i = 0; i < TXG_SIZE; i++) { 2876219089Spjd mutex_init(&zilog->zl_itxg[i].itxg_lock, NULL, 2877219089Spjd MUTEX_DEFAULT, NULL); 2878219089Spjd } 2879168404Spjd 2880168404Spjd list_create(&zilog->zl_lwb_list, sizeof (lwb_t), 2881168404Spjd offsetof(lwb_t, lwb_node)); 2882168404Spjd 2883219089Spjd list_create(&zilog->zl_itx_commit_list, sizeof (itx_t), 2884219089Spjd offsetof(itx_t, itx_node)); 2885219089Spjd 2886185029Spjd cv_init(&zilog->zl_cv_suspend, NULL, CV_DEFAULT, NULL); 2887185029Spjd 2888168404Spjd return (zilog); 2889168404Spjd} 2890168404Spjd 2891168404Spjdvoid 2892168404Spjdzil_free(zilog_t *zilog) 2893168404Spjd{ 2894168404Spjd zilog->zl_stop_sync = 1; 2895168404Spjd 2896248571Smm ASSERT0(zilog->zl_suspend); 2897248571Smm ASSERT0(zilog->zl_suspending); 2898248571Smm 2899224526Smm ASSERT(list_is_empty(&zilog->zl_lwb_list)); 2900168404Spjd list_destroy(&zilog->zl_lwb_list); 2901168404Spjd 2902219089Spjd ASSERT(list_is_empty(&zilog->zl_itx_commit_list)); 2903219089Spjd list_destroy(&zilog->zl_itx_commit_list); 2904219089Spjd 2905219089Spjd for (int i = 0; i < TXG_SIZE; i++) { 2906219089Spjd /* 2907219089Spjd * It's possible for an itx to be generated that doesn't dirty 2908219089Spjd * a txg (e.g. ztest TX_TRUNCATE). So there's no zil_clean() 2909219089Spjd * callback to remove the entry. We remove those here. 2910219089Spjd * 2911219089Spjd * Also free up the ziltest itxs. 2912219089Spjd */ 2913219089Spjd if (zilog->zl_itxg[i].itxg_itxs) 2914219089Spjd zil_itxg_clean(zilog->zl_itxg[i].itxg_itxs); 2915219089Spjd mutex_destroy(&zilog->zl_itxg[i].itxg_lock); 2916219089Spjd } 2917219089Spjd 2918329485Smav mutex_destroy(&zilog->zl_issuer_lock); 2919168404Spjd mutex_destroy(&zilog->zl_lock); 2920168404Spjd 2921185029Spjd cv_destroy(&zilog->zl_cv_suspend); 2922185029Spjd 2923168404Spjd kmem_free(zilog, sizeof (zilog_t)); 2924168404Spjd} 2925168404Spjd 2926168404Spjd/* 2927168404Spjd * Open an intent log. 2928168404Spjd */ 2929168404Spjdzilog_t * 2930168404Spjdzil_open(objset_t *os, zil_get_data_t *get_data) 2931168404Spjd{ 2932168404Spjd zilog_t *zilog = dmu_objset_zil(os); 2933168404Spjd 2934325132Savg ASSERT3P(zilog->zl_get_data, ==, NULL); 2935325132Savg ASSERT3P(zilog->zl_last_lwb_opened, ==, NULL); 2936224526Smm ASSERT(list_is_empty(&zilog->zl_lwb_list)); 2937224526Smm 2938168404Spjd zilog->zl_get_data = get_data; 2939168404Spjd 2940168404Spjd return (zilog); 2941168404Spjd} 2942168404Spjd 2943168404Spjd/* 2944168404Spjd * Close an intent log. 2945168404Spjd */ 2946168404Spjdvoid 2947168404Spjdzil_close(zilog_t *zilog) 2948168404Spjd{ 2949224526Smm lwb_t *lwb; 2950325132Savg uint64_t txg; 2951219089Spjd 2952325132Savg if (!dmu_objset_is_snapshot(zilog->zl_os)) { 2953325132Savg zil_commit(zilog, 0); 2954325132Savg } else { 2955325132Savg ASSERT3P(list_tail(&zilog->zl_lwb_list), ==, NULL); 2956325132Savg ASSERT0(zilog->zl_dirty_max_txg); 2957325132Savg ASSERT3B(zilog_is_dirty(zilog), ==, B_FALSE); 2958325132Savg } 2959219089Spjd 2960219089Spjd mutex_enter(&zilog->zl_lock); 2961224526Smm lwb = list_tail(&zilog->zl_lwb_list); 2962325132Savg if (lwb == NULL) 2963325132Savg txg = zilog->zl_dirty_max_txg; 2964325132Savg else 2965325132Savg txg = MAX(zilog->zl_dirty_max_txg, lwb->lwb_max_txg); 2966219089Spjd mutex_exit(&zilog->zl_lock); 2967325132Savg 2968325132Savg /* 2969325132Savg * We need to use txg_wait_synced() to wait long enough for the 2970325132Savg * ZIL to be clean, and to wait for all pending lwbs to be 2971325132Savg * written out. 2972325132Savg */ 2973325132Savg if (txg != 0) 2974168404Spjd txg_wait_synced(zilog->zl_dmu_pool, txg); 2975168404Spjd 2976310515Savg if (zilog_is_dirty(zilog)) 2977310515Savg zfs_dbgmsg("zil (%p) is dirty, txg %llu", zilog, txg); 2978310515Savg VERIFY(!zilog_is_dirty(zilog)); 2979310515Savg 2980168404Spjd zilog->zl_get_data = NULL; 2981224526Smm 2982224526Smm /* 2983325132Savg * We should have only one lwb left on the list; remove it now. 2984224526Smm */ 2985224526Smm mutex_enter(&zilog->zl_lock); 2986224526Smm lwb = list_head(&zilog->zl_lwb_list); 2987224526Smm if (lwb != NULL) { 2988325132Savg ASSERT3P(lwb, ==, list_tail(&zilog->zl_lwb_list)); 2989325132Savg ASSERT3S(lwb->lwb_state, !=, LWB_STATE_ISSUED); 2990224526Smm list_remove(&zilog->zl_lwb_list, lwb); 2991224526Smm zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); 2992325132Savg zil_free_lwb(zilog, lwb); 2993224526Smm } 2994224526Smm mutex_exit(&zilog->zl_lock); 2995168404Spjd} 2996168404Spjd 2997248571Smmstatic char *suspend_tag = "zil suspending"; 2998248571Smm 2999168404Spjd/* 3000168404Spjd * Suspend an intent log. While in suspended mode, we still honor 3001168404Spjd * synchronous semantics, but we rely on txg_wait_synced() to do it. 3002248571Smm * On old version pools, we suspend the log briefly when taking a 3003248571Smm * snapshot so that it will have an empty intent log. 3004248571Smm * 3005248571Smm * Long holds are not really intended to be used the way we do here -- 3006248571Smm * held for such a short time. A concurrent caller of dsl_dataset_long_held() 3007248571Smm * could fail. Therefore we take pains to only put a long hold if it is 3008248571Smm * actually necessary. Fortunately, it will only be necessary if the 3009248571Smm * objset is currently mounted (or the ZVOL equivalent). In that case it 3010248571Smm * will already have a long hold, so we are not really making things any worse. 3011248571Smm * 3012248571Smm * Ideally, we would locate the existing long-holder (i.e. the zfsvfs_t or 3013248571Smm * zvol_state_t), and use their mechanism to prevent their hold from being 3014248571Smm * dropped (e.g. VFS_HOLD()). However, that would be even more pain for 3015248571Smm * very little gain. 3016248571Smm * 3017248571Smm * if cookiep == NULL, this does both the suspend & resume. 3018248571Smm * Otherwise, it returns with the dataset "long held", and the cookie 3019248571Smm * should be passed into zil_resume(). 3020168404Spjd */ 3021168404Spjdint 3022248571Smmzil_suspend(const char *osname, void **cookiep) 3023168404Spjd{ 3024248571Smm objset_t *os; 3025248571Smm zilog_t *zilog; 3026248571Smm const zil_header_t *zh; 3027248571Smm int error; 3028168404Spjd 3029248571Smm error = dmu_objset_hold(osname, suspend_tag, &os); 3030248571Smm if (error != 0) 3031248571Smm return (error); 3032248571Smm zilog = dmu_objset_zil(os); 3033248571Smm 3034168404Spjd mutex_enter(&zilog->zl_lock); 3035248571Smm zh = zilog->zl_header; 3036248571Smm 3037200724Sdelphij if (zh->zh_flags & ZIL_REPLAY_NEEDED) { /* unplayed log */ 3038168404Spjd mutex_exit(&zilog->zl_lock); 3039248571Smm dmu_objset_rele(os, suspend_tag); 3040249195Smm return (SET_ERROR(EBUSY)); 3041168404Spjd } 3042248571Smm 3043248571Smm /* 3044248571Smm * Don't put a long hold in the cases where we can avoid it. This 3045248571Smm * is when there is no cookie so we are doing a suspend & resume 3046248571Smm * (i.e. called from zil_vdev_offline()), and there's nothing to do 3047248571Smm * for the suspend because it's already suspended, or there's no ZIL. 3048248571Smm */ 3049248571Smm if (cookiep == NULL && !zilog->zl_suspending && 3050248571Smm (zilog->zl_suspend > 0 || BP_IS_HOLE(&zh->zh_log))) { 3051248571Smm mutex_exit(&zilog->zl_lock); 3052248571Smm dmu_objset_rele(os, suspend_tag); 3053248571Smm return (0); 3054248571Smm } 3055248571Smm 3056248571Smm dsl_dataset_long_hold(dmu_objset_ds(os), suspend_tag); 3057248571Smm dsl_pool_rele(dmu_objset_pool(os), suspend_tag); 3058248571Smm 3059248571Smm zilog->zl_suspend++; 3060248571Smm 3061248571Smm if (zilog->zl_suspend > 1) { 3062168404Spjd /* 3063248571Smm * Someone else is already suspending it. 3064168404Spjd * Just wait for them to finish. 3065168404Spjd */ 3066248571Smm 3067168404Spjd while (zilog->zl_suspending) 3068168404Spjd cv_wait(&zilog->zl_cv_suspend, &zilog->zl_lock); 3069168404Spjd mutex_exit(&zilog->zl_lock); 3070248571Smm 3071248571Smm if (cookiep == NULL) 3072248571Smm zil_resume(os); 3073248571Smm else 3074248571Smm *cookiep = os; 3075168404Spjd return (0); 3076168404Spjd } 3077248571Smm 3078248571Smm /* 3079248571Smm * If there is no pointer to an on-disk block, this ZIL must not 3080248571Smm * be active (e.g. filesystem not mounted), so there's nothing 3081248571Smm * to clean up. 3082248571Smm */ 3083248571Smm if (BP_IS_HOLE(&zh->zh_log)) { 3084248571Smm ASSERT(cookiep != NULL); /* fast path already handled */ 3085248571Smm 3086248571Smm *cookiep = os; 3087248571Smm mutex_exit(&zilog->zl_lock); 3088248571Smm return (0); 3089248571Smm } 3090248571Smm 3091168404Spjd zilog->zl_suspending = B_TRUE; 3092168404Spjd mutex_exit(&zilog->zl_lock); 3093168404Spjd 3094329486Smav /* 3095329486Smav * We need to use zil_commit_impl to ensure we wait for all 3096329486Smav * LWB_STATE_OPENED and LWB_STATE_ISSUED lwb's to be committed 3097329486Smav * to disk before proceeding. If we used zil_commit instead, it 3098329486Smav * would just call txg_wait_synced(), because zl_suspend is set. 3099329486Smav * txg_wait_synced() doesn't wait for these lwb's to be 3100329486Smav * LWB_STATE_DONE before returning. 3101329486Smav */ 3102329486Smav zil_commit_impl(zilog, 0); 3103168404Spjd 3104329486Smav /* 3105329486Smav * Now that we've ensured all lwb's are LWB_STATE_DONE, we use 3106329486Smav * txg_wait_synced() to ensure the data from the zilog has 3107329486Smav * migrated to the main pool before calling zil_destroy(). 3108329486Smav */ 3109329486Smav txg_wait_synced(zilog->zl_dmu_pool, 0); 3110329486Smav 3111168404Spjd zil_destroy(zilog, B_FALSE); 3112168404Spjd 3113168404Spjd mutex_enter(&zilog->zl_lock); 3114168404Spjd zilog->zl_suspending = B_FALSE; 3115168404Spjd cv_broadcast(&zilog->zl_cv_suspend); 3116168404Spjd mutex_exit(&zilog->zl_lock); 3117168404Spjd 3118248571Smm if (cookiep == NULL) 3119248571Smm zil_resume(os); 3120248571Smm else 3121248571Smm *cookiep = os; 3122168404Spjd return (0); 3123168404Spjd} 3124168404Spjd 3125168404Spjdvoid 3126248571Smmzil_resume(void *cookie) 3127168404Spjd{ 3128248571Smm objset_t *os = cookie; 3129248571Smm zilog_t *zilog = dmu_objset_zil(os); 3130248571Smm 3131168404Spjd mutex_enter(&zilog->zl_lock); 3132168404Spjd ASSERT(zilog->zl_suspend != 0); 3133168404Spjd zilog->zl_suspend--; 3134168404Spjd mutex_exit(&zilog->zl_lock); 3135248571Smm dsl_dataset_long_rele(dmu_objset_ds(os), suspend_tag); 3136248571Smm dsl_dataset_rele(dmu_objset_ds(os), suspend_tag); 3137168404Spjd} 3138168404Spjd 3139219089Spjdtypedef struct zil_replay_arg { 3140219089Spjd zil_replay_func_t **zr_replay; 3141219089Spjd void *zr_arg; 3142219089Spjd boolean_t zr_byteswap; 3143219089Spjd char *zr_lr; 3144219089Spjd} zil_replay_arg_t; 3145219089Spjd 3146219089Spjdstatic int 3147219089Spjdzil_replay_error(zilog_t *zilog, lr_t *lr, int error) 3148209962Smm{ 3149307108Smav char name[ZFS_MAX_DATASET_NAME_LEN]; 3150209962Smm 3151219089Spjd zilog->zl_replaying_seq--; /* didn't actually replay this one */ 3152209962Smm 3153219089Spjd dmu_objset_name(zilog->zl_os, name); 3154209962Smm 3155219089Spjd cmn_err(CE_WARN, "ZFS replay transaction error %d, " 3156219089Spjd "dataset %s, seq 0x%llx, txtype %llu %s\n", error, name, 3157219089Spjd (u_longlong_t)lr->lrc_seq, 3158219089Spjd (u_longlong_t)(lr->lrc_txtype & ~TX_CI), 3159219089Spjd (lr->lrc_txtype & TX_CI) ? "CI" : ""); 3160219089Spjd 3161219089Spjd return (error); 3162209962Smm} 3163209962Smm 3164219089Spjdstatic int 3165168404Spjdzil_replay_log_record(zilog_t *zilog, lr_t *lr, void *zra, uint64_t claim_txg) 3166168404Spjd{ 3167168404Spjd zil_replay_arg_t *zr = zra; 3168168404Spjd const zil_header_t *zh = zilog->zl_header; 3169168404Spjd uint64_t reclen = lr->lrc_reclen; 3170168404Spjd uint64_t txtype = lr->lrc_txtype; 3171219089Spjd int error = 0; 3172168404Spjd 3173219089Spjd zilog->zl_replaying_seq = lr->lrc_seq; 3174168404Spjd 3175219089Spjd if (lr->lrc_seq <= zh->zh_replay_seq) /* already replayed */ 3176219089Spjd return (0); 3177219089Spjd 3178168404Spjd if (lr->lrc_txg < claim_txg) /* already committed */ 3179219089Spjd return (0); 3180168404Spjd 3181185029Spjd /* Strip case-insensitive bit, still present in log record */ 3182185029Spjd txtype &= ~TX_CI; 3183185029Spjd 3184219089Spjd if (txtype == 0 || txtype >= TX_MAX_TYPE) 3185219089Spjd return (zil_replay_error(zilog, lr, EINVAL)); 3186219089Spjd 3187219089Spjd /* 3188219089Spjd * If this record type can be logged out of order, the object 3189219089Spjd * (lr_foid) may no longer exist. That's legitimate, not an error. 3190219089Spjd */ 3191219089Spjd if (TX_OOO(txtype)) { 3192219089Spjd error = dmu_object_info(zilog->zl_os, 3193219089Spjd ((lr_ooo_t *)lr)->lr_foid, NULL); 3194219089Spjd if (error == ENOENT || error == EEXIST) 3195219089Spjd return (0); 3196209962Smm } 3197209962Smm 3198168404Spjd /* 3199168404Spjd * Make a copy of the data so we can revise and extend it. 3200168404Spjd */ 3201219089Spjd bcopy(lr, zr->zr_lr, reclen); 3202168404Spjd 3203168404Spjd /* 3204219089Spjd * If this is a TX_WRITE with a blkptr, suck in the data. 3205219089Spjd */ 3206219089Spjd if (txtype == TX_WRITE && reclen == sizeof (lr_write_t)) { 3207219089Spjd error = zil_read_log_data(zilog, (lr_write_t *)lr, 3208219089Spjd zr->zr_lr + reclen); 3209248571Smm if (error != 0) 3210219089Spjd return (zil_replay_error(zilog, lr, error)); 3211219089Spjd } 3212219089Spjd 3213219089Spjd /* 3214168404Spjd * The log block containing this lr may have been byteswapped 3215168404Spjd * so that we can easily examine common fields like lrc_txtype. 3216219089Spjd * However, the log is a mix of different record types, and only the 3217168404Spjd * replay vectors know how to byteswap their records. Therefore, if 3218168404Spjd * the lr was byteswapped, undo it before invoking the replay vector. 3219168404Spjd */ 3220168404Spjd if (zr->zr_byteswap) 3221219089Spjd byteswap_uint64_array(zr->zr_lr, reclen); 3222168404Spjd 3223168404Spjd /* 3224168404Spjd * We must now do two things atomically: replay this log record, 3225209962Smm * and update the log header sequence number to reflect the fact that 3226209962Smm * we did so. At the end of each replay function the sequence number 3227209962Smm * is updated if we are in replay mode. 3228168404Spjd */ 3229219089Spjd error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lr, zr->zr_byteswap); 3230248571Smm if (error != 0) { 3231168404Spjd /* 3232168404Spjd * The DMU's dnode layer doesn't see removes until the txg 3233168404Spjd * commits, so a subsequent claim can spuriously fail with 3234209962Smm * EEXIST. So if we receive any error we try syncing out 3235219089Spjd * any removes then retry the transaction. Note that we 3236219089Spjd * specify B_FALSE for byteswap now, so we don't do it twice. 3237168404Spjd */ 3238219089Spjd txg_wait_synced(spa_get_dsl(zilog->zl_spa), 0); 3239219089Spjd error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lr, B_FALSE); 3240248571Smm if (error != 0) 3241219089Spjd return (zil_replay_error(zilog, lr, error)); 3242168404Spjd } 3243219089Spjd return (0); 3244168404Spjd} 3245168404Spjd 3246168404Spjd/* ARGSUSED */ 3247219089Spjdstatic int 3248168404Spjdzil_incr_blks(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg) 3249168404Spjd{ 3250168404Spjd zilog->zl_replay_blks++; 3251219089Spjd 3252219089Spjd return (0); 3253168404Spjd} 3254168404Spjd 3255168404Spjd/* 3256168404Spjd * If this dataset has a non-empty intent log, replay it and destroy it. 3257168404Spjd */ 3258168404Spjdvoid 3259209962Smmzil_replay(objset_t *os, void *arg, zil_replay_func_t *replay_func[TX_MAX_TYPE]) 3260168404Spjd{ 3261168404Spjd zilog_t *zilog = dmu_objset_zil(os); 3262168404Spjd const zil_header_t *zh = zilog->zl_header; 3263168404Spjd zil_replay_arg_t zr; 3264168404Spjd 3265200724Sdelphij if ((zh->zh_flags & ZIL_REPLAY_NEEDED) == 0) { 3266168404Spjd zil_destroy(zilog, B_TRUE); 3267168404Spjd return; 3268168404Spjd } 3269168404Spjd 3270168404Spjd zr.zr_replay = replay_func; 3271168404Spjd zr.zr_arg = arg; 3272168404Spjd zr.zr_byteswap = BP_SHOULD_BYTESWAP(&zh->zh_log); 3273219089Spjd zr.zr_lr = kmem_alloc(2 * SPA_MAXBLOCKSIZE, KM_SLEEP); 3274168404Spjd 3275168404Spjd /* 3276168404Spjd * Wait for in-progress removes to sync before starting replay. 3277168404Spjd */ 3278168404Spjd txg_wait_synced(zilog->zl_dmu_pool, 0); 3279168404Spjd 3280209962Smm zilog->zl_replay = B_TRUE; 3281219089Spjd zilog->zl_replay_time = ddi_get_lbolt(); 3282168404Spjd ASSERT(zilog->zl_replay_blks == 0); 3283168404Spjd (void) zil_parse(zilog, zil_incr_blks, zil_replay_log_record, &zr, 3284168404Spjd zh->zh_claim_txg); 3285219089Spjd kmem_free(zr.zr_lr, 2 * SPA_MAXBLOCKSIZE); 3286168404Spjd 3287168404Spjd zil_destroy(zilog, B_FALSE); 3288185029Spjd txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg); 3289209962Smm zilog->zl_replay = B_FALSE; 3290168404Spjd} 3291168404Spjd 3292219089Spjdboolean_t 3293219089Spjdzil_replaying(zilog_t *zilog, dmu_tx_t *tx) 3294168404Spjd{ 3295219089Spjd if (zilog->zl_sync == ZFS_SYNC_DISABLED) 3296219089Spjd return (B_TRUE); 3297168404Spjd 3298219089Spjd if (zilog->zl_replay) { 3299219089Spjd dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); 3300219089Spjd zilog->zl_replayed_seq[dmu_tx_get_txg(tx) & TXG_MASK] = 3301219089Spjd zilog->zl_replaying_seq; 3302219089Spjd return (B_TRUE); 3303168404Spjd } 3304168404Spjd 3305219089Spjd return (B_FALSE); 3306168404Spjd} 3307213197Smm 3308213197Smm/* ARGSUSED */ 3309213197Smmint 3310332525Smavzil_reset(const char *osname, void *arg) 3311213197Smm{ 3312213197Smm int error; 3313213197Smm 3314248571Smm error = zil_suspend(osname, NULL); 3315248571Smm if (error != 0) 3316249195Smm return (SET_ERROR(EEXIST)); 3317248571Smm return (0); 3318213197Smm} 3319