1168404Spjd/* 2168404Spjd * CDDL HEADER START 3168404Spjd * 4168404Spjd * The contents of this file are subject to the terms of the 5168404Spjd * Common Development and Distribution License (the "License"). 6168404Spjd * You may not use this file except in compliance with the License. 7168404Spjd * 8168404Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9168404Spjd * or http://www.opensolaris.org/os/licensing. 10168404Spjd * See the License for the specific language governing permissions 11168404Spjd * and limitations under the License. 12168404Spjd * 13168404Spjd * When distributing Covered Code, include this CDDL HEADER in each 14168404Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15168404Spjd * If applicable, add the following below this CDDL HEADER, with the 16168404Spjd * fields enclosed by brackets "[]" replaced with your own identifying 17168404Spjd * information: Portions Copyright [yyyy] [name of copyright owner] 18168404Spjd * 19168404Spjd * CDDL HEADER END 20168404Spjd */ 21168404Spjd/* 22219089Spjd * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23339105Smav * Copyright (c) 2011, 2018 by Delphix. All rights reserved. 24296519Smav * Copyright (c) 2014 Integros [integros.com] 25168404Spjd */ 26168404Spjd 27219089Spjd/* Portions Copyright 2010 Robert Milkowski */ 28219089Spjd 29168404Spjd#include <sys/zfs_context.h> 30168404Spjd#include <sys/spa.h> 31332547Smav#include <sys/spa_impl.h> 32168404Spjd#include <sys/dmu.h> 33168404Spjd#include <sys/zap.h> 34168404Spjd#include <sys/arc.h> 35168404Spjd#include <sys/stat.h> 36168404Spjd#include <sys/resource.h> 37168404Spjd#include <sys/zil.h> 38168404Spjd#include <sys/zil_impl.h> 39168404Spjd#include <sys/dsl_dataset.h> 40219089Spjd#include <sys/vdev_impl.h> 41168404Spjd#include <sys/dmu_tx.h> 42219089Spjd#include <sys/dsl_pool.h> 43321610Smav#include <sys/abd.h> 44168404Spjd 45168404Spjd/* 46325132Savg * The ZFS Intent Log (ZIL) saves "transaction records" (itxs) of system 47325132Savg * calls that change the file system. Each itx has enough information to 48325132Savg * be able to replay them after a system crash, power loss, or 49325132Savg * equivalent failure mode. These are stored in memory until either: 50168404Spjd * 51325132Savg * 1. they are committed to the pool by the DMU transaction group 52325132Savg * (txg), at which point they can be discarded; or 53325132Savg * 2. they are committed to the on-disk ZIL for the dataset being 54325132Savg * modified (e.g. due to an fsync, O_DSYNC, or other synchronous 55325132Savg * requirement). 56168404Spjd * 57325132Savg * In the event of a crash or power loss, the itxs contained by each 58325132Savg * dataset's on-disk ZIL will be replayed when that dataset is first 59325132Savg * instantianted (e.g. if the dataset is a normal fileystem, when it is 60325132Savg * first mounted). 61168404Spjd * 62325132Savg * As hinted at above, there is one ZIL per dataset (both the in-memory 63325132Savg * representation, and the on-disk representation). The on-disk format 64325132Savg * consists of 3 parts: 65325132Savg * 66325132Savg * - a single, per-dataset, ZIL header; which points to a chain of 67325132Savg * - zero or more ZIL blocks; each of which contains 68325132Savg * - zero or more ZIL records 69325132Savg * 70325132Savg * A ZIL record holds the information necessary to replay a single 71325132Savg * system call transaction. A ZIL block can hold many ZIL records, and 72325132Savg * the blocks are chained together, similarly to a singly linked list. 73325132Savg * 74325132Savg * Each ZIL block contains a block pointer (blkptr_t) to the next ZIL 75325132Savg * block in the chain, and the ZIL header points to the first block in 76325132Savg * the chain. 77325132Savg * 78325132Savg * Note, there is not a fixed place in the pool to hold these ZIL 79325132Savg * blocks; they are dynamically allocated and freed as needed from the 80325132Savg * blocks available on the pool, though they can be preferentially 81325132Savg * allocated from a dedicated "log" vdev. 82168404Spjd */ 83168404Spjd 84168404Spjd/* 85325132Savg * This controls the amount of time that a ZIL block (lwb) will remain 86325132Savg * "open" when it isn't "full", and it has a thread waiting for it to be 87325132Savg * committed to stable storage. Please refer to the zil_commit_waiter() 88325132Savg * function (and the comments within it) for more details. 89325132Savg */ 90325132Savgint zfs_commit_timeout_pct = 5; 91325132Savg 92325132Savg/* 93251631Sdelphij * Disable intent logging replay. This global ZIL switch affects all pools. 94168404Spjd */ 95251631Sdelphijint zil_replay_disable = 0; 96168404SpjdSYSCTL_DECL(_vfs_zfs); 97267992ShselaskySYSCTL_INT(_vfs_zfs, OID_AUTO, zil_replay_disable, CTLFLAG_RWTUN, 98219089Spjd &zil_replay_disable, 0, "Disable intent logging replay"); 99168404Spjd 100168404Spjd/* 101168404Spjd * Tunable parameter for debugging or performance analysis. Setting 102168404Spjd * zfs_nocacheflush will cause corruption on power loss if a volatile 103168404Spjd * out-of-order write cache is enabled. 104168404Spjd */ 105168404Spjdboolean_t zfs_nocacheflush = B_FALSE; 106343983SoshogboSYSCTL_INT(_vfs_zfs, OID_AUTO, cache_flush_disable, CTLFLAG_RWTUN, 107168404Spjd &zfs_nocacheflush, 0, "Disable cache flush"); 108249921Ssmhboolean_t zfs_trim_enabled = B_TRUE; 109249921SsmhSYSCTL_DECL(_vfs_zfs_trim); 110249921SsmhSYSCTL_INT(_vfs_zfs_trim, OID_AUTO, enabled, CTLFLAG_RDTUN, &zfs_trim_enabled, 0, 111249921Ssmh "Enable ZFS TRIM"); 112168404Spjd 113315441Smav/* 114315441Smav * Limit SLOG write size per commit executed with synchronous priority. 115321611Smav * Any writes above that will be executed with lower (asynchronous) priority 116321611Smav * to limit potential SLOG device abuse by single active ZIL writer. 117315441Smav */ 118321611Smavuint64_t zil_slog_bulk = 768 * 1024; 119321611SmavSYSCTL_QUAD(_vfs_zfs, OID_AUTO, zil_slog_bulk, CTLFLAG_RWTUN, 120321611Smav &zil_slog_bulk, 0, "Maximal SLOG commit size with sync priority"); 121315441Smav 122168404Spjdstatic kmem_cache_t *zil_lwb_cache; 123325132Savgstatic kmem_cache_t *zil_zcw_cache; 124168404Spjd 125219089Spjd#define LWB_EMPTY(lwb) ((BP_GET_LSIZE(&lwb->lwb_blk) - \ 126219089Spjd sizeof (zil_chain_t)) == (lwb->lwb_sz - lwb->lwb_nused)) 127219089Spjd 128168404Spjdstatic int 129219089Spjdzil_bp_compare(const void *x1, const void *x2) 130168404Spjd{ 131219089Spjd const dva_t *dva1 = &((zil_bp_node_t *)x1)->zn_dva; 132219089Spjd const dva_t *dva2 = &((zil_bp_node_t *)x2)->zn_dva; 133168404Spjd 134339158Smav int cmp = AVL_CMP(DVA_GET_VDEV(dva1), DVA_GET_VDEV(dva2)); 135339158Smav if (likely(cmp)) 136339158Smav return (cmp); 137168404Spjd 138339158Smav return (AVL_CMP(DVA_GET_OFFSET(dva1), DVA_GET_OFFSET(dva2))); 139168404Spjd} 140168404Spjd 141168404Spjdstatic void 142219089Spjdzil_bp_tree_init(zilog_t *zilog) 143168404Spjd{ 144219089Spjd avl_create(&zilog->zl_bp_tree, zil_bp_compare, 145219089Spjd sizeof (zil_bp_node_t), offsetof(zil_bp_node_t, zn_node)); 146168404Spjd} 147168404Spjd 148168404Spjdstatic void 149219089Spjdzil_bp_tree_fini(zilog_t *zilog) 150168404Spjd{ 151219089Spjd avl_tree_t *t = &zilog->zl_bp_tree; 152219089Spjd zil_bp_node_t *zn; 153168404Spjd void *cookie = NULL; 154168404Spjd 155168404Spjd while ((zn = avl_destroy_nodes(t, &cookie)) != NULL) 156219089Spjd kmem_free(zn, sizeof (zil_bp_node_t)); 157168404Spjd 158168404Spjd avl_destroy(t); 159168404Spjd} 160168404Spjd 161219089Spjdint 162219089Spjdzil_bp_tree_add(zilog_t *zilog, const blkptr_t *bp) 163168404Spjd{ 164219089Spjd avl_tree_t *t = &zilog->zl_bp_tree; 165268075Sdelphij const dva_t *dva; 166219089Spjd zil_bp_node_t *zn; 167168404Spjd avl_index_t where; 168168404Spjd 169268075Sdelphij if (BP_IS_EMBEDDED(bp)) 170268075Sdelphij return (0); 171268075Sdelphij 172268075Sdelphij dva = BP_IDENTITY(bp); 173268075Sdelphij 174168404Spjd if (avl_find(t, dva, &where) != NULL) 175249195Smm return (SET_ERROR(EEXIST)); 176168404Spjd 177219089Spjd zn = kmem_alloc(sizeof (zil_bp_node_t), KM_SLEEP); 178168404Spjd zn->zn_dva = *dva; 179168404Spjd avl_insert(t, zn, where); 180168404Spjd 181168404Spjd return (0); 182168404Spjd} 183168404Spjd 184168404Spjdstatic zil_header_t * 185168404Spjdzil_header_in_syncing_context(zilog_t *zilog) 186168404Spjd{ 187168404Spjd return ((zil_header_t *)zilog->zl_header); 188168404Spjd} 189168404Spjd 190168404Spjdstatic void 191168404Spjdzil_init_log_chain(zilog_t *zilog, blkptr_t *bp) 192168404Spjd{ 193168404Spjd zio_cksum_t *zc = &bp->blk_cksum; 194168404Spjd 195168404Spjd zc->zc_word[ZIL_ZC_GUID_0] = spa_get_random(-1ULL); 196168404Spjd zc->zc_word[ZIL_ZC_GUID_1] = spa_get_random(-1ULL); 197168404Spjd zc->zc_word[ZIL_ZC_OBJSET] = dmu_objset_id(zilog->zl_os); 198168404Spjd zc->zc_word[ZIL_ZC_SEQ] = 1ULL; 199168404Spjd} 200168404Spjd 201168404Spjd/* 202219089Spjd * Read a log block and make sure it's valid. 203168404Spjd */ 204168404Spjdstatic int 205219089Spjdzil_read_log_block(zilog_t *zilog, const blkptr_t *bp, blkptr_t *nbp, void *dst, 206219089Spjd char **end) 207168404Spjd{ 208219089Spjd enum zio_flag zio_flags = ZIO_FLAG_CANFAIL; 209275811Sdelphij arc_flags_t aflags = ARC_FLAG_WAIT; 210219089Spjd arc_buf_t *abuf = NULL; 211268123Sdelphij zbookmark_phys_t zb; 212168404Spjd int error; 213168404Spjd 214219089Spjd if (zilog->zl_header->zh_claim_txg == 0) 215219089Spjd zio_flags |= ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB; 216168404Spjd 217219089Spjd if (!(zilog->zl_header->zh_flags & ZIL_CLAIM_LR_SEQ_VALID)) 218219089Spjd zio_flags |= ZIO_FLAG_SPECULATIVE; 219168404Spjd 220219089Spjd SET_BOOKMARK(&zb, bp->blk_cksum.zc_word[ZIL_ZC_OBJSET], 221219089Spjd ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]); 222168404Spjd 223246666Smm error = arc_read(NULL, zilog->zl_spa, bp, arc_getbuf_func, &abuf, 224219089Spjd ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb); 225219089Spjd 226168404Spjd if (error == 0) { 227168404Spjd zio_cksum_t cksum = bp->blk_cksum; 228168404Spjd 229168404Spjd /* 230185029Spjd * Validate the checksummed log block. 231185029Spjd * 232168404Spjd * Sequence numbers should be... sequential. The checksum 233168404Spjd * verifier for the next block should be bp's checksum plus 1. 234185029Spjd * 235185029Spjd * Also check the log chain linkage and size used. 236168404Spjd */ 237168404Spjd cksum.zc_word[ZIL_ZC_SEQ]++; 238168404Spjd 239219089Spjd if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) { 240219089Spjd zil_chain_t *zilc = abuf->b_data; 241219089Spjd char *lr = (char *)(zilc + 1); 242219089Spjd uint64_t len = zilc->zc_nused - sizeof (zil_chain_t); 243219089Spjd 244219089Spjd if (bcmp(&cksum, &zilc->zc_next_blk.blk_cksum, 245219089Spjd sizeof (cksum)) || BP_IS_HOLE(&zilc->zc_next_blk)) { 246249195Smm error = SET_ERROR(ECKSUM); 247219089Spjd } else { 248274337Sdelphij ASSERT3U(len, <=, SPA_OLD_MAXBLOCKSIZE); 249219089Spjd bcopy(lr, dst, len); 250219089Spjd *end = (char *)dst + len; 251219089Spjd *nbp = zilc->zc_next_blk; 252219089Spjd } 253219089Spjd } else { 254219089Spjd char *lr = abuf->b_data; 255219089Spjd uint64_t size = BP_GET_LSIZE(bp); 256219089Spjd zil_chain_t *zilc = (zil_chain_t *)(lr + size) - 1; 257219089Spjd 258219089Spjd if (bcmp(&cksum, &zilc->zc_next_blk.blk_cksum, 259219089Spjd sizeof (cksum)) || BP_IS_HOLE(&zilc->zc_next_blk) || 260219089Spjd (zilc->zc_nused > (size - sizeof (*zilc)))) { 261249195Smm error = SET_ERROR(ECKSUM); 262219089Spjd } else { 263274337Sdelphij ASSERT3U(zilc->zc_nused, <=, 264274337Sdelphij SPA_OLD_MAXBLOCKSIZE); 265219089Spjd bcopy(lr, dst, zilc->zc_nused); 266219089Spjd *end = (char *)dst + zilc->zc_nused; 267219089Spjd *nbp = zilc->zc_next_blk; 268219089Spjd } 269185029Spjd } 270168404Spjd 271307265Smav arc_buf_destroy(abuf, &abuf); 272168404Spjd } 273168404Spjd 274219089Spjd return (error); 275219089Spjd} 276168404Spjd 277219089Spjd/* 278219089Spjd * Read a TX_WRITE log data block. 279219089Spjd */ 280219089Spjdstatic int 281219089Spjdzil_read_log_data(zilog_t *zilog, const lr_write_t *lr, void *wbuf) 282219089Spjd{ 283219089Spjd enum zio_flag zio_flags = ZIO_FLAG_CANFAIL; 284219089Spjd const blkptr_t *bp = &lr->lr_blkptr; 285275811Sdelphij arc_flags_t aflags = ARC_FLAG_WAIT; 286219089Spjd arc_buf_t *abuf = NULL; 287268123Sdelphij zbookmark_phys_t zb; 288219089Spjd int error; 289219089Spjd 290219089Spjd if (BP_IS_HOLE(bp)) { 291219089Spjd if (wbuf != NULL) 292219089Spjd bzero(wbuf, MAX(BP_GET_LSIZE(bp), lr->lr_length)); 293219089Spjd return (0); 294219089Spjd } 295219089Spjd 296219089Spjd if (zilog->zl_header->zh_claim_txg == 0) 297219089Spjd zio_flags |= ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB; 298219089Spjd 299219089Spjd SET_BOOKMARK(&zb, dmu_objset_id(zilog->zl_os), lr->lr_foid, 300219089Spjd ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp)); 301219089Spjd 302246666Smm error = arc_read(NULL, zilog->zl_spa, bp, arc_getbuf_func, &abuf, 303219089Spjd ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb); 304219089Spjd 305219089Spjd if (error == 0) { 306219089Spjd if (wbuf != NULL) 307219089Spjd bcopy(abuf->b_data, wbuf, arc_buf_size(abuf)); 308307265Smav arc_buf_destroy(abuf, &abuf); 309219089Spjd } 310219089Spjd 311168404Spjd return (error); 312168404Spjd} 313168404Spjd 314168404Spjd/* 315168404Spjd * Parse the intent log, and call parse_func for each valid record within. 316168404Spjd */ 317219089Spjdint 318168404Spjdzil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func, 319168404Spjd zil_parse_lr_func_t *parse_lr_func, void *arg, uint64_t txg) 320168404Spjd{ 321168404Spjd const zil_header_t *zh = zilog->zl_header; 322219089Spjd boolean_t claimed = !!zh->zh_claim_txg; 323219089Spjd uint64_t claim_blk_seq = claimed ? zh->zh_claim_blk_seq : UINT64_MAX; 324219089Spjd uint64_t claim_lr_seq = claimed ? zh->zh_claim_lr_seq : UINT64_MAX; 325219089Spjd uint64_t max_blk_seq = 0; 326219089Spjd uint64_t max_lr_seq = 0; 327219089Spjd uint64_t blk_count = 0; 328219089Spjd uint64_t lr_count = 0; 329219089Spjd blkptr_t blk, next_blk; 330168404Spjd char *lrbuf, *lrp; 331219089Spjd int error = 0; 332168404Spjd 333219089Spjd /* 334219089Spjd * Old logs didn't record the maximum zh_claim_lr_seq. 335219089Spjd */ 336219089Spjd if (!(zh->zh_flags & ZIL_CLAIM_LR_SEQ_VALID)) 337219089Spjd claim_lr_seq = UINT64_MAX; 338168404Spjd 339168404Spjd /* 340168404Spjd * Starting at the block pointed to by zh_log we read the log chain. 341168404Spjd * For each block in the chain we strongly check that block to 342168404Spjd * ensure its validity. We stop when an invalid block is found. 343168404Spjd * For each block pointer in the chain we call parse_blk_func(). 344168404Spjd * For each record in each valid block we call parse_lr_func(). 345168404Spjd * If the log has been claimed, stop if we encounter a sequence 346168404Spjd * number greater than the highest claimed sequence number. 347168404Spjd */ 348274337Sdelphij lrbuf = zio_buf_alloc(SPA_OLD_MAXBLOCKSIZE); 349219089Spjd zil_bp_tree_init(zilog); 350168404Spjd 351219089Spjd for (blk = zh->zh_log; !BP_IS_HOLE(&blk); blk = next_blk) { 352219089Spjd uint64_t blk_seq = blk.blk_cksum.zc_word[ZIL_ZC_SEQ]; 353219089Spjd int reclen; 354219089Spjd char *end; 355219089Spjd 356219089Spjd if (blk_seq > claim_blk_seq) 357168404Spjd break; 358219089Spjd if ((error = parse_blk_func(zilog, &blk, arg, txg)) != 0) 359219089Spjd break; 360219089Spjd ASSERT3U(max_blk_seq, <, blk_seq); 361219089Spjd max_blk_seq = blk_seq; 362219089Spjd blk_count++; 363168404Spjd 364219089Spjd if (max_lr_seq == claim_lr_seq && max_blk_seq == claim_blk_seq) 365219089Spjd break; 366168404Spjd 367219089Spjd error = zil_read_log_block(zilog, &blk, &next_blk, lrbuf, &end); 368248571Smm if (error != 0) 369168404Spjd break; 370168404Spjd 371219089Spjd for (lrp = lrbuf; lrp < end; lrp += reclen) { 372168404Spjd lr_t *lr = (lr_t *)lrp; 373168404Spjd reclen = lr->lrc_reclen; 374168404Spjd ASSERT3U(reclen, >=, sizeof (lr_t)); 375219089Spjd if (lr->lrc_seq > claim_lr_seq) 376219089Spjd goto done; 377219089Spjd if ((error = parse_lr_func(zilog, lr, arg, txg)) != 0) 378219089Spjd goto done; 379219089Spjd ASSERT3U(max_lr_seq, <, lr->lrc_seq); 380219089Spjd max_lr_seq = lr->lrc_seq; 381219089Spjd lr_count++; 382168404Spjd } 383168404Spjd } 384219089Spjddone: 385219089Spjd zilog->zl_parse_error = error; 386219089Spjd zilog->zl_parse_blk_seq = max_blk_seq; 387219089Spjd zilog->zl_parse_lr_seq = max_lr_seq; 388219089Spjd zilog->zl_parse_blk_count = blk_count; 389219089Spjd zilog->zl_parse_lr_count = lr_count; 390168404Spjd 391219089Spjd ASSERT(!claimed || !(zh->zh_flags & ZIL_CLAIM_LR_SEQ_VALID) || 392219089Spjd (max_blk_seq == claim_blk_seq && max_lr_seq == claim_lr_seq)); 393219089Spjd 394219089Spjd zil_bp_tree_fini(zilog); 395274337Sdelphij zio_buf_free(lrbuf, SPA_OLD_MAXBLOCKSIZE); 396219089Spjd 397219089Spjd return (error); 398168404Spjd} 399168404Spjd 400332547Smav/* ARGSUSED */ 401219089Spjdstatic int 402332547Smavzil_clear_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t first_txg) 403332547Smav{ 404332547Smav ASSERT(!BP_IS_HOLE(bp)); 405332547Smav 406332547Smav /* 407332547Smav * As we call this function from the context of a rewind to a 408332547Smav * checkpoint, each ZIL block whose txg is later than the txg 409332547Smav * that we rewind to is invalid. Thus, we return -1 so 410332547Smav * zil_parse() doesn't attempt to read it. 411332547Smav */ 412332547Smav if (bp->blk_birth >= first_txg) 413332547Smav return (-1); 414332547Smav 415332547Smav if (zil_bp_tree_add(zilog, bp) != 0) 416332547Smav return (0); 417332547Smav 418332547Smav zio_free(zilog->zl_spa, first_txg, bp); 419332547Smav return (0); 420332547Smav} 421332547Smav 422332547Smav/* ARGSUSED */ 423332547Smavstatic int 424332547Smavzil_noop_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t first_txg) 425332547Smav{ 426332547Smav return (0); 427332547Smav} 428332547Smav 429332547Smavstatic int 430168404Spjdzil_claim_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t first_txg) 431168404Spjd{ 432168404Spjd /* 433168404Spjd * Claim log block if not already committed and not already claimed. 434219089Spjd * If tx == NULL, just verify that the block is claimable. 435168404Spjd */ 436260150Sdelphij if (BP_IS_HOLE(bp) || bp->blk_birth < first_txg || 437260150Sdelphij zil_bp_tree_add(zilog, bp) != 0) 438219089Spjd return (0); 439219089Spjd 440219089Spjd return (zio_wait(zio_claim(NULL, zilog->zl_spa, 441219089Spjd tx == NULL ? 0 : first_txg, bp, spa_claim_notify, NULL, 442219089Spjd ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB))); 443168404Spjd} 444168404Spjd 445219089Spjdstatic int 446168404Spjdzil_claim_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t first_txg) 447168404Spjd{ 448219089Spjd lr_write_t *lr = (lr_write_t *)lrc; 449219089Spjd int error; 450219089Spjd 451219089Spjd if (lrc->lrc_txtype != TX_WRITE) 452219089Spjd return (0); 453219089Spjd 454219089Spjd /* 455219089Spjd * If the block is not readable, don't claim it. This can happen 456219089Spjd * in normal operation when a log block is written to disk before 457219089Spjd * some of the dmu_sync() blocks it points to. In this case, the 458219089Spjd * transaction cannot have been committed to anyone (we would have 459219089Spjd * waited for all writes to be stable first), so it is semantically 460219089Spjd * correct to declare this the end of the log. 461219089Spjd */ 462219089Spjd if (lr->lr_blkptr.blk_birth >= first_txg && 463219089Spjd (error = zil_read_log_data(zilog, lr, NULL)) != 0) 464219089Spjd return (error); 465219089Spjd return (zil_claim_log_block(zilog, &lr->lr_blkptr, tx, first_txg)); 466168404Spjd} 467168404Spjd 468168404Spjd/* ARGSUSED */ 469219089Spjdstatic int 470168404Spjdzil_free_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t claim_txg) 471168404Spjd{ 472332547Smav zio_free(zilog->zl_spa, dmu_tx_get_txg(tx), bp); 473219089Spjd 474219089Spjd return (0); 475168404Spjd} 476168404Spjd 477219089Spjdstatic int 478168404Spjdzil_free_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t claim_txg) 479168404Spjd{ 480219089Spjd lr_write_t *lr = (lr_write_t *)lrc; 481219089Spjd blkptr_t *bp = &lr->lr_blkptr; 482219089Spjd 483168404Spjd /* 484168404Spjd * If we previously claimed it, we need to free it. 485168404Spjd */ 486219089Spjd if (claim_txg != 0 && lrc->lrc_txtype == TX_WRITE && 487260150Sdelphij bp->blk_birth >= claim_txg && zil_bp_tree_add(zilog, bp) == 0 && 488260150Sdelphij !BP_IS_HOLE(bp)) 489219089Spjd zio_free(zilog->zl_spa, dmu_tx_get_txg(tx), bp); 490219089Spjd 491219089Spjd return (0); 492219089Spjd} 493219089Spjd 494325132Savgstatic int 495325132Savgzil_lwb_vdev_compare(const void *x1, const void *x2) 496325132Savg{ 497325132Savg const uint64_t v1 = ((zil_vdev_node_t *)x1)->zv_vdev; 498325132Savg const uint64_t v2 = ((zil_vdev_node_t *)x2)->zv_vdev; 499325132Savg 500339158Smav return (AVL_CMP(v1, v2)); 501325132Savg} 502325132Savg 503219089Spjdstatic lwb_t * 504315441Smavzil_alloc_lwb(zilog_t *zilog, blkptr_t *bp, boolean_t slog, uint64_t txg) 505219089Spjd{ 506219089Spjd lwb_t *lwb; 507219089Spjd 508219089Spjd lwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP); 509219089Spjd lwb->lwb_zilog = zilog; 510219089Spjd lwb->lwb_blk = *bp; 511315441Smav lwb->lwb_slog = slog; 512325132Savg lwb->lwb_state = LWB_STATE_CLOSED; 513219089Spjd lwb->lwb_buf = zio_buf_alloc(BP_GET_LSIZE(bp)); 514219089Spjd lwb->lwb_max_txg = txg; 515325132Savg lwb->lwb_write_zio = NULL; 516325132Savg lwb->lwb_root_zio = NULL; 517219089Spjd lwb->lwb_tx = NULL; 518325132Savg lwb->lwb_issued_timestamp = 0; 519219089Spjd if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) { 520219089Spjd lwb->lwb_nused = sizeof (zil_chain_t); 521219089Spjd lwb->lwb_sz = BP_GET_LSIZE(bp); 522219089Spjd } else { 523219089Spjd lwb->lwb_nused = 0; 524219089Spjd lwb->lwb_sz = BP_GET_LSIZE(bp) - sizeof (zil_chain_t); 525168404Spjd } 526219089Spjd 527219089Spjd mutex_enter(&zilog->zl_lock); 528219089Spjd list_insert_tail(&zilog->zl_lwb_list, lwb); 529219089Spjd mutex_exit(&zilog->zl_lock); 530219089Spjd 531325132Savg ASSERT(!MUTEX_HELD(&lwb->lwb_vdev_lock)); 532325132Savg ASSERT(avl_is_empty(&lwb->lwb_vdev_tree)); 533329486Smav VERIFY(list_is_empty(&lwb->lwb_waiters)); 534325132Savg 535219089Spjd return (lwb); 536168404Spjd} 537168404Spjd 538325132Savgstatic void 539325132Savgzil_free_lwb(zilog_t *zilog, lwb_t *lwb) 540325132Savg{ 541325132Savg ASSERT(MUTEX_HELD(&zilog->zl_lock)); 542325132Savg ASSERT(!MUTEX_HELD(&lwb->lwb_vdev_lock)); 543329486Smav VERIFY(list_is_empty(&lwb->lwb_waiters)); 544325132Savg ASSERT(avl_is_empty(&lwb->lwb_vdev_tree)); 545325132Savg ASSERT3P(lwb->lwb_write_zio, ==, NULL); 546325132Savg ASSERT3P(lwb->lwb_root_zio, ==, NULL); 547329486Smav ASSERT3U(lwb->lwb_max_txg, <=, spa_syncing_txg(zilog->zl_spa)); 548329486Smav ASSERT(lwb->lwb_state == LWB_STATE_CLOSED || 549329486Smav lwb->lwb_state == LWB_STATE_DONE); 550325132Savg 551325132Savg /* 552325132Savg * Clear the zilog's field to indicate this lwb is no longer 553325132Savg * valid, and prevent use-after-free errors. 554325132Savg */ 555325132Savg if (zilog->zl_last_lwb_opened == lwb) 556325132Savg zilog->zl_last_lwb_opened = NULL; 557325132Savg 558325132Savg kmem_cache_free(zil_lwb_cache, lwb); 559325132Savg} 560325132Savg 561168404Spjd/* 562239620Smm * Called when we create in-memory log transactions so that we know 563239620Smm * to cleanup the itxs at the end of spa_sync(). 564239620Smm */ 565239620Smmvoid 566239620Smmzilog_dirty(zilog_t *zilog, uint64_t txg) 567239620Smm{ 568239620Smm dsl_pool_t *dp = zilog->zl_dmu_pool; 569239620Smm dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os); 570239620Smm 571325132Savg ASSERT(spa_writeable(zilog->zl_spa)); 572325132Savg 573286575Smav if (ds->ds_is_snapshot) 574239620Smm panic("dirtying snapshot!"); 575239620Smm 576248571Smm if (txg_list_add(&dp->dp_dirty_zilogs, zilog, txg)) { 577239620Smm /* up the hold count until we can be written out */ 578239620Smm dmu_buf_add_ref(ds->ds_dbuf, zilog); 579325132Savg 580325132Savg zilog->zl_dirty_max_txg = MAX(txg, zilog->zl_dirty_max_txg); 581239620Smm } 582239620Smm} 583239620Smm 584310515Savg/* 585310515Savg * Determine if the zil is dirty in the specified txg. Callers wanting to 586310515Savg * ensure that the dirty state does not change must hold the itxg_lock for 587310515Savg * the specified txg. Holding the lock will ensure that the zil cannot be 588310515Savg * dirtied (zil_itx_assign) or cleaned (zil_clean) while we check its current 589310515Savg * state. 590310515Savg */ 591239620Smmboolean_t 592310515Savgzilog_is_dirty_in_txg(zilog_t *zilog, uint64_t txg) 593310515Savg{ 594310515Savg dsl_pool_t *dp = zilog->zl_dmu_pool; 595310515Savg 596310515Savg if (txg_list_member(&dp->dp_dirty_zilogs, zilog, txg & TXG_MASK)) 597310515Savg return (B_TRUE); 598310515Savg return (B_FALSE); 599310515Savg} 600310515Savg 601310515Savg/* 602310515Savg * Determine if the zil is dirty. The zil is considered dirty if it has 603310515Savg * any pending itx records that have not been cleaned by zil_clean(). 604310515Savg */ 605310515Savgboolean_t 606239620Smmzilog_is_dirty(zilog_t *zilog) 607239620Smm{ 608239620Smm dsl_pool_t *dp = zilog->zl_dmu_pool; 609239620Smm 610239620Smm for (int t = 0; t < TXG_SIZE; t++) { 611239620Smm if (txg_list_member(&dp->dp_dirty_zilogs, zilog, t)) 612239620Smm return (B_TRUE); 613239620Smm } 614239620Smm return (B_FALSE); 615239620Smm} 616239620Smm 617239620Smm/* 618168404Spjd * Create an on-disk intent log. 619168404Spjd */ 620219089Spjdstatic lwb_t * 621168404Spjdzil_create(zilog_t *zilog) 622168404Spjd{ 623168404Spjd const zil_header_t *zh = zilog->zl_header; 624219089Spjd lwb_t *lwb = NULL; 625168404Spjd uint64_t txg = 0; 626168404Spjd dmu_tx_t *tx = NULL; 627168404Spjd blkptr_t blk; 628168404Spjd int error = 0; 629315441Smav boolean_t slog = FALSE; 630168404Spjd 631168404Spjd /* 632168404Spjd * Wait for any previous destroy to complete. 633168404Spjd */ 634168404Spjd txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg); 635168404Spjd 636168404Spjd ASSERT(zh->zh_claim_txg == 0); 637168404Spjd ASSERT(zh->zh_replay_seq == 0); 638168404Spjd 639168404Spjd blk = zh->zh_log; 640168404Spjd 641168404Spjd /* 642219089Spjd * Allocate an initial log block if: 643219089Spjd * - there isn't one already 644219089Spjd * - the existing block is the wrong endianess 645168404Spjd */ 646207908Smm if (BP_IS_HOLE(&blk) || BP_SHOULD_BYTESWAP(&blk)) { 647168404Spjd tx = dmu_tx_create(zilog->zl_os); 648325132Savg VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); 649168404Spjd dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); 650168404Spjd txg = dmu_tx_get_txg(tx); 651168404Spjd 652207908Smm if (!BP_IS_HOLE(&blk)) { 653332547Smav zio_free(zilog->zl_spa, txg, &blk); 654207908Smm BP_ZERO(&blk); 655207908Smm } 656207908Smm 657339105Smav error = zio_alloc_zil(zilog->zl_spa, 658339105Smav zilog->zl_os->os_dsl_dataset->ds_object, txg, &blk, NULL, 659315441Smav ZIL_MIN_BLKSZ, &slog); 660168404Spjd 661168404Spjd if (error == 0) 662168404Spjd zil_init_log_chain(zilog, &blk); 663168404Spjd } 664168404Spjd 665168404Spjd /* 666325132Savg * Allocate a log write block (lwb) for the first log block. 667168404Spjd */ 668219089Spjd if (error == 0) 669315441Smav lwb = zil_alloc_lwb(zilog, &blk, slog, txg); 670168404Spjd 671168404Spjd /* 672168404Spjd * If we just allocated the first log block, commit our transaction 673168404Spjd * and wait for zil_sync() to stuff the block poiner into zh_log. 674168404Spjd * (zh is part of the MOS, so we cannot modify it in open context.) 675168404Spjd */ 676168404Spjd if (tx != NULL) { 677168404Spjd dmu_tx_commit(tx); 678168404Spjd txg_wait_synced(zilog->zl_dmu_pool, txg); 679168404Spjd } 680168404Spjd 681168404Spjd ASSERT(bcmp(&blk, &zh->zh_log, sizeof (blk)) == 0); 682219089Spjd 683219089Spjd return (lwb); 684168404Spjd} 685168404Spjd 686168404Spjd/* 687325132Savg * In one tx, free all log blocks and clear the log header. If keep_first 688325132Savg * is set, then we're replaying a log with no content. We want to keep the 689325132Savg * first block, however, so that the first synchronous transaction doesn't 690325132Savg * require a txg_wait_synced() in zil_create(). We don't need to 691325132Savg * txg_wait_synced() here either when keep_first is set, because both 692325132Savg * zil_create() and zil_destroy() will wait for any in-progress destroys 693325132Savg * to complete. 694168404Spjd */ 695168404Spjdvoid 696168404Spjdzil_destroy(zilog_t *zilog, boolean_t keep_first) 697168404Spjd{ 698168404Spjd const zil_header_t *zh = zilog->zl_header; 699168404Spjd lwb_t *lwb; 700168404Spjd dmu_tx_t *tx; 701168404Spjd uint64_t txg; 702168404Spjd 703168404Spjd /* 704168404Spjd * Wait for any previous destroy to complete. 705168404Spjd */ 706168404Spjd txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg); 707168404Spjd 708219089Spjd zilog->zl_old_header = *zh; /* debugging aid */ 709219089Spjd 710168404Spjd if (BP_IS_HOLE(&zh->zh_log)) 711168404Spjd return; 712168404Spjd 713168404Spjd tx = dmu_tx_create(zilog->zl_os); 714325132Savg VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); 715168404Spjd dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); 716168404Spjd txg = dmu_tx_get_txg(tx); 717168404Spjd 718168404Spjd mutex_enter(&zilog->zl_lock); 719168404Spjd 720168404Spjd ASSERT3U(zilog->zl_destroy_txg, <, txg); 721168404Spjd zilog->zl_destroy_txg = txg; 722168404Spjd zilog->zl_keep_first = keep_first; 723168404Spjd 724168404Spjd if (!list_is_empty(&zilog->zl_lwb_list)) { 725168404Spjd ASSERT(zh->zh_claim_txg == 0); 726224526Smm VERIFY(!keep_first); 727168404Spjd while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) { 728168404Spjd list_remove(&zilog->zl_lwb_list, lwb); 729168404Spjd if (lwb->lwb_buf != NULL) 730168404Spjd zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); 731325132Savg zio_free(zilog->zl_spa, txg, &lwb->lwb_blk); 732325132Savg zil_free_lwb(zilog, lwb); 733168404Spjd } 734219089Spjd } else if (!keep_first) { 735239620Smm zil_destroy_sync(zilog, tx); 736168404Spjd } 737168404Spjd mutex_exit(&zilog->zl_lock); 738168404Spjd 739168404Spjd dmu_tx_commit(tx); 740185029Spjd} 741168404Spjd 742239620Smmvoid 743239620Smmzil_destroy_sync(zilog_t *zilog, dmu_tx_t *tx) 744239620Smm{ 745239620Smm ASSERT(list_is_empty(&zilog->zl_lwb_list)); 746239620Smm (void) zil_parse(zilog, zil_free_log_block, 747239620Smm zil_free_log_record, tx, zilog->zl_header->zh_claim_txg); 748239620Smm} 749239620Smm 750168404Spjdint 751286686Smavzil_claim(dsl_pool_t *dp, dsl_dataset_t *ds, void *txarg) 752168404Spjd{ 753168404Spjd dmu_tx_t *tx = txarg; 754168404Spjd zilog_t *zilog; 755332547Smav uint64_t first_txg; 756168404Spjd zil_header_t *zh; 757168404Spjd objset_t *os; 758168404Spjd int error; 759168404Spjd 760286686Smav error = dmu_objset_own_obj(dp, ds->ds_object, 761286686Smav DMU_OST_ANY, B_FALSE, FTAG, &os); 762248571Smm if (error != 0) { 763271534Sdelphij /* 764271534Sdelphij * EBUSY indicates that the objset is inconsistent, in which 765271534Sdelphij * case it can not have a ZIL. 766271534Sdelphij */ 767271534Sdelphij if (error != EBUSY) { 768286686Smav cmn_err(CE_WARN, "can't open objset for %llu, error %u", 769286686Smav (unsigned long long)ds->ds_object, error); 770271534Sdelphij } 771168404Spjd return (0); 772168404Spjd } 773168404Spjd 774168404Spjd zilog = dmu_objset_zil(os); 775168404Spjd zh = zil_header_in_syncing_context(zilog); 776332547Smav ASSERT3U(tx->tx_txg, ==, spa_first_txg(zilog->zl_spa)); 777332547Smav first_txg = spa_min_claim_txg(zilog->zl_spa); 778168404Spjd 779332547Smav /* 780332547Smav * If the spa_log_state is not set to be cleared, check whether 781332547Smav * the current uberblock is a checkpoint one and if the current 782332547Smav * header has been claimed before moving on. 783332547Smav * 784332547Smav * If the current uberblock is a checkpointed uberblock then 785332547Smav * one of the following scenarios took place: 786332547Smav * 787332547Smav * 1] We are currently rewinding to the checkpoint of the pool. 788332547Smav * 2] We crashed in the middle of a checkpoint rewind but we 789332547Smav * did manage to write the checkpointed uberblock to the 790332547Smav * vdev labels, so when we tried to import the pool again 791332547Smav * the checkpointed uberblock was selected from the import 792332547Smav * procedure. 793332547Smav * 794332547Smav * In both cases we want to zero out all the ZIL blocks, except 795332547Smav * the ones that have been claimed at the time of the checkpoint 796332547Smav * (their zh_claim_txg != 0). The reason is that these blocks 797332547Smav * may be corrupted since we may have reused their locations on 798332547Smav * disk after we took the checkpoint. 799332547Smav * 800332547Smav * We could try to set spa_log_state to SPA_LOG_CLEAR earlier 801332547Smav * when we first figure out whether the current uberblock is 802332547Smav * checkpointed or not. Unfortunately, that would discard all 803332547Smav * the logs, including the ones that are claimed, and we would 804332547Smav * leak space. 805332547Smav */ 806332547Smav if (spa_get_log_state(zilog->zl_spa) == SPA_LOG_CLEAR || 807332547Smav (zilog->zl_spa->spa_uberblock.ub_checkpoint_txg != 0 && 808332547Smav zh->zh_claim_txg == 0)) { 809332547Smav if (!BP_IS_HOLE(&zh->zh_log)) { 810332547Smav (void) zil_parse(zilog, zil_clear_log_block, 811332547Smav zil_noop_log_record, tx, first_txg); 812332547Smav } 813213197Smm BP_ZERO(&zh->zh_log); 814213197Smm dsl_dataset_dirty(dmu_objset_ds(os), tx); 815248571Smm dmu_objset_disown(os, FTAG); 816219089Spjd return (0); 817213197Smm } 818213197Smm 819168404Spjd /* 820332547Smav * If we are not rewinding and opening the pool normally, then 821332547Smav * the min_claim_txg should be equal to the first txg of the pool. 822332547Smav */ 823332547Smav ASSERT3U(first_txg, ==, spa_first_txg(zilog->zl_spa)); 824332547Smav 825332547Smav /* 826168404Spjd * Claim all log blocks if we haven't already done so, and remember 827168404Spjd * the highest claimed sequence number. This ensures that if we can 828168404Spjd * read only part of the log now (e.g. due to a missing device), 829168404Spjd * but we can read the entire log later, we will not try to replay 830168404Spjd * or destroy beyond the last block we successfully claimed. 831168404Spjd */ 832168404Spjd ASSERT3U(zh->zh_claim_txg, <=, first_txg); 833168404Spjd if (zh->zh_claim_txg == 0 && !BP_IS_HOLE(&zh->zh_log)) { 834219089Spjd (void) zil_parse(zilog, zil_claim_log_block, 835219089Spjd zil_claim_log_record, tx, first_txg); 836168404Spjd zh->zh_claim_txg = first_txg; 837219089Spjd zh->zh_claim_blk_seq = zilog->zl_parse_blk_seq; 838219089Spjd zh->zh_claim_lr_seq = zilog->zl_parse_lr_seq; 839219089Spjd if (zilog->zl_parse_lr_count || zilog->zl_parse_blk_count > 1) 840219089Spjd zh->zh_flags |= ZIL_REPLAY_NEEDED; 841219089Spjd zh->zh_flags |= ZIL_CLAIM_LR_SEQ_VALID; 842168404Spjd dsl_dataset_dirty(dmu_objset_ds(os), tx); 843168404Spjd } 844168404Spjd 845168404Spjd ASSERT3U(first_txg, ==, (spa_last_synced_txg(zilog->zl_spa) + 1)); 846248571Smm dmu_objset_disown(os, FTAG); 847168404Spjd return (0); 848168404Spjd} 849168404Spjd 850185029Spjd/* 851185029Spjd * Check the log by walking the log chain. 852185029Spjd * Checksum errors are ok as they indicate the end of the chain. 853185029Spjd * Any other error (no device or read failure) returns an error. 854185029Spjd */ 855286686Smav/* ARGSUSED */ 856185029Spjdint 857286686Smavzil_check_log_chain(dsl_pool_t *dp, dsl_dataset_t *ds, void *tx) 858168404Spjd{ 859185029Spjd zilog_t *zilog; 860185029Spjd objset_t *os; 861219089Spjd blkptr_t *bp; 862185029Spjd int error; 863168404Spjd 864219089Spjd ASSERT(tx == NULL); 865219089Spjd 866286686Smav error = dmu_objset_from_ds(ds, &os); 867248571Smm if (error != 0) { 868286686Smav cmn_err(CE_WARN, "can't open objset %llu, error %d", 869286686Smav (unsigned long long)ds->ds_object, error); 870185029Spjd return (0); 871185029Spjd } 872168404Spjd 873185029Spjd zilog = dmu_objset_zil(os); 874219089Spjd bp = (blkptr_t *)&zilog->zl_header->zh_log; 875219089Spjd 876219089Spjd if (!BP_IS_HOLE(bp)) { 877219089Spjd vdev_t *vd; 878219089Spjd boolean_t valid = B_TRUE; 879219089Spjd 880332547Smav /* 881332547Smav * Check the first block and determine if it's on a log device 882332547Smav * which may have been removed or faulted prior to loading this 883332547Smav * pool. If so, there's no point in checking the rest of the 884332547Smav * log as its content should have already been synced to the 885332547Smav * pool. 886332547Smav */ 887219089Spjd spa_config_enter(os->os_spa, SCL_STATE, FTAG, RW_READER); 888219089Spjd vd = vdev_lookup_top(os->os_spa, DVA_GET_VDEV(&bp->blk_dva[0])); 889219089Spjd if (vd->vdev_islog && vdev_is_dead(vd)) 890219089Spjd valid = vdev_log_state_valid(vd); 891219089Spjd spa_config_exit(os->os_spa, SCL_STATE, FTAG); 892219089Spjd 893286686Smav if (!valid) 894219089Spjd return (0); 895332547Smav 896332547Smav /* 897332547Smav * Check whether the current uberblock is checkpointed (e.g. 898332547Smav * we are rewinding) and whether the current header has been 899332547Smav * claimed or not. If it hasn't then skip verifying it. We 900332547Smav * do this because its ZIL blocks may be part of the pool's 901332547Smav * state before the rewind, which is no longer valid. 902332547Smav */ 903332547Smav zil_header_t *zh = zil_header_in_syncing_context(zilog); 904332547Smav if (zilog->zl_spa->spa_uberblock.ub_checkpoint_txg != 0 && 905332547Smav zh->zh_claim_txg == 0) 906332547Smav return (0); 907168404Spjd } 908185029Spjd 909219089Spjd /* 910219089Spjd * Because tx == NULL, zil_claim_log_block() will not actually claim 911219089Spjd * any blocks, but just determine whether it is possible to do so. 912219089Spjd * In addition to checking the log chain, zil_claim_log_block() 913219089Spjd * will invoke zio_claim() with a done func of spa_claim_notify(), 914219089Spjd * which will update spa_max_claim_txg. See spa_load() for details. 915219089Spjd */ 916219089Spjd error = zil_parse(zilog, zil_claim_log_block, zil_claim_log_record, tx, 917332547Smav zilog->zl_header->zh_claim_txg ? -1ULL : 918332547Smav spa_min_claim_txg(os->os_spa)); 919219089Spjd 920219089Spjd return ((error == ECKSUM || error == ENOENT) ? 0 : error); 921168404Spjd} 922168404Spjd 923325132Savg/* 924325132Savg * When an itx is "skipped", this function is used to properly mark the 925325132Savg * waiter as "done, and signal any thread(s) waiting on it. An itx can 926325132Savg * be skipped (and not committed to an lwb) for a variety of reasons, 927325132Savg * one of them being that the itx was committed via spa_sync(), prior to 928325132Savg * it being committed to an lwb; this can happen if a thread calling 929325132Savg * zil_commit() is racing with spa_sync(). 930325132Savg */ 931325132Savgstatic void 932325132Savgzil_commit_waiter_skip(zil_commit_waiter_t *zcw) 933185029Spjd{ 934325132Savg mutex_enter(&zcw->zcw_lock); 935325132Savg ASSERT3B(zcw->zcw_done, ==, B_FALSE); 936325132Savg zcw->zcw_done = B_TRUE; 937325132Savg cv_broadcast(&zcw->zcw_cv); 938325132Savg mutex_exit(&zcw->zcw_lock); 939325132Savg} 940185029Spjd 941325132Savg/* 942325132Savg * This function is used when the given waiter is to be linked into an 943325132Savg * lwb's "lwb_waiter" list; i.e. when the itx is committed to the lwb. 944325132Savg * At this point, the waiter will no longer be referenced by the itx, 945325132Savg * and instead, will be referenced by the lwb. 946325132Savg */ 947325132Savgstatic void 948325132Savgzil_commit_waiter_link_lwb(zil_commit_waiter_t *zcw, lwb_t *lwb) 949325132Savg{ 950329486Smav /* 951329486Smav * The lwb_waiters field of the lwb is protected by the zilog's 952329486Smav * zl_lock, thus it must be held when calling this function. 953329486Smav */ 954329486Smav ASSERT(MUTEX_HELD(&lwb->lwb_zilog->zl_lock)); 955329486Smav 956325132Savg mutex_enter(&zcw->zcw_lock); 957325132Savg ASSERT(!list_link_active(&zcw->zcw_node)); 958325132Savg ASSERT3P(zcw->zcw_lwb, ==, NULL); 959325132Savg ASSERT3P(lwb, !=, NULL); 960325132Savg ASSERT(lwb->lwb_state == LWB_STATE_OPENED || 961325132Savg lwb->lwb_state == LWB_STATE_ISSUED); 962185029Spjd 963325132Savg list_insert_tail(&lwb->lwb_waiters, zcw); 964325132Savg zcw->zcw_lwb = lwb; 965325132Savg mutex_exit(&zcw->zcw_lock); 966185029Spjd} 967185029Spjd 968325132Savg/* 969325132Savg * This function is used when zio_alloc_zil() fails to allocate a ZIL 970325132Savg * block, and the given waiter must be linked to the "nolwb waiters" 971325132Savg * list inside of zil_process_commit_list(). 972325132Savg */ 973325132Savgstatic void 974325132Savgzil_commit_waiter_link_nolwb(zil_commit_waiter_t *zcw, list_t *nolwb) 975325132Savg{ 976325132Savg mutex_enter(&zcw->zcw_lock); 977325132Savg ASSERT(!list_link_active(&zcw->zcw_node)); 978325132Savg ASSERT3P(zcw->zcw_lwb, ==, NULL); 979325132Savg list_insert_tail(nolwb, zcw); 980325132Savg mutex_exit(&zcw->zcw_lock); 981325132Savg} 982325132Savg 983168404Spjdvoid 984325132Savgzil_lwb_add_block(lwb_t *lwb, const blkptr_t *bp) 985168404Spjd{ 986325132Savg avl_tree_t *t = &lwb->lwb_vdev_tree; 987185029Spjd avl_index_t where; 988185029Spjd zil_vdev_node_t *zv, zvsearch; 989185029Spjd int ndvas = BP_GET_NDVAS(bp); 990185029Spjd int i; 991168404Spjd 992185029Spjd if (zfs_nocacheflush) 993185029Spjd return; 994168404Spjd 995325132Savg mutex_enter(&lwb->lwb_vdev_lock); 996185029Spjd for (i = 0; i < ndvas; i++) { 997185029Spjd zvsearch.zv_vdev = DVA_GET_VDEV(&bp->blk_dva[i]); 998185029Spjd if (avl_find(t, &zvsearch, &where) == NULL) { 999185029Spjd zv = kmem_alloc(sizeof (*zv), KM_SLEEP); 1000185029Spjd zv->zv_vdev = zvsearch.zv_vdev; 1001185029Spjd avl_insert(t, zv, where); 1002185029Spjd } 1003185029Spjd } 1004325132Savg mutex_exit(&lwb->lwb_vdev_lock); 1005168404Spjd} 1006168404Spjd 1007325132Savgvoid 1008325132Savgzil_lwb_add_txg(lwb_t *lwb, uint64_t txg) 1009325132Savg{ 1010325132Savg lwb->lwb_max_txg = MAX(lwb->lwb_max_txg, txg); 1011325132Savg} 1012325132Savg 1013325132Savg/* 1014325132Savg * This function is a called after all VDEVs associated with a given lwb 1015325132Savg * write have completed their DKIOCFLUSHWRITECACHE command; or as soon 1016325132Savg * as the lwb write completes, if "zfs_nocacheflush" is set. 1017325132Savg * 1018325132Savg * The intention is for this function to be called as soon as the 1019325132Savg * contents of an lwb are considered "stable" on disk, and will survive 1020325132Savg * any sudden loss of power. At this point, any threads waiting for the 1021325132Savg * lwb to reach this state are signalled, and the "waiter" structures 1022325132Savg * are marked "done". 1023325132Savg */ 1024219089Spjdstatic void 1025325132Savgzil_lwb_flush_vdevs_done(zio_t *zio) 1026168404Spjd{ 1027325132Savg lwb_t *lwb = zio->io_private; 1028325132Savg zilog_t *zilog = lwb->lwb_zilog; 1029325132Savg dmu_tx_t *tx = lwb->lwb_tx; 1030325132Savg zil_commit_waiter_t *zcw; 1031168404Spjd 1032325132Savg spa_config_exit(zilog->zl_spa, SCL_STATE, lwb); 1033168404Spjd 1034325132Savg zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); 1035325132Savg 1036325132Savg mutex_enter(&zilog->zl_lock); 1037325132Savg 1038185029Spjd /* 1039325132Savg * Ensure the lwb buffer pointer is cleared before releasing the 1040325132Savg * txg. If we have had an allocation failure and the txg is 1041325132Savg * waiting to sync then we want zil_sync() to remove the lwb so 1042325132Savg * that it's not picked up as the next new one in 1043325132Savg * zil_process_commit_list(). zil_sync() will only remove the 1044325132Savg * lwb if lwb_buf is null. 1045185029Spjd */ 1046325132Savg lwb->lwb_buf = NULL; 1047325132Savg lwb->lwb_tx = NULL; 1048185029Spjd 1049325132Savg ASSERT3U(lwb->lwb_issued_timestamp, >, 0); 1050325132Savg zilog->zl_last_lwb_latency = gethrtime() - lwb->lwb_issued_timestamp; 1051185029Spjd 1052325132Savg lwb->lwb_root_zio = NULL; 1053325132Savg lwb->lwb_state = LWB_STATE_DONE; 1054325132Savg 1055325132Savg if (zilog->zl_last_lwb_opened == lwb) { 1056325132Savg /* 1057325132Savg * Remember the highest committed log sequence number 1058325132Savg * for ztest. We only update this value when all the log 1059325132Savg * writes succeeded, because ztest wants to ASSERT that 1060325132Savg * it got the whole log chain. 1061325132Savg */ 1062325132Savg zilog->zl_commit_lr_seq = zilog->zl_lr_seq; 1063168404Spjd } 1064168404Spjd 1065325132Savg while ((zcw = list_head(&lwb->lwb_waiters)) != NULL) { 1066325132Savg mutex_enter(&zcw->zcw_lock); 1067325132Savg 1068325132Savg ASSERT(list_link_active(&zcw->zcw_node)); 1069325132Savg list_remove(&lwb->lwb_waiters, zcw); 1070325132Savg 1071325132Savg ASSERT3P(zcw->zcw_lwb, ==, lwb); 1072325132Savg zcw->zcw_lwb = NULL; 1073325132Savg 1074325132Savg zcw->zcw_zio_error = zio->io_error; 1075325132Savg 1076325132Savg ASSERT3B(zcw->zcw_done, ==, B_FALSE); 1077325132Savg zcw->zcw_done = B_TRUE; 1078325132Savg cv_broadcast(&zcw->zcw_cv); 1079325132Savg 1080325132Savg mutex_exit(&zcw->zcw_lock); 1081325132Savg } 1082325132Savg 1083325132Savg mutex_exit(&zilog->zl_lock); 1084325132Savg 1085168404Spjd /* 1086325132Savg * Now that we've written this log block, we have a stable pointer 1087325132Savg * to the next block in the chain, so it's OK to let the txg in 1088325132Savg * which we allocated the next block sync. 1089168404Spjd */ 1090325132Savg dmu_tx_commit(tx); 1091168404Spjd} 1092168404Spjd 1093168404Spjd/* 1094325132Savg * This is called when an lwb write completes. This means, this specific 1095325132Savg * lwb was written to disk, and all dependent lwb have also been 1096325132Savg * written to disk. 1097325132Savg * 1098325132Savg * At this point, a DKIOCFLUSHWRITECACHE command hasn't been issued to 1099325132Savg * the VDEVs involved in writing out this specific lwb. The lwb will be 1100325132Savg * "done" once zil_lwb_flush_vdevs_done() is called, which occurs in the 1101325132Savg * zio completion callback for the lwb's root zio. 1102168404Spjd */ 1103168404Spjdstatic void 1104168404Spjdzil_lwb_write_done(zio_t *zio) 1105168404Spjd{ 1106168404Spjd lwb_t *lwb = zio->io_private; 1107325132Savg spa_t *spa = zio->io_spa; 1108168404Spjd zilog_t *zilog = lwb->lwb_zilog; 1109325132Savg avl_tree_t *t = &lwb->lwb_vdev_tree; 1110325132Savg void *cookie = NULL; 1111325132Savg zil_vdev_node_t *zv; 1112168404Spjd 1113325132Savg ASSERT3S(spa_config_held(spa, SCL_STATE, RW_READER), !=, 0); 1114325132Savg 1115185029Spjd ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF); 1116185029Spjd ASSERT(BP_GET_TYPE(zio->io_bp) == DMU_OT_INTENT_LOG); 1117185029Spjd ASSERT(BP_GET_LEVEL(zio->io_bp) == 0); 1118185029Spjd ASSERT(BP_GET_BYTEORDER(zio->io_bp) == ZFS_HOST_BYTEORDER); 1119185029Spjd ASSERT(!BP_IS_GANG(zio->io_bp)); 1120185029Spjd ASSERT(!BP_IS_HOLE(zio->io_bp)); 1121268075Sdelphij ASSERT(BP_GET_FILL(zio->io_bp) == 0); 1122185029Spjd 1123321610Smav abd_put(zio->io_abd); 1124325132Savg 1125325132Savg ASSERT3S(lwb->lwb_state, ==, LWB_STATE_ISSUED); 1126325132Savg 1127168404Spjd mutex_enter(&zilog->zl_lock); 1128325132Savg lwb->lwb_write_zio = NULL; 1129219089Spjd mutex_exit(&zilog->zl_lock); 1130209962Smm 1131325132Savg if (avl_numnodes(t) == 0) 1132325132Savg return; 1133325132Savg 1134209962Smm /* 1135325132Savg * If there was an IO error, we're not going to call zio_flush() 1136325132Savg * on these vdevs, so we simply empty the tree and free the 1137325132Savg * nodes. We avoid calling zio_flush() since there isn't any 1138325132Savg * good reason for doing so, after the lwb block failed to be 1139325132Savg * written out. 1140209962Smm */ 1141325132Savg if (zio->io_error != 0) { 1142325132Savg while ((zv = avl_destroy_nodes(t, &cookie)) != NULL) 1143325132Savg kmem_free(zv, sizeof (*zv)); 1144325132Savg return; 1145325132Savg } 1146325132Savg 1147325132Savg while ((zv = avl_destroy_nodes(t, &cookie)) != NULL) { 1148325132Savg vdev_t *vd = vdev_lookup_top(spa, zv->zv_vdev); 1149325132Savg if (vd != NULL) 1150325132Savg zio_flush(lwb->lwb_root_zio, vd); 1151325132Savg kmem_free(zv, sizeof (*zv)); 1152325132Savg } 1153168404Spjd} 1154168404Spjd 1155168404Spjd/* 1156325132Savg * This function's purpose is to "open" an lwb such that it is ready to 1157325132Savg * accept new itxs being committed to it. To do this, the lwb's zio 1158325132Savg * structures are created, and linked to the lwb. This function is 1159325132Savg * idempotent; if the passed in lwb has already been opened, this 1160325132Savg * function is essentially a no-op. 1161168404Spjd */ 1162168404Spjdstatic void 1163325132Savgzil_lwb_write_open(zilog_t *zilog, lwb_t *lwb) 1164168404Spjd{ 1165268123Sdelphij zbookmark_phys_t zb; 1166315441Smav zio_priority_t prio; 1167168404Spjd 1168329485Smav ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); 1169325132Savg ASSERT3P(lwb, !=, NULL); 1170325132Savg EQUIV(lwb->lwb_root_zio == NULL, lwb->lwb_state == LWB_STATE_CLOSED); 1171325132Savg EQUIV(lwb->lwb_root_zio != NULL, lwb->lwb_state == LWB_STATE_OPENED); 1172325132Savg 1173219089Spjd SET_BOOKMARK(&zb, lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_OBJSET], 1174219089Spjd ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, 1175219089Spjd lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_SEQ]); 1176168404Spjd 1177325132Savg if (lwb->lwb_root_zio == NULL) { 1178321610Smav abd_t *lwb_abd = abd_get_from_buf(lwb->lwb_buf, 1179321610Smav BP_GET_LSIZE(&lwb->lwb_blk)); 1180325132Savg 1181321611Smav if (!lwb->lwb_slog || zilog->zl_cur_used <= zil_slog_bulk) 1182315441Smav prio = ZIO_PRIORITY_SYNC_WRITE; 1183315441Smav else 1184315441Smav prio = ZIO_PRIORITY_ASYNC_WRITE; 1185325132Savg 1186325132Savg lwb->lwb_root_zio = zio_root(zilog->zl_spa, 1187325132Savg zil_lwb_flush_vdevs_done, lwb, ZIO_FLAG_CANFAIL); 1188325132Savg ASSERT3P(lwb->lwb_root_zio, !=, NULL); 1189325132Savg 1190325132Savg lwb->lwb_write_zio = zio_rewrite(lwb->lwb_root_zio, 1191325132Savg zilog->zl_spa, 0, &lwb->lwb_blk, lwb_abd, 1192325132Savg BP_GET_LSIZE(&lwb->lwb_blk), zil_lwb_write_done, lwb, 1193325132Savg prio, ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE, &zb); 1194325132Savg ASSERT3P(lwb->lwb_write_zio, !=, NULL); 1195325132Savg 1196325132Savg lwb->lwb_state = LWB_STATE_OPENED; 1197325132Savg 1198325132Savg mutex_enter(&zilog->zl_lock); 1199325132Savg 1200325132Savg /* 1201325132Savg * The zilog's "zl_last_lwb_opened" field is used to 1202325132Savg * build the lwb/zio dependency chain, which is used to 1203325132Savg * preserve the ordering of lwb completions that is 1204325132Savg * required by the semantics of the ZIL. Each new lwb 1205325132Savg * zio becomes a parent of the "previous" lwb zio, such 1206325132Savg * that the new lwb's zio cannot complete until the 1207325132Savg * "previous" lwb's zio completes. 1208325132Savg * 1209325132Savg * This is required by the semantics of zil_commit(); 1210325132Savg * the commit waiters attached to the lwbs will be woken 1211325132Savg * in the lwb zio's completion callback, so this zio 1212325132Savg * dependency graph ensures the waiters are woken in the 1213325132Savg * correct order (the same order the lwbs were created). 1214325132Savg */ 1215325132Savg lwb_t *last_lwb_opened = zilog->zl_last_lwb_opened; 1216325132Savg if (last_lwb_opened != NULL && 1217325132Savg last_lwb_opened->lwb_state != LWB_STATE_DONE) { 1218325132Savg ASSERT(last_lwb_opened->lwb_state == LWB_STATE_OPENED || 1219325132Savg last_lwb_opened->lwb_state == LWB_STATE_ISSUED); 1220325132Savg ASSERT3P(last_lwb_opened->lwb_root_zio, !=, NULL); 1221325132Savg zio_add_child(lwb->lwb_root_zio, 1222325132Savg last_lwb_opened->lwb_root_zio); 1223325132Savg } 1224325132Savg zilog->zl_last_lwb_opened = lwb; 1225325132Savg 1226325132Savg mutex_exit(&zilog->zl_lock); 1227168404Spjd } 1228325132Savg 1229325132Savg ASSERT3P(lwb->lwb_root_zio, !=, NULL); 1230325132Savg ASSERT3P(lwb->lwb_write_zio, !=, NULL); 1231325132Savg ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED); 1232168404Spjd} 1233168404Spjd 1234168404Spjd/* 1235219089Spjd * Define a limited set of intent log block sizes. 1236251631Sdelphij * 1237219089Spjd * These must be a multiple of 4KB. Note only the amount used (again 1238219089Spjd * aligned to 4KB) actually gets written. However, we can't always just 1239274337Sdelphij * allocate SPA_OLD_MAXBLOCKSIZE as the slog space could be exhausted. 1240219089Spjd */ 1241353583Smavstruct { 1242353583Smav uint64_t limit; 1243353583Smav uint64_t blksz; 1244353583Smav} zil_block_buckets[] = { 1245353583Smav { 4096, 4096 }, /* non TX_WRITE */ 1246353583Smav { 8192 + 4096, 8192 + 4096 }, /* database */ 1247353583Smav { 32768 + 4096, 32768 + 4096 }, /* NFS writes */ 1248353583Smav { 65536 + 4096, 65536 + 4096 }, /* 64KB writes */ 1249353583Smav { 131072, 131072 }, /* < 128KB writes */ 1250353583Smav { 131072 + 4096, 65536 + 4096 }, /* 128KB writes */ 1251353583Smav { UINT64_MAX, SPA_OLD_MAXBLOCKSIZE}, /* > 128KB writes */ 1252219089Spjd}; 1253219089Spjd 1254219089Spjd/* 1255359554Smav * Maximum block size used by the ZIL. This is picked up when the ZIL is 1256359554Smav * initialized. Otherwise this should not be used directly; see 1257359554Smav * zl_max_block_size instead. 1258359554Smav */ 1259359554Smavint zil_maxblocksize = SPA_OLD_MAXBLOCKSIZE; 1260359554SmavSYSCTL_INT(_vfs_zfs, OID_AUTO, zil_maxblocksize, CTLFLAG_RWTUN, 1261359554Smav &zil_maxblocksize, 0, "Limit in bytes of ZIL log block size"); 1262359554Smav 1263359554Smav/* 1264168404Spjd * Start a log block write and advance to the next log block. 1265168404Spjd * Calls are serialized. 1266168404Spjd */ 1267168404Spjdstatic lwb_t * 1268325132Savgzil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb) 1269168404Spjd{ 1270219089Spjd lwb_t *nlwb = NULL; 1271219089Spjd zil_chain_t *zilc; 1272168404Spjd spa_t *spa = zilog->zl_spa; 1273219089Spjd blkptr_t *bp; 1274219089Spjd dmu_tx_t *tx; 1275168404Spjd uint64_t txg; 1276219089Spjd uint64_t zil_blksz, wsz; 1277219089Spjd int i, error; 1278315441Smav boolean_t slog; 1279168404Spjd 1280329485Smav ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); 1281325132Savg ASSERT3P(lwb->lwb_root_zio, !=, NULL); 1282325132Savg ASSERT3P(lwb->lwb_write_zio, !=, NULL); 1283325132Savg ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED); 1284325132Savg 1285219089Spjd if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2) { 1286219089Spjd zilc = (zil_chain_t *)lwb->lwb_buf; 1287219089Spjd bp = &zilc->zc_next_blk; 1288219089Spjd } else { 1289219089Spjd zilc = (zil_chain_t *)(lwb->lwb_buf + lwb->lwb_sz); 1290219089Spjd bp = &zilc->zc_next_blk; 1291219089Spjd } 1292168404Spjd 1293219089Spjd ASSERT(lwb->lwb_nused <= lwb->lwb_sz); 1294219089Spjd 1295168404Spjd /* 1296168404Spjd * Allocate the next block and save its address in this block 1297168404Spjd * before writing it in order to establish the log chain. 1298168404Spjd * Note that if the allocation of nlwb synced before we wrote 1299168404Spjd * the block that points at it (lwb), we'd leak it if we crashed. 1300219089Spjd * Therefore, we don't do dmu_tx_commit() until zil_lwb_write_done(). 1301219089Spjd * We dirty the dataset to ensure that zil_sync() will be called 1302219089Spjd * to clean up in the event of allocation failure or I/O failure. 1303168404Spjd */ 1304325132Savg 1305219089Spjd tx = dmu_tx_create(zilog->zl_os); 1306328235Smav 1307328235Smav /* 1308330986Savg * Since we are not going to create any new dirty data, and we 1309330986Savg * can even help with clearing the existing dirty data, we 1310330986Savg * should not be subject to the dirty data based delays. We 1311330986Savg * use TXG_NOTHROTTLE to bypass the delay mechanism. 1312328235Smav */ 1313330986Savg VERIFY0(dmu_tx_assign(tx, TXG_WAIT | TXG_NOTHROTTLE)); 1314330986Savg 1315219089Spjd dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); 1316219089Spjd txg = dmu_tx_get_txg(tx); 1317168404Spjd 1318219089Spjd lwb->lwb_tx = tx; 1319219089Spjd 1320168404Spjd /* 1321219089Spjd * Log blocks are pre-allocated. Here we select the size of the next 1322219089Spjd * block, based on size used in the last block. 1323219089Spjd * - first find the smallest bucket that will fit the block from a 1324219089Spjd * limited set of block sizes. This is because it's faster to write 1325219089Spjd * blocks allocated from the same metaslab as they are adjacent or 1326219089Spjd * close. 1327219089Spjd * - next find the maximum from the new suggested size and an array of 1328219089Spjd * previous sizes. This lessens a picket fence effect of wrongly 1329219089Spjd * guesssing the size if we have a stream of say 2k, 64k, 2k, 64k 1330219089Spjd * requests. 1331219089Spjd * 1332219089Spjd * Note we only write what is used, but we can't just allocate 1333219089Spjd * the maximum block size because we can exhaust the available 1334219089Spjd * pool log space. 1335168404Spjd */ 1336219089Spjd zil_blksz = zilog->zl_cur_used + sizeof (zil_chain_t); 1337353583Smav for (i = 0; zil_blksz > zil_block_buckets[i].limit; i++) 1338219089Spjd continue; 1339359554Smav zil_blksz = MIN(zil_block_buckets[i].blksz, zilog->zl_max_block_size); 1340219089Spjd zilog->zl_prev_blks[zilog->zl_prev_rotor] = zil_blksz; 1341219089Spjd for (i = 0; i < ZIL_PREV_BLKS; i++) 1342219089Spjd zil_blksz = MAX(zil_blksz, zilog->zl_prev_blks[i]); 1343219089Spjd zilog->zl_prev_rotor = (zilog->zl_prev_rotor + 1) & (ZIL_PREV_BLKS - 1); 1344168404Spjd 1345168404Spjd BP_ZERO(bp); 1346325132Savg 1347168404Spjd /* pass the old blkptr in order to spread log blocks across devs */ 1348339105Smav error = zio_alloc_zil(spa, zilog->zl_os->os_dsl_dataset->ds_object, 1349339105Smav txg, bp, &lwb->lwb_blk, zil_blksz, &slog); 1350248571Smm if (error == 0) { 1351219089Spjd ASSERT3U(bp->blk_birth, ==, txg); 1352219089Spjd bp->blk_cksum = lwb->lwb_blk.blk_cksum; 1353219089Spjd bp->blk_cksum.zc_word[ZIL_ZC_SEQ]++; 1354168404Spjd 1355168404Spjd /* 1356325132Savg * Allocate a new log write block (lwb). 1357168404Spjd */ 1358315441Smav nlwb = zil_alloc_lwb(zilog, bp, slog, txg); 1359168404Spjd } 1360168404Spjd 1361219089Spjd if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2) { 1362219089Spjd /* For Slim ZIL only write what is used. */ 1363219089Spjd wsz = P2ROUNDUP_TYPED(lwb->lwb_nused, ZIL_MIN_BLKSZ, uint64_t); 1364219089Spjd ASSERT3U(wsz, <=, lwb->lwb_sz); 1365325132Savg zio_shrink(lwb->lwb_write_zio, wsz); 1366168404Spjd 1367219089Spjd } else { 1368219089Spjd wsz = lwb->lwb_sz; 1369219089Spjd } 1370168404Spjd 1371219089Spjd zilc->zc_pad = 0; 1372219089Spjd zilc->zc_nused = lwb->lwb_nused; 1373219089Spjd zilc->zc_eck.zec_cksum = lwb->lwb_blk.blk_cksum; 1374168404Spjd 1375168404Spjd /* 1376219089Spjd * clear unused data for security 1377168404Spjd */ 1378219089Spjd bzero(lwb->lwb_buf + lwb->lwb_nused, wsz - lwb->lwb_nused); 1379168404Spjd 1380325132Savg spa_config_enter(zilog->zl_spa, SCL_STATE, lwb, RW_READER); 1381168404Spjd 1382325132Savg zil_lwb_add_block(lwb, &lwb->lwb_blk); 1383325132Savg lwb->lwb_issued_timestamp = gethrtime(); 1384325132Savg lwb->lwb_state = LWB_STATE_ISSUED; 1385325132Savg 1386325132Savg zio_nowait(lwb->lwb_root_zio); 1387325132Savg zio_nowait(lwb->lwb_write_zio); 1388325132Savg 1389168404Spjd /* 1390219089Spjd * If there was an allocation failure then nlwb will be null which 1391219089Spjd * forces a txg_wait_synced(). 1392168404Spjd */ 1393168404Spjd return (nlwb); 1394168404Spjd} 1395168404Spjd 1396359554Smav/* 1397359554Smav * Maximum amount of write data that can be put into single log block. 1398359554Smav */ 1399359554Smavuint64_t 1400359554Smavzil_max_log_data(zilog_t *zilog) 1401359554Smav{ 1402359554Smav return (zilog->zl_max_block_size - 1403359554Smav sizeof (zil_chain_t) - sizeof (lr_write_t)); 1404359554Smav} 1405359554Smav 1406359554Smav/* 1407359554Smav * Maximum amount of log space we agree to waste to reduce number of 1408359554Smav * WR_NEED_COPY chunks to reduce zl_get_data() overhead (~12%). 1409359554Smav */ 1410359554Smavstatic inline uint64_t 1411359554Smavzil_max_waste_space(zilog_t *zilog) 1412359554Smav{ 1413359554Smav return (zil_max_log_data(zilog) / 8); 1414359554Smav} 1415359554Smav 1416359554Smav/* 1417359554Smav * Maximum amount of write data for WR_COPIED. For correctness, consumers 1418359554Smav * must fall back to WR_NEED_COPY if we can't fit the entire record into one 1419359554Smav * maximum sized log block, because each WR_COPIED record must fit in a 1420359554Smav * single log block. For space efficiency, we want to fit two records into a 1421359554Smav * max-sized log block. 1422359554Smav */ 1423359554Smavuint64_t 1424359554Smavzil_max_copied_data(zilog_t *zilog) 1425359554Smav{ 1426359554Smav return ((zilog->zl_max_block_size - sizeof (zil_chain_t)) / 2 - 1427359554Smav sizeof (lr_write_t)); 1428359554Smav} 1429359554Smav 1430168404Spjdstatic lwb_t * 1431168404Spjdzil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb) 1432168404Spjd{ 1433321611Smav lr_t *lrcb, *lrc; 1434321611Smav lr_write_t *lrwb, *lrw; 1435219089Spjd char *lr_buf; 1436359554Smav uint64_t dlen, dnow, lwb_sp, reclen, txg, max_log_data; 1437168404Spjd 1438329485Smav ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); 1439325132Savg ASSERT3P(lwb, !=, NULL); 1440325132Savg ASSERT3P(lwb->lwb_buf, !=, NULL); 1441219089Spjd 1442325132Savg zil_lwb_write_open(zilog, lwb); 1443168404Spjd 1444325132Savg lrc = &itx->itx_lr; 1445325132Savg lrw = (lr_write_t *)lrc; 1446325132Savg 1447325132Savg /* 1448325132Savg * A commit itx doesn't represent any on-disk state; instead 1449325132Savg * it's simply used as a place holder on the commit list, and 1450325132Savg * provides a mechanism for attaching a "commit waiter" onto the 1451325132Savg * correct lwb (such that the waiter can be signalled upon 1452325132Savg * completion of that lwb). Thus, we don't process this itx's 1453325132Savg * log record if it's a commit itx (these itx's don't have log 1454325132Savg * records), and instead link the itx's waiter onto the lwb's 1455325132Savg * list of waiters. 1456325132Savg * 1457325132Savg * For more details, see the comment above zil_commit(). 1458325132Savg */ 1459325132Savg if (lrc->lrc_txtype == TX_COMMIT) { 1460329486Smav mutex_enter(&zilog->zl_lock); 1461325132Savg zil_commit_waiter_link_lwb(itx->itx_private, lwb); 1462325132Savg itx->itx_private = NULL; 1463329486Smav mutex_exit(&zilog->zl_lock); 1464325132Savg return (lwb); 1465325132Savg } 1466325132Savg 1467321611Smav if (lrc->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY) { 1468168404Spjd dlen = P2ROUNDUP_TYPED( 1469219089Spjd lrw->lr_length, sizeof (uint64_t), uint64_t); 1470321611Smav } else { 1471321611Smav dlen = 0; 1472321611Smav } 1473321611Smav reclen = lrc->lrc_reclen; 1474168404Spjd zilog->zl_cur_used += (reclen + dlen); 1475321611Smav txg = lrc->lrc_txg; 1476168404Spjd 1477325132Savg ASSERT3U(zilog->zl_cur_used, <, UINT64_MAX - (reclen + dlen)); 1478168404Spjd 1479315441Smavcont: 1480168404Spjd /* 1481168404Spjd * If this record won't fit in the current log block, start a new one. 1482321611Smav * For WR_NEED_COPY optimize layout for minimal number of chunks. 1483168404Spjd */ 1484315441Smav lwb_sp = lwb->lwb_sz - lwb->lwb_nused; 1485359554Smav max_log_data = zil_max_log_data(zilog); 1486315441Smav if (reclen > lwb_sp || (reclen + dlen > lwb_sp && 1487359554Smav lwb_sp < zil_max_waste_space(zilog) && 1488359554Smav (dlen % max_log_data == 0 || 1489359554Smav lwb_sp < reclen + dlen % max_log_data))) { 1490325132Savg lwb = zil_lwb_write_issue(zilog, lwb); 1491168404Spjd if (lwb == NULL) 1492168404Spjd return (NULL); 1493325132Savg zil_lwb_write_open(zilog, lwb); 1494219089Spjd ASSERT(LWB_EMPTY(lwb)); 1495315441Smav lwb_sp = lwb->lwb_sz - lwb->lwb_nused; 1496359554Smav 1497359554Smav /* 1498359554Smav * There must be enough space in the new, empty log block to 1499359554Smav * hold reclen. For WR_COPIED, we need to fit the whole 1500359554Smav * record in one block, and reclen is the header size + the 1501359554Smav * data size. For WR_NEED_COPY, we can create multiple 1502359554Smav * records, splitting the data into multiple blocks, so we 1503359554Smav * only need to fit one word of data per block; in this case 1504359554Smav * reclen is just the header size (no data). 1505359554Smav */ 1506321611Smav ASSERT3U(reclen + MIN(dlen, sizeof (uint64_t)), <=, lwb_sp); 1507168404Spjd } 1508168404Spjd 1509315441Smav dnow = MIN(dlen, lwb_sp - reclen); 1510219089Spjd lr_buf = lwb->lwb_buf + lwb->lwb_nused; 1511219089Spjd bcopy(lrc, lr_buf, reclen); 1512321611Smav lrcb = (lr_t *)lr_buf; /* Like lrc, but inside lwb. */ 1513321611Smav lrwb = (lr_write_t *)lrcb; /* Like lrw, but inside lwb. */ 1514168404Spjd 1515168404Spjd /* 1516168404Spjd * If it's a write, fetch the data or get its blkptr as appropriate. 1517168404Spjd */ 1518168404Spjd if (lrc->lrc_txtype == TX_WRITE) { 1519168404Spjd if (txg > spa_freeze_txg(zilog->zl_spa)) 1520168404Spjd txg_wait_synced(zilog->zl_dmu_pool, txg); 1521168404Spjd if (itx->itx_wr_state != WR_COPIED) { 1522168404Spjd char *dbuf; 1523168404Spjd int error; 1524168404Spjd 1525315441Smav if (itx->itx_wr_state == WR_NEED_COPY) { 1526219089Spjd dbuf = lr_buf + reclen; 1527315441Smav lrcb->lrc_reclen += dnow; 1528315441Smav if (lrwb->lr_length > dnow) 1529315441Smav lrwb->lr_length = dnow; 1530315441Smav lrw->lr_offset += dnow; 1531315441Smav lrw->lr_length -= dnow; 1532168404Spjd } else { 1533168404Spjd ASSERT(itx->itx_wr_state == WR_INDIRECT); 1534168404Spjd dbuf = NULL; 1535168404Spjd } 1536325132Savg 1537325132Savg /* 1538325132Savg * We pass in the "lwb_write_zio" rather than 1539325132Savg * "lwb_root_zio" so that the "lwb_write_zio" 1540325132Savg * becomes the parent of any zio's created by 1541325132Savg * the "zl_get_data" callback. The vdevs are 1542325132Savg * flushed after the "lwb_write_zio" completes, 1543325132Savg * so we want to make sure that completion 1544325132Savg * callback waits for these additional zio's, 1545325132Savg * such that the vdevs used by those zio's will 1546325132Savg * be included in the lwb's vdev tree, and those 1547325132Savg * vdevs will be properly flushed. If we passed 1548325132Savg * in "lwb_root_zio" here, then these additional 1549325132Savg * vdevs may not be flushed; e.g. if these zio's 1550325132Savg * completed after "lwb_write_zio" completed. 1551325132Savg */ 1552325132Savg error = zilog->zl_get_data(itx->itx_private, 1553325132Savg lrwb, dbuf, lwb, lwb->lwb_write_zio); 1554325132Savg 1555214378Smm if (error == EIO) { 1556214378Smm txg_wait_synced(zilog->zl_dmu_pool, txg); 1557214378Smm return (lwb); 1558214378Smm } 1559248571Smm if (error != 0) { 1560168404Spjd ASSERT(error == ENOENT || error == EEXIST || 1561168404Spjd error == EALREADY); 1562168404Spjd return (lwb); 1563168404Spjd } 1564168404Spjd } 1565168404Spjd } 1566168404Spjd 1567219089Spjd /* 1568219089Spjd * We're actually making an entry, so update lrc_seq to be the 1569219089Spjd * log record sequence number. Note that this is generally not 1570219089Spjd * equal to the itx sequence number because not all transactions 1571219089Spjd * are synchronous, and sometimes spa_sync() gets there first. 1572219089Spjd */ 1573325132Savg lrcb->lrc_seq = ++zilog->zl_lr_seq; 1574315441Smav lwb->lwb_nused += reclen + dnow; 1575325132Savg 1576325132Savg zil_lwb_add_txg(lwb, txg); 1577325132Savg 1578219089Spjd ASSERT3U(lwb->lwb_nused, <=, lwb->lwb_sz); 1579240415Smm ASSERT0(P2PHASE(lwb->lwb_nused, sizeof (uint64_t))); 1580168404Spjd 1581315441Smav dlen -= dnow; 1582315441Smav if (dlen > 0) { 1583315441Smav zilog->zl_cur_used += reclen; 1584315441Smav goto cont; 1585315441Smav } 1586315441Smav 1587168404Spjd return (lwb); 1588168404Spjd} 1589168404Spjd 1590168404Spjditx_t * 1591185029Spjdzil_itx_create(uint64_t txtype, size_t lrsize) 1592168404Spjd{ 1593168404Spjd itx_t *itx; 1594168404Spjd 1595168404Spjd lrsize = P2ROUNDUP_TYPED(lrsize, sizeof (uint64_t), size_t); 1596168404Spjd 1597168404Spjd itx = kmem_alloc(offsetof(itx_t, itx_lr) + lrsize, KM_SLEEP); 1598168404Spjd itx->itx_lr.lrc_txtype = txtype; 1599168404Spjd itx->itx_lr.lrc_reclen = lrsize; 1600168404Spjd itx->itx_lr.lrc_seq = 0; /* defensive */ 1601219089Spjd itx->itx_sync = B_TRUE; /* default is synchronous */ 1602168404Spjd 1603168404Spjd return (itx); 1604168404Spjd} 1605168404Spjd 1606219089Spjdvoid 1607219089Spjdzil_itx_destroy(itx_t *itx) 1608168404Spjd{ 1609219089Spjd kmem_free(itx, offsetof(itx_t, itx_lr) + itx->itx_lr.lrc_reclen); 1610219089Spjd} 1611168404Spjd 1612219089Spjd/* 1613219089Spjd * Free up the sync and async itxs. The itxs_t has already been detached 1614219089Spjd * so no locks are needed. 1615219089Spjd */ 1616219089Spjdstatic void 1617219089Spjdzil_itxg_clean(itxs_t *itxs) 1618219089Spjd{ 1619219089Spjd itx_t *itx; 1620219089Spjd list_t *list; 1621219089Spjd avl_tree_t *t; 1622219089Spjd void *cookie; 1623219089Spjd itx_async_node_t *ian; 1624168404Spjd 1625219089Spjd list = &itxs->i_sync_list; 1626219089Spjd while ((itx = list_head(list)) != NULL) { 1627325132Savg /* 1628325132Savg * In the general case, commit itxs will not be found 1629325132Savg * here, as they'll be committed to an lwb via 1630325132Savg * zil_lwb_commit(), and free'd in that function. Having 1631325132Savg * said that, it is still possible for commit itxs to be 1632325132Savg * found here, due to the following race: 1633325132Savg * 1634325132Savg * - a thread calls zil_commit() which assigns the 1635325132Savg * commit itx to a per-txg i_sync_list 1636325132Savg * - zil_itxg_clean() is called (e.g. via spa_sync()) 1637325132Savg * while the waiter is still on the i_sync_list 1638325132Savg * 1639325132Savg * There's nothing to prevent syncing the txg while the 1640325132Savg * waiter is on the i_sync_list. This normally doesn't 1641325132Savg * happen because spa_sync() is slower than zil_commit(), 1642325132Savg * but if zil_commit() calls txg_wait_synced() (e.g. 1643325132Savg * because zil_create() or zil_commit_writer_stall() is 1644325132Savg * called) we will hit this case. 1645325132Savg */ 1646325132Savg if (itx->itx_lr.lrc_txtype == TX_COMMIT) 1647325132Savg zil_commit_waiter_skip(itx->itx_private); 1648325132Savg 1649219089Spjd list_remove(list, itx); 1650325132Savg zil_itx_destroy(itx); 1651219089Spjd } 1652168404Spjd 1653219089Spjd cookie = NULL; 1654219089Spjd t = &itxs->i_async_tree; 1655219089Spjd while ((ian = avl_destroy_nodes(t, &cookie)) != NULL) { 1656219089Spjd list = &ian->ia_list; 1657219089Spjd while ((itx = list_head(list)) != NULL) { 1658219089Spjd list_remove(list, itx); 1659325132Savg /* commit itxs should never be on the async lists. */ 1660325132Savg ASSERT3U(itx->itx_lr.lrc_txtype, !=, TX_COMMIT); 1661325132Savg zil_itx_destroy(itx); 1662219089Spjd } 1663219089Spjd list_destroy(list); 1664219089Spjd kmem_free(ian, sizeof (itx_async_node_t)); 1665219089Spjd } 1666219089Spjd avl_destroy(t); 1667219089Spjd 1668219089Spjd kmem_free(itxs, sizeof (itxs_t)); 1669168404Spjd} 1670168404Spjd 1671219089Spjdstatic int 1672219089Spjdzil_aitx_compare(const void *x1, const void *x2) 1673219089Spjd{ 1674219089Spjd const uint64_t o1 = ((itx_async_node_t *)x1)->ia_foid; 1675219089Spjd const uint64_t o2 = ((itx_async_node_t *)x2)->ia_foid; 1676219089Spjd 1677339158Smav return (AVL_CMP(o1, o2)); 1678219089Spjd} 1679219089Spjd 1680168404Spjd/* 1681219089Spjd * Remove all async itx with the given oid. 1682168404Spjd */ 1683168404Spjdstatic void 1684219089Spjdzil_remove_async(zilog_t *zilog, uint64_t oid) 1685168404Spjd{ 1686219089Spjd uint64_t otxg, txg; 1687219089Spjd itx_async_node_t *ian; 1688219089Spjd avl_tree_t *t; 1689219089Spjd avl_index_t where; 1690168404Spjd list_t clean_list; 1691168404Spjd itx_t *itx; 1692168404Spjd 1693219089Spjd ASSERT(oid != 0); 1694168404Spjd list_create(&clean_list, sizeof (itx_t), offsetof(itx_t, itx_node)); 1695168404Spjd 1696219089Spjd if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */ 1697219089Spjd otxg = ZILTEST_TXG; 1698219089Spjd else 1699219089Spjd otxg = spa_last_synced_txg(zilog->zl_spa) + 1; 1700219089Spjd 1701219089Spjd for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) { 1702219089Spjd itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK]; 1703219089Spjd 1704219089Spjd mutex_enter(&itxg->itxg_lock); 1705219089Spjd if (itxg->itxg_txg != txg) { 1706219089Spjd mutex_exit(&itxg->itxg_lock); 1707219089Spjd continue; 1708219089Spjd } 1709219089Spjd 1710219089Spjd /* 1711219089Spjd * Locate the object node and append its list. 1712219089Spjd */ 1713219089Spjd t = &itxg->itxg_itxs->i_async_tree; 1714219089Spjd ian = avl_find(t, &oid, &where); 1715219089Spjd if (ian != NULL) 1716219089Spjd list_move_tail(&clean_list, &ian->ia_list); 1717219089Spjd mutex_exit(&itxg->itxg_lock); 1718168404Spjd } 1719219089Spjd while ((itx = list_head(&clean_list)) != NULL) { 1720219089Spjd list_remove(&clean_list, itx); 1721325132Savg /* commit itxs should never be on the async lists. */ 1722325132Savg ASSERT3U(itx->itx_lr.lrc_txtype, !=, TX_COMMIT); 1723325132Savg zil_itx_destroy(itx); 1724219089Spjd } 1725219089Spjd list_destroy(&clean_list); 1726219089Spjd} 1727168404Spjd 1728219089Spjdvoid 1729219089Spjdzil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx) 1730219089Spjd{ 1731219089Spjd uint64_t txg; 1732219089Spjd itxg_t *itxg; 1733219089Spjd itxs_t *itxs, *clean = NULL; 1734219089Spjd 1735168404Spjd /* 1736219089Spjd * Object ids can be re-instantiated in the next txg so 1737219089Spjd * remove any async transactions to avoid future leaks. 1738219089Spjd * This can happen if a fsync occurs on the re-instantiated 1739219089Spjd * object for a WR_INDIRECT or WR_NEED_COPY write, which gets 1740219089Spjd * the new file data and flushes a write record for the old object. 1741168404Spjd */ 1742219089Spjd if ((itx->itx_lr.lrc_txtype & ~TX_CI) == TX_REMOVE) 1743219089Spjd zil_remove_async(zilog, itx->itx_oid); 1744219089Spjd 1745219089Spjd /* 1746219089Spjd * Ensure the data of a renamed file is committed before the rename. 1747219089Spjd */ 1748219089Spjd if ((itx->itx_lr.lrc_txtype & ~TX_CI) == TX_RENAME) 1749219089Spjd zil_async_to_sync(zilog, itx->itx_oid); 1750219089Spjd 1751239620Smm if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) 1752219089Spjd txg = ZILTEST_TXG; 1753219089Spjd else 1754219089Spjd txg = dmu_tx_get_txg(tx); 1755219089Spjd 1756219089Spjd itxg = &zilog->zl_itxg[txg & TXG_MASK]; 1757219089Spjd mutex_enter(&itxg->itxg_lock); 1758219089Spjd itxs = itxg->itxg_itxs; 1759219089Spjd if (itxg->itxg_txg != txg) { 1760219089Spjd if (itxs != NULL) { 1761219089Spjd /* 1762219089Spjd * The zil_clean callback hasn't got around to cleaning 1763219089Spjd * this itxg. Save the itxs for release below. 1764219089Spjd * This should be rare. 1765219089Spjd */ 1766321611Smav zfs_dbgmsg("zil_itx_assign: missed itx cleanup for " 1767321611Smav "txg %llu", itxg->itxg_txg); 1768219089Spjd clean = itxg->itxg_itxs; 1769219089Spjd } 1770219089Spjd itxg->itxg_txg = txg; 1771219089Spjd itxs = itxg->itxg_itxs = kmem_zalloc(sizeof (itxs_t), KM_SLEEP); 1772219089Spjd 1773219089Spjd list_create(&itxs->i_sync_list, sizeof (itx_t), 1774219089Spjd offsetof(itx_t, itx_node)); 1775219089Spjd avl_create(&itxs->i_async_tree, zil_aitx_compare, 1776219089Spjd sizeof (itx_async_node_t), 1777219089Spjd offsetof(itx_async_node_t, ia_node)); 1778168404Spjd } 1779219089Spjd if (itx->itx_sync) { 1780219089Spjd list_insert_tail(&itxs->i_sync_list, itx); 1781219089Spjd } else { 1782219089Spjd avl_tree_t *t = &itxs->i_async_tree; 1783219089Spjd uint64_t foid = ((lr_ooo_t *)&itx->itx_lr)->lr_foid; 1784219089Spjd itx_async_node_t *ian; 1785219089Spjd avl_index_t where; 1786168404Spjd 1787219089Spjd ian = avl_find(t, &foid, &where); 1788219089Spjd if (ian == NULL) { 1789219089Spjd ian = kmem_alloc(sizeof (itx_async_node_t), KM_SLEEP); 1790219089Spjd list_create(&ian->ia_list, sizeof (itx_t), 1791219089Spjd offsetof(itx_t, itx_node)); 1792219089Spjd ian->ia_foid = foid; 1793219089Spjd avl_insert(t, ian, where); 1794219089Spjd } 1795219089Spjd list_insert_tail(&ian->ia_list, itx); 1796168404Spjd } 1797219089Spjd 1798219089Spjd itx->itx_lr.lrc_txg = dmu_tx_get_txg(tx); 1799325132Savg 1800325132Savg /* 1801325132Savg * We don't want to dirty the ZIL using ZILTEST_TXG, because 1802325132Savg * zil_clean() will never be called using ZILTEST_TXG. Thus, we 1803325132Savg * need to be careful to always dirty the ZIL using the "real" 1804325132Savg * TXG (not itxg_txg) even when the SPA is frozen. 1805325132Savg */ 1806325132Savg zilog_dirty(zilog, dmu_tx_get_txg(tx)); 1807219089Spjd mutex_exit(&itxg->itxg_lock); 1808219089Spjd 1809219089Spjd /* Release the old itxs now we've dropped the lock */ 1810219089Spjd if (clean != NULL) 1811219089Spjd zil_itxg_clean(clean); 1812168404Spjd} 1813168404Spjd 1814168404Spjd/* 1815168404Spjd * If there are any in-memory intent log transactions which have now been 1816239620Smm * synced then start up a taskq to free them. We should only do this after we 1817239620Smm * have written out the uberblocks (i.e. txg has been comitted) so that 1818239620Smm * don't inadvertently clean out in-memory log records that would be required 1819239620Smm * by zil_commit(). 1820168404Spjd */ 1821168404Spjdvoid 1822219089Spjdzil_clean(zilog_t *zilog, uint64_t synced_txg) 1823168404Spjd{ 1824219089Spjd itxg_t *itxg = &zilog->zl_itxg[synced_txg & TXG_MASK]; 1825219089Spjd itxs_t *clean_me; 1826168404Spjd 1827325132Savg ASSERT3U(synced_txg, <, ZILTEST_TXG); 1828325132Savg 1829219089Spjd mutex_enter(&itxg->itxg_lock); 1830219089Spjd if (itxg->itxg_itxs == NULL || itxg->itxg_txg == ZILTEST_TXG) { 1831219089Spjd mutex_exit(&itxg->itxg_lock); 1832219089Spjd return; 1833168404Spjd } 1834219089Spjd ASSERT3U(itxg->itxg_txg, <=, synced_txg); 1835324205Savg ASSERT3U(itxg->itxg_txg, !=, 0); 1836219089Spjd clean_me = itxg->itxg_itxs; 1837219089Spjd itxg->itxg_itxs = NULL; 1838219089Spjd itxg->itxg_txg = 0; 1839219089Spjd mutex_exit(&itxg->itxg_lock); 1840219089Spjd /* 1841219089Spjd * Preferably start a task queue to free up the old itxs but 1842219089Spjd * if taskq_dispatch can't allocate resources to do that then 1843219089Spjd * free it in-line. This should be rare. Note, using TQ_SLEEP 1844219089Spjd * created a bad performance problem. 1845219089Spjd */ 1846324205Savg ASSERT3P(zilog->zl_dmu_pool, !=, NULL); 1847324205Savg ASSERT3P(zilog->zl_dmu_pool->dp_zil_clean_taskq, !=, NULL); 1848324205Savg if (taskq_dispatch(zilog->zl_dmu_pool->dp_zil_clean_taskq, 1849219089Spjd (void (*)(void *))zil_itxg_clean, clean_me, TQ_NOSLEEP) == 0) 1850219089Spjd zil_itxg_clean(clean_me); 1851168404Spjd} 1852168404Spjd 1853219089Spjd/* 1854325132Savg * This function will traverse the queue of itxs that need to be 1855325132Savg * committed, and move them onto the ZIL's zl_itx_commit_list. 1856219089Spjd */ 1857185029Spjdstatic void 1858219089Spjdzil_get_commit_list(zilog_t *zilog) 1859168404Spjd{ 1860219089Spjd uint64_t otxg, txg; 1861219089Spjd list_t *commit_list = &zilog->zl_itx_commit_list; 1862219089Spjd 1863329485Smav ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); 1864325132Savg 1865219089Spjd if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */ 1866219089Spjd otxg = ZILTEST_TXG; 1867219089Spjd else 1868219089Spjd otxg = spa_last_synced_txg(zilog->zl_spa) + 1; 1869219089Spjd 1870310515Savg /* 1871310515Savg * This is inherently racy, since there is nothing to prevent 1872310515Savg * the last synced txg from changing. That's okay since we'll 1873310515Savg * only commit things in the future. 1874310515Savg */ 1875219089Spjd for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) { 1876219089Spjd itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK]; 1877219089Spjd 1878219089Spjd mutex_enter(&itxg->itxg_lock); 1879219089Spjd if (itxg->itxg_txg != txg) { 1880219089Spjd mutex_exit(&itxg->itxg_lock); 1881219089Spjd continue; 1882219089Spjd } 1883219089Spjd 1884310515Savg /* 1885310515Savg * If we're adding itx records to the zl_itx_commit_list, 1886310515Savg * then the zil better be dirty in this "txg". We can assert 1887310515Savg * that here since we're holding the itxg_lock which will 1888310515Savg * prevent spa_sync from cleaning it. Once we add the itxs 1889310515Savg * to the zl_itx_commit_list we must commit it to disk even 1890310515Savg * if it's unnecessary (i.e. the txg was synced). 1891310515Savg */ 1892310515Savg ASSERT(zilog_is_dirty_in_txg(zilog, txg) || 1893310515Savg spa_freeze_txg(zilog->zl_spa) != UINT64_MAX); 1894219089Spjd list_move_tail(commit_list, &itxg->itxg_itxs->i_sync_list); 1895219089Spjd 1896219089Spjd mutex_exit(&itxg->itxg_lock); 1897219089Spjd } 1898219089Spjd} 1899219089Spjd 1900219089Spjd/* 1901219089Spjd * Move the async itxs for a specified object to commit into sync lists. 1902219089Spjd */ 1903308595Smavvoid 1904219089Spjdzil_async_to_sync(zilog_t *zilog, uint64_t foid) 1905219089Spjd{ 1906219089Spjd uint64_t otxg, txg; 1907219089Spjd itx_async_node_t *ian; 1908219089Spjd avl_tree_t *t; 1909219089Spjd avl_index_t where; 1910219089Spjd 1911219089Spjd if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */ 1912219089Spjd otxg = ZILTEST_TXG; 1913219089Spjd else 1914219089Spjd otxg = spa_last_synced_txg(zilog->zl_spa) + 1; 1915219089Spjd 1916310515Savg /* 1917310515Savg * This is inherently racy, since there is nothing to prevent 1918310515Savg * the last synced txg from changing. 1919310515Savg */ 1920219089Spjd for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) { 1921219089Spjd itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK]; 1922219089Spjd 1923219089Spjd mutex_enter(&itxg->itxg_lock); 1924219089Spjd if (itxg->itxg_txg != txg) { 1925219089Spjd mutex_exit(&itxg->itxg_lock); 1926219089Spjd continue; 1927219089Spjd } 1928219089Spjd 1929219089Spjd /* 1930219089Spjd * If a foid is specified then find that node and append its 1931219089Spjd * list. Otherwise walk the tree appending all the lists 1932219089Spjd * to the sync list. We add to the end rather than the 1933219089Spjd * beginning to ensure the create has happened. 1934219089Spjd */ 1935219089Spjd t = &itxg->itxg_itxs->i_async_tree; 1936219089Spjd if (foid != 0) { 1937219089Spjd ian = avl_find(t, &foid, &where); 1938219089Spjd if (ian != NULL) { 1939219089Spjd list_move_tail(&itxg->itxg_itxs->i_sync_list, 1940219089Spjd &ian->ia_list); 1941219089Spjd } 1942219089Spjd } else { 1943219089Spjd void *cookie = NULL; 1944219089Spjd 1945219089Spjd while ((ian = avl_destroy_nodes(t, &cookie)) != NULL) { 1946219089Spjd list_move_tail(&itxg->itxg_itxs->i_sync_list, 1947219089Spjd &ian->ia_list); 1948219089Spjd list_destroy(&ian->ia_list); 1949219089Spjd kmem_free(ian, sizeof (itx_async_node_t)); 1950219089Spjd } 1951219089Spjd } 1952219089Spjd mutex_exit(&itxg->itxg_lock); 1953219089Spjd } 1954219089Spjd} 1955219089Spjd 1956325132Savg/* 1957325132Savg * This function will prune commit itxs that are at the head of the 1958325132Savg * commit list (it won't prune past the first non-commit itx), and 1959325132Savg * either: a) attach them to the last lwb that's still pending 1960325132Savg * completion, or b) skip them altogether. 1961325132Savg * 1962325132Savg * This is used as a performance optimization to prevent commit itxs 1963325132Savg * from generating new lwbs when it's unnecessary to do so. 1964325132Savg */ 1965219089Spjdstatic void 1966325132Savgzil_prune_commit_list(zilog_t *zilog) 1967219089Spjd{ 1968219089Spjd itx_t *itx; 1969168404Spjd 1970329485Smav ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); 1971168404Spjd 1972325132Savg while (itx = list_head(&zilog->zl_itx_commit_list)) { 1973325132Savg lr_t *lrc = &itx->itx_lr; 1974325132Savg if (lrc->lrc_txtype != TX_COMMIT) 1975325132Savg break; 1976219089Spjd 1977325132Savg mutex_enter(&zilog->zl_lock); 1978219089Spjd 1979325132Savg lwb_t *last_lwb = zilog->zl_last_lwb_opened; 1980325132Savg if (last_lwb == NULL || last_lwb->lwb_state == LWB_STATE_DONE) { 1981325132Savg /* 1982325132Savg * All of the itxs this waiter was waiting on 1983325132Savg * must have already completed (or there were 1984325132Savg * never any itx's for it to wait on), so it's 1985325132Savg * safe to skip this waiter and mark it done. 1986325132Savg */ 1987325132Savg zil_commit_waiter_skip(itx->itx_private); 1988325132Savg } else { 1989325132Savg zil_commit_waiter_link_lwb(itx->itx_private, last_lwb); 1990325132Savg itx->itx_private = NULL; 1991325132Savg } 1992325132Savg 1993325132Savg mutex_exit(&zilog->zl_lock); 1994325132Savg 1995325132Savg list_remove(&zilog->zl_itx_commit_list, itx); 1996325132Savg zil_itx_destroy(itx); 1997325132Savg } 1998325132Savg 1999325132Savg IMPLY(itx != NULL, itx->itx_lr.lrc_txtype != TX_COMMIT); 2000325132Savg} 2001325132Savg 2002325132Savgstatic void 2003325132Savgzil_commit_writer_stall(zilog_t *zilog) 2004325132Savg{ 2005219089Spjd /* 2006325132Savg * When zio_alloc_zil() fails to allocate the next lwb block on 2007325132Savg * disk, we must call txg_wait_synced() to ensure all of the 2008325132Savg * lwbs in the zilog's zl_lwb_list are synced and then freed (in 2009325132Savg * zil_sync()), such that any subsequent ZIL writer (i.e. a call 2010325132Savg * to zil_process_commit_list()) will have to call zil_create(), 2011325132Savg * and start a new ZIL chain. 2012325132Savg * 2013325132Savg * Since zil_alloc_zil() failed, the lwb that was previously 2014325132Savg * issued does not have a pointer to the "next" lwb on disk. 2015325132Savg * Thus, if another ZIL writer thread was to allocate the "next" 2016325132Savg * on-disk lwb, that block could be leaked in the event of a 2017325132Savg * crash (because the previous lwb on-disk would not point to 2018325132Savg * it). 2019325132Savg * 2020329485Smav * We must hold the zilog's zl_issuer_lock while we do this, to 2021325132Savg * ensure no new threads enter zil_process_commit_list() until 2022325132Savg * all lwb's in the zl_lwb_list have been synced and freed 2023325132Savg * (which is achieved via the txg_wait_synced() call). 2024325132Savg */ 2025329485Smav ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); 2026325132Savg txg_wait_synced(zilog->zl_dmu_pool, 0); 2027325132Savg ASSERT3P(list_tail(&zilog->zl_lwb_list), ==, NULL); 2028325132Savg} 2029325132Savg 2030325132Savg/* 2031325132Savg * This function will traverse the commit list, creating new lwbs as 2032325132Savg * needed, and committing the itxs from the commit list to these newly 2033325132Savg * created lwbs. Additionally, as a new lwb is created, the previous 2034325132Savg * lwb will be issued to the zio layer to be written to disk. 2035325132Savg */ 2036325132Savgstatic void 2037325132Savgzil_process_commit_list(zilog_t *zilog) 2038325132Savg{ 2039325132Savg spa_t *spa = zilog->zl_spa; 2040325132Savg list_t nolwb_waiters; 2041325132Savg lwb_t *lwb; 2042325132Savg itx_t *itx; 2043325132Savg 2044329485Smav ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); 2045325132Savg 2046325132Savg /* 2047219089Spjd * Return if there's nothing to commit before we dirty the fs by 2048219089Spjd * calling zil_create(). 2049219089Spjd */ 2050325132Savg if (list_head(&zilog->zl_itx_commit_list) == NULL) 2051219089Spjd return; 2052219089Spjd 2053325132Savg list_create(&nolwb_waiters, sizeof (zil_commit_waiter_t), 2054325132Savg offsetof(zil_commit_waiter_t, zcw_node)); 2055325132Savg 2056325132Savg lwb = list_tail(&zilog->zl_lwb_list); 2057325132Savg if (lwb == NULL) { 2058325132Savg lwb = zil_create(zilog); 2059168404Spjd } else { 2060325132Savg ASSERT3S(lwb->lwb_state, !=, LWB_STATE_ISSUED); 2061325132Savg ASSERT3S(lwb->lwb_state, !=, LWB_STATE_DONE); 2062168404Spjd } 2063168404Spjd 2064219089Spjd while (itx = list_head(&zilog->zl_itx_commit_list)) { 2065325132Savg lr_t *lrc = &itx->itx_lr; 2066325132Savg uint64_t txg = lrc->lrc_txg; 2067325132Savg 2068310515Savg ASSERT3U(txg, !=, 0); 2069168404Spjd 2070325132Savg if (lrc->lrc_txtype == TX_COMMIT) { 2071325132Savg DTRACE_PROBE2(zil__process__commit__itx, 2072325132Savg zilog_t *, zilog, itx_t *, itx); 2073325132Savg } else { 2074325132Savg DTRACE_PROBE2(zil__process__normal__itx, 2075325132Savg zilog_t *, zilog, itx_t *, itx); 2076325132Savg } 2077325132Savg 2078325132Savg boolean_t synced = txg <= spa_last_synced_txg(spa); 2079325132Savg boolean_t frozen = txg > spa_freeze_txg(spa); 2080325132Savg 2081329486Smav /* 2082329486Smav * If the txg of this itx has already been synced out, then 2083329486Smav * we don't need to commit this itx to an lwb. This is 2084329486Smav * because the data of this itx will have already been 2085329486Smav * written to the main pool. This is inherently racy, and 2086329486Smav * it's still ok to commit an itx whose txg has already 2087329486Smav * been synced; this will result in a write that's 2088329486Smav * unnecessary, but will do no harm. 2089329486Smav * 2090329486Smav * With that said, we always want to commit TX_COMMIT itxs 2091329486Smav * to an lwb, regardless of whether or not that itx's txg 2092329486Smav * has been synced out. We do this to ensure any OPENED lwb 2093329486Smav * will always have at least one zil_commit_waiter_t linked 2094329486Smav * to the lwb. 2095329486Smav * 2096329486Smav * As a counter-example, if we skipped TX_COMMIT itx's 2097329486Smav * whose txg had already been synced, the following 2098329486Smav * situation could occur if we happened to be racing with 2099329486Smav * spa_sync: 2100329486Smav * 2101329486Smav * 1. we commit a non-TX_COMMIT itx to an lwb, where the 2102329486Smav * itx's txg is 10 and the last synced txg is 9. 2103329486Smav * 2. spa_sync finishes syncing out txg 10. 2104329486Smav * 3. we move to the next itx in the list, it's a TX_COMMIT 2105329486Smav * whose txg is 10, so we skip it rather than committing 2106329486Smav * it to the lwb used in (1). 2107329486Smav * 2108329486Smav * If the itx that is skipped in (3) is the last TX_COMMIT 2109329486Smav * itx in the commit list, than it's possible for the lwb 2110329486Smav * used in (1) to remain in the OPENED state indefinitely. 2111329486Smav * 2112329486Smav * To prevent the above scenario from occuring, ensuring 2113329486Smav * that once an lwb is OPENED it will transition to ISSUED 2114329486Smav * and eventually DONE, we always commit TX_COMMIT itx's to 2115329486Smav * an lwb here, even if that itx's txg has already been 2116329486Smav * synced. 2117329486Smav * 2118329486Smav * Finally, if the pool is frozen, we _always_ commit the 2119329486Smav * itx. The point of freezing the pool is to prevent data 2120329486Smav * from being written to the main pool via spa_sync, and 2121329486Smav * instead rely solely on the ZIL to persistently store the 2122329486Smav * data; i.e. when the pool is frozen, the last synced txg 2123329486Smav * value can't be trusted. 2124329486Smav */ 2125329486Smav if (frozen || !synced || lrc->lrc_txtype == TX_COMMIT) { 2126325132Savg if (lwb != NULL) { 2127325132Savg lwb = zil_lwb_commit(zilog, itx, lwb); 2128325132Savg } else if (lrc->lrc_txtype == TX_COMMIT) { 2129325132Savg ASSERT3P(lwb, ==, NULL); 2130325132Savg zil_commit_waiter_link_nolwb( 2131325132Savg itx->itx_private, &nolwb_waiters); 2132325132Savg } 2133325132Savg } 2134325132Savg 2135219089Spjd list_remove(&zilog->zl_itx_commit_list, itx); 2136325132Savg zil_itx_destroy(itx); 2137168404Spjd } 2138168404Spjd 2139325132Savg if (lwb == NULL) { 2140325132Savg /* 2141325132Savg * This indicates zio_alloc_zil() failed to allocate the 2142325132Savg * "next" lwb on-disk. When this happens, we must stall 2143325132Savg * the ZIL write pipeline; see the comment within 2144325132Savg * zil_commit_writer_stall() for more details. 2145325132Savg */ 2146325132Savg zil_commit_writer_stall(zilog); 2147168404Spjd 2148325132Savg /* 2149325132Savg * Additionally, we have to signal and mark the "nolwb" 2150325132Savg * waiters as "done" here, since without an lwb, we 2151325132Savg * can't do this via zil_lwb_flush_vdevs_done() like 2152325132Savg * normal. 2153325132Savg */ 2154325132Savg zil_commit_waiter_t *zcw; 2155325132Savg while (zcw = list_head(&nolwb_waiters)) { 2156325132Savg zil_commit_waiter_skip(zcw); 2157325132Savg list_remove(&nolwb_waiters, zcw); 2158325132Savg } 2159325132Savg } else { 2160325132Savg ASSERT(list_is_empty(&nolwb_waiters)); 2161325132Savg ASSERT3P(lwb, !=, NULL); 2162325132Savg ASSERT3S(lwb->lwb_state, !=, LWB_STATE_ISSUED); 2163325132Savg ASSERT3S(lwb->lwb_state, !=, LWB_STATE_DONE); 2164168404Spjd 2165325132Savg /* 2166325132Savg * At this point, the ZIL block pointed at by the "lwb" 2167325132Savg * variable is in one of the following states: "closed" 2168325132Savg * or "open". 2169325132Savg * 2170325132Savg * If its "closed", then no itxs have been committed to 2171325132Savg * it, so there's no point in issuing its zio (i.e. 2172325132Savg * it's "empty"). 2173325132Savg * 2174325132Savg * If its "open" state, then it contains one or more 2175325132Savg * itxs that eventually need to be committed to stable 2176325132Savg * storage. In this case we intentionally do not issue 2177325132Savg * the lwb's zio to disk yet, and instead rely on one of 2178325132Savg * the following two mechanisms for issuing the zio: 2179325132Savg * 2180325132Savg * 1. Ideally, there will be more ZIL activity occuring 2181325132Savg * on the system, such that this function will be 2182325132Savg * immediately called again (not necessarily by the same 2183325132Savg * thread) and this lwb's zio will be issued via 2184325132Savg * zil_lwb_commit(). This way, the lwb is guaranteed to 2185325132Savg * be "full" when it is issued to disk, and we'll make 2186325132Savg * use of the lwb's size the best we can. 2187325132Savg * 2188325132Savg * 2. If there isn't sufficient ZIL activity occuring on 2189325132Savg * the system, such that this lwb's zio isn't issued via 2190325132Savg * zil_lwb_commit(), zil_commit_waiter() will issue the 2191325132Savg * lwb's zio. If this occurs, the lwb is not guaranteed 2192325132Savg * to be "full" by the time its zio is issued, and means 2193325132Savg * the size of the lwb was "too large" given the amount 2194325132Savg * of ZIL activity occuring on the system at that time. 2195325132Savg * 2196325132Savg * We do this for a couple of reasons: 2197325132Savg * 2198325132Savg * 1. To try and reduce the number of IOPs needed to 2199325132Savg * write the same number of itxs. If an lwb has space 2200325132Savg * available in it's buffer for more itxs, and more itxs 2201325132Savg * will be committed relatively soon (relative to the 2202325132Savg * latency of performing a write), then it's beneficial 2203325132Savg * to wait for these "next" itxs. This way, more itxs 2204325132Savg * can be committed to stable storage with fewer writes. 2205325132Savg * 2206325132Savg * 2. To try and use the largest lwb block size that the 2207325132Savg * incoming rate of itxs can support. Again, this is to 2208325132Savg * try and pack as many itxs into as few lwbs as 2209325132Savg * possible, without significantly impacting the latency 2210325132Savg * of each individual itx. 2211325132Savg */ 2212325132Savg } 2213325132Savg} 2214325132Savg 2215325132Savg/* 2216325132Savg * This function is responsible for ensuring the passed in commit waiter 2217325132Savg * (and associated commit itx) is committed to an lwb. If the waiter is 2218325132Savg * not already committed to an lwb, all itxs in the zilog's queue of 2219325132Savg * itxs will be processed. The assumption is the passed in waiter's 2220325132Savg * commit itx will found in the queue just like the other non-commit 2221325132Savg * itxs, such that when the entire queue is processed, the waiter will 2222325132Savg * have been commited to an lwb. 2223325132Savg * 2224325132Savg * The lwb associated with the passed in waiter is not guaranteed to 2225325132Savg * have been issued by the time this function completes. If the lwb is 2226325132Savg * not issued, we rely on future calls to zil_commit_writer() to issue 2227325132Savg * the lwb, or the timeout mechanism found in zil_commit_waiter(). 2228325132Savg */ 2229325132Savgstatic void 2230325132Savgzil_commit_writer(zilog_t *zilog, zil_commit_waiter_t *zcw) 2231325132Savg{ 2232325132Savg ASSERT(!MUTEX_HELD(&zilog->zl_lock)); 2233325132Savg ASSERT(spa_writeable(zilog->zl_spa)); 2234325132Savg 2235329485Smav mutex_enter(&zilog->zl_issuer_lock); 2236325132Savg 2237325132Savg if (zcw->zcw_lwb != NULL || zcw->zcw_done) { 2238325132Savg /* 2239325132Savg * It's possible that, while we were waiting to acquire 2240329485Smav * the "zl_issuer_lock", another thread committed this 2241325132Savg * waiter to an lwb. If that occurs, we bail out early, 2242325132Savg * without processing any of the zilog's queue of itxs. 2243325132Savg * 2244325132Savg * On certain workloads and system configurations, the 2245329485Smav * "zl_issuer_lock" can become highly contended. In an 2246325132Savg * attempt to reduce this contention, we immediately drop 2247325132Savg * the lock if the waiter has already been processed. 2248325132Savg * 2249325132Savg * We've measured this optimization to reduce CPU spent 2250325132Savg * contending on this lock by up to 5%, using a system 2251325132Savg * with 32 CPUs, low latency storage (~50 usec writes), 2252325132Savg * and 1024 threads performing sync writes. 2253325132Savg */ 2254325132Savg goto out; 2255325132Savg } 2256325132Savg 2257325132Savg zil_get_commit_list(zilog); 2258325132Savg zil_prune_commit_list(zilog); 2259325132Savg zil_process_commit_list(zilog); 2260325132Savg 2261325132Savgout: 2262329485Smav mutex_exit(&zilog->zl_issuer_lock); 2263325132Savg} 2264325132Savg 2265325132Savgstatic void 2266325132Savgzil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw) 2267325132Savg{ 2268329485Smav ASSERT(!MUTEX_HELD(&zilog->zl_issuer_lock)); 2269325132Savg ASSERT(MUTEX_HELD(&zcw->zcw_lock)); 2270325132Savg ASSERT3B(zcw->zcw_done, ==, B_FALSE); 2271325132Savg 2272325132Savg lwb_t *lwb = zcw->zcw_lwb; 2273325132Savg ASSERT3P(lwb, !=, NULL); 2274325132Savg ASSERT3S(lwb->lwb_state, !=, LWB_STATE_CLOSED); 2275325132Savg 2276168404Spjd /* 2277325132Savg * If the lwb has already been issued by another thread, we can 2278325132Savg * immediately return since there's no work to be done (the 2279325132Savg * point of this function is to issue the lwb). Additionally, we 2280329485Smav * do this prior to acquiring the zl_issuer_lock, to avoid 2281325132Savg * acquiring it when it's not necessary to do so. 2282168404Spjd */ 2283325132Savg if (lwb->lwb_state == LWB_STATE_ISSUED || 2284325132Savg lwb->lwb_state == LWB_STATE_DONE) 2285325132Savg return; 2286325132Savg 2287325132Savg /* 2288325132Savg * In order to call zil_lwb_write_issue() we must hold the 2289329485Smav * zilog's "zl_issuer_lock". We can't simply acquire that lock, 2290325132Savg * since we're already holding the commit waiter's "zcw_lock", 2291325132Savg * and those two locks are aquired in the opposite order 2292325132Savg * elsewhere. 2293325132Savg */ 2294325132Savg mutex_exit(&zcw->zcw_lock); 2295329485Smav mutex_enter(&zilog->zl_issuer_lock); 2296325132Savg mutex_enter(&zcw->zcw_lock); 2297325132Savg 2298325132Savg /* 2299325132Savg * Since we just dropped and re-acquired the commit waiter's 2300325132Savg * lock, we have to re-check to see if the waiter was marked 2301325132Savg * "done" during that process. If the waiter was marked "done", 2302325132Savg * the "lwb" pointer is no longer valid (it can be free'd after 2303325132Savg * the waiter is marked "done"), so without this check we could 2304325132Savg * wind up with a use-after-free error below. 2305325132Savg */ 2306325132Savg if (zcw->zcw_done) 2307325132Savg goto out; 2308325132Savg 2309325132Savg ASSERT3P(lwb, ==, zcw->zcw_lwb); 2310325132Savg 2311325132Savg /* 2312329486Smav * We've already checked this above, but since we hadn't acquired 2313329486Smav * the zilog's zl_issuer_lock, we have to perform this check a 2314329486Smav * second time while holding the lock. 2315329486Smav * 2316329486Smav * We don't need to hold the zl_lock since the lwb cannot transition 2317329486Smav * from OPENED to ISSUED while we hold the zl_issuer_lock. The lwb 2318329486Smav * _can_ transition from ISSUED to DONE, but it's OK to race with 2319329486Smav * that transition since we treat the lwb the same, whether it's in 2320329486Smav * the ISSUED or DONE states. 2321329486Smav * 2322329486Smav * The important thing, is we treat the lwb differently depending on 2323329486Smav * if it's ISSUED or OPENED, and block any other threads that might 2324329486Smav * attempt to issue this lwb. For that reason we hold the 2325329486Smav * zl_issuer_lock when checking the lwb_state; we must not call 2326325132Savg * zil_lwb_write_issue() if the lwb had already been issued. 2327329486Smav * 2328329486Smav * See the comment above the lwb_state_t structure definition for 2329329486Smav * more details on the lwb states, and locking requirements. 2330325132Savg */ 2331325132Savg if (lwb->lwb_state == LWB_STATE_ISSUED || 2332325132Savg lwb->lwb_state == LWB_STATE_DONE) 2333325132Savg goto out; 2334325132Savg 2335325132Savg ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED); 2336325132Savg 2337325132Savg /* 2338325132Savg * As described in the comments above zil_commit_waiter() and 2339325132Savg * zil_process_commit_list(), we need to issue this lwb's zio 2340325132Savg * since we've reached the commit waiter's timeout and it still 2341325132Savg * hasn't been issued. 2342325132Savg */ 2343325132Savg lwb_t *nlwb = zil_lwb_write_issue(zilog, lwb); 2344325132Savg 2345339134Smav IMPLY(nlwb != NULL, lwb->lwb_state != LWB_STATE_OPENED); 2346325132Savg 2347325132Savg /* 2348325132Savg * Since the lwb's zio hadn't been issued by the time this thread 2349325132Savg * reached its timeout, we reset the zilog's "zl_cur_used" field 2350325132Savg * to influence the zil block size selection algorithm. 2351325132Savg * 2352325132Savg * By having to issue the lwb's zio here, it means the size of the 2353325132Savg * lwb was too large, given the incoming throughput of itxs. By 2354325132Savg * setting "zl_cur_used" to zero, we communicate this fact to the 2355325132Savg * block size selection algorithm, so it can take this informaiton 2356325132Savg * into account, and potentially select a smaller size for the 2357325132Savg * next lwb block that is allocated. 2358325132Savg */ 2359325132Savg zilog->zl_cur_used = 0; 2360325132Savg 2361325132Savg if (nlwb == NULL) { 2362325132Savg /* 2363325132Savg * When zil_lwb_write_issue() returns NULL, this 2364325132Savg * indicates zio_alloc_zil() failed to allocate the 2365325132Savg * "next" lwb on-disk. When this occurs, the ZIL write 2366325132Savg * pipeline must be stalled; see the comment within the 2367325132Savg * zil_commit_writer_stall() function for more details. 2368325132Savg * 2369325132Savg * We must drop the commit waiter's lock prior to 2370325132Savg * calling zil_commit_writer_stall() or else we can wind 2371325132Savg * up with the following deadlock: 2372325132Savg * 2373325132Savg * - This thread is waiting for the txg to sync while 2374325132Savg * holding the waiter's lock; txg_wait_synced() is 2375325132Savg * used within txg_commit_writer_stall(). 2376325132Savg * 2377325132Savg * - The txg can't sync because it is waiting for this 2378325132Savg * lwb's zio callback to call dmu_tx_commit(). 2379325132Savg * 2380325132Savg * - The lwb's zio callback can't call dmu_tx_commit() 2381325132Savg * because it's blocked trying to acquire the waiter's 2382325132Savg * lock, which occurs prior to calling dmu_tx_commit() 2383325132Savg */ 2384325132Savg mutex_exit(&zcw->zcw_lock); 2385325132Savg zil_commit_writer_stall(zilog); 2386325132Savg mutex_enter(&zcw->zcw_lock); 2387168404Spjd } 2388168404Spjd 2389325132Savgout: 2390329485Smav mutex_exit(&zilog->zl_issuer_lock); 2391325132Savg ASSERT(MUTEX_HELD(&zcw->zcw_lock)); 2392325132Savg} 2393168404Spjd 2394325132Savg/* 2395325132Savg * This function is responsible for performing the following two tasks: 2396325132Savg * 2397325132Savg * 1. its primary responsibility is to block until the given "commit 2398325132Savg * waiter" is considered "done". 2399325132Savg * 2400325132Savg * 2. its secondary responsibility is to issue the zio for the lwb that 2401325132Savg * the given "commit waiter" is waiting on, if this function has 2402325132Savg * waited "long enough" and the lwb is still in the "open" state. 2403325132Savg * 2404325132Savg * Given a sufficient amount of itxs being generated and written using 2405325132Savg * the ZIL, the lwb's zio will be issued via the zil_lwb_commit() 2406325132Savg * function. If this does not occur, this secondary responsibility will 2407325132Savg * ensure the lwb is issued even if there is not other synchronous 2408325132Savg * activity on the system. 2409325132Savg * 2410325132Savg * For more details, see zil_process_commit_list(); more specifically, 2411325132Savg * the comment at the bottom of that function. 2412325132Savg */ 2413325132Savgstatic void 2414325132Savgzil_commit_waiter(zilog_t *zilog, zil_commit_waiter_t *zcw) 2415325132Savg{ 2416325132Savg ASSERT(!MUTEX_HELD(&zilog->zl_lock)); 2417329485Smav ASSERT(!MUTEX_HELD(&zilog->zl_issuer_lock)); 2418325132Savg ASSERT(spa_writeable(zilog->zl_spa)); 2419168404Spjd 2420325132Savg mutex_enter(&zcw->zcw_lock); 2421325132Savg 2422219089Spjd /* 2423325132Savg * The timeout is scaled based on the lwb latency to avoid 2424325132Savg * significantly impacting the latency of each individual itx. 2425325132Savg * For more details, see the comment at the bottom of the 2426325132Savg * zil_process_commit_list() function. 2427219089Spjd */ 2428325132Savg int pct = MAX(zfs_commit_timeout_pct, 1); 2429325132Savg#if defined(illumos) || !defined(_KERNEL) 2430325132Savg hrtime_t sleep = (zilog->zl_last_lwb_latency * pct) / 100; 2431325132Savg hrtime_t wakeup = gethrtime() + sleep; 2432325132Savg#else 2433325132Savg sbintime_t sleep = nstosbt((zilog->zl_last_lwb_latency * pct) / 100); 2434325132Savg sbintime_t wakeup = getsbinuptime() + sleep; 2435325132Savg#endif 2436325132Savg boolean_t timedout = B_FALSE; 2437325132Savg 2438325132Savg while (!zcw->zcw_done) { 2439325132Savg ASSERT(MUTEX_HELD(&zcw->zcw_lock)); 2440325132Savg 2441325132Savg lwb_t *lwb = zcw->zcw_lwb; 2442325132Savg 2443325132Savg /* 2444325132Savg * Usually, the waiter will have a non-NULL lwb field here, 2445325132Savg * but it's possible for it to be NULL as a result of 2446325132Savg * zil_commit() racing with spa_sync(). 2447325132Savg * 2448325132Savg * When zil_clean() is called, it's possible for the itxg 2449325132Savg * list (which may be cleaned via a taskq) to contain 2450325132Savg * commit itxs. When this occurs, the commit waiters linked 2451325132Savg * off of these commit itxs will not be committed to an 2452325132Savg * lwb. Additionally, these commit waiters will not be 2453325132Savg * marked done until zil_commit_waiter_skip() is called via 2454325132Savg * zil_itxg_clean(). 2455325132Savg * 2456325132Savg * Thus, it's possible for this commit waiter (i.e. the 2457325132Savg * "zcw" variable) to be found in this "in between" state; 2458325132Savg * where it's "zcw_lwb" field is NULL, and it hasn't yet 2459325132Savg * been skipped, so it's "zcw_done" field is still B_FALSE. 2460325132Savg */ 2461325132Savg IMPLY(lwb != NULL, lwb->lwb_state != LWB_STATE_CLOSED); 2462325132Savg 2463325132Savg if (lwb != NULL && lwb->lwb_state == LWB_STATE_OPENED) { 2464325132Savg ASSERT3B(timedout, ==, B_FALSE); 2465325132Savg 2466325132Savg /* 2467325132Savg * If the lwb hasn't been issued yet, then we 2468325132Savg * need to wait with a timeout, in case this 2469325132Savg * function needs to issue the lwb after the 2470325132Savg * timeout is reached; responsibility (2) from 2471325132Savg * the comment above this function. 2472325132Savg */ 2473325132Savg#if defined(illumos) || !defined(_KERNEL) 2474325132Savg clock_t timeleft = cv_timedwait_hires(&zcw->zcw_cv, 2475325132Savg &zcw->zcw_lock, wakeup, USEC2NSEC(1), 2476325132Savg CALLOUT_FLAG_ABSOLUTE); 2477325132Savg 2478325132Savg if (timeleft >= 0 || zcw->zcw_done) 2479325132Savg continue; 2480325132Savg#else 2481325132Savg int wait_err = cv_timedwait_sbt(&zcw->zcw_cv, 2482325132Savg &zcw->zcw_lock, wakeup, SBT_1NS, C_ABSOLUTE); 2483325132Savg if (wait_err != EWOULDBLOCK || zcw->zcw_done) 2484325132Savg continue; 2485325132Savg#endif 2486325132Savg 2487325132Savg timedout = B_TRUE; 2488325132Savg zil_commit_waiter_timeout(zilog, zcw); 2489325132Savg 2490325132Savg if (!zcw->zcw_done) { 2491325132Savg /* 2492325132Savg * If the commit waiter has already been 2493325132Savg * marked "done", it's possible for the 2494325132Savg * waiter's lwb structure to have already 2495325132Savg * been freed. Thus, we can only reliably 2496325132Savg * make these assertions if the waiter 2497325132Savg * isn't done. 2498325132Savg */ 2499325132Savg ASSERT3P(lwb, ==, zcw->zcw_lwb); 2500325132Savg ASSERT3S(lwb->lwb_state, !=, LWB_STATE_OPENED); 2501325132Savg } 2502325132Savg } else { 2503325132Savg /* 2504325132Savg * If the lwb isn't open, then it must have already 2505325132Savg * been issued. In that case, there's no need to 2506325132Savg * use a timeout when waiting for the lwb to 2507325132Savg * complete. 2508325132Savg * 2509325132Savg * Additionally, if the lwb is NULL, the waiter 2510325132Savg * will soon be signalled and marked done via 2511325132Savg * zil_clean() and zil_itxg_clean(), so no timeout 2512325132Savg * is required. 2513325132Savg */ 2514325132Savg 2515325132Savg IMPLY(lwb != NULL, 2516325132Savg lwb->lwb_state == LWB_STATE_ISSUED || 2517325132Savg lwb->lwb_state == LWB_STATE_DONE); 2518325132Savg cv_wait(&zcw->zcw_cv, &zcw->zcw_lock); 2519325132Savg } 2520325132Savg } 2521325132Savg 2522325132Savg mutex_exit(&zcw->zcw_lock); 2523168404Spjd} 2524168404Spjd 2525325132Savgstatic zil_commit_waiter_t * 2526325132Savgzil_alloc_commit_waiter() 2527325132Savg{ 2528325132Savg zil_commit_waiter_t *zcw = kmem_cache_alloc(zil_zcw_cache, KM_SLEEP); 2529325132Savg 2530325132Savg cv_init(&zcw->zcw_cv, NULL, CV_DEFAULT, NULL); 2531325132Savg mutex_init(&zcw->zcw_lock, NULL, MUTEX_DEFAULT, NULL); 2532325132Savg list_link_init(&zcw->zcw_node); 2533325132Savg zcw->zcw_lwb = NULL; 2534325132Savg zcw->zcw_done = B_FALSE; 2535325132Savg zcw->zcw_zio_error = 0; 2536325132Savg 2537325132Savg return (zcw); 2538325132Savg} 2539325132Savg 2540325132Savgstatic void 2541325132Savgzil_free_commit_waiter(zil_commit_waiter_t *zcw) 2542325132Savg{ 2543325132Savg ASSERT(!list_link_active(&zcw->zcw_node)); 2544325132Savg ASSERT3P(zcw->zcw_lwb, ==, NULL); 2545325132Savg ASSERT3B(zcw->zcw_done, ==, B_TRUE); 2546325132Savg mutex_destroy(&zcw->zcw_lock); 2547325132Savg cv_destroy(&zcw->zcw_cv); 2548325132Savg kmem_cache_free(zil_zcw_cache, zcw); 2549325132Savg} 2550325132Savg 2551168404Spjd/* 2552325132Savg * This function is used to create a TX_COMMIT itx and assign it. This 2553325132Savg * way, it will be linked into the ZIL's list of synchronous itxs, and 2554325132Savg * then later committed to an lwb (or skipped) when 2555325132Savg * zil_process_commit_list() is called. 2556325132Savg */ 2557325132Savgstatic void 2558325132Savgzil_commit_itx_assign(zilog_t *zilog, zil_commit_waiter_t *zcw) 2559325132Savg{ 2560325132Savg dmu_tx_t *tx = dmu_tx_create(zilog->zl_os); 2561325132Savg VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); 2562325132Savg 2563325132Savg itx_t *itx = zil_itx_create(TX_COMMIT, sizeof (lr_t)); 2564325132Savg itx->itx_sync = B_TRUE; 2565325132Savg itx->itx_private = zcw; 2566325132Savg 2567325132Savg zil_itx_assign(zilog, itx, tx); 2568325132Savg 2569325132Savg dmu_tx_commit(tx); 2570325132Savg} 2571325132Savg 2572325132Savg/* 2573325132Savg * Commit ZFS Intent Log transactions (itxs) to stable storage. 2574219089Spjd * 2575325132Savg * When writing ZIL transactions to the on-disk representation of the 2576325132Savg * ZIL, the itxs are committed to a Log Write Block (lwb). Multiple 2577325132Savg * itxs can be committed to a single lwb. Once a lwb is written and 2578325132Savg * committed to stable storage (i.e. the lwb is written, and vdevs have 2579325132Savg * been flushed), each itx that was committed to that lwb is also 2580325132Savg * considered to be committed to stable storage. 2581219089Spjd * 2582325132Savg * When an itx is committed to an lwb, the log record (lr_t) contained 2583325132Savg * by the itx is copied into the lwb's zio buffer, and once this buffer 2584325132Savg * is written to disk, it becomes an on-disk ZIL block. 2585219089Spjd * 2586325132Savg * As itxs are generated, they're inserted into the ZIL's queue of 2587325132Savg * uncommitted itxs. The semantics of zil_commit() are such that it will 2588325132Savg * block until all itxs that were in the queue when it was called, are 2589325132Savg * committed to stable storage. 2590219089Spjd * 2591325132Savg * If "foid" is zero, this means all "synchronous" and "asynchronous" 2592325132Savg * itxs, for all objects in the dataset, will be committed to stable 2593325132Savg * storage prior to zil_commit() returning. If "foid" is non-zero, all 2594325132Savg * "synchronous" itxs for all objects, but only "asynchronous" itxs 2595325132Savg * that correspond to the foid passed in, will be committed to stable 2596325132Savg * storage prior to zil_commit() returning. 2597325132Savg * 2598325132Savg * Generally speaking, when zil_commit() is called, the consumer doesn't 2599325132Savg * actually care about _all_ of the uncommitted itxs. Instead, they're 2600325132Savg * simply trying to waiting for a specific itx to be committed to disk, 2601325132Savg * but the interface(s) for interacting with the ZIL don't allow such 2602325132Savg * fine-grained communication. A better interface would allow a consumer 2603325132Savg * to create and assign an itx, and then pass a reference to this itx to 2604325132Savg * zil_commit(); such that zil_commit() would return as soon as that 2605325132Savg * specific itx was committed to disk (instead of waiting for _all_ 2606325132Savg * itxs to be committed). 2607325132Savg * 2608325132Savg * When a thread calls zil_commit() a special "commit itx" will be 2609325132Savg * generated, along with a corresponding "waiter" for this commit itx. 2610325132Savg * zil_commit() will wait on this waiter's CV, such that when the waiter 2611325132Savg * is marked done, and signalled, zil_commit() will return. 2612325132Savg * 2613325132Savg * This commit itx is inserted into the queue of uncommitted itxs. This 2614325132Savg * provides an easy mechanism for determining which itxs were in the 2615325132Savg * queue prior to zil_commit() having been called, and which itxs were 2616325132Savg * added after zil_commit() was called. 2617325132Savg * 2618325132Savg * The commit it is special; it doesn't have any on-disk representation. 2619325132Savg * When a commit itx is "committed" to an lwb, the waiter associated 2620325132Savg * with it is linked onto the lwb's list of waiters. Then, when that lwb 2621325132Savg * completes, each waiter on the lwb's list is marked done and signalled 2622325132Savg * -- allowing the thread waiting on the waiter to return from zil_commit(). 2623325132Savg * 2624325132Savg * It's important to point out a few critical factors that allow us 2625325132Savg * to make use of the commit itxs, commit waiters, per-lwb lists of 2626325132Savg * commit waiters, and zio completion callbacks like we're doing: 2627325132Savg * 2628325132Savg * 1. The list of waiters for each lwb is traversed, and each commit 2629325132Savg * waiter is marked "done" and signalled, in the zio completion 2630325132Savg * callback of the lwb's zio[*]. 2631325132Savg * 2632325132Savg * * Actually, the waiters are signalled in the zio completion 2633325132Savg * callback of the root zio for the DKIOCFLUSHWRITECACHE commands 2634325132Savg * that are sent to the vdevs upon completion of the lwb zio. 2635325132Savg * 2636325132Savg * 2. When the itxs are inserted into the ZIL's queue of uncommitted 2637325132Savg * itxs, the order in which they are inserted is preserved[*]; as 2638325132Savg * itxs are added to the queue, they are added to the tail of 2639325132Savg * in-memory linked lists. 2640325132Savg * 2641325132Savg * When committing the itxs to lwbs (to be written to disk), they 2642325132Savg * are committed in the same order in which the itxs were added to 2643325132Savg * the uncommitted queue's linked list(s); i.e. the linked list of 2644325132Savg * itxs to commit is traversed from head to tail, and each itx is 2645325132Savg * committed to an lwb in that order. 2646325132Savg * 2647325132Savg * * To clarify: 2648325132Savg * 2649325132Savg * - the order of "sync" itxs is preserved w.r.t. other 2650325132Savg * "sync" itxs, regardless of the corresponding objects. 2651325132Savg * - the order of "async" itxs is preserved w.r.t. other 2652325132Savg * "async" itxs corresponding to the same object. 2653325132Savg * - the order of "async" itxs is *not* preserved w.r.t. other 2654325132Savg * "async" itxs corresponding to different objects. 2655325132Savg * - the order of "sync" itxs w.r.t. "async" itxs (or vice 2656325132Savg * versa) is *not* preserved, even for itxs that correspond 2657325132Savg * to the same object. 2658325132Savg * 2659325132Savg * For more details, see: zil_itx_assign(), zil_async_to_sync(), 2660325132Savg * zil_get_commit_list(), and zil_process_commit_list(). 2661325132Savg * 2662325132Savg * 3. The lwbs represent a linked list of blocks on disk. Thus, any 2663325132Savg * lwb cannot be considered committed to stable storage, until its 2664325132Savg * "previous" lwb is also committed to stable storage. This fact, 2665325132Savg * coupled with the fact described above, means that itxs are 2666325132Savg * committed in (roughly) the order in which they were generated. 2667325132Savg * This is essential because itxs are dependent on prior itxs. 2668325132Savg * Thus, we *must not* deem an itx as being committed to stable 2669325132Savg * storage, until *all* prior itxs have also been committed to 2670325132Savg * stable storage. 2671325132Savg * 2672325132Savg * To enforce this ordering of lwb zio's, while still leveraging as 2673325132Savg * much of the underlying storage performance as possible, we rely 2674325132Savg * on two fundamental concepts: 2675325132Savg * 2676325132Savg * 1. The creation and issuance of lwb zio's is protected by 2677329485Smav * the zilog's "zl_issuer_lock", which ensures only a single 2678325132Savg * thread is creating and/or issuing lwb's at a time 2679325132Savg * 2. The "previous" lwb is a child of the "current" lwb 2680325132Savg * (leveraging the zio parent-child depenency graph) 2681325132Savg * 2682325132Savg * By relying on this parent-child zio relationship, we can have 2683325132Savg * many lwb zio's concurrently issued to the underlying storage, 2684325132Savg * but the order in which they complete will be the same order in 2685325132Savg * which they were created. 2686168404Spjd */ 2687168404Spjdvoid 2688219089Spjdzil_commit(zilog_t *zilog, uint64_t foid) 2689168404Spjd{ 2690325132Savg /* 2691325132Savg * We should never attempt to call zil_commit on a snapshot for 2692325132Savg * a couple of reasons: 2693325132Savg * 2694325132Savg * 1. A snapshot may never be modified, thus it cannot have any 2695325132Savg * in-flight itxs that would have modified the dataset. 2696325132Savg * 2697325132Savg * 2. By design, when zil_commit() is called, a commit itx will 2698325132Savg * be assigned to this zilog; as a result, the zilog will be 2699325132Savg * dirtied. We must not dirty the zilog of a snapshot; there's 2700325132Savg * checks in the code that enforce this invariant, and will 2701325132Savg * cause a panic if it's not upheld. 2702325132Savg */ 2703325132Savg ASSERT3B(dmu_objset_is_snapshot(zilog->zl_os), ==, B_FALSE); 2704219089Spjd 2705219089Spjd if (zilog->zl_sync == ZFS_SYNC_DISABLED) 2706168404Spjd return; 2707168404Spjd 2708325132Savg if (!spa_writeable(zilog->zl_spa)) { 2709325132Savg /* 2710325132Savg * If the SPA is not writable, there should never be any 2711325132Savg * pending itxs waiting to be committed to disk. If that 2712325132Savg * weren't true, we'd skip writing those itxs out, and 2713325132Savg * would break the sematics of zil_commit(); thus, we're 2714325132Savg * verifying that truth before we return to the caller. 2715325132Savg */ 2716325132Savg ASSERT(list_is_empty(&zilog->zl_lwb_list)); 2717325132Savg ASSERT3P(zilog->zl_last_lwb_opened, ==, NULL); 2718325132Savg for (int i = 0; i < TXG_SIZE; i++) 2719325132Savg ASSERT3P(zilog->zl_itxg[i].itxg_itxs, ==, NULL); 2720325132Savg return; 2721325132Savg } 2722219089Spjd 2723325132Savg /* 2724325132Savg * If the ZIL is suspended, we don't want to dirty it by calling 2725325132Savg * zil_commit_itx_assign() below, nor can we write out 2726325132Savg * lwbs like would be done in zil_commit_write(). Thus, we 2727325132Savg * simply rely on txg_wait_synced() to maintain the necessary 2728325132Savg * semantics, and avoid calling those functions altogether. 2729325132Savg */ 2730325132Savg if (zilog->zl_suspend > 0) { 2731325132Savg txg_wait_synced(zilog->zl_dmu_pool, 0); 2732325132Savg return; 2733168404Spjd } 2734219089Spjd 2735329486Smav zil_commit_impl(zilog, foid); 2736329486Smav} 2737329486Smav 2738329486Smavvoid 2739329486Smavzil_commit_impl(zilog_t *zilog, uint64_t foid) 2740329486Smav{ 2741325132Savg /* 2742325132Savg * Move the "async" itxs for the specified foid to the "sync" 2743325132Savg * queues, such that they will be later committed (or skipped) 2744325132Savg * to an lwb when zil_process_commit_list() is called. 2745325132Savg * 2746325132Savg * Since these "async" itxs must be committed prior to this 2747325132Savg * call to zil_commit returning, we must perform this operation 2748325132Savg * before we call zil_commit_itx_assign(). 2749325132Savg */ 2750325132Savg zil_async_to_sync(zilog, foid); 2751219089Spjd 2752325132Savg /* 2753325132Savg * We allocate a new "waiter" structure which will initially be 2754325132Savg * linked to the commit itx using the itx's "itx_private" field. 2755325132Savg * Since the commit itx doesn't represent any on-disk state, 2756325132Savg * when it's committed to an lwb, rather than copying the its 2757325132Savg * lr_t into the lwb's buffer, the commit itx's "waiter" will be 2758325132Savg * added to the lwb's list of waiters. Then, when the lwb is 2759325132Savg * committed to stable storage, each waiter in the lwb's list of 2760325132Savg * waiters will be marked "done", and signalled. 2761325132Savg * 2762325132Savg * We must create the waiter and assign the commit itx prior to 2763325132Savg * calling zil_commit_writer(), or else our specific commit itx 2764325132Savg * is not guaranteed to be committed to an lwb prior to calling 2765325132Savg * zil_commit_waiter(). 2766325132Savg */ 2767325132Savg zil_commit_waiter_t *zcw = zil_alloc_commit_waiter(); 2768325132Savg zil_commit_itx_assign(zilog, zcw); 2769219089Spjd 2770325132Savg zil_commit_writer(zilog, zcw); 2771325132Savg zil_commit_waiter(zilog, zcw); 2772325132Savg 2773325132Savg if (zcw->zcw_zio_error != 0) { 2774325132Savg /* 2775325132Savg * If there was an error writing out the ZIL blocks that 2776325132Savg * this thread is waiting on, then we fallback to 2777325132Savg * relying on spa_sync() to write out the data this 2778325132Savg * thread is waiting on. Obviously this has performance 2779325132Savg * implications, but the expectation is for this to be 2780325132Savg * an exceptional case, and shouldn't occur often. 2781325132Savg */ 2782325132Savg DTRACE_PROBE2(zil__commit__io__error, 2783325132Savg zilog_t *, zilog, zil_commit_waiter_t *, zcw); 2784325132Savg txg_wait_synced(zilog->zl_dmu_pool, 0); 2785325132Savg } 2786325132Savg 2787325132Savg zil_free_commit_waiter(zcw); 2788168404Spjd} 2789168404Spjd 2790168404Spjd/* 2791168404Spjd * Called in syncing context to free committed log blocks and update log header. 2792168404Spjd */ 2793168404Spjdvoid 2794168404Spjdzil_sync(zilog_t *zilog, dmu_tx_t *tx) 2795168404Spjd{ 2796168404Spjd zil_header_t *zh = zil_header_in_syncing_context(zilog); 2797168404Spjd uint64_t txg = dmu_tx_get_txg(tx); 2798168404Spjd spa_t *spa = zilog->zl_spa; 2799219089Spjd uint64_t *replayed_seq = &zilog->zl_replayed_seq[txg & TXG_MASK]; 2800168404Spjd lwb_t *lwb; 2801168404Spjd 2802209962Smm /* 2803209962Smm * We don't zero out zl_destroy_txg, so make sure we don't try 2804209962Smm * to destroy it twice. 2805209962Smm */ 2806209962Smm if (spa_sync_pass(spa) != 1) 2807209962Smm return; 2808209962Smm 2809168404Spjd mutex_enter(&zilog->zl_lock); 2810168404Spjd 2811168404Spjd ASSERT(zilog->zl_stop_sync == 0); 2812168404Spjd 2813219089Spjd if (*replayed_seq != 0) { 2814219089Spjd ASSERT(zh->zh_replay_seq < *replayed_seq); 2815219089Spjd zh->zh_replay_seq = *replayed_seq; 2816219089Spjd *replayed_seq = 0; 2817219089Spjd } 2818168404Spjd 2819168404Spjd if (zilog->zl_destroy_txg == txg) { 2820168404Spjd blkptr_t blk = zh->zh_log; 2821168404Spjd 2822168404Spjd ASSERT(list_head(&zilog->zl_lwb_list) == NULL); 2823168404Spjd 2824168404Spjd bzero(zh, sizeof (zil_header_t)); 2825209962Smm bzero(zilog->zl_replayed_seq, sizeof (zilog->zl_replayed_seq)); 2826168404Spjd 2827168404Spjd if (zilog->zl_keep_first) { 2828168404Spjd /* 2829168404Spjd * If this block was part of log chain that couldn't 2830168404Spjd * be claimed because a device was missing during 2831168404Spjd * zil_claim(), but that device later returns, 2832168404Spjd * then this block could erroneously appear valid. 2833168404Spjd * To guard against this, assign a new GUID to the new 2834168404Spjd * log chain so it doesn't matter what blk points to. 2835168404Spjd */ 2836168404Spjd zil_init_log_chain(zilog, &blk); 2837168404Spjd zh->zh_log = blk; 2838168404Spjd } 2839168404Spjd } 2840168404Spjd 2841213197Smm while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) { 2842168404Spjd zh->zh_log = lwb->lwb_blk; 2843168404Spjd if (lwb->lwb_buf != NULL || lwb->lwb_max_txg > txg) 2844168404Spjd break; 2845168404Spjd list_remove(&zilog->zl_lwb_list, lwb); 2846325132Savg zio_free(spa, txg, &lwb->lwb_blk); 2847325132Savg zil_free_lwb(zilog, lwb); 2848168404Spjd 2849168404Spjd /* 2850168404Spjd * If we don't have anything left in the lwb list then 2851168404Spjd * we've had an allocation failure and we need to zero 2852168404Spjd * out the zil_header blkptr so that we don't end 2853168404Spjd * up freeing the same block twice. 2854168404Spjd */ 2855168404Spjd if (list_head(&zilog->zl_lwb_list) == NULL) 2856168404Spjd BP_ZERO(&zh->zh_log); 2857168404Spjd } 2858168404Spjd mutex_exit(&zilog->zl_lock); 2859168404Spjd} 2860168404Spjd 2861325132Savg/* ARGSUSED */ 2862325132Savgstatic int 2863325132Savgzil_lwb_cons(void *vbuf, void *unused, int kmflag) 2864325132Savg{ 2865325132Savg lwb_t *lwb = vbuf; 2866325132Savg list_create(&lwb->lwb_waiters, sizeof (zil_commit_waiter_t), 2867325132Savg offsetof(zil_commit_waiter_t, zcw_node)); 2868325132Savg avl_create(&lwb->lwb_vdev_tree, zil_lwb_vdev_compare, 2869325132Savg sizeof (zil_vdev_node_t), offsetof(zil_vdev_node_t, zv_node)); 2870325132Savg mutex_init(&lwb->lwb_vdev_lock, NULL, MUTEX_DEFAULT, NULL); 2871325132Savg return (0); 2872325132Savg} 2873325132Savg 2874325132Savg/* ARGSUSED */ 2875325132Savgstatic void 2876325132Savgzil_lwb_dest(void *vbuf, void *unused) 2877325132Savg{ 2878325132Savg lwb_t *lwb = vbuf; 2879325132Savg mutex_destroy(&lwb->lwb_vdev_lock); 2880325132Savg avl_destroy(&lwb->lwb_vdev_tree); 2881325132Savg list_destroy(&lwb->lwb_waiters); 2882325132Savg} 2883325132Savg 2884168404Spjdvoid 2885168404Spjdzil_init(void) 2886168404Spjd{ 2887168404Spjd zil_lwb_cache = kmem_cache_create("zil_lwb_cache", 2888325132Savg sizeof (lwb_t), 0, zil_lwb_cons, zil_lwb_dest, NULL, NULL, NULL, 0); 2889325132Savg 2890325132Savg zil_zcw_cache = kmem_cache_create("zil_zcw_cache", 2891325132Savg sizeof (zil_commit_waiter_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 2892168404Spjd} 2893168404Spjd 2894168404Spjdvoid 2895168404Spjdzil_fini(void) 2896168404Spjd{ 2897325132Savg kmem_cache_destroy(zil_zcw_cache); 2898168404Spjd kmem_cache_destroy(zil_lwb_cache); 2899168404Spjd} 2900168404Spjd 2901219089Spjdvoid 2902219089Spjdzil_set_sync(zilog_t *zilog, uint64_t sync) 2903219089Spjd{ 2904219089Spjd zilog->zl_sync = sync; 2905219089Spjd} 2906219089Spjd 2907219089Spjdvoid 2908219089Spjdzil_set_logbias(zilog_t *zilog, uint64_t logbias) 2909219089Spjd{ 2910219089Spjd zilog->zl_logbias = logbias; 2911219089Spjd} 2912219089Spjd 2913168404Spjdzilog_t * 2914168404Spjdzil_alloc(objset_t *os, zil_header_t *zh_phys) 2915168404Spjd{ 2916168404Spjd zilog_t *zilog; 2917168404Spjd 2918168404Spjd zilog = kmem_zalloc(sizeof (zilog_t), KM_SLEEP); 2919168404Spjd 2920168404Spjd zilog->zl_header = zh_phys; 2921168404Spjd zilog->zl_os = os; 2922168404Spjd zilog->zl_spa = dmu_objset_spa(os); 2923168404Spjd zilog->zl_dmu_pool = dmu_objset_pool(os); 2924168404Spjd zilog->zl_destroy_txg = TXG_INITIAL - 1; 2925219089Spjd zilog->zl_logbias = dmu_objset_logbias(os); 2926219089Spjd zilog->zl_sync = dmu_objset_syncprop(os); 2927325132Savg zilog->zl_dirty_max_txg = 0; 2928325132Savg zilog->zl_last_lwb_opened = NULL; 2929325132Savg zilog->zl_last_lwb_latency = 0; 2930359554Smav zilog->zl_max_block_size = zil_maxblocksize; 2931168404Spjd 2932168404Spjd mutex_init(&zilog->zl_lock, NULL, MUTEX_DEFAULT, NULL); 2933329485Smav mutex_init(&zilog->zl_issuer_lock, NULL, MUTEX_DEFAULT, NULL); 2934168404Spjd 2935219089Spjd for (int i = 0; i < TXG_SIZE; i++) { 2936219089Spjd mutex_init(&zilog->zl_itxg[i].itxg_lock, NULL, 2937219089Spjd MUTEX_DEFAULT, NULL); 2938219089Spjd } 2939168404Spjd 2940168404Spjd list_create(&zilog->zl_lwb_list, sizeof (lwb_t), 2941168404Spjd offsetof(lwb_t, lwb_node)); 2942168404Spjd 2943219089Spjd list_create(&zilog->zl_itx_commit_list, sizeof (itx_t), 2944219089Spjd offsetof(itx_t, itx_node)); 2945219089Spjd 2946185029Spjd cv_init(&zilog->zl_cv_suspend, NULL, CV_DEFAULT, NULL); 2947185029Spjd 2948168404Spjd return (zilog); 2949168404Spjd} 2950168404Spjd 2951168404Spjdvoid 2952168404Spjdzil_free(zilog_t *zilog) 2953168404Spjd{ 2954168404Spjd zilog->zl_stop_sync = 1; 2955168404Spjd 2956248571Smm ASSERT0(zilog->zl_suspend); 2957248571Smm ASSERT0(zilog->zl_suspending); 2958248571Smm 2959224526Smm ASSERT(list_is_empty(&zilog->zl_lwb_list)); 2960168404Spjd list_destroy(&zilog->zl_lwb_list); 2961168404Spjd 2962219089Spjd ASSERT(list_is_empty(&zilog->zl_itx_commit_list)); 2963219089Spjd list_destroy(&zilog->zl_itx_commit_list); 2964219089Spjd 2965219089Spjd for (int i = 0; i < TXG_SIZE; i++) { 2966219089Spjd /* 2967219089Spjd * It's possible for an itx to be generated that doesn't dirty 2968219089Spjd * a txg (e.g. ztest TX_TRUNCATE). So there's no zil_clean() 2969219089Spjd * callback to remove the entry. We remove those here. 2970219089Spjd * 2971219089Spjd * Also free up the ziltest itxs. 2972219089Spjd */ 2973219089Spjd if (zilog->zl_itxg[i].itxg_itxs) 2974219089Spjd zil_itxg_clean(zilog->zl_itxg[i].itxg_itxs); 2975219089Spjd mutex_destroy(&zilog->zl_itxg[i].itxg_lock); 2976219089Spjd } 2977219089Spjd 2978329485Smav mutex_destroy(&zilog->zl_issuer_lock); 2979168404Spjd mutex_destroy(&zilog->zl_lock); 2980168404Spjd 2981185029Spjd cv_destroy(&zilog->zl_cv_suspend); 2982185029Spjd 2983168404Spjd kmem_free(zilog, sizeof (zilog_t)); 2984168404Spjd} 2985168404Spjd 2986168404Spjd/* 2987168404Spjd * Open an intent log. 2988168404Spjd */ 2989168404Spjdzilog_t * 2990168404Spjdzil_open(objset_t *os, zil_get_data_t *get_data) 2991168404Spjd{ 2992168404Spjd zilog_t *zilog = dmu_objset_zil(os); 2993168404Spjd 2994325132Savg ASSERT3P(zilog->zl_get_data, ==, NULL); 2995325132Savg ASSERT3P(zilog->zl_last_lwb_opened, ==, NULL); 2996224526Smm ASSERT(list_is_empty(&zilog->zl_lwb_list)); 2997224526Smm 2998168404Spjd zilog->zl_get_data = get_data; 2999168404Spjd 3000168404Spjd return (zilog); 3001168404Spjd} 3002168404Spjd 3003168404Spjd/* 3004168404Spjd * Close an intent log. 3005168404Spjd */ 3006168404Spjdvoid 3007168404Spjdzil_close(zilog_t *zilog) 3008168404Spjd{ 3009224526Smm lwb_t *lwb; 3010325132Savg uint64_t txg; 3011219089Spjd 3012325132Savg if (!dmu_objset_is_snapshot(zilog->zl_os)) { 3013325132Savg zil_commit(zilog, 0); 3014325132Savg } else { 3015325132Savg ASSERT3P(list_tail(&zilog->zl_lwb_list), ==, NULL); 3016325132Savg ASSERT0(zilog->zl_dirty_max_txg); 3017325132Savg ASSERT3B(zilog_is_dirty(zilog), ==, B_FALSE); 3018325132Savg } 3019219089Spjd 3020219089Spjd mutex_enter(&zilog->zl_lock); 3021224526Smm lwb = list_tail(&zilog->zl_lwb_list); 3022325132Savg if (lwb == NULL) 3023325132Savg txg = zilog->zl_dirty_max_txg; 3024325132Savg else 3025325132Savg txg = MAX(zilog->zl_dirty_max_txg, lwb->lwb_max_txg); 3026219089Spjd mutex_exit(&zilog->zl_lock); 3027325132Savg 3028325132Savg /* 3029325132Savg * We need to use txg_wait_synced() to wait long enough for the 3030325132Savg * ZIL to be clean, and to wait for all pending lwbs to be 3031325132Savg * written out. 3032325132Savg */ 3033325132Savg if (txg != 0) 3034168404Spjd txg_wait_synced(zilog->zl_dmu_pool, txg); 3035168404Spjd 3036310515Savg if (zilog_is_dirty(zilog)) 3037310515Savg zfs_dbgmsg("zil (%p) is dirty, txg %llu", zilog, txg); 3038310515Savg VERIFY(!zilog_is_dirty(zilog)); 3039310515Savg 3040168404Spjd zilog->zl_get_data = NULL; 3041224526Smm 3042224526Smm /* 3043325132Savg * We should have only one lwb left on the list; remove it now. 3044224526Smm */ 3045224526Smm mutex_enter(&zilog->zl_lock); 3046224526Smm lwb = list_head(&zilog->zl_lwb_list); 3047224526Smm if (lwb != NULL) { 3048325132Savg ASSERT3P(lwb, ==, list_tail(&zilog->zl_lwb_list)); 3049325132Savg ASSERT3S(lwb->lwb_state, !=, LWB_STATE_ISSUED); 3050224526Smm list_remove(&zilog->zl_lwb_list, lwb); 3051224526Smm zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); 3052325132Savg zil_free_lwb(zilog, lwb); 3053224526Smm } 3054224526Smm mutex_exit(&zilog->zl_lock); 3055168404Spjd} 3056168404Spjd 3057248571Smmstatic char *suspend_tag = "zil suspending"; 3058248571Smm 3059168404Spjd/* 3060168404Spjd * Suspend an intent log. While in suspended mode, we still honor 3061168404Spjd * synchronous semantics, but we rely on txg_wait_synced() to do it. 3062248571Smm * On old version pools, we suspend the log briefly when taking a 3063248571Smm * snapshot so that it will have an empty intent log. 3064248571Smm * 3065248571Smm * Long holds are not really intended to be used the way we do here -- 3066248571Smm * held for such a short time. A concurrent caller of dsl_dataset_long_held() 3067248571Smm * could fail. Therefore we take pains to only put a long hold if it is 3068248571Smm * actually necessary. Fortunately, it will only be necessary if the 3069248571Smm * objset is currently mounted (or the ZVOL equivalent). In that case it 3070248571Smm * will already have a long hold, so we are not really making things any worse. 3071248571Smm * 3072248571Smm * Ideally, we would locate the existing long-holder (i.e. the zfsvfs_t or 3073248571Smm * zvol_state_t), and use their mechanism to prevent their hold from being 3074248571Smm * dropped (e.g. VFS_HOLD()). However, that would be even more pain for 3075248571Smm * very little gain. 3076248571Smm * 3077248571Smm * if cookiep == NULL, this does both the suspend & resume. 3078248571Smm * Otherwise, it returns with the dataset "long held", and the cookie 3079248571Smm * should be passed into zil_resume(). 3080168404Spjd */ 3081168404Spjdint 3082248571Smmzil_suspend(const char *osname, void **cookiep) 3083168404Spjd{ 3084248571Smm objset_t *os; 3085248571Smm zilog_t *zilog; 3086248571Smm const zil_header_t *zh; 3087248571Smm int error; 3088168404Spjd 3089248571Smm error = dmu_objset_hold(osname, suspend_tag, &os); 3090248571Smm if (error != 0) 3091248571Smm return (error); 3092248571Smm zilog = dmu_objset_zil(os); 3093248571Smm 3094168404Spjd mutex_enter(&zilog->zl_lock); 3095248571Smm zh = zilog->zl_header; 3096248571Smm 3097200724Sdelphij if (zh->zh_flags & ZIL_REPLAY_NEEDED) { /* unplayed log */ 3098168404Spjd mutex_exit(&zilog->zl_lock); 3099248571Smm dmu_objset_rele(os, suspend_tag); 3100249195Smm return (SET_ERROR(EBUSY)); 3101168404Spjd } 3102248571Smm 3103248571Smm /* 3104248571Smm * Don't put a long hold in the cases where we can avoid it. This 3105248571Smm * is when there is no cookie so we are doing a suspend & resume 3106248571Smm * (i.e. called from zil_vdev_offline()), and there's nothing to do 3107248571Smm * for the suspend because it's already suspended, or there's no ZIL. 3108248571Smm */ 3109248571Smm if (cookiep == NULL && !zilog->zl_suspending && 3110248571Smm (zilog->zl_suspend > 0 || BP_IS_HOLE(&zh->zh_log))) { 3111248571Smm mutex_exit(&zilog->zl_lock); 3112248571Smm dmu_objset_rele(os, suspend_tag); 3113248571Smm return (0); 3114248571Smm } 3115248571Smm 3116248571Smm dsl_dataset_long_hold(dmu_objset_ds(os), suspend_tag); 3117248571Smm dsl_pool_rele(dmu_objset_pool(os), suspend_tag); 3118248571Smm 3119248571Smm zilog->zl_suspend++; 3120248571Smm 3121248571Smm if (zilog->zl_suspend > 1) { 3122168404Spjd /* 3123248571Smm * Someone else is already suspending it. 3124168404Spjd * Just wait for them to finish. 3125168404Spjd */ 3126248571Smm 3127168404Spjd while (zilog->zl_suspending) 3128168404Spjd cv_wait(&zilog->zl_cv_suspend, &zilog->zl_lock); 3129168404Spjd mutex_exit(&zilog->zl_lock); 3130248571Smm 3131248571Smm if (cookiep == NULL) 3132248571Smm zil_resume(os); 3133248571Smm else 3134248571Smm *cookiep = os; 3135168404Spjd return (0); 3136168404Spjd } 3137248571Smm 3138248571Smm /* 3139248571Smm * If there is no pointer to an on-disk block, this ZIL must not 3140248571Smm * be active (e.g. filesystem not mounted), so there's nothing 3141248571Smm * to clean up. 3142248571Smm */ 3143248571Smm if (BP_IS_HOLE(&zh->zh_log)) { 3144248571Smm ASSERT(cookiep != NULL); /* fast path already handled */ 3145248571Smm 3146248571Smm *cookiep = os; 3147248571Smm mutex_exit(&zilog->zl_lock); 3148248571Smm return (0); 3149248571Smm } 3150248571Smm 3151168404Spjd zilog->zl_suspending = B_TRUE; 3152168404Spjd mutex_exit(&zilog->zl_lock); 3153168404Spjd 3154329486Smav /* 3155329486Smav * We need to use zil_commit_impl to ensure we wait for all 3156329486Smav * LWB_STATE_OPENED and LWB_STATE_ISSUED lwb's to be committed 3157329486Smav * to disk before proceeding. If we used zil_commit instead, it 3158329486Smav * would just call txg_wait_synced(), because zl_suspend is set. 3159329486Smav * txg_wait_synced() doesn't wait for these lwb's to be 3160329486Smav * LWB_STATE_DONE before returning. 3161329486Smav */ 3162329486Smav zil_commit_impl(zilog, 0); 3163168404Spjd 3164329486Smav /* 3165329486Smav * Now that we've ensured all lwb's are LWB_STATE_DONE, we use 3166329486Smav * txg_wait_synced() to ensure the data from the zilog has 3167329486Smav * migrated to the main pool before calling zil_destroy(). 3168329486Smav */ 3169329486Smav txg_wait_synced(zilog->zl_dmu_pool, 0); 3170329486Smav 3171168404Spjd zil_destroy(zilog, B_FALSE); 3172168404Spjd 3173168404Spjd mutex_enter(&zilog->zl_lock); 3174168404Spjd zilog->zl_suspending = B_FALSE; 3175168404Spjd cv_broadcast(&zilog->zl_cv_suspend); 3176168404Spjd mutex_exit(&zilog->zl_lock); 3177168404Spjd 3178248571Smm if (cookiep == NULL) 3179248571Smm zil_resume(os); 3180248571Smm else 3181248571Smm *cookiep = os; 3182168404Spjd return (0); 3183168404Spjd} 3184168404Spjd 3185168404Spjdvoid 3186248571Smmzil_resume(void *cookie) 3187168404Spjd{ 3188248571Smm objset_t *os = cookie; 3189248571Smm zilog_t *zilog = dmu_objset_zil(os); 3190248571Smm 3191168404Spjd mutex_enter(&zilog->zl_lock); 3192168404Spjd ASSERT(zilog->zl_suspend != 0); 3193168404Spjd zilog->zl_suspend--; 3194168404Spjd mutex_exit(&zilog->zl_lock); 3195248571Smm dsl_dataset_long_rele(dmu_objset_ds(os), suspend_tag); 3196248571Smm dsl_dataset_rele(dmu_objset_ds(os), suspend_tag); 3197168404Spjd} 3198168404Spjd 3199219089Spjdtypedef struct zil_replay_arg { 3200219089Spjd zil_replay_func_t **zr_replay; 3201219089Spjd void *zr_arg; 3202219089Spjd boolean_t zr_byteswap; 3203219089Spjd char *zr_lr; 3204219089Spjd} zil_replay_arg_t; 3205219089Spjd 3206219089Spjdstatic int 3207219089Spjdzil_replay_error(zilog_t *zilog, lr_t *lr, int error) 3208209962Smm{ 3209307108Smav char name[ZFS_MAX_DATASET_NAME_LEN]; 3210209962Smm 3211219089Spjd zilog->zl_replaying_seq--; /* didn't actually replay this one */ 3212209962Smm 3213219089Spjd dmu_objset_name(zilog->zl_os, name); 3214209962Smm 3215219089Spjd cmn_err(CE_WARN, "ZFS replay transaction error %d, " 3216219089Spjd "dataset %s, seq 0x%llx, txtype %llu %s\n", error, name, 3217219089Spjd (u_longlong_t)lr->lrc_seq, 3218219089Spjd (u_longlong_t)(lr->lrc_txtype & ~TX_CI), 3219219089Spjd (lr->lrc_txtype & TX_CI) ? "CI" : ""); 3220219089Spjd 3221219089Spjd return (error); 3222209962Smm} 3223209962Smm 3224219089Spjdstatic int 3225168404Spjdzil_replay_log_record(zilog_t *zilog, lr_t *lr, void *zra, uint64_t claim_txg) 3226168404Spjd{ 3227168404Spjd zil_replay_arg_t *zr = zra; 3228168404Spjd const zil_header_t *zh = zilog->zl_header; 3229168404Spjd uint64_t reclen = lr->lrc_reclen; 3230168404Spjd uint64_t txtype = lr->lrc_txtype; 3231219089Spjd int error = 0; 3232168404Spjd 3233219089Spjd zilog->zl_replaying_seq = lr->lrc_seq; 3234168404Spjd 3235219089Spjd if (lr->lrc_seq <= zh->zh_replay_seq) /* already replayed */ 3236219089Spjd return (0); 3237219089Spjd 3238168404Spjd if (lr->lrc_txg < claim_txg) /* already committed */ 3239219089Spjd return (0); 3240168404Spjd 3241185029Spjd /* Strip case-insensitive bit, still present in log record */ 3242185029Spjd txtype &= ~TX_CI; 3243185029Spjd 3244219089Spjd if (txtype == 0 || txtype >= TX_MAX_TYPE) 3245219089Spjd return (zil_replay_error(zilog, lr, EINVAL)); 3246219089Spjd 3247219089Spjd /* 3248219089Spjd * If this record type can be logged out of order, the object 3249219089Spjd * (lr_foid) may no longer exist. That's legitimate, not an error. 3250219089Spjd */ 3251219089Spjd if (TX_OOO(txtype)) { 3252219089Spjd error = dmu_object_info(zilog->zl_os, 3253219089Spjd ((lr_ooo_t *)lr)->lr_foid, NULL); 3254219089Spjd if (error == ENOENT || error == EEXIST) 3255219089Spjd return (0); 3256209962Smm } 3257209962Smm 3258168404Spjd /* 3259168404Spjd * Make a copy of the data so we can revise and extend it. 3260168404Spjd */ 3261219089Spjd bcopy(lr, zr->zr_lr, reclen); 3262168404Spjd 3263168404Spjd /* 3264219089Spjd * If this is a TX_WRITE with a blkptr, suck in the data. 3265219089Spjd */ 3266219089Spjd if (txtype == TX_WRITE && reclen == sizeof (lr_write_t)) { 3267219089Spjd error = zil_read_log_data(zilog, (lr_write_t *)lr, 3268219089Spjd zr->zr_lr + reclen); 3269248571Smm if (error != 0) 3270219089Spjd return (zil_replay_error(zilog, lr, error)); 3271219089Spjd } 3272219089Spjd 3273219089Spjd /* 3274168404Spjd * The log block containing this lr may have been byteswapped 3275168404Spjd * so that we can easily examine common fields like lrc_txtype. 3276219089Spjd * However, the log is a mix of different record types, and only the 3277168404Spjd * replay vectors know how to byteswap their records. Therefore, if 3278168404Spjd * the lr was byteswapped, undo it before invoking the replay vector. 3279168404Spjd */ 3280168404Spjd if (zr->zr_byteswap) 3281219089Spjd byteswap_uint64_array(zr->zr_lr, reclen); 3282168404Spjd 3283168404Spjd /* 3284168404Spjd * We must now do two things atomically: replay this log record, 3285209962Smm * and update the log header sequence number to reflect the fact that 3286209962Smm * we did so. At the end of each replay function the sequence number 3287209962Smm * is updated if we are in replay mode. 3288168404Spjd */ 3289219089Spjd error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lr, zr->zr_byteswap); 3290248571Smm if (error != 0) { 3291168404Spjd /* 3292168404Spjd * The DMU's dnode layer doesn't see removes until the txg 3293168404Spjd * commits, so a subsequent claim can spuriously fail with 3294209962Smm * EEXIST. So if we receive any error we try syncing out 3295219089Spjd * any removes then retry the transaction. Note that we 3296219089Spjd * specify B_FALSE for byteswap now, so we don't do it twice. 3297168404Spjd */ 3298219089Spjd txg_wait_synced(spa_get_dsl(zilog->zl_spa), 0); 3299219089Spjd error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lr, B_FALSE); 3300248571Smm if (error != 0) 3301219089Spjd return (zil_replay_error(zilog, lr, error)); 3302168404Spjd } 3303219089Spjd return (0); 3304168404Spjd} 3305168404Spjd 3306168404Spjd/* ARGSUSED */ 3307219089Spjdstatic int 3308168404Spjdzil_incr_blks(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg) 3309168404Spjd{ 3310168404Spjd zilog->zl_replay_blks++; 3311219089Spjd 3312219089Spjd return (0); 3313168404Spjd} 3314168404Spjd 3315168404Spjd/* 3316168404Spjd * If this dataset has a non-empty intent log, replay it and destroy it. 3317168404Spjd */ 3318168404Spjdvoid 3319209962Smmzil_replay(objset_t *os, void *arg, zil_replay_func_t *replay_func[TX_MAX_TYPE]) 3320168404Spjd{ 3321168404Spjd zilog_t *zilog = dmu_objset_zil(os); 3322168404Spjd const zil_header_t *zh = zilog->zl_header; 3323168404Spjd zil_replay_arg_t zr; 3324168404Spjd 3325200724Sdelphij if ((zh->zh_flags & ZIL_REPLAY_NEEDED) == 0) { 3326168404Spjd zil_destroy(zilog, B_TRUE); 3327168404Spjd return; 3328168404Spjd } 3329168404Spjd 3330168404Spjd zr.zr_replay = replay_func; 3331168404Spjd zr.zr_arg = arg; 3332168404Spjd zr.zr_byteswap = BP_SHOULD_BYTESWAP(&zh->zh_log); 3333219089Spjd zr.zr_lr = kmem_alloc(2 * SPA_MAXBLOCKSIZE, KM_SLEEP); 3334168404Spjd 3335168404Spjd /* 3336168404Spjd * Wait for in-progress removes to sync before starting replay. 3337168404Spjd */ 3338168404Spjd txg_wait_synced(zilog->zl_dmu_pool, 0); 3339168404Spjd 3340209962Smm zilog->zl_replay = B_TRUE; 3341219089Spjd zilog->zl_replay_time = ddi_get_lbolt(); 3342168404Spjd ASSERT(zilog->zl_replay_blks == 0); 3343168404Spjd (void) zil_parse(zilog, zil_incr_blks, zil_replay_log_record, &zr, 3344168404Spjd zh->zh_claim_txg); 3345219089Spjd kmem_free(zr.zr_lr, 2 * SPA_MAXBLOCKSIZE); 3346168404Spjd 3347168404Spjd zil_destroy(zilog, B_FALSE); 3348185029Spjd txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg); 3349209962Smm zilog->zl_replay = B_FALSE; 3350168404Spjd} 3351168404Spjd 3352219089Spjdboolean_t 3353219089Spjdzil_replaying(zilog_t *zilog, dmu_tx_t *tx) 3354168404Spjd{ 3355219089Spjd if (zilog->zl_sync == ZFS_SYNC_DISABLED) 3356219089Spjd return (B_TRUE); 3357168404Spjd 3358219089Spjd if (zilog->zl_replay) { 3359219089Spjd dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); 3360219089Spjd zilog->zl_replayed_seq[dmu_tx_get_txg(tx) & TXG_MASK] = 3361219089Spjd zilog->zl_replaying_seq; 3362219089Spjd return (B_TRUE); 3363168404Spjd } 3364168404Spjd 3365219089Spjd return (B_FALSE); 3366168404Spjd} 3367213197Smm 3368213197Smm/* ARGSUSED */ 3369213197Smmint 3370332525Smavzil_reset(const char *osname, void *arg) 3371213197Smm{ 3372213197Smm int error; 3373213197Smm 3374248571Smm error = zil_suspend(osname, NULL); 3375248571Smm if (error != 0) 3376249195Smm return (SET_ERROR(EEXIST)); 3377248571Smm return (0); 3378213197Smm} 3379