zil.c revision 339134
1168404Spjd/* 2168404Spjd * CDDL HEADER START 3168404Spjd * 4168404Spjd * The contents of this file are subject to the terms of the 5168404Spjd * Common Development and Distribution License (the "License"). 6168404Spjd * You may not use this file except in compliance with the License. 7168404Spjd * 8168404Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9168404Spjd * or http://www.opensolaris.org/os/licensing. 10168404Spjd * See the License for the specific language governing permissions 11168404Spjd * and limitations under the License. 12168404Spjd * 13168404Spjd * When distributing Covered Code, include this CDDL HEADER in each 14168404Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15168404Spjd * If applicable, add the following below this CDDL HEADER, with the 16168404Spjd * fields enclosed by brackets "[]" replaced with your own identifying 17168404Spjd * information: Portions Copyright [yyyy] [name of copyright owner] 18168404Spjd * 19168404Spjd * CDDL HEADER END 20168404Spjd */ 21168404Spjd/* 22219089Spjd * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23339105Smav * Copyright (c) 2011, 2018 by Delphix. All rights reserved. 24296519Smav * Copyright (c) 2014 Integros [integros.com] 25168404Spjd */ 26168404Spjd 27219089Spjd/* Portions Copyright 2010 Robert Milkowski */ 28219089Spjd 29168404Spjd#include <sys/zfs_context.h> 30168404Spjd#include <sys/spa.h> 31332547Smav#include <sys/spa_impl.h> 32168404Spjd#include <sys/dmu.h> 33168404Spjd#include <sys/zap.h> 34168404Spjd#include <sys/arc.h> 35168404Spjd#include <sys/stat.h> 36168404Spjd#include <sys/resource.h> 37168404Spjd#include <sys/zil.h> 38168404Spjd#include <sys/zil_impl.h> 39168404Spjd#include <sys/dsl_dataset.h> 40219089Spjd#include <sys/vdev_impl.h> 41168404Spjd#include <sys/dmu_tx.h> 42219089Spjd#include <sys/dsl_pool.h> 43321610Smav#include <sys/abd.h> 44168404Spjd 45168404Spjd/* 46325132Savg * The ZFS Intent Log (ZIL) saves "transaction records" (itxs) of system 47325132Savg * calls that change the file system. Each itx has enough information to 48325132Savg * be able to replay them after a system crash, power loss, or 49325132Savg * equivalent failure mode. These are stored in memory until either: 50168404Spjd * 51325132Savg * 1. they are committed to the pool by the DMU transaction group 52325132Savg * (txg), at which point they can be discarded; or 53325132Savg * 2. they are committed to the on-disk ZIL for the dataset being 54325132Savg * modified (e.g. due to an fsync, O_DSYNC, or other synchronous 55325132Savg * requirement). 56168404Spjd * 57325132Savg * In the event of a crash or power loss, the itxs contained by each 58325132Savg * dataset's on-disk ZIL will be replayed when that dataset is first 59325132Savg * instantianted (e.g. if the dataset is a normal fileystem, when it is 60325132Savg * first mounted). 61168404Spjd * 62325132Savg * As hinted at above, there is one ZIL per dataset (both the in-memory 63325132Savg * representation, and the on-disk representation). The on-disk format 64325132Savg * consists of 3 parts: 65325132Savg * 66325132Savg * - a single, per-dataset, ZIL header; which points to a chain of 67325132Savg * - zero or more ZIL blocks; each of which contains 68325132Savg * - zero or more ZIL records 69325132Savg * 70325132Savg * A ZIL record holds the information necessary to replay a single 71325132Savg * system call transaction. A ZIL block can hold many ZIL records, and 72325132Savg * the blocks are chained together, similarly to a singly linked list. 73325132Savg * 74325132Savg * Each ZIL block contains a block pointer (blkptr_t) to the next ZIL 75325132Savg * block in the chain, and the ZIL header points to the first block in 76325132Savg * the chain. 77325132Savg * 78325132Savg * Note, there is not a fixed place in the pool to hold these ZIL 79325132Savg * blocks; they are dynamically allocated and freed as needed from the 80325132Savg * blocks available on the pool, though they can be preferentially 81325132Savg * allocated from a dedicated "log" vdev. 82168404Spjd */ 83168404Spjd 84168404Spjd/* 85325132Savg * This controls the amount of time that a ZIL block (lwb) will remain 86325132Savg * "open" when it isn't "full", and it has a thread waiting for it to be 87325132Savg * committed to stable storage. Please refer to the zil_commit_waiter() 88325132Savg * function (and the comments within it) for more details. 89325132Savg */ 90325132Savgint zfs_commit_timeout_pct = 5; 91325132Savg 92325132Savg/* 93251631Sdelphij * Disable intent logging replay. This global ZIL switch affects all pools. 94168404Spjd */ 95251631Sdelphijint zil_replay_disable = 0; 96168404SpjdSYSCTL_DECL(_vfs_zfs); 97267992ShselaskySYSCTL_INT(_vfs_zfs, OID_AUTO, zil_replay_disable, CTLFLAG_RWTUN, 98219089Spjd &zil_replay_disable, 0, "Disable intent logging replay"); 99168404Spjd 100168404Spjd/* 101168404Spjd * Tunable parameter for debugging or performance analysis. Setting 102168404Spjd * zfs_nocacheflush will cause corruption on power loss if a volatile 103168404Spjd * out-of-order write cache is enabled. 104168404Spjd */ 105168404Spjdboolean_t zfs_nocacheflush = B_FALSE; 106168404SpjdSYSCTL_INT(_vfs_zfs, OID_AUTO, cache_flush_disable, CTLFLAG_RDTUN, 107168404Spjd &zfs_nocacheflush, 0, "Disable cache flush"); 108249921Ssmhboolean_t zfs_trim_enabled = B_TRUE; 109249921SsmhSYSCTL_DECL(_vfs_zfs_trim); 110249921SsmhSYSCTL_INT(_vfs_zfs_trim, OID_AUTO, enabled, CTLFLAG_RDTUN, &zfs_trim_enabled, 0, 111249921Ssmh "Enable ZFS TRIM"); 112168404Spjd 113315441Smav/* 114315441Smav * Limit SLOG write size per commit executed with synchronous priority. 115321611Smav * Any writes above that will be executed with lower (asynchronous) priority 116321611Smav * to limit potential SLOG device abuse by single active ZIL writer. 117315441Smav */ 118321611Smavuint64_t zil_slog_bulk = 768 * 1024; 119321611SmavSYSCTL_QUAD(_vfs_zfs, OID_AUTO, zil_slog_bulk, CTLFLAG_RWTUN, 120321611Smav &zil_slog_bulk, 0, "Maximal SLOG commit size with sync priority"); 121315441Smav 122168404Spjdstatic kmem_cache_t *zil_lwb_cache; 123325132Savgstatic kmem_cache_t *zil_zcw_cache; 124168404Spjd 125219089Spjd#define LWB_EMPTY(lwb) ((BP_GET_LSIZE(&lwb->lwb_blk) - \ 126219089Spjd sizeof (zil_chain_t)) == (lwb->lwb_sz - lwb->lwb_nused)) 127219089Spjd 128168404Spjdstatic int 129219089Spjdzil_bp_compare(const void *x1, const void *x2) 130168404Spjd{ 131219089Spjd const dva_t *dva1 = &((zil_bp_node_t *)x1)->zn_dva; 132219089Spjd const dva_t *dva2 = &((zil_bp_node_t *)x2)->zn_dva; 133168404Spjd 134168404Spjd if (DVA_GET_VDEV(dva1) < DVA_GET_VDEV(dva2)) 135168404Spjd return (-1); 136168404Spjd if (DVA_GET_VDEV(dva1) > DVA_GET_VDEV(dva2)) 137168404Spjd return (1); 138168404Spjd 139168404Spjd if (DVA_GET_OFFSET(dva1) < DVA_GET_OFFSET(dva2)) 140168404Spjd return (-1); 141168404Spjd if (DVA_GET_OFFSET(dva1) > DVA_GET_OFFSET(dva2)) 142168404Spjd return (1); 143168404Spjd 144168404Spjd return (0); 145168404Spjd} 146168404Spjd 147168404Spjdstatic void 148219089Spjdzil_bp_tree_init(zilog_t *zilog) 149168404Spjd{ 150219089Spjd avl_create(&zilog->zl_bp_tree, zil_bp_compare, 151219089Spjd sizeof (zil_bp_node_t), offsetof(zil_bp_node_t, zn_node)); 152168404Spjd} 153168404Spjd 154168404Spjdstatic void 155219089Spjdzil_bp_tree_fini(zilog_t *zilog) 156168404Spjd{ 157219089Spjd avl_tree_t *t = &zilog->zl_bp_tree; 158219089Spjd zil_bp_node_t *zn; 159168404Spjd void *cookie = NULL; 160168404Spjd 161168404Spjd while ((zn = avl_destroy_nodes(t, &cookie)) != NULL) 162219089Spjd kmem_free(zn, sizeof (zil_bp_node_t)); 163168404Spjd 164168404Spjd avl_destroy(t); 165168404Spjd} 166168404Spjd 167219089Spjdint 168219089Spjdzil_bp_tree_add(zilog_t *zilog, const blkptr_t *bp) 169168404Spjd{ 170219089Spjd avl_tree_t *t = &zilog->zl_bp_tree; 171268075Sdelphij const dva_t *dva; 172219089Spjd zil_bp_node_t *zn; 173168404Spjd avl_index_t where; 174168404Spjd 175268075Sdelphij if (BP_IS_EMBEDDED(bp)) 176268075Sdelphij return (0); 177268075Sdelphij 178268075Sdelphij dva = BP_IDENTITY(bp); 179268075Sdelphij 180168404Spjd if (avl_find(t, dva, &where) != NULL) 181249195Smm return (SET_ERROR(EEXIST)); 182168404Spjd 183219089Spjd zn = kmem_alloc(sizeof (zil_bp_node_t), KM_SLEEP); 184168404Spjd zn->zn_dva = *dva; 185168404Spjd avl_insert(t, zn, where); 186168404Spjd 187168404Spjd return (0); 188168404Spjd} 189168404Spjd 190168404Spjdstatic zil_header_t * 191168404Spjdzil_header_in_syncing_context(zilog_t *zilog) 192168404Spjd{ 193168404Spjd return ((zil_header_t *)zilog->zl_header); 194168404Spjd} 195168404Spjd 196168404Spjdstatic void 197168404Spjdzil_init_log_chain(zilog_t *zilog, blkptr_t *bp) 198168404Spjd{ 199168404Spjd zio_cksum_t *zc = &bp->blk_cksum; 200168404Spjd 201168404Spjd zc->zc_word[ZIL_ZC_GUID_0] = spa_get_random(-1ULL); 202168404Spjd zc->zc_word[ZIL_ZC_GUID_1] = spa_get_random(-1ULL); 203168404Spjd zc->zc_word[ZIL_ZC_OBJSET] = dmu_objset_id(zilog->zl_os); 204168404Spjd zc->zc_word[ZIL_ZC_SEQ] = 1ULL; 205168404Spjd} 206168404Spjd 207168404Spjd/* 208219089Spjd * Read a log block and make sure it's valid. 209168404Spjd */ 210168404Spjdstatic int 211219089Spjdzil_read_log_block(zilog_t *zilog, const blkptr_t *bp, blkptr_t *nbp, void *dst, 212219089Spjd char **end) 213168404Spjd{ 214219089Spjd enum zio_flag zio_flags = ZIO_FLAG_CANFAIL; 215275811Sdelphij arc_flags_t aflags = ARC_FLAG_WAIT; 216219089Spjd arc_buf_t *abuf = NULL; 217268123Sdelphij zbookmark_phys_t zb; 218168404Spjd int error; 219168404Spjd 220219089Spjd if (zilog->zl_header->zh_claim_txg == 0) 221219089Spjd zio_flags |= ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB; 222168404Spjd 223219089Spjd if (!(zilog->zl_header->zh_flags & ZIL_CLAIM_LR_SEQ_VALID)) 224219089Spjd zio_flags |= ZIO_FLAG_SPECULATIVE; 225168404Spjd 226219089Spjd SET_BOOKMARK(&zb, bp->blk_cksum.zc_word[ZIL_ZC_OBJSET], 227219089Spjd ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]); 228168404Spjd 229246666Smm error = arc_read(NULL, zilog->zl_spa, bp, arc_getbuf_func, &abuf, 230219089Spjd ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb); 231219089Spjd 232168404Spjd if (error == 0) { 233168404Spjd zio_cksum_t cksum = bp->blk_cksum; 234168404Spjd 235168404Spjd /* 236185029Spjd * Validate the checksummed log block. 237185029Spjd * 238168404Spjd * Sequence numbers should be... sequential. The checksum 239168404Spjd * verifier for the next block should be bp's checksum plus 1. 240185029Spjd * 241185029Spjd * Also check the log chain linkage and size used. 242168404Spjd */ 243168404Spjd cksum.zc_word[ZIL_ZC_SEQ]++; 244168404Spjd 245219089Spjd if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) { 246219089Spjd zil_chain_t *zilc = abuf->b_data; 247219089Spjd char *lr = (char *)(zilc + 1); 248219089Spjd uint64_t len = zilc->zc_nused - sizeof (zil_chain_t); 249219089Spjd 250219089Spjd if (bcmp(&cksum, &zilc->zc_next_blk.blk_cksum, 251219089Spjd sizeof (cksum)) || BP_IS_HOLE(&zilc->zc_next_blk)) { 252249195Smm error = SET_ERROR(ECKSUM); 253219089Spjd } else { 254274337Sdelphij ASSERT3U(len, <=, SPA_OLD_MAXBLOCKSIZE); 255219089Spjd bcopy(lr, dst, len); 256219089Spjd *end = (char *)dst + len; 257219089Spjd *nbp = zilc->zc_next_blk; 258219089Spjd } 259219089Spjd } else { 260219089Spjd char *lr = abuf->b_data; 261219089Spjd uint64_t size = BP_GET_LSIZE(bp); 262219089Spjd zil_chain_t *zilc = (zil_chain_t *)(lr + size) - 1; 263219089Spjd 264219089Spjd if (bcmp(&cksum, &zilc->zc_next_blk.blk_cksum, 265219089Spjd sizeof (cksum)) || BP_IS_HOLE(&zilc->zc_next_blk) || 266219089Spjd (zilc->zc_nused > (size - sizeof (*zilc)))) { 267249195Smm error = SET_ERROR(ECKSUM); 268219089Spjd } else { 269274337Sdelphij ASSERT3U(zilc->zc_nused, <=, 270274337Sdelphij SPA_OLD_MAXBLOCKSIZE); 271219089Spjd bcopy(lr, dst, zilc->zc_nused); 272219089Spjd *end = (char *)dst + zilc->zc_nused; 273219089Spjd *nbp = zilc->zc_next_blk; 274219089Spjd } 275185029Spjd } 276168404Spjd 277307265Smav arc_buf_destroy(abuf, &abuf); 278168404Spjd } 279168404Spjd 280219089Spjd return (error); 281219089Spjd} 282168404Spjd 283219089Spjd/* 284219089Spjd * Read a TX_WRITE log data block. 285219089Spjd */ 286219089Spjdstatic int 287219089Spjdzil_read_log_data(zilog_t *zilog, const lr_write_t *lr, void *wbuf) 288219089Spjd{ 289219089Spjd enum zio_flag zio_flags = ZIO_FLAG_CANFAIL; 290219089Spjd const blkptr_t *bp = &lr->lr_blkptr; 291275811Sdelphij arc_flags_t aflags = ARC_FLAG_WAIT; 292219089Spjd arc_buf_t *abuf = NULL; 293268123Sdelphij zbookmark_phys_t zb; 294219089Spjd int error; 295219089Spjd 296219089Spjd if (BP_IS_HOLE(bp)) { 297219089Spjd if (wbuf != NULL) 298219089Spjd bzero(wbuf, MAX(BP_GET_LSIZE(bp), lr->lr_length)); 299219089Spjd return (0); 300219089Spjd } 301219089Spjd 302219089Spjd if (zilog->zl_header->zh_claim_txg == 0) 303219089Spjd zio_flags |= ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB; 304219089Spjd 305219089Spjd SET_BOOKMARK(&zb, dmu_objset_id(zilog->zl_os), lr->lr_foid, 306219089Spjd ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp)); 307219089Spjd 308246666Smm error = arc_read(NULL, zilog->zl_spa, bp, arc_getbuf_func, &abuf, 309219089Spjd ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb); 310219089Spjd 311219089Spjd if (error == 0) { 312219089Spjd if (wbuf != NULL) 313219089Spjd bcopy(abuf->b_data, wbuf, arc_buf_size(abuf)); 314307265Smav arc_buf_destroy(abuf, &abuf); 315219089Spjd } 316219089Spjd 317168404Spjd return (error); 318168404Spjd} 319168404Spjd 320168404Spjd/* 321168404Spjd * Parse the intent log, and call parse_func for each valid record within. 322168404Spjd */ 323219089Spjdint 324168404Spjdzil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func, 325168404Spjd zil_parse_lr_func_t *parse_lr_func, void *arg, uint64_t txg) 326168404Spjd{ 327168404Spjd const zil_header_t *zh = zilog->zl_header; 328219089Spjd boolean_t claimed = !!zh->zh_claim_txg; 329219089Spjd uint64_t claim_blk_seq = claimed ? zh->zh_claim_blk_seq : UINT64_MAX; 330219089Spjd uint64_t claim_lr_seq = claimed ? zh->zh_claim_lr_seq : UINT64_MAX; 331219089Spjd uint64_t max_blk_seq = 0; 332219089Spjd uint64_t max_lr_seq = 0; 333219089Spjd uint64_t blk_count = 0; 334219089Spjd uint64_t lr_count = 0; 335219089Spjd blkptr_t blk, next_blk; 336168404Spjd char *lrbuf, *lrp; 337219089Spjd int error = 0; 338168404Spjd 339219089Spjd /* 340219089Spjd * Old logs didn't record the maximum zh_claim_lr_seq. 341219089Spjd */ 342219089Spjd if (!(zh->zh_flags & ZIL_CLAIM_LR_SEQ_VALID)) 343219089Spjd claim_lr_seq = UINT64_MAX; 344168404Spjd 345168404Spjd /* 346168404Spjd * Starting at the block pointed to by zh_log we read the log chain. 347168404Spjd * For each block in the chain we strongly check that block to 348168404Spjd * ensure its validity. We stop when an invalid block is found. 349168404Spjd * For each block pointer in the chain we call parse_blk_func(). 350168404Spjd * For each record in each valid block we call parse_lr_func(). 351168404Spjd * If the log has been claimed, stop if we encounter a sequence 352168404Spjd * number greater than the highest claimed sequence number. 353168404Spjd */ 354274337Sdelphij lrbuf = zio_buf_alloc(SPA_OLD_MAXBLOCKSIZE); 355219089Spjd zil_bp_tree_init(zilog); 356168404Spjd 357219089Spjd for (blk = zh->zh_log; !BP_IS_HOLE(&blk); blk = next_blk) { 358219089Spjd uint64_t blk_seq = blk.blk_cksum.zc_word[ZIL_ZC_SEQ]; 359219089Spjd int reclen; 360219089Spjd char *end; 361219089Spjd 362219089Spjd if (blk_seq > claim_blk_seq) 363168404Spjd break; 364219089Spjd if ((error = parse_blk_func(zilog, &blk, arg, txg)) != 0) 365219089Spjd break; 366219089Spjd ASSERT3U(max_blk_seq, <, blk_seq); 367219089Spjd max_blk_seq = blk_seq; 368219089Spjd blk_count++; 369168404Spjd 370219089Spjd if (max_lr_seq == claim_lr_seq && max_blk_seq == claim_blk_seq) 371219089Spjd break; 372168404Spjd 373219089Spjd error = zil_read_log_block(zilog, &blk, &next_blk, lrbuf, &end); 374248571Smm if (error != 0) 375168404Spjd break; 376168404Spjd 377219089Spjd for (lrp = lrbuf; lrp < end; lrp += reclen) { 378168404Spjd lr_t *lr = (lr_t *)lrp; 379168404Spjd reclen = lr->lrc_reclen; 380168404Spjd ASSERT3U(reclen, >=, sizeof (lr_t)); 381219089Spjd if (lr->lrc_seq > claim_lr_seq) 382219089Spjd goto done; 383219089Spjd if ((error = parse_lr_func(zilog, lr, arg, txg)) != 0) 384219089Spjd goto done; 385219089Spjd ASSERT3U(max_lr_seq, <, lr->lrc_seq); 386219089Spjd max_lr_seq = lr->lrc_seq; 387219089Spjd lr_count++; 388168404Spjd } 389168404Spjd } 390219089Spjddone: 391219089Spjd zilog->zl_parse_error = error; 392219089Spjd zilog->zl_parse_blk_seq = max_blk_seq; 393219089Spjd zilog->zl_parse_lr_seq = max_lr_seq; 394219089Spjd zilog->zl_parse_blk_count = blk_count; 395219089Spjd zilog->zl_parse_lr_count = lr_count; 396168404Spjd 397219089Spjd ASSERT(!claimed || !(zh->zh_flags & ZIL_CLAIM_LR_SEQ_VALID) || 398219089Spjd (max_blk_seq == claim_blk_seq && max_lr_seq == claim_lr_seq)); 399219089Spjd 400219089Spjd zil_bp_tree_fini(zilog); 401274337Sdelphij zio_buf_free(lrbuf, SPA_OLD_MAXBLOCKSIZE); 402219089Spjd 403219089Spjd return (error); 404168404Spjd} 405168404Spjd 406332547Smav/* ARGSUSED */ 407219089Spjdstatic int 408332547Smavzil_clear_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t first_txg) 409332547Smav{ 410332547Smav ASSERT(!BP_IS_HOLE(bp)); 411332547Smav 412332547Smav /* 413332547Smav * As we call this function from the context of a rewind to a 414332547Smav * checkpoint, each ZIL block whose txg is later than the txg 415332547Smav * that we rewind to is invalid. Thus, we return -1 so 416332547Smav * zil_parse() doesn't attempt to read it. 417332547Smav */ 418332547Smav if (bp->blk_birth >= first_txg) 419332547Smav return (-1); 420332547Smav 421332547Smav if (zil_bp_tree_add(zilog, bp) != 0) 422332547Smav return (0); 423332547Smav 424332547Smav zio_free(zilog->zl_spa, first_txg, bp); 425332547Smav return (0); 426332547Smav} 427332547Smav 428332547Smav/* ARGSUSED */ 429332547Smavstatic int 430332547Smavzil_noop_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t first_txg) 431332547Smav{ 432332547Smav return (0); 433332547Smav} 434332547Smav 435332547Smavstatic int 436168404Spjdzil_claim_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t first_txg) 437168404Spjd{ 438168404Spjd /* 439168404Spjd * Claim log block if not already committed and not already claimed. 440219089Spjd * If tx == NULL, just verify that the block is claimable. 441168404Spjd */ 442260150Sdelphij if (BP_IS_HOLE(bp) || bp->blk_birth < first_txg || 443260150Sdelphij zil_bp_tree_add(zilog, bp) != 0) 444219089Spjd return (0); 445219089Spjd 446219089Spjd return (zio_wait(zio_claim(NULL, zilog->zl_spa, 447219089Spjd tx == NULL ? 0 : first_txg, bp, spa_claim_notify, NULL, 448219089Spjd ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB))); 449168404Spjd} 450168404Spjd 451219089Spjdstatic int 452168404Spjdzil_claim_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t first_txg) 453168404Spjd{ 454219089Spjd lr_write_t *lr = (lr_write_t *)lrc; 455219089Spjd int error; 456219089Spjd 457219089Spjd if (lrc->lrc_txtype != TX_WRITE) 458219089Spjd return (0); 459219089Spjd 460219089Spjd /* 461219089Spjd * If the block is not readable, don't claim it. This can happen 462219089Spjd * in normal operation when a log block is written to disk before 463219089Spjd * some of the dmu_sync() blocks it points to. In this case, the 464219089Spjd * transaction cannot have been committed to anyone (we would have 465219089Spjd * waited for all writes to be stable first), so it is semantically 466219089Spjd * correct to declare this the end of the log. 467219089Spjd */ 468219089Spjd if (lr->lr_blkptr.blk_birth >= first_txg && 469219089Spjd (error = zil_read_log_data(zilog, lr, NULL)) != 0) 470219089Spjd return (error); 471219089Spjd return (zil_claim_log_block(zilog, &lr->lr_blkptr, tx, first_txg)); 472168404Spjd} 473168404Spjd 474168404Spjd/* ARGSUSED */ 475219089Spjdstatic int 476168404Spjdzil_free_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t claim_txg) 477168404Spjd{ 478332547Smav zio_free(zilog->zl_spa, dmu_tx_get_txg(tx), bp); 479219089Spjd 480219089Spjd return (0); 481168404Spjd} 482168404Spjd 483219089Spjdstatic int 484168404Spjdzil_free_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t claim_txg) 485168404Spjd{ 486219089Spjd lr_write_t *lr = (lr_write_t *)lrc; 487219089Spjd blkptr_t *bp = &lr->lr_blkptr; 488219089Spjd 489168404Spjd /* 490168404Spjd * If we previously claimed it, we need to free it. 491168404Spjd */ 492219089Spjd if (claim_txg != 0 && lrc->lrc_txtype == TX_WRITE && 493260150Sdelphij bp->blk_birth >= claim_txg && zil_bp_tree_add(zilog, bp) == 0 && 494260150Sdelphij !BP_IS_HOLE(bp)) 495219089Spjd zio_free(zilog->zl_spa, dmu_tx_get_txg(tx), bp); 496219089Spjd 497219089Spjd return (0); 498219089Spjd} 499219089Spjd 500325132Savgstatic int 501325132Savgzil_lwb_vdev_compare(const void *x1, const void *x2) 502325132Savg{ 503325132Savg const uint64_t v1 = ((zil_vdev_node_t *)x1)->zv_vdev; 504325132Savg const uint64_t v2 = ((zil_vdev_node_t *)x2)->zv_vdev; 505325132Savg 506325132Savg if (v1 < v2) 507325132Savg return (-1); 508325132Savg if (v1 > v2) 509325132Savg return (1); 510325132Savg 511325132Savg return (0); 512325132Savg} 513325132Savg 514219089Spjdstatic lwb_t * 515315441Smavzil_alloc_lwb(zilog_t *zilog, blkptr_t *bp, boolean_t slog, uint64_t txg) 516219089Spjd{ 517219089Spjd lwb_t *lwb; 518219089Spjd 519219089Spjd lwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP); 520219089Spjd lwb->lwb_zilog = zilog; 521219089Spjd lwb->lwb_blk = *bp; 522315441Smav lwb->lwb_slog = slog; 523325132Savg lwb->lwb_state = LWB_STATE_CLOSED; 524219089Spjd lwb->lwb_buf = zio_buf_alloc(BP_GET_LSIZE(bp)); 525219089Spjd lwb->lwb_max_txg = txg; 526325132Savg lwb->lwb_write_zio = NULL; 527325132Savg lwb->lwb_root_zio = NULL; 528219089Spjd lwb->lwb_tx = NULL; 529325132Savg lwb->lwb_issued_timestamp = 0; 530219089Spjd if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) { 531219089Spjd lwb->lwb_nused = sizeof (zil_chain_t); 532219089Spjd lwb->lwb_sz = BP_GET_LSIZE(bp); 533219089Spjd } else { 534219089Spjd lwb->lwb_nused = 0; 535219089Spjd lwb->lwb_sz = BP_GET_LSIZE(bp) - sizeof (zil_chain_t); 536168404Spjd } 537219089Spjd 538219089Spjd mutex_enter(&zilog->zl_lock); 539219089Spjd list_insert_tail(&zilog->zl_lwb_list, lwb); 540219089Spjd mutex_exit(&zilog->zl_lock); 541219089Spjd 542325132Savg ASSERT(!MUTEX_HELD(&lwb->lwb_vdev_lock)); 543325132Savg ASSERT(avl_is_empty(&lwb->lwb_vdev_tree)); 544329486Smav VERIFY(list_is_empty(&lwb->lwb_waiters)); 545325132Savg 546219089Spjd return (lwb); 547168404Spjd} 548168404Spjd 549325132Savgstatic void 550325132Savgzil_free_lwb(zilog_t *zilog, lwb_t *lwb) 551325132Savg{ 552325132Savg ASSERT(MUTEX_HELD(&zilog->zl_lock)); 553325132Savg ASSERT(!MUTEX_HELD(&lwb->lwb_vdev_lock)); 554329486Smav VERIFY(list_is_empty(&lwb->lwb_waiters)); 555325132Savg ASSERT(avl_is_empty(&lwb->lwb_vdev_tree)); 556325132Savg ASSERT3P(lwb->lwb_write_zio, ==, NULL); 557325132Savg ASSERT3P(lwb->lwb_root_zio, ==, NULL); 558329486Smav ASSERT3U(lwb->lwb_max_txg, <=, spa_syncing_txg(zilog->zl_spa)); 559329486Smav ASSERT(lwb->lwb_state == LWB_STATE_CLOSED || 560329486Smav lwb->lwb_state == LWB_STATE_DONE); 561325132Savg 562325132Savg /* 563325132Savg * Clear the zilog's field to indicate this lwb is no longer 564325132Savg * valid, and prevent use-after-free errors. 565325132Savg */ 566325132Savg if (zilog->zl_last_lwb_opened == lwb) 567325132Savg zilog->zl_last_lwb_opened = NULL; 568325132Savg 569325132Savg kmem_cache_free(zil_lwb_cache, lwb); 570325132Savg} 571325132Savg 572168404Spjd/* 573239620Smm * Called when we create in-memory log transactions so that we know 574239620Smm * to cleanup the itxs at the end of spa_sync(). 575239620Smm */ 576239620Smmvoid 577239620Smmzilog_dirty(zilog_t *zilog, uint64_t txg) 578239620Smm{ 579239620Smm dsl_pool_t *dp = zilog->zl_dmu_pool; 580239620Smm dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os); 581239620Smm 582325132Savg ASSERT(spa_writeable(zilog->zl_spa)); 583325132Savg 584286575Smav if (ds->ds_is_snapshot) 585239620Smm panic("dirtying snapshot!"); 586239620Smm 587248571Smm if (txg_list_add(&dp->dp_dirty_zilogs, zilog, txg)) { 588239620Smm /* up the hold count until we can be written out */ 589239620Smm dmu_buf_add_ref(ds->ds_dbuf, zilog); 590325132Savg 591325132Savg zilog->zl_dirty_max_txg = MAX(txg, zilog->zl_dirty_max_txg); 592239620Smm } 593239620Smm} 594239620Smm 595310515Savg/* 596310515Savg * Determine if the zil is dirty in the specified txg. Callers wanting to 597310515Savg * ensure that the dirty state does not change must hold the itxg_lock for 598310515Savg * the specified txg. Holding the lock will ensure that the zil cannot be 599310515Savg * dirtied (zil_itx_assign) or cleaned (zil_clean) while we check its current 600310515Savg * state. 601310515Savg */ 602239620Smmboolean_t 603310515Savgzilog_is_dirty_in_txg(zilog_t *zilog, uint64_t txg) 604310515Savg{ 605310515Savg dsl_pool_t *dp = zilog->zl_dmu_pool; 606310515Savg 607310515Savg if (txg_list_member(&dp->dp_dirty_zilogs, zilog, txg & TXG_MASK)) 608310515Savg return (B_TRUE); 609310515Savg return (B_FALSE); 610310515Savg} 611310515Savg 612310515Savg/* 613310515Savg * Determine if the zil is dirty. The zil is considered dirty if it has 614310515Savg * any pending itx records that have not been cleaned by zil_clean(). 615310515Savg */ 616310515Savgboolean_t 617239620Smmzilog_is_dirty(zilog_t *zilog) 618239620Smm{ 619239620Smm dsl_pool_t *dp = zilog->zl_dmu_pool; 620239620Smm 621239620Smm for (int t = 0; t < TXG_SIZE; t++) { 622239620Smm if (txg_list_member(&dp->dp_dirty_zilogs, zilog, t)) 623239620Smm return (B_TRUE); 624239620Smm } 625239620Smm return (B_FALSE); 626239620Smm} 627239620Smm 628239620Smm/* 629168404Spjd * Create an on-disk intent log. 630168404Spjd */ 631219089Spjdstatic lwb_t * 632168404Spjdzil_create(zilog_t *zilog) 633168404Spjd{ 634168404Spjd const zil_header_t *zh = zilog->zl_header; 635219089Spjd lwb_t *lwb = NULL; 636168404Spjd uint64_t txg = 0; 637168404Spjd dmu_tx_t *tx = NULL; 638168404Spjd blkptr_t blk; 639168404Spjd int error = 0; 640315441Smav boolean_t slog = FALSE; 641168404Spjd 642168404Spjd /* 643168404Spjd * Wait for any previous destroy to complete. 644168404Spjd */ 645168404Spjd txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg); 646168404Spjd 647168404Spjd ASSERT(zh->zh_claim_txg == 0); 648168404Spjd ASSERT(zh->zh_replay_seq == 0); 649168404Spjd 650168404Spjd blk = zh->zh_log; 651168404Spjd 652168404Spjd /* 653219089Spjd * Allocate an initial log block if: 654219089Spjd * - there isn't one already 655219089Spjd * - the existing block is the wrong endianess 656168404Spjd */ 657207908Smm if (BP_IS_HOLE(&blk) || BP_SHOULD_BYTESWAP(&blk)) { 658168404Spjd tx = dmu_tx_create(zilog->zl_os); 659325132Savg VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); 660168404Spjd dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); 661168404Spjd txg = dmu_tx_get_txg(tx); 662168404Spjd 663207908Smm if (!BP_IS_HOLE(&blk)) { 664332547Smav zio_free(zilog->zl_spa, txg, &blk); 665207908Smm BP_ZERO(&blk); 666207908Smm } 667207908Smm 668339105Smav error = zio_alloc_zil(zilog->zl_spa, 669339105Smav zilog->zl_os->os_dsl_dataset->ds_object, txg, &blk, NULL, 670315441Smav ZIL_MIN_BLKSZ, &slog); 671168404Spjd 672168404Spjd if (error == 0) 673168404Spjd zil_init_log_chain(zilog, &blk); 674168404Spjd } 675168404Spjd 676168404Spjd /* 677325132Savg * Allocate a log write block (lwb) for the first log block. 678168404Spjd */ 679219089Spjd if (error == 0) 680315441Smav lwb = zil_alloc_lwb(zilog, &blk, slog, txg); 681168404Spjd 682168404Spjd /* 683168404Spjd * If we just allocated the first log block, commit our transaction 684168404Spjd * and wait for zil_sync() to stuff the block poiner into zh_log. 685168404Spjd * (zh is part of the MOS, so we cannot modify it in open context.) 686168404Spjd */ 687168404Spjd if (tx != NULL) { 688168404Spjd dmu_tx_commit(tx); 689168404Spjd txg_wait_synced(zilog->zl_dmu_pool, txg); 690168404Spjd } 691168404Spjd 692168404Spjd ASSERT(bcmp(&blk, &zh->zh_log, sizeof (blk)) == 0); 693219089Spjd 694219089Spjd return (lwb); 695168404Spjd} 696168404Spjd 697168404Spjd/* 698325132Savg * In one tx, free all log blocks and clear the log header. If keep_first 699325132Savg * is set, then we're replaying a log with no content. We want to keep the 700325132Savg * first block, however, so that the first synchronous transaction doesn't 701325132Savg * require a txg_wait_synced() in zil_create(). We don't need to 702325132Savg * txg_wait_synced() here either when keep_first is set, because both 703325132Savg * zil_create() and zil_destroy() will wait for any in-progress destroys 704325132Savg * to complete. 705168404Spjd */ 706168404Spjdvoid 707168404Spjdzil_destroy(zilog_t *zilog, boolean_t keep_first) 708168404Spjd{ 709168404Spjd const zil_header_t *zh = zilog->zl_header; 710168404Spjd lwb_t *lwb; 711168404Spjd dmu_tx_t *tx; 712168404Spjd uint64_t txg; 713168404Spjd 714168404Spjd /* 715168404Spjd * Wait for any previous destroy to complete. 716168404Spjd */ 717168404Spjd txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg); 718168404Spjd 719219089Spjd zilog->zl_old_header = *zh; /* debugging aid */ 720219089Spjd 721168404Spjd if (BP_IS_HOLE(&zh->zh_log)) 722168404Spjd return; 723168404Spjd 724168404Spjd tx = dmu_tx_create(zilog->zl_os); 725325132Savg VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); 726168404Spjd dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); 727168404Spjd txg = dmu_tx_get_txg(tx); 728168404Spjd 729168404Spjd mutex_enter(&zilog->zl_lock); 730168404Spjd 731168404Spjd ASSERT3U(zilog->zl_destroy_txg, <, txg); 732168404Spjd zilog->zl_destroy_txg = txg; 733168404Spjd zilog->zl_keep_first = keep_first; 734168404Spjd 735168404Spjd if (!list_is_empty(&zilog->zl_lwb_list)) { 736168404Spjd ASSERT(zh->zh_claim_txg == 0); 737224526Smm VERIFY(!keep_first); 738168404Spjd while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) { 739168404Spjd list_remove(&zilog->zl_lwb_list, lwb); 740168404Spjd if (lwb->lwb_buf != NULL) 741168404Spjd zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); 742325132Savg zio_free(zilog->zl_spa, txg, &lwb->lwb_blk); 743325132Savg zil_free_lwb(zilog, lwb); 744168404Spjd } 745219089Spjd } else if (!keep_first) { 746239620Smm zil_destroy_sync(zilog, tx); 747168404Spjd } 748168404Spjd mutex_exit(&zilog->zl_lock); 749168404Spjd 750168404Spjd dmu_tx_commit(tx); 751185029Spjd} 752168404Spjd 753239620Smmvoid 754239620Smmzil_destroy_sync(zilog_t *zilog, dmu_tx_t *tx) 755239620Smm{ 756239620Smm ASSERT(list_is_empty(&zilog->zl_lwb_list)); 757239620Smm (void) zil_parse(zilog, zil_free_log_block, 758239620Smm zil_free_log_record, tx, zilog->zl_header->zh_claim_txg); 759239620Smm} 760239620Smm 761168404Spjdint 762286686Smavzil_claim(dsl_pool_t *dp, dsl_dataset_t *ds, void *txarg) 763168404Spjd{ 764168404Spjd dmu_tx_t *tx = txarg; 765168404Spjd zilog_t *zilog; 766332547Smav uint64_t first_txg; 767168404Spjd zil_header_t *zh; 768168404Spjd objset_t *os; 769168404Spjd int error; 770168404Spjd 771286686Smav error = dmu_objset_own_obj(dp, ds->ds_object, 772286686Smav DMU_OST_ANY, B_FALSE, FTAG, &os); 773248571Smm if (error != 0) { 774271534Sdelphij /* 775271534Sdelphij * EBUSY indicates that the objset is inconsistent, in which 776271534Sdelphij * case it can not have a ZIL. 777271534Sdelphij */ 778271534Sdelphij if (error != EBUSY) { 779286686Smav cmn_err(CE_WARN, "can't open objset for %llu, error %u", 780286686Smav (unsigned long long)ds->ds_object, error); 781271534Sdelphij } 782168404Spjd return (0); 783168404Spjd } 784168404Spjd 785168404Spjd zilog = dmu_objset_zil(os); 786168404Spjd zh = zil_header_in_syncing_context(zilog); 787332547Smav ASSERT3U(tx->tx_txg, ==, spa_first_txg(zilog->zl_spa)); 788332547Smav first_txg = spa_min_claim_txg(zilog->zl_spa); 789168404Spjd 790332547Smav /* 791332547Smav * If the spa_log_state is not set to be cleared, check whether 792332547Smav * the current uberblock is a checkpoint one and if the current 793332547Smav * header has been claimed before moving on. 794332547Smav * 795332547Smav * If the current uberblock is a checkpointed uberblock then 796332547Smav * one of the following scenarios took place: 797332547Smav * 798332547Smav * 1] We are currently rewinding to the checkpoint of the pool. 799332547Smav * 2] We crashed in the middle of a checkpoint rewind but we 800332547Smav * did manage to write the checkpointed uberblock to the 801332547Smav * vdev labels, so when we tried to import the pool again 802332547Smav * the checkpointed uberblock was selected from the import 803332547Smav * procedure. 804332547Smav * 805332547Smav * In both cases we want to zero out all the ZIL blocks, except 806332547Smav * the ones that have been claimed at the time of the checkpoint 807332547Smav * (their zh_claim_txg != 0). The reason is that these blocks 808332547Smav * may be corrupted since we may have reused their locations on 809332547Smav * disk after we took the checkpoint. 810332547Smav * 811332547Smav * We could try to set spa_log_state to SPA_LOG_CLEAR earlier 812332547Smav * when we first figure out whether the current uberblock is 813332547Smav * checkpointed or not. Unfortunately, that would discard all 814332547Smav * the logs, including the ones that are claimed, and we would 815332547Smav * leak space. 816332547Smav */ 817332547Smav if (spa_get_log_state(zilog->zl_spa) == SPA_LOG_CLEAR || 818332547Smav (zilog->zl_spa->spa_uberblock.ub_checkpoint_txg != 0 && 819332547Smav zh->zh_claim_txg == 0)) { 820332547Smav if (!BP_IS_HOLE(&zh->zh_log)) { 821332547Smav (void) zil_parse(zilog, zil_clear_log_block, 822332547Smav zil_noop_log_record, tx, first_txg); 823332547Smav } 824213197Smm BP_ZERO(&zh->zh_log); 825213197Smm dsl_dataset_dirty(dmu_objset_ds(os), tx); 826248571Smm dmu_objset_disown(os, FTAG); 827219089Spjd return (0); 828213197Smm } 829213197Smm 830168404Spjd /* 831332547Smav * If we are not rewinding and opening the pool normally, then 832332547Smav * the min_claim_txg should be equal to the first txg of the pool. 833332547Smav */ 834332547Smav ASSERT3U(first_txg, ==, spa_first_txg(zilog->zl_spa)); 835332547Smav 836332547Smav /* 837168404Spjd * Claim all log blocks if we haven't already done so, and remember 838168404Spjd * the highest claimed sequence number. This ensures that if we can 839168404Spjd * read only part of the log now (e.g. due to a missing device), 840168404Spjd * but we can read the entire log later, we will not try to replay 841168404Spjd * or destroy beyond the last block we successfully claimed. 842168404Spjd */ 843168404Spjd ASSERT3U(zh->zh_claim_txg, <=, first_txg); 844168404Spjd if (zh->zh_claim_txg == 0 && !BP_IS_HOLE(&zh->zh_log)) { 845219089Spjd (void) zil_parse(zilog, zil_claim_log_block, 846219089Spjd zil_claim_log_record, tx, first_txg); 847168404Spjd zh->zh_claim_txg = first_txg; 848219089Spjd zh->zh_claim_blk_seq = zilog->zl_parse_blk_seq; 849219089Spjd zh->zh_claim_lr_seq = zilog->zl_parse_lr_seq; 850219089Spjd if (zilog->zl_parse_lr_count || zilog->zl_parse_blk_count > 1) 851219089Spjd zh->zh_flags |= ZIL_REPLAY_NEEDED; 852219089Spjd zh->zh_flags |= ZIL_CLAIM_LR_SEQ_VALID; 853168404Spjd dsl_dataset_dirty(dmu_objset_ds(os), tx); 854168404Spjd } 855168404Spjd 856168404Spjd ASSERT3U(first_txg, ==, (spa_last_synced_txg(zilog->zl_spa) + 1)); 857248571Smm dmu_objset_disown(os, FTAG); 858168404Spjd return (0); 859168404Spjd} 860168404Spjd 861185029Spjd/* 862185029Spjd * Check the log by walking the log chain. 863185029Spjd * Checksum errors are ok as they indicate the end of the chain. 864185029Spjd * Any other error (no device or read failure) returns an error. 865185029Spjd */ 866286686Smav/* ARGSUSED */ 867185029Spjdint 868286686Smavzil_check_log_chain(dsl_pool_t *dp, dsl_dataset_t *ds, void *tx) 869168404Spjd{ 870185029Spjd zilog_t *zilog; 871185029Spjd objset_t *os; 872219089Spjd blkptr_t *bp; 873185029Spjd int error; 874168404Spjd 875219089Spjd ASSERT(tx == NULL); 876219089Spjd 877286686Smav error = dmu_objset_from_ds(ds, &os); 878248571Smm if (error != 0) { 879286686Smav cmn_err(CE_WARN, "can't open objset %llu, error %d", 880286686Smav (unsigned long long)ds->ds_object, error); 881185029Spjd return (0); 882185029Spjd } 883168404Spjd 884185029Spjd zilog = dmu_objset_zil(os); 885219089Spjd bp = (blkptr_t *)&zilog->zl_header->zh_log; 886219089Spjd 887219089Spjd if (!BP_IS_HOLE(bp)) { 888219089Spjd vdev_t *vd; 889219089Spjd boolean_t valid = B_TRUE; 890219089Spjd 891332547Smav /* 892332547Smav * Check the first block and determine if it's on a log device 893332547Smav * which may have been removed or faulted prior to loading this 894332547Smav * pool. If so, there's no point in checking the rest of the 895332547Smav * log as its content should have already been synced to the 896332547Smav * pool. 897332547Smav */ 898219089Spjd spa_config_enter(os->os_spa, SCL_STATE, FTAG, RW_READER); 899219089Spjd vd = vdev_lookup_top(os->os_spa, DVA_GET_VDEV(&bp->blk_dva[0])); 900219089Spjd if (vd->vdev_islog && vdev_is_dead(vd)) 901219089Spjd valid = vdev_log_state_valid(vd); 902219089Spjd spa_config_exit(os->os_spa, SCL_STATE, FTAG); 903219089Spjd 904286686Smav if (!valid) 905219089Spjd return (0); 906332547Smav 907332547Smav /* 908332547Smav * Check whether the current uberblock is checkpointed (e.g. 909332547Smav * we are rewinding) and whether the current header has been 910332547Smav * claimed or not. If it hasn't then skip verifying it. We 911332547Smav * do this because its ZIL blocks may be part of the pool's 912332547Smav * state before the rewind, which is no longer valid. 913332547Smav */ 914332547Smav zil_header_t *zh = zil_header_in_syncing_context(zilog); 915332547Smav if (zilog->zl_spa->spa_uberblock.ub_checkpoint_txg != 0 && 916332547Smav zh->zh_claim_txg == 0) 917332547Smav return (0); 918168404Spjd } 919185029Spjd 920219089Spjd /* 921219089Spjd * Because tx == NULL, zil_claim_log_block() will not actually claim 922219089Spjd * any blocks, but just determine whether it is possible to do so. 923219089Spjd * In addition to checking the log chain, zil_claim_log_block() 924219089Spjd * will invoke zio_claim() with a done func of spa_claim_notify(), 925219089Spjd * which will update spa_max_claim_txg. See spa_load() for details. 926219089Spjd */ 927219089Spjd error = zil_parse(zilog, zil_claim_log_block, zil_claim_log_record, tx, 928332547Smav zilog->zl_header->zh_claim_txg ? -1ULL : 929332547Smav spa_min_claim_txg(os->os_spa)); 930219089Spjd 931219089Spjd return ((error == ECKSUM || error == ENOENT) ? 0 : error); 932168404Spjd} 933168404Spjd 934325132Savg/* 935325132Savg * When an itx is "skipped", this function is used to properly mark the 936325132Savg * waiter as "done, and signal any thread(s) waiting on it. An itx can 937325132Savg * be skipped (and not committed to an lwb) for a variety of reasons, 938325132Savg * one of them being that the itx was committed via spa_sync(), prior to 939325132Savg * it being committed to an lwb; this can happen if a thread calling 940325132Savg * zil_commit() is racing with spa_sync(). 941325132Savg */ 942325132Savgstatic void 943325132Savgzil_commit_waiter_skip(zil_commit_waiter_t *zcw) 944185029Spjd{ 945325132Savg mutex_enter(&zcw->zcw_lock); 946325132Savg ASSERT3B(zcw->zcw_done, ==, B_FALSE); 947325132Savg zcw->zcw_done = B_TRUE; 948325132Savg cv_broadcast(&zcw->zcw_cv); 949325132Savg mutex_exit(&zcw->zcw_lock); 950325132Savg} 951185029Spjd 952325132Savg/* 953325132Savg * This function is used when the given waiter is to be linked into an 954325132Savg * lwb's "lwb_waiter" list; i.e. when the itx is committed to the lwb. 955325132Savg * At this point, the waiter will no longer be referenced by the itx, 956325132Savg * and instead, will be referenced by the lwb. 957325132Savg */ 958325132Savgstatic void 959325132Savgzil_commit_waiter_link_lwb(zil_commit_waiter_t *zcw, lwb_t *lwb) 960325132Savg{ 961329486Smav /* 962329486Smav * The lwb_waiters field of the lwb is protected by the zilog's 963329486Smav * zl_lock, thus it must be held when calling this function. 964329486Smav */ 965329486Smav ASSERT(MUTEX_HELD(&lwb->lwb_zilog->zl_lock)); 966329486Smav 967325132Savg mutex_enter(&zcw->zcw_lock); 968325132Savg ASSERT(!list_link_active(&zcw->zcw_node)); 969325132Savg ASSERT3P(zcw->zcw_lwb, ==, NULL); 970325132Savg ASSERT3P(lwb, !=, NULL); 971325132Savg ASSERT(lwb->lwb_state == LWB_STATE_OPENED || 972325132Savg lwb->lwb_state == LWB_STATE_ISSUED); 973185029Spjd 974325132Savg list_insert_tail(&lwb->lwb_waiters, zcw); 975325132Savg zcw->zcw_lwb = lwb; 976325132Savg mutex_exit(&zcw->zcw_lock); 977185029Spjd} 978185029Spjd 979325132Savg/* 980325132Savg * This function is used when zio_alloc_zil() fails to allocate a ZIL 981325132Savg * block, and the given waiter must be linked to the "nolwb waiters" 982325132Savg * list inside of zil_process_commit_list(). 983325132Savg */ 984325132Savgstatic void 985325132Savgzil_commit_waiter_link_nolwb(zil_commit_waiter_t *zcw, list_t *nolwb) 986325132Savg{ 987325132Savg mutex_enter(&zcw->zcw_lock); 988325132Savg ASSERT(!list_link_active(&zcw->zcw_node)); 989325132Savg ASSERT3P(zcw->zcw_lwb, ==, NULL); 990325132Savg list_insert_tail(nolwb, zcw); 991325132Savg mutex_exit(&zcw->zcw_lock); 992325132Savg} 993325132Savg 994168404Spjdvoid 995325132Savgzil_lwb_add_block(lwb_t *lwb, const blkptr_t *bp) 996168404Spjd{ 997325132Savg avl_tree_t *t = &lwb->lwb_vdev_tree; 998185029Spjd avl_index_t where; 999185029Spjd zil_vdev_node_t *zv, zvsearch; 1000185029Spjd int ndvas = BP_GET_NDVAS(bp); 1001185029Spjd int i; 1002168404Spjd 1003185029Spjd if (zfs_nocacheflush) 1004185029Spjd return; 1005168404Spjd 1006325132Savg mutex_enter(&lwb->lwb_vdev_lock); 1007185029Spjd for (i = 0; i < ndvas; i++) { 1008185029Spjd zvsearch.zv_vdev = DVA_GET_VDEV(&bp->blk_dva[i]); 1009185029Spjd if (avl_find(t, &zvsearch, &where) == NULL) { 1010185029Spjd zv = kmem_alloc(sizeof (*zv), KM_SLEEP); 1011185029Spjd zv->zv_vdev = zvsearch.zv_vdev; 1012185029Spjd avl_insert(t, zv, where); 1013185029Spjd } 1014185029Spjd } 1015325132Savg mutex_exit(&lwb->lwb_vdev_lock); 1016168404Spjd} 1017168404Spjd 1018325132Savgvoid 1019325132Savgzil_lwb_add_txg(lwb_t *lwb, uint64_t txg) 1020325132Savg{ 1021325132Savg lwb->lwb_max_txg = MAX(lwb->lwb_max_txg, txg); 1022325132Savg} 1023325132Savg 1024325132Savg/* 1025325132Savg * This function is a called after all VDEVs associated with a given lwb 1026325132Savg * write have completed their DKIOCFLUSHWRITECACHE command; or as soon 1027325132Savg * as the lwb write completes, if "zfs_nocacheflush" is set. 1028325132Savg * 1029325132Savg * The intention is for this function to be called as soon as the 1030325132Savg * contents of an lwb are considered "stable" on disk, and will survive 1031325132Savg * any sudden loss of power. At this point, any threads waiting for the 1032325132Savg * lwb to reach this state are signalled, and the "waiter" structures 1033325132Savg * are marked "done". 1034325132Savg */ 1035219089Spjdstatic void 1036325132Savgzil_lwb_flush_vdevs_done(zio_t *zio) 1037168404Spjd{ 1038325132Savg lwb_t *lwb = zio->io_private; 1039325132Savg zilog_t *zilog = lwb->lwb_zilog; 1040325132Savg dmu_tx_t *tx = lwb->lwb_tx; 1041325132Savg zil_commit_waiter_t *zcw; 1042168404Spjd 1043325132Savg spa_config_exit(zilog->zl_spa, SCL_STATE, lwb); 1044168404Spjd 1045325132Savg zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); 1046325132Savg 1047325132Savg mutex_enter(&zilog->zl_lock); 1048325132Savg 1049185029Spjd /* 1050325132Savg * Ensure the lwb buffer pointer is cleared before releasing the 1051325132Savg * txg. If we have had an allocation failure and the txg is 1052325132Savg * waiting to sync then we want zil_sync() to remove the lwb so 1053325132Savg * that it's not picked up as the next new one in 1054325132Savg * zil_process_commit_list(). zil_sync() will only remove the 1055325132Savg * lwb if lwb_buf is null. 1056185029Spjd */ 1057325132Savg lwb->lwb_buf = NULL; 1058325132Savg lwb->lwb_tx = NULL; 1059185029Spjd 1060325132Savg ASSERT3U(lwb->lwb_issued_timestamp, >, 0); 1061325132Savg zilog->zl_last_lwb_latency = gethrtime() - lwb->lwb_issued_timestamp; 1062185029Spjd 1063325132Savg lwb->lwb_root_zio = NULL; 1064325132Savg lwb->lwb_state = LWB_STATE_DONE; 1065325132Savg 1066325132Savg if (zilog->zl_last_lwb_opened == lwb) { 1067325132Savg /* 1068325132Savg * Remember the highest committed log sequence number 1069325132Savg * for ztest. We only update this value when all the log 1070325132Savg * writes succeeded, because ztest wants to ASSERT that 1071325132Savg * it got the whole log chain. 1072325132Savg */ 1073325132Savg zilog->zl_commit_lr_seq = zilog->zl_lr_seq; 1074168404Spjd } 1075168404Spjd 1076325132Savg while ((zcw = list_head(&lwb->lwb_waiters)) != NULL) { 1077325132Savg mutex_enter(&zcw->zcw_lock); 1078325132Savg 1079325132Savg ASSERT(list_link_active(&zcw->zcw_node)); 1080325132Savg list_remove(&lwb->lwb_waiters, zcw); 1081325132Savg 1082325132Savg ASSERT3P(zcw->zcw_lwb, ==, lwb); 1083325132Savg zcw->zcw_lwb = NULL; 1084325132Savg 1085325132Savg zcw->zcw_zio_error = zio->io_error; 1086325132Savg 1087325132Savg ASSERT3B(zcw->zcw_done, ==, B_FALSE); 1088325132Savg zcw->zcw_done = B_TRUE; 1089325132Savg cv_broadcast(&zcw->zcw_cv); 1090325132Savg 1091325132Savg mutex_exit(&zcw->zcw_lock); 1092325132Savg } 1093325132Savg 1094325132Savg mutex_exit(&zilog->zl_lock); 1095325132Savg 1096168404Spjd /* 1097325132Savg * Now that we've written this log block, we have a stable pointer 1098325132Savg * to the next block in the chain, so it's OK to let the txg in 1099325132Savg * which we allocated the next block sync. 1100168404Spjd */ 1101325132Savg dmu_tx_commit(tx); 1102168404Spjd} 1103168404Spjd 1104168404Spjd/* 1105325132Savg * This is called when an lwb write completes. This means, this specific 1106325132Savg * lwb was written to disk, and all dependent lwb have also been 1107325132Savg * written to disk. 1108325132Savg * 1109325132Savg * At this point, a DKIOCFLUSHWRITECACHE command hasn't been issued to 1110325132Savg * the VDEVs involved in writing out this specific lwb. The lwb will be 1111325132Savg * "done" once zil_lwb_flush_vdevs_done() is called, which occurs in the 1112325132Savg * zio completion callback for the lwb's root zio. 1113168404Spjd */ 1114168404Spjdstatic void 1115168404Spjdzil_lwb_write_done(zio_t *zio) 1116168404Spjd{ 1117168404Spjd lwb_t *lwb = zio->io_private; 1118325132Savg spa_t *spa = zio->io_spa; 1119168404Spjd zilog_t *zilog = lwb->lwb_zilog; 1120325132Savg avl_tree_t *t = &lwb->lwb_vdev_tree; 1121325132Savg void *cookie = NULL; 1122325132Savg zil_vdev_node_t *zv; 1123168404Spjd 1124325132Savg ASSERT3S(spa_config_held(spa, SCL_STATE, RW_READER), !=, 0); 1125325132Savg 1126185029Spjd ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF); 1127185029Spjd ASSERT(BP_GET_TYPE(zio->io_bp) == DMU_OT_INTENT_LOG); 1128185029Spjd ASSERT(BP_GET_LEVEL(zio->io_bp) == 0); 1129185029Spjd ASSERT(BP_GET_BYTEORDER(zio->io_bp) == ZFS_HOST_BYTEORDER); 1130185029Spjd ASSERT(!BP_IS_GANG(zio->io_bp)); 1131185029Spjd ASSERT(!BP_IS_HOLE(zio->io_bp)); 1132268075Sdelphij ASSERT(BP_GET_FILL(zio->io_bp) == 0); 1133185029Spjd 1134321610Smav abd_put(zio->io_abd); 1135325132Savg 1136325132Savg ASSERT3S(lwb->lwb_state, ==, LWB_STATE_ISSUED); 1137325132Savg 1138168404Spjd mutex_enter(&zilog->zl_lock); 1139325132Savg lwb->lwb_write_zio = NULL; 1140219089Spjd mutex_exit(&zilog->zl_lock); 1141209962Smm 1142325132Savg if (avl_numnodes(t) == 0) 1143325132Savg return; 1144325132Savg 1145209962Smm /* 1146325132Savg * If there was an IO error, we're not going to call zio_flush() 1147325132Savg * on these vdevs, so we simply empty the tree and free the 1148325132Savg * nodes. We avoid calling zio_flush() since there isn't any 1149325132Savg * good reason for doing so, after the lwb block failed to be 1150325132Savg * written out. 1151209962Smm */ 1152325132Savg if (zio->io_error != 0) { 1153325132Savg while ((zv = avl_destroy_nodes(t, &cookie)) != NULL) 1154325132Savg kmem_free(zv, sizeof (*zv)); 1155325132Savg return; 1156325132Savg } 1157325132Savg 1158325132Savg while ((zv = avl_destroy_nodes(t, &cookie)) != NULL) { 1159325132Savg vdev_t *vd = vdev_lookup_top(spa, zv->zv_vdev); 1160325132Savg if (vd != NULL) 1161325132Savg zio_flush(lwb->lwb_root_zio, vd); 1162325132Savg kmem_free(zv, sizeof (*zv)); 1163325132Savg } 1164168404Spjd} 1165168404Spjd 1166168404Spjd/* 1167325132Savg * This function's purpose is to "open" an lwb such that it is ready to 1168325132Savg * accept new itxs being committed to it. To do this, the lwb's zio 1169325132Savg * structures are created, and linked to the lwb. This function is 1170325132Savg * idempotent; if the passed in lwb has already been opened, this 1171325132Savg * function is essentially a no-op. 1172168404Spjd */ 1173168404Spjdstatic void 1174325132Savgzil_lwb_write_open(zilog_t *zilog, lwb_t *lwb) 1175168404Spjd{ 1176268123Sdelphij zbookmark_phys_t zb; 1177315441Smav zio_priority_t prio; 1178168404Spjd 1179329485Smav ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); 1180325132Savg ASSERT3P(lwb, !=, NULL); 1181325132Savg EQUIV(lwb->lwb_root_zio == NULL, lwb->lwb_state == LWB_STATE_CLOSED); 1182325132Savg EQUIV(lwb->lwb_root_zio != NULL, lwb->lwb_state == LWB_STATE_OPENED); 1183325132Savg 1184219089Spjd SET_BOOKMARK(&zb, lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_OBJSET], 1185219089Spjd ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, 1186219089Spjd lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_SEQ]); 1187168404Spjd 1188325132Savg if (lwb->lwb_root_zio == NULL) { 1189321610Smav abd_t *lwb_abd = abd_get_from_buf(lwb->lwb_buf, 1190321610Smav BP_GET_LSIZE(&lwb->lwb_blk)); 1191325132Savg 1192321611Smav if (!lwb->lwb_slog || zilog->zl_cur_used <= zil_slog_bulk) 1193315441Smav prio = ZIO_PRIORITY_SYNC_WRITE; 1194315441Smav else 1195315441Smav prio = ZIO_PRIORITY_ASYNC_WRITE; 1196325132Savg 1197325132Savg lwb->lwb_root_zio = zio_root(zilog->zl_spa, 1198325132Savg zil_lwb_flush_vdevs_done, lwb, ZIO_FLAG_CANFAIL); 1199325132Savg ASSERT3P(lwb->lwb_root_zio, !=, NULL); 1200325132Savg 1201325132Savg lwb->lwb_write_zio = zio_rewrite(lwb->lwb_root_zio, 1202325132Savg zilog->zl_spa, 0, &lwb->lwb_blk, lwb_abd, 1203325132Savg BP_GET_LSIZE(&lwb->lwb_blk), zil_lwb_write_done, lwb, 1204325132Savg prio, ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE, &zb); 1205325132Savg ASSERT3P(lwb->lwb_write_zio, !=, NULL); 1206325132Savg 1207325132Savg lwb->lwb_state = LWB_STATE_OPENED; 1208325132Savg 1209325132Savg mutex_enter(&zilog->zl_lock); 1210325132Savg 1211325132Savg /* 1212325132Savg * The zilog's "zl_last_lwb_opened" field is used to 1213325132Savg * build the lwb/zio dependency chain, which is used to 1214325132Savg * preserve the ordering of lwb completions that is 1215325132Savg * required by the semantics of the ZIL. Each new lwb 1216325132Savg * zio becomes a parent of the "previous" lwb zio, such 1217325132Savg * that the new lwb's zio cannot complete until the 1218325132Savg * "previous" lwb's zio completes. 1219325132Savg * 1220325132Savg * This is required by the semantics of zil_commit(); 1221325132Savg * the commit waiters attached to the lwbs will be woken 1222325132Savg * in the lwb zio's completion callback, so this zio 1223325132Savg * dependency graph ensures the waiters are woken in the 1224325132Savg * correct order (the same order the lwbs were created). 1225325132Savg */ 1226325132Savg lwb_t *last_lwb_opened = zilog->zl_last_lwb_opened; 1227325132Savg if (last_lwb_opened != NULL && 1228325132Savg last_lwb_opened->lwb_state != LWB_STATE_DONE) { 1229325132Savg ASSERT(last_lwb_opened->lwb_state == LWB_STATE_OPENED || 1230325132Savg last_lwb_opened->lwb_state == LWB_STATE_ISSUED); 1231325132Savg ASSERT3P(last_lwb_opened->lwb_root_zio, !=, NULL); 1232325132Savg zio_add_child(lwb->lwb_root_zio, 1233325132Savg last_lwb_opened->lwb_root_zio); 1234325132Savg } 1235325132Savg zilog->zl_last_lwb_opened = lwb; 1236325132Savg 1237325132Savg mutex_exit(&zilog->zl_lock); 1238168404Spjd } 1239325132Savg 1240325132Savg ASSERT3P(lwb->lwb_root_zio, !=, NULL); 1241325132Savg ASSERT3P(lwb->lwb_write_zio, !=, NULL); 1242325132Savg ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED); 1243168404Spjd} 1244168404Spjd 1245168404Spjd/* 1246219089Spjd * Define a limited set of intent log block sizes. 1247251631Sdelphij * 1248219089Spjd * These must be a multiple of 4KB. Note only the amount used (again 1249219089Spjd * aligned to 4KB) actually gets written. However, we can't always just 1250274337Sdelphij * allocate SPA_OLD_MAXBLOCKSIZE as the slog space could be exhausted. 1251219089Spjd */ 1252219089Spjduint64_t zil_block_buckets[] = { 1253219089Spjd 4096, /* non TX_WRITE */ 1254219089Spjd 8192+4096, /* data base */ 1255219089Spjd 32*1024 + 4096, /* NFS writes */ 1256219089Spjd UINT64_MAX 1257219089Spjd}; 1258219089Spjd 1259219089Spjd/* 1260168404Spjd * Start a log block write and advance to the next log block. 1261168404Spjd * Calls are serialized. 1262168404Spjd */ 1263168404Spjdstatic lwb_t * 1264325132Savgzil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb) 1265168404Spjd{ 1266219089Spjd lwb_t *nlwb = NULL; 1267219089Spjd zil_chain_t *zilc; 1268168404Spjd spa_t *spa = zilog->zl_spa; 1269219089Spjd blkptr_t *bp; 1270219089Spjd dmu_tx_t *tx; 1271168404Spjd uint64_t txg; 1272219089Spjd uint64_t zil_blksz, wsz; 1273219089Spjd int i, error; 1274315441Smav boolean_t slog; 1275168404Spjd 1276329485Smav ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); 1277325132Savg ASSERT3P(lwb->lwb_root_zio, !=, NULL); 1278325132Savg ASSERT3P(lwb->lwb_write_zio, !=, NULL); 1279325132Savg ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED); 1280325132Savg 1281219089Spjd if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2) { 1282219089Spjd zilc = (zil_chain_t *)lwb->lwb_buf; 1283219089Spjd bp = &zilc->zc_next_blk; 1284219089Spjd } else { 1285219089Spjd zilc = (zil_chain_t *)(lwb->lwb_buf + lwb->lwb_sz); 1286219089Spjd bp = &zilc->zc_next_blk; 1287219089Spjd } 1288168404Spjd 1289219089Spjd ASSERT(lwb->lwb_nused <= lwb->lwb_sz); 1290219089Spjd 1291168404Spjd /* 1292168404Spjd * Allocate the next block and save its address in this block 1293168404Spjd * before writing it in order to establish the log chain. 1294168404Spjd * Note that if the allocation of nlwb synced before we wrote 1295168404Spjd * the block that points at it (lwb), we'd leak it if we crashed. 1296219089Spjd * Therefore, we don't do dmu_tx_commit() until zil_lwb_write_done(). 1297219089Spjd * We dirty the dataset to ensure that zil_sync() will be called 1298219089Spjd * to clean up in the event of allocation failure or I/O failure. 1299168404Spjd */ 1300325132Savg 1301219089Spjd tx = dmu_tx_create(zilog->zl_os); 1302328235Smav 1303328235Smav /* 1304330986Savg * Since we are not going to create any new dirty data, and we 1305330986Savg * can even help with clearing the existing dirty data, we 1306330986Savg * should not be subject to the dirty data based delays. We 1307330986Savg * use TXG_NOTHROTTLE to bypass the delay mechanism. 1308328235Smav */ 1309330986Savg VERIFY0(dmu_tx_assign(tx, TXG_WAIT | TXG_NOTHROTTLE)); 1310330986Savg 1311219089Spjd dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); 1312219089Spjd txg = dmu_tx_get_txg(tx); 1313168404Spjd 1314219089Spjd lwb->lwb_tx = tx; 1315219089Spjd 1316168404Spjd /* 1317219089Spjd * Log blocks are pre-allocated. Here we select the size of the next 1318219089Spjd * block, based on size used in the last block. 1319219089Spjd * - first find the smallest bucket that will fit the block from a 1320219089Spjd * limited set of block sizes. This is because it's faster to write 1321219089Spjd * blocks allocated from the same metaslab as they are adjacent or 1322219089Spjd * close. 1323219089Spjd * - next find the maximum from the new suggested size and an array of 1324219089Spjd * previous sizes. This lessens a picket fence effect of wrongly 1325219089Spjd * guesssing the size if we have a stream of say 2k, 64k, 2k, 64k 1326219089Spjd * requests. 1327219089Spjd * 1328219089Spjd * Note we only write what is used, but we can't just allocate 1329219089Spjd * the maximum block size because we can exhaust the available 1330219089Spjd * pool log space. 1331168404Spjd */ 1332219089Spjd zil_blksz = zilog->zl_cur_used + sizeof (zil_chain_t); 1333219089Spjd for (i = 0; zil_blksz > zil_block_buckets[i]; i++) 1334219089Spjd continue; 1335219089Spjd zil_blksz = zil_block_buckets[i]; 1336219089Spjd if (zil_blksz == UINT64_MAX) 1337274337Sdelphij zil_blksz = SPA_OLD_MAXBLOCKSIZE; 1338219089Spjd zilog->zl_prev_blks[zilog->zl_prev_rotor] = zil_blksz; 1339219089Spjd for (i = 0; i < ZIL_PREV_BLKS; i++) 1340219089Spjd zil_blksz = MAX(zil_blksz, zilog->zl_prev_blks[i]); 1341219089Spjd zilog->zl_prev_rotor = (zilog->zl_prev_rotor + 1) & (ZIL_PREV_BLKS - 1); 1342168404Spjd 1343168404Spjd BP_ZERO(bp); 1344325132Savg 1345168404Spjd /* pass the old blkptr in order to spread log blocks across devs */ 1346339105Smav error = zio_alloc_zil(spa, zilog->zl_os->os_dsl_dataset->ds_object, 1347339105Smav txg, bp, &lwb->lwb_blk, zil_blksz, &slog); 1348248571Smm if (error == 0) { 1349219089Spjd ASSERT3U(bp->blk_birth, ==, txg); 1350219089Spjd bp->blk_cksum = lwb->lwb_blk.blk_cksum; 1351219089Spjd bp->blk_cksum.zc_word[ZIL_ZC_SEQ]++; 1352168404Spjd 1353168404Spjd /* 1354325132Savg * Allocate a new log write block (lwb). 1355168404Spjd */ 1356315441Smav nlwb = zil_alloc_lwb(zilog, bp, slog, txg); 1357168404Spjd } 1358168404Spjd 1359219089Spjd if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2) { 1360219089Spjd /* For Slim ZIL only write what is used. */ 1361219089Spjd wsz = P2ROUNDUP_TYPED(lwb->lwb_nused, ZIL_MIN_BLKSZ, uint64_t); 1362219089Spjd ASSERT3U(wsz, <=, lwb->lwb_sz); 1363325132Savg zio_shrink(lwb->lwb_write_zio, wsz); 1364168404Spjd 1365219089Spjd } else { 1366219089Spjd wsz = lwb->lwb_sz; 1367219089Spjd } 1368168404Spjd 1369219089Spjd zilc->zc_pad = 0; 1370219089Spjd zilc->zc_nused = lwb->lwb_nused; 1371219089Spjd zilc->zc_eck.zec_cksum = lwb->lwb_blk.blk_cksum; 1372168404Spjd 1373168404Spjd /* 1374219089Spjd * clear unused data for security 1375168404Spjd */ 1376219089Spjd bzero(lwb->lwb_buf + lwb->lwb_nused, wsz - lwb->lwb_nused); 1377168404Spjd 1378325132Savg spa_config_enter(zilog->zl_spa, SCL_STATE, lwb, RW_READER); 1379168404Spjd 1380325132Savg zil_lwb_add_block(lwb, &lwb->lwb_blk); 1381325132Savg lwb->lwb_issued_timestamp = gethrtime(); 1382325132Savg lwb->lwb_state = LWB_STATE_ISSUED; 1383325132Savg 1384325132Savg zio_nowait(lwb->lwb_root_zio); 1385325132Savg zio_nowait(lwb->lwb_write_zio); 1386325132Savg 1387168404Spjd /* 1388219089Spjd * If there was an allocation failure then nlwb will be null which 1389219089Spjd * forces a txg_wait_synced(). 1390168404Spjd */ 1391168404Spjd return (nlwb); 1392168404Spjd} 1393168404Spjd 1394168404Spjdstatic lwb_t * 1395168404Spjdzil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb) 1396168404Spjd{ 1397321611Smav lr_t *lrcb, *lrc; 1398321611Smav lr_write_t *lrwb, *lrw; 1399219089Spjd char *lr_buf; 1400321611Smav uint64_t dlen, dnow, lwb_sp, reclen, txg; 1401168404Spjd 1402329485Smav ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); 1403325132Savg ASSERT3P(lwb, !=, NULL); 1404325132Savg ASSERT3P(lwb->lwb_buf, !=, NULL); 1405219089Spjd 1406325132Savg zil_lwb_write_open(zilog, lwb); 1407168404Spjd 1408325132Savg lrc = &itx->itx_lr; 1409325132Savg lrw = (lr_write_t *)lrc; 1410325132Savg 1411325132Savg /* 1412325132Savg * A commit itx doesn't represent any on-disk state; instead 1413325132Savg * it's simply used as a place holder on the commit list, and 1414325132Savg * provides a mechanism for attaching a "commit waiter" onto the 1415325132Savg * correct lwb (such that the waiter can be signalled upon 1416325132Savg * completion of that lwb). Thus, we don't process this itx's 1417325132Savg * log record if it's a commit itx (these itx's don't have log 1418325132Savg * records), and instead link the itx's waiter onto the lwb's 1419325132Savg * list of waiters. 1420325132Savg * 1421325132Savg * For more details, see the comment above zil_commit(). 1422325132Savg */ 1423325132Savg if (lrc->lrc_txtype == TX_COMMIT) { 1424329486Smav mutex_enter(&zilog->zl_lock); 1425325132Savg zil_commit_waiter_link_lwb(itx->itx_private, lwb); 1426325132Savg itx->itx_private = NULL; 1427329486Smav mutex_exit(&zilog->zl_lock); 1428325132Savg return (lwb); 1429325132Savg } 1430325132Savg 1431321611Smav if (lrc->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY) { 1432168404Spjd dlen = P2ROUNDUP_TYPED( 1433219089Spjd lrw->lr_length, sizeof (uint64_t), uint64_t); 1434321611Smav } else { 1435321611Smav dlen = 0; 1436321611Smav } 1437321611Smav reclen = lrc->lrc_reclen; 1438168404Spjd zilog->zl_cur_used += (reclen + dlen); 1439321611Smav txg = lrc->lrc_txg; 1440168404Spjd 1441325132Savg ASSERT3U(zilog->zl_cur_used, <, UINT64_MAX - (reclen + dlen)); 1442168404Spjd 1443315441Smavcont: 1444168404Spjd /* 1445168404Spjd * If this record won't fit in the current log block, start a new one. 1446321611Smav * For WR_NEED_COPY optimize layout for minimal number of chunks. 1447168404Spjd */ 1448315441Smav lwb_sp = lwb->lwb_sz - lwb->lwb_nused; 1449315441Smav if (reclen > lwb_sp || (reclen + dlen > lwb_sp && 1450321611Smav lwb_sp < ZIL_MAX_WASTE_SPACE && (dlen % ZIL_MAX_LOG_DATA == 0 || 1451315441Smav lwb_sp < reclen + dlen % ZIL_MAX_LOG_DATA))) { 1452325132Savg lwb = zil_lwb_write_issue(zilog, lwb); 1453168404Spjd if (lwb == NULL) 1454168404Spjd return (NULL); 1455325132Savg zil_lwb_write_open(zilog, lwb); 1456219089Spjd ASSERT(LWB_EMPTY(lwb)); 1457315441Smav lwb_sp = lwb->lwb_sz - lwb->lwb_nused; 1458321611Smav ASSERT3U(reclen + MIN(dlen, sizeof (uint64_t)), <=, lwb_sp); 1459168404Spjd } 1460168404Spjd 1461315441Smav dnow = MIN(dlen, lwb_sp - reclen); 1462219089Spjd lr_buf = lwb->lwb_buf + lwb->lwb_nused; 1463219089Spjd bcopy(lrc, lr_buf, reclen); 1464321611Smav lrcb = (lr_t *)lr_buf; /* Like lrc, but inside lwb. */ 1465321611Smav lrwb = (lr_write_t *)lrcb; /* Like lrw, but inside lwb. */ 1466168404Spjd 1467168404Spjd /* 1468168404Spjd * If it's a write, fetch the data or get its blkptr as appropriate. 1469168404Spjd */ 1470168404Spjd if (lrc->lrc_txtype == TX_WRITE) { 1471168404Spjd if (txg > spa_freeze_txg(zilog->zl_spa)) 1472168404Spjd txg_wait_synced(zilog->zl_dmu_pool, txg); 1473168404Spjd if (itx->itx_wr_state != WR_COPIED) { 1474168404Spjd char *dbuf; 1475168404Spjd int error; 1476168404Spjd 1477315441Smav if (itx->itx_wr_state == WR_NEED_COPY) { 1478219089Spjd dbuf = lr_buf + reclen; 1479315441Smav lrcb->lrc_reclen += dnow; 1480315441Smav if (lrwb->lr_length > dnow) 1481315441Smav lrwb->lr_length = dnow; 1482315441Smav lrw->lr_offset += dnow; 1483315441Smav lrw->lr_length -= dnow; 1484168404Spjd } else { 1485168404Spjd ASSERT(itx->itx_wr_state == WR_INDIRECT); 1486168404Spjd dbuf = NULL; 1487168404Spjd } 1488325132Savg 1489325132Savg /* 1490325132Savg * We pass in the "lwb_write_zio" rather than 1491325132Savg * "lwb_root_zio" so that the "lwb_write_zio" 1492325132Savg * becomes the parent of any zio's created by 1493325132Savg * the "zl_get_data" callback. The vdevs are 1494325132Savg * flushed after the "lwb_write_zio" completes, 1495325132Savg * so we want to make sure that completion 1496325132Savg * callback waits for these additional zio's, 1497325132Savg * such that the vdevs used by those zio's will 1498325132Savg * be included in the lwb's vdev tree, and those 1499325132Savg * vdevs will be properly flushed. If we passed 1500325132Savg * in "lwb_root_zio" here, then these additional 1501325132Savg * vdevs may not be flushed; e.g. if these zio's 1502325132Savg * completed after "lwb_write_zio" completed. 1503325132Savg */ 1504325132Savg error = zilog->zl_get_data(itx->itx_private, 1505325132Savg lrwb, dbuf, lwb, lwb->lwb_write_zio); 1506325132Savg 1507214378Smm if (error == EIO) { 1508214378Smm txg_wait_synced(zilog->zl_dmu_pool, txg); 1509214378Smm return (lwb); 1510214378Smm } 1511248571Smm if (error != 0) { 1512168404Spjd ASSERT(error == ENOENT || error == EEXIST || 1513168404Spjd error == EALREADY); 1514168404Spjd return (lwb); 1515168404Spjd } 1516168404Spjd } 1517168404Spjd } 1518168404Spjd 1519219089Spjd /* 1520219089Spjd * We're actually making an entry, so update lrc_seq to be the 1521219089Spjd * log record sequence number. Note that this is generally not 1522219089Spjd * equal to the itx sequence number because not all transactions 1523219089Spjd * are synchronous, and sometimes spa_sync() gets there first. 1524219089Spjd */ 1525325132Savg lrcb->lrc_seq = ++zilog->zl_lr_seq; 1526315441Smav lwb->lwb_nused += reclen + dnow; 1527325132Savg 1528325132Savg zil_lwb_add_txg(lwb, txg); 1529325132Savg 1530219089Spjd ASSERT3U(lwb->lwb_nused, <=, lwb->lwb_sz); 1531240415Smm ASSERT0(P2PHASE(lwb->lwb_nused, sizeof (uint64_t))); 1532168404Spjd 1533315441Smav dlen -= dnow; 1534315441Smav if (dlen > 0) { 1535315441Smav zilog->zl_cur_used += reclen; 1536315441Smav goto cont; 1537315441Smav } 1538315441Smav 1539168404Spjd return (lwb); 1540168404Spjd} 1541168404Spjd 1542168404Spjditx_t * 1543185029Spjdzil_itx_create(uint64_t txtype, size_t lrsize) 1544168404Spjd{ 1545168404Spjd itx_t *itx; 1546168404Spjd 1547168404Spjd lrsize = P2ROUNDUP_TYPED(lrsize, sizeof (uint64_t), size_t); 1548168404Spjd 1549168404Spjd itx = kmem_alloc(offsetof(itx_t, itx_lr) + lrsize, KM_SLEEP); 1550168404Spjd itx->itx_lr.lrc_txtype = txtype; 1551168404Spjd itx->itx_lr.lrc_reclen = lrsize; 1552168404Spjd itx->itx_lr.lrc_seq = 0; /* defensive */ 1553219089Spjd itx->itx_sync = B_TRUE; /* default is synchronous */ 1554168404Spjd 1555168404Spjd return (itx); 1556168404Spjd} 1557168404Spjd 1558219089Spjdvoid 1559219089Spjdzil_itx_destroy(itx_t *itx) 1560168404Spjd{ 1561219089Spjd kmem_free(itx, offsetof(itx_t, itx_lr) + itx->itx_lr.lrc_reclen); 1562219089Spjd} 1563168404Spjd 1564219089Spjd/* 1565219089Spjd * Free up the sync and async itxs. The itxs_t has already been detached 1566219089Spjd * so no locks are needed. 1567219089Spjd */ 1568219089Spjdstatic void 1569219089Spjdzil_itxg_clean(itxs_t *itxs) 1570219089Spjd{ 1571219089Spjd itx_t *itx; 1572219089Spjd list_t *list; 1573219089Spjd avl_tree_t *t; 1574219089Spjd void *cookie; 1575219089Spjd itx_async_node_t *ian; 1576168404Spjd 1577219089Spjd list = &itxs->i_sync_list; 1578219089Spjd while ((itx = list_head(list)) != NULL) { 1579325132Savg /* 1580325132Savg * In the general case, commit itxs will not be found 1581325132Savg * here, as they'll be committed to an lwb via 1582325132Savg * zil_lwb_commit(), and free'd in that function. Having 1583325132Savg * said that, it is still possible for commit itxs to be 1584325132Savg * found here, due to the following race: 1585325132Savg * 1586325132Savg * - a thread calls zil_commit() which assigns the 1587325132Savg * commit itx to a per-txg i_sync_list 1588325132Savg * - zil_itxg_clean() is called (e.g. via spa_sync()) 1589325132Savg * while the waiter is still on the i_sync_list 1590325132Savg * 1591325132Savg * There's nothing to prevent syncing the txg while the 1592325132Savg * waiter is on the i_sync_list. This normally doesn't 1593325132Savg * happen because spa_sync() is slower than zil_commit(), 1594325132Savg * but if zil_commit() calls txg_wait_synced() (e.g. 1595325132Savg * because zil_create() or zil_commit_writer_stall() is 1596325132Savg * called) we will hit this case. 1597325132Savg */ 1598325132Savg if (itx->itx_lr.lrc_txtype == TX_COMMIT) 1599325132Savg zil_commit_waiter_skip(itx->itx_private); 1600325132Savg 1601219089Spjd list_remove(list, itx); 1602325132Savg zil_itx_destroy(itx); 1603219089Spjd } 1604168404Spjd 1605219089Spjd cookie = NULL; 1606219089Spjd t = &itxs->i_async_tree; 1607219089Spjd while ((ian = avl_destroy_nodes(t, &cookie)) != NULL) { 1608219089Spjd list = &ian->ia_list; 1609219089Spjd while ((itx = list_head(list)) != NULL) { 1610219089Spjd list_remove(list, itx); 1611325132Savg /* commit itxs should never be on the async lists. */ 1612325132Savg ASSERT3U(itx->itx_lr.lrc_txtype, !=, TX_COMMIT); 1613325132Savg zil_itx_destroy(itx); 1614219089Spjd } 1615219089Spjd list_destroy(list); 1616219089Spjd kmem_free(ian, sizeof (itx_async_node_t)); 1617219089Spjd } 1618219089Spjd avl_destroy(t); 1619219089Spjd 1620219089Spjd kmem_free(itxs, sizeof (itxs_t)); 1621168404Spjd} 1622168404Spjd 1623219089Spjdstatic int 1624219089Spjdzil_aitx_compare(const void *x1, const void *x2) 1625219089Spjd{ 1626219089Spjd const uint64_t o1 = ((itx_async_node_t *)x1)->ia_foid; 1627219089Spjd const uint64_t o2 = ((itx_async_node_t *)x2)->ia_foid; 1628219089Spjd 1629219089Spjd if (o1 < o2) 1630219089Spjd return (-1); 1631219089Spjd if (o1 > o2) 1632219089Spjd return (1); 1633219089Spjd 1634219089Spjd return (0); 1635219089Spjd} 1636219089Spjd 1637168404Spjd/* 1638219089Spjd * Remove all async itx with the given oid. 1639168404Spjd */ 1640168404Spjdstatic void 1641219089Spjdzil_remove_async(zilog_t *zilog, uint64_t oid) 1642168404Spjd{ 1643219089Spjd uint64_t otxg, txg; 1644219089Spjd itx_async_node_t *ian; 1645219089Spjd avl_tree_t *t; 1646219089Spjd avl_index_t where; 1647168404Spjd list_t clean_list; 1648168404Spjd itx_t *itx; 1649168404Spjd 1650219089Spjd ASSERT(oid != 0); 1651168404Spjd list_create(&clean_list, sizeof (itx_t), offsetof(itx_t, itx_node)); 1652168404Spjd 1653219089Spjd if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */ 1654219089Spjd otxg = ZILTEST_TXG; 1655219089Spjd else 1656219089Spjd otxg = spa_last_synced_txg(zilog->zl_spa) + 1; 1657219089Spjd 1658219089Spjd for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) { 1659219089Spjd itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK]; 1660219089Spjd 1661219089Spjd mutex_enter(&itxg->itxg_lock); 1662219089Spjd if (itxg->itxg_txg != txg) { 1663219089Spjd mutex_exit(&itxg->itxg_lock); 1664219089Spjd continue; 1665219089Spjd } 1666219089Spjd 1667219089Spjd /* 1668219089Spjd * Locate the object node and append its list. 1669219089Spjd */ 1670219089Spjd t = &itxg->itxg_itxs->i_async_tree; 1671219089Spjd ian = avl_find(t, &oid, &where); 1672219089Spjd if (ian != NULL) 1673219089Spjd list_move_tail(&clean_list, &ian->ia_list); 1674219089Spjd mutex_exit(&itxg->itxg_lock); 1675168404Spjd } 1676219089Spjd while ((itx = list_head(&clean_list)) != NULL) { 1677219089Spjd list_remove(&clean_list, itx); 1678325132Savg /* commit itxs should never be on the async lists. */ 1679325132Savg ASSERT3U(itx->itx_lr.lrc_txtype, !=, TX_COMMIT); 1680325132Savg zil_itx_destroy(itx); 1681219089Spjd } 1682219089Spjd list_destroy(&clean_list); 1683219089Spjd} 1684168404Spjd 1685219089Spjdvoid 1686219089Spjdzil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx) 1687219089Spjd{ 1688219089Spjd uint64_t txg; 1689219089Spjd itxg_t *itxg; 1690219089Spjd itxs_t *itxs, *clean = NULL; 1691219089Spjd 1692168404Spjd /* 1693219089Spjd * Object ids can be re-instantiated in the next txg so 1694219089Spjd * remove any async transactions to avoid future leaks. 1695219089Spjd * This can happen if a fsync occurs on the re-instantiated 1696219089Spjd * object for a WR_INDIRECT or WR_NEED_COPY write, which gets 1697219089Spjd * the new file data and flushes a write record for the old object. 1698168404Spjd */ 1699219089Spjd if ((itx->itx_lr.lrc_txtype & ~TX_CI) == TX_REMOVE) 1700219089Spjd zil_remove_async(zilog, itx->itx_oid); 1701219089Spjd 1702219089Spjd /* 1703219089Spjd * Ensure the data of a renamed file is committed before the rename. 1704219089Spjd */ 1705219089Spjd if ((itx->itx_lr.lrc_txtype & ~TX_CI) == TX_RENAME) 1706219089Spjd zil_async_to_sync(zilog, itx->itx_oid); 1707219089Spjd 1708239620Smm if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) 1709219089Spjd txg = ZILTEST_TXG; 1710219089Spjd else 1711219089Spjd txg = dmu_tx_get_txg(tx); 1712219089Spjd 1713219089Spjd itxg = &zilog->zl_itxg[txg & TXG_MASK]; 1714219089Spjd mutex_enter(&itxg->itxg_lock); 1715219089Spjd itxs = itxg->itxg_itxs; 1716219089Spjd if (itxg->itxg_txg != txg) { 1717219089Spjd if (itxs != NULL) { 1718219089Spjd /* 1719219089Spjd * The zil_clean callback hasn't got around to cleaning 1720219089Spjd * this itxg. Save the itxs for release below. 1721219089Spjd * This should be rare. 1722219089Spjd */ 1723321611Smav zfs_dbgmsg("zil_itx_assign: missed itx cleanup for " 1724321611Smav "txg %llu", itxg->itxg_txg); 1725219089Spjd clean = itxg->itxg_itxs; 1726219089Spjd } 1727219089Spjd itxg->itxg_txg = txg; 1728219089Spjd itxs = itxg->itxg_itxs = kmem_zalloc(sizeof (itxs_t), KM_SLEEP); 1729219089Spjd 1730219089Spjd list_create(&itxs->i_sync_list, sizeof (itx_t), 1731219089Spjd offsetof(itx_t, itx_node)); 1732219089Spjd avl_create(&itxs->i_async_tree, zil_aitx_compare, 1733219089Spjd sizeof (itx_async_node_t), 1734219089Spjd offsetof(itx_async_node_t, ia_node)); 1735168404Spjd } 1736219089Spjd if (itx->itx_sync) { 1737219089Spjd list_insert_tail(&itxs->i_sync_list, itx); 1738219089Spjd } else { 1739219089Spjd avl_tree_t *t = &itxs->i_async_tree; 1740219089Spjd uint64_t foid = ((lr_ooo_t *)&itx->itx_lr)->lr_foid; 1741219089Spjd itx_async_node_t *ian; 1742219089Spjd avl_index_t where; 1743168404Spjd 1744219089Spjd ian = avl_find(t, &foid, &where); 1745219089Spjd if (ian == NULL) { 1746219089Spjd ian = kmem_alloc(sizeof (itx_async_node_t), KM_SLEEP); 1747219089Spjd list_create(&ian->ia_list, sizeof (itx_t), 1748219089Spjd offsetof(itx_t, itx_node)); 1749219089Spjd ian->ia_foid = foid; 1750219089Spjd avl_insert(t, ian, where); 1751219089Spjd } 1752219089Spjd list_insert_tail(&ian->ia_list, itx); 1753168404Spjd } 1754219089Spjd 1755219089Spjd itx->itx_lr.lrc_txg = dmu_tx_get_txg(tx); 1756325132Savg 1757325132Savg /* 1758325132Savg * We don't want to dirty the ZIL using ZILTEST_TXG, because 1759325132Savg * zil_clean() will never be called using ZILTEST_TXG. Thus, we 1760325132Savg * need to be careful to always dirty the ZIL using the "real" 1761325132Savg * TXG (not itxg_txg) even when the SPA is frozen. 1762325132Savg */ 1763325132Savg zilog_dirty(zilog, dmu_tx_get_txg(tx)); 1764219089Spjd mutex_exit(&itxg->itxg_lock); 1765219089Spjd 1766219089Spjd /* Release the old itxs now we've dropped the lock */ 1767219089Spjd if (clean != NULL) 1768219089Spjd zil_itxg_clean(clean); 1769168404Spjd} 1770168404Spjd 1771168404Spjd/* 1772168404Spjd * If there are any in-memory intent log transactions which have now been 1773239620Smm * synced then start up a taskq to free them. We should only do this after we 1774239620Smm * have written out the uberblocks (i.e. txg has been comitted) so that 1775239620Smm * don't inadvertently clean out in-memory log records that would be required 1776239620Smm * by zil_commit(). 1777168404Spjd */ 1778168404Spjdvoid 1779219089Spjdzil_clean(zilog_t *zilog, uint64_t synced_txg) 1780168404Spjd{ 1781219089Spjd itxg_t *itxg = &zilog->zl_itxg[synced_txg & TXG_MASK]; 1782219089Spjd itxs_t *clean_me; 1783168404Spjd 1784325132Savg ASSERT3U(synced_txg, <, ZILTEST_TXG); 1785325132Savg 1786219089Spjd mutex_enter(&itxg->itxg_lock); 1787219089Spjd if (itxg->itxg_itxs == NULL || itxg->itxg_txg == ZILTEST_TXG) { 1788219089Spjd mutex_exit(&itxg->itxg_lock); 1789219089Spjd return; 1790168404Spjd } 1791219089Spjd ASSERT3U(itxg->itxg_txg, <=, synced_txg); 1792324205Savg ASSERT3U(itxg->itxg_txg, !=, 0); 1793219089Spjd clean_me = itxg->itxg_itxs; 1794219089Spjd itxg->itxg_itxs = NULL; 1795219089Spjd itxg->itxg_txg = 0; 1796219089Spjd mutex_exit(&itxg->itxg_lock); 1797219089Spjd /* 1798219089Spjd * Preferably start a task queue to free up the old itxs but 1799219089Spjd * if taskq_dispatch can't allocate resources to do that then 1800219089Spjd * free it in-line. This should be rare. Note, using TQ_SLEEP 1801219089Spjd * created a bad performance problem. 1802219089Spjd */ 1803324205Savg ASSERT3P(zilog->zl_dmu_pool, !=, NULL); 1804324205Savg ASSERT3P(zilog->zl_dmu_pool->dp_zil_clean_taskq, !=, NULL); 1805324205Savg if (taskq_dispatch(zilog->zl_dmu_pool->dp_zil_clean_taskq, 1806219089Spjd (void (*)(void *))zil_itxg_clean, clean_me, TQ_NOSLEEP) == 0) 1807219089Spjd zil_itxg_clean(clean_me); 1808168404Spjd} 1809168404Spjd 1810219089Spjd/* 1811325132Savg * This function will traverse the queue of itxs that need to be 1812325132Savg * committed, and move them onto the ZIL's zl_itx_commit_list. 1813219089Spjd */ 1814185029Spjdstatic void 1815219089Spjdzil_get_commit_list(zilog_t *zilog) 1816168404Spjd{ 1817219089Spjd uint64_t otxg, txg; 1818219089Spjd list_t *commit_list = &zilog->zl_itx_commit_list; 1819219089Spjd 1820329485Smav ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); 1821325132Savg 1822219089Spjd if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */ 1823219089Spjd otxg = ZILTEST_TXG; 1824219089Spjd else 1825219089Spjd otxg = spa_last_synced_txg(zilog->zl_spa) + 1; 1826219089Spjd 1827310515Savg /* 1828310515Savg * This is inherently racy, since there is nothing to prevent 1829310515Savg * the last synced txg from changing. That's okay since we'll 1830310515Savg * only commit things in the future. 1831310515Savg */ 1832219089Spjd for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) { 1833219089Spjd itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK]; 1834219089Spjd 1835219089Spjd mutex_enter(&itxg->itxg_lock); 1836219089Spjd if (itxg->itxg_txg != txg) { 1837219089Spjd mutex_exit(&itxg->itxg_lock); 1838219089Spjd continue; 1839219089Spjd } 1840219089Spjd 1841310515Savg /* 1842310515Savg * If we're adding itx records to the zl_itx_commit_list, 1843310515Savg * then the zil better be dirty in this "txg". We can assert 1844310515Savg * that here since we're holding the itxg_lock which will 1845310515Savg * prevent spa_sync from cleaning it. Once we add the itxs 1846310515Savg * to the zl_itx_commit_list we must commit it to disk even 1847310515Savg * if it's unnecessary (i.e. the txg was synced). 1848310515Savg */ 1849310515Savg ASSERT(zilog_is_dirty_in_txg(zilog, txg) || 1850310515Savg spa_freeze_txg(zilog->zl_spa) != UINT64_MAX); 1851219089Spjd list_move_tail(commit_list, &itxg->itxg_itxs->i_sync_list); 1852219089Spjd 1853219089Spjd mutex_exit(&itxg->itxg_lock); 1854219089Spjd } 1855219089Spjd} 1856219089Spjd 1857219089Spjd/* 1858219089Spjd * Move the async itxs for a specified object to commit into sync lists. 1859219089Spjd */ 1860308595Smavvoid 1861219089Spjdzil_async_to_sync(zilog_t *zilog, uint64_t foid) 1862219089Spjd{ 1863219089Spjd uint64_t otxg, txg; 1864219089Spjd itx_async_node_t *ian; 1865219089Spjd avl_tree_t *t; 1866219089Spjd avl_index_t where; 1867219089Spjd 1868219089Spjd if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */ 1869219089Spjd otxg = ZILTEST_TXG; 1870219089Spjd else 1871219089Spjd otxg = spa_last_synced_txg(zilog->zl_spa) + 1; 1872219089Spjd 1873310515Savg /* 1874310515Savg * This is inherently racy, since there is nothing to prevent 1875310515Savg * the last synced txg from changing. 1876310515Savg */ 1877219089Spjd for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) { 1878219089Spjd itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK]; 1879219089Spjd 1880219089Spjd mutex_enter(&itxg->itxg_lock); 1881219089Spjd if (itxg->itxg_txg != txg) { 1882219089Spjd mutex_exit(&itxg->itxg_lock); 1883219089Spjd continue; 1884219089Spjd } 1885219089Spjd 1886219089Spjd /* 1887219089Spjd * If a foid is specified then find that node and append its 1888219089Spjd * list. Otherwise walk the tree appending all the lists 1889219089Spjd * to the sync list. We add to the end rather than the 1890219089Spjd * beginning to ensure the create has happened. 1891219089Spjd */ 1892219089Spjd t = &itxg->itxg_itxs->i_async_tree; 1893219089Spjd if (foid != 0) { 1894219089Spjd ian = avl_find(t, &foid, &where); 1895219089Spjd if (ian != NULL) { 1896219089Spjd list_move_tail(&itxg->itxg_itxs->i_sync_list, 1897219089Spjd &ian->ia_list); 1898219089Spjd } 1899219089Spjd } else { 1900219089Spjd void *cookie = NULL; 1901219089Spjd 1902219089Spjd while ((ian = avl_destroy_nodes(t, &cookie)) != NULL) { 1903219089Spjd list_move_tail(&itxg->itxg_itxs->i_sync_list, 1904219089Spjd &ian->ia_list); 1905219089Spjd list_destroy(&ian->ia_list); 1906219089Spjd kmem_free(ian, sizeof (itx_async_node_t)); 1907219089Spjd } 1908219089Spjd } 1909219089Spjd mutex_exit(&itxg->itxg_lock); 1910219089Spjd } 1911219089Spjd} 1912219089Spjd 1913325132Savg/* 1914325132Savg * This function will prune commit itxs that are at the head of the 1915325132Savg * commit list (it won't prune past the first non-commit itx), and 1916325132Savg * either: a) attach them to the last lwb that's still pending 1917325132Savg * completion, or b) skip them altogether. 1918325132Savg * 1919325132Savg * This is used as a performance optimization to prevent commit itxs 1920325132Savg * from generating new lwbs when it's unnecessary to do so. 1921325132Savg */ 1922219089Spjdstatic void 1923325132Savgzil_prune_commit_list(zilog_t *zilog) 1924219089Spjd{ 1925219089Spjd itx_t *itx; 1926168404Spjd 1927329485Smav ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); 1928168404Spjd 1929325132Savg while (itx = list_head(&zilog->zl_itx_commit_list)) { 1930325132Savg lr_t *lrc = &itx->itx_lr; 1931325132Savg if (lrc->lrc_txtype != TX_COMMIT) 1932325132Savg break; 1933219089Spjd 1934325132Savg mutex_enter(&zilog->zl_lock); 1935219089Spjd 1936325132Savg lwb_t *last_lwb = zilog->zl_last_lwb_opened; 1937325132Savg if (last_lwb == NULL || last_lwb->lwb_state == LWB_STATE_DONE) { 1938325132Savg /* 1939325132Savg * All of the itxs this waiter was waiting on 1940325132Savg * must have already completed (or there were 1941325132Savg * never any itx's for it to wait on), so it's 1942325132Savg * safe to skip this waiter and mark it done. 1943325132Savg */ 1944325132Savg zil_commit_waiter_skip(itx->itx_private); 1945325132Savg } else { 1946325132Savg zil_commit_waiter_link_lwb(itx->itx_private, last_lwb); 1947325132Savg itx->itx_private = NULL; 1948325132Savg } 1949325132Savg 1950325132Savg mutex_exit(&zilog->zl_lock); 1951325132Savg 1952325132Savg list_remove(&zilog->zl_itx_commit_list, itx); 1953325132Savg zil_itx_destroy(itx); 1954325132Savg } 1955325132Savg 1956325132Savg IMPLY(itx != NULL, itx->itx_lr.lrc_txtype != TX_COMMIT); 1957325132Savg} 1958325132Savg 1959325132Savgstatic void 1960325132Savgzil_commit_writer_stall(zilog_t *zilog) 1961325132Savg{ 1962219089Spjd /* 1963325132Savg * When zio_alloc_zil() fails to allocate the next lwb block on 1964325132Savg * disk, we must call txg_wait_synced() to ensure all of the 1965325132Savg * lwbs in the zilog's zl_lwb_list are synced and then freed (in 1966325132Savg * zil_sync()), such that any subsequent ZIL writer (i.e. a call 1967325132Savg * to zil_process_commit_list()) will have to call zil_create(), 1968325132Savg * and start a new ZIL chain. 1969325132Savg * 1970325132Savg * Since zil_alloc_zil() failed, the lwb that was previously 1971325132Savg * issued does not have a pointer to the "next" lwb on disk. 1972325132Savg * Thus, if another ZIL writer thread was to allocate the "next" 1973325132Savg * on-disk lwb, that block could be leaked in the event of a 1974325132Savg * crash (because the previous lwb on-disk would not point to 1975325132Savg * it). 1976325132Savg * 1977329485Smav * We must hold the zilog's zl_issuer_lock while we do this, to 1978325132Savg * ensure no new threads enter zil_process_commit_list() until 1979325132Savg * all lwb's in the zl_lwb_list have been synced and freed 1980325132Savg * (which is achieved via the txg_wait_synced() call). 1981325132Savg */ 1982329485Smav ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); 1983325132Savg txg_wait_synced(zilog->zl_dmu_pool, 0); 1984325132Savg ASSERT3P(list_tail(&zilog->zl_lwb_list), ==, NULL); 1985325132Savg} 1986325132Savg 1987325132Savg/* 1988325132Savg * This function will traverse the commit list, creating new lwbs as 1989325132Savg * needed, and committing the itxs from the commit list to these newly 1990325132Savg * created lwbs. Additionally, as a new lwb is created, the previous 1991325132Savg * lwb will be issued to the zio layer to be written to disk. 1992325132Savg */ 1993325132Savgstatic void 1994325132Savgzil_process_commit_list(zilog_t *zilog) 1995325132Savg{ 1996325132Savg spa_t *spa = zilog->zl_spa; 1997325132Savg list_t nolwb_waiters; 1998325132Savg lwb_t *lwb; 1999325132Savg itx_t *itx; 2000325132Savg 2001329485Smav ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); 2002325132Savg 2003325132Savg /* 2004219089Spjd * Return if there's nothing to commit before we dirty the fs by 2005219089Spjd * calling zil_create(). 2006219089Spjd */ 2007325132Savg if (list_head(&zilog->zl_itx_commit_list) == NULL) 2008219089Spjd return; 2009219089Spjd 2010325132Savg list_create(&nolwb_waiters, sizeof (zil_commit_waiter_t), 2011325132Savg offsetof(zil_commit_waiter_t, zcw_node)); 2012325132Savg 2013325132Savg lwb = list_tail(&zilog->zl_lwb_list); 2014325132Savg if (lwb == NULL) { 2015325132Savg lwb = zil_create(zilog); 2016168404Spjd } else { 2017325132Savg ASSERT3S(lwb->lwb_state, !=, LWB_STATE_ISSUED); 2018325132Savg ASSERT3S(lwb->lwb_state, !=, LWB_STATE_DONE); 2019168404Spjd } 2020168404Spjd 2021219089Spjd while (itx = list_head(&zilog->zl_itx_commit_list)) { 2022325132Savg lr_t *lrc = &itx->itx_lr; 2023325132Savg uint64_t txg = lrc->lrc_txg; 2024325132Savg 2025310515Savg ASSERT3U(txg, !=, 0); 2026168404Spjd 2027325132Savg if (lrc->lrc_txtype == TX_COMMIT) { 2028325132Savg DTRACE_PROBE2(zil__process__commit__itx, 2029325132Savg zilog_t *, zilog, itx_t *, itx); 2030325132Savg } else { 2031325132Savg DTRACE_PROBE2(zil__process__normal__itx, 2032325132Savg zilog_t *, zilog, itx_t *, itx); 2033325132Savg } 2034325132Savg 2035325132Savg boolean_t synced = txg <= spa_last_synced_txg(spa); 2036325132Savg boolean_t frozen = txg > spa_freeze_txg(spa); 2037325132Savg 2038329486Smav /* 2039329486Smav * If the txg of this itx has already been synced out, then 2040329486Smav * we don't need to commit this itx to an lwb. This is 2041329486Smav * because the data of this itx will have already been 2042329486Smav * written to the main pool. This is inherently racy, and 2043329486Smav * it's still ok to commit an itx whose txg has already 2044329486Smav * been synced; this will result in a write that's 2045329486Smav * unnecessary, but will do no harm. 2046329486Smav * 2047329486Smav * With that said, we always want to commit TX_COMMIT itxs 2048329486Smav * to an lwb, regardless of whether or not that itx's txg 2049329486Smav * has been synced out. We do this to ensure any OPENED lwb 2050329486Smav * will always have at least one zil_commit_waiter_t linked 2051329486Smav * to the lwb. 2052329486Smav * 2053329486Smav * As a counter-example, if we skipped TX_COMMIT itx's 2054329486Smav * whose txg had already been synced, the following 2055329486Smav * situation could occur if we happened to be racing with 2056329486Smav * spa_sync: 2057329486Smav * 2058329486Smav * 1. we commit a non-TX_COMMIT itx to an lwb, where the 2059329486Smav * itx's txg is 10 and the last synced txg is 9. 2060329486Smav * 2. spa_sync finishes syncing out txg 10. 2061329486Smav * 3. we move to the next itx in the list, it's a TX_COMMIT 2062329486Smav * whose txg is 10, so we skip it rather than committing 2063329486Smav * it to the lwb used in (1). 2064329486Smav * 2065329486Smav * If the itx that is skipped in (3) is the last TX_COMMIT 2066329486Smav * itx in the commit list, than it's possible for the lwb 2067329486Smav * used in (1) to remain in the OPENED state indefinitely. 2068329486Smav * 2069329486Smav * To prevent the above scenario from occuring, ensuring 2070329486Smav * that once an lwb is OPENED it will transition to ISSUED 2071329486Smav * and eventually DONE, we always commit TX_COMMIT itx's to 2072329486Smav * an lwb here, even if that itx's txg has already been 2073329486Smav * synced. 2074329486Smav * 2075329486Smav * Finally, if the pool is frozen, we _always_ commit the 2076329486Smav * itx. The point of freezing the pool is to prevent data 2077329486Smav * from being written to the main pool via spa_sync, and 2078329486Smav * instead rely solely on the ZIL to persistently store the 2079329486Smav * data; i.e. when the pool is frozen, the last synced txg 2080329486Smav * value can't be trusted. 2081329486Smav */ 2082329486Smav if (frozen || !synced || lrc->lrc_txtype == TX_COMMIT) { 2083325132Savg if (lwb != NULL) { 2084325132Savg lwb = zil_lwb_commit(zilog, itx, lwb); 2085325132Savg } else if (lrc->lrc_txtype == TX_COMMIT) { 2086325132Savg ASSERT3P(lwb, ==, NULL); 2087325132Savg zil_commit_waiter_link_nolwb( 2088325132Savg itx->itx_private, &nolwb_waiters); 2089325132Savg } 2090325132Savg } 2091325132Savg 2092219089Spjd list_remove(&zilog->zl_itx_commit_list, itx); 2093325132Savg zil_itx_destroy(itx); 2094168404Spjd } 2095168404Spjd 2096325132Savg if (lwb == NULL) { 2097325132Savg /* 2098325132Savg * This indicates zio_alloc_zil() failed to allocate the 2099325132Savg * "next" lwb on-disk. When this happens, we must stall 2100325132Savg * the ZIL write pipeline; see the comment within 2101325132Savg * zil_commit_writer_stall() for more details. 2102325132Savg */ 2103325132Savg zil_commit_writer_stall(zilog); 2104168404Spjd 2105325132Savg /* 2106325132Savg * Additionally, we have to signal and mark the "nolwb" 2107325132Savg * waiters as "done" here, since without an lwb, we 2108325132Savg * can't do this via zil_lwb_flush_vdevs_done() like 2109325132Savg * normal. 2110325132Savg */ 2111325132Savg zil_commit_waiter_t *zcw; 2112325132Savg while (zcw = list_head(&nolwb_waiters)) { 2113325132Savg zil_commit_waiter_skip(zcw); 2114325132Savg list_remove(&nolwb_waiters, zcw); 2115325132Savg } 2116325132Savg } else { 2117325132Savg ASSERT(list_is_empty(&nolwb_waiters)); 2118325132Savg ASSERT3P(lwb, !=, NULL); 2119325132Savg ASSERT3S(lwb->lwb_state, !=, LWB_STATE_ISSUED); 2120325132Savg ASSERT3S(lwb->lwb_state, !=, LWB_STATE_DONE); 2121168404Spjd 2122325132Savg /* 2123325132Savg * At this point, the ZIL block pointed at by the "lwb" 2124325132Savg * variable is in one of the following states: "closed" 2125325132Savg * or "open". 2126325132Savg * 2127325132Savg * If its "closed", then no itxs have been committed to 2128325132Savg * it, so there's no point in issuing its zio (i.e. 2129325132Savg * it's "empty"). 2130325132Savg * 2131325132Savg * If its "open" state, then it contains one or more 2132325132Savg * itxs that eventually need to be committed to stable 2133325132Savg * storage. In this case we intentionally do not issue 2134325132Savg * the lwb's zio to disk yet, and instead rely on one of 2135325132Savg * the following two mechanisms for issuing the zio: 2136325132Savg * 2137325132Savg * 1. Ideally, there will be more ZIL activity occuring 2138325132Savg * on the system, such that this function will be 2139325132Savg * immediately called again (not necessarily by the same 2140325132Savg * thread) and this lwb's zio will be issued via 2141325132Savg * zil_lwb_commit(). This way, the lwb is guaranteed to 2142325132Savg * be "full" when it is issued to disk, and we'll make 2143325132Savg * use of the lwb's size the best we can. 2144325132Savg * 2145325132Savg * 2. If there isn't sufficient ZIL activity occuring on 2146325132Savg * the system, such that this lwb's zio isn't issued via 2147325132Savg * zil_lwb_commit(), zil_commit_waiter() will issue the 2148325132Savg * lwb's zio. If this occurs, the lwb is not guaranteed 2149325132Savg * to be "full" by the time its zio is issued, and means 2150325132Savg * the size of the lwb was "too large" given the amount 2151325132Savg * of ZIL activity occuring on the system at that time. 2152325132Savg * 2153325132Savg * We do this for a couple of reasons: 2154325132Savg * 2155325132Savg * 1. To try and reduce the number of IOPs needed to 2156325132Savg * write the same number of itxs. If an lwb has space 2157325132Savg * available in it's buffer for more itxs, and more itxs 2158325132Savg * will be committed relatively soon (relative to the 2159325132Savg * latency of performing a write), then it's beneficial 2160325132Savg * to wait for these "next" itxs. This way, more itxs 2161325132Savg * can be committed to stable storage with fewer writes. 2162325132Savg * 2163325132Savg * 2. To try and use the largest lwb block size that the 2164325132Savg * incoming rate of itxs can support. Again, this is to 2165325132Savg * try and pack as many itxs into as few lwbs as 2166325132Savg * possible, without significantly impacting the latency 2167325132Savg * of each individual itx. 2168325132Savg */ 2169325132Savg } 2170325132Savg} 2171325132Savg 2172325132Savg/* 2173325132Savg * This function is responsible for ensuring the passed in commit waiter 2174325132Savg * (and associated commit itx) is committed to an lwb. If the waiter is 2175325132Savg * not already committed to an lwb, all itxs in the zilog's queue of 2176325132Savg * itxs will be processed. The assumption is the passed in waiter's 2177325132Savg * commit itx will found in the queue just like the other non-commit 2178325132Savg * itxs, such that when the entire queue is processed, the waiter will 2179325132Savg * have been commited to an lwb. 2180325132Savg * 2181325132Savg * The lwb associated with the passed in waiter is not guaranteed to 2182325132Savg * have been issued by the time this function completes. If the lwb is 2183325132Savg * not issued, we rely on future calls to zil_commit_writer() to issue 2184325132Savg * the lwb, or the timeout mechanism found in zil_commit_waiter(). 2185325132Savg */ 2186325132Savgstatic void 2187325132Savgzil_commit_writer(zilog_t *zilog, zil_commit_waiter_t *zcw) 2188325132Savg{ 2189325132Savg ASSERT(!MUTEX_HELD(&zilog->zl_lock)); 2190325132Savg ASSERT(spa_writeable(zilog->zl_spa)); 2191325132Savg 2192329485Smav mutex_enter(&zilog->zl_issuer_lock); 2193325132Savg 2194325132Savg if (zcw->zcw_lwb != NULL || zcw->zcw_done) { 2195325132Savg /* 2196325132Savg * It's possible that, while we were waiting to acquire 2197329485Smav * the "zl_issuer_lock", another thread committed this 2198325132Savg * waiter to an lwb. If that occurs, we bail out early, 2199325132Savg * without processing any of the zilog's queue of itxs. 2200325132Savg * 2201325132Savg * On certain workloads and system configurations, the 2202329485Smav * "zl_issuer_lock" can become highly contended. In an 2203325132Savg * attempt to reduce this contention, we immediately drop 2204325132Savg * the lock if the waiter has already been processed. 2205325132Savg * 2206325132Savg * We've measured this optimization to reduce CPU spent 2207325132Savg * contending on this lock by up to 5%, using a system 2208325132Savg * with 32 CPUs, low latency storage (~50 usec writes), 2209325132Savg * and 1024 threads performing sync writes. 2210325132Savg */ 2211325132Savg goto out; 2212325132Savg } 2213325132Savg 2214325132Savg zil_get_commit_list(zilog); 2215325132Savg zil_prune_commit_list(zilog); 2216325132Savg zil_process_commit_list(zilog); 2217325132Savg 2218325132Savgout: 2219329485Smav mutex_exit(&zilog->zl_issuer_lock); 2220325132Savg} 2221325132Savg 2222325132Savgstatic void 2223325132Savgzil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw) 2224325132Savg{ 2225329485Smav ASSERT(!MUTEX_HELD(&zilog->zl_issuer_lock)); 2226325132Savg ASSERT(MUTEX_HELD(&zcw->zcw_lock)); 2227325132Savg ASSERT3B(zcw->zcw_done, ==, B_FALSE); 2228325132Savg 2229325132Savg lwb_t *lwb = zcw->zcw_lwb; 2230325132Savg ASSERT3P(lwb, !=, NULL); 2231325132Savg ASSERT3S(lwb->lwb_state, !=, LWB_STATE_CLOSED); 2232325132Savg 2233168404Spjd /* 2234325132Savg * If the lwb has already been issued by another thread, we can 2235325132Savg * immediately return since there's no work to be done (the 2236325132Savg * point of this function is to issue the lwb). Additionally, we 2237329485Smav * do this prior to acquiring the zl_issuer_lock, to avoid 2238325132Savg * acquiring it when it's not necessary to do so. 2239168404Spjd */ 2240325132Savg if (lwb->lwb_state == LWB_STATE_ISSUED || 2241325132Savg lwb->lwb_state == LWB_STATE_DONE) 2242325132Savg return; 2243325132Savg 2244325132Savg /* 2245325132Savg * In order to call zil_lwb_write_issue() we must hold the 2246329485Smav * zilog's "zl_issuer_lock". We can't simply acquire that lock, 2247325132Savg * since we're already holding the commit waiter's "zcw_lock", 2248325132Savg * and those two locks are aquired in the opposite order 2249325132Savg * elsewhere. 2250325132Savg */ 2251325132Savg mutex_exit(&zcw->zcw_lock); 2252329485Smav mutex_enter(&zilog->zl_issuer_lock); 2253325132Savg mutex_enter(&zcw->zcw_lock); 2254325132Savg 2255325132Savg /* 2256325132Savg * Since we just dropped and re-acquired the commit waiter's 2257325132Savg * lock, we have to re-check to see if the waiter was marked 2258325132Savg * "done" during that process. If the waiter was marked "done", 2259325132Savg * the "lwb" pointer is no longer valid (it can be free'd after 2260325132Savg * the waiter is marked "done"), so without this check we could 2261325132Savg * wind up with a use-after-free error below. 2262325132Savg */ 2263325132Savg if (zcw->zcw_done) 2264325132Savg goto out; 2265325132Savg 2266325132Savg ASSERT3P(lwb, ==, zcw->zcw_lwb); 2267325132Savg 2268325132Savg /* 2269329486Smav * We've already checked this above, but since we hadn't acquired 2270329486Smav * the zilog's zl_issuer_lock, we have to perform this check a 2271329486Smav * second time while holding the lock. 2272329486Smav * 2273329486Smav * We don't need to hold the zl_lock since the lwb cannot transition 2274329486Smav * from OPENED to ISSUED while we hold the zl_issuer_lock. The lwb 2275329486Smav * _can_ transition from ISSUED to DONE, but it's OK to race with 2276329486Smav * that transition since we treat the lwb the same, whether it's in 2277329486Smav * the ISSUED or DONE states. 2278329486Smav * 2279329486Smav * The important thing, is we treat the lwb differently depending on 2280329486Smav * if it's ISSUED or OPENED, and block any other threads that might 2281329486Smav * attempt to issue this lwb. For that reason we hold the 2282329486Smav * zl_issuer_lock when checking the lwb_state; we must not call 2283325132Savg * zil_lwb_write_issue() if the lwb had already been issued. 2284329486Smav * 2285329486Smav * See the comment above the lwb_state_t structure definition for 2286329486Smav * more details on the lwb states, and locking requirements. 2287325132Savg */ 2288325132Savg if (lwb->lwb_state == LWB_STATE_ISSUED || 2289325132Savg lwb->lwb_state == LWB_STATE_DONE) 2290325132Savg goto out; 2291325132Savg 2292325132Savg ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED); 2293325132Savg 2294325132Savg /* 2295325132Savg * As described in the comments above zil_commit_waiter() and 2296325132Savg * zil_process_commit_list(), we need to issue this lwb's zio 2297325132Savg * since we've reached the commit waiter's timeout and it still 2298325132Savg * hasn't been issued. 2299325132Savg */ 2300325132Savg lwb_t *nlwb = zil_lwb_write_issue(zilog, lwb); 2301325132Savg 2302339134Smav IMPLY(nlwb != NULL, lwb->lwb_state != LWB_STATE_OPENED); 2303325132Savg 2304325132Savg /* 2305325132Savg * Since the lwb's zio hadn't been issued by the time this thread 2306325132Savg * reached its timeout, we reset the zilog's "zl_cur_used" field 2307325132Savg * to influence the zil block size selection algorithm. 2308325132Savg * 2309325132Savg * By having to issue the lwb's zio here, it means the size of the 2310325132Savg * lwb was too large, given the incoming throughput of itxs. By 2311325132Savg * setting "zl_cur_used" to zero, we communicate this fact to the 2312325132Savg * block size selection algorithm, so it can take this informaiton 2313325132Savg * into account, and potentially select a smaller size for the 2314325132Savg * next lwb block that is allocated. 2315325132Savg */ 2316325132Savg zilog->zl_cur_used = 0; 2317325132Savg 2318325132Savg if (nlwb == NULL) { 2319325132Savg /* 2320325132Savg * When zil_lwb_write_issue() returns NULL, this 2321325132Savg * indicates zio_alloc_zil() failed to allocate the 2322325132Savg * "next" lwb on-disk. When this occurs, the ZIL write 2323325132Savg * pipeline must be stalled; see the comment within the 2324325132Savg * zil_commit_writer_stall() function for more details. 2325325132Savg * 2326325132Savg * We must drop the commit waiter's lock prior to 2327325132Savg * calling zil_commit_writer_stall() or else we can wind 2328325132Savg * up with the following deadlock: 2329325132Savg * 2330325132Savg * - This thread is waiting for the txg to sync while 2331325132Savg * holding the waiter's lock; txg_wait_synced() is 2332325132Savg * used within txg_commit_writer_stall(). 2333325132Savg * 2334325132Savg * - The txg can't sync because it is waiting for this 2335325132Savg * lwb's zio callback to call dmu_tx_commit(). 2336325132Savg * 2337325132Savg * - The lwb's zio callback can't call dmu_tx_commit() 2338325132Savg * because it's blocked trying to acquire the waiter's 2339325132Savg * lock, which occurs prior to calling dmu_tx_commit() 2340325132Savg */ 2341325132Savg mutex_exit(&zcw->zcw_lock); 2342325132Savg zil_commit_writer_stall(zilog); 2343325132Savg mutex_enter(&zcw->zcw_lock); 2344168404Spjd } 2345168404Spjd 2346325132Savgout: 2347329485Smav mutex_exit(&zilog->zl_issuer_lock); 2348325132Savg ASSERT(MUTEX_HELD(&zcw->zcw_lock)); 2349325132Savg} 2350168404Spjd 2351325132Savg/* 2352325132Savg * This function is responsible for performing the following two tasks: 2353325132Savg * 2354325132Savg * 1. its primary responsibility is to block until the given "commit 2355325132Savg * waiter" is considered "done". 2356325132Savg * 2357325132Savg * 2. its secondary responsibility is to issue the zio for the lwb that 2358325132Savg * the given "commit waiter" is waiting on, if this function has 2359325132Savg * waited "long enough" and the lwb is still in the "open" state. 2360325132Savg * 2361325132Savg * Given a sufficient amount of itxs being generated and written using 2362325132Savg * the ZIL, the lwb's zio will be issued via the zil_lwb_commit() 2363325132Savg * function. If this does not occur, this secondary responsibility will 2364325132Savg * ensure the lwb is issued even if there is not other synchronous 2365325132Savg * activity on the system. 2366325132Savg * 2367325132Savg * For more details, see zil_process_commit_list(); more specifically, 2368325132Savg * the comment at the bottom of that function. 2369325132Savg */ 2370325132Savgstatic void 2371325132Savgzil_commit_waiter(zilog_t *zilog, zil_commit_waiter_t *zcw) 2372325132Savg{ 2373325132Savg ASSERT(!MUTEX_HELD(&zilog->zl_lock)); 2374329485Smav ASSERT(!MUTEX_HELD(&zilog->zl_issuer_lock)); 2375325132Savg ASSERT(spa_writeable(zilog->zl_spa)); 2376168404Spjd 2377325132Savg mutex_enter(&zcw->zcw_lock); 2378325132Savg 2379219089Spjd /* 2380325132Savg * The timeout is scaled based on the lwb latency to avoid 2381325132Savg * significantly impacting the latency of each individual itx. 2382325132Savg * For more details, see the comment at the bottom of the 2383325132Savg * zil_process_commit_list() function. 2384219089Spjd */ 2385325132Savg int pct = MAX(zfs_commit_timeout_pct, 1); 2386325132Savg#if defined(illumos) || !defined(_KERNEL) 2387325132Savg hrtime_t sleep = (zilog->zl_last_lwb_latency * pct) / 100; 2388325132Savg hrtime_t wakeup = gethrtime() + sleep; 2389325132Savg#else 2390325132Savg sbintime_t sleep = nstosbt((zilog->zl_last_lwb_latency * pct) / 100); 2391325132Savg sbintime_t wakeup = getsbinuptime() + sleep; 2392325132Savg#endif 2393325132Savg boolean_t timedout = B_FALSE; 2394325132Savg 2395325132Savg while (!zcw->zcw_done) { 2396325132Savg ASSERT(MUTEX_HELD(&zcw->zcw_lock)); 2397325132Savg 2398325132Savg lwb_t *lwb = zcw->zcw_lwb; 2399325132Savg 2400325132Savg /* 2401325132Savg * Usually, the waiter will have a non-NULL lwb field here, 2402325132Savg * but it's possible for it to be NULL as a result of 2403325132Savg * zil_commit() racing with spa_sync(). 2404325132Savg * 2405325132Savg * When zil_clean() is called, it's possible for the itxg 2406325132Savg * list (which may be cleaned via a taskq) to contain 2407325132Savg * commit itxs. When this occurs, the commit waiters linked 2408325132Savg * off of these commit itxs will not be committed to an 2409325132Savg * lwb. Additionally, these commit waiters will not be 2410325132Savg * marked done until zil_commit_waiter_skip() is called via 2411325132Savg * zil_itxg_clean(). 2412325132Savg * 2413325132Savg * Thus, it's possible for this commit waiter (i.e. the 2414325132Savg * "zcw" variable) to be found in this "in between" state; 2415325132Savg * where it's "zcw_lwb" field is NULL, and it hasn't yet 2416325132Savg * been skipped, so it's "zcw_done" field is still B_FALSE. 2417325132Savg */ 2418325132Savg IMPLY(lwb != NULL, lwb->lwb_state != LWB_STATE_CLOSED); 2419325132Savg 2420325132Savg if (lwb != NULL && lwb->lwb_state == LWB_STATE_OPENED) { 2421325132Savg ASSERT3B(timedout, ==, B_FALSE); 2422325132Savg 2423325132Savg /* 2424325132Savg * If the lwb hasn't been issued yet, then we 2425325132Savg * need to wait with a timeout, in case this 2426325132Savg * function needs to issue the lwb after the 2427325132Savg * timeout is reached; responsibility (2) from 2428325132Savg * the comment above this function. 2429325132Savg */ 2430325132Savg#if defined(illumos) || !defined(_KERNEL) 2431325132Savg clock_t timeleft = cv_timedwait_hires(&zcw->zcw_cv, 2432325132Savg &zcw->zcw_lock, wakeup, USEC2NSEC(1), 2433325132Savg CALLOUT_FLAG_ABSOLUTE); 2434325132Savg 2435325132Savg if (timeleft >= 0 || zcw->zcw_done) 2436325132Savg continue; 2437325132Savg#else 2438325132Savg int wait_err = cv_timedwait_sbt(&zcw->zcw_cv, 2439325132Savg &zcw->zcw_lock, wakeup, SBT_1NS, C_ABSOLUTE); 2440325132Savg if (wait_err != EWOULDBLOCK || zcw->zcw_done) 2441325132Savg continue; 2442325132Savg#endif 2443325132Savg 2444325132Savg timedout = B_TRUE; 2445325132Savg zil_commit_waiter_timeout(zilog, zcw); 2446325132Savg 2447325132Savg if (!zcw->zcw_done) { 2448325132Savg /* 2449325132Savg * If the commit waiter has already been 2450325132Savg * marked "done", it's possible for the 2451325132Savg * waiter's lwb structure to have already 2452325132Savg * been freed. Thus, we can only reliably 2453325132Savg * make these assertions if the waiter 2454325132Savg * isn't done. 2455325132Savg */ 2456325132Savg ASSERT3P(lwb, ==, zcw->zcw_lwb); 2457325132Savg ASSERT3S(lwb->lwb_state, !=, LWB_STATE_OPENED); 2458325132Savg } 2459325132Savg } else { 2460325132Savg /* 2461325132Savg * If the lwb isn't open, then it must have already 2462325132Savg * been issued. In that case, there's no need to 2463325132Savg * use a timeout when waiting for the lwb to 2464325132Savg * complete. 2465325132Savg * 2466325132Savg * Additionally, if the lwb is NULL, the waiter 2467325132Savg * will soon be signalled and marked done via 2468325132Savg * zil_clean() and zil_itxg_clean(), so no timeout 2469325132Savg * is required. 2470325132Savg */ 2471325132Savg 2472325132Savg IMPLY(lwb != NULL, 2473325132Savg lwb->lwb_state == LWB_STATE_ISSUED || 2474325132Savg lwb->lwb_state == LWB_STATE_DONE); 2475325132Savg cv_wait(&zcw->zcw_cv, &zcw->zcw_lock); 2476325132Savg } 2477325132Savg } 2478325132Savg 2479325132Savg mutex_exit(&zcw->zcw_lock); 2480168404Spjd} 2481168404Spjd 2482325132Savgstatic zil_commit_waiter_t * 2483325132Savgzil_alloc_commit_waiter() 2484325132Savg{ 2485325132Savg zil_commit_waiter_t *zcw = kmem_cache_alloc(zil_zcw_cache, KM_SLEEP); 2486325132Savg 2487325132Savg cv_init(&zcw->zcw_cv, NULL, CV_DEFAULT, NULL); 2488325132Savg mutex_init(&zcw->zcw_lock, NULL, MUTEX_DEFAULT, NULL); 2489325132Savg list_link_init(&zcw->zcw_node); 2490325132Savg zcw->zcw_lwb = NULL; 2491325132Savg zcw->zcw_done = B_FALSE; 2492325132Savg zcw->zcw_zio_error = 0; 2493325132Savg 2494325132Savg return (zcw); 2495325132Savg} 2496325132Savg 2497325132Savgstatic void 2498325132Savgzil_free_commit_waiter(zil_commit_waiter_t *zcw) 2499325132Savg{ 2500325132Savg ASSERT(!list_link_active(&zcw->zcw_node)); 2501325132Savg ASSERT3P(zcw->zcw_lwb, ==, NULL); 2502325132Savg ASSERT3B(zcw->zcw_done, ==, B_TRUE); 2503325132Savg mutex_destroy(&zcw->zcw_lock); 2504325132Savg cv_destroy(&zcw->zcw_cv); 2505325132Savg kmem_cache_free(zil_zcw_cache, zcw); 2506325132Savg} 2507325132Savg 2508168404Spjd/* 2509325132Savg * This function is used to create a TX_COMMIT itx and assign it. This 2510325132Savg * way, it will be linked into the ZIL's list of synchronous itxs, and 2511325132Savg * then later committed to an lwb (or skipped) when 2512325132Savg * zil_process_commit_list() is called. 2513325132Savg */ 2514325132Savgstatic void 2515325132Savgzil_commit_itx_assign(zilog_t *zilog, zil_commit_waiter_t *zcw) 2516325132Savg{ 2517325132Savg dmu_tx_t *tx = dmu_tx_create(zilog->zl_os); 2518325132Savg VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); 2519325132Savg 2520325132Savg itx_t *itx = zil_itx_create(TX_COMMIT, sizeof (lr_t)); 2521325132Savg itx->itx_sync = B_TRUE; 2522325132Savg itx->itx_private = zcw; 2523325132Savg 2524325132Savg zil_itx_assign(zilog, itx, tx); 2525325132Savg 2526325132Savg dmu_tx_commit(tx); 2527325132Savg} 2528325132Savg 2529325132Savg/* 2530325132Savg * Commit ZFS Intent Log transactions (itxs) to stable storage. 2531219089Spjd * 2532325132Savg * When writing ZIL transactions to the on-disk representation of the 2533325132Savg * ZIL, the itxs are committed to a Log Write Block (lwb). Multiple 2534325132Savg * itxs can be committed to a single lwb. Once a lwb is written and 2535325132Savg * committed to stable storage (i.e. the lwb is written, and vdevs have 2536325132Savg * been flushed), each itx that was committed to that lwb is also 2537325132Savg * considered to be committed to stable storage. 2538219089Spjd * 2539325132Savg * When an itx is committed to an lwb, the log record (lr_t) contained 2540325132Savg * by the itx is copied into the lwb's zio buffer, and once this buffer 2541325132Savg * is written to disk, it becomes an on-disk ZIL block. 2542219089Spjd * 2543325132Savg * As itxs are generated, they're inserted into the ZIL's queue of 2544325132Savg * uncommitted itxs. The semantics of zil_commit() are such that it will 2545325132Savg * block until all itxs that were in the queue when it was called, are 2546325132Savg * committed to stable storage. 2547219089Spjd * 2548325132Savg * If "foid" is zero, this means all "synchronous" and "asynchronous" 2549325132Savg * itxs, for all objects in the dataset, will be committed to stable 2550325132Savg * storage prior to zil_commit() returning. If "foid" is non-zero, all 2551325132Savg * "synchronous" itxs for all objects, but only "asynchronous" itxs 2552325132Savg * that correspond to the foid passed in, will be committed to stable 2553325132Savg * storage prior to zil_commit() returning. 2554325132Savg * 2555325132Savg * Generally speaking, when zil_commit() is called, the consumer doesn't 2556325132Savg * actually care about _all_ of the uncommitted itxs. Instead, they're 2557325132Savg * simply trying to waiting for a specific itx to be committed to disk, 2558325132Savg * but the interface(s) for interacting with the ZIL don't allow such 2559325132Savg * fine-grained communication. A better interface would allow a consumer 2560325132Savg * to create and assign an itx, and then pass a reference to this itx to 2561325132Savg * zil_commit(); such that zil_commit() would return as soon as that 2562325132Savg * specific itx was committed to disk (instead of waiting for _all_ 2563325132Savg * itxs to be committed). 2564325132Savg * 2565325132Savg * When a thread calls zil_commit() a special "commit itx" will be 2566325132Savg * generated, along with a corresponding "waiter" for this commit itx. 2567325132Savg * zil_commit() will wait on this waiter's CV, such that when the waiter 2568325132Savg * is marked done, and signalled, zil_commit() will return. 2569325132Savg * 2570325132Savg * This commit itx is inserted into the queue of uncommitted itxs. This 2571325132Savg * provides an easy mechanism for determining which itxs were in the 2572325132Savg * queue prior to zil_commit() having been called, and which itxs were 2573325132Savg * added after zil_commit() was called. 2574325132Savg * 2575325132Savg * The commit it is special; it doesn't have any on-disk representation. 2576325132Savg * When a commit itx is "committed" to an lwb, the waiter associated 2577325132Savg * with it is linked onto the lwb's list of waiters. Then, when that lwb 2578325132Savg * completes, each waiter on the lwb's list is marked done and signalled 2579325132Savg * -- allowing the thread waiting on the waiter to return from zil_commit(). 2580325132Savg * 2581325132Savg * It's important to point out a few critical factors that allow us 2582325132Savg * to make use of the commit itxs, commit waiters, per-lwb lists of 2583325132Savg * commit waiters, and zio completion callbacks like we're doing: 2584325132Savg * 2585325132Savg * 1. The list of waiters for each lwb is traversed, and each commit 2586325132Savg * waiter is marked "done" and signalled, in the zio completion 2587325132Savg * callback of the lwb's zio[*]. 2588325132Savg * 2589325132Savg * * Actually, the waiters are signalled in the zio completion 2590325132Savg * callback of the root zio for the DKIOCFLUSHWRITECACHE commands 2591325132Savg * that are sent to the vdevs upon completion of the lwb zio. 2592325132Savg * 2593325132Savg * 2. When the itxs are inserted into the ZIL's queue of uncommitted 2594325132Savg * itxs, the order in which they are inserted is preserved[*]; as 2595325132Savg * itxs are added to the queue, they are added to the tail of 2596325132Savg * in-memory linked lists. 2597325132Savg * 2598325132Savg * When committing the itxs to lwbs (to be written to disk), they 2599325132Savg * are committed in the same order in which the itxs were added to 2600325132Savg * the uncommitted queue's linked list(s); i.e. the linked list of 2601325132Savg * itxs to commit is traversed from head to tail, and each itx is 2602325132Savg * committed to an lwb in that order. 2603325132Savg * 2604325132Savg * * To clarify: 2605325132Savg * 2606325132Savg * - the order of "sync" itxs is preserved w.r.t. other 2607325132Savg * "sync" itxs, regardless of the corresponding objects. 2608325132Savg * - the order of "async" itxs is preserved w.r.t. other 2609325132Savg * "async" itxs corresponding to the same object. 2610325132Savg * - the order of "async" itxs is *not* preserved w.r.t. other 2611325132Savg * "async" itxs corresponding to different objects. 2612325132Savg * - the order of "sync" itxs w.r.t. "async" itxs (or vice 2613325132Savg * versa) is *not* preserved, even for itxs that correspond 2614325132Savg * to the same object. 2615325132Savg * 2616325132Savg * For more details, see: zil_itx_assign(), zil_async_to_sync(), 2617325132Savg * zil_get_commit_list(), and zil_process_commit_list(). 2618325132Savg * 2619325132Savg * 3. The lwbs represent a linked list of blocks on disk. Thus, any 2620325132Savg * lwb cannot be considered committed to stable storage, until its 2621325132Savg * "previous" lwb is also committed to stable storage. This fact, 2622325132Savg * coupled with the fact described above, means that itxs are 2623325132Savg * committed in (roughly) the order in which they were generated. 2624325132Savg * This is essential because itxs are dependent on prior itxs. 2625325132Savg * Thus, we *must not* deem an itx as being committed to stable 2626325132Savg * storage, until *all* prior itxs have also been committed to 2627325132Savg * stable storage. 2628325132Savg * 2629325132Savg * To enforce this ordering of lwb zio's, while still leveraging as 2630325132Savg * much of the underlying storage performance as possible, we rely 2631325132Savg * on two fundamental concepts: 2632325132Savg * 2633325132Savg * 1. The creation and issuance of lwb zio's is protected by 2634329485Smav * the zilog's "zl_issuer_lock", which ensures only a single 2635325132Savg * thread is creating and/or issuing lwb's at a time 2636325132Savg * 2. The "previous" lwb is a child of the "current" lwb 2637325132Savg * (leveraging the zio parent-child depenency graph) 2638325132Savg * 2639325132Savg * By relying on this parent-child zio relationship, we can have 2640325132Savg * many lwb zio's concurrently issued to the underlying storage, 2641325132Savg * but the order in which they complete will be the same order in 2642325132Savg * which they were created. 2643168404Spjd */ 2644168404Spjdvoid 2645219089Spjdzil_commit(zilog_t *zilog, uint64_t foid) 2646168404Spjd{ 2647325132Savg /* 2648325132Savg * We should never attempt to call zil_commit on a snapshot for 2649325132Savg * a couple of reasons: 2650325132Savg * 2651325132Savg * 1. A snapshot may never be modified, thus it cannot have any 2652325132Savg * in-flight itxs that would have modified the dataset. 2653325132Savg * 2654325132Savg * 2. By design, when zil_commit() is called, a commit itx will 2655325132Savg * be assigned to this zilog; as a result, the zilog will be 2656325132Savg * dirtied. We must not dirty the zilog of a snapshot; there's 2657325132Savg * checks in the code that enforce this invariant, and will 2658325132Savg * cause a panic if it's not upheld. 2659325132Savg */ 2660325132Savg ASSERT3B(dmu_objset_is_snapshot(zilog->zl_os), ==, B_FALSE); 2661219089Spjd 2662219089Spjd if (zilog->zl_sync == ZFS_SYNC_DISABLED) 2663168404Spjd return; 2664168404Spjd 2665325132Savg if (!spa_writeable(zilog->zl_spa)) { 2666325132Savg /* 2667325132Savg * If the SPA is not writable, there should never be any 2668325132Savg * pending itxs waiting to be committed to disk. If that 2669325132Savg * weren't true, we'd skip writing those itxs out, and 2670325132Savg * would break the sematics of zil_commit(); thus, we're 2671325132Savg * verifying that truth before we return to the caller. 2672325132Savg */ 2673325132Savg ASSERT(list_is_empty(&zilog->zl_lwb_list)); 2674325132Savg ASSERT3P(zilog->zl_last_lwb_opened, ==, NULL); 2675325132Savg for (int i = 0; i < TXG_SIZE; i++) 2676325132Savg ASSERT3P(zilog->zl_itxg[i].itxg_itxs, ==, NULL); 2677325132Savg return; 2678325132Savg } 2679219089Spjd 2680325132Savg /* 2681325132Savg * If the ZIL is suspended, we don't want to dirty it by calling 2682325132Savg * zil_commit_itx_assign() below, nor can we write out 2683325132Savg * lwbs like would be done in zil_commit_write(). Thus, we 2684325132Savg * simply rely on txg_wait_synced() to maintain the necessary 2685325132Savg * semantics, and avoid calling those functions altogether. 2686325132Savg */ 2687325132Savg if (zilog->zl_suspend > 0) { 2688325132Savg txg_wait_synced(zilog->zl_dmu_pool, 0); 2689325132Savg return; 2690168404Spjd } 2691219089Spjd 2692329486Smav zil_commit_impl(zilog, foid); 2693329486Smav} 2694329486Smav 2695329486Smavvoid 2696329486Smavzil_commit_impl(zilog_t *zilog, uint64_t foid) 2697329486Smav{ 2698325132Savg /* 2699325132Savg * Move the "async" itxs for the specified foid to the "sync" 2700325132Savg * queues, such that they will be later committed (or skipped) 2701325132Savg * to an lwb when zil_process_commit_list() is called. 2702325132Savg * 2703325132Savg * Since these "async" itxs must be committed prior to this 2704325132Savg * call to zil_commit returning, we must perform this operation 2705325132Savg * before we call zil_commit_itx_assign(). 2706325132Savg */ 2707325132Savg zil_async_to_sync(zilog, foid); 2708219089Spjd 2709325132Savg /* 2710325132Savg * We allocate a new "waiter" structure which will initially be 2711325132Savg * linked to the commit itx using the itx's "itx_private" field. 2712325132Savg * Since the commit itx doesn't represent any on-disk state, 2713325132Savg * when it's committed to an lwb, rather than copying the its 2714325132Savg * lr_t into the lwb's buffer, the commit itx's "waiter" will be 2715325132Savg * added to the lwb's list of waiters. Then, when the lwb is 2716325132Savg * committed to stable storage, each waiter in the lwb's list of 2717325132Savg * waiters will be marked "done", and signalled. 2718325132Savg * 2719325132Savg * We must create the waiter and assign the commit itx prior to 2720325132Savg * calling zil_commit_writer(), or else our specific commit itx 2721325132Savg * is not guaranteed to be committed to an lwb prior to calling 2722325132Savg * zil_commit_waiter(). 2723325132Savg */ 2724325132Savg zil_commit_waiter_t *zcw = zil_alloc_commit_waiter(); 2725325132Savg zil_commit_itx_assign(zilog, zcw); 2726219089Spjd 2727325132Savg zil_commit_writer(zilog, zcw); 2728325132Savg zil_commit_waiter(zilog, zcw); 2729325132Savg 2730325132Savg if (zcw->zcw_zio_error != 0) { 2731325132Savg /* 2732325132Savg * If there was an error writing out the ZIL blocks that 2733325132Savg * this thread is waiting on, then we fallback to 2734325132Savg * relying on spa_sync() to write out the data this 2735325132Savg * thread is waiting on. Obviously this has performance 2736325132Savg * implications, but the expectation is for this to be 2737325132Savg * an exceptional case, and shouldn't occur often. 2738325132Savg */ 2739325132Savg DTRACE_PROBE2(zil__commit__io__error, 2740325132Savg zilog_t *, zilog, zil_commit_waiter_t *, zcw); 2741325132Savg txg_wait_synced(zilog->zl_dmu_pool, 0); 2742325132Savg } 2743325132Savg 2744325132Savg zil_free_commit_waiter(zcw); 2745168404Spjd} 2746168404Spjd 2747168404Spjd/* 2748168404Spjd * Called in syncing context to free committed log blocks and update log header. 2749168404Spjd */ 2750168404Spjdvoid 2751168404Spjdzil_sync(zilog_t *zilog, dmu_tx_t *tx) 2752168404Spjd{ 2753168404Spjd zil_header_t *zh = zil_header_in_syncing_context(zilog); 2754168404Spjd uint64_t txg = dmu_tx_get_txg(tx); 2755168404Spjd spa_t *spa = zilog->zl_spa; 2756219089Spjd uint64_t *replayed_seq = &zilog->zl_replayed_seq[txg & TXG_MASK]; 2757168404Spjd lwb_t *lwb; 2758168404Spjd 2759209962Smm /* 2760209962Smm * We don't zero out zl_destroy_txg, so make sure we don't try 2761209962Smm * to destroy it twice. 2762209962Smm */ 2763209962Smm if (spa_sync_pass(spa) != 1) 2764209962Smm return; 2765209962Smm 2766168404Spjd mutex_enter(&zilog->zl_lock); 2767168404Spjd 2768168404Spjd ASSERT(zilog->zl_stop_sync == 0); 2769168404Spjd 2770219089Spjd if (*replayed_seq != 0) { 2771219089Spjd ASSERT(zh->zh_replay_seq < *replayed_seq); 2772219089Spjd zh->zh_replay_seq = *replayed_seq; 2773219089Spjd *replayed_seq = 0; 2774219089Spjd } 2775168404Spjd 2776168404Spjd if (zilog->zl_destroy_txg == txg) { 2777168404Spjd blkptr_t blk = zh->zh_log; 2778168404Spjd 2779168404Spjd ASSERT(list_head(&zilog->zl_lwb_list) == NULL); 2780168404Spjd 2781168404Spjd bzero(zh, sizeof (zil_header_t)); 2782209962Smm bzero(zilog->zl_replayed_seq, sizeof (zilog->zl_replayed_seq)); 2783168404Spjd 2784168404Spjd if (zilog->zl_keep_first) { 2785168404Spjd /* 2786168404Spjd * If this block was part of log chain that couldn't 2787168404Spjd * be claimed because a device was missing during 2788168404Spjd * zil_claim(), but that device later returns, 2789168404Spjd * then this block could erroneously appear valid. 2790168404Spjd * To guard against this, assign a new GUID to the new 2791168404Spjd * log chain so it doesn't matter what blk points to. 2792168404Spjd */ 2793168404Spjd zil_init_log_chain(zilog, &blk); 2794168404Spjd zh->zh_log = blk; 2795168404Spjd } 2796168404Spjd } 2797168404Spjd 2798213197Smm while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) { 2799168404Spjd zh->zh_log = lwb->lwb_blk; 2800168404Spjd if (lwb->lwb_buf != NULL || lwb->lwb_max_txg > txg) 2801168404Spjd break; 2802168404Spjd list_remove(&zilog->zl_lwb_list, lwb); 2803325132Savg zio_free(spa, txg, &lwb->lwb_blk); 2804325132Savg zil_free_lwb(zilog, lwb); 2805168404Spjd 2806168404Spjd /* 2807168404Spjd * If we don't have anything left in the lwb list then 2808168404Spjd * we've had an allocation failure and we need to zero 2809168404Spjd * out the zil_header blkptr so that we don't end 2810168404Spjd * up freeing the same block twice. 2811168404Spjd */ 2812168404Spjd if (list_head(&zilog->zl_lwb_list) == NULL) 2813168404Spjd BP_ZERO(&zh->zh_log); 2814168404Spjd } 2815168404Spjd mutex_exit(&zilog->zl_lock); 2816168404Spjd} 2817168404Spjd 2818325132Savg/* ARGSUSED */ 2819325132Savgstatic int 2820325132Savgzil_lwb_cons(void *vbuf, void *unused, int kmflag) 2821325132Savg{ 2822325132Savg lwb_t *lwb = vbuf; 2823325132Savg list_create(&lwb->lwb_waiters, sizeof (zil_commit_waiter_t), 2824325132Savg offsetof(zil_commit_waiter_t, zcw_node)); 2825325132Savg avl_create(&lwb->lwb_vdev_tree, zil_lwb_vdev_compare, 2826325132Savg sizeof (zil_vdev_node_t), offsetof(zil_vdev_node_t, zv_node)); 2827325132Savg mutex_init(&lwb->lwb_vdev_lock, NULL, MUTEX_DEFAULT, NULL); 2828325132Savg return (0); 2829325132Savg} 2830325132Savg 2831325132Savg/* ARGSUSED */ 2832325132Savgstatic void 2833325132Savgzil_lwb_dest(void *vbuf, void *unused) 2834325132Savg{ 2835325132Savg lwb_t *lwb = vbuf; 2836325132Savg mutex_destroy(&lwb->lwb_vdev_lock); 2837325132Savg avl_destroy(&lwb->lwb_vdev_tree); 2838325132Savg list_destroy(&lwb->lwb_waiters); 2839325132Savg} 2840325132Savg 2841168404Spjdvoid 2842168404Spjdzil_init(void) 2843168404Spjd{ 2844168404Spjd zil_lwb_cache = kmem_cache_create("zil_lwb_cache", 2845325132Savg sizeof (lwb_t), 0, zil_lwb_cons, zil_lwb_dest, NULL, NULL, NULL, 0); 2846325132Savg 2847325132Savg zil_zcw_cache = kmem_cache_create("zil_zcw_cache", 2848325132Savg sizeof (zil_commit_waiter_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 2849168404Spjd} 2850168404Spjd 2851168404Spjdvoid 2852168404Spjdzil_fini(void) 2853168404Spjd{ 2854325132Savg kmem_cache_destroy(zil_zcw_cache); 2855168404Spjd kmem_cache_destroy(zil_lwb_cache); 2856168404Spjd} 2857168404Spjd 2858219089Spjdvoid 2859219089Spjdzil_set_sync(zilog_t *zilog, uint64_t sync) 2860219089Spjd{ 2861219089Spjd zilog->zl_sync = sync; 2862219089Spjd} 2863219089Spjd 2864219089Spjdvoid 2865219089Spjdzil_set_logbias(zilog_t *zilog, uint64_t logbias) 2866219089Spjd{ 2867219089Spjd zilog->zl_logbias = logbias; 2868219089Spjd} 2869219089Spjd 2870168404Spjdzilog_t * 2871168404Spjdzil_alloc(objset_t *os, zil_header_t *zh_phys) 2872168404Spjd{ 2873168404Spjd zilog_t *zilog; 2874168404Spjd 2875168404Spjd zilog = kmem_zalloc(sizeof (zilog_t), KM_SLEEP); 2876168404Spjd 2877168404Spjd zilog->zl_header = zh_phys; 2878168404Spjd zilog->zl_os = os; 2879168404Spjd zilog->zl_spa = dmu_objset_spa(os); 2880168404Spjd zilog->zl_dmu_pool = dmu_objset_pool(os); 2881168404Spjd zilog->zl_destroy_txg = TXG_INITIAL - 1; 2882219089Spjd zilog->zl_logbias = dmu_objset_logbias(os); 2883219089Spjd zilog->zl_sync = dmu_objset_syncprop(os); 2884325132Savg zilog->zl_dirty_max_txg = 0; 2885325132Savg zilog->zl_last_lwb_opened = NULL; 2886325132Savg zilog->zl_last_lwb_latency = 0; 2887168404Spjd 2888168404Spjd mutex_init(&zilog->zl_lock, NULL, MUTEX_DEFAULT, NULL); 2889329485Smav mutex_init(&zilog->zl_issuer_lock, NULL, MUTEX_DEFAULT, NULL); 2890168404Spjd 2891219089Spjd for (int i = 0; i < TXG_SIZE; i++) { 2892219089Spjd mutex_init(&zilog->zl_itxg[i].itxg_lock, NULL, 2893219089Spjd MUTEX_DEFAULT, NULL); 2894219089Spjd } 2895168404Spjd 2896168404Spjd list_create(&zilog->zl_lwb_list, sizeof (lwb_t), 2897168404Spjd offsetof(lwb_t, lwb_node)); 2898168404Spjd 2899219089Spjd list_create(&zilog->zl_itx_commit_list, sizeof (itx_t), 2900219089Spjd offsetof(itx_t, itx_node)); 2901219089Spjd 2902185029Spjd cv_init(&zilog->zl_cv_suspend, NULL, CV_DEFAULT, NULL); 2903185029Spjd 2904168404Spjd return (zilog); 2905168404Spjd} 2906168404Spjd 2907168404Spjdvoid 2908168404Spjdzil_free(zilog_t *zilog) 2909168404Spjd{ 2910168404Spjd zilog->zl_stop_sync = 1; 2911168404Spjd 2912248571Smm ASSERT0(zilog->zl_suspend); 2913248571Smm ASSERT0(zilog->zl_suspending); 2914248571Smm 2915224526Smm ASSERT(list_is_empty(&zilog->zl_lwb_list)); 2916168404Spjd list_destroy(&zilog->zl_lwb_list); 2917168404Spjd 2918219089Spjd ASSERT(list_is_empty(&zilog->zl_itx_commit_list)); 2919219089Spjd list_destroy(&zilog->zl_itx_commit_list); 2920219089Spjd 2921219089Spjd for (int i = 0; i < TXG_SIZE; i++) { 2922219089Spjd /* 2923219089Spjd * It's possible for an itx to be generated that doesn't dirty 2924219089Spjd * a txg (e.g. ztest TX_TRUNCATE). So there's no zil_clean() 2925219089Spjd * callback to remove the entry. We remove those here. 2926219089Spjd * 2927219089Spjd * Also free up the ziltest itxs. 2928219089Spjd */ 2929219089Spjd if (zilog->zl_itxg[i].itxg_itxs) 2930219089Spjd zil_itxg_clean(zilog->zl_itxg[i].itxg_itxs); 2931219089Spjd mutex_destroy(&zilog->zl_itxg[i].itxg_lock); 2932219089Spjd } 2933219089Spjd 2934329485Smav mutex_destroy(&zilog->zl_issuer_lock); 2935168404Spjd mutex_destroy(&zilog->zl_lock); 2936168404Spjd 2937185029Spjd cv_destroy(&zilog->zl_cv_suspend); 2938185029Spjd 2939168404Spjd kmem_free(zilog, sizeof (zilog_t)); 2940168404Spjd} 2941168404Spjd 2942168404Spjd/* 2943168404Spjd * Open an intent log. 2944168404Spjd */ 2945168404Spjdzilog_t * 2946168404Spjdzil_open(objset_t *os, zil_get_data_t *get_data) 2947168404Spjd{ 2948168404Spjd zilog_t *zilog = dmu_objset_zil(os); 2949168404Spjd 2950325132Savg ASSERT3P(zilog->zl_get_data, ==, NULL); 2951325132Savg ASSERT3P(zilog->zl_last_lwb_opened, ==, NULL); 2952224526Smm ASSERT(list_is_empty(&zilog->zl_lwb_list)); 2953224526Smm 2954168404Spjd zilog->zl_get_data = get_data; 2955168404Spjd 2956168404Spjd return (zilog); 2957168404Spjd} 2958168404Spjd 2959168404Spjd/* 2960168404Spjd * Close an intent log. 2961168404Spjd */ 2962168404Spjdvoid 2963168404Spjdzil_close(zilog_t *zilog) 2964168404Spjd{ 2965224526Smm lwb_t *lwb; 2966325132Savg uint64_t txg; 2967219089Spjd 2968325132Savg if (!dmu_objset_is_snapshot(zilog->zl_os)) { 2969325132Savg zil_commit(zilog, 0); 2970325132Savg } else { 2971325132Savg ASSERT3P(list_tail(&zilog->zl_lwb_list), ==, NULL); 2972325132Savg ASSERT0(zilog->zl_dirty_max_txg); 2973325132Savg ASSERT3B(zilog_is_dirty(zilog), ==, B_FALSE); 2974325132Savg } 2975219089Spjd 2976219089Spjd mutex_enter(&zilog->zl_lock); 2977224526Smm lwb = list_tail(&zilog->zl_lwb_list); 2978325132Savg if (lwb == NULL) 2979325132Savg txg = zilog->zl_dirty_max_txg; 2980325132Savg else 2981325132Savg txg = MAX(zilog->zl_dirty_max_txg, lwb->lwb_max_txg); 2982219089Spjd mutex_exit(&zilog->zl_lock); 2983325132Savg 2984325132Savg /* 2985325132Savg * We need to use txg_wait_synced() to wait long enough for the 2986325132Savg * ZIL to be clean, and to wait for all pending lwbs to be 2987325132Savg * written out. 2988325132Savg */ 2989325132Savg if (txg != 0) 2990168404Spjd txg_wait_synced(zilog->zl_dmu_pool, txg); 2991168404Spjd 2992310515Savg if (zilog_is_dirty(zilog)) 2993310515Savg zfs_dbgmsg("zil (%p) is dirty, txg %llu", zilog, txg); 2994310515Savg VERIFY(!zilog_is_dirty(zilog)); 2995310515Savg 2996168404Spjd zilog->zl_get_data = NULL; 2997224526Smm 2998224526Smm /* 2999325132Savg * We should have only one lwb left on the list; remove it now. 3000224526Smm */ 3001224526Smm mutex_enter(&zilog->zl_lock); 3002224526Smm lwb = list_head(&zilog->zl_lwb_list); 3003224526Smm if (lwb != NULL) { 3004325132Savg ASSERT3P(lwb, ==, list_tail(&zilog->zl_lwb_list)); 3005325132Savg ASSERT3S(lwb->lwb_state, !=, LWB_STATE_ISSUED); 3006224526Smm list_remove(&zilog->zl_lwb_list, lwb); 3007224526Smm zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); 3008325132Savg zil_free_lwb(zilog, lwb); 3009224526Smm } 3010224526Smm mutex_exit(&zilog->zl_lock); 3011168404Spjd} 3012168404Spjd 3013248571Smmstatic char *suspend_tag = "zil suspending"; 3014248571Smm 3015168404Spjd/* 3016168404Spjd * Suspend an intent log. While in suspended mode, we still honor 3017168404Spjd * synchronous semantics, but we rely on txg_wait_synced() to do it. 3018248571Smm * On old version pools, we suspend the log briefly when taking a 3019248571Smm * snapshot so that it will have an empty intent log. 3020248571Smm * 3021248571Smm * Long holds are not really intended to be used the way we do here -- 3022248571Smm * held for such a short time. A concurrent caller of dsl_dataset_long_held() 3023248571Smm * could fail. Therefore we take pains to only put a long hold if it is 3024248571Smm * actually necessary. Fortunately, it will only be necessary if the 3025248571Smm * objset is currently mounted (or the ZVOL equivalent). In that case it 3026248571Smm * will already have a long hold, so we are not really making things any worse. 3027248571Smm * 3028248571Smm * Ideally, we would locate the existing long-holder (i.e. the zfsvfs_t or 3029248571Smm * zvol_state_t), and use their mechanism to prevent their hold from being 3030248571Smm * dropped (e.g. VFS_HOLD()). However, that would be even more pain for 3031248571Smm * very little gain. 3032248571Smm * 3033248571Smm * if cookiep == NULL, this does both the suspend & resume. 3034248571Smm * Otherwise, it returns with the dataset "long held", and the cookie 3035248571Smm * should be passed into zil_resume(). 3036168404Spjd */ 3037168404Spjdint 3038248571Smmzil_suspend(const char *osname, void **cookiep) 3039168404Spjd{ 3040248571Smm objset_t *os; 3041248571Smm zilog_t *zilog; 3042248571Smm const zil_header_t *zh; 3043248571Smm int error; 3044168404Spjd 3045248571Smm error = dmu_objset_hold(osname, suspend_tag, &os); 3046248571Smm if (error != 0) 3047248571Smm return (error); 3048248571Smm zilog = dmu_objset_zil(os); 3049248571Smm 3050168404Spjd mutex_enter(&zilog->zl_lock); 3051248571Smm zh = zilog->zl_header; 3052248571Smm 3053200724Sdelphij if (zh->zh_flags & ZIL_REPLAY_NEEDED) { /* unplayed log */ 3054168404Spjd mutex_exit(&zilog->zl_lock); 3055248571Smm dmu_objset_rele(os, suspend_tag); 3056249195Smm return (SET_ERROR(EBUSY)); 3057168404Spjd } 3058248571Smm 3059248571Smm /* 3060248571Smm * Don't put a long hold in the cases where we can avoid it. This 3061248571Smm * is when there is no cookie so we are doing a suspend & resume 3062248571Smm * (i.e. called from zil_vdev_offline()), and there's nothing to do 3063248571Smm * for the suspend because it's already suspended, or there's no ZIL. 3064248571Smm */ 3065248571Smm if (cookiep == NULL && !zilog->zl_suspending && 3066248571Smm (zilog->zl_suspend > 0 || BP_IS_HOLE(&zh->zh_log))) { 3067248571Smm mutex_exit(&zilog->zl_lock); 3068248571Smm dmu_objset_rele(os, suspend_tag); 3069248571Smm return (0); 3070248571Smm } 3071248571Smm 3072248571Smm dsl_dataset_long_hold(dmu_objset_ds(os), suspend_tag); 3073248571Smm dsl_pool_rele(dmu_objset_pool(os), suspend_tag); 3074248571Smm 3075248571Smm zilog->zl_suspend++; 3076248571Smm 3077248571Smm if (zilog->zl_suspend > 1) { 3078168404Spjd /* 3079248571Smm * Someone else is already suspending it. 3080168404Spjd * Just wait for them to finish. 3081168404Spjd */ 3082248571Smm 3083168404Spjd while (zilog->zl_suspending) 3084168404Spjd cv_wait(&zilog->zl_cv_suspend, &zilog->zl_lock); 3085168404Spjd mutex_exit(&zilog->zl_lock); 3086248571Smm 3087248571Smm if (cookiep == NULL) 3088248571Smm zil_resume(os); 3089248571Smm else 3090248571Smm *cookiep = os; 3091168404Spjd return (0); 3092168404Spjd } 3093248571Smm 3094248571Smm /* 3095248571Smm * If there is no pointer to an on-disk block, this ZIL must not 3096248571Smm * be active (e.g. filesystem not mounted), so there's nothing 3097248571Smm * to clean up. 3098248571Smm */ 3099248571Smm if (BP_IS_HOLE(&zh->zh_log)) { 3100248571Smm ASSERT(cookiep != NULL); /* fast path already handled */ 3101248571Smm 3102248571Smm *cookiep = os; 3103248571Smm mutex_exit(&zilog->zl_lock); 3104248571Smm return (0); 3105248571Smm } 3106248571Smm 3107168404Spjd zilog->zl_suspending = B_TRUE; 3108168404Spjd mutex_exit(&zilog->zl_lock); 3109168404Spjd 3110329486Smav /* 3111329486Smav * We need to use zil_commit_impl to ensure we wait for all 3112329486Smav * LWB_STATE_OPENED and LWB_STATE_ISSUED lwb's to be committed 3113329486Smav * to disk before proceeding. If we used zil_commit instead, it 3114329486Smav * would just call txg_wait_synced(), because zl_suspend is set. 3115329486Smav * txg_wait_synced() doesn't wait for these lwb's to be 3116329486Smav * LWB_STATE_DONE before returning. 3117329486Smav */ 3118329486Smav zil_commit_impl(zilog, 0); 3119168404Spjd 3120329486Smav /* 3121329486Smav * Now that we've ensured all lwb's are LWB_STATE_DONE, we use 3122329486Smav * txg_wait_synced() to ensure the data from the zilog has 3123329486Smav * migrated to the main pool before calling zil_destroy(). 3124329486Smav */ 3125329486Smav txg_wait_synced(zilog->zl_dmu_pool, 0); 3126329486Smav 3127168404Spjd zil_destroy(zilog, B_FALSE); 3128168404Spjd 3129168404Spjd mutex_enter(&zilog->zl_lock); 3130168404Spjd zilog->zl_suspending = B_FALSE; 3131168404Spjd cv_broadcast(&zilog->zl_cv_suspend); 3132168404Spjd mutex_exit(&zilog->zl_lock); 3133168404Spjd 3134248571Smm if (cookiep == NULL) 3135248571Smm zil_resume(os); 3136248571Smm else 3137248571Smm *cookiep = os; 3138168404Spjd return (0); 3139168404Spjd} 3140168404Spjd 3141168404Spjdvoid 3142248571Smmzil_resume(void *cookie) 3143168404Spjd{ 3144248571Smm objset_t *os = cookie; 3145248571Smm zilog_t *zilog = dmu_objset_zil(os); 3146248571Smm 3147168404Spjd mutex_enter(&zilog->zl_lock); 3148168404Spjd ASSERT(zilog->zl_suspend != 0); 3149168404Spjd zilog->zl_suspend--; 3150168404Spjd mutex_exit(&zilog->zl_lock); 3151248571Smm dsl_dataset_long_rele(dmu_objset_ds(os), suspend_tag); 3152248571Smm dsl_dataset_rele(dmu_objset_ds(os), suspend_tag); 3153168404Spjd} 3154168404Spjd 3155219089Spjdtypedef struct zil_replay_arg { 3156219089Spjd zil_replay_func_t **zr_replay; 3157219089Spjd void *zr_arg; 3158219089Spjd boolean_t zr_byteswap; 3159219089Spjd char *zr_lr; 3160219089Spjd} zil_replay_arg_t; 3161219089Spjd 3162219089Spjdstatic int 3163219089Spjdzil_replay_error(zilog_t *zilog, lr_t *lr, int error) 3164209962Smm{ 3165307108Smav char name[ZFS_MAX_DATASET_NAME_LEN]; 3166209962Smm 3167219089Spjd zilog->zl_replaying_seq--; /* didn't actually replay this one */ 3168209962Smm 3169219089Spjd dmu_objset_name(zilog->zl_os, name); 3170209962Smm 3171219089Spjd cmn_err(CE_WARN, "ZFS replay transaction error %d, " 3172219089Spjd "dataset %s, seq 0x%llx, txtype %llu %s\n", error, name, 3173219089Spjd (u_longlong_t)lr->lrc_seq, 3174219089Spjd (u_longlong_t)(lr->lrc_txtype & ~TX_CI), 3175219089Spjd (lr->lrc_txtype & TX_CI) ? "CI" : ""); 3176219089Spjd 3177219089Spjd return (error); 3178209962Smm} 3179209962Smm 3180219089Spjdstatic int 3181168404Spjdzil_replay_log_record(zilog_t *zilog, lr_t *lr, void *zra, uint64_t claim_txg) 3182168404Spjd{ 3183168404Spjd zil_replay_arg_t *zr = zra; 3184168404Spjd const zil_header_t *zh = zilog->zl_header; 3185168404Spjd uint64_t reclen = lr->lrc_reclen; 3186168404Spjd uint64_t txtype = lr->lrc_txtype; 3187219089Spjd int error = 0; 3188168404Spjd 3189219089Spjd zilog->zl_replaying_seq = lr->lrc_seq; 3190168404Spjd 3191219089Spjd if (lr->lrc_seq <= zh->zh_replay_seq) /* already replayed */ 3192219089Spjd return (0); 3193219089Spjd 3194168404Spjd if (lr->lrc_txg < claim_txg) /* already committed */ 3195219089Spjd return (0); 3196168404Spjd 3197185029Spjd /* Strip case-insensitive bit, still present in log record */ 3198185029Spjd txtype &= ~TX_CI; 3199185029Spjd 3200219089Spjd if (txtype == 0 || txtype >= TX_MAX_TYPE) 3201219089Spjd return (zil_replay_error(zilog, lr, EINVAL)); 3202219089Spjd 3203219089Spjd /* 3204219089Spjd * If this record type can be logged out of order, the object 3205219089Spjd * (lr_foid) may no longer exist. That's legitimate, not an error. 3206219089Spjd */ 3207219089Spjd if (TX_OOO(txtype)) { 3208219089Spjd error = dmu_object_info(zilog->zl_os, 3209219089Spjd ((lr_ooo_t *)lr)->lr_foid, NULL); 3210219089Spjd if (error == ENOENT || error == EEXIST) 3211219089Spjd return (0); 3212209962Smm } 3213209962Smm 3214168404Spjd /* 3215168404Spjd * Make a copy of the data so we can revise and extend it. 3216168404Spjd */ 3217219089Spjd bcopy(lr, zr->zr_lr, reclen); 3218168404Spjd 3219168404Spjd /* 3220219089Spjd * If this is a TX_WRITE with a blkptr, suck in the data. 3221219089Spjd */ 3222219089Spjd if (txtype == TX_WRITE && reclen == sizeof (lr_write_t)) { 3223219089Spjd error = zil_read_log_data(zilog, (lr_write_t *)lr, 3224219089Spjd zr->zr_lr + reclen); 3225248571Smm if (error != 0) 3226219089Spjd return (zil_replay_error(zilog, lr, error)); 3227219089Spjd } 3228219089Spjd 3229219089Spjd /* 3230168404Spjd * The log block containing this lr may have been byteswapped 3231168404Spjd * so that we can easily examine common fields like lrc_txtype. 3232219089Spjd * However, the log is a mix of different record types, and only the 3233168404Spjd * replay vectors know how to byteswap their records. Therefore, if 3234168404Spjd * the lr was byteswapped, undo it before invoking the replay vector. 3235168404Spjd */ 3236168404Spjd if (zr->zr_byteswap) 3237219089Spjd byteswap_uint64_array(zr->zr_lr, reclen); 3238168404Spjd 3239168404Spjd /* 3240168404Spjd * We must now do two things atomically: replay this log record, 3241209962Smm * and update the log header sequence number to reflect the fact that 3242209962Smm * we did so. At the end of each replay function the sequence number 3243209962Smm * is updated if we are in replay mode. 3244168404Spjd */ 3245219089Spjd error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lr, zr->zr_byteswap); 3246248571Smm if (error != 0) { 3247168404Spjd /* 3248168404Spjd * The DMU's dnode layer doesn't see removes until the txg 3249168404Spjd * commits, so a subsequent claim can spuriously fail with 3250209962Smm * EEXIST. So if we receive any error we try syncing out 3251219089Spjd * any removes then retry the transaction. Note that we 3252219089Spjd * specify B_FALSE for byteswap now, so we don't do it twice. 3253168404Spjd */ 3254219089Spjd txg_wait_synced(spa_get_dsl(zilog->zl_spa), 0); 3255219089Spjd error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lr, B_FALSE); 3256248571Smm if (error != 0) 3257219089Spjd return (zil_replay_error(zilog, lr, error)); 3258168404Spjd } 3259219089Spjd return (0); 3260168404Spjd} 3261168404Spjd 3262168404Spjd/* ARGSUSED */ 3263219089Spjdstatic int 3264168404Spjdzil_incr_blks(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg) 3265168404Spjd{ 3266168404Spjd zilog->zl_replay_blks++; 3267219089Spjd 3268219089Spjd return (0); 3269168404Spjd} 3270168404Spjd 3271168404Spjd/* 3272168404Spjd * If this dataset has a non-empty intent log, replay it and destroy it. 3273168404Spjd */ 3274168404Spjdvoid 3275209962Smmzil_replay(objset_t *os, void *arg, zil_replay_func_t *replay_func[TX_MAX_TYPE]) 3276168404Spjd{ 3277168404Spjd zilog_t *zilog = dmu_objset_zil(os); 3278168404Spjd const zil_header_t *zh = zilog->zl_header; 3279168404Spjd zil_replay_arg_t zr; 3280168404Spjd 3281200724Sdelphij if ((zh->zh_flags & ZIL_REPLAY_NEEDED) == 0) { 3282168404Spjd zil_destroy(zilog, B_TRUE); 3283168404Spjd return; 3284168404Spjd } 3285168404Spjd 3286168404Spjd zr.zr_replay = replay_func; 3287168404Spjd zr.zr_arg = arg; 3288168404Spjd zr.zr_byteswap = BP_SHOULD_BYTESWAP(&zh->zh_log); 3289219089Spjd zr.zr_lr = kmem_alloc(2 * SPA_MAXBLOCKSIZE, KM_SLEEP); 3290168404Spjd 3291168404Spjd /* 3292168404Spjd * Wait for in-progress removes to sync before starting replay. 3293168404Spjd */ 3294168404Spjd txg_wait_synced(zilog->zl_dmu_pool, 0); 3295168404Spjd 3296209962Smm zilog->zl_replay = B_TRUE; 3297219089Spjd zilog->zl_replay_time = ddi_get_lbolt(); 3298168404Spjd ASSERT(zilog->zl_replay_blks == 0); 3299168404Spjd (void) zil_parse(zilog, zil_incr_blks, zil_replay_log_record, &zr, 3300168404Spjd zh->zh_claim_txg); 3301219089Spjd kmem_free(zr.zr_lr, 2 * SPA_MAXBLOCKSIZE); 3302168404Spjd 3303168404Spjd zil_destroy(zilog, B_FALSE); 3304185029Spjd txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg); 3305209962Smm zilog->zl_replay = B_FALSE; 3306168404Spjd} 3307168404Spjd 3308219089Spjdboolean_t 3309219089Spjdzil_replaying(zilog_t *zilog, dmu_tx_t *tx) 3310168404Spjd{ 3311219089Spjd if (zilog->zl_sync == ZFS_SYNC_DISABLED) 3312219089Spjd return (B_TRUE); 3313168404Spjd 3314219089Spjd if (zilog->zl_replay) { 3315219089Spjd dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); 3316219089Spjd zilog->zl_replayed_seq[dmu_tx_get_txg(tx) & TXG_MASK] = 3317219089Spjd zilog->zl_replaying_seq; 3318219089Spjd return (B_TRUE); 3319168404Spjd } 3320168404Spjd 3321219089Spjd return (B_FALSE); 3322168404Spjd} 3323213197Smm 3324213197Smm/* ARGSUSED */ 3325213197Smmint 3326332525Smavzil_reset(const char *osname, void *arg) 3327213197Smm{ 3328213197Smm int error; 3329213197Smm 3330248571Smm error = zil_suspend(osname, NULL); 3331248571Smm if (error != 0) 3332249195Smm return (SET_ERROR(EEXIST)); 3333248571Smm return (0); 3334213197Smm} 3335