1168404Spjd/*
2168404Spjd * CDDL HEADER START
3168404Spjd *
4168404Spjd * The contents of this file are subject to the terms of the
5168404Spjd * Common Development and Distribution License (the "License").
6168404Spjd * You may not use this file except in compliance with the License.
7168404Spjd *
8168404Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9168404Spjd * or http://www.opensolaris.org/os/licensing.
10168404Spjd * See the License for the specific language governing permissions
11168404Spjd * and limitations under the License.
12168404Spjd *
13168404Spjd * When distributing Covered Code, include this CDDL HEADER in each
14168404Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15168404Spjd * If applicable, add the following below this CDDL HEADER, with the
16168404Spjd * fields enclosed by brackets "[]" replaced with your own identifying
17168404Spjd * information: Portions Copyright [yyyy] [name of copyright owner]
18168404Spjd *
19168404Spjd * CDDL HEADER END
20168404Spjd */
21168404Spjd/*
22219089Spjd * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23310516Savg * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
24297112Smav * Copyright (c) 2014 Integros [integros.com]
25168404Spjd */
26168404Spjd
27219089Spjd/* Portions Copyright 2010 Robert Milkowski */
28219089Spjd
29168404Spjd#include <sys/zfs_context.h>
30168404Spjd#include <sys/spa.h>
31168404Spjd#include <sys/dmu.h>
32168404Spjd#include <sys/zap.h>
33168404Spjd#include <sys/arc.h>
34168404Spjd#include <sys/stat.h>
35168404Spjd#include <sys/resource.h>
36168404Spjd#include <sys/zil.h>
37168404Spjd#include <sys/zil_impl.h>
38168404Spjd#include <sys/dsl_dataset.h>
39219089Spjd#include <sys/vdev_impl.h>
40168404Spjd#include <sys/dmu_tx.h>
41219089Spjd#include <sys/dsl_pool.h>
42168404Spjd
43168404Spjd/*
44168404Spjd * The zfs intent log (ZIL) saves transaction records of system calls
45168404Spjd * that change the file system in memory with enough information
46168404Spjd * to be able to replay them. These are stored in memory until
47168404Spjd * either the DMU transaction group (txg) commits them to the stable pool
48168404Spjd * and they can be discarded, or they are flushed to the stable log
49168404Spjd * (also in the pool) due to a fsync, O_DSYNC or other synchronous
50168404Spjd * requirement. In the event of a panic or power fail then those log
51168404Spjd * records (transactions) are replayed.
52168404Spjd *
53168404Spjd * There is one ZIL per file system. Its on-disk (pool) format consists
54168404Spjd * of 3 parts:
55168404Spjd *
56168404Spjd * 	- ZIL header
57168404Spjd * 	- ZIL blocks
58168404Spjd * 	- ZIL records
59168404Spjd *
60168404Spjd * A log record holds a system call transaction. Log blocks can
61168404Spjd * hold many log records and the blocks are chained together.
62168404Spjd * Each ZIL block contains a block pointer (blkptr_t) to the next
63168404Spjd * ZIL block in the chain. The ZIL header points to the first
64168404Spjd * block in the chain. Note there is not a fixed place in the pool
65168404Spjd * to hold blocks. They are dynamically allocated and freed as
66168404Spjd * needed from the blocks available. Figure X shows the ZIL structure:
67168404Spjd */
68168404Spjd
69168404Spjd/*
70251631Sdelphij * Disable intent logging replay.  This global ZIL switch affects all pools.
71168404Spjd */
72251631Sdelphijint zil_replay_disable = 0;
73168404SpjdSYSCTL_DECL(_vfs_zfs);
74219089SpjdTUNABLE_INT("vfs.zfs.zil_replay_disable", &zil_replay_disable);
75219089SpjdSYSCTL_INT(_vfs_zfs, OID_AUTO, zil_replay_disable, CTLFLAG_RW,
76219089Spjd    &zil_replay_disable, 0, "Disable intent logging replay");
77168404Spjd
78168404Spjd/*
79168404Spjd * Tunable parameter for debugging or performance analysis.  Setting
80168404Spjd * zfs_nocacheflush will cause corruption on power loss if a volatile
81168404Spjd * out-of-order write cache is enabled.
82168404Spjd */
83168404Spjdboolean_t zfs_nocacheflush = B_FALSE;
84168404SpjdTUNABLE_INT("vfs.zfs.cache_flush_disable", &zfs_nocacheflush);
85168404SpjdSYSCTL_INT(_vfs_zfs, OID_AUTO, cache_flush_disable, CTLFLAG_RDTUN,
86168404Spjd    &zfs_nocacheflush, 0, "Disable cache flush");
87249921Ssmhboolean_t zfs_trim_enabled = B_TRUE;
88249921SsmhSYSCTL_DECL(_vfs_zfs_trim);
89249921SsmhTUNABLE_INT("vfs.zfs.trim.enabled", &zfs_trim_enabled);
90249921SsmhSYSCTL_INT(_vfs_zfs_trim, OID_AUTO, enabled, CTLFLAG_RDTUN, &zfs_trim_enabled, 0,
91249921Ssmh    "Enable ZFS TRIM");
92168404Spjd
93320496Savg/*
94320496Savg * Limit SLOG write size per commit executed with synchronous priority.
95320496Savg * Any writes above that executed with lower (asynchronous) priority to
96320496Savg * limit potential SLOG device abuse by single active ZIL writer.
97320496Savg */
98320496Savguint64_t zil_slog_limit = 768 * 1024;
99320496SavgSYSCTL_QUAD(_vfs_zfs, OID_AUTO, zil_slog_limit, CTLFLAG_RWTUN,
100320496Savg    &zil_slog_limit, 0, "Maximal SLOG commit size with sync priority");
101320496Savg
102168404Spjdstatic kmem_cache_t *zil_lwb_cache;
103168404Spjd
104219089Spjd#define	LWB_EMPTY(lwb) ((BP_GET_LSIZE(&lwb->lwb_blk) - \
105219089Spjd    sizeof (zil_chain_t)) == (lwb->lwb_sz - lwb->lwb_nused))
106219089Spjd
107219089Spjd
108219089Spjd/*
109219089Spjd * ziltest is by and large an ugly hack, but very useful in
110219089Spjd * checking replay without tedious work.
111219089Spjd * When running ziltest we want to keep all itx's and so maintain
112219089Spjd * a single list in the zl_itxg[] that uses a high txg: ZILTEST_TXG
113219089Spjd * We subtract TXG_CONCURRENT_STATES to allow for common code.
114219089Spjd */
115219089Spjd#define	ZILTEST_TXG (UINT64_MAX - TXG_CONCURRENT_STATES)
116219089Spjd
117168404Spjdstatic int
118219089Spjdzil_bp_compare(const void *x1, const void *x2)
119168404Spjd{
120219089Spjd	const dva_t *dva1 = &((zil_bp_node_t *)x1)->zn_dva;
121219089Spjd	const dva_t *dva2 = &((zil_bp_node_t *)x2)->zn_dva;
122168404Spjd
123168404Spjd	if (DVA_GET_VDEV(dva1) < DVA_GET_VDEV(dva2))
124168404Spjd		return (-1);
125168404Spjd	if (DVA_GET_VDEV(dva1) > DVA_GET_VDEV(dva2))
126168404Spjd		return (1);
127168404Spjd
128168404Spjd	if (DVA_GET_OFFSET(dva1) < DVA_GET_OFFSET(dva2))
129168404Spjd		return (-1);
130168404Spjd	if (DVA_GET_OFFSET(dva1) > DVA_GET_OFFSET(dva2))
131168404Spjd		return (1);
132168404Spjd
133168404Spjd	return (0);
134168404Spjd}
135168404Spjd
136168404Spjdstatic void
137219089Spjdzil_bp_tree_init(zilog_t *zilog)
138168404Spjd{
139219089Spjd	avl_create(&zilog->zl_bp_tree, zil_bp_compare,
140219089Spjd	    sizeof (zil_bp_node_t), offsetof(zil_bp_node_t, zn_node));
141168404Spjd}
142168404Spjd
143168404Spjdstatic void
144219089Spjdzil_bp_tree_fini(zilog_t *zilog)
145168404Spjd{
146219089Spjd	avl_tree_t *t = &zilog->zl_bp_tree;
147219089Spjd	zil_bp_node_t *zn;
148168404Spjd	void *cookie = NULL;
149168404Spjd
150168404Spjd	while ((zn = avl_destroy_nodes(t, &cookie)) != NULL)
151219089Spjd		kmem_free(zn, sizeof (zil_bp_node_t));
152168404Spjd
153168404Spjd	avl_destroy(t);
154168404Spjd}
155168404Spjd
156219089Spjdint
157219089Spjdzil_bp_tree_add(zilog_t *zilog, const blkptr_t *bp)
158168404Spjd{
159219089Spjd	avl_tree_t *t = &zilog->zl_bp_tree;
160268649Sdelphij	const dva_t *dva;
161219089Spjd	zil_bp_node_t *zn;
162168404Spjd	avl_index_t where;
163168404Spjd
164268649Sdelphij	if (BP_IS_EMBEDDED(bp))
165268649Sdelphij		return (0);
166268649Sdelphij
167268649Sdelphij	dva = BP_IDENTITY(bp);
168268649Sdelphij
169168404Spjd	if (avl_find(t, dva, &where) != NULL)
170249195Smm		return (SET_ERROR(EEXIST));
171168404Spjd
172219089Spjd	zn = kmem_alloc(sizeof (zil_bp_node_t), KM_SLEEP);
173168404Spjd	zn->zn_dva = *dva;
174168404Spjd	avl_insert(t, zn, where);
175168404Spjd
176168404Spjd	return (0);
177168404Spjd}
178168404Spjd
179168404Spjdstatic zil_header_t *
180168404Spjdzil_header_in_syncing_context(zilog_t *zilog)
181168404Spjd{
182168404Spjd	return ((zil_header_t *)zilog->zl_header);
183168404Spjd}
184168404Spjd
185168404Spjdstatic void
186168404Spjdzil_init_log_chain(zilog_t *zilog, blkptr_t *bp)
187168404Spjd{
188168404Spjd	zio_cksum_t *zc = &bp->blk_cksum;
189168404Spjd
190168404Spjd	zc->zc_word[ZIL_ZC_GUID_0] = spa_get_random(-1ULL);
191168404Spjd	zc->zc_word[ZIL_ZC_GUID_1] = spa_get_random(-1ULL);
192168404Spjd	zc->zc_word[ZIL_ZC_OBJSET] = dmu_objset_id(zilog->zl_os);
193168404Spjd	zc->zc_word[ZIL_ZC_SEQ] = 1ULL;
194168404Spjd}
195168404Spjd
196168404Spjd/*
197219089Spjd * Read a log block and make sure it's valid.
198168404Spjd */
199168404Spjdstatic int
200219089Spjdzil_read_log_block(zilog_t *zilog, const blkptr_t *bp, blkptr_t *nbp, void *dst,
201219089Spjd    char **end)
202168404Spjd{
203219089Spjd	enum zio_flag zio_flags = ZIO_FLAG_CANFAIL;
204277586Sdelphij	arc_flags_t aflags = ARC_FLAG_WAIT;
205219089Spjd	arc_buf_t *abuf = NULL;
206268657Sdelphij	zbookmark_phys_t zb;
207168404Spjd	int error;
208168404Spjd
209219089Spjd	if (zilog->zl_header->zh_claim_txg == 0)
210219089Spjd		zio_flags |= ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB;
211168404Spjd
212219089Spjd	if (!(zilog->zl_header->zh_flags & ZIL_CLAIM_LR_SEQ_VALID))
213219089Spjd		zio_flags |= ZIO_FLAG_SPECULATIVE;
214168404Spjd
215219089Spjd	SET_BOOKMARK(&zb, bp->blk_cksum.zc_word[ZIL_ZC_OBJSET],
216219089Spjd	    ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]);
217168404Spjd
218246666Smm	error = arc_read(NULL, zilog->zl_spa, bp, arc_getbuf_func, &abuf,
219219089Spjd	    ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb);
220219089Spjd
221168404Spjd	if (error == 0) {
222168404Spjd		zio_cksum_t cksum = bp->blk_cksum;
223168404Spjd
224168404Spjd		/*
225185029Spjd		 * Validate the checksummed log block.
226185029Spjd		 *
227168404Spjd		 * Sequence numbers should be... sequential.  The checksum
228168404Spjd		 * verifier for the next block should be bp's checksum plus 1.
229185029Spjd		 *
230185029Spjd		 * Also check the log chain linkage and size used.
231168404Spjd		 */
232168404Spjd		cksum.zc_word[ZIL_ZC_SEQ]++;
233168404Spjd
234219089Spjd		if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) {
235219089Spjd			zil_chain_t *zilc = abuf->b_data;
236219089Spjd			char *lr = (char *)(zilc + 1);
237219089Spjd			uint64_t len = zilc->zc_nused - sizeof (zil_chain_t);
238219089Spjd
239219089Spjd			if (bcmp(&cksum, &zilc->zc_next_blk.blk_cksum,
240219089Spjd			    sizeof (cksum)) || BP_IS_HOLE(&zilc->zc_next_blk)) {
241249195Smm				error = SET_ERROR(ECKSUM);
242219089Spjd			} else {
243276081Sdelphij				ASSERT3U(len, <=, SPA_OLD_MAXBLOCKSIZE);
244219089Spjd				bcopy(lr, dst, len);
245219089Spjd				*end = (char *)dst + len;
246219089Spjd				*nbp = zilc->zc_next_blk;
247219089Spjd			}
248219089Spjd		} else {
249219089Spjd			char *lr = abuf->b_data;
250219089Spjd			uint64_t size = BP_GET_LSIZE(bp);
251219089Spjd			zil_chain_t *zilc = (zil_chain_t *)(lr + size) - 1;
252219089Spjd
253219089Spjd			if (bcmp(&cksum, &zilc->zc_next_blk.blk_cksum,
254219089Spjd			    sizeof (cksum)) || BP_IS_HOLE(&zilc->zc_next_blk) ||
255219089Spjd			    (zilc->zc_nused > (size - sizeof (*zilc)))) {
256249195Smm				error = SET_ERROR(ECKSUM);
257219089Spjd			} else {
258276081Sdelphij				ASSERT3U(zilc->zc_nused, <=,
259276081Sdelphij				    SPA_OLD_MAXBLOCKSIZE);
260219089Spjd				bcopy(lr, dst, zilc->zc_nused);
261219089Spjd				*end = (char *)dst + zilc->zc_nused;
262219089Spjd				*nbp = zilc->zc_next_blk;
263219089Spjd			}
264185029Spjd		}
265168404Spjd
266307266Smav		arc_buf_destroy(abuf, &abuf);
267168404Spjd	}
268168404Spjd
269219089Spjd	return (error);
270219089Spjd}
271168404Spjd
272219089Spjd/*
273219089Spjd * Read a TX_WRITE log data block.
274219089Spjd */
275219089Spjdstatic int
276219089Spjdzil_read_log_data(zilog_t *zilog, const lr_write_t *lr, void *wbuf)
277219089Spjd{
278219089Spjd	enum zio_flag zio_flags = ZIO_FLAG_CANFAIL;
279219089Spjd	const blkptr_t *bp = &lr->lr_blkptr;
280277586Sdelphij	arc_flags_t aflags = ARC_FLAG_WAIT;
281219089Spjd	arc_buf_t *abuf = NULL;
282268657Sdelphij	zbookmark_phys_t zb;
283219089Spjd	int error;
284219089Spjd
285219089Spjd	if (BP_IS_HOLE(bp)) {
286219089Spjd		if (wbuf != NULL)
287219089Spjd			bzero(wbuf, MAX(BP_GET_LSIZE(bp), lr->lr_length));
288219089Spjd		return (0);
289219089Spjd	}
290219089Spjd
291219089Spjd	if (zilog->zl_header->zh_claim_txg == 0)
292219089Spjd		zio_flags |= ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB;
293219089Spjd
294219089Spjd	SET_BOOKMARK(&zb, dmu_objset_id(zilog->zl_os), lr->lr_foid,
295219089Spjd	    ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp));
296219089Spjd
297246666Smm	error = arc_read(NULL, zilog->zl_spa, bp, arc_getbuf_func, &abuf,
298219089Spjd	    ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb);
299219089Spjd
300219089Spjd	if (error == 0) {
301219089Spjd		if (wbuf != NULL)
302219089Spjd			bcopy(abuf->b_data, wbuf, arc_buf_size(abuf));
303307266Smav		arc_buf_destroy(abuf, &abuf);
304219089Spjd	}
305219089Spjd
306168404Spjd	return (error);
307168404Spjd}
308168404Spjd
309168404Spjd/*
310168404Spjd * Parse the intent log, and call parse_func for each valid record within.
311168404Spjd */
312219089Spjdint
313168404Spjdzil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func,
314168404Spjd    zil_parse_lr_func_t *parse_lr_func, void *arg, uint64_t txg)
315168404Spjd{
316168404Spjd	const zil_header_t *zh = zilog->zl_header;
317219089Spjd	boolean_t claimed = !!zh->zh_claim_txg;
318219089Spjd	uint64_t claim_blk_seq = claimed ? zh->zh_claim_blk_seq : UINT64_MAX;
319219089Spjd	uint64_t claim_lr_seq = claimed ? zh->zh_claim_lr_seq : UINT64_MAX;
320219089Spjd	uint64_t max_blk_seq = 0;
321219089Spjd	uint64_t max_lr_seq = 0;
322219089Spjd	uint64_t blk_count = 0;
323219089Spjd	uint64_t lr_count = 0;
324219089Spjd	blkptr_t blk, next_blk;
325168404Spjd	char *lrbuf, *lrp;
326219089Spjd	int error = 0;
327168404Spjd
328219089Spjd	/*
329219089Spjd	 * Old logs didn't record the maximum zh_claim_lr_seq.
330219089Spjd	 */
331219089Spjd	if (!(zh->zh_flags & ZIL_CLAIM_LR_SEQ_VALID))
332219089Spjd		claim_lr_seq = UINT64_MAX;
333168404Spjd
334168404Spjd	/*
335168404Spjd	 * Starting at the block pointed to by zh_log we read the log chain.
336168404Spjd	 * For each block in the chain we strongly check that block to
337168404Spjd	 * ensure its validity.  We stop when an invalid block is found.
338168404Spjd	 * For each block pointer in the chain we call parse_blk_func().
339168404Spjd	 * For each record in each valid block we call parse_lr_func().
340168404Spjd	 * If the log has been claimed, stop if we encounter a sequence
341168404Spjd	 * number greater than the highest claimed sequence number.
342168404Spjd	 */
343276081Sdelphij	lrbuf = zio_buf_alloc(SPA_OLD_MAXBLOCKSIZE);
344219089Spjd	zil_bp_tree_init(zilog);
345168404Spjd
346219089Spjd	for (blk = zh->zh_log; !BP_IS_HOLE(&blk); blk = next_blk) {
347219089Spjd		uint64_t blk_seq = blk.blk_cksum.zc_word[ZIL_ZC_SEQ];
348219089Spjd		int reclen;
349219089Spjd		char *end;
350219089Spjd
351219089Spjd		if (blk_seq > claim_blk_seq)
352168404Spjd			break;
353219089Spjd		if ((error = parse_blk_func(zilog, &blk, arg, txg)) != 0)
354219089Spjd			break;
355219089Spjd		ASSERT3U(max_blk_seq, <, blk_seq);
356219089Spjd		max_blk_seq = blk_seq;
357219089Spjd		blk_count++;
358168404Spjd
359219089Spjd		if (max_lr_seq == claim_lr_seq && max_blk_seq == claim_blk_seq)
360219089Spjd			break;
361168404Spjd
362219089Spjd		error = zil_read_log_block(zilog, &blk, &next_blk, lrbuf, &end);
363248571Smm		if (error != 0)
364168404Spjd			break;
365168404Spjd
366219089Spjd		for (lrp = lrbuf; lrp < end; lrp += reclen) {
367168404Spjd			lr_t *lr = (lr_t *)lrp;
368168404Spjd			reclen = lr->lrc_reclen;
369168404Spjd			ASSERT3U(reclen, >=, sizeof (lr_t));
370219089Spjd			if (lr->lrc_seq > claim_lr_seq)
371219089Spjd				goto done;
372219089Spjd			if ((error = parse_lr_func(zilog, lr, arg, txg)) != 0)
373219089Spjd				goto done;
374219089Spjd			ASSERT3U(max_lr_seq, <, lr->lrc_seq);
375219089Spjd			max_lr_seq = lr->lrc_seq;
376219089Spjd			lr_count++;
377168404Spjd		}
378168404Spjd	}
379219089Spjddone:
380219089Spjd	zilog->zl_parse_error = error;
381219089Spjd	zilog->zl_parse_blk_seq = max_blk_seq;
382219089Spjd	zilog->zl_parse_lr_seq = max_lr_seq;
383219089Spjd	zilog->zl_parse_blk_count = blk_count;
384219089Spjd	zilog->zl_parse_lr_count = lr_count;
385168404Spjd
386219089Spjd	ASSERT(!claimed || !(zh->zh_flags & ZIL_CLAIM_LR_SEQ_VALID) ||
387219089Spjd	    (max_blk_seq == claim_blk_seq && max_lr_seq == claim_lr_seq));
388219089Spjd
389219089Spjd	zil_bp_tree_fini(zilog);
390276081Sdelphij	zio_buf_free(lrbuf, SPA_OLD_MAXBLOCKSIZE);
391219089Spjd
392219089Spjd	return (error);
393168404Spjd}
394168404Spjd
395219089Spjdstatic int
396168404Spjdzil_claim_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t first_txg)
397168404Spjd{
398168404Spjd	/*
399168404Spjd	 * Claim log block if not already committed and not already claimed.
400219089Spjd	 * If tx == NULL, just verify that the block is claimable.
401168404Spjd	 */
402263397Sdelphij	if (BP_IS_HOLE(bp) || bp->blk_birth < first_txg ||
403263397Sdelphij	    zil_bp_tree_add(zilog, bp) != 0)
404219089Spjd		return (0);
405219089Spjd
406219089Spjd	return (zio_wait(zio_claim(NULL, zilog->zl_spa,
407219089Spjd	    tx == NULL ? 0 : first_txg, bp, spa_claim_notify, NULL,
408219089Spjd	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB)));
409168404Spjd}
410168404Spjd
411219089Spjdstatic int
412168404Spjdzil_claim_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t first_txg)
413168404Spjd{
414219089Spjd	lr_write_t *lr = (lr_write_t *)lrc;
415219089Spjd	int error;
416219089Spjd
417219089Spjd	if (lrc->lrc_txtype != TX_WRITE)
418219089Spjd		return (0);
419219089Spjd
420219089Spjd	/*
421219089Spjd	 * If the block is not readable, don't claim it.  This can happen
422219089Spjd	 * in normal operation when a log block is written to disk before
423219089Spjd	 * some of the dmu_sync() blocks it points to.  In this case, the
424219089Spjd	 * transaction cannot have been committed to anyone (we would have
425219089Spjd	 * waited for all writes to be stable first), so it is semantically
426219089Spjd	 * correct to declare this the end of the log.
427219089Spjd	 */
428219089Spjd	if (lr->lr_blkptr.blk_birth >= first_txg &&
429219089Spjd	    (error = zil_read_log_data(zilog, lr, NULL)) != 0)
430219089Spjd		return (error);
431219089Spjd	return (zil_claim_log_block(zilog, &lr->lr_blkptr, tx, first_txg));
432168404Spjd}
433168404Spjd
434168404Spjd/* ARGSUSED */
435219089Spjdstatic int
436168404Spjdzil_free_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t claim_txg)
437168404Spjd{
438219089Spjd	zio_free_zil(zilog->zl_spa, dmu_tx_get_txg(tx), bp);
439219089Spjd
440219089Spjd	return (0);
441168404Spjd}
442168404Spjd
443219089Spjdstatic int
444168404Spjdzil_free_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t claim_txg)
445168404Spjd{
446219089Spjd	lr_write_t *lr = (lr_write_t *)lrc;
447219089Spjd	blkptr_t *bp = &lr->lr_blkptr;
448219089Spjd
449168404Spjd	/*
450168404Spjd	 * If we previously claimed it, we need to free it.
451168404Spjd	 */
452219089Spjd	if (claim_txg != 0 && lrc->lrc_txtype == TX_WRITE &&
453263397Sdelphij	    bp->blk_birth >= claim_txg && zil_bp_tree_add(zilog, bp) == 0 &&
454263397Sdelphij	    !BP_IS_HOLE(bp))
455219089Spjd		zio_free(zilog->zl_spa, dmu_tx_get_txg(tx), bp);
456219089Spjd
457219089Spjd	return (0);
458219089Spjd}
459219089Spjd
460219089Spjdstatic lwb_t *
461320496Savgzil_alloc_lwb(zilog_t *zilog, blkptr_t *bp, boolean_t slog, uint64_t txg)
462219089Spjd{
463219089Spjd	lwb_t *lwb;
464219089Spjd
465219089Spjd	lwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP);
466219089Spjd	lwb->lwb_zilog = zilog;
467219089Spjd	lwb->lwb_blk = *bp;
468320496Savg	lwb->lwb_slog = slog;
469219089Spjd	lwb->lwb_buf = zio_buf_alloc(BP_GET_LSIZE(bp));
470219089Spjd	lwb->lwb_max_txg = txg;
471219089Spjd	lwb->lwb_zio = NULL;
472219089Spjd	lwb->lwb_tx = NULL;
473219089Spjd	if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) {
474219089Spjd		lwb->lwb_nused = sizeof (zil_chain_t);
475219089Spjd		lwb->lwb_sz = BP_GET_LSIZE(bp);
476219089Spjd	} else {
477219089Spjd		lwb->lwb_nused = 0;
478219089Spjd		lwb->lwb_sz = BP_GET_LSIZE(bp) - sizeof (zil_chain_t);
479168404Spjd	}
480219089Spjd
481219089Spjd	mutex_enter(&zilog->zl_lock);
482219089Spjd	list_insert_tail(&zilog->zl_lwb_list, lwb);
483219089Spjd	mutex_exit(&zilog->zl_lock);
484219089Spjd
485219089Spjd	return (lwb);
486168404Spjd}
487168404Spjd
488168404Spjd/*
489239620Smm * Called when we create in-memory log transactions so that we know
490239620Smm * to cleanup the itxs at the end of spa_sync().
491239620Smm */
492239620Smmvoid
493239620Smmzilog_dirty(zilog_t *zilog, uint64_t txg)
494239620Smm{
495239620Smm	dsl_pool_t *dp = zilog->zl_dmu_pool;
496239620Smm	dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os);
497239620Smm
498288549Smav	if (ds->ds_is_snapshot)
499239620Smm		panic("dirtying snapshot!");
500239620Smm
501248571Smm	if (txg_list_add(&dp->dp_dirty_zilogs, zilog, txg)) {
502239620Smm		/* up the hold count until we can be written out */
503239620Smm		dmu_buf_add_ref(ds->ds_dbuf, zilog);
504239620Smm	}
505239620Smm}
506239620Smm
507310516Savg/*
508310516Savg * Determine if the zil is dirty in the specified txg. Callers wanting to
509310516Savg * ensure that the dirty state does not change must hold the itxg_lock for
510310516Savg * the specified txg. Holding the lock will ensure that the zil cannot be
511310516Savg * dirtied (zil_itx_assign) or cleaned (zil_clean) while we check its current
512310516Savg * state.
513310516Savg */
514239620Smmboolean_t
515310516Savgzilog_is_dirty_in_txg(zilog_t *zilog, uint64_t txg)
516310516Savg{
517310516Savg	dsl_pool_t *dp = zilog->zl_dmu_pool;
518310516Savg
519310516Savg	if (txg_list_member(&dp->dp_dirty_zilogs, zilog, txg & TXG_MASK))
520310516Savg		return (B_TRUE);
521310516Savg	return (B_FALSE);
522310516Savg}
523310516Savg
524310516Savg/*
525310516Savg * Determine if the zil is dirty. The zil is considered dirty if it has
526310516Savg * any pending itx records that have not been cleaned by zil_clean().
527310516Savg */
528310516Savgboolean_t
529239620Smmzilog_is_dirty(zilog_t *zilog)
530239620Smm{
531239620Smm	dsl_pool_t *dp = zilog->zl_dmu_pool;
532239620Smm
533239620Smm	for (int t = 0; t < TXG_SIZE; t++) {
534239620Smm		if (txg_list_member(&dp->dp_dirty_zilogs, zilog, t))
535239620Smm			return (B_TRUE);
536239620Smm	}
537239620Smm	return (B_FALSE);
538239620Smm}
539239620Smm
540239620Smm/*
541168404Spjd * Create an on-disk intent log.
542168404Spjd */
543219089Spjdstatic lwb_t *
544168404Spjdzil_create(zilog_t *zilog)
545168404Spjd{
546168404Spjd	const zil_header_t *zh = zilog->zl_header;
547219089Spjd	lwb_t *lwb = NULL;
548168404Spjd	uint64_t txg = 0;
549168404Spjd	dmu_tx_t *tx = NULL;
550168404Spjd	blkptr_t blk;
551168404Spjd	int error = 0;
552320496Savg	boolean_t slog = FALSE;
553168404Spjd
554168404Spjd	/*
555168404Spjd	 * Wait for any previous destroy to complete.
556168404Spjd	 */
557168404Spjd	txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg);
558168404Spjd
559168404Spjd	ASSERT(zh->zh_claim_txg == 0);
560168404Spjd	ASSERT(zh->zh_replay_seq == 0);
561168404Spjd
562168404Spjd	blk = zh->zh_log;
563168404Spjd
564168404Spjd	/*
565219089Spjd	 * Allocate an initial log block if:
566219089Spjd	 *    - there isn't one already
567219089Spjd	 *    - the existing block is the wrong endianess
568168404Spjd	 */
569207908Smm	if (BP_IS_HOLE(&blk) || BP_SHOULD_BYTESWAP(&blk)) {
570168404Spjd		tx = dmu_tx_create(zilog->zl_os);
571219089Spjd		VERIFY(dmu_tx_assign(tx, TXG_WAIT) == 0);
572168404Spjd		dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
573168404Spjd		txg = dmu_tx_get_txg(tx);
574168404Spjd
575207908Smm		if (!BP_IS_HOLE(&blk)) {
576219089Spjd			zio_free_zil(zilog->zl_spa, txg, &blk);
577207908Smm			BP_ZERO(&blk);
578207908Smm		}
579207908Smm
580219089Spjd		error = zio_alloc_zil(zilog->zl_spa, txg, &blk, NULL,
581320496Savg		    ZIL_MIN_BLKSZ, &slog);
582168404Spjd
583168404Spjd		if (error == 0)
584168404Spjd			zil_init_log_chain(zilog, &blk);
585168404Spjd	}
586168404Spjd
587168404Spjd	/*
588168404Spjd	 * Allocate a log write buffer (lwb) for the first log block.
589168404Spjd	 */
590219089Spjd	if (error == 0)
591320496Savg		lwb = zil_alloc_lwb(zilog, &blk, slog, txg);
592168404Spjd
593168404Spjd	/*
594168404Spjd	 * If we just allocated the first log block, commit our transaction
595168404Spjd	 * and wait for zil_sync() to stuff the block poiner into zh_log.
596168404Spjd	 * (zh is part of the MOS, so we cannot modify it in open context.)
597168404Spjd	 */
598168404Spjd	if (tx != NULL) {
599168404Spjd		dmu_tx_commit(tx);
600168404Spjd		txg_wait_synced(zilog->zl_dmu_pool, txg);
601168404Spjd	}
602168404Spjd
603168404Spjd	ASSERT(bcmp(&blk, &zh->zh_log, sizeof (blk)) == 0);
604219089Spjd
605219089Spjd	return (lwb);
606168404Spjd}
607168404Spjd
608168404Spjd/*
609168404Spjd * In one tx, free all log blocks and clear the log header.
610168404Spjd * If keep_first is set, then we're replaying a log with no content.
611168404Spjd * We want to keep the first block, however, so that the first
612168404Spjd * synchronous transaction doesn't require a txg_wait_synced()
613168404Spjd * in zil_create().  We don't need to txg_wait_synced() here either
614168404Spjd * when keep_first is set, because both zil_create() and zil_destroy()
615168404Spjd * will wait for any in-progress destroys to complete.
616168404Spjd */
617168404Spjdvoid
618168404Spjdzil_destroy(zilog_t *zilog, boolean_t keep_first)
619168404Spjd{
620168404Spjd	const zil_header_t *zh = zilog->zl_header;
621168404Spjd	lwb_t *lwb;
622168404Spjd	dmu_tx_t *tx;
623168404Spjd	uint64_t txg;
624168404Spjd
625168404Spjd	/*
626168404Spjd	 * Wait for any previous destroy to complete.
627168404Spjd	 */
628168404Spjd	txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg);
629168404Spjd
630219089Spjd	zilog->zl_old_header = *zh;		/* debugging aid */
631219089Spjd
632168404Spjd	if (BP_IS_HOLE(&zh->zh_log))
633168404Spjd		return;
634168404Spjd
635168404Spjd	tx = dmu_tx_create(zilog->zl_os);
636219089Spjd	VERIFY(dmu_tx_assign(tx, TXG_WAIT) == 0);
637168404Spjd	dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
638168404Spjd	txg = dmu_tx_get_txg(tx);
639168404Spjd
640168404Spjd	mutex_enter(&zilog->zl_lock);
641168404Spjd
642168404Spjd	ASSERT3U(zilog->zl_destroy_txg, <, txg);
643168404Spjd	zilog->zl_destroy_txg = txg;
644168404Spjd	zilog->zl_keep_first = keep_first;
645168404Spjd
646168404Spjd	if (!list_is_empty(&zilog->zl_lwb_list)) {
647168404Spjd		ASSERT(zh->zh_claim_txg == 0);
648224526Smm		VERIFY(!keep_first);
649168404Spjd		while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) {
650168404Spjd			list_remove(&zilog->zl_lwb_list, lwb);
651168404Spjd			if (lwb->lwb_buf != NULL)
652168404Spjd				zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
653219089Spjd			zio_free_zil(zilog->zl_spa, txg, &lwb->lwb_blk);
654168404Spjd			kmem_cache_free(zil_lwb_cache, lwb);
655168404Spjd		}
656219089Spjd	} else if (!keep_first) {
657239620Smm		zil_destroy_sync(zilog, tx);
658168404Spjd	}
659168404Spjd	mutex_exit(&zilog->zl_lock);
660168404Spjd
661168404Spjd	dmu_tx_commit(tx);
662185029Spjd}
663168404Spjd
664239620Smmvoid
665239620Smmzil_destroy_sync(zilog_t *zilog, dmu_tx_t *tx)
666239620Smm{
667239620Smm	ASSERT(list_is_empty(&zilog->zl_lwb_list));
668239620Smm	(void) zil_parse(zilog, zil_free_log_block,
669239620Smm	    zil_free_log_record, tx, zilog->zl_header->zh_claim_txg);
670239620Smm}
671239620Smm
672168404Spjdint
673288569Smavzil_claim(dsl_pool_t *dp, dsl_dataset_t *ds, void *txarg)
674168404Spjd{
675168404Spjd	dmu_tx_t *tx = txarg;
676168404Spjd	uint64_t first_txg = dmu_tx_get_txg(tx);
677168404Spjd	zilog_t *zilog;
678168404Spjd	zil_header_t *zh;
679168404Spjd	objset_t *os;
680168404Spjd	int error;
681168404Spjd
682288569Smav	error = dmu_objset_own_obj(dp, ds->ds_object,
683288569Smav	    DMU_OST_ANY, B_FALSE, FTAG, &os);
684248571Smm	if (error != 0) {
685272133Sdelphij		/*
686272133Sdelphij		 * EBUSY indicates that the objset is inconsistent, in which
687272133Sdelphij		 * case it can not have a ZIL.
688272133Sdelphij		 */
689272133Sdelphij		if (error != EBUSY) {
690288569Smav			cmn_err(CE_WARN, "can't open objset for %llu, error %u",
691288569Smav			    (unsigned long long)ds->ds_object, error);
692272133Sdelphij		}
693168404Spjd		return (0);
694168404Spjd	}
695168404Spjd
696168404Spjd	zilog = dmu_objset_zil(os);
697168404Spjd	zh = zil_header_in_syncing_context(zilog);
698168404Spjd
699219089Spjd	if (spa_get_log_state(zilog->zl_spa) == SPA_LOG_CLEAR) {
700213197Smm		if (!BP_IS_HOLE(&zh->zh_log))
701219089Spjd			zio_free_zil(zilog->zl_spa, first_txg, &zh->zh_log);
702213197Smm		BP_ZERO(&zh->zh_log);
703213197Smm		dsl_dataset_dirty(dmu_objset_ds(os), tx);
704248571Smm		dmu_objset_disown(os, FTAG);
705219089Spjd		return (0);
706213197Smm	}
707213197Smm
708168404Spjd	/*
709168404Spjd	 * Claim all log blocks if we haven't already done so, and remember
710168404Spjd	 * the highest claimed sequence number.  This ensures that if we can
711168404Spjd	 * read only part of the log now (e.g. due to a missing device),
712168404Spjd	 * but we can read the entire log later, we will not try to replay
713168404Spjd	 * or destroy beyond the last block we successfully claimed.
714168404Spjd	 */
715168404Spjd	ASSERT3U(zh->zh_claim_txg, <=, first_txg);
716168404Spjd	if (zh->zh_claim_txg == 0 && !BP_IS_HOLE(&zh->zh_log)) {
717219089Spjd		(void) zil_parse(zilog, zil_claim_log_block,
718219089Spjd		    zil_claim_log_record, tx, first_txg);
719168404Spjd		zh->zh_claim_txg = first_txg;
720219089Spjd		zh->zh_claim_blk_seq = zilog->zl_parse_blk_seq;
721219089Spjd		zh->zh_claim_lr_seq = zilog->zl_parse_lr_seq;
722219089Spjd		if (zilog->zl_parse_lr_count || zilog->zl_parse_blk_count > 1)
723219089Spjd			zh->zh_flags |= ZIL_REPLAY_NEEDED;
724219089Spjd		zh->zh_flags |= ZIL_CLAIM_LR_SEQ_VALID;
725168404Spjd		dsl_dataset_dirty(dmu_objset_ds(os), tx);
726168404Spjd	}
727168404Spjd
728168404Spjd	ASSERT3U(first_txg, ==, (spa_last_synced_txg(zilog->zl_spa) + 1));
729248571Smm	dmu_objset_disown(os, FTAG);
730168404Spjd	return (0);
731168404Spjd}
732168404Spjd
733185029Spjd/*
734185029Spjd * Check the log by walking the log chain.
735185029Spjd * Checksum errors are ok as they indicate the end of the chain.
736185029Spjd * Any other error (no device or read failure) returns an error.
737185029Spjd */
738288569Smav/* ARGSUSED */
739185029Spjdint
740288569Smavzil_check_log_chain(dsl_pool_t *dp, dsl_dataset_t *ds, void *tx)
741168404Spjd{
742185029Spjd	zilog_t *zilog;
743185029Spjd	objset_t *os;
744219089Spjd	blkptr_t *bp;
745185029Spjd	int error;
746168404Spjd
747219089Spjd	ASSERT(tx == NULL);
748219089Spjd
749288569Smav	error = dmu_objset_from_ds(ds, &os);
750248571Smm	if (error != 0) {
751288569Smav		cmn_err(CE_WARN, "can't open objset %llu, error %d",
752288569Smav		    (unsigned long long)ds->ds_object, error);
753185029Spjd		return (0);
754185029Spjd	}
755168404Spjd
756185029Spjd	zilog = dmu_objset_zil(os);
757219089Spjd	bp = (blkptr_t *)&zilog->zl_header->zh_log;
758219089Spjd
759219089Spjd	/*
760219089Spjd	 * Check the first block and determine if it's on a log device
761219089Spjd	 * which may have been removed or faulted prior to loading this
762219089Spjd	 * pool.  If so, there's no point in checking the rest of the log
763219089Spjd	 * as its content should have already been synced to the pool.
764219089Spjd	 */
765219089Spjd	if (!BP_IS_HOLE(bp)) {
766219089Spjd		vdev_t *vd;
767219089Spjd		boolean_t valid = B_TRUE;
768219089Spjd
769219089Spjd		spa_config_enter(os->os_spa, SCL_STATE, FTAG, RW_READER);
770219089Spjd		vd = vdev_lookup_top(os->os_spa, DVA_GET_VDEV(&bp->blk_dva[0]));
771219089Spjd		if (vd->vdev_islog && vdev_is_dead(vd))
772219089Spjd			valid = vdev_log_state_valid(vd);
773219089Spjd		spa_config_exit(os->os_spa, SCL_STATE, FTAG);
774219089Spjd
775288569Smav		if (!valid)
776219089Spjd			return (0);
777168404Spjd	}
778185029Spjd
779219089Spjd	/*
780219089Spjd	 * Because tx == NULL, zil_claim_log_block() will not actually claim
781219089Spjd	 * any blocks, but just determine whether it is possible to do so.
782219089Spjd	 * In addition to checking the log chain, zil_claim_log_block()
783219089Spjd	 * will invoke zio_claim() with a done func of spa_claim_notify(),
784219089Spjd	 * which will update spa_max_claim_txg.  See spa_load() for details.
785219089Spjd	 */
786219089Spjd	error = zil_parse(zilog, zil_claim_log_block, zil_claim_log_record, tx,
787219089Spjd	    zilog->zl_header->zh_claim_txg ? -1ULL : spa_first_txg(os->os_spa));
788219089Spjd
789219089Spjd	return ((error == ECKSUM || error == ENOENT) ? 0 : error);
790168404Spjd}
791168404Spjd
792185029Spjdstatic int
793185029Spjdzil_vdev_compare(const void *x1, const void *x2)
794185029Spjd{
795219089Spjd	const uint64_t v1 = ((zil_vdev_node_t *)x1)->zv_vdev;
796219089Spjd	const uint64_t v2 = ((zil_vdev_node_t *)x2)->zv_vdev;
797185029Spjd
798185029Spjd	if (v1 < v2)
799185029Spjd		return (-1);
800185029Spjd	if (v1 > v2)
801185029Spjd		return (1);
802185029Spjd
803185029Spjd	return (0);
804185029Spjd}
805185029Spjd
806168404Spjdvoid
807219089Spjdzil_add_block(zilog_t *zilog, const blkptr_t *bp)
808168404Spjd{
809185029Spjd	avl_tree_t *t = &zilog->zl_vdev_tree;
810185029Spjd	avl_index_t where;
811185029Spjd	zil_vdev_node_t *zv, zvsearch;
812185029Spjd	int ndvas = BP_GET_NDVAS(bp);
813185029Spjd	int i;
814168404Spjd
815185029Spjd	if (zfs_nocacheflush)
816185029Spjd		return;
817168404Spjd
818185029Spjd	ASSERT(zilog->zl_writer);
819168404Spjd
820185029Spjd	/*
821185029Spjd	 * Even though we're zl_writer, we still need a lock because the
822185029Spjd	 * zl_get_data() callbacks may have dmu_sync() done callbacks
823185029Spjd	 * that will run concurrently.
824185029Spjd	 */
825185029Spjd	mutex_enter(&zilog->zl_vdev_lock);
826185029Spjd	for (i = 0; i < ndvas; i++) {
827185029Spjd		zvsearch.zv_vdev = DVA_GET_VDEV(&bp->blk_dva[i]);
828185029Spjd		if (avl_find(t, &zvsearch, &where) == NULL) {
829185029Spjd			zv = kmem_alloc(sizeof (*zv), KM_SLEEP);
830185029Spjd			zv->zv_vdev = zvsearch.zv_vdev;
831185029Spjd			avl_insert(t, zv, where);
832185029Spjd		}
833185029Spjd	}
834185029Spjd	mutex_exit(&zilog->zl_vdev_lock);
835168404Spjd}
836168404Spjd
837219089Spjdstatic void
838168404Spjdzil_flush_vdevs(zilog_t *zilog)
839168404Spjd{
840168404Spjd	spa_t *spa = zilog->zl_spa;
841185029Spjd	avl_tree_t *t = &zilog->zl_vdev_tree;
842185029Spjd	void *cookie = NULL;
843185029Spjd	zil_vdev_node_t *zv;
844315385Smav	zio_t *zio = NULL;
845168404Spjd
846168404Spjd	ASSERT(zilog->zl_writer);
847168404Spjd
848185029Spjd	/*
849185029Spjd	 * We don't need zl_vdev_lock here because we're the zl_writer,
850185029Spjd	 * and all zl_get_data() callbacks are done.
851185029Spjd	 */
852185029Spjd	if (avl_numnodes(t) == 0)
853185029Spjd		return;
854185029Spjd
855185029Spjd	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
856185029Spjd
857185029Spjd	while ((zv = avl_destroy_nodes(t, &cookie)) != NULL) {
858185029Spjd		vdev_t *vd = vdev_lookup_top(spa, zv->zv_vdev);
859315385Smav		if (vd != NULL && !vd->vdev_nowritecache) {
860315385Smav			if (zio == NULL)
861315385Smav				zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
862185029Spjd			zio_flush(zio, vd);
863315385Smav		}
864185029Spjd		kmem_free(zv, sizeof (*zv));
865168404Spjd	}
866168404Spjd
867168404Spjd	/*
868168404Spjd	 * Wait for all the flushes to complete.  Not all devices actually
869168404Spjd	 * support the DKIOCFLUSHWRITECACHE ioctl, so it's OK if it fails.
870168404Spjd	 */
871315385Smav	if (zio)
872315385Smav		(void) zio_wait(zio);
873185029Spjd
874185029Spjd	spa_config_exit(spa, SCL_STATE, FTAG);
875168404Spjd}
876168404Spjd
877168404Spjd/*
878168404Spjd * Function called when a log block write completes
879168404Spjd */
880168404Spjdstatic void
881168404Spjdzil_lwb_write_done(zio_t *zio)
882168404Spjd{
883168404Spjd	lwb_t *lwb = zio->io_private;
884168404Spjd	zilog_t *zilog = lwb->lwb_zilog;
885219089Spjd	dmu_tx_t *tx = lwb->lwb_tx;
886168404Spjd
887185029Spjd	ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF);
888185029Spjd	ASSERT(BP_GET_TYPE(zio->io_bp) == DMU_OT_INTENT_LOG);
889185029Spjd	ASSERT(BP_GET_LEVEL(zio->io_bp) == 0);
890185029Spjd	ASSERT(BP_GET_BYTEORDER(zio->io_bp) == ZFS_HOST_BYTEORDER);
891185029Spjd	ASSERT(!BP_IS_GANG(zio->io_bp));
892185029Spjd	ASSERT(!BP_IS_HOLE(zio->io_bp));
893268649Sdelphij	ASSERT(BP_GET_FILL(zio->io_bp) == 0);
894185029Spjd
895168404Spjd	/*
896209962Smm	 * Ensure the lwb buffer pointer is cleared before releasing
897209962Smm	 * the txg. If we have had an allocation failure and
898209962Smm	 * the txg is waiting to sync then we want want zil_sync()
899209962Smm	 * to remove the lwb so that it's not picked up as the next new
900209962Smm	 * one in zil_commit_writer(). zil_sync() will only remove
901209962Smm	 * the lwb if lwb_buf is null.
902168404Spjd	 */
903168404Spjd	zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
904168404Spjd	mutex_enter(&zilog->zl_lock);
905168404Spjd	lwb->lwb_buf = NULL;
906219089Spjd	lwb->lwb_tx = NULL;
907219089Spjd	mutex_exit(&zilog->zl_lock);
908209962Smm
909209962Smm	/*
910209962Smm	 * Now that we've written this log block, we have a stable pointer
911209962Smm	 * to the next block in the chain, so it's OK to let the txg in
912219089Spjd	 * which we allocated the next block sync.
913209962Smm	 */
914219089Spjd	dmu_tx_commit(tx);
915168404Spjd}
916168404Spjd
917168404Spjd/*
918168404Spjd * Initialize the io for a log block.
919168404Spjd */
920168404Spjdstatic void
921168404Spjdzil_lwb_write_init(zilog_t *zilog, lwb_t *lwb)
922168404Spjd{
923268657Sdelphij	zbookmark_phys_t zb;
924320496Savg	zio_priority_t prio;
925168404Spjd
926219089Spjd	SET_BOOKMARK(&zb, lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_OBJSET],
927219089Spjd	    ZB_ZIL_OBJECT, ZB_ZIL_LEVEL,
928219089Spjd	    lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_SEQ]);
929168404Spjd
930168404Spjd	if (zilog->zl_root_zio == NULL) {
931168404Spjd		zilog->zl_root_zio = zio_root(zilog->zl_spa, NULL, NULL,
932168404Spjd		    ZIO_FLAG_CANFAIL);
933168404Spjd	}
934168404Spjd	if (lwb->lwb_zio == NULL) {
935320496Savg		if (zilog->zl_cur_used <= zil_slog_limit || !lwb->lwb_slog)
936320496Savg			prio = ZIO_PRIORITY_SYNC_WRITE;
937320496Savg		else
938320496Savg			prio = ZIO_PRIORITY_ASYNC_WRITE;
939168404Spjd		lwb->lwb_zio = zio_rewrite(zilog->zl_root_zio, zilog->zl_spa,
940219089Spjd		    0, &lwb->lwb_blk, lwb->lwb_buf, BP_GET_LSIZE(&lwb->lwb_blk),
941320496Savg		    zil_lwb_write_done, lwb, prio,
942219089Spjd		    ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE, &zb);
943168404Spjd	}
944168404Spjd}
945168404Spjd
946168404Spjd/*
947219089Spjd * Define a limited set of intent log block sizes.
948251631Sdelphij *
949219089Spjd * These must be a multiple of 4KB. Note only the amount used (again
950219089Spjd * aligned to 4KB) actually gets written. However, we can't always just
951276081Sdelphij * allocate SPA_OLD_MAXBLOCKSIZE as the slog space could be exhausted.
952219089Spjd */
953219089Spjduint64_t zil_block_buckets[] = {
954219089Spjd    4096,		/* non TX_WRITE */
955219089Spjd    8192+4096,		/* data base */
956219089Spjd    32*1024 + 4096, 	/* NFS writes */
957219089Spjd    UINT64_MAX
958219089Spjd};
959219089Spjd
960219089Spjd/*
961168404Spjd * Start a log block write and advance to the next log block.
962168404Spjd * Calls are serialized.
963168404Spjd */
964168404Spjdstatic lwb_t *
965315388Smavzil_lwb_write_start(zilog_t *zilog, lwb_t *lwb, boolean_t last)
966168404Spjd{
967219089Spjd	lwb_t *nlwb = NULL;
968219089Spjd	zil_chain_t *zilc;
969168404Spjd	spa_t *spa = zilog->zl_spa;
970219089Spjd	blkptr_t *bp;
971219089Spjd	dmu_tx_t *tx;
972168404Spjd	uint64_t txg;
973219089Spjd	uint64_t zil_blksz, wsz;
974219089Spjd	int i, error;
975320496Savg	boolean_t slog;
976168404Spjd
977219089Spjd	if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2) {
978219089Spjd		zilc = (zil_chain_t *)lwb->lwb_buf;
979219089Spjd		bp = &zilc->zc_next_blk;
980219089Spjd	} else {
981219089Spjd		zilc = (zil_chain_t *)(lwb->lwb_buf + lwb->lwb_sz);
982219089Spjd		bp = &zilc->zc_next_blk;
983219089Spjd	}
984168404Spjd
985219089Spjd	ASSERT(lwb->lwb_nused <= lwb->lwb_sz);
986219089Spjd
987168404Spjd	/*
988168404Spjd	 * Allocate the next block and save its address in this block
989168404Spjd	 * before writing it in order to establish the log chain.
990168404Spjd	 * Note that if the allocation of nlwb synced before we wrote
991168404Spjd	 * the block that points at it (lwb), we'd leak it if we crashed.
992219089Spjd	 * Therefore, we don't do dmu_tx_commit() until zil_lwb_write_done().
993219089Spjd	 * We dirty the dataset to ensure that zil_sync() will be called
994219089Spjd	 * to clean up in the event of allocation failure or I/O failure.
995168404Spjd	 */
996219089Spjd	tx = dmu_tx_create(zilog->zl_os);
997330987Savg
998330987Savg	/*
999330987Savg	 * Since we are not going to create any new dirty data, and we
1000330987Savg	 * can even help with clearing the existing dirty data, we
1001330987Savg	 * should not be subject to the dirty data based delays. We
1002330987Savg	 * use TXG_NOTHROTTLE to bypass the delay mechanism.
1003330987Savg	 */
1004330987Savg	VERIFY0(dmu_tx_assign(tx, TXG_WAIT | TXG_NOTHROTTLE));
1005330987Savg
1006219089Spjd	dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
1007219089Spjd	txg = dmu_tx_get_txg(tx);
1008168404Spjd
1009219089Spjd	lwb->lwb_tx = tx;
1010219089Spjd
1011168404Spjd	/*
1012219089Spjd	 * Log blocks are pre-allocated. Here we select the size of the next
1013219089Spjd	 * block, based on size used in the last block.
1014219089Spjd	 * - first find the smallest bucket that will fit the block from a
1015219089Spjd	 *   limited set of block sizes. This is because it's faster to write
1016219089Spjd	 *   blocks allocated from the same metaslab as they are adjacent or
1017219089Spjd	 *   close.
1018219089Spjd	 * - next find the maximum from the new suggested size and an array of
1019219089Spjd	 *   previous sizes. This lessens a picket fence effect of wrongly
1020219089Spjd	 *   guesssing the size if we have a stream of say 2k, 64k, 2k, 64k
1021219089Spjd	 *   requests.
1022219089Spjd	 *
1023219089Spjd	 * Note we only write what is used, but we can't just allocate
1024219089Spjd	 * the maximum block size because we can exhaust the available
1025219089Spjd	 * pool log space.
1026168404Spjd	 */
1027219089Spjd	zil_blksz = zilog->zl_cur_used + sizeof (zil_chain_t);
1028219089Spjd	for (i = 0; zil_blksz > zil_block_buckets[i]; i++)
1029219089Spjd		continue;
1030219089Spjd	zil_blksz = zil_block_buckets[i];
1031219089Spjd	if (zil_blksz == UINT64_MAX)
1032276081Sdelphij		zil_blksz = SPA_OLD_MAXBLOCKSIZE;
1033219089Spjd	zilog->zl_prev_blks[zilog->zl_prev_rotor] = zil_blksz;
1034219089Spjd	for (i = 0; i < ZIL_PREV_BLKS; i++)
1035219089Spjd		zil_blksz = MAX(zil_blksz, zilog->zl_prev_blks[i]);
1036219089Spjd	zilog->zl_prev_rotor = (zilog->zl_prev_rotor + 1) & (ZIL_PREV_BLKS - 1);
1037168404Spjd
1038168404Spjd	BP_ZERO(bp);
1039168404Spjd	/* pass the old blkptr in order to spread log blocks across devs */
1040320496Savg	error = zio_alloc_zil(spa, txg, bp, &lwb->lwb_blk, zil_blksz, &slog);
1041248571Smm	if (error == 0) {
1042219089Spjd		ASSERT3U(bp->blk_birth, ==, txg);
1043219089Spjd		bp->blk_cksum = lwb->lwb_blk.blk_cksum;
1044219089Spjd		bp->blk_cksum.zc_word[ZIL_ZC_SEQ]++;
1045168404Spjd
1046168404Spjd		/*
1047219089Spjd		 * Allocate a new log write buffer (lwb).
1048168404Spjd		 */
1049320496Savg		nlwb = zil_alloc_lwb(zilog, bp, slog, txg);
1050168404Spjd
1051219089Spjd		/* Record the block for later vdev flushing */
1052219089Spjd		zil_add_block(zilog, &lwb->lwb_blk);
1053168404Spjd	}
1054168404Spjd
1055219089Spjd	if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2) {
1056219089Spjd		/* For Slim ZIL only write what is used. */
1057219089Spjd		wsz = P2ROUNDUP_TYPED(lwb->lwb_nused, ZIL_MIN_BLKSZ, uint64_t);
1058219089Spjd		ASSERT3U(wsz, <=, lwb->lwb_sz);
1059219089Spjd		zio_shrink(lwb->lwb_zio, wsz);
1060168404Spjd
1061219089Spjd	} else {
1062219089Spjd		wsz = lwb->lwb_sz;
1063219089Spjd	}
1064168404Spjd
1065219089Spjd	zilc->zc_pad = 0;
1066219089Spjd	zilc->zc_nused = lwb->lwb_nused;
1067219089Spjd	zilc->zc_eck.zec_cksum = lwb->lwb_blk.blk_cksum;
1068168404Spjd
1069168404Spjd	/*
1070219089Spjd	 * clear unused data for security
1071168404Spjd	 */
1072219089Spjd	bzero(lwb->lwb_buf + lwb->lwb_nused, wsz - lwb->lwb_nused);
1073168404Spjd
1074315388Smav	if (last)
1075315388Smav		lwb->lwb_zio->io_pipeline &= ~ZIO_STAGE_ISSUE_ASYNC;
1076219089Spjd	zio_nowait(lwb->lwb_zio); /* Kick off the write for the old log block */
1077168404Spjd
1078168404Spjd	/*
1079219089Spjd	 * If there was an allocation failure then nlwb will be null which
1080219089Spjd	 * forces a txg_wait_synced().
1081168404Spjd	 */
1082168404Spjd	return (nlwb);
1083168404Spjd}
1084168404Spjd
1085168404Spjdstatic lwb_t *
1086168404Spjdzil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb)
1087168404Spjd{
1088320496Savg	lr_t *lrcb, *lrc = &itx->itx_lr; /* common log record */
1089320496Savg	lr_write_t *lrwb, *lrw = (lr_write_t *)lrc;
1090219089Spjd	char *lr_buf;
1091168404Spjd	uint64_t txg = lrc->lrc_txg;
1092168404Spjd	uint64_t reclen = lrc->lrc_reclen;
1093219089Spjd	uint64_t dlen = 0;
1094320496Savg	uint64_t dnow, lwb_sp;
1095168404Spjd
1096168404Spjd	if (lwb == NULL)
1097168404Spjd		return (NULL);
1098219089Spjd
1099168404Spjd	ASSERT(lwb->lwb_buf != NULL);
1100168404Spjd
1101168404Spjd	if (lrc->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY)
1102168404Spjd		dlen = P2ROUNDUP_TYPED(
1103219089Spjd		    lrw->lr_length, sizeof (uint64_t), uint64_t);
1104168404Spjd
1105168404Spjd	zilog->zl_cur_used += (reclen + dlen);
1106168404Spjd
1107168404Spjd	zil_lwb_write_init(zilog, lwb);
1108168404Spjd
1109320496Savgcont:
1110168404Spjd	/*
1111168404Spjd	 * If this record won't fit in the current log block, start a new one.
1112320496Savg	 * For WR_NEED_COPY optimize layout for minimal number of chunks, but
1113320496Savg	 * try to keep wasted space withing reasonable range (12%).
1114168404Spjd	 */
1115320496Savg	lwb_sp = lwb->lwb_sz - lwb->lwb_nused;
1116320496Savg	if (reclen > lwb_sp || (reclen + dlen > lwb_sp &&
1117320496Savg	    lwb_sp < ZIL_MAX_LOG_DATA / 8 && (dlen % ZIL_MAX_LOG_DATA == 0 ||
1118320496Savg	    lwb_sp < reclen + dlen % ZIL_MAX_LOG_DATA))) {
1119315388Smav		lwb = zil_lwb_write_start(zilog, lwb, B_FALSE);
1120168404Spjd		if (lwb == NULL)
1121168404Spjd			return (NULL);
1122168404Spjd		zil_lwb_write_init(zilog, lwb);
1123219089Spjd		ASSERT(LWB_EMPTY(lwb));
1124320496Savg		lwb_sp = lwb->lwb_sz - lwb->lwb_nused;
1125320496Savg		ASSERT3U(reclen + MIN(dlen, sizeof(uint64_t)), <=, lwb_sp);
1126168404Spjd	}
1127168404Spjd
1128320496Savg	dnow = MIN(dlen, lwb_sp - reclen);
1129219089Spjd	lr_buf = lwb->lwb_buf + lwb->lwb_nused;
1130219089Spjd	bcopy(lrc, lr_buf, reclen);
1131320496Savg	lrcb = (lr_t *)lr_buf;
1132320496Savg	lrwb = (lr_write_t *)lrcb;
1133168404Spjd
1134168404Spjd	/*
1135168404Spjd	 * If it's a write, fetch the data or get its blkptr as appropriate.
1136168404Spjd	 */
1137168404Spjd	if (lrc->lrc_txtype == TX_WRITE) {
1138168404Spjd		if (txg > spa_freeze_txg(zilog->zl_spa))
1139168404Spjd			txg_wait_synced(zilog->zl_dmu_pool, txg);
1140168404Spjd		if (itx->itx_wr_state != WR_COPIED) {
1141168404Spjd			char *dbuf;
1142168404Spjd			int error;
1143168404Spjd
1144320496Savg			if (itx->itx_wr_state == WR_NEED_COPY) {
1145219089Spjd				dbuf = lr_buf + reclen;
1146320496Savg				lrcb->lrc_reclen += dnow;
1147320496Savg				if (lrwb->lr_length > dnow)
1148320496Savg					lrwb->lr_length = dnow;
1149320496Savg				lrw->lr_offset += dnow;
1150320496Savg				lrw->lr_length -= dnow;
1151168404Spjd			} else {
1152168404Spjd				ASSERT(itx->itx_wr_state == WR_INDIRECT);
1153168404Spjd				dbuf = NULL;
1154168404Spjd			}
1155168404Spjd			error = zilog->zl_get_data(
1156320496Savg			    itx->itx_private, lrwb, dbuf, lwb->lwb_zio);
1157214378Smm			if (error == EIO) {
1158214378Smm				txg_wait_synced(zilog->zl_dmu_pool, txg);
1159214378Smm				return (lwb);
1160214378Smm			}
1161248571Smm			if (error != 0) {
1162168404Spjd				ASSERT(error == ENOENT || error == EEXIST ||
1163168404Spjd				    error == EALREADY);
1164168404Spjd				return (lwb);
1165168404Spjd			}
1166168404Spjd		}
1167168404Spjd	}
1168168404Spjd
1169219089Spjd	/*
1170219089Spjd	 * We're actually making an entry, so update lrc_seq to be the
1171219089Spjd	 * log record sequence number.  Note that this is generally not
1172219089Spjd	 * equal to the itx sequence number because not all transactions
1173219089Spjd	 * are synchronous, and sometimes spa_sync() gets there first.
1174219089Spjd	 */
1175320496Savg	lrcb->lrc_seq = ++zilog->zl_lr_seq; /* we are single threaded */
1176320496Savg	lwb->lwb_nused += reclen + dnow;
1177168404Spjd	lwb->lwb_max_txg = MAX(lwb->lwb_max_txg, txg);
1178219089Spjd	ASSERT3U(lwb->lwb_nused, <=, lwb->lwb_sz);
1179240415Smm	ASSERT0(P2PHASE(lwb->lwb_nused, sizeof (uint64_t)));
1180168404Spjd
1181320496Savg	dlen -= dnow;
1182320496Savg	if (dlen > 0) {
1183320496Savg		zilog->zl_cur_used += reclen;
1184320496Savg		goto cont;
1185320496Savg	}
1186320496Savg
1187168404Spjd	return (lwb);
1188168404Spjd}
1189168404Spjd
1190168404Spjditx_t *
1191185029Spjdzil_itx_create(uint64_t txtype, size_t lrsize)
1192168404Spjd{
1193168404Spjd	itx_t *itx;
1194168404Spjd
1195168404Spjd	lrsize = P2ROUNDUP_TYPED(lrsize, sizeof (uint64_t), size_t);
1196168404Spjd
1197168404Spjd	itx = kmem_alloc(offsetof(itx_t, itx_lr) + lrsize, KM_SLEEP);
1198168404Spjd	itx->itx_lr.lrc_txtype = txtype;
1199168404Spjd	itx->itx_lr.lrc_reclen = lrsize;
1200168404Spjd	itx->itx_lr.lrc_seq = 0;	/* defensive */
1201219089Spjd	itx->itx_sync = B_TRUE;		/* default is synchronous */
1202168404Spjd
1203168404Spjd	return (itx);
1204168404Spjd}
1205168404Spjd
1206219089Spjdvoid
1207219089Spjdzil_itx_destroy(itx_t *itx)
1208168404Spjd{
1209219089Spjd	kmem_free(itx, offsetof(itx_t, itx_lr) + itx->itx_lr.lrc_reclen);
1210219089Spjd}
1211168404Spjd
1212219089Spjd/*
1213219089Spjd * Free up the sync and async itxs. The itxs_t has already been detached
1214219089Spjd * so no locks are needed.
1215219089Spjd */
1216219089Spjdstatic void
1217219089Spjdzil_itxg_clean(itxs_t *itxs)
1218219089Spjd{
1219219089Spjd	itx_t *itx;
1220219089Spjd	list_t *list;
1221219089Spjd	avl_tree_t *t;
1222219089Spjd	void *cookie;
1223219089Spjd	itx_async_node_t *ian;
1224168404Spjd
1225219089Spjd	list = &itxs->i_sync_list;
1226219089Spjd	while ((itx = list_head(list)) != NULL) {
1227219089Spjd		list_remove(list, itx);
1228219089Spjd		kmem_free(itx, offsetof(itx_t, itx_lr) +
1229219089Spjd		    itx->itx_lr.lrc_reclen);
1230219089Spjd	}
1231168404Spjd
1232219089Spjd	cookie = NULL;
1233219089Spjd	t = &itxs->i_async_tree;
1234219089Spjd	while ((ian = avl_destroy_nodes(t, &cookie)) != NULL) {
1235219089Spjd		list = &ian->ia_list;
1236219089Spjd		while ((itx = list_head(list)) != NULL) {
1237219089Spjd			list_remove(list, itx);
1238219089Spjd			kmem_free(itx, offsetof(itx_t, itx_lr) +
1239219089Spjd			    itx->itx_lr.lrc_reclen);
1240219089Spjd		}
1241219089Spjd		list_destroy(list);
1242219089Spjd		kmem_free(ian, sizeof (itx_async_node_t));
1243219089Spjd	}
1244219089Spjd	avl_destroy(t);
1245219089Spjd
1246219089Spjd	kmem_free(itxs, sizeof (itxs_t));
1247168404Spjd}
1248168404Spjd
1249219089Spjdstatic int
1250219089Spjdzil_aitx_compare(const void *x1, const void *x2)
1251219089Spjd{
1252219089Spjd	const uint64_t o1 = ((itx_async_node_t *)x1)->ia_foid;
1253219089Spjd	const uint64_t o2 = ((itx_async_node_t *)x2)->ia_foid;
1254219089Spjd
1255219089Spjd	if (o1 < o2)
1256219089Spjd		return (-1);
1257219089Spjd	if (o1 > o2)
1258219089Spjd		return (1);
1259219089Spjd
1260219089Spjd	return (0);
1261219089Spjd}
1262219089Spjd
1263168404Spjd/*
1264219089Spjd * Remove all async itx with the given oid.
1265168404Spjd */
1266168404Spjdstatic void
1267219089Spjdzil_remove_async(zilog_t *zilog, uint64_t oid)
1268168404Spjd{
1269219089Spjd	uint64_t otxg, txg;
1270219089Spjd	itx_async_node_t *ian;
1271219089Spjd	avl_tree_t *t;
1272219089Spjd	avl_index_t where;
1273168404Spjd	list_t clean_list;
1274168404Spjd	itx_t *itx;
1275168404Spjd
1276219089Spjd	ASSERT(oid != 0);
1277168404Spjd	list_create(&clean_list, sizeof (itx_t), offsetof(itx_t, itx_node));
1278168404Spjd
1279219089Spjd	if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */
1280219089Spjd		otxg = ZILTEST_TXG;
1281219089Spjd	else
1282219089Spjd		otxg = spa_last_synced_txg(zilog->zl_spa) + 1;
1283219089Spjd
1284219089Spjd	for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) {
1285219089Spjd		itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK];
1286219089Spjd
1287219089Spjd		mutex_enter(&itxg->itxg_lock);
1288219089Spjd		if (itxg->itxg_txg != txg) {
1289219089Spjd			mutex_exit(&itxg->itxg_lock);
1290219089Spjd			continue;
1291219089Spjd		}
1292219089Spjd
1293219089Spjd		/*
1294219089Spjd		 * Locate the object node and append its list.
1295219089Spjd		 */
1296219089Spjd		t = &itxg->itxg_itxs->i_async_tree;
1297219089Spjd		ian = avl_find(t, &oid, &where);
1298219089Spjd		if (ian != NULL)
1299219089Spjd			list_move_tail(&clean_list, &ian->ia_list);
1300219089Spjd		mutex_exit(&itxg->itxg_lock);
1301168404Spjd	}
1302219089Spjd	while ((itx = list_head(&clean_list)) != NULL) {
1303219089Spjd		list_remove(&clean_list, itx);
1304219089Spjd		kmem_free(itx, offsetof(itx_t, itx_lr) +
1305219089Spjd		    itx->itx_lr.lrc_reclen);
1306219089Spjd	}
1307219089Spjd	list_destroy(&clean_list);
1308219089Spjd}
1309168404Spjd
1310219089Spjdvoid
1311219089Spjdzil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx)
1312219089Spjd{
1313219089Spjd	uint64_t txg;
1314219089Spjd	itxg_t *itxg;
1315219089Spjd	itxs_t *itxs, *clean = NULL;
1316219089Spjd
1317168404Spjd	/*
1318219089Spjd	 * Object ids can be re-instantiated in the next txg so
1319219089Spjd	 * remove any async transactions to avoid future leaks.
1320219089Spjd	 * This can happen if a fsync occurs on the re-instantiated
1321219089Spjd	 * object for a WR_INDIRECT or WR_NEED_COPY write, which gets
1322219089Spjd	 * the new file data and flushes a write record for the old object.
1323168404Spjd	 */
1324219089Spjd	if ((itx->itx_lr.lrc_txtype & ~TX_CI) == TX_REMOVE)
1325219089Spjd		zil_remove_async(zilog, itx->itx_oid);
1326219089Spjd
1327219089Spjd	/*
1328219089Spjd	 * Ensure the data of a renamed file is committed before the rename.
1329219089Spjd	 */
1330219089Spjd	if ((itx->itx_lr.lrc_txtype & ~TX_CI) == TX_RENAME)
1331219089Spjd		zil_async_to_sync(zilog, itx->itx_oid);
1332219089Spjd
1333239620Smm	if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX)
1334219089Spjd		txg = ZILTEST_TXG;
1335219089Spjd	else
1336219089Spjd		txg = dmu_tx_get_txg(tx);
1337219089Spjd
1338219089Spjd	itxg = &zilog->zl_itxg[txg & TXG_MASK];
1339219089Spjd	mutex_enter(&itxg->itxg_lock);
1340219089Spjd	itxs = itxg->itxg_itxs;
1341219089Spjd	if (itxg->itxg_txg != txg) {
1342219089Spjd		if (itxs != NULL) {
1343219089Spjd			/*
1344219089Spjd			 * The zil_clean callback hasn't got around to cleaning
1345219089Spjd			 * this itxg. Save the itxs for release below.
1346219089Spjd			 * This should be rare.
1347219089Spjd			 */
1348219089Spjd			clean = itxg->itxg_itxs;
1349219089Spjd		}
1350219089Spjd		itxg->itxg_txg = txg;
1351219089Spjd		itxs = itxg->itxg_itxs = kmem_zalloc(sizeof (itxs_t), KM_SLEEP);
1352219089Spjd
1353219089Spjd		list_create(&itxs->i_sync_list, sizeof (itx_t),
1354219089Spjd		    offsetof(itx_t, itx_node));
1355219089Spjd		avl_create(&itxs->i_async_tree, zil_aitx_compare,
1356219089Spjd		    sizeof (itx_async_node_t),
1357219089Spjd		    offsetof(itx_async_node_t, ia_node));
1358168404Spjd	}
1359219089Spjd	if (itx->itx_sync) {
1360219089Spjd		list_insert_tail(&itxs->i_sync_list, itx);
1361219089Spjd	} else {
1362219089Spjd		avl_tree_t *t = &itxs->i_async_tree;
1363219089Spjd		uint64_t foid = ((lr_ooo_t *)&itx->itx_lr)->lr_foid;
1364219089Spjd		itx_async_node_t *ian;
1365219089Spjd		avl_index_t where;
1366168404Spjd
1367219089Spjd		ian = avl_find(t, &foid, &where);
1368219089Spjd		if (ian == NULL) {
1369219089Spjd			ian = kmem_alloc(sizeof (itx_async_node_t), KM_SLEEP);
1370219089Spjd			list_create(&ian->ia_list, sizeof (itx_t),
1371219089Spjd			    offsetof(itx_t, itx_node));
1372219089Spjd			ian->ia_foid = foid;
1373219089Spjd			avl_insert(t, ian, where);
1374219089Spjd		}
1375219089Spjd		list_insert_tail(&ian->ia_list, itx);
1376168404Spjd	}
1377219089Spjd
1378219089Spjd	itx->itx_lr.lrc_txg = dmu_tx_get_txg(tx);
1379239620Smm	zilog_dirty(zilog, txg);
1380219089Spjd	mutex_exit(&itxg->itxg_lock);
1381219089Spjd
1382219089Spjd	/* Release the old itxs now we've dropped the lock */
1383219089Spjd	if (clean != NULL)
1384219089Spjd		zil_itxg_clean(clean);
1385168404Spjd}
1386168404Spjd
1387168404Spjd/*
1388168404Spjd * If there are any in-memory intent log transactions which have now been
1389239620Smm * synced then start up a taskq to free them. We should only do this after we
1390239620Smm * have written out the uberblocks (i.e. txg has been comitted) so that
1391239620Smm * don't inadvertently clean out in-memory log records that would be required
1392239620Smm * by zil_commit().
1393168404Spjd */
1394168404Spjdvoid
1395219089Spjdzil_clean(zilog_t *zilog, uint64_t synced_txg)
1396168404Spjd{
1397219089Spjd	itxg_t *itxg = &zilog->zl_itxg[synced_txg & TXG_MASK];
1398219089Spjd	itxs_t *clean_me;
1399168404Spjd
1400219089Spjd	mutex_enter(&itxg->itxg_lock);
1401219089Spjd	if (itxg->itxg_itxs == NULL || itxg->itxg_txg == ZILTEST_TXG) {
1402219089Spjd		mutex_exit(&itxg->itxg_lock);
1403219089Spjd		return;
1404168404Spjd	}
1405219089Spjd	ASSERT3U(itxg->itxg_txg, <=, synced_txg);
1406219089Spjd	ASSERT(itxg->itxg_txg != 0);
1407219089Spjd	ASSERT(zilog->zl_clean_taskq != NULL);
1408219089Spjd	clean_me = itxg->itxg_itxs;
1409219089Spjd	itxg->itxg_itxs = NULL;
1410219089Spjd	itxg->itxg_txg = 0;
1411219089Spjd	mutex_exit(&itxg->itxg_lock);
1412219089Spjd	/*
1413219089Spjd	 * Preferably start a task queue to free up the old itxs but
1414219089Spjd	 * if taskq_dispatch can't allocate resources to do that then
1415219089Spjd	 * free it in-line. This should be rare. Note, using TQ_SLEEP
1416219089Spjd	 * created a bad performance problem.
1417219089Spjd	 */
1418219089Spjd	if (taskq_dispatch(zilog->zl_clean_taskq,
1419219089Spjd	    (void (*)(void *))zil_itxg_clean, clean_me, TQ_NOSLEEP) == 0)
1420219089Spjd		zil_itxg_clean(clean_me);
1421168404Spjd}
1422168404Spjd
1423219089Spjd/*
1424219089Spjd * Get the list of itxs to commit into zl_itx_commit_list.
1425219089Spjd */
1426185029Spjdstatic void
1427219089Spjdzil_get_commit_list(zilog_t *zilog)
1428168404Spjd{
1429219089Spjd	uint64_t otxg, txg;
1430219089Spjd	list_t *commit_list = &zilog->zl_itx_commit_list;
1431219089Spjd
1432219089Spjd	if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */
1433219089Spjd		otxg = ZILTEST_TXG;
1434219089Spjd	else
1435219089Spjd		otxg = spa_last_synced_txg(zilog->zl_spa) + 1;
1436219089Spjd
1437310516Savg	/*
1438310516Savg	 * This is inherently racy, since there is nothing to prevent
1439310516Savg	 * the last synced txg from changing. That's okay since we'll
1440310516Savg	 * only commit things in the future.
1441310516Savg	 */
1442219089Spjd	for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) {
1443219089Spjd		itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK];
1444219089Spjd
1445219089Spjd		mutex_enter(&itxg->itxg_lock);
1446219089Spjd		if (itxg->itxg_txg != txg) {
1447219089Spjd			mutex_exit(&itxg->itxg_lock);
1448219089Spjd			continue;
1449219089Spjd		}
1450219089Spjd
1451310516Savg		/*
1452310516Savg		 * If we're adding itx records to the zl_itx_commit_list,
1453310516Savg		 * then the zil better be dirty in this "txg". We can assert
1454310516Savg		 * that here since we're holding the itxg_lock which will
1455310516Savg		 * prevent spa_sync from cleaning it. Once we add the itxs
1456310516Savg		 * to the zl_itx_commit_list we must commit it to disk even
1457310516Savg		 * if it's unnecessary (i.e. the txg was synced).
1458310516Savg		 */
1459310516Savg		ASSERT(zilog_is_dirty_in_txg(zilog, txg) ||
1460310516Savg		    spa_freeze_txg(zilog->zl_spa) != UINT64_MAX);
1461219089Spjd		list_move_tail(commit_list, &itxg->itxg_itxs->i_sync_list);
1462219089Spjd
1463219089Spjd		mutex_exit(&itxg->itxg_lock);
1464219089Spjd	}
1465219089Spjd}
1466219089Spjd
1467219089Spjd/*
1468219089Spjd * Move the async itxs for a specified object to commit into sync lists.
1469219089Spjd */
1470308596Smavvoid
1471219089Spjdzil_async_to_sync(zilog_t *zilog, uint64_t foid)
1472219089Spjd{
1473219089Spjd	uint64_t otxg, txg;
1474219089Spjd	itx_async_node_t *ian;
1475219089Spjd	avl_tree_t *t;
1476219089Spjd	avl_index_t where;
1477219089Spjd
1478219089Spjd	if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */
1479219089Spjd		otxg = ZILTEST_TXG;
1480219089Spjd	else
1481219089Spjd		otxg = spa_last_synced_txg(zilog->zl_spa) + 1;
1482219089Spjd
1483310516Savg	/*
1484310516Savg	 * This is inherently racy, since there is nothing to prevent
1485310516Savg	 * the last synced txg from changing.
1486310516Savg	 */
1487219089Spjd	for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) {
1488219089Spjd		itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK];
1489219089Spjd
1490219089Spjd		mutex_enter(&itxg->itxg_lock);
1491219089Spjd		if (itxg->itxg_txg != txg) {
1492219089Spjd			mutex_exit(&itxg->itxg_lock);
1493219089Spjd			continue;
1494219089Spjd		}
1495219089Spjd
1496219089Spjd		/*
1497219089Spjd		 * If a foid is specified then find that node and append its
1498219089Spjd		 * list. Otherwise walk the tree appending all the lists
1499219089Spjd		 * to the sync list. We add to the end rather than the
1500219089Spjd		 * beginning to ensure the create has happened.
1501219089Spjd		 */
1502219089Spjd		t = &itxg->itxg_itxs->i_async_tree;
1503219089Spjd		if (foid != 0) {
1504219089Spjd			ian = avl_find(t, &foid, &where);
1505219089Spjd			if (ian != NULL) {
1506219089Spjd				list_move_tail(&itxg->itxg_itxs->i_sync_list,
1507219089Spjd				    &ian->ia_list);
1508219089Spjd			}
1509219089Spjd		} else {
1510219089Spjd			void *cookie = NULL;
1511219089Spjd
1512219089Spjd			while ((ian = avl_destroy_nodes(t, &cookie)) != NULL) {
1513219089Spjd				list_move_tail(&itxg->itxg_itxs->i_sync_list,
1514219089Spjd				    &ian->ia_list);
1515219089Spjd				list_destroy(&ian->ia_list);
1516219089Spjd				kmem_free(ian, sizeof (itx_async_node_t));
1517219089Spjd			}
1518219089Spjd		}
1519219089Spjd		mutex_exit(&itxg->itxg_lock);
1520219089Spjd	}
1521219089Spjd}
1522219089Spjd
1523219089Spjdstatic void
1524219089Spjdzil_commit_writer(zilog_t *zilog)
1525219089Spjd{
1526168404Spjd	uint64_t txg;
1527219089Spjd	itx_t *itx;
1528168404Spjd	lwb_t *lwb;
1529219089Spjd	spa_t *spa = zilog->zl_spa;
1530219089Spjd	int error = 0;
1531168404Spjd
1532185029Spjd	ASSERT(zilog->zl_root_zio == NULL);
1533168404Spjd
1534219089Spjd	mutex_exit(&zilog->zl_lock);
1535219089Spjd
1536219089Spjd	zil_get_commit_list(zilog);
1537219089Spjd
1538219089Spjd	/*
1539219089Spjd	 * Return if there's nothing to commit before we dirty the fs by
1540219089Spjd	 * calling zil_create().
1541219089Spjd	 */
1542219089Spjd	if (list_head(&zilog->zl_itx_commit_list) == NULL) {
1543219089Spjd		mutex_enter(&zilog->zl_lock);
1544219089Spjd		return;
1545219089Spjd	}
1546219089Spjd
1547168404Spjd	if (zilog->zl_suspend) {
1548168404Spjd		lwb = NULL;
1549168404Spjd	} else {
1550168404Spjd		lwb = list_tail(&zilog->zl_lwb_list);
1551219089Spjd		if (lwb == NULL)
1552219089Spjd			lwb = zil_create(zilog);
1553168404Spjd	}
1554168404Spjd
1555168404Spjd	DTRACE_PROBE1(zil__cw1, zilog_t *, zilog);
1556219089Spjd	while (itx = list_head(&zilog->zl_itx_commit_list)) {
1557168404Spjd		txg = itx->itx_lr.lrc_txg;
1558310516Savg		ASSERT3U(txg, !=, 0);
1559168404Spjd
1560310516Savg		/*
1561310516Savg		 * This is inherently racy and may result in us writing
1562310516Savg		 * out a log block for a txg that was just synced. This is
1563310516Savg		 * ok since we'll end cleaning up that log block the next
1564310516Savg		 * time we call zil_sync().
1565310516Savg		 */
1566219089Spjd		if (txg > spa_last_synced_txg(spa) || txg > spa_freeze_txg(spa))
1567168404Spjd			lwb = zil_lwb_commit(zilog, itx, lwb);
1568219089Spjd		list_remove(&zilog->zl_itx_commit_list, itx);
1569168404Spjd		kmem_free(itx, offsetof(itx_t, itx_lr)
1570168404Spjd		    + itx->itx_lr.lrc_reclen);
1571168404Spjd	}
1572168404Spjd	DTRACE_PROBE1(zil__cw2, zilog_t *, zilog);
1573168404Spjd
1574168404Spjd	/* write the last block out */
1575168404Spjd	if (lwb != NULL && lwb->lwb_zio != NULL)
1576315388Smav		lwb = zil_lwb_write_start(zilog, lwb, B_TRUE);
1577168404Spjd
1578168404Spjd	zilog->zl_cur_used = 0;
1579168404Spjd
1580168404Spjd	/*
1581168404Spjd	 * Wait if necessary for the log blocks to be on stable storage.
1582168404Spjd	 */
1583168404Spjd	if (zilog->zl_root_zio) {
1584219089Spjd		error = zio_wait(zilog->zl_root_zio);
1585185029Spjd		zilog->zl_root_zio = NULL;
1586185029Spjd		zil_flush_vdevs(zilog);
1587168404Spjd	}
1588168404Spjd
1589219089Spjd	if (error || lwb == NULL)
1590168404Spjd		txg_wait_synced(zilog->zl_dmu_pool, 0);
1591168404Spjd
1592168404Spjd	mutex_enter(&zilog->zl_lock);
1593168404Spjd
1594219089Spjd	/*
1595219089Spjd	 * Remember the highest committed log sequence number for ztest.
1596219089Spjd	 * We only update this value when all the log writes succeeded,
1597219089Spjd	 * because ztest wants to ASSERT that it got the whole log chain.
1598219089Spjd	 */
1599219089Spjd	if (error == 0 && lwb != NULL)
1600219089Spjd		zilog->zl_commit_lr_seq = zilog->zl_lr_seq;
1601168404Spjd}
1602168404Spjd
1603168404Spjd/*
1604219089Spjd * Commit zfs transactions to stable storage.
1605168404Spjd * If foid is 0 push out all transactions, otherwise push only those
1606219089Spjd * for that object or might reference that object.
1607219089Spjd *
1608219089Spjd * itxs are committed in batches. In a heavily stressed zil there will be
1609219089Spjd * a commit writer thread who is writing out a bunch of itxs to the log
1610219089Spjd * for a set of committing threads (cthreads) in the same batch as the writer.
1611219089Spjd * Those cthreads are all waiting on the same cv for that batch.
1612219089Spjd *
1613219089Spjd * There will also be a different and growing batch of threads that are
1614219089Spjd * waiting to commit (qthreads). When the committing batch completes
1615219089Spjd * a transition occurs such that the cthreads exit and the qthreads become
1616219089Spjd * cthreads. One of the new cthreads becomes the writer thread for the
1617219089Spjd * batch. Any new threads arriving become new qthreads.
1618219089Spjd *
1619219089Spjd * Only 2 condition variables are needed and there's no transition
1620219089Spjd * between the two cvs needed. They just flip-flop between qthreads
1621219089Spjd * and cthreads.
1622219089Spjd *
1623219089Spjd * Using this scheme we can efficiently wakeup up only those threads
1624219089Spjd * that have been committed.
1625168404Spjd */
1626168404Spjdvoid
1627219089Spjdzil_commit(zilog_t *zilog, uint64_t foid)
1628168404Spjd{
1629219089Spjd	uint64_t mybatch;
1630219089Spjd
1631219089Spjd	if (zilog->zl_sync == ZFS_SYNC_DISABLED)
1632168404Spjd		return;
1633168404Spjd
1634219089Spjd	/* move the async itxs for the foid to the sync queues */
1635219089Spjd	zil_async_to_sync(zilog, foid);
1636219089Spjd
1637168404Spjd	mutex_enter(&zilog->zl_lock);
1638219089Spjd	mybatch = zilog->zl_next_batch;
1639168404Spjd	while (zilog->zl_writer) {
1640219089Spjd		cv_wait(&zilog->zl_cv_batch[mybatch & 1], &zilog->zl_lock);
1641219089Spjd		if (mybatch <= zilog->zl_com_batch) {
1642168404Spjd			mutex_exit(&zilog->zl_lock);
1643168404Spjd			return;
1644168404Spjd		}
1645168404Spjd	}
1646219089Spjd
1647219089Spjd	zilog->zl_next_batch++;
1648219089Spjd	zilog->zl_writer = B_TRUE;
1649219089Spjd	zil_commit_writer(zilog);
1650219089Spjd	zilog->zl_com_batch = mybatch;
1651219089Spjd	zilog->zl_writer = B_FALSE;
1652168404Spjd	mutex_exit(&zilog->zl_lock);
1653219089Spjd
1654219089Spjd	/* wake up one thread to become the next writer */
1655219089Spjd	cv_signal(&zilog->zl_cv_batch[(mybatch+1) & 1]);
1656219089Spjd
1657219089Spjd	/* wake up all threads waiting for this batch to be committed */
1658219089Spjd	cv_broadcast(&zilog->zl_cv_batch[mybatch & 1]);
1659168404Spjd}
1660168404Spjd
1661168404Spjd/*
1662168404Spjd * Called in syncing context to free committed log blocks and update log header.
1663168404Spjd */
1664168404Spjdvoid
1665168404Spjdzil_sync(zilog_t *zilog, dmu_tx_t *tx)
1666168404Spjd{
1667168404Spjd	zil_header_t *zh = zil_header_in_syncing_context(zilog);
1668168404Spjd	uint64_t txg = dmu_tx_get_txg(tx);
1669168404Spjd	spa_t *spa = zilog->zl_spa;
1670219089Spjd	uint64_t *replayed_seq = &zilog->zl_replayed_seq[txg & TXG_MASK];
1671168404Spjd	lwb_t *lwb;
1672168404Spjd
1673209962Smm	/*
1674209962Smm	 * We don't zero out zl_destroy_txg, so make sure we don't try
1675209962Smm	 * to destroy it twice.
1676209962Smm	 */
1677209962Smm	if (spa_sync_pass(spa) != 1)
1678209962Smm		return;
1679209962Smm
1680168404Spjd	mutex_enter(&zilog->zl_lock);
1681168404Spjd
1682168404Spjd	ASSERT(zilog->zl_stop_sync == 0);
1683168404Spjd
1684219089Spjd	if (*replayed_seq != 0) {
1685219089Spjd		ASSERT(zh->zh_replay_seq < *replayed_seq);
1686219089Spjd		zh->zh_replay_seq = *replayed_seq;
1687219089Spjd		*replayed_seq = 0;
1688219089Spjd	}
1689168404Spjd
1690168404Spjd	if (zilog->zl_destroy_txg == txg) {
1691168404Spjd		blkptr_t blk = zh->zh_log;
1692168404Spjd
1693168404Spjd		ASSERT(list_head(&zilog->zl_lwb_list) == NULL);
1694168404Spjd
1695168404Spjd		bzero(zh, sizeof (zil_header_t));
1696209962Smm		bzero(zilog->zl_replayed_seq, sizeof (zilog->zl_replayed_seq));
1697168404Spjd
1698168404Spjd		if (zilog->zl_keep_first) {
1699168404Spjd			/*
1700168404Spjd			 * If this block was part of log chain that couldn't
1701168404Spjd			 * be claimed because a device was missing during
1702168404Spjd			 * zil_claim(), but that device later returns,
1703168404Spjd			 * then this block could erroneously appear valid.
1704168404Spjd			 * To guard against this, assign a new GUID to the new
1705168404Spjd			 * log chain so it doesn't matter what blk points to.
1706168404Spjd			 */
1707168404Spjd			zil_init_log_chain(zilog, &blk);
1708168404Spjd			zh->zh_log = blk;
1709168404Spjd		}
1710168404Spjd	}
1711168404Spjd
1712213197Smm	while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) {
1713168404Spjd		zh->zh_log = lwb->lwb_blk;
1714168404Spjd		if (lwb->lwb_buf != NULL || lwb->lwb_max_txg > txg)
1715168404Spjd			break;
1716168404Spjd		list_remove(&zilog->zl_lwb_list, lwb);
1717219089Spjd		zio_free_zil(spa, txg, &lwb->lwb_blk);
1718168404Spjd		kmem_cache_free(zil_lwb_cache, lwb);
1719168404Spjd
1720168404Spjd		/*
1721168404Spjd		 * If we don't have anything left in the lwb list then
1722168404Spjd		 * we've had an allocation failure and we need to zero
1723168404Spjd		 * out the zil_header blkptr so that we don't end
1724168404Spjd		 * up freeing the same block twice.
1725168404Spjd		 */
1726168404Spjd		if (list_head(&zilog->zl_lwb_list) == NULL)
1727168404Spjd			BP_ZERO(&zh->zh_log);
1728168404Spjd	}
1729168404Spjd	mutex_exit(&zilog->zl_lock);
1730168404Spjd}
1731168404Spjd
1732168404Spjdvoid
1733168404Spjdzil_init(void)
1734168404Spjd{
1735168404Spjd	zil_lwb_cache = kmem_cache_create("zil_lwb_cache",
1736168404Spjd	    sizeof (struct lwb), 0, NULL, NULL, NULL, NULL, NULL, 0);
1737168404Spjd}
1738168404Spjd
1739168404Spjdvoid
1740168404Spjdzil_fini(void)
1741168404Spjd{
1742168404Spjd	kmem_cache_destroy(zil_lwb_cache);
1743168404Spjd}
1744168404Spjd
1745219089Spjdvoid
1746219089Spjdzil_set_sync(zilog_t *zilog, uint64_t sync)
1747219089Spjd{
1748219089Spjd	zilog->zl_sync = sync;
1749219089Spjd}
1750219089Spjd
1751219089Spjdvoid
1752219089Spjdzil_set_logbias(zilog_t *zilog, uint64_t logbias)
1753219089Spjd{
1754219089Spjd	zilog->zl_logbias = logbias;
1755219089Spjd}
1756219089Spjd
1757168404Spjdzilog_t *
1758168404Spjdzil_alloc(objset_t *os, zil_header_t *zh_phys)
1759168404Spjd{
1760168404Spjd	zilog_t *zilog;
1761168404Spjd
1762168404Spjd	zilog = kmem_zalloc(sizeof (zilog_t), KM_SLEEP);
1763168404Spjd
1764168404Spjd	zilog->zl_header = zh_phys;
1765168404Spjd	zilog->zl_os = os;
1766168404Spjd	zilog->zl_spa = dmu_objset_spa(os);
1767168404Spjd	zilog->zl_dmu_pool = dmu_objset_pool(os);
1768168404Spjd	zilog->zl_destroy_txg = TXG_INITIAL - 1;
1769219089Spjd	zilog->zl_logbias = dmu_objset_logbias(os);
1770219089Spjd	zilog->zl_sync = dmu_objset_syncprop(os);
1771219089Spjd	zilog->zl_next_batch = 1;
1772168404Spjd
1773168404Spjd	mutex_init(&zilog->zl_lock, NULL, MUTEX_DEFAULT, NULL);
1774168404Spjd
1775219089Spjd	for (int i = 0; i < TXG_SIZE; i++) {
1776219089Spjd		mutex_init(&zilog->zl_itxg[i].itxg_lock, NULL,
1777219089Spjd		    MUTEX_DEFAULT, NULL);
1778219089Spjd	}
1779168404Spjd
1780168404Spjd	list_create(&zilog->zl_lwb_list, sizeof (lwb_t),
1781168404Spjd	    offsetof(lwb_t, lwb_node));
1782168404Spjd
1783219089Spjd	list_create(&zilog->zl_itx_commit_list, sizeof (itx_t),
1784219089Spjd	    offsetof(itx_t, itx_node));
1785219089Spjd
1786185029Spjd	mutex_init(&zilog->zl_vdev_lock, NULL, MUTEX_DEFAULT, NULL);
1787168404Spjd
1788185029Spjd	avl_create(&zilog->zl_vdev_tree, zil_vdev_compare,
1789185029Spjd	    sizeof (zil_vdev_node_t), offsetof(zil_vdev_node_t, zv_node));
1790185029Spjd
1791185029Spjd	cv_init(&zilog->zl_cv_writer, NULL, CV_DEFAULT, NULL);
1792185029Spjd	cv_init(&zilog->zl_cv_suspend, NULL, CV_DEFAULT, NULL);
1793219089Spjd	cv_init(&zilog->zl_cv_batch[0], NULL, CV_DEFAULT, NULL);
1794219089Spjd	cv_init(&zilog->zl_cv_batch[1], NULL, CV_DEFAULT, NULL);
1795185029Spjd
1796168404Spjd	return (zilog);
1797168404Spjd}
1798168404Spjd
1799168404Spjdvoid
1800168404Spjdzil_free(zilog_t *zilog)
1801168404Spjd{
1802168404Spjd	zilog->zl_stop_sync = 1;
1803168404Spjd
1804248571Smm	ASSERT0(zilog->zl_suspend);
1805248571Smm	ASSERT0(zilog->zl_suspending);
1806248571Smm
1807224526Smm	ASSERT(list_is_empty(&zilog->zl_lwb_list));
1808168404Spjd	list_destroy(&zilog->zl_lwb_list);
1809168404Spjd
1810185029Spjd	avl_destroy(&zilog->zl_vdev_tree);
1811185029Spjd	mutex_destroy(&zilog->zl_vdev_lock);
1812168404Spjd
1813219089Spjd	ASSERT(list_is_empty(&zilog->zl_itx_commit_list));
1814219089Spjd	list_destroy(&zilog->zl_itx_commit_list);
1815219089Spjd
1816219089Spjd	for (int i = 0; i < TXG_SIZE; i++) {
1817219089Spjd		/*
1818219089Spjd		 * It's possible for an itx to be generated that doesn't dirty
1819219089Spjd		 * a txg (e.g. ztest TX_TRUNCATE). So there's no zil_clean()
1820219089Spjd		 * callback to remove the entry. We remove those here.
1821219089Spjd		 *
1822219089Spjd		 * Also free up the ziltest itxs.
1823219089Spjd		 */
1824219089Spjd		if (zilog->zl_itxg[i].itxg_itxs)
1825219089Spjd			zil_itxg_clean(zilog->zl_itxg[i].itxg_itxs);
1826219089Spjd		mutex_destroy(&zilog->zl_itxg[i].itxg_lock);
1827219089Spjd	}
1828219089Spjd
1829168404Spjd	mutex_destroy(&zilog->zl_lock);
1830168404Spjd
1831185029Spjd	cv_destroy(&zilog->zl_cv_writer);
1832185029Spjd	cv_destroy(&zilog->zl_cv_suspend);
1833219089Spjd	cv_destroy(&zilog->zl_cv_batch[0]);
1834219089Spjd	cv_destroy(&zilog->zl_cv_batch[1]);
1835185029Spjd
1836168404Spjd	kmem_free(zilog, sizeof (zilog_t));
1837168404Spjd}
1838168404Spjd
1839168404Spjd/*
1840168404Spjd * Open an intent log.
1841168404Spjd */
1842168404Spjdzilog_t *
1843168404Spjdzil_open(objset_t *os, zil_get_data_t *get_data)
1844168404Spjd{
1845168404Spjd	zilog_t *zilog = dmu_objset_zil(os);
1846168404Spjd
1847224526Smm	ASSERT(zilog->zl_clean_taskq == NULL);
1848224526Smm	ASSERT(zilog->zl_get_data == NULL);
1849224526Smm	ASSERT(list_is_empty(&zilog->zl_lwb_list));
1850224526Smm
1851168404Spjd	zilog->zl_get_data = get_data;
1852168404Spjd	zilog->zl_clean_taskq = taskq_create("zil_clean", 1, minclsyspri,
1853168404Spjd	    2, 2, TASKQ_PREPOPULATE);
1854168404Spjd
1855168404Spjd	return (zilog);
1856168404Spjd}
1857168404Spjd
1858168404Spjd/*
1859168404Spjd * Close an intent log.
1860168404Spjd */
1861168404Spjdvoid
1862168404Spjdzil_close(zilog_t *zilog)
1863168404Spjd{
1864224526Smm	lwb_t *lwb;
1865219089Spjd	uint64_t txg = 0;
1866219089Spjd
1867219089Spjd	zil_commit(zilog, 0); /* commit all itx */
1868219089Spjd
1869168404Spjd	/*
1870219089Spjd	 * The lwb_max_txg for the stubby lwb will reflect the last activity
1871219089Spjd	 * for the zil.  After a txg_wait_synced() on the txg we know all the
1872219089Spjd	 * callbacks have occurred that may clean the zil.  Only then can we
1873219089Spjd	 * destroy the zl_clean_taskq.
1874168404Spjd	 */
1875219089Spjd	mutex_enter(&zilog->zl_lock);
1876224526Smm	lwb = list_tail(&zilog->zl_lwb_list);
1877224526Smm	if (lwb != NULL)
1878224526Smm		txg = lwb->lwb_max_txg;
1879219089Spjd	mutex_exit(&zilog->zl_lock);
1880219089Spjd	if (txg)
1881168404Spjd		txg_wait_synced(zilog->zl_dmu_pool, txg);
1882168404Spjd
1883310516Savg	if (zilog_is_dirty(zilog))
1884310516Savg		zfs_dbgmsg("zil (%p) is dirty, txg %llu", zilog, txg);
1885310516Savg	VERIFY(!zilog_is_dirty(zilog));
1886310516Savg
1887168404Spjd	taskq_destroy(zilog->zl_clean_taskq);
1888168404Spjd	zilog->zl_clean_taskq = NULL;
1889168404Spjd	zilog->zl_get_data = NULL;
1890224526Smm
1891224526Smm	/*
1892224526Smm	 * We should have only one LWB left on the list; remove it now.
1893224526Smm	 */
1894224526Smm	mutex_enter(&zilog->zl_lock);
1895224526Smm	lwb = list_head(&zilog->zl_lwb_list);
1896224526Smm	if (lwb != NULL) {
1897224526Smm		ASSERT(lwb == list_tail(&zilog->zl_lwb_list));
1898224526Smm		list_remove(&zilog->zl_lwb_list, lwb);
1899224526Smm		zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
1900224526Smm		kmem_cache_free(zil_lwb_cache, lwb);
1901224526Smm	}
1902224526Smm	mutex_exit(&zilog->zl_lock);
1903168404Spjd}
1904168404Spjd
1905248571Smmstatic char *suspend_tag = "zil suspending";
1906248571Smm
1907168404Spjd/*
1908168404Spjd * Suspend an intent log.  While in suspended mode, we still honor
1909168404Spjd * synchronous semantics, but we rely on txg_wait_synced() to do it.
1910248571Smm * On old version pools, we suspend the log briefly when taking a
1911248571Smm * snapshot so that it will have an empty intent log.
1912248571Smm *
1913248571Smm * Long holds are not really intended to be used the way we do here --
1914248571Smm * held for such a short time.  A concurrent caller of dsl_dataset_long_held()
1915248571Smm * could fail.  Therefore we take pains to only put a long hold if it is
1916248571Smm * actually necessary.  Fortunately, it will only be necessary if the
1917248571Smm * objset is currently mounted (or the ZVOL equivalent).  In that case it
1918248571Smm * will already have a long hold, so we are not really making things any worse.
1919248571Smm *
1920248571Smm * Ideally, we would locate the existing long-holder (i.e. the zfsvfs_t or
1921248571Smm * zvol_state_t), and use their mechanism to prevent their hold from being
1922248571Smm * dropped (e.g. VFS_HOLD()).  However, that would be even more pain for
1923248571Smm * very little gain.
1924248571Smm *
1925248571Smm * if cookiep == NULL, this does both the suspend & resume.
1926248571Smm * Otherwise, it returns with the dataset "long held", and the cookie
1927248571Smm * should be passed into zil_resume().
1928168404Spjd */
1929168404Spjdint
1930248571Smmzil_suspend(const char *osname, void **cookiep)
1931168404Spjd{
1932248571Smm	objset_t *os;
1933248571Smm	zilog_t *zilog;
1934248571Smm	const zil_header_t *zh;
1935248571Smm	int error;
1936168404Spjd
1937248571Smm	error = dmu_objset_hold(osname, suspend_tag, &os);
1938248571Smm	if (error != 0)
1939248571Smm		return (error);
1940248571Smm	zilog = dmu_objset_zil(os);
1941248571Smm
1942168404Spjd	mutex_enter(&zilog->zl_lock);
1943248571Smm	zh = zilog->zl_header;
1944248571Smm
1945200724Sdelphij	if (zh->zh_flags & ZIL_REPLAY_NEEDED) {		/* unplayed log */
1946168404Spjd		mutex_exit(&zilog->zl_lock);
1947248571Smm		dmu_objset_rele(os, suspend_tag);
1948249195Smm		return (SET_ERROR(EBUSY));
1949168404Spjd	}
1950248571Smm
1951248571Smm	/*
1952248571Smm	 * Don't put a long hold in the cases where we can avoid it.  This
1953248571Smm	 * is when there is no cookie so we are doing a suspend & resume
1954248571Smm	 * (i.e. called from zil_vdev_offline()), and there's nothing to do
1955248571Smm	 * for the suspend because it's already suspended, or there's no ZIL.
1956248571Smm	 */
1957248571Smm	if (cookiep == NULL && !zilog->zl_suspending &&
1958248571Smm	    (zilog->zl_suspend > 0 || BP_IS_HOLE(&zh->zh_log))) {
1959248571Smm		mutex_exit(&zilog->zl_lock);
1960248571Smm		dmu_objset_rele(os, suspend_tag);
1961248571Smm		return (0);
1962248571Smm	}
1963248571Smm
1964248571Smm	dsl_dataset_long_hold(dmu_objset_ds(os), suspend_tag);
1965248571Smm	dsl_pool_rele(dmu_objset_pool(os), suspend_tag);
1966248571Smm
1967248571Smm	zilog->zl_suspend++;
1968248571Smm
1969248571Smm	if (zilog->zl_suspend > 1) {
1970168404Spjd		/*
1971248571Smm		 * Someone else is already suspending it.
1972168404Spjd		 * Just wait for them to finish.
1973168404Spjd		 */
1974248571Smm
1975168404Spjd		while (zilog->zl_suspending)
1976168404Spjd			cv_wait(&zilog->zl_cv_suspend, &zilog->zl_lock);
1977168404Spjd		mutex_exit(&zilog->zl_lock);
1978248571Smm
1979248571Smm		if (cookiep == NULL)
1980248571Smm			zil_resume(os);
1981248571Smm		else
1982248571Smm			*cookiep = os;
1983168404Spjd		return (0);
1984168404Spjd	}
1985248571Smm
1986248571Smm	/*
1987248571Smm	 * If there is no pointer to an on-disk block, this ZIL must not
1988248571Smm	 * be active (e.g. filesystem not mounted), so there's nothing
1989248571Smm	 * to clean up.
1990248571Smm	 */
1991248571Smm	if (BP_IS_HOLE(&zh->zh_log)) {
1992248571Smm		ASSERT(cookiep != NULL); /* fast path already handled */
1993248571Smm
1994248571Smm		*cookiep = os;
1995248571Smm		mutex_exit(&zilog->zl_lock);
1996248571Smm		return (0);
1997248571Smm	}
1998248571Smm
1999168404Spjd	zilog->zl_suspending = B_TRUE;
2000168404Spjd	mutex_exit(&zilog->zl_lock);
2001168404Spjd
2002219089Spjd	zil_commit(zilog, 0);
2003168404Spjd
2004168404Spjd	zil_destroy(zilog, B_FALSE);
2005168404Spjd
2006168404Spjd	mutex_enter(&zilog->zl_lock);
2007168404Spjd	zilog->zl_suspending = B_FALSE;
2008168404Spjd	cv_broadcast(&zilog->zl_cv_suspend);
2009168404Spjd	mutex_exit(&zilog->zl_lock);
2010168404Spjd
2011248571Smm	if (cookiep == NULL)
2012248571Smm		zil_resume(os);
2013248571Smm	else
2014248571Smm		*cookiep = os;
2015168404Spjd	return (0);
2016168404Spjd}
2017168404Spjd
2018168404Spjdvoid
2019248571Smmzil_resume(void *cookie)
2020168404Spjd{
2021248571Smm	objset_t *os = cookie;
2022248571Smm	zilog_t *zilog = dmu_objset_zil(os);
2023248571Smm
2024168404Spjd	mutex_enter(&zilog->zl_lock);
2025168404Spjd	ASSERT(zilog->zl_suspend != 0);
2026168404Spjd	zilog->zl_suspend--;
2027168404Spjd	mutex_exit(&zilog->zl_lock);
2028248571Smm	dsl_dataset_long_rele(dmu_objset_ds(os), suspend_tag);
2029248571Smm	dsl_dataset_rele(dmu_objset_ds(os), suspend_tag);
2030168404Spjd}
2031168404Spjd
2032219089Spjdtypedef struct zil_replay_arg {
2033219089Spjd	zil_replay_func_t **zr_replay;
2034219089Spjd	void		*zr_arg;
2035219089Spjd	boolean_t	zr_byteswap;
2036219089Spjd	char		*zr_lr;
2037219089Spjd} zil_replay_arg_t;
2038219089Spjd
2039219089Spjdstatic int
2040219089Spjdzil_replay_error(zilog_t *zilog, lr_t *lr, int error)
2041209962Smm{
2042307122Smav	char name[ZFS_MAX_DATASET_NAME_LEN];
2043209962Smm
2044219089Spjd	zilog->zl_replaying_seq--;	/* didn't actually replay this one */
2045209962Smm
2046219089Spjd	dmu_objset_name(zilog->zl_os, name);
2047209962Smm
2048219089Spjd	cmn_err(CE_WARN, "ZFS replay transaction error %d, "
2049219089Spjd	    "dataset %s, seq 0x%llx, txtype %llu %s\n", error, name,
2050219089Spjd	    (u_longlong_t)lr->lrc_seq,
2051219089Spjd	    (u_longlong_t)(lr->lrc_txtype & ~TX_CI),
2052219089Spjd	    (lr->lrc_txtype & TX_CI) ? "CI" : "");
2053219089Spjd
2054219089Spjd	return (error);
2055209962Smm}
2056209962Smm
2057219089Spjdstatic int
2058168404Spjdzil_replay_log_record(zilog_t *zilog, lr_t *lr, void *zra, uint64_t claim_txg)
2059168404Spjd{
2060168404Spjd	zil_replay_arg_t *zr = zra;
2061168404Spjd	const zil_header_t *zh = zilog->zl_header;
2062168404Spjd	uint64_t reclen = lr->lrc_reclen;
2063168404Spjd	uint64_t txtype = lr->lrc_txtype;
2064219089Spjd	int error = 0;
2065168404Spjd
2066219089Spjd	zilog->zl_replaying_seq = lr->lrc_seq;
2067168404Spjd
2068219089Spjd	if (lr->lrc_seq <= zh->zh_replay_seq)	/* already replayed */
2069219089Spjd		return (0);
2070219089Spjd
2071168404Spjd	if (lr->lrc_txg < claim_txg)		/* already committed */
2072219089Spjd		return (0);
2073168404Spjd
2074185029Spjd	/* Strip case-insensitive bit, still present in log record */
2075185029Spjd	txtype &= ~TX_CI;
2076185029Spjd
2077219089Spjd	if (txtype == 0 || txtype >= TX_MAX_TYPE)
2078219089Spjd		return (zil_replay_error(zilog, lr, EINVAL));
2079219089Spjd
2080219089Spjd	/*
2081219089Spjd	 * If this record type can be logged out of order, the object
2082219089Spjd	 * (lr_foid) may no longer exist.  That's legitimate, not an error.
2083219089Spjd	 */
2084219089Spjd	if (TX_OOO(txtype)) {
2085219089Spjd		error = dmu_object_info(zilog->zl_os,
2086219089Spjd		    ((lr_ooo_t *)lr)->lr_foid, NULL);
2087219089Spjd		if (error == ENOENT || error == EEXIST)
2088219089Spjd			return (0);
2089209962Smm	}
2090209962Smm
2091168404Spjd	/*
2092168404Spjd	 * Make a copy of the data so we can revise and extend it.
2093168404Spjd	 */
2094219089Spjd	bcopy(lr, zr->zr_lr, reclen);
2095168404Spjd
2096168404Spjd	/*
2097219089Spjd	 * If this is a TX_WRITE with a blkptr, suck in the data.
2098219089Spjd	 */
2099219089Spjd	if (txtype == TX_WRITE && reclen == sizeof (lr_write_t)) {
2100219089Spjd		error = zil_read_log_data(zilog, (lr_write_t *)lr,
2101219089Spjd		    zr->zr_lr + reclen);
2102248571Smm		if (error != 0)
2103219089Spjd			return (zil_replay_error(zilog, lr, error));
2104219089Spjd	}
2105219089Spjd
2106219089Spjd	/*
2107168404Spjd	 * The log block containing this lr may have been byteswapped
2108168404Spjd	 * so that we can easily examine common fields like lrc_txtype.
2109219089Spjd	 * However, the log is a mix of different record types, and only the
2110168404Spjd	 * replay vectors know how to byteswap their records.  Therefore, if
2111168404Spjd	 * the lr was byteswapped, undo it before invoking the replay vector.
2112168404Spjd	 */
2113168404Spjd	if (zr->zr_byteswap)
2114219089Spjd		byteswap_uint64_array(zr->zr_lr, reclen);
2115168404Spjd
2116168404Spjd	/*
2117168404Spjd	 * We must now do two things atomically: replay this log record,
2118209962Smm	 * and update the log header sequence number to reflect the fact that
2119209962Smm	 * we did so. At the end of each replay function the sequence number
2120209962Smm	 * is updated if we are in replay mode.
2121168404Spjd	 */
2122219089Spjd	error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lr, zr->zr_byteswap);
2123248571Smm	if (error != 0) {
2124168404Spjd		/*
2125168404Spjd		 * The DMU's dnode layer doesn't see removes until the txg
2126168404Spjd		 * commits, so a subsequent claim can spuriously fail with
2127209962Smm		 * EEXIST. So if we receive any error we try syncing out
2128219089Spjd		 * any removes then retry the transaction.  Note that we
2129219089Spjd		 * specify B_FALSE for byteswap now, so we don't do it twice.
2130168404Spjd		 */
2131219089Spjd		txg_wait_synced(spa_get_dsl(zilog->zl_spa), 0);
2132219089Spjd		error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lr, B_FALSE);
2133248571Smm		if (error != 0)
2134219089Spjd			return (zil_replay_error(zilog, lr, error));
2135168404Spjd	}
2136219089Spjd	return (0);
2137168404Spjd}
2138168404Spjd
2139168404Spjd/* ARGSUSED */
2140219089Spjdstatic int
2141168404Spjdzil_incr_blks(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
2142168404Spjd{
2143168404Spjd	zilog->zl_replay_blks++;
2144219089Spjd
2145219089Spjd	return (0);
2146168404Spjd}
2147168404Spjd
2148168404Spjd/*
2149168404Spjd * If this dataset has a non-empty intent log, replay it and destroy it.
2150168404Spjd */
2151168404Spjdvoid
2152209962Smmzil_replay(objset_t *os, void *arg, zil_replay_func_t *replay_func[TX_MAX_TYPE])
2153168404Spjd{
2154168404Spjd	zilog_t *zilog = dmu_objset_zil(os);
2155168404Spjd	const zil_header_t *zh = zilog->zl_header;
2156168404Spjd	zil_replay_arg_t zr;
2157168404Spjd
2158200724Sdelphij	if ((zh->zh_flags & ZIL_REPLAY_NEEDED) == 0) {
2159168404Spjd		zil_destroy(zilog, B_TRUE);
2160168404Spjd		return;
2161168404Spjd	}
2162168404Spjd
2163168404Spjd	zr.zr_replay = replay_func;
2164168404Spjd	zr.zr_arg = arg;
2165168404Spjd	zr.zr_byteswap = BP_SHOULD_BYTESWAP(&zh->zh_log);
2166219089Spjd	zr.zr_lr = kmem_alloc(2 * SPA_MAXBLOCKSIZE, KM_SLEEP);
2167168404Spjd
2168168404Spjd	/*
2169168404Spjd	 * Wait for in-progress removes to sync before starting replay.
2170168404Spjd	 */
2171168404Spjd	txg_wait_synced(zilog->zl_dmu_pool, 0);
2172168404Spjd
2173209962Smm	zilog->zl_replay = B_TRUE;
2174219089Spjd	zilog->zl_replay_time = ddi_get_lbolt();
2175168404Spjd	ASSERT(zilog->zl_replay_blks == 0);
2176168404Spjd	(void) zil_parse(zilog, zil_incr_blks, zil_replay_log_record, &zr,
2177168404Spjd	    zh->zh_claim_txg);
2178219089Spjd	kmem_free(zr.zr_lr, 2 * SPA_MAXBLOCKSIZE);
2179168404Spjd
2180168404Spjd	zil_destroy(zilog, B_FALSE);
2181185029Spjd	txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg);
2182209962Smm	zilog->zl_replay = B_FALSE;
2183168404Spjd}
2184168404Spjd
2185219089Spjdboolean_t
2186219089Spjdzil_replaying(zilog_t *zilog, dmu_tx_t *tx)
2187168404Spjd{
2188219089Spjd	if (zilog->zl_sync == ZFS_SYNC_DISABLED)
2189219089Spjd		return (B_TRUE);
2190168404Spjd
2191219089Spjd	if (zilog->zl_replay) {
2192219089Spjd		dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
2193219089Spjd		zilog->zl_replayed_seq[dmu_tx_get_txg(tx) & TXG_MASK] =
2194219089Spjd		    zilog->zl_replaying_seq;
2195219089Spjd		return (B_TRUE);
2196168404Spjd	}
2197168404Spjd
2198219089Spjd	return (B_FALSE);
2199168404Spjd}
2200213197Smm
2201213197Smm/* ARGSUSED */
2202213197Smmint
2203219089Spjdzil_vdev_offline(const char *osname, void *arg)
2204213197Smm{
2205213197Smm	int error;
2206213197Smm
2207248571Smm	error = zil_suspend(osname, NULL);
2208248571Smm	if (error != 0)
2209249195Smm		return (SET_ERROR(EEXIST));
2210248571Smm	return (0);
2211213197Smm}
2212