1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright (c) 2018, 2019 by Delphix. All rights reserved.
24 */
25
26#include <sys/dmu_objset.h>
27#include <sys/metaslab.h>
28#include <sys/metaslab_impl.h>
29#include <sys/spa.h>
30#include <sys/spa_impl.h>
31#include <sys/spa_log_spacemap.h>
32#include <sys/vdev_impl.h>
33#include <sys/zap.h>
34
35/*
36 * Log Space Maps
37 *
38 * Log space maps are an optimization in ZFS metadata allocations for pools
39 * whose workloads are primarily random-writes. Random-write workloads are also
40 * typically random-free, meaning that they are freeing from locations scattered
41 * throughout the pool. This means that each TXG we will have to append some
42 * FREE records to almost every metaslab. With log space maps, we hold their
43 * changes in memory and log them altogether in one pool-wide space map on-disk
44 * for persistence. As more blocks are accumulated in the log space maps and
45 * more unflushed changes are accounted in memory, we flush a selected group
46 * of metaslabs every TXG to relieve memory pressure and potential overheads
47 * when loading the pool. Flushing a metaslab to disk relieves memory as we
48 * flush any unflushed changes from memory to disk (i.e. the metaslab's space
49 * map) and saves import time by making old log space maps obsolete and
50 * eventually destroying them. [A log space map is said to be obsolete when all
51 * its entries have made it to their corresponding metaslab space maps].
52 *
53 * == On disk data structures used ==
54 *
55 * - The pool has a new feature flag and a new entry in the MOS. The feature
56 *   is activated when we create the first log space map and remains active
57 *   for the lifetime of the pool. The new entry in the MOS Directory [refer
58 *   to DMU_POOL_LOG_SPACEMAP_ZAP] is populated with a ZAP whose key-value
59 *   pairs are of the form <key: txg, value: log space map object for that txg>.
60 *   This entry is our on-disk reference of the log space maps that exist in
61 *   the pool for each TXG and it is used during import to load all the
62 *   metaslab unflushed changes in memory. To see how this structure is first
63 *   created and later populated refer to spa_generate_syncing_log_sm(). To see
64 *   how it is used during import time refer to spa_ld_log_sm_metadata().
65 *
66 * - Each vdev has a new entry in its vdev_top_zap (see field
67 *   VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS) which holds the msp_unflushed_txg of
68 *   each metaslab in this vdev. This field is the on-disk counterpart of the
69 *   in-memory field ms_unflushed_txg which tells us from which TXG and onwards
70 *   the metaslab haven't had its changes flushed. During import, we use this
71 *   to ignore any entries in the space map log that are for this metaslab but
72 *   from a TXG before msp_unflushed_txg. At that point, we also populate its
73 *   in-memory counterpart and from there both fields are updated every time
74 *   we flush that metaslab.
75 *
76 * - A space map is created every TXG and, during that TXG, it is used to log
77 *   all incoming changes (the log space map). When created, the log space map
78 *   is referenced in memory by spa_syncing_log_sm and its object ID is inserted
79 *   to the space map ZAP mentioned above. The log space map is closed at the
80 *   end of the TXG and will be destroyed when it becomes fully obsolete. We
81 *   know when a log space map has become obsolete by looking at the oldest
82 *   (and smallest) ms_unflushed_txg in the pool. If the value of that is bigger
83 *   than the log space map's TXG, then it means that there is no metaslab who
84 *   doesn't have the changes from that log and we can therefore destroy it.
85 *   [see spa_cleanup_old_sm_logs()].
86 *
87 * == Important in-memory structures ==
88 *
89 * - The per-spa field spa_metaslabs_by_flushed sorts all the metaslabs in
90 *   the pool by their ms_unflushed_txg field. It is primarily used for three
91 *   reasons. First of all, it is used during flushing where we try to flush
92 *   metaslabs in-order from the oldest-flushed to the most recently flushed
93 *   every TXG. Secondly, it helps us to lookup the ms_unflushed_txg of the
94 *   oldest flushed metaslab to distinguish which log space maps have become
95 *   obsolete and which ones are still relevant. Finally it tells us which
96 *   metaslabs have unflushed changes in a pool where this feature was just
97 *   enabled, as we don't immediately add all of the pool's metaslabs but we
98 *   add them over time as they go through metaslab_sync(). The reason that
99 *   we do that is to ease these pools into the behavior of the flushing
100 *   algorithm (described later on).
101 *
102 * - The per-spa field spa_sm_logs_by_txg can be thought as the in-memory
103 *   counterpart of the space map ZAP mentioned above. It's an AVL tree whose
104 *   nodes represent the log space maps in the pool. This in-memory
105 *   representation of log space maps in the pool sorts the log space maps by
106 *   the TXG that they were created (which is also the TXG of their unflushed
107 *   changes). It also contains the following extra information for each
108 *   space map:
109 *   [1] The number of metaslabs that were last flushed on that TXG. This is
110 *       important because if that counter is zero and this is the oldest
111 *       log then it means that it is also obsolete.
112 *   [2] The number of blocks of that space map. This field is used by the
113 *       block heuristic of our flushing algorithm (described later on).
114 *       It represents how many blocks of metadata changes ZFS had to write
115 *       to disk for that TXG.
116 *
117 * - The per-spa field spa_log_summary is a list of entries that summarizes
118 *   the metaslab and block counts of all the nodes of the spa_sm_logs_by_txg
119 *   AVL tree mentioned above. The reason this exists is that our flushing
120 *   algorithm (described later) tries to estimate how many metaslabs to flush
121 *   in each TXG by iterating over all the log space maps and looking at their
122 *   block counts. Summarizing that information means that don't have to
123 *   iterate through each space map, minimizing the runtime overhead of the
124 *   flushing algorithm which would be induced in syncing context. In terms of
125 *   implementation the log summary is used as a queue:
126 *   * we modify or pop entries from its head when we flush metaslabs
127 *   * we modify or append entries to its tail when we sync changes.
128 *
129 * - Each metaslab has two new range trees that hold its unflushed changes,
130 *   ms_unflushed_allocs and ms_unflushed_frees. These are always disjoint.
131 *
132 * == Flushing algorithm ==
133 *
134 * The decision of how many metaslabs to flush on a give TXG is guided by
135 * two heuristics:
136 *
137 * [1] The memory heuristic -
138 * We keep track of the memory used by the unflushed trees from all the
139 * metaslabs [see sus_memused of spa_unflushed_stats] and we ensure that it
140 * stays below a certain threshold which is determined by an arbitrary hard
141 * limit and an arbitrary percentage of the system's memory [see
142 * spa_log_exceeds_memlimit()]. When we see that the memory usage of the
143 * unflushed changes are passing that threshold, we flush metaslabs, which
144 * empties their unflushed range trees, reducing the memory used.
145 *
146 * [2] The block heuristic -
147 * We try to keep the total number of blocks in the log space maps in check
148 * so the log doesn't grow indefinitely and we don't induce a lot of overhead
149 * when loading the pool. At the same time we don't want to flush a lot of
150 * metaslabs too often as this would defeat the purpose of the log space map.
151 * As a result we set a limit in the amount of blocks that we think it's
152 * acceptable for the log space maps to have and try not to cross it.
153 * [see sus_blocklimit from spa_unflushed_stats].
154 *
155 * In order to stay below the block limit every TXG we have to estimate how
156 * many metaslabs we need to flush based on the current rate of incoming blocks
157 * and our history of log space map blocks. The main idea here is to answer
158 * the question of how many metaslabs do we need to flush in order to get rid
159 * at least an X amount of log space map blocks. We can answer this question
160 * by iterating backwards from the oldest log space map to the newest one
161 * and looking at their metaslab and block counts. At this point the log summary
162 * mentioned above comes handy as it reduces the amount of things that we have
163 * to iterate (even though it may reduce the preciseness of our estimates due
164 * to its aggregation of data). So with that in mind, we project the incoming
165 * rate of the current TXG into the future and attempt to approximate how many
166 * metaslabs would we need to flush from now in order to avoid exceeding our
167 * block limit in different points in the future (granted that we would keep
168 * flushing the same number of metaslabs for every TXG). Then we take the
169 * maximum number from all these estimates to be on the safe side. For the
170 * exact implementation details of algorithm refer to
171 * spa_estimate_metaslabs_to_flush.
172 */
173
174/*
175 * This is used as the block size for the space maps used for the
176 * log space map feature. These space maps benefit from a bigger
177 * block size as we expect to be writing a lot of data to them at
178 * once.
179 */
180unsigned long zfs_log_sm_blksz = 1ULL << 17;
181
182/*
183 * Percentage of the overall system's memory that ZFS allows to be
184 * used for unflushed changes (e.g. the sum of size of all the nodes
185 * in the unflushed trees).
186 *
187 * Note that this value is calculated over 1000000 for finer granularity
188 * (thus the _ppm suffix; reads as "parts per million"). As an example,
189 * the default of 1000 allows 0.1% of memory to be used.
190 */
191unsigned long zfs_unflushed_max_mem_ppm = 1000;
192
193/*
194 * Specific hard-limit in memory that ZFS allows to be used for
195 * unflushed changes.
196 */
197unsigned long zfs_unflushed_max_mem_amt = 1ULL << 30;
198
199/*
200 * The following tunable determines the number of blocks that can be used for
201 * the log space maps. It is expressed as a percentage of the total number of
202 * metaslabs in the pool (i.e. the default of 400 means that the number of log
203 * blocks is capped at 4 times the number of metaslabs).
204 *
205 * This value exists to tune our flushing algorithm, with higher values
206 * flushing metaslabs less often (doing less I/Os) per TXG versus lower values
207 * flushing metaslabs more aggressively with the upside of saving overheads
208 * when loading the pool. Another factor in this tradeoff is that flushing
209 * less often can potentially lead to better utilization of the metaslab space
210 * map's block size as we accumulate more changes per flush.
211 *
212 * Given that this tunable indirectly controls the flush rate (metaslabs
213 * flushed per txg) and that's why making it a percentage in terms of the
214 * number of metaslabs in the pool makes sense here.
215 *
216 * As a rule of thumb we default this tunable to 400% based on the following:
217 *
218 * 1] Assuming a constant flush rate and a constant incoming rate of log blocks
219 *    it is reasonable to expect that the amount of obsolete entries changes
220 *    linearly from txg to txg (e.g. the oldest log should have the most
221 *    obsolete entries, and the most recent one the least). With this we could
222 *    say that, at any given time, about half of the entries in the whole space
223 *    map log are obsolete. Thus for every two entries for a metaslab in the
224 *    log space map, only one of them is valid and actually makes it to the
225 *    metaslab's space map.
226 *    [factor of 2]
227 * 2] Each entry in the log space map is guaranteed to be two words while
228 *    entries in metaslab space maps are generally single-word.
229 *    [an extra factor of 2 - 400% overall]
230 * 3] Even if [1] and [2] are slightly less than 2 each, we haven't taken into
231 *    account any consolidation of segments from the log space map to the
232 *    unflushed range trees nor their history (e.g. a segment being allocated,
233 *    then freed, then allocated again means 3 log space map entries but 0
234 *    metaslab space map entries). Depending on the workload, we've seen ~1.8
235 *    non-obsolete log space map entries per metaslab entry, for a total of
236 *    ~600%. Since most of these estimates though are workload dependent, we
237 *    default on 400% to be conservative.
238 *
239 *    Thus we could say that even in the worst
240 *    case of [1] and [2], the factor should end up being 4.
241 *
242 * That said, regardless of the number of metaslabs in the pool we need to
243 * provide upper and lower bounds for the log block limit.
244 * [see zfs_unflushed_log_block_{min,max}]
245 */
246unsigned long zfs_unflushed_log_block_pct = 400;
247
248/*
249 * If the number of metaslabs is small and our incoming rate is high, we could
250 * get into a situation that we are flushing all our metaslabs every TXG. Thus
251 * we always allow at least this many log blocks.
252 */
253unsigned long zfs_unflushed_log_block_min = 1000;
254
255/*
256 * If the log becomes too big, the import time of the pool can take a hit in
257 * terms of performance. Thus we have a hard limit in the size of the log in
258 * terms of blocks.
259 */
260unsigned long zfs_unflushed_log_block_max = (1ULL << 18);
261
262/*
263 * Max # of rows allowed for the log_summary. The tradeoff here is accuracy and
264 * stability of the flushing algorithm (longer summary) vs its runtime overhead
265 * (smaller summary is faster to traverse).
266 */
267unsigned long zfs_max_logsm_summary_length = 10;
268
269/*
270 * Tunable that sets the lower bound on the metaslabs to flush every TXG.
271 *
272 * Setting this to 0 has no effect since if the pool is idle we won't even be
273 * creating log space maps and therefore we won't be flushing. On the other
274 * hand if the pool has any incoming workload our block heuristic will start
275 * flushing metaslabs anyway.
276 *
277 * The point of this tunable is to be used in extreme cases where we really
278 * want to flush more metaslabs than our adaptable heuristic plans to flush.
279 */
280unsigned long zfs_min_metaslabs_to_flush = 1;
281
282/*
283 * Tunable that specifies how far in the past do we want to look when trying to
284 * estimate the incoming log blocks for the current TXG.
285 *
286 * Setting this too high may not only increase runtime but also minimize the
287 * effect of the incoming rates from the most recent TXGs as we take the
288 * average over all the blocks that we walk
289 * [see spa_estimate_incoming_log_blocks].
290 */
291unsigned long zfs_max_log_walking = 5;
292
293/*
294 * This tunable exists solely for testing purposes. It ensures that the log
295 * spacemaps are not flushed and destroyed during export in order for the
296 * relevant log spacemap import code paths to be tested (effectively simulating
297 * a crash).
298 */
299int zfs_keep_log_spacemaps_at_export = 0;
300
301static uint64_t
302spa_estimate_incoming_log_blocks(spa_t *spa)
303{
304	ASSERT3U(spa_sync_pass(spa), ==, 1);
305	uint64_t steps = 0, sum = 0;
306	for (spa_log_sm_t *sls = avl_last(&spa->spa_sm_logs_by_txg);
307	    sls != NULL && steps < zfs_max_log_walking;
308	    sls = AVL_PREV(&spa->spa_sm_logs_by_txg, sls)) {
309		if (sls->sls_txg == spa_syncing_txg(spa)) {
310			/*
311			 * skip the log created in this TXG as this would
312			 * make our estimations inaccurate.
313			 */
314			continue;
315		}
316		sum += sls->sls_nblocks;
317		steps++;
318	}
319	return ((steps > 0) ? DIV_ROUND_UP(sum, steps) : 0);
320}
321
322uint64_t
323spa_log_sm_blocklimit(spa_t *spa)
324{
325	return (spa->spa_unflushed_stats.sus_blocklimit);
326}
327
328void
329spa_log_sm_set_blocklimit(spa_t *spa)
330{
331	if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) {
332		ASSERT0(spa_log_sm_blocklimit(spa));
333		return;
334	}
335
336	uint64_t calculated_limit =
337	    (spa_total_metaslabs(spa) * zfs_unflushed_log_block_pct) / 100;
338	spa->spa_unflushed_stats.sus_blocklimit = MIN(MAX(calculated_limit,
339	    zfs_unflushed_log_block_min), zfs_unflushed_log_block_max);
340}
341
342uint64_t
343spa_log_sm_nblocks(spa_t *spa)
344{
345	return (spa->spa_unflushed_stats.sus_nblocks);
346}
347
348/*
349 * Ensure that the in-memory log space map structures and the summary
350 * have the same block and metaslab counts.
351 */
352static void
353spa_log_summary_verify_counts(spa_t *spa)
354{
355	ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP));
356
357	if ((zfs_flags & ZFS_DEBUG_LOG_SPACEMAP) == 0)
358		return;
359
360	uint64_t ms_in_avl = avl_numnodes(&spa->spa_metaslabs_by_flushed);
361
362	uint64_t ms_in_summary = 0, blk_in_summary = 0;
363	for (log_summary_entry_t *e = list_head(&spa->spa_log_summary);
364	    e; e = list_next(&spa->spa_log_summary, e)) {
365		ms_in_summary += e->lse_mscount;
366		blk_in_summary += e->lse_blkcount;
367	}
368
369	uint64_t ms_in_logs = 0, blk_in_logs = 0;
370	for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg);
371	    sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) {
372		ms_in_logs += sls->sls_mscount;
373		blk_in_logs += sls->sls_nblocks;
374	}
375
376	VERIFY3U(ms_in_logs, ==, ms_in_summary);
377	VERIFY3U(ms_in_logs, ==, ms_in_avl);
378	VERIFY3U(blk_in_logs, ==, blk_in_summary);
379	VERIFY3U(blk_in_logs, ==, spa_log_sm_nblocks(spa));
380}
381
382static boolean_t
383summary_entry_is_full(spa_t *spa, log_summary_entry_t *e)
384{
385	uint64_t blocks_per_row = MAX(1,
386	    DIV_ROUND_UP(spa_log_sm_blocklimit(spa),
387	    zfs_max_logsm_summary_length));
388	return (blocks_per_row <= e->lse_blkcount);
389}
390
391/*
392 * Update the log summary information to reflect the fact that a metaslab
393 * was flushed or destroyed (e.g due to device removal or pool export/destroy).
394 *
395 * We typically flush the oldest flushed metaslab so the first (and oldest)
396 * entry of the summary is updated. However if that metaslab is getting loaded
397 * we may flush the second oldest one which may be part of an entry later in
398 * the summary. Moreover, if we call into this function from metaslab_fini()
399 * the metaslabs probably won't be ordered by ms_unflushed_txg. Thus we ask
400 * for a txg as an argument so we can locate the appropriate summary entry for
401 * the metaslab.
402 */
403void
404spa_log_summary_decrement_mscount(spa_t *spa, uint64_t txg)
405{
406	/*
407	 * We don't track summary data for read-only pools and this function
408	 * can be called from metaslab_fini(). In that case return immediately.
409	 */
410	if (!spa_writeable(spa))
411		return;
412
413	log_summary_entry_t *target = NULL;
414	for (log_summary_entry_t *e = list_head(&spa->spa_log_summary);
415	    e != NULL; e = list_next(&spa->spa_log_summary, e)) {
416		if (e->lse_start > txg)
417			break;
418		target = e;
419	}
420
421	if (target == NULL || target->lse_mscount == 0) {
422		/*
423		 * We didn't find a summary entry for this metaslab. We must be
424		 * at the teardown of a spa_load() attempt that got an error
425		 * while reading the log space maps.
426		 */
427		VERIFY3S(spa_load_state(spa), ==, SPA_LOAD_ERROR);
428		return;
429	}
430
431	target->lse_mscount--;
432}
433
434/*
435 * Update the log summary information to reflect the fact that we destroyed
436 * old log space maps. Since we can only destroy the oldest log space maps,
437 * we decrement the block count of the oldest summary entry and potentially
438 * destroy it when that count hits 0.
439 *
440 * This function is called after a metaslab is flushed and typically that
441 * metaslab is the oldest flushed, which means that this function will
442 * typically decrement the block count of the first entry of the summary and
443 * potentially free it if the block count gets to zero (its metaslab count
444 * should be zero too at that point).
445 *
446 * There are certain scenarios though that don't work exactly like that so we
447 * need to account for them:
448 *
449 * Scenario [1]: It is possible that after we flushed the oldest flushed
450 * metaslab and we destroyed the oldest log space map, more recent logs had 0
451 * metaslabs pointing to them so we got rid of them too. This can happen due
452 * to metaslabs being destroyed through device removal, or because the oldest
453 * flushed metaslab was loading but we kept flushing more recently flushed
454 * metaslabs due to the memory pressure of unflushed changes. Because of that,
455 * we always iterate from the beginning of the summary and if blocks_gone is
456 * bigger than the block_count of the current entry we free that entry (we
457 * expect its metaslab count to be zero), we decrement blocks_gone and on to
458 * the next entry repeating this procedure until blocks_gone gets decremented
459 * to 0. Doing this also works for the typical case mentioned above.
460 *
461 * Scenario [2]: The oldest flushed metaslab isn't necessarily accounted by
462 * the first (and oldest) entry in the summary. If the first few entries of
463 * the summary were only accounting metaslabs from a device that was just
464 * removed, then the current oldest flushed metaslab could be accounted by an
465 * entry somewhere in the middle of the summary. Moreover flushing that
466 * metaslab will destroy all the log space maps older than its ms_unflushed_txg
467 * because they became obsolete after the removal. Thus, iterating as we did
468 * for scenario [1] works out for this case too.
469 *
470 * Scenario [3]: At times we decide to flush all the metaslabs in the pool
471 * in one TXG (either because we are exporting the pool or because our flushing
472 * heuristics decided to do so). When that happens all the log space maps get
473 * destroyed except the one created for the current TXG which doesn't have
474 * any log blocks yet. As log space maps get destroyed with every metaslab that
475 * we flush, entries in the summary are also destroyed. This brings a weird
476 * corner-case when we flush the last metaslab and the log space map of the
477 * current TXG is in the same summary entry with other log space maps that
478 * are older. When that happens we are eventually left with this one last
479 * summary entry whose blocks are gone (blocks_gone equals the entry's block
480 * count) but its metaslab count is non-zero (because it accounts all the
481 * metaslabs in the pool as they all got flushed). Under this scenario we can't
482 * free this last summary entry as it's referencing all the metaslabs in the
483 * pool and its block count will get incremented at the end of this sync (when
484 * we close the syncing log space map). Thus we just decrement its current
485 * block count and leave it alone. In the case that the pool gets exported,
486 * its metaslab count will be decremented over time as we call metaslab_fini()
487 * for all the metaslabs in the pool and the entry will be freed at
488 * spa_unload_log_sm_metadata().
489 */
490void
491spa_log_summary_decrement_blkcount(spa_t *spa, uint64_t blocks_gone)
492{
493	for (log_summary_entry_t *e = list_head(&spa->spa_log_summary);
494	    e != NULL; e = list_head(&spa->spa_log_summary)) {
495		if (e->lse_blkcount > blocks_gone) {
496			/*
497			 * Assert that we stopped at an entry that is not
498			 * obsolete.
499			 */
500			ASSERT(e->lse_mscount != 0);
501
502			e->lse_blkcount -= blocks_gone;
503			blocks_gone = 0;
504			break;
505		} else if (e->lse_mscount == 0) {
506			/* remove obsolete entry */
507			blocks_gone -= e->lse_blkcount;
508			list_remove(&spa->spa_log_summary, e);
509			kmem_free(e, sizeof (log_summary_entry_t));
510		} else {
511			/* Verify that this is scenario [3] mentioned above. */
512			VERIFY3U(blocks_gone, ==, e->lse_blkcount);
513
514			/*
515			 * Assert that this is scenario [3] further by ensuring
516			 * that this is the only entry in the summary.
517			 */
518			VERIFY3P(e, ==, list_tail(&spa->spa_log_summary));
519			ASSERT3P(e, ==, list_head(&spa->spa_log_summary));
520
521			blocks_gone = e->lse_blkcount = 0;
522			break;
523		}
524	}
525
526	/*
527	 * Ensure that there is no way we are trying to remove more blocks
528	 * than the # of blocks in the summary.
529	 */
530	ASSERT0(blocks_gone);
531}
532
533void
534spa_log_sm_decrement_mscount(spa_t *spa, uint64_t txg)
535{
536	spa_log_sm_t target = { .sls_txg = txg };
537	spa_log_sm_t *sls = avl_find(&spa->spa_sm_logs_by_txg,
538	    &target, NULL);
539
540	if (sls == NULL) {
541		/*
542		 * We must be at the teardown of a spa_load() attempt that
543		 * got an error while reading the log space maps.
544		 */
545		VERIFY3S(spa_load_state(spa), ==, SPA_LOAD_ERROR);
546		return;
547	}
548
549	ASSERT(sls->sls_mscount > 0);
550	sls->sls_mscount--;
551}
552
553void
554spa_log_sm_increment_current_mscount(spa_t *spa)
555{
556	spa_log_sm_t *last_sls = avl_last(&spa->spa_sm_logs_by_txg);
557	ASSERT3U(last_sls->sls_txg, ==, spa_syncing_txg(spa));
558	last_sls->sls_mscount++;
559}
560
561static void
562summary_add_data(spa_t *spa, uint64_t txg, uint64_t metaslabs_flushed,
563    uint64_t nblocks)
564{
565	log_summary_entry_t *e = list_tail(&spa->spa_log_summary);
566
567	if (e == NULL || summary_entry_is_full(spa, e)) {
568		e = kmem_zalloc(sizeof (log_summary_entry_t), KM_SLEEP);
569		e->lse_start = txg;
570		list_insert_tail(&spa->spa_log_summary, e);
571	}
572
573	ASSERT3U(e->lse_start, <=, txg);
574	e->lse_mscount += metaslabs_flushed;
575	e->lse_blkcount += nblocks;
576}
577
578static void
579spa_log_summary_add_incoming_blocks(spa_t *spa, uint64_t nblocks)
580{
581	summary_add_data(spa, spa_syncing_txg(spa), 0, nblocks);
582}
583
584void
585spa_log_summary_add_flushed_metaslab(spa_t *spa)
586{
587	summary_add_data(spa, spa_syncing_txg(spa), 1, 0);
588}
589
590/*
591 * This function attempts to estimate how many metaslabs should
592 * we flush to satisfy our block heuristic for the log spacemap
593 * for the upcoming TXGs.
594 *
595 * Specifically, it first tries to estimate the number of incoming
596 * blocks in this TXG. Then by projecting that incoming rate to
597 * future TXGs and using the log summary, it figures out how many
598 * flushes we would need to do for future TXGs individually to
599 * stay below our block limit and returns the maximum number of
600 * flushes from those estimates.
601 */
602static uint64_t
603spa_estimate_metaslabs_to_flush(spa_t *spa)
604{
605	ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP));
606	ASSERT3U(spa_sync_pass(spa), ==, 1);
607	ASSERT(spa_log_sm_blocklimit(spa) != 0);
608
609	/*
610	 * This variable contains the incoming rate that will be projected
611	 * and used for our flushing estimates in the future.
612	 */
613	uint64_t incoming = spa_estimate_incoming_log_blocks(spa);
614
615	/*
616	 * At any point in time this variable tells us how many
617	 * TXGs in the future we are so we can make our estimations.
618	 */
619	uint64_t txgs_in_future = 1;
620
621	/*
622	 * This variable tells us how much room do we have until we hit
623	 * our limit. When it goes negative, it means that we've exceeded
624	 * our limit and we need to flush.
625	 *
626	 * Note that since we start at the first TXG in the future (i.e.
627	 * txgs_in_future starts from 1) we already decrement this
628	 * variable by the incoming rate.
629	 */
630	int64_t available_blocks =
631	    spa_log_sm_blocklimit(spa) - spa_log_sm_nblocks(spa) - incoming;
632
633	/*
634	 * This variable tells us the total number of flushes needed to
635	 * keep the log size within the limit when we reach txgs_in_future.
636	 */
637	uint64_t total_flushes = 0;
638
639	/* Holds the current maximum of our estimates so far. */
640	uint64_t max_flushes_pertxg =
641	    MIN(avl_numnodes(&spa->spa_metaslabs_by_flushed),
642	    zfs_min_metaslabs_to_flush);
643
644	/*
645	 * For our estimations we only look as far in the future
646	 * as the summary allows us.
647	 */
648	for (log_summary_entry_t *e = list_head(&spa->spa_log_summary);
649	    e; e = list_next(&spa->spa_log_summary, e)) {
650
651		/*
652		 * If there is still room before we exceed our limit
653		 * then keep skipping TXGs accumulating more blocks
654		 * based on the incoming rate until we exceed it.
655		 */
656		if (available_blocks >= 0) {
657			uint64_t skip_txgs = (available_blocks / incoming) + 1;
658			available_blocks -= (skip_txgs * incoming);
659			txgs_in_future += skip_txgs;
660			ASSERT3S(available_blocks, >=, -incoming);
661		}
662
663		/*
664		 * At this point we're far enough into the future where
665		 * the limit was just exceeded and we flush metaslabs
666		 * based on the current entry in the summary, updating
667		 * our available_blocks.
668		 */
669		ASSERT3S(available_blocks, <, 0);
670		available_blocks += e->lse_blkcount;
671		total_flushes += e->lse_mscount;
672
673		/*
674		 * Keep the running maximum of the total_flushes that
675		 * we've done so far over the number of TXGs in the
676		 * future that we are. The idea here is to estimate
677		 * the average number of flushes that we should do
678		 * every TXG so that when we are that many TXGs in the
679		 * future we stay under the limit.
680		 */
681		max_flushes_pertxg = MAX(max_flushes_pertxg,
682		    DIV_ROUND_UP(total_flushes, txgs_in_future));
683		ASSERT3U(avl_numnodes(&spa->spa_metaslabs_by_flushed), >=,
684		    max_flushes_pertxg);
685	}
686	return (max_flushes_pertxg);
687}
688
689uint64_t
690spa_log_sm_memused(spa_t *spa)
691{
692	return (spa->spa_unflushed_stats.sus_memused);
693}
694
695static boolean_t
696spa_log_exceeds_memlimit(spa_t *spa)
697{
698	if (spa_log_sm_memused(spa) > zfs_unflushed_max_mem_amt)
699		return (B_TRUE);
700
701	uint64_t system_mem_allowed = ((physmem * PAGESIZE) *
702	    zfs_unflushed_max_mem_ppm) / 1000000;
703	if (spa_log_sm_memused(spa) > system_mem_allowed)
704		return (B_TRUE);
705
706	return (B_FALSE);
707}
708
709boolean_t
710spa_flush_all_logs_requested(spa_t *spa)
711{
712	return (spa->spa_log_flushall_txg != 0);
713}
714
715void
716spa_flush_metaslabs(spa_t *spa, dmu_tx_t *tx)
717{
718	uint64_t txg = dmu_tx_get_txg(tx);
719
720	if (spa_sync_pass(spa) != 1)
721		return;
722
723	if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP))
724		return;
725
726	/*
727	 * If we don't have any metaslabs with unflushed changes
728	 * return immediately.
729	 */
730	if (avl_numnodes(&spa->spa_metaslabs_by_flushed) == 0)
731		return;
732
733	/*
734	 * During SPA export we leave a few empty TXGs to go by [see
735	 * spa_final_dirty_txg() to understand why]. For this specific
736	 * case, it is important to not flush any metaslabs as that
737	 * would dirty this TXG.
738	 *
739	 * That said, during one of these dirty TXGs that is less or
740	 * equal to spa_final_dirty(), spa_unload() will request that
741	 * we try to flush all the metaslabs for that TXG before
742	 * exporting the pool, thus we ensure that we didn't get a
743	 * request of flushing everything before we attempt to return
744	 * immediately.
745	 */
746	if (spa->spa_uberblock.ub_rootbp.blk_birth < txg &&
747	    !dmu_objset_is_dirty(spa_meta_objset(spa), txg) &&
748	    !spa_flush_all_logs_requested(spa))
749		return;
750
751	/*
752	 * We need to generate a log space map before flushing because this
753	 * will set up the in-memory data (i.e. node in spa_sm_logs_by_txg)
754	 * for this TXG's flushed metaslab count (aka sls_mscount which is
755	 * manipulated in many ways down the metaslab_flush() codepath).
756	 *
757	 * That is not to say that we may generate a log space map when we
758	 * don't need it. If we are flushing metaslabs, that means that we
759	 * were going to write changes to disk anyway, so even if we were
760	 * not flushing, a log space map would have been created anyway in
761	 * metaslab_sync().
762	 */
763	spa_generate_syncing_log_sm(spa, tx);
764
765	/*
766	 * This variable tells us how many metaslabs we want to flush based
767	 * on the block-heuristic of our flushing algorithm (see block comment
768	 * of log space map feature). We also decrement this as we flush
769	 * metaslabs and attempt to destroy old log space maps.
770	 */
771	uint64_t want_to_flush;
772	if (spa_flush_all_logs_requested(spa)) {
773		ASSERT3S(spa_state(spa), ==, POOL_STATE_EXPORTED);
774		want_to_flush = avl_numnodes(&spa->spa_metaslabs_by_flushed);
775	} else {
776		want_to_flush = spa_estimate_metaslabs_to_flush(spa);
777	}
778
779	ASSERT3U(avl_numnodes(&spa->spa_metaslabs_by_flushed), >=,
780	    want_to_flush);
781
782	/* Used purely for verification purposes */
783	uint64_t visited = 0;
784
785	/*
786	 * Ideally we would only iterate through spa_metaslabs_by_flushed
787	 * using only one variable (curr). We can't do that because
788	 * metaslab_flush() mutates position of curr in the AVL when
789	 * it flushes that metaslab by moving it to the end of the tree.
790	 * Thus we always keep track of the original next node of the
791	 * current node (curr) in another variable (next).
792	 */
793	metaslab_t *next = NULL;
794	for (metaslab_t *curr = avl_first(&spa->spa_metaslabs_by_flushed);
795	    curr != NULL; curr = next) {
796		next = AVL_NEXT(&spa->spa_metaslabs_by_flushed, curr);
797
798		/*
799		 * If this metaslab has been flushed this txg then we've done
800		 * a full circle over the metaslabs.
801		 */
802		if (metaslab_unflushed_txg(curr) == txg)
803			break;
804
805		/*
806		 * If we are done flushing for the block heuristic and the
807		 * unflushed changes don't exceed the memory limit just stop.
808		 */
809		if (want_to_flush == 0 && !spa_log_exceeds_memlimit(spa))
810			break;
811
812		mutex_enter(&curr->ms_sync_lock);
813		mutex_enter(&curr->ms_lock);
814		boolean_t flushed = metaslab_flush(curr, tx);
815		mutex_exit(&curr->ms_lock);
816		mutex_exit(&curr->ms_sync_lock);
817
818		/*
819		 * If we failed to flush a metaslab (because it was loading),
820		 * then we are done with the block heuristic as it's not
821		 * possible to destroy any log space maps once you've skipped
822		 * a metaslab. In that case we just set our counter to 0 but
823		 * we continue looping in case there is still memory pressure
824		 * due to unflushed changes. Note that, flushing a metaslab
825		 * that is not the oldest flushed in the pool, will never
826		 * destroy any log space maps [see spa_cleanup_old_sm_logs()].
827		 */
828		if (!flushed) {
829			want_to_flush = 0;
830		} else if (want_to_flush > 0) {
831			want_to_flush--;
832		}
833
834		visited++;
835	}
836	ASSERT3U(avl_numnodes(&spa->spa_metaslabs_by_flushed), >=, visited);
837}
838
839/*
840 * Close the log space map for this TXG and update the block counts
841 * for the log's in-memory structure and the summary.
842 */
843void
844spa_sync_close_syncing_log_sm(spa_t *spa)
845{
846	if (spa_syncing_log_sm(spa) == NULL)
847		return;
848	ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP));
849
850	spa_log_sm_t *sls = avl_last(&spa->spa_sm_logs_by_txg);
851	ASSERT3U(sls->sls_txg, ==, spa_syncing_txg(spa));
852
853	sls->sls_nblocks = space_map_nblocks(spa_syncing_log_sm(spa));
854	spa->spa_unflushed_stats.sus_nblocks += sls->sls_nblocks;
855
856	/*
857	 * Note that we can't assert that sls_mscount is not 0,
858	 * because there is the case where the first metaslab
859	 * in spa_metaslabs_by_flushed is loading and we were
860	 * not able to flush any metaslabs the current TXG.
861	 */
862	ASSERT(sls->sls_nblocks != 0);
863
864	spa_log_summary_add_incoming_blocks(spa, sls->sls_nblocks);
865	spa_log_summary_verify_counts(spa);
866
867	space_map_close(spa->spa_syncing_log_sm);
868	spa->spa_syncing_log_sm = NULL;
869
870	/*
871	 * At this point we tried to flush as many metaslabs as we
872	 * can as the pool is getting exported. Reset the "flush all"
873	 * so the last few TXGs before closing the pool can be empty
874	 * (e.g. not dirty).
875	 */
876	if (spa_flush_all_logs_requested(spa)) {
877		ASSERT3S(spa_state(spa), ==, POOL_STATE_EXPORTED);
878		spa->spa_log_flushall_txg = 0;
879	}
880}
881
882void
883spa_cleanup_old_sm_logs(spa_t *spa, dmu_tx_t *tx)
884{
885	objset_t *mos = spa_meta_objset(spa);
886
887	uint64_t spacemap_zap;
888	int error = zap_lookup(mos, DMU_POOL_DIRECTORY_OBJECT,
889	    DMU_POOL_LOG_SPACEMAP_ZAP, sizeof (spacemap_zap), 1, &spacemap_zap);
890	if (error == ENOENT) {
891		ASSERT(avl_is_empty(&spa->spa_sm_logs_by_txg));
892		return;
893	}
894	VERIFY0(error);
895
896	metaslab_t *oldest = avl_first(&spa->spa_metaslabs_by_flushed);
897	uint64_t oldest_flushed_txg = metaslab_unflushed_txg(oldest);
898
899	/* Free all log space maps older than the oldest_flushed_txg. */
900	for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg);
901	    sls && sls->sls_txg < oldest_flushed_txg;
902	    sls = avl_first(&spa->spa_sm_logs_by_txg)) {
903		ASSERT0(sls->sls_mscount);
904		avl_remove(&spa->spa_sm_logs_by_txg, sls);
905		space_map_free_obj(mos, sls->sls_sm_obj, tx);
906		VERIFY0(zap_remove_int(mos, spacemap_zap, sls->sls_txg, tx));
907		spa->spa_unflushed_stats.sus_nblocks -= sls->sls_nblocks;
908		kmem_free(sls, sizeof (spa_log_sm_t));
909	}
910}
911
912static spa_log_sm_t *
913spa_log_sm_alloc(uint64_t sm_obj, uint64_t txg)
914{
915	spa_log_sm_t *sls = kmem_zalloc(sizeof (*sls), KM_SLEEP);
916	sls->sls_sm_obj = sm_obj;
917	sls->sls_txg = txg;
918	return (sls);
919}
920
921void
922spa_generate_syncing_log_sm(spa_t *spa, dmu_tx_t *tx)
923{
924	uint64_t txg = dmu_tx_get_txg(tx);
925	objset_t *mos = spa_meta_objset(spa);
926
927	if (spa_syncing_log_sm(spa) != NULL)
928		return;
929
930	if (!spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP))
931		return;
932
933	uint64_t spacemap_zap;
934	int error = zap_lookup(mos, DMU_POOL_DIRECTORY_OBJECT,
935	    DMU_POOL_LOG_SPACEMAP_ZAP, sizeof (spacemap_zap), 1, &spacemap_zap);
936	if (error == ENOENT) {
937		ASSERT(avl_is_empty(&spa->spa_sm_logs_by_txg));
938
939		error = 0;
940		spacemap_zap = zap_create(mos,
941		    DMU_OTN_ZAP_METADATA, DMU_OT_NONE, 0, tx);
942		VERIFY0(zap_add(mos, DMU_POOL_DIRECTORY_OBJECT,
943		    DMU_POOL_LOG_SPACEMAP_ZAP, sizeof (spacemap_zap), 1,
944		    &spacemap_zap, tx));
945		spa_feature_incr(spa, SPA_FEATURE_LOG_SPACEMAP, tx);
946	}
947	VERIFY0(error);
948
949	uint64_t sm_obj;
950	ASSERT3U(zap_lookup_int_key(mos, spacemap_zap, txg, &sm_obj),
951	    ==, ENOENT);
952	sm_obj = space_map_alloc(mos, zfs_log_sm_blksz, tx);
953	VERIFY0(zap_add_int_key(mos, spacemap_zap, txg, sm_obj, tx));
954	avl_add(&spa->spa_sm_logs_by_txg, spa_log_sm_alloc(sm_obj, txg));
955
956	/*
957	 * We pass UINT64_MAX as the space map's representation size
958	 * and SPA_MINBLOCKSHIFT as the shift, to make the space map
959	 * accept any sorts of segments since there's no real advantage
960	 * to being more restrictive (given that we're already going
961	 * to be using 2-word entries).
962	 */
963	VERIFY0(space_map_open(&spa->spa_syncing_log_sm, mos, sm_obj,
964	    0, UINT64_MAX, SPA_MINBLOCKSHIFT));
965
966	/*
967	 * If the log space map feature was just enabled, the blocklimit
968	 * has not yet been set.
969	 */
970	if (spa_log_sm_blocklimit(spa) == 0)
971		spa_log_sm_set_blocklimit(spa);
972}
973
974/*
975 * Find all the log space maps stored in the space map ZAP and sort
976 * them by their TXG in spa_sm_logs_by_txg.
977 */
978static int
979spa_ld_log_sm_metadata(spa_t *spa)
980{
981	int error;
982	uint64_t spacemap_zap;
983
984	ASSERT(avl_is_empty(&spa->spa_sm_logs_by_txg));
985
986	error = zap_lookup(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT,
987	    DMU_POOL_LOG_SPACEMAP_ZAP, sizeof (spacemap_zap), 1, &spacemap_zap);
988	if (error == ENOENT) {
989		/* the space map ZAP doesn't exist yet */
990		return (0);
991	} else if (error != 0) {
992		spa_load_failed(spa, "spa_ld_log_sm_metadata(): failed at "
993		    "zap_lookup(DMU_POOL_DIRECTORY_OBJECT) [error %d]",
994		    error);
995		return (error);
996	}
997
998	zap_cursor_t zc;
999	zap_attribute_t za;
1000	for (zap_cursor_init(&zc, spa_meta_objset(spa), spacemap_zap);
1001	    (error = zap_cursor_retrieve(&zc, &za)) == 0;
1002	    zap_cursor_advance(&zc)) {
1003		uint64_t log_txg = zfs_strtonum(za.za_name, NULL);
1004		spa_log_sm_t *sls =
1005		    spa_log_sm_alloc(za.za_first_integer, log_txg);
1006		avl_add(&spa->spa_sm_logs_by_txg, sls);
1007	}
1008	zap_cursor_fini(&zc);
1009	if (error != ENOENT) {
1010		spa_load_failed(spa, "spa_ld_log_sm_metadata(): failed at "
1011		    "zap_cursor_retrieve(spacemap_zap) [error %d]",
1012		    error);
1013		return (error);
1014	}
1015
1016	for (metaslab_t *m = avl_first(&spa->spa_metaslabs_by_flushed);
1017	    m; m = AVL_NEXT(&spa->spa_metaslabs_by_flushed, m)) {
1018		spa_log_sm_t target = { .sls_txg = metaslab_unflushed_txg(m) };
1019		spa_log_sm_t *sls = avl_find(&spa->spa_sm_logs_by_txg,
1020		    &target, NULL);
1021
1022		/*
1023		 * At this point if sls is zero it means that a bug occurred
1024		 * in ZFS the last time the pool was open or earlier in the
1025		 * import code path. In general, we would have placed a
1026		 * VERIFY() here or in this case just let the kernel panic
1027		 * with NULL pointer dereference when incrementing sls_mscount,
1028		 * but since this is the import code path we can be a bit more
1029		 * lenient. Thus, for DEBUG bits we always cause a panic, while
1030		 * in production we log the error and just fail the import.
1031		 */
1032		ASSERT(sls != NULL);
1033		if (sls == NULL) {
1034			spa_load_failed(spa, "spa_ld_log_sm_metadata(): bug "
1035			    "encountered: could not find log spacemap for "
1036			    "TXG %ld [error %d]",
1037			    metaslab_unflushed_txg(m), ENOENT);
1038			return (ENOENT);
1039		}
1040		sls->sls_mscount++;
1041	}
1042
1043	return (0);
1044}
1045
1046typedef struct spa_ld_log_sm_arg {
1047	spa_t *slls_spa;
1048	uint64_t slls_txg;
1049} spa_ld_log_sm_arg_t;
1050
1051static int
1052spa_ld_log_sm_cb(space_map_entry_t *sme, void *arg)
1053{
1054	uint64_t offset = sme->sme_offset;
1055	uint64_t size = sme->sme_run;
1056	uint32_t vdev_id = sme->sme_vdev;
1057
1058	spa_ld_log_sm_arg_t *slls = arg;
1059	spa_t *spa = slls->slls_spa;
1060
1061	vdev_t *vd = vdev_lookup_top(spa, vdev_id);
1062
1063	/*
1064	 * If the vdev has been removed (i.e. it is indirect or a hole)
1065	 * skip this entry. The contents of this vdev have already moved
1066	 * elsewhere.
1067	 */
1068	if (!vdev_is_concrete(vd))
1069		return (0);
1070
1071	metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift];
1072	ASSERT(!ms->ms_loaded);
1073
1074	/*
1075	 * If we have already flushed entries for this TXG to this
1076	 * metaslab's space map, then ignore it. Note that we flush
1077	 * before processing any allocations/frees for that TXG, so
1078	 * the metaslab's space map only has entries from *before*
1079	 * the unflushed TXG.
1080	 */
1081	if (slls->slls_txg < metaslab_unflushed_txg(ms))
1082		return (0);
1083
1084	switch (sme->sme_type) {
1085	case SM_ALLOC:
1086		range_tree_remove_xor_add_segment(offset, offset + size,
1087		    ms->ms_unflushed_frees, ms->ms_unflushed_allocs);
1088		break;
1089	case SM_FREE:
1090		range_tree_remove_xor_add_segment(offset, offset + size,
1091		    ms->ms_unflushed_allocs, ms->ms_unflushed_frees);
1092		break;
1093	default:
1094		panic("invalid maptype_t");
1095		break;
1096	}
1097	return (0);
1098}
1099
1100static int
1101spa_ld_log_sm_data(spa_t *spa)
1102{
1103	int error = 0;
1104
1105	/*
1106	 * If we are not going to do any writes there is no need
1107	 * to read the log space maps.
1108	 */
1109	if (!spa_writeable(spa))
1110		return (0);
1111
1112	ASSERT0(spa->spa_unflushed_stats.sus_nblocks);
1113	ASSERT0(spa->spa_unflushed_stats.sus_memused);
1114
1115	hrtime_t read_logs_starttime = gethrtime();
1116	/* this is a no-op when we don't have space map logs */
1117	for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg);
1118	    sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) {
1119		space_map_t *sm = NULL;
1120		error = space_map_open(&sm, spa_meta_objset(spa),
1121		    sls->sls_sm_obj, 0, UINT64_MAX, SPA_MINBLOCKSHIFT);
1122		if (error != 0) {
1123			spa_load_failed(spa, "spa_ld_log_sm_data(): failed at "
1124			    "space_map_open(obj=%llu) [error %d]",
1125			    (u_longlong_t)sls->sls_sm_obj, error);
1126			goto out;
1127		}
1128
1129		struct spa_ld_log_sm_arg vla = {
1130			.slls_spa = spa,
1131			.slls_txg = sls->sls_txg
1132		};
1133		error = space_map_iterate(sm, space_map_length(sm),
1134		    spa_ld_log_sm_cb, &vla);
1135		if (error != 0) {
1136			space_map_close(sm);
1137			spa_load_failed(spa, "spa_ld_log_sm_data(): failed "
1138			    "at space_map_iterate(obj=%llu) [error %d]",
1139			    (u_longlong_t)sls->sls_sm_obj, error);
1140			goto out;
1141		}
1142
1143		ASSERT0(sls->sls_nblocks);
1144		sls->sls_nblocks = space_map_nblocks(sm);
1145		spa->spa_unflushed_stats.sus_nblocks += sls->sls_nblocks;
1146		summary_add_data(spa, sls->sls_txg,
1147		    sls->sls_mscount, sls->sls_nblocks);
1148
1149		space_map_close(sm);
1150	}
1151	hrtime_t read_logs_endtime = gethrtime();
1152	spa_load_note(spa,
1153	    "read %llu log space maps (%llu total blocks - blksz = %llu bytes) "
1154	    "in %lld ms", (u_longlong_t)avl_numnodes(&spa->spa_sm_logs_by_txg),
1155	    (u_longlong_t)spa_log_sm_nblocks(spa),
1156	    (u_longlong_t)zfs_log_sm_blksz,
1157	    (longlong_t)((read_logs_endtime - read_logs_starttime) / 1000000));
1158
1159out:
1160	/*
1161	 * Now that the metaslabs contain their unflushed changes:
1162	 * [1] recalculate their actual allocated space
1163	 * [2] recalculate their weights
1164	 * [3] sum up the memory usage of their unflushed range trees
1165	 * [4] optionally load them, if debug_load is set
1166	 *
1167	 * Note that even in the case where we get here because of an
1168	 * error (e.g. error != 0), we still want to update the fields
1169	 * below in order to have a proper teardown in spa_unload().
1170	 */
1171	for (metaslab_t *m = avl_first(&spa->spa_metaslabs_by_flushed);
1172	    m != NULL; m = AVL_NEXT(&spa->spa_metaslabs_by_flushed, m)) {
1173		mutex_enter(&m->ms_lock);
1174		m->ms_allocated_space = space_map_allocated(m->ms_sm) +
1175		    range_tree_space(m->ms_unflushed_allocs) -
1176		    range_tree_space(m->ms_unflushed_frees);
1177
1178		vdev_t *vd = m->ms_group->mg_vd;
1179		metaslab_space_update(vd, m->ms_group->mg_class,
1180		    range_tree_space(m->ms_unflushed_allocs), 0, 0);
1181		metaslab_space_update(vd, m->ms_group->mg_class,
1182		    -range_tree_space(m->ms_unflushed_frees), 0, 0);
1183
1184		ASSERT0(m->ms_weight & METASLAB_ACTIVE_MASK);
1185		metaslab_recalculate_weight_and_sort(m);
1186
1187		spa->spa_unflushed_stats.sus_memused +=
1188		    metaslab_unflushed_changes_memused(m);
1189
1190		if (metaslab_debug_load && m->ms_sm != NULL) {
1191			VERIFY0(metaslab_load(m));
1192			metaslab_set_selected_txg(m, 0);
1193		}
1194		mutex_exit(&m->ms_lock);
1195	}
1196
1197	return (error);
1198}
1199
1200static int
1201spa_ld_unflushed_txgs(vdev_t *vd)
1202{
1203	spa_t *spa = vd->vdev_spa;
1204	objset_t *mos = spa_meta_objset(spa);
1205
1206	if (vd->vdev_top_zap == 0)
1207		return (0);
1208
1209	uint64_t object = 0;
1210	int error = zap_lookup(mos, vd->vdev_top_zap,
1211	    VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS,
1212	    sizeof (uint64_t), 1, &object);
1213	if (error == ENOENT)
1214		return (0);
1215	else if (error != 0) {
1216		spa_load_failed(spa, "spa_ld_unflushed_txgs(): failed at "
1217		    "zap_lookup(vdev_top_zap=%llu) [error %d]",
1218		    (u_longlong_t)vd->vdev_top_zap, error);
1219		return (error);
1220	}
1221
1222	for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
1223		metaslab_t *ms = vd->vdev_ms[m];
1224		ASSERT(ms != NULL);
1225
1226		metaslab_unflushed_phys_t entry;
1227		uint64_t entry_size = sizeof (entry);
1228		uint64_t entry_offset = ms->ms_id * entry_size;
1229
1230		error = dmu_read(mos, object,
1231		    entry_offset, entry_size, &entry, 0);
1232		if (error != 0) {
1233			spa_load_failed(spa, "spa_ld_unflushed_txgs(): "
1234			    "failed at dmu_read(obj=%llu) [error %d]",
1235			    (u_longlong_t)object, error);
1236			return (error);
1237		}
1238
1239		ms->ms_unflushed_txg = entry.msp_unflushed_txg;
1240		if (ms->ms_unflushed_txg != 0) {
1241			mutex_enter(&spa->spa_flushed_ms_lock);
1242			avl_add(&spa->spa_metaslabs_by_flushed, ms);
1243			mutex_exit(&spa->spa_flushed_ms_lock);
1244		}
1245	}
1246	return (0);
1247}
1248
1249/*
1250 * Read all the log space map entries into their respective
1251 * metaslab unflushed trees and keep them sorted by TXG in the
1252 * SPA's metadata. In addition, setup all the metadata for the
1253 * memory and the block heuristics.
1254 */
1255int
1256spa_ld_log_spacemaps(spa_t *spa)
1257{
1258	int error;
1259
1260	spa_log_sm_set_blocklimit(spa);
1261
1262	for (uint64_t c = 0; c < spa->spa_root_vdev->vdev_children; c++) {
1263		vdev_t *vd = spa->spa_root_vdev->vdev_child[c];
1264		error = spa_ld_unflushed_txgs(vd);
1265		if (error != 0)
1266			return (error);
1267	}
1268
1269	error = spa_ld_log_sm_metadata(spa);
1270	if (error != 0)
1271		return (error);
1272
1273	/*
1274	 * Note: we don't actually expect anything to change at this point
1275	 * but we grab the config lock so we don't fail any assertions
1276	 * when using vdev_lookup_top().
1277	 */
1278	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
1279	error = spa_ld_log_sm_data(spa);
1280	spa_config_exit(spa, SCL_CONFIG, FTAG);
1281
1282	return (error);
1283}
1284
1285/* BEGIN CSTYLED */
1286ZFS_MODULE_PARAM(zfs, zfs_, unflushed_max_mem_amt, ULONG, ZMOD_RW,
1287    "Specific hard-limit in memory that ZFS allows to be used for "
1288    "unflushed changes");
1289
1290ZFS_MODULE_PARAM(zfs, zfs_, unflushed_max_mem_ppm, ULONG, ZMOD_RW,
1291    "Percentage of the overall system memory that ZFS allows to be "
1292    "used for unflushed changes (value is calculated over 1000000 for "
1293    "finer granularity)");
1294
1295ZFS_MODULE_PARAM(zfs, zfs_, unflushed_log_block_max, ULONG, ZMOD_RW,
1296    "Hard limit (upper-bound) in the size of the space map log "
1297    "in terms of blocks.");
1298
1299ZFS_MODULE_PARAM(zfs, zfs_, unflushed_log_block_min, ULONG, ZMOD_RW,
1300    "Lower-bound limit for the maximum amount of blocks allowed in "
1301    "log spacemap (see zfs_unflushed_log_block_max)");
1302
1303ZFS_MODULE_PARAM(zfs, zfs_, unflushed_log_block_pct, ULONG, ZMOD_RW,
1304    "Tunable used to determine the number of blocks that can be used for "
1305    "the spacemap log, expressed as a percentage of the total number of "
1306    "metaslabs in the pool (e.g. 400 means the number of log blocks is "
1307    "capped at 4 times the number of metaslabs)");
1308
1309ZFS_MODULE_PARAM(zfs, zfs_, max_log_walking, ULONG, ZMOD_RW,
1310    "The number of past TXGs that the flushing algorithm of the log "
1311    "spacemap feature uses to estimate incoming log blocks");
1312
1313ZFS_MODULE_PARAM(zfs, zfs_, max_logsm_summary_length, ULONG, ZMOD_RW,
1314    "Maximum number of rows allowed in the summary of the spacemap log");
1315
1316ZFS_MODULE_PARAM(zfs, zfs_, min_metaslabs_to_flush, ULONG, ZMOD_RW,
1317    "Minimum number of metaslabs to flush per dirty TXG");
1318
1319ZFS_MODULE_PARAM(zfs, zfs_, keep_log_spacemaps_at_export, INT, ZMOD_RW,
1320    "Prevent the log spacemaps from being flushed and destroyed "
1321    "during pool export/destroy");
1322/* END CSTYLED */
1323