1168404Spjd/*
2168404Spjd * CDDL HEADER START
3168404Spjd *
4168404Spjd * The contents of this file are subject to the terms of the
5168404Spjd * Common Development and Distribution License (the "License").
6168404Spjd * You may not use this file except in compliance with the License.
7168404Spjd *
8168404Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9168404Spjd * or http://www.opensolaris.org/os/licensing.
10168404Spjd * See the License for the specific language governing permissions
11168404Spjd * and limitations under the License.
12168404Spjd *
13168404Spjd * When distributing Covered Code, include this CDDL HEADER in each
14168404Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15168404Spjd * If applicable, add the following below this CDDL HEADER, with the
16168404Spjd * fields enclosed by brackets "[]" replaced with your own identifying
17168404Spjd * information: Portions Copyright [yyyy] [name of copyright owner]
18168404Spjd *
19168404Spjd * CDDL HEADER END
20168404Spjd */
21168404Spjd/*
22219089Spjd * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23285001Savg * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
24228103Smm * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
25247265Smm * Copyright 2013 Martin Matuska <mm@FreeBSD.org>. All rights reserved.
26288549Smav * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
27290757Smav * Copyright 2013 Saso Kiselkov. All rights reserved.
28168404Spjd */
29168404Spjd
30168404Spjd#include <sys/zfs_context.h>
31168404Spjd#include <sys/spa_impl.h>
32247265Smm#include <sys/spa_boot.h>
33168404Spjd#include <sys/zio.h>
34168404Spjd#include <sys/zio_checksum.h>
35168404Spjd#include <sys/zio_compress.h>
36168404Spjd#include <sys/dmu.h>
37168404Spjd#include <sys/dmu_tx.h>
38168404Spjd#include <sys/zap.h>
39168404Spjd#include <sys/zil.h>
40168404Spjd#include <sys/vdev_impl.h>
41168404Spjd#include <sys/metaslab.h>
42168404Spjd#include <sys/uberblock_impl.h>
43168404Spjd#include <sys/txg.h>
44168404Spjd#include <sys/avl.h>
45168404Spjd#include <sys/unique.h>
46168404Spjd#include <sys/dsl_pool.h>
47168404Spjd#include <sys/dsl_dir.h>
48168404Spjd#include <sys/dsl_prop.h>
49219089Spjd#include <sys/dsl_scan.h>
50168404Spjd#include <sys/fs/zfs.h>
51185029Spjd#include <sys/metaslab_impl.h>
52185029Spjd#include <sys/arc.h>
53219089Spjd#include <sys/ddt.h>
54185029Spjd#include "zfs_prop.h"
55290757Smav#include <sys/zfeature.h>
56168404Spjd
57168404Spjd/*
58168404Spjd * SPA locking
59168404Spjd *
60168404Spjd * There are four basic locks for managing spa_t structures:
61168404Spjd *
62168404Spjd * spa_namespace_lock (global mutex)
63168404Spjd *
64168404Spjd *	This lock must be acquired to do any of the following:
65168404Spjd *
66168404Spjd *		- Lookup a spa_t by name
67168404Spjd *		- Add or remove a spa_t from the namespace
68168404Spjd *		- Increase spa_refcount from non-zero
69168404Spjd *		- Check if spa_refcount is zero
70168404Spjd *		- Rename a spa_t
71168404Spjd *		- add/remove/attach/detach devices
72168404Spjd *		- Held for the duration of create/destroy/import/export
73168404Spjd *
74168404Spjd *	It does not need to handle recursion.  A create or destroy may
75168404Spjd *	reference objects (files or zvols) in other pools, but by
76168404Spjd *	definition they must have an existing reference, and will never need
77168404Spjd *	to lookup a spa_t by name.
78168404Spjd *
79168404Spjd * spa_refcount (per-spa refcount_t protected by mutex)
80168404Spjd *
81168404Spjd *	This reference count keep track of any active users of the spa_t.  The
82168404Spjd *	spa_t cannot be destroyed or freed while this is non-zero.  Internally,
83168404Spjd *	the refcount is never really 'zero' - opening a pool implicitly keeps
84185029Spjd *	some references in the DMU.  Internally we check against spa_minref, but
85168404Spjd *	present the image of a zero/non-zero value to consumers.
86168404Spjd *
87185029Spjd * spa_config_lock[] (per-spa array of rwlocks)
88168404Spjd *
89185029Spjd *	This protects the spa_t from config changes, and must be held in
90185029Spjd *	the following circumstances:
91168404Spjd *
92168404Spjd *		- RW_READER to perform I/O to the spa
93168404Spjd *		- RW_WRITER to change the vdev config
94168404Spjd *
95168404Spjd * The locking order is fairly straightforward:
96168404Spjd *
97168404Spjd *		spa_namespace_lock	->	spa_refcount
98168404Spjd *
99168404Spjd *	The namespace lock must be acquired to increase the refcount from 0
100168404Spjd *	or to check if it is zero.
101168404Spjd *
102185029Spjd *		spa_refcount		->	spa_config_lock[]
103168404Spjd *
104168404Spjd *	There must be at least one valid reference on the spa_t to acquire
105168404Spjd *	the config lock.
106168404Spjd *
107185029Spjd *		spa_namespace_lock	->	spa_config_lock[]
108168404Spjd *
109168404Spjd *	The namespace lock must always be taken before the config lock.
110168404Spjd *
111168404Spjd *
112185029Spjd * The spa_namespace_lock can be acquired directly and is globally visible.
113168404Spjd *
114185029Spjd * The namespace is manipulated using the following functions, all of which
115185029Spjd * require the spa_namespace_lock to be held.
116168404Spjd *
117168404Spjd *	spa_lookup()		Lookup a spa_t by name.
118168404Spjd *
119168404Spjd *	spa_add()		Create a new spa_t in the namespace.
120168404Spjd *
121168404Spjd *	spa_remove()		Remove a spa_t from the namespace.  This also
122168404Spjd *				frees up any memory associated with the spa_t.
123168404Spjd *
124168404Spjd *	spa_next()		Returns the next spa_t in the system, or the
125168404Spjd *				first if NULL is passed.
126168404Spjd *
127168404Spjd *	spa_evict_all()		Shutdown and remove all spa_t structures in
128168404Spjd *				the system.
129168404Spjd *
130168404Spjd *	spa_guid_exists()	Determine whether a pool/device guid exists.
131168404Spjd *
132168404Spjd * The spa_refcount is manipulated using the following functions:
133168404Spjd *
134168404Spjd *	spa_open_ref()		Adds a reference to the given spa_t.  Must be
135168404Spjd *				called with spa_namespace_lock held if the
136168404Spjd *				refcount is currently zero.
137168404Spjd *
138168404Spjd *	spa_close()		Remove a reference from the spa_t.  This will
139168404Spjd *				not free the spa_t or remove it from the
140168404Spjd *				namespace.  No locking is required.
141168404Spjd *
142168404Spjd *	spa_refcount_zero()	Returns true if the refcount is currently
143168404Spjd *				zero.  Must be called with spa_namespace_lock
144168404Spjd *				held.
145168404Spjd *
146185029Spjd * The spa_config_lock[] is an array of rwlocks, ordered as follows:
147185029Spjd * SCL_CONFIG > SCL_STATE > SCL_ALLOC > SCL_ZIO > SCL_FREE > SCL_VDEV.
148185029Spjd * spa_config_lock[] is manipulated with spa_config_{enter,exit,held}().
149168404Spjd *
150185029Spjd * To read the configuration, it suffices to hold one of these locks as reader.
151185029Spjd * To modify the configuration, you must hold all locks as writer.  To modify
152185029Spjd * vdev state without altering the vdev tree's topology (e.g. online/offline),
153185029Spjd * you must hold SCL_STATE and SCL_ZIO as writer.
154168404Spjd *
155185029Spjd * We use these distinct config locks to avoid recursive lock entry.
156185029Spjd * For example, spa_sync() (which holds SCL_CONFIG as reader) induces
157185029Spjd * block allocations (SCL_ALLOC), which may require reading space maps
158185029Spjd * from disk (dmu_read() -> zio_read() -> SCL_ZIO).
159168404Spjd *
160185029Spjd * The spa config locks cannot be normal rwlocks because we need the
161185029Spjd * ability to hand off ownership.  For example, SCL_ZIO is acquired
162185029Spjd * by the issuing thread and later released by an interrupt thread.
163185029Spjd * They do, however, obey the usual write-wanted semantics to prevent
164185029Spjd * writer (i.e. system administrator) starvation.
165168404Spjd *
166185029Spjd * The lock acquisition rules are as follows:
167185029Spjd *
168185029Spjd * SCL_CONFIG
169185029Spjd *	Protects changes to the vdev tree topology, such as vdev
170185029Spjd *	add/remove/attach/detach.  Protects the dirty config list
171185029Spjd *	(spa_config_dirty_list) and the set of spares and l2arc devices.
172185029Spjd *
173185029Spjd * SCL_STATE
174185029Spjd *	Protects changes to pool state and vdev state, such as vdev
175185029Spjd *	online/offline/fault/degrade/clear.  Protects the dirty state list
176185029Spjd *	(spa_state_dirty_list) and global pool state (spa_state).
177185029Spjd *
178185029Spjd * SCL_ALLOC
179185029Spjd *	Protects changes to metaslab groups and classes.
180185029Spjd *	Held as reader by metaslab_alloc() and metaslab_claim().
181185029Spjd *
182185029Spjd * SCL_ZIO
183185029Spjd *	Held by bp-level zios (those which have no io_vd upon entry)
184185029Spjd *	to prevent changes to the vdev tree.  The bp-level zio implicitly
185185029Spjd *	protects all of its vdev child zios, which do not hold SCL_ZIO.
186185029Spjd *
187185029Spjd * SCL_FREE
188185029Spjd *	Protects changes to metaslab groups and classes.
189185029Spjd *	Held as reader by metaslab_free().  SCL_FREE is distinct from
190185029Spjd *	SCL_ALLOC, and lower than SCL_ZIO, so that we can safely free
191185029Spjd *	blocks in zio_done() while another i/o that holds either
192185029Spjd *	SCL_ALLOC or SCL_ZIO is waiting for this i/o to complete.
193185029Spjd *
194185029Spjd * SCL_VDEV
195185029Spjd *	Held as reader to prevent changes to the vdev tree during trivial
196219089Spjd *	inquiries such as bp_get_dsize().  SCL_VDEV is distinct from the
197185029Spjd *	other locks, and lower than all of them, to ensure that it's safe
198185029Spjd *	to acquire regardless of caller context.
199185029Spjd *
200185029Spjd * In addition, the following rules apply:
201185029Spjd *
202185029Spjd * (a)	spa_props_lock protects pool properties, spa_config and spa_config_list.
203185029Spjd *	The lock ordering is SCL_CONFIG > spa_props_lock.
204185029Spjd *
205185029Spjd * (b)	I/O operations on leaf vdevs.  For any zio operation that takes
206185029Spjd *	an explicit vdev_t argument -- such as zio_ioctl(), zio_read_phys(),
207185029Spjd *	or zio_write_phys() -- the caller must ensure that the config cannot
208185029Spjd *	cannot change in the interim, and that the vdev cannot be reopened.
209185029Spjd *	SCL_STATE as reader suffices for both.
210185029Spjd *
211168404Spjd * The vdev configuration is protected by spa_vdev_enter() / spa_vdev_exit().
212168404Spjd *
213168404Spjd *	spa_vdev_enter()	Acquire the namespace lock and the config lock
214168404Spjd *				for writing.
215168404Spjd *
216168404Spjd *	spa_vdev_exit()		Release the config lock, wait for all I/O
217168404Spjd *				to complete, sync the updated configs to the
218168404Spjd *				cache, and release the namespace lock.
219168404Spjd *
220185029Spjd * vdev state is protected by spa_vdev_state_enter() / spa_vdev_state_exit().
221185029Spjd * Like spa_vdev_enter/exit, these are convenience wrappers -- the actual
222185029Spjd * locking is, always, based on spa_namespace_lock and spa_config_lock[].
223185029Spjd *
224236884Smm * spa_rename() is also implemented within this file since it requires
225185029Spjd * manipulation of the namespace.
226168404Spjd */
227168404Spjd
228168404Spjdstatic avl_tree_t spa_namespace_avl;
229168404Spjdkmutex_t spa_namespace_lock;
230168404Spjdstatic kcondvar_t spa_namespace_cv;
231168404Spjdstatic int spa_active_count;
232168404Spjdint spa_max_replication_override = SPA_DVAS_PER_BP;
233168404Spjd
234168404Spjdstatic kmutex_t spa_spare_lock;
235168404Spjdstatic avl_tree_t spa_spare_avl;
236185029Spjdstatic kmutex_t spa_l2cache_lock;
237185029Spjdstatic avl_tree_t spa_l2cache_avl;
238168404Spjd
239168404Spjdkmem_cache_t *spa_buffer_pool;
240209962Smmint spa_mode_global;
241168404Spjd
242168404Spjd#ifdef ZFS_DEBUG
243248571Smm/* Everything except dprintf and spa is on by default in debug builds */
244248571Smmint zfs_flags = ~(ZFS_DEBUG_DPRINTF | ZFS_DEBUG_SPA);
245168404Spjd#else
246168404Spjdint zfs_flags = 0;
247168404Spjd#endif
248260731SavgSYSCTL_DECL(_debug);
249260731SavgTUNABLE_INT("debug.zfs_flags", &zfs_flags);
250260731SavgSYSCTL_INT(_debug, OID_AUTO, zfs_flags, CTLFLAG_RWTUN, &zfs_flags, 0,
251260731Savg    "ZFS debug flags.");
252168404Spjd
253168404Spjd/*
254168404Spjd * zfs_recover can be set to nonzero to attempt to recover from
255168404Spjd * otherwise-fatal errors, typically caused by on-disk corruption.  When
256168404Spjd * set, calls to zfs_panic_recover() will turn into warning messages.
257262120Savg * This should only be used as a last resort, as it typically results
258262120Savg * in leaked space, or worse.
259168404Spjd */
260268650Sdelphijboolean_t zfs_recover = B_FALSE;
261168583SpjdSYSCTL_DECL(_vfs_zfs);
262168583SpjdTUNABLE_INT("vfs.zfs.recover", &zfs_recover);
263290712SsmhSYSCTL_INT(_vfs_zfs, OID_AUTO, recover, CTLFLAG_RWTUN, &zfs_recover, 0,
264168583Spjd    "Try to recover from otherwise-fatal errors.");
265168404Spjd
266260763Savg/*
267268650Sdelphij * If destroy encounters an EIO while reading metadata (e.g. indirect
268268650Sdelphij * blocks), space referenced by the missing metadata can not be freed.
269268650Sdelphij * Normally this causes the background destroy to become "stalled", as
270268650Sdelphij * it is unable to make forward progress.  While in this stalled state,
271268650Sdelphij * all remaining space to free from the error-encountering filesystem is
272268650Sdelphij * "temporarily leaked".  Set this flag to cause it to ignore the EIO,
273268650Sdelphij * permanently leak the space from indirect blocks that can not be read,
274268650Sdelphij * and continue to free everything else that it can.
275268650Sdelphij *
276268650Sdelphij * The default, "stalling" behavior is useful if the storage partially
277268650Sdelphij * fails (i.e. some but not all i/os fail), and then later recovers.  In
278268650Sdelphij * this case, we will be able to continue pool operations while it is
279268650Sdelphij * partially failed, and when it recovers, we can continue to free the
280268650Sdelphij * space, with no leaks.  However, note that this case is actually
281268650Sdelphij * fairly rare.
282268650Sdelphij *
283268650Sdelphij * Typically pools either (a) fail completely (but perhaps temporarily,
284268650Sdelphij * e.g. a top-level vdev going offline), or (b) have localized,
285268650Sdelphij * permanent errors (e.g. disk returns the wrong data due to bit flip or
286268650Sdelphij * firmware bug).  In case (a), this setting does not matter because the
287268650Sdelphij * pool will be suspended and the sync thread will not be able to make
288268650Sdelphij * forward progress regardless.  In case (b), because the error is
289268650Sdelphij * permanent, the best we can do is leak the minimum amount of space,
290268650Sdelphij * which is what setting this flag will do.  Therefore, it is reasonable
291268650Sdelphij * for this flag to normally be set, but we chose the more conservative
292268650Sdelphij * approach of not setting it, so that there is no possibility of
293268650Sdelphij * leaking space in the "partial temporary" failure case.
294268650Sdelphij */
295268650Sdelphijboolean_t zfs_free_leak_on_eio = B_FALSE;
296268650Sdelphij
297268650Sdelphij/*
298260763Savg * Expiration time in milliseconds. This value has two meanings. First it is
299260763Savg * used to determine when the spa_deadman() logic should fire. By default the
300260763Savg * spa_deadman() will fire if spa_sync() has not completed in 1000 seconds.
301260763Savg * Secondly, the value determines if an I/O is considered "hung". Any I/O that
302260763Savg * has not completed in zfs_deadman_synctime_ms is considered "hung" resulting
303260763Savg * in a system panic.
304260763Savg */
305260763Savguint64_t zfs_deadman_synctime_ms = 1000000ULL;
306260763SavgTUNABLE_QUAD("vfs.zfs.deadman_synctime_ms", &zfs_deadman_synctime_ms);
307260763SavgSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, deadman_synctime_ms, CTLFLAG_RDTUN,
308260763Savg    &zfs_deadman_synctime_ms, 0,
309260763Savg    "Stalled ZFS I/O expiration time in milliseconds");
310168404Spjd
311168404Spjd/*
312260763Savg * Check time in milliseconds. This defines the frequency at which we check
313260763Savg * for hung I/O.
314247265Smm */
315260763Savguint64_t zfs_deadman_checktime_ms = 5000ULL;
316260763SavgTUNABLE_QUAD("vfs.zfs.deadman_checktime_ms", &zfs_deadman_checktime_ms);
317260763SavgSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, deadman_checktime_ms, CTLFLAG_RDTUN,
318260763Savg    &zfs_deadman_checktime_ms, 0,
319260763Savg    "Period of checks for stalled ZFS I/O in milliseconds");
320247265Smm
321247265Smm/*
322247265Smm * Default value of -1 for zfs_deadman_enabled is resolved in
323247265Smm * zfs_deadman_init()
324247265Smm */
325247265Smmint zfs_deadman_enabled = -1;
326247265SmmTUNABLE_INT("vfs.zfs.deadman_enabled", &zfs_deadman_enabled);
327247265SmmSYSCTL_INT(_vfs_zfs, OID_AUTO, deadman_enabled, CTLFLAG_RDTUN,
328247265Smm    &zfs_deadman_enabled, 0, "Kernel panic on stalled ZFS I/O");
329247265Smm
330260763Savg/*
331260763Savg * The worst case is single-sector max-parity RAID-Z blocks, in which
332260763Savg * case the space requirement is exactly (VDEV_RAIDZ_MAXPARITY + 1)
333260763Savg * times the size; so just assume that.  Add to this the fact that
334260763Savg * we can have up to 3 DVAs per bp, and one more factor of 2 because
335260763Savg * the block may be dittoed with up to 3 DVAs by ddt_sync().  All together,
336260763Savg * the worst case is:
337260763Savg *     (VDEV_RAIDZ_MAXPARITY + 1) * SPA_DVAS_PER_BP * 2 == 24
338260763Savg */
339260763Savgint spa_asize_inflation = 24;
340262179SavgTUNABLE_INT("vfs.zfs.spa_asize_inflation", &spa_asize_inflation);
341262179SavgSYSCTL_INT(_vfs_zfs, OID_AUTO, spa_asize_inflation, CTLFLAG_RWTUN,
342262179Savg    &spa_asize_inflation, 0, "Worst case inflation factor for single sector writes");
343260763Savg
344247265Smm#ifndef illumos
345247265Smm#ifdef _KERNEL
346247265Smmstatic void
347247265Smmzfs_deadman_init()
348247265Smm{
349247265Smm	/*
350247265Smm	 * If we are not i386 or amd64 or in a virtual machine,
351247265Smm	 * disable ZFS deadman thread by default
352247265Smm	 */
353247265Smm	if (zfs_deadman_enabled == -1) {
354247265Smm#if defined(__amd64__) || defined(__i386__)
355247265Smm		zfs_deadman_enabled = (vm_guest == VM_GUEST_NO) ? 1 : 0;
356247265Smm#else
357247265Smm		zfs_deadman_enabled = 0;
358247265Smm#endif
359247265Smm	}
360247265Smm}
361247265Smm#endif	/* _KERNEL */
362247265Smm#endif	/* !illumos */
363247265Smm
364247265Smm/*
365269006Sdelphij * Normally, we don't allow the last 3.2% (1/(2^spa_slop_shift)) of space in
366269006Sdelphij * the pool to be consumed.  This ensures that we don't run the pool
367269006Sdelphij * completely out of space, due to unaccounted changes (e.g. to the MOS).
368269006Sdelphij * It also limits the worst-case time to allocate space.  If we have
369269006Sdelphij * less than this amount of free space, most ZPL operations (e.g. write,
370269006Sdelphij * create) will return ENOSPC.
371269006Sdelphij *
372269006Sdelphij * Certain operations (e.g. file removal, most administrative actions) can
373269006Sdelphij * use half the slop space.  They will only return ENOSPC if less than half
374269006Sdelphij * the slop space is free.  Typically, once the pool has less than the slop
375269006Sdelphij * space free, the user will use these operations to free up space in the pool.
376269006Sdelphij * These are the operations that call dsl_pool_adjustedsize() with the netfree
377269006Sdelphij * argument set to TRUE.
378269006Sdelphij *
379269006Sdelphij * A very restricted set of operations are always permitted, regardless of
380269006Sdelphij * the amount of free space.  These are the operations that call
381269006Sdelphij * dsl_sync_task(ZFS_SPACE_CHECK_NONE), e.g. "zfs destroy".  If these
382269006Sdelphij * operations result in a net increase in the amount of space used,
383269006Sdelphij * it is possible to run the pool completely out of space, causing it to
384269006Sdelphij * be permanently read-only.
385269006Sdelphij *
386269006Sdelphij * See also the comments in zfs_space_check_t.
387269006Sdelphij */
388269006Sdelphijint spa_slop_shift = 5;
389275490SdelphijSYSCTL_INT(_vfs_zfs, OID_AUTO, spa_slop_shift, CTLFLAG_RWTUN,
390275490Sdelphij    &spa_slop_shift, 0,
391275490Sdelphij    "Shift value of reserved space (1/(2^spa_slop_shift)).");
392269006Sdelphij
393269006Sdelphij/*
394168404Spjd * ==========================================================================
395185029Spjd * SPA config locking
396185029Spjd * ==========================================================================
397185029Spjd */
398185029Spjdstatic void
399185029Spjdspa_config_lock_init(spa_t *spa)
400185029Spjd{
401185029Spjd	for (int i = 0; i < SCL_LOCKS; i++) {
402185029Spjd		spa_config_lock_t *scl = &spa->spa_config_lock[i];
403185029Spjd		mutex_init(&scl->scl_lock, NULL, MUTEX_DEFAULT, NULL);
404185029Spjd		cv_init(&scl->scl_cv, NULL, CV_DEFAULT, NULL);
405248571Smm		refcount_create_untracked(&scl->scl_count);
406185029Spjd		scl->scl_writer = NULL;
407185029Spjd		scl->scl_write_wanted = 0;
408185029Spjd	}
409185029Spjd}
410185029Spjd
411185029Spjdstatic void
412185029Spjdspa_config_lock_destroy(spa_t *spa)
413185029Spjd{
414185029Spjd	for (int i = 0; i < SCL_LOCKS; i++) {
415185029Spjd		spa_config_lock_t *scl = &spa->spa_config_lock[i];
416185029Spjd		mutex_destroy(&scl->scl_lock);
417185029Spjd		cv_destroy(&scl->scl_cv);
418185029Spjd		refcount_destroy(&scl->scl_count);
419185029Spjd		ASSERT(scl->scl_writer == NULL);
420185029Spjd		ASSERT(scl->scl_write_wanted == 0);
421185029Spjd	}
422185029Spjd}
423185029Spjd
424185029Spjdint
425185029Spjdspa_config_tryenter(spa_t *spa, int locks, void *tag, krw_t rw)
426185029Spjd{
427185029Spjd	for (int i = 0; i < SCL_LOCKS; i++) {
428185029Spjd		spa_config_lock_t *scl = &spa->spa_config_lock[i];
429185029Spjd		if (!(locks & (1 << i)))
430185029Spjd			continue;
431185029Spjd		mutex_enter(&scl->scl_lock);
432185029Spjd		if (rw == RW_READER) {
433185029Spjd			if (scl->scl_writer || scl->scl_write_wanted) {
434185029Spjd				mutex_exit(&scl->scl_lock);
435185029Spjd				spa_config_exit(spa, locks ^ (1 << i), tag);
436185029Spjd				return (0);
437185029Spjd			}
438185029Spjd		} else {
439185029Spjd			ASSERT(scl->scl_writer != curthread);
440185029Spjd			if (!refcount_is_zero(&scl->scl_count)) {
441185029Spjd				mutex_exit(&scl->scl_lock);
442185029Spjd				spa_config_exit(spa, locks ^ (1 << i), tag);
443185029Spjd				return (0);
444185029Spjd			}
445185029Spjd			scl->scl_writer = curthread;
446185029Spjd		}
447185029Spjd		(void) refcount_add(&scl->scl_count, tag);
448185029Spjd		mutex_exit(&scl->scl_lock);
449185029Spjd	}
450185029Spjd	return (1);
451185029Spjd}
452185029Spjd
453185029Spjdvoid
454185029Spjdspa_config_enter(spa_t *spa, int locks, void *tag, krw_t rw)
455185029Spjd{
456219089Spjd	int wlocks_held = 0;
457219089Spjd
458248571Smm	ASSERT3U(SCL_LOCKS, <, sizeof (wlocks_held) * NBBY);
459248571Smm
460185029Spjd	for (int i = 0; i < SCL_LOCKS; i++) {
461185029Spjd		spa_config_lock_t *scl = &spa->spa_config_lock[i];
462219089Spjd		if (scl->scl_writer == curthread)
463219089Spjd			wlocks_held |= (1 << i);
464185029Spjd		if (!(locks & (1 << i)))
465185029Spjd			continue;
466185029Spjd		mutex_enter(&scl->scl_lock);
467185029Spjd		if (rw == RW_READER) {
468185029Spjd			while (scl->scl_writer || scl->scl_write_wanted) {
469185029Spjd				cv_wait(&scl->scl_cv, &scl->scl_lock);
470185029Spjd			}
471185029Spjd		} else {
472185029Spjd			ASSERT(scl->scl_writer != curthread);
473185029Spjd			while (!refcount_is_zero(&scl->scl_count)) {
474185029Spjd				scl->scl_write_wanted++;
475185029Spjd				cv_wait(&scl->scl_cv, &scl->scl_lock);
476185029Spjd				scl->scl_write_wanted--;
477185029Spjd			}
478185029Spjd			scl->scl_writer = curthread;
479185029Spjd		}
480185029Spjd		(void) refcount_add(&scl->scl_count, tag);
481185029Spjd		mutex_exit(&scl->scl_lock);
482185029Spjd	}
483219089Spjd	ASSERT(wlocks_held <= locks);
484185029Spjd}
485185029Spjd
486185029Spjdvoid
487185029Spjdspa_config_exit(spa_t *spa, int locks, void *tag)
488185029Spjd{
489185029Spjd	for (int i = SCL_LOCKS - 1; i >= 0; i--) {
490185029Spjd		spa_config_lock_t *scl = &spa->spa_config_lock[i];
491185029Spjd		if (!(locks & (1 << i)))
492185029Spjd			continue;
493185029Spjd		mutex_enter(&scl->scl_lock);
494185029Spjd		ASSERT(!refcount_is_zero(&scl->scl_count));
495185029Spjd		if (refcount_remove(&scl->scl_count, tag) == 0) {
496185029Spjd			ASSERT(scl->scl_writer == NULL ||
497185029Spjd			    scl->scl_writer == curthread);
498185029Spjd			scl->scl_writer = NULL;	/* OK in either case */
499185029Spjd			cv_broadcast(&scl->scl_cv);
500185029Spjd		}
501185029Spjd		mutex_exit(&scl->scl_lock);
502185029Spjd	}
503185029Spjd}
504185029Spjd
505185029Spjdint
506185029Spjdspa_config_held(spa_t *spa, int locks, krw_t rw)
507185029Spjd{
508185029Spjd	int locks_held = 0;
509185029Spjd
510185029Spjd	for (int i = 0; i < SCL_LOCKS; i++) {
511185029Spjd		spa_config_lock_t *scl = &spa->spa_config_lock[i];
512185029Spjd		if (!(locks & (1 << i)))
513185029Spjd			continue;
514185029Spjd		if ((rw == RW_READER && !refcount_is_zero(&scl->scl_count)) ||
515185029Spjd		    (rw == RW_WRITER && scl->scl_writer == curthread))
516185029Spjd			locks_held |= 1 << i;
517185029Spjd	}
518185029Spjd
519185029Spjd	return (locks_held);
520185029Spjd}
521185029Spjd
522185029Spjd/*
523185029Spjd * ==========================================================================
524168404Spjd * SPA namespace functions
525168404Spjd * ==========================================================================
526168404Spjd */
527168404Spjd
528168404Spjd/*
529168404Spjd * Lookup the named spa_t in the AVL tree.  The spa_namespace_lock must be held.
530168404Spjd * Returns NULL if no matching spa_t is found.
531168404Spjd */
532168404Spjdspa_t *
533168404Spjdspa_lookup(const char *name)
534168404Spjd{
535185029Spjd	static spa_t search;	/* spa_t is large; don't allocate on stack */
536185029Spjd	spa_t *spa;
537168404Spjd	avl_index_t where;
538185029Spjd	char *cp;
539168404Spjd
540168404Spjd	ASSERT(MUTEX_HELD(&spa_namespace_lock));
541168404Spjd
542248571Smm	(void) strlcpy(search.spa_name, name, sizeof (search.spa_name));
543248571Smm
544185029Spjd	/*
545185029Spjd	 * If it's a full dataset name, figure out the pool name and
546185029Spjd	 * just use that.
547185029Spjd	 */
548263407Sdelphij	cp = strpbrk(search.spa_name, "/@#");
549248571Smm	if (cp != NULL)
550185029Spjd		*cp = '\0';
551185029Spjd
552168404Spjd	spa = avl_find(&spa_namespace_avl, &search, &where);
553168404Spjd
554168404Spjd	return (spa);
555168404Spjd}
556168404Spjd
557168404Spjd/*
558247265Smm * Fires when spa_sync has not completed within zfs_deadman_synctime_ms.
559247265Smm * If the zfs_deadman_enabled flag is set then it inspects all vdev queues
560247265Smm * looking for potentially hung I/Os.
561247265Smm */
562247265Smmvoid
563247265Smmspa_deadman(void *arg)
564247265Smm{
565247265Smm	spa_t *spa = arg;
566247265Smm
567262093Savg	/*
568262093Savg	 * Disable the deadman timer if the pool is suspended.
569262093Savg	 */
570262093Savg	if (spa_suspended(spa)) {
571262093Savg#ifdef illumos
572262093Savg		VERIFY(cyclic_reprogram(spa->spa_deadman_cycid, CY_INFINITY));
573262093Savg#else
574262093Savg		/* Nothing.  just don't schedule any future callouts. */
575262093Savg#endif
576262093Savg		return;
577262093Savg	}
578262093Savg
579247265Smm	zfs_dbgmsg("slow spa_sync: started %llu seconds ago, calls %llu",
580247265Smm	    (gethrtime() - spa->spa_sync_starttime) / NANOSEC,
581247265Smm	    ++spa->spa_deadman_calls);
582247265Smm	if (zfs_deadman_enabled)
583247265Smm		vdev_deadman(spa->spa_root_vdev);
584247265Smm}
585247265Smm
586247265Smm/*
587168404Spjd * Create an uninitialized spa_t with the given name.  Requires
588168404Spjd * spa_namespace_lock.  The caller must ensure that the spa_t doesn't already
589168404Spjd * exist by calling spa_lookup() first.
590168404Spjd */
591168404Spjdspa_t *
592219089Spjdspa_add(const char *name, nvlist_t *config, const char *altroot)
593168404Spjd{
594168404Spjd	spa_t *spa;
595185029Spjd	spa_config_dirent_t *dp;
596247265Smm#ifdef illumos
597247265Smm	cyc_handler_t hdlr;
598247265Smm	cyc_time_t when;
599247265Smm#endif
600168404Spjd
601168404Spjd	ASSERT(MUTEX_HELD(&spa_namespace_lock));
602168404Spjd
603168404Spjd	spa = kmem_zalloc(sizeof (spa_t), KM_SLEEP);
604168404Spjd
605168404Spjd	mutex_init(&spa->spa_async_lock, NULL, MUTEX_DEFAULT, NULL);
606219089Spjd	mutex_init(&spa->spa_errlist_lock, NULL, MUTEX_DEFAULT, NULL);
607185029Spjd	mutex_init(&spa->spa_errlog_lock, NULL, MUTEX_DEFAULT, NULL);
608288549Smav	mutex_init(&spa->spa_evicting_os_lock, NULL, MUTEX_DEFAULT, NULL);
609185029Spjd	mutex_init(&spa->spa_history_lock, NULL, MUTEX_DEFAULT, NULL);
610219089Spjd	mutex_init(&spa->spa_proc_lock, NULL, MUTEX_DEFAULT, NULL);
611185029Spjd	mutex_init(&spa->spa_props_lock, NULL, MUTEX_DEFAULT, NULL);
612290757Smav	mutex_init(&spa->spa_cksum_tmpls_lock, NULL, MUTEX_DEFAULT, NULL);
613219089Spjd	mutex_init(&spa->spa_scrub_lock, NULL, MUTEX_DEFAULT, NULL);
614219089Spjd	mutex_init(&spa->spa_suspend_lock, NULL, MUTEX_DEFAULT, NULL);
615219089Spjd	mutex_init(&spa->spa_vdev_top_lock, NULL, MUTEX_DEFAULT, NULL);
616168404Spjd
617185029Spjd	cv_init(&spa->spa_async_cv, NULL, CV_DEFAULT, NULL);
618288549Smav	cv_init(&spa->spa_evicting_os_cv, NULL, CV_DEFAULT, NULL);
619219089Spjd	cv_init(&spa->spa_proc_cv, NULL, CV_DEFAULT, NULL);
620168404Spjd	cv_init(&spa->spa_scrub_io_cv, NULL, CV_DEFAULT, NULL);
621185029Spjd	cv_init(&spa->spa_suspend_cv, NULL, CV_DEFAULT, NULL);
622168404Spjd
623219089Spjd	for (int t = 0; t < TXG_SIZE; t++)
624219089Spjd		bplist_create(&spa->spa_free_bplist[t]);
625219089Spjd
626185029Spjd	(void) strlcpy(spa->spa_name, name, sizeof (spa->spa_name));
627185029Spjd	spa->spa_state = POOL_STATE_UNINITIALIZED;
628185029Spjd	spa->spa_freeze_txg = UINT64_MAX;
629185029Spjd	spa->spa_final_txg = UINT64_MAX;
630219089Spjd	spa->spa_load_max_txg = UINT64_MAX;
631219089Spjd	spa->spa_proc = &p0;
632219089Spjd	spa->spa_proc_state = SPA_PROC_NONE;
633185029Spjd
634247265Smm#ifdef illumos
635247265Smm	hdlr.cyh_func = spa_deadman;
636247265Smm	hdlr.cyh_arg = spa;
637247265Smm	hdlr.cyh_level = CY_LOW_LEVEL;
638247265Smm#endif
639247265Smm
640260763Savg	spa->spa_deadman_synctime = MSEC2NSEC(zfs_deadman_synctime_ms);
641247265Smm
642247265Smm#ifdef illumos
643247265Smm	/*
644247265Smm	 * This determines how often we need to check for hung I/Os after
645247265Smm	 * the cyclic has already fired. Since checking for hung I/Os is
646247265Smm	 * an expensive operation we don't want to check too frequently.
647260763Savg	 * Instead wait for 5 seconds before checking again.
648247265Smm	 */
649260763Savg	when.cyt_interval = MSEC2NSEC(zfs_deadman_checktime_ms);
650247265Smm	when.cyt_when = CY_INFINITY;
651247265Smm	mutex_enter(&cpu_lock);
652247265Smm	spa->spa_deadman_cycid = cyclic_add(&hdlr, &when);
653247265Smm	mutex_exit(&cpu_lock);
654247265Smm#else	/* !illumos */
655247265Smm#ifdef _KERNEL
656247265Smm	callout_init(&spa->spa_deadman_cycid, CALLOUT_MPSAFE);
657247265Smm#endif
658247265Smm#endif
659168404Spjd	refcount_create(&spa->spa_refcount);
660185029Spjd	spa_config_lock_init(spa);
661168404Spjd
662168404Spjd	avl_add(&spa_namespace_avl, spa);
663168404Spjd
664168404Spjd	/*
665168404Spjd	 * Set the alternate root, if there is one.
666168404Spjd	 */
667168404Spjd	if (altroot) {
668168404Spjd		spa->spa_root = spa_strdup(altroot);
669168404Spjd		spa_active_count++;
670168404Spjd	}
671168404Spjd
672185029Spjd	/*
673185029Spjd	 * Every pool starts with the default cachefile
674185029Spjd	 */
675185029Spjd	list_create(&spa->spa_config_list, sizeof (spa_config_dirent_t),
676185029Spjd	    offsetof(spa_config_dirent_t, scd_link));
677185029Spjd
678185029Spjd	dp = kmem_zalloc(sizeof (spa_config_dirent_t), KM_SLEEP);
679219089Spjd	dp->scd_path = altroot ? NULL : spa_strdup(spa_config_path);
680185029Spjd	list_insert_head(&spa->spa_config_list, dp);
681185029Spjd
682219089Spjd	VERIFY(nvlist_alloc(&spa->spa_load_info, NV_UNIQUE_NAME,
683219089Spjd	    KM_SLEEP) == 0);
684219089Spjd
685236884Smm	if (config != NULL) {
686236884Smm		nvlist_t *features;
687236884Smm
688236884Smm		if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_FEATURES_FOR_READ,
689236884Smm		    &features) == 0) {
690236884Smm			VERIFY(nvlist_dup(features, &spa->spa_label_features,
691236884Smm			    0) == 0);
692236884Smm		}
693236884Smm
694219089Spjd		VERIFY(nvlist_dup(config, &spa->spa_config, 0) == 0);
695236884Smm	}
696219089Spjd
697236884Smm	if (spa->spa_label_features == NULL) {
698236884Smm		VERIFY(nvlist_alloc(&spa->spa_label_features, NV_UNIQUE_NAME,
699236884Smm		    KM_SLEEP) == 0);
700236884Smm	}
701236884Smm
702248571Smm	spa->spa_debug = ((zfs_flags & ZFS_DEBUG_SPA) != 0);
703248571Smm
704285001Savg	spa->spa_min_ashift = INT_MAX;
705285001Savg	spa->spa_max_ashift = 0;
706285001Savg
707263397Sdelphij	/*
708263397Sdelphij	 * As a pool is being created, treat all features as disabled by
709263397Sdelphij	 * setting SPA_FEATURE_DISABLED for all entries in the feature
710263397Sdelphij	 * refcount cache.
711263397Sdelphij	 */
712263397Sdelphij	for (int i = 0; i < SPA_FEATURES; i++) {
713263397Sdelphij		spa->spa_feat_refcount_cache[i] = SPA_FEATURE_DISABLED;
714263397Sdelphij	}
715263397Sdelphij
716168404Spjd	return (spa);
717168404Spjd}
718168404Spjd
719168404Spjd/*
720168404Spjd * Removes a spa_t from the namespace, freeing up any memory used.  Requires
721168404Spjd * spa_namespace_lock.  This is called only after the spa_t has been closed and
722168404Spjd * deactivated.
723168404Spjd */
724168404Spjdvoid
725168404Spjdspa_remove(spa_t *spa)
726168404Spjd{
727185029Spjd	spa_config_dirent_t *dp;
728185029Spjd
729168404Spjd	ASSERT(MUTEX_HELD(&spa_namespace_lock));
730168404Spjd	ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
731288549Smav	ASSERT3U(refcount_count(&spa->spa_refcount), ==, 0);
732168404Spjd
733219089Spjd	nvlist_free(spa->spa_config_splitting);
734219089Spjd
735168404Spjd	avl_remove(&spa_namespace_avl, spa);
736168404Spjd	cv_broadcast(&spa_namespace_cv);
737168404Spjd
738168404Spjd	if (spa->spa_root) {
739168404Spjd		spa_strfree(spa->spa_root);
740168404Spjd		spa_active_count--;
741168404Spjd	}
742168404Spjd
743185029Spjd	while ((dp = list_head(&spa->spa_config_list)) != NULL) {
744185029Spjd		list_remove(&spa->spa_config_list, dp);
745185029Spjd		if (dp->scd_path != NULL)
746185029Spjd			spa_strfree(dp->scd_path);
747185029Spjd		kmem_free(dp, sizeof (spa_config_dirent_t));
748185029Spjd	}
749168404Spjd
750185029Spjd	list_destroy(&spa->spa_config_list);
751185029Spjd
752236884Smm	nvlist_free(spa->spa_label_features);
753219089Spjd	nvlist_free(spa->spa_load_info);
754168404Spjd	spa_config_set(spa, NULL);
755168404Spjd
756247265Smm#ifdef illumos
757247265Smm	mutex_enter(&cpu_lock);
758247265Smm	if (spa->spa_deadman_cycid != CYCLIC_NONE)
759247265Smm		cyclic_remove(spa->spa_deadman_cycid);
760247265Smm	mutex_exit(&cpu_lock);
761247265Smm	spa->spa_deadman_cycid = CYCLIC_NONE;
762247265Smm#else	/* !illumos */
763247265Smm#ifdef _KERNEL
764247265Smm	callout_drain(&spa->spa_deadman_cycid);
765247265Smm#endif
766247265Smm#endif
767247265Smm
768168404Spjd	refcount_destroy(&spa->spa_refcount);
769168404Spjd
770185029Spjd	spa_config_lock_destroy(spa);
771185029Spjd
772219089Spjd	for (int t = 0; t < TXG_SIZE; t++)
773219089Spjd		bplist_destroy(&spa->spa_free_bplist[t]);
774219089Spjd
775290757Smav	zio_checksum_templates_free(spa);
776290757Smav
777168404Spjd	cv_destroy(&spa->spa_async_cv);
778288549Smav	cv_destroy(&spa->spa_evicting_os_cv);
779219089Spjd	cv_destroy(&spa->spa_proc_cv);
780168404Spjd	cv_destroy(&spa->spa_scrub_io_cv);
781185029Spjd	cv_destroy(&spa->spa_suspend_cv);
782168404Spjd
783185029Spjd	mutex_destroy(&spa->spa_async_lock);
784219089Spjd	mutex_destroy(&spa->spa_errlist_lock);
785185029Spjd	mutex_destroy(&spa->spa_errlog_lock);
786288549Smav	mutex_destroy(&spa->spa_evicting_os_lock);
787185029Spjd	mutex_destroy(&spa->spa_history_lock);
788219089Spjd	mutex_destroy(&spa->spa_proc_lock);
789185029Spjd	mutex_destroy(&spa->spa_props_lock);
790290757Smav	mutex_destroy(&spa->spa_cksum_tmpls_lock);
791219089Spjd	mutex_destroy(&spa->spa_scrub_lock);
792185029Spjd	mutex_destroy(&spa->spa_suspend_lock);
793219089Spjd	mutex_destroy(&spa->spa_vdev_top_lock);
794168404Spjd
795168404Spjd	kmem_free(spa, sizeof (spa_t));
796168404Spjd}
797168404Spjd
798168404Spjd/*
799168404Spjd * Given a pool, return the next pool in the namespace, or NULL if there is
800168404Spjd * none.  If 'prev' is NULL, return the first pool.
801168404Spjd */
802168404Spjdspa_t *
803168404Spjdspa_next(spa_t *prev)
804168404Spjd{
805168404Spjd	ASSERT(MUTEX_HELD(&spa_namespace_lock));
806168404Spjd
807168404Spjd	if (prev)
808168404Spjd		return (AVL_NEXT(&spa_namespace_avl, prev));
809168404Spjd	else
810168404Spjd		return (avl_first(&spa_namespace_avl));
811168404Spjd}
812168404Spjd
813168404Spjd/*
814168404Spjd * ==========================================================================
815168404Spjd * SPA refcount functions
816168404Spjd * ==========================================================================
817168404Spjd */
818168404Spjd
819168404Spjd/*
820168404Spjd * Add a reference to the given spa_t.  Must have at least one reference, or
821168404Spjd * have the namespace lock held.
822168404Spjd */
823168404Spjdvoid
824168404Spjdspa_open_ref(spa_t *spa, void *tag)
825168404Spjd{
826185029Spjd	ASSERT(refcount_count(&spa->spa_refcount) >= spa->spa_minref ||
827168404Spjd	    MUTEX_HELD(&spa_namespace_lock));
828168404Spjd	(void) refcount_add(&spa->spa_refcount, tag);
829168404Spjd}
830168404Spjd
831168404Spjd/*
832168404Spjd * Remove a reference to the given spa_t.  Must have at least one reference, or
833168404Spjd * have the namespace lock held.
834168404Spjd */
835168404Spjdvoid
836168404Spjdspa_close(spa_t *spa, void *tag)
837168404Spjd{
838185029Spjd	ASSERT(refcount_count(&spa->spa_refcount) > spa->spa_minref ||
839168404Spjd	    MUTEX_HELD(&spa_namespace_lock));
840168404Spjd	(void) refcount_remove(&spa->spa_refcount, tag);
841168404Spjd}
842168404Spjd
843168404Spjd/*
844288549Smav * Remove a reference to the given spa_t held by a dsl dir that is
845288549Smav * being asynchronously released.  Async releases occur from a taskq
846288549Smav * performing eviction of dsl datasets and dirs.  The namespace lock
847288549Smav * isn't held and the hold by the object being evicted may contribute to
848288549Smav * spa_minref (e.g. dataset or directory released during pool export),
849288549Smav * so the asserts in spa_close() do not apply.
850288549Smav */
851288549Smavvoid
852288549Smavspa_async_close(spa_t *spa, void *tag)
853288549Smav{
854288549Smav	(void) refcount_remove(&spa->spa_refcount, tag);
855288549Smav}
856288549Smav
857288549Smav/*
858168404Spjd * Check to see if the spa refcount is zero.  Must be called with
859185029Spjd * spa_namespace_lock held.  We really compare against spa_minref, which is the
860168404Spjd * number of references acquired when opening a pool
861168404Spjd */
862168404Spjdboolean_t
863168404Spjdspa_refcount_zero(spa_t *spa)
864168404Spjd{
865168404Spjd	ASSERT(MUTEX_HELD(&spa_namespace_lock));
866168404Spjd
867185029Spjd	return (refcount_count(&spa->spa_refcount) == spa->spa_minref);
868168404Spjd}
869168404Spjd
870168404Spjd/*
871168404Spjd * ==========================================================================
872185029Spjd * SPA spare and l2cache tracking
873168404Spjd * ==========================================================================
874168404Spjd */
875168404Spjd
876168404Spjd/*
877185029Spjd * Hot spares and cache devices are tracked using the same code below,
878185029Spjd * for 'auxiliary' devices.
879185029Spjd */
880185029Spjd
881185029Spjdtypedef struct spa_aux {
882185029Spjd	uint64_t	aux_guid;
883185029Spjd	uint64_t	aux_pool;
884185029Spjd	avl_node_t	aux_avl;
885185029Spjd	int		aux_count;
886185029Spjd} spa_aux_t;
887185029Spjd
888185029Spjdstatic int
889185029Spjdspa_aux_compare(const void *a, const void *b)
890185029Spjd{
891185029Spjd	const spa_aux_t *sa = a;
892185029Spjd	const spa_aux_t *sb = b;
893185029Spjd
894185029Spjd	if (sa->aux_guid < sb->aux_guid)
895185029Spjd		return (-1);
896185029Spjd	else if (sa->aux_guid > sb->aux_guid)
897185029Spjd		return (1);
898185029Spjd	else
899185029Spjd		return (0);
900185029Spjd}
901185029Spjd
902185029Spjdvoid
903185029Spjdspa_aux_add(vdev_t *vd, avl_tree_t *avl)
904185029Spjd{
905185029Spjd	avl_index_t where;
906185029Spjd	spa_aux_t search;
907185029Spjd	spa_aux_t *aux;
908185029Spjd
909185029Spjd	search.aux_guid = vd->vdev_guid;
910185029Spjd	if ((aux = avl_find(avl, &search, &where)) != NULL) {
911185029Spjd		aux->aux_count++;
912185029Spjd	} else {
913185029Spjd		aux = kmem_zalloc(sizeof (spa_aux_t), KM_SLEEP);
914185029Spjd		aux->aux_guid = vd->vdev_guid;
915185029Spjd		aux->aux_count = 1;
916185029Spjd		avl_insert(avl, aux, where);
917185029Spjd	}
918185029Spjd}
919185029Spjd
920185029Spjdvoid
921185029Spjdspa_aux_remove(vdev_t *vd, avl_tree_t *avl)
922185029Spjd{
923185029Spjd	spa_aux_t search;
924185029Spjd	spa_aux_t *aux;
925185029Spjd	avl_index_t where;
926185029Spjd
927185029Spjd	search.aux_guid = vd->vdev_guid;
928185029Spjd	aux = avl_find(avl, &search, &where);
929185029Spjd
930185029Spjd	ASSERT(aux != NULL);
931185029Spjd
932185029Spjd	if (--aux->aux_count == 0) {
933185029Spjd		avl_remove(avl, aux);
934185029Spjd		kmem_free(aux, sizeof (spa_aux_t));
935185029Spjd	} else if (aux->aux_pool == spa_guid(vd->vdev_spa)) {
936185029Spjd		aux->aux_pool = 0ULL;
937185029Spjd	}
938185029Spjd}
939185029Spjd
940185029Spjdboolean_t
941185029Spjdspa_aux_exists(uint64_t guid, uint64_t *pool, int *refcnt, avl_tree_t *avl)
942185029Spjd{
943185029Spjd	spa_aux_t search, *found;
944185029Spjd
945185029Spjd	search.aux_guid = guid;
946185029Spjd	found = avl_find(avl, &search, NULL);
947185029Spjd
948185029Spjd	if (pool) {
949185029Spjd		if (found)
950185029Spjd			*pool = found->aux_pool;
951185029Spjd		else
952185029Spjd			*pool = 0ULL;
953185029Spjd	}
954185029Spjd
955185029Spjd	if (refcnt) {
956185029Spjd		if (found)
957185029Spjd			*refcnt = found->aux_count;
958185029Spjd		else
959185029Spjd			*refcnt = 0;
960185029Spjd	}
961185029Spjd
962185029Spjd	return (found != NULL);
963185029Spjd}
964185029Spjd
965185029Spjdvoid
966185029Spjdspa_aux_activate(vdev_t *vd, avl_tree_t *avl)
967185029Spjd{
968185029Spjd	spa_aux_t search, *found;
969185029Spjd	avl_index_t where;
970185029Spjd
971185029Spjd	search.aux_guid = vd->vdev_guid;
972185029Spjd	found = avl_find(avl, &search, &where);
973185029Spjd	ASSERT(found != NULL);
974185029Spjd	ASSERT(found->aux_pool == 0ULL);
975185029Spjd
976185029Spjd	found->aux_pool = spa_guid(vd->vdev_spa);
977185029Spjd}
978185029Spjd
979185029Spjd/*
980168404Spjd * Spares are tracked globally due to the following constraints:
981168404Spjd *
982168404Spjd * 	- A spare may be part of multiple pools.
983168404Spjd * 	- A spare may be added to a pool even if it's actively in use within
984168404Spjd *	  another pool.
985168404Spjd * 	- A spare in use in any pool can only be the source of a replacement if
986168404Spjd *	  the target is a spare in the same pool.
987168404Spjd *
988168404Spjd * We keep track of all spares on the system through the use of a reference
989168404Spjd * counted AVL tree.  When a vdev is added as a spare, or used as a replacement
990168404Spjd * spare, then we bump the reference count in the AVL tree.  In addition, we set
991168404Spjd * the 'vdev_isspare' member to indicate that the device is a spare (active or
992168404Spjd * inactive).  When a spare is made active (used to replace a device in the
993168404Spjd * pool), we also keep track of which pool its been made a part of.
994168404Spjd *
995168404Spjd * The 'spa_spare_lock' protects the AVL tree.  These functions are normally
996168404Spjd * called under the spa_namespace lock as part of vdev reconfiguration.  The
997168404Spjd * separate spare lock exists for the status query path, which does not need to
998168404Spjd * be completely consistent with respect to other vdev configuration changes.
999168404Spjd */
1000168404Spjd
1001168404Spjdstatic int
1002168404Spjdspa_spare_compare(const void *a, const void *b)
1003168404Spjd{
1004185029Spjd	return (spa_aux_compare(a, b));
1005168404Spjd}
1006168404Spjd
1007168404Spjdvoid
1008168404Spjdspa_spare_add(vdev_t *vd)
1009168404Spjd{
1010168404Spjd	mutex_enter(&spa_spare_lock);
1011168404Spjd	ASSERT(!vd->vdev_isspare);
1012185029Spjd	spa_aux_add(vd, &spa_spare_avl);
1013168404Spjd	vd->vdev_isspare = B_TRUE;
1014168404Spjd	mutex_exit(&spa_spare_lock);
1015168404Spjd}
1016168404Spjd
1017168404Spjdvoid
1018168404Spjdspa_spare_remove(vdev_t *vd)
1019168404Spjd{
1020168404Spjd	mutex_enter(&spa_spare_lock);
1021168404Spjd	ASSERT(vd->vdev_isspare);
1022185029Spjd	spa_aux_remove(vd, &spa_spare_avl);
1023168404Spjd	vd->vdev_isspare = B_FALSE;
1024168404Spjd	mutex_exit(&spa_spare_lock);
1025168404Spjd}
1026168404Spjd
1027168404Spjdboolean_t
1028185029Spjdspa_spare_exists(uint64_t guid, uint64_t *pool, int *refcnt)
1029168404Spjd{
1030185029Spjd	boolean_t found;
1031168404Spjd
1032168404Spjd	mutex_enter(&spa_spare_lock);
1033185029Spjd	found = spa_aux_exists(guid, pool, refcnt, &spa_spare_avl);
1034168404Spjd	mutex_exit(&spa_spare_lock);
1035168404Spjd
1036185029Spjd	return (found);
1037168404Spjd}
1038168404Spjd
1039168404Spjdvoid
1040168404Spjdspa_spare_activate(vdev_t *vd)
1041168404Spjd{
1042168404Spjd	mutex_enter(&spa_spare_lock);
1043168404Spjd	ASSERT(vd->vdev_isspare);
1044185029Spjd	spa_aux_activate(vd, &spa_spare_avl);
1045168404Spjd	mutex_exit(&spa_spare_lock);
1046168404Spjd}
1047168404Spjd
1048168404Spjd/*
1049185029Spjd * Level 2 ARC devices are tracked globally for the same reasons as spares.
1050185029Spjd * Cache devices currently only support one pool per cache device, and so
1051185029Spjd * for these devices the aux reference count is currently unused beyond 1.
1052168404Spjd */
1053168404Spjd
1054185029Spjdstatic int
1055185029Spjdspa_l2cache_compare(const void *a, const void *b)
1056185029Spjd{
1057185029Spjd	return (spa_aux_compare(a, b));
1058185029Spjd}
1059185029Spjd
1060168404Spjdvoid
1061185029Spjdspa_l2cache_add(vdev_t *vd)
1062168404Spjd{
1063185029Spjd	mutex_enter(&spa_l2cache_lock);
1064185029Spjd	ASSERT(!vd->vdev_isl2cache);
1065185029Spjd	spa_aux_add(vd, &spa_l2cache_avl);
1066185029Spjd	vd->vdev_isl2cache = B_TRUE;
1067185029Spjd	mutex_exit(&spa_l2cache_lock);
1068185029Spjd}
1069168404Spjd
1070185029Spjdvoid
1071185029Spjdspa_l2cache_remove(vdev_t *vd)
1072185029Spjd{
1073185029Spjd	mutex_enter(&spa_l2cache_lock);
1074185029Spjd	ASSERT(vd->vdev_isl2cache);
1075185029Spjd	spa_aux_remove(vd, &spa_l2cache_avl);
1076185029Spjd	vd->vdev_isl2cache = B_FALSE;
1077185029Spjd	mutex_exit(&spa_l2cache_lock);
1078185029Spjd}
1079168404Spjd
1080185029Spjdboolean_t
1081185029Spjdspa_l2cache_exists(uint64_t guid, uint64_t *pool)
1082185029Spjd{
1083185029Spjd	boolean_t found;
1084168404Spjd
1085185029Spjd	mutex_enter(&spa_l2cache_lock);
1086185029Spjd	found = spa_aux_exists(guid, pool, NULL, &spa_l2cache_avl);
1087185029Spjd	mutex_exit(&spa_l2cache_lock);
1088168404Spjd
1089185029Spjd	return (found);
1090168404Spjd}
1091168404Spjd
1092168404Spjdvoid
1093185029Spjdspa_l2cache_activate(vdev_t *vd)
1094168404Spjd{
1095185029Spjd	mutex_enter(&spa_l2cache_lock);
1096185029Spjd	ASSERT(vd->vdev_isl2cache);
1097185029Spjd	spa_aux_activate(vd, &spa_l2cache_avl);
1098185029Spjd	mutex_exit(&spa_l2cache_lock);
1099168404Spjd}
1100168404Spjd
1101168404Spjd/*
1102168404Spjd * ==========================================================================
1103168404Spjd * SPA vdev locking
1104168404Spjd * ==========================================================================
1105168404Spjd */
1106168404Spjd
1107168404Spjd/*
1108168404Spjd * Lock the given spa_t for the purpose of adding or removing a vdev.
1109168404Spjd * Grabs the global spa_namespace_lock plus the spa config lock for writing.
1110168404Spjd * It returns the next transaction group for the spa_t.
1111168404Spjd */
1112168404Spjduint64_t
1113168404Spjdspa_vdev_enter(spa_t *spa)
1114168404Spjd{
1115219089Spjd	mutex_enter(&spa->spa_vdev_top_lock);
1116168404Spjd	mutex_enter(&spa_namespace_lock);
1117219089Spjd	return (spa_vdev_config_enter(spa));
1118219089Spjd}
1119168404Spjd
1120219089Spjd/*
1121219089Spjd * Internal implementation for spa_vdev_enter().  Used when a vdev
1122219089Spjd * operation requires multiple syncs (i.e. removing a device) while
1123219089Spjd * keeping the spa_namespace_lock held.
1124219089Spjd */
1125219089Spjduint64_t
1126219089Spjdspa_vdev_config_enter(spa_t *spa)
1127219089Spjd{
1128219089Spjd	ASSERT(MUTEX_HELD(&spa_namespace_lock));
1129219089Spjd
1130185029Spjd	spa_config_enter(spa, SCL_ALL, spa, RW_WRITER);
1131168404Spjd
1132168404Spjd	return (spa_last_synced_txg(spa) + 1);
1133168404Spjd}
1134168404Spjd
1135168404Spjd/*
1136219089Spjd * Used in combination with spa_vdev_config_enter() to allow the syncing
1137219089Spjd * of multiple transactions without releasing the spa_namespace_lock.
1138168404Spjd */
1139219089Spjdvoid
1140219089Spjdspa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error, char *tag)
1141168404Spjd{
1142219089Spjd	ASSERT(MUTEX_HELD(&spa_namespace_lock));
1143219089Spjd
1144168404Spjd	int config_changed = B_FALSE;
1145168404Spjd
1146168404Spjd	ASSERT(txg > spa_last_synced_txg(spa));
1147168404Spjd
1148185029Spjd	spa->spa_pending_vdev = NULL;
1149185029Spjd
1150168404Spjd	/*
1151168404Spjd	 * Reassess the DTLs.
1152168404Spjd	 */
1153168404Spjd	vdev_dtl_reassess(spa->spa_root_vdev, 0, 0, B_FALSE);
1154168404Spjd
1155185029Spjd	if (error == 0 && !list_is_empty(&spa->spa_config_dirty_list)) {
1156168404Spjd		config_changed = B_TRUE;
1157219089Spjd		spa->spa_config_generation++;
1158168404Spjd	}
1159168404Spjd
1160219089Spjd	/*
1161219089Spjd	 * Verify the metaslab classes.
1162219089Spjd	 */
1163219089Spjd	ASSERT(metaslab_class_validate(spa_normal_class(spa)) == 0);
1164219089Spjd	ASSERT(metaslab_class_validate(spa_log_class(spa)) == 0);
1165219089Spjd
1166185029Spjd	spa_config_exit(spa, SCL_ALL, spa);
1167168404Spjd
1168168404Spjd	/*
1169219089Spjd	 * Panic the system if the specified tag requires it.  This
1170219089Spjd	 * is useful for ensuring that configurations are updated
1171219089Spjd	 * transactionally.
1172219089Spjd	 */
1173219089Spjd	if (zio_injection_enabled)
1174219089Spjd		zio_handle_panic_injection(spa, tag, 0);
1175219089Spjd
1176219089Spjd	/*
1177168404Spjd	 * Note: this txg_wait_synced() is important because it ensures
1178168404Spjd	 * that there won't be more than one config change per txg.
1179168404Spjd	 * This allows us to use the txg as the generation number.
1180168404Spjd	 */
1181168404Spjd	if (error == 0)
1182168404Spjd		txg_wait_synced(spa->spa_dsl_pool, txg);
1183168404Spjd
1184168404Spjd	if (vd != NULL) {
1185262093Savg		ASSERT(!vd->vdev_detached || vd->vdev_dtl_sm == NULL);
1186209962Smm		spa_config_enter(spa, SCL_ALL, spa, RW_WRITER);
1187168404Spjd		vdev_free(vd);
1188209962Smm		spa_config_exit(spa, SCL_ALL, spa);
1189168404Spjd	}
1190168404Spjd
1191168404Spjd	/*
1192168404Spjd	 * If the config changed, update the config cache.
1193168404Spjd	 */
1194168404Spjd	if (config_changed)
1195185029Spjd		spa_config_sync(spa, B_FALSE, B_TRUE);
1196219089Spjd}
1197168404Spjd
1198219089Spjd/*
1199219089Spjd * Unlock the spa_t after adding or removing a vdev.  Besides undoing the
1200219089Spjd * locking of spa_vdev_enter(), we also want make sure the transactions have
1201219089Spjd * synced to disk, and then update the global configuration cache with the new
1202219089Spjd * information.
1203219089Spjd */
1204219089Spjdint
1205219089Spjdspa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error)
1206219089Spjd{
1207219089Spjd	spa_vdev_config_exit(spa, vd, txg, error, FTAG);
1208168404Spjd	mutex_exit(&spa_namespace_lock);
1209219089Spjd	mutex_exit(&spa->spa_vdev_top_lock);
1210168404Spjd
1211168404Spjd	return (error);
1212168404Spjd}
1213168404Spjd
1214168404Spjd/*
1215185029Spjd * Lock the given spa_t for the purpose of changing vdev state.
1216185029Spjd */
1217185029Spjdvoid
1218219089Spjdspa_vdev_state_enter(spa_t *spa, int oplocks)
1219185029Spjd{
1220219089Spjd	int locks = SCL_STATE_ALL | oplocks;
1221219089Spjd
1222219089Spjd	/*
1223219089Spjd	 * Root pools may need to read of the underlying devfs filesystem
1224219089Spjd	 * when opening up a vdev.  Unfortunately if we're holding the
1225219089Spjd	 * SCL_ZIO lock it will result in a deadlock when we try to issue
1226219089Spjd	 * the read from the root filesystem.  Instead we "prefetch"
1227219089Spjd	 * the associated vnodes that we need prior to opening the
1228219089Spjd	 * underlying devices and cache them so that we can prevent
1229219089Spjd	 * any I/O when we are doing the actual open.
1230219089Spjd	 */
1231219089Spjd	if (spa_is_root(spa)) {
1232219089Spjd		int low = locks & ~(SCL_ZIO - 1);
1233219089Spjd		int high = locks & ~low;
1234219089Spjd
1235219089Spjd		spa_config_enter(spa, high, spa, RW_WRITER);
1236219089Spjd		vdev_hold(spa->spa_root_vdev);
1237219089Spjd		spa_config_enter(spa, low, spa, RW_WRITER);
1238219089Spjd	} else {
1239219089Spjd		spa_config_enter(spa, locks, spa, RW_WRITER);
1240219089Spjd	}
1241219089Spjd	spa->spa_vdev_locks = locks;
1242185029Spjd}
1243185029Spjd
1244185029Spjdint
1245185029Spjdspa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error)
1246185029Spjd{
1247219089Spjd	boolean_t config_changed = B_FALSE;
1248219089Spjd
1249219089Spjd	if (vd != NULL || error == 0)
1250219089Spjd		vdev_dtl_reassess(vd ? vd->vdev_top : spa->spa_root_vdev,
1251219089Spjd		    0, 0, B_FALSE);
1252219089Spjd
1253219089Spjd	if (vd != NULL) {
1254185029Spjd		vdev_state_dirty(vd->vdev_top);
1255219089Spjd		config_changed = B_TRUE;
1256219089Spjd		spa->spa_config_generation++;
1257219089Spjd	}
1258185029Spjd
1259219089Spjd	if (spa_is_root(spa))
1260219089Spjd		vdev_rele(spa->spa_root_vdev);
1261185029Spjd
1262219089Spjd	ASSERT3U(spa->spa_vdev_locks, >=, SCL_STATE_ALL);
1263219089Spjd	spa_config_exit(spa, spa->spa_vdev_locks, spa);
1264219089Spjd
1265209962Smm	/*
1266209962Smm	 * If anything changed, wait for it to sync.  This ensures that,
1267209962Smm	 * from the system administrator's perspective, zpool(1M) commands
1268209962Smm	 * are synchronous.  This is important for things like zpool offline:
1269209962Smm	 * when the command completes, you expect no further I/O from ZFS.
1270209962Smm	 */
1271209962Smm	if (vd != NULL)
1272209962Smm		txg_wait_synced(spa->spa_dsl_pool, 0);
1273209962Smm
1274219089Spjd	/*
1275219089Spjd	 * If the config changed, update the config cache.
1276219089Spjd	 */
1277219089Spjd	if (config_changed) {
1278219089Spjd		mutex_enter(&spa_namespace_lock);
1279219089Spjd		spa_config_sync(spa, B_FALSE, B_TRUE);
1280219089Spjd		mutex_exit(&spa_namespace_lock);
1281219089Spjd	}
1282219089Spjd
1283185029Spjd	return (error);
1284185029Spjd}
1285185029Spjd
1286185029Spjd/*
1287168404Spjd * ==========================================================================
1288168404Spjd * Miscellaneous functions
1289168404Spjd * ==========================================================================
1290168404Spjd */
1291168404Spjd
1292236884Smmvoid
1293263397Sdelphijspa_activate_mos_feature(spa_t *spa, const char *feature, dmu_tx_t *tx)
1294236884Smm{
1295263390Sdelphij	if (!nvlist_exists(spa->spa_label_features, feature)) {
1296263390Sdelphij		fnvlist_add_boolean(spa->spa_label_features, feature);
1297263397Sdelphij		/*
1298263397Sdelphij		 * When we are creating the pool (tx_txg==TXG_INITIAL), we can't
1299263397Sdelphij		 * dirty the vdev config because lock SCL_CONFIG is not held.
1300263397Sdelphij		 * Thankfully, in this case we don't need to dirty the config
1301263397Sdelphij		 * because it will be written out anyway when we finish
1302263397Sdelphij		 * creating the pool.
1303263397Sdelphij		 */
1304263397Sdelphij		if (tx->tx_txg != TXG_INITIAL)
1305263397Sdelphij			vdev_config_dirty(spa->spa_root_vdev);
1306263390Sdelphij	}
1307236884Smm}
1308236884Smm
1309236884Smmvoid
1310236884Smmspa_deactivate_mos_feature(spa_t *spa, const char *feature)
1311236884Smm{
1312263390Sdelphij	if (nvlist_remove_all(spa->spa_label_features, feature) == 0)
1313263390Sdelphij		vdev_config_dirty(spa->spa_root_vdev);
1314236884Smm}
1315236884Smm
1316168404Spjd/*
1317168404Spjd * Rename a spa_t.
1318168404Spjd */
1319168404Spjdint
1320168404Spjdspa_rename(const char *name, const char *newname)
1321168404Spjd{
1322168404Spjd	spa_t *spa;
1323168404Spjd	int err;
1324168404Spjd
1325168404Spjd	/*
1326168404Spjd	 * Lookup the spa_t and grab the config lock for writing.  We need to
1327168404Spjd	 * actually open the pool so that we can sync out the necessary labels.
1328168404Spjd	 * It's OK to call spa_open() with the namespace lock held because we
1329168404Spjd	 * allow recursive calls for other reasons.
1330168404Spjd	 */
1331168404Spjd	mutex_enter(&spa_namespace_lock);
1332168404Spjd	if ((err = spa_open(name, &spa, FTAG)) != 0) {
1333168404Spjd		mutex_exit(&spa_namespace_lock);
1334168404Spjd		return (err);
1335168404Spjd	}
1336168404Spjd
1337185029Spjd	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
1338168404Spjd
1339168404Spjd	avl_remove(&spa_namespace_avl, spa);
1340185029Spjd	(void) strlcpy(spa->spa_name, newname, sizeof (spa->spa_name));
1341168404Spjd	avl_add(&spa_namespace_avl, spa);
1342168404Spjd
1343168404Spjd	/*
1344168404Spjd	 * Sync all labels to disk with the new names by marking the root vdev
1345168404Spjd	 * dirty and waiting for it to sync.  It will pick up the new pool name
1346168404Spjd	 * during the sync.
1347168404Spjd	 */
1348168404Spjd	vdev_config_dirty(spa->spa_root_vdev);
1349168404Spjd
1350185029Spjd	spa_config_exit(spa, SCL_ALL, FTAG);
1351168404Spjd
1352168404Spjd	txg_wait_synced(spa->spa_dsl_pool, 0);
1353168404Spjd
1354168404Spjd	/*
1355168404Spjd	 * Sync the updated config cache.
1356168404Spjd	 */
1357185029Spjd	spa_config_sync(spa, B_FALSE, B_TRUE);
1358168404Spjd
1359168404Spjd	spa_close(spa, FTAG);
1360168404Spjd
1361168404Spjd	mutex_exit(&spa_namespace_lock);
1362168404Spjd
1363168404Spjd	return (0);
1364168404Spjd}
1365168404Spjd
1366168404Spjd/*
1367219089Spjd * Return the spa_t associated with given pool_guid, if it exists.  If
1368219089Spjd * device_guid is non-zero, determine whether the pool exists *and* contains
1369219089Spjd * a device with the specified device_guid.
1370168404Spjd */
1371219089Spjdspa_t *
1372219089Spjdspa_by_guid(uint64_t pool_guid, uint64_t device_guid)
1373168404Spjd{
1374168404Spjd	spa_t *spa;
1375168404Spjd	avl_tree_t *t = &spa_namespace_avl;
1376168404Spjd
1377168404Spjd	ASSERT(MUTEX_HELD(&spa_namespace_lock));
1378168404Spjd
1379168404Spjd	for (spa = avl_first(t); spa != NULL; spa = AVL_NEXT(t, spa)) {
1380168404Spjd		if (spa->spa_state == POOL_STATE_UNINITIALIZED)
1381168404Spjd			continue;
1382168404Spjd		if (spa->spa_root_vdev == NULL)
1383168404Spjd			continue;
1384168404Spjd		if (spa_guid(spa) == pool_guid) {
1385168404Spjd			if (device_guid == 0)
1386168404Spjd				break;
1387168404Spjd
1388168404Spjd			if (vdev_lookup_by_guid(spa->spa_root_vdev,
1389168404Spjd			    device_guid) != NULL)
1390168404Spjd				break;
1391168404Spjd
1392168404Spjd			/*
1393185029Spjd			 * Check any devices we may be in the process of adding.
1394168404Spjd			 */
1395168404Spjd			if (spa->spa_pending_vdev) {
1396168404Spjd				if (vdev_lookup_by_guid(spa->spa_pending_vdev,
1397168404Spjd				    device_guid) != NULL)
1398168404Spjd					break;
1399168404Spjd			}
1400168404Spjd		}
1401168404Spjd	}
1402168404Spjd
1403219089Spjd	return (spa);
1404168404Spjd}
1405168404Spjd
1406219089Spjd/*
1407219089Spjd * Determine whether a pool with the given pool_guid exists.
1408219089Spjd */
1409219089Spjdboolean_t
1410219089Spjdspa_guid_exists(uint64_t pool_guid, uint64_t device_guid)
1411219089Spjd{
1412219089Spjd	return (spa_by_guid(pool_guid, device_guid) != NULL);
1413219089Spjd}
1414219089Spjd
1415168404Spjdchar *
1416168404Spjdspa_strdup(const char *s)
1417168404Spjd{
1418168404Spjd	size_t len;
1419168404Spjd	char *new;
1420168404Spjd
1421168404Spjd	len = strlen(s);
1422168404Spjd	new = kmem_alloc(len + 1, KM_SLEEP);
1423168404Spjd	bcopy(s, new, len);
1424168404Spjd	new[len] = '\0';
1425168404Spjd
1426168404Spjd	return (new);
1427168404Spjd}
1428168404Spjd
1429168404Spjdvoid
1430168404Spjdspa_strfree(char *s)
1431168404Spjd{
1432168404Spjd	kmem_free(s, strlen(s) + 1);
1433168404Spjd}
1434168404Spjd
1435168404Spjduint64_t
1436168404Spjdspa_get_random(uint64_t range)
1437168404Spjd{
1438168404Spjd	uint64_t r;
1439168404Spjd
1440168404Spjd	ASSERT(range != 0);
1441168404Spjd
1442168404Spjd	(void) random_get_pseudo_bytes((void *)&r, sizeof (uint64_t));
1443168404Spjd
1444168404Spjd	return (r % range);
1445168404Spjd}
1446168404Spjd
1447219089Spjduint64_t
1448219089Spjdspa_generate_guid(spa_t *spa)
1449168404Spjd{
1450219089Spjd	uint64_t guid = spa_get_random(-1ULL);
1451168404Spjd
1452219089Spjd	if (spa != NULL) {
1453219089Spjd		while (guid == 0 || spa_guid_exists(spa_guid(spa), guid))
1454219089Spjd			guid = spa_get_random(-1ULL);
1455219089Spjd	} else {
1456219089Spjd		while (guid == 0 || spa_guid_exists(guid, 0))
1457219089Spjd			guid = spa_get_random(-1ULL);
1458168404Spjd	}
1459168404Spjd
1460219089Spjd	return (guid);
1461219089Spjd}
1462168404Spjd
1463219089Spjdvoid
1464263397Sdelphijsnprintf_blkptr(char *buf, size_t buflen, const blkptr_t *bp)
1465219089Spjd{
1466236884Smm	char type[256];
1467219089Spjd	char *checksum = NULL;
1468219089Spjd	char *compress = NULL;
1469168404Spjd
1470219089Spjd	if (bp != NULL) {
1471236884Smm		if (BP_GET_TYPE(bp) & DMU_OT_NEWTYPE) {
1472236884Smm			dmu_object_byteswap_t bswap =
1473236884Smm			    DMU_OT_BYTESWAP(BP_GET_TYPE(bp));
1474236884Smm			(void) snprintf(type, sizeof (type), "bswap %s %s",
1475236884Smm			    DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) ?
1476236884Smm			    "metadata" : "data",
1477236884Smm			    dmu_ot_byteswap[bswap].ob_name);
1478236884Smm		} else {
1479236884Smm			(void) strlcpy(type, dmu_ot[BP_GET_TYPE(bp)].ot_name,
1480236884Smm			    sizeof (type));
1481236884Smm		}
1482268649Sdelphij		if (!BP_IS_EMBEDDED(bp)) {
1483268649Sdelphij			checksum =
1484268649Sdelphij			    zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_name;
1485268649Sdelphij		}
1486219089Spjd		compress = zio_compress_table[BP_GET_COMPRESS(bp)].ci_name;
1487168404Spjd	}
1488168404Spjd
1489263397Sdelphij	SNPRINTF_BLKPTR(snprintf, ' ', buf, buflen, bp, type, checksum,
1490263397Sdelphij	    compress);
1491168404Spjd}
1492168404Spjd
1493168404Spjdvoid
1494168404Spjdspa_freeze(spa_t *spa)
1495168404Spjd{
1496168404Spjd	uint64_t freeze_txg = 0;
1497168404Spjd
1498185029Spjd	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
1499168404Spjd	if (spa->spa_freeze_txg == UINT64_MAX) {
1500168404Spjd		freeze_txg = spa_last_synced_txg(spa) + TXG_SIZE;
1501168404Spjd		spa->spa_freeze_txg = freeze_txg;
1502168404Spjd	}
1503185029Spjd	spa_config_exit(spa, SCL_ALL, FTAG);
1504168404Spjd	if (freeze_txg != 0)
1505168404Spjd		txg_wait_synced(spa_get_dsl(spa), freeze_txg);
1506168404Spjd}
1507168404Spjd
1508168404Spjdvoid
1509168404Spjdzfs_panic_recover(const char *fmt, ...)
1510168404Spjd{
1511168404Spjd	va_list adx;
1512168404Spjd
1513168404Spjd	va_start(adx, fmt);
1514168404Spjd	vcmn_err(zfs_recover ? CE_WARN : CE_PANIC, fmt, adx);
1515168404Spjd	va_end(adx);
1516168404Spjd}
1517168404Spjd
1518168404Spjd/*
1519209962Smm * This is a stripped-down version of strtoull, suitable only for converting
1520251631Sdelphij * lowercase hexadecimal numbers that don't overflow.
1521209962Smm */
1522209962Smmuint64_t
1523209962Smmzfs_strtonum(const char *str, char **nptr)
1524209962Smm{
1525209962Smm	uint64_t val = 0;
1526209962Smm	char c;
1527209962Smm	int digit;
1528209962Smm
1529209962Smm	while ((c = *str) != '\0') {
1530209962Smm		if (c >= '0' && c <= '9')
1531209962Smm			digit = c - '0';
1532209962Smm		else if (c >= 'a' && c <= 'f')
1533209962Smm			digit = 10 + c - 'a';
1534209962Smm		else
1535209962Smm			break;
1536209962Smm
1537209962Smm		val *= 16;
1538209962Smm		val += digit;
1539209962Smm
1540209962Smm		str++;
1541209962Smm	}
1542209962Smm
1543209962Smm	if (nptr)
1544209962Smm		*nptr = (char *)str;
1545209962Smm
1546209962Smm	return (val);
1547209962Smm}
1548209962Smm
1549209962Smm/*
1550168404Spjd * ==========================================================================
1551168404Spjd * Accessor functions
1552168404Spjd * ==========================================================================
1553168404Spjd */
1554168404Spjd
1555185029Spjdboolean_t
1556208047Smmspa_shutting_down(spa_t *spa)
1557168404Spjd{
1558208047Smm	return (spa->spa_async_suspended);
1559168404Spjd}
1560168404Spjd
1561168404Spjddsl_pool_t *
1562168404Spjdspa_get_dsl(spa_t *spa)
1563168404Spjd{
1564168404Spjd	return (spa->spa_dsl_pool);
1565168404Spjd}
1566168404Spjd
1567236884Smmboolean_t
1568236884Smmspa_is_initializing(spa_t *spa)
1569236884Smm{
1570236884Smm	return (spa->spa_is_initializing);
1571236884Smm}
1572236884Smm
1573168404Spjdblkptr_t *
1574168404Spjdspa_get_rootblkptr(spa_t *spa)
1575168404Spjd{
1576168404Spjd	return (&spa->spa_ubsync.ub_rootbp);
1577168404Spjd}
1578168404Spjd
1579168404Spjdvoid
1580168404Spjdspa_set_rootblkptr(spa_t *spa, const blkptr_t *bp)
1581168404Spjd{
1582168404Spjd	spa->spa_uberblock.ub_rootbp = *bp;
1583168404Spjd}
1584168404Spjd
1585168404Spjdvoid
1586168404Spjdspa_altroot(spa_t *spa, char *buf, size_t buflen)
1587168404Spjd{
1588168404Spjd	if (spa->spa_root == NULL)
1589168404Spjd		buf[0] = '\0';
1590168404Spjd	else
1591168404Spjd		(void) strncpy(buf, spa->spa_root, buflen);
1592168404Spjd}
1593168404Spjd
1594168404Spjdint
1595168404Spjdspa_sync_pass(spa_t *spa)
1596168404Spjd{
1597168404Spjd	return (spa->spa_sync_pass);
1598168404Spjd}
1599168404Spjd
1600168404Spjdchar *
1601168404Spjdspa_name(spa_t *spa)
1602168404Spjd{
1603168404Spjd	return (spa->spa_name);
1604168404Spjd}
1605168404Spjd
1606168404Spjduint64_t
1607168404Spjdspa_guid(spa_t *spa)
1608168404Spjd{
1609239620Smm	dsl_pool_t *dp = spa_get_dsl(spa);
1610239620Smm	uint64_t guid;
1611239620Smm
1612168404Spjd	/*
1613168404Spjd	 * If we fail to parse the config during spa_load(), we can go through
1614168404Spjd	 * the error path (which posts an ereport) and end up here with no root
1615228103Smm	 * vdev.  We stash the original pool guid in 'spa_config_guid' to handle
1616168404Spjd	 * this case.
1617168404Spjd	 */
1618239620Smm	if (spa->spa_root_vdev == NULL)
1619239620Smm		return (spa->spa_config_guid);
1620239620Smm
1621239620Smm	guid = spa->spa_last_synced_guid != 0 ?
1622239620Smm	    spa->spa_last_synced_guid : spa->spa_root_vdev->vdev_guid;
1623239620Smm
1624239620Smm	/*
1625239620Smm	 * Return the most recently synced out guid unless we're
1626239620Smm	 * in syncing context.
1627239620Smm	 */
1628239620Smm	if (dp && dsl_pool_sync_context(dp))
1629168404Spjd		return (spa->spa_root_vdev->vdev_guid);
1630168404Spjd	else
1631239620Smm		return (guid);
1632168404Spjd}
1633168404Spjd
1634168404Spjduint64_t
1635228103Smmspa_load_guid(spa_t *spa)
1636228103Smm{
1637228103Smm	/*
1638228103Smm	 * This is a GUID that exists solely as a reference for the
1639228103Smm	 * purposes of the arc.  It is generated at load time, and
1640228103Smm	 * is never written to persistent storage.
1641228103Smm	 */
1642228103Smm	return (spa->spa_load_guid);
1643228103Smm}
1644228103Smm
1645228103Smmuint64_t
1646168404Spjdspa_last_synced_txg(spa_t *spa)
1647168404Spjd{
1648168404Spjd	return (spa->spa_ubsync.ub_txg);
1649168404Spjd}
1650168404Spjd
1651168404Spjduint64_t
1652168404Spjdspa_first_txg(spa_t *spa)
1653168404Spjd{
1654168404Spjd	return (spa->spa_first_txg);
1655168404Spjd}
1656168404Spjd
1657219089Spjduint64_t
1658219089Spjdspa_syncing_txg(spa_t *spa)
1659219089Spjd{
1660219089Spjd	return (spa->spa_syncing_txg);
1661219089Spjd}
1662219089Spjd
1663208047Smmpool_state_t
1664168404Spjdspa_state(spa_t *spa)
1665168404Spjd{
1666168404Spjd	return (spa->spa_state);
1667168404Spjd}
1668168404Spjd
1669219089Spjdspa_load_state_t
1670219089Spjdspa_load_state(spa_t *spa)
1671168404Spjd{
1672219089Spjd	return (spa->spa_load_state);
1673168404Spjd}
1674168404Spjd
1675168404Spjduint64_t
1676219089Spjdspa_freeze_txg(spa_t *spa)
1677168404Spjd{
1678219089Spjd	return (spa->spa_freeze_txg);
1679168404Spjd}
1680168404Spjd
1681219089Spjd/* ARGSUSED */
1682168404Spjduint64_t
1683219089Spjdspa_get_asize(spa_t *spa, uint64_t lsize)
1684168404Spjd{
1685260763Savg	return (lsize * spa_asize_inflation);
1686168404Spjd}
1687168404Spjd
1688269006Sdelphij/*
1689269006Sdelphij * Return the amount of slop space in bytes.  It is 1/32 of the pool (3.2%),
1690269006Sdelphij * or at least 32MB.
1691269006Sdelphij *
1692269006Sdelphij * See the comment above spa_slop_shift for details.
1693269006Sdelphij */
1694168404Spjduint64_t
1695269006Sdelphijspa_get_slop_space(spa_t *spa) {
1696269006Sdelphij	uint64_t space = spa_get_dspace(spa);
1697269006Sdelphij	return (MAX(space >> spa_slop_shift, SPA_MINDEVSIZE >> 1));
1698269006Sdelphij}
1699269006Sdelphij
1700269006Sdelphijuint64_t
1701168404Spjdspa_get_dspace(spa_t *spa)
1702168404Spjd{
1703219089Spjd	return (spa->spa_dspace);
1704168404Spjd}
1705168404Spjd
1706219089Spjdvoid
1707219089Spjdspa_update_dspace(spa_t *spa)
1708168404Spjd{
1709219089Spjd	spa->spa_dspace = metaslab_class_get_dspace(spa_normal_class(spa)) +
1710219089Spjd	    ddt_get_dedup_dspace(spa);
1711168404Spjd}
1712168404Spjd
1713185029Spjd/*
1714185029Spjd * Return the failure mode that has been set to this pool. The default
1715185029Spjd * behavior will be to block all I/Os when a complete failure occurs.
1716185029Spjd */
1717185029Spjduint8_t
1718185029Spjdspa_get_failmode(spa_t *spa)
1719185029Spjd{
1720185029Spjd	return (spa->spa_failmode);
1721185029Spjd}
1722185029Spjd
1723185029Spjdboolean_t
1724185029Spjdspa_suspended(spa_t *spa)
1725185029Spjd{
1726185029Spjd	return (spa->spa_suspended);
1727185029Spjd}
1728185029Spjd
1729168404Spjduint64_t
1730168404Spjdspa_version(spa_t *spa)
1731168404Spjd{
1732168404Spjd	return (spa->spa_ubsync.ub_version);
1733168404Spjd}
1734168404Spjd
1735219089Spjdboolean_t
1736219089Spjdspa_deflate(spa_t *spa)
1737219089Spjd{
1738219089Spjd	return (spa->spa_deflate);
1739219089Spjd}
1740219089Spjd
1741219089Spjdmetaslab_class_t *
1742219089Spjdspa_normal_class(spa_t *spa)
1743219089Spjd{
1744219089Spjd	return (spa->spa_normal_class);
1745219089Spjd}
1746219089Spjd
1747219089Spjdmetaslab_class_t *
1748219089Spjdspa_log_class(spa_t *spa)
1749219089Spjd{
1750219089Spjd	return (spa->spa_log_class);
1751219089Spjd}
1752219089Spjd
1753288549Smavvoid
1754288549Smavspa_evicting_os_register(spa_t *spa, objset_t *os)
1755288549Smav{
1756288549Smav	mutex_enter(&spa->spa_evicting_os_lock);
1757288549Smav	list_insert_head(&spa->spa_evicting_os_list, os);
1758288549Smav	mutex_exit(&spa->spa_evicting_os_lock);
1759288549Smav}
1760288549Smav
1761288549Smavvoid
1762288549Smavspa_evicting_os_deregister(spa_t *spa, objset_t *os)
1763288549Smav{
1764288549Smav	mutex_enter(&spa->spa_evicting_os_lock);
1765288549Smav	list_remove(&spa->spa_evicting_os_list, os);
1766288549Smav	cv_broadcast(&spa->spa_evicting_os_cv);
1767288549Smav	mutex_exit(&spa->spa_evicting_os_lock);
1768288549Smav}
1769288549Smav
1770288549Smavvoid
1771288549Smavspa_evicting_os_wait(spa_t *spa)
1772288549Smav{
1773288549Smav	mutex_enter(&spa->spa_evicting_os_lock);
1774288549Smav	while (!list_is_empty(&spa->spa_evicting_os_list))
1775288549Smav		cv_wait(&spa->spa_evicting_os_cv, &spa->spa_evicting_os_lock);
1776288549Smav	mutex_exit(&spa->spa_evicting_os_lock);
1777288549Smav
1778288549Smav	dmu_buf_user_evict_wait();
1779288549Smav}
1780288549Smav
1781168404Spjdint
1782168404Spjdspa_max_replication(spa_t *spa)
1783168404Spjd{
1784168404Spjd	/*
1785185029Spjd	 * As of SPA_VERSION == SPA_VERSION_DITTO_BLOCKS, we are able to
1786168404Spjd	 * handle BPs with more than one DVA allocated.  Set our max
1787168404Spjd	 * replication level accordingly.
1788168404Spjd	 */
1789185029Spjd	if (spa_version(spa) < SPA_VERSION_DITTO_BLOCKS)
1790168404Spjd		return (1);
1791168404Spjd	return (MIN(SPA_DVAS_PER_BP, spa_max_replication_override));
1792168404Spjd}
1793168404Spjd
1794219089Spjdint
1795219089Spjdspa_prev_software_version(spa_t *spa)
1796219089Spjd{
1797219089Spjd	return (spa->spa_prev_software_version);
1798219089Spjd}
1799219089Spjd
1800168404Spjduint64_t
1801247265Smmspa_deadman_synctime(spa_t *spa)
1802247265Smm{
1803247265Smm	return (spa->spa_deadman_synctime);
1804247265Smm}
1805247265Smm
1806247265Smmuint64_t
1807219089Spjddva_get_dsize_sync(spa_t *spa, const dva_t *dva)
1808168404Spjd{
1809219089Spjd	uint64_t asize = DVA_GET_ASIZE(dva);
1810219089Spjd	uint64_t dsize = asize;
1811168404Spjd
1812219089Spjd	ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
1813168404Spjd
1814219089Spjd	if (asize != 0 && spa->spa_deflate) {
1815290713Ssmh		uint64_t vdev = DVA_GET_VDEV(dva);
1816290713Ssmh		vdev_t *vd = vdev_lookup_top(spa, vdev);
1817290713Ssmh		if (vd == NULL) {
1818290713Ssmh			panic(
1819290713Ssmh			    "dva_get_dsize_sync(): bad DVA %llu:%llu",
1820290713Ssmh			    (u_longlong_t)vdev, (u_longlong_t)asize);
1821290713Ssmh		}
1822219089Spjd		dsize = (asize >> SPA_MINBLOCKSHIFT) * vd->vdev_deflate_ratio;
1823219089Spjd	}
1824219089Spjd
1825219089Spjd	return (dsize);
1826219089Spjd}
1827219089Spjd
1828219089Spjduint64_t
1829219089Spjdbp_get_dsize_sync(spa_t *spa, const blkptr_t *bp)
1830219089Spjd{
1831219089Spjd	uint64_t dsize = 0;
1832219089Spjd
1833268649Sdelphij	for (int d = 0; d < BP_GET_NDVAS(bp); d++)
1834219089Spjd		dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]);
1835219089Spjd
1836219089Spjd	return (dsize);
1837219089Spjd}
1838219089Spjd
1839219089Spjduint64_t
1840219089Spjdbp_get_dsize(spa_t *spa, const blkptr_t *bp)
1841219089Spjd{
1842219089Spjd	uint64_t dsize = 0;
1843219089Spjd
1844185029Spjd	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
1845219089Spjd
1846268649Sdelphij	for (int d = 0; d < BP_GET_NDVAS(bp); d++)
1847219089Spjd		dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]);
1848219089Spjd
1849185029Spjd	spa_config_exit(spa, SCL_VDEV, FTAG);
1850219089Spjd
1851219089Spjd	return (dsize);
1852168404Spjd}
1853168404Spjd
1854168404Spjd/*
1855168404Spjd * ==========================================================================
1856168404Spjd * Initialization and Termination
1857168404Spjd * ==========================================================================
1858168404Spjd */
1859168404Spjd
1860168404Spjdstatic int
1861168404Spjdspa_name_compare(const void *a1, const void *a2)
1862168404Spjd{
1863168404Spjd	const spa_t *s1 = a1;
1864168404Spjd	const spa_t *s2 = a2;
1865168404Spjd	int s;
1866168404Spjd
1867168404Spjd	s = strcmp(s1->spa_name, s2->spa_name);
1868168404Spjd	if (s > 0)
1869168404Spjd		return (1);
1870168404Spjd	if (s < 0)
1871168404Spjd		return (-1);
1872168404Spjd	return (0);
1873168404Spjd}
1874168404Spjd
1875168404Spjdint
1876168404Spjdspa_busy(void)
1877168404Spjd{
1878168404Spjd	return (spa_active_count);
1879168404Spjd}
1880168404Spjd
1881168404Spjdvoid
1882185029Spjdspa_boot_init()
1883185029Spjd{
1884185029Spjd	spa_config_load();
1885185029Spjd}
1886185029Spjd
1887253070Savg#ifdef _KERNEL
1888253070SavgEVENTHANDLER_DEFINE(mountroot, spa_boot_init, NULL, 0);
1889253070Savg#endif
1890253070Savg
1891185029Spjdvoid
1892168404Spjdspa_init(int mode)
1893168404Spjd{
1894168404Spjd	mutex_init(&spa_namespace_lock, NULL, MUTEX_DEFAULT, NULL);
1895185029Spjd	mutex_init(&spa_spare_lock, NULL, MUTEX_DEFAULT, NULL);
1896185029Spjd	mutex_init(&spa_l2cache_lock, NULL, MUTEX_DEFAULT, NULL);
1897168404Spjd	cv_init(&spa_namespace_cv, NULL, CV_DEFAULT, NULL);
1898168404Spjd
1899168404Spjd	avl_create(&spa_namespace_avl, spa_name_compare, sizeof (spa_t),
1900168404Spjd	    offsetof(spa_t, spa_avl));
1901168404Spjd
1902185029Spjd	avl_create(&spa_spare_avl, spa_spare_compare, sizeof (spa_aux_t),
1903185029Spjd	    offsetof(spa_aux_t, aux_avl));
1904168404Spjd
1905185029Spjd	avl_create(&spa_l2cache_avl, spa_l2cache_compare, sizeof (spa_aux_t),
1906185029Spjd	    offsetof(spa_aux_t, aux_avl));
1907168404Spjd
1908209962Smm	spa_mode_global = mode;
1909168404Spjd
1910240133Smm#ifdef illumos
1911247265Smm#ifdef _KERNEL
1912247265Smm	spa_arch_init();
1913247265Smm#else
1914240133Smm	if (spa_mode_global != FREAD && dprintf_find_string("watch")) {
1915240133Smm		arc_procfd = open("/proc/self/ctl", O_WRONLY);
1916240133Smm		if (arc_procfd == -1) {
1917240133Smm			perror("could not enable watchpoints: "
1918240133Smm			    "opening /proc/self/ctl failed: ");
1919240133Smm		} else {
1920240133Smm			arc_watch = B_TRUE;
1921240133Smm		}
1922240133Smm	}
1923240133Smm#endif
1924240133Smm#endif /* illumos */
1925179310Spjd	refcount_sysinit();
1926168404Spjd	unique_init();
1927262093Savg	range_tree_init();
1928168404Spjd	zio_init();
1929260337Smav	lz4_init();
1930168404Spjd	dmu_init();
1931168404Spjd	zil_init();
1932185029Spjd	vdev_cache_stat_init();
1933185029Spjd	zfs_prop_init();
1934185029Spjd	zpool_prop_init();
1935236884Smm	zpool_feature_init();
1936168404Spjd	spa_config_load();
1937185029Spjd	l2arc_start();
1938247265Smm#ifndef illumos
1939247265Smm#ifdef _KERNEL
1940247265Smm	zfs_deadman_init();
1941247265Smm#endif
1942247265Smm#endif	/* !illumos */
1943168404Spjd}
1944168404Spjd
1945168404Spjdvoid
1946168404Spjdspa_fini(void)
1947168404Spjd{
1948185029Spjd	l2arc_stop();
1949185029Spjd
1950168404Spjd	spa_evict_all();
1951168404Spjd
1952185029Spjd	vdev_cache_stat_fini();
1953168404Spjd	zil_fini();
1954168404Spjd	dmu_fini();
1955260337Smav	lz4_fini();
1956168404Spjd	zio_fini();
1957262093Savg	range_tree_fini();
1958185029Spjd	unique_fini();
1959168404Spjd	refcount_fini();
1960168404Spjd
1961168404Spjd	avl_destroy(&spa_namespace_avl);
1962168404Spjd	avl_destroy(&spa_spare_avl);
1963185029Spjd	avl_destroy(&spa_l2cache_avl);
1964168404Spjd
1965168404Spjd	cv_destroy(&spa_namespace_cv);
1966168404Spjd	mutex_destroy(&spa_namespace_lock);
1967168404Spjd	mutex_destroy(&spa_spare_lock);
1968185029Spjd	mutex_destroy(&spa_l2cache_lock);
1969168404Spjd}
1970185029Spjd
1971185029Spjd/*
1972185029Spjd * Return whether this pool has slogs. No locking needed.
1973185029Spjd * It's not a problem if the wrong answer is returned as it's only for
1974185029Spjd * performance and not correctness
1975185029Spjd */
1976185029Spjdboolean_t
1977185029Spjdspa_has_slogs(spa_t *spa)
1978185029Spjd{
1979185029Spjd	return (spa->spa_log_class->mc_rotor != NULL);
1980185029Spjd}
1981185029Spjd
1982219089Spjdspa_log_state_t
1983219089Spjdspa_get_log_state(spa_t *spa)
1984219089Spjd{
1985219089Spjd	return (spa->spa_log_state);
1986219089Spjd}
1987219089Spjd
1988219089Spjdvoid
1989219089Spjdspa_set_log_state(spa_t *spa, spa_log_state_t state)
1990219089Spjd{
1991219089Spjd	spa->spa_log_state = state;
1992219089Spjd}
1993219089Spjd
1994185029Spjdboolean_t
1995185029Spjdspa_is_root(spa_t *spa)
1996185029Spjd{
1997185029Spjd	return (spa->spa_is_root);
1998185029Spjd}
1999209962Smm
2000209962Smmboolean_t
2001209962Smmspa_writeable(spa_t *spa)
2002209962Smm{
2003209962Smm	return (!!(spa->spa_mode & FWRITE));
2004209962Smm}
2005209962Smm
2006269418Sdelphij/*
2007269418Sdelphij * Returns true if there is a pending sync task in any of the current
2008269418Sdelphij * syncing txg, the current quiescing txg, or the current open txg.
2009269418Sdelphij */
2010269418Sdelphijboolean_t
2011269418Sdelphijspa_has_pending_synctask(spa_t *spa)
2012269418Sdelphij{
2013269418Sdelphij	return (!txg_all_lists_empty(&spa->spa_dsl_pool->dp_sync_tasks));
2014269418Sdelphij}
2015269418Sdelphij
2016209962Smmint
2017209962Smmspa_mode(spa_t *spa)
2018209962Smm{
2019209962Smm	return (spa->spa_mode);
2020209962Smm}
2021219089Spjd
2022219089Spjduint64_t
2023219089Spjdspa_bootfs(spa_t *spa)
2024219089Spjd{
2025219089Spjd	return (spa->spa_bootfs);
2026219089Spjd}
2027219089Spjd
2028219089Spjduint64_t
2029219089Spjdspa_delegation(spa_t *spa)
2030219089Spjd{
2031219089Spjd	return (spa->spa_delegation);
2032219089Spjd}
2033219089Spjd
2034219089Spjdobjset_t *
2035219089Spjdspa_meta_objset(spa_t *spa)
2036219089Spjd{
2037219089Spjd	return (spa->spa_meta_objset);
2038219089Spjd}
2039219089Spjd
2040219089Spjdenum zio_checksum
2041219089Spjdspa_dedup_checksum(spa_t *spa)
2042219089Spjd{
2043219089Spjd	return (spa->spa_dedup_checksum);
2044219089Spjd}
2045219089Spjd
2046219089Spjd/*
2047219089Spjd * Reset pool scan stat per scan pass (or reboot).
2048219089Spjd */
2049219089Spjdvoid
2050219089Spjdspa_scan_stat_init(spa_t *spa)
2051219089Spjd{
2052219089Spjd	/* data not stored on disk */
2053219089Spjd	spa->spa_scan_pass_start = gethrestime_sec();
2054219089Spjd	spa->spa_scan_pass_exam = 0;
2055219089Spjd	vdev_scan_stat_init(spa->spa_root_vdev);
2056219089Spjd}
2057219089Spjd
2058219089Spjd/*
2059219089Spjd * Get scan stats for zpool status reports
2060219089Spjd */
2061219089Spjdint
2062219089Spjdspa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps)
2063219089Spjd{
2064219089Spjd	dsl_scan_t *scn = spa->spa_dsl_pool ? spa->spa_dsl_pool->dp_scan : NULL;
2065219089Spjd
2066219089Spjd	if (scn == NULL || scn->scn_phys.scn_func == POOL_SCAN_NONE)
2067249195Smm		return (SET_ERROR(ENOENT));
2068219089Spjd	bzero(ps, sizeof (pool_scan_stat_t));
2069219089Spjd
2070219089Spjd	/* data stored on disk */
2071219089Spjd	ps->pss_func = scn->scn_phys.scn_func;
2072219089Spjd	ps->pss_start_time = scn->scn_phys.scn_start_time;
2073219089Spjd	ps->pss_end_time = scn->scn_phys.scn_end_time;
2074219089Spjd	ps->pss_to_examine = scn->scn_phys.scn_to_examine;
2075219089Spjd	ps->pss_examined = scn->scn_phys.scn_examined;
2076219089Spjd	ps->pss_to_process = scn->scn_phys.scn_to_process;
2077219089Spjd	ps->pss_processed = scn->scn_phys.scn_processed;
2078219089Spjd	ps->pss_errors = scn->scn_phys.scn_errors;
2079219089Spjd	ps->pss_state = scn->scn_phys.scn_state;
2080219089Spjd
2081219089Spjd	/* data not stored on disk */
2082219089Spjd	ps->pss_pass_start = spa->spa_scan_pass_start;
2083219089Spjd	ps->pss_pass_exam = spa->spa_scan_pass_exam;
2084219089Spjd
2085219089Spjd	return (0);
2086219089Spjd}
2087224177Smm
2088224177Smmboolean_t
2089224177Smmspa_debug_enabled(spa_t *spa)
2090224177Smm{
2091224177Smm	return (spa->spa_debug);
2092224177Smm}
2093276081Sdelphij
2094276081Sdelphijint
2095276081Sdelphijspa_maxblocksize(spa_t *spa)
2096276081Sdelphij{
2097276081Sdelphij	if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS))
2098276081Sdelphij		return (SPA_MAXBLOCKSIZE);
2099276081Sdelphij	else
2100276081Sdelphij		return (SPA_OLD_MAXBLOCKSIZE);
2101276081Sdelphij}
2102