spa_misc.c revision 339149
11541Srgrimes/*
21541Srgrimes * CDDL HEADER START
31541Srgrimes *
41541Srgrimes * The contents of this file are subject to the terms of the
51541Srgrimes * Common Development and Distribution License (the "License").
61541Srgrimes * You may not use this file except in compliance with the License.
71541Srgrimes *
81541Srgrimes * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
91541Srgrimes * or http://www.opensolaris.org/os/licensing.
101541Srgrimes * See the License for the specific language governing permissions
111541Srgrimes * and limitations under the License.
121541Srgrimes *
131541Srgrimes * When distributing Covered Code, include this CDDL HEADER in each
141541Srgrimes * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
151541Srgrimes * If applicable, add the following below this CDDL HEADER, with the
161541Srgrimes * fields enclosed by brackets "[]" replaced with your own identifying
171541Srgrimes * information: Portions Copyright [yyyy] [name of copyright owner]
181541Srgrimes *
191541Srgrimes * CDDL HEADER END
201541Srgrimes */
211541Srgrimes/*
221541Srgrimes * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
231541Srgrimes * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
241541Srgrimes * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
251541Srgrimes * Copyright 2013 Martin Matuska <mm@FreeBSD.org>. All rights reserved.
261541Srgrimes * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
271541Srgrimes * Copyright 2013 Saso Kiselkov. All rights reserved.
281541Srgrimes * Copyright (c) 2014 Integros [integros.com]
291541Srgrimes * Copyright (c) 2017 Datto Inc.
301541Srgrimes */
311541Srgrimes
321541Srgrimes#include <sys/zfs_context.h>
331541Srgrimes#include <sys/spa_impl.h>
341541Srgrimes#include <sys/spa_boot.h>
351541Srgrimes#include <sys/zio.h>
361541Srgrimes#include <sys/zio_checksum.h>
371541Srgrimes#include <sys/zio_compress.h>
381541Srgrimes#include <sys/dmu.h>
3950477Speter#include <sys/dmu_tx.h>
401541Srgrimes#include <sys/zap.h>
411541Srgrimes#include <sys/zil.h>
421541Srgrimes#include <sys/vdev_impl.h>
431541Srgrimes#include <sys/vdev_file.h>
441541Srgrimes#include <sys/vdev_initialize.h>
451541Srgrimes#include <sys/metaslab.h>
4631778Seivind#include <sys/uberblock_impl.h>
4731778Seivind#include <sys/txg.h>
481541Srgrimes#include <sys/avl.h>
491541Srgrimes#include <sys/unique.h>
501541Srgrimes#include <sys/dsl_pool.h>
5112221Sbde#include <sys/dsl_dir.h>
5241059Speter#include <sys/dsl_prop.h>
5370317Sjake#include <sys/dsl_scan.h>
541541Srgrimes#include <sys/fs/zfs.h>
551541Srgrimes#include <sys/metaslab_impl.h>
5631891Ssef#include <sys/arc.h>
5765495Struckman#include <sys/ddt.h>
5861287Srwatson#include "zfs_prop.h"
5972786Srwatson#include <sys/zfeature.h>
601541Srgrimes
6130354Sphk#if defined(__FreeBSD__) && defined(_KERNEL)
6230354Sphk#include <sys/types.h>
6312221Sbde#include <sys/sysctl.h>
6411332Sswallace#endif
651541Srgrimes
661541Srgrimes/*
6712221Sbde * SPA locking
681541Srgrimes *
6958717Sdillon * There are four basic locks for managing spa_t structures:
7070317Sjake *
7158717Sdillon * spa_namespace_lock (global mutex)
7270317Sjake *
731541Srgrimes *	This lock must be acquired to do any of the following:
741549Srgrimes *
7530994Sphk *		- Lookup a spa_t by name
761541Srgrimes *		- Add or remove a spa_t from the namespace
7711332Sswallace *		- Increase spa_refcount from non-zero
781541Srgrimes *		- Check if spa_refcount is zero
791541Srgrimes *		- Rename a spa_t
8030994Sphk *		- add/remove/attach/detach devices
811541Srgrimes *		- Held for the duration of create/destroy/import/export
8274728Sjhb *
8330994Sphk *	It does not need to handle recursion.  A create or destroy may
8474728Sjhb *	reference objects (files or zvols) in other pools, but by
851541Srgrimes *	definition they must have an existing reference, and will never need
861541Srgrimes *	to lookup a spa_t by name.
871541Srgrimes *
881541Srgrimes * spa_refcount (per-spa refcount_t protected by mutex)
8970317Sjake *
9070317Sjake *	This reference count keep track of any active users of the spa_t.  The
9170317Sjake *	spa_t cannot be destroyed or freed while this is non-zero.  Internally,
9270317Sjake *	the refcount is never really 'zero' - opening a pool implicitly keeps
9312221Sbde *	some references in the DMU.  Internally we check against spa_minref, but
9411332Sswallace *	present the image of a zero/non-zero value to consumers.
9511332Sswallace *
9611332Sswallace * spa_config_lock[] (per-spa array of rwlocks)
9712221Sbde *
981541Srgrimes *	This protects the spa_t from config changes, and must be held in
991549Srgrimes *	the following circumstances:
10030994Sphk *
1011541Srgrimes *		- RW_READER to perform I/O to the spa
10211332Sswallace *		- RW_WRITER to change the vdev config
1031541Srgrimes *
1041541Srgrimes * The locking order is fairly straightforward:
10574728Sjhb *
10630994Sphk *		spa_namespace_lock	->	spa_refcount
10774728Sjhb *
1081541Srgrimes *	The namespace lock must be acquired to increase the refcount from 0
1091541Srgrimes *	or to check if it is zero.
1101541Srgrimes *
11158717Sdillon *		spa_refcount		->	spa_config_lock[]
11258717Sdillon *
11358717Sdillon *	There must be at least one valid reference on the spa_t to acquire
11458717Sdillon *	the config lock.
11558717Sdillon *
11612221Sbde *		spa_namespace_lock	->	spa_config_lock[]
11711332Sswallace *
11811332Sswallace *	The namespace lock must always be taken before the config lock.
11911332Sswallace *
12012221Sbde *
12111332Sswallace * The spa_namespace_lock can be acquired directly and is globally visible.
1221549Srgrimes *
12330994Sphk * The namespace is manipulated using the following functions, all of which
1241541Srgrimes * require the spa_namespace_lock to be held.
12511332Sswallace *
1261541Srgrimes *	spa_lookup()		Lookup a spa_t by name.
1271541Srgrimes *
12830994Sphk *	spa_add()		Create a new spa_t in the namespace.
1291541Srgrimes *
1301541Srgrimes *	spa_remove()		Remove a spa_t from the namespace.  This also
1311541Srgrimes *				frees up any memory associated with the spa_t.
13228401Speter *
13312221Sbde *	spa_next()		Returns the next spa_t in the system, or the
13428401Speter *				first if NULL is passed.
13528401Speter *
13628401Speter *	spa_evict_all()		Shutdown and remove all spa_t structures in
13728401Speter *				the system.
13828401Speter *
13928401Speter *	spa_guid_exists()	Determine whether a pool/device guid exists.
14030994Sphk *
14128401Speter * The spa_refcount is manipulated using the following functions:
14228401Speter *
14328401Speter *	spa_open_ref()		Adds a reference to the given spa_t.  Must be
14441726Struckman *				called with spa_namespace_lock held if the
14541726Struckman *				refcount is currently zero.
14641726Struckman *
14728401Speter *	spa_close()		Remove a reference from the spa_t.  This will
14828401Speter *				not free the spa_t or remove it from the
14928401Speter *				namespace.  No locking is required.
15041726Struckman *
15128401Speter *	spa_refcount_zero()	Returns true if the refcount is currently
15228401Speter *				zero.  Must be called with spa_namespace_lock
15341726Struckman *				held.
15428401Speter *
15528401Speter * The spa_config_lock[] is an array of rwlocks, ordered as follows:
15628401Speter * SCL_CONFIG > SCL_STATE > SCL_ALLOC > SCL_ZIO > SCL_FREE > SCL_VDEV.
15728401Speter * spa_config_lock[] is manipulated with spa_config_{enter,exit,held}().
15828401Speter *
15928401Speter * To read the configuration, it suffices to hold one of these locks as reader.
16028401Speter * To modify the configuration, you must hold all locks as writer.  To modify
16128401Speter * vdev state without altering the vdev tree's topology (e.g. online/offline),
16228401Speter * you must hold SCL_STATE and SCL_ZIO as writer.
16328401Speter *
16428401Speter * We use these distinct config locks to avoid recursive lock entry.
16528401Speter * For example, spa_sync() (which holds SCL_CONFIG as reader) induces
16628401Speter * block allocations (SCL_ALLOC), which may require reading space maps
16730994Sphk * from disk (dmu_read() -> zio_read() -> SCL_ZIO).
16828401Speter *
16928401Speter * The spa config locks cannot be normal rwlocks because we need the
17028401Speter * ability to hand off ownership.  For example, SCL_ZIO is acquired
17141726Struckman * by the issuing thread and later released by an interrupt thread.
17241726Struckman * They do, however, obey the usual write-wanted semantics to prevent
17341726Struckman * writer (i.e. system administrator) starvation.
17428401Speter *
17528401Speter * The lock acquisition rules are as follows:
17628401Speter *
17771002Sben * SCL_CONFIG
17828401Speter *	Protects changes to the vdev tree topology, such as vdev
17928401Speter *	add/remove/attach/detach.  Protects the dirty config list
18041726Struckman *	(spa_config_dirty_list) and the set of spares and l2arc devices.
18128401Speter *
18228401Speter * SCL_STATE
18328401Speter *	Protects changes to pool state and vdev state, such as vdev
18428401Speter *	online/offline/fault/degrade/clear.  Protects the dirty state list
18558941Sdillon *	(spa_state_dirty_list) and global pool state (spa_state).
18658941Sdillon *
18758941Sdillon * SCL_ALLOC
18828401Speter *	Protects changes to metaslab groups and classes.
18911332Sswallace *	Held as reader by metaslab_alloc() and metaslab_claim().
19011332Sswallace *
19111332Sswallace * SCL_ZIO
19212221Sbde *	Held by bp-level zios (those which have no io_vd upon entry)
19311332Sswallace *	to prevent changes to the vdev tree.  The bp-level zio implicitly
1941541Srgrimes *	protects all of its vdev child zios, which do not hold SCL_ZIO.
1951549Srgrimes *
19630994Sphk * SCL_FREE
1971541Srgrimes *	Protects changes to metaslab groups and classes.
19811332Sswallace *	Held as reader by metaslab_free().  SCL_FREE is distinct from
1991541Srgrimes *	SCL_ALLOC, and lower than SCL_ZIO, so that we can safely free
2001541Srgrimes *	blocks in zio_done() while another i/o that holds either
20130994Sphk *	SCL_ALLOC or SCL_ZIO is waiting for this i/o to complete.
2021541Srgrimes *
20330994Sphk * SCL_VDEV
2041541Srgrimes *	Held as reader to prevent changes to the vdev tree during trivial
2051541Srgrimes *	inquiries such as bp_get_dsize().  SCL_VDEV is distinct from the
2061541Srgrimes *	other locks, and lower than all of them, to ensure that it's safe
2071541Srgrimes *	to acquire regardless of caller context.
20858941Sdillon *
20958941Sdillon * In addition, the following rules apply:
21058941Sdillon *
21112221Sbde * (a)	spa_props_lock protects pool properties, spa_config and spa_config_list.
21211332Sswallace *	The lock ordering is SCL_CONFIG > spa_props_lock.
21311332Sswallace *
21411332Sswallace * (b)	I/O operations on leaf vdevs.  For any zio operation that takes
21512221Sbde *	an explicit vdev_t argument -- such as zio_ioctl(), zio_read_phys(),
21611332Sswallace *	or zio_write_phys() -- the caller must ensure that the config cannot
2171541Srgrimes *	cannot change in the interim, and that the vdev cannot be reopened.
2181549Srgrimes *	SCL_STATE as reader suffices for both.
21930994Sphk *
2201541Srgrimes * The vdev configuration is protected by spa_vdev_enter() / spa_vdev_exit().
22111332Sswallace *
2221541Srgrimes *	spa_vdev_enter()	Acquire the namespace lock and the config lock
2231541Srgrimes *				for writing.
22430994Sphk *
2251541Srgrimes *	spa_vdev_exit()		Release the config lock, wait for all I/O
2261541Srgrimes *				to complete, sync the updated configs to the
2271541Srgrimes *				cache, and release the namespace lock.
22858941Sdillon *
22958941Sdillon * vdev state is protected by spa_vdev_state_enter() / spa_vdev_state_exit().
23058941Sdillon * Like spa_vdev_enter/exit, these are convenience wrappers -- the actual
23112221Sbde * locking is, always, based on spa_namespace_lock and spa_config_lock[].
23211332Sswallace *
23311332Sswallace * spa_rename() is also implemented within this file since it requires
23411332Sswallace * manipulation of the namespace.
23512221Sbde */
23611332Sswallace
2371541Srgrimesstatic avl_tree_t spa_namespace_avl;
2381549Srgrimeskmutex_t spa_namespace_lock;
23930994Sphkstatic kcondvar_t spa_namespace_cv;
2401541Srgrimesstatic int spa_active_count;
24111332Sswallaceint spa_max_replication_override = SPA_DVAS_PER_BP;
2421541Srgrimes
2431541Srgrimesstatic kmutex_t spa_spare_lock;
24430994Sphkstatic avl_tree_t spa_spare_avl;
2451541Srgrimesstatic kmutex_t spa_l2cache_lock;
24630994Sphkstatic avl_tree_t spa_l2cache_avl;
2471541Srgrimes
2481541Srgrimeskmem_cache_t *spa_buffer_pool;
2491541Srgrimesint spa_mode_global;
2501541Srgrimes
2511541Srgrimes#ifdef ZFS_DEBUG
2521541Srgrimes/*
2531541Srgrimes * Everything except dprintf, spa, and indirect_remap is on by default
2541541Srgrimes * in debug builds.
2551541Srgrimes */
25612221Sbdeint zfs_flags = ~(ZFS_DEBUG_DPRINTF | ZFS_DEBUG_INDIRECT_REMAP);
25711332Sswallace#else
25811332Sswallaceint zfs_flags = 0;
25911332Sswallace#endif
26012221Sbde
26111332Sswallace/*
2621541Srgrimes * zfs_recover can be set to nonzero to attempt to recover from
2631549Srgrimes * otherwise-fatal errors, typically caused by on-disk corruption.  When
26430994Sphk * set, calls to zfs_panic_recover() will turn into warning messages.
2651541Srgrimes * This should only be used as a last resort, as it typically results
26611332Sswallace * in leaked space, or worse.
2671541Srgrimes */
2681541Srgrimesboolean_t zfs_recover = B_FALSE;
26930994Sphk
2701541Srgrimes/*
2711541Srgrimes * If destroy encounters an EIO while reading metadata (e.g. indirect
2721541Srgrimes * blocks), space referenced by the missing metadata can not be freed.
27312221Sbde * Normally this causes the background destroy to become "stalled", as
2741541Srgrimes * it is unable to make forward progress.  While in this stalled state,
2751541Srgrimes * all remaining space to free from the error-encountering filesystem is
2761541Srgrimes * "temporarily leaked".  Set this flag to cause it to ignore the EIO,
2771541Srgrimes * permanently leak the space from indirect blocks that can not be read,
27812221Sbde * and continue to free everything else that it can.
2791549Srgrimes *
28030994Sphk * The default, "stalling" behavior is useful if the storage partially
2811541Srgrimes * fails (i.e. some but not all i/os fail), and then later recovers.  In
2821541Srgrimes * this case, we will be able to continue pool operations while it is
2831541Srgrimes * partially failed, and when it recovers, we can continue to free the
2841541Srgrimes * space, with no leaks.  However, note that this case is actually
2851541Srgrimes * fairly rare.
2861541Srgrimes *
2871541Srgrimes * Typically pools either (a) fail completely (but perhaps temporarily,
2881541Srgrimes * e.g. a top-level vdev going offline), or (b) have localized,
28930994Sphk * permanent errors (e.g. disk returns the wrong data due to bit flip or
2901541Srgrimes * firmware bug).  In case (a), this setting does not matter because the
2911541Srgrimes * pool will be suspended and the sync thread will not be able to make
2921541Srgrimes * forward progress regardless.  In case (b), because the error is
2931541Srgrimes * permanent, the best we can do is leak the minimum amount of space,
2941541Srgrimes * which is what setting this flag will do.  Therefore, it is reasonable
2953098Sphk * for this flag to normally be set, but we chose the more conservative
2963098Sphk * approach of not setting it, so that there is no possibility of
2971541Srgrimes * leaking space in the "partial temporary" failure case.
29830994Sphk */
2991541Srgrimesboolean_t zfs_free_leak_on_eio = B_FALSE;
3001541Srgrimes
3011541Srgrimes/*
30212221Sbde * Expiration time in milliseconds. This value has two meanings. First it is
30312207Sbde * used to determine when the spa_deadman() logic should fire. By default the
30411332Sswallace * spa_deadman() will fire if spa_sync() has not completed in 1000 seconds.
30511332Sswallace * Secondly, the value determines if an I/O is considered "hung". Any I/O that
30612221Sbde * has not completed in zfs_deadman_synctime_ms is considered "hung" resulting
30711332Sswallace * in a system panic.
3081541Srgrimes */
3091549Srgrimesuint64_t zfs_deadman_synctime_ms = 1000000ULL;
31030994Sphk
3111541Srgrimes/*
31212207Sbde * Check time in milliseconds. This defines the frequency at which we check
3131541Srgrimes * for hung I/O.
3141541Srgrimes */
3151541Srgrimesuint64_t zfs_deadman_checktime_ms = 5000ULL;
3161541Srgrimes
3171541Srgrimes/*
3181541Srgrimes * Default value of -1 for zfs_deadman_enabled is resolved in
31930994Sphk * zfs_deadman_init()
3201541Srgrimes */
3211541Srgrimesint zfs_deadman_enabled = -1;
3221541Srgrimes
3231541Srgrimes/*
3241541Srgrimes * The worst case is single-sector max-parity RAID-Z blocks, in which
3251541Srgrimes * case the space requirement is exactly (VDEV_RAIDZ_MAXPARITY + 1)
3261541Srgrimes * times the size; so just assume that.  Add to this the fact that
3271541Srgrimes * we can have up to 3 DVAs per bp, and one more factor of 2 because
3281541Srgrimes * the block may be dittoed with up to 3 DVAs by ddt_sync().  All together,
3291541Srgrimes * the worst case is:
3301541Srgrimes *     (VDEV_RAIDZ_MAXPARITY + 1) * SPA_DVAS_PER_BP * 2 == 24
3311541Srgrimes */
3321541Srgrimesint spa_asize_inflation = 24;
3331541Srgrimes
3341541Srgrimes#if defined(__FreeBSD__) && defined(_KERNEL)
3351541SrgrimesSYSCTL_DECL(_vfs_zfs);
3361541SrgrimesSYSCTL_INT(_vfs_zfs, OID_AUTO, recover, CTLFLAG_RWTUN, &zfs_recover, 0,
33712221Sbde    "Try to recover from otherwise-fatal errors.");
3381541Srgrimes
3391541Srgrimesstatic int
3401541Srgrimessysctl_vfs_zfs_debug_flags(SYSCTL_HANDLER_ARGS)
3411541Srgrimes{
34212221Sbde	int err, val;
3431541Srgrimes
3441549Srgrimes	val = zfs_flags;
34530994Sphk	err = sysctl_handle_int(oidp, &val, 0, req);
3461541Srgrimes	if (err != 0 || req->newptr == NULL)
3471541Srgrimes		return (err);
3481541Srgrimes
3491541Srgrimes	/*
3501541Srgrimes	 * ZFS_DEBUG_MODIFY must be enabled prior to boot so all
3511541Srgrimes	 * arc buffers in the system have the necessary additional
35220677Sbde	 * checksum data.  However, it is safe to disable at any
35320677Sbde	 * time.
3541541Srgrimes	 */
3551541Srgrimes	if (!(zfs_flags & ZFS_DEBUG_MODIFY))
3561541Srgrimes		val &= ~ZFS_DEBUG_MODIFY;
35715985Sdg	zfs_flags = val;
3581541Srgrimes
3591541Srgrimes	return (0);
3601541Srgrimes}
3611541Srgrimes
3621541SrgrimesSYSCTL_PROC(_vfs_zfs, OID_AUTO, debugflags,
3631541Srgrimes    CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RWTUN, 0, sizeof(int),
3641541Srgrimes    sysctl_vfs_zfs_debug_flags, "IU", "Debug flags for ZFS testing.");
3651541SrgrimesSYSCTL_PROC(_vfs_zfs, OID_AUTO, debug_flags,
3661541Srgrimes    CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(int),
3671541Srgrimes    sysctl_vfs_zfs_debug_flags, "IU",
3681541Srgrimes    "Debug flags for ZFS testing (deprecated, see vfs.zfs.debugflags).");
3691541Srgrimes
3701541SrgrimesSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, deadman_synctime_ms, CTLFLAG_RDTUN,
3711541Srgrimes    &zfs_deadman_synctime_ms, 0,
3721541Srgrimes    "Stalled ZFS I/O expiration time in milliseconds");
3731541SrgrimesSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, deadman_checktime_ms, CTLFLAG_RDTUN,
37424448Speter    &zfs_deadman_checktime_ms, 0,
37524448Speter    "Period of checks for stalled ZFS I/O in milliseconds");
37672093SasmodaiSYSCTL_INT(_vfs_zfs, OID_AUTO, deadman_enabled, CTLFLAG_RDTUN,
37724448Speter    &zfs_deadman_enabled, 0, "Kernel panic on stalled ZFS I/O");
37824448SpeterSYSCTL_INT(_vfs_zfs, OID_AUTO, spa_asize_inflation, CTLFLAG_RWTUN,
37924448Speter    &spa_asize_inflation, 0, "Worst case inflation factor for single sector writes");
38024448Speter#endif
38124448Speter
38224448Speter#ifndef illumos
38324448Speter#ifdef _KERNEL
38424448Speterstatic void
38524448Speterzfs_deadman_init()
38612221Sbde{
3871541Srgrimes	/*
3881541Srgrimes	 * If we are not i386 or amd64 or in a virtual machine,
3891541Srgrimes	 * disable ZFS deadman thread by default
39012221Sbde	 */
3911541Srgrimes	if (zfs_deadman_enabled == -1) {
3921549Srgrimes#if defined(__amd64__) || defined(__i386__)
39330994Sphk		zfs_deadman_enabled = (vm_guest == VM_GUEST_NO) ? 1 : 0;
3941541Srgrimes#else
3951541Srgrimes		zfs_deadman_enabled = 0;
3961541Srgrimes#endif
3971541Srgrimes	}
3981541Srgrimes}
3991541Srgrimes#endif	/* _KERNEL */
4001541Srgrimes#endif	/* !illumos */
40124448Speter
40224448Speter/*
40324448Speter * Normally, we don't allow the last 3.2% (1/(2^spa_slop_shift)) of space in
40424448Speter * the pool to be consumed.  This ensures that we don't run the pool
40524448Speter * completely out of space, due to unaccounted changes (e.g. to the MOS).
40672093Sasmodai * It also limits the worst-case time to allocate space.  If we have
40724448Speter * less than this amount of free space, most ZPL operations (e.g. write,
40824448Speter * create) will return ENOSPC.
40924448Speter *
41024448Speter * Certain operations (e.g. file removal, most administrative actions) can
41124448Speter * use half the slop space.  They will only return ENOSPC if less than half
41224448Speter * the slop space is free.  Typically, once the pool has less than the slop
41324448Speter * space free, the user will use these operations to free up space in the pool.
41424448Speter * These are the operations that call dsl_pool_adjustedsize() with the netfree
41524448Speter * argument set to TRUE.
41624448Speter *
41724448Speter * Operations that are almost guaranteed to free up space in the absence of
4181541Srgrimes * a pool checkpoint can use up to three quarters of the slop space
41924448Speter * (e.g zfs destroy).
42017994Sache *
42124448Speter * A very restricted set of operations are always permitted, regardless of
42217994Sache * the amount of free space.  These are the operations that call
42324448Speter * dsl_sync_task(ZFS_SPACE_CHECK_NONE). If these operations result in a net
42424448Speter * increase in the amount of space used, it is possible to run the pool
42524448Speter * completely out of space, causing it to be permanently read-only.
42646155Sphk *
4271541Srgrimes * Note that on very small pools, the slop space will be larger than
42824448Speter * 3.2%, in an effort to have it be at least spa_min_slop (128MB),
42924448Speter * but we never allow it to be more than half the pool size.
4301541Srgrimes *
43124448Speter * See also the comments in zfs_space_check_t.
43224448Speter */
4331541Srgrimesint spa_slop_shift = 5;
43417994SacheSYSCTL_INT(_vfs_zfs, OID_AUTO, spa_slop_shift, CTLFLAG_RWTUN,
43524448Speter    &spa_slop_shift, 0,
43624448Speter    "Shift value of reserved space (1/(2^spa_slop_shift)).");
43717994Sacheuint64_t spa_min_slop = 128 * 1024 * 1024;
43846155SphkSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, spa_min_slop, CTLFLAG_RWTUN,
43917994Sache    &spa_min_slop, 0,
44024448Speter    "Minimal value of reserved space");
44124448Speter
44265495Struckmanint spa_allocators = 4;
44324448Speter
44424448SpeterSYSCTL_INT(_vfs_zfs, OID_AUTO, spa_allocators, CTLFLAG_RWTUN,
44565495Struckman    &spa_allocators, 0,
44665495Struckman    "Number of allocators per metaslab group");
44724448Speter
44824448Speter/*PRINTFLIKE2*/
44924448Spetervoid
45024448Speterspa_load_failed(spa_t *spa, const char *fmt, ...)
45124448Speter{
45224448Speter	va_list adx;
45324448Speter	char buf[256];
45424448Speter
45524448Speter	va_start(adx, fmt);
45624448Speter	(void) vsnprintf(buf, sizeof (buf), fmt, adx);
45731891Ssef	va_end(adx);
45824448Speter
4598141Sache	zfs_dbgmsg("spa_load(%s, config %s): FAILED: %s", spa->spa_name,
46024448Speter	    spa->spa_trust_config ? "trusted" : "untrusted", buf);
46124448Speter}
46224448Speter
46324448Speter/*PRINTFLIKE2*/
46424448Spetervoid
46524448Speterspa_load_note(spa_t *spa, const char *fmt, ...)
46665495Struckman{
46731891Ssef	va_list adx;
46824448Speter	char buf[256];
4691541Srgrimes
4701541Srgrimes	va_start(adx, fmt);
4711541Srgrimes	(void) vsnprintf(buf, sizeof (buf), fmt, adx);
47212221Sbde	va_end(adx);
4731541Srgrimes
4741541Srgrimes	zfs_dbgmsg("spa_load(%s, config %s): %s", spa->spa_name,
4751541Srgrimes	    spa->spa_trust_config ? "trusted" : "untrusted", buf);
47612221Sbde}
4771541Srgrimes
4781549Srgrimes/*
47930994Sphk * ==========================================================================
4801541Srgrimes * SPA config locking
4811541Srgrimes * ==========================================================================
4821541Srgrimes */
4831541Srgrimesstatic void
4841541Srgrimesspa_config_lock_init(spa_t *spa)
4851541Srgrimes{
4861541Srgrimes	for (int i = 0; i < SCL_LOCKS; i++) {
4871541Srgrimes		spa_config_lock_t *scl = &spa->spa_config_lock[i];
48824449Speter		mutex_init(&scl->scl_lock, NULL, MUTEX_DEFAULT, NULL);
48924449Speter		cv_init(&scl->scl_cv, NULL, CV_DEFAULT, NULL);
49046155Sphk		refcount_create_untracked(&scl->scl_count);
4911541Srgrimes		scl->scl_writer = NULL;
4921541Srgrimes		scl->scl_write_wanted = 0;
4931541Srgrimes	}
4941541Srgrimes}
4951541Srgrimes
49624449Speterstatic void
49765495Struckmanspa_config_lock_destroy(spa_t *spa)
49831891Ssef{
49924449Speter	for (int i = 0; i < SCL_LOCKS; i++) {
5001541Srgrimes		spa_config_lock_t *scl = &spa->spa_config_lock[i];
5011541Srgrimes		mutex_destroy(&scl->scl_lock);
5021541Srgrimes		cv_destroy(&scl->scl_cv);
50312221Sbde		refcount_destroy(&scl->scl_count);
5041541Srgrimes		ASSERT(scl->scl_writer == NULL);
5051541Srgrimes		ASSERT(scl->scl_write_wanted == 0);
5061541Srgrimes	}
50712221Sbde}
5081541Srgrimes
5091549Srgrimesint
51030994Sphkspa_config_tryenter(spa_t *spa, int locks, void *tag, krw_t rw)
5111541Srgrimes{
5121541Srgrimes	for (int i = 0; i < SCL_LOCKS; i++) {
5131541Srgrimes		spa_config_lock_t *scl = &spa->spa_config_lock[i];
5141541Srgrimes		if (!(locks & (1 << i)))
5151541Srgrimes			continue;
5161541Srgrimes		mutex_enter(&scl->scl_lock);
5171541Srgrimes		if (rw == RW_READER) {
51824448Speter			if (scl->scl_writer || scl->scl_write_wanted) {
51924448Speter				mutex_exit(&scl->scl_lock);
52024448Speter				spa_config_exit(spa, locks & ((1 << i) - 1),
52124448Speter				    tag);
52224448Speter				return (0);
52372093Sasmodai			}
52424448Speter		} else {
52524448Speter			ASSERT(scl->scl_writer != curthread);
52624448Speter			if (!refcount_is_zero(&scl->scl_count)) {
52724448Speter				mutex_exit(&scl->scl_lock);
52824448Speter				spa_config_exit(spa, locks & ((1 << i) - 1),
5291541Srgrimes				    tag);
53024448Speter				return (0);
53117994Sache			}
53224448Speter			scl->scl_writer = curthread;
53317994Sache		}
53424448Speter		(void) refcount_add(&scl->scl_count, tag);
53524448Speter		mutex_exit(&scl->scl_lock);
53624448Speter	}
53746155Sphk	return (1);
5381541Srgrimes}
53924448Speter
54017994Sachevoid
54124448Speterspa_config_enter(spa_t *spa, int locks, void *tag, krw_t rw)
54224448Speter{
54324448Speter	int wlocks_held = 0;
54424448Speter
54524448Speter	ASSERT3U(SCL_LOCKS, <, sizeof (wlocks_held) * NBBY);
54624448Speter
54724448Speter	for (int i = 0; i < SCL_LOCKS; i++) {
54817994Sache		spa_config_lock_t *scl = &spa->spa_config_lock[i];
54946155Sphk		if (scl->scl_writer == curthread)
55024448Speter			wlocks_held |= (1 << i);
55124448Speter		if (!(locks & (1 << i)))
55224448Speter			continue;
55324448Speter		mutex_enter(&scl->scl_lock);
55424448Speter		if (rw == RW_READER) {
55524448Speter			while (scl->scl_writer || scl->scl_write_wanted) {
55624448Speter				cv_wait(&scl->scl_cv, &scl->scl_lock);
55731891Ssef			}
55824448Speter		} else {
55924448Speter			ASSERT(scl->scl_writer != curthread);
56024448Speter			while (!refcount_is_zero(&scl->scl_count)) {
56124448Speter				scl->scl_write_wanted++;
56224448Speter				cv_wait(&scl->scl_cv, &scl->scl_lock);
56324448Speter				scl->scl_write_wanted--;
56424448Speter			}
56524448Speter			scl->scl_writer = curthread;
56624448Speter		}
56724448Speter		(void) refcount_add(&scl->scl_count, tag);
56831891Ssef		mutex_exit(&scl->scl_lock);
56924448Speter	}
5708141Sache	ASSERT3U(wlocks_held, <=, locks);
57124448Speter}
57224448Speter
57324448Spetervoid
57424448Speterspa_config_exit(spa_t *spa, int locks, void *tag)
57524448Speter{
57624448Speter	for (int i = SCL_LOCKS - 1; i >= 0; i--) {
57724448Speter		spa_config_lock_t *scl = &spa->spa_config_lock[i];
57831891Ssef		if (!(locks & (1 << i)))
57924448Speter			continue;
5801541Srgrimes		mutex_enter(&scl->scl_lock);
5811541Srgrimes		ASSERT(!refcount_is_zero(&scl->scl_count));
5821541Srgrimes		if (refcount_remove(&scl->scl_count, tag) == 0) {
58312221Sbde			ASSERT(scl->scl_writer == NULL ||
5841541Srgrimes			    scl->scl_writer == curthread);
5851541Srgrimes			scl->scl_writer = NULL;	/* OK in either case */
5861541Srgrimes			cv_broadcast(&scl->scl_cv);
58712221Sbde		}
5881541Srgrimes		mutex_exit(&scl->scl_lock);
5891549Srgrimes	}
59030994Sphk}
5911541Srgrimes
5921541Srgrimesint
5931541Srgrimesspa_config_held(spa_t *spa, int locks, krw_t rw)
5941541Srgrimes{
5951541Srgrimes	int locks_held = 0;
5961541Srgrimes
5971541Srgrimes	for (int i = 0; i < SCL_LOCKS; i++) {
5981541Srgrimes		spa_config_lock_t *scl = &spa->spa_config_lock[i];
59924449Speter		if (!(locks & (1 << i)))
60024449Speter			continue;
60146155Sphk		if ((rw == RW_READER && !refcount_is_zero(&scl->scl_count)) ||
6021541Srgrimes		    (rw == RW_WRITER && scl->scl_writer == curthread))
60324449Speter			locks_held |= 1 << i;
60424449Speter	}
60524449Speter
60631891Ssef	return (locks_held);
60724449Speter}
6081541Srgrimes
6091541Srgrimes/*
6101541Srgrimes * ==========================================================================
61112221Sbde * SPA namespace functions
6121541Srgrimes * ==========================================================================
6131541Srgrimes */
6141541Srgrimes
6151541Srgrimes/*
61612221Sbde * Lookup the named spa_t in the AVL tree.  The spa_namespace_lock must be held.
6171541Srgrimes * Returns NULL if no matching spa_t is found.
6181549Srgrimes */
61930994Sphkspa_t *
6201541Srgrimesspa_lookup(const char *name)
6211541Srgrimes{
6221541Srgrimes	static spa_t search;	/* spa_t is large; don't allocate on stack */
6231541Srgrimes	spa_t *spa;
6241541Srgrimes	avl_index_t where;
6251541Srgrimes	char *cp;
6261541Srgrimes
62746155Sphk	ASSERT(MUTEX_HELD(&spa_namespace_lock));
6281541Srgrimes
62912063Sdg	(void) strlcpy(search.spa_name, name, sizeof (search.spa_name));
63024447Speter
6311541Srgrimes	/*
63224447Speter	 * If it's a full dataset name, figure out the pool name and
63324447Speter	 * just use that.
63424447Speter	 */
63524447Speter	cp = strpbrk(search.spa_name, "/@#");
6361541Srgrimes	if (cp != NULL)
63724447Speter		*cp = '\0';
63824447Speter
63924447Speter	spa = avl_find(&spa_namespace_avl, &search, &where);
64024447Speter
64124447Speter	return (spa);
64224447Speter}
64324447Speter
64424447Speter/*
64524447Speter * Fires when spa_sync has not completed within zfs_deadman_synctime_ms.
64624447Speter * If the zfs_deadman_enabled flag is set then it inspects all vdev queues
64724447Speter * looking for potentially hung I/Os.
64824447Speter */
64924447Speterstatic void
65024447Speterspa_deadman(void *arg, int pending)
65131891Ssef{
6521541Srgrimes	spa_t *spa = arg;
6531541Srgrimes
6541541Srgrimes	/*
65512221Sbde	 * Disable the deadman timer if the pool is suspended.
6561541Srgrimes	 */
6579238Sache	if (spa_suspended(spa)) {
6589238Sache#ifdef illumos
6591541Srgrimes		VERIFY(cyclic_reprogram(spa->spa_deadman_cycid, CY_INFINITY));
66012221Sbde#else
6611541Srgrimes		/* Nothing.  just don't schedule any future callouts. */
6621549Srgrimes#endif
66330994Sphk		return;
6641541Srgrimes	}
6651541Srgrimes
6661541Srgrimes	zfs_dbgmsg("slow spa_sync: started %llu seconds ago, calls %llu",
6671541Srgrimes	    (gethrtime() - spa->spa_sync_starttime) / NANOSEC,
6689238Sache	    ++spa->spa_deadman_calls);
6698135Sache	if (zfs_deadman_enabled)
6701541Srgrimes		vdev_deadman(spa->spa_root_vdev);
6719238Sache#ifdef __FreeBSD__
6729238Sache#ifdef _KERNEL
67343311Sdillon	callout_schedule(&spa->spa_deadman_cycid,
67443311Sdillon	    hz * zfs_deadman_checktime_ms / MILLISEC);
67543311Sdillon#endif
67646155Sphk#endif
6778135Sache}
6789238Sache
67924450Speter#if defined(__FreeBSD__) && defined(_KERNEL)
68065495Struckmanstatic void
68131891Ssefspa_deadman_timeout(void *arg)
68224450Speter{
68324450Speter	spa_t *spa = arg;
68465495Struckman
68531891Ssef	taskqueue_enqueue(taskqueue_thread, &spa->spa_deadman_task);
6868135Sache}
68724559Speter#endif
68824559Speter
6898111Sache/*
69031891Ssef * Create an uninitialized spa_t with the given name.  Requires
69124450Speter * spa_namespace_lock.  The caller must ensure that the spa_t doesn't already
6928135Sache * exist by calling spa_lookup() first.
6931541Srgrimes */
6941541Srgrimesspa_t *
69512221Sbdespa_add(const char *name, nvlist_t *config, const char *altroot)
6961541Srgrimes{
6979238Sache	spa_t *spa;
6989238Sache	spa_config_dirent_t *dp;
6991541Srgrimes#ifdef illumos
70012221Sbde	cyc_handler_t hdlr;
7011541Srgrimes	cyc_time_t when;
7021549Srgrimes#endif
70330994Sphk
7041541Srgrimes	ASSERT(MUTEX_HELD(&spa_namespace_lock));
7051541Srgrimes
7061541Srgrimes	spa = kmem_zalloc(sizeof (spa_t), KM_SLEEP);
7071541Srgrimes
7089238Sache	mutex_init(&spa->spa_async_lock, NULL, MUTEX_DEFAULT, NULL);
7098135Sache	mutex_init(&spa->spa_errlist_lock, NULL, MUTEX_DEFAULT, NULL);
7101541Srgrimes	mutex_init(&spa->spa_errlog_lock, NULL, MUTEX_DEFAULT, NULL);
7119238Sache	mutex_init(&spa->spa_evicting_os_lock, NULL, MUTEX_DEFAULT, NULL);
7129238Sache	mutex_init(&spa->spa_history_lock, NULL, MUTEX_DEFAULT, NULL);
71343311Sdillon	mutex_init(&spa->spa_proc_lock, NULL, MUTEX_DEFAULT, NULL);
71443311Sdillon	mutex_init(&spa->spa_props_lock, NULL, MUTEX_DEFAULT, NULL);
71543311Sdillon	mutex_init(&spa->spa_cksum_tmpls_lock, NULL, MUTEX_DEFAULT, NULL);
71646155Sphk	mutex_init(&spa->spa_scrub_lock, NULL, MUTEX_DEFAULT, NULL);
7178135Sache	mutex_init(&spa->spa_suspend_lock, NULL, MUTEX_DEFAULT, NULL);
7189238Sache	mutex_init(&spa->spa_vdev_top_lock, NULL, MUTEX_DEFAULT, NULL);
71924450Speter
72024450Speter	cv_init(&spa->spa_async_cv, NULL, CV_DEFAULT, NULL);
7219238Sache	cv_init(&spa->spa_evicting_os_cv, NULL, CV_DEFAULT, NULL);
72231891Ssef	cv_init(&spa->spa_proc_cv, NULL, CV_DEFAULT, NULL);
72324450Speter	cv_init(&spa->spa_scrub_io_cv, NULL, CV_DEFAULT, NULL);
72424450Speter	cv_init(&spa->spa_suspend_cv, NULL, CV_DEFAULT, NULL);
7259238Sache
72631891Ssef	for (int t = 0; t < TXG_SIZE; t++)
72724450Speter		bplist_create(&spa->spa_free_bplist[t]);
72824559Speter
72924559Speter	(void) strlcpy(spa->spa_name, name, sizeof (spa->spa_name));
7308111Sache	spa->spa_state = POOL_STATE_UNINITIALIZED;
73131891Ssef	spa->spa_freeze_txg = UINT64_MAX;
73224450Speter	spa->spa_final_txg = UINT64_MAX;
7338135Sache	spa->spa_load_max_txg = UINT64_MAX;
7341541Srgrimes	spa->spa_proc = &p0;
7351541Srgrimes	spa->spa_proc_state = SPA_PROC_NONE;
73656115Speter	spa->spa_trust_config = B_TRUE;
73756115Speter
73856115Speter#ifdef illumos
73956115Speter	hdlr.cyh_func = spa_deadman;
74056115Speter	hdlr.cyh_arg = spa;
74124453Speter	hdlr.cyh_level = CY_LOW_LEVEL;
74256115Speter#endif
74356115Speter
74456115Speter	spa->spa_deadman_synctime = MSEC2NSEC(zfs_deadman_synctime_ms);
74556115Speter
74656115Speter#ifdef illumos
74756115Speter	/*
74856115Speter	 * This determines how often we need to check for hung I/Os after
74956115Speter	 * the cyclic has already fired. Since checking for hung I/Os is
75056115Speter	 * an expensive operation we don't want to check too frequently.
75156115Speter	 * Instead wait for 5 seconds before checking again.
75256115Speter	 */
75356115Speter	when.cyt_interval = MSEC2NSEC(zfs_deadman_checktime_ms);
75456115Speter	when.cyt_when = CY_INFINITY;
75556115Speter	mutex_enter(&cpu_lock);
75656115Speter	spa->spa_deadman_cycid = cyclic_add(&hdlr, &when);
75756115Speter	mutex_exit(&cpu_lock);
75856115Speter#else	/* !illumos */
75956115Speter#ifdef _KERNEL
76056115Speter	/*
76156115Speter	 * callout(9) does not provide a way to initialize a callout with
76256115Speter	 * a function and an argument, so we use callout_reset() to schedule
76356115Speter	 * the callout in the very distant future.  Even if that event ever
76456115Speter	 * fires, it should be okayas we won't have any active zio-s.
76556115Speter	 * But normally spa_sync() will reschedule the callout with a proper
76656115Speter	 * timeout.
76756115Speter	 * callout(9) does not allow the callback function to sleep but
76856115Speter	 * vdev_deadman() needs to acquire vq_lock and illumos mutexes are
76956115Speter	 * emulated using sx(9).  For this reason spa_deadman_timeout()
77065495Struckman	 * will schedule spa_deadman() as task on a taskqueue that allows
77156115Speter	 * sleeping.
77256115Speter	 */
77356115Speter	TASK_INIT(&spa->spa_deadman_task, 0, spa_deadman, spa);
77465495Struckman	callout_init(&spa->spa_deadman_cycid, 1);
77556115Speter	callout_reset_sbt(&spa->spa_deadman_cycid, SBT_MAX, 0,
77656115Speter	    spa_deadman_timeout, spa, 0);
77756115Speter#endif
77856115Speter#endif
77956115Speter	refcount_create(&spa->spa_refcount);
78056115Speter	spa_config_lock_init(spa);
78156115Speter
78256115Speter	avl_add(&spa_namespace_avl, spa);
78356115Speter
78456115Speter	/*
78556115Speter	 * Set the alternate root, if there is one.
78656115Speter	 */
78756115Speter	if (altroot) {
78856115Speter		spa->spa_root = spa_strdup(altroot);
78956115Speter		spa_active_count++;
79056115Speter	}
79156115Speter
79256115Speter	spa->spa_alloc_count = spa_allocators;
79356115Speter	spa->spa_alloc_locks = kmem_zalloc(spa->spa_alloc_count *
79456115Speter	    sizeof (kmutex_t), KM_SLEEP);
79556115Speter	spa->spa_alloc_trees = kmem_zalloc(spa->spa_alloc_count *
79656115Speter	    sizeof (avl_tree_t), KM_SLEEP);
79756115Speter	for (int i = 0; i < spa->spa_alloc_count; i++) {
79856115Speter		mutex_init(&spa->spa_alloc_locks[i], NULL, MUTEX_DEFAULT, NULL);
79956115Speter		avl_create(&spa->spa_alloc_trees[i], zio_bookmark_compare,
80056115Speter		    sizeof (zio_t), offsetof(zio_t, io_alloc_node));
80156115Speter	}
80256115Speter
80356115Speter	/*
80456115Speter	 * Every pool starts with the default cachefile
80556115Speter	 */
80656115Speter	list_create(&spa->spa_config_list, sizeof (spa_config_dirent_t),
80756115Speter	    offsetof(spa_config_dirent_t, scd_link));
80856115Speter
80956115Speter	dp = kmem_zalloc(sizeof (spa_config_dirent_t), KM_SLEEP);
81056115Speter	dp->scd_path = altroot ? NULL : spa_strdup(spa_config_path);
81156115Speter	list_insert_head(&spa->spa_config_list, dp);
81256115Speter
81356115Speter	VERIFY(nvlist_alloc(&spa->spa_load_info, NV_UNIQUE_NAME,
81456115Speter	    KM_SLEEP) == 0);
81556115Speter
81656115Speter	if (config != NULL) {
81756115Speter		nvlist_t *features;
81856115Speter
81956115Speter		if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_FEATURES_FOR_READ,
82056115Speter		    &features) == 0) {
82156115Speter			VERIFY(nvlist_dup(features, &spa->spa_label_features,
82256115Speter			    0) == 0);
82356115Speter		}
82456115Speter
82556115Speter		VERIFY(nvlist_dup(config, &spa->spa_config, 0) == 0);
82656115Speter	}
82756115Speter
82856115Speter	if (spa->spa_label_features == NULL) {
82956115Speter		VERIFY(nvlist_alloc(&spa->spa_label_features, NV_UNIQUE_NAME,
83056115Speter		    KM_SLEEP) == 0);
83156115Speter	}
83256115Speter
83356115Speter	spa->spa_min_ashift = INT_MAX;
83456115Speter	spa->spa_max_ashift = 0;
83556115Speter
83656115Speter	/*
83756115Speter	 * As a pool is being created, treat all features as disabled by
83856115Speter	 * setting SPA_FEATURE_DISABLED for all entries in the feature
83956115Speter	 * refcount cache.
84056115Speter	 */
84156115Speter	for (int i = 0; i < SPA_FEATURES; i++) {
84256115Speter		spa->spa_feat_refcount_cache[i] = SPA_FEATURE_DISABLED;
84356115Speter	}
84456115Speter
84556115Speter	return (spa);
84656115Speter}
84756115Speter
84856115Speter/*
84956115Speter * Removes a spa_t from the namespace, freeing up any memory used.  Requires
85056115Speter * spa_namespace_lock.  This is called only after the spa_t has been closed and
85156115Speter * deactivated.
85256115Speter */
85356115Spetervoid
85456115Speterspa_remove(spa_t *spa)
85556115Speter{
85656115Speter	spa_config_dirent_t *dp;
85756115Speter
85856115Speter	ASSERT(MUTEX_HELD(&spa_namespace_lock));
85956115Speter	ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
86056115Speter	ASSERT3U(refcount_count(&spa->spa_refcount), ==, 0);
86156115Speter
86256115Speter	nvlist_free(spa->spa_config_splitting);
86356115Speter
86456115Speter	avl_remove(&spa_namespace_avl, spa);
86556115Speter	cv_broadcast(&spa_namespace_cv);
86656115Speter
86756115Speter	if (spa->spa_root) {
86856115Speter		spa_strfree(spa->spa_root);
86956115Speter		spa_active_count--;
87056115Speter	}
87156115Speter
87256115Speter	while ((dp = list_head(&spa->spa_config_list)) != NULL) {
87356115Speter		list_remove(&spa->spa_config_list, dp);
87456115Speter		if (dp->scd_path != NULL)
87556115Speter			spa_strfree(dp->scd_path);
87656115Speter		kmem_free(dp, sizeof (spa_config_dirent_t));
87756115Speter	}
87856115Speter
87956115Speter	for (int i = 0; i < spa->spa_alloc_count; i++) {
88056115Speter		avl_destroy(&spa->spa_alloc_trees[i]);
88156115Speter		mutex_destroy(&spa->spa_alloc_locks[i]);
88256115Speter	}
88356115Speter	kmem_free(spa->spa_alloc_locks, spa->spa_alloc_count *
88456115Speter	    sizeof (kmutex_t));
88556115Speter	kmem_free(spa->spa_alloc_trees, spa->spa_alloc_count *
88656115Speter	    sizeof (avl_tree_t));
88756115Speter
88856115Speter	list_destroy(&spa->spa_config_list);
88956115Speter
89056115Speter	nvlist_free(spa->spa_label_features);
89156115Speter	nvlist_free(spa->spa_load_info);
89224453Speter	spa_config_set(spa, NULL);
89324453Speter
89424453Speter#ifdef illumos
89524453Speter	mutex_enter(&cpu_lock);
89624453Speter	if (spa->spa_deadman_cycid != CYCLIC_NONE)
89724453Speter		cyclic_remove(spa->spa_deadman_cycid);
89830994Sphk	mutex_exit(&cpu_lock);
89924453Speter	spa->spa_deadman_cycid = CYCLIC_NONE;
90024453Speter#else	/* !illumos */
90124453Speter#ifdef _KERNEL
90224453Speter	callout_drain(&spa->spa_deadman_cycid);
90324453Speter	taskqueue_drain(taskqueue_thread, &spa->spa_deadman_task);
90424453Speter#endif
90524453Speter#endif
90624453Speter
90724453Speter	refcount_destroy(&spa->spa_refcount);
90824453Speter
90924453Speter	spa_config_lock_destroy(spa);
91060216Speter
91124453Speter	for (int t = 0; t < TXG_SIZE; t++)
91224453Speter		bplist_destroy(&spa->spa_free_bplist[t]);
91324453Speter
9141541Srgrimes	zio_checksum_templates_free(spa);
9151541Srgrimes
9161541Srgrimes	cv_destroy(&spa->spa_async_cv);
9171549Srgrimes	cv_destroy(&spa->spa_evicting_os_cv);
9181541Srgrimes	cv_destroy(&spa->spa_proc_cv);
9191541Srgrimes	cv_destroy(&spa->spa_scrub_io_cv);
9201541Srgrimes	cv_destroy(&spa->spa_suspend_cv);
9211541Srgrimes
9221541Srgrimes	mutex_destroy(&spa->spa_async_lock);
9231541Srgrimes	mutex_destroy(&spa->spa_errlist_lock);
9241541Srgrimes	mutex_destroy(&spa->spa_errlog_lock);
9251541Srgrimes	mutex_destroy(&spa->spa_evicting_os_lock);
9261541Srgrimes	mutex_destroy(&spa->spa_history_lock);
9271541Srgrimes	mutex_destroy(&spa->spa_proc_lock);
9281541Srgrimes	mutex_destroy(&spa->spa_props_lock);
9291541Srgrimes	mutex_destroy(&spa->spa_cksum_tmpls_lock);
9301541Srgrimes	mutex_destroy(&spa->spa_scrub_lock);
9311541Srgrimes	mutex_destroy(&spa->spa_suspend_lock);
93261287Srwatson	mutex_destroy(&spa->spa_vdev_top_lock);
93361287Srwatson
93461287Srwatson	kmem_free(spa, sizeof (spa_t));
93561287Srwatson}
93661287Srwatson
9371541Srgrimes/*
9381541Srgrimes * Given a pool, return the next pool in the namespace, or NULL if there is
9391541Srgrimes * none.  If 'prev' is NULL, return the first pool.
9401541Srgrimes */
9411541Srgrimesspa_t *
9421541Srgrimesspa_next(spa_t *prev)
9431549Srgrimes{
94446112Sphk	ASSERT(MUTEX_HELD(&spa_namespace_lock));
94572786Srwatson
94646112Sphk	if (prev)
94746155Sphk		return (AVL_NEXT(&spa_namespace_avl, prev));
94846112Sphk	else
94946112Sphk		return (avl_first(&spa_namespace_avl));
95046112Sphk}
95146155Sphk
95272786Srwatson/*
95372786Srwatson * ==========================================================================
95446155Sphk * SPA refcount functions
9551541Srgrimes * ==========================================================================
95661282Srwatson */
95761282Srwatson
95846155Sphk/*
95946155Sphk * Add a reference to the given spa_t.  Must have at least one reference, or
96046155Sphk * have the namespace lock held.
9611541Srgrimes */
96246155Sphkvoid
96346155Sphkspa_open_ref(spa_t *spa, void *tag)
96446155Sphk{
96546155Sphk	ASSERT(refcount_count(&spa->spa_refcount) >= spa->spa_minref ||
96672786Srwatson	    MUTEX_HELD(&spa_namespace_lock));
96746155Sphk	(void) refcount_add(&spa->spa_refcount, tag);
96846155Sphk}
9691541Srgrimes
9701541Srgrimes/*
97165237Srwatson * Remove a reference to the given spa_t.  Must have at least one reference, or
97272786Srwatson * have the namespace lock held.
97365237Srwatson */
97472786Srwatsonvoid
97553518Sphkspa_close(spa_t *spa, void *tag)
97665237Srwatson{
97765237Srwatson	ASSERT(refcount_count(&spa->spa_refcount) > spa->spa_minref ||
97865237Srwatson	    MUTEX_HELD(&spa_namespace_lock));
97972786Srwatson	(void) refcount_remove(&spa->spa_refcount, tag);
98072786Srwatson}
98165237Srwatson
98265293Srwatson/*
98365293Srwatson * Remove a reference to the given spa_t held by a dsl dir that is
98465293Srwatson * being asynchronously released.  Async releases occur from a taskq
98565293Srwatson * performing eviction of dsl datasets and dirs.  The namespace lock
98665293Srwatson * isn't held and the hold by the object being evicted may contribute to
98765293Srwatson * spa_minref (e.g. dataset or directory released during pool export),
98865237Srwatson * so the asserts in spa_close() do not apply.
98965293Srwatson */
99065237Srwatsonvoid
99165237Srwatsonspa_async_close(spa_t *spa, void *tag)
99265237Srwatson{
99365237Srwatson	(void) refcount_remove(&spa->spa_refcount, tag);
99465237Srwatson}
99572786Srwatson
99653518Sphk/*
99772786Srwatson * Check to see if the spa refcount is zero.  Must be called with
99853518Sphk * spa_namespace_lock held.  We really compare against spa_minref, which is the
99965237Srwatson * number of references acquired when opening a pool
100065237Srwatson */
100165237Srwatsonboolean_t
100253518Sphkspa_refcount_zero(spa_t *spa)
100353518Sphk{
100465237Srwatson	ASSERT(MUTEX_HELD(&spa_namespace_lock));
100572786Srwatson
100672786Srwatson	return (refcount_count(&spa->spa_refcount) == spa->spa_minref);
100765237Srwatson}
100853518Sphk
100953518Sphk/*
101053518Sphk * ==========================================================================
101153518Sphk * SPA spare and l2cache tracking
101265237Srwatson * ==========================================================================
101365237Srwatson */
101465237Srwatson
101565237Srwatson/*
101653518Sphk * Hot spares and cache devices are tracked using the same code below,
101753518Sphk * for 'auxiliary' devices.
101853518Sphk */
101953518Sphk
102065237Srwatsontypedef struct spa_aux {
102165237Srwatson	uint64_t	aux_guid;
102265237Srwatson	uint64_t	aux_pool;
102365237Srwatson	avl_node_t	aux_avl;
102453518Sphk	int		aux_count;
102565237Srwatson} spa_aux_t;
102665237Srwatson
102765237Srwatsonstatic int
102865237Srwatsonspa_aux_compare(const void *a, const void *b)
102965237Srwatson{
103065237Srwatson	const spa_aux_t *sa = a;
103165237Srwatson	const spa_aux_t *sb = b;
103265237Srwatson
103365237Srwatson	if (sa->aux_guid < sb->aux_guid)
103465237Srwatson		return (-1);
103553518Sphk	else if (sa->aux_guid > sb->aux_guid)
103653518Sphk		return (1);
103753518Sphk	else
103865237Srwatson		return (0);
103972786Srwatson}
104065237Srwatson
104172786Srwatsonvoid
104265237Srwatsonspa_aux_add(vdev_t *vd, avl_tree_t *avl)
104365237Srwatson{
104465237Srwatson	avl_index_t where;
104565237Srwatson	spa_aux_t search;
104665237Srwatson	spa_aux_t *aux;
104765237Srwatson
104865237Srwatson	search.aux_guid = vd->vdev_guid;
104972786Srwatson	if ((aux = avl_find(avl, &search, &where)) != NULL) {
105072786Srwatson		aux->aux_count++;
105165237Srwatson	} else {
105265237Srwatson		aux = kmem_zalloc(sizeof (spa_aux_t), KM_SLEEP);
105365237Srwatson		aux->aux_guid = vd->vdev_guid;
105465237Srwatson		aux->aux_count = 1;
105565237Srwatson		avl_insert(avl, aux, where);
105665237Srwatson	}
105765237Srwatson}
105865237Srwatson
105965237Srwatsonvoid
106065237Srwatsonspa_aux_remove(vdev_t *vd, avl_tree_t *avl)
106165237Srwatson{
106265237Srwatson	spa_aux_t search;
106365237Srwatson	spa_aux_t *aux;
106465237Srwatson	avl_index_t where;
106565237Srwatson
106665237Srwatson	search.aux_guid = vd->vdev_guid;
106765237Srwatson	aux = avl_find(avl, &search, &where);
106865237Srwatson
106965237Srwatson	ASSERT(aux != NULL);
107065237Srwatson
107165237Srwatson	if (--aux->aux_count == 0) {
107265237Srwatson		avl_remove(avl, aux);
107365237Srwatson		kmem_free(aux, sizeof (spa_aux_t));
107465237Srwatson	} else if (aux->aux_pool == spa_guid(vd->vdev_spa)) {
107565237Srwatson		aux->aux_pool = 0ULL;
107665237Srwatson	}
107765237Srwatson}
107865237Srwatson
107965237Srwatsonboolean_t
108065237Srwatsonspa_aux_exists(uint64_t guid, uint64_t *pool, int *refcnt, avl_tree_t *avl)
108165237Srwatson{
108265237Srwatson	spa_aux_t search, *found;
108372786Srwatson
108465237Srwatson	search.aux_guid = guid;
108572786Srwatson	found = avl_find(avl, &search, NULL);
108665237Srwatson
108765237Srwatson	if (pool) {
108865237Srwatson		if (found)
108965237Srwatson			*pool = found->aux_pool;
109065237Srwatson		else
109165237Srwatson			*pool = 0ULL;
109265237Srwatson	}
109365237Srwatson
109472786Srwatson	if (refcnt) {
109572786Srwatson		if (found)
109665237Srwatson			*refcnt = found->aux_count;
109765237Srwatson		else
109865237Srwatson			*refcnt = 0;
109967999Srwatson	}
110067999Srwatson
110168591Srwatson	return (found != NULL);
110267999Srwatson}
110365237Srwatson
110465237Srwatsonvoid
110565237Srwatsonspa_aux_activate(vdev_t *vd, avl_tree_t *avl)
110665237Srwatson{
110765237Srwatson	spa_aux_t search, *found;
110865237Srwatson	avl_index_t where;
110965237Srwatson
111065237Srwatson	search.aux_guid = vd->vdev_guid;
111165237Srwatson	found = avl_find(avl, &search, &where);
111265237Srwatson	ASSERT(found != NULL);
111365237Srwatson	ASSERT(found->aux_pool == 0ULL);
111465237Srwatson
111565237Srwatson	found->aux_pool = spa_guid(vd->vdev_spa);
111665237Srwatson}
111772786Srwatson
111865237Srwatson/*
111965237Srwatson * Spares are tracked globally due to the following constraints:
112065237Srwatson *
112165237Srwatson * 	- A spare may be part of multiple pools.
112265237Srwatson * 	- A spare may be added to a pool even if it's actively in use within
112365237Srwatson *	  another pool.
112465237Srwatson * 	- A spare in use in any pool can only be the source of a replacement if
112565237Srwatson *	  the target is a spare in the same pool.
112665237Srwatson *
112765237Srwatson * We keep track of all spares on the system through the use of a reference
112865237Srwatson * counted AVL tree.  When a vdev is added as a spare, or used as a replacement
112965237Srwatson * spare, then we bump the reference count in the AVL tree.  In addition, we set
113065237Srwatson * the 'vdev_isspare' member to indicate that the device is a spare (active or
113165237Srwatson * inactive).  When a spare is made active (used to replace a device in the
113265237Srwatson * pool), we also keep track of which pool its been made a part of.
113365237Srwatson *
113465237Srwatson * The 'spa_spare_lock' protects the AVL tree.  These functions are normally
113565237Srwatson * called under the spa_namespace lock as part of vdev reconfiguration.  The
113665237Srwatson * separate spare lock exists for the status query path, which does not need to
113765237Srwatson * be completely consistent with respect to other vdev configuration changes.
113865237Srwatson */
113965237Srwatson
114053518Sphkstatic int
11411541Srgrimesspa_spare_compare(const void *a, const void *b)
11421541Srgrimes{
11431541Srgrimes	return (spa_aux_compare(a, b));
11441541Srgrimes}
11451541Srgrimes
11461541Srgrimesvoid
11471541Srgrimesspa_spare_add(vdev_t *vd)
114869239Salfred{
11491541Srgrimes	mutex_enter(&spa_spare_lock);
115069239Salfred	ASSERT(!vd->vdev_isspare);
11511541Srgrimes	spa_aux_add(vd, &spa_spare_avl);
11521541Srgrimes	vd->vdev_isspare = B_TRUE;
11531541Srgrimes	mutex_exit(&spa_spare_lock);
11541541Srgrimes}
115572474Srwatson
115669401Salfredvoid
115769401Salfredspa_spare_remove(vdev_t *vd)
115869401Salfred{
115969401Salfred	mutex_enter(&spa_spare_lock);
116069401Salfred	ASSERT(vd->vdev_isspare);
116169401Salfred	spa_aux_remove(vd, &spa_spare_avl);
116272200Sbmilekic	vd->vdev_isspare = B_FALSE;
116369401Salfred	mutex_exit(&spa_spare_lock);
116472200Sbmilekic}
116569401Salfred
116669401Salfredboolean_t
116769401Salfredspa_spare_exists(uint64_t guid, uint64_t *pool, int *refcnt)
116869401Salfred{
11691541Srgrimes	boolean_t found;
11701541Srgrimes
11711541Srgrimes	mutex_enter(&spa_spare_lock);
11721549Srgrimes	found = spa_aux_exists(guid, pool, refcnt, &spa_spare_avl);
11731541Srgrimes	mutex_exit(&spa_spare_lock);
11741541Srgrimes
11751541Srgrimes	return (found);
117669239Salfred}
117772200Sbmilekic
117865495Struckmanvoid
117969239Salfredspa_spare_activate(vdev_t *vd)
118065495Struckman{
118165495Struckman	mutex_enter(&spa_spare_lock);
118265495Struckman	ASSERT(vd->vdev_isspare);
118365495Struckman	spa_aux_activate(vd, &spa_spare_avl);
118465495Struckman	mutex_exit(&spa_spare_lock);
118565495Struckman}
118665495Struckman
118772786Srwatson/*
118872786Srwatson * Level 2 ARC devices are tracked globally for the same reasons as spares.
118972786Srwatson * Cache devices currently only support one pool per cache device, and so
119072786Srwatson * for these devices the aux reference count is currently unused beyond 1.
119172786Srwatson */
11921541Srgrimes
119369239Salfredstatic int
119472200Sbmilekicspa_l2cache_compare(const void *a, const void *b)
119565495Struckman{
11961541Srgrimes	return (spa_aux_compare(a, b));
11971541Srgrimes}
11981541Srgrimes
11991541Srgrimesvoid
12001541Srgrimesspa_l2cache_add(vdev_t *vd)
12011541Srgrimes{
12021541Srgrimes	mutex_enter(&spa_l2cache_lock);
12031541Srgrimes	ASSERT(!vd->vdev_isl2cache);
12041541Srgrimes	spa_aux_add(vd, &spa_l2cache_avl);
12051541Srgrimes	vd->vdev_isl2cache = B_TRUE;
12061541Srgrimes	mutex_exit(&spa_l2cache_lock);
120772200Sbmilekic}
120869239Salfred
120972200Sbmilekicvoid
12101541Srgrimesspa_l2cache_remove(vdev_t *vd)
121169239Salfred{
121272200Sbmilekic	mutex_enter(&spa_l2cache_lock);
121369239Salfred	ASSERT(vd->vdev_isl2cache);
12141541Srgrimes	spa_aux_remove(vd, &spa_l2cache_avl);
12151541Srgrimes	vd->vdev_isl2cache = B_FALSE;
12161541Srgrimes	mutex_exit(&spa_l2cache_lock);
12171541Srgrimes}
12181541Srgrimes
12191541Srgrimesboolean_t
12201541Srgrimesspa_l2cache_exists(uint64_t guid, uint64_t *pool)
12211541Srgrimes{
12221541Srgrimes	boolean_t found;
12231541Srgrimes
12241541Srgrimes	mutex_enter(&spa_l2cache_lock);
12251541Srgrimes	found = spa_aux_exists(guid, pool, NULL, &spa_l2cache_avl);
12261541Srgrimes	mutex_exit(&spa_l2cache_lock);
122769239Salfred
12281541Srgrimes	return (found);
122969239Salfred}
123065495Struckman
123172786Srwatsonvoid
123272786Srwatsonspa_l2cache_activate(vdev_t *vd)
12331541Srgrimes{
12341541Srgrimes	mutex_enter(&spa_l2cache_lock);
12351541Srgrimes	ASSERT(vd->vdev_isl2cache);
12361541Srgrimes	spa_aux_activate(vd, &spa_l2cache_avl);
12371541Srgrimes	mutex_exit(&spa_l2cache_lock);
12381541Srgrimes}
12391541Srgrimes
124012221Sbde/*
12411541Srgrimes * ==========================================================================
12421541Srgrimes * SPA vdev locking
12431541Srgrimes * ==========================================================================
12441541Srgrimes */
124512221Sbde
12461541Srgrimes/*
12471549Srgrimes * Lock the given spa_t for the purpose of adding or removing a vdev.
124830994Sphk * Grabs the global spa_namespace_lock plus the spa config lock for writing.
12491541Srgrimes * It returns the next transaction group for the spa_t.
12501541Srgrimes */
12511541Srgrimesuint64_t
12521541Srgrimesspa_vdev_enter(spa_t *spa)
125323358Sache{
125423359Sache	mutex_enter(&spa->spa_vdev_top_lock);
12551541Srgrimes	mutex_enter(&spa_namespace_lock);
12561541Srgrimes	return (spa_vdev_config_enter(spa));
12571541Srgrimes}
12581541Srgrimes
12591541Srgrimes/*
12601541Srgrimes * Internal implementation for spa_vdev_enter().  Used when a vdev
12611541Srgrimes * operation requires multiple syncs (i.e. removing a device) while
126212221Sbde * keeping the spa_namespace_lock held.
12631541Srgrimes */
12641541Srgrimesuint64_t
12651541Srgrimesspa_vdev_config_enter(spa_t *spa)
126612221Sbde{
12671541Srgrimes	ASSERT(MUTEX_HELD(&spa_namespace_lock));
12681549Srgrimes
126930994Sphk	spa_config_enter(spa, SCL_ALL, spa, RW_WRITER);
12701541Srgrimes
12711541Srgrimes	return (spa_last_synced_txg(spa) + 1);
12721541Srgrimes}
12731541Srgrimes
127423330Sache/*
12751541Srgrimes * Used in combination with spa_vdev_config_enter() to allow the syncing
127646155Sphk * of multiple transactions without releasing the spa_namespace_lock.
12771541Srgrimes */
127822522Sdavidnvoid
127936845Sdfrspa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error, char *tag)
12801541Srgrimes{
12811541Srgrimes	ASSERT(MUTEX_HELD(&spa_namespace_lock));
128222522Sdavidn
128322522Sdavidn	int config_changed = B_FALSE;
128423330Sache
12851541Srgrimes	ASSERT(txg > spa_last_synced_txg(spa));
12861541Srgrimes
128731891Ssef	spa->spa_pending_vdev = NULL;
128831891Ssef
128931891Ssef	/*
129055338Sphk	 * Reassess the DTLs.
129131891Ssef	 */
129231891Ssef	vdev_dtl_reassess(spa->spa_root_vdev, 0, 0, B_FALSE);
129355707Ssef
129431891Ssef	if (error == 0 && !list_is_empty(&spa->spa_config_dirty_list)) {
129531891Ssef		config_changed = B_TRUE;
129665495Struckman		spa->spa_config_generation++;
129765495Struckman	}
129865495Struckman
129965495Struckman	/*
130065495Struckman	 * Verify the metaslab classes.
130165495Struckman	 */
130265495Struckman	ASSERT(metaslab_class_validate(spa_normal_class(spa)) == 0);
130365495Struckman	ASSERT(metaslab_class_validate(spa_log_class(spa)) == 0);
130465495Struckman
130565495Struckman	spa_config_exit(spa, SCL_ALL, spa);
130665495Struckman
130765495Struckman	/*
130865495Struckman	 * Panic the system if the specified tag requires it.  This
130965495Struckman	 * is useful for ensuring that configurations are updated
131065495Struckman	 * transactionally.
131165495Struckman	 */
131265495Struckman	if (zio_injection_enabled)
131365495Struckman		zio_handle_panic_injection(spa, tag, 0);
131465495Struckman
131565495Struckman	/*
131665495Struckman	 * Note: this txg_wait_synced() is important because it ensures
131765495Struckman	 * that there won't be more than one config change per txg.
131865495Struckman	 * This allows us to use the txg as the generation number.
131965495Struckman	 */
132065495Struckman	if (error == 0)
132165495Struckman		txg_wait_synced(spa->spa_dsl_pool, txg);
132265495Struckman
132365495Struckman	if (vd != NULL) {
132465495Struckman		ASSERT(!vd->vdev_detached || vd->vdev_dtl_sm == NULL);
132565495Struckman		if (vd->vdev_ops->vdev_op_leaf) {
132667629Sgallatin			mutex_enter(&vd->vdev_initialize_lock);
132765495Struckman			vdev_initialize_stop(vd, VDEV_INITIALIZE_CANCELED);
132865495Struckman			mutex_exit(&vd->vdev_initialize_lock);
132965495Struckman		}
133065495Struckman
133165495Struckman		spa_config_enter(spa, SCL_ALL, spa, RW_WRITER);
133265495Struckman		vdev_free(vd);
133365495Struckman		spa_config_exit(spa, SCL_ALL, spa);
133465495Struckman	}
133565495Struckman
133665495Struckman	/*
133765495Struckman	 * If the config changed, update the config cache.
133865495Struckman	 */
133965495Struckman	if (config_changed)
134065495Struckman		spa_write_cachefile(spa, B_FALSE, B_TRUE);
134165495Struckman}
134265495Struckman
1343/*
1344 * Unlock the spa_t after adding or removing a vdev.  Besides undoing the
1345 * locking of spa_vdev_enter(), we also want make sure the transactions have
1346 * synced to disk, and then update the global configuration cache with the new
1347 * information.
1348 */
1349int
1350spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error)
1351{
1352	spa_vdev_config_exit(spa, vd, txg, error, FTAG);
1353	mutex_exit(&spa_namespace_lock);
1354	mutex_exit(&spa->spa_vdev_top_lock);
1355
1356	return (error);
1357}
1358
1359/*
1360 * Lock the given spa_t for the purpose of changing vdev state.
1361 */
1362void
1363spa_vdev_state_enter(spa_t *spa, int oplocks)
1364{
1365	int locks = SCL_STATE_ALL | oplocks;
1366
1367	/*
1368	 * Root pools may need to read of the underlying devfs filesystem
1369	 * when opening up a vdev.  Unfortunately if we're holding the
1370	 * SCL_ZIO lock it will result in a deadlock when we try to issue
1371	 * the read from the root filesystem.  Instead we "prefetch"
1372	 * the associated vnodes that we need prior to opening the
1373	 * underlying devices and cache them so that we can prevent
1374	 * any I/O when we are doing the actual open.
1375	 */
1376	if (spa_is_root(spa)) {
1377		int low = locks & ~(SCL_ZIO - 1);
1378		int high = locks & ~low;
1379
1380		spa_config_enter(spa, high, spa, RW_WRITER);
1381		vdev_hold(spa->spa_root_vdev);
1382		spa_config_enter(spa, low, spa, RW_WRITER);
1383	} else {
1384		spa_config_enter(spa, locks, spa, RW_WRITER);
1385	}
1386	spa->spa_vdev_locks = locks;
1387}
1388
1389int
1390spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error)
1391{
1392	boolean_t config_changed = B_FALSE;
1393
1394	if (vd != NULL || error == 0)
1395		vdev_dtl_reassess(vd ? vd->vdev_top : spa->spa_root_vdev,
1396		    0, 0, B_FALSE);
1397
1398	if (vd != NULL) {
1399		vdev_state_dirty(vd->vdev_top);
1400		config_changed = B_TRUE;
1401		spa->spa_config_generation++;
1402	}
1403
1404	if (spa_is_root(spa))
1405		vdev_rele(spa->spa_root_vdev);
1406
1407	ASSERT3U(spa->spa_vdev_locks, >=, SCL_STATE_ALL);
1408	spa_config_exit(spa, spa->spa_vdev_locks, spa);
1409
1410	/*
1411	 * If anything changed, wait for it to sync.  This ensures that,
1412	 * from the system administrator's perspective, zpool(1M) commands
1413	 * are synchronous.  This is important for things like zpool offline:
1414	 * when the command completes, you expect no further I/O from ZFS.
1415	 */
1416	if (vd != NULL)
1417		txg_wait_synced(spa->spa_dsl_pool, 0);
1418
1419	/*
1420	 * If the config changed, update the config cache.
1421	 */
1422	if (config_changed) {
1423		mutex_enter(&spa_namespace_lock);
1424		spa_write_cachefile(spa, B_FALSE, B_TRUE);
1425		mutex_exit(&spa_namespace_lock);
1426	}
1427
1428	return (error);
1429}
1430
1431/*
1432 * ==========================================================================
1433 * Miscellaneous functions
1434 * ==========================================================================
1435 */
1436
1437void
1438spa_activate_mos_feature(spa_t *spa, const char *feature, dmu_tx_t *tx)
1439{
1440	if (!nvlist_exists(spa->spa_label_features, feature)) {
1441		fnvlist_add_boolean(spa->spa_label_features, feature);
1442		/*
1443		 * When we are creating the pool (tx_txg==TXG_INITIAL), we can't
1444		 * dirty the vdev config because lock SCL_CONFIG is not held.
1445		 * Thankfully, in this case we don't need to dirty the config
1446		 * because it will be written out anyway when we finish
1447		 * creating the pool.
1448		 */
1449		if (tx->tx_txg != TXG_INITIAL)
1450			vdev_config_dirty(spa->spa_root_vdev);
1451	}
1452}
1453
1454void
1455spa_deactivate_mos_feature(spa_t *spa, const char *feature)
1456{
1457	if (nvlist_remove_all(spa->spa_label_features, feature) == 0)
1458		vdev_config_dirty(spa->spa_root_vdev);
1459}
1460
1461/*
1462 * Rename a spa_t.
1463 */
1464int
1465spa_rename(const char *name, const char *newname)
1466{
1467	spa_t *spa;
1468	int err;
1469
1470	/*
1471	 * Lookup the spa_t and grab the config lock for writing.  We need to
1472	 * actually open the pool so that we can sync out the necessary labels.
1473	 * It's OK to call spa_open() with the namespace lock held because we
1474	 * allow recursive calls for other reasons.
1475	 */
1476	mutex_enter(&spa_namespace_lock);
1477	if ((err = spa_open(name, &spa, FTAG)) != 0) {
1478		mutex_exit(&spa_namespace_lock);
1479		return (err);
1480	}
1481
1482	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
1483
1484	avl_remove(&spa_namespace_avl, spa);
1485	(void) strlcpy(spa->spa_name, newname, sizeof (spa->spa_name));
1486	avl_add(&spa_namespace_avl, spa);
1487
1488	/*
1489	 * Sync all labels to disk with the new names by marking the root vdev
1490	 * dirty and waiting for it to sync.  It will pick up the new pool name
1491	 * during the sync.
1492	 */
1493	vdev_config_dirty(spa->spa_root_vdev);
1494
1495	spa_config_exit(spa, SCL_ALL, FTAG);
1496
1497	txg_wait_synced(spa->spa_dsl_pool, 0);
1498
1499	/*
1500	 * Sync the updated config cache.
1501	 */
1502	spa_write_cachefile(spa, B_FALSE, B_TRUE);
1503
1504	spa_close(spa, FTAG);
1505
1506	mutex_exit(&spa_namespace_lock);
1507
1508	return (0);
1509}
1510
1511/*
1512 * Return the spa_t associated with given pool_guid, if it exists.  If
1513 * device_guid is non-zero, determine whether the pool exists *and* contains
1514 * a device with the specified device_guid.
1515 */
1516spa_t *
1517spa_by_guid(uint64_t pool_guid, uint64_t device_guid)
1518{
1519	spa_t *spa;
1520	avl_tree_t *t = &spa_namespace_avl;
1521
1522	ASSERT(MUTEX_HELD(&spa_namespace_lock));
1523
1524	for (spa = avl_first(t); spa != NULL; spa = AVL_NEXT(t, spa)) {
1525		if (spa->spa_state == POOL_STATE_UNINITIALIZED)
1526			continue;
1527		if (spa->spa_root_vdev == NULL)
1528			continue;
1529		if (spa_guid(spa) == pool_guid) {
1530			if (device_guid == 0)
1531				break;
1532
1533			if (vdev_lookup_by_guid(spa->spa_root_vdev,
1534			    device_guid) != NULL)
1535				break;
1536
1537			/*
1538			 * Check any devices we may be in the process of adding.
1539			 */
1540			if (spa->spa_pending_vdev) {
1541				if (vdev_lookup_by_guid(spa->spa_pending_vdev,
1542				    device_guid) != NULL)
1543					break;
1544			}
1545		}
1546	}
1547
1548	return (spa);
1549}
1550
1551/*
1552 * Determine whether a pool with the given pool_guid exists.
1553 */
1554boolean_t
1555spa_guid_exists(uint64_t pool_guid, uint64_t device_guid)
1556{
1557	return (spa_by_guid(pool_guid, device_guid) != NULL);
1558}
1559
1560char *
1561spa_strdup(const char *s)
1562{
1563	size_t len;
1564	char *new;
1565
1566	len = strlen(s);
1567	new = kmem_alloc(len + 1, KM_SLEEP);
1568	bcopy(s, new, len);
1569	new[len] = '\0';
1570
1571	return (new);
1572}
1573
1574void
1575spa_strfree(char *s)
1576{
1577	kmem_free(s, strlen(s) + 1);
1578}
1579
1580uint64_t
1581spa_get_random(uint64_t range)
1582{
1583	uint64_t r;
1584
1585	ASSERT(range != 0);
1586
1587	(void) random_get_pseudo_bytes((void *)&r, sizeof (uint64_t));
1588
1589	return (r % range);
1590}
1591
1592uint64_t
1593spa_generate_guid(spa_t *spa)
1594{
1595	uint64_t guid = spa_get_random(-1ULL);
1596
1597	if (spa != NULL) {
1598		while (guid == 0 || spa_guid_exists(spa_guid(spa), guid))
1599			guid = spa_get_random(-1ULL);
1600	} else {
1601		while (guid == 0 || spa_guid_exists(guid, 0))
1602			guid = spa_get_random(-1ULL);
1603	}
1604
1605	return (guid);
1606}
1607
1608void
1609snprintf_blkptr(char *buf, size_t buflen, const blkptr_t *bp)
1610{
1611	char type[256];
1612	char *checksum = NULL;
1613	char *compress = NULL;
1614
1615	if (bp != NULL) {
1616		if (BP_GET_TYPE(bp) & DMU_OT_NEWTYPE) {
1617			dmu_object_byteswap_t bswap =
1618			    DMU_OT_BYTESWAP(BP_GET_TYPE(bp));
1619			(void) snprintf(type, sizeof (type), "bswap %s %s",
1620			    DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) ?
1621			    "metadata" : "data",
1622			    dmu_ot_byteswap[bswap].ob_name);
1623		} else {
1624			(void) strlcpy(type, dmu_ot[BP_GET_TYPE(bp)].ot_name,
1625			    sizeof (type));
1626		}
1627		if (!BP_IS_EMBEDDED(bp)) {
1628			checksum =
1629			    zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_name;
1630		}
1631		compress = zio_compress_table[BP_GET_COMPRESS(bp)].ci_name;
1632	}
1633
1634	SNPRINTF_BLKPTR(snprintf, ' ', buf, buflen, bp, type, checksum,
1635	    compress);
1636}
1637
1638void
1639spa_freeze(spa_t *spa)
1640{
1641	uint64_t freeze_txg = 0;
1642
1643	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
1644	if (spa->spa_freeze_txg == UINT64_MAX) {
1645		freeze_txg = spa_last_synced_txg(spa) + TXG_SIZE;
1646		spa->spa_freeze_txg = freeze_txg;
1647	}
1648	spa_config_exit(spa, SCL_ALL, FTAG);
1649	if (freeze_txg != 0)
1650		txg_wait_synced(spa_get_dsl(spa), freeze_txg);
1651}
1652
1653void
1654zfs_panic_recover(const char *fmt, ...)
1655{
1656	va_list adx;
1657
1658	va_start(adx, fmt);
1659	vcmn_err(zfs_recover ? CE_WARN : CE_PANIC, fmt, adx);
1660	va_end(adx);
1661}
1662
1663/*
1664 * This is a stripped-down version of strtoull, suitable only for converting
1665 * lowercase hexadecimal numbers that don't overflow.
1666 */
1667uint64_t
1668zfs_strtonum(const char *str, char **nptr)
1669{
1670	uint64_t val = 0;
1671	char c;
1672	int digit;
1673
1674	while ((c = *str) != '\0') {
1675		if (c >= '0' && c <= '9')
1676			digit = c - '0';
1677		else if (c >= 'a' && c <= 'f')
1678			digit = 10 + c - 'a';
1679		else
1680			break;
1681
1682		val *= 16;
1683		val += digit;
1684
1685		str++;
1686	}
1687
1688	if (nptr)
1689		*nptr = (char *)str;
1690
1691	return (val);
1692}
1693
1694/*
1695 * ==========================================================================
1696 * Accessor functions
1697 * ==========================================================================
1698 */
1699
1700boolean_t
1701spa_shutting_down(spa_t *spa)
1702{
1703	return (spa->spa_async_suspended);
1704}
1705
1706dsl_pool_t *
1707spa_get_dsl(spa_t *spa)
1708{
1709	return (spa->spa_dsl_pool);
1710}
1711
1712boolean_t
1713spa_is_initializing(spa_t *spa)
1714{
1715	return (spa->spa_is_initializing);
1716}
1717
1718boolean_t
1719spa_indirect_vdevs_loaded(spa_t *spa)
1720{
1721	return (spa->spa_indirect_vdevs_loaded);
1722}
1723
1724blkptr_t *
1725spa_get_rootblkptr(spa_t *spa)
1726{
1727	return (&spa->spa_ubsync.ub_rootbp);
1728}
1729
1730void
1731spa_set_rootblkptr(spa_t *spa, const blkptr_t *bp)
1732{
1733	spa->spa_uberblock.ub_rootbp = *bp;
1734}
1735
1736void
1737spa_altroot(spa_t *spa, char *buf, size_t buflen)
1738{
1739	if (spa->spa_root == NULL)
1740		buf[0] = '\0';
1741	else
1742		(void) strncpy(buf, spa->spa_root, buflen);
1743}
1744
1745int
1746spa_sync_pass(spa_t *spa)
1747{
1748	return (spa->spa_sync_pass);
1749}
1750
1751char *
1752spa_name(spa_t *spa)
1753{
1754	return (spa->spa_name);
1755}
1756
1757uint64_t
1758spa_guid(spa_t *spa)
1759{
1760	dsl_pool_t *dp = spa_get_dsl(spa);
1761	uint64_t guid;
1762
1763	/*
1764	 * If we fail to parse the config during spa_load(), we can go through
1765	 * the error path (which posts an ereport) and end up here with no root
1766	 * vdev.  We stash the original pool guid in 'spa_config_guid' to handle
1767	 * this case.
1768	 */
1769	if (spa->spa_root_vdev == NULL)
1770		return (spa->spa_config_guid);
1771
1772	guid = spa->spa_last_synced_guid != 0 ?
1773	    spa->spa_last_synced_guid : spa->spa_root_vdev->vdev_guid;
1774
1775	/*
1776	 * Return the most recently synced out guid unless we're
1777	 * in syncing context.
1778	 */
1779	if (dp && dsl_pool_sync_context(dp))
1780		return (spa->spa_root_vdev->vdev_guid);
1781	else
1782		return (guid);
1783}
1784
1785uint64_t
1786spa_load_guid(spa_t *spa)
1787{
1788	/*
1789	 * This is a GUID that exists solely as a reference for the
1790	 * purposes of the arc.  It is generated at load time, and
1791	 * is never written to persistent storage.
1792	 */
1793	return (spa->spa_load_guid);
1794}
1795
1796uint64_t
1797spa_last_synced_txg(spa_t *spa)
1798{
1799	return (spa->spa_ubsync.ub_txg);
1800}
1801
1802uint64_t
1803spa_first_txg(spa_t *spa)
1804{
1805	return (spa->spa_first_txg);
1806}
1807
1808uint64_t
1809spa_syncing_txg(spa_t *spa)
1810{
1811	return (spa->spa_syncing_txg);
1812}
1813
1814/*
1815 * Return the last txg where data can be dirtied. The final txgs
1816 * will be used to just clear out any deferred frees that remain.
1817 */
1818uint64_t
1819spa_final_dirty_txg(spa_t *spa)
1820{
1821	return (spa->spa_final_txg - TXG_DEFER_SIZE);
1822}
1823
1824pool_state_t
1825spa_state(spa_t *spa)
1826{
1827	return (spa->spa_state);
1828}
1829
1830spa_load_state_t
1831spa_load_state(spa_t *spa)
1832{
1833	return (spa->spa_load_state);
1834}
1835
1836uint64_t
1837spa_freeze_txg(spa_t *spa)
1838{
1839	return (spa->spa_freeze_txg);
1840}
1841
1842/* ARGSUSED */
1843uint64_t
1844spa_get_worst_case_asize(spa_t *spa, uint64_t lsize)
1845{
1846	return (lsize * spa_asize_inflation);
1847}
1848
1849/*
1850 * Return the amount of slop space in bytes.  It is 1/32 of the pool (3.2%),
1851 * or at least 128MB, unless that would cause it to be more than half the
1852 * pool size.
1853 *
1854 * See the comment above spa_slop_shift for details.
1855 */
1856uint64_t
1857spa_get_slop_space(spa_t *spa)
1858{
1859	uint64_t space = spa_get_dspace(spa);
1860	return (MAX(space >> spa_slop_shift, MIN(space >> 1, spa_min_slop)));
1861}
1862
1863uint64_t
1864spa_get_dspace(spa_t *spa)
1865{
1866	return (spa->spa_dspace);
1867}
1868
1869uint64_t
1870spa_get_checkpoint_space(spa_t *spa)
1871{
1872	return (spa->spa_checkpoint_info.sci_dspace);
1873}
1874
1875void
1876spa_update_dspace(spa_t *spa)
1877{
1878	spa->spa_dspace = metaslab_class_get_dspace(spa_normal_class(spa)) +
1879	    ddt_get_dedup_dspace(spa);
1880	if (spa->spa_vdev_removal != NULL) {
1881		/*
1882		 * We can't allocate from the removing device, so
1883		 * subtract its size.  This prevents the DMU/DSL from
1884		 * filling up the (now smaller) pool while we are in the
1885		 * middle of removing the device.
1886		 *
1887		 * Note that the DMU/DSL doesn't actually know or care
1888		 * how much space is allocated (it does its own tracking
1889		 * of how much space has been logically used).  So it
1890		 * doesn't matter that the data we are moving may be
1891		 * allocated twice (on the old device and the new
1892		 * device).
1893		 */
1894		spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
1895		vdev_t *vd =
1896		    vdev_lookup_top(spa, spa->spa_vdev_removal->svr_vdev_id);
1897		spa->spa_dspace -= spa_deflate(spa) ?
1898		    vd->vdev_stat.vs_dspace : vd->vdev_stat.vs_space;
1899		spa_config_exit(spa, SCL_VDEV, FTAG);
1900	}
1901}
1902
1903/*
1904 * Return the failure mode that has been set to this pool. The default
1905 * behavior will be to block all I/Os when a complete failure occurs.
1906 */
1907uint8_t
1908spa_get_failmode(spa_t *spa)
1909{
1910	return (spa->spa_failmode);
1911}
1912
1913boolean_t
1914spa_suspended(spa_t *spa)
1915{
1916	return (spa->spa_suspended);
1917}
1918
1919uint64_t
1920spa_version(spa_t *spa)
1921{
1922	return (spa->spa_ubsync.ub_version);
1923}
1924
1925boolean_t
1926spa_deflate(spa_t *spa)
1927{
1928	return (spa->spa_deflate);
1929}
1930
1931metaslab_class_t *
1932spa_normal_class(spa_t *spa)
1933{
1934	return (spa->spa_normal_class);
1935}
1936
1937metaslab_class_t *
1938spa_log_class(spa_t *spa)
1939{
1940	return (spa->spa_log_class);
1941}
1942
1943void
1944spa_evicting_os_register(spa_t *spa, objset_t *os)
1945{
1946	mutex_enter(&spa->spa_evicting_os_lock);
1947	list_insert_head(&spa->spa_evicting_os_list, os);
1948	mutex_exit(&spa->spa_evicting_os_lock);
1949}
1950
1951void
1952spa_evicting_os_deregister(spa_t *spa, objset_t *os)
1953{
1954	mutex_enter(&spa->spa_evicting_os_lock);
1955	list_remove(&spa->spa_evicting_os_list, os);
1956	cv_broadcast(&spa->spa_evicting_os_cv);
1957	mutex_exit(&spa->spa_evicting_os_lock);
1958}
1959
1960void
1961spa_evicting_os_wait(spa_t *spa)
1962{
1963	mutex_enter(&spa->spa_evicting_os_lock);
1964	while (!list_is_empty(&spa->spa_evicting_os_list))
1965		cv_wait(&spa->spa_evicting_os_cv, &spa->spa_evicting_os_lock);
1966	mutex_exit(&spa->spa_evicting_os_lock);
1967
1968	dmu_buf_user_evict_wait();
1969}
1970
1971int
1972spa_max_replication(spa_t *spa)
1973{
1974	/*
1975	 * As of SPA_VERSION == SPA_VERSION_DITTO_BLOCKS, we are able to
1976	 * handle BPs with more than one DVA allocated.  Set our max
1977	 * replication level accordingly.
1978	 */
1979	if (spa_version(spa) < SPA_VERSION_DITTO_BLOCKS)
1980		return (1);
1981	return (MIN(SPA_DVAS_PER_BP, spa_max_replication_override));
1982}
1983
1984int
1985spa_prev_software_version(spa_t *spa)
1986{
1987	return (spa->spa_prev_software_version);
1988}
1989
1990uint64_t
1991spa_deadman_synctime(spa_t *spa)
1992{
1993	return (spa->spa_deadman_synctime);
1994}
1995
1996uint64_t
1997dva_get_dsize_sync(spa_t *spa, const dva_t *dva)
1998{
1999	uint64_t asize = DVA_GET_ASIZE(dva);
2000	uint64_t dsize = asize;
2001
2002	ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
2003
2004	if (asize != 0 && spa->spa_deflate) {
2005		uint64_t vdev = DVA_GET_VDEV(dva);
2006		vdev_t *vd = vdev_lookup_top(spa, vdev);
2007		if (vd == NULL) {
2008			panic(
2009			    "dva_get_dsize_sync(): bad DVA %llu:%llu",
2010			    (u_longlong_t)vdev, (u_longlong_t)asize);
2011		}
2012		dsize = (asize >> SPA_MINBLOCKSHIFT) * vd->vdev_deflate_ratio;
2013	}
2014
2015	return (dsize);
2016}
2017
2018uint64_t
2019bp_get_dsize_sync(spa_t *spa, const blkptr_t *bp)
2020{
2021	uint64_t dsize = 0;
2022
2023	for (int d = 0; d < BP_GET_NDVAS(bp); d++)
2024		dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]);
2025
2026	return (dsize);
2027}
2028
2029uint64_t
2030bp_get_dsize(spa_t *spa, const blkptr_t *bp)
2031{
2032	uint64_t dsize = 0;
2033
2034	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
2035
2036	for (int d = 0; d < BP_GET_NDVAS(bp); d++)
2037		dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]);
2038
2039	spa_config_exit(spa, SCL_VDEV, FTAG);
2040
2041	return (dsize);
2042}
2043
2044uint64_t
2045spa_dirty_data(spa_t *spa)
2046{
2047	return (spa->spa_dsl_pool->dp_dirty_total);
2048}
2049
2050/*
2051 * ==========================================================================
2052 * Initialization and Termination
2053 * ==========================================================================
2054 */
2055
2056static int
2057spa_name_compare(const void *a1, const void *a2)
2058{
2059	const spa_t *s1 = a1;
2060	const spa_t *s2 = a2;
2061	int s;
2062
2063	s = strcmp(s1->spa_name, s2->spa_name);
2064	if (s > 0)
2065		return (1);
2066	if (s < 0)
2067		return (-1);
2068	return (0);
2069}
2070
2071int
2072spa_busy(void)
2073{
2074	return (spa_active_count);
2075}
2076
2077void
2078spa_boot_init()
2079{
2080	spa_config_load();
2081}
2082
2083#ifdef _KERNEL
2084EVENTHANDLER_DEFINE(mountroot, spa_boot_init, NULL, 0);
2085#endif
2086
2087void
2088spa_init(int mode)
2089{
2090	mutex_init(&spa_namespace_lock, NULL, MUTEX_DEFAULT, NULL);
2091	mutex_init(&spa_spare_lock, NULL, MUTEX_DEFAULT, NULL);
2092	mutex_init(&spa_l2cache_lock, NULL, MUTEX_DEFAULT, NULL);
2093	cv_init(&spa_namespace_cv, NULL, CV_DEFAULT, NULL);
2094
2095	avl_create(&spa_namespace_avl, spa_name_compare, sizeof (spa_t),
2096	    offsetof(spa_t, spa_avl));
2097
2098	avl_create(&spa_spare_avl, spa_spare_compare, sizeof (spa_aux_t),
2099	    offsetof(spa_aux_t, aux_avl));
2100
2101	avl_create(&spa_l2cache_avl, spa_l2cache_compare, sizeof (spa_aux_t),
2102	    offsetof(spa_aux_t, aux_avl));
2103
2104	spa_mode_global = mode;
2105
2106#ifdef illumos
2107#ifdef _KERNEL
2108	spa_arch_init();
2109#else
2110	if (spa_mode_global != FREAD && dprintf_find_string("watch")) {
2111		arc_procfd = open("/proc/self/ctl", O_WRONLY);
2112		if (arc_procfd == -1) {
2113			perror("could not enable watchpoints: "
2114			    "opening /proc/self/ctl failed: ");
2115		} else {
2116			arc_watch = B_TRUE;
2117		}
2118	}
2119#endif
2120#endif /* illumos */
2121	refcount_sysinit();
2122	unique_init();
2123	range_tree_init();
2124	metaslab_alloc_trace_init();
2125	zio_init();
2126	lz4_init();
2127	dmu_init();
2128	zil_init();
2129	vdev_cache_stat_init();
2130	vdev_file_init();
2131	zfs_prop_init();
2132	zpool_prop_init();
2133	zpool_feature_init();
2134	spa_config_load();
2135	l2arc_start();
2136	scan_init();
2137	dsl_scan_global_init();
2138#ifndef illumos
2139#ifdef _KERNEL
2140	zfs_deadman_init();
2141#endif
2142#endif	/* !illumos */
2143}
2144
2145void
2146spa_fini(void)
2147{
2148	l2arc_stop();
2149
2150	spa_evict_all();
2151
2152	vdev_file_fini();
2153	vdev_cache_stat_fini();
2154	zil_fini();
2155	dmu_fini();
2156	lz4_fini();
2157	zio_fini();
2158	metaslab_alloc_trace_fini();
2159	range_tree_fini();
2160	unique_fini();
2161	refcount_fini();
2162	scan_fini();
2163
2164	avl_destroy(&spa_namespace_avl);
2165	avl_destroy(&spa_spare_avl);
2166	avl_destroy(&spa_l2cache_avl);
2167
2168	cv_destroy(&spa_namespace_cv);
2169	mutex_destroy(&spa_namespace_lock);
2170	mutex_destroy(&spa_spare_lock);
2171	mutex_destroy(&spa_l2cache_lock);
2172}
2173
2174/*
2175 * Return whether this pool has slogs. No locking needed.
2176 * It's not a problem if the wrong answer is returned as it's only for
2177 * performance and not correctness
2178 */
2179boolean_t
2180spa_has_slogs(spa_t *spa)
2181{
2182	return (spa->spa_log_class->mc_rotor != NULL);
2183}
2184
2185spa_log_state_t
2186spa_get_log_state(spa_t *spa)
2187{
2188	return (spa->spa_log_state);
2189}
2190
2191void
2192spa_set_log_state(spa_t *spa, spa_log_state_t state)
2193{
2194	spa->spa_log_state = state;
2195}
2196
2197boolean_t
2198spa_is_root(spa_t *spa)
2199{
2200	return (spa->spa_is_root);
2201}
2202
2203boolean_t
2204spa_writeable(spa_t *spa)
2205{
2206	return (!!(spa->spa_mode & FWRITE) && spa->spa_trust_config);
2207}
2208
2209/*
2210 * Returns true if there is a pending sync task in any of the current
2211 * syncing txg, the current quiescing txg, or the current open txg.
2212 */
2213boolean_t
2214spa_has_pending_synctask(spa_t *spa)
2215{
2216	return (!txg_all_lists_empty(&spa->spa_dsl_pool->dp_sync_tasks) ||
2217	    !txg_all_lists_empty(&spa->spa_dsl_pool->dp_early_sync_tasks));
2218}
2219
2220int
2221spa_mode(spa_t *spa)
2222{
2223	return (spa->spa_mode);
2224}
2225
2226uint64_t
2227spa_bootfs(spa_t *spa)
2228{
2229	return (spa->spa_bootfs);
2230}
2231
2232uint64_t
2233spa_delegation(spa_t *spa)
2234{
2235	return (spa->spa_delegation);
2236}
2237
2238objset_t *
2239spa_meta_objset(spa_t *spa)
2240{
2241	return (spa->spa_meta_objset);
2242}
2243
2244enum zio_checksum
2245spa_dedup_checksum(spa_t *spa)
2246{
2247	return (spa->spa_dedup_checksum);
2248}
2249
2250/*
2251 * Reset pool scan stat per scan pass (or reboot).
2252 */
2253void
2254spa_scan_stat_init(spa_t *spa)
2255{
2256	/* data not stored on disk */
2257	spa->spa_scan_pass_start = gethrestime_sec();
2258	if (dsl_scan_is_paused_scrub(spa->spa_dsl_pool->dp_scan))
2259		spa->spa_scan_pass_scrub_pause = spa->spa_scan_pass_start;
2260	else
2261		spa->spa_scan_pass_scrub_pause = 0;
2262	spa->spa_scan_pass_scrub_spent_paused = 0;
2263	spa->spa_scan_pass_exam = 0;
2264	spa->spa_scan_pass_issued = 0;
2265	vdev_scan_stat_init(spa->spa_root_vdev);
2266}
2267
2268/*
2269 * Get scan stats for zpool status reports
2270 */
2271int
2272spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps)
2273{
2274	dsl_scan_t *scn = spa->spa_dsl_pool ? spa->spa_dsl_pool->dp_scan : NULL;
2275
2276	if (scn == NULL || scn->scn_phys.scn_func == POOL_SCAN_NONE)
2277		return (SET_ERROR(ENOENT));
2278	bzero(ps, sizeof (pool_scan_stat_t));
2279
2280	/* data stored on disk */
2281	ps->pss_func = scn->scn_phys.scn_func;
2282	ps->pss_state = scn->scn_phys.scn_state;
2283	ps->pss_start_time = scn->scn_phys.scn_start_time;
2284	ps->pss_end_time = scn->scn_phys.scn_end_time;
2285	ps->pss_to_examine = scn->scn_phys.scn_to_examine;
2286	ps->pss_to_process = scn->scn_phys.scn_to_process;
2287	ps->pss_processed = scn->scn_phys.scn_processed;
2288	ps->pss_errors = scn->scn_phys.scn_errors;
2289	ps->pss_examined = scn->scn_phys.scn_examined;
2290	ps->pss_issued =
2291		scn->scn_issued_before_pass + spa->spa_scan_pass_issued;
2292	/* data not stored on disk */
2293	ps->pss_pass_start = spa->spa_scan_pass_start;
2294	ps->pss_pass_exam = spa->spa_scan_pass_exam;
2295	ps->pss_pass_issued = spa->spa_scan_pass_issued;
2296	ps->pss_pass_scrub_pause = spa->spa_scan_pass_scrub_pause;
2297	ps->pss_pass_scrub_spent_paused = spa->spa_scan_pass_scrub_spent_paused;
2298
2299	return (0);
2300}
2301
2302int
2303spa_maxblocksize(spa_t *spa)
2304{
2305	if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS))
2306		return (SPA_MAXBLOCKSIZE);
2307	else
2308		return (SPA_OLD_MAXBLOCKSIZE);
2309}
2310
2311/*
2312 * Returns the txg that the last device removal completed. No indirect mappings
2313 * have been added since this txg.
2314 */
2315uint64_t
2316spa_get_last_removal_txg(spa_t *spa)
2317{
2318	uint64_t vdevid;
2319	uint64_t ret = -1ULL;
2320
2321	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
2322	/*
2323	 * sr_prev_indirect_vdev is only modified while holding all the
2324	 * config locks, so it is sufficient to hold SCL_VDEV as reader when
2325	 * examining it.
2326	 */
2327	vdevid = spa->spa_removing_phys.sr_prev_indirect_vdev;
2328
2329	while (vdevid != -1ULL) {
2330		vdev_t *vd = vdev_lookup_top(spa, vdevid);
2331		vdev_indirect_births_t *vib = vd->vdev_indirect_births;
2332
2333		ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
2334
2335		/*
2336		 * If the removal did not remap any data, we don't care.
2337		 */
2338		if (vdev_indirect_births_count(vib) != 0) {
2339			ret = vdev_indirect_births_last_entry_txg(vib);
2340			break;
2341		}
2342
2343		vdevid = vd->vdev_indirect_config.vic_prev_indirect_vdev;
2344	}
2345	spa_config_exit(spa, SCL_VDEV, FTAG);
2346
2347	IMPLY(ret != -1ULL,
2348	    spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REMOVAL));
2349
2350	return (ret);
2351}
2352
2353boolean_t
2354spa_trust_config(spa_t *spa)
2355{
2356	return (spa->spa_trust_config);
2357}
2358
2359uint64_t
2360spa_missing_tvds_allowed(spa_t *spa)
2361{
2362	return (spa->spa_missing_tvds_allowed);
2363}
2364
2365void
2366spa_set_missing_tvds(spa_t *spa, uint64_t missing)
2367{
2368	spa->spa_missing_tvds = missing;
2369}
2370
2371boolean_t
2372spa_top_vdevs_spacemap_addressable(spa_t *spa)
2373{
2374	vdev_t *rvd = spa->spa_root_vdev;
2375	for (uint64_t c = 0; c < rvd->vdev_children; c++) {
2376		if (!vdev_is_spacemap_addressable(rvd->vdev_child[c]))
2377			return (B_FALSE);
2378	}
2379	return (B_TRUE);
2380}
2381
2382boolean_t
2383spa_has_checkpoint(spa_t *spa)
2384{
2385	return (spa->spa_checkpoint_txg != 0);
2386}
2387
2388boolean_t
2389spa_importing_readonly_checkpoint(spa_t *spa)
2390{
2391	return ((spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT) &&
2392	    spa->spa_mode == FREAD);
2393}
2394
2395uint64_t
2396spa_min_claim_txg(spa_t *spa)
2397{
2398	uint64_t checkpoint_txg = spa->spa_uberblock.ub_checkpoint_txg;
2399
2400	if (checkpoint_txg != 0)
2401		return (checkpoint_txg + 1);
2402
2403	return (spa->spa_first_txg);
2404}
2405
2406/*
2407 * If there is a checkpoint, async destroys may consume more space from
2408 * the pool instead of freeing it. In an attempt to save the pool from
2409 * getting suspended when it is about to run out of space, we stop
2410 * processing async destroys.
2411 */
2412boolean_t
2413spa_suspend_async_destroy(spa_t *spa)
2414{
2415	dsl_pool_t *dp = spa_get_dsl(spa);
2416
2417	uint64_t unreserved = dsl_pool_unreserved_space(dp,
2418	    ZFS_SPACE_CHECK_EXTRA_RESERVED);
2419	uint64_t used = dsl_dir_phys(dp->dp_root_dir)->dd_used_bytes;
2420	uint64_t avail = (unreserved > used) ? (unreserved - used) : 0;
2421
2422	if (spa_has_checkpoint(spa) && avail == 0)
2423		return (B_TRUE);
2424
2425	return (B_FALSE);
2426}
2427