spa_misc.c revision 339149
11541Srgrimes/* 21541Srgrimes * CDDL HEADER START 31541Srgrimes * 41541Srgrimes * The contents of this file are subject to the terms of the 51541Srgrimes * Common Development and Distribution License (the "License"). 61541Srgrimes * You may not use this file except in compliance with the License. 71541Srgrimes * 81541Srgrimes * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 91541Srgrimes * or http://www.opensolaris.org/os/licensing. 101541Srgrimes * See the License for the specific language governing permissions 111541Srgrimes * and limitations under the License. 121541Srgrimes * 131541Srgrimes * When distributing Covered Code, include this CDDL HEADER in each 141541Srgrimes * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 151541Srgrimes * If applicable, add the following below this CDDL HEADER, with the 161541Srgrimes * fields enclosed by brackets "[]" replaced with your own identifying 171541Srgrimes * information: Portions Copyright [yyyy] [name of copyright owner] 181541Srgrimes * 191541Srgrimes * CDDL HEADER END 201541Srgrimes */ 211541Srgrimes/* 221541Srgrimes * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 231541Srgrimes * Copyright (c) 2011, 2018 by Delphix. All rights reserved. 241541Srgrimes * Copyright 2015 Nexenta Systems, Inc. All rights reserved. 251541Srgrimes * Copyright 2013 Martin Matuska <mm@FreeBSD.org>. All rights reserved. 261541Srgrimes * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. 271541Srgrimes * Copyright 2013 Saso Kiselkov. All rights reserved. 281541Srgrimes * Copyright (c) 2014 Integros [integros.com] 291541Srgrimes * Copyright (c) 2017 Datto Inc. 301541Srgrimes */ 311541Srgrimes 321541Srgrimes#include <sys/zfs_context.h> 331541Srgrimes#include <sys/spa_impl.h> 341541Srgrimes#include <sys/spa_boot.h> 351541Srgrimes#include <sys/zio.h> 361541Srgrimes#include <sys/zio_checksum.h> 371541Srgrimes#include <sys/zio_compress.h> 381541Srgrimes#include <sys/dmu.h> 3950477Speter#include <sys/dmu_tx.h> 401541Srgrimes#include <sys/zap.h> 411541Srgrimes#include <sys/zil.h> 421541Srgrimes#include <sys/vdev_impl.h> 431541Srgrimes#include <sys/vdev_file.h> 441541Srgrimes#include <sys/vdev_initialize.h> 451541Srgrimes#include <sys/metaslab.h> 4631778Seivind#include <sys/uberblock_impl.h> 4731778Seivind#include <sys/txg.h> 481541Srgrimes#include <sys/avl.h> 491541Srgrimes#include <sys/unique.h> 501541Srgrimes#include <sys/dsl_pool.h> 5112221Sbde#include <sys/dsl_dir.h> 5241059Speter#include <sys/dsl_prop.h> 5370317Sjake#include <sys/dsl_scan.h> 541541Srgrimes#include <sys/fs/zfs.h> 551541Srgrimes#include <sys/metaslab_impl.h> 5631891Ssef#include <sys/arc.h> 5765495Struckman#include <sys/ddt.h> 5861287Srwatson#include "zfs_prop.h" 5972786Srwatson#include <sys/zfeature.h> 601541Srgrimes 6130354Sphk#if defined(__FreeBSD__) && defined(_KERNEL) 6230354Sphk#include <sys/types.h> 6312221Sbde#include <sys/sysctl.h> 6411332Sswallace#endif 651541Srgrimes 661541Srgrimes/* 6712221Sbde * SPA locking 681541Srgrimes * 6958717Sdillon * There are four basic locks for managing spa_t structures: 7070317Sjake * 7158717Sdillon * spa_namespace_lock (global mutex) 7270317Sjake * 731541Srgrimes * This lock must be acquired to do any of the following: 741549Srgrimes * 7530994Sphk * - Lookup a spa_t by name 761541Srgrimes * - Add or remove a spa_t from the namespace 7711332Sswallace * - Increase spa_refcount from non-zero 781541Srgrimes * - Check if spa_refcount is zero 791541Srgrimes * - Rename a spa_t 8030994Sphk * - add/remove/attach/detach devices 811541Srgrimes * - Held for the duration of create/destroy/import/export 8274728Sjhb * 8330994Sphk * It does not need to handle recursion. A create or destroy may 8474728Sjhb * reference objects (files or zvols) in other pools, but by 851541Srgrimes * definition they must have an existing reference, and will never need 861541Srgrimes * to lookup a spa_t by name. 871541Srgrimes * 881541Srgrimes * spa_refcount (per-spa refcount_t protected by mutex) 8970317Sjake * 9070317Sjake * This reference count keep track of any active users of the spa_t. The 9170317Sjake * spa_t cannot be destroyed or freed while this is non-zero. Internally, 9270317Sjake * the refcount is never really 'zero' - opening a pool implicitly keeps 9312221Sbde * some references in the DMU. Internally we check against spa_minref, but 9411332Sswallace * present the image of a zero/non-zero value to consumers. 9511332Sswallace * 9611332Sswallace * spa_config_lock[] (per-spa array of rwlocks) 9712221Sbde * 981541Srgrimes * This protects the spa_t from config changes, and must be held in 991549Srgrimes * the following circumstances: 10030994Sphk * 1011541Srgrimes * - RW_READER to perform I/O to the spa 10211332Sswallace * - RW_WRITER to change the vdev config 1031541Srgrimes * 1041541Srgrimes * The locking order is fairly straightforward: 10574728Sjhb * 10630994Sphk * spa_namespace_lock -> spa_refcount 10774728Sjhb * 1081541Srgrimes * The namespace lock must be acquired to increase the refcount from 0 1091541Srgrimes * or to check if it is zero. 1101541Srgrimes * 11158717Sdillon * spa_refcount -> spa_config_lock[] 11258717Sdillon * 11358717Sdillon * There must be at least one valid reference on the spa_t to acquire 11458717Sdillon * the config lock. 11558717Sdillon * 11612221Sbde * spa_namespace_lock -> spa_config_lock[] 11711332Sswallace * 11811332Sswallace * The namespace lock must always be taken before the config lock. 11911332Sswallace * 12012221Sbde * 12111332Sswallace * The spa_namespace_lock can be acquired directly and is globally visible. 1221549Srgrimes * 12330994Sphk * The namespace is manipulated using the following functions, all of which 1241541Srgrimes * require the spa_namespace_lock to be held. 12511332Sswallace * 1261541Srgrimes * spa_lookup() Lookup a spa_t by name. 1271541Srgrimes * 12830994Sphk * spa_add() Create a new spa_t in the namespace. 1291541Srgrimes * 1301541Srgrimes * spa_remove() Remove a spa_t from the namespace. This also 1311541Srgrimes * frees up any memory associated with the spa_t. 13228401Speter * 13312221Sbde * spa_next() Returns the next spa_t in the system, or the 13428401Speter * first if NULL is passed. 13528401Speter * 13628401Speter * spa_evict_all() Shutdown and remove all spa_t structures in 13728401Speter * the system. 13828401Speter * 13928401Speter * spa_guid_exists() Determine whether a pool/device guid exists. 14030994Sphk * 14128401Speter * The spa_refcount is manipulated using the following functions: 14228401Speter * 14328401Speter * spa_open_ref() Adds a reference to the given spa_t. Must be 14441726Struckman * called with spa_namespace_lock held if the 14541726Struckman * refcount is currently zero. 14641726Struckman * 14728401Speter * spa_close() Remove a reference from the spa_t. This will 14828401Speter * not free the spa_t or remove it from the 14928401Speter * namespace. No locking is required. 15041726Struckman * 15128401Speter * spa_refcount_zero() Returns true if the refcount is currently 15228401Speter * zero. Must be called with spa_namespace_lock 15341726Struckman * held. 15428401Speter * 15528401Speter * The spa_config_lock[] is an array of rwlocks, ordered as follows: 15628401Speter * SCL_CONFIG > SCL_STATE > SCL_ALLOC > SCL_ZIO > SCL_FREE > SCL_VDEV. 15728401Speter * spa_config_lock[] is manipulated with spa_config_{enter,exit,held}(). 15828401Speter * 15928401Speter * To read the configuration, it suffices to hold one of these locks as reader. 16028401Speter * To modify the configuration, you must hold all locks as writer. To modify 16128401Speter * vdev state without altering the vdev tree's topology (e.g. online/offline), 16228401Speter * you must hold SCL_STATE and SCL_ZIO as writer. 16328401Speter * 16428401Speter * We use these distinct config locks to avoid recursive lock entry. 16528401Speter * For example, spa_sync() (which holds SCL_CONFIG as reader) induces 16628401Speter * block allocations (SCL_ALLOC), which may require reading space maps 16730994Sphk * from disk (dmu_read() -> zio_read() -> SCL_ZIO). 16828401Speter * 16928401Speter * The spa config locks cannot be normal rwlocks because we need the 17028401Speter * ability to hand off ownership. For example, SCL_ZIO is acquired 17141726Struckman * by the issuing thread and later released by an interrupt thread. 17241726Struckman * They do, however, obey the usual write-wanted semantics to prevent 17341726Struckman * writer (i.e. system administrator) starvation. 17428401Speter * 17528401Speter * The lock acquisition rules are as follows: 17628401Speter * 17771002Sben * SCL_CONFIG 17828401Speter * Protects changes to the vdev tree topology, such as vdev 17928401Speter * add/remove/attach/detach. Protects the dirty config list 18041726Struckman * (spa_config_dirty_list) and the set of spares and l2arc devices. 18128401Speter * 18228401Speter * SCL_STATE 18328401Speter * Protects changes to pool state and vdev state, such as vdev 18428401Speter * online/offline/fault/degrade/clear. Protects the dirty state list 18558941Sdillon * (spa_state_dirty_list) and global pool state (spa_state). 18658941Sdillon * 18758941Sdillon * SCL_ALLOC 18828401Speter * Protects changes to metaslab groups and classes. 18911332Sswallace * Held as reader by metaslab_alloc() and metaslab_claim(). 19011332Sswallace * 19111332Sswallace * SCL_ZIO 19212221Sbde * Held by bp-level zios (those which have no io_vd upon entry) 19311332Sswallace * to prevent changes to the vdev tree. The bp-level zio implicitly 1941541Srgrimes * protects all of its vdev child zios, which do not hold SCL_ZIO. 1951549Srgrimes * 19630994Sphk * SCL_FREE 1971541Srgrimes * Protects changes to metaslab groups and classes. 19811332Sswallace * Held as reader by metaslab_free(). SCL_FREE is distinct from 1991541Srgrimes * SCL_ALLOC, and lower than SCL_ZIO, so that we can safely free 2001541Srgrimes * blocks in zio_done() while another i/o that holds either 20130994Sphk * SCL_ALLOC or SCL_ZIO is waiting for this i/o to complete. 2021541Srgrimes * 20330994Sphk * SCL_VDEV 2041541Srgrimes * Held as reader to prevent changes to the vdev tree during trivial 2051541Srgrimes * inquiries such as bp_get_dsize(). SCL_VDEV is distinct from the 2061541Srgrimes * other locks, and lower than all of them, to ensure that it's safe 2071541Srgrimes * to acquire regardless of caller context. 20858941Sdillon * 20958941Sdillon * In addition, the following rules apply: 21058941Sdillon * 21112221Sbde * (a) spa_props_lock protects pool properties, spa_config and spa_config_list. 21211332Sswallace * The lock ordering is SCL_CONFIG > spa_props_lock. 21311332Sswallace * 21411332Sswallace * (b) I/O operations on leaf vdevs. For any zio operation that takes 21512221Sbde * an explicit vdev_t argument -- such as zio_ioctl(), zio_read_phys(), 21611332Sswallace * or zio_write_phys() -- the caller must ensure that the config cannot 2171541Srgrimes * cannot change in the interim, and that the vdev cannot be reopened. 2181549Srgrimes * SCL_STATE as reader suffices for both. 21930994Sphk * 2201541Srgrimes * The vdev configuration is protected by spa_vdev_enter() / spa_vdev_exit(). 22111332Sswallace * 2221541Srgrimes * spa_vdev_enter() Acquire the namespace lock and the config lock 2231541Srgrimes * for writing. 22430994Sphk * 2251541Srgrimes * spa_vdev_exit() Release the config lock, wait for all I/O 2261541Srgrimes * to complete, sync the updated configs to the 2271541Srgrimes * cache, and release the namespace lock. 22858941Sdillon * 22958941Sdillon * vdev state is protected by spa_vdev_state_enter() / spa_vdev_state_exit(). 23058941Sdillon * Like spa_vdev_enter/exit, these are convenience wrappers -- the actual 23112221Sbde * locking is, always, based on spa_namespace_lock and spa_config_lock[]. 23211332Sswallace * 23311332Sswallace * spa_rename() is also implemented within this file since it requires 23411332Sswallace * manipulation of the namespace. 23512221Sbde */ 23611332Sswallace 2371541Srgrimesstatic avl_tree_t spa_namespace_avl; 2381549Srgrimeskmutex_t spa_namespace_lock; 23930994Sphkstatic kcondvar_t spa_namespace_cv; 2401541Srgrimesstatic int spa_active_count; 24111332Sswallaceint spa_max_replication_override = SPA_DVAS_PER_BP; 2421541Srgrimes 2431541Srgrimesstatic kmutex_t spa_spare_lock; 24430994Sphkstatic avl_tree_t spa_spare_avl; 2451541Srgrimesstatic kmutex_t spa_l2cache_lock; 24630994Sphkstatic avl_tree_t spa_l2cache_avl; 2471541Srgrimes 2481541Srgrimeskmem_cache_t *spa_buffer_pool; 2491541Srgrimesint spa_mode_global; 2501541Srgrimes 2511541Srgrimes#ifdef ZFS_DEBUG 2521541Srgrimes/* 2531541Srgrimes * Everything except dprintf, spa, and indirect_remap is on by default 2541541Srgrimes * in debug builds. 2551541Srgrimes */ 25612221Sbdeint zfs_flags = ~(ZFS_DEBUG_DPRINTF | ZFS_DEBUG_INDIRECT_REMAP); 25711332Sswallace#else 25811332Sswallaceint zfs_flags = 0; 25911332Sswallace#endif 26012221Sbde 26111332Sswallace/* 2621541Srgrimes * zfs_recover can be set to nonzero to attempt to recover from 2631549Srgrimes * otherwise-fatal errors, typically caused by on-disk corruption. When 26430994Sphk * set, calls to zfs_panic_recover() will turn into warning messages. 2651541Srgrimes * This should only be used as a last resort, as it typically results 26611332Sswallace * in leaked space, or worse. 2671541Srgrimes */ 2681541Srgrimesboolean_t zfs_recover = B_FALSE; 26930994Sphk 2701541Srgrimes/* 2711541Srgrimes * If destroy encounters an EIO while reading metadata (e.g. indirect 2721541Srgrimes * blocks), space referenced by the missing metadata can not be freed. 27312221Sbde * Normally this causes the background destroy to become "stalled", as 2741541Srgrimes * it is unable to make forward progress. While in this stalled state, 2751541Srgrimes * all remaining space to free from the error-encountering filesystem is 2761541Srgrimes * "temporarily leaked". Set this flag to cause it to ignore the EIO, 2771541Srgrimes * permanently leak the space from indirect blocks that can not be read, 27812221Sbde * and continue to free everything else that it can. 2791549Srgrimes * 28030994Sphk * The default, "stalling" behavior is useful if the storage partially 2811541Srgrimes * fails (i.e. some but not all i/os fail), and then later recovers. In 2821541Srgrimes * this case, we will be able to continue pool operations while it is 2831541Srgrimes * partially failed, and when it recovers, we can continue to free the 2841541Srgrimes * space, with no leaks. However, note that this case is actually 2851541Srgrimes * fairly rare. 2861541Srgrimes * 2871541Srgrimes * Typically pools either (a) fail completely (but perhaps temporarily, 2881541Srgrimes * e.g. a top-level vdev going offline), or (b) have localized, 28930994Sphk * permanent errors (e.g. disk returns the wrong data due to bit flip or 2901541Srgrimes * firmware bug). In case (a), this setting does not matter because the 2911541Srgrimes * pool will be suspended and the sync thread will not be able to make 2921541Srgrimes * forward progress regardless. In case (b), because the error is 2931541Srgrimes * permanent, the best we can do is leak the minimum amount of space, 2941541Srgrimes * which is what setting this flag will do. Therefore, it is reasonable 2953098Sphk * for this flag to normally be set, but we chose the more conservative 2963098Sphk * approach of not setting it, so that there is no possibility of 2971541Srgrimes * leaking space in the "partial temporary" failure case. 29830994Sphk */ 2991541Srgrimesboolean_t zfs_free_leak_on_eio = B_FALSE; 3001541Srgrimes 3011541Srgrimes/* 30212221Sbde * Expiration time in milliseconds. This value has two meanings. First it is 30312207Sbde * used to determine when the spa_deadman() logic should fire. By default the 30411332Sswallace * spa_deadman() will fire if spa_sync() has not completed in 1000 seconds. 30511332Sswallace * Secondly, the value determines if an I/O is considered "hung". Any I/O that 30612221Sbde * has not completed in zfs_deadman_synctime_ms is considered "hung" resulting 30711332Sswallace * in a system panic. 3081541Srgrimes */ 3091549Srgrimesuint64_t zfs_deadman_synctime_ms = 1000000ULL; 31030994Sphk 3111541Srgrimes/* 31212207Sbde * Check time in milliseconds. This defines the frequency at which we check 3131541Srgrimes * for hung I/O. 3141541Srgrimes */ 3151541Srgrimesuint64_t zfs_deadman_checktime_ms = 5000ULL; 3161541Srgrimes 3171541Srgrimes/* 3181541Srgrimes * Default value of -1 for zfs_deadman_enabled is resolved in 31930994Sphk * zfs_deadman_init() 3201541Srgrimes */ 3211541Srgrimesint zfs_deadman_enabled = -1; 3221541Srgrimes 3231541Srgrimes/* 3241541Srgrimes * The worst case is single-sector max-parity RAID-Z blocks, in which 3251541Srgrimes * case the space requirement is exactly (VDEV_RAIDZ_MAXPARITY + 1) 3261541Srgrimes * times the size; so just assume that. Add to this the fact that 3271541Srgrimes * we can have up to 3 DVAs per bp, and one more factor of 2 because 3281541Srgrimes * the block may be dittoed with up to 3 DVAs by ddt_sync(). All together, 3291541Srgrimes * the worst case is: 3301541Srgrimes * (VDEV_RAIDZ_MAXPARITY + 1) * SPA_DVAS_PER_BP * 2 == 24 3311541Srgrimes */ 3321541Srgrimesint spa_asize_inflation = 24; 3331541Srgrimes 3341541Srgrimes#if defined(__FreeBSD__) && defined(_KERNEL) 3351541SrgrimesSYSCTL_DECL(_vfs_zfs); 3361541SrgrimesSYSCTL_INT(_vfs_zfs, OID_AUTO, recover, CTLFLAG_RWTUN, &zfs_recover, 0, 33712221Sbde "Try to recover from otherwise-fatal errors."); 3381541Srgrimes 3391541Srgrimesstatic int 3401541Srgrimessysctl_vfs_zfs_debug_flags(SYSCTL_HANDLER_ARGS) 3411541Srgrimes{ 34212221Sbde int err, val; 3431541Srgrimes 3441549Srgrimes val = zfs_flags; 34530994Sphk err = sysctl_handle_int(oidp, &val, 0, req); 3461541Srgrimes if (err != 0 || req->newptr == NULL) 3471541Srgrimes return (err); 3481541Srgrimes 3491541Srgrimes /* 3501541Srgrimes * ZFS_DEBUG_MODIFY must be enabled prior to boot so all 3511541Srgrimes * arc buffers in the system have the necessary additional 35220677Sbde * checksum data. However, it is safe to disable at any 35320677Sbde * time. 3541541Srgrimes */ 3551541Srgrimes if (!(zfs_flags & ZFS_DEBUG_MODIFY)) 3561541Srgrimes val &= ~ZFS_DEBUG_MODIFY; 35715985Sdg zfs_flags = val; 3581541Srgrimes 3591541Srgrimes return (0); 3601541Srgrimes} 3611541Srgrimes 3621541SrgrimesSYSCTL_PROC(_vfs_zfs, OID_AUTO, debugflags, 3631541Srgrimes CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RWTUN, 0, sizeof(int), 3641541Srgrimes sysctl_vfs_zfs_debug_flags, "IU", "Debug flags for ZFS testing."); 3651541SrgrimesSYSCTL_PROC(_vfs_zfs, OID_AUTO, debug_flags, 3661541Srgrimes CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(int), 3671541Srgrimes sysctl_vfs_zfs_debug_flags, "IU", 3681541Srgrimes "Debug flags for ZFS testing (deprecated, see vfs.zfs.debugflags)."); 3691541Srgrimes 3701541SrgrimesSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, deadman_synctime_ms, CTLFLAG_RDTUN, 3711541Srgrimes &zfs_deadman_synctime_ms, 0, 3721541Srgrimes "Stalled ZFS I/O expiration time in milliseconds"); 3731541SrgrimesSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, deadman_checktime_ms, CTLFLAG_RDTUN, 37424448Speter &zfs_deadman_checktime_ms, 0, 37524448Speter "Period of checks for stalled ZFS I/O in milliseconds"); 37672093SasmodaiSYSCTL_INT(_vfs_zfs, OID_AUTO, deadman_enabled, CTLFLAG_RDTUN, 37724448Speter &zfs_deadman_enabled, 0, "Kernel panic on stalled ZFS I/O"); 37824448SpeterSYSCTL_INT(_vfs_zfs, OID_AUTO, spa_asize_inflation, CTLFLAG_RWTUN, 37924448Speter &spa_asize_inflation, 0, "Worst case inflation factor for single sector writes"); 38024448Speter#endif 38124448Speter 38224448Speter#ifndef illumos 38324448Speter#ifdef _KERNEL 38424448Speterstatic void 38524448Speterzfs_deadman_init() 38612221Sbde{ 3871541Srgrimes /* 3881541Srgrimes * If we are not i386 or amd64 or in a virtual machine, 3891541Srgrimes * disable ZFS deadman thread by default 39012221Sbde */ 3911541Srgrimes if (zfs_deadman_enabled == -1) { 3921549Srgrimes#if defined(__amd64__) || defined(__i386__) 39330994Sphk zfs_deadman_enabled = (vm_guest == VM_GUEST_NO) ? 1 : 0; 3941541Srgrimes#else 3951541Srgrimes zfs_deadman_enabled = 0; 3961541Srgrimes#endif 3971541Srgrimes } 3981541Srgrimes} 3991541Srgrimes#endif /* _KERNEL */ 4001541Srgrimes#endif /* !illumos */ 40124448Speter 40224448Speter/* 40324448Speter * Normally, we don't allow the last 3.2% (1/(2^spa_slop_shift)) of space in 40424448Speter * the pool to be consumed. This ensures that we don't run the pool 40524448Speter * completely out of space, due to unaccounted changes (e.g. to the MOS). 40672093Sasmodai * It also limits the worst-case time to allocate space. If we have 40724448Speter * less than this amount of free space, most ZPL operations (e.g. write, 40824448Speter * create) will return ENOSPC. 40924448Speter * 41024448Speter * Certain operations (e.g. file removal, most administrative actions) can 41124448Speter * use half the slop space. They will only return ENOSPC if less than half 41224448Speter * the slop space is free. Typically, once the pool has less than the slop 41324448Speter * space free, the user will use these operations to free up space in the pool. 41424448Speter * These are the operations that call dsl_pool_adjustedsize() with the netfree 41524448Speter * argument set to TRUE. 41624448Speter * 41724448Speter * Operations that are almost guaranteed to free up space in the absence of 4181541Srgrimes * a pool checkpoint can use up to three quarters of the slop space 41924448Speter * (e.g zfs destroy). 42017994Sache * 42124448Speter * A very restricted set of operations are always permitted, regardless of 42217994Sache * the amount of free space. These are the operations that call 42324448Speter * dsl_sync_task(ZFS_SPACE_CHECK_NONE). If these operations result in a net 42424448Speter * increase in the amount of space used, it is possible to run the pool 42524448Speter * completely out of space, causing it to be permanently read-only. 42646155Sphk * 4271541Srgrimes * Note that on very small pools, the slop space will be larger than 42824448Speter * 3.2%, in an effort to have it be at least spa_min_slop (128MB), 42924448Speter * but we never allow it to be more than half the pool size. 4301541Srgrimes * 43124448Speter * See also the comments in zfs_space_check_t. 43224448Speter */ 4331541Srgrimesint spa_slop_shift = 5; 43417994SacheSYSCTL_INT(_vfs_zfs, OID_AUTO, spa_slop_shift, CTLFLAG_RWTUN, 43524448Speter &spa_slop_shift, 0, 43624448Speter "Shift value of reserved space (1/(2^spa_slop_shift))."); 43717994Sacheuint64_t spa_min_slop = 128 * 1024 * 1024; 43846155SphkSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, spa_min_slop, CTLFLAG_RWTUN, 43917994Sache &spa_min_slop, 0, 44024448Speter "Minimal value of reserved space"); 44124448Speter 44265495Struckmanint spa_allocators = 4; 44324448Speter 44424448SpeterSYSCTL_INT(_vfs_zfs, OID_AUTO, spa_allocators, CTLFLAG_RWTUN, 44565495Struckman &spa_allocators, 0, 44665495Struckman "Number of allocators per metaslab group"); 44724448Speter 44824448Speter/*PRINTFLIKE2*/ 44924448Spetervoid 45024448Speterspa_load_failed(spa_t *spa, const char *fmt, ...) 45124448Speter{ 45224448Speter va_list adx; 45324448Speter char buf[256]; 45424448Speter 45524448Speter va_start(adx, fmt); 45624448Speter (void) vsnprintf(buf, sizeof (buf), fmt, adx); 45731891Ssef va_end(adx); 45824448Speter 4598141Sache zfs_dbgmsg("spa_load(%s, config %s): FAILED: %s", spa->spa_name, 46024448Speter spa->spa_trust_config ? "trusted" : "untrusted", buf); 46124448Speter} 46224448Speter 46324448Speter/*PRINTFLIKE2*/ 46424448Spetervoid 46524448Speterspa_load_note(spa_t *spa, const char *fmt, ...) 46665495Struckman{ 46731891Ssef va_list adx; 46824448Speter char buf[256]; 4691541Srgrimes 4701541Srgrimes va_start(adx, fmt); 4711541Srgrimes (void) vsnprintf(buf, sizeof (buf), fmt, adx); 47212221Sbde va_end(adx); 4731541Srgrimes 4741541Srgrimes zfs_dbgmsg("spa_load(%s, config %s): %s", spa->spa_name, 4751541Srgrimes spa->spa_trust_config ? "trusted" : "untrusted", buf); 47612221Sbde} 4771541Srgrimes 4781549Srgrimes/* 47930994Sphk * ========================================================================== 4801541Srgrimes * SPA config locking 4811541Srgrimes * ========================================================================== 4821541Srgrimes */ 4831541Srgrimesstatic void 4841541Srgrimesspa_config_lock_init(spa_t *spa) 4851541Srgrimes{ 4861541Srgrimes for (int i = 0; i < SCL_LOCKS; i++) { 4871541Srgrimes spa_config_lock_t *scl = &spa->spa_config_lock[i]; 48824449Speter mutex_init(&scl->scl_lock, NULL, MUTEX_DEFAULT, NULL); 48924449Speter cv_init(&scl->scl_cv, NULL, CV_DEFAULT, NULL); 49046155Sphk refcount_create_untracked(&scl->scl_count); 4911541Srgrimes scl->scl_writer = NULL; 4921541Srgrimes scl->scl_write_wanted = 0; 4931541Srgrimes } 4941541Srgrimes} 4951541Srgrimes 49624449Speterstatic void 49765495Struckmanspa_config_lock_destroy(spa_t *spa) 49831891Ssef{ 49924449Speter for (int i = 0; i < SCL_LOCKS; i++) { 5001541Srgrimes spa_config_lock_t *scl = &spa->spa_config_lock[i]; 5011541Srgrimes mutex_destroy(&scl->scl_lock); 5021541Srgrimes cv_destroy(&scl->scl_cv); 50312221Sbde refcount_destroy(&scl->scl_count); 5041541Srgrimes ASSERT(scl->scl_writer == NULL); 5051541Srgrimes ASSERT(scl->scl_write_wanted == 0); 5061541Srgrimes } 50712221Sbde} 5081541Srgrimes 5091549Srgrimesint 51030994Sphkspa_config_tryenter(spa_t *spa, int locks, void *tag, krw_t rw) 5111541Srgrimes{ 5121541Srgrimes for (int i = 0; i < SCL_LOCKS; i++) { 5131541Srgrimes spa_config_lock_t *scl = &spa->spa_config_lock[i]; 5141541Srgrimes if (!(locks & (1 << i))) 5151541Srgrimes continue; 5161541Srgrimes mutex_enter(&scl->scl_lock); 5171541Srgrimes if (rw == RW_READER) { 51824448Speter if (scl->scl_writer || scl->scl_write_wanted) { 51924448Speter mutex_exit(&scl->scl_lock); 52024448Speter spa_config_exit(spa, locks & ((1 << i) - 1), 52124448Speter tag); 52224448Speter return (0); 52372093Sasmodai } 52424448Speter } else { 52524448Speter ASSERT(scl->scl_writer != curthread); 52624448Speter if (!refcount_is_zero(&scl->scl_count)) { 52724448Speter mutex_exit(&scl->scl_lock); 52824448Speter spa_config_exit(spa, locks & ((1 << i) - 1), 5291541Srgrimes tag); 53024448Speter return (0); 53117994Sache } 53224448Speter scl->scl_writer = curthread; 53317994Sache } 53424448Speter (void) refcount_add(&scl->scl_count, tag); 53524448Speter mutex_exit(&scl->scl_lock); 53624448Speter } 53746155Sphk return (1); 5381541Srgrimes} 53924448Speter 54017994Sachevoid 54124448Speterspa_config_enter(spa_t *spa, int locks, void *tag, krw_t rw) 54224448Speter{ 54324448Speter int wlocks_held = 0; 54424448Speter 54524448Speter ASSERT3U(SCL_LOCKS, <, sizeof (wlocks_held) * NBBY); 54624448Speter 54724448Speter for (int i = 0; i < SCL_LOCKS; i++) { 54817994Sache spa_config_lock_t *scl = &spa->spa_config_lock[i]; 54946155Sphk if (scl->scl_writer == curthread) 55024448Speter wlocks_held |= (1 << i); 55124448Speter if (!(locks & (1 << i))) 55224448Speter continue; 55324448Speter mutex_enter(&scl->scl_lock); 55424448Speter if (rw == RW_READER) { 55524448Speter while (scl->scl_writer || scl->scl_write_wanted) { 55624448Speter cv_wait(&scl->scl_cv, &scl->scl_lock); 55731891Ssef } 55824448Speter } else { 55924448Speter ASSERT(scl->scl_writer != curthread); 56024448Speter while (!refcount_is_zero(&scl->scl_count)) { 56124448Speter scl->scl_write_wanted++; 56224448Speter cv_wait(&scl->scl_cv, &scl->scl_lock); 56324448Speter scl->scl_write_wanted--; 56424448Speter } 56524448Speter scl->scl_writer = curthread; 56624448Speter } 56724448Speter (void) refcount_add(&scl->scl_count, tag); 56831891Ssef mutex_exit(&scl->scl_lock); 56924448Speter } 5708141Sache ASSERT3U(wlocks_held, <=, locks); 57124448Speter} 57224448Speter 57324448Spetervoid 57424448Speterspa_config_exit(spa_t *spa, int locks, void *tag) 57524448Speter{ 57624448Speter for (int i = SCL_LOCKS - 1; i >= 0; i--) { 57724448Speter spa_config_lock_t *scl = &spa->spa_config_lock[i]; 57831891Ssef if (!(locks & (1 << i))) 57924448Speter continue; 5801541Srgrimes mutex_enter(&scl->scl_lock); 5811541Srgrimes ASSERT(!refcount_is_zero(&scl->scl_count)); 5821541Srgrimes if (refcount_remove(&scl->scl_count, tag) == 0) { 58312221Sbde ASSERT(scl->scl_writer == NULL || 5841541Srgrimes scl->scl_writer == curthread); 5851541Srgrimes scl->scl_writer = NULL; /* OK in either case */ 5861541Srgrimes cv_broadcast(&scl->scl_cv); 58712221Sbde } 5881541Srgrimes mutex_exit(&scl->scl_lock); 5891549Srgrimes } 59030994Sphk} 5911541Srgrimes 5921541Srgrimesint 5931541Srgrimesspa_config_held(spa_t *spa, int locks, krw_t rw) 5941541Srgrimes{ 5951541Srgrimes int locks_held = 0; 5961541Srgrimes 5971541Srgrimes for (int i = 0; i < SCL_LOCKS; i++) { 5981541Srgrimes spa_config_lock_t *scl = &spa->spa_config_lock[i]; 59924449Speter if (!(locks & (1 << i))) 60024449Speter continue; 60146155Sphk if ((rw == RW_READER && !refcount_is_zero(&scl->scl_count)) || 6021541Srgrimes (rw == RW_WRITER && scl->scl_writer == curthread)) 60324449Speter locks_held |= 1 << i; 60424449Speter } 60524449Speter 60631891Ssef return (locks_held); 60724449Speter} 6081541Srgrimes 6091541Srgrimes/* 6101541Srgrimes * ========================================================================== 61112221Sbde * SPA namespace functions 6121541Srgrimes * ========================================================================== 6131541Srgrimes */ 6141541Srgrimes 6151541Srgrimes/* 61612221Sbde * Lookup the named spa_t in the AVL tree. The spa_namespace_lock must be held. 6171541Srgrimes * Returns NULL if no matching spa_t is found. 6181549Srgrimes */ 61930994Sphkspa_t * 6201541Srgrimesspa_lookup(const char *name) 6211541Srgrimes{ 6221541Srgrimes static spa_t search; /* spa_t is large; don't allocate on stack */ 6231541Srgrimes spa_t *spa; 6241541Srgrimes avl_index_t where; 6251541Srgrimes char *cp; 6261541Srgrimes 62746155Sphk ASSERT(MUTEX_HELD(&spa_namespace_lock)); 6281541Srgrimes 62912063Sdg (void) strlcpy(search.spa_name, name, sizeof (search.spa_name)); 63024447Speter 6311541Srgrimes /* 63224447Speter * If it's a full dataset name, figure out the pool name and 63324447Speter * just use that. 63424447Speter */ 63524447Speter cp = strpbrk(search.spa_name, "/@#"); 6361541Srgrimes if (cp != NULL) 63724447Speter *cp = '\0'; 63824447Speter 63924447Speter spa = avl_find(&spa_namespace_avl, &search, &where); 64024447Speter 64124447Speter return (spa); 64224447Speter} 64324447Speter 64424447Speter/* 64524447Speter * Fires when spa_sync has not completed within zfs_deadman_synctime_ms. 64624447Speter * If the zfs_deadman_enabled flag is set then it inspects all vdev queues 64724447Speter * looking for potentially hung I/Os. 64824447Speter */ 64924447Speterstatic void 65024447Speterspa_deadman(void *arg, int pending) 65131891Ssef{ 6521541Srgrimes spa_t *spa = arg; 6531541Srgrimes 6541541Srgrimes /* 65512221Sbde * Disable the deadman timer if the pool is suspended. 6561541Srgrimes */ 6579238Sache if (spa_suspended(spa)) { 6589238Sache#ifdef illumos 6591541Srgrimes VERIFY(cyclic_reprogram(spa->spa_deadman_cycid, CY_INFINITY)); 66012221Sbde#else 6611541Srgrimes /* Nothing. just don't schedule any future callouts. */ 6621549Srgrimes#endif 66330994Sphk return; 6641541Srgrimes } 6651541Srgrimes 6661541Srgrimes zfs_dbgmsg("slow spa_sync: started %llu seconds ago, calls %llu", 6671541Srgrimes (gethrtime() - spa->spa_sync_starttime) / NANOSEC, 6689238Sache ++spa->spa_deadman_calls); 6698135Sache if (zfs_deadman_enabled) 6701541Srgrimes vdev_deadman(spa->spa_root_vdev); 6719238Sache#ifdef __FreeBSD__ 6729238Sache#ifdef _KERNEL 67343311Sdillon callout_schedule(&spa->spa_deadman_cycid, 67443311Sdillon hz * zfs_deadman_checktime_ms / MILLISEC); 67543311Sdillon#endif 67646155Sphk#endif 6778135Sache} 6789238Sache 67924450Speter#if defined(__FreeBSD__) && defined(_KERNEL) 68065495Struckmanstatic void 68131891Ssefspa_deadman_timeout(void *arg) 68224450Speter{ 68324450Speter spa_t *spa = arg; 68465495Struckman 68531891Ssef taskqueue_enqueue(taskqueue_thread, &spa->spa_deadman_task); 6868135Sache} 68724559Speter#endif 68824559Speter 6898111Sache/* 69031891Ssef * Create an uninitialized spa_t with the given name. Requires 69124450Speter * spa_namespace_lock. The caller must ensure that the spa_t doesn't already 6928135Sache * exist by calling spa_lookup() first. 6931541Srgrimes */ 6941541Srgrimesspa_t * 69512221Sbdespa_add(const char *name, nvlist_t *config, const char *altroot) 6961541Srgrimes{ 6979238Sache spa_t *spa; 6989238Sache spa_config_dirent_t *dp; 6991541Srgrimes#ifdef illumos 70012221Sbde cyc_handler_t hdlr; 7011541Srgrimes cyc_time_t when; 7021549Srgrimes#endif 70330994Sphk 7041541Srgrimes ASSERT(MUTEX_HELD(&spa_namespace_lock)); 7051541Srgrimes 7061541Srgrimes spa = kmem_zalloc(sizeof (spa_t), KM_SLEEP); 7071541Srgrimes 7089238Sache mutex_init(&spa->spa_async_lock, NULL, MUTEX_DEFAULT, NULL); 7098135Sache mutex_init(&spa->spa_errlist_lock, NULL, MUTEX_DEFAULT, NULL); 7101541Srgrimes mutex_init(&spa->spa_errlog_lock, NULL, MUTEX_DEFAULT, NULL); 7119238Sache mutex_init(&spa->spa_evicting_os_lock, NULL, MUTEX_DEFAULT, NULL); 7129238Sache mutex_init(&spa->spa_history_lock, NULL, MUTEX_DEFAULT, NULL); 71343311Sdillon mutex_init(&spa->spa_proc_lock, NULL, MUTEX_DEFAULT, NULL); 71443311Sdillon mutex_init(&spa->spa_props_lock, NULL, MUTEX_DEFAULT, NULL); 71543311Sdillon mutex_init(&spa->spa_cksum_tmpls_lock, NULL, MUTEX_DEFAULT, NULL); 71646155Sphk mutex_init(&spa->spa_scrub_lock, NULL, MUTEX_DEFAULT, NULL); 7178135Sache mutex_init(&spa->spa_suspend_lock, NULL, MUTEX_DEFAULT, NULL); 7189238Sache mutex_init(&spa->spa_vdev_top_lock, NULL, MUTEX_DEFAULT, NULL); 71924450Speter 72024450Speter cv_init(&spa->spa_async_cv, NULL, CV_DEFAULT, NULL); 7219238Sache cv_init(&spa->spa_evicting_os_cv, NULL, CV_DEFAULT, NULL); 72231891Ssef cv_init(&spa->spa_proc_cv, NULL, CV_DEFAULT, NULL); 72324450Speter cv_init(&spa->spa_scrub_io_cv, NULL, CV_DEFAULT, NULL); 72424450Speter cv_init(&spa->spa_suspend_cv, NULL, CV_DEFAULT, NULL); 7259238Sache 72631891Ssef for (int t = 0; t < TXG_SIZE; t++) 72724450Speter bplist_create(&spa->spa_free_bplist[t]); 72824559Speter 72924559Speter (void) strlcpy(spa->spa_name, name, sizeof (spa->spa_name)); 7308111Sache spa->spa_state = POOL_STATE_UNINITIALIZED; 73131891Ssef spa->spa_freeze_txg = UINT64_MAX; 73224450Speter spa->spa_final_txg = UINT64_MAX; 7338135Sache spa->spa_load_max_txg = UINT64_MAX; 7341541Srgrimes spa->spa_proc = &p0; 7351541Srgrimes spa->spa_proc_state = SPA_PROC_NONE; 73656115Speter spa->spa_trust_config = B_TRUE; 73756115Speter 73856115Speter#ifdef illumos 73956115Speter hdlr.cyh_func = spa_deadman; 74056115Speter hdlr.cyh_arg = spa; 74124453Speter hdlr.cyh_level = CY_LOW_LEVEL; 74256115Speter#endif 74356115Speter 74456115Speter spa->spa_deadman_synctime = MSEC2NSEC(zfs_deadman_synctime_ms); 74556115Speter 74656115Speter#ifdef illumos 74756115Speter /* 74856115Speter * This determines how often we need to check for hung I/Os after 74956115Speter * the cyclic has already fired. Since checking for hung I/Os is 75056115Speter * an expensive operation we don't want to check too frequently. 75156115Speter * Instead wait for 5 seconds before checking again. 75256115Speter */ 75356115Speter when.cyt_interval = MSEC2NSEC(zfs_deadman_checktime_ms); 75456115Speter when.cyt_when = CY_INFINITY; 75556115Speter mutex_enter(&cpu_lock); 75656115Speter spa->spa_deadman_cycid = cyclic_add(&hdlr, &when); 75756115Speter mutex_exit(&cpu_lock); 75856115Speter#else /* !illumos */ 75956115Speter#ifdef _KERNEL 76056115Speter /* 76156115Speter * callout(9) does not provide a way to initialize a callout with 76256115Speter * a function and an argument, so we use callout_reset() to schedule 76356115Speter * the callout in the very distant future. Even if that event ever 76456115Speter * fires, it should be okayas we won't have any active zio-s. 76556115Speter * But normally spa_sync() will reschedule the callout with a proper 76656115Speter * timeout. 76756115Speter * callout(9) does not allow the callback function to sleep but 76856115Speter * vdev_deadman() needs to acquire vq_lock and illumos mutexes are 76956115Speter * emulated using sx(9). For this reason spa_deadman_timeout() 77065495Struckman * will schedule spa_deadman() as task on a taskqueue that allows 77156115Speter * sleeping. 77256115Speter */ 77356115Speter TASK_INIT(&spa->spa_deadman_task, 0, spa_deadman, spa); 77465495Struckman callout_init(&spa->spa_deadman_cycid, 1); 77556115Speter callout_reset_sbt(&spa->spa_deadman_cycid, SBT_MAX, 0, 77656115Speter spa_deadman_timeout, spa, 0); 77756115Speter#endif 77856115Speter#endif 77956115Speter refcount_create(&spa->spa_refcount); 78056115Speter spa_config_lock_init(spa); 78156115Speter 78256115Speter avl_add(&spa_namespace_avl, spa); 78356115Speter 78456115Speter /* 78556115Speter * Set the alternate root, if there is one. 78656115Speter */ 78756115Speter if (altroot) { 78856115Speter spa->spa_root = spa_strdup(altroot); 78956115Speter spa_active_count++; 79056115Speter } 79156115Speter 79256115Speter spa->spa_alloc_count = spa_allocators; 79356115Speter spa->spa_alloc_locks = kmem_zalloc(spa->spa_alloc_count * 79456115Speter sizeof (kmutex_t), KM_SLEEP); 79556115Speter spa->spa_alloc_trees = kmem_zalloc(spa->spa_alloc_count * 79656115Speter sizeof (avl_tree_t), KM_SLEEP); 79756115Speter for (int i = 0; i < spa->spa_alloc_count; i++) { 79856115Speter mutex_init(&spa->spa_alloc_locks[i], NULL, MUTEX_DEFAULT, NULL); 79956115Speter avl_create(&spa->spa_alloc_trees[i], zio_bookmark_compare, 80056115Speter sizeof (zio_t), offsetof(zio_t, io_alloc_node)); 80156115Speter } 80256115Speter 80356115Speter /* 80456115Speter * Every pool starts with the default cachefile 80556115Speter */ 80656115Speter list_create(&spa->spa_config_list, sizeof (spa_config_dirent_t), 80756115Speter offsetof(spa_config_dirent_t, scd_link)); 80856115Speter 80956115Speter dp = kmem_zalloc(sizeof (spa_config_dirent_t), KM_SLEEP); 81056115Speter dp->scd_path = altroot ? NULL : spa_strdup(spa_config_path); 81156115Speter list_insert_head(&spa->spa_config_list, dp); 81256115Speter 81356115Speter VERIFY(nvlist_alloc(&spa->spa_load_info, NV_UNIQUE_NAME, 81456115Speter KM_SLEEP) == 0); 81556115Speter 81656115Speter if (config != NULL) { 81756115Speter nvlist_t *features; 81856115Speter 81956115Speter if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_FEATURES_FOR_READ, 82056115Speter &features) == 0) { 82156115Speter VERIFY(nvlist_dup(features, &spa->spa_label_features, 82256115Speter 0) == 0); 82356115Speter } 82456115Speter 82556115Speter VERIFY(nvlist_dup(config, &spa->spa_config, 0) == 0); 82656115Speter } 82756115Speter 82856115Speter if (spa->spa_label_features == NULL) { 82956115Speter VERIFY(nvlist_alloc(&spa->spa_label_features, NV_UNIQUE_NAME, 83056115Speter KM_SLEEP) == 0); 83156115Speter } 83256115Speter 83356115Speter spa->spa_min_ashift = INT_MAX; 83456115Speter spa->spa_max_ashift = 0; 83556115Speter 83656115Speter /* 83756115Speter * As a pool is being created, treat all features as disabled by 83856115Speter * setting SPA_FEATURE_DISABLED for all entries in the feature 83956115Speter * refcount cache. 84056115Speter */ 84156115Speter for (int i = 0; i < SPA_FEATURES; i++) { 84256115Speter spa->spa_feat_refcount_cache[i] = SPA_FEATURE_DISABLED; 84356115Speter } 84456115Speter 84556115Speter return (spa); 84656115Speter} 84756115Speter 84856115Speter/* 84956115Speter * Removes a spa_t from the namespace, freeing up any memory used. Requires 85056115Speter * spa_namespace_lock. This is called only after the spa_t has been closed and 85156115Speter * deactivated. 85256115Speter */ 85356115Spetervoid 85456115Speterspa_remove(spa_t *spa) 85556115Speter{ 85656115Speter spa_config_dirent_t *dp; 85756115Speter 85856115Speter ASSERT(MUTEX_HELD(&spa_namespace_lock)); 85956115Speter ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 86056115Speter ASSERT3U(refcount_count(&spa->spa_refcount), ==, 0); 86156115Speter 86256115Speter nvlist_free(spa->spa_config_splitting); 86356115Speter 86456115Speter avl_remove(&spa_namespace_avl, spa); 86556115Speter cv_broadcast(&spa_namespace_cv); 86656115Speter 86756115Speter if (spa->spa_root) { 86856115Speter spa_strfree(spa->spa_root); 86956115Speter spa_active_count--; 87056115Speter } 87156115Speter 87256115Speter while ((dp = list_head(&spa->spa_config_list)) != NULL) { 87356115Speter list_remove(&spa->spa_config_list, dp); 87456115Speter if (dp->scd_path != NULL) 87556115Speter spa_strfree(dp->scd_path); 87656115Speter kmem_free(dp, sizeof (spa_config_dirent_t)); 87756115Speter } 87856115Speter 87956115Speter for (int i = 0; i < spa->spa_alloc_count; i++) { 88056115Speter avl_destroy(&spa->spa_alloc_trees[i]); 88156115Speter mutex_destroy(&spa->spa_alloc_locks[i]); 88256115Speter } 88356115Speter kmem_free(spa->spa_alloc_locks, spa->spa_alloc_count * 88456115Speter sizeof (kmutex_t)); 88556115Speter kmem_free(spa->spa_alloc_trees, spa->spa_alloc_count * 88656115Speter sizeof (avl_tree_t)); 88756115Speter 88856115Speter list_destroy(&spa->spa_config_list); 88956115Speter 89056115Speter nvlist_free(spa->spa_label_features); 89156115Speter nvlist_free(spa->spa_load_info); 89224453Speter spa_config_set(spa, NULL); 89324453Speter 89424453Speter#ifdef illumos 89524453Speter mutex_enter(&cpu_lock); 89624453Speter if (spa->spa_deadman_cycid != CYCLIC_NONE) 89724453Speter cyclic_remove(spa->spa_deadman_cycid); 89830994Sphk mutex_exit(&cpu_lock); 89924453Speter spa->spa_deadman_cycid = CYCLIC_NONE; 90024453Speter#else /* !illumos */ 90124453Speter#ifdef _KERNEL 90224453Speter callout_drain(&spa->spa_deadman_cycid); 90324453Speter taskqueue_drain(taskqueue_thread, &spa->spa_deadman_task); 90424453Speter#endif 90524453Speter#endif 90624453Speter 90724453Speter refcount_destroy(&spa->spa_refcount); 90824453Speter 90924453Speter spa_config_lock_destroy(spa); 91060216Speter 91124453Speter for (int t = 0; t < TXG_SIZE; t++) 91224453Speter bplist_destroy(&spa->spa_free_bplist[t]); 91324453Speter 9141541Srgrimes zio_checksum_templates_free(spa); 9151541Srgrimes 9161541Srgrimes cv_destroy(&spa->spa_async_cv); 9171549Srgrimes cv_destroy(&spa->spa_evicting_os_cv); 9181541Srgrimes cv_destroy(&spa->spa_proc_cv); 9191541Srgrimes cv_destroy(&spa->spa_scrub_io_cv); 9201541Srgrimes cv_destroy(&spa->spa_suspend_cv); 9211541Srgrimes 9221541Srgrimes mutex_destroy(&spa->spa_async_lock); 9231541Srgrimes mutex_destroy(&spa->spa_errlist_lock); 9241541Srgrimes mutex_destroy(&spa->spa_errlog_lock); 9251541Srgrimes mutex_destroy(&spa->spa_evicting_os_lock); 9261541Srgrimes mutex_destroy(&spa->spa_history_lock); 9271541Srgrimes mutex_destroy(&spa->spa_proc_lock); 9281541Srgrimes mutex_destroy(&spa->spa_props_lock); 9291541Srgrimes mutex_destroy(&spa->spa_cksum_tmpls_lock); 9301541Srgrimes mutex_destroy(&spa->spa_scrub_lock); 9311541Srgrimes mutex_destroy(&spa->spa_suspend_lock); 93261287Srwatson mutex_destroy(&spa->spa_vdev_top_lock); 93361287Srwatson 93461287Srwatson kmem_free(spa, sizeof (spa_t)); 93561287Srwatson} 93661287Srwatson 9371541Srgrimes/* 9381541Srgrimes * Given a pool, return the next pool in the namespace, or NULL if there is 9391541Srgrimes * none. If 'prev' is NULL, return the first pool. 9401541Srgrimes */ 9411541Srgrimesspa_t * 9421541Srgrimesspa_next(spa_t *prev) 9431549Srgrimes{ 94446112Sphk ASSERT(MUTEX_HELD(&spa_namespace_lock)); 94572786Srwatson 94646112Sphk if (prev) 94746155Sphk return (AVL_NEXT(&spa_namespace_avl, prev)); 94846112Sphk else 94946112Sphk return (avl_first(&spa_namespace_avl)); 95046112Sphk} 95146155Sphk 95272786Srwatson/* 95372786Srwatson * ========================================================================== 95446155Sphk * SPA refcount functions 9551541Srgrimes * ========================================================================== 95661282Srwatson */ 95761282Srwatson 95846155Sphk/* 95946155Sphk * Add a reference to the given spa_t. Must have at least one reference, or 96046155Sphk * have the namespace lock held. 9611541Srgrimes */ 96246155Sphkvoid 96346155Sphkspa_open_ref(spa_t *spa, void *tag) 96446155Sphk{ 96546155Sphk ASSERT(refcount_count(&spa->spa_refcount) >= spa->spa_minref || 96672786Srwatson MUTEX_HELD(&spa_namespace_lock)); 96746155Sphk (void) refcount_add(&spa->spa_refcount, tag); 96846155Sphk} 9691541Srgrimes 9701541Srgrimes/* 97165237Srwatson * Remove a reference to the given spa_t. Must have at least one reference, or 97272786Srwatson * have the namespace lock held. 97365237Srwatson */ 97472786Srwatsonvoid 97553518Sphkspa_close(spa_t *spa, void *tag) 97665237Srwatson{ 97765237Srwatson ASSERT(refcount_count(&spa->spa_refcount) > spa->spa_minref || 97865237Srwatson MUTEX_HELD(&spa_namespace_lock)); 97972786Srwatson (void) refcount_remove(&spa->spa_refcount, tag); 98072786Srwatson} 98165237Srwatson 98265293Srwatson/* 98365293Srwatson * Remove a reference to the given spa_t held by a dsl dir that is 98465293Srwatson * being asynchronously released. Async releases occur from a taskq 98565293Srwatson * performing eviction of dsl datasets and dirs. The namespace lock 98665293Srwatson * isn't held and the hold by the object being evicted may contribute to 98765293Srwatson * spa_minref (e.g. dataset or directory released during pool export), 98865237Srwatson * so the asserts in spa_close() do not apply. 98965293Srwatson */ 99065237Srwatsonvoid 99165237Srwatsonspa_async_close(spa_t *spa, void *tag) 99265237Srwatson{ 99365237Srwatson (void) refcount_remove(&spa->spa_refcount, tag); 99465237Srwatson} 99572786Srwatson 99653518Sphk/* 99772786Srwatson * Check to see if the spa refcount is zero. Must be called with 99853518Sphk * spa_namespace_lock held. We really compare against spa_minref, which is the 99965237Srwatson * number of references acquired when opening a pool 100065237Srwatson */ 100165237Srwatsonboolean_t 100253518Sphkspa_refcount_zero(spa_t *spa) 100353518Sphk{ 100465237Srwatson ASSERT(MUTEX_HELD(&spa_namespace_lock)); 100572786Srwatson 100672786Srwatson return (refcount_count(&spa->spa_refcount) == spa->spa_minref); 100765237Srwatson} 100853518Sphk 100953518Sphk/* 101053518Sphk * ========================================================================== 101153518Sphk * SPA spare and l2cache tracking 101265237Srwatson * ========================================================================== 101365237Srwatson */ 101465237Srwatson 101565237Srwatson/* 101653518Sphk * Hot spares and cache devices are tracked using the same code below, 101753518Sphk * for 'auxiliary' devices. 101853518Sphk */ 101953518Sphk 102065237Srwatsontypedef struct spa_aux { 102165237Srwatson uint64_t aux_guid; 102265237Srwatson uint64_t aux_pool; 102365237Srwatson avl_node_t aux_avl; 102453518Sphk int aux_count; 102565237Srwatson} spa_aux_t; 102665237Srwatson 102765237Srwatsonstatic int 102865237Srwatsonspa_aux_compare(const void *a, const void *b) 102965237Srwatson{ 103065237Srwatson const spa_aux_t *sa = a; 103165237Srwatson const spa_aux_t *sb = b; 103265237Srwatson 103365237Srwatson if (sa->aux_guid < sb->aux_guid) 103465237Srwatson return (-1); 103553518Sphk else if (sa->aux_guid > sb->aux_guid) 103653518Sphk return (1); 103753518Sphk else 103865237Srwatson return (0); 103972786Srwatson} 104065237Srwatson 104172786Srwatsonvoid 104265237Srwatsonspa_aux_add(vdev_t *vd, avl_tree_t *avl) 104365237Srwatson{ 104465237Srwatson avl_index_t where; 104565237Srwatson spa_aux_t search; 104665237Srwatson spa_aux_t *aux; 104765237Srwatson 104865237Srwatson search.aux_guid = vd->vdev_guid; 104972786Srwatson if ((aux = avl_find(avl, &search, &where)) != NULL) { 105072786Srwatson aux->aux_count++; 105165237Srwatson } else { 105265237Srwatson aux = kmem_zalloc(sizeof (spa_aux_t), KM_SLEEP); 105365237Srwatson aux->aux_guid = vd->vdev_guid; 105465237Srwatson aux->aux_count = 1; 105565237Srwatson avl_insert(avl, aux, where); 105665237Srwatson } 105765237Srwatson} 105865237Srwatson 105965237Srwatsonvoid 106065237Srwatsonspa_aux_remove(vdev_t *vd, avl_tree_t *avl) 106165237Srwatson{ 106265237Srwatson spa_aux_t search; 106365237Srwatson spa_aux_t *aux; 106465237Srwatson avl_index_t where; 106565237Srwatson 106665237Srwatson search.aux_guid = vd->vdev_guid; 106765237Srwatson aux = avl_find(avl, &search, &where); 106865237Srwatson 106965237Srwatson ASSERT(aux != NULL); 107065237Srwatson 107165237Srwatson if (--aux->aux_count == 0) { 107265237Srwatson avl_remove(avl, aux); 107365237Srwatson kmem_free(aux, sizeof (spa_aux_t)); 107465237Srwatson } else if (aux->aux_pool == spa_guid(vd->vdev_spa)) { 107565237Srwatson aux->aux_pool = 0ULL; 107665237Srwatson } 107765237Srwatson} 107865237Srwatson 107965237Srwatsonboolean_t 108065237Srwatsonspa_aux_exists(uint64_t guid, uint64_t *pool, int *refcnt, avl_tree_t *avl) 108165237Srwatson{ 108265237Srwatson spa_aux_t search, *found; 108372786Srwatson 108465237Srwatson search.aux_guid = guid; 108572786Srwatson found = avl_find(avl, &search, NULL); 108665237Srwatson 108765237Srwatson if (pool) { 108865237Srwatson if (found) 108965237Srwatson *pool = found->aux_pool; 109065237Srwatson else 109165237Srwatson *pool = 0ULL; 109265237Srwatson } 109365237Srwatson 109472786Srwatson if (refcnt) { 109572786Srwatson if (found) 109665237Srwatson *refcnt = found->aux_count; 109765237Srwatson else 109865237Srwatson *refcnt = 0; 109967999Srwatson } 110067999Srwatson 110168591Srwatson return (found != NULL); 110267999Srwatson} 110365237Srwatson 110465237Srwatsonvoid 110565237Srwatsonspa_aux_activate(vdev_t *vd, avl_tree_t *avl) 110665237Srwatson{ 110765237Srwatson spa_aux_t search, *found; 110865237Srwatson avl_index_t where; 110965237Srwatson 111065237Srwatson search.aux_guid = vd->vdev_guid; 111165237Srwatson found = avl_find(avl, &search, &where); 111265237Srwatson ASSERT(found != NULL); 111365237Srwatson ASSERT(found->aux_pool == 0ULL); 111465237Srwatson 111565237Srwatson found->aux_pool = spa_guid(vd->vdev_spa); 111665237Srwatson} 111772786Srwatson 111865237Srwatson/* 111965237Srwatson * Spares are tracked globally due to the following constraints: 112065237Srwatson * 112165237Srwatson * - A spare may be part of multiple pools. 112265237Srwatson * - A spare may be added to a pool even if it's actively in use within 112365237Srwatson * another pool. 112465237Srwatson * - A spare in use in any pool can only be the source of a replacement if 112565237Srwatson * the target is a spare in the same pool. 112665237Srwatson * 112765237Srwatson * We keep track of all spares on the system through the use of a reference 112865237Srwatson * counted AVL tree. When a vdev is added as a spare, or used as a replacement 112965237Srwatson * spare, then we bump the reference count in the AVL tree. In addition, we set 113065237Srwatson * the 'vdev_isspare' member to indicate that the device is a spare (active or 113165237Srwatson * inactive). When a spare is made active (used to replace a device in the 113265237Srwatson * pool), we also keep track of which pool its been made a part of. 113365237Srwatson * 113465237Srwatson * The 'spa_spare_lock' protects the AVL tree. These functions are normally 113565237Srwatson * called under the spa_namespace lock as part of vdev reconfiguration. The 113665237Srwatson * separate spare lock exists for the status query path, which does not need to 113765237Srwatson * be completely consistent with respect to other vdev configuration changes. 113865237Srwatson */ 113965237Srwatson 114053518Sphkstatic int 11411541Srgrimesspa_spare_compare(const void *a, const void *b) 11421541Srgrimes{ 11431541Srgrimes return (spa_aux_compare(a, b)); 11441541Srgrimes} 11451541Srgrimes 11461541Srgrimesvoid 11471541Srgrimesspa_spare_add(vdev_t *vd) 114869239Salfred{ 11491541Srgrimes mutex_enter(&spa_spare_lock); 115069239Salfred ASSERT(!vd->vdev_isspare); 11511541Srgrimes spa_aux_add(vd, &spa_spare_avl); 11521541Srgrimes vd->vdev_isspare = B_TRUE; 11531541Srgrimes mutex_exit(&spa_spare_lock); 11541541Srgrimes} 115572474Srwatson 115669401Salfredvoid 115769401Salfredspa_spare_remove(vdev_t *vd) 115869401Salfred{ 115969401Salfred mutex_enter(&spa_spare_lock); 116069401Salfred ASSERT(vd->vdev_isspare); 116169401Salfred spa_aux_remove(vd, &spa_spare_avl); 116272200Sbmilekic vd->vdev_isspare = B_FALSE; 116369401Salfred mutex_exit(&spa_spare_lock); 116472200Sbmilekic} 116569401Salfred 116669401Salfredboolean_t 116769401Salfredspa_spare_exists(uint64_t guid, uint64_t *pool, int *refcnt) 116869401Salfred{ 11691541Srgrimes boolean_t found; 11701541Srgrimes 11711541Srgrimes mutex_enter(&spa_spare_lock); 11721549Srgrimes found = spa_aux_exists(guid, pool, refcnt, &spa_spare_avl); 11731541Srgrimes mutex_exit(&spa_spare_lock); 11741541Srgrimes 11751541Srgrimes return (found); 117669239Salfred} 117772200Sbmilekic 117865495Struckmanvoid 117969239Salfredspa_spare_activate(vdev_t *vd) 118065495Struckman{ 118165495Struckman mutex_enter(&spa_spare_lock); 118265495Struckman ASSERT(vd->vdev_isspare); 118365495Struckman spa_aux_activate(vd, &spa_spare_avl); 118465495Struckman mutex_exit(&spa_spare_lock); 118565495Struckman} 118665495Struckman 118772786Srwatson/* 118872786Srwatson * Level 2 ARC devices are tracked globally for the same reasons as spares. 118972786Srwatson * Cache devices currently only support one pool per cache device, and so 119072786Srwatson * for these devices the aux reference count is currently unused beyond 1. 119172786Srwatson */ 11921541Srgrimes 119369239Salfredstatic int 119472200Sbmilekicspa_l2cache_compare(const void *a, const void *b) 119565495Struckman{ 11961541Srgrimes return (spa_aux_compare(a, b)); 11971541Srgrimes} 11981541Srgrimes 11991541Srgrimesvoid 12001541Srgrimesspa_l2cache_add(vdev_t *vd) 12011541Srgrimes{ 12021541Srgrimes mutex_enter(&spa_l2cache_lock); 12031541Srgrimes ASSERT(!vd->vdev_isl2cache); 12041541Srgrimes spa_aux_add(vd, &spa_l2cache_avl); 12051541Srgrimes vd->vdev_isl2cache = B_TRUE; 12061541Srgrimes mutex_exit(&spa_l2cache_lock); 120772200Sbmilekic} 120869239Salfred 120972200Sbmilekicvoid 12101541Srgrimesspa_l2cache_remove(vdev_t *vd) 121169239Salfred{ 121272200Sbmilekic mutex_enter(&spa_l2cache_lock); 121369239Salfred ASSERT(vd->vdev_isl2cache); 12141541Srgrimes spa_aux_remove(vd, &spa_l2cache_avl); 12151541Srgrimes vd->vdev_isl2cache = B_FALSE; 12161541Srgrimes mutex_exit(&spa_l2cache_lock); 12171541Srgrimes} 12181541Srgrimes 12191541Srgrimesboolean_t 12201541Srgrimesspa_l2cache_exists(uint64_t guid, uint64_t *pool) 12211541Srgrimes{ 12221541Srgrimes boolean_t found; 12231541Srgrimes 12241541Srgrimes mutex_enter(&spa_l2cache_lock); 12251541Srgrimes found = spa_aux_exists(guid, pool, NULL, &spa_l2cache_avl); 12261541Srgrimes mutex_exit(&spa_l2cache_lock); 122769239Salfred 12281541Srgrimes return (found); 122969239Salfred} 123065495Struckman 123172786Srwatsonvoid 123272786Srwatsonspa_l2cache_activate(vdev_t *vd) 12331541Srgrimes{ 12341541Srgrimes mutex_enter(&spa_l2cache_lock); 12351541Srgrimes ASSERT(vd->vdev_isl2cache); 12361541Srgrimes spa_aux_activate(vd, &spa_l2cache_avl); 12371541Srgrimes mutex_exit(&spa_l2cache_lock); 12381541Srgrimes} 12391541Srgrimes 124012221Sbde/* 12411541Srgrimes * ========================================================================== 12421541Srgrimes * SPA vdev locking 12431541Srgrimes * ========================================================================== 12441541Srgrimes */ 124512221Sbde 12461541Srgrimes/* 12471549Srgrimes * Lock the given spa_t for the purpose of adding or removing a vdev. 124830994Sphk * Grabs the global spa_namespace_lock plus the spa config lock for writing. 12491541Srgrimes * It returns the next transaction group for the spa_t. 12501541Srgrimes */ 12511541Srgrimesuint64_t 12521541Srgrimesspa_vdev_enter(spa_t *spa) 125323358Sache{ 125423359Sache mutex_enter(&spa->spa_vdev_top_lock); 12551541Srgrimes mutex_enter(&spa_namespace_lock); 12561541Srgrimes return (spa_vdev_config_enter(spa)); 12571541Srgrimes} 12581541Srgrimes 12591541Srgrimes/* 12601541Srgrimes * Internal implementation for spa_vdev_enter(). Used when a vdev 12611541Srgrimes * operation requires multiple syncs (i.e. removing a device) while 126212221Sbde * keeping the spa_namespace_lock held. 12631541Srgrimes */ 12641541Srgrimesuint64_t 12651541Srgrimesspa_vdev_config_enter(spa_t *spa) 126612221Sbde{ 12671541Srgrimes ASSERT(MUTEX_HELD(&spa_namespace_lock)); 12681549Srgrimes 126930994Sphk spa_config_enter(spa, SCL_ALL, spa, RW_WRITER); 12701541Srgrimes 12711541Srgrimes return (spa_last_synced_txg(spa) + 1); 12721541Srgrimes} 12731541Srgrimes 127423330Sache/* 12751541Srgrimes * Used in combination with spa_vdev_config_enter() to allow the syncing 127646155Sphk * of multiple transactions without releasing the spa_namespace_lock. 12771541Srgrimes */ 127822522Sdavidnvoid 127936845Sdfrspa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error, char *tag) 12801541Srgrimes{ 12811541Srgrimes ASSERT(MUTEX_HELD(&spa_namespace_lock)); 128222522Sdavidn 128322522Sdavidn int config_changed = B_FALSE; 128423330Sache 12851541Srgrimes ASSERT(txg > spa_last_synced_txg(spa)); 12861541Srgrimes 128731891Ssef spa->spa_pending_vdev = NULL; 128831891Ssef 128931891Ssef /* 129055338Sphk * Reassess the DTLs. 129131891Ssef */ 129231891Ssef vdev_dtl_reassess(spa->spa_root_vdev, 0, 0, B_FALSE); 129355707Ssef 129431891Ssef if (error == 0 && !list_is_empty(&spa->spa_config_dirty_list)) { 129531891Ssef config_changed = B_TRUE; 129665495Struckman spa->spa_config_generation++; 129765495Struckman } 129865495Struckman 129965495Struckman /* 130065495Struckman * Verify the metaslab classes. 130165495Struckman */ 130265495Struckman ASSERT(metaslab_class_validate(spa_normal_class(spa)) == 0); 130365495Struckman ASSERT(metaslab_class_validate(spa_log_class(spa)) == 0); 130465495Struckman 130565495Struckman spa_config_exit(spa, SCL_ALL, spa); 130665495Struckman 130765495Struckman /* 130865495Struckman * Panic the system if the specified tag requires it. This 130965495Struckman * is useful for ensuring that configurations are updated 131065495Struckman * transactionally. 131165495Struckman */ 131265495Struckman if (zio_injection_enabled) 131365495Struckman zio_handle_panic_injection(spa, tag, 0); 131465495Struckman 131565495Struckman /* 131665495Struckman * Note: this txg_wait_synced() is important because it ensures 131765495Struckman * that there won't be more than one config change per txg. 131865495Struckman * This allows us to use the txg as the generation number. 131965495Struckman */ 132065495Struckman if (error == 0) 132165495Struckman txg_wait_synced(spa->spa_dsl_pool, txg); 132265495Struckman 132365495Struckman if (vd != NULL) { 132465495Struckman ASSERT(!vd->vdev_detached || vd->vdev_dtl_sm == NULL); 132565495Struckman if (vd->vdev_ops->vdev_op_leaf) { 132667629Sgallatin mutex_enter(&vd->vdev_initialize_lock); 132765495Struckman vdev_initialize_stop(vd, VDEV_INITIALIZE_CANCELED); 132865495Struckman mutex_exit(&vd->vdev_initialize_lock); 132965495Struckman } 133065495Struckman 133165495Struckman spa_config_enter(spa, SCL_ALL, spa, RW_WRITER); 133265495Struckman vdev_free(vd); 133365495Struckman spa_config_exit(spa, SCL_ALL, spa); 133465495Struckman } 133565495Struckman 133665495Struckman /* 133765495Struckman * If the config changed, update the config cache. 133865495Struckman */ 133965495Struckman if (config_changed) 134065495Struckman spa_write_cachefile(spa, B_FALSE, B_TRUE); 134165495Struckman} 134265495Struckman 1343/* 1344 * Unlock the spa_t after adding or removing a vdev. Besides undoing the 1345 * locking of spa_vdev_enter(), we also want make sure the transactions have 1346 * synced to disk, and then update the global configuration cache with the new 1347 * information. 1348 */ 1349int 1350spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error) 1351{ 1352 spa_vdev_config_exit(spa, vd, txg, error, FTAG); 1353 mutex_exit(&spa_namespace_lock); 1354 mutex_exit(&spa->spa_vdev_top_lock); 1355 1356 return (error); 1357} 1358 1359/* 1360 * Lock the given spa_t for the purpose of changing vdev state. 1361 */ 1362void 1363spa_vdev_state_enter(spa_t *spa, int oplocks) 1364{ 1365 int locks = SCL_STATE_ALL | oplocks; 1366 1367 /* 1368 * Root pools may need to read of the underlying devfs filesystem 1369 * when opening up a vdev. Unfortunately if we're holding the 1370 * SCL_ZIO lock it will result in a deadlock when we try to issue 1371 * the read from the root filesystem. Instead we "prefetch" 1372 * the associated vnodes that we need prior to opening the 1373 * underlying devices and cache them so that we can prevent 1374 * any I/O when we are doing the actual open. 1375 */ 1376 if (spa_is_root(spa)) { 1377 int low = locks & ~(SCL_ZIO - 1); 1378 int high = locks & ~low; 1379 1380 spa_config_enter(spa, high, spa, RW_WRITER); 1381 vdev_hold(spa->spa_root_vdev); 1382 spa_config_enter(spa, low, spa, RW_WRITER); 1383 } else { 1384 spa_config_enter(spa, locks, spa, RW_WRITER); 1385 } 1386 spa->spa_vdev_locks = locks; 1387} 1388 1389int 1390spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error) 1391{ 1392 boolean_t config_changed = B_FALSE; 1393 1394 if (vd != NULL || error == 0) 1395 vdev_dtl_reassess(vd ? vd->vdev_top : spa->spa_root_vdev, 1396 0, 0, B_FALSE); 1397 1398 if (vd != NULL) { 1399 vdev_state_dirty(vd->vdev_top); 1400 config_changed = B_TRUE; 1401 spa->spa_config_generation++; 1402 } 1403 1404 if (spa_is_root(spa)) 1405 vdev_rele(spa->spa_root_vdev); 1406 1407 ASSERT3U(spa->spa_vdev_locks, >=, SCL_STATE_ALL); 1408 spa_config_exit(spa, spa->spa_vdev_locks, spa); 1409 1410 /* 1411 * If anything changed, wait for it to sync. This ensures that, 1412 * from the system administrator's perspective, zpool(1M) commands 1413 * are synchronous. This is important for things like zpool offline: 1414 * when the command completes, you expect no further I/O from ZFS. 1415 */ 1416 if (vd != NULL) 1417 txg_wait_synced(spa->spa_dsl_pool, 0); 1418 1419 /* 1420 * If the config changed, update the config cache. 1421 */ 1422 if (config_changed) { 1423 mutex_enter(&spa_namespace_lock); 1424 spa_write_cachefile(spa, B_FALSE, B_TRUE); 1425 mutex_exit(&spa_namespace_lock); 1426 } 1427 1428 return (error); 1429} 1430 1431/* 1432 * ========================================================================== 1433 * Miscellaneous functions 1434 * ========================================================================== 1435 */ 1436 1437void 1438spa_activate_mos_feature(spa_t *spa, const char *feature, dmu_tx_t *tx) 1439{ 1440 if (!nvlist_exists(spa->spa_label_features, feature)) { 1441 fnvlist_add_boolean(spa->spa_label_features, feature); 1442 /* 1443 * When we are creating the pool (tx_txg==TXG_INITIAL), we can't 1444 * dirty the vdev config because lock SCL_CONFIG is not held. 1445 * Thankfully, in this case we don't need to dirty the config 1446 * because it will be written out anyway when we finish 1447 * creating the pool. 1448 */ 1449 if (tx->tx_txg != TXG_INITIAL) 1450 vdev_config_dirty(spa->spa_root_vdev); 1451 } 1452} 1453 1454void 1455spa_deactivate_mos_feature(spa_t *spa, const char *feature) 1456{ 1457 if (nvlist_remove_all(spa->spa_label_features, feature) == 0) 1458 vdev_config_dirty(spa->spa_root_vdev); 1459} 1460 1461/* 1462 * Rename a spa_t. 1463 */ 1464int 1465spa_rename(const char *name, const char *newname) 1466{ 1467 spa_t *spa; 1468 int err; 1469 1470 /* 1471 * Lookup the spa_t and grab the config lock for writing. We need to 1472 * actually open the pool so that we can sync out the necessary labels. 1473 * It's OK to call spa_open() with the namespace lock held because we 1474 * allow recursive calls for other reasons. 1475 */ 1476 mutex_enter(&spa_namespace_lock); 1477 if ((err = spa_open(name, &spa, FTAG)) != 0) { 1478 mutex_exit(&spa_namespace_lock); 1479 return (err); 1480 } 1481 1482 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1483 1484 avl_remove(&spa_namespace_avl, spa); 1485 (void) strlcpy(spa->spa_name, newname, sizeof (spa->spa_name)); 1486 avl_add(&spa_namespace_avl, spa); 1487 1488 /* 1489 * Sync all labels to disk with the new names by marking the root vdev 1490 * dirty and waiting for it to sync. It will pick up the new pool name 1491 * during the sync. 1492 */ 1493 vdev_config_dirty(spa->spa_root_vdev); 1494 1495 spa_config_exit(spa, SCL_ALL, FTAG); 1496 1497 txg_wait_synced(spa->spa_dsl_pool, 0); 1498 1499 /* 1500 * Sync the updated config cache. 1501 */ 1502 spa_write_cachefile(spa, B_FALSE, B_TRUE); 1503 1504 spa_close(spa, FTAG); 1505 1506 mutex_exit(&spa_namespace_lock); 1507 1508 return (0); 1509} 1510 1511/* 1512 * Return the spa_t associated with given pool_guid, if it exists. If 1513 * device_guid is non-zero, determine whether the pool exists *and* contains 1514 * a device with the specified device_guid. 1515 */ 1516spa_t * 1517spa_by_guid(uint64_t pool_guid, uint64_t device_guid) 1518{ 1519 spa_t *spa; 1520 avl_tree_t *t = &spa_namespace_avl; 1521 1522 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 1523 1524 for (spa = avl_first(t); spa != NULL; spa = AVL_NEXT(t, spa)) { 1525 if (spa->spa_state == POOL_STATE_UNINITIALIZED) 1526 continue; 1527 if (spa->spa_root_vdev == NULL) 1528 continue; 1529 if (spa_guid(spa) == pool_guid) { 1530 if (device_guid == 0) 1531 break; 1532 1533 if (vdev_lookup_by_guid(spa->spa_root_vdev, 1534 device_guid) != NULL) 1535 break; 1536 1537 /* 1538 * Check any devices we may be in the process of adding. 1539 */ 1540 if (spa->spa_pending_vdev) { 1541 if (vdev_lookup_by_guid(spa->spa_pending_vdev, 1542 device_guid) != NULL) 1543 break; 1544 } 1545 } 1546 } 1547 1548 return (spa); 1549} 1550 1551/* 1552 * Determine whether a pool with the given pool_guid exists. 1553 */ 1554boolean_t 1555spa_guid_exists(uint64_t pool_guid, uint64_t device_guid) 1556{ 1557 return (spa_by_guid(pool_guid, device_guid) != NULL); 1558} 1559 1560char * 1561spa_strdup(const char *s) 1562{ 1563 size_t len; 1564 char *new; 1565 1566 len = strlen(s); 1567 new = kmem_alloc(len + 1, KM_SLEEP); 1568 bcopy(s, new, len); 1569 new[len] = '\0'; 1570 1571 return (new); 1572} 1573 1574void 1575spa_strfree(char *s) 1576{ 1577 kmem_free(s, strlen(s) + 1); 1578} 1579 1580uint64_t 1581spa_get_random(uint64_t range) 1582{ 1583 uint64_t r; 1584 1585 ASSERT(range != 0); 1586 1587 (void) random_get_pseudo_bytes((void *)&r, sizeof (uint64_t)); 1588 1589 return (r % range); 1590} 1591 1592uint64_t 1593spa_generate_guid(spa_t *spa) 1594{ 1595 uint64_t guid = spa_get_random(-1ULL); 1596 1597 if (spa != NULL) { 1598 while (guid == 0 || spa_guid_exists(spa_guid(spa), guid)) 1599 guid = spa_get_random(-1ULL); 1600 } else { 1601 while (guid == 0 || spa_guid_exists(guid, 0)) 1602 guid = spa_get_random(-1ULL); 1603 } 1604 1605 return (guid); 1606} 1607 1608void 1609snprintf_blkptr(char *buf, size_t buflen, const blkptr_t *bp) 1610{ 1611 char type[256]; 1612 char *checksum = NULL; 1613 char *compress = NULL; 1614 1615 if (bp != NULL) { 1616 if (BP_GET_TYPE(bp) & DMU_OT_NEWTYPE) { 1617 dmu_object_byteswap_t bswap = 1618 DMU_OT_BYTESWAP(BP_GET_TYPE(bp)); 1619 (void) snprintf(type, sizeof (type), "bswap %s %s", 1620 DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) ? 1621 "metadata" : "data", 1622 dmu_ot_byteswap[bswap].ob_name); 1623 } else { 1624 (void) strlcpy(type, dmu_ot[BP_GET_TYPE(bp)].ot_name, 1625 sizeof (type)); 1626 } 1627 if (!BP_IS_EMBEDDED(bp)) { 1628 checksum = 1629 zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_name; 1630 } 1631 compress = zio_compress_table[BP_GET_COMPRESS(bp)].ci_name; 1632 } 1633 1634 SNPRINTF_BLKPTR(snprintf, ' ', buf, buflen, bp, type, checksum, 1635 compress); 1636} 1637 1638void 1639spa_freeze(spa_t *spa) 1640{ 1641 uint64_t freeze_txg = 0; 1642 1643 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1644 if (spa->spa_freeze_txg == UINT64_MAX) { 1645 freeze_txg = spa_last_synced_txg(spa) + TXG_SIZE; 1646 spa->spa_freeze_txg = freeze_txg; 1647 } 1648 spa_config_exit(spa, SCL_ALL, FTAG); 1649 if (freeze_txg != 0) 1650 txg_wait_synced(spa_get_dsl(spa), freeze_txg); 1651} 1652 1653void 1654zfs_panic_recover(const char *fmt, ...) 1655{ 1656 va_list adx; 1657 1658 va_start(adx, fmt); 1659 vcmn_err(zfs_recover ? CE_WARN : CE_PANIC, fmt, adx); 1660 va_end(adx); 1661} 1662 1663/* 1664 * This is a stripped-down version of strtoull, suitable only for converting 1665 * lowercase hexadecimal numbers that don't overflow. 1666 */ 1667uint64_t 1668zfs_strtonum(const char *str, char **nptr) 1669{ 1670 uint64_t val = 0; 1671 char c; 1672 int digit; 1673 1674 while ((c = *str) != '\0') { 1675 if (c >= '0' && c <= '9') 1676 digit = c - '0'; 1677 else if (c >= 'a' && c <= 'f') 1678 digit = 10 + c - 'a'; 1679 else 1680 break; 1681 1682 val *= 16; 1683 val += digit; 1684 1685 str++; 1686 } 1687 1688 if (nptr) 1689 *nptr = (char *)str; 1690 1691 return (val); 1692} 1693 1694/* 1695 * ========================================================================== 1696 * Accessor functions 1697 * ========================================================================== 1698 */ 1699 1700boolean_t 1701spa_shutting_down(spa_t *spa) 1702{ 1703 return (spa->spa_async_suspended); 1704} 1705 1706dsl_pool_t * 1707spa_get_dsl(spa_t *spa) 1708{ 1709 return (spa->spa_dsl_pool); 1710} 1711 1712boolean_t 1713spa_is_initializing(spa_t *spa) 1714{ 1715 return (spa->spa_is_initializing); 1716} 1717 1718boolean_t 1719spa_indirect_vdevs_loaded(spa_t *spa) 1720{ 1721 return (spa->spa_indirect_vdevs_loaded); 1722} 1723 1724blkptr_t * 1725spa_get_rootblkptr(spa_t *spa) 1726{ 1727 return (&spa->spa_ubsync.ub_rootbp); 1728} 1729 1730void 1731spa_set_rootblkptr(spa_t *spa, const blkptr_t *bp) 1732{ 1733 spa->spa_uberblock.ub_rootbp = *bp; 1734} 1735 1736void 1737spa_altroot(spa_t *spa, char *buf, size_t buflen) 1738{ 1739 if (spa->spa_root == NULL) 1740 buf[0] = '\0'; 1741 else 1742 (void) strncpy(buf, spa->spa_root, buflen); 1743} 1744 1745int 1746spa_sync_pass(spa_t *spa) 1747{ 1748 return (spa->spa_sync_pass); 1749} 1750 1751char * 1752spa_name(spa_t *spa) 1753{ 1754 return (spa->spa_name); 1755} 1756 1757uint64_t 1758spa_guid(spa_t *spa) 1759{ 1760 dsl_pool_t *dp = spa_get_dsl(spa); 1761 uint64_t guid; 1762 1763 /* 1764 * If we fail to parse the config during spa_load(), we can go through 1765 * the error path (which posts an ereport) and end up here with no root 1766 * vdev. We stash the original pool guid in 'spa_config_guid' to handle 1767 * this case. 1768 */ 1769 if (spa->spa_root_vdev == NULL) 1770 return (spa->spa_config_guid); 1771 1772 guid = spa->spa_last_synced_guid != 0 ? 1773 spa->spa_last_synced_guid : spa->spa_root_vdev->vdev_guid; 1774 1775 /* 1776 * Return the most recently synced out guid unless we're 1777 * in syncing context. 1778 */ 1779 if (dp && dsl_pool_sync_context(dp)) 1780 return (spa->spa_root_vdev->vdev_guid); 1781 else 1782 return (guid); 1783} 1784 1785uint64_t 1786spa_load_guid(spa_t *spa) 1787{ 1788 /* 1789 * This is a GUID that exists solely as a reference for the 1790 * purposes of the arc. It is generated at load time, and 1791 * is never written to persistent storage. 1792 */ 1793 return (spa->spa_load_guid); 1794} 1795 1796uint64_t 1797spa_last_synced_txg(spa_t *spa) 1798{ 1799 return (spa->spa_ubsync.ub_txg); 1800} 1801 1802uint64_t 1803spa_first_txg(spa_t *spa) 1804{ 1805 return (spa->spa_first_txg); 1806} 1807 1808uint64_t 1809spa_syncing_txg(spa_t *spa) 1810{ 1811 return (spa->spa_syncing_txg); 1812} 1813 1814/* 1815 * Return the last txg where data can be dirtied. The final txgs 1816 * will be used to just clear out any deferred frees that remain. 1817 */ 1818uint64_t 1819spa_final_dirty_txg(spa_t *spa) 1820{ 1821 return (spa->spa_final_txg - TXG_DEFER_SIZE); 1822} 1823 1824pool_state_t 1825spa_state(spa_t *spa) 1826{ 1827 return (spa->spa_state); 1828} 1829 1830spa_load_state_t 1831spa_load_state(spa_t *spa) 1832{ 1833 return (spa->spa_load_state); 1834} 1835 1836uint64_t 1837spa_freeze_txg(spa_t *spa) 1838{ 1839 return (spa->spa_freeze_txg); 1840} 1841 1842/* ARGSUSED */ 1843uint64_t 1844spa_get_worst_case_asize(spa_t *spa, uint64_t lsize) 1845{ 1846 return (lsize * spa_asize_inflation); 1847} 1848 1849/* 1850 * Return the amount of slop space in bytes. It is 1/32 of the pool (3.2%), 1851 * or at least 128MB, unless that would cause it to be more than half the 1852 * pool size. 1853 * 1854 * See the comment above spa_slop_shift for details. 1855 */ 1856uint64_t 1857spa_get_slop_space(spa_t *spa) 1858{ 1859 uint64_t space = spa_get_dspace(spa); 1860 return (MAX(space >> spa_slop_shift, MIN(space >> 1, spa_min_slop))); 1861} 1862 1863uint64_t 1864spa_get_dspace(spa_t *spa) 1865{ 1866 return (spa->spa_dspace); 1867} 1868 1869uint64_t 1870spa_get_checkpoint_space(spa_t *spa) 1871{ 1872 return (spa->spa_checkpoint_info.sci_dspace); 1873} 1874 1875void 1876spa_update_dspace(spa_t *spa) 1877{ 1878 spa->spa_dspace = metaslab_class_get_dspace(spa_normal_class(spa)) + 1879 ddt_get_dedup_dspace(spa); 1880 if (spa->spa_vdev_removal != NULL) { 1881 /* 1882 * We can't allocate from the removing device, so 1883 * subtract its size. This prevents the DMU/DSL from 1884 * filling up the (now smaller) pool while we are in the 1885 * middle of removing the device. 1886 * 1887 * Note that the DMU/DSL doesn't actually know or care 1888 * how much space is allocated (it does its own tracking 1889 * of how much space has been logically used). So it 1890 * doesn't matter that the data we are moving may be 1891 * allocated twice (on the old device and the new 1892 * device). 1893 */ 1894 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 1895 vdev_t *vd = 1896 vdev_lookup_top(spa, spa->spa_vdev_removal->svr_vdev_id); 1897 spa->spa_dspace -= spa_deflate(spa) ? 1898 vd->vdev_stat.vs_dspace : vd->vdev_stat.vs_space; 1899 spa_config_exit(spa, SCL_VDEV, FTAG); 1900 } 1901} 1902 1903/* 1904 * Return the failure mode that has been set to this pool. The default 1905 * behavior will be to block all I/Os when a complete failure occurs. 1906 */ 1907uint8_t 1908spa_get_failmode(spa_t *spa) 1909{ 1910 return (spa->spa_failmode); 1911} 1912 1913boolean_t 1914spa_suspended(spa_t *spa) 1915{ 1916 return (spa->spa_suspended); 1917} 1918 1919uint64_t 1920spa_version(spa_t *spa) 1921{ 1922 return (spa->spa_ubsync.ub_version); 1923} 1924 1925boolean_t 1926spa_deflate(spa_t *spa) 1927{ 1928 return (spa->spa_deflate); 1929} 1930 1931metaslab_class_t * 1932spa_normal_class(spa_t *spa) 1933{ 1934 return (spa->spa_normal_class); 1935} 1936 1937metaslab_class_t * 1938spa_log_class(spa_t *spa) 1939{ 1940 return (spa->spa_log_class); 1941} 1942 1943void 1944spa_evicting_os_register(spa_t *spa, objset_t *os) 1945{ 1946 mutex_enter(&spa->spa_evicting_os_lock); 1947 list_insert_head(&spa->spa_evicting_os_list, os); 1948 mutex_exit(&spa->spa_evicting_os_lock); 1949} 1950 1951void 1952spa_evicting_os_deregister(spa_t *spa, objset_t *os) 1953{ 1954 mutex_enter(&spa->spa_evicting_os_lock); 1955 list_remove(&spa->spa_evicting_os_list, os); 1956 cv_broadcast(&spa->spa_evicting_os_cv); 1957 mutex_exit(&spa->spa_evicting_os_lock); 1958} 1959 1960void 1961spa_evicting_os_wait(spa_t *spa) 1962{ 1963 mutex_enter(&spa->spa_evicting_os_lock); 1964 while (!list_is_empty(&spa->spa_evicting_os_list)) 1965 cv_wait(&spa->spa_evicting_os_cv, &spa->spa_evicting_os_lock); 1966 mutex_exit(&spa->spa_evicting_os_lock); 1967 1968 dmu_buf_user_evict_wait(); 1969} 1970 1971int 1972spa_max_replication(spa_t *spa) 1973{ 1974 /* 1975 * As of SPA_VERSION == SPA_VERSION_DITTO_BLOCKS, we are able to 1976 * handle BPs with more than one DVA allocated. Set our max 1977 * replication level accordingly. 1978 */ 1979 if (spa_version(spa) < SPA_VERSION_DITTO_BLOCKS) 1980 return (1); 1981 return (MIN(SPA_DVAS_PER_BP, spa_max_replication_override)); 1982} 1983 1984int 1985spa_prev_software_version(spa_t *spa) 1986{ 1987 return (spa->spa_prev_software_version); 1988} 1989 1990uint64_t 1991spa_deadman_synctime(spa_t *spa) 1992{ 1993 return (spa->spa_deadman_synctime); 1994} 1995 1996uint64_t 1997dva_get_dsize_sync(spa_t *spa, const dva_t *dva) 1998{ 1999 uint64_t asize = DVA_GET_ASIZE(dva); 2000 uint64_t dsize = asize; 2001 2002 ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); 2003 2004 if (asize != 0 && spa->spa_deflate) { 2005 uint64_t vdev = DVA_GET_VDEV(dva); 2006 vdev_t *vd = vdev_lookup_top(spa, vdev); 2007 if (vd == NULL) { 2008 panic( 2009 "dva_get_dsize_sync(): bad DVA %llu:%llu", 2010 (u_longlong_t)vdev, (u_longlong_t)asize); 2011 } 2012 dsize = (asize >> SPA_MINBLOCKSHIFT) * vd->vdev_deflate_ratio; 2013 } 2014 2015 return (dsize); 2016} 2017 2018uint64_t 2019bp_get_dsize_sync(spa_t *spa, const blkptr_t *bp) 2020{ 2021 uint64_t dsize = 0; 2022 2023 for (int d = 0; d < BP_GET_NDVAS(bp); d++) 2024 dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]); 2025 2026 return (dsize); 2027} 2028 2029uint64_t 2030bp_get_dsize(spa_t *spa, const blkptr_t *bp) 2031{ 2032 uint64_t dsize = 0; 2033 2034 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 2035 2036 for (int d = 0; d < BP_GET_NDVAS(bp); d++) 2037 dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]); 2038 2039 spa_config_exit(spa, SCL_VDEV, FTAG); 2040 2041 return (dsize); 2042} 2043 2044uint64_t 2045spa_dirty_data(spa_t *spa) 2046{ 2047 return (spa->spa_dsl_pool->dp_dirty_total); 2048} 2049 2050/* 2051 * ========================================================================== 2052 * Initialization and Termination 2053 * ========================================================================== 2054 */ 2055 2056static int 2057spa_name_compare(const void *a1, const void *a2) 2058{ 2059 const spa_t *s1 = a1; 2060 const spa_t *s2 = a2; 2061 int s; 2062 2063 s = strcmp(s1->spa_name, s2->spa_name); 2064 if (s > 0) 2065 return (1); 2066 if (s < 0) 2067 return (-1); 2068 return (0); 2069} 2070 2071int 2072spa_busy(void) 2073{ 2074 return (spa_active_count); 2075} 2076 2077void 2078spa_boot_init() 2079{ 2080 spa_config_load(); 2081} 2082 2083#ifdef _KERNEL 2084EVENTHANDLER_DEFINE(mountroot, spa_boot_init, NULL, 0); 2085#endif 2086 2087void 2088spa_init(int mode) 2089{ 2090 mutex_init(&spa_namespace_lock, NULL, MUTEX_DEFAULT, NULL); 2091 mutex_init(&spa_spare_lock, NULL, MUTEX_DEFAULT, NULL); 2092 mutex_init(&spa_l2cache_lock, NULL, MUTEX_DEFAULT, NULL); 2093 cv_init(&spa_namespace_cv, NULL, CV_DEFAULT, NULL); 2094 2095 avl_create(&spa_namespace_avl, spa_name_compare, sizeof (spa_t), 2096 offsetof(spa_t, spa_avl)); 2097 2098 avl_create(&spa_spare_avl, spa_spare_compare, sizeof (spa_aux_t), 2099 offsetof(spa_aux_t, aux_avl)); 2100 2101 avl_create(&spa_l2cache_avl, spa_l2cache_compare, sizeof (spa_aux_t), 2102 offsetof(spa_aux_t, aux_avl)); 2103 2104 spa_mode_global = mode; 2105 2106#ifdef illumos 2107#ifdef _KERNEL 2108 spa_arch_init(); 2109#else 2110 if (spa_mode_global != FREAD && dprintf_find_string("watch")) { 2111 arc_procfd = open("/proc/self/ctl", O_WRONLY); 2112 if (arc_procfd == -1) { 2113 perror("could not enable watchpoints: " 2114 "opening /proc/self/ctl failed: "); 2115 } else { 2116 arc_watch = B_TRUE; 2117 } 2118 } 2119#endif 2120#endif /* illumos */ 2121 refcount_sysinit(); 2122 unique_init(); 2123 range_tree_init(); 2124 metaslab_alloc_trace_init(); 2125 zio_init(); 2126 lz4_init(); 2127 dmu_init(); 2128 zil_init(); 2129 vdev_cache_stat_init(); 2130 vdev_file_init(); 2131 zfs_prop_init(); 2132 zpool_prop_init(); 2133 zpool_feature_init(); 2134 spa_config_load(); 2135 l2arc_start(); 2136 scan_init(); 2137 dsl_scan_global_init(); 2138#ifndef illumos 2139#ifdef _KERNEL 2140 zfs_deadman_init(); 2141#endif 2142#endif /* !illumos */ 2143} 2144 2145void 2146spa_fini(void) 2147{ 2148 l2arc_stop(); 2149 2150 spa_evict_all(); 2151 2152 vdev_file_fini(); 2153 vdev_cache_stat_fini(); 2154 zil_fini(); 2155 dmu_fini(); 2156 lz4_fini(); 2157 zio_fini(); 2158 metaslab_alloc_trace_fini(); 2159 range_tree_fini(); 2160 unique_fini(); 2161 refcount_fini(); 2162 scan_fini(); 2163 2164 avl_destroy(&spa_namespace_avl); 2165 avl_destroy(&spa_spare_avl); 2166 avl_destroy(&spa_l2cache_avl); 2167 2168 cv_destroy(&spa_namespace_cv); 2169 mutex_destroy(&spa_namespace_lock); 2170 mutex_destroy(&spa_spare_lock); 2171 mutex_destroy(&spa_l2cache_lock); 2172} 2173 2174/* 2175 * Return whether this pool has slogs. No locking needed. 2176 * It's not a problem if the wrong answer is returned as it's only for 2177 * performance and not correctness 2178 */ 2179boolean_t 2180spa_has_slogs(spa_t *spa) 2181{ 2182 return (spa->spa_log_class->mc_rotor != NULL); 2183} 2184 2185spa_log_state_t 2186spa_get_log_state(spa_t *spa) 2187{ 2188 return (spa->spa_log_state); 2189} 2190 2191void 2192spa_set_log_state(spa_t *spa, spa_log_state_t state) 2193{ 2194 spa->spa_log_state = state; 2195} 2196 2197boolean_t 2198spa_is_root(spa_t *spa) 2199{ 2200 return (spa->spa_is_root); 2201} 2202 2203boolean_t 2204spa_writeable(spa_t *spa) 2205{ 2206 return (!!(spa->spa_mode & FWRITE) && spa->spa_trust_config); 2207} 2208 2209/* 2210 * Returns true if there is a pending sync task in any of the current 2211 * syncing txg, the current quiescing txg, or the current open txg. 2212 */ 2213boolean_t 2214spa_has_pending_synctask(spa_t *spa) 2215{ 2216 return (!txg_all_lists_empty(&spa->spa_dsl_pool->dp_sync_tasks) || 2217 !txg_all_lists_empty(&spa->spa_dsl_pool->dp_early_sync_tasks)); 2218} 2219 2220int 2221spa_mode(spa_t *spa) 2222{ 2223 return (spa->spa_mode); 2224} 2225 2226uint64_t 2227spa_bootfs(spa_t *spa) 2228{ 2229 return (spa->spa_bootfs); 2230} 2231 2232uint64_t 2233spa_delegation(spa_t *spa) 2234{ 2235 return (spa->spa_delegation); 2236} 2237 2238objset_t * 2239spa_meta_objset(spa_t *spa) 2240{ 2241 return (spa->spa_meta_objset); 2242} 2243 2244enum zio_checksum 2245spa_dedup_checksum(spa_t *spa) 2246{ 2247 return (spa->spa_dedup_checksum); 2248} 2249 2250/* 2251 * Reset pool scan stat per scan pass (or reboot). 2252 */ 2253void 2254spa_scan_stat_init(spa_t *spa) 2255{ 2256 /* data not stored on disk */ 2257 spa->spa_scan_pass_start = gethrestime_sec(); 2258 if (dsl_scan_is_paused_scrub(spa->spa_dsl_pool->dp_scan)) 2259 spa->spa_scan_pass_scrub_pause = spa->spa_scan_pass_start; 2260 else 2261 spa->spa_scan_pass_scrub_pause = 0; 2262 spa->spa_scan_pass_scrub_spent_paused = 0; 2263 spa->spa_scan_pass_exam = 0; 2264 spa->spa_scan_pass_issued = 0; 2265 vdev_scan_stat_init(spa->spa_root_vdev); 2266} 2267 2268/* 2269 * Get scan stats for zpool status reports 2270 */ 2271int 2272spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps) 2273{ 2274 dsl_scan_t *scn = spa->spa_dsl_pool ? spa->spa_dsl_pool->dp_scan : NULL; 2275 2276 if (scn == NULL || scn->scn_phys.scn_func == POOL_SCAN_NONE) 2277 return (SET_ERROR(ENOENT)); 2278 bzero(ps, sizeof (pool_scan_stat_t)); 2279 2280 /* data stored on disk */ 2281 ps->pss_func = scn->scn_phys.scn_func; 2282 ps->pss_state = scn->scn_phys.scn_state; 2283 ps->pss_start_time = scn->scn_phys.scn_start_time; 2284 ps->pss_end_time = scn->scn_phys.scn_end_time; 2285 ps->pss_to_examine = scn->scn_phys.scn_to_examine; 2286 ps->pss_to_process = scn->scn_phys.scn_to_process; 2287 ps->pss_processed = scn->scn_phys.scn_processed; 2288 ps->pss_errors = scn->scn_phys.scn_errors; 2289 ps->pss_examined = scn->scn_phys.scn_examined; 2290 ps->pss_issued = 2291 scn->scn_issued_before_pass + spa->spa_scan_pass_issued; 2292 /* data not stored on disk */ 2293 ps->pss_pass_start = spa->spa_scan_pass_start; 2294 ps->pss_pass_exam = spa->spa_scan_pass_exam; 2295 ps->pss_pass_issued = spa->spa_scan_pass_issued; 2296 ps->pss_pass_scrub_pause = spa->spa_scan_pass_scrub_pause; 2297 ps->pss_pass_scrub_spent_paused = spa->spa_scan_pass_scrub_spent_paused; 2298 2299 return (0); 2300} 2301 2302int 2303spa_maxblocksize(spa_t *spa) 2304{ 2305 if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) 2306 return (SPA_MAXBLOCKSIZE); 2307 else 2308 return (SPA_OLD_MAXBLOCKSIZE); 2309} 2310 2311/* 2312 * Returns the txg that the last device removal completed. No indirect mappings 2313 * have been added since this txg. 2314 */ 2315uint64_t 2316spa_get_last_removal_txg(spa_t *spa) 2317{ 2318 uint64_t vdevid; 2319 uint64_t ret = -1ULL; 2320 2321 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 2322 /* 2323 * sr_prev_indirect_vdev is only modified while holding all the 2324 * config locks, so it is sufficient to hold SCL_VDEV as reader when 2325 * examining it. 2326 */ 2327 vdevid = spa->spa_removing_phys.sr_prev_indirect_vdev; 2328 2329 while (vdevid != -1ULL) { 2330 vdev_t *vd = vdev_lookup_top(spa, vdevid); 2331 vdev_indirect_births_t *vib = vd->vdev_indirect_births; 2332 2333 ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); 2334 2335 /* 2336 * If the removal did not remap any data, we don't care. 2337 */ 2338 if (vdev_indirect_births_count(vib) != 0) { 2339 ret = vdev_indirect_births_last_entry_txg(vib); 2340 break; 2341 } 2342 2343 vdevid = vd->vdev_indirect_config.vic_prev_indirect_vdev; 2344 } 2345 spa_config_exit(spa, SCL_VDEV, FTAG); 2346 2347 IMPLY(ret != -1ULL, 2348 spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REMOVAL)); 2349 2350 return (ret); 2351} 2352 2353boolean_t 2354spa_trust_config(spa_t *spa) 2355{ 2356 return (spa->spa_trust_config); 2357} 2358 2359uint64_t 2360spa_missing_tvds_allowed(spa_t *spa) 2361{ 2362 return (spa->spa_missing_tvds_allowed); 2363} 2364 2365void 2366spa_set_missing_tvds(spa_t *spa, uint64_t missing) 2367{ 2368 spa->spa_missing_tvds = missing; 2369} 2370 2371boolean_t 2372spa_top_vdevs_spacemap_addressable(spa_t *spa) 2373{ 2374 vdev_t *rvd = spa->spa_root_vdev; 2375 for (uint64_t c = 0; c < rvd->vdev_children; c++) { 2376 if (!vdev_is_spacemap_addressable(rvd->vdev_child[c])) 2377 return (B_FALSE); 2378 } 2379 return (B_TRUE); 2380} 2381 2382boolean_t 2383spa_has_checkpoint(spa_t *spa) 2384{ 2385 return (spa->spa_checkpoint_txg != 0); 2386} 2387 2388boolean_t 2389spa_importing_readonly_checkpoint(spa_t *spa) 2390{ 2391 return ((spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT) && 2392 spa->spa_mode == FREAD); 2393} 2394 2395uint64_t 2396spa_min_claim_txg(spa_t *spa) 2397{ 2398 uint64_t checkpoint_txg = spa->spa_uberblock.ub_checkpoint_txg; 2399 2400 if (checkpoint_txg != 0) 2401 return (checkpoint_txg + 1); 2402 2403 return (spa->spa_first_txg); 2404} 2405 2406/* 2407 * If there is a checkpoint, async destroys may consume more space from 2408 * the pool instead of freeing it. In an attempt to save the pool from 2409 * getting suspended when it is about to run out of space, we stop 2410 * processing async destroys. 2411 */ 2412boolean_t 2413spa_suspend_async_destroy(spa_t *spa) 2414{ 2415 dsl_pool_t *dp = spa_get_dsl(spa); 2416 2417 uint64_t unreserved = dsl_pool_unreserved_space(dp, 2418 ZFS_SPACE_CHECK_EXTRA_RESERVED); 2419 uint64_t used = dsl_dir_phys(dp->dp_root_dir)->dd_used_bytes; 2420 uint64_t avail = (unreserved > used) ? (unreserved - used) : 0; 2421 2422 if (spa_has_checkpoint(spa) && avail == 0) 2423 return (B_TRUE); 2424 2425 return (B_FALSE); 2426} 2427