1768 1769 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, 1770 &nvl) == 0) { 1771 VERIFY(nvlist_dup(nvl, &spa->spa_config_splitting, 1772 KM_SLEEP) == 0); 1773 } 1774 1775 gethrestime(&spa->spa_loaded_ts); 1776 error = spa_load_impl(spa, pool_guid, config, state, type, 1777 mosconfig, &ereport); 1778 } 1779 1780 spa->spa_minref = refcount_count(&spa->spa_refcount); 1781 if (error) { 1782 if (error != EEXIST) { 1783 spa->spa_loaded_ts.tv_sec = 0; 1784 spa->spa_loaded_ts.tv_nsec = 0; 1785 } 1786 if (error != EBADF) { 1787 zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0); 1788 } 1789 } 1790 spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE; 1791 spa->spa_ena = 0; 1792 1793 return (error); 1794} 1795 1796/* 1797 * Load an existing storage pool, using the pool's builtin spa_config as a 1798 * source of configuration information. 1799 */ 1800static int 1801spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, 1802 spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig, 1803 char **ereport) 1804{ 1805 int error = 0; 1806 nvlist_t *nvroot = NULL; 1807 vdev_t *rvd; 1808 uberblock_t *ub = &spa->spa_uberblock; 1809 uint64_t children, config_cache_txg = spa->spa_config_txg; 1810 int orig_mode = spa->spa_mode; 1811 int parse; 1812 uint64_t obj; 1813 1814 /* 1815 * If this is an untrusted config, access the pool in read-only mode. 1816 * This prevents things like resilvering recently removed devices. 1817 */ 1818 if (!mosconfig) 1819 spa->spa_mode = FREAD; 1820 1821 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 1822 1823 spa->spa_load_state = state; 1824 1825 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot)) 1826 return (EINVAL); 1827 1828 parse = (type == SPA_IMPORT_EXISTING ? 1829 VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT); 1830 1831 /* 1832 * Create "The Godfather" zio to hold all async IOs 1833 */ 1834 spa->spa_async_zio_root = zio_root(spa, NULL, NULL, 1835 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER); 1836 1837 /* 1838 * Parse the configuration into a vdev tree. We explicitly set the 1839 * value that will be returned by spa_version() since parsing the 1840 * configuration requires knowing the version number. 1841 */ 1842 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1843 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, parse); 1844 spa_config_exit(spa, SCL_ALL, FTAG); 1845 1846 if (error != 0) 1847 return (error); 1848 1849 ASSERT(spa->spa_root_vdev == rvd); 1850 1851 if (type != SPA_IMPORT_ASSEMBLE) { 1852 ASSERT(spa_guid(spa) == pool_guid); 1853 } 1854 1855 /* 1856 * Try to open all vdevs, loading each label in the process. 1857 */ 1858 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1859 error = vdev_open(rvd); 1860 spa_config_exit(spa, SCL_ALL, FTAG); 1861 if (error != 0) 1862 return (error); 1863 1864 /* 1865 * We need to validate the vdev labels against the configuration that 1866 * we have in hand, which is dependent on the setting of mosconfig. If 1867 * mosconfig is true then we're validating the vdev labels based on 1868 * that config. Otherwise, we're validating against the cached config 1869 * (zpool.cache) that was read when we loaded the zfs module, and then 1870 * later we will recursively call spa_load() and validate against 1871 * the vdev config. 1872 * 1873 * If we're assembling a new pool that's been split off from an 1874 * existing pool, the labels haven't yet been updated so we skip 1875 * validation for now. 1876 */ 1877 if (type != SPA_IMPORT_ASSEMBLE) { 1878 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1879 error = vdev_validate(rvd); 1880 spa_config_exit(spa, SCL_ALL, FTAG); 1881 1882 if (error != 0) 1883 return (error); 1884 1885 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) 1886 return (ENXIO); 1887 } 1888 1889 /* 1890 * Find the best uberblock. 1891 */ 1892 vdev_uberblock_load(NULL, rvd, ub); 1893 1894 /* 1895 * If we weren't able to find a single valid uberblock, return failure. 1896 */ 1897 if (ub->ub_txg == 0) 1898 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO)); 1899 1900 /* 1901 * If the pool is newer than the code, we can't open it. 1902 */ 1903 if (ub->ub_version > SPA_VERSION) 1904 return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP)); 1905 1906 /* 1907 * If the vdev guid sum doesn't match the uberblock, we have an 1908 * incomplete configuration. We first check to see if the pool 1909 * is aware of the complete config (i.e ZPOOL_CONFIG_VDEV_CHILDREN). 1910 * If it is, defer the vdev_guid_sum check till later so we 1911 * can handle missing vdevs. 1912 */ 1913 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VDEV_CHILDREN, 1914 &children) != 0 && mosconfig && type != SPA_IMPORT_ASSEMBLE && 1915 rvd->vdev_guid_sum != ub->ub_guid_sum) 1916 return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO)); 1917 1918 if (type != SPA_IMPORT_ASSEMBLE && spa->spa_config_splitting) { 1919 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1920 spa_try_repair(spa, config); 1921 spa_config_exit(spa, SCL_ALL, FTAG); 1922 nvlist_free(spa->spa_config_splitting); 1923 spa->spa_config_splitting = NULL; 1924 } 1925 1926 /* 1927 * Initialize internal SPA structures. 1928 */ 1929 spa->spa_state = POOL_STATE_ACTIVE; 1930 spa->spa_ubsync = spa->spa_uberblock; 1931 spa->spa_verify_min_txg = spa->spa_extreme_rewind ? 1932 TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1; 1933 spa->spa_first_txg = spa->spa_last_ubsync_txg ? 1934 spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1; 1935 spa->spa_claim_max_txg = spa->spa_first_txg; 1936 spa->spa_prev_software_version = ub->ub_software_version; 1937 1938 error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool); 1939 if (error) 1940 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 1941 spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; 1942 1943 if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object) != 0) 1944 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 1945 1946 if (!mosconfig) { 1947 uint64_t hostid; 1948 nvlist_t *policy = NULL, *nvconfig; 1949 1950 if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0) 1951 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 1952 1953 if (!spa_is_root(spa) && nvlist_lookup_uint64(nvconfig, 1954 ZPOOL_CONFIG_HOSTID, &hostid) == 0) { 1955 char *hostname; 1956 unsigned long myhostid = 0; 1957 1958 VERIFY(nvlist_lookup_string(nvconfig, 1959 ZPOOL_CONFIG_HOSTNAME, &hostname) == 0); 1960 1961#ifdef _KERNEL 1962 myhostid = zone_get_hostid(NULL); 1963#else /* _KERNEL */ 1964 /* 1965 * We're emulating the system's hostid in userland, so 1966 * we can't use zone_get_hostid(). 1967 */ 1968 (void) ddi_strtoul(hw_serial, NULL, 10, &myhostid); 1969#endif /* _KERNEL */ 1970 if (check_hostid && hostid != 0 && myhostid != 0 && 1971 hostid != myhostid) { 1972 nvlist_free(nvconfig); 1973 cmn_err(CE_WARN, "pool '%s' could not be " 1974 "loaded as it was last accessed by " 1975 "another system (host: %s hostid: 0x%lx). " 1976 "See: http://www.sun.com/msg/ZFS-8000-EY", 1977 spa_name(spa), hostname, 1978 (unsigned long)hostid); 1979 return (EBADF); 1980 } 1981 } 1982 if (nvlist_lookup_nvlist(spa->spa_config, 1983 ZPOOL_REWIND_POLICY, &policy) == 0) 1984 VERIFY(nvlist_add_nvlist(nvconfig, 1985 ZPOOL_REWIND_POLICY, policy) == 0); 1986 1987 spa_config_set(spa, nvconfig); 1988 spa_unload(spa); 1989 spa_deactivate(spa); 1990 spa_activate(spa, orig_mode); 1991 1992 return (spa_load(spa, state, SPA_IMPORT_EXISTING, B_TRUE)); 1993 } 1994 1995 if (spa_dir_prop(spa, DMU_POOL_SYNC_BPOBJ, &obj) != 0) 1996 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 1997 error = bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj); 1998 if (error != 0) 1999 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2000 2001 /* 2002 * Load the bit that tells us to use the new accounting function 2003 * (raid-z deflation). If we have an older pool, this will not 2004 * be present. 2005 */ 2006 error = spa_dir_prop(spa, DMU_POOL_DEFLATE, &spa->spa_deflate); 2007 if (error != 0 && error != ENOENT) 2008 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2009 2010 error = spa_dir_prop(spa, DMU_POOL_CREATION_VERSION, 2011 &spa->spa_creation_version); 2012 if (error != 0 && error != ENOENT) 2013 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2014 2015 /* 2016 * Load the persistent error log. If we have an older pool, this will 2017 * not be present. 2018 */ 2019 error = spa_dir_prop(spa, DMU_POOL_ERRLOG_LAST, &spa->spa_errlog_last); 2020 if (error != 0 && error != ENOENT) 2021 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2022 2023 error = spa_dir_prop(spa, DMU_POOL_ERRLOG_SCRUB, 2024 &spa->spa_errlog_scrub); 2025 if (error != 0 && error != ENOENT) 2026 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2027 2028 /* 2029 * Load the history object. If we have an older pool, this 2030 * will not be present. 2031 */ 2032 error = spa_dir_prop(spa, DMU_POOL_HISTORY, &spa->spa_history); 2033 if (error != 0 && error != ENOENT) 2034 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2035 2036 /* 2037 * If we're assembling the pool from the split-off vdevs of 2038 * an existing pool, we don't want to attach the spares & cache 2039 * devices. 2040 */ 2041 2042 /* 2043 * Load any hot spares for this pool. 2044 */ 2045 error = spa_dir_prop(spa, DMU_POOL_SPARES, &spa->spa_spares.sav_object); 2046 if (error != 0 && error != ENOENT) 2047 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2048 if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { 2049 ASSERT(spa_version(spa) >= SPA_VERSION_SPARES); 2050 if (load_nvlist(spa, spa->spa_spares.sav_object, 2051 &spa->spa_spares.sav_config) != 0) 2052 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2053 2054 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2055 spa_load_spares(spa); 2056 spa_config_exit(spa, SCL_ALL, FTAG); 2057 } else if (error == 0) { 2058 spa->spa_spares.sav_sync = B_TRUE; 2059 } 2060 2061 /* 2062 * Load any level 2 ARC devices for this pool. 2063 */ 2064 error = spa_dir_prop(spa, DMU_POOL_L2CACHE, 2065 &spa->spa_l2cache.sav_object); 2066 if (error != 0 && error != ENOENT) 2067 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2068 if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { 2069 ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE); 2070 if (load_nvlist(spa, spa->spa_l2cache.sav_object, 2071 &spa->spa_l2cache.sav_config) != 0) 2072 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2073 2074 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2075 spa_load_l2cache(spa); 2076 spa_config_exit(spa, SCL_ALL, FTAG); 2077 } else if (error == 0) { 2078 spa->spa_l2cache.sav_sync = B_TRUE; 2079 } 2080 2081 spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 2082 2083 error = spa_dir_prop(spa, DMU_POOL_PROPS, &spa->spa_pool_props_object); 2084 if (error && error != ENOENT) 2085 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2086 2087 if (error == 0) { 2088 uint64_t autoreplace; 2089 2090 spa_prop_find(spa, ZPOOL_PROP_BOOTFS, &spa->spa_bootfs); 2091 spa_prop_find(spa, ZPOOL_PROP_AUTOREPLACE, &autoreplace); 2092 spa_prop_find(spa, ZPOOL_PROP_DELEGATION, &spa->spa_delegation); 2093 spa_prop_find(spa, ZPOOL_PROP_FAILUREMODE, &spa->spa_failmode); 2094 spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand); 2095 spa_prop_find(spa, ZPOOL_PROP_DEDUPDITTO, 2096 &spa->spa_dedup_ditto); 2097 2098 spa->spa_autoreplace = (autoreplace != 0); 2099 } 2100 2101 /* 2102 * If the 'autoreplace' property is set, then post a resource notifying 2103 * the ZFS DE that it should not issue any faults for unopenable 2104 * devices. We also iterate over the vdevs, and post a sysevent for any 2105 * unopenable vdevs so that the normal autoreplace handler can take 2106 * over. 2107 */ 2108 if (spa->spa_autoreplace && state != SPA_LOAD_TRYIMPORT) { 2109 spa_check_removed(spa->spa_root_vdev); 2110 /* 2111 * For the import case, this is done in spa_import(), because 2112 * at this point we're using the spare definitions from 2113 * the MOS config, not necessarily from the userland config. 2114 */ 2115 if (state != SPA_LOAD_IMPORT) { 2116 spa_aux_check_removed(&spa->spa_spares); 2117 spa_aux_check_removed(&spa->spa_l2cache); 2118 } 2119 } 2120 2121 /* 2122 * Load the vdev state for all toplevel vdevs. 2123 */ 2124 vdev_load(rvd); 2125 2126 /* 2127 * Propagate the leaf DTLs we just loaded all the way up the tree. 2128 */ 2129 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2130 vdev_dtl_reassess(rvd, 0, 0, B_FALSE); 2131 spa_config_exit(spa, SCL_ALL, FTAG); 2132 2133 /* 2134 * Load the DDTs (dedup tables). 2135 */ 2136 error = ddt_load(spa); 2137 if (error != 0) 2138 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2139 2140 spa_update_dspace(spa); 2141 2142 /* 2143 * Validate the config, using the MOS config to fill in any 2144 * information which might be missing. If we fail to validate 2145 * the config then declare the pool unfit for use. If we're 2146 * assembling a pool from a split, the log is not transferred 2147 * over. 2148 */ 2149 if (type != SPA_IMPORT_ASSEMBLE) { 2150 nvlist_t *nvconfig; 2151 2152 if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0) 2153 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2154 2155 if (!spa_config_valid(spa, nvconfig)) { 2156 nvlist_free(nvconfig); 2157 return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, 2158 ENXIO)); 2159 } 2160 nvlist_free(nvconfig); 2161 2162 /* 2163 * Now that we've validate the config, check the state of the 2164 * root vdev. If it can't be opened, it indicates one or 2165 * more toplevel vdevs are faulted. 2166 */ 2167 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) 2168 return (ENXIO); 2169 2170 if (spa_check_logs(spa)) { 2171 *ereport = FM_EREPORT_ZFS_LOG_REPLAY; 2172 return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG, ENXIO)); 2173 } 2174 } 2175 2176 /* 2177 * We've successfully opened the pool, verify that we're ready 2178 * to start pushing transactions. 2179 */ 2180 if (state != SPA_LOAD_TRYIMPORT) { 2181 if (error = spa_load_verify(spa)) 2182 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, 2183 error)); 2184 } 2185 2186 if (spa_writeable(spa) && (state == SPA_LOAD_RECOVER || 2187 spa->spa_load_max_txg == UINT64_MAX)) { 2188 dmu_tx_t *tx; 2189 int need_update = B_FALSE; 2190 2191 ASSERT(state != SPA_LOAD_TRYIMPORT); 2192 2193 /* 2194 * Claim log blocks that haven't been committed yet. 2195 * This must all happen in a single txg. 2196 * Note: spa_claim_max_txg is updated by spa_claim_notify(), 2197 * invoked from zil_claim_log_block()'s i/o done callback. 2198 * Price of rollback is that we abandon the log. 2199 */ 2200 spa->spa_claiming = B_TRUE; 2201 2202 tx = dmu_tx_create_assigned(spa_get_dsl(spa), 2203 spa_first_txg(spa)); 2204 (void) dmu_objset_find(spa_name(spa), 2205 zil_claim, tx, DS_FIND_CHILDREN); 2206 dmu_tx_commit(tx); 2207 2208 spa->spa_claiming = B_FALSE; 2209 2210 spa_set_log_state(spa, SPA_LOG_GOOD); 2211 spa->spa_sync_on = B_TRUE; 2212 txg_sync_start(spa->spa_dsl_pool); 2213 2214 /* 2215 * Wait for all claims to sync. We sync up to the highest 2216 * claimed log block birth time so that claimed log blocks 2217 * don't appear to be from the future. spa_claim_max_txg 2218 * will have been set for us by either zil_check_log_chain() 2219 * (invoked from spa_check_logs()) or zil_claim() above. 2220 */ 2221 txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg); 2222 2223 /* 2224 * If the config cache is stale, or we have uninitialized 2225 * metaslabs (see spa_vdev_add()), then update the config. 2226 * 2227 * If this is a verbatim import, trust the current 2228 * in-core spa_config and update the disk labels. 2229 */ 2230 if (config_cache_txg != spa->spa_config_txg || 2231 state == SPA_LOAD_IMPORT || 2232 state == SPA_LOAD_RECOVER || 2233 (spa->spa_import_flags & ZFS_IMPORT_VERBATIM)) 2234 need_update = B_TRUE; 2235 2236 for (int c = 0; c < rvd->vdev_children; c++) 2237 if (rvd->vdev_child[c]->vdev_ms_array == 0) 2238 need_update = B_TRUE; 2239 2240 /* 2241 * Update the config cache asychronously in case we're the 2242 * root pool, in which case the config cache isn't writable yet. 2243 */ 2244 if (need_update) 2245 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 2246 2247 /* 2248 * Check all DTLs to see if anything needs resilvering. 2249 */ 2250 if (!dsl_scan_resilvering(spa->spa_dsl_pool) && 2251 vdev_resilver_needed(rvd, NULL, NULL)) 2252 spa_async_request(spa, SPA_ASYNC_RESILVER); 2253 2254 /* 2255 * Delete any inconsistent datasets. 2256 */ 2257 (void) dmu_objset_find(spa_name(spa), 2258 dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN); 2259 2260 /* 2261 * Clean up any stale temporary dataset userrefs. 2262 */ 2263 dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool); 2264 } 2265 2266 return (0); 2267} 2268 2269static int 2270spa_load_retry(spa_t *spa, spa_load_state_t state, int mosconfig) 2271{ 2272 int mode = spa->spa_mode; 2273 2274 spa_unload(spa); 2275 spa_deactivate(spa); 2276 2277 spa->spa_load_max_txg--; 2278 2279 spa_activate(spa, mode); 2280 spa_async_suspend(spa); 2281 2282 return (spa_load(spa, state, SPA_IMPORT_EXISTING, mosconfig)); 2283} 2284 2285static int 2286spa_load_best(spa_t *spa, spa_load_state_t state, int mosconfig, 2287 uint64_t max_request, int rewind_flags) 2288{ 2289 nvlist_t *config = NULL; 2290 int load_error, rewind_error; 2291 uint64_t safe_rewind_txg; 2292 uint64_t min_txg; 2293 2294 if (spa->spa_load_txg && state == SPA_LOAD_RECOVER) { 2295 spa->spa_load_max_txg = spa->spa_load_txg; 2296 spa_set_log_state(spa, SPA_LOG_CLEAR); 2297 } else { 2298 spa->spa_load_max_txg = max_request; 2299 } 2300 2301 load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING, 2302 mosconfig); 2303 if (load_error == 0) 2304 return (0); 2305 2306 if (spa->spa_root_vdev != NULL) 2307 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 2308 2309 spa->spa_last_ubsync_txg = spa->spa_uberblock.ub_txg; 2310 spa->spa_last_ubsync_txg_ts = spa->spa_uberblock.ub_timestamp; 2311 2312 if (rewind_flags & ZPOOL_NEVER_REWIND) { 2313 nvlist_free(config); 2314 return (load_error); 2315 } 2316 2317 /* Price of rolling back is discarding txgs, including log */ 2318 if (state == SPA_LOAD_RECOVER) 2319 spa_set_log_state(spa, SPA_LOG_CLEAR); 2320 2321 spa->spa_load_max_txg = spa->spa_last_ubsync_txg; 2322 safe_rewind_txg = spa->spa_last_ubsync_txg - TXG_DEFER_SIZE; 2323 min_txg = (rewind_flags & ZPOOL_EXTREME_REWIND) ? 2324 TXG_INITIAL : safe_rewind_txg; 2325 2326 /* 2327 * Continue as long as we're finding errors, we're still within 2328 * the acceptable rewind range, and we're still finding uberblocks 2329 */ 2330 while (rewind_error && spa->spa_uberblock.ub_txg >= min_txg && 2331 spa->spa_uberblock.ub_txg <= spa->spa_load_max_txg) { 2332 if (spa->spa_load_max_txg < safe_rewind_txg) 2333 spa->spa_extreme_rewind = B_TRUE; 2334 rewind_error = spa_load_retry(spa, state, mosconfig); 2335 } 2336 2337 spa->spa_extreme_rewind = B_FALSE; 2338 spa->spa_load_max_txg = UINT64_MAX; 2339 2340 if (config && (rewind_error || state != SPA_LOAD_RECOVER)) 2341 spa_config_set(spa, config); 2342 2343 return (state == SPA_LOAD_RECOVER ? rewind_error : load_error); 2344} 2345 2346/* 2347 * Pool Open/Import 2348 * 2349 * The import case is identical to an open except that the configuration is sent 2350 * down from userland, instead of grabbed from the configuration cache. For the 2351 * case of an open, the pool configuration will exist in the 2352 * POOL_STATE_UNINITIALIZED state. 2353 * 2354 * The stats information (gen/count/ustats) is used to gather vdev statistics at 2355 * the same time open the pool, without having to keep around the spa_t in some 2356 * ambiguous state. 2357 */ 2358static int 2359spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy, 2360 nvlist_t **config) 2361{ 2362 spa_t *spa; 2363 spa_load_state_t state = SPA_LOAD_OPEN; 2364 int error; 2365 int locked = B_FALSE; 2366 int firstopen = B_FALSE; 2367 2368 *spapp = NULL; 2369 2370 /* 2371 * As disgusting as this is, we need to support recursive calls to this 2372 * function because dsl_dir_open() is called during spa_load(), and ends 2373 * up calling spa_open() again. The real fix is to figure out how to 2374 * avoid dsl_dir_open() calling this in the first place. 2375 */ 2376 if (mutex_owner(&spa_namespace_lock) != curthread) { 2377 mutex_enter(&spa_namespace_lock); 2378 locked = B_TRUE; 2379 } 2380 2381 if ((spa = spa_lookup(pool)) == NULL) { 2382 if (locked) 2383 mutex_exit(&spa_namespace_lock); 2384 return (ENOENT); 2385 } 2386 2387 if (spa->spa_state == POOL_STATE_UNINITIALIZED) { 2388 zpool_rewind_policy_t policy; 2389 2390 firstopen = B_TRUE; 2391 2392 zpool_get_rewind_policy(nvpolicy ? nvpolicy : spa->spa_config, 2393 &policy); 2394 if (policy.zrp_request & ZPOOL_DO_REWIND) 2395 state = SPA_LOAD_RECOVER; 2396 2397 spa_activate(spa, spa_mode_global); 2398 2399 if (state != SPA_LOAD_RECOVER) 2400 spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; 2401 2402 error = spa_load_best(spa, state, B_FALSE, policy.zrp_txg, 2403 policy.zrp_request); 2404 2405 if (error == EBADF) { 2406 /* 2407 * If vdev_validate() returns failure (indicated by 2408 * EBADF), it indicates that one of the vdevs indicates 2409 * that the pool has been exported or destroyed. If 2410 * this is the case, the config cache is out of sync and 2411 * we should remove the pool from the namespace. 2412 */ 2413 spa_unload(spa); 2414 spa_deactivate(spa); 2415 spa_config_sync(spa, B_TRUE, B_TRUE); 2416 spa_remove(spa); 2417 if (locked) 2418 mutex_exit(&spa_namespace_lock); 2419 return (ENOENT); 2420 } 2421 2422 if (error) { 2423 /* 2424 * We can't open the pool, but we still have useful 2425 * information: the state of each vdev after the 2426 * attempted vdev_open(). Return this to the user. 2427 */ 2428 if (config != NULL && spa->spa_config) { 2429 VERIFY(nvlist_dup(spa->spa_config, config, 2430 KM_SLEEP) == 0); 2431 VERIFY(nvlist_add_nvlist(*config, 2432 ZPOOL_CONFIG_LOAD_INFO, 2433 spa->spa_load_info) == 0); 2434 } 2435 spa_unload(spa); 2436 spa_deactivate(spa); 2437 spa->spa_last_open_failed = error; 2438 if (locked) 2439 mutex_exit(&spa_namespace_lock); 2440 *spapp = NULL; 2441 return (error); 2442 } 2443 } 2444 2445 spa_open_ref(spa, tag); 2446 2447 if (config != NULL) 2448 *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 2449 2450 /* 2451 * If we've recovered the pool, pass back any information we 2452 * gathered while doing the load. 2453 */ 2454 if (state == SPA_LOAD_RECOVER) { 2455 VERIFY(nvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO, 2456 spa->spa_load_info) == 0); 2457 } 2458 2459 if (locked) { 2460 spa->spa_last_open_failed = 0; 2461 spa->spa_last_ubsync_txg = 0; 2462 spa->spa_load_txg = 0; 2463 mutex_exit(&spa_namespace_lock); 2464#ifdef __FreeBSD__ 2465#ifdef _KERNEL 2466 if (firstopen) 2467 zvol_create_minors(pool); 2468#endif 2469#endif 2470 } 2471 2472 *spapp = spa; 2473 2474 return (0); 2475} 2476 2477int 2478spa_open_rewind(const char *name, spa_t **spapp, void *tag, nvlist_t *policy, 2479 nvlist_t **config) 2480{ 2481 return (spa_open_common(name, spapp, tag, policy, config)); 2482} 2483 2484int 2485spa_open(const char *name, spa_t **spapp, void *tag) 2486{ 2487 return (spa_open_common(name, spapp, tag, NULL, NULL)); 2488} 2489 2490/* 2491 * Lookup the given spa_t, incrementing the inject count in the process, 2492 * preventing it from being exported or destroyed. 2493 */ 2494spa_t * 2495spa_inject_addref(char *name) 2496{ 2497 spa_t *spa; 2498 2499 mutex_enter(&spa_namespace_lock); 2500 if ((spa = spa_lookup(name)) == NULL) { 2501 mutex_exit(&spa_namespace_lock); 2502 return (NULL); 2503 } 2504 spa->spa_inject_ref++; 2505 mutex_exit(&spa_namespace_lock); 2506 2507 return (spa); 2508} 2509 2510void 2511spa_inject_delref(spa_t *spa) 2512{ 2513 mutex_enter(&spa_namespace_lock); 2514 spa->spa_inject_ref--; 2515 mutex_exit(&spa_namespace_lock); 2516} 2517 2518/* 2519 * Add spares device information to the nvlist. 2520 */ 2521static void 2522spa_add_spares(spa_t *spa, nvlist_t *config) 2523{ 2524 nvlist_t **spares; 2525 uint_t i, nspares; 2526 nvlist_t *nvroot; 2527 uint64_t guid; 2528 vdev_stat_t *vs; 2529 uint_t vsc; 2530 uint64_t pool; 2531 2532 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 2533 2534 if (spa->spa_spares.sav_count == 0) 2535 return; 2536 2537 VERIFY(nvlist_lookup_nvlist(config, 2538 ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 2539 VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 2540 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 2541 if (nspares != 0) { 2542 VERIFY(nvlist_add_nvlist_array(nvroot, 2543 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 2544 VERIFY(nvlist_lookup_nvlist_array(nvroot, 2545 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 2546 2547 /* 2548 * Go through and find any spares which have since been 2549 * repurposed as an active spare. If this is the case, update 2550 * their status appropriately. 2551 */ 2552 for (i = 0; i < nspares; i++) { 2553 VERIFY(nvlist_lookup_uint64(spares[i], 2554 ZPOOL_CONFIG_GUID, &guid) == 0); 2555 if (spa_spare_exists(guid, &pool, NULL) && 2556 pool != 0ULL) { 2557 VERIFY(nvlist_lookup_uint64_array( 2558 spares[i], ZPOOL_CONFIG_VDEV_STATS, 2559 (uint64_t **)&vs, &vsc) == 0); 2560 vs->vs_state = VDEV_STATE_CANT_OPEN; 2561 vs->vs_aux = VDEV_AUX_SPARED; 2562 } 2563 } 2564 } 2565} 2566 2567/* 2568 * Add l2cache device information to the nvlist, including vdev stats. 2569 */ 2570static void 2571spa_add_l2cache(spa_t *spa, nvlist_t *config) 2572{ 2573 nvlist_t **l2cache; 2574 uint_t i, j, nl2cache; 2575 nvlist_t *nvroot; 2576 uint64_t guid; 2577 vdev_t *vd; 2578 vdev_stat_t *vs; 2579 uint_t vsc; 2580 2581 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 2582 2583 if (spa->spa_l2cache.sav_count == 0) 2584 return; 2585 2586 VERIFY(nvlist_lookup_nvlist(config, 2587 ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 2588 VERIFY(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, 2589 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 2590 if (nl2cache != 0) { 2591 VERIFY(nvlist_add_nvlist_array(nvroot, 2592 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 2593 VERIFY(nvlist_lookup_nvlist_array(nvroot, 2594 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 2595 2596 /* 2597 * Update level 2 cache device stats. 2598 */ 2599 2600 for (i = 0; i < nl2cache; i++) { 2601 VERIFY(nvlist_lookup_uint64(l2cache[i], 2602 ZPOOL_CONFIG_GUID, &guid) == 0); 2603 2604 vd = NULL; 2605 for (j = 0; j < spa->spa_l2cache.sav_count; j++) { 2606 if (guid == 2607 spa->spa_l2cache.sav_vdevs[j]->vdev_guid) { 2608 vd = spa->spa_l2cache.sav_vdevs[j]; 2609 break; 2610 } 2611 } 2612 ASSERT(vd != NULL); 2613 2614 VERIFY(nvlist_lookup_uint64_array(l2cache[i], 2615 ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc) 2616 == 0); 2617 vdev_get_stats(vd, vs); 2618 } 2619 } 2620} 2621 2622int 2623spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen) 2624{ 2625 int error; 2626 spa_t *spa; 2627 2628 *config = NULL; 2629 error = spa_open_common(name, &spa, FTAG, NULL, config); 2630 2631 if (spa != NULL) { 2632 /* 2633 * This still leaves a window of inconsistency where the spares 2634 * or l2cache devices could change and the config would be 2635 * self-inconsistent. 2636 */ 2637 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 2638 2639 if (*config != NULL) { 2640 uint64_t loadtimes[2]; 2641 2642 loadtimes[0] = spa->spa_loaded_ts.tv_sec; 2643 loadtimes[1] = spa->spa_loaded_ts.tv_nsec; 2644 VERIFY(nvlist_add_uint64_array(*config, 2645 ZPOOL_CONFIG_LOADED_TIME, loadtimes, 2) == 0); 2646 2647 VERIFY(nvlist_add_uint64(*config, 2648 ZPOOL_CONFIG_ERRCOUNT, 2649 spa_get_errlog_size(spa)) == 0); 2650 2651 if (spa_suspended(spa)) 2652 VERIFY(nvlist_add_uint64(*config, 2653 ZPOOL_CONFIG_SUSPENDED, 2654 spa->spa_failmode) == 0); 2655 2656 spa_add_spares(spa, *config); 2657 spa_add_l2cache(spa, *config); 2658 } 2659 } 2660 2661 /* 2662 * We want to get the alternate root even for faulted pools, so we cheat 2663 * and call spa_lookup() directly. 2664 */ 2665 if (altroot) { 2666 if (spa == NULL) { 2667 mutex_enter(&spa_namespace_lock); 2668 spa = spa_lookup(name); 2669 if (spa) 2670 spa_altroot(spa, altroot, buflen); 2671 else 2672 altroot[0] = '\0'; 2673 spa = NULL; 2674 mutex_exit(&spa_namespace_lock); 2675 } else { 2676 spa_altroot(spa, altroot, buflen); 2677 } 2678 } 2679 2680 if (spa != NULL) { 2681 spa_config_exit(spa, SCL_CONFIG, FTAG); 2682 spa_close(spa, FTAG); 2683 } 2684 2685 return (error); 2686} 2687 2688/* 2689 * Validate that the auxiliary device array is well formed. We must have an 2690 * array of nvlists, each which describes a valid leaf vdev. If this is an 2691 * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be 2692 * specified, as long as they are well-formed. 2693 */ 2694static int 2695spa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode, 2696 spa_aux_vdev_t *sav, const char *config, uint64_t version, 2697 vdev_labeltype_t label) 2698{ 2699 nvlist_t **dev; 2700 uint_t i, ndev; 2701 vdev_t *vd; 2702 int error; 2703 2704 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 2705 2706 /* 2707 * It's acceptable to have no devs specified. 2708 */ 2709 if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0) 2710 return (0); 2711 2712 if (ndev == 0) 2713 return (EINVAL); 2714 2715 /* 2716 * Make sure the pool is formatted with a version that supports this 2717 * device type. 2718 */ 2719 if (spa_version(spa) < version) 2720 return (ENOTSUP); 2721 2722 /* 2723 * Set the pending device list so we correctly handle device in-use 2724 * checking. 2725 */ 2726 sav->sav_pending = dev; 2727 sav->sav_npending = ndev; 2728 2729 for (i = 0; i < ndev; i++) { 2730 if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0, 2731 mode)) != 0) 2732 goto out; 2733 2734 if (!vd->vdev_ops->vdev_op_leaf) { 2735 vdev_free(vd); 2736 error = EINVAL; 2737 goto out; 2738 } 2739 2740 /* 2741 * The L2ARC currently only supports disk devices in 2742 * kernel context. For user-level testing, we allow it. 2743 */ 2744#ifdef _KERNEL 2745 if ((strcmp(config, ZPOOL_CONFIG_L2CACHE) == 0) && 2746 strcmp(vd->vdev_ops->vdev_op_type, VDEV_TYPE_DISK) != 0) { 2747 error = ENOTBLK; 2748 goto out; 2749 } 2750#endif 2751 vd->vdev_top = vd; 2752 2753 if ((error = vdev_open(vd)) == 0 && 2754 (error = vdev_label_init(vd, crtxg, label)) == 0) { 2755 VERIFY(nvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID, 2756 vd->vdev_guid) == 0); 2757 } 2758 2759 vdev_free(vd); 2760 2761 if (error && 2762 (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE)) 2763 goto out; 2764 else 2765 error = 0; 2766 } 2767 2768out: 2769 sav->sav_pending = NULL; 2770 sav->sav_npending = 0; 2771 return (error); 2772} 2773 2774static int 2775spa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode) 2776{ 2777 int error; 2778 2779 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 2780 2781 if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode, 2782 &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES, 2783 VDEV_LABEL_SPARE)) != 0) { 2784 return (error); 2785 } 2786 2787 return (spa_validate_aux_devs(spa, nvroot, crtxg, mode, 2788 &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE, 2789 VDEV_LABEL_L2CACHE)); 2790} 2791 2792static void 2793spa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs, 2794 const char *config) 2795{ 2796 int i; 2797 2798 if (sav->sav_config != NULL) { 2799 nvlist_t **olddevs; 2800 uint_t oldndevs; 2801 nvlist_t **newdevs; 2802 2803 /* 2804 * Generate new dev list by concatentating with the 2805 * current dev list. 2806 */ 2807 VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, config, 2808 &olddevs, &oldndevs) == 0); 2809 2810 newdevs = kmem_alloc(sizeof (void *) * 2811 (ndevs + oldndevs), KM_SLEEP); 2812 for (i = 0; i < oldndevs; i++) 2813 VERIFY(nvlist_dup(olddevs[i], &newdevs[i], 2814 KM_SLEEP) == 0); 2815 for (i = 0; i < ndevs; i++) 2816 VERIFY(nvlist_dup(devs[i], &newdevs[i + oldndevs], 2817 KM_SLEEP) == 0); 2818 2819 VERIFY(nvlist_remove(sav->sav_config, config, 2820 DATA_TYPE_NVLIST_ARRAY) == 0); 2821 2822 VERIFY(nvlist_add_nvlist_array(sav->sav_config, 2823 config, newdevs, ndevs + oldndevs) == 0); 2824 for (i = 0; i < oldndevs + ndevs; i++) 2825 nvlist_free(newdevs[i]); 2826 kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *)); 2827 } else { 2828 /* 2829 * Generate a new dev list. 2830 */ 2831 VERIFY(nvlist_alloc(&sav->sav_config, NV_UNIQUE_NAME, 2832 KM_SLEEP) == 0); 2833 VERIFY(nvlist_add_nvlist_array(sav->sav_config, config, 2834 devs, ndevs) == 0); 2835 } 2836} 2837 2838/* 2839 * Stop and drop level 2 ARC devices 2840 */ 2841void 2842spa_l2cache_drop(spa_t *spa) 2843{ 2844 vdev_t *vd; 2845 int i; 2846 spa_aux_vdev_t *sav = &spa->spa_l2cache; 2847 2848 for (i = 0; i < sav->sav_count; i++) { 2849 uint64_t pool; 2850 2851 vd = sav->sav_vdevs[i]; 2852 ASSERT(vd != NULL); 2853 2854 if (spa_l2cache_exists(vd->vdev_guid, &pool) && 2855 pool != 0ULL && l2arc_vdev_present(vd)) 2856 l2arc_remove_vdev(vd); 2857 if (vd->vdev_isl2cache) 2858 spa_l2cache_remove(vd); 2859 vdev_clear_stats(vd); 2860 (void) vdev_close(vd); 2861 } 2862} 2863 2864/* 2865 * Pool Creation 2866 */ 2867int 2868spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, 2869 const char *history_str, nvlist_t *zplprops) 2870{ 2871 spa_t *spa; 2872 char *altroot = NULL; 2873 vdev_t *rvd; 2874 dsl_pool_t *dp; 2875 dmu_tx_t *tx; 2876 int error = 0; 2877 uint64_t txg = TXG_INITIAL; 2878 nvlist_t **spares, **l2cache; 2879 uint_t nspares, nl2cache; 2880 uint64_t version, obj; 2881 2882 /* 2883 * If this pool already exists, return failure. 2884 */ 2885 mutex_enter(&spa_namespace_lock); 2886 if (spa_lookup(pool) != NULL) { 2887 mutex_exit(&spa_namespace_lock); 2888 return (EEXIST); 2889 } 2890 2891 /* 2892 * Allocate a new spa_t structure. 2893 */ 2894 (void) nvlist_lookup_string(props, 2895 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 2896 spa = spa_add(pool, NULL, altroot); 2897 spa_activate(spa, spa_mode_global); 2898 2899 if (props && (error = spa_prop_validate(spa, props))) { 2900 spa_deactivate(spa); 2901 spa_remove(spa); 2902 mutex_exit(&spa_namespace_lock); 2903 return (error); 2904 } 2905 2906 if (nvlist_lookup_uint64(props, zpool_prop_to_name(ZPOOL_PROP_VERSION), 2907 &version) != 0) 2908 version = SPA_VERSION; 2909 ASSERT(version <= SPA_VERSION); 2910 2911 spa->spa_first_txg = txg; 2912 spa->spa_uberblock.ub_txg = txg - 1; 2913 spa->spa_uberblock.ub_version = version; 2914 spa->spa_ubsync = spa->spa_uberblock; 2915 2916 /* 2917 * Create "The Godfather" zio to hold all async IOs 2918 */ 2919 spa->spa_async_zio_root = zio_root(spa, NULL, NULL, 2920 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER); 2921 2922 /* 2923 * Create the root vdev. 2924 */ 2925 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2926 2927 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD); 2928 2929 ASSERT(error != 0 || rvd != NULL); 2930 ASSERT(error != 0 || spa->spa_root_vdev == rvd); 2931 2932 if (error == 0 && !zfs_allocatable_devs(nvroot)) 2933 error = EINVAL; 2934 2935 if (error == 0 && 2936 (error = vdev_create(rvd, txg, B_FALSE)) == 0 && 2937 (error = spa_validate_aux(spa, nvroot, txg, 2938 VDEV_ALLOC_ADD)) == 0) { 2939 for (int c = 0; c < rvd->vdev_children; c++) { 2940 vdev_metaslab_set_size(rvd->vdev_child[c]); 2941 vdev_expand(rvd->vdev_child[c], txg); 2942 } 2943 } 2944 2945 spa_config_exit(spa, SCL_ALL, FTAG); 2946 2947 if (error != 0) { 2948 spa_unload(spa); 2949 spa_deactivate(spa); 2950 spa_remove(spa); 2951 mutex_exit(&spa_namespace_lock); 2952 return (error); 2953 } 2954 2955 /* 2956 * Get the list of spares, if specified. 2957 */ 2958 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 2959 &spares, &nspares) == 0) { 2960 VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, NV_UNIQUE_NAME, 2961 KM_SLEEP) == 0); 2962 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 2963 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 2964 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2965 spa_load_spares(spa); 2966 spa_config_exit(spa, SCL_ALL, FTAG); 2967 spa->spa_spares.sav_sync = B_TRUE; 2968 } 2969 2970 /* 2971 * Get the list of level 2 cache devices, if specified. 2972 */ 2973 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 2974 &l2cache, &nl2cache) == 0) { 2975 VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, 2976 NV_UNIQUE_NAME, KM_SLEEP) == 0); 2977 VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 2978 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 2979 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2980 spa_load_l2cache(spa); 2981 spa_config_exit(spa, SCL_ALL, FTAG); 2982 spa->spa_l2cache.sav_sync = B_TRUE; 2983 } 2984 2985 spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, txg); 2986 spa->spa_meta_objset = dp->dp_meta_objset; 2987 2988 /* 2989 * Create DDTs (dedup tables). 2990 */ 2991 ddt_create(spa); 2992 2993 spa_update_dspace(spa); 2994 2995 tx = dmu_tx_create_assigned(dp, txg); 2996 2997 /* 2998 * Create the pool config object. 2999 */ 3000 spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, 3001 DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE, 3002 DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 3003 3004 if (zap_add(spa->spa_meta_objset, 3005 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 3006 sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { 3007 cmn_err(CE_PANIC, "failed to add pool config"); 3008 } 3009 3010 if (zap_add(spa->spa_meta_objset, 3011 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CREATION_VERSION, 3012 sizeof (uint64_t), 1, &version, tx) != 0) { 3013 cmn_err(CE_PANIC, "failed to add pool version"); 3014 } 3015 3016 /* Newly created pools with the right version are always deflated. */ 3017 if (version >= SPA_VERSION_RAIDZ_DEFLATE) { 3018 spa->spa_deflate = TRUE; 3019 if (zap_add(spa->spa_meta_objset, 3020 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 3021 sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) { 3022 cmn_err(CE_PANIC, "failed to add deflate"); 3023 } 3024 } 3025 3026 /* 3027 * Create the deferred-free bpobj. Turn off compression 3028 * because sync-to-convergence takes longer if the blocksize 3029 * keeps changing. 3030 */ 3031 obj = bpobj_alloc(spa->spa_meta_objset, 1 << 14, tx); 3032 dmu_object_set_compress(spa->spa_meta_objset, obj, 3033 ZIO_COMPRESS_OFF, tx); 3034 if (zap_add(spa->spa_meta_objset, 3035 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPOBJ, 3036 sizeof (uint64_t), 1, &obj, tx) != 0) { 3037 cmn_err(CE_PANIC, "failed to add bpobj"); 3038 } 3039 VERIFY3U(0, ==, bpobj_open(&spa->spa_deferred_bpobj, 3040 spa->spa_meta_objset, obj)); 3041 3042 /* 3043 * Create the pool's history object. 3044 */ 3045 if (version >= SPA_VERSION_ZPOOL_HISTORY) 3046 spa_history_create_obj(spa, tx); 3047 3048 /* 3049 * Set pool properties. 3050 */ 3051 spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS); 3052 spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 3053 spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE); 3054 spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND); 3055 3056 if (props != NULL) { 3057 spa_configfile_set(spa, props, B_FALSE); 3058 spa_sync_props(spa, props, tx); 3059 } 3060 3061 dmu_tx_commit(tx); 3062 3063 spa->spa_sync_on = B_TRUE; 3064 txg_sync_start(spa->spa_dsl_pool); 3065 3066 /* 3067 * We explicitly wait for the first transaction to complete so that our 3068 * bean counters are appropriately updated. 3069 */ 3070 txg_wait_synced(spa->spa_dsl_pool, txg); 3071 3072 spa_config_sync(spa, B_FALSE, B_TRUE); 3073 3074 if (version >= SPA_VERSION_ZPOOL_HISTORY && history_str != NULL) 3075 (void) spa_history_log(spa, history_str, LOG_CMD_POOL_CREATE); 3076 spa_history_log_version(spa, LOG_POOL_CREATE); 3077 3078 spa->spa_minref = refcount_count(&spa->spa_refcount); 3079 3080 mutex_exit(&spa_namespace_lock); 3081 3082 return (0); 3083} 3084 3085#if defined(sun) 3086#ifdef _KERNEL 3087/* 3088 * Get the root pool information from the root disk, then import the root pool 3089 * during the system boot up time. 3090 */ 3091extern int vdev_disk_read_rootlabel(char *, char *, nvlist_t **); 3092 3093static nvlist_t * 3094spa_generate_rootconf(char *devpath, char *devid, uint64_t *guid) 3095{ 3096 nvlist_t *config; 3097 nvlist_t *nvtop, *nvroot; 3098 uint64_t pgid; 3099 3100 if (vdev_disk_read_rootlabel(devpath, devid, &config) != 0) 3101 return (NULL); 3102 3103 /* 3104 * Add this top-level vdev to the child array. 3105 */ 3106 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 3107 &nvtop) == 0); 3108 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, 3109 &pgid) == 0); 3110 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, guid) == 0); 3111 3112 /* 3113 * Put this pool's top-level vdevs into a root vdev. 3114 */ 3115 VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 3116 VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, 3117 VDEV_TYPE_ROOT) == 0); 3118 VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0); 3119 VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0); 3120 VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 3121 &nvtop, 1) == 0); 3122 3123 /* 3124 * Replace the existing vdev_tree with the new root vdev in 3125 * this pool's configuration (remove the old, add the new). 3126 */ 3127 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0); 3128 nvlist_free(nvroot); 3129 return (config); 3130} 3131 3132/* 3133 * Walk the vdev tree and see if we can find a device with "better" 3134 * configuration. A configuration is "better" if the label on that 3135 * device has a more recent txg. 3136 */ 3137static void 3138spa_alt_rootvdev(vdev_t *vd, vdev_t **avd, uint64_t *txg) 3139{ 3140 for (int c = 0; c < vd->vdev_children; c++) 3141 spa_alt_rootvdev(vd->vdev_child[c], avd, txg); 3142 3143 if (vd->vdev_ops->vdev_op_leaf) { 3144 nvlist_t *label; 3145 uint64_t label_txg; 3146 3147 if (vdev_disk_read_rootlabel(vd->vdev_physpath, vd->vdev_devid, 3148 &label) != 0) 3149 return; 3150 3151 VERIFY(nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG, 3152 &label_txg) == 0); 3153 3154 /* 3155 * Do we have a better boot device? 3156 */ 3157 if (label_txg > *txg) { 3158 *txg = label_txg; 3159 *avd = vd; 3160 } 3161 nvlist_free(label); 3162 } 3163} 3164 3165/* 3166 * Import a root pool. 3167 * 3168 * For x86. devpath_list will consist of devid and/or physpath name of 3169 * the vdev (e.g. "id1,sd@SSEAGATE..." or "/pci@1f,0/ide@d/disk@0,0:a"). 3170 * The GRUB "findroot" command will return the vdev we should boot. 3171 * 3172 * For Sparc, devpath_list consists the physpath name of the booting device 3173 * no matter the rootpool is a single device pool or a mirrored pool. 3174 * e.g. 3175 * "/pci@1f,0/ide@d/disk@0,0:a" 3176 */ 3177int 3178spa_import_rootpool(char *devpath, char *devid) 3179{ 3180 spa_t *spa; 3181 vdev_t *rvd, *bvd, *avd = NULL; 3182 nvlist_t *config, *nvtop; 3183 uint64_t guid, txg; 3184 char *pname; 3185 int error; 3186 3187 /* 3188 * Read the label from the boot device and generate a configuration. 3189 */ 3190 config = spa_generate_rootconf(devpath, devid, &guid); 3191#if defined(_OBP) && defined(_KERNEL) 3192 if (config == NULL) { 3193 if (strstr(devpath, "/iscsi/ssd") != NULL) { 3194 /* iscsi boot */ 3195 get_iscsi_bootpath_phy(devpath); 3196 config = spa_generate_rootconf(devpath, devid, &guid); 3197 } 3198 } 3199#endif 3200 if (config == NULL) { 3201 cmn_err(CE_NOTE, "Can not read the pool label from '%s'", 3202 devpath); 3203 return (EIO); 3204 } 3205 3206 VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, 3207 &pname) == 0); 3208 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) == 0); 3209 3210 mutex_enter(&spa_namespace_lock); 3211 if ((spa = spa_lookup(pname)) != NULL) { 3212 /* 3213 * Remove the existing root pool from the namespace so that we 3214 * can replace it with the correct config we just read in. 3215 */ 3216 spa_remove(spa); 3217 } 3218 3219 spa = spa_add(pname, config, NULL); 3220 spa->spa_is_root = B_TRUE; 3221 spa->spa_import_flags = ZFS_IMPORT_VERBATIM; 3222 3223 /* 3224 * Build up a vdev tree based on the boot device's label config. 3225 */ 3226 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 3227 &nvtop) == 0); 3228 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3229 error = spa_config_parse(spa, &rvd, nvtop, NULL, 0, 3230 VDEV_ALLOC_ROOTPOOL); 3231 spa_config_exit(spa, SCL_ALL, FTAG); 3232 if (error) { 3233 mutex_exit(&spa_namespace_lock); 3234 nvlist_free(config); 3235 cmn_err(CE_NOTE, "Can not parse the config for pool '%s'", 3236 pname); 3237 return (error); 3238 } 3239 3240 /* 3241 * Get the boot vdev. 3242 */ 3243 if ((bvd = vdev_lookup_by_guid(rvd, guid)) == NULL) { 3244 cmn_err(CE_NOTE, "Can not find the boot vdev for guid %llu", 3245 (u_longlong_t)guid); 3246 error = ENOENT; 3247 goto out; 3248 } 3249 3250 /* 3251 * Determine if there is a better boot device. 3252 */ 3253 avd = bvd; 3254 spa_alt_rootvdev(rvd, &avd, &txg); 3255 if (avd != bvd) { 3256 cmn_err(CE_NOTE, "The boot device is 'degraded'. Please " 3257 "try booting from '%s'", avd->vdev_path); 3258 error = EINVAL; 3259 goto out; 3260 } 3261 3262 /* 3263 * If the boot device is part of a spare vdev then ensure that 3264 * we're booting off the active spare. 3265 */ 3266 if (bvd->vdev_parent->vdev_ops == &vdev_spare_ops && 3267 !bvd->vdev_isspare) { 3268 cmn_err(CE_NOTE, "The boot device is currently spared. Please " 3269 "try booting from '%s'", 3270 bvd->vdev_parent-> 3271 vdev_child[bvd->vdev_parent->vdev_children - 1]->vdev_path); 3272 error = EINVAL; 3273 goto out; 3274 } 3275 3276 error = 0; 3277 spa_history_log_version(spa, LOG_POOL_IMPORT); 3278out: 3279 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3280 vdev_free(rvd); 3281 spa_config_exit(spa, SCL_ALL, FTAG); 3282 mutex_exit(&spa_namespace_lock); 3283 3284 nvlist_free(config); 3285 return (error); 3286} 3287 3288#endif 3289#endif /* sun */ 3290 3291/* 3292 * Import a non-root pool into the system. 3293 */ 3294int 3295spa_import(const char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags) 3296{ 3297 spa_t *spa; 3298 char *altroot = NULL; 3299 spa_load_state_t state = SPA_LOAD_IMPORT; 3300 zpool_rewind_policy_t policy; 3301 uint64_t mode = spa_mode_global; 3302 uint64_t readonly = B_FALSE; 3303 int error; 3304 nvlist_t *nvroot; 3305 nvlist_t **spares, **l2cache; 3306 uint_t nspares, nl2cache; 3307 3308 /* 3309 * If a pool with this name exists, return failure. 3310 */ 3311 mutex_enter(&spa_namespace_lock); 3312 if (spa_lookup(pool) != NULL) { 3313 mutex_exit(&spa_namespace_lock); 3314 return (EEXIST); 3315 } 3316 3317 /* 3318 * Create and initialize the spa structure. 3319 */ 3320 (void) nvlist_lookup_string(props, 3321 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 3322 (void) nvlist_lookup_uint64(props, 3323 zpool_prop_to_name(ZPOOL_PROP_READONLY), &readonly); 3324 if (readonly) 3325 mode = FREAD; 3326 spa = spa_add(pool, config, altroot); 3327 spa->spa_import_flags = flags; 3328 3329 /* 3330 * Verbatim import - Take a pool and insert it into the namespace 3331 * as if it had been loaded at boot. 3332 */ 3333 if (spa->spa_import_flags & ZFS_IMPORT_VERBATIM) { 3334 if (props != NULL) 3335 spa_configfile_set(spa, props, B_FALSE); 3336 3337 spa_config_sync(spa, B_FALSE, B_TRUE); 3338 3339 mutex_exit(&spa_namespace_lock); 3340 spa_history_log_version(spa, LOG_POOL_IMPORT); 3341 3342 return (0); 3343 } 3344 3345 spa_activate(spa, mode); 3346 3347 /* 3348 * Don't start async tasks until we know everything is healthy. 3349 */ 3350 spa_async_suspend(spa); 3351 3352 zpool_get_rewind_policy(config, &policy); 3353 if (policy.zrp_request & ZPOOL_DO_REWIND) 3354 state = SPA_LOAD_RECOVER; 3355 3356 /* 3357 * Pass off the heavy lifting to spa_load(). Pass TRUE for mosconfig 3358 * because the user-supplied config is actually the one to trust when 3359 * doing an import. 3360 */ 3361 if (state != SPA_LOAD_RECOVER) 3362 spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; 3363 3364 error = spa_load_best(spa, state, B_TRUE, policy.zrp_txg, 3365 policy.zrp_request); 3366 3367 /* 3368 * Propagate anything learned while loading the pool and pass it 3369 * back to caller (i.e. rewind info, missing devices, etc). 3370 */ 3371 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, 3372 spa->spa_load_info) == 0); 3373 3374 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3375 /* 3376 * Toss any existing sparelist, as it doesn't have any validity 3377 * anymore, and conflicts with spa_has_spare(). 3378 */ 3379 if (spa->spa_spares.sav_config) { 3380 nvlist_free(spa->spa_spares.sav_config); 3381 spa->spa_spares.sav_config = NULL; 3382 spa_load_spares(spa); 3383 } 3384 if (spa->spa_l2cache.sav_config) { 3385 nvlist_free(spa->spa_l2cache.sav_config); 3386 spa->spa_l2cache.sav_config = NULL; 3387 spa_load_l2cache(spa); 3388 } 3389 3390 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 3391 &nvroot) == 0); 3392 if (error == 0) 3393 error = spa_validate_aux(spa, nvroot, -1ULL, 3394 VDEV_ALLOC_SPARE); 3395 if (error == 0) 3396 error = spa_validate_aux(spa, nvroot, -1ULL, 3397 VDEV_ALLOC_L2CACHE); 3398 spa_config_exit(spa, SCL_ALL, FTAG); 3399 3400 if (props != NULL) 3401 spa_configfile_set(spa, props, B_FALSE); 3402 3403 if (error != 0 || (props && spa_writeable(spa) && 3404 (error = spa_prop_set(spa, props)))) { 3405 spa_unload(spa); 3406 spa_deactivate(spa); 3407 spa_remove(spa); 3408 mutex_exit(&spa_namespace_lock); 3409 return (error); 3410 } 3411 3412 spa_async_resume(spa); 3413 3414 /* 3415 * Override any spares and level 2 cache devices as specified by 3416 * the user, as these may have correct device names/devids, etc. 3417 */ 3418 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 3419 &spares, &nspares) == 0) { 3420 if (spa->spa_spares.sav_config) 3421 VERIFY(nvlist_remove(spa->spa_spares.sav_config, 3422 ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); 3423 else 3424 VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, 3425 NV_UNIQUE_NAME, KM_SLEEP) == 0); 3426 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 3427 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 3428 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3429 spa_load_spares(spa); 3430 spa_config_exit(spa, SCL_ALL, FTAG); 3431 spa->spa_spares.sav_sync = B_TRUE; 3432 } 3433 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 3434 &l2cache, &nl2cache) == 0) { 3435 if (spa->spa_l2cache.sav_config) 3436 VERIFY(nvlist_remove(spa->spa_l2cache.sav_config, 3437 ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0); 3438 else 3439 VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, 3440 NV_UNIQUE_NAME, KM_SLEEP) == 0); 3441 VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 3442 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 3443 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3444 spa_load_l2cache(spa); 3445 spa_config_exit(spa, SCL_ALL, FTAG); 3446 spa->spa_l2cache.sav_sync = B_TRUE; 3447 } 3448 3449 /* 3450 * Check for any removed devices. 3451 */ 3452 if (spa->spa_autoreplace) { 3453 spa_aux_check_removed(&spa->spa_spares); 3454 spa_aux_check_removed(&spa->spa_l2cache); 3455 } 3456 3457 if (spa_writeable(spa)) { 3458 /* 3459 * Update the config cache to include the newly-imported pool. 3460 */ 3461 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 3462 } 3463 3464 /* 3465 * It's possible that the pool was expanded while it was exported. 3466 * We kick off an async task to handle this for us. 3467 */ 3468 spa_async_request(spa, SPA_ASYNC_AUTOEXPAND); 3469 3470 mutex_exit(&spa_namespace_lock); 3471 spa_history_log_version(spa, LOG_POOL_IMPORT); 3472 3473#ifdef __FreeBSD__ 3474#ifdef _KERNEL 3475 zvol_create_minors(pool); 3476#endif 3477#endif 3478 return (0); 3479} 3480 3481nvlist_t * 3482spa_tryimport(nvlist_t *tryconfig) 3483{ 3484 nvlist_t *config = NULL; 3485 char *poolname; 3486 spa_t *spa; 3487 uint64_t state; 3488 int error; 3489 3490 if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) 3491 return (NULL); 3492 3493 if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) 3494 return (NULL); 3495 3496 /* 3497 * Create and initialize the spa structure. 3498 */ 3499 mutex_enter(&spa_namespace_lock); 3500 spa = spa_add(TRYIMPORT_NAME, tryconfig, NULL); 3501 spa_activate(spa, FREAD); 3502 3503 /* 3504 * Pass off the heavy lifting to spa_load(). 3505 * Pass TRUE for mosconfig because the user-supplied config 3506 * is actually the one to trust when doing an import. 3507 */ 3508 error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING, B_TRUE); 3509 3510 /* 3511 * If 'tryconfig' was at least parsable, return the current config. 3512 */ 3513 if (spa->spa_root_vdev != NULL) { 3514 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 3515 VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, 3516 poolname) == 0); 3517 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 3518 state) == 0); 3519 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP, 3520 spa->spa_uberblock.ub_timestamp) == 0); 3521 3522 /* 3523 * If the bootfs property exists on this pool then we 3524 * copy it out so that external consumers can tell which 3525 * pools are bootable. 3526 */ 3527 if ((!error || error == EEXIST) && spa->spa_bootfs) { 3528 char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 3529 3530 /* 3531 * We have to play games with the name since the 3532 * pool was opened as TRYIMPORT_NAME. 3533 */ 3534 if (dsl_dsobj_to_dsname(spa_name(spa), 3535 spa->spa_bootfs, tmpname) == 0) { 3536 char *cp; 3537 char *dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 3538 3539 cp = strchr(tmpname, '/'); 3540 if (cp == NULL) { 3541 (void) strlcpy(dsname, tmpname, 3542 MAXPATHLEN); 3543 } else { 3544 (void) snprintf(dsname, MAXPATHLEN, 3545 "%s/%s", poolname, ++cp); 3546 } 3547 VERIFY(nvlist_add_string(config, 3548 ZPOOL_CONFIG_BOOTFS, dsname) == 0); 3549 kmem_free(dsname, MAXPATHLEN); 3550 } 3551 kmem_free(tmpname, MAXPATHLEN); 3552 } 3553 3554 /* 3555 * Add the list of hot spares and level 2 cache devices. 3556 */ 3557 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 3558 spa_add_spares(spa, config); 3559 spa_add_l2cache(spa, config); 3560 spa_config_exit(spa, SCL_CONFIG, FTAG); 3561 } 3562 3563 spa_unload(spa); 3564 spa_deactivate(spa); 3565 spa_remove(spa); 3566 mutex_exit(&spa_namespace_lock); 3567 3568 return (config); 3569} 3570 3571/* 3572 * Pool export/destroy 3573 * 3574 * The act of destroying or exporting a pool is very simple. We make sure there 3575 * is no more pending I/O and any references to the pool are gone. Then, we 3576 * update the pool state and sync all the labels to disk, removing the 3577 * configuration from the cache afterwards. If the 'hardforce' flag is set, then 3578 * we don't sync the labels or remove the configuration cache. 3579 */ 3580static int 3581spa_export_common(char *pool, int new_state, nvlist_t **oldconfig, 3582 boolean_t force, boolean_t hardforce) 3583{ 3584 spa_t *spa; 3585 3586 if (oldconfig) 3587 *oldconfig = NULL; 3588 3589 if (!(spa_mode_global & FWRITE)) 3590 return (EROFS); 3591 3592 mutex_enter(&spa_namespace_lock); 3593 if ((spa = spa_lookup(pool)) == NULL) { 3594 mutex_exit(&spa_namespace_lock); 3595 return (ENOENT); 3596 } 3597 3598 /* 3599 * Put a hold on the pool, drop the namespace lock, stop async tasks, 3600 * reacquire the namespace lock, and see if we can export. 3601 */ 3602 spa_open_ref(spa, FTAG); 3603 mutex_exit(&spa_namespace_lock); 3604 spa_async_suspend(spa); 3605 mutex_enter(&spa_namespace_lock); 3606 spa_close(spa, FTAG); 3607 3608 /* 3609 * The pool will be in core if it's openable, 3610 * in which case we can modify its state. 3611 */ 3612 if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) { 3613 /* 3614 * Objsets may be open only because they're dirty, so we 3615 * have to force it to sync before checking spa_refcnt. 3616 */ 3617 txg_wait_synced(spa->spa_dsl_pool, 0); 3618 3619 /* 3620 * A pool cannot be exported or destroyed if there are active 3621 * references. If we are resetting a pool, allow references by 3622 * fault injection handlers. 3623 */ 3624 if (!spa_refcount_zero(spa) || 3625 (spa->spa_inject_ref != 0 && 3626 new_state != POOL_STATE_UNINITIALIZED)) { 3627 spa_async_resume(spa); 3628 mutex_exit(&spa_namespace_lock); 3629 return (EBUSY); 3630 } 3631 3632 /* 3633 * A pool cannot be exported if it has an active shared spare. 3634 * This is to prevent other pools stealing the active spare 3635 * from an exported pool. At user's own will, such pool can 3636 * be forcedly exported. 3637 */ 3638 if (!force && new_state == POOL_STATE_EXPORTED && 3639 spa_has_active_shared_spare(spa)) { 3640 spa_async_resume(spa); 3641 mutex_exit(&spa_namespace_lock); 3642 return (EXDEV); 3643 } 3644 3645 /* 3646 * We want this to be reflected on every label, 3647 * so mark them all dirty. spa_unload() will do the 3648 * final sync that pushes these changes out. 3649 */ 3650 if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) { 3651 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3652 spa->spa_state = new_state; 3653 spa->spa_final_txg = spa_last_synced_txg(spa) + 3654 TXG_DEFER_SIZE + 1; 3655 vdev_config_dirty(spa->spa_root_vdev); 3656 spa_config_exit(spa, SCL_ALL, FTAG); 3657 } 3658 } 3659 3660 spa_event_notify(spa, NULL, ESC_ZFS_POOL_DESTROY); 3661 3662 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 3663 spa_unload(spa); 3664 spa_deactivate(spa); 3665 } 3666 3667 if (oldconfig && spa->spa_config) 3668 VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0); 3669 3670 if (new_state != POOL_STATE_UNINITIALIZED) { 3671 if (!hardforce) 3672 spa_config_sync(spa, B_TRUE, B_TRUE); 3673 spa_remove(spa); 3674 } 3675 mutex_exit(&spa_namespace_lock); 3676 3677 return (0); 3678} 3679 3680/* 3681 * Destroy a storage pool. 3682 */ 3683int 3684spa_destroy(char *pool) 3685{ 3686 return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL, 3687 B_FALSE, B_FALSE)); 3688} 3689 3690/* 3691 * Export a storage pool. 3692 */ 3693int 3694spa_export(char *pool, nvlist_t **oldconfig, boolean_t force, 3695 boolean_t hardforce) 3696{ 3697 return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig, 3698 force, hardforce)); 3699} 3700 3701/* 3702 * Similar to spa_export(), this unloads the spa_t without actually removing it 3703 * from the namespace in any way. 3704 */ 3705int 3706spa_reset(char *pool) 3707{ 3708 return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL, 3709 B_FALSE, B_FALSE)); 3710} 3711 3712/* 3713 * ========================================================================== 3714 * Device manipulation 3715 * ========================================================================== 3716 */ 3717 3718/* 3719 * Add a device to a storage pool. 3720 */ 3721int 3722spa_vdev_add(spa_t *spa, nvlist_t *nvroot) 3723{ 3724 uint64_t txg, id; 3725 int error; 3726 vdev_t *rvd = spa->spa_root_vdev; 3727 vdev_t *vd, *tvd; 3728 nvlist_t **spares, **l2cache; 3729 uint_t nspares, nl2cache; 3730 3731 ASSERT(spa_writeable(spa)); 3732 3733 txg = spa_vdev_enter(spa); 3734 3735 if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0, 3736 VDEV_ALLOC_ADD)) != 0) 3737 return (spa_vdev_exit(spa, NULL, txg, error)); 3738 3739 spa->spa_pending_vdev = vd; /* spa_vdev_exit() will clear this */ 3740 3741 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, 3742 &nspares) != 0) 3743 nspares = 0; 3744 3745 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, 3746 &nl2cache) != 0) 3747 nl2cache = 0; 3748 3749 if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0) 3750 return (spa_vdev_exit(spa, vd, txg, EINVAL)); 3751 3752 if (vd->vdev_children != 0 && 3753 (error = vdev_create(vd, txg, B_FALSE)) != 0) 3754 return (spa_vdev_exit(spa, vd, txg, error)); 3755 3756 /* 3757 * We must validate the spares and l2cache devices after checking the 3758 * children. Otherwise, vdev_inuse() will blindly overwrite the spare. 3759 */ 3760 if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0) 3761 return (spa_vdev_exit(spa, vd, txg, error)); 3762 3763 /* 3764 * Transfer each new top-level vdev from vd to rvd. 3765 */ 3766 for (int c = 0; c < vd->vdev_children; c++) { 3767 3768 /* 3769 * Set the vdev id to the first hole, if one exists. 3770 */ 3771 for (id = 0; id < rvd->vdev_children; id++) { 3772 if (rvd->vdev_child[id]->vdev_ishole) { 3773 vdev_free(rvd->vdev_child[id]); 3774 break; 3775 } 3776 } 3777 tvd = vd->vdev_child[c]; 3778 vdev_remove_child(vd, tvd); 3779 tvd->vdev_id = id; 3780 vdev_add_child(rvd, tvd); 3781 vdev_config_dirty(tvd); 3782 } 3783 3784 if (nspares != 0) { 3785 spa_set_aux_vdevs(&spa->spa_spares, spares, nspares, 3786 ZPOOL_CONFIG_SPARES); 3787 spa_load_spares(spa); 3788 spa->spa_spares.sav_sync = B_TRUE; 3789 } 3790 3791 if (nl2cache != 0) { 3792 spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache, 3793 ZPOOL_CONFIG_L2CACHE); 3794 spa_load_l2cache(spa); 3795 spa->spa_l2cache.sav_sync = B_TRUE; 3796 } 3797 3798 /* 3799 * We have to be careful when adding new vdevs to an existing pool. 3800 * If other threads start allocating from these vdevs before we 3801 * sync the config cache, and we lose power, then upon reboot we may 3802 * fail to open the pool because there are DVAs that the config cache 3803 * can't translate. Therefore, we first add the vdevs without 3804 * initializing metaslabs; sync the config cache (via spa_vdev_exit()); 3805 * and then let spa_config_update() initialize the new metaslabs. 3806 * 3807 * spa_load() checks for added-but-not-initialized vdevs, so that 3808 * if we lose power at any point in this sequence, the remaining 3809 * steps will be completed the next time we load the pool. 3810 */ 3811 (void) spa_vdev_exit(spa, vd, txg, 0); 3812 3813 mutex_enter(&spa_namespace_lock); 3814 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 3815 mutex_exit(&spa_namespace_lock); 3816 3817 return (0); 3818} 3819 3820/* 3821 * Attach a device to a mirror. The arguments are the path to any device 3822 * in the mirror, and the nvroot for the new device. If the path specifies 3823 * a device that is not mirrored, we automatically insert the mirror vdev. 3824 * 3825 * If 'replacing' is specified, the new device is intended to replace the 3826 * existing device; in this case the two devices are made into their own 3827 * mirror using the 'replacing' vdev, which is functionally identical to 3828 * the mirror vdev (it actually reuses all the same ops) but has a few 3829 * extra rules: you can't attach to it after it's been created, and upon 3830 * completion of resilvering, the first disk (the one being replaced) 3831 * is automatically detached. 3832 */ 3833int 3834spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) 3835{ 3836 uint64_t txg, dtl_max_txg; 3837 vdev_t *rvd = spa->spa_root_vdev; 3838 vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; 3839 vdev_ops_t *pvops; 3840 char *oldvdpath, *newvdpath; 3841 int newvd_isspare; 3842 int error; 3843 3844 ASSERT(spa_writeable(spa)); 3845 3846 txg = spa_vdev_enter(spa); 3847 3848 oldvd = spa_lookup_by_guid(spa, guid, B_FALSE); 3849 3850 if (oldvd == NULL) 3851 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 3852 3853 if (!oldvd->vdev_ops->vdev_op_leaf) 3854 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 3855 3856 pvd = oldvd->vdev_parent; 3857 3858 if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, 3859 VDEV_ALLOC_ADD)) != 0) 3860 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 3861 3862 if (newrootvd->vdev_children != 1) 3863 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 3864 3865 newvd = newrootvd->vdev_child[0]; 3866 3867 if (!newvd->vdev_ops->vdev_op_leaf) 3868 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 3869 3870 if ((error = vdev_create(newrootvd, txg, replacing)) != 0) 3871 return (spa_vdev_exit(spa, newrootvd, txg, error)); 3872 3873 /* 3874 * Spares can't replace logs 3875 */ 3876 if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare) 3877 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 3878 3879 if (!replacing) { 3880 /* 3881 * For attach, the only allowable parent is a mirror or the root 3882 * vdev. 3883 */ 3884 if (pvd->vdev_ops != &vdev_mirror_ops && 3885 pvd->vdev_ops != &vdev_root_ops) 3886 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 3887 3888 pvops = &vdev_mirror_ops; 3889 } else { 3890 /* 3891 * Active hot spares can only be replaced by inactive hot 3892 * spares. 3893 */ 3894 if (pvd->vdev_ops == &vdev_spare_ops && 3895 oldvd->vdev_isspare && 3896 !spa_has_spare(spa, newvd->vdev_guid)) 3897 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 3898 3899 /* 3900 * If the source is a hot spare, and the parent isn't already a 3901 * spare, then we want to create a new hot spare. Otherwise, we 3902 * want to create a replacing vdev. The user is not allowed to 3903 * attach to a spared vdev child unless the 'isspare' state is 3904 * the same (spare replaces spare, non-spare replaces 3905 * non-spare). 3906 */ 3907 if (pvd->vdev_ops == &vdev_replacing_ops && 3908 spa_version(spa) < SPA_VERSION_MULTI_REPLACE) { 3909 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 3910 } else if (pvd->vdev_ops == &vdev_spare_ops && 3911 newvd->vdev_isspare != oldvd->vdev_isspare) { 3912 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 3913 } 3914 3915 if (newvd->vdev_isspare) 3916 pvops = &vdev_spare_ops; 3917 else 3918 pvops = &vdev_replacing_ops; 3919 } 3920 3921 /* 3922 * Make sure the new device is big enough. 3923 */ 3924 if (newvd->vdev_asize < vdev_get_min_asize(oldvd)) 3925 return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); 3926 3927 /* 3928 * The new device cannot have a higher alignment requirement 3929 * than the top-level vdev. 3930 */ 3931 if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) 3932 return (spa_vdev_exit(spa, newrootvd, txg, EDOM)); 3933 3934 /* 3935 * If this is an in-place replacement, update oldvd's path and devid 3936 * to make it distinguishable from newvd, and unopenable from now on. 3937 */ 3938 if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { 3939 spa_strfree(oldvd->vdev_path); 3940 oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, 3941 KM_SLEEP); 3942 (void) sprintf(oldvd->vdev_path, "%s/%s", 3943 newvd->vdev_path, "old"); 3944 if (oldvd->vdev_devid != NULL) { 3945 spa_strfree(oldvd->vdev_devid); 3946 oldvd->vdev_devid = NULL; 3947 } 3948 } 3949 3950 /* mark the device being resilvered */ 3951 newvd->vdev_resilvering = B_TRUE; 3952 3953 /* 3954 * If the parent is not a mirror, or if we're replacing, insert the new 3955 * mirror/replacing/spare vdev above oldvd. 3956 */ 3957 if (pvd->vdev_ops != pvops) 3958 pvd = vdev_add_parent(oldvd, pvops); 3959 3960 ASSERT(pvd->vdev_top->vdev_parent == rvd); 3961 ASSERT(pvd->vdev_ops == pvops); 3962 ASSERT(oldvd->vdev_parent == pvd); 3963 3964 /* 3965 * Extract the new device from its root and add it to pvd. 3966 */ 3967 vdev_remove_child(newrootvd, newvd); 3968 newvd->vdev_id = pvd->vdev_children; 3969 newvd->vdev_crtxg = oldvd->vdev_crtxg; 3970 vdev_add_child(pvd, newvd); 3971 3972 tvd = newvd->vdev_top; 3973 ASSERT(pvd->vdev_top == tvd); 3974 ASSERT(tvd->vdev_parent == rvd); 3975 3976 vdev_config_dirty(tvd); 3977 3978 /* 3979 * Set newvd's DTL to [TXG_INITIAL, dtl_max_txg) so that we account 3980 * for any dmu_sync-ed blocks. It will propagate upward when 3981 * spa_vdev_exit() calls vdev_dtl_reassess(). 3982 */ 3983 dtl_max_txg = txg + TXG_CONCURRENT_STATES; 3984 3985 vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL, 3986 dtl_max_txg - TXG_INITIAL); 3987 3988 if (newvd->vdev_isspare) { 3989 spa_spare_activate(newvd); 3990 spa_event_notify(spa, newvd, ESC_ZFS_VDEV_SPARE); 3991 } 3992 3993 oldvdpath = spa_strdup(oldvd->vdev_path); 3994 newvdpath = spa_strdup(newvd->vdev_path); 3995 newvd_isspare = newvd->vdev_isspare; 3996 3997 /* 3998 * Mark newvd's DTL dirty in this txg. 3999 */ 4000 vdev_dirty(tvd, VDD_DTL, newvd, txg); 4001 4002 /* 4003 * Restart the resilver 4004 */ 4005 dsl_resilver_restart(spa->spa_dsl_pool, dtl_max_txg); 4006 4007 /* 4008 * Commit the config 4009 */ 4010 (void) spa_vdev_exit(spa, newrootvd, dtl_max_txg, 0); 4011 4012 spa_history_log_internal(LOG_POOL_VDEV_ATTACH, spa, NULL, 4013 "%s vdev=%s %s vdev=%s", 4014 replacing && newvd_isspare ? "spare in" : 4015 replacing ? "replace" : "attach", newvdpath, 4016 replacing ? "for" : "to", oldvdpath); 4017 4018 spa_strfree(oldvdpath); 4019 spa_strfree(newvdpath); 4020 4021 if (spa->spa_bootfs) 4022 spa_event_notify(spa, newvd, ESC_ZFS_BOOTFS_VDEV_ATTACH); 4023 4024 return (0); 4025} 4026 4027/* 4028 * Detach a device from a mirror or replacing vdev. 4029 * If 'replace_done' is specified, only detach if the parent 4030 * is a replacing vdev. 4031 */ 4032int 4033spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) 4034{ 4035 uint64_t txg; 4036 int error; 4037 vdev_t *rvd = spa->spa_root_vdev; 4038 vdev_t *vd, *pvd, *cvd, *tvd; 4039 boolean_t unspare = B_FALSE; 4040 uint64_t unspare_guid; 4041 char *vdpath; 4042 4043 ASSERT(spa_writeable(spa)); 4044 4045 txg = spa_vdev_enter(spa); 4046 4047 vd = spa_lookup_by_guid(spa, guid, B_FALSE); 4048 4049 if (vd == NULL) 4050 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 4051 4052 if (!vd->vdev_ops->vdev_op_leaf) 4053 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 4054 4055 pvd = vd->vdev_parent; 4056 4057 /* 4058 * If the parent/child relationship is not as expected, don't do it. 4059 * Consider M(A,R(B,C)) -- that is, a mirror of A with a replacing 4060 * vdev that's replacing B with C. The user's intent in replacing 4061 * is to go from M(A,B) to M(A,C). If the user decides to cancel 4062 * the replace by detaching C, the expected behavior is to end up 4063 * M(A,B). But suppose that right after deciding to detach C, 4064 * the replacement of B completes. We would have M(A,C), and then 4065 * ask to detach C, which would leave us with just A -- not what 4066 * the user wanted. To prevent this, we make sure that the 4067 * parent/child relationship hasn't changed -- in this example, 4068 * that C's parent is still the replacing vdev R. 4069 */ 4070 if (pvd->vdev_guid != pguid && pguid != 0) 4071 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 4072 4073 /* 4074 * Only 'replacing' or 'spare' vdevs can be replaced. 4075 */ 4076 if (replace_done && pvd->vdev_ops != &vdev_replacing_ops && 4077 pvd->vdev_ops != &vdev_spare_ops) 4078 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 4079 4080 ASSERT(pvd->vdev_ops != &vdev_spare_ops || 4081 spa_version(spa) >= SPA_VERSION_SPARES); 4082 4083 /* 4084 * Only mirror, replacing, and spare vdevs support detach. 4085 */ 4086 if (pvd->vdev_ops != &vdev_replacing_ops && 4087 pvd->vdev_ops != &vdev_mirror_ops && 4088 pvd->vdev_ops != &vdev_spare_ops) 4089 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 4090 4091 /* 4092 * If this device has the only valid copy of some data, 4093 * we cannot safely detach it. 4094 */ 4095 if (vdev_dtl_required(vd)) 4096 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 4097 4098 ASSERT(pvd->vdev_children >= 2); 4099 4100 /* 4101 * If we are detaching the second disk from a replacing vdev, then 4102 * check to see if we changed the original vdev's path to have "/old" 4103 * at the end in spa_vdev_attach(). If so, undo that change now. 4104 */ 4105 if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id > 0 && 4106 vd->vdev_path != NULL) { 4107 size_t len = strlen(vd->vdev_path); 4108 4109 for (int c = 0; c < pvd->vdev_children; c++) { 4110 cvd = pvd->vdev_child[c]; 4111 4112 if (cvd == vd || cvd->vdev_path == NULL) 4113 continue; 4114 4115 if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 && 4116 strcmp(cvd->vdev_path + len, "/old") == 0) { 4117 spa_strfree(cvd->vdev_path); 4118 cvd->vdev_path = spa_strdup(vd->vdev_path); 4119 break; 4120 } 4121 } 4122 } 4123 4124 /* 4125 * If we are detaching the original disk from a spare, then it implies 4126 * that the spare should become a real disk, and be removed from the 4127 * active spare list for the pool. 4128 */ 4129 if (pvd->vdev_ops == &vdev_spare_ops && 4130 vd->vdev_id == 0 && 4131 pvd->vdev_child[pvd->vdev_children - 1]->vdev_isspare) 4132 unspare = B_TRUE; 4133 4134 /* 4135 * Erase the disk labels so the disk can be used for other things. 4136 * This must be done after all other error cases are handled, 4137 * but before we disembowel vd (so we can still do I/O to it). 4138 * But if we can't do it, don't treat the error as fatal -- 4139 * it may be that the unwritability of the disk is the reason 4140 * it's being detached! 4141 */ 4142 error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 4143 4144 /* 4145 * Remove vd from its parent and compact the parent's children. 4146 */ 4147 vdev_remove_child(pvd, vd); 4148 vdev_compact_children(pvd); 4149 4150 /* 4151 * Remember one of the remaining children so we can get tvd below. 4152 */ 4153 cvd = pvd->vdev_child[pvd->vdev_children - 1]; 4154 4155 /* 4156 * If we need to remove the remaining child from the list of hot spares, 4157 * do it now, marking the vdev as no longer a spare in the process. 4158 * We must do this before vdev_remove_parent(), because that can 4159 * change the GUID if it creates a new toplevel GUID. For a similar 4160 * reason, we must remove the spare now, in the same txg as the detach; 4161 * otherwise someone could attach a new sibling, change the GUID, and 4162 * the subsequent attempt to spa_vdev_remove(unspare_guid) would fail. 4163 */ 4164 if (unspare) { 4165 ASSERT(cvd->vdev_isspare); 4166 spa_spare_remove(cvd); 4167 unspare_guid = cvd->vdev_guid; 4168 (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); 4169 cvd->vdev_unspare = B_TRUE; 4170 } 4171 4172 /* 4173 * If the parent mirror/replacing vdev only has one child, 4174 * the parent is no longer needed. Remove it from the tree. 4175 */ 4176 if (pvd->vdev_children == 1) { 4177 if (pvd->vdev_ops == &vdev_spare_ops) 4178 cvd->vdev_unspare = B_FALSE; 4179 vdev_remove_parent(cvd); 4180 cvd->vdev_resilvering = B_FALSE; 4181 } 4182 4183 4184 /* 4185 * We don't set tvd until now because the parent we just removed 4186 * may have been the previous top-level vdev. 4187 */ 4188 tvd = cvd->vdev_top; 4189 ASSERT(tvd->vdev_parent == rvd); 4190 4191 /* 4192 * Reevaluate the parent vdev state. 4193 */ 4194 vdev_propagate_state(cvd); 4195 4196 /* 4197 * If the 'autoexpand' property is set on the pool then automatically 4198 * try to expand the size of the pool. For example if the device we 4199 * just detached was smaller than the others, it may be possible to 4200 * add metaslabs (i.e. grow the pool). We need to reopen the vdev 4201 * first so that we can obtain the updated sizes of the leaf vdevs. 4202 */ 4203 if (spa->spa_autoexpand) { 4204 vdev_reopen(tvd); 4205 vdev_expand(tvd, txg); 4206 } 4207 4208 vdev_config_dirty(tvd); 4209 4210 /* 4211 * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that 4212 * vd->vdev_detached is set and free vd's DTL object in syncing context. 4213 * But first make sure we're not on any *other* txg's DTL list, to 4214 * prevent vd from being accessed after it's freed. 4215 */ 4216 vdpath = spa_strdup(vd->vdev_path); 4217 for (int t = 0; t < TXG_SIZE; t++) 4218 (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); 4219 vd->vdev_detached = B_TRUE; 4220 vdev_dirty(tvd, VDD_DTL, vd, txg); 4221 4222 spa_event_notify(spa, vd, ESC_ZFS_VDEV_REMOVE); 4223 4224 /* hang on to the spa before we release the lock */ 4225 spa_open_ref(spa, FTAG); 4226 4227 error = spa_vdev_exit(spa, vd, txg, 0); 4228 4229 spa_history_log_internal(LOG_POOL_VDEV_DETACH, spa, NULL, 4230 "vdev=%s", vdpath); 4231 spa_strfree(vdpath); 4232 4233 /* 4234 * If this was the removal of the original device in a hot spare vdev, 4235 * then we want to go through and remove the device from the hot spare 4236 * list of every other pool. 4237 */ 4238 if (unspare) { 4239 spa_t *altspa = NULL; 4240 4241 mutex_enter(&spa_namespace_lock); 4242 while ((altspa = spa_next(altspa)) != NULL) { 4243 if (altspa->spa_state != POOL_STATE_ACTIVE || 4244 altspa == spa) 4245 continue; 4246 4247 spa_open_ref(altspa, FTAG); 4248 mutex_exit(&spa_namespace_lock); 4249 (void) spa_vdev_remove(altspa, unspare_guid, B_TRUE); 4250 mutex_enter(&spa_namespace_lock); 4251 spa_close(altspa, FTAG); 4252 } 4253 mutex_exit(&spa_namespace_lock); 4254 4255 /* search the rest of the vdevs for spares to remove */ 4256 spa_vdev_resilver_done(spa); 4257 } 4258 4259 /* all done with the spa; OK to release */ 4260 mutex_enter(&spa_namespace_lock); 4261 spa_close(spa, FTAG); 4262 mutex_exit(&spa_namespace_lock); 4263 4264 return (error); 4265} 4266 4267/* 4268 * Split a set of devices from their mirrors, and create a new pool from them. 4269 */ 4270int 4271spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, 4272 nvlist_t *props, boolean_t exp) 4273{ 4274 int error = 0; 4275 uint64_t txg, *glist; 4276 spa_t *newspa; 4277 uint_t c, children, lastlog; 4278 nvlist_t **child, *nvl, *tmp; 4279 dmu_tx_t *tx; 4280 char *altroot = NULL; 4281 vdev_t *rvd, **vml = NULL; /* vdev modify list */ 4282 boolean_t activate_slog; 4283 4284 ASSERT(spa_writeable(spa)); 4285 4286 txg = spa_vdev_enter(spa); 4287 4288 /* clear the log and flush everything up to now */ 4289 activate_slog = spa_passivate_log(spa); 4290 (void) spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 4291 error = spa_offline_log(spa); 4292 txg = spa_vdev_config_enter(spa); 4293 4294 if (activate_slog) 4295 spa_activate_log(spa); 4296 4297 if (error != 0) 4298 return (spa_vdev_exit(spa, NULL, txg, error)); 4299 4300 /* check new spa name before going any further */ 4301 if (spa_lookup(newname) != NULL) 4302 return (spa_vdev_exit(spa, NULL, txg, EEXIST)); 4303 4304 /* 4305 * scan through all the children to ensure they're all mirrors 4306 */ 4307 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 || 4308 nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child, 4309 &children) != 0) 4310 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 4311 4312 /* first, check to ensure we've got the right child count */ 4313 rvd = spa->spa_root_vdev; 4314 lastlog = 0; 4315 for (c = 0; c < rvd->vdev_children; c++) { 4316 vdev_t *vd = rvd->vdev_child[c]; 4317 4318 /* don't count the holes & logs as children */ 4319 if (vd->vdev_islog || vd->vdev_ishole) { 4320 if (lastlog == 0) 4321 lastlog = c; 4322 continue; 4323 } 4324 4325 lastlog = 0; 4326 } 4327 if (children != (lastlog != 0 ? lastlog : rvd->vdev_children)) 4328 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 4329 4330 /* next, ensure no spare or cache devices are part of the split */ 4331 if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_SPARES, &tmp) == 0 || 4332 nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_L2CACHE, &tmp) == 0) 4333 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 4334 4335 vml = kmem_zalloc(children * sizeof (vdev_t *), KM_SLEEP); 4336 glist = kmem_zalloc(children * sizeof (uint64_t), KM_SLEEP); 4337 4338 /* then, loop over each vdev and validate it */ 4339 for (c = 0; c < children; c++) { 4340 uint64_t is_hole = 0; 4341 4342 (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE, 4343 &is_hole); 4344 4345 if (is_hole != 0) { 4346 if (spa->spa_root_vdev->vdev_child[c]->vdev_ishole || 4347 spa->spa_root_vdev->vdev_child[c]->vdev_islog) { 4348 continue; 4349 } else { 4350 error = EINVAL; 4351 break; 4352 } 4353 } 4354 4355 /* which disk is going to be split? */ 4356 if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_GUID, 4357 &glist[c]) != 0) { 4358 error = EINVAL; 4359 break; 4360 } 4361 4362 /* look it up in the spa */ 4363 vml[c] = spa_lookup_by_guid(spa, glist[c], B_FALSE); 4364 if (vml[c] == NULL) { 4365 error = ENODEV; 4366 break; 4367 } 4368 4369 /* make sure there's nothing stopping the split */ 4370 if (vml[c]->vdev_parent->vdev_ops != &vdev_mirror_ops || 4371 vml[c]->vdev_islog || 4372 vml[c]->vdev_ishole || 4373 vml[c]->vdev_isspare || 4374 vml[c]->vdev_isl2cache || 4375 !vdev_writeable(vml[c]) || 4376 vml[c]->vdev_children != 0 || 4377 vml[c]->vdev_state != VDEV_STATE_HEALTHY || 4378 c != spa->spa_root_vdev->vdev_child[c]->vdev_id) { 4379 error = EINVAL; 4380 break; 4381 } 4382 4383 if (vdev_dtl_required(vml[c])) { 4384 error = EBUSY; 4385 break; 4386 } 4387 4388 /* we need certain info from the top level */ 4389 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_ARRAY, 4390 vml[c]->vdev_top->vdev_ms_array) == 0); 4391 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_SHIFT, 4392 vml[c]->vdev_top->vdev_ms_shift) == 0); 4393 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASIZE, 4394 vml[c]->vdev_top->vdev_asize) == 0); 4395 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASHIFT, 4396 vml[c]->vdev_top->vdev_ashift) == 0); 4397 } 4398 4399 if (error != 0) { 4400 kmem_free(vml, children * sizeof (vdev_t *)); 4401 kmem_free(glist, children * sizeof (uint64_t)); 4402 return (spa_vdev_exit(spa, NULL, txg, error)); 4403 } 4404 4405 /* stop writers from using the disks */ 4406 for (c = 0; c < children; c++) { 4407 if (vml[c] != NULL) 4408 vml[c]->vdev_offline = B_TRUE; 4409 } 4410 vdev_reopen(spa->spa_root_vdev); 4411 4412 /* 4413 * Temporarily record the splitting vdevs in the spa config. This 4414 * will disappear once the config is regenerated. 4415 */ 4416 VERIFY(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) == 0); 4417 VERIFY(nvlist_add_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, 4418 glist, children) == 0); 4419 kmem_free(glist, children * sizeof (uint64_t)); 4420 4421 mutex_enter(&spa->spa_props_lock); 4422 VERIFY(nvlist_add_nvlist(spa->spa_config, ZPOOL_CONFIG_SPLIT, 4423 nvl) == 0); 4424 mutex_exit(&spa->spa_props_lock); 4425 spa->spa_config_splitting = nvl; 4426 vdev_config_dirty(spa->spa_root_vdev); 4427 4428 /* configure and create the new pool */ 4429 VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, newname) == 0); 4430 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 4431 exp ? POOL_STATE_EXPORTED : POOL_STATE_ACTIVE) == 0); 4432 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, 4433 spa_version(spa)) == 0); 4434 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG, 4435 spa->spa_config_txg) == 0); 4436 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID, 4437 spa_generate_guid(NULL)) == 0); 4438 (void) nvlist_lookup_string(props, 4439 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 4440 4441 /* add the new pool to the namespace */ 4442 newspa = spa_add(newname, config, altroot); 4443 newspa->spa_config_txg = spa->spa_config_txg; 4444 spa_set_log_state(newspa, SPA_LOG_CLEAR); 4445 4446 /* release the spa config lock, retaining the namespace lock */ 4447 spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 4448 4449 if (zio_injection_enabled) 4450 zio_handle_panic_injection(spa, FTAG, 1); 4451 4452 spa_activate(newspa, spa_mode_global); 4453 spa_async_suspend(newspa); 4454 4455#ifndef sun 4456 /* mark that we are creating new spa by splitting */ 4457 newspa->spa_splitting_newspa = B_TRUE; 4458#endif 4459 /* create the new pool from the disks of the original pool */ 4460 error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE, B_TRUE); 4461#ifndef sun 4462 newspa->spa_splitting_newspa = B_FALSE; 4463#endif 4464 if (error) 4465 goto out; 4466 4467 /* if that worked, generate a real config for the new pool */ 4468 if (newspa->spa_root_vdev != NULL) { 4469 VERIFY(nvlist_alloc(&newspa->spa_config_splitting, 4470 NV_UNIQUE_NAME, KM_SLEEP) == 0); 4471 VERIFY(nvlist_add_uint64(newspa->spa_config_splitting, 4472 ZPOOL_CONFIG_SPLIT_GUID, spa_guid(spa)) == 0); 4473 spa_config_set(newspa, spa_config_generate(newspa, NULL, -1ULL, 4474 B_TRUE)); 4475 } 4476 4477 /* set the props */ 4478 if (props != NULL) { 4479 spa_configfile_set(newspa, props, B_FALSE); 4480 error = spa_prop_set(newspa, props); 4481 if (error) 4482 goto out; 4483 } 4484 4485 /* flush everything */ 4486 txg = spa_vdev_config_enter(newspa); 4487 vdev_config_dirty(newspa->spa_root_vdev); 4488 (void) spa_vdev_config_exit(newspa, NULL, txg, 0, FTAG); 4489 4490 if (zio_injection_enabled) 4491 zio_handle_panic_injection(spa, FTAG, 2); 4492 4493 spa_async_resume(newspa); 4494 4495 /* finally, update the original pool's config */ 4496 txg = spa_vdev_config_enter(spa); 4497 tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); 4498 error = dmu_tx_assign(tx, TXG_WAIT); 4499 if (error != 0) 4500 dmu_tx_abort(tx); 4501 for (c = 0; c < children; c++) { 4502 if (vml[c] != NULL) { 4503 vdev_split(vml[c]); 4504 if (error == 0) 4505 spa_history_log_internal(LOG_POOL_VDEV_DETACH, 4506 spa, tx, "vdev=%s", 4507 vml[c]->vdev_path); 4508 vdev_free(vml[c]); 4509 } 4510 } 4511 vdev_config_dirty(spa->spa_root_vdev); 4512 spa->spa_config_splitting = NULL; 4513 nvlist_free(nvl); 4514 if (error == 0) 4515 dmu_tx_commit(tx); 4516 (void) spa_vdev_exit(spa, NULL, txg, 0); 4517 4518 if (zio_injection_enabled) 4519 zio_handle_panic_injection(spa, FTAG, 3); 4520 4521 /* split is complete; log a history record */ 4522 spa_history_log_internal(LOG_POOL_SPLIT, newspa, NULL, 4523 "split new pool %s from pool %s", newname, spa_name(spa)); 4524 4525 kmem_free(vml, children * sizeof (vdev_t *)); 4526 4527 /* if we're not going to mount the filesystems in userland, export */ 4528 if (exp) 4529 error = spa_export_common(newname, POOL_STATE_EXPORTED, NULL, 4530 B_FALSE, B_FALSE); 4531 4532 return (error); 4533 4534out: 4535 spa_unload(newspa); 4536 spa_deactivate(newspa); 4537 spa_remove(newspa); 4538 4539 txg = spa_vdev_config_enter(spa); 4540 4541 /* re-online all offlined disks */ 4542 for (c = 0; c < children; c++) { 4543 if (vml[c] != NULL) 4544 vml[c]->vdev_offline = B_FALSE; 4545 } 4546 vdev_reopen(spa->spa_root_vdev); 4547 4548 nvlist_free(spa->spa_config_splitting); 4549 spa->spa_config_splitting = NULL; 4550 (void) spa_vdev_exit(spa, NULL, txg, error); 4551 4552 kmem_free(vml, children * sizeof (vdev_t *)); 4553 return (error); 4554} 4555 4556static nvlist_t * 4557spa_nvlist_lookup_by_guid(nvlist_t **nvpp, int count, uint64_t target_guid) 4558{ 4559 for (int i = 0; i < count; i++) { 4560 uint64_t guid; 4561 4562 VERIFY(nvlist_lookup_uint64(nvpp[i], ZPOOL_CONFIG_GUID, 4563 &guid) == 0); 4564 4565 if (guid == target_guid) 4566 return (nvpp[i]); 4567 } 4568 4569 return (NULL); 4570} 4571 4572static void 4573spa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count, 4574 nvlist_t *dev_to_remove) 4575{ 4576 nvlist_t **newdev = NULL; 4577 4578 if (count > 1) 4579 newdev = kmem_alloc((count - 1) * sizeof (void *), KM_SLEEP); 4580 4581 for (int i = 0, j = 0; i < count; i++) { 4582 if (dev[i] == dev_to_remove) 4583 continue; 4584 VERIFY(nvlist_dup(dev[i], &newdev[j++], KM_SLEEP) == 0); 4585 } 4586 4587 VERIFY(nvlist_remove(config, name, DATA_TYPE_NVLIST_ARRAY) == 0); 4588 VERIFY(nvlist_add_nvlist_array(config, name, newdev, count - 1) == 0); 4589 4590 for (int i = 0; i < count - 1; i++) 4591 nvlist_free(newdev[i]); 4592 4593 if (count > 1) 4594 kmem_free(newdev, (count - 1) * sizeof (void *)); 4595} 4596 4597/* 4598 * Evacuate the device. 4599 */ 4600static int 4601spa_vdev_remove_evacuate(spa_t *spa, vdev_t *vd) 4602{ 4603 uint64_t txg; 4604 int error = 0; 4605 4606 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 4607 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 4608 ASSERT(vd == vd->vdev_top); 4609 4610 /* 4611 * Evacuate the device. We don't hold the config lock as writer 4612 * since we need to do I/O but we do keep the 4613 * spa_namespace_lock held. Once this completes the device 4614 * should no longer have any blocks allocated on it. 4615 */ 4616 if (vd->vdev_islog) { 4617 if (vd->vdev_stat.vs_alloc != 0) 4618 error = spa_offline_log(spa); 4619 } else { 4620 error = ENOTSUP; 4621 } 4622 4623 if (error) 4624 return (error); 4625 4626 /* 4627 * The evacuation succeeded. Remove any remaining MOS metadata 4628 * associated with this vdev, and wait for these changes to sync. 4629 */ 4630 ASSERT3U(vd->vdev_stat.vs_alloc, ==, 0); 4631 txg = spa_vdev_config_enter(spa); 4632 vd->vdev_removing = B_TRUE; 4633 vdev_dirty(vd, 0, NULL, txg); 4634 vdev_config_dirty(vd); 4635 spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 4636 4637 return (0); 4638} 4639 4640/* 4641 * Complete the removal by cleaning up the namespace. 4642 */ 4643static void 4644spa_vdev_remove_from_namespace(spa_t *spa, vdev_t *vd) 4645{ 4646 vdev_t *rvd = spa->spa_root_vdev; 4647 uint64_t id = vd->vdev_id; 4648 boolean_t last_vdev = (id == (rvd->vdev_children - 1)); 4649 4650 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 4651 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 4652 ASSERT(vd == vd->vdev_top); 4653 4654 /* 4655 * Only remove any devices which are empty. 4656 */ 4657 if (vd->vdev_stat.vs_alloc != 0) 4658 return; 4659 4660 (void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 4661 4662 if (list_link_active(&vd->vdev_state_dirty_node)) 4663 vdev_state_clean(vd); 4664 if (list_link_active(&vd->vdev_config_dirty_node)) 4665 vdev_config_clean(vd); 4666 4667 vdev_free(vd); 4668 4669 if (last_vdev) { 4670 vdev_compact_children(rvd); 4671 } else { 4672 vd = vdev_alloc_common(spa, id, 0, &vdev_hole_ops); 4673 vdev_add_child(rvd, vd); 4674 } 4675 vdev_config_dirty(rvd); 4676 4677 /* 4678 * Reassess the health of our root vdev. 4679 */ 4680 vdev_reopen(rvd); 4681} 4682 4683/* 4684 * Remove a device from the pool - 4685 * 4686 * Removing a device from the vdev namespace requires several steps 4687 * and can take a significant amount of time. As a result we use 4688 * the spa_vdev_config_[enter/exit] functions which allow us to 4689 * grab and release the spa_config_lock while still holding the namespace 4690 * lock. During each step the configuration is synced out. 4691 */ 4692 4693/* 4694 * Remove a device from the pool. Currently, this supports removing only hot 4695 * spares, slogs, and level 2 ARC devices. 4696 */ 4697int 4698spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) 4699{ 4700 vdev_t *vd; 4701 metaslab_group_t *mg; 4702 nvlist_t **spares, **l2cache, *nv; 4703 uint64_t txg = 0; 4704 uint_t nspares, nl2cache; 4705 int error = 0; 4706 boolean_t locked = MUTEX_HELD(&spa_namespace_lock); 4707 4708 ASSERT(spa_writeable(spa)); 4709 4710 if (!locked) 4711 txg = spa_vdev_enter(spa); 4712 4713 vd = spa_lookup_by_guid(spa, guid, B_FALSE); 4714 4715 if (spa->spa_spares.sav_vdevs != NULL && 4716 nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 4717 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0 && 4718 (nv = spa_nvlist_lookup_by_guid(spares, nspares, guid)) != NULL) { 4719 /* 4720 * Only remove the hot spare if it's not currently in use 4721 * in this pool. 4722 */ 4723 if (vd == NULL || unspare) { 4724 spa_vdev_remove_aux(spa->spa_spares.sav_config, 4725 ZPOOL_CONFIG_SPARES, spares, nspares, nv); 4726 spa_load_spares(spa); 4727 spa->spa_spares.sav_sync = B_TRUE; 4728 } else { 4729 error = EBUSY; 4730 } 4731 } else if (spa->spa_l2cache.sav_vdevs != NULL && 4732 nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, 4733 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0 && 4734 (nv = spa_nvlist_lookup_by_guid(l2cache, nl2cache, guid)) != NULL) { 4735 /* 4736 * Cache devices can always be removed. 4737 */ 4738 spa_vdev_remove_aux(spa->spa_l2cache.sav_config, 4739 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache, nv); 4740 spa_load_l2cache(spa); 4741 spa->spa_l2cache.sav_sync = B_TRUE; 4742 } else if (vd != NULL && vd->vdev_islog) { 4743 ASSERT(!locked); 4744 ASSERT(vd == vd->vdev_top); 4745 4746 /* 4747 * XXX - Once we have bp-rewrite this should 4748 * become the common case. 4749 */ 4750 4751 mg = vd->vdev_mg; 4752 4753 /* 4754 * Stop allocating from this vdev. 4755 */ 4756 metaslab_group_passivate(mg); 4757 4758 /* 4759 * Wait for the youngest allocations and frees to sync, 4760 * and then wait for the deferral of those frees to finish. 4761 */ 4762 spa_vdev_config_exit(spa, NULL, 4763 txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG); 4764 4765 /* 4766 * Attempt to evacuate the vdev. 4767 */ 4768 error = spa_vdev_remove_evacuate(spa, vd); 4769 4770 txg = spa_vdev_config_enter(spa); 4771 4772 /* 4773 * If we couldn't evacuate the vdev, unwind. 4774 */ 4775 if (error) { 4776 metaslab_group_activate(mg); 4777 return (spa_vdev_exit(spa, NULL, txg, error)); 4778 } 4779 4780 /* 4781 * Clean up the vdev namespace. 4782 */ 4783 spa_vdev_remove_from_namespace(spa, vd); 4784 4785 } else if (vd != NULL) { 4786 /* 4787 * Normal vdevs cannot be removed (yet). 4788 */ 4789 error = ENOTSUP; 4790 } else { 4791 /* 4792 * There is no vdev of any kind with the specified guid. 4793 */ 4794 error = ENOENT; 4795 } 4796 4797 if (!locked) 4798 return (spa_vdev_exit(spa, NULL, txg, error)); 4799 4800 return (error); 4801} 4802 4803/* 4804 * Find any device that's done replacing, or a vdev marked 'unspare' that's 4805 * current spared, so we can detach it. 4806 */ 4807static vdev_t * 4808spa_vdev_resilver_done_hunt(vdev_t *vd) 4809{ 4810 vdev_t *newvd, *oldvd; 4811 4812 for (int c = 0; c < vd->vdev_children; c++) { 4813 oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]); 4814 if (oldvd != NULL) 4815 return (oldvd); 4816 } 4817 4818 /* 4819 * Check for a completed replacement. We always consider the first 4820 * vdev in the list to be the oldest vdev, and the last one to be 4821 * the newest (see spa_vdev_attach() for how that works). In 4822 * the case where the newest vdev is faulted, we will not automatically 4823 * remove it after a resilver completes. This is OK as it will require 4824 * user intervention to determine which disk the admin wishes to keep. 4825 */ 4826 if (vd->vdev_ops == &vdev_replacing_ops) { 4827 ASSERT(vd->vdev_children > 1); 4828 4829 newvd = vd->vdev_child[vd->vdev_children - 1]; 4830 oldvd = vd->vdev_child[0]; 4831 4832 if (vdev_dtl_empty(newvd, DTL_MISSING) && 4833 vdev_dtl_empty(newvd, DTL_OUTAGE) && 4834 !vdev_dtl_required(oldvd)) 4835 return (oldvd); 4836 } 4837 4838 /* 4839 * Check for a completed resilver with the 'unspare' flag set. 4840 */ 4841 if (vd->vdev_ops == &vdev_spare_ops) { 4842 vdev_t *first = vd->vdev_child[0]; 4843 vdev_t *last = vd->vdev_child[vd->vdev_children - 1]; 4844 4845 if (last->vdev_unspare) { 4846 oldvd = first; 4847 newvd = last; 4848 } else if (first->vdev_unspare) { 4849 oldvd = last; 4850 newvd = first; 4851 } else { 4852 oldvd = NULL; 4853 } 4854 4855 if (oldvd != NULL && 4856 vdev_dtl_empty(newvd, DTL_MISSING) && 4857 vdev_dtl_empty(newvd, DTL_OUTAGE) && 4858 !vdev_dtl_required(oldvd)) 4859 return (oldvd); 4860 4861 /* 4862 * If there are more than two spares attached to a disk, 4863 * and those spares are not required, then we want to 4864 * attempt to free them up now so that they can be used 4865 * by other pools. Once we're back down to a single 4866 * disk+spare, we stop removing them. 4867 */ 4868 if (vd->vdev_children > 2) { 4869 newvd = vd->vdev_child[1]; 4870 4871 if (newvd->vdev_isspare && last->vdev_isspare && 4872 vdev_dtl_empty(last, DTL_MISSING) && 4873 vdev_dtl_empty(last, DTL_OUTAGE) && 4874 !vdev_dtl_required(newvd)) 4875 return (newvd); 4876 } 4877 } 4878 4879 return (NULL); 4880} 4881 4882static void 4883spa_vdev_resilver_done(spa_t *spa) 4884{ 4885 vdev_t *vd, *pvd, *ppvd; 4886 uint64_t guid, sguid, pguid, ppguid; 4887 4888 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4889 4890 while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) { 4891 pvd = vd->vdev_parent; 4892 ppvd = pvd->vdev_parent; 4893 guid = vd->vdev_guid; 4894 pguid = pvd->vdev_guid; 4895 ppguid = ppvd->vdev_guid; 4896 sguid = 0; 4897 /* 4898 * If we have just finished replacing a hot spared device, then 4899 * we need to detach the parent's first child (the original hot 4900 * spare) as well. 4901 */ 4902 if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0 && 4903 ppvd->vdev_children == 2) { 4904 ASSERT(pvd->vdev_ops == &vdev_replacing_ops); 4905 sguid = ppvd->vdev_child[1]->vdev_guid; 4906 } 4907 spa_config_exit(spa, SCL_ALL, FTAG); 4908 if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0) 4909 return; 4910 if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0) 4911 return; 4912 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4913 } 4914 4915 spa_config_exit(spa, SCL_ALL, FTAG); 4916} 4917 4918/* 4919 * Update the stored path or FRU for this vdev. 4920 */ 4921int 4922spa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value, 4923 boolean_t ispath) 4924{ 4925 vdev_t *vd; 4926 boolean_t sync = B_FALSE; 4927 4928 ASSERT(spa_writeable(spa)); 4929 4930 spa_vdev_state_enter(spa, SCL_ALL); 4931 4932 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 4933 return (spa_vdev_state_exit(spa, NULL, ENOENT)); 4934 4935 if (!vd->vdev_ops->vdev_op_leaf) 4936 return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); 4937 4938 if (ispath) { 4939 if (strcmp(value, vd->vdev_path) != 0) { 4940 spa_strfree(vd->vdev_path); 4941 vd->vdev_path = spa_strdup(value); 4942 sync = B_TRUE; 4943 } 4944 } else { 4945 if (vd->vdev_fru == NULL) { 4946 vd->vdev_fru = spa_strdup(value); 4947 sync = B_TRUE; 4948 } else if (strcmp(value, vd->vdev_fru) != 0) { 4949 spa_strfree(vd->vdev_fru); 4950 vd->vdev_fru = spa_strdup(value); 4951 sync = B_TRUE; 4952 } 4953 } 4954 4955 return (spa_vdev_state_exit(spa, sync ? vd : NULL, 0)); 4956} 4957 4958int 4959spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) 4960{ 4961 return (spa_vdev_set_common(spa, guid, newpath, B_TRUE)); 4962} 4963 4964int 4965spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru) 4966{ 4967 return (spa_vdev_set_common(spa, guid, newfru, B_FALSE)); 4968} 4969 4970/* 4971 * ========================================================================== 4972 * SPA Scanning 4973 * ========================================================================== 4974 */ 4975 4976int 4977spa_scan_stop(spa_t *spa) 4978{ 4979 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 4980 if (dsl_scan_resilvering(spa->spa_dsl_pool)) 4981 return (EBUSY); 4982 return (dsl_scan_cancel(spa->spa_dsl_pool)); 4983} 4984 4985int 4986spa_scan(spa_t *spa, pool_scan_func_t func) 4987{ 4988 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 4989 4990 if (func >= POOL_SCAN_FUNCS || func == POOL_SCAN_NONE) 4991 return (ENOTSUP); 4992 4993 /* 4994 * If a resilver was requested, but there is no DTL on a 4995 * writeable leaf device, we have nothing to do. 4996 */ 4997 if (func == POOL_SCAN_RESILVER && 4998 !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) { 4999 spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); 5000 return (0); 5001 } 5002 5003 return (dsl_scan(spa->spa_dsl_pool, func)); 5004} 5005 5006/* 5007 * ========================================================================== 5008 * SPA async task processing 5009 * ========================================================================== 5010 */ 5011 5012static void 5013spa_async_remove(spa_t *spa, vdev_t *vd) 5014{ 5015 if (vd->vdev_remove_wanted) { 5016 vd->vdev_remove_wanted = B_FALSE; 5017 vd->vdev_delayed_close = B_FALSE; 5018 vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE); 5019 5020 /* 5021 * We want to clear the stats, but we don't want to do a full 5022 * vdev_clear() as that will cause us to throw away 5023 * degraded/faulted state as well as attempt to reopen the 5024 * device, all of which is a waste. 5025 */ 5026 vd->vdev_stat.vs_read_errors = 0; 5027 vd->vdev_stat.vs_write_errors = 0; 5028 vd->vdev_stat.vs_checksum_errors = 0; 5029 5030 vdev_state_dirty(vd->vdev_top); 5031 } 5032 5033 for (int c = 0; c < vd->vdev_children; c++) 5034 spa_async_remove(spa, vd->vdev_child[c]); 5035} 5036 5037static void 5038spa_async_probe(spa_t *spa, vdev_t *vd) 5039{ 5040 if (vd->vdev_probe_wanted) { 5041 vd->vdev_probe_wanted = B_FALSE; 5042 vdev_reopen(vd); /* vdev_open() does the actual probe */ 5043 } 5044 5045 for (int c = 0; c < vd->vdev_children; c++) 5046 spa_async_probe(spa, vd->vdev_child[c]); 5047} 5048 5049static void 5050spa_async_autoexpand(spa_t *spa, vdev_t *vd) 5051{ 5052 sysevent_id_t eid; 5053 nvlist_t *attr; 5054 char *physpath; 5055 5056 if (!spa->spa_autoexpand) 5057 return; 5058 5059 for (int c = 0; c < vd->vdev_children; c++) { 5060 vdev_t *cvd = vd->vdev_child[c]; 5061 spa_async_autoexpand(spa, cvd); 5062 } 5063 5064 if (!vd->vdev_ops->vdev_op_leaf || vd->vdev_physpath == NULL) 5065 return; 5066 5067 physpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP); 5068 (void) snprintf(physpath, MAXPATHLEN, "/devices%s", vd->vdev_physpath); 5069 5070 VERIFY(nvlist_alloc(&attr, NV_UNIQUE_NAME, KM_SLEEP) == 0); 5071 VERIFY(nvlist_add_string(attr, DEV_PHYS_PATH, physpath) == 0); 5072 5073 (void) ddi_log_sysevent(zfs_dip, SUNW_VENDOR, EC_DEV_STATUS, 5074 ESC_ZFS_VDEV_AUTOEXPAND, attr, &eid, DDI_SLEEP); 5075 5076 nvlist_free(attr); 5077 kmem_free(physpath, MAXPATHLEN); 5078} 5079 5080static void 5081spa_async_thread(void *arg) 5082{ 5083 spa_t *spa = arg; 5084 int tasks; 5085 5086 ASSERT(spa->spa_sync_on); 5087 5088 mutex_enter(&spa->spa_async_lock); 5089 tasks = spa->spa_async_tasks; 5090 spa->spa_async_tasks = 0; 5091 mutex_exit(&spa->spa_async_lock); 5092 5093 /* 5094 * See if the config needs to be updated. 5095 */ 5096 if (tasks & SPA_ASYNC_CONFIG_UPDATE) { 5097 uint64_t old_space, new_space; 5098 5099 mutex_enter(&spa_namespace_lock); 5100 old_space = metaslab_class_get_space(spa_normal_class(spa)); 5101 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 5102 new_space = metaslab_class_get_space(spa_normal_class(spa)); 5103 mutex_exit(&spa_namespace_lock); 5104 5105 /* 5106 * If the pool grew as a result of the config update, 5107 * then log an internal history event. 5108 */ 5109 if (new_space != old_space) { 5110 spa_history_log_internal(LOG_POOL_VDEV_ONLINE, 5111 spa, NULL, 5112 "pool '%s' size: %llu(+%llu)", 5113 spa_name(spa), new_space, new_space - old_space); 5114 } 5115 } 5116 5117 /* 5118 * See if any devices need to be marked REMOVED. 5119 */ 5120 if (tasks & SPA_ASYNC_REMOVE) { 5121 spa_vdev_state_enter(spa, SCL_NONE); 5122 spa_async_remove(spa, spa->spa_root_vdev); 5123 for (int i = 0; i < spa->spa_l2cache.sav_count; i++) 5124 spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]); 5125 for (int i = 0; i < spa->spa_spares.sav_count; i++) 5126 spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]); 5127 (void) spa_vdev_state_exit(spa, NULL, 0); 5128 } 5129 5130 if ((tasks & SPA_ASYNC_AUTOEXPAND) && !spa_suspended(spa)) { 5131 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 5132 spa_async_autoexpand(spa, spa->spa_root_vdev); 5133 spa_config_exit(spa, SCL_CONFIG, FTAG); 5134 } 5135 5136 /* 5137 * See if any devices need to be probed. 5138 */ 5139 if (tasks & SPA_ASYNC_PROBE) { 5140 spa_vdev_state_enter(spa, SCL_NONE); 5141 spa_async_probe(spa, spa->spa_root_vdev); 5142 (void) spa_vdev_state_exit(spa, NULL, 0); 5143 } 5144 5145 /* 5146 * If any devices are done replacing, detach them. 5147 */ 5148 if (tasks & SPA_ASYNC_RESILVER_DONE) 5149 spa_vdev_resilver_done(spa); 5150 5151 /* 5152 * Kick off a resilver. 5153 */ 5154 if (tasks & SPA_ASYNC_RESILVER) 5155 dsl_resilver_restart(spa->spa_dsl_pool, 0); 5156 5157 /* 5158 * Let the world know that we're done. 5159 */ 5160 mutex_enter(&spa->spa_async_lock); 5161 spa->spa_async_thread = NULL; 5162 cv_broadcast(&spa->spa_async_cv); 5163 mutex_exit(&spa->spa_async_lock); 5164 thread_exit(); 5165} 5166 5167void 5168spa_async_suspend(spa_t *spa) 5169{ 5170 mutex_enter(&spa->spa_async_lock); 5171 spa->spa_async_suspended++; 5172 while (spa->spa_async_thread != NULL) 5173 cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); 5174 mutex_exit(&spa->spa_async_lock); 5175} 5176 5177void 5178spa_async_resume(spa_t *spa) 5179{ 5180 mutex_enter(&spa->spa_async_lock); 5181 ASSERT(spa->spa_async_suspended != 0); 5182 spa->spa_async_suspended--; 5183 mutex_exit(&spa->spa_async_lock); 5184} 5185 5186static void 5187spa_async_dispatch(spa_t *spa) 5188{ 5189 mutex_enter(&spa->spa_async_lock); 5190 if (spa->spa_async_tasks && !spa->spa_async_suspended && 5191 spa->spa_async_thread == NULL && 5192 rootdir != NULL && !vn_is_readonly(rootdir)) 5193 spa->spa_async_thread = thread_create(NULL, 0, 5194 spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); 5195 mutex_exit(&spa->spa_async_lock); 5196} 5197 5198void 5199spa_async_request(spa_t *spa, int task) 5200{ 5201 zfs_dbgmsg("spa=%s async request task=%u", spa->spa_name, task); 5202 mutex_enter(&spa->spa_async_lock); 5203 spa->spa_async_tasks |= task; 5204 mutex_exit(&spa->spa_async_lock); 5205} 5206 5207/* 5208 * ========================================================================== 5209 * SPA syncing routines 5210 * ========================================================================== 5211 */ 5212 5213static int 5214bpobj_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 5215{ 5216 bpobj_t *bpo = arg; 5217 bpobj_enqueue(bpo, bp, tx); 5218 return (0); 5219} 5220 5221static int 5222spa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 5223{ 5224 zio_t *zio = arg; 5225 5226 zio_nowait(zio_free_sync(zio, zio->io_spa, dmu_tx_get_txg(tx), bp, 5227 zio->io_flags)); 5228 return (0); 5229} 5230 5231static void 5232spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) 5233{ 5234 char *packed = NULL; 5235 size_t bufsize; 5236 size_t nvsize = 0; 5237 dmu_buf_t *db; 5238 5239 VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0); 5240 5241 /* 5242 * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration 5243 * information. This avoids the dbuf_will_dirty() path and 5244 * saves us a pre-read to get data we don't actually care about. 5245 */ 5246 bufsize = P2ROUNDUP(nvsize, SPA_CONFIG_BLOCKSIZE); 5247 packed = kmem_alloc(bufsize, KM_SLEEP); 5248 5249 VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR, 5250 KM_SLEEP) == 0); 5251 bzero(packed + nvsize, bufsize - nvsize); 5252 5253 dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx); 5254 5255 kmem_free(packed, bufsize); 5256 5257 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 5258 dmu_buf_will_dirty(db, tx); 5259 *(uint64_t *)db->db_data = nvsize; 5260 dmu_buf_rele(db, FTAG); 5261} 5262 5263static void 5264spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx, 5265 const char *config, const char *entry) 5266{ 5267 nvlist_t *nvroot; 5268 nvlist_t **list; 5269 int i; 5270 5271 if (!sav->sav_sync) 5272 return; 5273 5274 /* 5275 * Update the MOS nvlist describing the list of available devices. 5276 * spa_validate_aux() will have already made sure this nvlist is 5277 * valid and the vdevs are labeled appropriately. 5278 */ 5279 if (sav->sav_object == 0) { 5280 sav->sav_object = dmu_object_alloc(spa->spa_meta_objset, 5281 DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE, 5282 sizeof (uint64_t), tx); 5283 VERIFY(zap_update(spa->spa_meta_objset, 5284 DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1, 5285 &sav->sav_object, tx) == 0); 5286 } 5287 5288 VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 5289 if (sav->sav_count == 0) { 5290 VERIFY(nvlist_add_nvlist_array(nvroot, config, NULL, 0) == 0); 5291 } else { 5292 list = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); 5293 for (i = 0; i < sav->sav_count; i++) 5294 list[i] = vdev_config_generate(spa, sav->sav_vdevs[i], 5295 B_FALSE, VDEV_CONFIG_L2CACHE); 5296 VERIFY(nvlist_add_nvlist_array(nvroot, config, list, 5297 sav->sav_count) == 0); 5298 for (i = 0; i < sav->sav_count; i++) 5299 nvlist_free(list[i]); 5300 kmem_free(list, sav->sav_count * sizeof (void *)); 5301 } 5302 5303 spa_sync_nvlist(spa, sav->sav_object, nvroot, tx); 5304 nvlist_free(nvroot); 5305 5306 sav->sav_sync = B_FALSE; 5307} 5308 5309static void 5310spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) 5311{ 5312 nvlist_t *config; 5313 5314 if (list_is_empty(&spa->spa_config_dirty_list)) 5315 return; 5316 5317 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 5318 5319 config = spa_config_generate(spa, spa->spa_root_vdev, 5320 dmu_tx_get_txg(tx), B_FALSE); 5321 5322 spa_config_exit(spa, SCL_STATE, FTAG); 5323 5324 if (spa->spa_config_syncing) 5325 nvlist_free(spa->spa_config_syncing); 5326 spa->spa_config_syncing = config; 5327 5328 spa_sync_nvlist(spa, spa->spa_config_object, config, tx); 5329} 5330 5331/* 5332 * Set zpool properties. 5333 */ 5334static void 5335spa_sync_props(void *arg1, void *arg2, dmu_tx_t *tx) 5336{ 5337 spa_t *spa = arg1; 5338 objset_t *mos = spa->spa_meta_objset; 5339 nvlist_t *nvp = arg2; 5340 nvpair_t *elem; 5341 uint64_t intval; 5342 char *strval; 5343 zpool_prop_t prop; 5344 const char *propname; 5345 zprop_type_t proptype; 5346 5347 mutex_enter(&spa->spa_props_lock); 5348 5349 elem = NULL; 5350 while ((elem = nvlist_next_nvpair(nvp, elem))) { 5351 switch (prop = zpool_name_to_prop(nvpair_name(elem))) { 5352 case ZPOOL_PROP_VERSION: 5353 /* 5354 * Only set version for non-zpool-creation cases 5355 * (set/import). spa_create() needs special care 5356 * for version setting. 5357 */ 5358 if (tx->tx_txg != TXG_INITIAL) { 5359 VERIFY(nvpair_value_uint64(elem, 5360 &intval) == 0); 5361 ASSERT(intval <= SPA_VERSION); 5362 ASSERT(intval >= spa_version(spa)); 5363 spa->spa_uberblock.ub_version = intval; 5364 vdev_config_dirty(spa->spa_root_vdev); 5365 } 5366 break; 5367 5368 case ZPOOL_PROP_ALTROOT: 5369 /* 5370 * 'altroot' is a non-persistent property. It should 5371 * have been set temporarily at creation or import time. 5372 */ 5373 ASSERT(spa->spa_root != NULL); 5374 break; 5375 5376 case ZPOOL_PROP_READONLY: 5377 case ZPOOL_PROP_CACHEFILE: 5378 /* 5379 * 'readonly' and 'cachefile' are also non-persisitent 5380 * properties. 5381 */ 5382 break;
| 1841 1842 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, 1843 &nvl) == 0) { 1844 VERIFY(nvlist_dup(nvl, &spa->spa_config_splitting, 1845 KM_SLEEP) == 0); 1846 } 1847 1848 gethrestime(&spa->spa_loaded_ts); 1849 error = spa_load_impl(spa, pool_guid, config, state, type, 1850 mosconfig, &ereport); 1851 } 1852 1853 spa->spa_minref = refcount_count(&spa->spa_refcount); 1854 if (error) { 1855 if (error != EEXIST) { 1856 spa->spa_loaded_ts.tv_sec = 0; 1857 spa->spa_loaded_ts.tv_nsec = 0; 1858 } 1859 if (error != EBADF) { 1860 zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0); 1861 } 1862 } 1863 spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE; 1864 spa->spa_ena = 0; 1865 1866 return (error); 1867} 1868 1869/* 1870 * Load an existing storage pool, using the pool's builtin spa_config as a 1871 * source of configuration information. 1872 */ 1873static int 1874spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, 1875 spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig, 1876 char **ereport) 1877{ 1878 int error = 0; 1879 nvlist_t *nvroot = NULL; 1880 vdev_t *rvd; 1881 uberblock_t *ub = &spa->spa_uberblock; 1882 uint64_t children, config_cache_txg = spa->spa_config_txg; 1883 int orig_mode = spa->spa_mode; 1884 int parse; 1885 uint64_t obj; 1886 1887 /* 1888 * If this is an untrusted config, access the pool in read-only mode. 1889 * This prevents things like resilvering recently removed devices. 1890 */ 1891 if (!mosconfig) 1892 spa->spa_mode = FREAD; 1893 1894 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 1895 1896 spa->spa_load_state = state; 1897 1898 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot)) 1899 return (EINVAL); 1900 1901 parse = (type == SPA_IMPORT_EXISTING ? 1902 VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT); 1903 1904 /* 1905 * Create "The Godfather" zio to hold all async IOs 1906 */ 1907 spa->spa_async_zio_root = zio_root(spa, NULL, NULL, 1908 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER); 1909 1910 /* 1911 * Parse the configuration into a vdev tree. We explicitly set the 1912 * value that will be returned by spa_version() since parsing the 1913 * configuration requires knowing the version number. 1914 */ 1915 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1916 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, parse); 1917 spa_config_exit(spa, SCL_ALL, FTAG); 1918 1919 if (error != 0) 1920 return (error); 1921 1922 ASSERT(spa->spa_root_vdev == rvd); 1923 1924 if (type != SPA_IMPORT_ASSEMBLE) { 1925 ASSERT(spa_guid(spa) == pool_guid); 1926 } 1927 1928 /* 1929 * Try to open all vdevs, loading each label in the process. 1930 */ 1931 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1932 error = vdev_open(rvd); 1933 spa_config_exit(spa, SCL_ALL, FTAG); 1934 if (error != 0) 1935 return (error); 1936 1937 /* 1938 * We need to validate the vdev labels against the configuration that 1939 * we have in hand, which is dependent on the setting of mosconfig. If 1940 * mosconfig is true then we're validating the vdev labels based on 1941 * that config. Otherwise, we're validating against the cached config 1942 * (zpool.cache) that was read when we loaded the zfs module, and then 1943 * later we will recursively call spa_load() and validate against 1944 * the vdev config. 1945 * 1946 * If we're assembling a new pool that's been split off from an 1947 * existing pool, the labels haven't yet been updated so we skip 1948 * validation for now. 1949 */ 1950 if (type != SPA_IMPORT_ASSEMBLE) { 1951 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1952 error = vdev_validate(rvd); 1953 spa_config_exit(spa, SCL_ALL, FTAG); 1954 1955 if (error != 0) 1956 return (error); 1957 1958 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) 1959 return (ENXIO); 1960 } 1961 1962 /* 1963 * Find the best uberblock. 1964 */ 1965 vdev_uberblock_load(NULL, rvd, ub); 1966 1967 /* 1968 * If we weren't able to find a single valid uberblock, return failure. 1969 */ 1970 if (ub->ub_txg == 0) 1971 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO)); 1972 1973 /* 1974 * If the pool is newer than the code, we can't open it. 1975 */ 1976 if (ub->ub_version > SPA_VERSION) 1977 return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP)); 1978 1979 /* 1980 * If the vdev guid sum doesn't match the uberblock, we have an 1981 * incomplete configuration. We first check to see if the pool 1982 * is aware of the complete config (i.e ZPOOL_CONFIG_VDEV_CHILDREN). 1983 * If it is, defer the vdev_guid_sum check till later so we 1984 * can handle missing vdevs. 1985 */ 1986 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VDEV_CHILDREN, 1987 &children) != 0 && mosconfig && type != SPA_IMPORT_ASSEMBLE && 1988 rvd->vdev_guid_sum != ub->ub_guid_sum) 1989 return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO)); 1990 1991 if (type != SPA_IMPORT_ASSEMBLE && spa->spa_config_splitting) { 1992 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1993 spa_try_repair(spa, config); 1994 spa_config_exit(spa, SCL_ALL, FTAG); 1995 nvlist_free(spa->spa_config_splitting); 1996 spa->spa_config_splitting = NULL; 1997 } 1998 1999 /* 2000 * Initialize internal SPA structures. 2001 */ 2002 spa->spa_state = POOL_STATE_ACTIVE; 2003 spa->spa_ubsync = spa->spa_uberblock; 2004 spa->spa_verify_min_txg = spa->spa_extreme_rewind ? 2005 TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1; 2006 spa->spa_first_txg = spa->spa_last_ubsync_txg ? 2007 spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1; 2008 spa->spa_claim_max_txg = spa->spa_first_txg; 2009 spa->spa_prev_software_version = ub->ub_software_version; 2010 2011 error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool); 2012 if (error) 2013 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2014 spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; 2015 2016 if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object) != 0) 2017 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2018 2019 if (!mosconfig) { 2020 uint64_t hostid; 2021 nvlist_t *policy = NULL, *nvconfig; 2022 2023 if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0) 2024 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2025 2026 if (!spa_is_root(spa) && nvlist_lookup_uint64(nvconfig, 2027 ZPOOL_CONFIG_HOSTID, &hostid) == 0) { 2028 char *hostname; 2029 unsigned long myhostid = 0; 2030 2031 VERIFY(nvlist_lookup_string(nvconfig, 2032 ZPOOL_CONFIG_HOSTNAME, &hostname) == 0); 2033 2034#ifdef _KERNEL 2035 myhostid = zone_get_hostid(NULL); 2036#else /* _KERNEL */ 2037 /* 2038 * We're emulating the system's hostid in userland, so 2039 * we can't use zone_get_hostid(). 2040 */ 2041 (void) ddi_strtoul(hw_serial, NULL, 10, &myhostid); 2042#endif /* _KERNEL */ 2043 if (check_hostid && hostid != 0 && myhostid != 0 && 2044 hostid != myhostid) { 2045 nvlist_free(nvconfig); 2046 cmn_err(CE_WARN, "pool '%s' could not be " 2047 "loaded as it was last accessed by " 2048 "another system (host: %s hostid: 0x%lx). " 2049 "See: http://www.sun.com/msg/ZFS-8000-EY", 2050 spa_name(spa), hostname, 2051 (unsigned long)hostid); 2052 return (EBADF); 2053 } 2054 } 2055 if (nvlist_lookup_nvlist(spa->spa_config, 2056 ZPOOL_REWIND_POLICY, &policy) == 0) 2057 VERIFY(nvlist_add_nvlist(nvconfig, 2058 ZPOOL_REWIND_POLICY, policy) == 0); 2059 2060 spa_config_set(spa, nvconfig); 2061 spa_unload(spa); 2062 spa_deactivate(spa); 2063 spa_activate(spa, orig_mode); 2064 2065 return (spa_load(spa, state, SPA_IMPORT_EXISTING, B_TRUE)); 2066 } 2067 2068 if (spa_dir_prop(spa, DMU_POOL_SYNC_BPOBJ, &obj) != 0) 2069 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2070 error = bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj); 2071 if (error != 0) 2072 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2073 2074 /* 2075 * Load the bit that tells us to use the new accounting function 2076 * (raid-z deflation). If we have an older pool, this will not 2077 * be present. 2078 */ 2079 error = spa_dir_prop(spa, DMU_POOL_DEFLATE, &spa->spa_deflate); 2080 if (error != 0 && error != ENOENT) 2081 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2082 2083 error = spa_dir_prop(spa, DMU_POOL_CREATION_VERSION, 2084 &spa->spa_creation_version); 2085 if (error != 0 && error != ENOENT) 2086 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2087 2088 /* 2089 * Load the persistent error log. If we have an older pool, this will 2090 * not be present. 2091 */ 2092 error = spa_dir_prop(spa, DMU_POOL_ERRLOG_LAST, &spa->spa_errlog_last); 2093 if (error != 0 && error != ENOENT) 2094 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2095 2096 error = spa_dir_prop(spa, DMU_POOL_ERRLOG_SCRUB, 2097 &spa->spa_errlog_scrub); 2098 if (error != 0 && error != ENOENT) 2099 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2100 2101 /* 2102 * Load the history object. If we have an older pool, this 2103 * will not be present. 2104 */ 2105 error = spa_dir_prop(spa, DMU_POOL_HISTORY, &spa->spa_history); 2106 if (error != 0 && error != ENOENT) 2107 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2108 2109 /* 2110 * If we're assembling the pool from the split-off vdevs of 2111 * an existing pool, we don't want to attach the spares & cache 2112 * devices. 2113 */ 2114 2115 /* 2116 * Load any hot spares for this pool. 2117 */ 2118 error = spa_dir_prop(spa, DMU_POOL_SPARES, &spa->spa_spares.sav_object); 2119 if (error != 0 && error != ENOENT) 2120 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2121 if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { 2122 ASSERT(spa_version(spa) >= SPA_VERSION_SPARES); 2123 if (load_nvlist(spa, spa->spa_spares.sav_object, 2124 &spa->spa_spares.sav_config) != 0) 2125 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2126 2127 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2128 spa_load_spares(spa); 2129 spa_config_exit(spa, SCL_ALL, FTAG); 2130 } else if (error == 0) { 2131 spa->spa_spares.sav_sync = B_TRUE; 2132 } 2133 2134 /* 2135 * Load any level 2 ARC devices for this pool. 2136 */ 2137 error = spa_dir_prop(spa, DMU_POOL_L2CACHE, 2138 &spa->spa_l2cache.sav_object); 2139 if (error != 0 && error != ENOENT) 2140 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2141 if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { 2142 ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE); 2143 if (load_nvlist(spa, spa->spa_l2cache.sav_object, 2144 &spa->spa_l2cache.sav_config) != 0) 2145 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2146 2147 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2148 spa_load_l2cache(spa); 2149 spa_config_exit(spa, SCL_ALL, FTAG); 2150 } else if (error == 0) { 2151 spa->spa_l2cache.sav_sync = B_TRUE; 2152 } 2153 2154 spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 2155 2156 error = spa_dir_prop(spa, DMU_POOL_PROPS, &spa->spa_pool_props_object); 2157 if (error && error != ENOENT) 2158 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2159 2160 if (error == 0) { 2161 uint64_t autoreplace; 2162 2163 spa_prop_find(spa, ZPOOL_PROP_BOOTFS, &spa->spa_bootfs); 2164 spa_prop_find(spa, ZPOOL_PROP_AUTOREPLACE, &autoreplace); 2165 spa_prop_find(spa, ZPOOL_PROP_DELEGATION, &spa->spa_delegation); 2166 spa_prop_find(spa, ZPOOL_PROP_FAILUREMODE, &spa->spa_failmode); 2167 spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand); 2168 spa_prop_find(spa, ZPOOL_PROP_DEDUPDITTO, 2169 &spa->spa_dedup_ditto); 2170 2171 spa->spa_autoreplace = (autoreplace != 0); 2172 } 2173 2174 /* 2175 * If the 'autoreplace' property is set, then post a resource notifying 2176 * the ZFS DE that it should not issue any faults for unopenable 2177 * devices. We also iterate over the vdevs, and post a sysevent for any 2178 * unopenable vdevs so that the normal autoreplace handler can take 2179 * over. 2180 */ 2181 if (spa->spa_autoreplace && state != SPA_LOAD_TRYIMPORT) { 2182 spa_check_removed(spa->spa_root_vdev); 2183 /* 2184 * For the import case, this is done in spa_import(), because 2185 * at this point we're using the spare definitions from 2186 * the MOS config, not necessarily from the userland config. 2187 */ 2188 if (state != SPA_LOAD_IMPORT) { 2189 spa_aux_check_removed(&spa->spa_spares); 2190 spa_aux_check_removed(&spa->spa_l2cache); 2191 } 2192 } 2193 2194 /* 2195 * Load the vdev state for all toplevel vdevs. 2196 */ 2197 vdev_load(rvd); 2198 2199 /* 2200 * Propagate the leaf DTLs we just loaded all the way up the tree. 2201 */ 2202 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2203 vdev_dtl_reassess(rvd, 0, 0, B_FALSE); 2204 spa_config_exit(spa, SCL_ALL, FTAG); 2205 2206 /* 2207 * Load the DDTs (dedup tables). 2208 */ 2209 error = ddt_load(spa); 2210 if (error != 0) 2211 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2212 2213 spa_update_dspace(spa); 2214 2215 /* 2216 * Validate the config, using the MOS config to fill in any 2217 * information which might be missing. If we fail to validate 2218 * the config then declare the pool unfit for use. If we're 2219 * assembling a pool from a split, the log is not transferred 2220 * over. 2221 */ 2222 if (type != SPA_IMPORT_ASSEMBLE) { 2223 nvlist_t *nvconfig; 2224 2225 if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0) 2226 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2227 2228 if (!spa_config_valid(spa, nvconfig)) { 2229 nvlist_free(nvconfig); 2230 return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, 2231 ENXIO)); 2232 } 2233 nvlist_free(nvconfig); 2234 2235 /* 2236 * Now that we've validate the config, check the state of the 2237 * root vdev. If it can't be opened, it indicates one or 2238 * more toplevel vdevs are faulted. 2239 */ 2240 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) 2241 return (ENXIO); 2242 2243 if (spa_check_logs(spa)) { 2244 *ereport = FM_EREPORT_ZFS_LOG_REPLAY; 2245 return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG, ENXIO)); 2246 } 2247 } 2248 2249 /* 2250 * We've successfully opened the pool, verify that we're ready 2251 * to start pushing transactions. 2252 */ 2253 if (state != SPA_LOAD_TRYIMPORT) { 2254 if (error = spa_load_verify(spa)) 2255 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, 2256 error)); 2257 } 2258 2259 if (spa_writeable(spa) && (state == SPA_LOAD_RECOVER || 2260 spa->spa_load_max_txg == UINT64_MAX)) { 2261 dmu_tx_t *tx; 2262 int need_update = B_FALSE; 2263 2264 ASSERT(state != SPA_LOAD_TRYIMPORT); 2265 2266 /* 2267 * Claim log blocks that haven't been committed yet. 2268 * This must all happen in a single txg. 2269 * Note: spa_claim_max_txg is updated by spa_claim_notify(), 2270 * invoked from zil_claim_log_block()'s i/o done callback. 2271 * Price of rollback is that we abandon the log. 2272 */ 2273 spa->spa_claiming = B_TRUE; 2274 2275 tx = dmu_tx_create_assigned(spa_get_dsl(spa), 2276 spa_first_txg(spa)); 2277 (void) dmu_objset_find(spa_name(spa), 2278 zil_claim, tx, DS_FIND_CHILDREN); 2279 dmu_tx_commit(tx); 2280 2281 spa->spa_claiming = B_FALSE; 2282 2283 spa_set_log_state(spa, SPA_LOG_GOOD); 2284 spa->spa_sync_on = B_TRUE; 2285 txg_sync_start(spa->spa_dsl_pool); 2286 2287 /* 2288 * Wait for all claims to sync. We sync up to the highest 2289 * claimed log block birth time so that claimed log blocks 2290 * don't appear to be from the future. spa_claim_max_txg 2291 * will have been set for us by either zil_check_log_chain() 2292 * (invoked from spa_check_logs()) or zil_claim() above. 2293 */ 2294 txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg); 2295 2296 /* 2297 * If the config cache is stale, or we have uninitialized 2298 * metaslabs (see spa_vdev_add()), then update the config. 2299 * 2300 * If this is a verbatim import, trust the current 2301 * in-core spa_config and update the disk labels. 2302 */ 2303 if (config_cache_txg != spa->spa_config_txg || 2304 state == SPA_LOAD_IMPORT || 2305 state == SPA_LOAD_RECOVER || 2306 (spa->spa_import_flags & ZFS_IMPORT_VERBATIM)) 2307 need_update = B_TRUE; 2308 2309 for (int c = 0; c < rvd->vdev_children; c++) 2310 if (rvd->vdev_child[c]->vdev_ms_array == 0) 2311 need_update = B_TRUE; 2312 2313 /* 2314 * Update the config cache asychronously in case we're the 2315 * root pool, in which case the config cache isn't writable yet. 2316 */ 2317 if (need_update) 2318 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 2319 2320 /* 2321 * Check all DTLs to see if anything needs resilvering. 2322 */ 2323 if (!dsl_scan_resilvering(spa->spa_dsl_pool) && 2324 vdev_resilver_needed(rvd, NULL, NULL)) 2325 spa_async_request(spa, SPA_ASYNC_RESILVER); 2326 2327 /* 2328 * Delete any inconsistent datasets. 2329 */ 2330 (void) dmu_objset_find(spa_name(spa), 2331 dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN); 2332 2333 /* 2334 * Clean up any stale temporary dataset userrefs. 2335 */ 2336 dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool); 2337 } 2338 2339 return (0); 2340} 2341 2342static int 2343spa_load_retry(spa_t *spa, spa_load_state_t state, int mosconfig) 2344{ 2345 int mode = spa->spa_mode; 2346 2347 spa_unload(spa); 2348 spa_deactivate(spa); 2349 2350 spa->spa_load_max_txg--; 2351 2352 spa_activate(spa, mode); 2353 spa_async_suspend(spa); 2354 2355 return (spa_load(spa, state, SPA_IMPORT_EXISTING, mosconfig)); 2356} 2357 2358static int 2359spa_load_best(spa_t *spa, spa_load_state_t state, int mosconfig, 2360 uint64_t max_request, int rewind_flags) 2361{ 2362 nvlist_t *config = NULL; 2363 int load_error, rewind_error; 2364 uint64_t safe_rewind_txg; 2365 uint64_t min_txg; 2366 2367 if (spa->spa_load_txg && state == SPA_LOAD_RECOVER) { 2368 spa->spa_load_max_txg = spa->spa_load_txg; 2369 spa_set_log_state(spa, SPA_LOG_CLEAR); 2370 } else { 2371 spa->spa_load_max_txg = max_request; 2372 } 2373 2374 load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING, 2375 mosconfig); 2376 if (load_error == 0) 2377 return (0); 2378 2379 if (spa->spa_root_vdev != NULL) 2380 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 2381 2382 spa->spa_last_ubsync_txg = spa->spa_uberblock.ub_txg; 2383 spa->spa_last_ubsync_txg_ts = spa->spa_uberblock.ub_timestamp; 2384 2385 if (rewind_flags & ZPOOL_NEVER_REWIND) { 2386 nvlist_free(config); 2387 return (load_error); 2388 } 2389 2390 /* Price of rolling back is discarding txgs, including log */ 2391 if (state == SPA_LOAD_RECOVER) 2392 spa_set_log_state(spa, SPA_LOG_CLEAR); 2393 2394 spa->spa_load_max_txg = spa->spa_last_ubsync_txg; 2395 safe_rewind_txg = spa->spa_last_ubsync_txg - TXG_DEFER_SIZE; 2396 min_txg = (rewind_flags & ZPOOL_EXTREME_REWIND) ? 2397 TXG_INITIAL : safe_rewind_txg; 2398 2399 /* 2400 * Continue as long as we're finding errors, we're still within 2401 * the acceptable rewind range, and we're still finding uberblocks 2402 */ 2403 while (rewind_error && spa->spa_uberblock.ub_txg >= min_txg && 2404 spa->spa_uberblock.ub_txg <= spa->spa_load_max_txg) { 2405 if (spa->spa_load_max_txg < safe_rewind_txg) 2406 spa->spa_extreme_rewind = B_TRUE; 2407 rewind_error = spa_load_retry(spa, state, mosconfig); 2408 } 2409 2410 spa->spa_extreme_rewind = B_FALSE; 2411 spa->spa_load_max_txg = UINT64_MAX; 2412 2413 if (config && (rewind_error || state != SPA_LOAD_RECOVER)) 2414 spa_config_set(spa, config); 2415 2416 return (state == SPA_LOAD_RECOVER ? rewind_error : load_error); 2417} 2418 2419/* 2420 * Pool Open/Import 2421 * 2422 * The import case is identical to an open except that the configuration is sent 2423 * down from userland, instead of grabbed from the configuration cache. For the 2424 * case of an open, the pool configuration will exist in the 2425 * POOL_STATE_UNINITIALIZED state. 2426 * 2427 * The stats information (gen/count/ustats) is used to gather vdev statistics at 2428 * the same time open the pool, without having to keep around the spa_t in some 2429 * ambiguous state. 2430 */ 2431static int 2432spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy, 2433 nvlist_t **config) 2434{ 2435 spa_t *spa; 2436 spa_load_state_t state = SPA_LOAD_OPEN; 2437 int error; 2438 int locked = B_FALSE; 2439 int firstopen = B_FALSE; 2440 2441 *spapp = NULL; 2442 2443 /* 2444 * As disgusting as this is, we need to support recursive calls to this 2445 * function because dsl_dir_open() is called during spa_load(), and ends 2446 * up calling spa_open() again. The real fix is to figure out how to 2447 * avoid dsl_dir_open() calling this in the first place. 2448 */ 2449 if (mutex_owner(&spa_namespace_lock) != curthread) { 2450 mutex_enter(&spa_namespace_lock); 2451 locked = B_TRUE; 2452 } 2453 2454 if ((spa = spa_lookup(pool)) == NULL) { 2455 if (locked) 2456 mutex_exit(&spa_namespace_lock); 2457 return (ENOENT); 2458 } 2459 2460 if (spa->spa_state == POOL_STATE_UNINITIALIZED) { 2461 zpool_rewind_policy_t policy; 2462 2463 firstopen = B_TRUE; 2464 2465 zpool_get_rewind_policy(nvpolicy ? nvpolicy : spa->spa_config, 2466 &policy); 2467 if (policy.zrp_request & ZPOOL_DO_REWIND) 2468 state = SPA_LOAD_RECOVER; 2469 2470 spa_activate(spa, spa_mode_global); 2471 2472 if (state != SPA_LOAD_RECOVER) 2473 spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; 2474 2475 error = spa_load_best(spa, state, B_FALSE, policy.zrp_txg, 2476 policy.zrp_request); 2477 2478 if (error == EBADF) { 2479 /* 2480 * If vdev_validate() returns failure (indicated by 2481 * EBADF), it indicates that one of the vdevs indicates 2482 * that the pool has been exported or destroyed. If 2483 * this is the case, the config cache is out of sync and 2484 * we should remove the pool from the namespace. 2485 */ 2486 spa_unload(spa); 2487 spa_deactivate(spa); 2488 spa_config_sync(spa, B_TRUE, B_TRUE); 2489 spa_remove(spa); 2490 if (locked) 2491 mutex_exit(&spa_namespace_lock); 2492 return (ENOENT); 2493 } 2494 2495 if (error) { 2496 /* 2497 * We can't open the pool, but we still have useful 2498 * information: the state of each vdev after the 2499 * attempted vdev_open(). Return this to the user. 2500 */ 2501 if (config != NULL && spa->spa_config) { 2502 VERIFY(nvlist_dup(spa->spa_config, config, 2503 KM_SLEEP) == 0); 2504 VERIFY(nvlist_add_nvlist(*config, 2505 ZPOOL_CONFIG_LOAD_INFO, 2506 spa->spa_load_info) == 0); 2507 } 2508 spa_unload(spa); 2509 spa_deactivate(spa); 2510 spa->spa_last_open_failed = error; 2511 if (locked) 2512 mutex_exit(&spa_namespace_lock); 2513 *spapp = NULL; 2514 return (error); 2515 } 2516 } 2517 2518 spa_open_ref(spa, tag); 2519 2520 if (config != NULL) 2521 *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 2522 2523 /* 2524 * If we've recovered the pool, pass back any information we 2525 * gathered while doing the load. 2526 */ 2527 if (state == SPA_LOAD_RECOVER) { 2528 VERIFY(nvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO, 2529 spa->spa_load_info) == 0); 2530 } 2531 2532 if (locked) { 2533 spa->spa_last_open_failed = 0; 2534 spa->spa_last_ubsync_txg = 0; 2535 spa->spa_load_txg = 0; 2536 mutex_exit(&spa_namespace_lock); 2537#ifdef __FreeBSD__ 2538#ifdef _KERNEL 2539 if (firstopen) 2540 zvol_create_minors(pool); 2541#endif 2542#endif 2543 } 2544 2545 *spapp = spa; 2546 2547 return (0); 2548} 2549 2550int 2551spa_open_rewind(const char *name, spa_t **spapp, void *tag, nvlist_t *policy, 2552 nvlist_t **config) 2553{ 2554 return (spa_open_common(name, spapp, tag, policy, config)); 2555} 2556 2557int 2558spa_open(const char *name, spa_t **spapp, void *tag) 2559{ 2560 return (spa_open_common(name, spapp, tag, NULL, NULL)); 2561} 2562 2563/* 2564 * Lookup the given spa_t, incrementing the inject count in the process, 2565 * preventing it from being exported or destroyed. 2566 */ 2567spa_t * 2568spa_inject_addref(char *name) 2569{ 2570 spa_t *spa; 2571 2572 mutex_enter(&spa_namespace_lock); 2573 if ((spa = spa_lookup(name)) == NULL) { 2574 mutex_exit(&spa_namespace_lock); 2575 return (NULL); 2576 } 2577 spa->spa_inject_ref++; 2578 mutex_exit(&spa_namespace_lock); 2579 2580 return (spa); 2581} 2582 2583void 2584spa_inject_delref(spa_t *spa) 2585{ 2586 mutex_enter(&spa_namespace_lock); 2587 spa->spa_inject_ref--; 2588 mutex_exit(&spa_namespace_lock); 2589} 2590 2591/* 2592 * Add spares device information to the nvlist. 2593 */ 2594static void 2595spa_add_spares(spa_t *spa, nvlist_t *config) 2596{ 2597 nvlist_t **spares; 2598 uint_t i, nspares; 2599 nvlist_t *nvroot; 2600 uint64_t guid; 2601 vdev_stat_t *vs; 2602 uint_t vsc; 2603 uint64_t pool; 2604 2605 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 2606 2607 if (spa->spa_spares.sav_count == 0) 2608 return; 2609 2610 VERIFY(nvlist_lookup_nvlist(config, 2611 ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 2612 VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 2613 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 2614 if (nspares != 0) { 2615 VERIFY(nvlist_add_nvlist_array(nvroot, 2616 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 2617 VERIFY(nvlist_lookup_nvlist_array(nvroot, 2618 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 2619 2620 /* 2621 * Go through and find any spares which have since been 2622 * repurposed as an active spare. If this is the case, update 2623 * their status appropriately. 2624 */ 2625 for (i = 0; i < nspares; i++) { 2626 VERIFY(nvlist_lookup_uint64(spares[i], 2627 ZPOOL_CONFIG_GUID, &guid) == 0); 2628 if (spa_spare_exists(guid, &pool, NULL) && 2629 pool != 0ULL) { 2630 VERIFY(nvlist_lookup_uint64_array( 2631 spares[i], ZPOOL_CONFIG_VDEV_STATS, 2632 (uint64_t **)&vs, &vsc) == 0); 2633 vs->vs_state = VDEV_STATE_CANT_OPEN; 2634 vs->vs_aux = VDEV_AUX_SPARED; 2635 } 2636 } 2637 } 2638} 2639 2640/* 2641 * Add l2cache device information to the nvlist, including vdev stats. 2642 */ 2643static void 2644spa_add_l2cache(spa_t *spa, nvlist_t *config) 2645{ 2646 nvlist_t **l2cache; 2647 uint_t i, j, nl2cache; 2648 nvlist_t *nvroot; 2649 uint64_t guid; 2650 vdev_t *vd; 2651 vdev_stat_t *vs; 2652 uint_t vsc; 2653 2654 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 2655 2656 if (spa->spa_l2cache.sav_count == 0) 2657 return; 2658 2659 VERIFY(nvlist_lookup_nvlist(config, 2660 ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 2661 VERIFY(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, 2662 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 2663 if (nl2cache != 0) { 2664 VERIFY(nvlist_add_nvlist_array(nvroot, 2665 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 2666 VERIFY(nvlist_lookup_nvlist_array(nvroot, 2667 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 2668 2669 /* 2670 * Update level 2 cache device stats. 2671 */ 2672 2673 for (i = 0; i < nl2cache; i++) { 2674 VERIFY(nvlist_lookup_uint64(l2cache[i], 2675 ZPOOL_CONFIG_GUID, &guid) == 0); 2676 2677 vd = NULL; 2678 for (j = 0; j < spa->spa_l2cache.sav_count; j++) { 2679 if (guid == 2680 spa->spa_l2cache.sav_vdevs[j]->vdev_guid) { 2681 vd = spa->spa_l2cache.sav_vdevs[j]; 2682 break; 2683 } 2684 } 2685 ASSERT(vd != NULL); 2686 2687 VERIFY(nvlist_lookup_uint64_array(l2cache[i], 2688 ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc) 2689 == 0); 2690 vdev_get_stats(vd, vs); 2691 } 2692 } 2693} 2694 2695int 2696spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen) 2697{ 2698 int error; 2699 spa_t *spa; 2700 2701 *config = NULL; 2702 error = spa_open_common(name, &spa, FTAG, NULL, config); 2703 2704 if (spa != NULL) { 2705 /* 2706 * This still leaves a window of inconsistency where the spares 2707 * or l2cache devices could change and the config would be 2708 * self-inconsistent. 2709 */ 2710 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 2711 2712 if (*config != NULL) { 2713 uint64_t loadtimes[2]; 2714 2715 loadtimes[0] = spa->spa_loaded_ts.tv_sec; 2716 loadtimes[1] = spa->spa_loaded_ts.tv_nsec; 2717 VERIFY(nvlist_add_uint64_array(*config, 2718 ZPOOL_CONFIG_LOADED_TIME, loadtimes, 2) == 0); 2719 2720 VERIFY(nvlist_add_uint64(*config, 2721 ZPOOL_CONFIG_ERRCOUNT, 2722 spa_get_errlog_size(spa)) == 0); 2723 2724 if (spa_suspended(spa)) 2725 VERIFY(nvlist_add_uint64(*config, 2726 ZPOOL_CONFIG_SUSPENDED, 2727 spa->spa_failmode) == 0); 2728 2729 spa_add_spares(spa, *config); 2730 spa_add_l2cache(spa, *config); 2731 } 2732 } 2733 2734 /* 2735 * We want to get the alternate root even for faulted pools, so we cheat 2736 * and call spa_lookup() directly. 2737 */ 2738 if (altroot) { 2739 if (spa == NULL) { 2740 mutex_enter(&spa_namespace_lock); 2741 spa = spa_lookup(name); 2742 if (spa) 2743 spa_altroot(spa, altroot, buflen); 2744 else 2745 altroot[0] = '\0'; 2746 spa = NULL; 2747 mutex_exit(&spa_namespace_lock); 2748 } else { 2749 spa_altroot(spa, altroot, buflen); 2750 } 2751 } 2752 2753 if (spa != NULL) { 2754 spa_config_exit(spa, SCL_CONFIG, FTAG); 2755 spa_close(spa, FTAG); 2756 } 2757 2758 return (error); 2759} 2760 2761/* 2762 * Validate that the auxiliary device array is well formed. We must have an 2763 * array of nvlists, each which describes a valid leaf vdev. If this is an 2764 * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be 2765 * specified, as long as they are well-formed. 2766 */ 2767static int 2768spa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode, 2769 spa_aux_vdev_t *sav, const char *config, uint64_t version, 2770 vdev_labeltype_t label) 2771{ 2772 nvlist_t **dev; 2773 uint_t i, ndev; 2774 vdev_t *vd; 2775 int error; 2776 2777 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 2778 2779 /* 2780 * It's acceptable to have no devs specified. 2781 */ 2782 if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0) 2783 return (0); 2784 2785 if (ndev == 0) 2786 return (EINVAL); 2787 2788 /* 2789 * Make sure the pool is formatted with a version that supports this 2790 * device type. 2791 */ 2792 if (spa_version(spa) < version) 2793 return (ENOTSUP); 2794 2795 /* 2796 * Set the pending device list so we correctly handle device in-use 2797 * checking. 2798 */ 2799 sav->sav_pending = dev; 2800 sav->sav_npending = ndev; 2801 2802 for (i = 0; i < ndev; i++) { 2803 if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0, 2804 mode)) != 0) 2805 goto out; 2806 2807 if (!vd->vdev_ops->vdev_op_leaf) { 2808 vdev_free(vd); 2809 error = EINVAL; 2810 goto out; 2811 } 2812 2813 /* 2814 * The L2ARC currently only supports disk devices in 2815 * kernel context. For user-level testing, we allow it. 2816 */ 2817#ifdef _KERNEL 2818 if ((strcmp(config, ZPOOL_CONFIG_L2CACHE) == 0) && 2819 strcmp(vd->vdev_ops->vdev_op_type, VDEV_TYPE_DISK) != 0) { 2820 error = ENOTBLK; 2821 goto out; 2822 } 2823#endif 2824 vd->vdev_top = vd; 2825 2826 if ((error = vdev_open(vd)) == 0 && 2827 (error = vdev_label_init(vd, crtxg, label)) == 0) { 2828 VERIFY(nvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID, 2829 vd->vdev_guid) == 0); 2830 } 2831 2832 vdev_free(vd); 2833 2834 if (error && 2835 (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE)) 2836 goto out; 2837 else 2838 error = 0; 2839 } 2840 2841out: 2842 sav->sav_pending = NULL; 2843 sav->sav_npending = 0; 2844 return (error); 2845} 2846 2847static int 2848spa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode) 2849{ 2850 int error; 2851 2852 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 2853 2854 if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode, 2855 &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES, 2856 VDEV_LABEL_SPARE)) != 0) { 2857 return (error); 2858 } 2859 2860 return (spa_validate_aux_devs(spa, nvroot, crtxg, mode, 2861 &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE, 2862 VDEV_LABEL_L2CACHE)); 2863} 2864 2865static void 2866spa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs, 2867 const char *config) 2868{ 2869 int i; 2870 2871 if (sav->sav_config != NULL) { 2872 nvlist_t **olddevs; 2873 uint_t oldndevs; 2874 nvlist_t **newdevs; 2875 2876 /* 2877 * Generate new dev list by concatentating with the 2878 * current dev list. 2879 */ 2880 VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, config, 2881 &olddevs, &oldndevs) == 0); 2882 2883 newdevs = kmem_alloc(sizeof (void *) * 2884 (ndevs + oldndevs), KM_SLEEP); 2885 for (i = 0; i < oldndevs; i++) 2886 VERIFY(nvlist_dup(olddevs[i], &newdevs[i], 2887 KM_SLEEP) == 0); 2888 for (i = 0; i < ndevs; i++) 2889 VERIFY(nvlist_dup(devs[i], &newdevs[i + oldndevs], 2890 KM_SLEEP) == 0); 2891 2892 VERIFY(nvlist_remove(sav->sav_config, config, 2893 DATA_TYPE_NVLIST_ARRAY) == 0); 2894 2895 VERIFY(nvlist_add_nvlist_array(sav->sav_config, 2896 config, newdevs, ndevs + oldndevs) == 0); 2897 for (i = 0; i < oldndevs + ndevs; i++) 2898 nvlist_free(newdevs[i]); 2899 kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *)); 2900 } else { 2901 /* 2902 * Generate a new dev list. 2903 */ 2904 VERIFY(nvlist_alloc(&sav->sav_config, NV_UNIQUE_NAME, 2905 KM_SLEEP) == 0); 2906 VERIFY(nvlist_add_nvlist_array(sav->sav_config, config, 2907 devs, ndevs) == 0); 2908 } 2909} 2910 2911/* 2912 * Stop and drop level 2 ARC devices 2913 */ 2914void 2915spa_l2cache_drop(spa_t *spa) 2916{ 2917 vdev_t *vd; 2918 int i; 2919 spa_aux_vdev_t *sav = &spa->spa_l2cache; 2920 2921 for (i = 0; i < sav->sav_count; i++) { 2922 uint64_t pool; 2923 2924 vd = sav->sav_vdevs[i]; 2925 ASSERT(vd != NULL); 2926 2927 if (spa_l2cache_exists(vd->vdev_guid, &pool) && 2928 pool != 0ULL && l2arc_vdev_present(vd)) 2929 l2arc_remove_vdev(vd); 2930 if (vd->vdev_isl2cache) 2931 spa_l2cache_remove(vd); 2932 vdev_clear_stats(vd); 2933 (void) vdev_close(vd); 2934 } 2935} 2936 2937/* 2938 * Pool Creation 2939 */ 2940int 2941spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, 2942 const char *history_str, nvlist_t *zplprops) 2943{ 2944 spa_t *spa; 2945 char *altroot = NULL; 2946 vdev_t *rvd; 2947 dsl_pool_t *dp; 2948 dmu_tx_t *tx; 2949 int error = 0; 2950 uint64_t txg = TXG_INITIAL; 2951 nvlist_t **spares, **l2cache; 2952 uint_t nspares, nl2cache; 2953 uint64_t version, obj; 2954 2955 /* 2956 * If this pool already exists, return failure. 2957 */ 2958 mutex_enter(&spa_namespace_lock); 2959 if (spa_lookup(pool) != NULL) { 2960 mutex_exit(&spa_namespace_lock); 2961 return (EEXIST); 2962 } 2963 2964 /* 2965 * Allocate a new spa_t structure. 2966 */ 2967 (void) nvlist_lookup_string(props, 2968 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 2969 spa = spa_add(pool, NULL, altroot); 2970 spa_activate(spa, spa_mode_global); 2971 2972 if (props && (error = spa_prop_validate(spa, props))) { 2973 spa_deactivate(spa); 2974 spa_remove(spa); 2975 mutex_exit(&spa_namespace_lock); 2976 return (error); 2977 } 2978 2979 if (nvlist_lookup_uint64(props, zpool_prop_to_name(ZPOOL_PROP_VERSION), 2980 &version) != 0) 2981 version = SPA_VERSION; 2982 ASSERT(version <= SPA_VERSION); 2983 2984 spa->spa_first_txg = txg; 2985 spa->spa_uberblock.ub_txg = txg - 1; 2986 spa->spa_uberblock.ub_version = version; 2987 spa->spa_ubsync = spa->spa_uberblock; 2988 2989 /* 2990 * Create "The Godfather" zio to hold all async IOs 2991 */ 2992 spa->spa_async_zio_root = zio_root(spa, NULL, NULL, 2993 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER); 2994 2995 /* 2996 * Create the root vdev. 2997 */ 2998 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2999 3000 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD); 3001 3002 ASSERT(error != 0 || rvd != NULL); 3003 ASSERT(error != 0 || spa->spa_root_vdev == rvd); 3004 3005 if (error == 0 && !zfs_allocatable_devs(nvroot)) 3006 error = EINVAL; 3007 3008 if (error == 0 && 3009 (error = vdev_create(rvd, txg, B_FALSE)) == 0 && 3010 (error = spa_validate_aux(spa, nvroot, txg, 3011 VDEV_ALLOC_ADD)) == 0) { 3012 for (int c = 0; c < rvd->vdev_children; c++) { 3013 vdev_metaslab_set_size(rvd->vdev_child[c]); 3014 vdev_expand(rvd->vdev_child[c], txg); 3015 } 3016 } 3017 3018 spa_config_exit(spa, SCL_ALL, FTAG); 3019 3020 if (error != 0) { 3021 spa_unload(spa); 3022 spa_deactivate(spa); 3023 spa_remove(spa); 3024 mutex_exit(&spa_namespace_lock); 3025 return (error); 3026 } 3027 3028 /* 3029 * Get the list of spares, if specified. 3030 */ 3031 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 3032 &spares, &nspares) == 0) { 3033 VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, NV_UNIQUE_NAME, 3034 KM_SLEEP) == 0); 3035 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 3036 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 3037 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3038 spa_load_spares(spa); 3039 spa_config_exit(spa, SCL_ALL, FTAG); 3040 spa->spa_spares.sav_sync = B_TRUE; 3041 } 3042 3043 /* 3044 * Get the list of level 2 cache devices, if specified. 3045 */ 3046 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 3047 &l2cache, &nl2cache) == 0) { 3048 VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, 3049 NV_UNIQUE_NAME, KM_SLEEP) == 0); 3050 VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 3051 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 3052 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3053 spa_load_l2cache(spa); 3054 spa_config_exit(spa, SCL_ALL, FTAG); 3055 spa->spa_l2cache.sav_sync = B_TRUE; 3056 } 3057 3058 spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, txg); 3059 spa->spa_meta_objset = dp->dp_meta_objset; 3060 3061 /* 3062 * Create DDTs (dedup tables). 3063 */ 3064 ddt_create(spa); 3065 3066 spa_update_dspace(spa); 3067 3068 tx = dmu_tx_create_assigned(dp, txg); 3069 3070 /* 3071 * Create the pool config object. 3072 */ 3073 spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, 3074 DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE, 3075 DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 3076 3077 if (zap_add(spa->spa_meta_objset, 3078 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 3079 sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { 3080 cmn_err(CE_PANIC, "failed to add pool config"); 3081 } 3082 3083 if (zap_add(spa->spa_meta_objset, 3084 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CREATION_VERSION, 3085 sizeof (uint64_t), 1, &version, tx) != 0) { 3086 cmn_err(CE_PANIC, "failed to add pool version"); 3087 } 3088 3089 /* Newly created pools with the right version are always deflated. */ 3090 if (version >= SPA_VERSION_RAIDZ_DEFLATE) { 3091 spa->spa_deflate = TRUE; 3092 if (zap_add(spa->spa_meta_objset, 3093 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 3094 sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) { 3095 cmn_err(CE_PANIC, "failed to add deflate"); 3096 } 3097 } 3098 3099 /* 3100 * Create the deferred-free bpobj. Turn off compression 3101 * because sync-to-convergence takes longer if the blocksize 3102 * keeps changing. 3103 */ 3104 obj = bpobj_alloc(spa->spa_meta_objset, 1 << 14, tx); 3105 dmu_object_set_compress(spa->spa_meta_objset, obj, 3106 ZIO_COMPRESS_OFF, tx); 3107 if (zap_add(spa->spa_meta_objset, 3108 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPOBJ, 3109 sizeof (uint64_t), 1, &obj, tx) != 0) { 3110 cmn_err(CE_PANIC, "failed to add bpobj"); 3111 } 3112 VERIFY3U(0, ==, bpobj_open(&spa->spa_deferred_bpobj, 3113 spa->spa_meta_objset, obj)); 3114 3115 /* 3116 * Create the pool's history object. 3117 */ 3118 if (version >= SPA_VERSION_ZPOOL_HISTORY) 3119 spa_history_create_obj(spa, tx); 3120 3121 /* 3122 * Set pool properties. 3123 */ 3124 spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS); 3125 spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 3126 spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE); 3127 spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND); 3128 3129 if (props != NULL) { 3130 spa_configfile_set(spa, props, B_FALSE); 3131 spa_sync_props(spa, props, tx); 3132 } 3133 3134 dmu_tx_commit(tx); 3135 3136 spa->spa_sync_on = B_TRUE; 3137 txg_sync_start(spa->spa_dsl_pool); 3138 3139 /* 3140 * We explicitly wait for the first transaction to complete so that our 3141 * bean counters are appropriately updated. 3142 */ 3143 txg_wait_synced(spa->spa_dsl_pool, txg); 3144 3145 spa_config_sync(spa, B_FALSE, B_TRUE); 3146 3147 if (version >= SPA_VERSION_ZPOOL_HISTORY && history_str != NULL) 3148 (void) spa_history_log(spa, history_str, LOG_CMD_POOL_CREATE); 3149 spa_history_log_version(spa, LOG_POOL_CREATE); 3150 3151 spa->spa_minref = refcount_count(&spa->spa_refcount); 3152 3153 mutex_exit(&spa_namespace_lock); 3154 3155 return (0); 3156} 3157 3158#if defined(sun) 3159#ifdef _KERNEL 3160/* 3161 * Get the root pool information from the root disk, then import the root pool 3162 * during the system boot up time. 3163 */ 3164extern int vdev_disk_read_rootlabel(char *, char *, nvlist_t **); 3165 3166static nvlist_t * 3167spa_generate_rootconf(char *devpath, char *devid, uint64_t *guid) 3168{ 3169 nvlist_t *config; 3170 nvlist_t *nvtop, *nvroot; 3171 uint64_t pgid; 3172 3173 if (vdev_disk_read_rootlabel(devpath, devid, &config) != 0) 3174 return (NULL); 3175 3176 /* 3177 * Add this top-level vdev to the child array. 3178 */ 3179 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 3180 &nvtop) == 0); 3181 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, 3182 &pgid) == 0); 3183 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, guid) == 0); 3184 3185 /* 3186 * Put this pool's top-level vdevs into a root vdev. 3187 */ 3188 VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 3189 VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, 3190 VDEV_TYPE_ROOT) == 0); 3191 VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0); 3192 VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0); 3193 VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 3194 &nvtop, 1) == 0); 3195 3196 /* 3197 * Replace the existing vdev_tree with the new root vdev in 3198 * this pool's configuration (remove the old, add the new). 3199 */ 3200 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0); 3201 nvlist_free(nvroot); 3202 return (config); 3203} 3204 3205/* 3206 * Walk the vdev tree and see if we can find a device with "better" 3207 * configuration. A configuration is "better" if the label on that 3208 * device has a more recent txg. 3209 */ 3210static void 3211spa_alt_rootvdev(vdev_t *vd, vdev_t **avd, uint64_t *txg) 3212{ 3213 for (int c = 0; c < vd->vdev_children; c++) 3214 spa_alt_rootvdev(vd->vdev_child[c], avd, txg); 3215 3216 if (vd->vdev_ops->vdev_op_leaf) { 3217 nvlist_t *label; 3218 uint64_t label_txg; 3219 3220 if (vdev_disk_read_rootlabel(vd->vdev_physpath, vd->vdev_devid, 3221 &label) != 0) 3222 return; 3223 3224 VERIFY(nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG, 3225 &label_txg) == 0); 3226 3227 /* 3228 * Do we have a better boot device? 3229 */ 3230 if (label_txg > *txg) { 3231 *txg = label_txg; 3232 *avd = vd; 3233 } 3234 nvlist_free(label); 3235 } 3236} 3237 3238/* 3239 * Import a root pool. 3240 * 3241 * For x86. devpath_list will consist of devid and/or physpath name of 3242 * the vdev (e.g. "id1,sd@SSEAGATE..." or "/pci@1f,0/ide@d/disk@0,0:a"). 3243 * The GRUB "findroot" command will return the vdev we should boot. 3244 * 3245 * For Sparc, devpath_list consists the physpath name of the booting device 3246 * no matter the rootpool is a single device pool or a mirrored pool. 3247 * e.g. 3248 * "/pci@1f,0/ide@d/disk@0,0:a" 3249 */ 3250int 3251spa_import_rootpool(char *devpath, char *devid) 3252{ 3253 spa_t *spa; 3254 vdev_t *rvd, *bvd, *avd = NULL; 3255 nvlist_t *config, *nvtop; 3256 uint64_t guid, txg; 3257 char *pname; 3258 int error; 3259 3260 /* 3261 * Read the label from the boot device and generate a configuration. 3262 */ 3263 config = spa_generate_rootconf(devpath, devid, &guid); 3264#if defined(_OBP) && defined(_KERNEL) 3265 if (config == NULL) { 3266 if (strstr(devpath, "/iscsi/ssd") != NULL) { 3267 /* iscsi boot */ 3268 get_iscsi_bootpath_phy(devpath); 3269 config = spa_generate_rootconf(devpath, devid, &guid); 3270 } 3271 } 3272#endif 3273 if (config == NULL) { 3274 cmn_err(CE_NOTE, "Can not read the pool label from '%s'", 3275 devpath); 3276 return (EIO); 3277 } 3278 3279 VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, 3280 &pname) == 0); 3281 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) == 0); 3282 3283 mutex_enter(&spa_namespace_lock); 3284 if ((spa = spa_lookup(pname)) != NULL) { 3285 /* 3286 * Remove the existing root pool from the namespace so that we 3287 * can replace it with the correct config we just read in. 3288 */ 3289 spa_remove(spa); 3290 } 3291 3292 spa = spa_add(pname, config, NULL); 3293 spa->spa_is_root = B_TRUE; 3294 spa->spa_import_flags = ZFS_IMPORT_VERBATIM; 3295 3296 /* 3297 * Build up a vdev tree based on the boot device's label config. 3298 */ 3299 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 3300 &nvtop) == 0); 3301 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3302 error = spa_config_parse(spa, &rvd, nvtop, NULL, 0, 3303 VDEV_ALLOC_ROOTPOOL); 3304 spa_config_exit(spa, SCL_ALL, FTAG); 3305 if (error) { 3306 mutex_exit(&spa_namespace_lock); 3307 nvlist_free(config); 3308 cmn_err(CE_NOTE, "Can not parse the config for pool '%s'", 3309 pname); 3310 return (error); 3311 } 3312 3313 /* 3314 * Get the boot vdev. 3315 */ 3316 if ((bvd = vdev_lookup_by_guid(rvd, guid)) == NULL) { 3317 cmn_err(CE_NOTE, "Can not find the boot vdev for guid %llu", 3318 (u_longlong_t)guid); 3319 error = ENOENT; 3320 goto out; 3321 } 3322 3323 /* 3324 * Determine if there is a better boot device. 3325 */ 3326 avd = bvd; 3327 spa_alt_rootvdev(rvd, &avd, &txg); 3328 if (avd != bvd) { 3329 cmn_err(CE_NOTE, "The boot device is 'degraded'. Please " 3330 "try booting from '%s'", avd->vdev_path); 3331 error = EINVAL; 3332 goto out; 3333 } 3334 3335 /* 3336 * If the boot device is part of a spare vdev then ensure that 3337 * we're booting off the active spare. 3338 */ 3339 if (bvd->vdev_parent->vdev_ops == &vdev_spare_ops && 3340 !bvd->vdev_isspare) { 3341 cmn_err(CE_NOTE, "The boot device is currently spared. Please " 3342 "try booting from '%s'", 3343 bvd->vdev_parent-> 3344 vdev_child[bvd->vdev_parent->vdev_children - 1]->vdev_path); 3345 error = EINVAL; 3346 goto out; 3347 } 3348 3349 error = 0; 3350 spa_history_log_version(spa, LOG_POOL_IMPORT); 3351out: 3352 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3353 vdev_free(rvd); 3354 spa_config_exit(spa, SCL_ALL, FTAG); 3355 mutex_exit(&spa_namespace_lock); 3356 3357 nvlist_free(config); 3358 return (error); 3359} 3360 3361#endif 3362#endif /* sun */ 3363 3364/* 3365 * Import a non-root pool into the system. 3366 */ 3367int 3368spa_import(const char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags) 3369{ 3370 spa_t *spa; 3371 char *altroot = NULL; 3372 spa_load_state_t state = SPA_LOAD_IMPORT; 3373 zpool_rewind_policy_t policy; 3374 uint64_t mode = spa_mode_global; 3375 uint64_t readonly = B_FALSE; 3376 int error; 3377 nvlist_t *nvroot; 3378 nvlist_t **spares, **l2cache; 3379 uint_t nspares, nl2cache; 3380 3381 /* 3382 * If a pool with this name exists, return failure. 3383 */ 3384 mutex_enter(&spa_namespace_lock); 3385 if (spa_lookup(pool) != NULL) { 3386 mutex_exit(&spa_namespace_lock); 3387 return (EEXIST); 3388 } 3389 3390 /* 3391 * Create and initialize the spa structure. 3392 */ 3393 (void) nvlist_lookup_string(props, 3394 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 3395 (void) nvlist_lookup_uint64(props, 3396 zpool_prop_to_name(ZPOOL_PROP_READONLY), &readonly); 3397 if (readonly) 3398 mode = FREAD; 3399 spa = spa_add(pool, config, altroot); 3400 spa->spa_import_flags = flags; 3401 3402 /* 3403 * Verbatim import - Take a pool and insert it into the namespace 3404 * as if it had been loaded at boot. 3405 */ 3406 if (spa->spa_import_flags & ZFS_IMPORT_VERBATIM) { 3407 if (props != NULL) 3408 spa_configfile_set(spa, props, B_FALSE); 3409 3410 spa_config_sync(spa, B_FALSE, B_TRUE); 3411 3412 mutex_exit(&spa_namespace_lock); 3413 spa_history_log_version(spa, LOG_POOL_IMPORT); 3414 3415 return (0); 3416 } 3417 3418 spa_activate(spa, mode); 3419 3420 /* 3421 * Don't start async tasks until we know everything is healthy. 3422 */ 3423 spa_async_suspend(spa); 3424 3425 zpool_get_rewind_policy(config, &policy); 3426 if (policy.zrp_request & ZPOOL_DO_REWIND) 3427 state = SPA_LOAD_RECOVER; 3428 3429 /* 3430 * Pass off the heavy lifting to spa_load(). Pass TRUE for mosconfig 3431 * because the user-supplied config is actually the one to trust when 3432 * doing an import. 3433 */ 3434 if (state != SPA_LOAD_RECOVER) 3435 spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; 3436 3437 error = spa_load_best(spa, state, B_TRUE, policy.zrp_txg, 3438 policy.zrp_request); 3439 3440 /* 3441 * Propagate anything learned while loading the pool and pass it 3442 * back to caller (i.e. rewind info, missing devices, etc). 3443 */ 3444 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, 3445 spa->spa_load_info) == 0); 3446 3447 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3448 /* 3449 * Toss any existing sparelist, as it doesn't have any validity 3450 * anymore, and conflicts with spa_has_spare(). 3451 */ 3452 if (spa->spa_spares.sav_config) { 3453 nvlist_free(spa->spa_spares.sav_config); 3454 spa->spa_spares.sav_config = NULL; 3455 spa_load_spares(spa); 3456 } 3457 if (spa->spa_l2cache.sav_config) { 3458 nvlist_free(spa->spa_l2cache.sav_config); 3459 spa->spa_l2cache.sav_config = NULL; 3460 spa_load_l2cache(spa); 3461 } 3462 3463 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 3464 &nvroot) == 0); 3465 if (error == 0) 3466 error = spa_validate_aux(spa, nvroot, -1ULL, 3467 VDEV_ALLOC_SPARE); 3468 if (error == 0) 3469 error = spa_validate_aux(spa, nvroot, -1ULL, 3470 VDEV_ALLOC_L2CACHE); 3471 spa_config_exit(spa, SCL_ALL, FTAG); 3472 3473 if (props != NULL) 3474 spa_configfile_set(spa, props, B_FALSE); 3475 3476 if (error != 0 || (props && spa_writeable(spa) && 3477 (error = spa_prop_set(spa, props)))) { 3478 spa_unload(spa); 3479 spa_deactivate(spa); 3480 spa_remove(spa); 3481 mutex_exit(&spa_namespace_lock); 3482 return (error); 3483 } 3484 3485 spa_async_resume(spa); 3486 3487 /* 3488 * Override any spares and level 2 cache devices as specified by 3489 * the user, as these may have correct device names/devids, etc. 3490 */ 3491 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 3492 &spares, &nspares) == 0) { 3493 if (spa->spa_spares.sav_config) 3494 VERIFY(nvlist_remove(spa->spa_spares.sav_config, 3495 ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); 3496 else 3497 VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, 3498 NV_UNIQUE_NAME, KM_SLEEP) == 0); 3499 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 3500 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 3501 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3502 spa_load_spares(spa); 3503 spa_config_exit(spa, SCL_ALL, FTAG); 3504 spa->spa_spares.sav_sync = B_TRUE; 3505 } 3506 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 3507 &l2cache, &nl2cache) == 0) { 3508 if (spa->spa_l2cache.sav_config) 3509 VERIFY(nvlist_remove(spa->spa_l2cache.sav_config, 3510 ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0); 3511 else 3512 VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, 3513 NV_UNIQUE_NAME, KM_SLEEP) == 0); 3514 VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 3515 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 3516 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3517 spa_load_l2cache(spa); 3518 spa_config_exit(spa, SCL_ALL, FTAG); 3519 spa->spa_l2cache.sav_sync = B_TRUE; 3520 } 3521 3522 /* 3523 * Check for any removed devices. 3524 */ 3525 if (spa->spa_autoreplace) { 3526 spa_aux_check_removed(&spa->spa_spares); 3527 spa_aux_check_removed(&spa->spa_l2cache); 3528 } 3529 3530 if (spa_writeable(spa)) { 3531 /* 3532 * Update the config cache to include the newly-imported pool. 3533 */ 3534 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 3535 } 3536 3537 /* 3538 * It's possible that the pool was expanded while it was exported. 3539 * We kick off an async task to handle this for us. 3540 */ 3541 spa_async_request(spa, SPA_ASYNC_AUTOEXPAND); 3542 3543 mutex_exit(&spa_namespace_lock); 3544 spa_history_log_version(spa, LOG_POOL_IMPORT); 3545 3546#ifdef __FreeBSD__ 3547#ifdef _KERNEL 3548 zvol_create_minors(pool); 3549#endif 3550#endif 3551 return (0); 3552} 3553 3554nvlist_t * 3555spa_tryimport(nvlist_t *tryconfig) 3556{ 3557 nvlist_t *config = NULL; 3558 char *poolname; 3559 spa_t *spa; 3560 uint64_t state; 3561 int error; 3562 3563 if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) 3564 return (NULL); 3565 3566 if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) 3567 return (NULL); 3568 3569 /* 3570 * Create and initialize the spa structure. 3571 */ 3572 mutex_enter(&spa_namespace_lock); 3573 spa = spa_add(TRYIMPORT_NAME, tryconfig, NULL); 3574 spa_activate(spa, FREAD); 3575 3576 /* 3577 * Pass off the heavy lifting to spa_load(). 3578 * Pass TRUE for mosconfig because the user-supplied config 3579 * is actually the one to trust when doing an import. 3580 */ 3581 error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING, B_TRUE); 3582 3583 /* 3584 * If 'tryconfig' was at least parsable, return the current config. 3585 */ 3586 if (spa->spa_root_vdev != NULL) { 3587 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 3588 VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, 3589 poolname) == 0); 3590 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 3591 state) == 0); 3592 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP, 3593 spa->spa_uberblock.ub_timestamp) == 0); 3594 3595 /* 3596 * If the bootfs property exists on this pool then we 3597 * copy it out so that external consumers can tell which 3598 * pools are bootable. 3599 */ 3600 if ((!error || error == EEXIST) && spa->spa_bootfs) { 3601 char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 3602 3603 /* 3604 * We have to play games with the name since the 3605 * pool was opened as TRYIMPORT_NAME. 3606 */ 3607 if (dsl_dsobj_to_dsname(spa_name(spa), 3608 spa->spa_bootfs, tmpname) == 0) { 3609 char *cp; 3610 char *dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 3611 3612 cp = strchr(tmpname, '/'); 3613 if (cp == NULL) { 3614 (void) strlcpy(dsname, tmpname, 3615 MAXPATHLEN); 3616 } else { 3617 (void) snprintf(dsname, MAXPATHLEN, 3618 "%s/%s", poolname, ++cp); 3619 } 3620 VERIFY(nvlist_add_string(config, 3621 ZPOOL_CONFIG_BOOTFS, dsname) == 0); 3622 kmem_free(dsname, MAXPATHLEN); 3623 } 3624 kmem_free(tmpname, MAXPATHLEN); 3625 } 3626 3627 /* 3628 * Add the list of hot spares and level 2 cache devices. 3629 */ 3630 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 3631 spa_add_spares(spa, config); 3632 spa_add_l2cache(spa, config); 3633 spa_config_exit(spa, SCL_CONFIG, FTAG); 3634 } 3635 3636 spa_unload(spa); 3637 spa_deactivate(spa); 3638 spa_remove(spa); 3639 mutex_exit(&spa_namespace_lock); 3640 3641 return (config); 3642} 3643 3644/* 3645 * Pool export/destroy 3646 * 3647 * The act of destroying or exporting a pool is very simple. We make sure there 3648 * is no more pending I/O and any references to the pool are gone. Then, we 3649 * update the pool state and sync all the labels to disk, removing the 3650 * configuration from the cache afterwards. If the 'hardforce' flag is set, then 3651 * we don't sync the labels or remove the configuration cache. 3652 */ 3653static int 3654spa_export_common(char *pool, int new_state, nvlist_t **oldconfig, 3655 boolean_t force, boolean_t hardforce) 3656{ 3657 spa_t *spa; 3658 3659 if (oldconfig) 3660 *oldconfig = NULL; 3661 3662 if (!(spa_mode_global & FWRITE)) 3663 return (EROFS); 3664 3665 mutex_enter(&spa_namespace_lock); 3666 if ((spa = spa_lookup(pool)) == NULL) { 3667 mutex_exit(&spa_namespace_lock); 3668 return (ENOENT); 3669 } 3670 3671 /* 3672 * Put a hold on the pool, drop the namespace lock, stop async tasks, 3673 * reacquire the namespace lock, and see if we can export. 3674 */ 3675 spa_open_ref(spa, FTAG); 3676 mutex_exit(&spa_namespace_lock); 3677 spa_async_suspend(spa); 3678 mutex_enter(&spa_namespace_lock); 3679 spa_close(spa, FTAG); 3680 3681 /* 3682 * The pool will be in core if it's openable, 3683 * in which case we can modify its state. 3684 */ 3685 if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) { 3686 /* 3687 * Objsets may be open only because they're dirty, so we 3688 * have to force it to sync before checking spa_refcnt. 3689 */ 3690 txg_wait_synced(spa->spa_dsl_pool, 0); 3691 3692 /* 3693 * A pool cannot be exported or destroyed if there are active 3694 * references. If we are resetting a pool, allow references by 3695 * fault injection handlers. 3696 */ 3697 if (!spa_refcount_zero(spa) || 3698 (spa->spa_inject_ref != 0 && 3699 new_state != POOL_STATE_UNINITIALIZED)) { 3700 spa_async_resume(spa); 3701 mutex_exit(&spa_namespace_lock); 3702 return (EBUSY); 3703 } 3704 3705 /* 3706 * A pool cannot be exported if it has an active shared spare. 3707 * This is to prevent other pools stealing the active spare 3708 * from an exported pool. At user's own will, such pool can 3709 * be forcedly exported. 3710 */ 3711 if (!force && new_state == POOL_STATE_EXPORTED && 3712 spa_has_active_shared_spare(spa)) { 3713 spa_async_resume(spa); 3714 mutex_exit(&spa_namespace_lock); 3715 return (EXDEV); 3716 } 3717 3718 /* 3719 * We want this to be reflected on every label, 3720 * so mark them all dirty. spa_unload() will do the 3721 * final sync that pushes these changes out. 3722 */ 3723 if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) { 3724 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3725 spa->spa_state = new_state; 3726 spa->spa_final_txg = spa_last_synced_txg(spa) + 3727 TXG_DEFER_SIZE + 1; 3728 vdev_config_dirty(spa->spa_root_vdev); 3729 spa_config_exit(spa, SCL_ALL, FTAG); 3730 } 3731 } 3732 3733 spa_event_notify(spa, NULL, ESC_ZFS_POOL_DESTROY); 3734 3735 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 3736 spa_unload(spa); 3737 spa_deactivate(spa); 3738 } 3739 3740 if (oldconfig && spa->spa_config) 3741 VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0); 3742 3743 if (new_state != POOL_STATE_UNINITIALIZED) { 3744 if (!hardforce) 3745 spa_config_sync(spa, B_TRUE, B_TRUE); 3746 spa_remove(spa); 3747 } 3748 mutex_exit(&spa_namespace_lock); 3749 3750 return (0); 3751} 3752 3753/* 3754 * Destroy a storage pool. 3755 */ 3756int 3757spa_destroy(char *pool) 3758{ 3759 return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL, 3760 B_FALSE, B_FALSE)); 3761} 3762 3763/* 3764 * Export a storage pool. 3765 */ 3766int 3767spa_export(char *pool, nvlist_t **oldconfig, boolean_t force, 3768 boolean_t hardforce) 3769{ 3770 return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig, 3771 force, hardforce)); 3772} 3773 3774/* 3775 * Similar to spa_export(), this unloads the spa_t without actually removing it 3776 * from the namespace in any way. 3777 */ 3778int 3779spa_reset(char *pool) 3780{ 3781 return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL, 3782 B_FALSE, B_FALSE)); 3783} 3784 3785/* 3786 * ========================================================================== 3787 * Device manipulation 3788 * ========================================================================== 3789 */ 3790 3791/* 3792 * Add a device to a storage pool. 3793 */ 3794int 3795spa_vdev_add(spa_t *spa, nvlist_t *nvroot) 3796{ 3797 uint64_t txg, id; 3798 int error; 3799 vdev_t *rvd = spa->spa_root_vdev; 3800 vdev_t *vd, *tvd; 3801 nvlist_t **spares, **l2cache; 3802 uint_t nspares, nl2cache; 3803 3804 ASSERT(spa_writeable(spa)); 3805 3806 txg = spa_vdev_enter(spa); 3807 3808 if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0, 3809 VDEV_ALLOC_ADD)) != 0) 3810 return (spa_vdev_exit(spa, NULL, txg, error)); 3811 3812 spa->spa_pending_vdev = vd; /* spa_vdev_exit() will clear this */ 3813 3814 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, 3815 &nspares) != 0) 3816 nspares = 0; 3817 3818 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, 3819 &nl2cache) != 0) 3820 nl2cache = 0; 3821 3822 if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0) 3823 return (spa_vdev_exit(spa, vd, txg, EINVAL)); 3824 3825 if (vd->vdev_children != 0 && 3826 (error = vdev_create(vd, txg, B_FALSE)) != 0) 3827 return (spa_vdev_exit(spa, vd, txg, error)); 3828 3829 /* 3830 * We must validate the spares and l2cache devices after checking the 3831 * children. Otherwise, vdev_inuse() will blindly overwrite the spare. 3832 */ 3833 if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0) 3834 return (spa_vdev_exit(spa, vd, txg, error)); 3835 3836 /* 3837 * Transfer each new top-level vdev from vd to rvd. 3838 */ 3839 for (int c = 0; c < vd->vdev_children; c++) { 3840 3841 /* 3842 * Set the vdev id to the first hole, if one exists. 3843 */ 3844 for (id = 0; id < rvd->vdev_children; id++) { 3845 if (rvd->vdev_child[id]->vdev_ishole) { 3846 vdev_free(rvd->vdev_child[id]); 3847 break; 3848 } 3849 } 3850 tvd = vd->vdev_child[c]; 3851 vdev_remove_child(vd, tvd); 3852 tvd->vdev_id = id; 3853 vdev_add_child(rvd, tvd); 3854 vdev_config_dirty(tvd); 3855 } 3856 3857 if (nspares != 0) { 3858 spa_set_aux_vdevs(&spa->spa_spares, spares, nspares, 3859 ZPOOL_CONFIG_SPARES); 3860 spa_load_spares(spa); 3861 spa->spa_spares.sav_sync = B_TRUE; 3862 } 3863 3864 if (nl2cache != 0) { 3865 spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache, 3866 ZPOOL_CONFIG_L2CACHE); 3867 spa_load_l2cache(spa); 3868 spa->spa_l2cache.sav_sync = B_TRUE; 3869 } 3870 3871 /* 3872 * We have to be careful when adding new vdevs to an existing pool. 3873 * If other threads start allocating from these vdevs before we 3874 * sync the config cache, and we lose power, then upon reboot we may 3875 * fail to open the pool because there are DVAs that the config cache 3876 * can't translate. Therefore, we first add the vdevs without 3877 * initializing metaslabs; sync the config cache (via spa_vdev_exit()); 3878 * and then let spa_config_update() initialize the new metaslabs. 3879 * 3880 * spa_load() checks for added-but-not-initialized vdevs, so that 3881 * if we lose power at any point in this sequence, the remaining 3882 * steps will be completed the next time we load the pool. 3883 */ 3884 (void) spa_vdev_exit(spa, vd, txg, 0); 3885 3886 mutex_enter(&spa_namespace_lock); 3887 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 3888 mutex_exit(&spa_namespace_lock); 3889 3890 return (0); 3891} 3892 3893/* 3894 * Attach a device to a mirror. The arguments are the path to any device 3895 * in the mirror, and the nvroot for the new device. If the path specifies 3896 * a device that is not mirrored, we automatically insert the mirror vdev. 3897 * 3898 * If 'replacing' is specified, the new device is intended to replace the 3899 * existing device; in this case the two devices are made into their own 3900 * mirror using the 'replacing' vdev, which is functionally identical to 3901 * the mirror vdev (it actually reuses all the same ops) but has a few 3902 * extra rules: you can't attach to it after it's been created, and upon 3903 * completion of resilvering, the first disk (the one being replaced) 3904 * is automatically detached. 3905 */ 3906int 3907spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) 3908{ 3909 uint64_t txg, dtl_max_txg; 3910 vdev_t *rvd = spa->spa_root_vdev; 3911 vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; 3912 vdev_ops_t *pvops; 3913 char *oldvdpath, *newvdpath; 3914 int newvd_isspare; 3915 int error; 3916 3917 ASSERT(spa_writeable(spa)); 3918 3919 txg = spa_vdev_enter(spa); 3920 3921 oldvd = spa_lookup_by_guid(spa, guid, B_FALSE); 3922 3923 if (oldvd == NULL) 3924 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 3925 3926 if (!oldvd->vdev_ops->vdev_op_leaf) 3927 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 3928 3929 pvd = oldvd->vdev_parent; 3930 3931 if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, 3932 VDEV_ALLOC_ADD)) != 0) 3933 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 3934 3935 if (newrootvd->vdev_children != 1) 3936 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 3937 3938 newvd = newrootvd->vdev_child[0]; 3939 3940 if (!newvd->vdev_ops->vdev_op_leaf) 3941 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 3942 3943 if ((error = vdev_create(newrootvd, txg, replacing)) != 0) 3944 return (spa_vdev_exit(spa, newrootvd, txg, error)); 3945 3946 /* 3947 * Spares can't replace logs 3948 */ 3949 if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare) 3950 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 3951 3952 if (!replacing) { 3953 /* 3954 * For attach, the only allowable parent is a mirror or the root 3955 * vdev. 3956 */ 3957 if (pvd->vdev_ops != &vdev_mirror_ops && 3958 pvd->vdev_ops != &vdev_root_ops) 3959 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 3960 3961 pvops = &vdev_mirror_ops; 3962 } else { 3963 /* 3964 * Active hot spares can only be replaced by inactive hot 3965 * spares. 3966 */ 3967 if (pvd->vdev_ops == &vdev_spare_ops && 3968 oldvd->vdev_isspare && 3969 !spa_has_spare(spa, newvd->vdev_guid)) 3970 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 3971 3972 /* 3973 * If the source is a hot spare, and the parent isn't already a 3974 * spare, then we want to create a new hot spare. Otherwise, we 3975 * want to create a replacing vdev. The user is not allowed to 3976 * attach to a spared vdev child unless the 'isspare' state is 3977 * the same (spare replaces spare, non-spare replaces 3978 * non-spare). 3979 */ 3980 if (pvd->vdev_ops == &vdev_replacing_ops && 3981 spa_version(spa) < SPA_VERSION_MULTI_REPLACE) { 3982 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 3983 } else if (pvd->vdev_ops == &vdev_spare_ops && 3984 newvd->vdev_isspare != oldvd->vdev_isspare) { 3985 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 3986 } 3987 3988 if (newvd->vdev_isspare) 3989 pvops = &vdev_spare_ops; 3990 else 3991 pvops = &vdev_replacing_ops; 3992 } 3993 3994 /* 3995 * Make sure the new device is big enough. 3996 */ 3997 if (newvd->vdev_asize < vdev_get_min_asize(oldvd)) 3998 return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); 3999 4000 /* 4001 * The new device cannot have a higher alignment requirement 4002 * than the top-level vdev. 4003 */ 4004 if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) 4005 return (spa_vdev_exit(spa, newrootvd, txg, EDOM)); 4006 4007 /* 4008 * If this is an in-place replacement, update oldvd's path and devid 4009 * to make it distinguishable from newvd, and unopenable from now on. 4010 */ 4011 if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { 4012 spa_strfree(oldvd->vdev_path); 4013 oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, 4014 KM_SLEEP); 4015 (void) sprintf(oldvd->vdev_path, "%s/%s", 4016 newvd->vdev_path, "old"); 4017 if (oldvd->vdev_devid != NULL) { 4018 spa_strfree(oldvd->vdev_devid); 4019 oldvd->vdev_devid = NULL; 4020 } 4021 } 4022 4023 /* mark the device being resilvered */ 4024 newvd->vdev_resilvering = B_TRUE; 4025 4026 /* 4027 * If the parent is not a mirror, or if we're replacing, insert the new 4028 * mirror/replacing/spare vdev above oldvd. 4029 */ 4030 if (pvd->vdev_ops != pvops) 4031 pvd = vdev_add_parent(oldvd, pvops); 4032 4033 ASSERT(pvd->vdev_top->vdev_parent == rvd); 4034 ASSERT(pvd->vdev_ops == pvops); 4035 ASSERT(oldvd->vdev_parent == pvd); 4036 4037 /* 4038 * Extract the new device from its root and add it to pvd. 4039 */ 4040 vdev_remove_child(newrootvd, newvd); 4041 newvd->vdev_id = pvd->vdev_children; 4042 newvd->vdev_crtxg = oldvd->vdev_crtxg; 4043 vdev_add_child(pvd, newvd); 4044 4045 tvd = newvd->vdev_top; 4046 ASSERT(pvd->vdev_top == tvd); 4047 ASSERT(tvd->vdev_parent == rvd); 4048 4049 vdev_config_dirty(tvd); 4050 4051 /* 4052 * Set newvd's DTL to [TXG_INITIAL, dtl_max_txg) so that we account 4053 * for any dmu_sync-ed blocks. It will propagate upward when 4054 * spa_vdev_exit() calls vdev_dtl_reassess(). 4055 */ 4056 dtl_max_txg = txg + TXG_CONCURRENT_STATES; 4057 4058 vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL, 4059 dtl_max_txg - TXG_INITIAL); 4060 4061 if (newvd->vdev_isspare) { 4062 spa_spare_activate(newvd); 4063 spa_event_notify(spa, newvd, ESC_ZFS_VDEV_SPARE); 4064 } 4065 4066 oldvdpath = spa_strdup(oldvd->vdev_path); 4067 newvdpath = spa_strdup(newvd->vdev_path); 4068 newvd_isspare = newvd->vdev_isspare; 4069 4070 /* 4071 * Mark newvd's DTL dirty in this txg. 4072 */ 4073 vdev_dirty(tvd, VDD_DTL, newvd, txg); 4074 4075 /* 4076 * Restart the resilver 4077 */ 4078 dsl_resilver_restart(spa->spa_dsl_pool, dtl_max_txg); 4079 4080 /* 4081 * Commit the config 4082 */ 4083 (void) spa_vdev_exit(spa, newrootvd, dtl_max_txg, 0); 4084 4085 spa_history_log_internal(LOG_POOL_VDEV_ATTACH, spa, NULL, 4086 "%s vdev=%s %s vdev=%s", 4087 replacing && newvd_isspare ? "spare in" : 4088 replacing ? "replace" : "attach", newvdpath, 4089 replacing ? "for" : "to", oldvdpath); 4090 4091 spa_strfree(oldvdpath); 4092 spa_strfree(newvdpath); 4093 4094 if (spa->spa_bootfs) 4095 spa_event_notify(spa, newvd, ESC_ZFS_BOOTFS_VDEV_ATTACH); 4096 4097 return (0); 4098} 4099 4100/* 4101 * Detach a device from a mirror or replacing vdev. 4102 * If 'replace_done' is specified, only detach if the parent 4103 * is a replacing vdev. 4104 */ 4105int 4106spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) 4107{ 4108 uint64_t txg; 4109 int error; 4110 vdev_t *rvd = spa->spa_root_vdev; 4111 vdev_t *vd, *pvd, *cvd, *tvd; 4112 boolean_t unspare = B_FALSE; 4113 uint64_t unspare_guid; 4114 char *vdpath; 4115 4116 ASSERT(spa_writeable(spa)); 4117 4118 txg = spa_vdev_enter(spa); 4119 4120 vd = spa_lookup_by_guid(spa, guid, B_FALSE); 4121 4122 if (vd == NULL) 4123 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 4124 4125 if (!vd->vdev_ops->vdev_op_leaf) 4126 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 4127 4128 pvd = vd->vdev_parent; 4129 4130 /* 4131 * If the parent/child relationship is not as expected, don't do it. 4132 * Consider M(A,R(B,C)) -- that is, a mirror of A with a replacing 4133 * vdev that's replacing B with C. The user's intent in replacing 4134 * is to go from M(A,B) to M(A,C). If the user decides to cancel 4135 * the replace by detaching C, the expected behavior is to end up 4136 * M(A,B). But suppose that right after deciding to detach C, 4137 * the replacement of B completes. We would have M(A,C), and then 4138 * ask to detach C, which would leave us with just A -- not what 4139 * the user wanted. To prevent this, we make sure that the 4140 * parent/child relationship hasn't changed -- in this example, 4141 * that C's parent is still the replacing vdev R. 4142 */ 4143 if (pvd->vdev_guid != pguid && pguid != 0) 4144 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 4145 4146 /* 4147 * Only 'replacing' or 'spare' vdevs can be replaced. 4148 */ 4149 if (replace_done && pvd->vdev_ops != &vdev_replacing_ops && 4150 pvd->vdev_ops != &vdev_spare_ops) 4151 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 4152 4153 ASSERT(pvd->vdev_ops != &vdev_spare_ops || 4154 spa_version(spa) >= SPA_VERSION_SPARES); 4155 4156 /* 4157 * Only mirror, replacing, and spare vdevs support detach. 4158 */ 4159 if (pvd->vdev_ops != &vdev_replacing_ops && 4160 pvd->vdev_ops != &vdev_mirror_ops && 4161 pvd->vdev_ops != &vdev_spare_ops) 4162 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 4163 4164 /* 4165 * If this device has the only valid copy of some data, 4166 * we cannot safely detach it. 4167 */ 4168 if (vdev_dtl_required(vd)) 4169 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 4170 4171 ASSERT(pvd->vdev_children >= 2); 4172 4173 /* 4174 * If we are detaching the second disk from a replacing vdev, then 4175 * check to see if we changed the original vdev's path to have "/old" 4176 * at the end in spa_vdev_attach(). If so, undo that change now. 4177 */ 4178 if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id > 0 && 4179 vd->vdev_path != NULL) { 4180 size_t len = strlen(vd->vdev_path); 4181 4182 for (int c = 0; c < pvd->vdev_children; c++) { 4183 cvd = pvd->vdev_child[c]; 4184 4185 if (cvd == vd || cvd->vdev_path == NULL) 4186 continue; 4187 4188 if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 && 4189 strcmp(cvd->vdev_path + len, "/old") == 0) { 4190 spa_strfree(cvd->vdev_path); 4191 cvd->vdev_path = spa_strdup(vd->vdev_path); 4192 break; 4193 } 4194 } 4195 } 4196 4197 /* 4198 * If we are detaching the original disk from a spare, then it implies 4199 * that the spare should become a real disk, and be removed from the 4200 * active spare list for the pool. 4201 */ 4202 if (pvd->vdev_ops == &vdev_spare_ops && 4203 vd->vdev_id == 0 && 4204 pvd->vdev_child[pvd->vdev_children - 1]->vdev_isspare) 4205 unspare = B_TRUE; 4206 4207 /* 4208 * Erase the disk labels so the disk can be used for other things. 4209 * This must be done after all other error cases are handled, 4210 * but before we disembowel vd (so we can still do I/O to it). 4211 * But if we can't do it, don't treat the error as fatal -- 4212 * it may be that the unwritability of the disk is the reason 4213 * it's being detached! 4214 */ 4215 error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 4216 4217 /* 4218 * Remove vd from its parent and compact the parent's children. 4219 */ 4220 vdev_remove_child(pvd, vd); 4221 vdev_compact_children(pvd); 4222 4223 /* 4224 * Remember one of the remaining children so we can get tvd below. 4225 */ 4226 cvd = pvd->vdev_child[pvd->vdev_children - 1]; 4227 4228 /* 4229 * If we need to remove the remaining child from the list of hot spares, 4230 * do it now, marking the vdev as no longer a spare in the process. 4231 * We must do this before vdev_remove_parent(), because that can 4232 * change the GUID if it creates a new toplevel GUID. For a similar 4233 * reason, we must remove the spare now, in the same txg as the detach; 4234 * otherwise someone could attach a new sibling, change the GUID, and 4235 * the subsequent attempt to spa_vdev_remove(unspare_guid) would fail. 4236 */ 4237 if (unspare) { 4238 ASSERT(cvd->vdev_isspare); 4239 spa_spare_remove(cvd); 4240 unspare_guid = cvd->vdev_guid; 4241 (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); 4242 cvd->vdev_unspare = B_TRUE; 4243 } 4244 4245 /* 4246 * If the parent mirror/replacing vdev only has one child, 4247 * the parent is no longer needed. Remove it from the tree. 4248 */ 4249 if (pvd->vdev_children == 1) { 4250 if (pvd->vdev_ops == &vdev_spare_ops) 4251 cvd->vdev_unspare = B_FALSE; 4252 vdev_remove_parent(cvd); 4253 cvd->vdev_resilvering = B_FALSE; 4254 } 4255 4256 4257 /* 4258 * We don't set tvd until now because the parent we just removed 4259 * may have been the previous top-level vdev. 4260 */ 4261 tvd = cvd->vdev_top; 4262 ASSERT(tvd->vdev_parent == rvd); 4263 4264 /* 4265 * Reevaluate the parent vdev state. 4266 */ 4267 vdev_propagate_state(cvd); 4268 4269 /* 4270 * If the 'autoexpand' property is set on the pool then automatically 4271 * try to expand the size of the pool. For example if the device we 4272 * just detached was smaller than the others, it may be possible to 4273 * add metaslabs (i.e. grow the pool). We need to reopen the vdev 4274 * first so that we can obtain the updated sizes of the leaf vdevs. 4275 */ 4276 if (spa->spa_autoexpand) { 4277 vdev_reopen(tvd); 4278 vdev_expand(tvd, txg); 4279 } 4280 4281 vdev_config_dirty(tvd); 4282 4283 /* 4284 * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that 4285 * vd->vdev_detached is set and free vd's DTL object in syncing context. 4286 * But first make sure we're not on any *other* txg's DTL list, to 4287 * prevent vd from being accessed after it's freed. 4288 */ 4289 vdpath = spa_strdup(vd->vdev_path); 4290 for (int t = 0; t < TXG_SIZE; t++) 4291 (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); 4292 vd->vdev_detached = B_TRUE; 4293 vdev_dirty(tvd, VDD_DTL, vd, txg); 4294 4295 spa_event_notify(spa, vd, ESC_ZFS_VDEV_REMOVE); 4296 4297 /* hang on to the spa before we release the lock */ 4298 spa_open_ref(spa, FTAG); 4299 4300 error = spa_vdev_exit(spa, vd, txg, 0); 4301 4302 spa_history_log_internal(LOG_POOL_VDEV_DETACH, spa, NULL, 4303 "vdev=%s", vdpath); 4304 spa_strfree(vdpath); 4305 4306 /* 4307 * If this was the removal of the original device in a hot spare vdev, 4308 * then we want to go through and remove the device from the hot spare 4309 * list of every other pool. 4310 */ 4311 if (unspare) { 4312 spa_t *altspa = NULL; 4313 4314 mutex_enter(&spa_namespace_lock); 4315 while ((altspa = spa_next(altspa)) != NULL) { 4316 if (altspa->spa_state != POOL_STATE_ACTIVE || 4317 altspa == spa) 4318 continue; 4319 4320 spa_open_ref(altspa, FTAG); 4321 mutex_exit(&spa_namespace_lock); 4322 (void) spa_vdev_remove(altspa, unspare_guid, B_TRUE); 4323 mutex_enter(&spa_namespace_lock); 4324 spa_close(altspa, FTAG); 4325 } 4326 mutex_exit(&spa_namespace_lock); 4327 4328 /* search the rest of the vdevs for spares to remove */ 4329 spa_vdev_resilver_done(spa); 4330 } 4331 4332 /* all done with the spa; OK to release */ 4333 mutex_enter(&spa_namespace_lock); 4334 spa_close(spa, FTAG); 4335 mutex_exit(&spa_namespace_lock); 4336 4337 return (error); 4338} 4339 4340/* 4341 * Split a set of devices from their mirrors, and create a new pool from them. 4342 */ 4343int 4344spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, 4345 nvlist_t *props, boolean_t exp) 4346{ 4347 int error = 0; 4348 uint64_t txg, *glist; 4349 spa_t *newspa; 4350 uint_t c, children, lastlog; 4351 nvlist_t **child, *nvl, *tmp; 4352 dmu_tx_t *tx; 4353 char *altroot = NULL; 4354 vdev_t *rvd, **vml = NULL; /* vdev modify list */ 4355 boolean_t activate_slog; 4356 4357 ASSERT(spa_writeable(spa)); 4358 4359 txg = spa_vdev_enter(spa); 4360 4361 /* clear the log and flush everything up to now */ 4362 activate_slog = spa_passivate_log(spa); 4363 (void) spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 4364 error = spa_offline_log(spa); 4365 txg = spa_vdev_config_enter(spa); 4366 4367 if (activate_slog) 4368 spa_activate_log(spa); 4369 4370 if (error != 0) 4371 return (spa_vdev_exit(spa, NULL, txg, error)); 4372 4373 /* check new spa name before going any further */ 4374 if (spa_lookup(newname) != NULL) 4375 return (spa_vdev_exit(spa, NULL, txg, EEXIST)); 4376 4377 /* 4378 * scan through all the children to ensure they're all mirrors 4379 */ 4380 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 || 4381 nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child, 4382 &children) != 0) 4383 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 4384 4385 /* first, check to ensure we've got the right child count */ 4386 rvd = spa->spa_root_vdev; 4387 lastlog = 0; 4388 for (c = 0; c < rvd->vdev_children; c++) { 4389 vdev_t *vd = rvd->vdev_child[c]; 4390 4391 /* don't count the holes & logs as children */ 4392 if (vd->vdev_islog || vd->vdev_ishole) { 4393 if (lastlog == 0) 4394 lastlog = c; 4395 continue; 4396 } 4397 4398 lastlog = 0; 4399 } 4400 if (children != (lastlog != 0 ? lastlog : rvd->vdev_children)) 4401 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 4402 4403 /* next, ensure no spare or cache devices are part of the split */ 4404 if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_SPARES, &tmp) == 0 || 4405 nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_L2CACHE, &tmp) == 0) 4406 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 4407 4408 vml = kmem_zalloc(children * sizeof (vdev_t *), KM_SLEEP); 4409 glist = kmem_zalloc(children * sizeof (uint64_t), KM_SLEEP); 4410 4411 /* then, loop over each vdev and validate it */ 4412 for (c = 0; c < children; c++) { 4413 uint64_t is_hole = 0; 4414 4415 (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE, 4416 &is_hole); 4417 4418 if (is_hole != 0) { 4419 if (spa->spa_root_vdev->vdev_child[c]->vdev_ishole || 4420 spa->spa_root_vdev->vdev_child[c]->vdev_islog) { 4421 continue; 4422 } else { 4423 error = EINVAL; 4424 break; 4425 } 4426 } 4427 4428 /* which disk is going to be split? */ 4429 if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_GUID, 4430 &glist[c]) != 0) { 4431 error = EINVAL; 4432 break; 4433 } 4434 4435 /* look it up in the spa */ 4436 vml[c] = spa_lookup_by_guid(spa, glist[c], B_FALSE); 4437 if (vml[c] == NULL) { 4438 error = ENODEV; 4439 break; 4440 } 4441 4442 /* make sure there's nothing stopping the split */ 4443 if (vml[c]->vdev_parent->vdev_ops != &vdev_mirror_ops || 4444 vml[c]->vdev_islog || 4445 vml[c]->vdev_ishole || 4446 vml[c]->vdev_isspare || 4447 vml[c]->vdev_isl2cache || 4448 !vdev_writeable(vml[c]) || 4449 vml[c]->vdev_children != 0 || 4450 vml[c]->vdev_state != VDEV_STATE_HEALTHY || 4451 c != spa->spa_root_vdev->vdev_child[c]->vdev_id) { 4452 error = EINVAL; 4453 break; 4454 } 4455 4456 if (vdev_dtl_required(vml[c])) { 4457 error = EBUSY; 4458 break; 4459 } 4460 4461 /* we need certain info from the top level */ 4462 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_ARRAY, 4463 vml[c]->vdev_top->vdev_ms_array) == 0); 4464 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_SHIFT, 4465 vml[c]->vdev_top->vdev_ms_shift) == 0); 4466 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASIZE, 4467 vml[c]->vdev_top->vdev_asize) == 0); 4468 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASHIFT, 4469 vml[c]->vdev_top->vdev_ashift) == 0); 4470 } 4471 4472 if (error != 0) { 4473 kmem_free(vml, children * sizeof (vdev_t *)); 4474 kmem_free(glist, children * sizeof (uint64_t)); 4475 return (spa_vdev_exit(spa, NULL, txg, error)); 4476 } 4477 4478 /* stop writers from using the disks */ 4479 for (c = 0; c < children; c++) { 4480 if (vml[c] != NULL) 4481 vml[c]->vdev_offline = B_TRUE; 4482 } 4483 vdev_reopen(spa->spa_root_vdev); 4484 4485 /* 4486 * Temporarily record the splitting vdevs in the spa config. This 4487 * will disappear once the config is regenerated. 4488 */ 4489 VERIFY(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) == 0); 4490 VERIFY(nvlist_add_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, 4491 glist, children) == 0); 4492 kmem_free(glist, children * sizeof (uint64_t)); 4493 4494 mutex_enter(&spa->spa_props_lock); 4495 VERIFY(nvlist_add_nvlist(spa->spa_config, ZPOOL_CONFIG_SPLIT, 4496 nvl) == 0); 4497 mutex_exit(&spa->spa_props_lock); 4498 spa->spa_config_splitting = nvl; 4499 vdev_config_dirty(spa->spa_root_vdev); 4500 4501 /* configure and create the new pool */ 4502 VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, newname) == 0); 4503 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 4504 exp ? POOL_STATE_EXPORTED : POOL_STATE_ACTIVE) == 0); 4505 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, 4506 spa_version(spa)) == 0); 4507 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG, 4508 spa->spa_config_txg) == 0); 4509 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID, 4510 spa_generate_guid(NULL)) == 0); 4511 (void) nvlist_lookup_string(props, 4512 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 4513 4514 /* add the new pool to the namespace */ 4515 newspa = spa_add(newname, config, altroot); 4516 newspa->spa_config_txg = spa->spa_config_txg; 4517 spa_set_log_state(newspa, SPA_LOG_CLEAR); 4518 4519 /* release the spa config lock, retaining the namespace lock */ 4520 spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 4521 4522 if (zio_injection_enabled) 4523 zio_handle_panic_injection(spa, FTAG, 1); 4524 4525 spa_activate(newspa, spa_mode_global); 4526 spa_async_suspend(newspa); 4527 4528#ifndef sun 4529 /* mark that we are creating new spa by splitting */ 4530 newspa->spa_splitting_newspa = B_TRUE; 4531#endif 4532 /* create the new pool from the disks of the original pool */ 4533 error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE, B_TRUE); 4534#ifndef sun 4535 newspa->spa_splitting_newspa = B_FALSE; 4536#endif 4537 if (error) 4538 goto out; 4539 4540 /* if that worked, generate a real config for the new pool */ 4541 if (newspa->spa_root_vdev != NULL) { 4542 VERIFY(nvlist_alloc(&newspa->spa_config_splitting, 4543 NV_UNIQUE_NAME, KM_SLEEP) == 0); 4544 VERIFY(nvlist_add_uint64(newspa->spa_config_splitting, 4545 ZPOOL_CONFIG_SPLIT_GUID, spa_guid(spa)) == 0); 4546 spa_config_set(newspa, spa_config_generate(newspa, NULL, -1ULL, 4547 B_TRUE)); 4548 } 4549 4550 /* set the props */ 4551 if (props != NULL) { 4552 spa_configfile_set(newspa, props, B_FALSE); 4553 error = spa_prop_set(newspa, props); 4554 if (error) 4555 goto out; 4556 } 4557 4558 /* flush everything */ 4559 txg = spa_vdev_config_enter(newspa); 4560 vdev_config_dirty(newspa->spa_root_vdev); 4561 (void) spa_vdev_config_exit(newspa, NULL, txg, 0, FTAG); 4562 4563 if (zio_injection_enabled) 4564 zio_handle_panic_injection(spa, FTAG, 2); 4565 4566 spa_async_resume(newspa); 4567 4568 /* finally, update the original pool's config */ 4569 txg = spa_vdev_config_enter(spa); 4570 tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); 4571 error = dmu_tx_assign(tx, TXG_WAIT); 4572 if (error != 0) 4573 dmu_tx_abort(tx); 4574 for (c = 0; c < children; c++) { 4575 if (vml[c] != NULL) { 4576 vdev_split(vml[c]); 4577 if (error == 0) 4578 spa_history_log_internal(LOG_POOL_VDEV_DETACH, 4579 spa, tx, "vdev=%s", 4580 vml[c]->vdev_path); 4581 vdev_free(vml[c]); 4582 } 4583 } 4584 vdev_config_dirty(spa->spa_root_vdev); 4585 spa->spa_config_splitting = NULL; 4586 nvlist_free(nvl); 4587 if (error == 0) 4588 dmu_tx_commit(tx); 4589 (void) spa_vdev_exit(spa, NULL, txg, 0); 4590 4591 if (zio_injection_enabled) 4592 zio_handle_panic_injection(spa, FTAG, 3); 4593 4594 /* split is complete; log a history record */ 4595 spa_history_log_internal(LOG_POOL_SPLIT, newspa, NULL, 4596 "split new pool %s from pool %s", newname, spa_name(spa)); 4597 4598 kmem_free(vml, children * sizeof (vdev_t *)); 4599 4600 /* if we're not going to mount the filesystems in userland, export */ 4601 if (exp) 4602 error = spa_export_common(newname, POOL_STATE_EXPORTED, NULL, 4603 B_FALSE, B_FALSE); 4604 4605 return (error); 4606 4607out: 4608 spa_unload(newspa); 4609 spa_deactivate(newspa); 4610 spa_remove(newspa); 4611 4612 txg = spa_vdev_config_enter(spa); 4613 4614 /* re-online all offlined disks */ 4615 for (c = 0; c < children; c++) { 4616 if (vml[c] != NULL) 4617 vml[c]->vdev_offline = B_FALSE; 4618 } 4619 vdev_reopen(spa->spa_root_vdev); 4620 4621 nvlist_free(spa->spa_config_splitting); 4622 spa->spa_config_splitting = NULL; 4623 (void) spa_vdev_exit(spa, NULL, txg, error); 4624 4625 kmem_free(vml, children * sizeof (vdev_t *)); 4626 return (error); 4627} 4628 4629static nvlist_t * 4630spa_nvlist_lookup_by_guid(nvlist_t **nvpp, int count, uint64_t target_guid) 4631{ 4632 for (int i = 0; i < count; i++) { 4633 uint64_t guid; 4634 4635 VERIFY(nvlist_lookup_uint64(nvpp[i], ZPOOL_CONFIG_GUID, 4636 &guid) == 0); 4637 4638 if (guid == target_guid) 4639 return (nvpp[i]); 4640 } 4641 4642 return (NULL); 4643} 4644 4645static void 4646spa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count, 4647 nvlist_t *dev_to_remove) 4648{ 4649 nvlist_t **newdev = NULL; 4650 4651 if (count > 1) 4652 newdev = kmem_alloc((count - 1) * sizeof (void *), KM_SLEEP); 4653 4654 for (int i = 0, j = 0; i < count; i++) { 4655 if (dev[i] == dev_to_remove) 4656 continue; 4657 VERIFY(nvlist_dup(dev[i], &newdev[j++], KM_SLEEP) == 0); 4658 } 4659 4660 VERIFY(nvlist_remove(config, name, DATA_TYPE_NVLIST_ARRAY) == 0); 4661 VERIFY(nvlist_add_nvlist_array(config, name, newdev, count - 1) == 0); 4662 4663 for (int i = 0; i < count - 1; i++) 4664 nvlist_free(newdev[i]); 4665 4666 if (count > 1) 4667 kmem_free(newdev, (count - 1) * sizeof (void *)); 4668} 4669 4670/* 4671 * Evacuate the device. 4672 */ 4673static int 4674spa_vdev_remove_evacuate(spa_t *spa, vdev_t *vd) 4675{ 4676 uint64_t txg; 4677 int error = 0; 4678 4679 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 4680 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 4681 ASSERT(vd == vd->vdev_top); 4682 4683 /* 4684 * Evacuate the device. We don't hold the config lock as writer 4685 * since we need to do I/O but we do keep the 4686 * spa_namespace_lock held. Once this completes the device 4687 * should no longer have any blocks allocated on it. 4688 */ 4689 if (vd->vdev_islog) { 4690 if (vd->vdev_stat.vs_alloc != 0) 4691 error = spa_offline_log(spa); 4692 } else { 4693 error = ENOTSUP; 4694 } 4695 4696 if (error) 4697 return (error); 4698 4699 /* 4700 * The evacuation succeeded. Remove any remaining MOS metadata 4701 * associated with this vdev, and wait for these changes to sync. 4702 */ 4703 ASSERT3U(vd->vdev_stat.vs_alloc, ==, 0); 4704 txg = spa_vdev_config_enter(spa); 4705 vd->vdev_removing = B_TRUE; 4706 vdev_dirty(vd, 0, NULL, txg); 4707 vdev_config_dirty(vd); 4708 spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 4709 4710 return (0); 4711} 4712 4713/* 4714 * Complete the removal by cleaning up the namespace. 4715 */ 4716static void 4717spa_vdev_remove_from_namespace(spa_t *spa, vdev_t *vd) 4718{ 4719 vdev_t *rvd = spa->spa_root_vdev; 4720 uint64_t id = vd->vdev_id; 4721 boolean_t last_vdev = (id == (rvd->vdev_children - 1)); 4722 4723 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 4724 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 4725 ASSERT(vd == vd->vdev_top); 4726 4727 /* 4728 * Only remove any devices which are empty. 4729 */ 4730 if (vd->vdev_stat.vs_alloc != 0) 4731 return; 4732 4733 (void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 4734 4735 if (list_link_active(&vd->vdev_state_dirty_node)) 4736 vdev_state_clean(vd); 4737 if (list_link_active(&vd->vdev_config_dirty_node)) 4738 vdev_config_clean(vd); 4739 4740 vdev_free(vd); 4741 4742 if (last_vdev) { 4743 vdev_compact_children(rvd); 4744 } else { 4745 vd = vdev_alloc_common(spa, id, 0, &vdev_hole_ops); 4746 vdev_add_child(rvd, vd); 4747 } 4748 vdev_config_dirty(rvd); 4749 4750 /* 4751 * Reassess the health of our root vdev. 4752 */ 4753 vdev_reopen(rvd); 4754} 4755 4756/* 4757 * Remove a device from the pool - 4758 * 4759 * Removing a device from the vdev namespace requires several steps 4760 * and can take a significant amount of time. As a result we use 4761 * the spa_vdev_config_[enter/exit] functions which allow us to 4762 * grab and release the spa_config_lock while still holding the namespace 4763 * lock. During each step the configuration is synced out. 4764 */ 4765 4766/* 4767 * Remove a device from the pool. Currently, this supports removing only hot 4768 * spares, slogs, and level 2 ARC devices. 4769 */ 4770int 4771spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) 4772{ 4773 vdev_t *vd; 4774 metaslab_group_t *mg; 4775 nvlist_t **spares, **l2cache, *nv; 4776 uint64_t txg = 0; 4777 uint_t nspares, nl2cache; 4778 int error = 0; 4779 boolean_t locked = MUTEX_HELD(&spa_namespace_lock); 4780 4781 ASSERT(spa_writeable(spa)); 4782 4783 if (!locked) 4784 txg = spa_vdev_enter(spa); 4785 4786 vd = spa_lookup_by_guid(spa, guid, B_FALSE); 4787 4788 if (spa->spa_spares.sav_vdevs != NULL && 4789 nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 4790 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0 && 4791 (nv = spa_nvlist_lookup_by_guid(spares, nspares, guid)) != NULL) { 4792 /* 4793 * Only remove the hot spare if it's not currently in use 4794 * in this pool. 4795 */ 4796 if (vd == NULL || unspare) { 4797 spa_vdev_remove_aux(spa->spa_spares.sav_config, 4798 ZPOOL_CONFIG_SPARES, spares, nspares, nv); 4799 spa_load_spares(spa); 4800 spa->spa_spares.sav_sync = B_TRUE; 4801 } else { 4802 error = EBUSY; 4803 } 4804 } else if (spa->spa_l2cache.sav_vdevs != NULL && 4805 nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, 4806 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0 && 4807 (nv = spa_nvlist_lookup_by_guid(l2cache, nl2cache, guid)) != NULL) { 4808 /* 4809 * Cache devices can always be removed. 4810 */ 4811 spa_vdev_remove_aux(spa->spa_l2cache.sav_config, 4812 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache, nv); 4813 spa_load_l2cache(spa); 4814 spa->spa_l2cache.sav_sync = B_TRUE; 4815 } else if (vd != NULL && vd->vdev_islog) { 4816 ASSERT(!locked); 4817 ASSERT(vd == vd->vdev_top); 4818 4819 /* 4820 * XXX - Once we have bp-rewrite this should 4821 * become the common case. 4822 */ 4823 4824 mg = vd->vdev_mg; 4825 4826 /* 4827 * Stop allocating from this vdev. 4828 */ 4829 metaslab_group_passivate(mg); 4830 4831 /* 4832 * Wait for the youngest allocations and frees to sync, 4833 * and then wait for the deferral of those frees to finish. 4834 */ 4835 spa_vdev_config_exit(spa, NULL, 4836 txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG); 4837 4838 /* 4839 * Attempt to evacuate the vdev. 4840 */ 4841 error = spa_vdev_remove_evacuate(spa, vd); 4842 4843 txg = spa_vdev_config_enter(spa); 4844 4845 /* 4846 * If we couldn't evacuate the vdev, unwind. 4847 */ 4848 if (error) { 4849 metaslab_group_activate(mg); 4850 return (spa_vdev_exit(spa, NULL, txg, error)); 4851 } 4852 4853 /* 4854 * Clean up the vdev namespace. 4855 */ 4856 spa_vdev_remove_from_namespace(spa, vd); 4857 4858 } else if (vd != NULL) { 4859 /* 4860 * Normal vdevs cannot be removed (yet). 4861 */ 4862 error = ENOTSUP; 4863 } else { 4864 /* 4865 * There is no vdev of any kind with the specified guid. 4866 */ 4867 error = ENOENT; 4868 } 4869 4870 if (!locked) 4871 return (spa_vdev_exit(spa, NULL, txg, error)); 4872 4873 return (error); 4874} 4875 4876/* 4877 * Find any device that's done replacing, or a vdev marked 'unspare' that's 4878 * current spared, so we can detach it. 4879 */ 4880static vdev_t * 4881spa_vdev_resilver_done_hunt(vdev_t *vd) 4882{ 4883 vdev_t *newvd, *oldvd; 4884 4885 for (int c = 0; c < vd->vdev_children; c++) { 4886 oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]); 4887 if (oldvd != NULL) 4888 return (oldvd); 4889 } 4890 4891 /* 4892 * Check for a completed replacement. We always consider the first 4893 * vdev in the list to be the oldest vdev, and the last one to be 4894 * the newest (see spa_vdev_attach() for how that works). In 4895 * the case where the newest vdev is faulted, we will not automatically 4896 * remove it after a resilver completes. This is OK as it will require 4897 * user intervention to determine which disk the admin wishes to keep. 4898 */ 4899 if (vd->vdev_ops == &vdev_replacing_ops) { 4900 ASSERT(vd->vdev_children > 1); 4901 4902 newvd = vd->vdev_child[vd->vdev_children - 1]; 4903 oldvd = vd->vdev_child[0]; 4904 4905 if (vdev_dtl_empty(newvd, DTL_MISSING) && 4906 vdev_dtl_empty(newvd, DTL_OUTAGE) && 4907 !vdev_dtl_required(oldvd)) 4908 return (oldvd); 4909 } 4910 4911 /* 4912 * Check for a completed resilver with the 'unspare' flag set. 4913 */ 4914 if (vd->vdev_ops == &vdev_spare_ops) { 4915 vdev_t *first = vd->vdev_child[0]; 4916 vdev_t *last = vd->vdev_child[vd->vdev_children - 1]; 4917 4918 if (last->vdev_unspare) { 4919 oldvd = first; 4920 newvd = last; 4921 } else if (first->vdev_unspare) { 4922 oldvd = last; 4923 newvd = first; 4924 } else { 4925 oldvd = NULL; 4926 } 4927 4928 if (oldvd != NULL && 4929 vdev_dtl_empty(newvd, DTL_MISSING) && 4930 vdev_dtl_empty(newvd, DTL_OUTAGE) && 4931 !vdev_dtl_required(oldvd)) 4932 return (oldvd); 4933 4934 /* 4935 * If there are more than two spares attached to a disk, 4936 * and those spares are not required, then we want to 4937 * attempt to free them up now so that they can be used 4938 * by other pools. Once we're back down to a single 4939 * disk+spare, we stop removing them. 4940 */ 4941 if (vd->vdev_children > 2) { 4942 newvd = vd->vdev_child[1]; 4943 4944 if (newvd->vdev_isspare && last->vdev_isspare && 4945 vdev_dtl_empty(last, DTL_MISSING) && 4946 vdev_dtl_empty(last, DTL_OUTAGE) && 4947 !vdev_dtl_required(newvd)) 4948 return (newvd); 4949 } 4950 } 4951 4952 return (NULL); 4953} 4954 4955static void 4956spa_vdev_resilver_done(spa_t *spa) 4957{ 4958 vdev_t *vd, *pvd, *ppvd; 4959 uint64_t guid, sguid, pguid, ppguid; 4960 4961 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4962 4963 while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) { 4964 pvd = vd->vdev_parent; 4965 ppvd = pvd->vdev_parent; 4966 guid = vd->vdev_guid; 4967 pguid = pvd->vdev_guid; 4968 ppguid = ppvd->vdev_guid; 4969 sguid = 0; 4970 /* 4971 * If we have just finished replacing a hot spared device, then 4972 * we need to detach the parent's first child (the original hot 4973 * spare) as well. 4974 */ 4975 if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0 && 4976 ppvd->vdev_children == 2) { 4977 ASSERT(pvd->vdev_ops == &vdev_replacing_ops); 4978 sguid = ppvd->vdev_child[1]->vdev_guid; 4979 } 4980 spa_config_exit(spa, SCL_ALL, FTAG); 4981 if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0) 4982 return; 4983 if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0) 4984 return; 4985 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4986 } 4987 4988 spa_config_exit(spa, SCL_ALL, FTAG); 4989} 4990 4991/* 4992 * Update the stored path or FRU for this vdev. 4993 */ 4994int 4995spa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value, 4996 boolean_t ispath) 4997{ 4998 vdev_t *vd; 4999 boolean_t sync = B_FALSE; 5000 5001 ASSERT(spa_writeable(spa)); 5002 5003 spa_vdev_state_enter(spa, SCL_ALL); 5004 5005 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 5006 return (spa_vdev_state_exit(spa, NULL, ENOENT)); 5007 5008 if (!vd->vdev_ops->vdev_op_leaf) 5009 return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); 5010 5011 if (ispath) { 5012 if (strcmp(value, vd->vdev_path) != 0) { 5013 spa_strfree(vd->vdev_path); 5014 vd->vdev_path = spa_strdup(value); 5015 sync = B_TRUE; 5016 } 5017 } else { 5018 if (vd->vdev_fru == NULL) { 5019 vd->vdev_fru = spa_strdup(value); 5020 sync = B_TRUE; 5021 } else if (strcmp(value, vd->vdev_fru) != 0) { 5022 spa_strfree(vd->vdev_fru); 5023 vd->vdev_fru = spa_strdup(value); 5024 sync = B_TRUE; 5025 } 5026 } 5027 5028 return (spa_vdev_state_exit(spa, sync ? vd : NULL, 0)); 5029} 5030 5031int 5032spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) 5033{ 5034 return (spa_vdev_set_common(spa, guid, newpath, B_TRUE)); 5035} 5036 5037int 5038spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru) 5039{ 5040 return (spa_vdev_set_common(spa, guid, newfru, B_FALSE)); 5041} 5042 5043/* 5044 * ========================================================================== 5045 * SPA Scanning 5046 * ========================================================================== 5047 */ 5048 5049int 5050spa_scan_stop(spa_t *spa) 5051{ 5052 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 5053 if (dsl_scan_resilvering(spa->spa_dsl_pool)) 5054 return (EBUSY); 5055 return (dsl_scan_cancel(spa->spa_dsl_pool)); 5056} 5057 5058int 5059spa_scan(spa_t *spa, pool_scan_func_t func) 5060{ 5061 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 5062 5063 if (func >= POOL_SCAN_FUNCS || func == POOL_SCAN_NONE) 5064 return (ENOTSUP); 5065 5066 /* 5067 * If a resilver was requested, but there is no DTL on a 5068 * writeable leaf device, we have nothing to do. 5069 */ 5070 if (func == POOL_SCAN_RESILVER && 5071 !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) { 5072 spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); 5073 return (0); 5074 } 5075 5076 return (dsl_scan(spa->spa_dsl_pool, func)); 5077} 5078 5079/* 5080 * ========================================================================== 5081 * SPA async task processing 5082 * ========================================================================== 5083 */ 5084 5085static void 5086spa_async_remove(spa_t *spa, vdev_t *vd) 5087{ 5088 if (vd->vdev_remove_wanted) { 5089 vd->vdev_remove_wanted = B_FALSE; 5090 vd->vdev_delayed_close = B_FALSE; 5091 vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE); 5092 5093 /* 5094 * We want to clear the stats, but we don't want to do a full 5095 * vdev_clear() as that will cause us to throw away 5096 * degraded/faulted state as well as attempt to reopen the 5097 * device, all of which is a waste. 5098 */ 5099 vd->vdev_stat.vs_read_errors = 0; 5100 vd->vdev_stat.vs_write_errors = 0; 5101 vd->vdev_stat.vs_checksum_errors = 0; 5102 5103 vdev_state_dirty(vd->vdev_top); 5104 } 5105 5106 for (int c = 0; c < vd->vdev_children; c++) 5107 spa_async_remove(spa, vd->vdev_child[c]); 5108} 5109 5110static void 5111spa_async_probe(spa_t *spa, vdev_t *vd) 5112{ 5113 if (vd->vdev_probe_wanted) { 5114 vd->vdev_probe_wanted = B_FALSE; 5115 vdev_reopen(vd); /* vdev_open() does the actual probe */ 5116 } 5117 5118 for (int c = 0; c < vd->vdev_children; c++) 5119 spa_async_probe(spa, vd->vdev_child[c]); 5120} 5121 5122static void 5123spa_async_autoexpand(spa_t *spa, vdev_t *vd) 5124{ 5125 sysevent_id_t eid; 5126 nvlist_t *attr; 5127 char *physpath; 5128 5129 if (!spa->spa_autoexpand) 5130 return; 5131 5132 for (int c = 0; c < vd->vdev_children; c++) { 5133 vdev_t *cvd = vd->vdev_child[c]; 5134 spa_async_autoexpand(spa, cvd); 5135 } 5136 5137 if (!vd->vdev_ops->vdev_op_leaf || vd->vdev_physpath == NULL) 5138 return; 5139 5140 physpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP); 5141 (void) snprintf(physpath, MAXPATHLEN, "/devices%s", vd->vdev_physpath); 5142 5143 VERIFY(nvlist_alloc(&attr, NV_UNIQUE_NAME, KM_SLEEP) == 0); 5144 VERIFY(nvlist_add_string(attr, DEV_PHYS_PATH, physpath) == 0); 5145 5146 (void) ddi_log_sysevent(zfs_dip, SUNW_VENDOR, EC_DEV_STATUS, 5147 ESC_ZFS_VDEV_AUTOEXPAND, attr, &eid, DDI_SLEEP); 5148 5149 nvlist_free(attr); 5150 kmem_free(physpath, MAXPATHLEN); 5151} 5152 5153static void 5154spa_async_thread(void *arg) 5155{ 5156 spa_t *spa = arg; 5157 int tasks; 5158 5159 ASSERT(spa->spa_sync_on); 5160 5161 mutex_enter(&spa->spa_async_lock); 5162 tasks = spa->spa_async_tasks; 5163 spa->spa_async_tasks = 0; 5164 mutex_exit(&spa->spa_async_lock); 5165 5166 /* 5167 * See if the config needs to be updated. 5168 */ 5169 if (tasks & SPA_ASYNC_CONFIG_UPDATE) { 5170 uint64_t old_space, new_space; 5171 5172 mutex_enter(&spa_namespace_lock); 5173 old_space = metaslab_class_get_space(spa_normal_class(spa)); 5174 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 5175 new_space = metaslab_class_get_space(spa_normal_class(spa)); 5176 mutex_exit(&spa_namespace_lock); 5177 5178 /* 5179 * If the pool grew as a result of the config update, 5180 * then log an internal history event. 5181 */ 5182 if (new_space != old_space) { 5183 spa_history_log_internal(LOG_POOL_VDEV_ONLINE, 5184 spa, NULL, 5185 "pool '%s' size: %llu(+%llu)", 5186 spa_name(spa), new_space, new_space - old_space); 5187 } 5188 } 5189 5190 /* 5191 * See if any devices need to be marked REMOVED. 5192 */ 5193 if (tasks & SPA_ASYNC_REMOVE) { 5194 spa_vdev_state_enter(spa, SCL_NONE); 5195 spa_async_remove(spa, spa->spa_root_vdev); 5196 for (int i = 0; i < spa->spa_l2cache.sav_count; i++) 5197 spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]); 5198 for (int i = 0; i < spa->spa_spares.sav_count; i++) 5199 spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]); 5200 (void) spa_vdev_state_exit(spa, NULL, 0); 5201 } 5202 5203 if ((tasks & SPA_ASYNC_AUTOEXPAND) && !spa_suspended(spa)) { 5204 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 5205 spa_async_autoexpand(spa, spa->spa_root_vdev); 5206 spa_config_exit(spa, SCL_CONFIG, FTAG); 5207 } 5208 5209 /* 5210 * See if any devices need to be probed. 5211 */ 5212 if (tasks & SPA_ASYNC_PROBE) { 5213 spa_vdev_state_enter(spa, SCL_NONE); 5214 spa_async_probe(spa, spa->spa_root_vdev); 5215 (void) spa_vdev_state_exit(spa, NULL, 0); 5216 } 5217 5218 /* 5219 * If any devices are done replacing, detach them. 5220 */ 5221 if (tasks & SPA_ASYNC_RESILVER_DONE) 5222 spa_vdev_resilver_done(spa); 5223 5224 /* 5225 * Kick off a resilver. 5226 */ 5227 if (tasks & SPA_ASYNC_RESILVER) 5228 dsl_resilver_restart(spa->spa_dsl_pool, 0); 5229 5230 /* 5231 * Let the world know that we're done. 5232 */ 5233 mutex_enter(&spa->spa_async_lock); 5234 spa->spa_async_thread = NULL; 5235 cv_broadcast(&spa->spa_async_cv); 5236 mutex_exit(&spa->spa_async_lock); 5237 thread_exit(); 5238} 5239 5240void 5241spa_async_suspend(spa_t *spa) 5242{ 5243 mutex_enter(&spa->spa_async_lock); 5244 spa->spa_async_suspended++; 5245 while (spa->spa_async_thread != NULL) 5246 cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); 5247 mutex_exit(&spa->spa_async_lock); 5248} 5249 5250void 5251spa_async_resume(spa_t *spa) 5252{ 5253 mutex_enter(&spa->spa_async_lock); 5254 ASSERT(spa->spa_async_suspended != 0); 5255 spa->spa_async_suspended--; 5256 mutex_exit(&spa->spa_async_lock); 5257} 5258 5259static void 5260spa_async_dispatch(spa_t *spa) 5261{ 5262 mutex_enter(&spa->spa_async_lock); 5263 if (spa->spa_async_tasks && !spa->spa_async_suspended && 5264 spa->spa_async_thread == NULL && 5265 rootdir != NULL && !vn_is_readonly(rootdir)) 5266 spa->spa_async_thread = thread_create(NULL, 0, 5267 spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); 5268 mutex_exit(&spa->spa_async_lock); 5269} 5270 5271void 5272spa_async_request(spa_t *spa, int task) 5273{ 5274 zfs_dbgmsg("spa=%s async request task=%u", spa->spa_name, task); 5275 mutex_enter(&spa->spa_async_lock); 5276 spa->spa_async_tasks |= task; 5277 mutex_exit(&spa->spa_async_lock); 5278} 5279 5280/* 5281 * ========================================================================== 5282 * SPA syncing routines 5283 * ========================================================================== 5284 */ 5285 5286static int 5287bpobj_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 5288{ 5289 bpobj_t *bpo = arg; 5290 bpobj_enqueue(bpo, bp, tx); 5291 return (0); 5292} 5293 5294static int 5295spa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 5296{ 5297 zio_t *zio = arg; 5298 5299 zio_nowait(zio_free_sync(zio, zio->io_spa, dmu_tx_get_txg(tx), bp, 5300 zio->io_flags)); 5301 return (0); 5302} 5303 5304static void 5305spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) 5306{ 5307 char *packed = NULL; 5308 size_t bufsize; 5309 size_t nvsize = 0; 5310 dmu_buf_t *db; 5311 5312 VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0); 5313 5314 /* 5315 * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration 5316 * information. This avoids the dbuf_will_dirty() path and 5317 * saves us a pre-read to get data we don't actually care about. 5318 */ 5319 bufsize = P2ROUNDUP(nvsize, SPA_CONFIG_BLOCKSIZE); 5320 packed = kmem_alloc(bufsize, KM_SLEEP); 5321 5322 VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR, 5323 KM_SLEEP) == 0); 5324 bzero(packed + nvsize, bufsize - nvsize); 5325 5326 dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx); 5327 5328 kmem_free(packed, bufsize); 5329 5330 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 5331 dmu_buf_will_dirty(db, tx); 5332 *(uint64_t *)db->db_data = nvsize; 5333 dmu_buf_rele(db, FTAG); 5334} 5335 5336static void 5337spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx, 5338 const char *config, const char *entry) 5339{ 5340 nvlist_t *nvroot; 5341 nvlist_t **list; 5342 int i; 5343 5344 if (!sav->sav_sync) 5345 return; 5346 5347 /* 5348 * Update the MOS nvlist describing the list of available devices. 5349 * spa_validate_aux() will have already made sure this nvlist is 5350 * valid and the vdevs are labeled appropriately. 5351 */ 5352 if (sav->sav_object == 0) { 5353 sav->sav_object = dmu_object_alloc(spa->spa_meta_objset, 5354 DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE, 5355 sizeof (uint64_t), tx); 5356 VERIFY(zap_update(spa->spa_meta_objset, 5357 DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1, 5358 &sav->sav_object, tx) == 0); 5359 } 5360 5361 VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 5362 if (sav->sav_count == 0) { 5363 VERIFY(nvlist_add_nvlist_array(nvroot, config, NULL, 0) == 0); 5364 } else { 5365 list = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); 5366 for (i = 0; i < sav->sav_count; i++) 5367 list[i] = vdev_config_generate(spa, sav->sav_vdevs[i], 5368 B_FALSE, VDEV_CONFIG_L2CACHE); 5369 VERIFY(nvlist_add_nvlist_array(nvroot, config, list, 5370 sav->sav_count) == 0); 5371 for (i = 0; i < sav->sav_count; i++) 5372 nvlist_free(list[i]); 5373 kmem_free(list, sav->sav_count * sizeof (void *)); 5374 } 5375 5376 spa_sync_nvlist(spa, sav->sav_object, nvroot, tx); 5377 nvlist_free(nvroot); 5378 5379 sav->sav_sync = B_FALSE; 5380} 5381 5382static void 5383spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) 5384{ 5385 nvlist_t *config; 5386 5387 if (list_is_empty(&spa->spa_config_dirty_list)) 5388 return; 5389 5390 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 5391 5392 config = spa_config_generate(spa, spa->spa_root_vdev, 5393 dmu_tx_get_txg(tx), B_FALSE); 5394 5395 spa_config_exit(spa, SCL_STATE, FTAG); 5396 5397 if (spa->spa_config_syncing) 5398 nvlist_free(spa->spa_config_syncing); 5399 spa->spa_config_syncing = config; 5400 5401 spa_sync_nvlist(spa, spa->spa_config_object, config, tx); 5402} 5403 5404/* 5405 * Set zpool properties. 5406 */ 5407static void 5408spa_sync_props(void *arg1, void *arg2, dmu_tx_t *tx) 5409{ 5410 spa_t *spa = arg1; 5411 objset_t *mos = spa->spa_meta_objset; 5412 nvlist_t *nvp = arg2; 5413 nvpair_t *elem; 5414 uint64_t intval; 5415 char *strval; 5416 zpool_prop_t prop; 5417 const char *propname; 5418 zprop_type_t proptype; 5419 5420 mutex_enter(&spa->spa_props_lock); 5421 5422 elem = NULL; 5423 while ((elem = nvlist_next_nvpair(nvp, elem))) { 5424 switch (prop = zpool_name_to_prop(nvpair_name(elem))) { 5425 case ZPOOL_PROP_VERSION: 5426 /* 5427 * Only set version for non-zpool-creation cases 5428 * (set/import). spa_create() needs special care 5429 * for version setting. 5430 */ 5431 if (tx->tx_txg != TXG_INITIAL) { 5432 VERIFY(nvpair_value_uint64(elem, 5433 &intval) == 0); 5434 ASSERT(intval <= SPA_VERSION); 5435 ASSERT(intval >= spa_version(spa)); 5436 spa->spa_uberblock.ub_version = intval; 5437 vdev_config_dirty(spa->spa_root_vdev); 5438 } 5439 break; 5440 5441 case ZPOOL_PROP_ALTROOT: 5442 /* 5443 * 'altroot' is a non-persistent property. It should 5444 * have been set temporarily at creation or import time. 5445 */ 5446 ASSERT(spa->spa_root != NULL); 5447 break; 5448 5449 case ZPOOL_PROP_READONLY: 5450 case ZPOOL_PROP_CACHEFILE: 5451 /* 5452 * 'readonly' and 'cachefile' are also non-persisitent 5453 * properties. 5454 */ 5455 break;
|