spa_config.c revision 168714
1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22/* 23 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27#pragma ident "%Z%%M% %I% %E% SMI" 28 29#include <sys/zfs_context.h> 30#include <sys/spa.h> 31#include <sys/spa_impl.h> 32#include <sys/nvpair.h> 33#include <sys/uio.h> 34#include <sys/fs/zfs.h> 35#include <sys/vdev_impl.h> 36#include <sys/zfs_ioctl.h> 37#include <sys/utsname.h> 38#ifdef _KERNEL 39#include <sys/kobj.h> 40#endif 41 42/* 43 * Pool configuration repository. 44 * 45 * The configuration for all pools, in addition to being stored on disk, is 46 * stored in /etc/zfs/zpool.cache as a packed nvlist. The kernel maintains 47 * this list as pools are created, destroyed, or modified. 48 * 49 * We have a single nvlist which holds all the configuration information. When 50 * the module loads, we read this information from the cache and populate the 51 * SPA namespace. This namespace is maintained independently in spa.c. 52 * Whenever the namespace is modified, or the configuration of a pool is 53 * changed, we call spa_config_sync(), which walks through all the active pools 54 * and writes the configuration to disk. 55 */ 56 57static uint64_t spa_config_generation = 1; 58 59/* 60 * This can be overridden in userland to preserve an alternate namespace for 61 * userland pools when doing testing. 62 */ 63const char *spa_config_dir = ZPOOL_CACHE_DIR; 64 65/* 66 * Called when the module is first loaded, this routine loads the configuration 67 * file into the SPA namespace. It does not actually open or load the pools; it 68 * only populates the namespace. 69 */ 70void 71spa_config_load(void) 72{ 73 void *buf = NULL; 74 nvlist_t *nvlist, *child; 75 nvpair_t *nvpair; 76 spa_t *spa; 77 char pathname[128]; 78 struct _buf *file; 79 uint64_t fsize; 80 81 /* 82 * Open the configuration file. 83 */ 84 (void) snprintf(pathname, sizeof (pathname), "%s/%s", 85 spa_config_dir, ZPOOL_CACHE_FILE); 86 87 file = kobj_open_file(pathname); 88 if (file == (struct _buf *)-1) { 89 ZFS_LOG(1, "Cannot open %s.", pathname); 90 return; 91 } 92 93 if (kobj_get_filesize(file, &fsize) != 0) { 94 ZFS_LOG(1, "Cannot get size of %s.", pathname); 95 goto out; 96 } 97 98 buf = kmem_alloc(fsize, KM_SLEEP); 99 100 /* 101 * Read the nvlist from the file. 102 */ 103 if (kobj_read_file(file, buf, fsize, 0) < 0) { 104 ZFS_LOG(1, "Cannot read %s.", pathname); 105 goto out; 106 } 107 108 /* 109 * Unpack the nvlist. 110 */ 111 if (nvlist_unpack(buf, fsize, &nvlist, KM_SLEEP) != 0) 112 goto out; 113 114 ZFS_LOG(1, "File %s loaded.", pathname); 115 116 /* 117 * Iterate over all elements in the nvlist, creating a new spa_t for 118 * each one with the specified configuration. 119 */ 120 mutex_enter(&spa_namespace_lock); 121 nvpair = NULL; 122 while ((nvpair = nvlist_next_nvpair(nvlist, nvpair)) != NULL) { 123 124 if (nvpair_type(nvpair) != DATA_TYPE_NVLIST) 125 continue; 126 127 VERIFY(nvpair_value_nvlist(nvpair, &child) == 0); 128 129 if (spa_lookup(nvpair_name(nvpair)) != NULL) 130 continue; 131 spa = spa_add(nvpair_name(nvpair), NULL); 132 133 /* 134 * We blindly duplicate the configuration here. If it's 135 * invalid, we will catch it when the pool is first opened. 136 */ 137 VERIFY(nvlist_dup(child, &spa->spa_config, 0) == 0); 138 } 139 mutex_exit(&spa_namespace_lock); 140 141 nvlist_free(nvlist); 142 143out: 144 if (buf != NULL) 145 kmem_free(buf, fsize); 146 147 kobj_close_file(file); 148} 149 150/* 151 * Synchronize all pools to disk. This must be called with the namespace lock 152 * held. 153 */ 154void 155spa_config_sync(void) 156{ 157 spa_t *spa = NULL; 158 nvlist_t *config; 159 size_t buflen; 160 char *buf; 161 vnode_t *vp; 162 int oflags = FWRITE | FTRUNC | FCREAT | FOFFMAX; 163 char pathname[128]; 164 char pathname2[128]; 165 166 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 167 168 VERIFY(nvlist_alloc(&config, NV_UNIQUE_NAME, KM_SLEEP) == 0); 169 170 /* 171 * Add all known pools to the configuration list, ignoring those with 172 * alternate root paths. 173 */ 174 spa = NULL; 175 while ((spa = spa_next(spa)) != NULL) { 176 mutex_enter(&spa->spa_config_cache_lock); 177 if (spa->spa_config && spa->spa_name && spa->spa_root == NULL) 178 VERIFY(nvlist_add_nvlist(config, spa->spa_name, 179 spa->spa_config) == 0); 180 mutex_exit(&spa->spa_config_cache_lock); 181 } 182 183 /* 184 * Pack the configuration into a buffer. 185 */ 186 VERIFY(nvlist_size(config, &buflen, NV_ENCODE_XDR) == 0); 187 188 buf = kmem_alloc(buflen, KM_SLEEP); 189 190 VERIFY(nvlist_pack(config, &buf, &buflen, NV_ENCODE_XDR, 191 KM_SLEEP) == 0); 192 193 /* 194 * Write the configuration to disk. We need to do the traditional 195 * 'write to temporary file, sync, move over original' to make sure we 196 * always have a consistent view of the data. 197 */ 198 (void) snprintf(pathname, sizeof (pathname), "%s/%s", spa_config_dir, 199 ZPOOL_CACHE_TMP); 200 201 if (vn_open(pathname, UIO_SYSSPACE, oflags, 0644, &vp, CRCREAT, 0) != 0) 202 goto out; 203 204 if (vn_rdwr(UIO_WRITE, vp, buf, buflen, 0, UIO_SYSSPACE, 205 0, RLIM64_INFINITY, kcred, NULL) == 0 && 206 VOP_FSYNC(vp, FSYNC, kcred) == 0) { 207 (void) snprintf(pathname2, sizeof (pathname2), "%s/%s", 208 spa_config_dir, ZPOOL_CACHE_FILE); 209 (void) vn_rename(pathname, pathname2, UIO_SYSSPACE); 210 } 211 212 (void) VOP_CLOSE(vp, oflags, 1, 0, kcred); 213 VN_RELE(vp); 214 215out: 216 (void) vn_remove(pathname, UIO_SYSSPACE, RMFILE); 217 spa_config_generation++; 218 219 kmem_free(buf, buflen); 220 nvlist_free(config); 221} 222 223/* 224 * Sigh. Inside a local zone, we don't have access to /etc/zfs/zpool.cache, 225 * and we don't want to allow the local zone to see all the pools anyway. 226 * So we have to invent the ZFS_IOC_CONFIG ioctl to grab the configuration 227 * information for all pool visible within the zone. 228 */ 229nvlist_t * 230spa_all_configs(uint64_t *generation) 231{ 232 nvlist_t *pools; 233 spa_t *spa; 234 235 if (*generation == spa_config_generation) 236 return (NULL); 237 238 VERIFY(nvlist_alloc(&pools, NV_UNIQUE_NAME, KM_SLEEP) == 0); 239 240 spa = NULL; 241 mutex_enter(&spa_namespace_lock); 242 while ((spa = spa_next(spa)) != NULL) { 243 if (INGLOBALZONE(curproc) || 244 zone_dataset_visible(spa_name(spa), NULL)) { 245 mutex_enter(&spa->spa_config_cache_lock); 246 VERIFY(nvlist_add_nvlist(pools, spa_name(spa), 247 spa->spa_config) == 0); 248 mutex_exit(&spa->spa_config_cache_lock); 249 } 250 } 251 mutex_exit(&spa_namespace_lock); 252 253 *generation = spa_config_generation; 254 255 return (pools); 256} 257 258void 259spa_config_set(spa_t *spa, nvlist_t *config) 260{ 261 mutex_enter(&spa->spa_config_cache_lock); 262 if (spa->spa_config != NULL) 263 nvlist_free(spa->spa_config); 264 spa->spa_config = config; 265 mutex_exit(&spa->spa_config_cache_lock); 266} 267 268/* 269 * Generate the pool's configuration based on the current in-core state. 270 * We infer whether to generate a complete config or just one top-level config 271 * based on whether vd is the root vdev. 272 */ 273nvlist_t * 274spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats) 275{ 276 nvlist_t *config, *nvroot; 277 vdev_t *rvd = spa->spa_root_vdev; 278 unsigned long hostid = 0; 279 280 ASSERT(spa_config_held(spa, RW_READER)); 281 282 if (vd == NULL) 283 vd = rvd; 284 285 /* 286 * If txg is -1, report the current value of spa->spa_config_txg. 287 */ 288 if (txg == -1ULL) 289 txg = spa->spa_config_txg; 290 291 VERIFY(nvlist_alloc(&config, NV_UNIQUE_NAME, KM_SLEEP) == 0); 292 293 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, 294 spa_version(spa)) == 0); 295 VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, 296 spa_name(spa)) == 0); 297 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 298 spa_state(spa)) == 0); 299 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG, 300 txg) == 0); 301 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID, 302 spa_guid(spa)) == 0); 303 (void) ddi_strtoul(hw_serial, NULL, 10, &hostid); 304 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_HOSTID, 305 hostid) == 0); 306 VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_HOSTNAME, 307 utsname.nodename) == 0); 308 309 if (vd != rvd) { 310 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TOP_GUID, 311 vd->vdev_top->vdev_guid) == 0); 312 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_GUID, 313 vd->vdev_guid) == 0); 314 if (vd->vdev_isspare) 315 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_IS_SPARE, 316 1ULL) == 0); 317 vd = vd->vdev_top; /* label contains top config */ 318 } 319 320 nvroot = vdev_config_generate(spa, vd, getstats, B_FALSE); 321 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0); 322 nvlist_free(nvroot); 323 324 return (config); 325} 326 327/* 328 * Update all disk labels, generate a fresh config based on the current 329 * in-core state, and sync the global config cache. 330 */ 331void 332spa_config_update(spa_t *spa, int what) 333{ 334 vdev_t *rvd = spa->spa_root_vdev; 335 uint64_t txg; 336 int c; 337 338 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 339 340 spa_config_enter(spa, RW_WRITER, FTAG); 341 txg = spa_last_synced_txg(spa) + 1; 342 if (what == SPA_CONFIG_UPDATE_POOL) { 343 vdev_config_dirty(rvd); 344 } else { 345 /* 346 * If we have top-level vdevs that were added but have 347 * not yet been prepared for allocation, do that now. 348 * (It's safe now because the config cache is up to date, 349 * so it will be able to translate the new DVAs.) 350 * See comments in spa_vdev_add() for full details. 351 */ 352 for (c = 0; c < rvd->vdev_children; c++) { 353 vdev_t *tvd = rvd->vdev_child[c]; 354 if (tvd->vdev_ms_array == 0) { 355 vdev_init(tvd, txg); 356 vdev_config_dirty(tvd); 357 } 358 } 359 } 360 spa_config_exit(spa, FTAG); 361 362 /* 363 * Wait for the mosconfig to be regenerated and synced. 364 */ 365 txg_wait_synced(spa->spa_dsl_pool, txg); 366 367 /* 368 * Update the global config cache to reflect the new mosconfig. 369 */ 370 spa_config_sync(); 371 372 if (what == SPA_CONFIG_UPDATE_POOL) 373 spa_config_update(spa, SPA_CONFIG_UPDATE_VDEVS); 374} 375