23 */ 24 25#include <sys/dsl_pool.h> 26#include <sys/dsl_dataset.h> 27#include <sys/dsl_prop.h> 28#include <sys/dsl_dir.h> 29#include <sys/dsl_synctask.h> 30#include <sys/dsl_scan.h> 31#include <sys/dnode.h> 32#include <sys/dmu_tx.h> 33#include <sys/dmu_objset.h> 34#include <sys/arc.h> 35#include <sys/zap.h> 36#include <sys/zio.h> 37#include <sys/zfs_context.h> 38#include <sys/fs/zfs.h> 39#include <sys/zfs_znode.h> 40#include <sys/spa_impl.h> 41#include <sys/dsl_deadlist.h> 42 43int zfs_no_write_throttle = 0; 44int zfs_write_limit_shift = 3; /* 1/8th of physical memory */ 45int zfs_txg_synctime_ms = 1000; /* target millisecs to sync a txg */ 46 47uint64_t zfs_write_limit_min = 32 << 20; /* min write limit is 32MB */ 48uint64_t zfs_write_limit_max = 0; /* max data payload per txg */ 49uint64_t zfs_write_limit_inflated = 0; 50uint64_t zfs_write_limit_override = 0; 51 52kmutex_t zfs_write_limit_lock; 53 54static pgcnt_t old_physmem = 0; 55 56SYSCTL_DECL(_vfs_zfs); 57TUNABLE_INT("vfs.zfs.no_write_throttle", &zfs_no_write_throttle); 58SYSCTL_INT(_vfs_zfs, OID_AUTO, no_write_throttle, CTLFLAG_RDTUN, 59 &zfs_no_write_throttle, 0, ""); 60TUNABLE_INT("vfs.zfs.write_limit_shift", &zfs_write_limit_shift); 61SYSCTL_INT(_vfs_zfs, OID_AUTO, write_limit_shift, CTLFLAG_RDTUN, 62 &zfs_write_limit_shift, 0, "2^N of physical memory"); 63SYSCTL_DECL(_vfs_zfs_txg); 64TUNABLE_INT("vfs.zfs.txg.synctime_ms", &zfs_txg_synctime_ms); 65SYSCTL_INT(_vfs_zfs_txg, OID_AUTO, synctime_ms, CTLFLAG_RDTUN, 66 &zfs_txg_synctime_ms, 0, "Target milliseconds to sync a txg"); 67 68TUNABLE_QUAD("vfs.zfs.write_limit_min", &zfs_write_limit_min); 69SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, write_limit_min, CTLFLAG_RDTUN, 70 &zfs_write_limit_min, 0, "Minimum write limit"); 71TUNABLE_QUAD("vfs.zfs.write_limit_max", &zfs_write_limit_max); 72SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, write_limit_max, CTLFLAG_RDTUN, 73 &zfs_write_limit_max, 0, "Maximum data payload per txg"); 74TUNABLE_QUAD("vfs.zfs.write_limit_inflated", &zfs_write_limit_inflated); 75SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, write_limit_inflated, CTLFLAG_RDTUN, 76 &zfs_write_limit_inflated, 0, ""); 77TUNABLE_QUAD("vfs.zfs.write_limit_override", &zfs_write_limit_override); 78SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, write_limit_override, CTLFLAG_RDTUN, 79 &zfs_write_limit_override, 0, ""); 80 81int 82dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **ddp) 83{ 84 uint64_t obj; 85 int err; 86 87 err = zap_lookup(dp->dp_meta_objset, 88 dp->dp_root_dir->dd_phys->dd_child_dir_zapobj, 89 name, sizeof (obj), 1, &obj); 90 if (err) 91 return (err); 92 93 return (dsl_dir_open_obj(dp, obj, name, dp, ddp)); 94} 95 96static dsl_pool_t * 97dsl_pool_open_impl(spa_t *spa, uint64_t txg) 98{ 99 dsl_pool_t *dp; 100 blkptr_t *bp = spa_get_rootblkptr(spa); 101 102 dp = kmem_zalloc(sizeof (dsl_pool_t), KM_SLEEP); 103 dp->dp_spa = spa; 104 dp->dp_meta_rootbp = *bp; 105 rw_init(&dp->dp_config_rwlock, NULL, RW_DEFAULT, NULL); 106 dp->dp_write_limit = zfs_write_limit_min; 107 txg_init(dp, txg); 108 109 txg_list_create(&dp->dp_dirty_datasets, 110 offsetof(dsl_dataset_t, ds_dirty_link)); 111 txg_list_create(&dp->dp_dirty_dirs, 112 offsetof(dsl_dir_t, dd_dirty_link)); 113 txg_list_create(&dp->dp_sync_tasks, 114 offsetof(dsl_sync_task_group_t, dstg_node)); 115 list_create(&dp->dp_synced_datasets, sizeof (dsl_dataset_t), 116 offsetof(dsl_dataset_t, ds_synced_link)); 117 118 mutex_init(&dp->dp_lock, NULL, MUTEX_DEFAULT, NULL); 119 120 dp->dp_vnrele_taskq = taskq_create("zfs_vn_rele_taskq", 1, minclsyspri, 121 1, 4, 0); 122 123 return (dp); 124} 125 126int 127dsl_pool_open(spa_t *spa, uint64_t txg, dsl_pool_t **dpp) 128{ 129 int err; 130 dsl_pool_t *dp = dsl_pool_open_impl(spa, txg); 131 dsl_dir_t *dd; 132 dsl_dataset_t *ds; 133 uint64_t obj; 134 135 rw_enter(&dp->dp_config_rwlock, RW_WRITER); 136 err = dmu_objset_open_impl(spa, NULL, &dp->dp_meta_rootbp, 137 &dp->dp_meta_objset); 138 if (err) 139 goto out; 140 141 err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 142 DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1, 143 &dp->dp_root_dir_obj); 144 if (err) 145 goto out; 146 147 err = dsl_dir_open_obj(dp, dp->dp_root_dir_obj, 148 NULL, dp, &dp->dp_root_dir); 149 if (err) 150 goto out; 151 152 err = dsl_pool_open_special_dir(dp, MOS_DIR_NAME, &dp->dp_mos_dir); 153 if (err) 154 goto out; 155 156 if (spa_version(spa) >= SPA_VERSION_ORIGIN) { 157 err = dsl_pool_open_special_dir(dp, ORIGIN_DIR_NAME, &dd); 158 if (err) 159 goto out; 160 err = dsl_dataset_hold_obj(dp, dd->dd_phys->dd_head_dataset_obj, 161 FTAG, &ds); 162 if (err == 0) { 163 err = dsl_dataset_hold_obj(dp, 164 ds->ds_phys->ds_prev_snap_obj, dp, 165 &dp->dp_origin_snap); 166 dsl_dataset_rele(ds, FTAG); 167 } 168 dsl_dir_close(dd, dp); 169 if (err) 170 goto out; 171 } 172 173 if (spa_version(spa) >= SPA_VERSION_DEADLISTS) { 174 err = dsl_pool_open_special_dir(dp, FREE_DIR_NAME, 175 &dp->dp_free_dir); 176 if (err) 177 goto out; 178 179 err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 180 DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj); 181 if (err) 182 goto out; 183 VERIFY3U(0, ==, bpobj_open(&dp->dp_free_bpobj, 184 dp->dp_meta_objset, obj)); 185 } 186 187 err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 188 DMU_POOL_TMP_USERREFS, sizeof (uint64_t), 1, 189 &dp->dp_tmp_userrefs_obj); 190 if (err == ENOENT) 191 err = 0; 192 if (err) 193 goto out; 194 195 err = dsl_scan_init(dp, txg); 196 197out: 198 rw_exit(&dp->dp_config_rwlock); 199 if (err) 200 dsl_pool_close(dp); 201 else 202 *dpp = dp; 203 204 return (err); 205} 206 207void 208dsl_pool_close(dsl_pool_t *dp) 209{ 210 /* drop our references from dsl_pool_open() */ 211 212 /* 213 * Since we held the origin_snap from "syncing" context (which 214 * includes pool-opening context), it actually only got a "ref" 215 * and not a hold, so just drop that here. 216 */ 217 if (dp->dp_origin_snap) 218 dsl_dataset_drop_ref(dp->dp_origin_snap, dp); 219 if (dp->dp_mos_dir) 220 dsl_dir_close(dp->dp_mos_dir, dp); 221 if (dp->dp_free_dir) 222 dsl_dir_close(dp->dp_free_dir, dp); 223 if (dp->dp_root_dir) 224 dsl_dir_close(dp->dp_root_dir, dp); 225 226 bpobj_close(&dp->dp_free_bpobj); 227 228 /* undo the dmu_objset_open_impl(mos) from dsl_pool_open() */ 229 if (dp->dp_meta_objset) 230 dmu_objset_evict(dp->dp_meta_objset); 231 232 txg_list_destroy(&dp->dp_dirty_datasets); 233 txg_list_destroy(&dp->dp_sync_tasks); 234 txg_list_destroy(&dp->dp_dirty_dirs); 235 list_destroy(&dp->dp_synced_datasets); 236 237 arc_flush(dp->dp_spa); 238 txg_fini(dp); 239 dsl_scan_fini(dp); 240 rw_destroy(&dp->dp_config_rwlock); 241 mutex_destroy(&dp->dp_lock); 242 taskq_destroy(dp->dp_vnrele_taskq); 243 if (dp->dp_blkstats) 244 kmem_free(dp->dp_blkstats, sizeof (zfs_all_blkstats_t)); 245 kmem_free(dp, sizeof (dsl_pool_t)); 246} 247 248dsl_pool_t * 249dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg) 250{ 251 int err; 252 dsl_pool_t *dp = dsl_pool_open_impl(spa, txg); 253 dmu_tx_t *tx = dmu_tx_create_assigned(dp, txg); 254 objset_t *os; 255 dsl_dataset_t *ds; 256 uint64_t obj; 257 258 /* create and open the MOS (meta-objset) */ 259 dp->dp_meta_objset = dmu_objset_create_impl(spa, 260 NULL, &dp->dp_meta_rootbp, DMU_OST_META, tx); 261 262 /* create the pool directory */ 263 err = zap_create_claim(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 264 DMU_OT_OBJECT_DIRECTORY, DMU_OT_NONE, 0, tx); 265 ASSERT3U(err, ==, 0); 266 267 /* Initialize scan structures */ 268 VERIFY3U(0, ==, dsl_scan_init(dp, txg)); 269 270 /* create and open the root dir */ 271 dp->dp_root_dir_obj = dsl_dir_create_sync(dp, NULL, NULL, tx); 272 VERIFY(0 == dsl_dir_open_obj(dp, dp->dp_root_dir_obj, 273 NULL, dp, &dp->dp_root_dir)); 274 275 /* create and open the meta-objset dir */ 276 (void) dsl_dir_create_sync(dp, dp->dp_root_dir, MOS_DIR_NAME, tx); 277 VERIFY(0 == dsl_pool_open_special_dir(dp, 278 MOS_DIR_NAME, &dp->dp_mos_dir)); 279 280 if (spa_version(spa) >= SPA_VERSION_DEADLISTS) { 281 /* create and open the free dir */ 282 (void) dsl_dir_create_sync(dp, dp->dp_root_dir, 283 FREE_DIR_NAME, tx); 284 VERIFY(0 == dsl_pool_open_special_dir(dp, 285 FREE_DIR_NAME, &dp->dp_free_dir)); 286 287 /* create and open the free_bplist */ 288 obj = bpobj_alloc(dp->dp_meta_objset, SPA_MAXBLOCKSIZE, tx); 289 VERIFY(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 290 DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx) == 0); 291 VERIFY3U(0, ==, bpobj_open(&dp->dp_free_bpobj, 292 dp->dp_meta_objset, obj)); 293 } 294 295 if (spa_version(spa) >= SPA_VERSION_DSL_SCRUB) 296 dsl_pool_create_origin(dp, tx); 297 298 /* create the root dataset */ 299 obj = dsl_dataset_create_sync_dd(dp->dp_root_dir, NULL, 0, tx); 300 301 /* create the root objset */ 302 VERIFY(0 == dsl_dataset_hold_obj(dp, obj, FTAG, &ds)); 303 os = dmu_objset_create_impl(dp->dp_spa, ds, 304 dsl_dataset_get_blkptr(ds), DMU_OST_ZFS, tx); 305#ifdef _KERNEL 306 zfs_create_fs(os, kcred, zplprops, tx); 307#endif 308 dsl_dataset_rele(ds, FTAG); 309 310 dmu_tx_commit(tx); 311 312 return (dp); 313} 314 315static int 316deadlist_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 317{ 318 dsl_deadlist_t *dl = arg;
| 24 */ 25 26#include <sys/dsl_pool.h> 27#include <sys/dsl_dataset.h> 28#include <sys/dsl_prop.h> 29#include <sys/dsl_dir.h> 30#include <sys/dsl_synctask.h> 31#include <sys/dsl_scan.h> 32#include <sys/dnode.h> 33#include <sys/dmu_tx.h> 34#include <sys/dmu_objset.h> 35#include <sys/arc.h> 36#include <sys/zap.h> 37#include <sys/zio.h> 38#include <sys/zfs_context.h> 39#include <sys/fs/zfs.h> 40#include <sys/zfs_znode.h> 41#include <sys/spa_impl.h> 42#include <sys/dsl_deadlist.h> 43 44int zfs_no_write_throttle = 0; 45int zfs_write_limit_shift = 3; /* 1/8th of physical memory */ 46int zfs_txg_synctime_ms = 1000; /* target millisecs to sync a txg */ 47 48uint64_t zfs_write_limit_min = 32 << 20; /* min write limit is 32MB */ 49uint64_t zfs_write_limit_max = 0; /* max data payload per txg */ 50uint64_t zfs_write_limit_inflated = 0; 51uint64_t zfs_write_limit_override = 0; 52 53kmutex_t zfs_write_limit_lock; 54 55static pgcnt_t old_physmem = 0; 56 57SYSCTL_DECL(_vfs_zfs); 58TUNABLE_INT("vfs.zfs.no_write_throttle", &zfs_no_write_throttle); 59SYSCTL_INT(_vfs_zfs, OID_AUTO, no_write_throttle, CTLFLAG_RDTUN, 60 &zfs_no_write_throttle, 0, ""); 61TUNABLE_INT("vfs.zfs.write_limit_shift", &zfs_write_limit_shift); 62SYSCTL_INT(_vfs_zfs, OID_AUTO, write_limit_shift, CTLFLAG_RDTUN, 63 &zfs_write_limit_shift, 0, "2^N of physical memory"); 64SYSCTL_DECL(_vfs_zfs_txg); 65TUNABLE_INT("vfs.zfs.txg.synctime_ms", &zfs_txg_synctime_ms); 66SYSCTL_INT(_vfs_zfs_txg, OID_AUTO, synctime_ms, CTLFLAG_RDTUN, 67 &zfs_txg_synctime_ms, 0, "Target milliseconds to sync a txg"); 68 69TUNABLE_QUAD("vfs.zfs.write_limit_min", &zfs_write_limit_min); 70SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, write_limit_min, CTLFLAG_RDTUN, 71 &zfs_write_limit_min, 0, "Minimum write limit"); 72TUNABLE_QUAD("vfs.zfs.write_limit_max", &zfs_write_limit_max); 73SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, write_limit_max, CTLFLAG_RDTUN, 74 &zfs_write_limit_max, 0, "Maximum data payload per txg"); 75TUNABLE_QUAD("vfs.zfs.write_limit_inflated", &zfs_write_limit_inflated); 76SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, write_limit_inflated, CTLFLAG_RDTUN, 77 &zfs_write_limit_inflated, 0, ""); 78TUNABLE_QUAD("vfs.zfs.write_limit_override", &zfs_write_limit_override); 79SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, write_limit_override, CTLFLAG_RDTUN, 80 &zfs_write_limit_override, 0, ""); 81 82int 83dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **ddp) 84{ 85 uint64_t obj; 86 int err; 87 88 err = zap_lookup(dp->dp_meta_objset, 89 dp->dp_root_dir->dd_phys->dd_child_dir_zapobj, 90 name, sizeof (obj), 1, &obj); 91 if (err) 92 return (err); 93 94 return (dsl_dir_open_obj(dp, obj, name, dp, ddp)); 95} 96 97static dsl_pool_t * 98dsl_pool_open_impl(spa_t *spa, uint64_t txg) 99{ 100 dsl_pool_t *dp; 101 blkptr_t *bp = spa_get_rootblkptr(spa); 102 103 dp = kmem_zalloc(sizeof (dsl_pool_t), KM_SLEEP); 104 dp->dp_spa = spa; 105 dp->dp_meta_rootbp = *bp; 106 rw_init(&dp->dp_config_rwlock, NULL, RW_DEFAULT, NULL); 107 dp->dp_write_limit = zfs_write_limit_min; 108 txg_init(dp, txg); 109 110 txg_list_create(&dp->dp_dirty_datasets, 111 offsetof(dsl_dataset_t, ds_dirty_link)); 112 txg_list_create(&dp->dp_dirty_dirs, 113 offsetof(dsl_dir_t, dd_dirty_link)); 114 txg_list_create(&dp->dp_sync_tasks, 115 offsetof(dsl_sync_task_group_t, dstg_node)); 116 list_create(&dp->dp_synced_datasets, sizeof (dsl_dataset_t), 117 offsetof(dsl_dataset_t, ds_synced_link)); 118 119 mutex_init(&dp->dp_lock, NULL, MUTEX_DEFAULT, NULL); 120 121 dp->dp_vnrele_taskq = taskq_create("zfs_vn_rele_taskq", 1, minclsyspri, 122 1, 4, 0); 123 124 return (dp); 125} 126 127int 128dsl_pool_open(spa_t *spa, uint64_t txg, dsl_pool_t **dpp) 129{ 130 int err; 131 dsl_pool_t *dp = dsl_pool_open_impl(spa, txg); 132 dsl_dir_t *dd; 133 dsl_dataset_t *ds; 134 uint64_t obj; 135 136 rw_enter(&dp->dp_config_rwlock, RW_WRITER); 137 err = dmu_objset_open_impl(spa, NULL, &dp->dp_meta_rootbp, 138 &dp->dp_meta_objset); 139 if (err) 140 goto out; 141 142 err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 143 DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1, 144 &dp->dp_root_dir_obj); 145 if (err) 146 goto out; 147 148 err = dsl_dir_open_obj(dp, dp->dp_root_dir_obj, 149 NULL, dp, &dp->dp_root_dir); 150 if (err) 151 goto out; 152 153 err = dsl_pool_open_special_dir(dp, MOS_DIR_NAME, &dp->dp_mos_dir); 154 if (err) 155 goto out; 156 157 if (spa_version(spa) >= SPA_VERSION_ORIGIN) { 158 err = dsl_pool_open_special_dir(dp, ORIGIN_DIR_NAME, &dd); 159 if (err) 160 goto out; 161 err = dsl_dataset_hold_obj(dp, dd->dd_phys->dd_head_dataset_obj, 162 FTAG, &ds); 163 if (err == 0) { 164 err = dsl_dataset_hold_obj(dp, 165 ds->ds_phys->ds_prev_snap_obj, dp, 166 &dp->dp_origin_snap); 167 dsl_dataset_rele(ds, FTAG); 168 } 169 dsl_dir_close(dd, dp); 170 if (err) 171 goto out; 172 } 173 174 if (spa_version(spa) >= SPA_VERSION_DEADLISTS) { 175 err = dsl_pool_open_special_dir(dp, FREE_DIR_NAME, 176 &dp->dp_free_dir); 177 if (err) 178 goto out; 179 180 err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 181 DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj); 182 if (err) 183 goto out; 184 VERIFY3U(0, ==, bpobj_open(&dp->dp_free_bpobj, 185 dp->dp_meta_objset, obj)); 186 } 187 188 err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 189 DMU_POOL_TMP_USERREFS, sizeof (uint64_t), 1, 190 &dp->dp_tmp_userrefs_obj); 191 if (err == ENOENT) 192 err = 0; 193 if (err) 194 goto out; 195 196 err = dsl_scan_init(dp, txg); 197 198out: 199 rw_exit(&dp->dp_config_rwlock); 200 if (err) 201 dsl_pool_close(dp); 202 else 203 *dpp = dp; 204 205 return (err); 206} 207 208void 209dsl_pool_close(dsl_pool_t *dp) 210{ 211 /* drop our references from dsl_pool_open() */ 212 213 /* 214 * Since we held the origin_snap from "syncing" context (which 215 * includes pool-opening context), it actually only got a "ref" 216 * and not a hold, so just drop that here. 217 */ 218 if (dp->dp_origin_snap) 219 dsl_dataset_drop_ref(dp->dp_origin_snap, dp); 220 if (dp->dp_mos_dir) 221 dsl_dir_close(dp->dp_mos_dir, dp); 222 if (dp->dp_free_dir) 223 dsl_dir_close(dp->dp_free_dir, dp); 224 if (dp->dp_root_dir) 225 dsl_dir_close(dp->dp_root_dir, dp); 226 227 bpobj_close(&dp->dp_free_bpobj); 228 229 /* undo the dmu_objset_open_impl(mos) from dsl_pool_open() */ 230 if (dp->dp_meta_objset) 231 dmu_objset_evict(dp->dp_meta_objset); 232 233 txg_list_destroy(&dp->dp_dirty_datasets); 234 txg_list_destroy(&dp->dp_sync_tasks); 235 txg_list_destroy(&dp->dp_dirty_dirs); 236 list_destroy(&dp->dp_synced_datasets); 237 238 arc_flush(dp->dp_spa); 239 txg_fini(dp); 240 dsl_scan_fini(dp); 241 rw_destroy(&dp->dp_config_rwlock); 242 mutex_destroy(&dp->dp_lock); 243 taskq_destroy(dp->dp_vnrele_taskq); 244 if (dp->dp_blkstats) 245 kmem_free(dp->dp_blkstats, sizeof (zfs_all_blkstats_t)); 246 kmem_free(dp, sizeof (dsl_pool_t)); 247} 248 249dsl_pool_t * 250dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg) 251{ 252 int err; 253 dsl_pool_t *dp = dsl_pool_open_impl(spa, txg); 254 dmu_tx_t *tx = dmu_tx_create_assigned(dp, txg); 255 objset_t *os; 256 dsl_dataset_t *ds; 257 uint64_t obj; 258 259 /* create and open the MOS (meta-objset) */ 260 dp->dp_meta_objset = dmu_objset_create_impl(spa, 261 NULL, &dp->dp_meta_rootbp, DMU_OST_META, tx); 262 263 /* create the pool directory */ 264 err = zap_create_claim(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 265 DMU_OT_OBJECT_DIRECTORY, DMU_OT_NONE, 0, tx); 266 ASSERT3U(err, ==, 0); 267 268 /* Initialize scan structures */ 269 VERIFY3U(0, ==, dsl_scan_init(dp, txg)); 270 271 /* create and open the root dir */ 272 dp->dp_root_dir_obj = dsl_dir_create_sync(dp, NULL, NULL, tx); 273 VERIFY(0 == dsl_dir_open_obj(dp, dp->dp_root_dir_obj, 274 NULL, dp, &dp->dp_root_dir)); 275 276 /* create and open the meta-objset dir */ 277 (void) dsl_dir_create_sync(dp, dp->dp_root_dir, MOS_DIR_NAME, tx); 278 VERIFY(0 == dsl_pool_open_special_dir(dp, 279 MOS_DIR_NAME, &dp->dp_mos_dir)); 280 281 if (spa_version(spa) >= SPA_VERSION_DEADLISTS) { 282 /* create and open the free dir */ 283 (void) dsl_dir_create_sync(dp, dp->dp_root_dir, 284 FREE_DIR_NAME, tx); 285 VERIFY(0 == dsl_pool_open_special_dir(dp, 286 FREE_DIR_NAME, &dp->dp_free_dir)); 287 288 /* create and open the free_bplist */ 289 obj = bpobj_alloc(dp->dp_meta_objset, SPA_MAXBLOCKSIZE, tx); 290 VERIFY(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 291 DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx) == 0); 292 VERIFY3U(0, ==, bpobj_open(&dp->dp_free_bpobj, 293 dp->dp_meta_objset, obj)); 294 } 295 296 if (spa_version(spa) >= SPA_VERSION_DSL_SCRUB) 297 dsl_pool_create_origin(dp, tx); 298 299 /* create the root dataset */ 300 obj = dsl_dataset_create_sync_dd(dp->dp_root_dir, NULL, 0, tx); 301 302 /* create the root objset */ 303 VERIFY(0 == dsl_dataset_hold_obj(dp, obj, FTAG, &ds)); 304 os = dmu_objset_create_impl(dp->dp_spa, ds, 305 dsl_dataset_get_blkptr(ds), DMU_OST_ZFS, tx); 306#ifdef _KERNEL 307 zfs_create_fs(os, kcred, zplprops, tx); 308#endif 309 dsl_dataset_rele(ds, FTAG); 310 311 dmu_tx_commit(tx); 312 313 return (dp); 314} 315 316static int 317deadlist_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 318{ 319 dsl_deadlist_t *dl = arg;
|
320 return (0); 321} 322 323void 324dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) 325{ 326 zio_t *zio; 327 dmu_tx_t *tx; 328 dsl_dir_t *dd; 329 dsl_dataset_t *ds; 330 dsl_sync_task_group_t *dstg; 331 objset_t *mos = dp->dp_meta_objset; 332 hrtime_t start, write_time; 333 uint64_t data_written; 334 int err; 335 336 /* 337 * We need to copy dp_space_towrite() before doing 338 * dsl_sync_task_group_sync(), because 339 * dsl_dataset_snapshot_reserve_space() will increase 340 * dp_space_towrite but not actually write anything. 341 */ 342 data_written = dp->dp_space_towrite[txg & TXG_MASK]; 343 344 tx = dmu_tx_create_assigned(dp, txg); 345 346 dp->dp_read_overhead = 0; 347 start = gethrtime(); 348 349 zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); 350 while (ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) { 351 /* 352 * We must not sync any non-MOS datasets twice, because 353 * we may have taken a snapshot of them. However, we 354 * may sync newly-created datasets on pass 2. 355 */ 356 ASSERT(!list_link_active(&ds->ds_synced_link)); 357 list_insert_tail(&dp->dp_synced_datasets, ds); 358 dsl_dataset_sync(ds, zio, tx); 359 } 360 DTRACE_PROBE(pool_sync__1setup); 361 err = zio_wait(zio); 362 363 write_time = gethrtime() - start; 364 ASSERT(err == 0); 365 DTRACE_PROBE(pool_sync__2rootzio); 366 367 for (ds = list_head(&dp->dp_synced_datasets); ds; 368 ds = list_next(&dp->dp_synced_datasets, ds)) 369 dmu_objset_do_userquota_updates(ds->ds_objset, tx); 370 371 /* 372 * Sync the datasets again to push out the changes due to 373 * userspace updates. This must be done before we process the 374 * sync tasks, because that could cause a snapshot of a dataset 375 * whose ds_bp will be rewritten when we do this 2nd sync. 376 */ 377 zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); 378 while (ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) { 379 ASSERT(list_link_active(&ds->ds_synced_link)); 380 dmu_buf_rele(ds->ds_dbuf, ds); 381 dsl_dataset_sync(ds, zio, tx); 382 } 383 err = zio_wait(zio); 384 385 /* 386 * Move dead blocks from the pending deadlist to the on-disk 387 * deadlist. 388 */ 389 for (ds = list_head(&dp->dp_synced_datasets); ds; 390 ds = list_next(&dp->dp_synced_datasets, ds)) { 391 bplist_iterate(&ds->ds_pending_deadlist, 392 deadlist_enqueue_cb, &ds->ds_deadlist, tx); 393 } 394 395 while (dstg = txg_list_remove(&dp->dp_sync_tasks, txg)) { 396 /* 397 * No more sync tasks should have been added while we 398 * were syncing. 399 */ 400 ASSERT(spa_sync_pass(dp->dp_spa) == 1); 401 dsl_sync_task_group_sync(dstg, tx); 402 } 403 DTRACE_PROBE(pool_sync__3task); 404 405 start = gethrtime(); 406 while (dd = txg_list_remove(&dp->dp_dirty_dirs, txg)) 407 dsl_dir_sync(dd, tx); 408 write_time += gethrtime() - start; 409 410 start = gethrtime(); 411 if (list_head(&mos->os_dirty_dnodes[txg & TXG_MASK]) != NULL || 412 list_head(&mos->os_free_dnodes[txg & TXG_MASK]) != NULL) { 413 zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); 414 dmu_objset_sync(mos, zio, tx); 415 err = zio_wait(zio); 416 ASSERT(err == 0); 417 dprintf_bp(&dp->dp_meta_rootbp, "meta objset rootbp is %s", ""); 418 spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp); 419 } 420 write_time += gethrtime() - start; 421 DTRACE_PROBE2(pool_sync__4io, hrtime_t, write_time, 422 hrtime_t, dp->dp_read_overhead); 423 write_time -= dp->dp_read_overhead; 424 425 dmu_tx_commit(tx); 426 427 dp->dp_space_towrite[txg & TXG_MASK] = 0; 428 ASSERT(dp->dp_tempreserved[txg & TXG_MASK] == 0); 429 430 /* 431 * If the write limit max has not been explicitly set, set it 432 * to a fraction of available physical memory (default 1/8th). 433 * Note that we must inflate the limit because the spa 434 * inflates write sizes to account for data replication. 435 * Check this each sync phase to catch changing memory size. 436 */ 437 if (physmem != old_physmem && zfs_write_limit_shift) { 438 mutex_enter(&zfs_write_limit_lock); 439 old_physmem = physmem; 440 zfs_write_limit_max = ptob(physmem) >> zfs_write_limit_shift; 441 zfs_write_limit_inflated = MAX(zfs_write_limit_min, 442 spa_get_asize(dp->dp_spa, zfs_write_limit_max)); 443 mutex_exit(&zfs_write_limit_lock); 444 } 445 446 /* 447 * Attempt to keep the sync time consistent by adjusting the 448 * amount of write traffic allowed into each transaction group. 449 * Weight the throughput calculation towards the current value: 450 * thru = 3/4 old_thru + 1/4 new_thru 451 * 452 * Note: write_time is in nanosecs, so write_time/MICROSEC 453 * yields millisecs 454 */ 455 ASSERT(zfs_write_limit_min > 0); 456 if (data_written > zfs_write_limit_min / 8 && write_time > MICROSEC) { 457 uint64_t throughput = data_written / (write_time / MICROSEC); 458 459 if (dp->dp_throughput) 460 dp->dp_throughput = throughput / 4 + 461 3 * dp->dp_throughput / 4; 462 else 463 dp->dp_throughput = throughput; 464 dp->dp_write_limit = MIN(zfs_write_limit_inflated, 465 MAX(zfs_write_limit_min, 466 dp->dp_throughput * zfs_txg_synctime_ms)); 467 } 468} 469 470void 471dsl_pool_sync_done(dsl_pool_t *dp, uint64_t txg) 472{ 473 dsl_dataset_t *ds; 474 objset_t *os; 475 476 while (ds = list_head(&dp->dp_synced_datasets)) { 477 list_remove(&dp->dp_synced_datasets, ds); 478 os = ds->ds_objset; 479 zil_clean(os->os_zil, txg); 480 ASSERT(!dmu_objset_is_dirty(os, txg)); 481 dmu_buf_rele(ds->ds_dbuf, ds); 482 } 483 ASSERT(!dmu_objset_is_dirty(dp->dp_meta_objset, txg)); 484} 485 486/* 487 * TRUE if the current thread is the tx_sync_thread or if we 488 * are being called from SPA context during pool initialization. 489 */ 490int 491dsl_pool_sync_context(dsl_pool_t *dp) 492{ 493 return (curthread == dp->dp_tx.tx_sync_thread || 494 spa_get_dsl(dp->dp_spa) == NULL); 495} 496 497uint64_t 498dsl_pool_adjustedsize(dsl_pool_t *dp, boolean_t netfree) 499{ 500 uint64_t space, resv; 501 502 /* 503 * Reserve about 1.6% (1/64), or at least 32MB, for allocation 504 * efficiency. 505 * XXX The intent log is not accounted for, so it must fit 506 * within this slop. 507 * 508 * If we're trying to assess whether it's OK to do a free, 509 * cut the reservation in half to allow forward progress 510 * (e.g. make it possible to rm(1) files from a full pool). 511 */ 512 space = spa_get_dspace(dp->dp_spa); 513 resv = MAX(space >> 6, SPA_MINDEVSIZE >> 1); 514 if (netfree) 515 resv >>= 1; 516 517 return (space - resv); 518} 519 520int 521dsl_pool_tempreserve_space(dsl_pool_t *dp, uint64_t space, dmu_tx_t *tx) 522{ 523 uint64_t reserved = 0; 524 uint64_t write_limit = (zfs_write_limit_override ? 525 zfs_write_limit_override : dp->dp_write_limit); 526 527 if (zfs_no_write_throttle) { 528 atomic_add_64(&dp->dp_tempreserved[tx->tx_txg & TXG_MASK], 529 space); 530 return (0); 531 } 532 533 /* 534 * Check to see if we have exceeded the maximum allowed IO for 535 * this transaction group. We can do this without locks since 536 * a little slop here is ok. Note that we do the reserved check 537 * with only half the requested reserve: this is because the 538 * reserve requests are worst-case, and we really don't want to 539 * throttle based off of worst-case estimates. 540 */ 541 if (write_limit > 0) { 542 reserved = dp->dp_space_towrite[tx->tx_txg & TXG_MASK] 543 + dp->dp_tempreserved[tx->tx_txg & TXG_MASK] / 2; 544 545 if (reserved && reserved > write_limit) 546 return (ERESTART); 547 } 548 549 atomic_add_64(&dp->dp_tempreserved[tx->tx_txg & TXG_MASK], space); 550 551 /* 552 * If this transaction group is over 7/8ths capacity, delay 553 * the caller 1 clock tick. This will slow down the "fill" 554 * rate until the sync process can catch up with us. 555 */ 556 if (reserved && reserved > (write_limit - (write_limit >> 3))) 557 txg_delay(dp, tx->tx_txg, 1); 558 559 return (0); 560} 561 562void 563dsl_pool_tempreserve_clear(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx) 564{ 565 ASSERT(dp->dp_tempreserved[tx->tx_txg & TXG_MASK] >= space); 566 atomic_add_64(&dp->dp_tempreserved[tx->tx_txg & TXG_MASK], -space); 567} 568 569void 570dsl_pool_memory_pressure(dsl_pool_t *dp) 571{ 572 uint64_t space_inuse = 0; 573 int i; 574 575 if (dp->dp_write_limit == zfs_write_limit_min) 576 return; 577 578 for (i = 0; i < TXG_SIZE; i++) { 579 space_inuse += dp->dp_space_towrite[i]; 580 space_inuse += dp->dp_tempreserved[i]; 581 } 582 dp->dp_write_limit = MAX(zfs_write_limit_min, 583 MIN(dp->dp_write_limit, space_inuse / 4)); 584} 585 586void 587dsl_pool_willuse_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx) 588{ 589 if (space > 0) { 590 mutex_enter(&dp->dp_lock); 591 dp->dp_space_towrite[tx->tx_txg & TXG_MASK] += space; 592 mutex_exit(&dp->dp_lock); 593 } 594} 595 596/* ARGSUSED */ 597static int 598upgrade_clones_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) 599{ 600 dmu_tx_t *tx = arg; 601 dsl_dataset_t *ds, *prev = NULL; 602 int err; 603 dsl_pool_t *dp = spa_get_dsl(spa); 604 605 err = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds); 606 if (err) 607 return (err); 608 609 while (ds->ds_phys->ds_prev_snap_obj != 0) { 610 err = dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj, 611 FTAG, &prev); 612 if (err) { 613 dsl_dataset_rele(ds, FTAG); 614 return (err); 615 } 616 617 if (prev->ds_phys->ds_next_snap_obj != ds->ds_object) 618 break; 619 dsl_dataset_rele(ds, FTAG); 620 ds = prev; 621 prev = NULL; 622 } 623 624 if (prev == NULL) { 625 prev = dp->dp_origin_snap; 626 627 /* 628 * The $ORIGIN can't have any data, or the accounting 629 * will be wrong. 630 */ 631 ASSERT(prev->ds_phys->ds_bp.blk_birth == 0); 632 633 /* The origin doesn't get attached to itself */ 634 if (ds->ds_object == prev->ds_object) { 635 dsl_dataset_rele(ds, FTAG); 636 return (0); 637 } 638 639 dmu_buf_will_dirty(ds->ds_dbuf, tx); 640 ds->ds_phys->ds_prev_snap_obj = prev->ds_object; 641 ds->ds_phys->ds_prev_snap_txg = prev->ds_phys->ds_creation_txg; 642 643 dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx); 644 ds->ds_dir->dd_phys->dd_origin_obj = prev->ds_object; 645 646 dmu_buf_will_dirty(prev->ds_dbuf, tx); 647 prev->ds_phys->ds_num_children++; 648 649 if (ds->ds_phys->ds_next_snap_obj == 0) { 650 ASSERT(ds->ds_prev == NULL); 651 VERIFY(0 == dsl_dataset_hold_obj(dp, 652 ds->ds_phys->ds_prev_snap_obj, ds, &ds->ds_prev)); 653 } 654 } 655 656 ASSERT(ds->ds_dir->dd_phys->dd_origin_obj == prev->ds_object); 657 ASSERT(ds->ds_phys->ds_prev_snap_obj == prev->ds_object); 658 659 if (prev->ds_phys->ds_next_clones_obj == 0) { 660 dmu_buf_will_dirty(prev->ds_dbuf, tx); 661 prev->ds_phys->ds_next_clones_obj = 662 zap_create(dp->dp_meta_objset, 663 DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx); 664 } 665 VERIFY(0 == zap_add_int(dp->dp_meta_objset, 666 prev->ds_phys->ds_next_clones_obj, ds->ds_object, tx)); 667 668 dsl_dataset_rele(ds, FTAG); 669 if (prev != dp->dp_origin_snap) 670 dsl_dataset_rele(prev, FTAG); 671 return (0); 672} 673 674void 675dsl_pool_upgrade_clones(dsl_pool_t *dp, dmu_tx_t *tx) 676{ 677 ASSERT(dmu_tx_is_syncing(tx)); 678 ASSERT(dp->dp_origin_snap != NULL); 679 680 VERIFY3U(0, ==, dmu_objset_find_spa(dp->dp_spa, NULL, upgrade_clones_cb, 681 tx, DS_FIND_CHILDREN)); 682} 683 684/* ARGSUSED */ 685static int 686upgrade_dir_clones_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) 687{ 688 dmu_tx_t *tx = arg; 689 dsl_dataset_t *ds; 690 dsl_pool_t *dp = spa_get_dsl(spa); 691 objset_t *mos = dp->dp_meta_objset; 692 693 VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); 694 695 if (ds->ds_dir->dd_phys->dd_origin_obj) { 696 dsl_dataset_t *origin; 697 698 VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, 699 ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &origin)); 700 701 if (origin->ds_dir->dd_phys->dd_clones == 0) { 702 dmu_buf_will_dirty(origin->ds_dir->dd_dbuf, tx); 703 origin->ds_dir->dd_phys->dd_clones = zap_create(mos, 704 DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx); 705 } 706 707 VERIFY3U(0, ==, zap_add_int(dp->dp_meta_objset, 708 origin->ds_dir->dd_phys->dd_clones, dsobj, tx)); 709 710 dsl_dataset_rele(origin, FTAG); 711 } 712 713 dsl_dataset_rele(ds, FTAG); 714 return (0); 715} 716 717void 718dsl_pool_upgrade_dir_clones(dsl_pool_t *dp, dmu_tx_t *tx) 719{ 720 ASSERT(dmu_tx_is_syncing(tx)); 721 uint64_t obj; 722 723 (void) dsl_dir_create_sync(dp, dp->dp_root_dir, FREE_DIR_NAME, tx); 724 VERIFY(0 == dsl_pool_open_special_dir(dp, 725 FREE_DIR_NAME, &dp->dp_free_dir)); 726 727 /* 728 * We can't use bpobj_alloc(), because spa_version() still 729 * returns the old version, and we need a new-version bpobj with 730 * subobj support. So call dmu_object_alloc() directly. 731 */ 732 obj = dmu_object_alloc(dp->dp_meta_objset, DMU_OT_BPOBJ, 733 SPA_MAXBLOCKSIZE, DMU_OT_BPOBJ_HDR, sizeof (bpobj_phys_t), tx); 734 VERIFY3U(0, ==, zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 735 DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx)); 736 VERIFY3U(0, ==, bpobj_open(&dp->dp_free_bpobj, 737 dp->dp_meta_objset, obj)); 738 739 VERIFY3U(0, ==, dmu_objset_find_spa(dp->dp_spa, NULL, 740 upgrade_dir_clones_cb, tx, DS_FIND_CHILDREN)); 741} 742 743void 744dsl_pool_create_origin(dsl_pool_t *dp, dmu_tx_t *tx) 745{ 746 uint64_t dsobj; 747 dsl_dataset_t *ds; 748 749 ASSERT(dmu_tx_is_syncing(tx)); 750 ASSERT(dp->dp_origin_snap == NULL); 751 752 /* create the origin dir, ds, & snap-ds */ 753 rw_enter(&dp->dp_config_rwlock, RW_WRITER); 754 dsobj = dsl_dataset_create_sync(dp->dp_root_dir, ORIGIN_DIR_NAME, 755 NULL, 0, kcred, tx); 756 VERIFY(0 == dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); 757 dsl_dataset_snapshot_sync(ds, ORIGIN_DIR_NAME, tx); 758 VERIFY(0 == dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj, 759 dp, &dp->dp_origin_snap)); 760 dsl_dataset_rele(ds, FTAG); 761 rw_exit(&dp->dp_config_rwlock); 762} 763 764taskq_t * 765dsl_pool_vnrele_taskq(dsl_pool_t *dp) 766{ 767 return (dp->dp_vnrele_taskq); 768} 769 770/* 771 * Walk through the pool-wide zap object of temporary snapshot user holds 772 * and release them. 773 */ 774void 775dsl_pool_clean_tmp_userrefs(dsl_pool_t *dp) 776{ 777 zap_attribute_t za; 778 zap_cursor_t zc; 779 objset_t *mos = dp->dp_meta_objset; 780 uint64_t zapobj = dp->dp_tmp_userrefs_obj; 781 782 if (zapobj == 0) 783 return; 784 ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS); 785 786 for (zap_cursor_init(&zc, mos, zapobj); 787 zap_cursor_retrieve(&zc, &za) == 0; 788 zap_cursor_advance(&zc)) { 789 char *htag; 790 uint64_t dsobj; 791 792 htag = strchr(za.za_name, '-'); 793 *htag = '\0'; 794 ++htag; 795 dsobj = strtonum(za.za_name, NULL); 796 (void) dsl_dataset_user_release_tmp(dp, dsobj, htag, B_FALSE); 797 } 798 zap_cursor_fini(&zc); 799} 800 801/* 802 * Create the pool-wide zap object for storing temporary snapshot holds. 803 */ 804void 805dsl_pool_user_hold_create_obj(dsl_pool_t *dp, dmu_tx_t *tx) 806{ 807 objset_t *mos = dp->dp_meta_objset; 808 809 ASSERT(dp->dp_tmp_userrefs_obj == 0); 810 ASSERT(dmu_tx_is_syncing(tx)); 811 812 dp->dp_tmp_userrefs_obj = zap_create(mos, DMU_OT_USERREFS, 813 DMU_OT_NONE, 0, tx); 814 815 VERIFY(zap_add(mos, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_TMP_USERREFS, 816 sizeof (uint64_t), 1, &dp->dp_tmp_userrefs_obj, tx) == 0); 817} 818 819static int 820dsl_pool_user_hold_rele_impl(dsl_pool_t *dp, uint64_t dsobj, 821 const char *tag, uint64_t *now, dmu_tx_t *tx, boolean_t holding) 822{ 823 objset_t *mos = dp->dp_meta_objset; 824 uint64_t zapobj = dp->dp_tmp_userrefs_obj; 825 char *name; 826 int error; 827 828 ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS); 829 ASSERT(dmu_tx_is_syncing(tx)); 830 831 /* 832 * If the pool was created prior to SPA_VERSION_USERREFS, the 833 * zap object for temporary holds might not exist yet. 834 */ 835 if (zapobj == 0) { 836 if (holding) { 837 dsl_pool_user_hold_create_obj(dp, tx); 838 zapobj = dp->dp_tmp_userrefs_obj; 839 } else { 840 return (ENOENT); 841 } 842 } 843 844 name = kmem_asprintf("%llx-%s", (u_longlong_t)dsobj, tag); 845 if (holding) 846 error = zap_add(mos, zapobj, name, 8, 1, now, tx); 847 else 848 error = zap_remove(mos, zapobj, name, tx); 849 strfree(name); 850 851 return (error); 852} 853 854/* 855 * Add a temporary hold for the given dataset object and tag. 856 */ 857int 858dsl_pool_user_hold(dsl_pool_t *dp, uint64_t dsobj, const char *tag, 859 uint64_t *now, dmu_tx_t *tx) 860{ 861 return (dsl_pool_user_hold_rele_impl(dp, dsobj, tag, now, tx, B_TRUE)); 862} 863 864/* 865 * Release a temporary hold for the given dataset object and tag. 866 */ 867int 868dsl_pool_user_release(dsl_pool_t *dp, uint64_t dsobj, const char *tag, 869 dmu_tx_t *tx) 870{ 871 return (dsl_pool_user_hold_rele_impl(dp, dsobj, tag, NULL, 872 tx, B_FALSE)); 873}
| 324 return (0); 325} 326 327void 328dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) 329{ 330 zio_t *zio; 331 dmu_tx_t *tx; 332 dsl_dir_t *dd; 333 dsl_dataset_t *ds; 334 dsl_sync_task_group_t *dstg; 335 objset_t *mos = dp->dp_meta_objset; 336 hrtime_t start, write_time; 337 uint64_t data_written; 338 int err; 339 340 /* 341 * We need to copy dp_space_towrite() before doing 342 * dsl_sync_task_group_sync(), because 343 * dsl_dataset_snapshot_reserve_space() will increase 344 * dp_space_towrite but not actually write anything. 345 */ 346 data_written = dp->dp_space_towrite[txg & TXG_MASK]; 347 348 tx = dmu_tx_create_assigned(dp, txg); 349 350 dp->dp_read_overhead = 0; 351 start = gethrtime(); 352 353 zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); 354 while (ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) { 355 /* 356 * We must not sync any non-MOS datasets twice, because 357 * we may have taken a snapshot of them. However, we 358 * may sync newly-created datasets on pass 2. 359 */ 360 ASSERT(!list_link_active(&ds->ds_synced_link)); 361 list_insert_tail(&dp->dp_synced_datasets, ds); 362 dsl_dataset_sync(ds, zio, tx); 363 } 364 DTRACE_PROBE(pool_sync__1setup); 365 err = zio_wait(zio); 366 367 write_time = gethrtime() - start; 368 ASSERT(err == 0); 369 DTRACE_PROBE(pool_sync__2rootzio); 370 371 for (ds = list_head(&dp->dp_synced_datasets); ds; 372 ds = list_next(&dp->dp_synced_datasets, ds)) 373 dmu_objset_do_userquota_updates(ds->ds_objset, tx); 374 375 /* 376 * Sync the datasets again to push out the changes due to 377 * userspace updates. This must be done before we process the 378 * sync tasks, because that could cause a snapshot of a dataset 379 * whose ds_bp will be rewritten when we do this 2nd sync. 380 */ 381 zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); 382 while (ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) { 383 ASSERT(list_link_active(&ds->ds_synced_link)); 384 dmu_buf_rele(ds->ds_dbuf, ds); 385 dsl_dataset_sync(ds, zio, tx); 386 } 387 err = zio_wait(zio); 388 389 /* 390 * Move dead blocks from the pending deadlist to the on-disk 391 * deadlist. 392 */ 393 for (ds = list_head(&dp->dp_synced_datasets); ds; 394 ds = list_next(&dp->dp_synced_datasets, ds)) { 395 bplist_iterate(&ds->ds_pending_deadlist, 396 deadlist_enqueue_cb, &ds->ds_deadlist, tx); 397 } 398 399 while (dstg = txg_list_remove(&dp->dp_sync_tasks, txg)) { 400 /* 401 * No more sync tasks should have been added while we 402 * were syncing. 403 */ 404 ASSERT(spa_sync_pass(dp->dp_spa) == 1); 405 dsl_sync_task_group_sync(dstg, tx); 406 } 407 DTRACE_PROBE(pool_sync__3task); 408 409 start = gethrtime(); 410 while (dd = txg_list_remove(&dp->dp_dirty_dirs, txg)) 411 dsl_dir_sync(dd, tx); 412 write_time += gethrtime() - start; 413 414 start = gethrtime(); 415 if (list_head(&mos->os_dirty_dnodes[txg & TXG_MASK]) != NULL || 416 list_head(&mos->os_free_dnodes[txg & TXG_MASK]) != NULL) { 417 zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); 418 dmu_objset_sync(mos, zio, tx); 419 err = zio_wait(zio); 420 ASSERT(err == 0); 421 dprintf_bp(&dp->dp_meta_rootbp, "meta objset rootbp is %s", ""); 422 spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp); 423 } 424 write_time += gethrtime() - start; 425 DTRACE_PROBE2(pool_sync__4io, hrtime_t, write_time, 426 hrtime_t, dp->dp_read_overhead); 427 write_time -= dp->dp_read_overhead; 428 429 dmu_tx_commit(tx); 430 431 dp->dp_space_towrite[txg & TXG_MASK] = 0; 432 ASSERT(dp->dp_tempreserved[txg & TXG_MASK] == 0); 433 434 /* 435 * If the write limit max has not been explicitly set, set it 436 * to a fraction of available physical memory (default 1/8th). 437 * Note that we must inflate the limit because the spa 438 * inflates write sizes to account for data replication. 439 * Check this each sync phase to catch changing memory size. 440 */ 441 if (physmem != old_physmem && zfs_write_limit_shift) { 442 mutex_enter(&zfs_write_limit_lock); 443 old_physmem = physmem; 444 zfs_write_limit_max = ptob(physmem) >> zfs_write_limit_shift; 445 zfs_write_limit_inflated = MAX(zfs_write_limit_min, 446 spa_get_asize(dp->dp_spa, zfs_write_limit_max)); 447 mutex_exit(&zfs_write_limit_lock); 448 } 449 450 /* 451 * Attempt to keep the sync time consistent by adjusting the 452 * amount of write traffic allowed into each transaction group. 453 * Weight the throughput calculation towards the current value: 454 * thru = 3/4 old_thru + 1/4 new_thru 455 * 456 * Note: write_time is in nanosecs, so write_time/MICROSEC 457 * yields millisecs 458 */ 459 ASSERT(zfs_write_limit_min > 0); 460 if (data_written > zfs_write_limit_min / 8 && write_time > MICROSEC) { 461 uint64_t throughput = data_written / (write_time / MICROSEC); 462 463 if (dp->dp_throughput) 464 dp->dp_throughput = throughput / 4 + 465 3 * dp->dp_throughput / 4; 466 else 467 dp->dp_throughput = throughput; 468 dp->dp_write_limit = MIN(zfs_write_limit_inflated, 469 MAX(zfs_write_limit_min, 470 dp->dp_throughput * zfs_txg_synctime_ms)); 471 } 472} 473 474void 475dsl_pool_sync_done(dsl_pool_t *dp, uint64_t txg) 476{ 477 dsl_dataset_t *ds; 478 objset_t *os; 479 480 while (ds = list_head(&dp->dp_synced_datasets)) { 481 list_remove(&dp->dp_synced_datasets, ds); 482 os = ds->ds_objset; 483 zil_clean(os->os_zil, txg); 484 ASSERT(!dmu_objset_is_dirty(os, txg)); 485 dmu_buf_rele(ds->ds_dbuf, ds); 486 } 487 ASSERT(!dmu_objset_is_dirty(dp->dp_meta_objset, txg)); 488} 489 490/* 491 * TRUE if the current thread is the tx_sync_thread or if we 492 * are being called from SPA context during pool initialization. 493 */ 494int 495dsl_pool_sync_context(dsl_pool_t *dp) 496{ 497 return (curthread == dp->dp_tx.tx_sync_thread || 498 spa_get_dsl(dp->dp_spa) == NULL); 499} 500 501uint64_t 502dsl_pool_adjustedsize(dsl_pool_t *dp, boolean_t netfree) 503{ 504 uint64_t space, resv; 505 506 /* 507 * Reserve about 1.6% (1/64), or at least 32MB, for allocation 508 * efficiency. 509 * XXX The intent log is not accounted for, so it must fit 510 * within this slop. 511 * 512 * If we're trying to assess whether it's OK to do a free, 513 * cut the reservation in half to allow forward progress 514 * (e.g. make it possible to rm(1) files from a full pool). 515 */ 516 space = spa_get_dspace(dp->dp_spa); 517 resv = MAX(space >> 6, SPA_MINDEVSIZE >> 1); 518 if (netfree) 519 resv >>= 1; 520 521 return (space - resv); 522} 523 524int 525dsl_pool_tempreserve_space(dsl_pool_t *dp, uint64_t space, dmu_tx_t *tx) 526{ 527 uint64_t reserved = 0; 528 uint64_t write_limit = (zfs_write_limit_override ? 529 zfs_write_limit_override : dp->dp_write_limit); 530 531 if (zfs_no_write_throttle) { 532 atomic_add_64(&dp->dp_tempreserved[tx->tx_txg & TXG_MASK], 533 space); 534 return (0); 535 } 536 537 /* 538 * Check to see if we have exceeded the maximum allowed IO for 539 * this transaction group. We can do this without locks since 540 * a little slop here is ok. Note that we do the reserved check 541 * with only half the requested reserve: this is because the 542 * reserve requests are worst-case, and we really don't want to 543 * throttle based off of worst-case estimates. 544 */ 545 if (write_limit > 0) { 546 reserved = dp->dp_space_towrite[tx->tx_txg & TXG_MASK] 547 + dp->dp_tempreserved[tx->tx_txg & TXG_MASK] / 2; 548 549 if (reserved && reserved > write_limit) 550 return (ERESTART); 551 } 552 553 atomic_add_64(&dp->dp_tempreserved[tx->tx_txg & TXG_MASK], space); 554 555 /* 556 * If this transaction group is over 7/8ths capacity, delay 557 * the caller 1 clock tick. This will slow down the "fill" 558 * rate until the sync process can catch up with us. 559 */ 560 if (reserved && reserved > (write_limit - (write_limit >> 3))) 561 txg_delay(dp, tx->tx_txg, 1); 562 563 return (0); 564} 565 566void 567dsl_pool_tempreserve_clear(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx) 568{ 569 ASSERT(dp->dp_tempreserved[tx->tx_txg & TXG_MASK] >= space); 570 atomic_add_64(&dp->dp_tempreserved[tx->tx_txg & TXG_MASK], -space); 571} 572 573void 574dsl_pool_memory_pressure(dsl_pool_t *dp) 575{ 576 uint64_t space_inuse = 0; 577 int i; 578 579 if (dp->dp_write_limit == zfs_write_limit_min) 580 return; 581 582 for (i = 0; i < TXG_SIZE; i++) { 583 space_inuse += dp->dp_space_towrite[i]; 584 space_inuse += dp->dp_tempreserved[i]; 585 } 586 dp->dp_write_limit = MAX(zfs_write_limit_min, 587 MIN(dp->dp_write_limit, space_inuse / 4)); 588} 589 590void 591dsl_pool_willuse_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx) 592{ 593 if (space > 0) { 594 mutex_enter(&dp->dp_lock); 595 dp->dp_space_towrite[tx->tx_txg & TXG_MASK] += space; 596 mutex_exit(&dp->dp_lock); 597 } 598} 599 600/* ARGSUSED */ 601static int 602upgrade_clones_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) 603{ 604 dmu_tx_t *tx = arg; 605 dsl_dataset_t *ds, *prev = NULL; 606 int err; 607 dsl_pool_t *dp = spa_get_dsl(spa); 608 609 err = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds); 610 if (err) 611 return (err); 612 613 while (ds->ds_phys->ds_prev_snap_obj != 0) { 614 err = dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj, 615 FTAG, &prev); 616 if (err) { 617 dsl_dataset_rele(ds, FTAG); 618 return (err); 619 } 620 621 if (prev->ds_phys->ds_next_snap_obj != ds->ds_object) 622 break; 623 dsl_dataset_rele(ds, FTAG); 624 ds = prev; 625 prev = NULL; 626 } 627 628 if (prev == NULL) { 629 prev = dp->dp_origin_snap; 630 631 /* 632 * The $ORIGIN can't have any data, or the accounting 633 * will be wrong. 634 */ 635 ASSERT(prev->ds_phys->ds_bp.blk_birth == 0); 636 637 /* The origin doesn't get attached to itself */ 638 if (ds->ds_object == prev->ds_object) { 639 dsl_dataset_rele(ds, FTAG); 640 return (0); 641 } 642 643 dmu_buf_will_dirty(ds->ds_dbuf, tx); 644 ds->ds_phys->ds_prev_snap_obj = prev->ds_object; 645 ds->ds_phys->ds_prev_snap_txg = prev->ds_phys->ds_creation_txg; 646 647 dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx); 648 ds->ds_dir->dd_phys->dd_origin_obj = prev->ds_object; 649 650 dmu_buf_will_dirty(prev->ds_dbuf, tx); 651 prev->ds_phys->ds_num_children++; 652 653 if (ds->ds_phys->ds_next_snap_obj == 0) { 654 ASSERT(ds->ds_prev == NULL); 655 VERIFY(0 == dsl_dataset_hold_obj(dp, 656 ds->ds_phys->ds_prev_snap_obj, ds, &ds->ds_prev)); 657 } 658 } 659 660 ASSERT(ds->ds_dir->dd_phys->dd_origin_obj == prev->ds_object); 661 ASSERT(ds->ds_phys->ds_prev_snap_obj == prev->ds_object); 662 663 if (prev->ds_phys->ds_next_clones_obj == 0) { 664 dmu_buf_will_dirty(prev->ds_dbuf, tx); 665 prev->ds_phys->ds_next_clones_obj = 666 zap_create(dp->dp_meta_objset, 667 DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx); 668 } 669 VERIFY(0 == zap_add_int(dp->dp_meta_objset, 670 prev->ds_phys->ds_next_clones_obj, ds->ds_object, tx)); 671 672 dsl_dataset_rele(ds, FTAG); 673 if (prev != dp->dp_origin_snap) 674 dsl_dataset_rele(prev, FTAG); 675 return (0); 676} 677 678void 679dsl_pool_upgrade_clones(dsl_pool_t *dp, dmu_tx_t *tx) 680{ 681 ASSERT(dmu_tx_is_syncing(tx)); 682 ASSERT(dp->dp_origin_snap != NULL); 683 684 VERIFY3U(0, ==, dmu_objset_find_spa(dp->dp_spa, NULL, upgrade_clones_cb, 685 tx, DS_FIND_CHILDREN)); 686} 687 688/* ARGSUSED */ 689static int 690upgrade_dir_clones_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) 691{ 692 dmu_tx_t *tx = arg; 693 dsl_dataset_t *ds; 694 dsl_pool_t *dp = spa_get_dsl(spa); 695 objset_t *mos = dp->dp_meta_objset; 696 697 VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); 698 699 if (ds->ds_dir->dd_phys->dd_origin_obj) { 700 dsl_dataset_t *origin; 701 702 VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, 703 ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &origin)); 704 705 if (origin->ds_dir->dd_phys->dd_clones == 0) { 706 dmu_buf_will_dirty(origin->ds_dir->dd_dbuf, tx); 707 origin->ds_dir->dd_phys->dd_clones = zap_create(mos, 708 DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx); 709 } 710 711 VERIFY3U(0, ==, zap_add_int(dp->dp_meta_objset, 712 origin->ds_dir->dd_phys->dd_clones, dsobj, tx)); 713 714 dsl_dataset_rele(origin, FTAG); 715 } 716 717 dsl_dataset_rele(ds, FTAG); 718 return (0); 719} 720 721void 722dsl_pool_upgrade_dir_clones(dsl_pool_t *dp, dmu_tx_t *tx) 723{ 724 ASSERT(dmu_tx_is_syncing(tx)); 725 uint64_t obj; 726 727 (void) dsl_dir_create_sync(dp, dp->dp_root_dir, FREE_DIR_NAME, tx); 728 VERIFY(0 == dsl_pool_open_special_dir(dp, 729 FREE_DIR_NAME, &dp->dp_free_dir)); 730 731 /* 732 * We can't use bpobj_alloc(), because spa_version() still 733 * returns the old version, and we need a new-version bpobj with 734 * subobj support. So call dmu_object_alloc() directly. 735 */ 736 obj = dmu_object_alloc(dp->dp_meta_objset, DMU_OT_BPOBJ, 737 SPA_MAXBLOCKSIZE, DMU_OT_BPOBJ_HDR, sizeof (bpobj_phys_t), tx); 738 VERIFY3U(0, ==, zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 739 DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx)); 740 VERIFY3U(0, ==, bpobj_open(&dp->dp_free_bpobj, 741 dp->dp_meta_objset, obj)); 742 743 VERIFY3U(0, ==, dmu_objset_find_spa(dp->dp_spa, NULL, 744 upgrade_dir_clones_cb, tx, DS_FIND_CHILDREN)); 745} 746 747void 748dsl_pool_create_origin(dsl_pool_t *dp, dmu_tx_t *tx) 749{ 750 uint64_t dsobj; 751 dsl_dataset_t *ds; 752 753 ASSERT(dmu_tx_is_syncing(tx)); 754 ASSERT(dp->dp_origin_snap == NULL); 755 756 /* create the origin dir, ds, & snap-ds */ 757 rw_enter(&dp->dp_config_rwlock, RW_WRITER); 758 dsobj = dsl_dataset_create_sync(dp->dp_root_dir, ORIGIN_DIR_NAME, 759 NULL, 0, kcred, tx); 760 VERIFY(0 == dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); 761 dsl_dataset_snapshot_sync(ds, ORIGIN_DIR_NAME, tx); 762 VERIFY(0 == dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj, 763 dp, &dp->dp_origin_snap)); 764 dsl_dataset_rele(ds, FTAG); 765 rw_exit(&dp->dp_config_rwlock); 766} 767 768taskq_t * 769dsl_pool_vnrele_taskq(dsl_pool_t *dp) 770{ 771 return (dp->dp_vnrele_taskq); 772} 773 774/* 775 * Walk through the pool-wide zap object of temporary snapshot user holds 776 * and release them. 777 */ 778void 779dsl_pool_clean_tmp_userrefs(dsl_pool_t *dp) 780{ 781 zap_attribute_t za; 782 zap_cursor_t zc; 783 objset_t *mos = dp->dp_meta_objset; 784 uint64_t zapobj = dp->dp_tmp_userrefs_obj; 785 786 if (zapobj == 0) 787 return; 788 ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS); 789 790 for (zap_cursor_init(&zc, mos, zapobj); 791 zap_cursor_retrieve(&zc, &za) == 0; 792 zap_cursor_advance(&zc)) { 793 char *htag; 794 uint64_t dsobj; 795 796 htag = strchr(za.za_name, '-'); 797 *htag = '\0'; 798 ++htag; 799 dsobj = strtonum(za.za_name, NULL); 800 (void) dsl_dataset_user_release_tmp(dp, dsobj, htag, B_FALSE); 801 } 802 zap_cursor_fini(&zc); 803} 804 805/* 806 * Create the pool-wide zap object for storing temporary snapshot holds. 807 */ 808void 809dsl_pool_user_hold_create_obj(dsl_pool_t *dp, dmu_tx_t *tx) 810{ 811 objset_t *mos = dp->dp_meta_objset; 812 813 ASSERT(dp->dp_tmp_userrefs_obj == 0); 814 ASSERT(dmu_tx_is_syncing(tx)); 815 816 dp->dp_tmp_userrefs_obj = zap_create(mos, DMU_OT_USERREFS, 817 DMU_OT_NONE, 0, tx); 818 819 VERIFY(zap_add(mos, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_TMP_USERREFS, 820 sizeof (uint64_t), 1, &dp->dp_tmp_userrefs_obj, tx) == 0); 821} 822 823static int 824dsl_pool_user_hold_rele_impl(dsl_pool_t *dp, uint64_t dsobj, 825 const char *tag, uint64_t *now, dmu_tx_t *tx, boolean_t holding) 826{ 827 objset_t *mos = dp->dp_meta_objset; 828 uint64_t zapobj = dp->dp_tmp_userrefs_obj; 829 char *name; 830 int error; 831 832 ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS); 833 ASSERT(dmu_tx_is_syncing(tx)); 834 835 /* 836 * If the pool was created prior to SPA_VERSION_USERREFS, the 837 * zap object for temporary holds might not exist yet. 838 */ 839 if (zapobj == 0) { 840 if (holding) { 841 dsl_pool_user_hold_create_obj(dp, tx); 842 zapobj = dp->dp_tmp_userrefs_obj; 843 } else { 844 return (ENOENT); 845 } 846 } 847 848 name = kmem_asprintf("%llx-%s", (u_longlong_t)dsobj, tag); 849 if (holding) 850 error = zap_add(mos, zapobj, name, 8, 1, now, tx); 851 else 852 error = zap_remove(mos, zapobj, name, tx); 853 strfree(name); 854 855 return (error); 856} 857 858/* 859 * Add a temporary hold for the given dataset object and tag. 860 */ 861int 862dsl_pool_user_hold(dsl_pool_t *dp, uint64_t dsobj, const char *tag, 863 uint64_t *now, dmu_tx_t *tx) 864{ 865 return (dsl_pool_user_hold_rele_impl(dp, dsobj, tag, now, tx, B_TRUE)); 866} 867 868/* 869 * Release a temporary hold for the given dataset object and tag. 870 */ 871int 872dsl_pool_user_release(dsl_pool_t *dp, uint64_t dsobj, const char *tag, 873 dmu_tx_t *tx) 874{ 875 return (dsl_pool_user_hold_rele_impl(dp, dsobj, tag, NULL, 876 tx, B_FALSE)); 877}
|