1236884Smm/* 2236884Smm * CDDL HEADER START 3236884Smm * 4236884Smm * The contents of this file are subject to the terms of the 5236884Smm * Common Development and Distribution License (the "License"). 6236884Smm * You may not use this file except in compliance with the License. 7236884Smm * 8236884Smm * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9236884Smm * or http://www.opensolaris.org/os/licensing. 10236884Smm * See the License for the specific language governing permissions 11236884Smm * and limitations under the License. 12236884Smm * 13236884Smm * When distributing Covered Code, include this CDDL HEADER in each 14236884Smm * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15236884Smm * If applicable, add the following below this CDDL HEADER, with the 16236884Smm * fields enclosed by brackets "[]" replaced with your own identifying 17236884Smm * information: Portions Copyright [yyyy] [name of copyright owner] 18236884Smm * 19236884Smm * CDDL HEADER END 20236884Smm */ 21236884Smm 22236884Smm/* 23263398Sdelphij * Copyright (c) 2013 by Delphix. All rights reserved. 24236884Smm */ 25236884Smm 26236884Smm#include <sys/arc.h> 27236884Smm#include <sys/bptree.h> 28236884Smm#include <sys/dmu.h> 29236884Smm#include <sys/dmu_objset.h> 30236884Smm#include <sys/dmu_tx.h> 31236884Smm#include <sys/dmu_traverse.h> 32236884Smm#include <sys/dsl_dataset.h> 33236884Smm#include <sys/dsl_dir.h> 34236884Smm#include <sys/dsl_pool.h> 35236884Smm#include <sys/dnode.h> 36236884Smm#include <sys/refcount.h> 37236884Smm#include <sys/spa.h> 38236884Smm 39236884Smm/* 40236884Smm * A bptree is a queue of root block pointers from destroyed datasets. When a 41236884Smm * dataset is destroyed its root block pointer is put on the end of the pool's 42236884Smm * bptree queue so the dataset's blocks can be freed asynchronously by 43236884Smm * dsl_scan_sync. This allows the delete operation to finish without traversing 44236884Smm * all the dataset's blocks. 45236884Smm * 46252751Sdelphij * Note that while bt_begin and bt_end are only ever incremented in this code, 47236884Smm * they are effectively reset to 0 every time the entire bptree is freed because 48236884Smm * the bptree's object is destroyed and re-created. 49236884Smm */ 50236884Smm 51236884Smmstruct bptree_args { 52236884Smm bptree_phys_t *ba_phys; /* data in bonus buffer, dirtied if freeing */ 53236884Smm boolean_t ba_free; /* true if freeing during traversal */ 54236884Smm 55236884Smm bptree_itor_t *ba_func; /* function to call for each blockpointer */ 56236884Smm void *ba_arg; /* caller supplied argument to ba_func */ 57236884Smm dmu_tx_t *ba_tx; /* caller supplied tx, NULL if not freeing */ 58236884Smm} bptree_args_t; 59236884Smm 60236884Smmuint64_t 61236884Smmbptree_alloc(objset_t *os, dmu_tx_t *tx) 62236884Smm{ 63236884Smm uint64_t obj; 64236884Smm dmu_buf_t *db; 65236884Smm bptree_phys_t *bt; 66236884Smm 67236884Smm obj = dmu_object_alloc(os, DMU_OTN_UINT64_METADATA, 68236884Smm SPA_MAXBLOCKSIZE, DMU_OTN_UINT64_METADATA, 69236884Smm sizeof (bptree_phys_t), tx); 70236884Smm 71236884Smm /* 72236884Smm * Bonus buffer contents are already initialized to 0, but for 73236884Smm * readability we make it explicit. 74236884Smm */ 75236884Smm VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db)); 76236884Smm dmu_buf_will_dirty(db, tx); 77236884Smm bt = db->db_data; 78236884Smm bt->bt_begin = 0; 79236884Smm bt->bt_end = 0; 80236884Smm bt->bt_bytes = 0; 81236884Smm bt->bt_comp = 0; 82236884Smm bt->bt_uncomp = 0; 83236884Smm dmu_buf_rele(db, FTAG); 84236884Smm 85236884Smm return (obj); 86236884Smm} 87236884Smm 88236884Smmint 89236884Smmbptree_free(objset_t *os, uint64_t obj, dmu_tx_t *tx) 90236884Smm{ 91236884Smm dmu_buf_t *db; 92236884Smm bptree_phys_t *bt; 93236884Smm 94236884Smm VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db)); 95236884Smm bt = db->db_data; 96236884Smm ASSERT3U(bt->bt_begin, ==, bt->bt_end); 97243674Smm ASSERT0(bt->bt_bytes); 98243674Smm ASSERT0(bt->bt_comp); 99243674Smm ASSERT0(bt->bt_uncomp); 100236884Smm dmu_buf_rele(db, FTAG); 101236884Smm 102236884Smm return (dmu_object_free(os, obj, tx)); 103236884Smm} 104236884Smm 105236884Smmvoid 106236884Smmbptree_add(objset_t *os, uint64_t obj, blkptr_t *bp, uint64_t birth_txg, 107236884Smm uint64_t bytes, uint64_t comp, uint64_t uncomp, dmu_tx_t *tx) 108236884Smm{ 109236884Smm dmu_buf_t *db; 110236884Smm bptree_phys_t *bt; 111236884Smm bptree_entry_phys_t bte; 112236884Smm 113236884Smm /* 114236884Smm * bptree objects are in the pool mos, therefore they can only be 115236884Smm * modified in syncing context. Furthermore, this is only modified 116236884Smm * by the sync thread, so no locking is necessary. 117236884Smm */ 118236884Smm ASSERT(dmu_tx_is_syncing(tx)); 119236884Smm 120236884Smm VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db)); 121236884Smm bt = db->db_data; 122236884Smm 123236884Smm bte.be_birth_txg = birth_txg; 124236884Smm bte.be_bp = *bp; 125236884Smm bzero(&bte.be_zb, sizeof (bte.be_zb)); 126236884Smm dmu_write(os, obj, bt->bt_end * sizeof (bte), sizeof (bte), &bte, tx); 127236884Smm 128236884Smm dmu_buf_will_dirty(db, tx); 129236884Smm bt->bt_end++; 130236884Smm bt->bt_bytes += bytes; 131236884Smm bt->bt_comp += comp; 132236884Smm bt->bt_uncomp += uncomp; 133236884Smm dmu_buf_rele(db, FTAG); 134236884Smm} 135236884Smm 136236884Smm/* ARGSUSED */ 137236884Smmstatic int 138247406Smmbptree_visit_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, 139236884Smm const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg) 140236884Smm{ 141236884Smm int err; 142236884Smm struct bptree_args *ba = arg; 143236884Smm 144263398Sdelphij if (BP_IS_HOLE(bp)) 145236884Smm return (0); 146236884Smm 147236884Smm err = ba->ba_func(ba->ba_arg, bp, ba->ba_tx); 148236884Smm if (err == 0 && ba->ba_free) { 149236884Smm ba->ba_phys->bt_bytes -= bp_get_dsize_sync(spa, bp); 150236884Smm ba->ba_phys->bt_comp -= BP_GET_PSIZE(bp); 151236884Smm ba->ba_phys->bt_uncomp -= BP_GET_UCSIZE(bp); 152236884Smm } 153236884Smm return (err); 154236884Smm} 155236884Smm 156236884Smmint 157236884Smmbptree_iterate(objset_t *os, uint64_t obj, boolean_t free, bptree_itor_t func, 158236884Smm void *arg, dmu_tx_t *tx) 159236884Smm{ 160236884Smm int err; 161236884Smm uint64_t i; 162236884Smm dmu_buf_t *db; 163236884Smm struct bptree_args ba; 164236884Smm 165236884Smm ASSERT(!free || dmu_tx_is_syncing(tx)); 166236884Smm 167236884Smm err = dmu_bonus_hold(os, obj, FTAG, &db); 168236884Smm if (err != 0) 169236884Smm return (err); 170236884Smm 171236884Smm if (free) 172236884Smm dmu_buf_will_dirty(db, tx); 173236884Smm 174236884Smm ba.ba_phys = db->db_data; 175236884Smm ba.ba_free = free; 176236884Smm ba.ba_func = func; 177236884Smm ba.ba_arg = arg; 178236884Smm ba.ba_tx = tx; 179236884Smm 180236884Smm err = 0; 181236884Smm for (i = ba.ba_phys->bt_begin; i < ba.ba_phys->bt_end; i++) { 182236884Smm bptree_entry_phys_t bte; 183262118Savg int flags = TRAVERSE_PREFETCH_METADATA | TRAVERSE_POST; 184236884Smm 185236884Smm ASSERT(!free || i == ba.ba_phys->bt_begin); 186236884Smm 187236884Smm err = dmu_read(os, obj, i * sizeof (bte), sizeof (bte), 188236884Smm &bte, DMU_READ_NO_PREFETCH); 189236884Smm if (err != 0) 190236884Smm break; 191236884Smm 192262118Savg if (zfs_recover) 193262118Savg flags |= TRAVERSE_HARD; 194236884Smm err = traverse_dataset_destroyed(os->os_spa, &bte.be_bp, 195262118Savg bte.be_birth_txg, &bte.be_zb, flags, 196236884Smm bptree_visit_cb, &ba); 197236884Smm if (free) { 198262118Savg if (err == ERESTART) { 199236884Smm /* save bookmark for future resume */ 200236884Smm ASSERT3U(bte.be_zb.zb_objset, ==, 201236884Smm ZB_DESTROYED_OBJSET); 202243674Smm ASSERT0(bte.be_zb.zb_level); 203236884Smm dmu_write(os, obj, i * sizeof (bte), 204236884Smm sizeof (bte), &bte, tx); 205236884Smm break; 206236884Smm } 207262118Savg if (err != 0) { 208262118Savg /* 209262118Savg * We can not properly handle an i/o 210262118Savg * error, because the traversal code 211262118Savg * does not know how to resume from an 212262118Savg * arbitrary bookmark. 213262118Savg */ 214262118Savg zfs_panic_recover("error %u from " 215262118Savg "traverse_dataset_destroyed()", err); 216262118Savg } 217262118Savg 218262118Savg ba.ba_phys->bt_begin++; 219262118Savg (void) dmu_free_range(os, obj, 220262118Savg i * sizeof (bte), sizeof (bte), tx); 221236884Smm } 222236884Smm } 223236884Smm 224236884Smm ASSERT(!free || err != 0 || ba.ba_phys->bt_begin == ba.ba_phys->bt_end); 225236884Smm 226236884Smm /* if all blocks are free there should be no used space */ 227236884Smm if (ba.ba_phys->bt_begin == ba.ba_phys->bt_end) { 228243674Smm ASSERT0(ba.ba_phys->bt_bytes); 229243674Smm ASSERT0(ba.ba_phys->bt_comp); 230243674Smm ASSERT0(ba.ba_phys->bt_uncomp); 231236884Smm } 232236884Smm 233236884Smm dmu_buf_rele(db, FTAG); 234236884Smm 235236884Smm return (err); 236236884Smm} 237