1236884Smm/* 2236884Smm * CDDL HEADER START 3236884Smm * 4236884Smm * The contents of this file are subject to the terms of the 5236884Smm * Common Development and Distribution License (the "License"). 6236884Smm * You may not use this file except in compliance with the License. 7236884Smm * 8236884Smm * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9236884Smm * or http://www.opensolaris.org/os/licensing. 10236884Smm * See the License for the specific language governing permissions 11236884Smm * and limitations under the License. 12236884Smm * 13236884Smm * When distributing Covered Code, include this CDDL HEADER in each 14236884Smm * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15236884Smm * If applicable, add the following below this CDDL HEADER, with the 16236884Smm * fields enclosed by brackets "[]" replaced with your own identifying 17236884Smm * information: Portions Copyright [yyyy] [name of copyright owner] 18236884Smm * 19236884Smm * CDDL HEADER END 20236884Smm */ 21236884Smm 22236884Smm/* 23307123Smav * Copyright (c) 2011, 2015 by Delphix. All rights reserved. 24297112Smav * Copyright (c) 2014 Integros [integros.com] 25236884Smm */ 26236884Smm 27236884Smm#include <sys/arc.h> 28236884Smm#include <sys/bptree.h> 29236884Smm#include <sys/dmu.h> 30236884Smm#include <sys/dmu_objset.h> 31236884Smm#include <sys/dmu_tx.h> 32236884Smm#include <sys/dmu_traverse.h> 33236884Smm#include <sys/dsl_dataset.h> 34236884Smm#include <sys/dsl_dir.h> 35236884Smm#include <sys/dsl_pool.h> 36236884Smm#include <sys/dnode.h> 37236884Smm#include <sys/refcount.h> 38236884Smm#include <sys/spa.h> 39236884Smm 40236884Smm/* 41236884Smm * A bptree is a queue of root block pointers from destroyed datasets. When a 42236884Smm * dataset is destroyed its root block pointer is put on the end of the pool's 43236884Smm * bptree queue so the dataset's blocks can be freed asynchronously by 44236884Smm * dsl_scan_sync. This allows the delete operation to finish without traversing 45236884Smm * all the dataset's blocks. 46236884Smm * 47251631Sdelphij * Note that while bt_begin and bt_end are only ever incremented in this code, 48236884Smm * they are effectively reset to 0 every time the entire bptree is freed because 49236884Smm * the bptree's object is destroyed and re-created. 50236884Smm */ 51236884Smm 52236884Smmstruct bptree_args { 53236884Smm bptree_phys_t *ba_phys; /* data in bonus buffer, dirtied if freeing */ 54236884Smm boolean_t ba_free; /* true if freeing during traversal */ 55236884Smm 56236884Smm bptree_itor_t *ba_func; /* function to call for each blockpointer */ 57236884Smm void *ba_arg; /* caller supplied argument to ba_func */ 58236884Smm dmu_tx_t *ba_tx; /* caller supplied tx, NULL if not freeing */ 59236884Smm} bptree_args_t; 60236884Smm 61236884Smmuint64_t 62236884Smmbptree_alloc(objset_t *os, dmu_tx_t *tx) 63236884Smm{ 64236884Smm uint64_t obj; 65236884Smm dmu_buf_t *db; 66236884Smm bptree_phys_t *bt; 67236884Smm 68236884Smm obj = dmu_object_alloc(os, DMU_OTN_UINT64_METADATA, 69276081Sdelphij SPA_OLD_MAXBLOCKSIZE, DMU_OTN_UINT64_METADATA, 70236884Smm sizeof (bptree_phys_t), tx); 71236884Smm 72236884Smm /* 73236884Smm * Bonus buffer contents are already initialized to 0, but for 74236884Smm * readability we make it explicit. 75236884Smm */ 76236884Smm VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db)); 77236884Smm dmu_buf_will_dirty(db, tx); 78236884Smm bt = db->db_data; 79236884Smm bt->bt_begin = 0; 80236884Smm bt->bt_end = 0; 81236884Smm bt->bt_bytes = 0; 82236884Smm bt->bt_comp = 0; 83236884Smm bt->bt_uncomp = 0; 84236884Smm dmu_buf_rele(db, FTAG); 85236884Smm 86236884Smm return (obj); 87236884Smm} 88236884Smm 89236884Smmint 90236884Smmbptree_free(objset_t *os, uint64_t obj, dmu_tx_t *tx) 91236884Smm{ 92236884Smm dmu_buf_t *db; 93236884Smm bptree_phys_t *bt; 94236884Smm 95236884Smm VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db)); 96236884Smm bt = db->db_data; 97236884Smm ASSERT3U(bt->bt_begin, ==, bt->bt_end); 98240415Smm ASSERT0(bt->bt_bytes); 99240415Smm ASSERT0(bt->bt_comp); 100240415Smm ASSERT0(bt->bt_uncomp); 101236884Smm dmu_buf_rele(db, FTAG); 102236884Smm 103236884Smm return (dmu_object_free(os, obj, tx)); 104236884Smm} 105236884Smm 106268650Sdelphijboolean_t 107268650Sdelphijbptree_is_empty(objset_t *os, uint64_t obj) 108268650Sdelphij{ 109268650Sdelphij dmu_buf_t *db; 110268650Sdelphij bptree_phys_t *bt; 111268650Sdelphij boolean_t rv; 112268650Sdelphij 113268650Sdelphij VERIFY0(dmu_bonus_hold(os, obj, FTAG, &db)); 114268650Sdelphij bt = db->db_data; 115268650Sdelphij rv = (bt->bt_begin == bt->bt_end); 116268650Sdelphij dmu_buf_rele(db, FTAG); 117268650Sdelphij return (rv); 118268650Sdelphij} 119268650Sdelphij 120236884Smmvoid 121236884Smmbptree_add(objset_t *os, uint64_t obj, blkptr_t *bp, uint64_t birth_txg, 122236884Smm uint64_t bytes, uint64_t comp, uint64_t uncomp, dmu_tx_t *tx) 123236884Smm{ 124236884Smm dmu_buf_t *db; 125236884Smm bptree_phys_t *bt; 126268650Sdelphij bptree_entry_phys_t bte = { 0 }; 127236884Smm 128236884Smm /* 129236884Smm * bptree objects are in the pool mos, therefore they can only be 130236884Smm * modified in syncing context. Furthermore, this is only modified 131236884Smm * by the sync thread, so no locking is necessary. 132236884Smm */ 133236884Smm ASSERT(dmu_tx_is_syncing(tx)); 134236884Smm 135236884Smm VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db)); 136236884Smm bt = db->db_data; 137236884Smm 138236884Smm bte.be_birth_txg = birth_txg; 139236884Smm bte.be_bp = *bp; 140236884Smm dmu_write(os, obj, bt->bt_end * sizeof (bte), sizeof (bte), &bte, tx); 141236884Smm 142236884Smm dmu_buf_will_dirty(db, tx); 143236884Smm bt->bt_end++; 144236884Smm bt->bt_bytes += bytes; 145236884Smm bt->bt_comp += comp; 146236884Smm bt->bt_uncomp += uncomp; 147236884Smm dmu_buf_rele(db, FTAG); 148236884Smm} 149236884Smm 150236884Smm/* ARGSUSED */ 151236884Smmstatic int 152246666Smmbptree_visit_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, 153268657Sdelphij const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) 154236884Smm{ 155236884Smm int err; 156236884Smm struct bptree_args *ba = arg; 157236884Smm 158288571Smav if (bp == NULL || BP_IS_HOLE(bp)) 159236884Smm return (0); 160236884Smm 161236884Smm err = ba->ba_func(ba->ba_arg, bp, ba->ba_tx); 162236884Smm if (err == 0 && ba->ba_free) { 163236884Smm ba->ba_phys->bt_bytes -= bp_get_dsize_sync(spa, bp); 164236884Smm ba->ba_phys->bt_comp -= BP_GET_PSIZE(bp); 165236884Smm ba->ba_phys->bt_uncomp -= BP_GET_UCSIZE(bp); 166236884Smm } 167236884Smm return (err); 168236884Smm} 169236884Smm 170268650Sdelphij/* 171268650Sdelphij * If "free" is set: 172268650Sdelphij * - It is assumed that "func" will be freeing the block pointers. 173268650Sdelphij * - If "func" returns nonzero, the bookmark will be remembered and 174268650Sdelphij * iteration will be restarted from this point on next invocation. 175268650Sdelphij * - If an i/o error is encountered (e.g. "func" returns EIO or ECKSUM), 176268650Sdelphij * bptree_iterate will remember the bookmark, continue traversing 177268650Sdelphij * any additional entries, and return 0. 178268650Sdelphij * 179268650Sdelphij * If "free" is not set, traversal will stop and return an error if 180268650Sdelphij * an i/o error is encountered. 181268650Sdelphij * 182268650Sdelphij * In either case, if zfs_free_leak_on_eio is set, i/o errors will be 183268650Sdelphij * ignored and traversal will continue (i.e. TRAVERSE_HARD will be passed to 184268650Sdelphij * traverse_dataset_destroyed()). 185268650Sdelphij */ 186236884Smmint 187236884Smmbptree_iterate(objset_t *os, uint64_t obj, boolean_t free, bptree_itor_t func, 188236884Smm void *arg, dmu_tx_t *tx) 189236884Smm{ 190268650Sdelphij boolean_t ioerr = B_FALSE; 191236884Smm int err; 192236884Smm uint64_t i; 193236884Smm dmu_buf_t *db; 194236884Smm struct bptree_args ba; 195236884Smm 196236884Smm ASSERT(!free || dmu_tx_is_syncing(tx)); 197236884Smm 198236884Smm err = dmu_bonus_hold(os, obj, FTAG, &db); 199236884Smm if (err != 0) 200236884Smm return (err); 201236884Smm 202236884Smm if (free) 203236884Smm dmu_buf_will_dirty(db, tx); 204236884Smm 205236884Smm ba.ba_phys = db->db_data; 206236884Smm ba.ba_free = free; 207236884Smm ba.ba_func = func; 208236884Smm ba.ba_arg = arg; 209236884Smm ba.ba_tx = tx; 210236884Smm 211236884Smm err = 0; 212236884Smm for (i = ba.ba_phys->bt_begin; i < ba.ba_phys->bt_end; i++) { 213236884Smm bptree_entry_phys_t bte; 214262120Savg int flags = TRAVERSE_PREFETCH_METADATA | TRAVERSE_POST; 215236884Smm 216236884Smm err = dmu_read(os, obj, i * sizeof (bte), sizeof (bte), 217236884Smm &bte, DMU_READ_NO_PREFETCH); 218236884Smm if (err != 0) 219236884Smm break; 220236884Smm 221268650Sdelphij if (zfs_free_leak_on_eio) 222262120Savg flags |= TRAVERSE_HARD; 223307123Smav zfs_dbgmsg("bptree index %lld: traversing from min_txg=%lld " 224268650Sdelphij "bookmark %lld/%lld/%lld/%lld", 225307123Smav (longlong_t)i, 226307123Smav (longlong_t)bte.be_birth_txg, 227268650Sdelphij (longlong_t)bte.be_zb.zb_objset, 228268650Sdelphij (longlong_t)bte.be_zb.zb_object, 229268650Sdelphij (longlong_t)bte.be_zb.zb_level, 230268650Sdelphij (longlong_t)bte.be_zb.zb_blkid); 231236884Smm err = traverse_dataset_destroyed(os->os_spa, &bte.be_bp, 232262120Savg bte.be_birth_txg, &bte.be_zb, flags, 233236884Smm bptree_visit_cb, &ba); 234236884Smm if (free) { 235268650Sdelphij /* 236268650Sdelphij * The callback has freed the visited block pointers. 237268650Sdelphij * Record our traversal progress on disk, either by 238268650Sdelphij * updating this record's bookmark, or by logically 239268650Sdelphij * removing this record by advancing bt_begin. 240268650Sdelphij */ 241268650Sdelphij if (err != 0) { 242236884Smm /* save bookmark for future resume */ 243236884Smm ASSERT3U(bte.be_zb.zb_objset, ==, 244236884Smm ZB_DESTROYED_OBJSET); 245240415Smm ASSERT0(bte.be_zb.zb_level); 246236884Smm dmu_write(os, obj, i * sizeof (bte), 247236884Smm sizeof (bte), &bte, tx); 248268650Sdelphij if (err == EIO || err == ECKSUM || 249268650Sdelphij err == ENXIO) { 250268650Sdelphij /* 251268650Sdelphij * Skip the rest of this tree and 252268650Sdelphij * continue on to the next entry. 253268650Sdelphij */ 254268650Sdelphij err = 0; 255268650Sdelphij ioerr = B_TRUE; 256268650Sdelphij } else { 257268650Sdelphij break; 258268650Sdelphij } 259268650Sdelphij } else if (ioerr) { 260262120Savg /* 261268650Sdelphij * This entry is finished, but there were 262268650Sdelphij * i/o errors on previous entries, so we 263268650Sdelphij * can't adjust bt_begin. Set this entry's 264268650Sdelphij * be_birth_txg such that it will be 265268650Sdelphij * treated as a no-op in future traversals. 266262120Savg */ 267268650Sdelphij bte.be_birth_txg = UINT64_MAX; 268268650Sdelphij dmu_write(os, obj, i * sizeof (bte), 269268650Sdelphij sizeof (bte), &bte, tx); 270262120Savg } 271262120Savg 272268650Sdelphij if (!ioerr) { 273268650Sdelphij ba.ba_phys->bt_begin++; 274268650Sdelphij (void) dmu_free_range(os, obj, 275268650Sdelphij i * sizeof (bte), sizeof (bte), tx); 276268650Sdelphij } 277268650Sdelphij } else if (err != 0) { 278268650Sdelphij break; 279236884Smm } 280236884Smm } 281236884Smm 282268650Sdelphij ASSERT(!free || err != 0 || ioerr || 283268650Sdelphij ba.ba_phys->bt_begin == ba.ba_phys->bt_end); 284236884Smm 285236884Smm /* if all blocks are free there should be no used space */ 286236884Smm if (ba.ba_phys->bt_begin == ba.ba_phys->bt_end) { 287268650Sdelphij if (zfs_free_leak_on_eio) { 288268650Sdelphij ba.ba_phys->bt_bytes = 0; 289268650Sdelphij ba.ba_phys->bt_comp = 0; 290268650Sdelphij ba.ba_phys->bt_uncomp = 0; 291268650Sdelphij } 292268650Sdelphij 293240415Smm ASSERT0(ba.ba_phys->bt_bytes); 294240415Smm ASSERT0(ba.ba_phys->bt_comp); 295240415Smm ASSERT0(ba.ba_phys->bt_uncomp); 296236884Smm } 297236884Smm 298236884Smm dmu_buf_rele(db, FTAG); 299236884Smm 300236884Smm return (err); 301236884Smm} 302