1168404Spjd/* 2168404Spjd * CDDL HEADER START 3168404Spjd * 4168404Spjd * The contents of this file are subject to the terms of the 5168404Spjd * Common Development and Distribution License (the "License"). 6168404Spjd * You may not use this file except in compliance with the License. 7168404Spjd * 8168404Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9168404Spjd * or http://www.opensolaris.org/os/licensing. 10168404Spjd * See the License for the specific language governing permissions 11168404Spjd * and limitations under the License. 12168404Spjd * 13168404Spjd * When distributing Covered Code, include this CDDL HEADER in each 14168404Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15168404Spjd * If applicable, add the following below this CDDL HEADER, with the 16168404Spjd * fields enclosed by brackets "[]" replaced with your own identifying 17168404Spjd * information: Portions Copyright [yyyy] [name of copyright owner] 18168404Spjd * 19168404Spjd * CDDL HEADER END 20168404Spjd */ 21236884Smm 22168404Spjd/* 23219089Spjd * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24339140Smav * Copyright (c) 2012, 2018 by Delphix. All rights reserved. 25286575Smav * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. 26168404Spjd */ 27168404Spjd 28168404Spjd#include <sys/zfs_context.h> 29168404Spjd#include <sys/dbuf.h> 30168404Spjd#include <sys/dnode.h> 31168404Spjd#include <sys/dmu.h> 32168404Spjd#include <sys/dmu_tx.h> 33168404Spjd#include <sys/dmu_objset.h> 34168404Spjd#include <sys/dsl_dataset.h> 35168404Spjd#include <sys/spa.h> 36264669Sdelphij#include <sys/range_tree.h> 37260150Sdelphij#include <sys/zfeature.h> 38168404Spjd 39168404Spjdstatic void 40168404Spjddnode_increase_indirection(dnode_t *dn, dmu_tx_t *tx) 41168404Spjd{ 42168404Spjd dmu_buf_impl_t *db; 43168404Spjd int txgoff = tx->tx_txg & TXG_MASK; 44168404Spjd int nblkptr = dn->dn_phys->dn_nblkptr; 45168404Spjd int old_toplvl = dn->dn_phys->dn_nlevels - 1; 46168404Spjd int new_level = dn->dn_next_nlevels[txgoff]; 47168404Spjd int i; 48168404Spjd 49168404Spjd rw_enter(&dn->dn_struct_rwlock, RW_WRITER); 50168404Spjd 51168404Spjd /* this dnode can't be paged out because it's dirty */ 52168404Spjd ASSERT(dn->dn_phys->dn_type != DMU_OT_NONE); 53168404Spjd ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); 54168404Spjd ASSERT(new_level > 1 && dn->dn_phys->dn_nlevels > 0); 55168404Spjd 56168404Spjd db = dbuf_hold_level(dn, dn->dn_phys->dn_nlevels, 0, FTAG); 57168404Spjd ASSERT(db != NULL); 58168404Spjd 59168404Spjd dn->dn_phys->dn_nlevels = new_level; 60185029Spjd dprintf("os=%p obj=%llu, increase to %d\n", dn->dn_objset, 61185029Spjd dn->dn_object, dn->dn_phys->dn_nlevels); 62168404Spjd 63304138Savg /* transfer dnode's block pointers to new indirect block */ 64304138Savg (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED|DB_RF_HAVESTRUCT); 65304138Savg ASSERT(db->db.db_data); 66304138Savg ASSERT(arc_released(db->db_buf)); 67304138Savg ASSERT3U(sizeof (blkptr_t) * nblkptr, <=, db->db.db_size); 68304138Savg bcopy(dn->dn_phys->dn_blkptr, db->db.db_data, 69304138Savg sizeof (blkptr_t) * nblkptr); 70304138Savg arc_buf_freeze(db->db_buf); 71168404Spjd 72168404Spjd /* set dbuf's parent pointers to new indirect buf */ 73168404Spjd for (i = 0; i < nblkptr; i++) { 74286541Smav dmu_buf_impl_t *child = 75286541Smav dbuf_find(dn->dn_objset, dn->dn_object, old_toplvl, i); 76168404Spjd 77168404Spjd if (child == NULL) 78168404Spjd continue; 79219089Spjd#ifdef DEBUG 80219089Spjd DB_DNODE_ENTER(child); 81219089Spjd ASSERT3P(DB_DNODE(child), ==, dn); 82219089Spjd DB_DNODE_EXIT(child); 83219089Spjd#endif /* DEBUG */ 84168404Spjd if (child->db_parent && child->db_parent != dn->dn_dbuf) { 85168404Spjd ASSERT(child->db_parent->db_level == db->db_level); 86168404Spjd ASSERT(child->db_blkptr != 87168404Spjd &dn->dn_phys->dn_blkptr[child->db_blkid]); 88168404Spjd mutex_exit(&child->db_mtx); 89168404Spjd continue; 90168404Spjd } 91168404Spjd ASSERT(child->db_parent == NULL || 92168404Spjd child->db_parent == dn->dn_dbuf); 93168404Spjd 94168404Spjd child->db_parent = db; 95168404Spjd dbuf_add_ref(db, child); 96168404Spjd if (db->db.db_data) 97168404Spjd child->db_blkptr = (blkptr_t *)db->db.db_data + i; 98168404Spjd else 99168404Spjd child->db_blkptr = NULL; 100168404Spjd dprintf_dbuf_bp(child, child->db_blkptr, 101168404Spjd "changed db_blkptr to new indirect %s", ""); 102168404Spjd 103168404Spjd mutex_exit(&child->db_mtx); 104168404Spjd } 105168404Spjd 106168404Spjd bzero(dn->dn_phys->dn_blkptr, sizeof (blkptr_t) * nblkptr); 107168404Spjd 108168404Spjd dbuf_rele(db, FTAG); 109168404Spjd 110168404Spjd rw_exit(&dn->dn_struct_rwlock); 111168404Spjd} 112168404Spjd 113260150Sdelphijstatic void 114168404Spjdfree_blocks(dnode_t *dn, blkptr_t *bp, int num, dmu_tx_t *tx) 115168404Spjd{ 116185029Spjd dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; 117168404Spjd uint64_t bytesfreed = 0; 118168404Spjd 119185029Spjd dprintf("ds=%p obj=%llx num=%d\n", ds, dn->dn_object, num); 120168404Spjd 121260150Sdelphij for (int i = 0; i < num; i++, bp++) { 122168404Spjd if (BP_IS_HOLE(bp)) 123168404Spjd continue; 124168404Spjd 125219089Spjd bytesfreed += dsl_dataset_block_kill(ds, bp, tx, B_FALSE); 126168404Spjd ASSERT3U(bytesfreed, <=, DN_USED_BYTES(dn->dn_phys)); 127260150Sdelphij 128260150Sdelphij /* 129260150Sdelphij * Save some useful information on the holes being 130260150Sdelphij * punched, including logical size, type, and indirection 131260150Sdelphij * level. Retaining birth time enables detection of when 132260150Sdelphij * holes are punched for reducing the number of free 133260150Sdelphij * records transmitted during a zfs send. 134260150Sdelphij */ 135260150Sdelphij 136260150Sdelphij uint64_t lsize = BP_GET_LSIZE(bp); 137260150Sdelphij dmu_object_type_t type = BP_GET_TYPE(bp); 138260150Sdelphij uint64_t lvl = BP_GET_LEVEL(bp); 139260150Sdelphij 140168404Spjd bzero(bp, sizeof (blkptr_t)); 141260150Sdelphij 142260150Sdelphij if (spa_feature_is_active(dn->dn_objset->os_spa, 143260150Sdelphij SPA_FEATURE_HOLE_BIRTH)) { 144260150Sdelphij BP_SET_LSIZE(bp, lsize); 145260150Sdelphij BP_SET_TYPE(bp, type); 146260150Sdelphij BP_SET_LEVEL(bp, lvl); 147260150Sdelphij BP_SET_BIRTH(bp, dmu_tx_get_txg(tx), 0); 148260150Sdelphij } 149168404Spjd } 150168404Spjd dnode_diduse_space(dn, -bytesfreed); 151168404Spjd} 152168404Spjd 153168404Spjd#ifdef ZFS_DEBUG 154168404Spjdstatic void 155168404Spjdfree_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx) 156168404Spjd{ 157168404Spjd int off, num; 158168404Spjd int i, err, epbs; 159168404Spjd uint64_t txg = tx->tx_txg; 160219089Spjd dnode_t *dn; 161168404Spjd 162219089Spjd DB_DNODE_ENTER(db); 163219089Spjd dn = DB_DNODE(db); 164219089Spjd epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; 165168404Spjd off = start - (db->db_blkid * 1<<epbs); 166168404Spjd num = end - start + 1; 167168404Spjd 168168404Spjd ASSERT3U(off, >=, 0); 169168404Spjd ASSERT3U(num, >=, 0); 170168404Spjd ASSERT3U(db->db_level, >, 0); 171219089Spjd ASSERT3U(db->db.db_size, ==, 1 << dn->dn_phys->dn_indblkshift); 172168404Spjd ASSERT3U(off+num, <=, db->db.db_size >> SPA_BLKPTRSHIFT); 173168404Spjd ASSERT(db->db_blkptr != NULL); 174168404Spjd 175168404Spjd for (i = off; i < off+num; i++) { 176168404Spjd uint64_t *buf; 177168404Spjd dmu_buf_impl_t *child; 178168404Spjd dbuf_dirty_record_t *dr; 179168404Spjd int j; 180168404Spjd 181168404Spjd ASSERT(db->db_level == 1); 182168404Spjd 183219089Spjd rw_enter(&dn->dn_struct_rwlock, RW_READER); 184219089Spjd err = dbuf_hold_impl(dn, db->db_level-1, 185286705Smav (db->db_blkid << epbs) + i, TRUE, FALSE, FTAG, &child); 186219089Spjd rw_exit(&dn->dn_struct_rwlock); 187168404Spjd if (err == ENOENT) 188168404Spjd continue; 189168404Spjd ASSERT(err == 0); 190168404Spjd ASSERT(child->db_level == 0); 191168404Spjd dr = child->db_last_dirty; 192168404Spjd while (dr && dr->dr_txg > txg) 193168404Spjd dr = dr->dr_next; 194168404Spjd ASSERT(dr == NULL || dr->dr_txg == txg); 195168404Spjd 196168404Spjd /* data_old better be zeroed */ 197168404Spjd if (dr) { 198168404Spjd buf = dr->dt.dl.dr_data->b_data; 199168404Spjd for (j = 0; j < child->db.db_size >> 3; j++) { 200168404Spjd if (buf[j] != 0) { 201168404Spjd panic("freed data not zero: " 202168404Spjd "child=%p i=%d off=%d num=%d\n", 203185029Spjd (void *)child, i, off, num); 204168404Spjd } 205168404Spjd } 206168404Spjd } 207168404Spjd 208168404Spjd /* 209168404Spjd * db_data better be zeroed unless it's dirty in a 210168404Spjd * future txg. 211168404Spjd */ 212168404Spjd mutex_enter(&child->db_mtx); 213168404Spjd buf = child->db.db_data; 214168404Spjd if (buf != NULL && child->db_state != DB_FILL && 215168404Spjd child->db_last_dirty == NULL) { 216168404Spjd for (j = 0; j < child->db.db_size >> 3; j++) { 217168404Spjd if (buf[j] != 0) { 218168404Spjd panic("freed data not zero: " 219168404Spjd "child=%p i=%d off=%d num=%d\n", 220185029Spjd (void *)child, i, off, num); 221168404Spjd } 222168404Spjd } 223168404Spjd } 224168404Spjd mutex_exit(&child->db_mtx); 225168404Spjd 226168404Spjd dbuf_rele(child, FTAG); 227168404Spjd } 228219089Spjd DB_DNODE_EXIT(db); 229168404Spjd} 230168404Spjd#endif 231168404Spjd 232339136Smav/* 233339136Smav * We don't usually free the indirect blocks here. If in one txg we have a 234339136Smav * free_range and a write to the same indirect block, it's important that we 235339136Smav * preserve the hole's birth times. Therefore, we don't free any any indirect 236339136Smav * blocks in free_children(). If an indirect block happens to turn into all 237339136Smav * holes, it will be freed by dbuf_write_children_ready, which happens at a 238339136Smav * point in the syncing process where we know for certain the contents of the 239339136Smav * indirect block. 240339136Smav * 241339136Smav * However, if we're freeing a dnode, its space accounting must go to zero 242339136Smav * before we actually try to free the dnode, or we will trip an assertion. In 243339136Smav * addition, we know the case described above cannot occur, because the dnode is 244339136Smav * being freed. Therefore, we free the indirect blocks immediately in that 245339136Smav * case. 246339136Smav */ 247260150Sdelphijstatic void 248260150Sdelphijfree_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, 249339136Smav boolean_t free_indirects, dmu_tx_t *tx) 250168404Spjd{ 251219089Spjd dnode_t *dn; 252168404Spjd blkptr_t *bp; 253168404Spjd dmu_buf_impl_t *subdb; 254321537Smav uint64_t start, end, dbstart, dbend; 255321537Smav unsigned int epbs, shift, i; 256168404Spjd 257185029Spjd /* 258185029Spjd * There is a small possibility that this block will not be cached: 259185029Spjd * 1 - if level > 1 and there are no children with level <= 1 260260150Sdelphij * 2 - if this block was evicted since we read it from 261260150Sdelphij * dmu_tx_hold_free(). 262185029Spjd */ 263185029Spjd if (db->db_state != DB_CACHED) 264185029Spjd (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED); 265185029Spjd 266339137Smav /* 267339137Smav * If we modify this indirect block, and we are not freeing the 268339137Smav * dnode (!free_indirects), then this indirect block needs to get 269339137Smav * written to disk by dbuf_write(). If it is dirty, we know it will 270339137Smav * be written (otherwise, we would have incorrect on-disk state 271339137Smav * because the space would be freed but still referenced by the BP 272339137Smav * in this indirect block). Therefore we VERIFY that it is 273339137Smav * dirty. 274339137Smav * 275339137Smav * Our VERIFY covers some cases that do not actually have to be 276339137Smav * dirty, but the open-context code happens to dirty. E.g. if the 277339137Smav * blocks we are freeing are all holes, because in that case, we 278339137Smav * are only freeing part of this indirect block, so it is an 279339137Smav * ancestor of the first or last block to be freed. The first and 280339137Smav * last L1 indirect blocks are always dirtied by dnode_free_range(). 281339137Smav */ 282339137Smav VERIFY(BP_GET_FILL(db->db_blkptr) == 0 || db->db_dirtycnt > 0); 283339137Smav 284219089Spjd dbuf_release_bp(db); 285260150Sdelphij bp = db->db.db_data; 286168404Spjd 287219089Spjd DB_DNODE_ENTER(db); 288219089Spjd dn = DB_DNODE(db); 289219089Spjd epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; 290321537Smav ASSERT3U(epbs, <, 31); 291168404Spjd shift = (db->db_level - 1) * epbs; 292168404Spjd dbstart = db->db_blkid << epbs; 293168404Spjd start = blkid >> shift; 294168404Spjd if (dbstart < start) { 295168404Spjd bp += start - dbstart; 296168404Spjd } else { 297168404Spjd start = dbstart; 298168404Spjd } 299168404Spjd dbend = ((db->db_blkid + 1) << epbs) - 1; 300168404Spjd end = (blkid + nblks - 1) >> shift; 301168404Spjd if (dbend <= end) 302168404Spjd end = dbend; 303260150Sdelphij 304168404Spjd ASSERT3U(start, <=, end); 305168404Spjd 306168404Spjd if (db->db_level == 1) { 307168404Spjd FREE_VERIFY(db, start, end, tx); 308260150Sdelphij free_blocks(dn, bp, end-start+1, tx); 309260150Sdelphij } else { 310321537Smav for (uint64_t id = start; id <= end; id++, bp++) { 311260150Sdelphij if (BP_IS_HOLE(bp)) 312260150Sdelphij continue; 313260150Sdelphij rw_enter(&dn->dn_struct_rwlock, RW_READER); 314260150Sdelphij VERIFY0(dbuf_hold_impl(dn, db->db_level - 1, 315321537Smav id, TRUE, FALSE, FTAG, &subdb)); 316260150Sdelphij rw_exit(&dn->dn_struct_rwlock); 317260150Sdelphij ASSERT3P(bp, ==, subdb->db_blkptr); 318260150Sdelphij 319339136Smav free_children(subdb, blkid, nblks, free_indirects, tx); 320260150Sdelphij dbuf_rele(subdb, FTAG); 321260150Sdelphij } 322168404Spjd } 323168404Spjd 324339136Smav if (free_indirects) { 325339136Smav for (i = 0, bp = db->db.db_data; i < 1 << epbs; i++, bp++) 326339136Smav ASSERT(BP_IS_HOLE(bp)); 327260150Sdelphij bzero(db->db.db_data, db->db.db_size); 328260150Sdelphij free_blocks(dn, db->db_blkptr, 1, tx); 329260150Sdelphij } 330168404Spjd 331219089Spjd DB_DNODE_EXIT(db); 332168404Spjd arc_buf_freeze(db->db_buf); 333168404Spjd} 334168404Spjd 335168404Spjd/* 336251631Sdelphij * Traverse the indicated range of the provided file 337168404Spjd * and "free" all the blocks contained there. 338168404Spjd */ 339168404Spjdstatic void 340264669Sdelphijdnode_sync_free_range_impl(dnode_t *dn, uint64_t blkid, uint64_t nblks, 341339136Smav boolean_t free_indirects, dmu_tx_t *tx) 342168404Spjd{ 343168404Spjd blkptr_t *bp = dn->dn_phys->dn_blkptr; 344168404Spjd int dnlevel = dn->dn_phys->dn_nlevels; 345260150Sdelphij boolean_t trunc = B_FALSE; 346168404Spjd 347168404Spjd if (blkid > dn->dn_phys->dn_maxblkid) 348168404Spjd return; 349168404Spjd 350168404Spjd ASSERT(dn->dn_phys->dn_maxblkid < UINT64_MAX); 351260150Sdelphij if (blkid + nblks > dn->dn_phys->dn_maxblkid) { 352168404Spjd nblks = dn->dn_phys->dn_maxblkid - blkid + 1; 353260150Sdelphij trunc = B_TRUE; 354260150Sdelphij } 355168404Spjd 356168404Spjd /* There are no indirect blocks in the object */ 357168404Spjd if (dnlevel == 1) { 358168404Spjd if (blkid >= dn->dn_phys->dn_nblkptr) { 359168404Spjd /* this range was never made persistent */ 360168404Spjd return; 361168404Spjd } 362168404Spjd ASSERT3U(blkid + nblks, <=, dn->dn_phys->dn_nblkptr); 363260150Sdelphij free_blocks(dn, bp + blkid, nblks, tx); 364260150Sdelphij } else { 365260150Sdelphij int shift = (dnlevel - 1) * 366260150Sdelphij (dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT); 367260150Sdelphij int start = blkid >> shift; 368260150Sdelphij int end = (blkid + nblks - 1) >> shift; 369260150Sdelphij dmu_buf_impl_t *db; 370168404Spjd 371260150Sdelphij ASSERT(start < dn->dn_phys->dn_nblkptr); 372260150Sdelphij bp += start; 373260150Sdelphij for (int i = start; i <= end; i++, bp++) { 374260150Sdelphij if (BP_IS_HOLE(bp)) 375260150Sdelphij continue; 376260150Sdelphij rw_enter(&dn->dn_struct_rwlock, RW_READER); 377260150Sdelphij VERIFY0(dbuf_hold_impl(dn, dnlevel - 1, i, 378286705Smav TRUE, FALSE, FTAG, &db)); 379260150Sdelphij rw_exit(&dn->dn_struct_rwlock); 380168404Spjd 381339136Smav free_children(db, blkid, nblks, free_indirects, tx); 382260150Sdelphij dbuf_rele(db, FTAG); 383168404Spjd } 384168404Spjd } 385260150Sdelphij 386168404Spjd if (trunc) { 387260150Sdelphij dn->dn_phys->dn_maxblkid = blkid == 0 ? 0 : blkid - 1; 388260150Sdelphij 389168404Spjd uint64_t off = (dn->dn_phys->dn_maxblkid + 1) * 390168404Spjd (dn->dn_phys->dn_datablkszsec << SPA_MINBLOCKSHIFT); 391168404Spjd ASSERT(off < dn->dn_phys->dn_maxblkid || 392168404Spjd dn->dn_phys->dn_maxblkid == 0 || 393185029Spjd dnode_next_offset(dn, 0, &off, 1, 1, 0) != 0); 394168404Spjd } 395168404Spjd} 396168404Spjd 397264669Sdelphijtypedef struct dnode_sync_free_range_arg { 398264669Sdelphij dnode_t *dsfra_dnode; 399264669Sdelphij dmu_tx_t *dsfra_tx; 400339136Smav boolean_t dsfra_free_indirects; 401264669Sdelphij} dnode_sync_free_range_arg_t; 402264669Sdelphij 403264669Sdelphijstatic void 404264669Sdelphijdnode_sync_free_range(void *arg, uint64_t blkid, uint64_t nblks) 405264669Sdelphij{ 406264669Sdelphij dnode_sync_free_range_arg_t *dsfra = arg; 407264669Sdelphij dnode_t *dn = dsfra->dsfra_dnode; 408264669Sdelphij 409264669Sdelphij mutex_exit(&dn->dn_mtx); 410339136Smav dnode_sync_free_range_impl(dn, blkid, nblks, 411339136Smav dsfra->dsfra_free_indirects, dsfra->dsfra_tx); 412264669Sdelphij mutex_enter(&dn->dn_mtx); 413264669Sdelphij} 414264669Sdelphij 415168404Spjd/* 416251631Sdelphij * Try to kick all the dnode's dbufs out of the cache... 417168404Spjd */ 418185029Spjdvoid 419185029Spjddnode_evict_dbufs(dnode_t *dn) 420168404Spjd{ 421286575Smav dmu_buf_impl_t db_marker; 422286575Smav dmu_buf_impl_t *db, *db_next; 423168404Spjd 424286575Smav mutex_enter(&dn->dn_dbufs_mtx); 425286575Smav for (db = avl_first(&dn->dn_dbufs); db != NULL; db = db_next) { 426168404Spjd 427219089Spjd#ifdef DEBUG 428286575Smav DB_DNODE_ENTER(db); 429286575Smav ASSERT3P(DB_DNODE(db), ==, dn); 430286575Smav DB_DNODE_EXIT(db); 431219089Spjd#endif /* DEBUG */ 432168404Spjd 433286575Smav mutex_enter(&db->db_mtx); 434286575Smav if (db->db_state != DB_EVICTING && 435286575Smav refcount_is_zero(&db->db_holds)) { 436286575Smav db_marker.db_level = db->db_level; 437286575Smav db_marker.db_blkid = db->db_blkid; 438286575Smav db_marker.db_state = DB_SEARCH; 439286575Smav avl_insert_here(&dn->dn_dbufs, &db_marker, db, 440286575Smav AVL_BEFORE); 441168404Spjd 442339140Smav /* 443339140Smav * We need to use the "marker" dbuf rather than 444339140Smav * simply getting the next dbuf, because 445339140Smav * dbuf_destroy() may actually remove multiple dbufs. 446339140Smav * It can call itself recursively on the parent dbuf, 447339140Smav * which may also be removed from dn_dbufs. The code 448339140Smav * flow would look like: 449339140Smav * 450339140Smav * dbuf_destroy(): 451339140Smav * dnode_rele_and_unlock(parent_dbuf, evicting=TRUE): 452339140Smav * if (!cacheable || pending_evict) 453339140Smav * dbuf_destroy() 454339140Smav */ 455307265Smav dbuf_destroy(db); 456286575Smav 457286575Smav db_next = AVL_NEXT(&dn->dn_dbufs, &db_marker); 458286575Smav avl_remove(&dn->dn_dbufs, &db_marker); 459286575Smav } else { 460289309Smav db->db_pending_evict = TRUE; 461286575Smav mutex_exit(&db->db_mtx); 462286575Smav db_next = AVL_NEXT(&dn->dn_dbufs, db); 463168404Spjd } 464286575Smav } 465286575Smav mutex_exit(&dn->dn_dbufs_mtx); 466168404Spjd 467286545Smav dnode_evict_bonus(dn); 468286545Smav} 469286545Smav 470286545Smavvoid 471286545Smavdnode_evict_bonus(dnode_t *dn) 472286545Smav{ 473168404Spjd rw_enter(&dn->dn_struct_rwlock, RW_WRITER); 474289309Smav if (dn->dn_bonus != NULL) { 475289309Smav if (refcount_is_zero(&dn->dn_bonus->db_holds)) { 476289309Smav mutex_enter(&dn->dn_bonus->db_mtx); 477307265Smav dbuf_destroy(dn->dn_bonus); 478289309Smav dn->dn_bonus = NULL; 479289309Smav } else { 480289309Smav dn->dn_bonus->db_pending_evict = TRUE; 481289309Smav } 482168404Spjd } 483168404Spjd rw_exit(&dn->dn_struct_rwlock); 484168404Spjd} 485168404Spjd 486168404Spjdstatic void 487168404Spjddnode_undirty_dbufs(list_t *list) 488168404Spjd{ 489168404Spjd dbuf_dirty_record_t *dr; 490168404Spjd 491168404Spjd while (dr = list_head(list)) { 492168404Spjd dmu_buf_impl_t *db = dr->dr_dbuf; 493168404Spjd uint64_t txg = dr->dr_txg; 494168404Spjd 495219089Spjd if (db->db_level != 0) 496219089Spjd dnode_undirty_dbufs(&dr->dt.di.dr_children); 497219089Spjd 498168404Spjd mutex_enter(&db->db_mtx); 499168404Spjd /* XXX - use dbuf_undirty()? */ 500168404Spjd list_remove(list, dr); 501168404Spjd ASSERT(db->db_last_dirty == dr); 502168404Spjd db->db_last_dirty = NULL; 503168404Spjd db->db_dirtycnt -= 1; 504168404Spjd if (db->db_level == 0) { 505219089Spjd ASSERT(db->db_blkid == DMU_BONUS_BLKID || 506168404Spjd dr->dt.dl.dr_data == db->db_buf); 507168404Spjd dbuf_unoverride(dr); 508168404Spjd } else { 509268713Sdelphij mutex_destroy(&dr->dt.di.dr_mtx); 510169325Spjd list_destroy(&dr->dt.di.dr_children); 511168404Spjd } 512168404Spjd kmem_free(dr, sizeof (dbuf_dirty_record_t)); 513339140Smav dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg, B_FALSE); 514168404Spjd } 515168404Spjd} 516168404Spjd 517168404Spjdstatic void 518168404Spjddnode_sync_free(dnode_t *dn, dmu_tx_t *tx) 519168404Spjd{ 520168404Spjd int txgoff = tx->tx_txg & TXG_MASK; 521168404Spjd 522168404Spjd ASSERT(dmu_tx_is_syncing(tx)); 523168404Spjd 524185029Spjd /* 525185029Spjd * Our contents should have been freed in dnode_sync() by the 526185029Spjd * free range record inserted by the caller of dnode_free(). 527185029Spjd */ 528240415Smm ASSERT0(DN_USED_BYTES(dn->dn_phys)); 529185029Spjd ASSERT(BP_IS_HOLE(dn->dn_phys->dn_blkptr)); 530185029Spjd 531168404Spjd dnode_undirty_dbufs(&dn->dn_dirty_records[txgoff]); 532185029Spjd dnode_evict_dbufs(dn); 533168404Spjd 534168404Spjd /* 535168404Spjd * XXX - It would be nice to assert this, but we may still 536168404Spjd * have residual holds from async evictions from the arc... 537168404Spjd * 538168404Spjd * zfs_obj_to_path() also depends on this being 539168404Spjd * commented out. 540168404Spjd * 541168404Spjd * ASSERT3U(refcount_count(&dn->dn_holds), ==, 1); 542168404Spjd */ 543168404Spjd 544168404Spjd /* Undirty next bits */ 545168404Spjd dn->dn_next_nlevels[txgoff] = 0; 546168404Spjd dn->dn_next_indblkshift[txgoff] = 0; 547168404Spjd dn->dn_next_blksz[txgoff] = 0; 548168404Spjd 549168404Spjd /* ASSERT(blkptrs are zero); */ 550168404Spjd ASSERT(dn->dn_phys->dn_type != DMU_OT_NONE); 551168404Spjd ASSERT(dn->dn_type != DMU_OT_NONE); 552168404Spjd 553168404Spjd ASSERT(dn->dn_free_txg > 0); 554168404Spjd if (dn->dn_allocated_txg != dn->dn_free_txg) 555260150Sdelphij dmu_buf_will_dirty(&dn->dn_dbuf->db, tx); 556168404Spjd bzero(dn->dn_phys, sizeof (dnode_phys_t)); 557168404Spjd 558168404Spjd mutex_enter(&dn->dn_mtx); 559168404Spjd dn->dn_type = DMU_OT_NONE; 560168404Spjd dn->dn_maxblkid = 0; 561168404Spjd dn->dn_allocated_txg = 0; 562185029Spjd dn->dn_free_txg = 0; 563219089Spjd dn->dn_have_spill = B_FALSE; 564168404Spjd mutex_exit(&dn->dn_mtx); 565168404Spjd 566168404Spjd ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT); 567168404Spjd 568168404Spjd dnode_rele(dn, (void *)(uintptr_t)tx->tx_txg); 569168404Spjd /* 570168404Spjd * Now that we've released our hold, the dnode may 571168404Spjd * be evicted, so we musn't access it. 572168404Spjd */ 573168404Spjd} 574168404Spjd 575168404Spjd/* 576168404Spjd * Write out the dnode's dirty buffers. 577168404Spjd */ 578168404Spjdvoid 579168404Spjddnode_sync(dnode_t *dn, dmu_tx_t *tx) 580168404Spjd{ 581168404Spjd dnode_phys_t *dnp = dn->dn_phys; 582168404Spjd int txgoff = tx->tx_txg & TXG_MASK; 583168404Spjd list_t *list = &dn->dn_dirty_records[txgoff]; 584209962Smm static const dnode_phys_t zerodn = { 0 }; 585219089Spjd boolean_t kill_spill = B_FALSE; 586168404Spjd 587168404Spjd ASSERT(dmu_tx_is_syncing(tx)); 588168404Spjd ASSERT(dnp->dn_type != DMU_OT_NONE || dn->dn_allocated_txg); 589209962Smm ASSERT(dnp->dn_type != DMU_OT_NONE || 590209962Smm bcmp(dnp, &zerodn, DNODE_SIZE) == 0); 591168404Spjd DNODE_VERIFY(dn); 592168404Spjd 593168404Spjd ASSERT(dn->dn_dbuf == NULL || arc_released(dn->dn_dbuf->db_buf)); 594168404Spjd 595209962Smm if (dmu_objset_userused_enabled(dn->dn_objset) && 596209962Smm !DMU_OBJECT_IS_SPECIAL(dn->dn_object)) { 597219089Spjd mutex_enter(&dn->dn_mtx); 598219089Spjd dn->dn_oldused = DN_USED_BYTES(dn->dn_phys); 599219089Spjd dn->dn_oldflags = dn->dn_phys->dn_flags; 600209962Smm dn->dn_phys->dn_flags |= DNODE_FLAG_USERUSED_ACCOUNTED; 601219089Spjd mutex_exit(&dn->dn_mtx); 602219089Spjd dmu_objset_userquota_get_ids(dn, B_FALSE, tx); 603209962Smm } else { 604209962Smm /* Once we account for it, we should always account for it. */ 605209962Smm ASSERT(!(dn->dn_phys->dn_flags & 606209962Smm DNODE_FLAG_USERUSED_ACCOUNTED)); 607209962Smm } 608209962Smm 609168404Spjd mutex_enter(&dn->dn_mtx); 610168404Spjd if (dn->dn_allocated_txg == tx->tx_txg) { 611168404Spjd /* The dnode is newly allocated or reallocated */ 612168404Spjd if (dnp->dn_type == DMU_OT_NONE) { 613168404Spjd /* this is a first alloc, not a realloc */ 614168404Spjd dnp->dn_nlevels = 1; 615196703Spjd dnp->dn_nblkptr = dn->dn_nblkptr; 616168404Spjd } 617168404Spjd 618168404Spjd dnp->dn_type = dn->dn_type; 619168404Spjd dnp->dn_bonustype = dn->dn_bonustype; 620168404Spjd dnp->dn_bonuslen = dn->dn_bonuslen; 621168404Spjd } 622168404Spjd ASSERT(dnp->dn_nlevels > 1 || 623168404Spjd BP_IS_HOLE(&dnp->dn_blkptr[0]) || 624268075Sdelphij BP_IS_EMBEDDED(&dnp->dn_blkptr[0]) || 625168404Spjd BP_GET_LSIZE(&dnp->dn_blkptr[0]) == 626168404Spjd dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT); 627268075Sdelphij ASSERT(dnp->dn_nlevels < 2 || 628268075Sdelphij BP_IS_HOLE(&dnp->dn_blkptr[0]) || 629268075Sdelphij BP_GET_LSIZE(&dnp->dn_blkptr[0]) == 1 << dnp->dn_indblkshift); 630168404Spjd 631259813Sdelphij if (dn->dn_next_type[txgoff] != 0) { 632259813Sdelphij dnp->dn_type = dn->dn_type; 633259813Sdelphij dn->dn_next_type[txgoff] = 0; 634259813Sdelphij } 635259813Sdelphij 636259813Sdelphij if (dn->dn_next_blksz[txgoff] != 0) { 637168404Spjd ASSERT(P2PHASE(dn->dn_next_blksz[txgoff], 638168404Spjd SPA_MINBLOCKSIZE) == 0); 639168404Spjd ASSERT(BP_IS_HOLE(&dnp->dn_blkptr[0]) || 640185029Spjd dn->dn_maxblkid == 0 || list_head(list) != NULL || 641168404Spjd dn->dn_next_blksz[txgoff] >> SPA_MINBLOCKSHIFT == 642264669Sdelphij dnp->dn_datablkszsec || 643332547Smav !range_tree_is_empty(dn->dn_free_ranges[txgoff])); 644168404Spjd dnp->dn_datablkszsec = 645168404Spjd dn->dn_next_blksz[txgoff] >> SPA_MINBLOCKSHIFT; 646168404Spjd dn->dn_next_blksz[txgoff] = 0; 647168404Spjd } 648168404Spjd 649259813Sdelphij if (dn->dn_next_bonuslen[txgoff] != 0) { 650185029Spjd if (dn->dn_next_bonuslen[txgoff] == DN_ZERO_BONUSLEN) 651185029Spjd dnp->dn_bonuslen = 0; 652185029Spjd else 653185029Spjd dnp->dn_bonuslen = dn->dn_next_bonuslen[txgoff]; 654185029Spjd ASSERT(dnp->dn_bonuslen <= DN_MAX_BONUSLEN); 655185029Spjd dn->dn_next_bonuslen[txgoff] = 0; 656185029Spjd } 657185029Spjd 658259813Sdelphij if (dn->dn_next_bonustype[txgoff] != 0) { 659236884Smm ASSERT(DMU_OT_IS_VALID(dn->dn_next_bonustype[txgoff])); 660219089Spjd dnp->dn_bonustype = dn->dn_next_bonustype[txgoff]; 661219089Spjd dn->dn_next_bonustype[txgoff] = 0; 662219089Spjd } 663219089Spjd 664260150Sdelphij boolean_t freeing_dnode = dn->dn_free_txg > 0 && 665260150Sdelphij dn->dn_free_txg <= tx->tx_txg; 666260150Sdelphij 667219089Spjd /* 668275736Sdelphij * Remove the spill block if we have been explicitly asked to 669275736Sdelphij * remove it, or if the object is being removed. 670219089Spjd */ 671275736Sdelphij if (dn->dn_rm_spillblk[txgoff] || freeing_dnode) { 672275736Sdelphij if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) 673219089Spjd kill_spill = B_TRUE; 674219089Spjd dn->dn_rm_spillblk[txgoff] = 0; 675219089Spjd } 676219089Spjd 677259813Sdelphij if (dn->dn_next_indblkshift[txgoff] != 0) { 678168404Spjd ASSERT(dnp->dn_nlevels == 1); 679168404Spjd dnp->dn_indblkshift = dn->dn_next_indblkshift[txgoff]; 680168404Spjd dn->dn_next_indblkshift[txgoff] = 0; 681168404Spjd } 682168404Spjd 683168404Spjd /* 684168404Spjd * Just take the live (open-context) values for checksum and compress. 685168404Spjd * Strictly speaking it's a future leak, but nothing bad happens if we 686168404Spjd * start using the new checksum or compress algorithm a little early. 687168404Spjd */ 688168404Spjd dnp->dn_checksum = dn->dn_checksum; 689168404Spjd dnp->dn_compress = dn->dn_compress; 690168404Spjd 691168404Spjd mutex_exit(&dn->dn_mtx); 692168404Spjd 693219089Spjd if (kill_spill) { 694260150Sdelphij free_blocks(dn, &dn->dn_phys->dn_spill, 1, tx); 695219089Spjd mutex_enter(&dn->dn_mtx); 696219089Spjd dnp->dn_flags &= ~DNODE_FLAG_SPILL_BLKPTR; 697219089Spjd mutex_exit(&dn->dn_mtx); 698219089Spjd } 699219089Spjd 700168404Spjd /* process all the "freed" ranges in the file */ 701264669Sdelphij if (dn->dn_free_ranges[txgoff] != NULL) { 702264669Sdelphij dnode_sync_free_range_arg_t dsfra; 703264669Sdelphij dsfra.dsfra_dnode = dn; 704264669Sdelphij dsfra.dsfra_tx = tx; 705339136Smav dsfra.dsfra_free_indirects = freeing_dnode; 706339136Smav if (freeing_dnode) { 707339136Smav ASSERT(range_tree_contains(dn->dn_free_ranges[txgoff], 708339136Smav 0, dn->dn_maxblkid + 1)); 709339136Smav } 710185029Spjd mutex_enter(&dn->dn_mtx); 711264669Sdelphij range_tree_vacate(dn->dn_free_ranges[txgoff], 712264669Sdelphij dnode_sync_free_range, &dsfra); 713264669Sdelphij range_tree_destroy(dn->dn_free_ranges[txgoff]); 714264669Sdelphij dn->dn_free_ranges[txgoff] = NULL; 715185029Spjd mutex_exit(&dn->dn_mtx); 716168404Spjd } 717168404Spjd 718260150Sdelphij if (freeing_dnode) { 719321531Smav dn->dn_objset->os_freed_dnodes++; 720168404Spjd dnode_sync_free(dn, tx); 721168404Spjd return; 722168404Spjd } 723168404Spjd 724271226Sdelphij if (dn->dn_next_nlevels[txgoff]) { 725271226Sdelphij dnode_increase_indirection(dn, tx); 726271226Sdelphij dn->dn_next_nlevels[txgoff] = 0; 727271226Sdelphij } 728271226Sdelphij 729196703Spjd if (dn->dn_next_nblkptr[txgoff]) { 730196703Spjd /* this should only happen on a realloc */ 731196703Spjd ASSERT(dn->dn_allocated_txg == tx->tx_txg); 732196703Spjd if (dn->dn_next_nblkptr[txgoff] > dnp->dn_nblkptr) { 733196703Spjd /* zero the new blkptrs we are gaining */ 734196703Spjd bzero(dnp->dn_blkptr + dnp->dn_nblkptr, 735196703Spjd sizeof (blkptr_t) * 736196703Spjd (dn->dn_next_nblkptr[txgoff] - dnp->dn_nblkptr)); 737196703Spjd#ifdef ZFS_DEBUG 738196703Spjd } else { 739196703Spjd int i; 740196703Spjd ASSERT(dn->dn_next_nblkptr[txgoff] < dnp->dn_nblkptr); 741196703Spjd /* the blkptrs we are losing better be unallocated */ 742196703Spjd for (i = dn->dn_next_nblkptr[txgoff]; 743196703Spjd i < dnp->dn_nblkptr; i++) 744196703Spjd ASSERT(BP_IS_HOLE(&dnp->dn_blkptr[i])); 745196703Spjd#endif 746196703Spjd } 747196703Spjd mutex_enter(&dn->dn_mtx); 748196703Spjd dnp->dn_nblkptr = dn->dn_next_nblkptr[txgoff]; 749196703Spjd dn->dn_next_nblkptr[txgoff] = 0; 750196703Spjd mutex_exit(&dn->dn_mtx); 751196703Spjd } 752196703Spjd 753284593Savg dbuf_sync_list(list, dn->dn_phys->dn_nlevels - 1, tx); 754168404Spjd 755209962Smm if (!DMU_OBJECT_IS_SPECIAL(dn->dn_object)) { 756168404Spjd ASSERT3P(list_head(list), ==, NULL); 757168404Spjd dnode_rele(dn, (void *)(uintptr_t)tx->tx_txg); 758168404Spjd } 759168404Spjd 760168404Spjd /* 761168404Spjd * Although we have dropped our reference to the dnode, it 762168404Spjd * can't be evicted until its written, and we haven't yet 763168404Spjd * initiated the IO for the dnode's dbuf. 764168404Spjd */ 765168404Spjd} 766