dbuf.c revision 286541
1168404Spjd/* 2168404Spjd * CDDL HEADER START 3168404Spjd * 4168404Spjd * The contents of this file are subject to the terms of the 5168404Spjd * Common Development and Distribution License (the "License"). 6168404Spjd * You may not use this file except in compliance with the License. 7168404Spjd * 8168404Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9168404Spjd * or http://www.opensolaris.org/os/licensing. 10168404Spjd * See the License for the specific language governing permissions 11168404Spjd * and limitations under the License. 12168404Spjd * 13168404Spjd * When distributing Covered Code, include this CDDL HEADER in each 14168404Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15168404Spjd * If applicable, add the following below this CDDL HEADER, with the 16168404Spjd * fields enclosed by brackets "[]" replaced with your own identifying 17168404Spjd * information: Portions Copyright [yyyy] [name of copyright owner] 18168404Spjd * 19168404Spjd * CDDL HEADER END 20168404Spjd */ 21168404Spjd/* 22219089Spjd * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23219636Spjd * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 24284593Savg * Copyright (c) 2012, 2015 by Delphix. All rights reserved. 25251478Sdelphij * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. 26255750Sdelphij * Copyright (c) 2013, Joyent, Inc. All rights reserved. 27168404Spjd */ 28168404Spjd 29168404Spjd#include <sys/zfs_context.h> 30168404Spjd#include <sys/dmu.h> 31253821Sdelphij#include <sys/dmu_send.h> 32168404Spjd#include <sys/dmu_impl.h> 33168404Spjd#include <sys/dbuf.h> 34168404Spjd#include <sys/dmu_objset.h> 35168404Spjd#include <sys/dsl_dataset.h> 36168404Spjd#include <sys/dsl_dir.h> 37168404Spjd#include <sys/dmu_tx.h> 38168404Spjd#include <sys/spa.h> 39168404Spjd#include <sys/zio.h> 40168404Spjd#include <sys/dmu_zfetch.h> 41219089Spjd#include <sys/sa.h> 42219089Spjd#include <sys/sa_impl.h> 43268075Sdelphij#include <sys/zfeature.h> 44268075Sdelphij#include <sys/blkptr.h> 45264669Sdelphij#include <sys/range_tree.h> 46168404Spjd 47254753Sdelphij/* 48254753Sdelphij * Number of times that zfs_free_range() took the slow path while doing 49254753Sdelphij * a zfs receive. A nonzero value indicates a potential performance problem. 50254753Sdelphij */ 51254753Sdelphijuint64_t zfs_free_range_recv_miss; 52254753Sdelphij 53168404Spjdstatic void dbuf_destroy(dmu_buf_impl_t *db); 54248571Smmstatic boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx); 55185029Spjdstatic void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx); 56168404Spjd 57168404Spjd/* 58168404Spjd * Global data structures and functions for the dbuf cache. 59168404Spjd */ 60168404Spjdstatic kmem_cache_t *dbuf_cache; 61168404Spjd 62168404Spjd/* ARGSUSED */ 63168404Spjdstatic int 64168404Spjddbuf_cons(void *vdb, void *unused, int kmflag) 65168404Spjd{ 66168404Spjd dmu_buf_impl_t *db = vdb; 67168404Spjd bzero(db, sizeof (dmu_buf_impl_t)); 68168404Spjd 69168404Spjd mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL); 70168404Spjd cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL); 71168404Spjd refcount_create(&db->db_holds); 72269229Sdelphij 73168404Spjd return (0); 74168404Spjd} 75168404Spjd 76168404Spjd/* ARGSUSED */ 77168404Spjdstatic void 78168404Spjddbuf_dest(void *vdb, void *unused) 79168404Spjd{ 80168404Spjd dmu_buf_impl_t *db = vdb; 81168404Spjd mutex_destroy(&db->db_mtx); 82168404Spjd cv_destroy(&db->db_changed); 83168404Spjd refcount_destroy(&db->db_holds); 84168404Spjd} 85168404Spjd 86168404Spjd/* 87168404Spjd * dbuf hash table routines 88168404Spjd */ 89168404Spjdstatic dbuf_hash_table_t dbuf_hash_table; 90168404Spjd 91168404Spjdstatic uint64_t dbuf_hash_count; 92168404Spjd 93168404Spjdstatic uint64_t 94168404Spjddbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid) 95168404Spjd{ 96168404Spjd uintptr_t osv = (uintptr_t)os; 97168404Spjd uint64_t crc = -1ULL; 98168404Spjd 99168404Spjd ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); 100168404Spjd crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (lvl)) & 0xFF]; 101168404Spjd crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF]; 102168404Spjd crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF]; 103168404Spjd crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF]; 104168404Spjd crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 0)) & 0xFF]; 105168404Spjd crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 8)) & 0xFF]; 106168404Spjd 107168404Spjd crc ^= (osv>>14) ^ (obj>>16) ^ (blkid>>16); 108168404Spjd 109168404Spjd return (crc); 110168404Spjd} 111168404Spjd 112168404Spjd#define DBUF_HASH(os, obj, level, blkid) dbuf_hash(os, obj, level, blkid); 113168404Spjd 114168404Spjd#define DBUF_EQUAL(dbuf, os, obj, level, blkid) \ 115168404Spjd ((dbuf)->db.db_object == (obj) && \ 116168404Spjd (dbuf)->db_objset == (os) && \ 117168404Spjd (dbuf)->db_level == (level) && \ 118168404Spjd (dbuf)->db_blkid == (blkid)) 119168404Spjd 120168404Spjddmu_buf_impl_t * 121286541Smavdbuf_find(objset_t *os, uint64_t obj, uint8_t level, uint64_t blkid) 122168404Spjd{ 123168404Spjd dbuf_hash_table_t *h = &dbuf_hash_table; 124168404Spjd uint64_t hv = DBUF_HASH(os, obj, level, blkid); 125168404Spjd uint64_t idx = hv & h->hash_table_mask; 126168404Spjd dmu_buf_impl_t *db; 127168404Spjd 128168404Spjd mutex_enter(DBUF_HASH_MUTEX(h, idx)); 129168404Spjd for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) { 130168404Spjd if (DBUF_EQUAL(db, os, obj, level, blkid)) { 131168404Spjd mutex_enter(&db->db_mtx); 132168404Spjd if (db->db_state != DB_EVICTING) { 133168404Spjd mutex_exit(DBUF_HASH_MUTEX(h, idx)); 134168404Spjd return (db); 135168404Spjd } 136168404Spjd mutex_exit(&db->db_mtx); 137168404Spjd } 138168404Spjd } 139168404Spjd mutex_exit(DBUF_HASH_MUTEX(h, idx)); 140168404Spjd return (NULL); 141168404Spjd} 142168404Spjd 143286541Smavstatic dmu_buf_impl_t * 144286541Smavdbuf_find_bonus(objset_t *os, uint64_t object) 145286541Smav{ 146286541Smav dnode_t *dn; 147286541Smav dmu_buf_impl_t *db = NULL; 148286541Smav 149286541Smav if (dnode_hold(os, object, FTAG, &dn) == 0) { 150286541Smav rw_enter(&dn->dn_struct_rwlock, RW_READER); 151286541Smav if (dn->dn_bonus != NULL) { 152286541Smav db = dn->dn_bonus; 153286541Smav mutex_enter(&db->db_mtx); 154286541Smav } 155286541Smav rw_exit(&dn->dn_struct_rwlock); 156286541Smav dnode_rele(dn, FTAG); 157286541Smav } 158286541Smav return (db); 159286541Smav} 160286541Smav 161168404Spjd/* 162168404Spjd * Insert an entry into the hash table. If there is already an element 163168404Spjd * equal to elem in the hash table, then the already existing element 164168404Spjd * will be returned and the new element will not be inserted. 165168404Spjd * Otherwise returns NULL. 166168404Spjd */ 167168404Spjdstatic dmu_buf_impl_t * 168168404Spjddbuf_hash_insert(dmu_buf_impl_t *db) 169168404Spjd{ 170168404Spjd dbuf_hash_table_t *h = &dbuf_hash_table; 171219089Spjd objset_t *os = db->db_objset; 172168404Spjd uint64_t obj = db->db.db_object; 173168404Spjd int level = db->db_level; 174168404Spjd uint64_t blkid = db->db_blkid; 175168404Spjd uint64_t hv = DBUF_HASH(os, obj, level, blkid); 176168404Spjd uint64_t idx = hv & h->hash_table_mask; 177168404Spjd dmu_buf_impl_t *dbf; 178168404Spjd 179168404Spjd mutex_enter(DBUF_HASH_MUTEX(h, idx)); 180168404Spjd for (dbf = h->hash_table[idx]; dbf != NULL; dbf = dbf->db_hash_next) { 181168404Spjd if (DBUF_EQUAL(dbf, os, obj, level, blkid)) { 182168404Spjd mutex_enter(&dbf->db_mtx); 183168404Spjd if (dbf->db_state != DB_EVICTING) { 184168404Spjd mutex_exit(DBUF_HASH_MUTEX(h, idx)); 185168404Spjd return (dbf); 186168404Spjd } 187168404Spjd mutex_exit(&dbf->db_mtx); 188168404Spjd } 189168404Spjd } 190168404Spjd 191168404Spjd mutex_enter(&db->db_mtx); 192168404Spjd db->db_hash_next = h->hash_table[idx]; 193168404Spjd h->hash_table[idx] = db; 194168404Spjd mutex_exit(DBUF_HASH_MUTEX(h, idx)); 195270247Sdelphij atomic_inc_64(&dbuf_hash_count); 196168404Spjd 197168404Spjd return (NULL); 198168404Spjd} 199168404Spjd 200168404Spjd/* 201268858Sdelphij * Remove an entry from the hash table. It must be in the EVICTING state. 202168404Spjd */ 203168404Spjdstatic void 204168404Spjddbuf_hash_remove(dmu_buf_impl_t *db) 205168404Spjd{ 206168404Spjd dbuf_hash_table_t *h = &dbuf_hash_table; 207168404Spjd uint64_t hv = DBUF_HASH(db->db_objset, db->db.db_object, 208168404Spjd db->db_level, db->db_blkid); 209168404Spjd uint64_t idx = hv & h->hash_table_mask; 210168404Spjd dmu_buf_impl_t *dbf, **dbp; 211168404Spjd 212168404Spjd /* 213268858Sdelphij * We musn't hold db_mtx to maintain lock ordering: 214168404Spjd * DBUF_HASH_MUTEX > db_mtx. 215168404Spjd */ 216168404Spjd ASSERT(refcount_is_zero(&db->db_holds)); 217168404Spjd ASSERT(db->db_state == DB_EVICTING); 218168404Spjd ASSERT(!MUTEX_HELD(&db->db_mtx)); 219168404Spjd 220168404Spjd mutex_enter(DBUF_HASH_MUTEX(h, idx)); 221168404Spjd dbp = &h->hash_table[idx]; 222168404Spjd while ((dbf = *dbp) != db) { 223168404Spjd dbp = &dbf->db_hash_next; 224168404Spjd ASSERT(dbf != NULL); 225168404Spjd } 226168404Spjd *dbp = db->db_hash_next; 227168404Spjd db->db_hash_next = NULL; 228168404Spjd mutex_exit(DBUF_HASH_MUTEX(h, idx)); 229270247Sdelphij atomic_dec_64(&dbuf_hash_count); 230168404Spjd} 231168404Spjd 232168404Spjdstatic arc_evict_func_t dbuf_do_evict; 233168404Spjd 234168404Spjdstatic void 235168404Spjddbuf_evict_user(dmu_buf_impl_t *db) 236168404Spjd{ 237168404Spjd ASSERT(MUTEX_HELD(&db->db_mtx)); 238168404Spjd 239168404Spjd if (db->db_level != 0 || db->db_evict_func == NULL) 240168404Spjd return; 241168404Spjd 242168404Spjd db->db_evict_func(&db->db, db->db_user_ptr); 243168404Spjd db->db_user_ptr = NULL; 244168404Spjd db->db_evict_func = NULL; 245168404Spjd} 246168404Spjd 247219089Spjdboolean_t 248219089Spjddbuf_is_metadata(dmu_buf_impl_t *db) 249219089Spjd{ 250219089Spjd if (db->db_level > 0) { 251219089Spjd return (B_TRUE); 252219089Spjd } else { 253219089Spjd boolean_t is_metadata; 254219089Spjd 255219089Spjd DB_DNODE_ENTER(db); 256236884Smm is_metadata = DMU_OT_IS_METADATA(DB_DNODE(db)->dn_type); 257219089Spjd DB_DNODE_EXIT(db); 258219089Spjd 259219089Spjd return (is_metadata); 260219089Spjd } 261219089Spjd} 262219089Spjd 263168404Spjdvoid 264168404Spjddbuf_evict(dmu_buf_impl_t *db) 265168404Spjd{ 266168404Spjd ASSERT(MUTEX_HELD(&db->db_mtx)); 267168404Spjd ASSERT(db->db_buf == NULL); 268168404Spjd ASSERT(db->db_data_pending == NULL); 269168404Spjd 270168404Spjd dbuf_clear(db); 271168404Spjd dbuf_destroy(db); 272168404Spjd} 273168404Spjd 274168404Spjdvoid 275168404Spjddbuf_init(void) 276168404Spjd{ 277168404Spjd uint64_t hsize = 1ULL << 16; 278168404Spjd dbuf_hash_table_t *h = &dbuf_hash_table; 279168404Spjd int i; 280168404Spjd 281168404Spjd /* 282168404Spjd * The hash table is big enough to fill all of physical memory 283168404Spjd * with an average 4K block size. The table will take up 284168404Spjd * totalmem*sizeof(void*)/4K (i.e. 2MB/GB with 8-byte pointers). 285168404Spjd */ 286168696Spjd while (hsize * 4096 < (uint64_t)physmem * PAGESIZE) 287168404Spjd hsize <<= 1; 288168404Spjd 289168404Spjdretry: 290168404Spjd h->hash_table_mask = hsize - 1; 291168404Spjd h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP); 292168404Spjd if (h->hash_table == NULL) { 293168404Spjd /* XXX - we should really return an error instead of assert */ 294168404Spjd ASSERT(hsize > (1ULL << 10)); 295168404Spjd hsize >>= 1; 296168404Spjd goto retry; 297168404Spjd } 298168404Spjd 299168404Spjd dbuf_cache = kmem_cache_create("dmu_buf_impl_t", 300168404Spjd sizeof (dmu_buf_impl_t), 301168404Spjd 0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0); 302168404Spjd 303168404Spjd for (i = 0; i < DBUF_MUTEXES; i++) 304168404Spjd mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL); 305168404Spjd} 306168404Spjd 307168404Spjdvoid 308168404Spjddbuf_fini(void) 309168404Spjd{ 310168404Spjd dbuf_hash_table_t *h = &dbuf_hash_table; 311168404Spjd int i; 312168404Spjd 313168404Spjd for (i = 0; i < DBUF_MUTEXES; i++) 314168404Spjd mutex_destroy(&h->hash_mutexes[i]); 315168404Spjd kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *)); 316168404Spjd kmem_cache_destroy(dbuf_cache); 317168404Spjd} 318168404Spjd 319168404Spjd/* 320168404Spjd * Other stuff. 321168404Spjd */ 322168404Spjd 323168404Spjd#ifdef ZFS_DEBUG 324168404Spjdstatic void 325168404Spjddbuf_verify(dmu_buf_impl_t *db) 326168404Spjd{ 327219089Spjd dnode_t *dn; 328219089Spjd dbuf_dirty_record_t *dr; 329168404Spjd 330168404Spjd ASSERT(MUTEX_HELD(&db->db_mtx)); 331168404Spjd 332168404Spjd if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY)) 333168404Spjd return; 334168404Spjd 335168404Spjd ASSERT(db->db_objset != NULL); 336219089Spjd DB_DNODE_ENTER(db); 337219089Spjd dn = DB_DNODE(db); 338168404Spjd if (dn == NULL) { 339168404Spjd ASSERT(db->db_parent == NULL); 340168404Spjd ASSERT(db->db_blkptr == NULL); 341168404Spjd } else { 342168404Spjd ASSERT3U(db->db.db_object, ==, dn->dn_object); 343168404Spjd ASSERT3P(db->db_objset, ==, dn->dn_objset); 344168404Spjd ASSERT3U(db->db_level, <, dn->dn_nlevels); 345219089Spjd ASSERT(db->db_blkid == DMU_BONUS_BLKID || 346219089Spjd db->db_blkid == DMU_SPILL_BLKID || 347269229Sdelphij !avl_is_empty(&dn->dn_dbufs)); 348168404Spjd } 349219089Spjd if (db->db_blkid == DMU_BONUS_BLKID) { 350168404Spjd ASSERT(dn != NULL); 351185029Spjd ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); 352219089Spjd ASSERT3U(db->db.db_offset, ==, DMU_BONUS_BLKID); 353219089Spjd } else if (db->db_blkid == DMU_SPILL_BLKID) { 354219089Spjd ASSERT(dn != NULL); 355219089Spjd ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); 356240415Smm ASSERT0(db->db.db_offset); 357168404Spjd } else { 358168404Spjd ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size); 359168404Spjd } 360168404Spjd 361219089Spjd for (dr = db->db_data_pending; dr != NULL; dr = dr->dr_next) 362219089Spjd ASSERT(dr->dr_dbuf == db); 363219089Spjd 364219089Spjd for (dr = db->db_last_dirty; dr != NULL; dr = dr->dr_next) 365219089Spjd ASSERT(dr->dr_dbuf == db); 366219089Spjd 367208047Smm /* 368208047Smm * We can't assert that db_size matches dn_datablksz because it 369208047Smm * can be momentarily different when another thread is doing 370208047Smm * dnode_set_blksz(). 371208047Smm */ 372208047Smm if (db->db_level == 0 && db->db.db_object == DMU_META_DNODE_OBJECT) { 373219089Spjd dr = db->db_data_pending; 374208047Smm /* 375208047Smm * It should only be modified in syncing context, so 376208047Smm * make sure we only have one copy of the data. 377208047Smm */ 378208047Smm ASSERT(dr == NULL || dr->dt.dl.dr_data == db->db_buf); 379168404Spjd } 380168404Spjd 381168404Spjd /* verify db->db_blkptr */ 382168404Spjd if (db->db_blkptr) { 383168404Spjd if (db->db_parent == dn->dn_dbuf) { 384168404Spjd /* db is pointed to by the dnode */ 385168404Spjd /* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */ 386209962Smm if (DMU_OBJECT_IS_SPECIAL(db->db.db_object)) 387168404Spjd ASSERT(db->db_parent == NULL); 388168404Spjd else 389168404Spjd ASSERT(db->db_parent != NULL); 390219089Spjd if (db->db_blkid != DMU_SPILL_BLKID) 391219089Spjd ASSERT3P(db->db_blkptr, ==, 392219089Spjd &dn->dn_phys->dn_blkptr[db->db_blkid]); 393168404Spjd } else { 394168404Spjd /* db is pointed to by an indirect block */ 395168404Spjd int epb = db->db_parent->db.db_size >> SPA_BLKPTRSHIFT; 396168404Spjd ASSERT3U(db->db_parent->db_level, ==, db->db_level+1); 397168404Spjd ASSERT3U(db->db_parent->db.db_object, ==, 398168404Spjd db->db.db_object); 399168404Spjd /* 400168404Spjd * dnode_grow_indblksz() can make this fail if we don't 401168404Spjd * have the struct_rwlock. XXX indblksz no longer 402168404Spjd * grows. safe to do this now? 403168404Spjd */ 404219089Spjd if (RW_WRITE_HELD(&dn->dn_struct_rwlock)) { 405168404Spjd ASSERT3P(db->db_blkptr, ==, 406168404Spjd ((blkptr_t *)db->db_parent->db.db_data + 407168404Spjd db->db_blkid % epb)); 408168404Spjd } 409168404Spjd } 410168404Spjd } 411168404Spjd if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) && 412219089Spjd (db->db_buf == NULL || db->db_buf->b_data) && 413219089Spjd db->db.db_data && db->db_blkid != DMU_BONUS_BLKID && 414168404Spjd db->db_state != DB_FILL && !dn->dn_free_txg) { 415168404Spjd /* 416168404Spjd * If the blkptr isn't set but they have nonzero data, 417168404Spjd * it had better be dirty, otherwise we'll lose that 418168404Spjd * data when we evict this buffer. 419168404Spjd */ 420168404Spjd if (db->db_dirtycnt == 0) { 421168404Spjd uint64_t *buf = db->db.db_data; 422168404Spjd int i; 423168404Spjd 424168404Spjd for (i = 0; i < db->db.db_size >> 3; i++) { 425168404Spjd ASSERT(buf[i] == 0); 426168404Spjd } 427168404Spjd } 428168404Spjd } 429219089Spjd DB_DNODE_EXIT(db); 430168404Spjd} 431168404Spjd#endif 432168404Spjd 433168404Spjdstatic void 434168404Spjddbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf) 435168404Spjd{ 436168404Spjd ASSERT(MUTEX_HELD(&db->db_mtx)); 437168404Spjd db->db_buf = buf; 438168404Spjd if (buf != NULL) { 439168404Spjd ASSERT(buf->b_data != NULL); 440168404Spjd db->db.db_data = buf->b_data; 441168404Spjd if (!arc_released(buf)) 442168404Spjd arc_set_callback(buf, dbuf_do_evict, db); 443168404Spjd } else { 444168404Spjd dbuf_evict_user(db); 445168404Spjd db->db.db_data = NULL; 446219089Spjd if (db->db_state != DB_NOFILL) 447219089Spjd db->db_state = DB_UNCACHED; 448168404Spjd } 449168404Spjd} 450168404Spjd 451219089Spjd/* 452219089Spjd * Loan out an arc_buf for read. Return the loaned arc_buf. 453219089Spjd */ 454219089Spjdarc_buf_t * 455219089Spjddbuf_loan_arcbuf(dmu_buf_impl_t *db) 456219089Spjd{ 457219089Spjd arc_buf_t *abuf; 458219089Spjd 459219089Spjd mutex_enter(&db->db_mtx); 460219089Spjd if (arc_released(db->db_buf) || refcount_count(&db->db_holds) > 1) { 461219089Spjd int blksz = db->db.db_size; 462260150Sdelphij spa_t *spa = db->db_objset->os_spa; 463219089Spjd 464219089Spjd mutex_exit(&db->db_mtx); 465219089Spjd abuf = arc_loan_buf(spa, blksz); 466219089Spjd bcopy(db->db.db_data, abuf->b_data, blksz); 467219089Spjd } else { 468219089Spjd abuf = db->db_buf; 469219089Spjd arc_loan_inuse_buf(abuf, db); 470219089Spjd dbuf_set_data(db, NULL); 471219089Spjd mutex_exit(&db->db_mtx); 472219089Spjd } 473219089Spjd return (abuf); 474219089Spjd} 475219089Spjd 476168404Spjduint64_t 477168404Spjddbuf_whichblock(dnode_t *dn, uint64_t offset) 478168404Spjd{ 479168404Spjd if (dn->dn_datablkshift) { 480168404Spjd return (offset >> dn->dn_datablkshift); 481168404Spjd } else { 482168404Spjd ASSERT3U(offset, <, dn->dn_datablksz); 483168404Spjd return (0); 484168404Spjd } 485168404Spjd} 486168404Spjd 487168404Spjdstatic void 488168404Spjddbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb) 489168404Spjd{ 490168404Spjd dmu_buf_impl_t *db = vdb; 491168404Spjd 492168404Spjd mutex_enter(&db->db_mtx); 493168404Spjd ASSERT3U(db->db_state, ==, DB_READ); 494168404Spjd /* 495168404Spjd * All reads are synchronous, so we must have a hold on the dbuf 496168404Spjd */ 497168404Spjd ASSERT(refcount_count(&db->db_holds) > 0); 498168404Spjd ASSERT(db->db_buf == NULL); 499168404Spjd ASSERT(db->db.db_data == NULL); 500168404Spjd if (db->db_level == 0 && db->db_freed_in_flight) { 501168404Spjd /* we were freed in flight; disregard any error */ 502168404Spjd arc_release(buf, db); 503168404Spjd bzero(buf->b_data, db->db.db_size); 504168404Spjd arc_buf_freeze(buf); 505168404Spjd db->db_freed_in_flight = FALSE; 506168404Spjd dbuf_set_data(db, buf); 507168404Spjd db->db_state = DB_CACHED; 508168404Spjd } else if (zio == NULL || zio->io_error == 0) { 509168404Spjd dbuf_set_data(db, buf); 510168404Spjd db->db_state = DB_CACHED; 511168404Spjd } else { 512219089Spjd ASSERT(db->db_blkid != DMU_BONUS_BLKID); 513168404Spjd ASSERT3P(db->db_buf, ==, NULL); 514248571Smm VERIFY(arc_buf_remove_ref(buf, db)); 515168404Spjd db->db_state = DB_UNCACHED; 516168404Spjd } 517168404Spjd cv_broadcast(&db->db_changed); 518219089Spjd dbuf_rele_and_unlock(db, NULL); 519168404Spjd} 520168404Spjd 521168404Spjdstatic void 522168404Spjddbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags) 523168404Spjd{ 524219089Spjd dnode_t *dn; 525268123Sdelphij zbookmark_phys_t zb; 526275811Sdelphij arc_flags_t aflags = ARC_FLAG_NOWAIT; 527168404Spjd 528219089Spjd DB_DNODE_ENTER(db); 529219089Spjd dn = DB_DNODE(db); 530168404Spjd ASSERT(!refcount_is_zero(&db->db_holds)); 531168404Spjd /* We need the struct_rwlock to prevent db_blkptr from changing. */ 532185029Spjd ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 533168404Spjd ASSERT(MUTEX_HELD(&db->db_mtx)); 534168404Spjd ASSERT(db->db_state == DB_UNCACHED); 535168404Spjd ASSERT(db->db_buf == NULL); 536168404Spjd 537219089Spjd if (db->db_blkid == DMU_BONUS_BLKID) { 538207624Smm int bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen); 539185029Spjd 540185029Spjd ASSERT3U(bonuslen, <=, db->db.db_size); 541168404Spjd db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN); 542208373Smm arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); 543185029Spjd if (bonuslen < DN_MAX_BONUSLEN) 544168404Spjd bzero(db->db.db_data, DN_MAX_BONUSLEN); 545207624Smm if (bonuslen) 546207624Smm bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen); 547219089Spjd DB_DNODE_EXIT(db); 548168404Spjd db->db_state = DB_CACHED; 549168404Spjd mutex_exit(&db->db_mtx); 550168404Spjd return; 551168404Spjd } 552168404Spjd 553185029Spjd /* 554185029Spjd * Recheck BP_IS_HOLE() after dnode_block_freed() in case dnode_sync() 555185029Spjd * processes the delete record and clears the bp while we are waiting 556185029Spjd * for the dn_mtx (resulting in a "no" from block_freed). 557185029Spjd */ 558185029Spjd if (db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr) || 559185029Spjd (db->db_level == 0 && (dnode_block_freed(dn, db->db_blkid) || 560185029Spjd BP_IS_HOLE(db->db_blkptr)))) { 561168404Spjd arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 562168404Spjd 563260150Sdelphij DB_DNODE_EXIT(db); 564260150Sdelphij dbuf_set_data(db, arc_buf_alloc(db->db_objset->os_spa, 565168404Spjd db->db.db_size, db, type)); 566168404Spjd bzero(db->db.db_data, db->db.db_size); 567168404Spjd db->db_state = DB_CACHED; 568168404Spjd *flags |= DB_RF_CACHED; 569168404Spjd mutex_exit(&db->db_mtx); 570168404Spjd return; 571168404Spjd } 572168404Spjd 573219089Spjd DB_DNODE_EXIT(db); 574219089Spjd 575168404Spjd db->db_state = DB_READ; 576168404Spjd mutex_exit(&db->db_mtx); 577168404Spjd 578185029Spjd if (DBUF_IS_L2CACHEABLE(db)) 579275811Sdelphij aflags |= ARC_FLAG_L2CACHE; 580251478Sdelphij if (DBUF_IS_L2COMPRESSIBLE(db)) 581275811Sdelphij aflags |= ARC_FLAG_L2COMPRESS; 582185029Spjd 583219089Spjd SET_BOOKMARK(&zb, db->db_objset->os_dsl_dataset ? 584219089Spjd db->db_objset->os_dsl_dataset->ds_object : DMU_META_OBJSET, 585219089Spjd db->db.db_object, db->db_level, db->db_blkid); 586168404Spjd 587168404Spjd dbuf_add_ref(db, NULL); 588185029Spjd 589260150Sdelphij (void) arc_read(zio, db->db_objset->os_spa, db->db_blkptr, 590168404Spjd dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, 591168404Spjd (*flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED, 592168404Spjd &aflags, &zb); 593275811Sdelphij if (aflags & ARC_FLAG_CACHED) 594168404Spjd *flags |= DB_RF_CACHED; 595168404Spjd} 596168404Spjd 597168404Spjdint 598168404Spjddbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) 599168404Spjd{ 600168404Spjd int err = 0; 601260150Sdelphij boolean_t havepzio = (zio != NULL); 602260150Sdelphij boolean_t prefetch; 603219089Spjd dnode_t *dn; 604168404Spjd 605168404Spjd /* 606168404Spjd * We don't have to hold the mutex to check db_state because it 607168404Spjd * can't be freed while we have a hold on the buffer. 608168404Spjd */ 609168404Spjd ASSERT(!refcount_is_zero(&db->db_holds)); 610168404Spjd 611219089Spjd if (db->db_state == DB_NOFILL) 612249195Smm return (SET_ERROR(EIO)); 613219089Spjd 614219089Spjd DB_DNODE_ENTER(db); 615219089Spjd dn = DB_DNODE(db); 616168404Spjd if ((flags & DB_RF_HAVESTRUCT) == 0) 617219089Spjd rw_enter(&dn->dn_struct_rwlock, RW_READER); 618168404Spjd 619219089Spjd prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && 620219089Spjd (flags & DB_RF_NOPREFETCH) == 0 && dn != NULL && 621185029Spjd DBUF_IS_CACHEABLE(db); 622168404Spjd 623168404Spjd mutex_enter(&db->db_mtx); 624168404Spjd if (db->db_state == DB_CACHED) { 625168404Spjd mutex_exit(&db->db_mtx); 626168404Spjd if (prefetch) 627219089Spjd dmu_zfetch(&dn->dn_zfetch, db->db.db_offset, 628168404Spjd db->db.db_size, TRUE); 629168404Spjd if ((flags & DB_RF_HAVESTRUCT) == 0) 630219089Spjd rw_exit(&dn->dn_struct_rwlock); 631219089Spjd DB_DNODE_EXIT(db); 632168404Spjd } else if (db->db_state == DB_UNCACHED) { 633219089Spjd spa_t *spa = dn->dn_objset->os_spa; 634219089Spjd 635219089Spjd if (zio == NULL) 636219089Spjd zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); 637168404Spjd dbuf_read_impl(db, zio, &flags); 638168404Spjd 639168404Spjd /* dbuf_read_impl has dropped db_mtx for us */ 640168404Spjd 641168404Spjd if (prefetch) 642219089Spjd dmu_zfetch(&dn->dn_zfetch, db->db.db_offset, 643168404Spjd db->db.db_size, flags & DB_RF_CACHED); 644168404Spjd 645168404Spjd if ((flags & DB_RF_HAVESTRUCT) == 0) 646219089Spjd rw_exit(&dn->dn_struct_rwlock); 647219089Spjd DB_DNODE_EXIT(db); 648168404Spjd 649168404Spjd if (!havepzio) 650168404Spjd err = zio_wait(zio); 651168404Spjd } else { 652251629Sdelphij /* 653251629Sdelphij * Another reader came in while the dbuf was in flight 654251629Sdelphij * between UNCACHED and CACHED. Either a writer will finish 655251629Sdelphij * writing the buffer (sending the dbuf to CACHED) or the 656251629Sdelphij * first reader's request will reach the read_done callback 657251629Sdelphij * and send the dbuf to CACHED. Otherwise, a failure 658251629Sdelphij * occurred and the dbuf went to UNCACHED. 659251629Sdelphij */ 660168404Spjd mutex_exit(&db->db_mtx); 661168404Spjd if (prefetch) 662219089Spjd dmu_zfetch(&dn->dn_zfetch, db->db.db_offset, 663168404Spjd db->db.db_size, TRUE); 664168404Spjd if ((flags & DB_RF_HAVESTRUCT) == 0) 665219089Spjd rw_exit(&dn->dn_struct_rwlock); 666219089Spjd DB_DNODE_EXIT(db); 667168404Spjd 668251629Sdelphij /* Skip the wait per the caller's request. */ 669168404Spjd mutex_enter(&db->db_mtx); 670168404Spjd if ((flags & DB_RF_NEVERWAIT) == 0) { 671168404Spjd while (db->db_state == DB_READ || 672168404Spjd db->db_state == DB_FILL) { 673168404Spjd ASSERT(db->db_state == DB_READ || 674168404Spjd (flags & DB_RF_HAVESTRUCT) == 0); 675272511Sdelphij DTRACE_PROBE2(blocked__read, dmu_buf_impl_t *, 676272511Sdelphij db, zio_t *, zio); 677168404Spjd cv_wait(&db->db_changed, &db->db_mtx); 678168404Spjd } 679168404Spjd if (db->db_state == DB_UNCACHED) 680249195Smm err = SET_ERROR(EIO); 681168404Spjd } 682168404Spjd mutex_exit(&db->db_mtx); 683168404Spjd } 684168404Spjd 685168404Spjd ASSERT(err || havepzio || db->db_state == DB_CACHED); 686168404Spjd return (err); 687168404Spjd} 688168404Spjd 689168404Spjdstatic void 690168404Spjddbuf_noread(dmu_buf_impl_t *db) 691168404Spjd{ 692168404Spjd ASSERT(!refcount_is_zero(&db->db_holds)); 693219089Spjd ASSERT(db->db_blkid != DMU_BONUS_BLKID); 694168404Spjd mutex_enter(&db->db_mtx); 695168404Spjd while (db->db_state == DB_READ || db->db_state == DB_FILL) 696168404Spjd cv_wait(&db->db_changed, &db->db_mtx); 697168404Spjd if (db->db_state == DB_UNCACHED) { 698168404Spjd arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 699260150Sdelphij spa_t *spa = db->db_objset->os_spa; 700168404Spjd 701168404Spjd ASSERT(db->db_buf == NULL); 702168404Spjd ASSERT(db->db.db_data == NULL); 703219089Spjd dbuf_set_data(db, arc_buf_alloc(spa, db->db.db_size, db, type)); 704168404Spjd db->db_state = DB_FILL; 705219089Spjd } else if (db->db_state == DB_NOFILL) { 706219089Spjd dbuf_set_data(db, NULL); 707168404Spjd } else { 708168404Spjd ASSERT3U(db->db_state, ==, DB_CACHED); 709168404Spjd } 710168404Spjd mutex_exit(&db->db_mtx); 711168404Spjd} 712168404Spjd 713168404Spjd/* 714168404Spjd * This is our just-in-time copy function. It makes a copy of 715168404Spjd * buffers, that have been modified in a previous transaction 716168404Spjd * group, before we modify them in the current active group. 717168404Spjd * 718168404Spjd * This function is used in two places: when we are dirtying a 719168404Spjd * buffer for the first time in a txg, and when we are freeing 720168404Spjd * a range in a dnode that includes this buffer. 721168404Spjd * 722168404Spjd * Note that when we are called from dbuf_free_range() we do 723168404Spjd * not put a hold on the buffer, we just traverse the active 724168404Spjd * dbuf list for the dnode. 725168404Spjd */ 726168404Spjdstatic void 727168404Spjddbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg) 728168404Spjd{ 729168404Spjd dbuf_dirty_record_t *dr = db->db_last_dirty; 730168404Spjd 731168404Spjd ASSERT(MUTEX_HELD(&db->db_mtx)); 732168404Spjd ASSERT(db->db.db_data != NULL); 733168404Spjd ASSERT(db->db_level == 0); 734168404Spjd ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT); 735168404Spjd 736168404Spjd if (dr == NULL || 737168404Spjd (dr->dt.dl.dr_data != 738219089Spjd ((db->db_blkid == DMU_BONUS_BLKID) ? db->db.db_data : db->db_buf))) 739168404Spjd return; 740168404Spjd 741168404Spjd /* 742168404Spjd * If the last dirty record for this dbuf has not yet synced 743168404Spjd * and its referencing the dbuf data, either: 744219089Spjd * reset the reference to point to a new copy, 745168404Spjd * or (if there a no active holders) 746168404Spjd * just null out the current db_data pointer. 747168404Spjd */ 748168404Spjd ASSERT(dr->dr_txg >= txg - 2); 749219089Spjd if (db->db_blkid == DMU_BONUS_BLKID) { 750168404Spjd /* Note that the data bufs here are zio_bufs */ 751168404Spjd dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN); 752208373Smm arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); 753168404Spjd bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN); 754168404Spjd } else if (refcount_count(&db->db_holds) > db->db_dirtycnt) { 755168404Spjd int size = db->db.db_size; 756168404Spjd arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 757260150Sdelphij spa_t *spa = db->db_objset->os_spa; 758219089Spjd 759219089Spjd dr->dt.dl.dr_data = arc_buf_alloc(spa, size, db, type); 760168404Spjd bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size); 761168404Spjd } else { 762168404Spjd dbuf_set_data(db, NULL); 763168404Spjd } 764168404Spjd} 765168404Spjd 766168404Spjdvoid 767168404Spjddbuf_unoverride(dbuf_dirty_record_t *dr) 768168404Spjd{ 769168404Spjd dmu_buf_impl_t *db = dr->dr_dbuf; 770219089Spjd blkptr_t *bp = &dr->dt.dl.dr_overridden_by; 771168404Spjd uint64_t txg = dr->dr_txg; 772168404Spjd 773168404Spjd ASSERT(MUTEX_HELD(&db->db_mtx)); 774168404Spjd ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC); 775168404Spjd ASSERT(db->db_level == 0); 776168404Spjd 777219089Spjd if (db->db_blkid == DMU_BONUS_BLKID || 778168404Spjd dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN) 779168404Spjd return; 780168404Spjd 781219089Spjd ASSERT(db->db_data_pending != dr); 782219089Spjd 783168404Spjd /* free this block */ 784260150Sdelphij if (!BP_IS_HOLE(bp) && !dr->dt.dl.dr_nopwrite) 785260150Sdelphij zio_free(db->db_objset->os_spa, txg, bp); 786219089Spjd 787168404Spjd dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; 788243524Smm dr->dt.dl.dr_nopwrite = B_FALSE; 789243524Smm 790168404Spjd /* 791168404Spjd * Release the already-written buffer, so we leave it in 792168404Spjd * a consistent dirty state. Note that all callers are 793168404Spjd * modifying the buffer, so they will immediately do 794168404Spjd * another (redundant) arc_release(). Therefore, leave 795168404Spjd * the buf thawed to save the effort of freezing & 796168404Spjd * immediately re-thawing it. 797168404Spjd */ 798168404Spjd arc_release(dr->dt.dl.dr_data, db); 799168404Spjd} 800168404Spjd 801185029Spjd/* 802185029Spjd * Evict (if its unreferenced) or clear (if its referenced) any level-0 803185029Spjd * data blocks in the free range, so that any future readers will find 804260150Sdelphij * empty blocks. 805253821Sdelphij * 806253821Sdelphij * This is a no-op if the dataset is in the middle of an incremental 807253821Sdelphij * receive; see comment below for details. 808185029Spjd */ 809168404Spjdvoid 810269229Sdelphijdbuf_free_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid, 811269229Sdelphij dmu_tx_t *tx) 812168404Spjd{ 813269229Sdelphij dmu_buf_impl_t *db, *db_next, db_search; 814168404Spjd uint64_t txg = tx->tx_txg; 815269229Sdelphij avl_index_t where; 816168404Spjd 817269229Sdelphij if (end_blkid > dn->dn_maxblkid && (end_blkid != DMU_SPILL_BLKID)) 818269229Sdelphij end_blkid = dn->dn_maxblkid; 819269229Sdelphij dprintf_dnode(dn, "start=%llu end=%llu\n", start_blkid, end_blkid); 820253821Sdelphij 821269229Sdelphij db_search.db_level = 0; 822269229Sdelphij db_search.db_blkid = start_blkid; 823270383Sdelphij db_search.db_state = DB_SEARCH; 824269229Sdelphij 825254753Sdelphij mutex_enter(&dn->dn_dbufs_mtx); 826269229Sdelphij if (start_blkid >= dn->dn_unlisted_l0_blkid) { 827254753Sdelphij /* There can't be any dbufs in this range; no need to search. */ 828269229Sdelphij#ifdef DEBUG 829269229Sdelphij db = avl_find(&dn->dn_dbufs, &db_search, &where); 830269229Sdelphij ASSERT3P(db, ==, NULL); 831269229Sdelphij db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER); 832269229Sdelphij ASSERT(db == NULL || db->db_level > 0); 833269229Sdelphij#endif 834254753Sdelphij mutex_exit(&dn->dn_dbufs_mtx); 835254753Sdelphij return; 836254753Sdelphij } else if (dmu_objset_is_receiving(dn->dn_objset)) { 837253821Sdelphij /* 838254753Sdelphij * If we are receiving, we expect there to be no dbufs in 839254753Sdelphij * the range to be freed, because receive modifies each 840254753Sdelphij * block at most once, and in offset order. If this is 841254753Sdelphij * not the case, it can lead to performance problems, 842254753Sdelphij * so note that we unexpectedly took the slow path. 843253821Sdelphij */ 844254753Sdelphij atomic_inc_64(&zfs_free_range_recv_miss); 845253821Sdelphij } 846253821Sdelphij 847269229Sdelphij db = avl_find(&dn->dn_dbufs, &db_search, &where); 848269229Sdelphij ASSERT3P(db, ==, NULL); 849269229Sdelphij db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER); 850269229Sdelphij 851269229Sdelphij for (; db != NULL; db = db_next) { 852269229Sdelphij db_next = AVL_NEXT(&dn->dn_dbufs, db); 853219089Spjd ASSERT(db->db_blkid != DMU_BONUS_BLKID); 854185029Spjd 855269229Sdelphij if (db->db_level != 0 || db->db_blkid > end_blkid) { 856269229Sdelphij break; 857269229Sdelphij } 858269229Sdelphij ASSERT3U(db->db_blkid, >=, start_blkid); 859168404Spjd 860168404Spjd /* found a level 0 buffer in the range */ 861248571Smm mutex_enter(&db->db_mtx); 862248571Smm if (dbuf_undirty(db, tx)) { 863248571Smm /* mutex has been dropped and dbuf destroyed */ 864168404Spjd continue; 865248571Smm } 866168404Spjd 867168404Spjd if (db->db_state == DB_UNCACHED || 868219089Spjd db->db_state == DB_NOFILL || 869168404Spjd db->db_state == DB_EVICTING) { 870168404Spjd ASSERT(db->db.db_data == NULL); 871168404Spjd mutex_exit(&db->db_mtx); 872168404Spjd continue; 873168404Spjd } 874168404Spjd if (db->db_state == DB_READ || db->db_state == DB_FILL) { 875168404Spjd /* will be handled in dbuf_read_done or dbuf_rele */ 876168404Spjd db->db_freed_in_flight = TRUE; 877168404Spjd mutex_exit(&db->db_mtx); 878168404Spjd continue; 879168404Spjd } 880168404Spjd if (refcount_count(&db->db_holds) == 0) { 881168404Spjd ASSERT(db->db_buf); 882168404Spjd dbuf_clear(db); 883168404Spjd continue; 884168404Spjd } 885168404Spjd /* The dbuf is referenced */ 886168404Spjd 887168404Spjd if (db->db_last_dirty != NULL) { 888168404Spjd dbuf_dirty_record_t *dr = db->db_last_dirty; 889168404Spjd 890168404Spjd if (dr->dr_txg == txg) { 891168404Spjd /* 892168404Spjd * This buffer is "in-use", re-adjust the file 893168404Spjd * size to reflect that this buffer may 894168404Spjd * contain new data when we sync. 895168404Spjd */ 896219089Spjd if (db->db_blkid != DMU_SPILL_BLKID && 897219089Spjd db->db_blkid > dn->dn_maxblkid) 898168404Spjd dn->dn_maxblkid = db->db_blkid; 899168404Spjd dbuf_unoverride(dr); 900168404Spjd } else { 901168404Spjd /* 902168404Spjd * This dbuf is not dirty in the open context. 903168404Spjd * Either uncache it (if its not referenced in 904168404Spjd * the open context) or reset its contents to 905168404Spjd * empty. 906168404Spjd */ 907168404Spjd dbuf_fix_old_data(db, txg); 908168404Spjd } 909168404Spjd } 910168404Spjd /* clear the contents if its cached */ 911168404Spjd if (db->db_state == DB_CACHED) { 912168404Spjd ASSERT(db->db.db_data != NULL); 913168404Spjd arc_release(db->db_buf, db); 914168404Spjd bzero(db->db.db_data, db->db.db_size); 915168404Spjd arc_buf_freeze(db->db_buf); 916168404Spjd } 917168404Spjd 918168404Spjd mutex_exit(&db->db_mtx); 919168404Spjd } 920168404Spjd mutex_exit(&dn->dn_dbufs_mtx); 921168404Spjd} 922168404Spjd 923168404Spjdstatic int 924185029Spjddbuf_block_freeable(dmu_buf_impl_t *db) 925168404Spjd{ 926168404Spjd dsl_dataset_t *ds = db->db_objset->os_dsl_dataset; 927168404Spjd uint64_t birth_txg = 0; 928168404Spjd 929168404Spjd /* 930168404Spjd * We don't need any locking to protect db_blkptr: 931168404Spjd * If it's syncing, then db_last_dirty will be set 932168404Spjd * so we'll ignore db_blkptr. 933260150Sdelphij * 934260150Sdelphij * This logic ensures that only block births for 935260150Sdelphij * filled blocks are considered. 936168404Spjd */ 937168404Spjd ASSERT(MUTEX_HELD(&db->db_mtx)); 938260150Sdelphij if (db->db_last_dirty && (db->db_blkptr == NULL || 939260150Sdelphij !BP_IS_HOLE(db->db_blkptr))) { 940168404Spjd birth_txg = db->db_last_dirty->dr_txg; 941260150Sdelphij } else if (db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)) { 942168404Spjd birth_txg = db->db_blkptr->blk_birth; 943260150Sdelphij } 944168404Spjd 945219089Spjd /* 946260150Sdelphij * If this block don't exist or is in a snapshot, it can't be freed. 947219089Spjd * Don't pass the bp to dsl_dataset_block_freeable() since we 948219089Spjd * are holding the db_mtx lock and might deadlock if we are 949219089Spjd * prefetching a dedup-ed block. 950219089Spjd */ 951260150Sdelphij if (birth_txg != 0) 952185029Spjd return (ds == NULL || 953219089Spjd dsl_dataset_block_freeable(ds, NULL, birth_txg)); 954168404Spjd else 955260150Sdelphij return (B_FALSE); 956168404Spjd} 957168404Spjd 958168404Spjdvoid 959168404Spjddbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx) 960168404Spjd{ 961168404Spjd arc_buf_t *buf, *obuf; 962168404Spjd int osize = db->db.db_size; 963168404Spjd arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 964219089Spjd dnode_t *dn; 965168404Spjd 966219089Spjd ASSERT(db->db_blkid != DMU_BONUS_BLKID); 967168404Spjd 968219089Spjd DB_DNODE_ENTER(db); 969219089Spjd dn = DB_DNODE(db); 970219089Spjd 971168404Spjd /* XXX does *this* func really need the lock? */ 972219089Spjd ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); 973168404Spjd 974168404Spjd /* 975260150Sdelphij * This call to dmu_buf_will_dirty() with the dn_struct_rwlock held 976168404Spjd * is OK, because there can be no other references to the db 977168404Spjd * when we are changing its size, so no concurrent DB_FILL can 978168404Spjd * be happening. 979168404Spjd */ 980168404Spjd /* 981168404Spjd * XXX we should be doing a dbuf_read, checking the return 982168404Spjd * value and returning that up to our callers 983168404Spjd */ 984260150Sdelphij dmu_buf_will_dirty(&db->db, tx); 985168404Spjd 986168404Spjd /* create the data buffer for the new block */ 987219089Spjd buf = arc_buf_alloc(dn->dn_objset->os_spa, size, db, type); 988168404Spjd 989168404Spjd /* copy old block data to the new block */ 990168404Spjd obuf = db->db_buf; 991168404Spjd bcopy(obuf->b_data, buf->b_data, MIN(osize, size)); 992168404Spjd /* zero the remainder */ 993168404Spjd if (size > osize) 994168404Spjd bzero((uint8_t *)buf->b_data + osize, size - osize); 995168404Spjd 996168404Spjd mutex_enter(&db->db_mtx); 997168404Spjd dbuf_set_data(db, buf); 998248571Smm VERIFY(arc_buf_remove_ref(obuf, db)); 999168404Spjd db->db.db_size = size; 1000168404Spjd 1001168404Spjd if (db->db_level == 0) { 1002168404Spjd ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg); 1003168404Spjd db->db_last_dirty->dt.dl.dr_data = buf; 1004168404Spjd } 1005168404Spjd mutex_exit(&db->db_mtx); 1006168404Spjd 1007219089Spjd dnode_willuse_space(dn, size-osize, tx); 1008219089Spjd DB_DNODE_EXIT(db); 1009168404Spjd} 1010168404Spjd 1011219089Spjdvoid 1012219089Spjddbuf_release_bp(dmu_buf_impl_t *db) 1013219089Spjd{ 1014260150Sdelphij objset_t *os = db->db_objset; 1015219089Spjd 1016219089Spjd ASSERT(dsl_pool_sync_context(dmu_objset_pool(os))); 1017219089Spjd ASSERT(arc_released(os->os_phys_buf) || 1018219089Spjd list_link_active(&os->os_dsl_dataset->ds_synced_link)); 1019219089Spjd ASSERT(db->db_parent == NULL || arc_released(db->db_parent->db_buf)); 1020219089Spjd 1021246666Smm (void) arc_release(db->db_buf, db); 1022219089Spjd} 1023219089Spjd 1024168404Spjddbuf_dirty_record_t * 1025168404Spjddbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) 1026168404Spjd{ 1027219089Spjd dnode_t *dn; 1028219089Spjd objset_t *os; 1029168404Spjd dbuf_dirty_record_t **drp, *dr; 1030168404Spjd int drop_struct_lock = FALSE; 1031185029Spjd boolean_t do_free_accounting = B_FALSE; 1032168404Spjd int txgoff = tx->tx_txg & TXG_MASK; 1033168404Spjd 1034168404Spjd ASSERT(tx->tx_txg != 0); 1035168404Spjd ASSERT(!refcount_is_zero(&db->db_holds)); 1036168404Spjd DMU_TX_DIRTY_BUF(tx, db); 1037168404Spjd 1038219089Spjd DB_DNODE_ENTER(db); 1039219089Spjd dn = DB_DNODE(db); 1040168404Spjd /* 1041168404Spjd * Shouldn't dirty a regular buffer in syncing context. Private 1042168404Spjd * objects may be dirtied in syncing context, but only if they 1043168404Spjd * were already pre-dirtied in open context. 1044168404Spjd */ 1045168404Spjd ASSERT(!dmu_tx_is_syncing(tx) || 1046168404Spjd BP_IS_HOLE(dn->dn_objset->os_rootbp) || 1047209962Smm DMU_OBJECT_IS_SPECIAL(dn->dn_object) || 1048209962Smm dn->dn_objset->os_dsl_dataset == NULL); 1049168404Spjd /* 1050168404Spjd * We make this assert for private objects as well, but after we 1051168404Spjd * check if we're already dirty. They are allowed to re-dirty 1052168404Spjd * in syncing context. 1053168404Spjd */ 1054168404Spjd ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || 1055168404Spjd dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx == 1056168404Spjd (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); 1057168404Spjd 1058168404Spjd mutex_enter(&db->db_mtx); 1059168404Spjd /* 1060168404Spjd * XXX make this true for indirects too? The problem is that 1061168404Spjd * transactions created with dmu_tx_create_assigned() from 1062168404Spjd * syncing context don't bother holding ahead. 1063168404Spjd */ 1064168404Spjd ASSERT(db->db_level != 0 || 1065219089Spjd db->db_state == DB_CACHED || db->db_state == DB_FILL || 1066219089Spjd db->db_state == DB_NOFILL); 1067168404Spjd 1068168404Spjd mutex_enter(&dn->dn_mtx); 1069168404Spjd /* 1070168404Spjd * Don't set dirtyctx to SYNC if we're just modifying this as we 1071168404Spjd * initialize the objset. 1072168404Spjd */ 1073168404Spjd if (dn->dn_dirtyctx == DN_UNDIRTIED && 1074168404Spjd !BP_IS_HOLE(dn->dn_objset->os_rootbp)) { 1075168404Spjd dn->dn_dirtyctx = 1076168404Spjd (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN); 1077168404Spjd ASSERT(dn->dn_dirtyctx_firstset == NULL); 1078168404Spjd dn->dn_dirtyctx_firstset = kmem_alloc(1, KM_SLEEP); 1079168404Spjd } 1080168404Spjd mutex_exit(&dn->dn_mtx); 1081168404Spjd 1082219089Spjd if (db->db_blkid == DMU_SPILL_BLKID) 1083219089Spjd dn->dn_have_spill = B_TRUE; 1084219089Spjd 1085168404Spjd /* 1086168404Spjd * If this buffer is already dirty, we're done. 1087168404Spjd */ 1088168404Spjd drp = &db->db_last_dirty; 1089168404Spjd ASSERT(*drp == NULL || (*drp)->dr_txg <= tx->tx_txg || 1090168404Spjd db->db.db_object == DMU_META_DNODE_OBJECT); 1091185029Spjd while ((dr = *drp) != NULL && dr->dr_txg > tx->tx_txg) 1092185029Spjd drp = &dr->dr_next; 1093185029Spjd if (dr && dr->dr_txg == tx->tx_txg) { 1094219089Spjd DB_DNODE_EXIT(db); 1095219089Spjd 1096219089Spjd if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID) { 1097168404Spjd /* 1098168404Spjd * If this buffer has already been written out, 1099168404Spjd * we now need to reset its state. 1100168404Spjd */ 1101185029Spjd dbuf_unoverride(dr); 1102219089Spjd if (db->db.db_object != DMU_META_DNODE_OBJECT && 1103219089Spjd db->db_state != DB_NOFILL) 1104168404Spjd arc_buf_thaw(db->db_buf); 1105168404Spjd } 1106168404Spjd mutex_exit(&db->db_mtx); 1107185029Spjd return (dr); 1108168404Spjd } 1109168404Spjd 1110168404Spjd /* 1111168404Spjd * Only valid if not already dirty. 1112168404Spjd */ 1113209962Smm ASSERT(dn->dn_object == 0 || 1114209962Smm dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx == 1115168404Spjd (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); 1116168404Spjd 1117168404Spjd ASSERT3U(dn->dn_nlevels, >, db->db_level); 1118168404Spjd ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) || 1119168404Spjd dn->dn_phys->dn_nlevels > db->db_level || 1120168404Spjd dn->dn_next_nlevels[txgoff] > db->db_level || 1121168404Spjd dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level || 1122168404Spjd dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level); 1123168404Spjd 1124168404Spjd /* 1125168404Spjd * We should only be dirtying in syncing context if it's the 1126209962Smm * mos or we're initializing the os or it's a special object. 1127209962Smm * However, we are allowed to dirty in syncing context provided 1128209962Smm * we already dirtied it in open context. Hence we must make 1129209962Smm * this assertion only if we're not already dirty. 1130168404Spjd */ 1131219089Spjd os = dn->dn_objset; 1132209962Smm ASSERT(!dmu_tx_is_syncing(tx) || DMU_OBJECT_IS_SPECIAL(dn->dn_object) || 1133209962Smm os->os_dsl_dataset == NULL || BP_IS_HOLE(os->os_rootbp)); 1134168404Spjd ASSERT(db->db.db_size != 0); 1135168404Spjd 1136168404Spjd dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); 1137168404Spjd 1138219089Spjd if (db->db_blkid != DMU_BONUS_BLKID) { 1139185029Spjd /* 1140185029Spjd * Update the accounting. 1141185029Spjd * Note: we delay "free accounting" until after we drop 1142185029Spjd * the db_mtx. This keeps us from grabbing other locks 1143219089Spjd * (and possibly deadlocking) in bp_get_dsize() while 1144185029Spjd * also holding the db_mtx. 1145185029Spjd */ 1146185029Spjd dnode_willuse_space(dn, db->db.db_size, tx); 1147185029Spjd do_free_accounting = dbuf_block_freeable(db); 1148185029Spjd } 1149185029Spjd 1150168404Spjd /* 1151168404Spjd * If this buffer is dirty in an old transaction group we need 1152168404Spjd * to make a copy of it so that the changes we make in this 1153168404Spjd * transaction group won't leak out when we sync the older txg. 1154168404Spjd */ 1155168404Spjd dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_SLEEP); 1156168404Spjd if (db->db_level == 0) { 1157168404Spjd void *data_old = db->db_buf; 1158168404Spjd 1159219089Spjd if (db->db_state != DB_NOFILL) { 1160219089Spjd if (db->db_blkid == DMU_BONUS_BLKID) { 1161219089Spjd dbuf_fix_old_data(db, tx->tx_txg); 1162219089Spjd data_old = db->db.db_data; 1163219089Spjd } else if (db->db.db_object != DMU_META_DNODE_OBJECT) { 1164219089Spjd /* 1165219089Spjd * Release the data buffer from the cache so 1166219089Spjd * that we can modify it without impacting 1167219089Spjd * possible other users of this cached data 1168219089Spjd * block. Note that indirect blocks and 1169219089Spjd * private objects are not released until the 1170219089Spjd * syncing state (since they are only modified 1171219089Spjd * then). 1172219089Spjd */ 1173219089Spjd arc_release(db->db_buf, db); 1174219089Spjd dbuf_fix_old_data(db, tx->tx_txg); 1175219089Spjd data_old = db->db_buf; 1176219089Spjd } 1177219089Spjd ASSERT(data_old != NULL); 1178168404Spjd } 1179168404Spjd dr->dt.dl.dr_data = data_old; 1180168404Spjd } else { 1181168404Spjd mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_DEFAULT, NULL); 1182168404Spjd list_create(&dr->dt.di.dr_children, 1183168404Spjd sizeof (dbuf_dirty_record_t), 1184168404Spjd offsetof(dbuf_dirty_record_t, dr_dirty_node)); 1185168404Spjd } 1186258632Savg if (db->db_blkid != DMU_BONUS_BLKID && os->os_dsl_dataset != NULL) 1187258632Savg dr->dr_accounted = db->db.db_size; 1188168404Spjd dr->dr_dbuf = db; 1189168404Spjd dr->dr_txg = tx->tx_txg; 1190168404Spjd dr->dr_next = *drp; 1191168404Spjd *drp = dr; 1192168404Spjd 1193168404Spjd /* 1194168404Spjd * We could have been freed_in_flight between the dbuf_noread 1195168404Spjd * and dbuf_dirty. We win, as though the dbuf_noread() had 1196168404Spjd * happened after the free. 1197168404Spjd */ 1198219089Spjd if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && 1199219089Spjd db->db_blkid != DMU_SPILL_BLKID) { 1200168404Spjd mutex_enter(&dn->dn_mtx); 1201264669Sdelphij if (dn->dn_free_ranges[txgoff] != NULL) { 1202264669Sdelphij range_tree_clear(dn->dn_free_ranges[txgoff], 1203264669Sdelphij db->db_blkid, 1); 1204264669Sdelphij } 1205168404Spjd mutex_exit(&dn->dn_mtx); 1206168404Spjd db->db_freed_in_flight = FALSE; 1207168404Spjd } 1208168404Spjd 1209168404Spjd /* 1210168404Spjd * This buffer is now part of this txg 1211168404Spjd */ 1212168404Spjd dbuf_add_ref(db, (void *)(uintptr_t)tx->tx_txg); 1213168404Spjd db->db_dirtycnt += 1; 1214168404Spjd ASSERT3U(db->db_dirtycnt, <=, 3); 1215168404Spjd 1216168404Spjd mutex_exit(&db->db_mtx); 1217168404Spjd 1218219089Spjd if (db->db_blkid == DMU_BONUS_BLKID || 1219219089Spjd db->db_blkid == DMU_SPILL_BLKID) { 1220168404Spjd mutex_enter(&dn->dn_mtx); 1221168404Spjd ASSERT(!list_link_active(&dr->dr_dirty_node)); 1222168404Spjd list_insert_tail(&dn->dn_dirty_records[txgoff], dr); 1223168404Spjd mutex_exit(&dn->dn_mtx); 1224168404Spjd dnode_setdirty(dn, tx); 1225219089Spjd DB_DNODE_EXIT(db); 1226168404Spjd return (dr); 1227185029Spjd } else if (do_free_accounting) { 1228185029Spjd blkptr_t *bp = db->db_blkptr; 1229185029Spjd int64_t willfree = (bp && !BP_IS_HOLE(bp)) ? 1230219089Spjd bp_get_dsize(os->os_spa, bp) : db->db.db_size; 1231185029Spjd /* 1232185029Spjd * This is only a guess -- if the dbuf is dirty 1233185029Spjd * in a previous txg, we don't know how much 1234185029Spjd * space it will use on disk yet. We should 1235185029Spjd * really have the struct_rwlock to access 1236185029Spjd * db_blkptr, but since this is just a guess, 1237185029Spjd * it's OK if we get an odd answer. 1238185029Spjd */ 1239219089Spjd ddt_prefetch(os->os_spa, bp); 1240185029Spjd dnode_willuse_space(dn, -willfree, tx); 1241168404Spjd } 1242168404Spjd 1243168404Spjd if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) { 1244168404Spjd rw_enter(&dn->dn_struct_rwlock, RW_READER); 1245168404Spjd drop_struct_lock = TRUE; 1246168404Spjd } 1247168404Spjd 1248185029Spjd if (db->db_level == 0) { 1249185029Spjd dnode_new_blkid(dn, db->db_blkid, tx, drop_struct_lock); 1250185029Spjd ASSERT(dn->dn_maxblkid >= db->db_blkid); 1251185029Spjd } 1252185029Spjd 1253168404Spjd if (db->db_level+1 < dn->dn_nlevels) { 1254168404Spjd dmu_buf_impl_t *parent = db->db_parent; 1255168404Spjd dbuf_dirty_record_t *di; 1256168404Spjd int parent_held = FALSE; 1257168404Spjd 1258168404Spjd if (db->db_parent == NULL || db->db_parent == dn->dn_dbuf) { 1259168404Spjd int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 1260168404Spjd 1261168404Spjd parent = dbuf_hold_level(dn, db->db_level+1, 1262168404Spjd db->db_blkid >> epbs, FTAG); 1263219089Spjd ASSERT(parent != NULL); 1264168404Spjd parent_held = TRUE; 1265168404Spjd } 1266168404Spjd if (drop_struct_lock) 1267168404Spjd rw_exit(&dn->dn_struct_rwlock); 1268168404Spjd ASSERT3U(db->db_level+1, ==, parent->db_level); 1269168404Spjd di = dbuf_dirty(parent, tx); 1270168404Spjd if (parent_held) 1271168404Spjd dbuf_rele(parent, FTAG); 1272168404Spjd 1273168404Spjd mutex_enter(&db->db_mtx); 1274258632Savg /* 1275258632Savg * Since we've dropped the mutex, it's possible that 1276258632Savg * dbuf_undirty() might have changed this out from under us. 1277258632Savg */ 1278168404Spjd if (db->db_last_dirty == dr || 1279168404Spjd dn->dn_object == DMU_META_DNODE_OBJECT) { 1280168404Spjd mutex_enter(&di->dt.di.dr_mtx); 1281168404Spjd ASSERT3U(di->dr_txg, ==, tx->tx_txg); 1282168404Spjd ASSERT(!list_link_active(&dr->dr_dirty_node)); 1283168404Spjd list_insert_tail(&di->dt.di.dr_children, dr); 1284168404Spjd mutex_exit(&di->dt.di.dr_mtx); 1285168404Spjd dr->dr_parent = di; 1286168404Spjd } 1287168404Spjd mutex_exit(&db->db_mtx); 1288168404Spjd } else { 1289168404Spjd ASSERT(db->db_level+1 == dn->dn_nlevels); 1290168404Spjd ASSERT(db->db_blkid < dn->dn_nblkptr); 1291219089Spjd ASSERT(db->db_parent == NULL || db->db_parent == dn->dn_dbuf); 1292168404Spjd mutex_enter(&dn->dn_mtx); 1293168404Spjd ASSERT(!list_link_active(&dr->dr_dirty_node)); 1294168404Spjd list_insert_tail(&dn->dn_dirty_records[txgoff], dr); 1295168404Spjd mutex_exit(&dn->dn_mtx); 1296168404Spjd if (drop_struct_lock) 1297168404Spjd rw_exit(&dn->dn_struct_rwlock); 1298168404Spjd } 1299168404Spjd 1300168404Spjd dnode_setdirty(dn, tx); 1301219089Spjd DB_DNODE_EXIT(db); 1302168404Spjd return (dr); 1303168404Spjd} 1304168404Spjd 1305248571Smm/* 1306251629Sdelphij * Undirty a buffer in the transaction group referenced by the given 1307251629Sdelphij * transaction. Return whether this evicted the dbuf. 1308248571Smm */ 1309248571Smmstatic boolean_t 1310168404Spjddbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) 1311168404Spjd{ 1312219089Spjd dnode_t *dn; 1313168404Spjd uint64_t txg = tx->tx_txg; 1314185029Spjd dbuf_dirty_record_t *dr, **drp; 1315168404Spjd 1316168404Spjd ASSERT(txg != 0); 1317284593Savg 1318284593Savg /* 1319284593Savg * Due to our use of dn_nlevels below, this can only be called 1320284593Savg * in open context, unless we are operating on the MOS. 1321284593Savg * From syncing context, dn_nlevels may be different from the 1322284593Savg * dn_nlevels used when dbuf was dirtied. 1323284593Savg */ 1324284593Savg ASSERT(db->db_objset == 1325284593Savg dmu_objset_pool(db->db_objset)->dp_meta_objset || 1326284593Savg txg != spa_syncing_txg(dmu_objset_spa(db->db_objset))); 1327219089Spjd ASSERT(db->db_blkid != DMU_BONUS_BLKID); 1328248571Smm ASSERT0(db->db_level); 1329248571Smm ASSERT(MUTEX_HELD(&db->db_mtx)); 1330168404Spjd 1331168404Spjd /* 1332168404Spjd * If this buffer is not dirty, we're done. 1333168404Spjd */ 1334185029Spjd for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next) 1335168404Spjd if (dr->dr_txg <= txg) 1336168404Spjd break; 1337248571Smm if (dr == NULL || dr->dr_txg < txg) 1338248571Smm return (B_FALSE); 1339168404Spjd ASSERT(dr->dr_txg == txg); 1340219089Spjd ASSERT(dr->dr_dbuf == db); 1341168404Spjd 1342219089Spjd DB_DNODE_ENTER(db); 1343219089Spjd dn = DB_DNODE(db); 1344219089Spjd 1345168404Spjd dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); 1346168404Spjd 1347168404Spjd ASSERT(db->db.db_size != 0); 1348168404Spjd 1349284593Savg dsl_pool_undirty_space(dmu_objset_pool(dn->dn_objset), 1350284593Savg dr->dr_accounted, txg); 1351168404Spjd 1352185029Spjd *drp = dr->dr_next; 1353168404Spjd 1354219636Spjd /* 1355219636Spjd * Note that there are three places in dbuf_dirty() 1356219636Spjd * where this dirty record may be put on a list. 1357219636Spjd * Make sure to do a list_remove corresponding to 1358219636Spjd * every one of those list_insert calls. 1359219636Spjd */ 1360168404Spjd if (dr->dr_parent) { 1361168404Spjd mutex_enter(&dr->dr_parent->dt.di.dr_mtx); 1362168404Spjd list_remove(&dr->dr_parent->dt.di.dr_children, dr); 1363168404Spjd mutex_exit(&dr->dr_parent->dt.di.dr_mtx); 1364219636Spjd } else if (db->db_blkid == DMU_SPILL_BLKID || 1365284593Savg db->db_level + 1 == dn->dn_nlevels) { 1366185029Spjd ASSERT(db->db_blkptr == NULL || db->db_parent == dn->dn_dbuf); 1367168404Spjd mutex_enter(&dn->dn_mtx); 1368168404Spjd list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr); 1369168404Spjd mutex_exit(&dn->dn_mtx); 1370168404Spjd } 1371219089Spjd DB_DNODE_EXIT(db); 1372168404Spjd 1373248571Smm if (db->db_state != DB_NOFILL) { 1374248571Smm dbuf_unoverride(dr); 1375168404Spjd 1376168404Spjd ASSERT(db->db_buf != NULL); 1377248571Smm ASSERT(dr->dt.dl.dr_data != NULL); 1378248571Smm if (dr->dt.dl.dr_data != db->db_buf) 1379248571Smm VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db)); 1380168404Spjd } 1381268713Sdelphij 1382168404Spjd kmem_free(dr, sizeof (dbuf_dirty_record_t)); 1383168404Spjd 1384168404Spjd ASSERT(db->db_dirtycnt > 0); 1385168404Spjd db->db_dirtycnt -= 1; 1386168404Spjd 1387168404Spjd if (refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) { 1388168404Spjd arc_buf_t *buf = db->db_buf; 1389168404Spjd 1390219089Spjd ASSERT(db->db_state == DB_NOFILL || arc_released(buf)); 1391168404Spjd dbuf_set_data(db, NULL); 1392248571Smm VERIFY(arc_buf_remove_ref(buf, db)); 1393168404Spjd dbuf_evict(db); 1394248571Smm return (B_TRUE); 1395168404Spjd } 1396168404Spjd 1397248571Smm return (B_FALSE); 1398168404Spjd} 1399168404Spjd 1400168404Spjdvoid 1401260150Sdelphijdmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx) 1402168404Spjd{ 1403260150Sdelphij dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 1404185029Spjd int rf = DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH; 1405168404Spjd 1406168404Spjd ASSERT(tx->tx_txg != 0); 1407168404Spjd ASSERT(!refcount_is_zero(&db->db_holds)); 1408168404Spjd 1409219089Spjd DB_DNODE_ENTER(db); 1410219089Spjd if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock)) 1411168404Spjd rf |= DB_RF_HAVESTRUCT; 1412219089Spjd DB_DNODE_EXIT(db); 1413168404Spjd (void) dbuf_read(db, NULL, rf); 1414168404Spjd (void) dbuf_dirty(db, tx); 1415168404Spjd} 1416168404Spjd 1417168404Spjdvoid 1418219089Spjddmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) 1419219089Spjd{ 1420219089Spjd dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 1421219089Spjd 1422219089Spjd db->db_state = DB_NOFILL; 1423219089Spjd 1424219089Spjd dmu_buf_will_fill(db_fake, tx); 1425219089Spjd} 1426219089Spjd 1427219089Spjdvoid 1428168404Spjddmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) 1429168404Spjd{ 1430168404Spjd dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 1431168404Spjd 1432219089Spjd ASSERT(db->db_blkid != DMU_BONUS_BLKID); 1433168404Spjd ASSERT(tx->tx_txg != 0); 1434168404Spjd ASSERT(db->db_level == 0); 1435168404Spjd ASSERT(!refcount_is_zero(&db->db_holds)); 1436168404Spjd 1437168404Spjd ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT || 1438168404Spjd dmu_tx_private_ok(tx)); 1439168404Spjd 1440168404Spjd dbuf_noread(db); 1441168404Spjd (void) dbuf_dirty(db, tx); 1442168404Spjd} 1443168404Spjd 1444168404Spjd#pragma weak dmu_buf_fill_done = dbuf_fill_done 1445168404Spjd/* ARGSUSED */ 1446168404Spjdvoid 1447168404Spjddbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx) 1448168404Spjd{ 1449168404Spjd mutex_enter(&db->db_mtx); 1450168404Spjd DBUF_VERIFY(db); 1451168404Spjd 1452168404Spjd if (db->db_state == DB_FILL) { 1453168404Spjd if (db->db_level == 0 && db->db_freed_in_flight) { 1454219089Spjd ASSERT(db->db_blkid != DMU_BONUS_BLKID); 1455168404Spjd /* we were freed while filling */ 1456168404Spjd /* XXX dbuf_undirty? */ 1457168404Spjd bzero(db->db.db_data, db->db.db_size); 1458168404Spjd db->db_freed_in_flight = FALSE; 1459168404Spjd } 1460168404Spjd db->db_state = DB_CACHED; 1461168404Spjd cv_broadcast(&db->db_changed); 1462168404Spjd } 1463168404Spjd mutex_exit(&db->db_mtx); 1464168404Spjd} 1465168404Spjd 1466268075Sdelphijvoid 1467268075Sdelphijdmu_buf_write_embedded(dmu_buf_t *dbuf, void *data, 1468268075Sdelphij bp_embedded_type_t etype, enum zio_compress comp, 1469268075Sdelphij int uncompressed_size, int compressed_size, int byteorder, 1470268075Sdelphij dmu_tx_t *tx) 1471268075Sdelphij{ 1472268075Sdelphij dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf; 1473268075Sdelphij struct dirty_leaf *dl; 1474268075Sdelphij dmu_object_type_t type; 1475268075Sdelphij 1476268075Sdelphij DB_DNODE_ENTER(db); 1477268075Sdelphij type = DB_DNODE(db)->dn_type; 1478268075Sdelphij DB_DNODE_EXIT(db); 1479268075Sdelphij 1480268075Sdelphij ASSERT0(db->db_level); 1481268075Sdelphij ASSERT(db->db_blkid != DMU_BONUS_BLKID); 1482268075Sdelphij 1483268075Sdelphij dmu_buf_will_not_fill(dbuf, tx); 1484268075Sdelphij 1485268075Sdelphij ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg); 1486268075Sdelphij dl = &db->db_last_dirty->dt.dl; 1487268075Sdelphij encode_embedded_bp_compressed(&dl->dr_overridden_by, 1488268075Sdelphij data, comp, uncompressed_size, compressed_size); 1489268075Sdelphij BPE_SET_ETYPE(&dl->dr_overridden_by, etype); 1490268075Sdelphij BP_SET_TYPE(&dl->dr_overridden_by, type); 1491268075Sdelphij BP_SET_LEVEL(&dl->dr_overridden_by, 0); 1492268075Sdelphij BP_SET_BYTEORDER(&dl->dr_overridden_by, byteorder); 1493268075Sdelphij 1494268075Sdelphij dl->dr_override_state = DR_OVERRIDDEN; 1495268075Sdelphij dl->dr_overridden_by.blk_birth = db->db_last_dirty->dr_txg; 1496268075Sdelphij} 1497268075Sdelphij 1498168404Spjd/* 1499209962Smm * Directly assign a provided arc buf to a given dbuf if it's not referenced 1500209962Smm * by anybody except our caller. Otherwise copy arcbuf's contents to dbuf. 1501209962Smm */ 1502209962Smmvoid 1503209962Smmdbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx) 1504209962Smm{ 1505209962Smm ASSERT(!refcount_is_zero(&db->db_holds)); 1506219089Spjd ASSERT(db->db_blkid != DMU_BONUS_BLKID); 1507209962Smm ASSERT(db->db_level == 0); 1508209962Smm ASSERT(DBUF_GET_BUFC_TYPE(db) == ARC_BUFC_DATA); 1509209962Smm ASSERT(buf != NULL); 1510209962Smm ASSERT(arc_buf_size(buf) == db->db.db_size); 1511209962Smm ASSERT(tx->tx_txg != 0); 1512209962Smm 1513209962Smm arc_return_buf(buf, db); 1514209962Smm ASSERT(arc_released(buf)); 1515209962Smm 1516209962Smm mutex_enter(&db->db_mtx); 1517209962Smm 1518209962Smm while (db->db_state == DB_READ || db->db_state == DB_FILL) 1519209962Smm cv_wait(&db->db_changed, &db->db_mtx); 1520209962Smm 1521209962Smm ASSERT(db->db_state == DB_CACHED || db->db_state == DB_UNCACHED); 1522209962Smm 1523209962Smm if (db->db_state == DB_CACHED && 1524209962Smm refcount_count(&db->db_holds) - 1 > db->db_dirtycnt) { 1525209962Smm mutex_exit(&db->db_mtx); 1526209962Smm (void) dbuf_dirty(db, tx); 1527209962Smm bcopy(buf->b_data, db->db.db_data, db->db.db_size); 1528248571Smm VERIFY(arc_buf_remove_ref(buf, db)); 1529219089Spjd xuio_stat_wbuf_copied(); 1530209962Smm return; 1531209962Smm } 1532209962Smm 1533219089Spjd xuio_stat_wbuf_nocopy(); 1534209962Smm if (db->db_state == DB_CACHED) { 1535209962Smm dbuf_dirty_record_t *dr = db->db_last_dirty; 1536209962Smm 1537209962Smm ASSERT(db->db_buf != NULL); 1538209962Smm if (dr != NULL && dr->dr_txg == tx->tx_txg) { 1539209962Smm ASSERT(dr->dt.dl.dr_data == db->db_buf); 1540209962Smm if (!arc_released(db->db_buf)) { 1541209962Smm ASSERT(dr->dt.dl.dr_override_state == 1542209962Smm DR_OVERRIDDEN); 1543209962Smm arc_release(db->db_buf, db); 1544209962Smm } 1545209962Smm dr->dt.dl.dr_data = buf; 1546248571Smm VERIFY(arc_buf_remove_ref(db->db_buf, db)); 1547209962Smm } else if (dr == NULL || dr->dt.dl.dr_data != db->db_buf) { 1548209962Smm arc_release(db->db_buf, db); 1549248571Smm VERIFY(arc_buf_remove_ref(db->db_buf, db)); 1550209962Smm } 1551209962Smm db->db_buf = NULL; 1552209962Smm } 1553209962Smm ASSERT(db->db_buf == NULL); 1554209962Smm dbuf_set_data(db, buf); 1555209962Smm db->db_state = DB_FILL; 1556209962Smm mutex_exit(&db->db_mtx); 1557209962Smm (void) dbuf_dirty(db, tx); 1558260150Sdelphij dmu_buf_fill_done(&db->db, tx); 1559209962Smm} 1560209962Smm 1561209962Smm/* 1562168404Spjd * "Clear" the contents of this dbuf. This will mark the dbuf 1563258632Savg * EVICTING and clear *most* of its references. Unfortunately, 1564168404Spjd * when we are not holding the dn_dbufs_mtx, we can't clear the 1565168404Spjd * entry in the dn_dbufs list. We have to wait until dbuf_destroy() 1566168404Spjd * in this case. For callers from the DMU we will usually see: 1567268858Sdelphij * dbuf_clear()->arc_clear_callback()->dbuf_do_evict()->dbuf_destroy() 1568168404Spjd * For the arc callback, we will usually see: 1569219089Spjd * dbuf_do_evict()->dbuf_clear();dbuf_destroy() 1570168404Spjd * Sometimes, though, we will get a mix of these two: 1571268858Sdelphij * DMU: dbuf_clear()->arc_clear_callback() 1572168404Spjd * ARC: dbuf_do_evict()->dbuf_destroy() 1573268858Sdelphij * 1574268858Sdelphij * This routine will dissociate the dbuf from the arc, by calling 1575268858Sdelphij * arc_clear_callback(), but will not evict the data from the ARC. 1576168404Spjd */ 1577168404Spjdvoid 1578168404Spjddbuf_clear(dmu_buf_impl_t *db) 1579168404Spjd{ 1580219089Spjd dnode_t *dn; 1581168404Spjd dmu_buf_impl_t *parent = db->db_parent; 1582219089Spjd dmu_buf_impl_t *dndb; 1583268858Sdelphij boolean_t dbuf_gone = B_FALSE; 1584168404Spjd 1585168404Spjd ASSERT(MUTEX_HELD(&db->db_mtx)); 1586168404Spjd ASSERT(refcount_is_zero(&db->db_holds)); 1587168404Spjd 1588168404Spjd dbuf_evict_user(db); 1589168404Spjd 1590168404Spjd if (db->db_state == DB_CACHED) { 1591168404Spjd ASSERT(db->db.db_data != NULL); 1592219089Spjd if (db->db_blkid == DMU_BONUS_BLKID) { 1593168404Spjd zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN); 1594208373Smm arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); 1595185029Spjd } 1596168404Spjd db->db.db_data = NULL; 1597168404Spjd db->db_state = DB_UNCACHED; 1598168404Spjd } 1599168404Spjd 1600219089Spjd ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL); 1601168404Spjd ASSERT(db->db_data_pending == NULL); 1602168404Spjd 1603168404Spjd db->db_state = DB_EVICTING; 1604168404Spjd db->db_blkptr = NULL; 1605168404Spjd 1606219089Spjd DB_DNODE_ENTER(db); 1607219089Spjd dn = DB_DNODE(db); 1608219089Spjd dndb = dn->dn_dbuf; 1609219089Spjd if (db->db_blkid != DMU_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) { 1610269229Sdelphij avl_remove(&dn->dn_dbufs, db); 1611270248Sdelphij atomic_dec_32(&dn->dn_dbufs_count); 1612219089Spjd membar_producer(); 1613219089Spjd DB_DNODE_EXIT(db); 1614219089Spjd /* 1615219089Spjd * Decrementing the dbuf count means that the hold corresponding 1616219089Spjd * to the removed dbuf is no longer discounted in dnode_move(), 1617219089Spjd * so the dnode cannot be moved until after we release the hold. 1618219089Spjd * The membar_producer() ensures visibility of the decremented 1619219089Spjd * value in dnode_move(), since DB_DNODE_EXIT doesn't actually 1620219089Spjd * release any lock. 1621219089Spjd */ 1622168404Spjd dnode_rele(dn, db); 1623219089Spjd db->db_dnode_handle = NULL; 1624219089Spjd } else { 1625219089Spjd DB_DNODE_EXIT(db); 1626168404Spjd } 1627168404Spjd 1628168404Spjd if (db->db_buf) 1629268858Sdelphij dbuf_gone = arc_clear_callback(db->db_buf); 1630168404Spjd 1631168404Spjd if (!dbuf_gone) 1632168404Spjd mutex_exit(&db->db_mtx); 1633168404Spjd 1634168404Spjd /* 1635219089Spjd * If this dbuf is referenced from an indirect dbuf, 1636168404Spjd * decrement the ref count on the indirect dbuf. 1637168404Spjd */ 1638168404Spjd if (parent && parent != dndb) 1639168404Spjd dbuf_rele(parent, db); 1640168404Spjd} 1641168404Spjd 1642168404Spjdstatic int 1643168404Spjddbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse, 1644168404Spjd dmu_buf_impl_t **parentp, blkptr_t **bpp) 1645168404Spjd{ 1646168404Spjd int nlevels, epbs; 1647168404Spjd 1648168404Spjd *parentp = NULL; 1649168404Spjd *bpp = NULL; 1650168404Spjd 1651219089Spjd ASSERT(blkid != DMU_BONUS_BLKID); 1652168404Spjd 1653219089Spjd if (blkid == DMU_SPILL_BLKID) { 1654219089Spjd mutex_enter(&dn->dn_mtx); 1655219089Spjd if (dn->dn_have_spill && 1656219089Spjd (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) 1657219089Spjd *bpp = &dn->dn_phys->dn_spill; 1658219089Spjd else 1659219089Spjd *bpp = NULL; 1660219089Spjd dbuf_add_ref(dn->dn_dbuf, NULL); 1661219089Spjd *parentp = dn->dn_dbuf; 1662219089Spjd mutex_exit(&dn->dn_mtx); 1663219089Spjd return (0); 1664219089Spjd } 1665219089Spjd 1666168404Spjd if (dn->dn_phys->dn_nlevels == 0) 1667168404Spjd nlevels = 1; 1668168404Spjd else 1669168404Spjd nlevels = dn->dn_phys->dn_nlevels; 1670168404Spjd 1671168404Spjd epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 1672168404Spjd 1673168404Spjd ASSERT3U(level * epbs, <, 64); 1674168404Spjd ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 1675168404Spjd if (level >= nlevels || 1676168404Spjd (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))) { 1677168404Spjd /* the buffer has no parent yet */ 1678249195Smm return (SET_ERROR(ENOENT)); 1679168404Spjd } else if (level < nlevels-1) { 1680168404Spjd /* this block is referenced from an indirect block */ 1681168404Spjd int err = dbuf_hold_impl(dn, level+1, 1682168404Spjd blkid >> epbs, fail_sparse, NULL, parentp); 1683168404Spjd if (err) 1684168404Spjd return (err); 1685168404Spjd err = dbuf_read(*parentp, NULL, 1686168404Spjd (DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH | DB_RF_CANFAIL)); 1687168404Spjd if (err) { 1688168404Spjd dbuf_rele(*parentp, NULL); 1689168404Spjd *parentp = NULL; 1690168404Spjd return (err); 1691168404Spjd } 1692168404Spjd *bpp = ((blkptr_t *)(*parentp)->db.db_data) + 1693168404Spjd (blkid & ((1ULL << epbs) - 1)); 1694168404Spjd return (0); 1695168404Spjd } else { 1696168404Spjd /* the block is referenced from the dnode */ 1697168404Spjd ASSERT3U(level, ==, nlevels-1); 1698168404Spjd ASSERT(dn->dn_phys->dn_nblkptr == 0 || 1699168404Spjd blkid < dn->dn_phys->dn_nblkptr); 1700168404Spjd if (dn->dn_dbuf) { 1701168404Spjd dbuf_add_ref(dn->dn_dbuf, NULL); 1702168404Spjd *parentp = dn->dn_dbuf; 1703168404Spjd } 1704168404Spjd *bpp = &dn->dn_phys->dn_blkptr[blkid]; 1705168404Spjd return (0); 1706168404Spjd } 1707168404Spjd} 1708168404Spjd 1709168404Spjdstatic dmu_buf_impl_t * 1710168404Spjddbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, 1711168404Spjd dmu_buf_impl_t *parent, blkptr_t *blkptr) 1712168404Spjd{ 1713219089Spjd objset_t *os = dn->dn_objset; 1714168404Spjd dmu_buf_impl_t *db, *odb; 1715168404Spjd 1716168404Spjd ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 1717168404Spjd ASSERT(dn->dn_type != DMU_OT_NONE); 1718168404Spjd 1719168404Spjd db = kmem_cache_alloc(dbuf_cache, KM_SLEEP); 1720168404Spjd 1721168404Spjd db->db_objset = os; 1722168404Spjd db->db.db_object = dn->dn_object; 1723168404Spjd db->db_level = level; 1724168404Spjd db->db_blkid = blkid; 1725168404Spjd db->db_last_dirty = NULL; 1726168404Spjd db->db_dirtycnt = 0; 1727219089Spjd db->db_dnode_handle = dn->dn_handle; 1728168404Spjd db->db_parent = parent; 1729168404Spjd db->db_blkptr = blkptr; 1730168404Spjd 1731168404Spjd db->db_user_ptr = NULL; 1732168404Spjd db->db_evict_func = NULL; 1733168404Spjd db->db_immediate_evict = 0; 1734168404Spjd db->db_freed_in_flight = 0; 1735168404Spjd 1736219089Spjd if (blkid == DMU_BONUS_BLKID) { 1737168404Spjd ASSERT3P(parent, ==, dn->dn_dbuf); 1738185029Spjd db->db.db_size = DN_MAX_BONUSLEN - 1739185029Spjd (dn->dn_nblkptr-1) * sizeof (blkptr_t); 1740185029Spjd ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); 1741219089Spjd db->db.db_offset = DMU_BONUS_BLKID; 1742168404Spjd db->db_state = DB_UNCACHED; 1743168404Spjd /* the bonus dbuf is not placed in the hash table */ 1744208373Smm arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); 1745168404Spjd return (db); 1746219089Spjd } else if (blkid == DMU_SPILL_BLKID) { 1747219089Spjd db->db.db_size = (blkptr != NULL) ? 1748219089Spjd BP_GET_LSIZE(blkptr) : SPA_MINBLOCKSIZE; 1749219089Spjd db->db.db_offset = 0; 1750168404Spjd } else { 1751168404Spjd int blocksize = 1752258632Savg db->db_level ? 1 << dn->dn_indblkshift : dn->dn_datablksz; 1753168404Spjd db->db.db_size = blocksize; 1754168404Spjd db->db.db_offset = db->db_blkid * blocksize; 1755168404Spjd } 1756168404Spjd 1757168404Spjd /* 1758168404Spjd * Hold the dn_dbufs_mtx while we get the new dbuf 1759168404Spjd * in the hash table *and* added to the dbufs list. 1760168404Spjd * This prevents a possible deadlock with someone 1761168404Spjd * trying to look up this dbuf before its added to the 1762168404Spjd * dn_dbufs list. 1763168404Spjd */ 1764168404Spjd mutex_enter(&dn->dn_dbufs_mtx); 1765168404Spjd db->db_state = DB_EVICTING; 1766168404Spjd if ((odb = dbuf_hash_insert(db)) != NULL) { 1767168404Spjd /* someone else inserted it first */ 1768168404Spjd kmem_cache_free(dbuf_cache, db); 1769168404Spjd mutex_exit(&dn->dn_dbufs_mtx); 1770168404Spjd return (odb); 1771168404Spjd } 1772269229Sdelphij avl_add(&dn->dn_dbufs, db); 1773254753Sdelphij if (db->db_level == 0 && db->db_blkid >= 1774254753Sdelphij dn->dn_unlisted_l0_blkid) 1775254753Sdelphij dn->dn_unlisted_l0_blkid = db->db_blkid + 1; 1776168404Spjd db->db_state = DB_UNCACHED; 1777168404Spjd mutex_exit(&dn->dn_dbufs_mtx); 1778208373Smm arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); 1779168404Spjd 1780168404Spjd if (parent && parent != dn->dn_dbuf) 1781168404Spjd dbuf_add_ref(parent, db); 1782168404Spjd 1783168404Spjd ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || 1784168404Spjd refcount_count(&dn->dn_holds) > 0); 1785168404Spjd (void) refcount_add(&dn->dn_holds, db); 1786270248Sdelphij atomic_inc_32(&dn->dn_dbufs_count); 1787168404Spjd 1788168404Spjd dprintf_dbuf(db, "db=%p\n", db); 1789168404Spjd 1790168404Spjd return (db); 1791168404Spjd} 1792168404Spjd 1793168404Spjdstatic int 1794168404Spjddbuf_do_evict(void *private) 1795168404Spjd{ 1796268858Sdelphij dmu_buf_impl_t *db = private; 1797168404Spjd 1798168404Spjd if (!MUTEX_HELD(&db->db_mtx)) 1799168404Spjd mutex_enter(&db->db_mtx); 1800168404Spjd 1801168404Spjd ASSERT(refcount_is_zero(&db->db_holds)); 1802168404Spjd 1803168404Spjd if (db->db_state != DB_EVICTING) { 1804168404Spjd ASSERT(db->db_state == DB_CACHED); 1805168404Spjd DBUF_VERIFY(db); 1806168404Spjd db->db_buf = NULL; 1807168404Spjd dbuf_evict(db); 1808168404Spjd } else { 1809168404Spjd mutex_exit(&db->db_mtx); 1810168404Spjd dbuf_destroy(db); 1811168404Spjd } 1812168404Spjd return (0); 1813168404Spjd} 1814168404Spjd 1815168404Spjdstatic void 1816168404Spjddbuf_destroy(dmu_buf_impl_t *db) 1817168404Spjd{ 1818168404Spjd ASSERT(refcount_is_zero(&db->db_holds)); 1819168404Spjd 1820219089Spjd if (db->db_blkid != DMU_BONUS_BLKID) { 1821168404Spjd /* 1822168404Spjd * If this dbuf is still on the dn_dbufs list, 1823168404Spjd * remove it from that list. 1824168404Spjd */ 1825219089Spjd if (db->db_dnode_handle != NULL) { 1826219089Spjd dnode_t *dn; 1827185029Spjd 1828219089Spjd DB_DNODE_ENTER(db); 1829219089Spjd dn = DB_DNODE(db); 1830168404Spjd mutex_enter(&dn->dn_dbufs_mtx); 1831269229Sdelphij avl_remove(&dn->dn_dbufs, db); 1832270248Sdelphij atomic_dec_32(&dn->dn_dbufs_count); 1833168404Spjd mutex_exit(&dn->dn_dbufs_mtx); 1834219089Spjd DB_DNODE_EXIT(db); 1835219089Spjd /* 1836219089Spjd * Decrementing the dbuf count means that the hold 1837219089Spjd * corresponding to the removed dbuf is no longer 1838219089Spjd * discounted in dnode_move(), so the dnode cannot be 1839219089Spjd * moved until after we release the hold. 1840219089Spjd */ 1841168404Spjd dnode_rele(dn, db); 1842219089Spjd db->db_dnode_handle = NULL; 1843168404Spjd } 1844168404Spjd dbuf_hash_remove(db); 1845168404Spjd } 1846168404Spjd db->db_parent = NULL; 1847168404Spjd db->db_buf = NULL; 1848168404Spjd 1849168404Spjd ASSERT(db->db.db_data == NULL); 1850168404Spjd ASSERT(db->db_hash_next == NULL); 1851168404Spjd ASSERT(db->db_blkptr == NULL); 1852168404Spjd ASSERT(db->db_data_pending == NULL); 1853168404Spjd 1854168404Spjd kmem_cache_free(dbuf_cache, db); 1855208373Smm arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); 1856168404Spjd} 1857168404Spjd 1858168404Spjdvoid 1859258632Savgdbuf_prefetch(dnode_t *dn, uint64_t blkid, zio_priority_t prio) 1860168404Spjd{ 1861168404Spjd dmu_buf_impl_t *db = NULL; 1862168404Spjd blkptr_t *bp = NULL; 1863168404Spjd 1864219089Spjd ASSERT(blkid != DMU_BONUS_BLKID); 1865168404Spjd ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 1866168404Spjd 1867168404Spjd if (dnode_block_freed(dn, blkid)) 1868168404Spjd return; 1869168404Spjd 1870168404Spjd /* dbuf_find() returns with db_mtx held */ 1871286541Smav if (db = dbuf_find(dn->dn_objset, dn->dn_object, 0, blkid)) { 1872219089Spjd /* 1873219089Spjd * This dbuf is already in the cache. We assume that 1874219089Spjd * it is already CACHED, or else about to be either 1875219089Spjd * read or filled. 1876219089Spjd */ 1877168404Spjd mutex_exit(&db->db_mtx); 1878219089Spjd return; 1879168404Spjd } 1880168404Spjd 1881168404Spjd if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp) == 0) { 1882268075Sdelphij if (bp && !BP_IS_HOLE(bp) && !BP_IS_EMBEDDED(bp)) { 1883219089Spjd dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; 1884275811Sdelphij arc_flags_t aflags = 1885275811Sdelphij ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH; 1886268123Sdelphij zbookmark_phys_t zb; 1887168404Spjd 1888219089Spjd SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET, 1889219089Spjd dn->dn_object, 0, blkid); 1890219089Spjd 1891246666Smm (void) arc_read(NULL, dn->dn_objset->os_spa, 1892258632Savg bp, NULL, NULL, prio, 1893168404Spjd ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, 1894168404Spjd &aflags, &zb); 1895168404Spjd } 1896168404Spjd if (db) 1897168404Spjd dbuf_rele(db, NULL); 1898168404Spjd } 1899168404Spjd} 1900168404Spjd 1901168404Spjd/* 1902168404Spjd * Returns with db_holds incremented, and db_mtx not held. 1903168404Spjd * Note: dn_struct_rwlock must be held. 1904168404Spjd */ 1905168404Spjdint 1906168404Spjddbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse, 1907168404Spjd void *tag, dmu_buf_impl_t **dbp) 1908168404Spjd{ 1909168404Spjd dmu_buf_impl_t *db, *parent = NULL; 1910168404Spjd 1911219089Spjd ASSERT(blkid != DMU_BONUS_BLKID); 1912168404Spjd ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 1913168404Spjd ASSERT3U(dn->dn_nlevels, >, level); 1914168404Spjd 1915168404Spjd *dbp = NULL; 1916168404Spjdtop: 1917168404Spjd /* dbuf_find() returns with db_mtx held */ 1918286541Smav db = dbuf_find(dn->dn_objset, dn->dn_object, level, blkid); 1919168404Spjd 1920168404Spjd if (db == NULL) { 1921168404Spjd blkptr_t *bp = NULL; 1922168404Spjd int err; 1923168404Spjd 1924168404Spjd ASSERT3P(parent, ==, NULL); 1925168404Spjd err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp); 1926168404Spjd if (fail_sparse) { 1927168404Spjd if (err == 0 && bp && BP_IS_HOLE(bp)) 1928249195Smm err = SET_ERROR(ENOENT); 1929168404Spjd if (err) { 1930168404Spjd if (parent) 1931168404Spjd dbuf_rele(parent, NULL); 1932168404Spjd return (err); 1933168404Spjd } 1934168404Spjd } 1935168404Spjd if (err && err != ENOENT) 1936168404Spjd return (err); 1937168404Spjd db = dbuf_create(dn, level, blkid, parent, bp); 1938168404Spjd } 1939168404Spjd 1940168404Spjd if (db->db_buf && refcount_is_zero(&db->db_holds)) { 1941168404Spjd arc_buf_add_ref(db->db_buf, db); 1942168404Spjd if (db->db_buf->b_data == NULL) { 1943168404Spjd dbuf_clear(db); 1944168404Spjd if (parent) { 1945168404Spjd dbuf_rele(parent, NULL); 1946168404Spjd parent = NULL; 1947168404Spjd } 1948168404Spjd goto top; 1949168404Spjd } 1950168404Spjd ASSERT3P(db->db.db_data, ==, db->db_buf->b_data); 1951168404Spjd } 1952168404Spjd 1953168404Spjd ASSERT(db->db_buf == NULL || arc_referenced(db->db_buf)); 1954168404Spjd 1955168404Spjd /* 1956168404Spjd * If this buffer is currently syncing out, and we are are 1957168404Spjd * still referencing it from db_data, we need to make a copy 1958168404Spjd * of it in case we decide we want to dirty it again in this txg. 1959168404Spjd */ 1960219089Spjd if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && 1961168404Spjd dn->dn_object != DMU_META_DNODE_OBJECT && 1962168404Spjd db->db_state == DB_CACHED && db->db_data_pending) { 1963168404Spjd dbuf_dirty_record_t *dr = db->db_data_pending; 1964168404Spjd 1965168404Spjd if (dr->dt.dl.dr_data == db->db_buf) { 1966168404Spjd arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 1967168404Spjd 1968168404Spjd dbuf_set_data(db, 1969219089Spjd arc_buf_alloc(dn->dn_objset->os_spa, 1970168404Spjd db->db.db_size, db, type)); 1971168404Spjd bcopy(dr->dt.dl.dr_data->b_data, db->db.db_data, 1972168404Spjd db->db.db_size); 1973168404Spjd } 1974168404Spjd } 1975168404Spjd 1976168404Spjd (void) refcount_add(&db->db_holds, tag); 1977168404Spjd DBUF_VERIFY(db); 1978168404Spjd mutex_exit(&db->db_mtx); 1979168404Spjd 1980168404Spjd /* NOTE: we can't rele the parent until after we drop the db_mtx */ 1981168404Spjd if (parent) 1982168404Spjd dbuf_rele(parent, NULL); 1983168404Spjd 1984219089Spjd ASSERT3P(DB_DNODE(db), ==, dn); 1985168404Spjd ASSERT3U(db->db_blkid, ==, blkid); 1986168404Spjd ASSERT3U(db->db_level, ==, level); 1987168404Spjd *dbp = db; 1988168404Spjd 1989168404Spjd return (0); 1990168404Spjd} 1991168404Spjd 1992168404Spjddmu_buf_impl_t * 1993168404Spjddbuf_hold(dnode_t *dn, uint64_t blkid, void *tag) 1994168404Spjd{ 1995168404Spjd dmu_buf_impl_t *db; 1996168404Spjd int err = dbuf_hold_impl(dn, 0, blkid, FALSE, tag, &db); 1997168404Spjd return (err ? NULL : db); 1998168404Spjd} 1999168404Spjd 2000168404Spjddmu_buf_impl_t * 2001168404Spjddbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag) 2002168404Spjd{ 2003168404Spjd dmu_buf_impl_t *db; 2004168404Spjd int err = dbuf_hold_impl(dn, level, blkid, FALSE, tag, &db); 2005168404Spjd return (err ? NULL : db); 2006168404Spjd} 2007168404Spjd 2008185029Spjdvoid 2009168404Spjddbuf_create_bonus(dnode_t *dn) 2010168404Spjd{ 2011168404Spjd ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); 2012168404Spjd 2013168404Spjd ASSERT(dn->dn_bonus == NULL); 2014219089Spjd dn->dn_bonus = dbuf_create(dn, 0, DMU_BONUS_BLKID, dn->dn_dbuf, NULL); 2015168404Spjd} 2016168404Spjd 2017219089Spjdint 2018219089Spjddbuf_spill_set_blksz(dmu_buf_t *db_fake, uint64_t blksz, dmu_tx_t *tx) 2019219089Spjd{ 2020219089Spjd dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 2021219089Spjd dnode_t *dn; 2022219089Spjd 2023219089Spjd if (db->db_blkid != DMU_SPILL_BLKID) 2024249195Smm return (SET_ERROR(ENOTSUP)); 2025219089Spjd if (blksz == 0) 2026219089Spjd blksz = SPA_MINBLOCKSIZE; 2027274337Sdelphij ASSERT3U(blksz, <=, spa_maxblocksize(dmu_objset_spa(db->db_objset))); 2028274337Sdelphij blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE); 2029219089Spjd 2030219089Spjd DB_DNODE_ENTER(db); 2031219089Spjd dn = DB_DNODE(db); 2032219089Spjd rw_enter(&dn->dn_struct_rwlock, RW_WRITER); 2033219089Spjd dbuf_new_size(db, blksz, tx); 2034219089Spjd rw_exit(&dn->dn_struct_rwlock); 2035219089Spjd DB_DNODE_EXIT(db); 2036219089Spjd 2037219089Spjd return (0); 2038219089Spjd} 2039219089Spjd 2040219089Spjdvoid 2041219089Spjddbuf_rm_spill(dnode_t *dn, dmu_tx_t *tx) 2042219089Spjd{ 2043219089Spjd dbuf_free_range(dn, DMU_SPILL_BLKID, DMU_SPILL_BLKID, tx); 2044219089Spjd} 2045219089Spjd 2046168404Spjd#pragma weak dmu_buf_add_ref = dbuf_add_ref 2047168404Spjdvoid 2048168404Spjddbuf_add_ref(dmu_buf_impl_t *db, void *tag) 2049168404Spjd{ 2050168404Spjd int64_t holds = refcount_add(&db->db_holds, tag); 2051168404Spjd ASSERT(holds > 1); 2052168404Spjd} 2053168404Spjd 2054286541Smav#pragma weak dmu_buf_try_add_ref = dbuf_try_add_ref 2055286541Smavboolean_t 2056286541Smavdbuf_try_add_ref(dmu_buf_t *db_fake, objset_t *os, uint64_t obj, uint64_t blkid, 2057286541Smav void *tag) 2058286541Smav{ 2059286541Smav dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 2060286541Smav dmu_buf_impl_t *found_db; 2061286541Smav boolean_t result = B_FALSE; 2062286541Smav 2063286541Smav if (db->db_blkid == DMU_BONUS_BLKID) 2064286541Smav found_db = dbuf_find_bonus(os, obj); 2065286541Smav else 2066286541Smav found_db = dbuf_find(os, obj, 0, blkid); 2067286541Smav 2068286541Smav if (found_db != NULL) { 2069286541Smav if (db == found_db && dbuf_refcount(db) > db->db_dirtycnt) { 2070286541Smav (void) refcount_add(&db->db_holds, tag); 2071286541Smav result = B_TRUE; 2072286541Smav } 2073286541Smav mutex_exit(&db->db_mtx); 2074286541Smav } 2075286541Smav return (result); 2076286541Smav} 2077286541Smav 2078219089Spjd/* 2079219089Spjd * If you call dbuf_rele() you had better not be referencing the dnode handle 2080219089Spjd * unless you have some other direct or indirect hold on the dnode. (An indirect 2081219089Spjd * hold is a hold on one of the dnode's dbufs, including the bonus buffer.) 2082219089Spjd * Without that, the dbuf_rele() could lead to a dnode_rele() followed by the 2083219089Spjd * dnode's parent dbuf evicting its dnode handles. 2084219089Spjd */ 2085168404Spjdvoid 2086168404Spjddbuf_rele(dmu_buf_impl_t *db, void *tag) 2087168404Spjd{ 2088219089Spjd mutex_enter(&db->db_mtx); 2089219089Spjd dbuf_rele_and_unlock(db, tag); 2090219089Spjd} 2091219089Spjd 2092260150Sdelphijvoid 2093260150Sdelphijdmu_buf_rele(dmu_buf_t *db, void *tag) 2094260150Sdelphij{ 2095260150Sdelphij dbuf_rele((dmu_buf_impl_t *)db, tag); 2096260150Sdelphij} 2097260150Sdelphij 2098219089Spjd/* 2099219089Spjd * dbuf_rele() for an already-locked dbuf. This is necessary to allow 2100219089Spjd * db_dirtycnt and db_holds to be updated atomically. 2101219089Spjd */ 2102219089Spjdvoid 2103219089Spjddbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag) 2104219089Spjd{ 2105168404Spjd int64_t holds; 2106168404Spjd 2107219089Spjd ASSERT(MUTEX_HELD(&db->db_mtx)); 2108168404Spjd DBUF_VERIFY(db); 2109168404Spjd 2110219089Spjd /* 2111219089Spjd * Remove the reference to the dbuf before removing its hold on the 2112219089Spjd * dnode so we can guarantee in dnode_move() that a referenced bonus 2113219089Spjd * buffer has a corresponding dnode hold. 2114219089Spjd */ 2115168404Spjd holds = refcount_remove(&db->db_holds, tag); 2116168404Spjd ASSERT(holds >= 0); 2117168404Spjd 2118168404Spjd /* 2119168404Spjd * We can't freeze indirects if there is a possibility that they 2120168404Spjd * may be modified in the current syncing context. 2121168404Spjd */ 2122168404Spjd if (db->db_buf && holds == (db->db_level == 0 ? db->db_dirtycnt : 0)) 2123168404Spjd arc_buf_freeze(db->db_buf); 2124168404Spjd 2125168404Spjd if (holds == db->db_dirtycnt && 2126168404Spjd db->db_level == 0 && db->db_immediate_evict) 2127168404Spjd dbuf_evict_user(db); 2128168404Spjd 2129168404Spjd if (holds == 0) { 2130219089Spjd if (db->db_blkid == DMU_BONUS_BLKID) { 2131168404Spjd mutex_exit(&db->db_mtx); 2132219089Spjd 2133219089Spjd /* 2134219089Spjd * If the dnode moves here, we cannot cross this barrier 2135219089Spjd * until the move completes. 2136219089Spjd */ 2137219089Spjd DB_DNODE_ENTER(db); 2138270248Sdelphij atomic_dec_32(&DB_DNODE(db)->dn_dbufs_count); 2139219089Spjd DB_DNODE_EXIT(db); 2140219089Spjd /* 2141219089Spjd * The bonus buffer's dnode hold is no longer discounted 2142219089Spjd * in dnode_move(). The dnode cannot move until after 2143219089Spjd * the dnode_rele(). 2144219089Spjd */ 2145219089Spjd dnode_rele(DB_DNODE(db), db); 2146168404Spjd } else if (db->db_buf == NULL) { 2147168404Spjd /* 2148168404Spjd * This is a special case: we never associated this 2149168404Spjd * dbuf with any data allocated from the ARC. 2150168404Spjd */ 2151219089Spjd ASSERT(db->db_state == DB_UNCACHED || 2152219089Spjd db->db_state == DB_NOFILL); 2153168404Spjd dbuf_evict(db); 2154168404Spjd } else if (arc_released(db->db_buf)) { 2155168404Spjd arc_buf_t *buf = db->db_buf; 2156168404Spjd /* 2157168404Spjd * This dbuf has anonymous data associated with it. 2158168404Spjd */ 2159168404Spjd dbuf_set_data(db, NULL); 2160248571Smm VERIFY(arc_buf_remove_ref(buf, db)); 2161168404Spjd dbuf_evict(db); 2162168404Spjd } else { 2163248571Smm VERIFY(!arc_buf_remove_ref(db->db_buf, db)); 2164242845Sdelphij 2165242845Sdelphij /* 2166242845Sdelphij * A dbuf will be eligible for eviction if either the 2167242845Sdelphij * 'primarycache' property is set or a duplicate 2168242845Sdelphij * copy of this buffer is already cached in the arc. 2169242845Sdelphij * 2170242845Sdelphij * In the case of the 'primarycache' a buffer 2171242845Sdelphij * is considered for eviction if it matches the 2172242845Sdelphij * criteria set in the property. 2173242845Sdelphij * 2174242845Sdelphij * To decide if our buffer is considered a 2175242845Sdelphij * duplicate, we must call into the arc to determine 2176242845Sdelphij * if multiple buffers are referencing the same 2177242845Sdelphij * block on-disk. If so, then we simply evict 2178242845Sdelphij * ourselves. 2179242845Sdelphij */ 2180268858Sdelphij if (!DBUF_IS_CACHEABLE(db)) { 2181268858Sdelphij if (db->db_blkptr != NULL && 2182268858Sdelphij !BP_IS_HOLE(db->db_blkptr) && 2183268858Sdelphij !BP_IS_EMBEDDED(db->db_blkptr)) { 2184268858Sdelphij spa_t *spa = 2185268858Sdelphij dmu_objset_spa(db->db_objset); 2186268858Sdelphij blkptr_t bp = *db->db_blkptr; 2187268858Sdelphij dbuf_clear(db); 2188268858Sdelphij arc_freed(spa, &bp); 2189268858Sdelphij } else { 2190268858Sdelphij dbuf_clear(db); 2191268858Sdelphij } 2192268858Sdelphij } else if (arc_buf_eviction_needed(db->db_buf)) { 2193185029Spjd dbuf_clear(db); 2194268858Sdelphij } else { 2195185029Spjd mutex_exit(&db->db_mtx); 2196268858Sdelphij } 2197168404Spjd } 2198168404Spjd } else { 2199168404Spjd mutex_exit(&db->db_mtx); 2200168404Spjd } 2201168404Spjd} 2202168404Spjd 2203168404Spjd#pragma weak dmu_buf_refcount = dbuf_refcount 2204168404Spjduint64_t 2205168404Spjddbuf_refcount(dmu_buf_impl_t *db) 2206168404Spjd{ 2207168404Spjd return (refcount_count(&db->db_holds)); 2208168404Spjd} 2209168404Spjd 2210168404Spjdvoid * 2211275782Sdelphijdmu_buf_set_user(dmu_buf_t *db_fake, void *user_ptr, 2212168404Spjd dmu_buf_evict_func_t *evict_func) 2213168404Spjd{ 2214275782Sdelphij return (dmu_buf_update_user(db_fake, NULL, user_ptr, evict_func)); 2215168404Spjd} 2216168404Spjd 2217168404Spjdvoid * 2218275782Sdelphijdmu_buf_set_user_ie(dmu_buf_t *db_fake, void *user_ptr, 2219168404Spjd dmu_buf_evict_func_t *evict_func) 2220168404Spjd{ 2221168404Spjd dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 2222168404Spjd 2223168404Spjd db->db_immediate_evict = TRUE; 2224275782Sdelphij return (dmu_buf_update_user(db_fake, NULL, user_ptr, evict_func)); 2225168404Spjd} 2226168404Spjd 2227168404Spjdvoid * 2228168404Spjddmu_buf_update_user(dmu_buf_t *db_fake, void *old_user_ptr, void *user_ptr, 2229275782Sdelphij dmu_buf_evict_func_t *evict_func) 2230168404Spjd{ 2231168404Spjd dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 2232168404Spjd ASSERT(db->db_level == 0); 2233168404Spjd 2234168404Spjd ASSERT((user_ptr == NULL) == (evict_func == NULL)); 2235168404Spjd 2236168404Spjd mutex_enter(&db->db_mtx); 2237168404Spjd 2238168404Spjd if (db->db_user_ptr == old_user_ptr) { 2239168404Spjd db->db_user_ptr = user_ptr; 2240168404Spjd db->db_evict_func = evict_func; 2241168404Spjd } else { 2242168404Spjd old_user_ptr = db->db_user_ptr; 2243168404Spjd } 2244168404Spjd 2245168404Spjd mutex_exit(&db->db_mtx); 2246168404Spjd return (old_user_ptr); 2247168404Spjd} 2248168404Spjd 2249168404Spjdvoid * 2250168404Spjddmu_buf_get_user(dmu_buf_t *db_fake) 2251168404Spjd{ 2252168404Spjd dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 2253168404Spjd ASSERT(!refcount_is_zero(&db->db_holds)); 2254168404Spjd 2255168404Spjd return (db->db_user_ptr); 2256168404Spjd} 2257168404Spjd 2258209962Smmboolean_t 2259209962Smmdmu_buf_freeable(dmu_buf_t *dbuf) 2260209962Smm{ 2261209962Smm boolean_t res = B_FALSE; 2262209962Smm dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf; 2263209962Smm 2264209962Smm if (db->db_blkptr) 2265209962Smm res = dsl_dataset_block_freeable(db->db_objset->os_dsl_dataset, 2266219089Spjd db->db_blkptr, db->db_blkptr->blk_birth); 2267209962Smm 2268209962Smm return (res); 2269209962Smm} 2270209962Smm 2271243524Smmblkptr_t * 2272243524Smmdmu_buf_get_blkptr(dmu_buf_t *db) 2273243524Smm{ 2274243524Smm dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db; 2275243524Smm return (dbi->db_blkptr); 2276243524Smm} 2277243524Smm 2278168404Spjdstatic void 2279168404Spjddbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db) 2280168404Spjd{ 2281168404Spjd /* ASSERT(dmu_tx_is_syncing(tx) */ 2282168404Spjd ASSERT(MUTEX_HELD(&db->db_mtx)); 2283168404Spjd 2284168404Spjd if (db->db_blkptr != NULL) 2285168404Spjd return; 2286168404Spjd 2287219089Spjd if (db->db_blkid == DMU_SPILL_BLKID) { 2288219089Spjd db->db_blkptr = &dn->dn_phys->dn_spill; 2289219089Spjd BP_ZERO(db->db_blkptr); 2290219089Spjd return; 2291219089Spjd } 2292168404Spjd if (db->db_level == dn->dn_phys->dn_nlevels-1) { 2293168404Spjd /* 2294168404Spjd * This buffer was allocated at a time when there was 2295168404Spjd * no available blkptrs from the dnode, or it was 2296168404Spjd * inappropriate to hook it in (i.e., nlevels mis-match). 2297168404Spjd */ 2298168404Spjd ASSERT(db->db_blkid < dn->dn_phys->dn_nblkptr); 2299168404Spjd ASSERT(db->db_parent == NULL); 2300168404Spjd db->db_parent = dn->dn_dbuf; 2301168404Spjd db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid]; 2302168404Spjd DBUF_VERIFY(db); 2303168404Spjd } else { 2304168404Spjd dmu_buf_impl_t *parent = db->db_parent; 2305168404Spjd int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; 2306168404Spjd 2307168404Spjd ASSERT(dn->dn_phys->dn_nlevels > 1); 2308168404Spjd if (parent == NULL) { 2309168404Spjd mutex_exit(&db->db_mtx); 2310168404Spjd rw_enter(&dn->dn_struct_rwlock, RW_READER); 2311168404Spjd (void) dbuf_hold_impl(dn, db->db_level+1, 2312168404Spjd db->db_blkid >> epbs, FALSE, db, &parent); 2313168404Spjd rw_exit(&dn->dn_struct_rwlock); 2314168404Spjd mutex_enter(&db->db_mtx); 2315168404Spjd db->db_parent = parent; 2316168404Spjd } 2317168404Spjd db->db_blkptr = (blkptr_t *)parent->db.db_data + 2318168404Spjd (db->db_blkid & ((1ULL << epbs) - 1)); 2319168404Spjd DBUF_VERIFY(db); 2320168404Spjd } 2321168404Spjd} 2322168404Spjd 2323168404Spjdstatic void 2324168404Spjddbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx) 2325168404Spjd{ 2326168404Spjd dmu_buf_impl_t *db = dr->dr_dbuf; 2327219089Spjd dnode_t *dn; 2328168404Spjd zio_t *zio; 2329168404Spjd 2330168404Spjd ASSERT(dmu_tx_is_syncing(tx)); 2331168404Spjd 2332168404Spjd dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr); 2333168404Spjd 2334168404Spjd mutex_enter(&db->db_mtx); 2335168404Spjd 2336168404Spjd ASSERT(db->db_level > 0); 2337168404Spjd DBUF_VERIFY(db); 2338168404Spjd 2339251629Sdelphij /* Read the block if it hasn't been read yet. */ 2340168404Spjd if (db->db_buf == NULL) { 2341168404Spjd mutex_exit(&db->db_mtx); 2342168404Spjd (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED); 2343168404Spjd mutex_enter(&db->db_mtx); 2344168404Spjd } 2345168404Spjd ASSERT3U(db->db_state, ==, DB_CACHED); 2346168404Spjd ASSERT(db->db_buf != NULL); 2347168404Spjd 2348219089Spjd DB_DNODE_ENTER(db); 2349219089Spjd dn = DB_DNODE(db); 2350251629Sdelphij /* Indirect block size must match what the dnode thinks it is. */ 2351219089Spjd ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift); 2352168404Spjd dbuf_check_blkptr(dn, db); 2353219089Spjd DB_DNODE_EXIT(db); 2354168404Spjd 2355251629Sdelphij /* Provide the pending dirty record to child dbufs */ 2356168404Spjd db->db_data_pending = dr; 2357168404Spjd 2358168404Spjd mutex_exit(&db->db_mtx); 2359185029Spjd dbuf_write(dr, db->db_buf, tx); 2360168404Spjd 2361168404Spjd zio = dr->dr_zio; 2362168404Spjd mutex_enter(&dr->dt.di.dr_mtx); 2363284593Savg dbuf_sync_list(&dr->dt.di.dr_children, db->db_level - 1, tx); 2364168404Spjd ASSERT(list_head(&dr->dt.di.dr_children) == NULL); 2365168404Spjd mutex_exit(&dr->dt.di.dr_mtx); 2366168404Spjd zio_nowait(zio); 2367168404Spjd} 2368168404Spjd 2369168404Spjdstatic void 2370168404Spjddbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) 2371168404Spjd{ 2372168404Spjd arc_buf_t **datap = &dr->dt.dl.dr_data; 2373168404Spjd dmu_buf_impl_t *db = dr->dr_dbuf; 2374219089Spjd dnode_t *dn; 2375219089Spjd objset_t *os; 2376168404Spjd uint64_t txg = tx->tx_txg; 2377168404Spjd 2378168404Spjd ASSERT(dmu_tx_is_syncing(tx)); 2379168404Spjd 2380168404Spjd dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr); 2381168404Spjd 2382168404Spjd mutex_enter(&db->db_mtx); 2383168404Spjd /* 2384168404Spjd * To be synced, we must be dirtied. But we 2385168404Spjd * might have been freed after the dirty. 2386168404Spjd */ 2387168404Spjd if (db->db_state == DB_UNCACHED) { 2388168404Spjd /* This buffer has been freed since it was dirtied */ 2389168404Spjd ASSERT(db->db.db_data == NULL); 2390168404Spjd } else if (db->db_state == DB_FILL) { 2391168404Spjd /* This buffer was freed and is now being re-filled */ 2392168404Spjd ASSERT(db->db.db_data != dr->dt.dl.dr_data); 2393168404Spjd } else { 2394219089Spjd ASSERT(db->db_state == DB_CACHED || db->db_state == DB_NOFILL); 2395168404Spjd } 2396168404Spjd DBUF_VERIFY(db); 2397168404Spjd 2398219089Spjd DB_DNODE_ENTER(db); 2399219089Spjd dn = DB_DNODE(db); 2400219089Spjd 2401219089Spjd if (db->db_blkid == DMU_SPILL_BLKID) { 2402219089Spjd mutex_enter(&dn->dn_mtx); 2403219089Spjd dn->dn_phys->dn_flags |= DNODE_FLAG_SPILL_BLKPTR; 2404219089Spjd mutex_exit(&dn->dn_mtx); 2405219089Spjd } 2406219089Spjd 2407168404Spjd /* 2408168404Spjd * If this is a bonus buffer, simply copy the bonus data into the 2409168404Spjd * dnode. It will be written out when the dnode is synced (and it 2410168404Spjd * will be synced, since it must have been dirty for dbuf_sync to 2411168404Spjd * be called). 2412168404Spjd */ 2413219089Spjd if (db->db_blkid == DMU_BONUS_BLKID) { 2414168404Spjd dbuf_dirty_record_t **drp; 2415185029Spjd 2416168404Spjd ASSERT(*datap != NULL); 2417240415Smm ASSERT0(db->db_level); 2418168404Spjd ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN); 2419168404Spjd bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen); 2420219089Spjd DB_DNODE_EXIT(db); 2421219089Spjd 2422185029Spjd if (*datap != db->db.db_data) { 2423168404Spjd zio_buf_free(*datap, DN_MAX_BONUSLEN); 2424208373Smm arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); 2425185029Spjd } 2426168404Spjd db->db_data_pending = NULL; 2427168404Spjd drp = &db->db_last_dirty; 2428168404Spjd while (*drp != dr) 2429168404Spjd drp = &(*drp)->dr_next; 2430185029Spjd ASSERT(dr->dr_next == NULL); 2431219089Spjd ASSERT(dr->dr_dbuf == db); 2432185029Spjd *drp = dr->dr_next; 2433169325Spjd if (dr->dr_dbuf->db_level != 0) { 2434169325Spjd list_destroy(&dr->dt.di.dr_children); 2435169325Spjd mutex_destroy(&dr->dt.di.dr_mtx); 2436169325Spjd } 2437168404Spjd kmem_free(dr, sizeof (dbuf_dirty_record_t)); 2438168404Spjd ASSERT(db->db_dirtycnt > 0); 2439168404Spjd db->db_dirtycnt -= 1; 2440219089Spjd dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg); 2441168404Spjd return; 2442168404Spjd } 2443168404Spjd 2444219089Spjd os = dn->dn_objset; 2445219089Spjd 2446168404Spjd /* 2447185029Spjd * This function may have dropped the db_mtx lock allowing a dmu_sync 2448185029Spjd * operation to sneak in. As a result, we need to ensure that we 2449185029Spjd * don't check the dr_override_state until we have returned from 2450185029Spjd * dbuf_check_blkptr. 2451185029Spjd */ 2452185029Spjd dbuf_check_blkptr(dn, db); 2453185029Spjd 2454185029Spjd /* 2455219089Spjd * If this buffer is in the middle of an immediate write, 2456168404Spjd * wait for the synchronous IO to complete. 2457168404Spjd */ 2458168404Spjd while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) { 2459168404Spjd ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT); 2460168404Spjd cv_wait(&db->db_changed, &db->db_mtx); 2461168404Spjd ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN); 2462168404Spjd } 2463168404Spjd 2464219089Spjd if (db->db_state != DB_NOFILL && 2465219089Spjd dn->dn_object != DMU_META_DNODE_OBJECT && 2466208050Smm refcount_count(&db->db_holds) > 1 && 2467219089Spjd dr->dt.dl.dr_override_state != DR_OVERRIDDEN && 2468208050Smm *datap == db->db_buf) { 2469168404Spjd /* 2470208050Smm * If this buffer is currently "in use" (i.e., there 2471208050Smm * are active holds and db_data still references it), 2472208050Smm * then make a copy before we start the write so that 2473208050Smm * any modifications from the open txg will not leak 2474208050Smm * into this write. 2475168404Spjd * 2476208050Smm * NOTE: this copy does not need to be made for 2477208050Smm * objects only modified in the syncing context (e.g. 2478208050Smm * DNONE_DNODE blocks). 2479168404Spjd */ 2480208050Smm int blksz = arc_buf_size(*datap); 2481208050Smm arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 2482208050Smm *datap = arc_buf_alloc(os->os_spa, blksz, db, type); 2483208050Smm bcopy(db->db.db_data, (*datap)->b_data, blksz); 2484168404Spjd } 2485168404Spjd db->db_data_pending = dr; 2486168404Spjd 2487168404Spjd mutex_exit(&db->db_mtx); 2488168404Spjd 2489185029Spjd dbuf_write(dr, *datap, tx); 2490168404Spjd 2491168404Spjd ASSERT(!list_link_active(&dr->dr_dirty_node)); 2492219089Spjd if (dn->dn_object == DMU_META_DNODE_OBJECT) { 2493168404Spjd list_insert_tail(&dn->dn_dirty_records[txg&TXG_MASK], dr); 2494219089Spjd DB_DNODE_EXIT(db); 2495219089Spjd } else { 2496219089Spjd /* 2497219089Spjd * Although zio_nowait() does not "wait for an IO", it does 2498219089Spjd * initiate the IO. If this is an empty write it seems plausible 2499219089Spjd * that the IO could actually be completed before the nowait 2500219089Spjd * returns. We need to DB_DNODE_EXIT() first in case 2501219089Spjd * zio_nowait() invalidates the dbuf. 2502219089Spjd */ 2503219089Spjd DB_DNODE_EXIT(db); 2504168404Spjd zio_nowait(dr->dr_zio); 2505219089Spjd } 2506168404Spjd} 2507168404Spjd 2508168404Spjdvoid 2509284593Savgdbuf_sync_list(list_t *list, int level, dmu_tx_t *tx) 2510168404Spjd{ 2511168404Spjd dbuf_dirty_record_t *dr; 2512168404Spjd 2513168404Spjd while (dr = list_head(list)) { 2514168404Spjd if (dr->dr_zio != NULL) { 2515168404Spjd /* 2516168404Spjd * If we find an already initialized zio then we 2517168404Spjd * are processing the meta-dnode, and we have finished. 2518168404Spjd * The dbufs for all dnodes are put back on the list 2519168404Spjd * during processing, so that we can zio_wait() 2520168404Spjd * these IOs after initiating all child IOs. 2521168404Spjd */ 2522168404Spjd ASSERT3U(dr->dr_dbuf->db.db_object, ==, 2523168404Spjd DMU_META_DNODE_OBJECT); 2524168404Spjd break; 2525168404Spjd } 2526284593Savg if (dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID && 2527284593Savg dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) { 2528284593Savg VERIFY3U(dr->dr_dbuf->db_level, ==, level); 2529284593Savg } 2530168404Spjd list_remove(list, dr); 2531168404Spjd if (dr->dr_dbuf->db_level > 0) 2532168404Spjd dbuf_sync_indirect(dr, tx); 2533168404Spjd else 2534168404Spjd dbuf_sync_leaf(dr, tx); 2535168404Spjd } 2536168404Spjd} 2537168404Spjd 2538168404Spjd/* ARGSUSED */ 2539168404Spjdstatic void 2540168404Spjddbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb) 2541168404Spjd{ 2542168404Spjd dmu_buf_impl_t *db = vdb; 2543219089Spjd dnode_t *dn; 2544185029Spjd blkptr_t *bp = zio->io_bp; 2545168404Spjd blkptr_t *bp_orig = &zio->io_bp_orig; 2546219089Spjd spa_t *spa = zio->io_spa; 2547219089Spjd int64_t delta; 2548168404Spjd uint64_t fill = 0; 2549219089Spjd int i; 2550168404Spjd 2551268075Sdelphij ASSERT3P(db->db_blkptr, ==, bp); 2552185029Spjd 2553219089Spjd DB_DNODE_ENTER(db); 2554219089Spjd dn = DB_DNODE(db); 2555219089Spjd delta = bp_get_dsize_sync(spa, bp) - bp_get_dsize_sync(spa, bp_orig); 2556219089Spjd dnode_diduse_space(dn, delta - zio->io_prev_space_delta); 2557219089Spjd zio->io_prev_space_delta = delta; 2558168404Spjd 2559260150Sdelphij if (bp->blk_birth != 0) { 2560260150Sdelphij ASSERT((db->db_blkid != DMU_SPILL_BLKID && 2561260150Sdelphij BP_GET_TYPE(bp) == dn->dn_type) || 2562260150Sdelphij (db->db_blkid == DMU_SPILL_BLKID && 2563268075Sdelphij BP_GET_TYPE(bp) == dn->dn_bonustype) || 2564268075Sdelphij BP_IS_EMBEDDED(bp)); 2565260150Sdelphij ASSERT(BP_GET_LEVEL(bp) == db->db_level); 2566168404Spjd } 2567168404Spjd 2568168404Spjd mutex_enter(&db->db_mtx); 2569168404Spjd 2570219089Spjd#ifdef ZFS_DEBUG 2571219089Spjd if (db->db_blkid == DMU_SPILL_BLKID) { 2572219089Spjd ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR); 2573219089Spjd ASSERT(!(BP_IS_HOLE(db->db_blkptr)) && 2574219089Spjd db->db_blkptr == &dn->dn_phys->dn_spill); 2575219089Spjd } 2576219089Spjd#endif 2577219089Spjd 2578168404Spjd if (db->db_level == 0) { 2579168404Spjd mutex_enter(&dn->dn_mtx); 2580219089Spjd if (db->db_blkid > dn->dn_phys->dn_maxblkid && 2581219089Spjd db->db_blkid != DMU_SPILL_BLKID) 2582168404Spjd dn->dn_phys->dn_maxblkid = db->db_blkid; 2583168404Spjd mutex_exit(&dn->dn_mtx); 2584168404Spjd 2585168404Spjd if (dn->dn_type == DMU_OT_DNODE) { 2586168404Spjd dnode_phys_t *dnp = db->db.db_data; 2587168404Spjd for (i = db->db.db_size >> DNODE_SHIFT; i > 0; 2588168404Spjd i--, dnp++) { 2589168404Spjd if (dnp->dn_type != DMU_OT_NONE) 2590168404Spjd fill++; 2591168404Spjd } 2592168404Spjd } else { 2593260150Sdelphij if (BP_IS_HOLE(bp)) { 2594260150Sdelphij fill = 0; 2595260150Sdelphij } else { 2596260150Sdelphij fill = 1; 2597260150Sdelphij } 2598168404Spjd } 2599168404Spjd } else { 2600185029Spjd blkptr_t *ibp = db->db.db_data; 2601168404Spjd ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift); 2602185029Spjd for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) { 2603185029Spjd if (BP_IS_HOLE(ibp)) 2604168404Spjd continue; 2605268075Sdelphij fill += BP_GET_FILL(ibp); 2606168404Spjd } 2607168404Spjd } 2608219089Spjd DB_DNODE_EXIT(db); 2609168404Spjd 2610268075Sdelphij if (!BP_IS_EMBEDDED(bp)) 2611268075Sdelphij bp->blk_fill = fill; 2612168404Spjd 2613168404Spjd mutex_exit(&db->db_mtx); 2614168404Spjd} 2615168404Spjd 2616258632Savg/* 2617258632Savg * The SPA will call this callback several times for each zio - once 2618258632Savg * for every physical child i/o (zio->io_phys_children times). This 2619258632Savg * allows the DMU to monitor the progress of each logical i/o. For example, 2620258632Savg * there may be 2 copies of an indirect block, or many fragments of a RAID-Z 2621258632Savg * block. There may be a long delay before all copies/fragments are completed, 2622258632Savg * so this callback allows us to retire dirty space gradually, as the physical 2623258632Savg * i/os complete. 2624258632Savg */ 2625168404Spjd/* ARGSUSED */ 2626168404Spjdstatic void 2627258632Savgdbuf_write_physdone(zio_t *zio, arc_buf_t *buf, void *arg) 2628258632Savg{ 2629258632Savg dmu_buf_impl_t *db = arg; 2630258632Savg objset_t *os = db->db_objset; 2631258632Savg dsl_pool_t *dp = dmu_objset_pool(os); 2632258632Savg dbuf_dirty_record_t *dr; 2633258632Savg int delta = 0; 2634258632Savg 2635258632Savg dr = db->db_data_pending; 2636258632Savg ASSERT3U(dr->dr_txg, ==, zio->io_txg); 2637258632Savg 2638258632Savg /* 2639258632Savg * The callback will be called io_phys_children times. Retire one 2640258632Savg * portion of our dirty space each time we are called. Any rounding 2641258632Savg * error will be cleaned up by dsl_pool_sync()'s call to 2642258632Savg * dsl_pool_undirty_space(). 2643258632Savg */ 2644258632Savg delta = dr->dr_accounted / zio->io_phys_children; 2645258632Savg dsl_pool_undirty_space(dp, delta, zio->io_txg); 2646258632Savg} 2647258632Savg 2648258632Savg/* ARGSUSED */ 2649258632Savgstatic void 2650168404Spjddbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) 2651168404Spjd{ 2652168404Spjd dmu_buf_impl_t *db = vdb; 2653219089Spjd blkptr_t *bp_orig = &zio->io_bp_orig; 2654260150Sdelphij blkptr_t *bp = db->db_blkptr; 2655260150Sdelphij objset_t *os = db->db_objset; 2656260150Sdelphij dmu_tx_t *tx = os->os_synctx; 2657168404Spjd dbuf_dirty_record_t **drp, *dr; 2658168404Spjd 2659240415Smm ASSERT0(zio->io_error); 2660219089Spjd ASSERT(db->db_blkptr == bp); 2661168404Spjd 2662243524Smm /* 2663243524Smm * For nopwrites and rewrites we ensure that the bp matches our 2664243524Smm * original and bypass all the accounting. 2665243524Smm */ 2666243524Smm if (zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)) { 2667219089Spjd ASSERT(BP_EQUAL(bp, bp_orig)); 2668219089Spjd } else { 2669260150Sdelphij dsl_dataset_t *ds = os->os_dsl_dataset; 2670219089Spjd (void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE); 2671219089Spjd dsl_dataset_block_born(ds, bp, tx); 2672219089Spjd } 2673219089Spjd 2674168404Spjd mutex_enter(&db->db_mtx); 2675168404Spjd 2676219089Spjd DBUF_VERIFY(db); 2677219089Spjd 2678168404Spjd drp = &db->db_last_dirty; 2679185029Spjd while ((dr = *drp) != db->db_data_pending) 2680185029Spjd drp = &dr->dr_next; 2681185029Spjd ASSERT(!list_link_active(&dr->dr_dirty_node)); 2682219089Spjd ASSERT(dr->dr_dbuf == db); 2683185029Spjd ASSERT(dr->dr_next == NULL); 2684185029Spjd *drp = dr->dr_next; 2685168404Spjd 2686219089Spjd#ifdef ZFS_DEBUG 2687219089Spjd if (db->db_blkid == DMU_SPILL_BLKID) { 2688219089Spjd dnode_t *dn; 2689219089Spjd 2690219089Spjd DB_DNODE_ENTER(db); 2691219089Spjd dn = DB_DNODE(db); 2692219089Spjd ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR); 2693219089Spjd ASSERT(!(BP_IS_HOLE(db->db_blkptr)) && 2694219089Spjd db->db_blkptr == &dn->dn_phys->dn_spill); 2695219089Spjd DB_DNODE_EXIT(db); 2696219089Spjd } 2697219089Spjd#endif 2698219089Spjd 2699168404Spjd if (db->db_level == 0) { 2700219089Spjd ASSERT(db->db_blkid != DMU_BONUS_BLKID); 2701168404Spjd ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN); 2702219089Spjd if (db->db_state != DB_NOFILL) { 2703219089Spjd if (dr->dt.dl.dr_data != db->db_buf) 2704219089Spjd VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, 2705248571Smm db)); 2706219089Spjd else if (!arc_released(db->db_buf)) 2707219089Spjd arc_set_callback(db->db_buf, dbuf_do_evict, db); 2708219089Spjd } 2709168404Spjd } else { 2710219089Spjd dnode_t *dn; 2711168404Spjd 2712219089Spjd DB_DNODE_ENTER(db); 2713219089Spjd dn = DB_DNODE(db); 2714168404Spjd ASSERT(list_head(&dr->dt.di.dr_children) == NULL); 2715260150Sdelphij ASSERT3U(db->db.db_size, ==, 1 << dn->dn_phys->dn_indblkshift); 2716168404Spjd if (!BP_IS_HOLE(db->db_blkptr)) { 2717168404Spjd int epbs = 2718168404Spjd dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; 2719260150Sdelphij ASSERT3U(db->db_blkid, <=, 2720260150Sdelphij dn->dn_phys->dn_maxblkid >> (db->db_level * epbs)); 2721168404Spjd ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==, 2722168404Spjd db->db.db_size); 2723268075Sdelphij if (!arc_released(db->db_buf)) 2724268075Sdelphij arc_set_callback(db->db_buf, dbuf_do_evict, db); 2725168404Spjd } 2726219089Spjd DB_DNODE_EXIT(db); 2727185029Spjd mutex_destroy(&dr->dt.di.dr_mtx); 2728169325Spjd list_destroy(&dr->dt.di.dr_children); 2729168404Spjd } 2730168404Spjd kmem_free(dr, sizeof (dbuf_dirty_record_t)); 2731168404Spjd 2732168404Spjd cv_broadcast(&db->db_changed); 2733168404Spjd ASSERT(db->db_dirtycnt > 0); 2734168404Spjd db->db_dirtycnt -= 1; 2735168404Spjd db->db_data_pending = NULL; 2736260150Sdelphij dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg); 2737219089Spjd} 2738219089Spjd 2739219089Spjdstatic void 2740219089Spjddbuf_write_nofill_ready(zio_t *zio) 2741219089Spjd{ 2742219089Spjd dbuf_write_ready(zio, NULL, zio->io_private); 2743219089Spjd} 2744219089Spjd 2745219089Spjdstatic void 2746219089Spjddbuf_write_nofill_done(zio_t *zio) 2747219089Spjd{ 2748219089Spjd dbuf_write_done(zio, NULL, zio->io_private); 2749219089Spjd} 2750219089Spjd 2751219089Spjdstatic void 2752219089Spjddbuf_write_override_ready(zio_t *zio) 2753219089Spjd{ 2754219089Spjd dbuf_dirty_record_t *dr = zio->io_private; 2755219089Spjd dmu_buf_impl_t *db = dr->dr_dbuf; 2756219089Spjd 2757219089Spjd dbuf_write_ready(zio, NULL, db); 2758219089Spjd} 2759219089Spjd 2760219089Spjdstatic void 2761219089Spjddbuf_write_override_done(zio_t *zio) 2762219089Spjd{ 2763219089Spjd dbuf_dirty_record_t *dr = zio->io_private; 2764219089Spjd dmu_buf_impl_t *db = dr->dr_dbuf; 2765219089Spjd blkptr_t *obp = &dr->dt.dl.dr_overridden_by; 2766219089Spjd 2767219089Spjd mutex_enter(&db->db_mtx); 2768219089Spjd if (!BP_EQUAL(zio->io_bp, obp)) { 2769219089Spjd if (!BP_IS_HOLE(obp)) 2770219089Spjd dsl_free(spa_get_dsl(zio->io_spa), zio->io_txg, obp); 2771219089Spjd arc_release(dr->dt.dl.dr_data, db); 2772219089Spjd } 2773168404Spjd mutex_exit(&db->db_mtx); 2774168404Spjd 2775219089Spjd dbuf_write_done(zio, NULL, db); 2776219089Spjd} 2777168404Spjd 2778251629Sdelphij/* Issue I/O to commit a dirty buffer to disk. */ 2779219089Spjdstatic void 2780219089Spjddbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) 2781219089Spjd{ 2782219089Spjd dmu_buf_impl_t *db = dr->dr_dbuf; 2783219089Spjd dnode_t *dn; 2784219089Spjd objset_t *os; 2785219089Spjd dmu_buf_impl_t *parent = db->db_parent; 2786219089Spjd uint64_t txg = tx->tx_txg; 2787268123Sdelphij zbookmark_phys_t zb; 2788219089Spjd zio_prop_t zp; 2789219089Spjd zio_t *zio; 2790219089Spjd int wp_flag = 0; 2791219089Spjd 2792219089Spjd DB_DNODE_ENTER(db); 2793219089Spjd dn = DB_DNODE(db); 2794219089Spjd os = dn->dn_objset; 2795219089Spjd 2796219089Spjd if (db->db_state != DB_NOFILL) { 2797219089Spjd if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) { 2798219089Spjd /* 2799219089Spjd * Private object buffers are released here rather 2800219089Spjd * than in dbuf_dirty() since they are only modified 2801219089Spjd * in the syncing context and we don't want the 2802219089Spjd * overhead of making multiple copies of the data. 2803219089Spjd */ 2804219089Spjd if (BP_IS_HOLE(db->db_blkptr)) { 2805219089Spjd arc_buf_thaw(data); 2806219089Spjd } else { 2807219089Spjd dbuf_release_bp(db); 2808219089Spjd } 2809219089Spjd } 2810219089Spjd } 2811219089Spjd 2812219089Spjd if (parent != dn->dn_dbuf) { 2813251629Sdelphij /* Our parent is an indirect block. */ 2814251629Sdelphij /* We have a dirty parent that has been scheduled for write. */ 2815219089Spjd ASSERT(parent && parent->db_data_pending); 2816251629Sdelphij /* Our parent's buffer is one level closer to the dnode. */ 2817219089Spjd ASSERT(db->db_level == parent->db_level-1); 2818251629Sdelphij /* 2819251629Sdelphij * We're about to modify our parent's db_data by modifying 2820251629Sdelphij * our block pointer, so the parent must be released. 2821251629Sdelphij */ 2822219089Spjd ASSERT(arc_released(parent->db_buf)); 2823219089Spjd zio = parent->db_data_pending->dr_zio; 2824219089Spjd } else { 2825251629Sdelphij /* Our parent is the dnode itself. */ 2826219089Spjd ASSERT((db->db_level == dn->dn_phys->dn_nlevels-1 && 2827219089Spjd db->db_blkid != DMU_SPILL_BLKID) || 2828219089Spjd (db->db_blkid == DMU_SPILL_BLKID && db->db_level == 0)); 2829219089Spjd if (db->db_blkid != DMU_SPILL_BLKID) 2830219089Spjd ASSERT3P(db->db_blkptr, ==, 2831219089Spjd &dn->dn_phys->dn_blkptr[db->db_blkid]); 2832219089Spjd zio = dn->dn_zio; 2833219089Spjd } 2834219089Spjd 2835219089Spjd ASSERT(db->db_level == 0 || data == db->db_buf); 2836219089Spjd ASSERT3U(db->db_blkptr->blk_birth, <=, txg); 2837219089Spjd ASSERT(zio); 2838219089Spjd 2839219089Spjd SET_BOOKMARK(&zb, os->os_dsl_dataset ? 2840219089Spjd os->os_dsl_dataset->ds_object : DMU_META_OBJSET, 2841219089Spjd db->db.db_object, db->db_level, db->db_blkid); 2842219089Spjd 2843219089Spjd if (db->db_blkid == DMU_SPILL_BLKID) 2844219089Spjd wp_flag = WP_SPILL; 2845219089Spjd wp_flag |= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0; 2846219089Spjd 2847219089Spjd dmu_write_policy(os, dn, db->db_level, wp_flag, &zp); 2848219089Spjd DB_DNODE_EXIT(db); 2849219089Spjd 2850268075Sdelphij if (db->db_level == 0 && 2851268075Sdelphij dr->dt.dl.dr_override_state == DR_OVERRIDDEN) { 2852268075Sdelphij /* 2853268075Sdelphij * The BP for this block has been provided by open context 2854268075Sdelphij * (by dmu_sync() or dmu_buf_write_embedded()). 2855268075Sdelphij */ 2856268075Sdelphij void *contents = (data != NULL) ? data->b_data : NULL; 2857268075Sdelphij 2858219089Spjd dr->dr_zio = zio_write(zio, os->os_spa, txg, 2859268075Sdelphij db->db_blkptr, contents, db->db.db_size, &zp, 2860258632Savg dbuf_write_override_ready, NULL, dbuf_write_override_done, 2861258632Savg dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); 2862219089Spjd mutex_enter(&db->db_mtx); 2863219089Spjd dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; 2864219089Spjd zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by, 2865243524Smm dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite); 2866219089Spjd mutex_exit(&db->db_mtx); 2867219089Spjd } else if (db->db_state == DB_NOFILL) { 2868255750Sdelphij ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF || 2869255750Sdelphij zp.zp_checksum == ZIO_CHECKSUM_NOPARITY); 2870219089Spjd dr->dr_zio = zio_write(zio, os->os_spa, txg, 2871219089Spjd db->db_blkptr, NULL, db->db.db_size, &zp, 2872258632Savg dbuf_write_nofill_ready, NULL, dbuf_write_nofill_done, db, 2873219089Spjd ZIO_PRIORITY_ASYNC_WRITE, 2874219089Spjd ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb); 2875219089Spjd } else { 2876219089Spjd ASSERT(arc_released(data)); 2877219089Spjd dr->dr_zio = arc_write(zio, os->os_spa, txg, 2878251478Sdelphij db->db_blkptr, data, DBUF_IS_L2CACHEABLE(db), 2879251478Sdelphij DBUF_IS_L2COMPRESSIBLE(db), &zp, dbuf_write_ready, 2880258632Savg dbuf_write_physdone, dbuf_write_done, db, 2881258632Savg ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); 2882219089Spjd } 2883168404Spjd} 2884