dbuf.c revision 249195
1168404Spjd/* 2168404Spjd * CDDL HEADER START 3168404Spjd * 4168404Spjd * The contents of this file are subject to the terms of the 5168404Spjd * Common Development and Distribution License (the "License"). 6168404Spjd * You may not use this file except in compliance with the License. 7168404Spjd * 8168404Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9168404Spjd * or http://www.opensolaris.org/os/licensing. 10168404Spjd * See the License for the specific language governing permissions 11168404Spjd * and limitations under the License. 12168404Spjd * 13168404Spjd * When distributing Covered Code, include this CDDL HEADER in each 14168404Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15168404Spjd * If applicable, add the following below this CDDL HEADER, with the 16168404Spjd * fields enclosed by brackets "[]" replaced with your own identifying 17168404Spjd * information: Portions Copyright [yyyy] [name of copyright owner] 18168404Spjd * 19168404Spjd * CDDL HEADER END 20168404Spjd */ 21168404Spjd/* 22219089Spjd * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23219636Spjd * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 24249195Smm * Copyright (c) 2013 by Delphix. All rights reserved. 25168404Spjd */ 26168404Spjd 27168404Spjd#include <sys/zfs_context.h> 28168404Spjd#include <sys/dmu.h> 29168404Spjd#include <sys/dmu_impl.h> 30168404Spjd#include <sys/dbuf.h> 31168404Spjd#include <sys/dmu_objset.h> 32168404Spjd#include <sys/dsl_dataset.h> 33168404Spjd#include <sys/dsl_dir.h> 34168404Spjd#include <sys/dmu_tx.h> 35168404Spjd#include <sys/spa.h> 36168404Spjd#include <sys/zio.h> 37168404Spjd#include <sys/dmu_zfetch.h> 38219089Spjd#include <sys/sa.h> 39219089Spjd#include <sys/sa_impl.h> 40168404Spjd 41168404Spjdstatic void dbuf_destroy(dmu_buf_impl_t *db); 42248571Smmstatic boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx); 43185029Spjdstatic void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx); 44168404Spjd 45168404Spjd/* 46168404Spjd * Global data structures and functions for the dbuf cache. 47168404Spjd */ 48168404Spjdstatic kmem_cache_t *dbuf_cache; 49168404Spjd 50168404Spjd/* ARGSUSED */ 51168404Spjdstatic int 52168404Spjddbuf_cons(void *vdb, void *unused, int kmflag) 53168404Spjd{ 54168404Spjd dmu_buf_impl_t *db = vdb; 55168404Spjd bzero(db, sizeof (dmu_buf_impl_t)); 56168404Spjd 57168404Spjd mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL); 58168404Spjd cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL); 59168404Spjd refcount_create(&db->db_holds); 60168404Spjd return (0); 61168404Spjd} 62168404Spjd 63168404Spjd/* ARGSUSED */ 64168404Spjdstatic void 65168404Spjddbuf_dest(void *vdb, void *unused) 66168404Spjd{ 67168404Spjd dmu_buf_impl_t *db = vdb; 68168404Spjd mutex_destroy(&db->db_mtx); 69168404Spjd cv_destroy(&db->db_changed); 70168404Spjd refcount_destroy(&db->db_holds); 71168404Spjd} 72168404Spjd 73168404Spjd/* 74168404Spjd * dbuf hash table routines 75168404Spjd */ 76168404Spjdstatic dbuf_hash_table_t dbuf_hash_table; 77168404Spjd 78168404Spjdstatic uint64_t dbuf_hash_count; 79168404Spjd 80168404Spjdstatic uint64_t 81168404Spjddbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid) 82168404Spjd{ 83168404Spjd uintptr_t osv = (uintptr_t)os; 84168404Spjd uint64_t crc = -1ULL; 85168404Spjd 86168404Spjd ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); 87168404Spjd crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (lvl)) & 0xFF]; 88168404Spjd crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF]; 89168404Spjd crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF]; 90168404Spjd crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF]; 91168404Spjd crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 0)) & 0xFF]; 92168404Spjd crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 8)) & 0xFF]; 93168404Spjd 94168404Spjd crc ^= (osv>>14) ^ (obj>>16) ^ (blkid>>16); 95168404Spjd 96168404Spjd return (crc); 97168404Spjd} 98168404Spjd 99168404Spjd#define DBUF_HASH(os, obj, level, blkid) dbuf_hash(os, obj, level, blkid); 100168404Spjd 101168404Spjd#define DBUF_EQUAL(dbuf, os, obj, level, blkid) \ 102168404Spjd ((dbuf)->db.db_object == (obj) && \ 103168404Spjd (dbuf)->db_objset == (os) && \ 104168404Spjd (dbuf)->db_level == (level) && \ 105168404Spjd (dbuf)->db_blkid == (blkid)) 106168404Spjd 107168404Spjddmu_buf_impl_t * 108168404Spjddbuf_find(dnode_t *dn, uint8_t level, uint64_t blkid) 109168404Spjd{ 110168404Spjd dbuf_hash_table_t *h = &dbuf_hash_table; 111219089Spjd objset_t *os = dn->dn_objset; 112168404Spjd uint64_t obj = dn->dn_object; 113168404Spjd uint64_t hv = DBUF_HASH(os, obj, level, blkid); 114168404Spjd uint64_t idx = hv & h->hash_table_mask; 115168404Spjd dmu_buf_impl_t *db; 116168404Spjd 117168404Spjd mutex_enter(DBUF_HASH_MUTEX(h, idx)); 118168404Spjd for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) { 119168404Spjd if (DBUF_EQUAL(db, os, obj, level, blkid)) { 120168404Spjd mutex_enter(&db->db_mtx); 121168404Spjd if (db->db_state != DB_EVICTING) { 122168404Spjd mutex_exit(DBUF_HASH_MUTEX(h, idx)); 123168404Spjd return (db); 124168404Spjd } 125168404Spjd mutex_exit(&db->db_mtx); 126168404Spjd } 127168404Spjd } 128168404Spjd mutex_exit(DBUF_HASH_MUTEX(h, idx)); 129168404Spjd return (NULL); 130168404Spjd} 131168404Spjd 132168404Spjd/* 133168404Spjd * Insert an entry into the hash table. If there is already an element 134168404Spjd * equal to elem in the hash table, then the already existing element 135168404Spjd * will be returned and the new element will not be inserted. 136168404Spjd * Otherwise returns NULL. 137168404Spjd */ 138168404Spjdstatic dmu_buf_impl_t * 139168404Spjddbuf_hash_insert(dmu_buf_impl_t *db) 140168404Spjd{ 141168404Spjd dbuf_hash_table_t *h = &dbuf_hash_table; 142219089Spjd objset_t *os = db->db_objset; 143168404Spjd uint64_t obj = db->db.db_object; 144168404Spjd int level = db->db_level; 145168404Spjd uint64_t blkid = db->db_blkid; 146168404Spjd uint64_t hv = DBUF_HASH(os, obj, level, blkid); 147168404Spjd uint64_t idx = hv & h->hash_table_mask; 148168404Spjd dmu_buf_impl_t *dbf; 149168404Spjd 150168404Spjd mutex_enter(DBUF_HASH_MUTEX(h, idx)); 151168404Spjd for (dbf = h->hash_table[idx]; dbf != NULL; dbf = dbf->db_hash_next) { 152168404Spjd if (DBUF_EQUAL(dbf, os, obj, level, blkid)) { 153168404Spjd mutex_enter(&dbf->db_mtx); 154168404Spjd if (dbf->db_state != DB_EVICTING) { 155168404Spjd mutex_exit(DBUF_HASH_MUTEX(h, idx)); 156168404Spjd return (dbf); 157168404Spjd } 158168404Spjd mutex_exit(&dbf->db_mtx); 159168404Spjd } 160168404Spjd } 161168404Spjd 162168404Spjd mutex_enter(&db->db_mtx); 163168404Spjd db->db_hash_next = h->hash_table[idx]; 164168404Spjd h->hash_table[idx] = db; 165168404Spjd mutex_exit(DBUF_HASH_MUTEX(h, idx)); 166168404Spjd atomic_add_64(&dbuf_hash_count, 1); 167168404Spjd 168168404Spjd return (NULL); 169168404Spjd} 170168404Spjd 171168404Spjd/* 172168404Spjd * Remove an entry from the hash table. This operation will 173168404Spjd * fail if there are any existing holds on the db. 174168404Spjd */ 175168404Spjdstatic void 176168404Spjddbuf_hash_remove(dmu_buf_impl_t *db) 177168404Spjd{ 178168404Spjd dbuf_hash_table_t *h = &dbuf_hash_table; 179168404Spjd uint64_t hv = DBUF_HASH(db->db_objset, db->db.db_object, 180168404Spjd db->db_level, db->db_blkid); 181168404Spjd uint64_t idx = hv & h->hash_table_mask; 182168404Spjd dmu_buf_impl_t *dbf, **dbp; 183168404Spjd 184168404Spjd /* 185168404Spjd * We musn't hold db_mtx to maintin lock ordering: 186168404Spjd * DBUF_HASH_MUTEX > db_mtx. 187168404Spjd */ 188168404Spjd ASSERT(refcount_is_zero(&db->db_holds)); 189168404Spjd ASSERT(db->db_state == DB_EVICTING); 190168404Spjd ASSERT(!MUTEX_HELD(&db->db_mtx)); 191168404Spjd 192168404Spjd mutex_enter(DBUF_HASH_MUTEX(h, idx)); 193168404Spjd dbp = &h->hash_table[idx]; 194168404Spjd while ((dbf = *dbp) != db) { 195168404Spjd dbp = &dbf->db_hash_next; 196168404Spjd ASSERT(dbf != NULL); 197168404Spjd } 198168404Spjd *dbp = db->db_hash_next; 199168404Spjd db->db_hash_next = NULL; 200168404Spjd mutex_exit(DBUF_HASH_MUTEX(h, idx)); 201168404Spjd atomic_add_64(&dbuf_hash_count, -1); 202168404Spjd} 203168404Spjd 204168404Spjdstatic arc_evict_func_t dbuf_do_evict; 205168404Spjd 206168404Spjdstatic void 207168404Spjddbuf_evict_user(dmu_buf_impl_t *db) 208168404Spjd{ 209168404Spjd ASSERT(MUTEX_HELD(&db->db_mtx)); 210168404Spjd 211168404Spjd if (db->db_level != 0 || db->db_evict_func == NULL) 212168404Spjd return; 213168404Spjd 214168404Spjd if (db->db_user_data_ptr_ptr) 215168404Spjd *db->db_user_data_ptr_ptr = db->db.db_data; 216168404Spjd db->db_evict_func(&db->db, db->db_user_ptr); 217168404Spjd db->db_user_ptr = NULL; 218168404Spjd db->db_user_data_ptr_ptr = NULL; 219168404Spjd db->db_evict_func = NULL; 220168404Spjd} 221168404Spjd 222219089Spjdboolean_t 223219089Spjddbuf_is_metadata(dmu_buf_impl_t *db) 224219089Spjd{ 225219089Spjd if (db->db_level > 0) { 226219089Spjd return (B_TRUE); 227219089Spjd } else { 228219089Spjd boolean_t is_metadata; 229219089Spjd 230219089Spjd DB_DNODE_ENTER(db); 231236884Smm is_metadata = DMU_OT_IS_METADATA(DB_DNODE(db)->dn_type); 232219089Spjd DB_DNODE_EXIT(db); 233219089Spjd 234219089Spjd return (is_metadata); 235219089Spjd } 236219089Spjd} 237219089Spjd 238168404Spjdvoid 239168404Spjddbuf_evict(dmu_buf_impl_t *db) 240168404Spjd{ 241168404Spjd ASSERT(MUTEX_HELD(&db->db_mtx)); 242168404Spjd ASSERT(db->db_buf == NULL); 243168404Spjd ASSERT(db->db_data_pending == NULL); 244168404Spjd 245168404Spjd dbuf_clear(db); 246168404Spjd dbuf_destroy(db); 247168404Spjd} 248168404Spjd 249168404Spjdvoid 250168404Spjddbuf_init(void) 251168404Spjd{ 252168404Spjd uint64_t hsize = 1ULL << 16; 253168404Spjd dbuf_hash_table_t *h = &dbuf_hash_table; 254168404Spjd int i; 255168404Spjd 256168404Spjd /* 257168404Spjd * The hash table is big enough to fill all of physical memory 258168404Spjd * with an average 4K block size. The table will take up 259168404Spjd * totalmem*sizeof(void*)/4K (i.e. 2MB/GB with 8-byte pointers). 260168404Spjd */ 261168696Spjd while (hsize * 4096 < (uint64_t)physmem * PAGESIZE) 262168404Spjd hsize <<= 1; 263168404Spjd 264168404Spjdretry: 265168404Spjd h->hash_table_mask = hsize - 1; 266168404Spjd h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP); 267168404Spjd if (h->hash_table == NULL) { 268168404Spjd /* XXX - we should really return an error instead of assert */ 269168404Spjd ASSERT(hsize > (1ULL << 10)); 270168404Spjd hsize >>= 1; 271168404Spjd goto retry; 272168404Spjd } 273168404Spjd 274168404Spjd dbuf_cache = kmem_cache_create("dmu_buf_impl_t", 275168404Spjd sizeof (dmu_buf_impl_t), 276168404Spjd 0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0); 277168404Spjd 278168404Spjd for (i = 0; i < DBUF_MUTEXES; i++) 279168404Spjd mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL); 280168404Spjd} 281168404Spjd 282168404Spjdvoid 283168404Spjddbuf_fini(void) 284168404Spjd{ 285168404Spjd dbuf_hash_table_t *h = &dbuf_hash_table; 286168404Spjd int i; 287168404Spjd 288168404Spjd for (i = 0; i < DBUF_MUTEXES; i++) 289168404Spjd mutex_destroy(&h->hash_mutexes[i]); 290168404Spjd kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *)); 291168404Spjd kmem_cache_destroy(dbuf_cache); 292168404Spjd} 293168404Spjd 294168404Spjd/* 295168404Spjd * Other stuff. 296168404Spjd */ 297168404Spjd 298168404Spjd#ifdef ZFS_DEBUG 299168404Spjdstatic void 300168404Spjddbuf_verify(dmu_buf_impl_t *db) 301168404Spjd{ 302219089Spjd dnode_t *dn; 303219089Spjd dbuf_dirty_record_t *dr; 304168404Spjd 305168404Spjd ASSERT(MUTEX_HELD(&db->db_mtx)); 306168404Spjd 307168404Spjd if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY)) 308168404Spjd return; 309168404Spjd 310168404Spjd ASSERT(db->db_objset != NULL); 311219089Spjd DB_DNODE_ENTER(db); 312219089Spjd dn = DB_DNODE(db); 313168404Spjd if (dn == NULL) { 314168404Spjd ASSERT(db->db_parent == NULL); 315168404Spjd ASSERT(db->db_blkptr == NULL); 316168404Spjd } else { 317168404Spjd ASSERT3U(db->db.db_object, ==, dn->dn_object); 318168404Spjd ASSERT3P(db->db_objset, ==, dn->dn_objset); 319168404Spjd ASSERT3U(db->db_level, <, dn->dn_nlevels); 320219089Spjd ASSERT(db->db_blkid == DMU_BONUS_BLKID || 321219089Spjd db->db_blkid == DMU_SPILL_BLKID || 322219089Spjd !list_is_empty(&dn->dn_dbufs)); 323168404Spjd } 324219089Spjd if (db->db_blkid == DMU_BONUS_BLKID) { 325168404Spjd ASSERT(dn != NULL); 326185029Spjd ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); 327219089Spjd ASSERT3U(db->db.db_offset, ==, DMU_BONUS_BLKID); 328219089Spjd } else if (db->db_blkid == DMU_SPILL_BLKID) { 329219089Spjd ASSERT(dn != NULL); 330219089Spjd ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); 331240415Smm ASSERT0(db->db.db_offset); 332168404Spjd } else { 333168404Spjd ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size); 334168404Spjd } 335168404Spjd 336219089Spjd for (dr = db->db_data_pending; dr != NULL; dr = dr->dr_next) 337219089Spjd ASSERT(dr->dr_dbuf == db); 338219089Spjd 339219089Spjd for (dr = db->db_last_dirty; dr != NULL; dr = dr->dr_next) 340219089Spjd ASSERT(dr->dr_dbuf == db); 341219089Spjd 342208047Smm /* 343208047Smm * We can't assert that db_size matches dn_datablksz because it 344208047Smm * can be momentarily different when another thread is doing 345208047Smm * dnode_set_blksz(). 346208047Smm */ 347208047Smm if (db->db_level == 0 && db->db.db_object == DMU_META_DNODE_OBJECT) { 348219089Spjd dr = db->db_data_pending; 349208047Smm /* 350208047Smm * It should only be modified in syncing context, so 351208047Smm * make sure we only have one copy of the data. 352208047Smm */ 353208047Smm ASSERT(dr == NULL || dr->dt.dl.dr_data == db->db_buf); 354168404Spjd } 355168404Spjd 356168404Spjd /* verify db->db_blkptr */ 357168404Spjd if (db->db_blkptr) { 358168404Spjd if (db->db_parent == dn->dn_dbuf) { 359168404Spjd /* db is pointed to by the dnode */ 360168404Spjd /* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */ 361209962Smm if (DMU_OBJECT_IS_SPECIAL(db->db.db_object)) 362168404Spjd ASSERT(db->db_parent == NULL); 363168404Spjd else 364168404Spjd ASSERT(db->db_parent != NULL); 365219089Spjd if (db->db_blkid != DMU_SPILL_BLKID) 366219089Spjd ASSERT3P(db->db_blkptr, ==, 367219089Spjd &dn->dn_phys->dn_blkptr[db->db_blkid]); 368168404Spjd } else { 369168404Spjd /* db is pointed to by an indirect block */ 370168404Spjd int epb = db->db_parent->db.db_size >> SPA_BLKPTRSHIFT; 371168404Spjd ASSERT3U(db->db_parent->db_level, ==, db->db_level+1); 372168404Spjd ASSERT3U(db->db_parent->db.db_object, ==, 373168404Spjd db->db.db_object); 374168404Spjd /* 375168404Spjd * dnode_grow_indblksz() can make this fail if we don't 376168404Spjd * have the struct_rwlock. XXX indblksz no longer 377168404Spjd * grows. safe to do this now? 378168404Spjd */ 379219089Spjd if (RW_WRITE_HELD(&dn->dn_struct_rwlock)) { 380168404Spjd ASSERT3P(db->db_blkptr, ==, 381168404Spjd ((blkptr_t *)db->db_parent->db.db_data + 382168404Spjd db->db_blkid % epb)); 383168404Spjd } 384168404Spjd } 385168404Spjd } 386168404Spjd if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) && 387219089Spjd (db->db_buf == NULL || db->db_buf->b_data) && 388219089Spjd db->db.db_data && db->db_blkid != DMU_BONUS_BLKID && 389168404Spjd db->db_state != DB_FILL && !dn->dn_free_txg) { 390168404Spjd /* 391168404Spjd * If the blkptr isn't set but they have nonzero data, 392168404Spjd * it had better be dirty, otherwise we'll lose that 393168404Spjd * data when we evict this buffer. 394168404Spjd */ 395168404Spjd if (db->db_dirtycnt == 0) { 396168404Spjd uint64_t *buf = db->db.db_data; 397168404Spjd int i; 398168404Spjd 399168404Spjd for (i = 0; i < db->db.db_size >> 3; i++) { 400168404Spjd ASSERT(buf[i] == 0); 401168404Spjd } 402168404Spjd } 403168404Spjd } 404219089Spjd DB_DNODE_EXIT(db); 405168404Spjd} 406168404Spjd#endif 407168404Spjd 408168404Spjdstatic void 409168404Spjddbuf_update_data(dmu_buf_impl_t *db) 410168404Spjd{ 411168404Spjd ASSERT(MUTEX_HELD(&db->db_mtx)); 412168404Spjd if (db->db_level == 0 && db->db_user_data_ptr_ptr) { 413168404Spjd ASSERT(!refcount_is_zero(&db->db_holds)); 414168404Spjd *db->db_user_data_ptr_ptr = db->db.db_data; 415168404Spjd } 416168404Spjd} 417168404Spjd 418168404Spjdstatic void 419168404Spjddbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf) 420168404Spjd{ 421168404Spjd ASSERT(MUTEX_HELD(&db->db_mtx)); 422168404Spjd ASSERT(db->db_buf == NULL || !arc_has_callback(db->db_buf)); 423168404Spjd db->db_buf = buf; 424168404Spjd if (buf != NULL) { 425168404Spjd ASSERT(buf->b_data != NULL); 426168404Spjd db->db.db_data = buf->b_data; 427168404Spjd if (!arc_released(buf)) 428168404Spjd arc_set_callback(buf, dbuf_do_evict, db); 429168404Spjd dbuf_update_data(db); 430168404Spjd } else { 431168404Spjd dbuf_evict_user(db); 432168404Spjd db->db.db_data = NULL; 433219089Spjd if (db->db_state != DB_NOFILL) 434219089Spjd db->db_state = DB_UNCACHED; 435168404Spjd } 436168404Spjd} 437168404Spjd 438219089Spjd/* 439219089Spjd * Loan out an arc_buf for read. Return the loaned arc_buf. 440219089Spjd */ 441219089Spjdarc_buf_t * 442219089Spjddbuf_loan_arcbuf(dmu_buf_impl_t *db) 443219089Spjd{ 444219089Spjd arc_buf_t *abuf; 445219089Spjd 446219089Spjd mutex_enter(&db->db_mtx); 447219089Spjd if (arc_released(db->db_buf) || refcount_count(&db->db_holds) > 1) { 448219089Spjd int blksz = db->db.db_size; 449219089Spjd spa_t *spa; 450219089Spjd 451219089Spjd mutex_exit(&db->db_mtx); 452219089Spjd DB_GET_SPA(&spa, db); 453219089Spjd abuf = arc_loan_buf(spa, blksz); 454219089Spjd bcopy(db->db.db_data, abuf->b_data, blksz); 455219089Spjd } else { 456219089Spjd abuf = db->db_buf; 457219089Spjd arc_loan_inuse_buf(abuf, db); 458219089Spjd dbuf_set_data(db, NULL); 459219089Spjd mutex_exit(&db->db_mtx); 460219089Spjd } 461219089Spjd return (abuf); 462219089Spjd} 463219089Spjd 464168404Spjduint64_t 465168404Spjddbuf_whichblock(dnode_t *dn, uint64_t offset) 466168404Spjd{ 467168404Spjd if (dn->dn_datablkshift) { 468168404Spjd return (offset >> dn->dn_datablkshift); 469168404Spjd } else { 470168404Spjd ASSERT3U(offset, <, dn->dn_datablksz); 471168404Spjd return (0); 472168404Spjd } 473168404Spjd} 474168404Spjd 475168404Spjdstatic void 476168404Spjddbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb) 477168404Spjd{ 478168404Spjd dmu_buf_impl_t *db = vdb; 479168404Spjd 480168404Spjd mutex_enter(&db->db_mtx); 481168404Spjd ASSERT3U(db->db_state, ==, DB_READ); 482168404Spjd /* 483168404Spjd * All reads are synchronous, so we must have a hold on the dbuf 484168404Spjd */ 485168404Spjd ASSERT(refcount_count(&db->db_holds) > 0); 486168404Spjd ASSERT(db->db_buf == NULL); 487168404Spjd ASSERT(db->db.db_data == NULL); 488168404Spjd if (db->db_level == 0 && db->db_freed_in_flight) { 489168404Spjd /* we were freed in flight; disregard any error */ 490168404Spjd arc_release(buf, db); 491168404Spjd bzero(buf->b_data, db->db.db_size); 492168404Spjd arc_buf_freeze(buf); 493168404Spjd db->db_freed_in_flight = FALSE; 494168404Spjd dbuf_set_data(db, buf); 495168404Spjd db->db_state = DB_CACHED; 496168404Spjd } else if (zio == NULL || zio->io_error == 0) { 497168404Spjd dbuf_set_data(db, buf); 498168404Spjd db->db_state = DB_CACHED; 499168404Spjd } else { 500219089Spjd ASSERT(db->db_blkid != DMU_BONUS_BLKID); 501168404Spjd ASSERT3P(db->db_buf, ==, NULL); 502248571Smm VERIFY(arc_buf_remove_ref(buf, db)); 503168404Spjd db->db_state = DB_UNCACHED; 504168404Spjd } 505168404Spjd cv_broadcast(&db->db_changed); 506219089Spjd dbuf_rele_and_unlock(db, NULL); 507168404Spjd} 508168404Spjd 509168404Spjdstatic void 510168404Spjddbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags) 511168404Spjd{ 512219089Spjd dnode_t *dn; 513219089Spjd spa_t *spa; 514168404Spjd zbookmark_t zb; 515168404Spjd uint32_t aflags = ARC_NOWAIT; 516168404Spjd 517219089Spjd DB_DNODE_ENTER(db); 518219089Spjd dn = DB_DNODE(db); 519168404Spjd ASSERT(!refcount_is_zero(&db->db_holds)); 520168404Spjd /* We need the struct_rwlock to prevent db_blkptr from changing. */ 521185029Spjd ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 522168404Spjd ASSERT(MUTEX_HELD(&db->db_mtx)); 523168404Spjd ASSERT(db->db_state == DB_UNCACHED); 524168404Spjd ASSERT(db->db_buf == NULL); 525168404Spjd 526219089Spjd if (db->db_blkid == DMU_BONUS_BLKID) { 527207624Smm int bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen); 528185029Spjd 529185029Spjd ASSERT3U(bonuslen, <=, db->db.db_size); 530168404Spjd db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN); 531208373Smm arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); 532185029Spjd if (bonuslen < DN_MAX_BONUSLEN) 533168404Spjd bzero(db->db.db_data, DN_MAX_BONUSLEN); 534207624Smm if (bonuslen) 535207624Smm bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen); 536219089Spjd DB_DNODE_EXIT(db); 537168404Spjd dbuf_update_data(db); 538168404Spjd db->db_state = DB_CACHED; 539168404Spjd mutex_exit(&db->db_mtx); 540168404Spjd return; 541168404Spjd } 542168404Spjd 543185029Spjd /* 544185029Spjd * Recheck BP_IS_HOLE() after dnode_block_freed() in case dnode_sync() 545185029Spjd * processes the delete record and clears the bp while we are waiting 546185029Spjd * for the dn_mtx (resulting in a "no" from block_freed). 547185029Spjd */ 548185029Spjd if (db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr) || 549185029Spjd (db->db_level == 0 && (dnode_block_freed(dn, db->db_blkid) || 550185029Spjd BP_IS_HOLE(db->db_blkptr)))) { 551168404Spjd arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 552168404Spjd 553185029Spjd dbuf_set_data(db, arc_buf_alloc(dn->dn_objset->os_spa, 554168404Spjd db->db.db_size, db, type)); 555219089Spjd DB_DNODE_EXIT(db); 556168404Spjd bzero(db->db.db_data, db->db.db_size); 557168404Spjd db->db_state = DB_CACHED; 558168404Spjd *flags |= DB_RF_CACHED; 559168404Spjd mutex_exit(&db->db_mtx); 560168404Spjd return; 561168404Spjd } 562168404Spjd 563219089Spjd spa = dn->dn_objset->os_spa; 564219089Spjd DB_DNODE_EXIT(db); 565219089Spjd 566168404Spjd db->db_state = DB_READ; 567168404Spjd mutex_exit(&db->db_mtx); 568168404Spjd 569185029Spjd if (DBUF_IS_L2CACHEABLE(db)) 570185029Spjd aflags |= ARC_L2CACHE; 571185029Spjd 572219089Spjd SET_BOOKMARK(&zb, db->db_objset->os_dsl_dataset ? 573219089Spjd db->db_objset->os_dsl_dataset->ds_object : DMU_META_OBJSET, 574219089Spjd db->db.db_object, db->db_level, db->db_blkid); 575168404Spjd 576168404Spjd dbuf_add_ref(db, NULL); 577185029Spjd 578246666Smm (void) arc_read(zio, spa, db->db_blkptr, 579168404Spjd dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, 580168404Spjd (*flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED, 581168404Spjd &aflags, &zb); 582168404Spjd if (aflags & ARC_CACHED) 583168404Spjd *flags |= DB_RF_CACHED; 584168404Spjd} 585168404Spjd 586168404Spjdint 587168404Spjddbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) 588168404Spjd{ 589168404Spjd int err = 0; 590168404Spjd int havepzio = (zio != NULL); 591168404Spjd int prefetch; 592219089Spjd dnode_t *dn; 593168404Spjd 594168404Spjd /* 595168404Spjd * We don't have to hold the mutex to check db_state because it 596168404Spjd * can't be freed while we have a hold on the buffer. 597168404Spjd */ 598168404Spjd ASSERT(!refcount_is_zero(&db->db_holds)); 599168404Spjd 600219089Spjd if (db->db_state == DB_NOFILL) 601249195Smm return (SET_ERROR(EIO)); 602219089Spjd 603219089Spjd DB_DNODE_ENTER(db); 604219089Spjd dn = DB_DNODE(db); 605168404Spjd if ((flags & DB_RF_HAVESTRUCT) == 0) 606219089Spjd rw_enter(&dn->dn_struct_rwlock, RW_READER); 607168404Spjd 608219089Spjd prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && 609219089Spjd (flags & DB_RF_NOPREFETCH) == 0 && dn != NULL && 610185029Spjd DBUF_IS_CACHEABLE(db); 611168404Spjd 612168404Spjd mutex_enter(&db->db_mtx); 613168404Spjd if (db->db_state == DB_CACHED) { 614168404Spjd mutex_exit(&db->db_mtx); 615168404Spjd if (prefetch) 616219089Spjd dmu_zfetch(&dn->dn_zfetch, db->db.db_offset, 617168404Spjd db->db.db_size, TRUE); 618168404Spjd if ((flags & DB_RF_HAVESTRUCT) == 0) 619219089Spjd rw_exit(&dn->dn_struct_rwlock); 620219089Spjd DB_DNODE_EXIT(db); 621168404Spjd } else if (db->db_state == DB_UNCACHED) { 622219089Spjd spa_t *spa = dn->dn_objset->os_spa; 623219089Spjd 624219089Spjd if (zio == NULL) 625219089Spjd zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); 626168404Spjd dbuf_read_impl(db, zio, &flags); 627168404Spjd 628168404Spjd /* dbuf_read_impl has dropped db_mtx for us */ 629168404Spjd 630168404Spjd if (prefetch) 631219089Spjd dmu_zfetch(&dn->dn_zfetch, db->db.db_offset, 632168404Spjd db->db.db_size, flags & DB_RF_CACHED); 633168404Spjd 634168404Spjd if ((flags & DB_RF_HAVESTRUCT) == 0) 635219089Spjd rw_exit(&dn->dn_struct_rwlock); 636219089Spjd DB_DNODE_EXIT(db); 637168404Spjd 638168404Spjd if (!havepzio) 639168404Spjd err = zio_wait(zio); 640168404Spjd } else { 641168404Spjd mutex_exit(&db->db_mtx); 642168404Spjd if (prefetch) 643219089Spjd dmu_zfetch(&dn->dn_zfetch, db->db.db_offset, 644168404Spjd db->db.db_size, TRUE); 645168404Spjd if ((flags & DB_RF_HAVESTRUCT) == 0) 646219089Spjd rw_exit(&dn->dn_struct_rwlock); 647219089Spjd DB_DNODE_EXIT(db); 648168404Spjd 649168404Spjd mutex_enter(&db->db_mtx); 650168404Spjd if ((flags & DB_RF_NEVERWAIT) == 0) { 651168404Spjd while (db->db_state == DB_READ || 652168404Spjd db->db_state == DB_FILL) { 653168404Spjd ASSERT(db->db_state == DB_READ || 654168404Spjd (flags & DB_RF_HAVESTRUCT) == 0); 655168404Spjd cv_wait(&db->db_changed, &db->db_mtx); 656168404Spjd } 657168404Spjd if (db->db_state == DB_UNCACHED) 658249195Smm err = SET_ERROR(EIO); 659168404Spjd } 660168404Spjd mutex_exit(&db->db_mtx); 661168404Spjd } 662168404Spjd 663168404Spjd ASSERT(err || havepzio || db->db_state == DB_CACHED); 664168404Spjd return (err); 665168404Spjd} 666168404Spjd 667168404Spjdstatic void 668168404Spjddbuf_noread(dmu_buf_impl_t *db) 669168404Spjd{ 670168404Spjd ASSERT(!refcount_is_zero(&db->db_holds)); 671219089Spjd ASSERT(db->db_blkid != DMU_BONUS_BLKID); 672168404Spjd mutex_enter(&db->db_mtx); 673168404Spjd while (db->db_state == DB_READ || db->db_state == DB_FILL) 674168404Spjd cv_wait(&db->db_changed, &db->db_mtx); 675168404Spjd if (db->db_state == DB_UNCACHED) { 676168404Spjd arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 677219089Spjd spa_t *spa; 678168404Spjd 679168404Spjd ASSERT(db->db_buf == NULL); 680168404Spjd ASSERT(db->db.db_data == NULL); 681219089Spjd DB_GET_SPA(&spa, db); 682219089Spjd dbuf_set_data(db, arc_buf_alloc(spa, db->db.db_size, db, type)); 683168404Spjd db->db_state = DB_FILL; 684219089Spjd } else if (db->db_state == DB_NOFILL) { 685219089Spjd dbuf_set_data(db, NULL); 686168404Spjd } else { 687168404Spjd ASSERT3U(db->db_state, ==, DB_CACHED); 688168404Spjd } 689168404Spjd mutex_exit(&db->db_mtx); 690168404Spjd} 691168404Spjd 692168404Spjd/* 693168404Spjd * This is our just-in-time copy function. It makes a copy of 694168404Spjd * buffers, that have been modified in a previous transaction 695168404Spjd * group, before we modify them in the current active group. 696168404Spjd * 697168404Spjd * This function is used in two places: when we are dirtying a 698168404Spjd * buffer for the first time in a txg, and when we are freeing 699168404Spjd * a range in a dnode that includes this buffer. 700168404Spjd * 701168404Spjd * Note that when we are called from dbuf_free_range() we do 702168404Spjd * not put a hold on the buffer, we just traverse the active 703168404Spjd * dbuf list for the dnode. 704168404Spjd */ 705168404Spjdstatic void 706168404Spjddbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg) 707168404Spjd{ 708168404Spjd dbuf_dirty_record_t *dr = db->db_last_dirty; 709168404Spjd 710168404Spjd ASSERT(MUTEX_HELD(&db->db_mtx)); 711168404Spjd ASSERT(db->db.db_data != NULL); 712168404Spjd ASSERT(db->db_level == 0); 713168404Spjd ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT); 714168404Spjd 715168404Spjd if (dr == NULL || 716168404Spjd (dr->dt.dl.dr_data != 717219089Spjd ((db->db_blkid == DMU_BONUS_BLKID) ? db->db.db_data : db->db_buf))) 718168404Spjd return; 719168404Spjd 720168404Spjd /* 721168404Spjd * If the last dirty record for this dbuf has not yet synced 722168404Spjd * and its referencing the dbuf data, either: 723219089Spjd * reset the reference to point to a new copy, 724168404Spjd * or (if there a no active holders) 725168404Spjd * just null out the current db_data pointer. 726168404Spjd */ 727168404Spjd ASSERT(dr->dr_txg >= txg - 2); 728219089Spjd if (db->db_blkid == DMU_BONUS_BLKID) { 729168404Spjd /* Note that the data bufs here are zio_bufs */ 730168404Spjd dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN); 731208373Smm arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); 732168404Spjd bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN); 733168404Spjd } else if (refcount_count(&db->db_holds) > db->db_dirtycnt) { 734168404Spjd int size = db->db.db_size; 735168404Spjd arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 736219089Spjd spa_t *spa; 737219089Spjd 738219089Spjd DB_GET_SPA(&spa, db); 739219089Spjd dr->dt.dl.dr_data = arc_buf_alloc(spa, size, db, type); 740168404Spjd bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size); 741168404Spjd } else { 742168404Spjd dbuf_set_data(db, NULL); 743168404Spjd } 744168404Spjd} 745168404Spjd 746168404Spjdvoid 747168404Spjddbuf_unoverride(dbuf_dirty_record_t *dr) 748168404Spjd{ 749168404Spjd dmu_buf_impl_t *db = dr->dr_dbuf; 750219089Spjd blkptr_t *bp = &dr->dt.dl.dr_overridden_by; 751168404Spjd uint64_t txg = dr->dr_txg; 752168404Spjd 753168404Spjd ASSERT(MUTEX_HELD(&db->db_mtx)); 754168404Spjd ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC); 755168404Spjd ASSERT(db->db_level == 0); 756168404Spjd 757219089Spjd if (db->db_blkid == DMU_BONUS_BLKID || 758168404Spjd dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN) 759168404Spjd return; 760168404Spjd 761219089Spjd ASSERT(db->db_data_pending != dr); 762219089Spjd 763168404Spjd /* free this block */ 764243524Smm if (!BP_IS_HOLE(bp) && !dr->dt.dl.dr_nopwrite) { 765219089Spjd spa_t *spa; 766219089Spjd 767219089Spjd DB_GET_SPA(&spa, db); 768219089Spjd zio_free(spa, txg, bp); 769168404Spjd } 770168404Spjd dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; 771243524Smm dr->dt.dl.dr_nopwrite = B_FALSE; 772243524Smm 773168404Spjd /* 774168404Spjd * Release the already-written buffer, so we leave it in 775168404Spjd * a consistent dirty state. Note that all callers are 776168404Spjd * modifying the buffer, so they will immediately do 777168404Spjd * another (redundant) arc_release(). Therefore, leave 778168404Spjd * the buf thawed to save the effort of freezing & 779168404Spjd * immediately re-thawing it. 780168404Spjd */ 781168404Spjd arc_release(dr->dt.dl.dr_data, db); 782168404Spjd} 783168404Spjd 784185029Spjd/* 785185029Spjd * Evict (if its unreferenced) or clear (if its referenced) any level-0 786185029Spjd * data blocks in the free range, so that any future readers will find 787185029Spjd * empty blocks. Also, if we happen accross any level-1 dbufs in the 788185029Spjd * range that have not already been marked dirty, mark them dirty so 789185029Spjd * they stay in memory. 790185029Spjd */ 791168404Spjdvoid 792185029Spjddbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx) 793168404Spjd{ 794168404Spjd dmu_buf_impl_t *db, *db_next; 795168404Spjd uint64_t txg = tx->tx_txg; 796185029Spjd int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 797185029Spjd uint64_t first_l1 = start >> epbs; 798185029Spjd uint64_t last_l1 = end >> epbs; 799168404Spjd 800219089Spjd if (end > dn->dn_maxblkid && (end != DMU_SPILL_BLKID)) { 801185029Spjd end = dn->dn_maxblkid; 802185029Spjd last_l1 = end >> epbs; 803185029Spjd } 804185029Spjd dprintf_dnode(dn, "start=%llu end=%llu\n", start, end); 805168404Spjd mutex_enter(&dn->dn_dbufs_mtx); 806168404Spjd for (db = list_head(&dn->dn_dbufs); db; db = db_next) { 807168404Spjd db_next = list_next(&dn->dn_dbufs, db); 808219089Spjd ASSERT(db->db_blkid != DMU_BONUS_BLKID); 809185029Spjd 810185029Spjd if (db->db_level == 1 && 811185029Spjd db->db_blkid >= first_l1 && db->db_blkid <= last_l1) { 812185029Spjd mutex_enter(&db->db_mtx); 813185029Spjd if (db->db_last_dirty && 814185029Spjd db->db_last_dirty->dr_txg < txg) { 815185029Spjd dbuf_add_ref(db, FTAG); 816185029Spjd mutex_exit(&db->db_mtx); 817185029Spjd dbuf_will_dirty(db, tx); 818185029Spjd dbuf_rele(db, FTAG); 819185029Spjd } else { 820185029Spjd mutex_exit(&db->db_mtx); 821185029Spjd } 822185029Spjd } 823185029Spjd 824168404Spjd if (db->db_level != 0) 825168404Spjd continue; 826168404Spjd dprintf_dbuf(db, "found buf %s\n", ""); 827185029Spjd if (db->db_blkid < start || db->db_blkid > end) 828168404Spjd continue; 829168404Spjd 830168404Spjd /* found a level 0 buffer in the range */ 831248571Smm mutex_enter(&db->db_mtx); 832248571Smm if (dbuf_undirty(db, tx)) { 833248571Smm /* mutex has been dropped and dbuf destroyed */ 834168404Spjd continue; 835248571Smm } 836168404Spjd 837168404Spjd if (db->db_state == DB_UNCACHED || 838219089Spjd db->db_state == DB_NOFILL || 839168404Spjd db->db_state == DB_EVICTING) { 840168404Spjd ASSERT(db->db.db_data == NULL); 841168404Spjd mutex_exit(&db->db_mtx); 842168404Spjd continue; 843168404Spjd } 844168404Spjd if (db->db_state == DB_READ || db->db_state == DB_FILL) { 845168404Spjd /* will be handled in dbuf_read_done or dbuf_rele */ 846168404Spjd db->db_freed_in_flight = TRUE; 847168404Spjd mutex_exit(&db->db_mtx); 848168404Spjd continue; 849168404Spjd } 850168404Spjd if (refcount_count(&db->db_holds) == 0) { 851168404Spjd ASSERT(db->db_buf); 852168404Spjd dbuf_clear(db); 853168404Spjd continue; 854168404Spjd } 855168404Spjd /* The dbuf is referenced */ 856168404Spjd 857168404Spjd if (db->db_last_dirty != NULL) { 858168404Spjd dbuf_dirty_record_t *dr = db->db_last_dirty; 859168404Spjd 860168404Spjd if (dr->dr_txg == txg) { 861168404Spjd /* 862168404Spjd * This buffer is "in-use", re-adjust the file 863168404Spjd * size to reflect that this buffer may 864168404Spjd * contain new data when we sync. 865168404Spjd */ 866219089Spjd if (db->db_blkid != DMU_SPILL_BLKID && 867219089Spjd db->db_blkid > dn->dn_maxblkid) 868168404Spjd dn->dn_maxblkid = db->db_blkid; 869168404Spjd dbuf_unoverride(dr); 870168404Spjd } else { 871168404Spjd /* 872168404Spjd * This dbuf is not dirty in the open context. 873168404Spjd * Either uncache it (if its not referenced in 874168404Spjd * the open context) or reset its contents to 875168404Spjd * empty. 876168404Spjd */ 877168404Spjd dbuf_fix_old_data(db, txg); 878168404Spjd } 879168404Spjd } 880168404Spjd /* clear the contents if its cached */ 881168404Spjd if (db->db_state == DB_CACHED) { 882168404Spjd ASSERT(db->db.db_data != NULL); 883168404Spjd arc_release(db->db_buf, db); 884168404Spjd bzero(db->db.db_data, db->db.db_size); 885168404Spjd arc_buf_freeze(db->db_buf); 886168404Spjd } 887168404Spjd 888168404Spjd mutex_exit(&db->db_mtx); 889168404Spjd } 890168404Spjd mutex_exit(&dn->dn_dbufs_mtx); 891168404Spjd} 892168404Spjd 893168404Spjdstatic int 894185029Spjddbuf_block_freeable(dmu_buf_impl_t *db) 895168404Spjd{ 896168404Spjd dsl_dataset_t *ds = db->db_objset->os_dsl_dataset; 897168404Spjd uint64_t birth_txg = 0; 898168404Spjd 899168404Spjd /* 900168404Spjd * We don't need any locking to protect db_blkptr: 901168404Spjd * If it's syncing, then db_last_dirty will be set 902168404Spjd * so we'll ignore db_blkptr. 903168404Spjd */ 904168404Spjd ASSERT(MUTEX_HELD(&db->db_mtx)); 905168404Spjd if (db->db_last_dirty) 906168404Spjd birth_txg = db->db_last_dirty->dr_txg; 907168404Spjd else if (db->db_blkptr) 908168404Spjd birth_txg = db->db_blkptr->blk_birth; 909168404Spjd 910219089Spjd /* 911219089Spjd * If we don't exist or are in a snapshot, we can't be freed. 912219089Spjd * Don't pass the bp to dsl_dataset_block_freeable() since we 913219089Spjd * are holding the db_mtx lock and might deadlock if we are 914219089Spjd * prefetching a dedup-ed block. 915219089Spjd */ 916168404Spjd if (birth_txg) 917185029Spjd return (ds == NULL || 918219089Spjd dsl_dataset_block_freeable(ds, NULL, birth_txg)); 919168404Spjd else 920185029Spjd return (FALSE); 921168404Spjd} 922168404Spjd 923168404Spjdvoid 924168404Spjddbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx) 925168404Spjd{ 926168404Spjd arc_buf_t *buf, *obuf; 927168404Spjd int osize = db->db.db_size; 928168404Spjd arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 929219089Spjd dnode_t *dn; 930168404Spjd 931219089Spjd ASSERT(db->db_blkid != DMU_BONUS_BLKID); 932168404Spjd 933219089Spjd DB_DNODE_ENTER(db); 934219089Spjd dn = DB_DNODE(db); 935219089Spjd 936168404Spjd /* XXX does *this* func really need the lock? */ 937219089Spjd ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); 938168404Spjd 939168404Spjd /* 940168404Spjd * This call to dbuf_will_dirty() with the dn_struct_rwlock held 941168404Spjd * is OK, because there can be no other references to the db 942168404Spjd * when we are changing its size, so no concurrent DB_FILL can 943168404Spjd * be happening. 944168404Spjd */ 945168404Spjd /* 946168404Spjd * XXX we should be doing a dbuf_read, checking the return 947168404Spjd * value and returning that up to our callers 948168404Spjd */ 949168404Spjd dbuf_will_dirty(db, tx); 950168404Spjd 951168404Spjd /* create the data buffer for the new block */ 952219089Spjd buf = arc_buf_alloc(dn->dn_objset->os_spa, size, db, type); 953168404Spjd 954168404Spjd /* copy old block data to the new block */ 955168404Spjd obuf = db->db_buf; 956168404Spjd bcopy(obuf->b_data, buf->b_data, MIN(osize, size)); 957168404Spjd /* zero the remainder */ 958168404Spjd if (size > osize) 959168404Spjd bzero((uint8_t *)buf->b_data + osize, size - osize); 960168404Spjd 961168404Spjd mutex_enter(&db->db_mtx); 962168404Spjd dbuf_set_data(db, buf); 963248571Smm VERIFY(arc_buf_remove_ref(obuf, db)); 964168404Spjd db->db.db_size = size; 965168404Spjd 966168404Spjd if (db->db_level == 0) { 967168404Spjd ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg); 968168404Spjd db->db_last_dirty->dt.dl.dr_data = buf; 969168404Spjd } 970168404Spjd mutex_exit(&db->db_mtx); 971168404Spjd 972219089Spjd dnode_willuse_space(dn, size-osize, tx); 973219089Spjd DB_DNODE_EXIT(db); 974168404Spjd} 975168404Spjd 976219089Spjdvoid 977219089Spjddbuf_release_bp(dmu_buf_impl_t *db) 978219089Spjd{ 979219089Spjd objset_t *os; 980219089Spjd 981219089Spjd DB_GET_OBJSET(&os, db); 982219089Spjd ASSERT(dsl_pool_sync_context(dmu_objset_pool(os))); 983219089Spjd ASSERT(arc_released(os->os_phys_buf) || 984219089Spjd list_link_active(&os->os_dsl_dataset->ds_synced_link)); 985219089Spjd ASSERT(db->db_parent == NULL || arc_released(db->db_parent->db_buf)); 986219089Spjd 987246666Smm (void) arc_release(db->db_buf, db); 988219089Spjd} 989219089Spjd 990168404Spjddbuf_dirty_record_t * 991168404Spjddbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) 992168404Spjd{ 993219089Spjd dnode_t *dn; 994219089Spjd objset_t *os; 995168404Spjd dbuf_dirty_record_t **drp, *dr; 996168404Spjd int drop_struct_lock = FALSE; 997185029Spjd boolean_t do_free_accounting = B_FALSE; 998168404Spjd int txgoff = tx->tx_txg & TXG_MASK; 999168404Spjd 1000168404Spjd ASSERT(tx->tx_txg != 0); 1001168404Spjd ASSERT(!refcount_is_zero(&db->db_holds)); 1002168404Spjd DMU_TX_DIRTY_BUF(tx, db); 1003168404Spjd 1004219089Spjd DB_DNODE_ENTER(db); 1005219089Spjd dn = DB_DNODE(db); 1006168404Spjd /* 1007168404Spjd * Shouldn't dirty a regular buffer in syncing context. Private 1008168404Spjd * objects may be dirtied in syncing context, but only if they 1009168404Spjd * were already pre-dirtied in open context. 1010168404Spjd */ 1011168404Spjd ASSERT(!dmu_tx_is_syncing(tx) || 1012168404Spjd BP_IS_HOLE(dn->dn_objset->os_rootbp) || 1013209962Smm DMU_OBJECT_IS_SPECIAL(dn->dn_object) || 1014209962Smm dn->dn_objset->os_dsl_dataset == NULL); 1015168404Spjd /* 1016168404Spjd * We make this assert for private objects as well, but after we 1017168404Spjd * check if we're already dirty. They are allowed to re-dirty 1018168404Spjd * in syncing context. 1019168404Spjd */ 1020168404Spjd ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || 1021168404Spjd dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx == 1022168404Spjd (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); 1023168404Spjd 1024168404Spjd mutex_enter(&db->db_mtx); 1025168404Spjd /* 1026168404Spjd * XXX make this true for indirects too? The problem is that 1027168404Spjd * transactions created with dmu_tx_create_assigned() from 1028168404Spjd * syncing context don't bother holding ahead. 1029168404Spjd */ 1030168404Spjd ASSERT(db->db_level != 0 || 1031219089Spjd db->db_state == DB_CACHED || db->db_state == DB_FILL || 1032219089Spjd db->db_state == DB_NOFILL); 1033168404Spjd 1034168404Spjd mutex_enter(&dn->dn_mtx); 1035168404Spjd /* 1036168404Spjd * Don't set dirtyctx to SYNC if we're just modifying this as we 1037168404Spjd * initialize the objset. 1038168404Spjd */ 1039168404Spjd if (dn->dn_dirtyctx == DN_UNDIRTIED && 1040168404Spjd !BP_IS_HOLE(dn->dn_objset->os_rootbp)) { 1041168404Spjd dn->dn_dirtyctx = 1042168404Spjd (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN); 1043168404Spjd ASSERT(dn->dn_dirtyctx_firstset == NULL); 1044168404Spjd dn->dn_dirtyctx_firstset = kmem_alloc(1, KM_SLEEP); 1045168404Spjd } 1046168404Spjd mutex_exit(&dn->dn_mtx); 1047168404Spjd 1048219089Spjd if (db->db_blkid == DMU_SPILL_BLKID) 1049219089Spjd dn->dn_have_spill = B_TRUE; 1050219089Spjd 1051168404Spjd /* 1052168404Spjd * If this buffer is already dirty, we're done. 1053168404Spjd */ 1054168404Spjd drp = &db->db_last_dirty; 1055168404Spjd ASSERT(*drp == NULL || (*drp)->dr_txg <= tx->tx_txg || 1056168404Spjd db->db.db_object == DMU_META_DNODE_OBJECT); 1057185029Spjd while ((dr = *drp) != NULL && dr->dr_txg > tx->tx_txg) 1058185029Spjd drp = &dr->dr_next; 1059185029Spjd if (dr && dr->dr_txg == tx->tx_txg) { 1060219089Spjd DB_DNODE_EXIT(db); 1061219089Spjd 1062219089Spjd if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID) { 1063168404Spjd /* 1064168404Spjd * If this buffer has already been written out, 1065168404Spjd * we now need to reset its state. 1066168404Spjd */ 1067185029Spjd dbuf_unoverride(dr); 1068219089Spjd if (db->db.db_object != DMU_META_DNODE_OBJECT && 1069219089Spjd db->db_state != DB_NOFILL) 1070168404Spjd arc_buf_thaw(db->db_buf); 1071168404Spjd } 1072168404Spjd mutex_exit(&db->db_mtx); 1073185029Spjd return (dr); 1074168404Spjd } 1075168404Spjd 1076168404Spjd /* 1077168404Spjd * Only valid if not already dirty. 1078168404Spjd */ 1079209962Smm ASSERT(dn->dn_object == 0 || 1080209962Smm dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx == 1081168404Spjd (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); 1082168404Spjd 1083168404Spjd ASSERT3U(dn->dn_nlevels, >, db->db_level); 1084168404Spjd ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) || 1085168404Spjd dn->dn_phys->dn_nlevels > db->db_level || 1086168404Spjd dn->dn_next_nlevels[txgoff] > db->db_level || 1087168404Spjd dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level || 1088168404Spjd dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level); 1089168404Spjd 1090168404Spjd /* 1091168404Spjd * We should only be dirtying in syncing context if it's the 1092209962Smm * mos or we're initializing the os or it's a special object. 1093209962Smm * However, we are allowed to dirty in syncing context provided 1094209962Smm * we already dirtied it in open context. Hence we must make 1095209962Smm * this assertion only if we're not already dirty. 1096168404Spjd */ 1097219089Spjd os = dn->dn_objset; 1098209962Smm ASSERT(!dmu_tx_is_syncing(tx) || DMU_OBJECT_IS_SPECIAL(dn->dn_object) || 1099209962Smm os->os_dsl_dataset == NULL || BP_IS_HOLE(os->os_rootbp)); 1100168404Spjd ASSERT(db->db.db_size != 0); 1101168404Spjd 1102168404Spjd dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); 1103168404Spjd 1104219089Spjd if (db->db_blkid != DMU_BONUS_BLKID) { 1105185029Spjd /* 1106185029Spjd * Update the accounting. 1107185029Spjd * Note: we delay "free accounting" until after we drop 1108185029Spjd * the db_mtx. This keeps us from grabbing other locks 1109219089Spjd * (and possibly deadlocking) in bp_get_dsize() while 1110185029Spjd * also holding the db_mtx. 1111185029Spjd */ 1112185029Spjd dnode_willuse_space(dn, db->db.db_size, tx); 1113185029Spjd do_free_accounting = dbuf_block_freeable(db); 1114185029Spjd } 1115185029Spjd 1116168404Spjd /* 1117168404Spjd * If this buffer is dirty in an old transaction group we need 1118168404Spjd * to make a copy of it so that the changes we make in this 1119168404Spjd * transaction group won't leak out when we sync the older txg. 1120168404Spjd */ 1121168404Spjd dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_SLEEP); 1122168404Spjd if (db->db_level == 0) { 1123168404Spjd void *data_old = db->db_buf; 1124168404Spjd 1125219089Spjd if (db->db_state != DB_NOFILL) { 1126219089Spjd if (db->db_blkid == DMU_BONUS_BLKID) { 1127219089Spjd dbuf_fix_old_data(db, tx->tx_txg); 1128219089Spjd data_old = db->db.db_data; 1129219089Spjd } else if (db->db.db_object != DMU_META_DNODE_OBJECT) { 1130219089Spjd /* 1131219089Spjd * Release the data buffer from the cache so 1132219089Spjd * that we can modify it without impacting 1133219089Spjd * possible other users of this cached data 1134219089Spjd * block. Note that indirect blocks and 1135219089Spjd * private objects are not released until the 1136219089Spjd * syncing state (since they are only modified 1137219089Spjd * then). 1138219089Spjd */ 1139219089Spjd arc_release(db->db_buf, db); 1140219089Spjd dbuf_fix_old_data(db, tx->tx_txg); 1141219089Spjd data_old = db->db_buf; 1142219089Spjd } 1143219089Spjd ASSERT(data_old != NULL); 1144168404Spjd } 1145168404Spjd dr->dt.dl.dr_data = data_old; 1146168404Spjd } else { 1147168404Spjd mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_DEFAULT, NULL); 1148168404Spjd list_create(&dr->dt.di.dr_children, 1149168404Spjd sizeof (dbuf_dirty_record_t), 1150168404Spjd offsetof(dbuf_dirty_record_t, dr_dirty_node)); 1151168404Spjd } 1152168404Spjd dr->dr_dbuf = db; 1153168404Spjd dr->dr_txg = tx->tx_txg; 1154168404Spjd dr->dr_next = *drp; 1155168404Spjd *drp = dr; 1156168404Spjd 1157168404Spjd /* 1158168404Spjd * We could have been freed_in_flight between the dbuf_noread 1159168404Spjd * and dbuf_dirty. We win, as though the dbuf_noread() had 1160168404Spjd * happened after the free. 1161168404Spjd */ 1162219089Spjd if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && 1163219089Spjd db->db_blkid != DMU_SPILL_BLKID) { 1164168404Spjd mutex_enter(&dn->dn_mtx); 1165168404Spjd dnode_clear_range(dn, db->db_blkid, 1, tx); 1166168404Spjd mutex_exit(&dn->dn_mtx); 1167168404Spjd db->db_freed_in_flight = FALSE; 1168168404Spjd } 1169168404Spjd 1170168404Spjd /* 1171168404Spjd * This buffer is now part of this txg 1172168404Spjd */ 1173168404Spjd dbuf_add_ref(db, (void *)(uintptr_t)tx->tx_txg); 1174168404Spjd db->db_dirtycnt += 1; 1175168404Spjd ASSERT3U(db->db_dirtycnt, <=, 3); 1176168404Spjd 1177168404Spjd mutex_exit(&db->db_mtx); 1178168404Spjd 1179219089Spjd if (db->db_blkid == DMU_BONUS_BLKID || 1180219089Spjd db->db_blkid == DMU_SPILL_BLKID) { 1181168404Spjd mutex_enter(&dn->dn_mtx); 1182168404Spjd ASSERT(!list_link_active(&dr->dr_dirty_node)); 1183168404Spjd list_insert_tail(&dn->dn_dirty_records[txgoff], dr); 1184168404Spjd mutex_exit(&dn->dn_mtx); 1185168404Spjd dnode_setdirty(dn, tx); 1186219089Spjd DB_DNODE_EXIT(db); 1187168404Spjd return (dr); 1188185029Spjd } else if (do_free_accounting) { 1189185029Spjd blkptr_t *bp = db->db_blkptr; 1190185029Spjd int64_t willfree = (bp && !BP_IS_HOLE(bp)) ? 1191219089Spjd bp_get_dsize(os->os_spa, bp) : db->db.db_size; 1192185029Spjd /* 1193185029Spjd * This is only a guess -- if the dbuf is dirty 1194185029Spjd * in a previous txg, we don't know how much 1195185029Spjd * space it will use on disk yet. We should 1196185029Spjd * really have the struct_rwlock to access 1197185029Spjd * db_blkptr, but since this is just a guess, 1198185029Spjd * it's OK if we get an odd answer. 1199185029Spjd */ 1200219089Spjd ddt_prefetch(os->os_spa, bp); 1201185029Spjd dnode_willuse_space(dn, -willfree, tx); 1202168404Spjd } 1203168404Spjd 1204168404Spjd if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) { 1205168404Spjd rw_enter(&dn->dn_struct_rwlock, RW_READER); 1206168404Spjd drop_struct_lock = TRUE; 1207168404Spjd } 1208168404Spjd 1209185029Spjd if (db->db_level == 0) { 1210185029Spjd dnode_new_blkid(dn, db->db_blkid, tx, drop_struct_lock); 1211185029Spjd ASSERT(dn->dn_maxblkid >= db->db_blkid); 1212185029Spjd } 1213185029Spjd 1214168404Spjd if (db->db_level+1 < dn->dn_nlevels) { 1215168404Spjd dmu_buf_impl_t *parent = db->db_parent; 1216168404Spjd dbuf_dirty_record_t *di; 1217168404Spjd int parent_held = FALSE; 1218168404Spjd 1219168404Spjd if (db->db_parent == NULL || db->db_parent == dn->dn_dbuf) { 1220168404Spjd int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 1221168404Spjd 1222168404Spjd parent = dbuf_hold_level(dn, db->db_level+1, 1223168404Spjd db->db_blkid >> epbs, FTAG); 1224219089Spjd ASSERT(parent != NULL); 1225168404Spjd parent_held = TRUE; 1226168404Spjd } 1227168404Spjd if (drop_struct_lock) 1228168404Spjd rw_exit(&dn->dn_struct_rwlock); 1229168404Spjd ASSERT3U(db->db_level+1, ==, parent->db_level); 1230168404Spjd di = dbuf_dirty(parent, tx); 1231168404Spjd if (parent_held) 1232168404Spjd dbuf_rele(parent, FTAG); 1233168404Spjd 1234168404Spjd mutex_enter(&db->db_mtx); 1235168404Spjd /* possible race with dbuf_undirty() */ 1236168404Spjd if (db->db_last_dirty == dr || 1237168404Spjd dn->dn_object == DMU_META_DNODE_OBJECT) { 1238168404Spjd mutex_enter(&di->dt.di.dr_mtx); 1239168404Spjd ASSERT3U(di->dr_txg, ==, tx->tx_txg); 1240168404Spjd ASSERT(!list_link_active(&dr->dr_dirty_node)); 1241168404Spjd list_insert_tail(&di->dt.di.dr_children, dr); 1242168404Spjd mutex_exit(&di->dt.di.dr_mtx); 1243168404Spjd dr->dr_parent = di; 1244168404Spjd } 1245168404Spjd mutex_exit(&db->db_mtx); 1246168404Spjd } else { 1247168404Spjd ASSERT(db->db_level+1 == dn->dn_nlevels); 1248168404Spjd ASSERT(db->db_blkid < dn->dn_nblkptr); 1249219089Spjd ASSERT(db->db_parent == NULL || db->db_parent == dn->dn_dbuf); 1250168404Spjd mutex_enter(&dn->dn_mtx); 1251168404Spjd ASSERT(!list_link_active(&dr->dr_dirty_node)); 1252168404Spjd list_insert_tail(&dn->dn_dirty_records[txgoff], dr); 1253168404Spjd mutex_exit(&dn->dn_mtx); 1254168404Spjd if (drop_struct_lock) 1255168404Spjd rw_exit(&dn->dn_struct_rwlock); 1256168404Spjd } 1257168404Spjd 1258168404Spjd dnode_setdirty(dn, tx); 1259219089Spjd DB_DNODE_EXIT(db); 1260168404Spjd return (dr); 1261168404Spjd} 1262168404Spjd 1263248571Smm/* 1264248571Smm * Return TRUE if this evicted the dbuf. 1265248571Smm */ 1266248571Smmstatic boolean_t 1267168404Spjddbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) 1268168404Spjd{ 1269219089Spjd dnode_t *dn; 1270168404Spjd uint64_t txg = tx->tx_txg; 1271185029Spjd dbuf_dirty_record_t *dr, **drp; 1272168404Spjd 1273168404Spjd ASSERT(txg != 0); 1274219089Spjd ASSERT(db->db_blkid != DMU_BONUS_BLKID); 1275248571Smm ASSERT0(db->db_level); 1276248571Smm ASSERT(MUTEX_HELD(&db->db_mtx)); 1277168404Spjd 1278168404Spjd /* 1279168404Spjd * If this buffer is not dirty, we're done. 1280168404Spjd */ 1281185029Spjd for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next) 1282168404Spjd if (dr->dr_txg <= txg) 1283168404Spjd break; 1284248571Smm if (dr == NULL || dr->dr_txg < txg) 1285248571Smm return (B_FALSE); 1286168404Spjd ASSERT(dr->dr_txg == txg); 1287219089Spjd ASSERT(dr->dr_dbuf == db); 1288168404Spjd 1289219089Spjd DB_DNODE_ENTER(db); 1290219089Spjd dn = DB_DNODE(db); 1291219089Spjd 1292168404Spjd /* 1293248571Smm * Note: This code will probably work even if there are concurrent 1294248571Smm * holders, but it is untested in that scenerio, as the ZPL and 1295248571Smm * ztest have additional locking (the range locks) that prevents 1296248571Smm * that type of concurrent access. 1297168404Spjd */ 1298248571Smm ASSERT3U(refcount_count(&db->db_holds), ==, db->db_dirtycnt); 1299168404Spjd 1300168404Spjd dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); 1301168404Spjd 1302168404Spjd ASSERT(db->db.db_size != 0); 1303168404Spjd 1304168404Spjd /* XXX would be nice to fix up dn_towrite_space[] */ 1305168404Spjd 1306185029Spjd *drp = dr->dr_next; 1307168404Spjd 1308219636Spjd /* 1309219636Spjd * Note that there are three places in dbuf_dirty() 1310219636Spjd * where this dirty record may be put on a list. 1311219636Spjd * Make sure to do a list_remove corresponding to 1312219636Spjd * every one of those list_insert calls. 1313219636Spjd */ 1314168404Spjd if (dr->dr_parent) { 1315168404Spjd mutex_enter(&dr->dr_parent->dt.di.dr_mtx); 1316168404Spjd list_remove(&dr->dr_parent->dt.di.dr_children, dr); 1317168404Spjd mutex_exit(&dr->dr_parent->dt.di.dr_mtx); 1318219636Spjd } else if (db->db_blkid == DMU_SPILL_BLKID || 1319219636Spjd db->db_level+1 == dn->dn_nlevels) { 1320185029Spjd ASSERT(db->db_blkptr == NULL || db->db_parent == dn->dn_dbuf); 1321168404Spjd mutex_enter(&dn->dn_mtx); 1322168404Spjd list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr); 1323168404Spjd mutex_exit(&dn->dn_mtx); 1324168404Spjd } 1325219089Spjd DB_DNODE_EXIT(db); 1326168404Spjd 1327248571Smm if (db->db_state != DB_NOFILL) { 1328248571Smm dbuf_unoverride(dr); 1329168404Spjd 1330168404Spjd ASSERT(db->db_buf != NULL); 1331248571Smm ASSERT(dr->dt.dl.dr_data != NULL); 1332248571Smm if (dr->dt.dl.dr_data != db->db_buf) 1333248571Smm VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db)); 1334168404Spjd } 1335168404Spjd kmem_free(dr, sizeof (dbuf_dirty_record_t)); 1336168404Spjd 1337168404Spjd ASSERT(db->db_dirtycnt > 0); 1338168404Spjd db->db_dirtycnt -= 1; 1339168404Spjd 1340168404Spjd if (refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) { 1341168404Spjd arc_buf_t *buf = db->db_buf; 1342168404Spjd 1343219089Spjd ASSERT(db->db_state == DB_NOFILL || arc_released(buf)); 1344168404Spjd dbuf_set_data(db, NULL); 1345248571Smm VERIFY(arc_buf_remove_ref(buf, db)); 1346168404Spjd dbuf_evict(db); 1347248571Smm return (B_TRUE); 1348168404Spjd } 1349168404Spjd 1350248571Smm return (B_FALSE); 1351168404Spjd} 1352168404Spjd 1353168404Spjd#pragma weak dmu_buf_will_dirty = dbuf_will_dirty 1354168404Spjdvoid 1355168404Spjddbuf_will_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) 1356168404Spjd{ 1357185029Spjd int rf = DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH; 1358168404Spjd 1359168404Spjd ASSERT(tx->tx_txg != 0); 1360168404Spjd ASSERT(!refcount_is_zero(&db->db_holds)); 1361168404Spjd 1362219089Spjd DB_DNODE_ENTER(db); 1363219089Spjd if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock)) 1364168404Spjd rf |= DB_RF_HAVESTRUCT; 1365219089Spjd DB_DNODE_EXIT(db); 1366168404Spjd (void) dbuf_read(db, NULL, rf); 1367168404Spjd (void) dbuf_dirty(db, tx); 1368168404Spjd} 1369168404Spjd 1370168404Spjdvoid 1371219089Spjddmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) 1372219089Spjd{ 1373219089Spjd dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 1374219089Spjd 1375219089Spjd db->db_state = DB_NOFILL; 1376219089Spjd 1377219089Spjd dmu_buf_will_fill(db_fake, tx); 1378219089Spjd} 1379219089Spjd 1380219089Spjdvoid 1381168404Spjddmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) 1382168404Spjd{ 1383168404Spjd dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 1384168404Spjd 1385219089Spjd ASSERT(db->db_blkid != DMU_BONUS_BLKID); 1386168404Spjd ASSERT(tx->tx_txg != 0); 1387168404Spjd ASSERT(db->db_level == 0); 1388168404Spjd ASSERT(!refcount_is_zero(&db->db_holds)); 1389168404Spjd 1390168404Spjd ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT || 1391168404Spjd dmu_tx_private_ok(tx)); 1392168404Spjd 1393168404Spjd dbuf_noread(db); 1394168404Spjd (void) dbuf_dirty(db, tx); 1395168404Spjd} 1396168404Spjd 1397168404Spjd#pragma weak dmu_buf_fill_done = dbuf_fill_done 1398168404Spjd/* ARGSUSED */ 1399168404Spjdvoid 1400168404Spjddbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx) 1401168404Spjd{ 1402168404Spjd mutex_enter(&db->db_mtx); 1403168404Spjd DBUF_VERIFY(db); 1404168404Spjd 1405168404Spjd if (db->db_state == DB_FILL) { 1406168404Spjd if (db->db_level == 0 && db->db_freed_in_flight) { 1407219089Spjd ASSERT(db->db_blkid != DMU_BONUS_BLKID); 1408168404Spjd /* we were freed while filling */ 1409168404Spjd /* XXX dbuf_undirty? */ 1410168404Spjd bzero(db->db.db_data, db->db.db_size); 1411168404Spjd db->db_freed_in_flight = FALSE; 1412168404Spjd } 1413168404Spjd db->db_state = DB_CACHED; 1414168404Spjd cv_broadcast(&db->db_changed); 1415168404Spjd } 1416168404Spjd mutex_exit(&db->db_mtx); 1417168404Spjd} 1418168404Spjd 1419168404Spjd/* 1420209962Smm * Directly assign a provided arc buf to a given dbuf if it's not referenced 1421209962Smm * by anybody except our caller. Otherwise copy arcbuf's contents to dbuf. 1422209962Smm */ 1423209962Smmvoid 1424209962Smmdbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx) 1425209962Smm{ 1426209962Smm ASSERT(!refcount_is_zero(&db->db_holds)); 1427219089Spjd ASSERT(db->db_blkid != DMU_BONUS_BLKID); 1428209962Smm ASSERT(db->db_level == 0); 1429209962Smm ASSERT(DBUF_GET_BUFC_TYPE(db) == ARC_BUFC_DATA); 1430209962Smm ASSERT(buf != NULL); 1431209962Smm ASSERT(arc_buf_size(buf) == db->db.db_size); 1432209962Smm ASSERT(tx->tx_txg != 0); 1433209962Smm 1434209962Smm arc_return_buf(buf, db); 1435209962Smm ASSERT(arc_released(buf)); 1436209962Smm 1437209962Smm mutex_enter(&db->db_mtx); 1438209962Smm 1439209962Smm while (db->db_state == DB_READ || db->db_state == DB_FILL) 1440209962Smm cv_wait(&db->db_changed, &db->db_mtx); 1441209962Smm 1442209962Smm ASSERT(db->db_state == DB_CACHED || db->db_state == DB_UNCACHED); 1443209962Smm 1444209962Smm if (db->db_state == DB_CACHED && 1445209962Smm refcount_count(&db->db_holds) - 1 > db->db_dirtycnt) { 1446209962Smm mutex_exit(&db->db_mtx); 1447209962Smm (void) dbuf_dirty(db, tx); 1448209962Smm bcopy(buf->b_data, db->db.db_data, db->db.db_size); 1449248571Smm VERIFY(arc_buf_remove_ref(buf, db)); 1450219089Spjd xuio_stat_wbuf_copied(); 1451209962Smm return; 1452209962Smm } 1453209962Smm 1454219089Spjd xuio_stat_wbuf_nocopy(); 1455209962Smm if (db->db_state == DB_CACHED) { 1456209962Smm dbuf_dirty_record_t *dr = db->db_last_dirty; 1457209962Smm 1458209962Smm ASSERT(db->db_buf != NULL); 1459209962Smm if (dr != NULL && dr->dr_txg == tx->tx_txg) { 1460209962Smm ASSERT(dr->dt.dl.dr_data == db->db_buf); 1461209962Smm if (!arc_released(db->db_buf)) { 1462209962Smm ASSERT(dr->dt.dl.dr_override_state == 1463209962Smm DR_OVERRIDDEN); 1464209962Smm arc_release(db->db_buf, db); 1465209962Smm } 1466209962Smm dr->dt.dl.dr_data = buf; 1467248571Smm VERIFY(arc_buf_remove_ref(db->db_buf, db)); 1468209962Smm } else if (dr == NULL || dr->dt.dl.dr_data != db->db_buf) { 1469209962Smm arc_release(db->db_buf, db); 1470248571Smm VERIFY(arc_buf_remove_ref(db->db_buf, db)); 1471209962Smm } 1472209962Smm db->db_buf = NULL; 1473209962Smm } 1474209962Smm ASSERT(db->db_buf == NULL); 1475209962Smm dbuf_set_data(db, buf); 1476209962Smm db->db_state = DB_FILL; 1477209962Smm mutex_exit(&db->db_mtx); 1478209962Smm (void) dbuf_dirty(db, tx); 1479209962Smm dbuf_fill_done(db, tx); 1480209962Smm} 1481209962Smm 1482209962Smm/* 1483168404Spjd * "Clear" the contents of this dbuf. This will mark the dbuf 1484168404Spjd * EVICTING and clear *most* of its references. Unfortunetely, 1485168404Spjd * when we are not holding the dn_dbufs_mtx, we can't clear the 1486168404Spjd * entry in the dn_dbufs list. We have to wait until dbuf_destroy() 1487168404Spjd * in this case. For callers from the DMU we will usually see: 1488168404Spjd * dbuf_clear()->arc_buf_evict()->dbuf_do_evict()->dbuf_destroy() 1489168404Spjd * For the arc callback, we will usually see: 1490219089Spjd * dbuf_do_evict()->dbuf_clear();dbuf_destroy() 1491168404Spjd * Sometimes, though, we will get a mix of these two: 1492168404Spjd * DMU: dbuf_clear()->arc_buf_evict() 1493168404Spjd * ARC: dbuf_do_evict()->dbuf_destroy() 1494168404Spjd */ 1495168404Spjdvoid 1496168404Spjddbuf_clear(dmu_buf_impl_t *db) 1497168404Spjd{ 1498219089Spjd dnode_t *dn; 1499168404Spjd dmu_buf_impl_t *parent = db->db_parent; 1500219089Spjd dmu_buf_impl_t *dndb; 1501168404Spjd int dbuf_gone = FALSE; 1502168404Spjd 1503168404Spjd ASSERT(MUTEX_HELD(&db->db_mtx)); 1504168404Spjd ASSERT(refcount_is_zero(&db->db_holds)); 1505168404Spjd 1506168404Spjd dbuf_evict_user(db); 1507168404Spjd 1508168404Spjd if (db->db_state == DB_CACHED) { 1509168404Spjd ASSERT(db->db.db_data != NULL); 1510219089Spjd if (db->db_blkid == DMU_BONUS_BLKID) { 1511168404Spjd zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN); 1512208373Smm arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); 1513185029Spjd } 1514168404Spjd db->db.db_data = NULL; 1515168404Spjd db->db_state = DB_UNCACHED; 1516168404Spjd } 1517168404Spjd 1518219089Spjd ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL); 1519168404Spjd ASSERT(db->db_data_pending == NULL); 1520168404Spjd 1521168404Spjd db->db_state = DB_EVICTING; 1522168404Spjd db->db_blkptr = NULL; 1523168404Spjd 1524219089Spjd DB_DNODE_ENTER(db); 1525219089Spjd dn = DB_DNODE(db); 1526219089Spjd dndb = dn->dn_dbuf; 1527219089Spjd if (db->db_blkid != DMU_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) { 1528168404Spjd list_remove(&dn->dn_dbufs, db); 1529219089Spjd (void) atomic_dec_32_nv(&dn->dn_dbufs_count); 1530219089Spjd membar_producer(); 1531219089Spjd DB_DNODE_EXIT(db); 1532219089Spjd /* 1533219089Spjd * Decrementing the dbuf count means that the hold corresponding 1534219089Spjd * to the removed dbuf is no longer discounted in dnode_move(), 1535219089Spjd * so the dnode cannot be moved until after we release the hold. 1536219089Spjd * The membar_producer() ensures visibility of the decremented 1537219089Spjd * value in dnode_move(), since DB_DNODE_EXIT doesn't actually 1538219089Spjd * release any lock. 1539219089Spjd */ 1540168404Spjd dnode_rele(dn, db); 1541219089Spjd db->db_dnode_handle = NULL; 1542219089Spjd } else { 1543219089Spjd DB_DNODE_EXIT(db); 1544168404Spjd } 1545168404Spjd 1546168404Spjd if (db->db_buf) 1547168404Spjd dbuf_gone = arc_buf_evict(db->db_buf); 1548168404Spjd 1549168404Spjd if (!dbuf_gone) 1550168404Spjd mutex_exit(&db->db_mtx); 1551168404Spjd 1552168404Spjd /* 1553219089Spjd * If this dbuf is referenced from an indirect dbuf, 1554168404Spjd * decrement the ref count on the indirect dbuf. 1555168404Spjd */ 1556168404Spjd if (parent && parent != dndb) 1557168404Spjd dbuf_rele(parent, db); 1558168404Spjd} 1559168404Spjd 1560168404Spjdstatic int 1561168404Spjddbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse, 1562168404Spjd dmu_buf_impl_t **parentp, blkptr_t **bpp) 1563168404Spjd{ 1564168404Spjd int nlevels, epbs; 1565168404Spjd 1566168404Spjd *parentp = NULL; 1567168404Spjd *bpp = NULL; 1568168404Spjd 1569219089Spjd ASSERT(blkid != DMU_BONUS_BLKID); 1570168404Spjd 1571219089Spjd if (blkid == DMU_SPILL_BLKID) { 1572219089Spjd mutex_enter(&dn->dn_mtx); 1573219089Spjd if (dn->dn_have_spill && 1574219089Spjd (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) 1575219089Spjd *bpp = &dn->dn_phys->dn_spill; 1576219089Spjd else 1577219089Spjd *bpp = NULL; 1578219089Spjd dbuf_add_ref(dn->dn_dbuf, NULL); 1579219089Spjd *parentp = dn->dn_dbuf; 1580219089Spjd mutex_exit(&dn->dn_mtx); 1581219089Spjd return (0); 1582219089Spjd } 1583219089Spjd 1584168404Spjd if (dn->dn_phys->dn_nlevels == 0) 1585168404Spjd nlevels = 1; 1586168404Spjd else 1587168404Spjd nlevels = dn->dn_phys->dn_nlevels; 1588168404Spjd 1589168404Spjd epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 1590168404Spjd 1591168404Spjd ASSERT3U(level * epbs, <, 64); 1592168404Spjd ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 1593168404Spjd if (level >= nlevels || 1594168404Spjd (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))) { 1595168404Spjd /* the buffer has no parent yet */ 1596249195Smm return (SET_ERROR(ENOENT)); 1597168404Spjd } else if (level < nlevels-1) { 1598168404Spjd /* this block is referenced from an indirect block */ 1599168404Spjd int err = dbuf_hold_impl(dn, level+1, 1600168404Spjd blkid >> epbs, fail_sparse, NULL, parentp); 1601168404Spjd if (err) 1602168404Spjd return (err); 1603168404Spjd err = dbuf_read(*parentp, NULL, 1604168404Spjd (DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH | DB_RF_CANFAIL)); 1605168404Spjd if (err) { 1606168404Spjd dbuf_rele(*parentp, NULL); 1607168404Spjd *parentp = NULL; 1608168404Spjd return (err); 1609168404Spjd } 1610168404Spjd *bpp = ((blkptr_t *)(*parentp)->db.db_data) + 1611168404Spjd (blkid & ((1ULL << epbs) - 1)); 1612168404Spjd return (0); 1613168404Spjd } else { 1614168404Spjd /* the block is referenced from the dnode */ 1615168404Spjd ASSERT3U(level, ==, nlevels-1); 1616168404Spjd ASSERT(dn->dn_phys->dn_nblkptr == 0 || 1617168404Spjd blkid < dn->dn_phys->dn_nblkptr); 1618168404Spjd if (dn->dn_dbuf) { 1619168404Spjd dbuf_add_ref(dn->dn_dbuf, NULL); 1620168404Spjd *parentp = dn->dn_dbuf; 1621168404Spjd } 1622168404Spjd *bpp = &dn->dn_phys->dn_blkptr[blkid]; 1623168404Spjd return (0); 1624168404Spjd } 1625168404Spjd} 1626168404Spjd 1627168404Spjdstatic dmu_buf_impl_t * 1628168404Spjddbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, 1629168404Spjd dmu_buf_impl_t *parent, blkptr_t *blkptr) 1630168404Spjd{ 1631219089Spjd objset_t *os = dn->dn_objset; 1632168404Spjd dmu_buf_impl_t *db, *odb; 1633168404Spjd 1634168404Spjd ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 1635168404Spjd ASSERT(dn->dn_type != DMU_OT_NONE); 1636168404Spjd 1637168404Spjd db = kmem_cache_alloc(dbuf_cache, KM_SLEEP); 1638168404Spjd 1639168404Spjd db->db_objset = os; 1640168404Spjd db->db.db_object = dn->dn_object; 1641168404Spjd db->db_level = level; 1642168404Spjd db->db_blkid = blkid; 1643168404Spjd db->db_last_dirty = NULL; 1644168404Spjd db->db_dirtycnt = 0; 1645219089Spjd db->db_dnode_handle = dn->dn_handle; 1646168404Spjd db->db_parent = parent; 1647168404Spjd db->db_blkptr = blkptr; 1648168404Spjd 1649168404Spjd db->db_user_ptr = NULL; 1650168404Spjd db->db_user_data_ptr_ptr = NULL; 1651168404Spjd db->db_evict_func = NULL; 1652168404Spjd db->db_immediate_evict = 0; 1653168404Spjd db->db_freed_in_flight = 0; 1654168404Spjd 1655219089Spjd if (blkid == DMU_BONUS_BLKID) { 1656168404Spjd ASSERT3P(parent, ==, dn->dn_dbuf); 1657185029Spjd db->db.db_size = DN_MAX_BONUSLEN - 1658185029Spjd (dn->dn_nblkptr-1) * sizeof (blkptr_t); 1659185029Spjd ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); 1660219089Spjd db->db.db_offset = DMU_BONUS_BLKID; 1661168404Spjd db->db_state = DB_UNCACHED; 1662168404Spjd /* the bonus dbuf is not placed in the hash table */ 1663208373Smm arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); 1664168404Spjd return (db); 1665219089Spjd } else if (blkid == DMU_SPILL_BLKID) { 1666219089Spjd db->db.db_size = (blkptr != NULL) ? 1667219089Spjd BP_GET_LSIZE(blkptr) : SPA_MINBLOCKSIZE; 1668219089Spjd db->db.db_offset = 0; 1669168404Spjd } else { 1670168404Spjd int blocksize = 1671168404Spjd db->db_level ? 1<<dn->dn_indblkshift : dn->dn_datablksz; 1672168404Spjd db->db.db_size = blocksize; 1673168404Spjd db->db.db_offset = db->db_blkid * blocksize; 1674168404Spjd } 1675168404Spjd 1676168404Spjd /* 1677168404Spjd * Hold the dn_dbufs_mtx while we get the new dbuf 1678168404Spjd * in the hash table *and* added to the dbufs list. 1679168404Spjd * This prevents a possible deadlock with someone 1680168404Spjd * trying to look up this dbuf before its added to the 1681168404Spjd * dn_dbufs list. 1682168404Spjd */ 1683168404Spjd mutex_enter(&dn->dn_dbufs_mtx); 1684168404Spjd db->db_state = DB_EVICTING; 1685168404Spjd if ((odb = dbuf_hash_insert(db)) != NULL) { 1686168404Spjd /* someone else inserted it first */ 1687168404Spjd kmem_cache_free(dbuf_cache, db); 1688168404Spjd mutex_exit(&dn->dn_dbufs_mtx); 1689168404Spjd return (odb); 1690168404Spjd } 1691168404Spjd list_insert_head(&dn->dn_dbufs, db); 1692168404Spjd db->db_state = DB_UNCACHED; 1693168404Spjd mutex_exit(&dn->dn_dbufs_mtx); 1694208373Smm arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); 1695168404Spjd 1696168404Spjd if (parent && parent != dn->dn_dbuf) 1697168404Spjd dbuf_add_ref(parent, db); 1698168404Spjd 1699168404Spjd ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || 1700168404Spjd refcount_count(&dn->dn_holds) > 0); 1701168404Spjd (void) refcount_add(&dn->dn_holds, db); 1702219089Spjd (void) atomic_inc_32_nv(&dn->dn_dbufs_count); 1703168404Spjd 1704168404Spjd dprintf_dbuf(db, "db=%p\n", db); 1705168404Spjd 1706168404Spjd return (db); 1707168404Spjd} 1708168404Spjd 1709168404Spjdstatic int 1710168404Spjddbuf_do_evict(void *private) 1711168404Spjd{ 1712168404Spjd arc_buf_t *buf = private; 1713168404Spjd dmu_buf_impl_t *db = buf->b_private; 1714168404Spjd 1715168404Spjd if (!MUTEX_HELD(&db->db_mtx)) 1716168404Spjd mutex_enter(&db->db_mtx); 1717168404Spjd 1718168404Spjd ASSERT(refcount_is_zero(&db->db_holds)); 1719168404Spjd 1720168404Spjd if (db->db_state != DB_EVICTING) { 1721168404Spjd ASSERT(db->db_state == DB_CACHED); 1722168404Spjd DBUF_VERIFY(db); 1723168404Spjd db->db_buf = NULL; 1724168404Spjd dbuf_evict(db); 1725168404Spjd } else { 1726168404Spjd mutex_exit(&db->db_mtx); 1727168404Spjd dbuf_destroy(db); 1728168404Spjd } 1729168404Spjd return (0); 1730168404Spjd} 1731168404Spjd 1732168404Spjdstatic void 1733168404Spjddbuf_destroy(dmu_buf_impl_t *db) 1734168404Spjd{ 1735168404Spjd ASSERT(refcount_is_zero(&db->db_holds)); 1736168404Spjd 1737219089Spjd if (db->db_blkid != DMU_BONUS_BLKID) { 1738168404Spjd /* 1739168404Spjd * If this dbuf is still on the dn_dbufs list, 1740168404Spjd * remove it from that list. 1741168404Spjd */ 1742219089Spjd if (db->db_dnode_handle != NULL) { 1743219089Spjd dnode_t *dn; 1744185029Spjd 1745219089Spjd DB_DNODE_ENTER(db); 1746219089Spjd dn = DB_DNODE(db); 1747168404Spjd mutex_enter(&dn->dn_dbufs_mtx); 1748168404Spjd list_remove(&dn->dn_dbufs, db); 1749219089Spjd (void) atomic_dec_32_nv(&dn->dn_dbufs_count); 1750168404Spjd mutex_exit(&dn->dn_dbufs_mtx); 1751219089Spjd DB_DNODE_EXIT(db); 1752219089Spjd /* 1753219089Spjd * Decrementing the dbuf count means that the hold 1754219089Spjd * corresponding to the removed dbuf is no longer 1755219089Spjd * discounted in dnode_move(), so the dnode cannot be 1756219089Spjd * moved until after we release the hold. 1757219089Spjd */ 1758168404Spjd dnode_rele(dn, db); 1759219089Spjd db->db_dnode_handle = NULL; 1760168404Spjd } 1761168404Spjd dbuf_hash_remove(db); 1762168404Spjd } 1763168404Spjd db->db_parent = NULL; 1764168404Spjd db->db_buf = NULL; 1765168404Spjd 1766185029Spjd ASSERT(!list_link_active(&db->db_link)); 1767168404Spjd ASSERT(db->db.db_data == NULL); 1768168404Spjd ASSERT(db->db_hash_next == NULL); 1769168404Spjd ASSERT(db->db_blkptr == NULL); 1770168404Spjd ASSERT(db->db_data_pending == NULL); 1771168404Spjd 1772168404Spjd kmem_cache_free(dbuf_cache, db); 1773208373Smm arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); 1774168404Spjd} 1775168404Spjd 1776168404Spjdvoid 1777168404Spjddbuf_prefetch(dnode_t *dn, uint64_t blkid) 1778168404Spjd{ 1779168404Spjd dmu_buf_impl_t *db = NULL; 1780168404Spjd blkptr_t *bp = NULL; 1781168404Spjd 1782219089Spjd ASSERT(blkid != DMU_BONUS_BLKID); 1783168404Spjd ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 1784168404Spjd 1785168404Spjd if (dnode_block_freed(dn, blkid)) 1786168404Spjd return; 1787168404Spjd 1788168404Spjd /* dbuf_find() returns with db_mtx held */ 1789168404Spjd if (db = dbuf_find(dn, 0, blkid)) { 1790219089Spjd /* 1791219089Spjd * This dbuf is already in the cache. We assume that 1792219089Spjd * it is already CACHED, or else about to be either 1793219089Spjd * read or filled. 1794219089Spjd */ 1795168404Spjd mutex_exit(&db->db_mtx); 1796219089Spjd return; 1797168404Spjd } 1798168404Spjd 1799168404Spjd if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp) == 0) { 1800168404Spjd if (bp && !BP_IS_HOLE(bp)) { 1801219089Spjd int priority = dn->dn_type == DMU_OT_DDT_ZAP ? 1802219089Spjd ZIO_PRIORITY_DDT_PREFETCH : ZIO_PRIORITY_ASYNC_READ; 1803219089Spjd dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; 1804168404Spjd uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH; 1805168404Spjd zbookmark_t zb; 1806168404Spjd 1807219089Spjd SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET, 1808219089Spjd dn->dn_object, 0, blkid); 1809219089Spjd 1810246666Smm (void) arc_read(NULL, dn->dn_objset->os_spa, 1811246666Smm bp, NULL, NULL, priority, 1812168404Spjd ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, 1813168404Spjd &aflags, &zb); 1814168404Spjd } 1815168404Spjd if (db) 1816168404Spjd dbuf_rele(db, NULL); 1817168404Spjd } 1818168404Spjd} 1819168404Spjd 1820168404Spjd/* 1821168404Spjd * Returns with db_holds incremented, and db_mtx not held. 1822168404Spjd * Note: dn_struct_rwlock must be held. 1823168404Spjd */ 1824168404Spjdint 1825168404Spjddbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse, 1826168404Spjd void *tag, dmu_buf_impl_t **dbp) 1827168404Spjd{ 1828168404Spjd dmu_buf_impl_t *db, *parent = NULL; 1829168404Spjd 1830219089Spjd ASSERT(blkid != DMU_BONUS_BLKID); 1831168404Spjd ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 1832168404Spjd ASSERT3U(dn->dn_nlevels, >, level); 1833168404Spjd 1834168404Spjd *dbp = NULL; 1835168404Spjdtop: 1836168404Spjd /* dbuf_find() returns with db_mtx held */ 1837168404Spjd db = dbuf_find(dn, level, blkid); 1838168404Spjd 1839168404Spjd if (db == NULL) { 1840168404Spjd blkptr_t *bp = NULL; 1841168404Spjd int err; 1842168404Spjd 1843168404Spjd ASSERT3P(parent, ==, NULL); 1844168404Spjd err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp); 1845168404Spjd if (fail_sparse) { 1846168404Spjd if (err == 0 && bp && BP_IS_HOLE(bp)) 1847249195Smm err = SET_ERROR(ENOENT); 1848168404Spjd if (err) { 1849168404Spjd if (parent) 1850168404Spjd dbuf_rele(parent, NULL); 1851168404Spjd return (err); 1852168404Spjd } 1853168404Spjd } 1854168404Spjd if (err && err != ENOENT) 1855168404Spjd return (err); 1856168404Spjd db = dbuf_create(dn, level, blkid, parent, bp); 1857168404Spjd } 1858168404Spjd 1859168404Spjd if (db->db_buf && refcount_is_zero(&db->db_holds)) { 1860168404Spjd arc_buf_add_ref(db->db_buf, db); 1861168404Spjd if (db->db_buf->b_data == NULL) { 1862168404Spjd dbuf_clear(db); 1863168404Spjd if (parent) { 1864168404Spjd dbuf_rele(parent, NULL); 1865168404Spjd parent = NULL; 1866168404Spjd } 1867168404Spjd goto top; 1868168404Spjd } 1869168404Spjd ASSERT3P(db->db.db_data, ==, db->db_buf->b_data); 1870168404Spjd } 1871168404Spjd 1872168404Spjd ASSERT(db->db_buf == NULL || arc_referenced(db->db_buf)); 1873168404Spjd 1874168404Spjd /* 1875168404Spjd * If this buffer is currently syncing out, and we are are 1876168404Spjd * still referencing it from db_data, we need to make a copy 1877168404Spjd * of it in case we decide we want to dirty it again in this txg. 1878168404Spjd */ 1879219089Spjd if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && 1880168404Spjd dn->dn_object != DMU_META_DNODE_OBJECT && 1881168404Spjd db->db_state == DB_CACHED && db->db_data_pending) { 1882168404Spjd dbuf_dirty_record_t *dr = db->db_data_pending; 1883168404Spjd 1884168404Spjd if (dr->dt.dl.dr_data == db->db_buf) { 1885168404Spjd arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 1886168404Spjd 1887168404Spjd dbuf_set_data(db, 1888219089Spjd arc_buf_alloc(dn->dn_objset->os_spa, 1889168404Spjd db->db.db_size, db, type)); 1890168404Spjd bcopy(dr->dt.dl.dr_data->b_data, db->db.db_data, 1891168404Spjd db->db.db_size); 1892168404Spjd } 1893168404Spjd } 1894168404Spjd 1895168404Spjd (void) refcount_add(&db->db_holds, tag); 1896168404Spjd dbuf_update_data(db); 1897168404Spjd DBUF_VERIFY(db); 1898168404Spjd mutex_exit(&db->db_mtx); 1899168404Spjd 1900168404Spjd /* NOTE: we can't rele the parent until after we drop the db_mtx */ 1901168404Spjd if (parent) 1902168404Spjd dbuf_rele(parent, NULL); 1903168404Spjd 1904219089Spjd ASSERT3P(DB_DNODE(db), ==, dn); 1905168404Spjd ASSERT3U(db->db_blkid, ==, blkid); 1906168404Spjd ASSERT3U(db->db_level, ==, level); 1907168404Spjd *dbp = db; 1908168404Spjd 1909168404Spjd return (0); 1910168404Spjd} 1911168404Spjd 1912168404Spjddmu_buf_impl_t * 1913168404Spjddbuf_hold(dnode_t *dn, uint64_t blkid, void *tag) 1914168404Spjd{ 1915168404Spjd dmu_buf_impl_t *db; 1916168404Spjd int err = dbuf_hold_impl(dn, 0, blkid, FALSE, tag, &db); 1917168404Spjd return (err ? NULL : db); 1918168404Spjd} 1919168404Spjd 1920168404Spjddmu_buf_impl_t * 1921168404Spjddbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag) 1922168404Spjd{ 1923168404Spjd dmu_buf_impl_t *db; 1924168404Spjd int err = dbuf_hold_impl(dn, level, blkid, FALSE, tag, &db); 1925168404Spjd return (err ? NULL : db); 1926168404Spjd} 1927168404Spjd 1928185029Spjdvoid 1929168404Spjddbuf_create_bonus(dnode_t *dn) 1930168404Spjd{ 1931168404Spjd ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); 1932168404Spjd 1933168404Spjd ASSERT(dn->dn_bonus == NULL); 1934219089Spjd dn->dn_bonus = dbuf_create(dn, 0, DMU_BONUS_BLKID, dn->dn_dbuf, NULL); 1935168404Spjd} 1936168404Spjd 1937219089Spjdint 1938219089Spjddbuf_spill_set_blksz(dmu_buf_t *db_fake, uint64_t blksz, dmu_tx_t *tx) 1939219089Spjd{ 1940219089Spjd dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 1941219089Spjd dnode_t *dn; 1942219089Spjd 1943219089Spjd if (db->db_blkid != DMU_SPILL_BLKID) 1944249195Smm return (SET_ERROR(ENOTSUP)); 1945219089Spjd if (blksz == 0) 1946219089Spjd blksz = SPA_MINBLOCKSIZE; 1947219089Spjd if (blksz > SPA_MAXBLOCKSIZE) 1948219089Spjd blksz = SPA_MAXBLOCKSIZE; 1949219089Spjd else 1950219089Spjd blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE); 1951219089Spjd 1952219089Spjd DB_DNODE_ENTER(db); 1953219089Spjd dn = DB_DNODE(db); 1954219089Spjd rw_enter(&dn->dn_struct_rwlock, RW_WRITER); 1955219089Spjd dbuf_new_size(db, blksz, tx); 1956219089Spjd rw_exit(&dn->dn_struct_rwlock); 1957219089Spjd DB_DNODE_EXIT(db); 1958219089Spjd 1959219089Spjd return (0); 1960219089Spjd} 1961219089Spjd 1962219089Spjdvoid 1963219089Spjddbuf_rm_spill(dnode_t *dn, dmu_tx_t *tx) 1964219089Spjd{ 1965219089Spjd dbuf_free_range(dn, DMU_SPILL_BLKID, DMU_SPILL_BLKID, tx); 1966219089Spjd} 1967219089Spjd 1968168404Spjd#pragma weak dmu_buf_add_ref = dbuf_add_ref 1969168404Spjdvoid 1970168404Spjddbuf_add_ref(dmu_buf_impl_t *db, void *tag) 1971168404Spjd{ 1972168404Spjd int64_t holds = refcount_add(&db->db_holds, tag); 1973168404Spjd ASSERT(holds > 1); 1974168404Spjd} 1975168404Spjd 1976219089Spjd/* 1977219089Spjd * If you call dbuf_rele() you had better not be referencing the dnode handle 1978219089Spjd * unless you have some other direct or indirect hold on the dnode. (An indirect 1979219089Spjd * hold is a hold on one of the dnode's dbufs, including the bonus buffer.) 1980219089Spjd * Without that, the dbuf_rele() could lead to a dnode_rele() followed by the 1981219089Spjd * dnode's parent dbuf evicting its dnode handles. 1982219089Spjd */ 1983168404Spjd#pragma weak dmu_buf_rele = dbuf_rele 1984168404Spjdvoid 1985168404Spjddbuf_rele(dmu_buf_impl_t *db, void *tag) 1986168404Spjd{ 1987219089Spjd mutex_enter(&db->db_mtx); 1988219089Spjd dbuf_rele_and_unlock(db, tag); 1989219089Spjd} 1990219089Spjd 1991219089Spjd/* 1992219089Spjd * dbuf_rele() for an already-locked dbuf. This is necessary to allow 1993219089Spjd * db_dirtycnt and db_holds to be updated atomically. 1994219089Spjd */ 1995219089Spjdvoid 1996219089Spjddbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag) 1997219089Spjd{ 1998168404Spjd int64_t holds; 1999168404Spjd 2000219089Spjd ASSERT(MUTEX_HELD(&db->db_mtx)); 2001168404Spjd DBUF_VERIFY(db); 2002168404Spjd 2003219089Spjd /* 2004219089Spjd * Remove the reference to the dbuf before removing its hold on the 2005219089Spjd * dnode so we can guarantee in dnode_move() that a referenced bonus 2006219089Spjd * buffer has a corresponding dnode hold. 2007219089Spjd */ 2008168404Spjd holds = refcount_remove(&db->db_holds, tag); 2009168404Spjd ASSERT(holds >= 0); 2010168404Spjd 2011168404Spjd /* 2012168404Spjd * We can't freeze indirects if there is a possibility that they 2013168404Spjd * may be modified in the current syncing context. 2014168404Spjd */ 2015168404Spjd if (db->db_buf && holds == (db->db_level == 0 ? db->db_dirtycnt : 0)) 2016168404Spjd arc_buf_freeze(db->db_buf); 2017168404Spjd 2018168404Spjd if (holds == db->db_dirtycnt && 2019168404Spjd db->db_level == 0 && db->db_immediate_evict) 2020168404Spjd dbuf_evict_user(db); 2021168404Spjd 2022168404Spjd if (holds == 0) { 2023219089Spjd if (db->db_blkid == DMU_BONUS_BLKID) { 2024168404Spjd mutex_exit(&db->db_mtx); 2025219089Spjd 2026219089Spjd /* 2027219089Spjd * If the dnode moves here, we cannot cross this barrier 2028219089Spjd * until the move completes. 2029219089Spjd */ 2030219089Spjd DB_DNODE_ENTER(db); 2031219089Spjd (void) atomic_dec_32_nv(&DB_DNODE(db)->dn_dbufs_count); 2032219089Spjd DB_DNODE_EXIT(db); 2033219089Spjd /* 2034219089Spjd * The bonus buffer's dnode hold is no longer discounted 2035219089Spjd * in dnode_move(). The dnode cannot move until after 2036219089Spjd * the dnode_rele(). 2037219089Spjd */ 2038219089Spjd dnode_rele(DB_DNODE(db), db); 2039168404Spjd } else if (db->db_buf == NULL) { 2040168404Spjd /* 2041168404Spjd * This is a special case: we never associated this 2042168404Spjd * dbuf with any data allocated from the ARC. 2043168404Spjd */ 2044219089Spjd ASSERT(db->db_state == DB_UNCACHED || 2045219089Spjd db->db_state == DB_NOFILL); 2046168404Spjd dbuf_evict(db); 2047168404Spjd } else if (arc_released(db->db_buf)) { 2048168404Spjd arc_buf_t *buf = db->db_buf; 2049168404Spjd /* 2050168404Spjd * This dbuf has anonymous data associated with it. 2051168404Spjd */ 2052168404Spjd dbuf_set_data(db, NULL); 2053248571Smm VERIFY(arc_buf_remove_ref(buf, db)); 2054168404Spjd dbuf_evict(db); 2055168404Spjd } else { 2056248571Smm VERIFY(!arc_buf_remove_ref(db->db_buf, db)); 2057242845Sdelphij 2058242845Sdelphij /* 2059242845Sdelphij * A dbuf will be eligible for eviction if either the 2060242845Sdelphij * 'primarycache' property is set or a duplicate 2061242845Sdelphij * copy of this buffer is already cached in the arc. 2062242845Sdelphij * 2063242845Sdelphij * In the case of the 'primarycache' a buffer 2064242845Sdelphij * is considered for eviction if it matches the 2065242845Sdelphij * criteria set in the property. 2066242845Sdelphij * 2067242845Sdelphij * To decide if our buffer is considered a 2068242845Sdelphij * duplicate, we must call into the arc to determine 2069242845Sdelphij * if multiple buffers are referencing the same 2070242845Sdelphij * block on-disk. If so, then we simply evict 2071242845Sdelphij * ourselves. 2072242845Sdelphij */ 2073242845Sdelphij if (!DBUF_IS_CACHEABLE(db) || 2074242845Sdelphij arc_buf_eviction_needed(db->db_buf)) 2075185029Spjd dbuf_clear(db); 2076185029Spjd else 2077185029Spjd mutex_exit(&db->db_mtx); 2078168404Spjd } 2079168404Spjd } else { 2080168404Spjd mutex_exit(&db->db_mtx); 2081168404Spjd } 2082168404Spjd} 2083168404Spjd 2084168404Spjd#pragma weak dmu_buf_refcount = dbuf_refcount 2085168404Spjduint64_t 2086168404Spjddbuf_refcount(dmu_buf_impl_t *db) 2087168404Spjd{ 2088168404Spjd return (refcount_count(&db->db_holds)); 2089168404Spjd} 2090168404Spjd 2091168404Spjdvoid * 2092168404Spjddmu_buf_set_user(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr, 2093168404Spjd dmu_buf_evict_func_t *evict_func) 2094168404Spjd{ 2095168404Spjd return (dmu_buf_update_user(db_fake, NULL, user_ptr, 2096168404Spjd user_data_ptr_ptr, evict_func)); 2097168404Spjd} 2098168404Spjd 2099168404Spjdvoid * 2100168404Spjddmu_buf_set_user_ie(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr, 2101168404Spjd dmu_buf_evict_func_t *evict_func) 2102168404Spjd{ 2103168404Spjd dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 2104168404Spjd 2105168404Spjd db->db_immediate_evict = TRUE; 2106168404Spjd return (dmu_buf_update_user(db_fake, NULL, user_ptr, 2107168404Spjd user_data_ptr_ptr, evict_func)); 2108168404Spjd} 2109168404Spjd 2110168404Spjdvoid * 2111168404Spjddmu_buf_update_user(dmu_buf_t *db_fake, void *old_user_ptr, void *user_ptr, 2112168404Spjd void *user_data_ptr_ptr, dmu_buf_evict_func_t *evict_func) 2113168404Spjd{ 2114168404Spjd dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 2115168404Spjd ASSERT(db->db_level == 0); 2116168404Spjd 2117168404Spjd ASSERT((user_ptr == NULL) == (evict_func == NULL)); 2118168404Spjd 2119168404Spjd mutex_enter(&db->db_mtx); 2120168404Spjd 2121168404Spjd if (db->db_user_ptr == old_user_ptr) { 2122168404Spjd db->db_user_ptr = user_ptr; 2123168404Spjd db->db_user_data_ptr_ptr = user_data_ptr_ptr; 2124168404Spjd db->db_evict_func = evict_func; 2125168404Spjd 2126168404Spjd dbuf_update_data(db); 2127168404Spjd } else { 2128168404Spjd old_user_ptr = db->db_user_ptr; 2129168404Spjd } 2130168404Spjd 2131168404Spjd mutex_exit(&db->db_mtx); 2132168404Spjd return (old_user_ptr); 2133168404Spjd} 2134168404Spjd 2135168404Spjdvoid * 2136168404Spjddmu_buf_get_user(dmu_buf_t *db_fake) 2137168404Spjd{ 2138168404Spjd dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 2139168404Spjd ASSERT(!refcount_is_zero(&db->db_holds)); 2140168404Spjd 2141168404Spjd return (db->db_user_ptr); 2142168404Spjd} 2143168404Spjd 2144209962Smmboolean_t 2145209962Smmdmu_buf_freeable(dmu_buf_t *dbuf) 2146209962Smm{ 2147209962Smm boolean_t res = B_FALSE; 2148209962Smm dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf; 2149209962Smm 2150209962Smm if (db->db_blkptr) 2151209962Smm res = dsl_dataset_block_freeable(db->db_objset->os_dsl_dataset, 2152219089Spjd db->db_blkptr, db->db_blkptr->blk_birth); 2153209962Smm 2154209962Smm return (res); 2155209962Smm} 2156209962Smm 2157243524Smmblkptr_t * 2158243524Smmdmu_buf_get_blkptr(dmu_buf_t *db) 2159243524Smm{ 2160243524Smm dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db; 2161243524Smm return (dbi->db_blkptr); 2162243524Smm} 2163243524Smm 2164168404Spjdstatic void 2165168404Spjddbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db) 2166168404Spjd{ 2167168404Spjd /* ASSERT(dmu_tx_is_syncing(tx) */ 2168168404Spjd ASSERT(MUTEX_HELD(&db->db_mtx)); 2169168404Spjd 2170168404Spjd if (db->db_blkptr != NULL) 2171168404Spjd return; 2172168404Spjd 2173219089Spjd if (db->db_blkid == DMU_SPILL_BLKID) { 2174219089Spjd db->db_blkptr = &dn->dn_phys->dn_spill; 2175219089Spjd BP_ZERO(db->db_blkptr); 2176219089Spjd return; 2177219089Spjd } 2178168404Spjd if (db->db_level == dn->dn_phys->dn_nlevels-1) { 2179168404Spjd /* 2180168404Spjd * This buffer was allocated at a time when there was 2181168404Spjd * no available blkptrs from the dnode, or it was 2182168404Spjd * inappropriate to hook it in (i.e., nlevels mis-match). 2183168404Spjd */ 2184168404Spjd ASSERT(db->db_blkid < dn->dn_phys->dn_nblkptr); 2185168404Spjd ASSERT(db->db_parent == NULL); 2186168404Spjd db->db_parent = dn->dn_dbuf; 2187168404Spjd db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid]; 2188168404Spjd DBUF_VERIFY(db); 2189168404Spjd } else { 2190168404Spjd dmu_buf_impl_t *parent = db->db_parent; 2191168404Spjd int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; 2192168404Spjd 2193168404Spjd ASSERT(dn->dn_phys->dn_nlevels > 1); 2194168404Spjd if (parent == NULL) { 2195168404Spjd mutex_exit(&db->db_mtx); 2196168404Spjd rw_enter(&dn->dn_struct_rwlock, RW_READER); 2197168404Spjd (void) dbuf_hold_impl(dn, db->db_level+1, 2198168404Spjd db->db_blkid >> epbs, FALSE, db, &parent); 2199168404Spjd rw_exit(&dn->dn_struct_rwlock); 2200168404Spjd mutex_enter(&db->db_mtx); 2201168404Spjd db->db_parent = parent; 2202168404Spjd } 2203168404Spjd db->db_blkptr = (blkptr_t *)parent->db.db_data + 2204168404Spjd (db->db_blkid & ((1ULL << epbs) - 1)); 2205168404Spjd DBUF_VERIFY(db); 2206168404Spjd } 2207168404Spjd} 2208168404Spjd 2209168404Spjdstatic void 2210168404Spjddbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx) 2211168404Spjd{ 2212168404Spjd dmu_buf_impl_t *db = dr->dr_dbuf; 2213219089Spjd dnode_t *dn; 2214168404Spjd zio_t *zio; 2215168404Spjd 2216168404Spjd ASSERT(dmu_tx_is_syncing(tx)); 2217168404Spjd 2218168404Spjd dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr); 2219168404Spjd 2220168404Spjd mutex_enter(&db->db_mtx); 2221168404Spjd 2222168404Spjd ASSERT(db->db_level > 0); 2223168404Spjd DBUF_VERIFY(db); 2224168404Spjd 2225168404Spjd if (db->db_buf == NULL) { 2226168404Spjd mutex_exit(&db->db_mtx); 2227168404Spjd (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED); 2228168404Spjd mutex_enter(&db->db_mtx); 2229168404Spjd } 2230168404Spjd ASSERT3U(db->db_state, ==, DB_CACHED); 2231168404Spjd ASSERT(db->db_buf != NULL); 2232168404Spjd 2233219089Spjd DB_DNODE_ENTER(db); 2234219089Spjd dn = DB_DNODE(db); 2235219089Spjd ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift); 2236168404Spjd dbuf_check_blkptr(dn, db); 2237219089Spjd DB_DNODE_EXIT(db); 2238168404Spjd 2239168404Spjd db->db_data_pending = dr; 2240168404Spjd 2241168404Spjd mutex_exit(&db->db_mtx); 2242185029Spjd dbuf_write(dr, db->db_buf, tx); 2243168404Spjd 2244168404Spjd zio = dr->dr_zio; 2245168404Spjd mutex_enter(&dr->dt.di.dr_mtx); 2246168404Spjd dbuf_sync_list(&dr->dt.di.dr_children, tx); 2247168404Spjd ASSERT(list_head(&dr->dt.di.dr_children) == NULL); 2248168404Spjd mutex_exit(&dr->dt.di.dr_mtx); 2249168404Spjd zio_nowait(zio); 2250168404Spjd} 2251168404Spjd 2252168404Spjdstatic void 2253168404Spjddbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) 2254168404Spjd{ 2255168404Spjd arc_buf_t **datap = &dr->dt.dl.dr_data; 2256168404Spjd dmu_buf_impl_t *db = dr->dr_dbuf; 2257219089Spjd dnode_t *dn; 2258219089Spjd objset_t *os; 2259168404Spjd uint64_t txg = tx->tx_txg; 2260168404Spjd 2261168404Spjd ASSERT(dmu_tx_is_syncing(tx)); 2262168404Spjd 2263168404Spjd dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr); 2264168404Spjd 2265168404Spjd mutex_enter(&db->db_mtx); 2266168404Spjd /* 2267168404Spjd * To be synced, we must be dirtied. But we 2268168404Spjd * might have been freed after the dirty. 2269168404Spjd */ 2270168404Spjd if (db->db_state == DB_UNCACHED) { 2271168404Spjd /* This buffer has been freed since it was dirtied */ 2272168404Spjd ASSERT(db->db.db_data == NULL); 2273168404Spjd } else if (db->db_state == DB_FILL) { 2274168404Spjd /* This buffer was freed and is now being re-filled */ 2275168404Spjd ASSERT(db->db.db_data != dr->dt.dl.dr_data); 2276168404Spjd } else { 2277219089Spjd ASSERT(db->db_state == DB_CACHED || db->db_state == DB_NOFILL); 2278168404Spjd } 2279168404Spjd DBUF_VERIFY(db); 2280168404Spjd 2281219089Spjd DB_DNODE_ENTER(db); 2282219089Spjd dn = DB_DNODE(db); 2283219089Spjd 2284219089Spjd if (db->db_blkid == DMU_SPILL_BLKID) { 2285219089Spjd mutex_enter(&dn->dn_mtx); 2286219089Spjd dn->dn_phys->dn_flags |= DNODE_FLAG_SPILL_BLKPTR; 2287219089Spjd mutex_exit(&dn->dn_mtx); 2288219089Spjd } 2289219089Spjd 2290168404Spjd /* 2291168404Spjd * If this is a bonus buffer, simply copy the bonus data into the 2292168404Spjd * dnode. It will be written out when the dnode is synced (and it 2293168404Spjd * will be synced, since it must have been dirty for dbuf_sync to 2294168404Spjd * be called). 2295168404Spjd */ 2296219089Spjd if (db->db_blkid == DMU_BONUS_BLKID) { 2297168404Spjd dbuf_dirty_record_t **drp; 2298185029Spjd 2299168404Spjd ASSERT(*datap != NULL); 2300240415Smm ASSERT0(db->db_level); 2301168404Spjd ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN); 2302168404Spjd bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen); 2303219089Spjd DB_DNODE_EXIT(db); 2304219089Spjd 2305185029Spjd if (*datap != db->db.db_data) { 2306168404Spjd zio_buf_free(*datap, DN_MAX_BONUSLEN); 2307208373Smm arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); 2308185029Spjd } 2309168404Spjd db->db_data_pending = NULL; 2310168404Spjd drp = &db->db_last_dirty; 2311168404Spjd while (*drp != dr) 2312168404Spjd drp = &(*drp)->dr_next; 2313185029Spjd ASSERT(dr->dr_next == NULL); 2314219089Spjd ASSERT(dr->dr_dbuf == db); 2315185029Spjd *drp = dr->dr_next; 2316169325Spjd if (dr->dr_dbuf->db_level != 0) { 2317169325Spjd list_destroy(&dr->dt.di.dr_children); 2318169325Spjd mutex_destroy(&dr->dt.di.dr_mtx); 2319169325Spjd } 2320168404Spjd kmem_free(dr, sizeof (dbuf_dirty_record_t)); 2321168404Spjd ASSERT(db->db_dirtycnt > 0); 2322168404Spjd db->db_dirtycnt -= 1; 2323219089Spjd dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg); 2324168404Spjd return; 2325168404Spjd } 2326168404Spjd 2327219089Spjd os = dn->dn_objset; 2328219089Spjd 2329168404Spjd /* 2330185029Spjd * This function may have dropped the db_mtx lock allowing a dmu_sync 2331185029Spjd * operation to sneak in. As a result, we need to ensure that we 2332185029Spjd * don't check the dr_override_state until we have returned from 2333185029Spjd * dbuf_check_blkptr. 2334185029Spjd */ 2335185029Spjd dbuf_check_blkptr(dn, db); 2336185029Spjd 2337185029Spjd /* 2338219089Spjd * If this buffer is in the middle of an immediate write, 2339168404Spjd * wait for the synchronous IO to complete. 2340168404Spjd */ 2341168404Spjd while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) { 2342168404Spjd ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT); 2343168404Spjd cv_wait(&db->db_changed, &db->db_mtx); 2344168404Spjd ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN); 2345168404Spjd } 2346168404Spjd 2347219089Spjd if (db->db_state != DB_NOFILL && 2348219089Spjd dn->dn_object != DMU_META_DNODE_OBJECT && 2349208050Smm refcount_count(&db->db_holds) > 1 && 2350219089Spjd dr->dt.dl.dr_override_state != DR_OVERRIDDEN && 2351208050Smm *datap == db->db_buf) { 2352168404Spjd /* 2353208050Smm * If this buffer is currently "in use" (i.e., there 2354208050Smm * are active holds and db_data still references it), 2355208050Smm * then make a copy before we start the write so that 2356208050Smm * any modifications from the open txg will not leak 2357208050Smm * into this write. 2358168404Spjd * 2359208050Smm * NOTE: this copy does not need to be made for 2360208050Smm * objects only modified in the syncing context (e.g. 2361208050Smm * DNONE_DNODE blocks). 2362168404Spjd */ 2363208050Smm int blksz = arc_buf_size(*datap); 2364208050Smm arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 2365208050Smm *datap = arc_buf_alloc(os->os_spa, blksz, db, type); 2366208050Smm bcopy(db->db.db_data, (*datap)->b_data, blksz); 2367168404Spjd } 2368168404Spjd db->db_data_pending = dr; 2369168404Spjd 2370168404Spjd mutex_exit(&db->db_mtx); 2371168404Spjd 2372185029Spjd dbuf_write(dr, *datap, tx); 2373168404Spjd 2374168404Spjd ASSERT(!list_link_active(&dr->dr_dirty_node)); 2375219089Spjd if (dn->dn_object == DMU_META_DNODE_OBJECT) { 2376168404Spjd list_insert_tail(&dn->dn_dirty_records[txg&TXG_MASK], dr); 2377219089Spjd DB_DNODE_EXIT(db); 2378219089Spjd } else { 2379219089Spjd /* 2380219089Spjd * Although zio_nowait() does not "wait for an IO", it does 2381219089Spjd * initiate the IO. If this is an empty write it seems plausible 2382219089Spjd * that the IO could actually be completed before the nowait 2383219089Spjd * returns. We need to DB_DNODE_EXIT() first in case 2384219089Spjd * zio_nowait() invalidates the dbuf. 2385219089Spjd */ 2386219089Spjd DB_DNODE_EXIT(db); 2387168404Spjd zio_nowait(dr->dr_zio); 2388219089Spjd } 2389168404Spjd} 2390168404Spjd 2391168404Spjdvoid 2392168404Spjddbuf_sync_list(list_t *list, dmu_tx_t *tx) 2393168404Spjd{ 2394168404Spjd dbuf_dirty_record_t *dr; 2395168404Spjd 2396168404Spjd while (dr = list_head(list)) { 2397168404Spjd if (dr->dr_zio != NULL) { 2398168404Spjd /* 2399168404Spjd * If we find an already initialized zio then we 2400168404Spjd * are processing the meta-dnode, and we have finished. 2401168404Spjd * The dbufs for all dnodes are put back on the list 2402168404Spjd * during processing, so that we can zio_wait() 2403168404Spjd * these IOs after initiating all child IOs. 2404168404Spjd */ 2405168404Spjd ASSERT3U(dr->dr_dbuf->db.db_object, ==, 2406168404Spjd DMU_META_DNODE_OBJECT); 2407168404Spjd break; 2408168404Spjd } 2409168404Spjd list_remove(list, dr); 2410168404Spjd if (dr->dr_dbuf->db_level > 0) 2411168404Spjd dbuf_sync_indirect(dr, tx); 2412168404Spjd else 2413168404Spjd dbuf_sync_leaf(dr, tx); 2414168404Spjd } 2415168404Spjd} 2416168404Spjd 2417168404Spjd/* ARGSUSED */ 2418168404Spjdstatic void 2419168404Spjddbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb) 2420168404Spjd{ 2421168404Spjd dmu_buf_impl_t *db = vdb; 2422219089Spjd dnode_t *dn; 2423185029Spjd blkptr_t *bp = zio->io_bp; 2424168404Spjd blkptr_t *bp_orig = &zio->io_bp_orig; 2425219089Spjd spa_t *spa = zio->io_spa; 2426219089Spjd int64_t delta; 2427168404Spjd uint64_t fill = 0; 2428219089Spjd int i; 2429168404Spjd 2430185029Spjd ASSERT(db->db_blkptr == bp); 2431185029Spjd 2432219089Spjd DB_DNODE_ENTER(db); 2433219089Spjd dn = DB_DNODE(db); 2434219089Spjd delta = bp_get_dsize_sync(spa, bp) - bp_get_dsize_sync(spa, bp_orig); 2435219089Spjd dnode_diduse_space(dn, delta - zio->io_prev_space_delta); 2436219089Spjd zio->io_prev_space_delta = delta; 2437168404Spjd 2438185029Spjd if (BP_IS_HOLE(bp)) { 2439219089Spjd ASSERT(bp->blk_fill == 0); 2440219089Spjd DB_DNODE_EXIT(db); 2441168404Spjd return; 2442168404Spjd } 2443168404Spjd 2444219089Spjd ASSERT((db->db_blkid != DMU_SPILL_BLKID && 2445219089Spjd BP_GET_TYPE(bp) == dn->dn_type) || 2446219089Spjd (db->db_blkid == DMU_SPILL_BLKID && 2447219089Spjd BP_GET_TYPE(bp) == dn->dn_bonustype)); 2448185029Spjd ASSERT(BP_GET_LEVEL(bp) == db->db_level); 2449185029Spjd 2450168404Spjd mutex_enter(&db->db_mtx); 2451168404Spjd 2452219089Spjd#ifdef ZFS_DEBUG 2453219089Spjd if (db->db_blkid == DMU_SPILL_BLKID) { 2454219089Spjd ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR); 2455219089Spjd ASSERT(!(BP_IS_HOLE(db->db_blkptr)) && 2456219089Spjd db->db_blkptr == &dn->dn_phys->dn_spill); 2457219089Spjd } 2458219089Spjd#endif 2459219089Spjd 2460168404Spjd if (db->db_level == 0) { 2461168404Spjd mutex_enter(&dn->dn_mtx); 2462219089Spjd if (db->db_blkid > dn->dn_phys->dn_maxblkid && 2463219089Spjd db->db_blkid != DMU_SPILL_BLKID) 2464168404Spjd dn->dn_phys->dn_maxblkid = db->db_blkid; 2465168404Spjd mutex_exit(&dn->dn_mtx); 2466168404Spjd 2467168404Spjd if (dn->dn_type == DMU_OT_DNODE) { 2468168404Spjd dnode_phys_t *dnp = db->db.db_data; 2469168404Spjd for (i = db->db.db_size >> DNODE_SHIFT; i > 0; 2470168404Spjd i--, dnp++) { 2471168404Spjd if (dnp->dn_type != DMU_OT_NONE) 2472168404Spjd fill++; 2473168404Spjd } 2474168404Spjd } else { 2475168404Spjd fill = 1; 2476168404Spjd } 2477168404Spjd } else { 2478185029Spjd blkptr_t *ibp = db->db.db_data; 2479168404Spjd ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift); 2480185029Spjd for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) { 2481185029Spjd if (BP_IS_HOLE(ibp)) 2482168404Spjd continue; 2483185029Spjd fill += ibp->blk_fill; 2484168404Spjd } 2485168404Spjd } 2486219089Spjd DB_DNODE_EXIT(db); 2487168404Spjd 2488185029Spjd bp->blk_fill = fill; 2489168404Spjd 2490168404Spjd mutex_exit(&db->db_mtx); 2491168404Spjd} 2492168404Spjd 2493168404Spjd/* ARGSUSED */ 2494168404Spjdstatic void 2495168404Spjddbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) 2496168404Spjd{ 2497168404Spjd dmu_buf_impl_t *db = vdb; 2498219089Spjd blkptr_t *bp = zio->io_bp; 2499219089Spjd blkptr_t *bp_orig = &zio->io_bp_orig; 2500168404Spjd uint64_t txg = zio->io_txg; 2501168404Spjd dbuf_dirty_record_t **drp, *dr; 2502168404Spjd 2503240415Smm ASSERT0(zio->io_error); 2504219089Spjd ASSERT(db->db_blkptr == bp); 2505168404Spjd 2506243524Smm /* 2507243524Smm * For nopwrites and rewrites we ensure that the bp matches our 2508243524Smm * original and bypass all the accounting. 2509243524Smm */ 2510243524Smm if (zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)) { 2511219089Spjd ASSERT(BP_EQUAL(bp, bp_orig)); 2512219089Spjd } else { 2513219089Spjd objset_t *os; 2514219089Spjd dsl_dataset_t *ds; 2515219089Spjd dmu_tx_t *tx; 2516219089Spjd 2517219089Spjd DB_GET_OBJSET(&os, db); 2518219089Spjd ds = os->os_dsl_dataset; 2519219089Spjd tx = os->os_synctx; 2520219089Spjd 2521219089Spjd (void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE); 2522219089Spjd dsl_dataset_block_born(ds, bp, tx); 2523219089Spjd } 2524219089Spjd 2525168404Spjd mutex_enter(&db->db_mtx); 2526168404Spjd 2527219089Spjd DBUF_VERIFY(db); 2528219089Spjd 2529168404Spjd drp = &db->db_last_dirty; 2530185029Spjd while ((dr = *drp) != db->db_data_pending) 2531185029Spjd drp = &dr->dr_next; 2532185029Spjd ASSERT(!list_link_active(&dr->dr_dirty_node)); 2533185029Spjd ASSERT(dr->dr_txg == txg); 2534219089Spjd ASSERT(dr->dr_dbuf == db); 2535185029Spjd ASSERT(dr->dr_next == NULL); 2536185029Spjd *drp = dr->dr_next; 2537168404Spjd 2538219089Spjd#ifdef ZFS_DEBUG 2539219089Spjd if (db->db_blkid == DMU_SPILL_BLKID) { 2540219089Spjd dnode_t *dn; 2541219089Spjd 2542219089Spjd DB_DNODE_ENTER(db); 2543219089Spjd dn = DB_DNODE(db); 2544219089Spjd ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR); 2545219089Spjd ASSERT(!(BP_IS_HOLE(db->db_blkptr)) && 2546219089Spjd db->db_blkptr == &dn->dn_phys->dn_spill); 2547219089Spjd DB_DNODE_EXIT(db); 2548219089Spjd } 2549219089Spjd#endif 2550219089Spjd 2551168404Spjd if (db->db_level == 0) { 2552219089Spjd ASSERT(db->db_blkid != DMU_BONUS_BLKID); 2553168404Spjd ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN); 2554219089Spjd if (db->db_state != DB_NOFILL) { 2555219089Spjd if (dr->dt.dl.dr_data != db->db_buf) 2556219089Spjd VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, 2557248571Smm db)); 2558219089Spjd else if (!arc_released(db->db_buf)) 2559219089Spjd arc_set_callback(db->db_buf, dbuf_do_evict, db); 2560219089Spjd } 2561168404Spjd } else { 2562219089Spjd dnode_t *dn; 2563168404Spjd 2564219089Spjd DB_DNODE_ENTER(db); 2565219089Spjd dn = DB_DNODE(db); 2566168404Spjd ASSERT(list_head(&dr->dt.di.dr_children) == NULL); 2567168404Spjd ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift); 2568168404Spjd if (!BP_IS_HOLE(db->db_blkptr)) { 2569168404Spjd int epbs = 2570168404Spjd dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; 2571168404Spjd ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==, 2572168404Spjd db->db.db_size); 2573168404Spjd ASSERT3U(dn->dn_phys->dn_maxblkid 2574168404Spjd >> (db->db_level * epbs), >=, db->db_blkid); 2575168404Spjd arc_set_callback(db->db_buf, dbuf_do_evict, db); 2576168404Spjd } 2577219089Spjd DB_DNODE_EXIT(db); 2578185029Spjd mutex_destroy(&dr->dt.di.dr_mtx); 2579169325Spjd list_destroy(&dr->dt.di.dr_children); 2580168404Spjd } 2581168404Spjd kmem_free(dr, sizeof (dbuf_dirty_record_t)); 2582168404Spjd 2583168404Spjd cv_broadcast(&db->db_changed); 2584168404Spjd ASSERT(db->db_dirtycnt > 0); 2585168404Spjd db->db_dirtycnt -= 1; 2586168404Spjd db->db_data_pending = NULL; 2587219089Spjd dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg); 2588219089Spjd} 2589219089Spjd 2590219089Spjdstatic void 2591219089Spjddbuf_write_nofill_ready(zio_t *zio) 2592219089Spjd{ 2593219089Spjd dbuf_write_ready(zio, NULL, zio->io_private); 2594219089Spjd} 2595219089Spjd 2596219089Spjdstatic void 2597219089Spjddbuf_write_nofill_done(zio_t *zio) 2598219089Spjd{ 2599219089Spjd dbuf_write_done(zio, NULL, zio->io_private); 2600219089Spjd} 2601219089Spjd 2602219089Spjdstatic void 2603219089Spjddbuf_write_override_ready(zio_t *zio) 2604219089Spjd{ 2605219089Spjd dbuf_dirty_record_t *dr = zio->io_private; 2606219089Spjd dmu_buf_impl_t *db = dr->dr_dbuf; 2607219089Spjd 2608219089Spjd dbuf_write_ready(zio, NULL, db); 2609219089Spjd} 2610219089Spjd 2611219089Spjdstatic void 2612219089Spjddbuf_write_override_done(zio_t *zio) 2613219089Spjd{ 2614219089Spjd dbuf_dirty_record_t *dr = zio->io_private; 2615219089Spjd dmu_buf_impl_t *db = dr->dr_dbuf; 2616219089Spjd blkptr_t *obp = &dr->dt.dl.dr_overridden_by; 2617219089Spjd 2618219089Spjd mutex_enter(&db->db_mtx); 2619219089Spjd if (!BP_EQUAL(zio->io_bp, obp)) { 2620219089Spjd if (!BP_IS_HOLE(obp)) 2621219089Spjd dsl_free(spa_get_dsl(zio->io_spa), zio->io_txg, obp); 2622219089Spjd arc_release(dr->dt.dl.dr_data, db); 2623219089Spjd } 2624168404Spjd mutex_exit(&db->db_mtx); 2625168404Spjd 2626219089Spjd dbuf_write_done(zio, NULL, db); 2627219089Spjd} 2628168404Spjd 2629219089Spjdstatic void 2630219089Spjddbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) 2631219089Spjd{ 2632219089Spjd dmu_buf_impl_t *db = dr->dr_dbuf; 2633219089Spjd dnode_t *dn; 2634219089Spjd objset_t *os; 2635219089Spjd dmu_buf_impl_t *parent = db->db_parent; 2636219089Spjd uint64_t txg = tx->tx_txg; 2637219089Spjd zbookmark_t zb; 2638219089Spjd zio_prop_t zp; 2639219089Spjd zio_t *zio; 2640219089Spjd int wp_flag = 0; 2641219089Spjd 2642219089Spjd DB_DNODE_ENTER(db); 2643219089Spjd dn = DB_DNODE(db); 2644219089Spjd os = dn->dn_objset; 2645219089Spjd 2646219089Spjd if (db->db_state != DB_NOFILL) { 2647219089Spjd if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) { 2648219089Spjd /* 2649219089Spjd * Private object buffers are released here rather 2650219089Spjd * than in dbuf_dirty() since they are only modified 2651219089Spjd * in the syncing context and we don't want the 2652219089Spjd * overhead of making multiple copies of the data. 2653219089Spjd */ 2654219089Spjd if (BP_IS_HOLE(db->db_blkptr)) { 2655219089Spjd arc_buf_thaw(data); 2656219089Spjd } else { 2657219089Spjd dbuf_release_bp(db); 2658219089Spjd } 2659219089Spjd } 2660219089Spjd } 2661219089Spjd 2662219089Spjd if (parent != dn->dn_dbuf) { 2663219089Spjd ASSERT(parent && parent->db_data_pending); 2664219089Spjd ASSERT(db->db_level == parent->db_level-1); 2665219089Spjd ASSERT(arc_released(parent->db_buf)); 2666219089Spjd zio = parent->db_data_pending->dr_zio; 2667219089Spjd } else { 2668219089Spjd ASSERT((db->db_level == dn->dn_phys->dn_nlevels-1 && 2669219089Spjd db->db_blkid != DMU_SPILL_BLKID) || 2670219089Spjd (db->db_blkid == DMU_SPILL_BLKID && db->db_level == 0)); 2671219089Spjd if (db->db_blkid != DMU_SPILL_BLKID) 2672219089Spjd ASSERT3P(db->db_blkptr, ==, 2673219089Spjd &dn->dn_phys->dn_blkptr[db->db_blkid]); 2674219089Spjd zio = dn->dn_zio; 2675219089Spjd } 2676219089Spjd 2677219089Spjd ASSERT(db->db_level == 0 || data == db->db_buf); 2678219089Spjd ASSERT3U(db->db_blkptr->blk_birth, <=, txg); 2679219089Spjd ASSERT(zio); 2680219089Spjd 2681219089Spjd SET_BOOKMARK(&zb, os->os_dsl_dataset ? 2682219089Spjd os->os_dsl_dataset->ds_object : DMU_META_OBJSET, 2683219089Spjd db->db.db_object, db->db_level, db->db_blkid); 2684219089Spjd 2685219089Spjd if (db->db_blkid == DMU_SPILL_BLKID) 2686219089Spjd wp_flag = WP_SPILL; 2687219089Spjd wp_flag |= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0; 2688219089Spjd 2689219089Spjd dmu_write_policy(os, dn, db->db_level, wp_flag, &zp); 2690219089Spjd DB_DNODE_EXIT(db); 2691219089Spjd 2692219089Spjd if (db->db_level == 0 && dr->dt.dl.dr_override_state == DR_OVERRIDDEN) { 2693219089Spjd ASSERT(db->db_state != DB_NOFILL); 2694219089Spjd dr->dr_zio = zio_write(zio, os->os_spa, txg, 2695219089Spjd db->db_blkptr, data->b_data, arc_buf_size(data), &zp, 2696219089Spjd dbuf_write_override_ready, dbuf_write_override_done, dr, 2697219089Spjd ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); 2698219089Spjd mutex_enter(&db->db_mtx); 2699219089Spjd dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; 2700219089Spjd zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by, 2701243524Smm dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite); 2702219089Spjd mutex_exit(&db->db_mtx); 2703219089Spjd } else if (db->db_state == DB_NOFILL) { 2704219089Spjd ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF); 2705219089Spjd dr->dr_zio = zio_write(zio, os->os_spa, txg, 2706219089Spjd db->db_blkptr, NULL, db->db.db_size, &zp, 2707219089Spjd dbuf_write_nofill_ready, dbuf_write_nofill_done, db, 2708219089Spjd ZIO_PRIORITY_ASYNC_WRITE, 2709219089Spjd ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb); 2710219089Spjd } else { 2711219089Spjd ASSERT(arc_released(data)); 2712219089Spjd dr->dr_zio = arc_write(zio, os->os_spa, txg, 2713219089Spjd db->db_blkptr, data, DBUF_IS_L2CACHEABLE(db), &zp, 2714219089Spjd dbuf_write_ready, dbuf_write_done, db, 2715219089Spjd ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); 2716219089Spjd } 2717168404Spjd} 2718