dbuf.c revision 275782
1168404Spjd/* 2168404Spjd * CDDL HEADER START 3168404Spjd * 4168404Spjd * The contents of this file are subject to the terms of the 5168404Spjd * Common Development and Distribution License (the "License"). 6168404Spjd * You may not use this file except in compliance with the License. 7168404Spjd * 8168404Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9168404Spjd * or http://www.opensolaris.org/os/licensing. 10168404Spjd * See the License for the specific language governing permissions 11168404Spjd * and limitations under the License. 12168404Spjd * 13168404Spjd * When distributing Covered Code, include this CDDL HEADER in each 14168404Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15168404Spjd * If applicable, add the following below this CDDL HEADER, with the 16168404Spjd * fields enclosed by brackets "[]" replaced with your own identifying 17168404Spjd * information: Portions Copyright [yyyy] [name of copyright owner] 18168404Spjd * 19168404Spjd * CDDL HEADER END 20168404Spjd */ 21168404Spjd/* 22219089Spjd * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23219636Spjd * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 24264669Sdelphij * Copyright (c) 2012, 2014 by Delphix. All rights reserved. 25251478Sdelphij * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. 26255750Sdelphij * Copyright (c) 2013, Joyent, Inc. All rights reserved. 27168404Spjd */ 28168404Spjd 29168404Spjd#include <sys/zfs_context.h> 30168404Spjd#include <sys/dmu.h> 31253821Sdelphij#include <sys/dmu_send.h> 32168404Spjd#include <sys/dmu_impl.h> 33168404Spjd#include <sys/dbuf.h> 34168404Spjd#include <sys/dmu_objset.h> 35168404Spjd#include <sys/dsl_dataset.h> 36168404Spjd#include <sys/dsl_dir.h> 37168404Spjd#include <sys/dmu_tx.h> 38168404Spjd#include <sys/spa.h> 39168404Spjd#include <sys/zio.h> 40168404Spjd#include <sys/dmu_zfetch.h> 41219089Spjd#include <sys/sa.h> 42219089Spjd#include <sys/sa_impl.h> 43268075Sdelphij#include <sys/zfeature.h> 44268075Sdelphij#include <sys/blkptr.h> 45264669Sdelphij#include <sys/range_tree.h> 46168404Spjd 47254753Sdelphij/* 48254753Sdelphij * Number of times that zfs_free_range() took the slow path while doing 49254753Sdelphij * a zfs receive. A nonzero value indicates a potential performance problem. 50254753Sdelphij */ 51254753Sdelphijuint64_t zfs_free_range_recv_miss; 52254753Sdelphij 53168404Spjdstatic void dbuf_destroy(dmu_buf_impl_t *db); 54248571Smmstatic boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx); 55185029Spjdstatic void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx); 56168404Spjd 57168404Spjd/* 58168404Spjd * Global data structures and functions for the dbuf cache. 59168404Spjd */ 60168404Spjdstatic kmem_cache_t *dbuf_cache; 61168404Spjd 62168404Spjd/* ARGSUSED */ 63168404Spjdstatic int 64168404Spjddbuf_cons(void *vdb, void *unused, int kmflag) 65168404Spjd{ 66168404Spjd dmu_buf_impl_t *db = vdb; 67168404Spjd bzero(db, sizeof (dmu_buf_impl_t)); 68168404Spjd 69168404Spjd mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL); 70168404Spjd cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL); 71168404Spjd refcount_create(&db->db_holds); 72269229Sdelphij 73168404Spjd return (0); 74168404Spjd} 75168404Spjd 76168404Spjd/* ARGSUSED */ 77168404Spjdstatic void 78168404Spjddbuf_dest(void *vdb, void *unused) 79168404Spjd{ 80168404Spjd dmu_buf_impl_t *db = vdb; 81168404Spjd mutex_destroy(&db->db_mtx); 82168404Spjd cv_destroy(&db->db_changed); 83168404Spjd refcount_destroy(&db->db_holds); 84168404Spjd} 85168404Spjd 86168404Spjd/* 87168404Spjd * dbuf hash table routines 88168404Spjd */ 89168404Spjdstatic dbuf_hash_table_t dbuf_hash_table; 90168404Spjd 91168404Spjdstatic uint64_t dbuf_hash_count; 92168404Spjd 93168404Spjdstatic uint64_t 94168404Spjddbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid) 95168404Spjd{ 96168404Spjd uintptr_t osv = (uintptr_t)os; 97168404Spjd uint64_t crc = -1ULL; 98168404Spjd 99168404Spjd ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); 100168404Spjd crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (lvl)) & 0xFF]; 101168404Spjd crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF]; 102168404Spjd crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF]; 103168404Spjd crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF]; 104168404Spjd crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 0)) & 0xFF]; 105168404Spjd crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 8)) & 0xFF]; 106168404Spjd 107168404Spjd crc ^= (osv>>14) ^ (obj>>16) ^ (blkid>>16); 108168404Spjd 109168404Spjd return (crc); 110168404Spjd} 111168404Spjd 112168404Spjd#define DBUF_HASH(os, obj, level, blkid) dbuf_hash(os, obj, level, blkid); 113168404Spjd 114168404Spjd#define DBUF_EQUAL(dbuf, os, obj, level, blkid) \ 115168404Spjd ((dbuf)->db.db_object == (obj) && \ 116168404Spjd (dbuf)->db_objset == (os) && \ 117168404Spjd (dbuf)->db_level == (level) && \ 118168404Spjd (dbuf)->db_blkid == (blkid)) 119168404Spjd 120168404Spjddmu_buf_impl_t * 121168404Spjddbuf_find(dnode_t *dn, uint8_t level, uint64_t blkid) 122168404Spjd{ 123168404Spjd dbuf_hash_table_t *h = &dbuf_hash_table; 124219089Spjd objset_t *os = dn->dn_objset; 125168404Spjd uint64_t obj = dn->dn_object; 126168404Spjd uint64_t hv = DBUF_HASH(os, obj, level, blkid); 127168404Spjd uint64_t idx = hv & h->hash_table_mask; 128168404Spjd dmu_buf_impl_t *db; 129168404Spjd 130168404Spjd mutex_enter(DBUF_HASH_MUTEX(h, idx)); 131168404Spjd for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) { 132168404Spjd if (DBUF_EQUAL(db, os, obj, level, blkid)) { 133168404Spjd mutex_enter(&db->db_mtx); 134168404Spjd if (db->db_state != DB_EVICTING) { 135168404Spjd mutex_exit(DBUF_HASH_MUTEX(h, idx)); 136168404Spjd return (db); 137168404Spjd } 138168404Spjd mutex_exit(&db->db_mtx); 139168404Spjd } 140168404Spjd } 141168404Spjd mutex_exit(DBUF_HASH_MUTEX(h, idx)); 142168404Spjd return (NULL); 143168404Spjd} 144168404Spjd 145168404Spjd/* 146168404Spjd * Insert an entry into the hash table. If there is already an element 147168404Spjd * equal to elem in the hash table, then the already existing element 148168404Spjd * will be returned and the new element will not be inserted. 149168404Spjd * Otherwise returns NULL. 150168404Spjd */ 151168404Spjdstatic dmu_buf_impl_t * 152168404Spjddbuf_hash_insert(dmu_buf_impl_t *db) 153168404Spjd{ 154168404Spjd dbuf_hash_table_t *h = &dbuf_hash_table; 155219089Spjd objset_t *os = db->db_objset; 156168404Spjd uint64_t obj = db->db.db_object; 157168404Spjd int level = db->db_level; 158168404Spjd uint64_t blkid = db->db_blkid; 159168404Spjd uint64_t hv = DBUF_HASH(os, obj, level, blkid); 160168404Spjd uint64_t idx = hv & h->hash_table_mask; 161168404Spjd dmu_buf_impl_t *dbf; 162168404Spjd 163168404Spjd mutex_enter(DBUF_HASH_MUTEX(h, idx)); 164168404Spjd for (dbf = h->hash_table[idx]; dbf != NULL; dbf = dbf->db_hash_next) { 165168404Spjd if (DBUF_EQUAL(dbf, os, obj, level, blkid)) { 166168404Spjd mutex_enter(&dbf->db_mtx); 167168404Spjd if (dbf->db_state != DB_EVICTING) { 168168404Spjd mutex_exit(DBUF_HASH_MUTEX(h, idx)); 169168404Spjd return (dbf); 170168404Spjd } 171168404Spjd mutex_exit(&dbf->db_mtx); 172168404Spjd } 173168404Spjd } 174168404Spjd 175168404Spjd mutex_enter(&db->db_mtx); 176168404Spjd db->db_hash_next = h->hash_table[idx]; 177168404Spjd h->hash_table[idx] = db; 178168404Spjd mutex_exit(DBUF_HASH_MUTEX(h, idx)); 179270247Sdelphij atomic_inc_64(&dbuf_hash_count); 180168404Spjd 181168404Spjd return (NULL); 182168404Spjd} 183168404Spjd 184168404Spjd/* 185268858Sdelphij * Remove an entry from the hash table. It must be in the EVICTING state. 186168404Spjd */ 187168404Spjdstatic void 188168404Spjddbuf_hash_remove(dmu_buf_impl_t *db) 189168404Spjd{ 190168404Spjd dbuf_hash_table_t *h = &dbuf_hash_table; 191168404Spjd uint64_t hv = DBUF_HASH(db->db_objset, db->db.db_object, 192168404Spjd db->db_level, db->db_blkid); 193168404Spjd uint64_t idx = hv & h->hash_table_mask; 194168404Spjd dmu_buf_impl_t *dbf, **dbp; 195168404Spjd 196168404Spjd /* 197268858Sdelphij * We musn't hold db_mtx to maintain lock ordering: 198168404Spjd * DBUF_HASH_MUTEX > db_mtx. 199168404Spjd */ 200168404Spjd ASSERT(refcount_is_zero(&db->db_holds)); 201168404Spjd ASSERT(db->db_state == DB_EVICTING); 202168404Spjd ASSERT(!MUTEX_HELD(&db->db_mtx)); 203168404Spjd 204168404Spjd mutex_enter(DBUF_HASH_MUTEX(h, idx)); 205168404Spjd dbp = &h->hash_table[idx]; 206168404Spjd while ((dbf = *dbp) != db) { 207168404Spjd dbp = &dbf->db_hash_next; 208168404Spjd ASSERT(dbf != NULL); 209168404Spjd } 210168404Spjd *dbp = db->db_hash_next; 211168404Spjd db->db_hash_next = NULL; 212168404Spjd mutex_exit(DBUF_HASH_MUTEX(h, idx)); 213270247Sdelphij atomic_dec_64(&dbuf_hash_count); 214168404Spjd} 215168404Spjd 216168404Spjdstatic arc_evict_func_t dbuf_do_evict; 217168404Spjd 218168404Spjdstatic void 219168404Spjddbuf_evict_user(dmu_buf_impl_t *db) 220168404Spjd{ 221168404Spjd ASSERT(MUTEX_HELD(&db->db_mtx)); 222168404Spjd 223168404Spjd if (db->db_level != 0 || db->db_evict_func == NULL) 224168404Spjd return; 225168404Spjd 226168404Spjd db->db_evict_func(&db->db, db->db_user_ptr); 227168404Spjd db->db_user_ptr = NULL; 228168404Spjd db->db_evict_func = NULL; 229168404Spjd} 230168404Spjd 231219089Spjdboolean_t 232219089Spjddbuf_is_metadata(dmu_buf_impl_t *db) 233219089Spjd{ 234219089Spjd if (db->db_level > 0) { 235219089Spjd return (B_TRUE); 236219089Spjd } else { 237219089Spjd boolean_t is_metadata; 238219089Spjd 239219089Spjd DB_DNODE_ENTER(db); 240236884Smm is_metadata = DMU_OT_IS_METADATA(DB_DNODE(db)->dn_type); 241219089Spjd DB_DNODE_EXIT(db); 242219089Spjd 243219089Spjd return (is_metadata); 244219089Spjd } 245219089Spjd} 246219089Spjd 247168404Spjdvoid 248168404Spjddbuf_evict(dmu_buf_impl_t *db) 249168404Spjd{ 250168404Spjd ASSERT(MUTEX_HELD(&db->db_mtx)); 251168404Spjd ASSERT(db->db_buf == NULL); 252168404Spjd ASSERT(db->db_data_pending == NULL); 253168404Spjd 254168404Spjd dbuf_clear(db); 255168404Spjd dbuf_destroy(db); 256168404Spjd} 257168404Spjd 258168404Spjdvoid 259168404Spjddbuf_init(void) 260168404Spjd{ 261168404Spjd uint64_t hsize = 1ULL << 16; 262168404Spjd dbuf_hash_table_t *h = &dbuf_hash_table; 263168404Spjd int i; 264168404Spjd 265168404Spjd /* 266168404Spjd * The hash table is big enough to fill all of physical memory 267168404Spjd * with an average 4K block size. The table will take up 268168404Spjd * totalmem*sizeof(void*)/4K (i.e. 2MB/GB with 8-byte pointers). 269168404Spjd */ 270168696Spjd while (hsize * 4096 < (uint64_t)physmem * PAGESIZE) 271168404Spjd hsize <<= 1; 272168404Spjd 273168404Spjdretry: 274168404Spjd h->hash_table_mask = hsize - 1; 275168404Spjd h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP); 276168404Spjd if (h->hash_table == NULL) { 277168404Spjd /* XXX - we should really return an error instead of assert */ 278168404Spjd ASSERT(hsize > (1ULL << 10)); 279168404Spjd hsize >>= 1; 280168404Spjd goto retry; 281168404Spjd } 282168404Spjd 283168404Spjd dbuf_cache = kmem_cache_create("dmu_buf_impl_t", 284168404Spjd sizeof (dmu_buf_impl_t), 285168404Spjd 0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0); 286168404Spjd 287168404Spjd for (i = 0; i < DBUF_MUTEXES; i++) 288168404Spjd mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL); 289168404Spjd} 290168404Spjd 291168404Spjdvoid 292168404Spjddbuf_fini(void) 293168404Spjd{ 294168404Spjd dbuf_hash_table_t *h = &dbuf_hash_table; 295168404Spjd int i; 296168404Spjd 297168404Spjd for (i = 0; i < DBUF_MUTEXES; i++) 298168404Spjd mutex_destroy(&h->hash_mutexes[i]); 299168404Spjd kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *)); 300168404Spjd kmem_cache_destroy(dbuf_cache); 301168404Spjd} 302168404Spjd 303168404Spjd/* 304168404Spjd * Other stuff. 305168404Spjd */ 306168404Spjd 307168404Spjd#ifdef ZFS_DEBUG 308168404Spjdstatic void 309168404Spjddbuf_verify(dmu_buf_impl_t *db) 310168404Spjd{ 311219089Spjd dnode_t *dn; 312219089Spjd dbuf_dirty_record_t *dr; 313168404Spjd 314168404Spjd ASSERT(MUTEX_HELD(&db->db_mtx)); 315168404Spjd 316168404Spjd if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY)) 317168404Spjd return; 318168404Spjd 319168404Spjd ASSERT(db->db_objset != NULL); 320219089Spjd DB_DNODE_ENTER(db); 321219089Spjd dn = DB_DNODE(db); 322168404Spjd if (dn == NULL) { 323168404Spjd ASSERT(db->db_parent == NULL); 324168404Spjd ASSERT(db->db_blkptr == NULL); 325168404Spjd } else { 326168404Spjd ASSERT3U(db->db.db_object, ==, dn->dn_object); 327168404Spjd ASSERT3P(db->db_objset, ==, dn->dn_objset); 328168404Spjd ASSERT3U(db->db_level, <, dn->dn_nlevels); 329219089Spjd ASSERT(db->db_blkid == DMU_BONUS_BLKID || 330219089Spjd db->db_blkid == DMU_SPILL_BLKID || 331269229Sdelphij !avl_is_empty(&dn->dn_dbufs)); 332168404Spjd } 333219089Spjd if (db->db_blkid == DMU_BONUS_BLKID) { 334168404Spjd ASSERT(dn != NULL); 335185029Spjd ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); 336219089Spjd ASSERT3U(db->db.db_offset, ==, DMU_BONUS_BLKID); 337219089Spjd } else if (db->db_blkid == DMU_SPILL_BLKID) { 338219089Spjd ASSERT(dn != NULL); 339219089Spjd ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); 340240415Smm ASSERT0(db->db.db_offset); 341168404Spjd } else { 342168404Spjd ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size); 343168404Spjd } 344168404Spjd 345219089Spjd for (dr = db->db_data_pending; dr != NULL; dr = dr->dr_next) 346219089Spjd ASSERT(dr->dr_dbuf == db); 347219089Spjd 348219089Spjd for (dr = db->db_last_dirty; dr != NULL; dr = dr->dr_next) 349219089Spjd ASSERT(dr->dr_dbuf == db); 350219089Spjd 351208047Smm /* 352208047Smm * We can't assert that db_size matches dn_datablksz because it 353208047Smm * can be momentarily different when another thread is doing 354208047Smm * dnode_set_blksz(). 355208047Smm */ 356208047Smm if (db->db_level == 0 && db->db.db_object == DMU_META_DNODE_OBJECT) { 357219089Spjd dr = db->db_data_pending; 358208047Smm /* 359208047Smm * It should only be modified in syncing context, so 360208047Smm * make sure we only have one copy of the data. 361208047Smm */ 362208047Smm ASSERT(dr == NULL || dr->dt.dl.dr_data == db->db_buf); 363168404Spjd } 364168404Spjd 365168404Spjd /* verify db->db_blkptr */ 366168404Spjd if (db->db_blkptr) { 367168404Spjd if (db->db_parent == dn->dn_dbuf) { 368168404Spjd /* db is pointed to by the dnode */ 369168404Spjd /* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */ 370209962Smm if (DMU_OBJECT_IS_SPECIAL(db->db.db_object)) 371168404Spjd ASSERT(db->db_parent == NULL); 372168404Spjd else 373168404Spjd ASSERT(db->db_parent != NULL); 374219089Spjd if (db->db_blkid != DMU_SPILL_BLKID) 375219089Spjd ASSERT3P(db->db_blkptr, ==, 376219089Spjd &dn->dn_phys->dn_blkptr[db->db_blkid]); 377168404Spjd } else { 378168404Spjd /* db is pointed to by an indirect block */ 379168404Spjd int epb = db->db_parent->db.db_size >> SPA_BLKPTRSHIFT; 380168404Spjd ASSERT3U(db->db_parent->db_level, ==, db->db_level+1); 381168404Spjd ASSERT3U(db->db_parent->db.db_object, ==, 382168404Spjd db->db.db_object); 383168404Spjd /* 384168404Spjd * dnode_grow_indblksz() can make this fail if we don't 385168404Spjd * have the struct_rwlock. XXX indblksz no longer 386168404Spjd * grows. safe to do this now? 387168404Spjd */ 388219089Spjd if (RW_WRITE_HELD(&dn->dn_struct_rwlock)) { 389168404Spjd ASSERT3P(db->db_blkptr, ==, 390168404Spjd ((blkptr_t *)db->db_parent->db.db_data + 391168404Spjd db->db_blkid % epb)); 392168404Spjd } 393168404Spjd } 394168404Spjd } 395168404Spjd if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) && 396219089Spjd (db->db_buf == NULL || db->db_buf->b_data) && 397219089Spjd db->db.db_data && db->db_blkid != DMU_BONUS_BLKID && 398168404Spjd db->db_state != DB_FILL && !dn->dn_free_txg) { 399168404Spjd /* 400168404Spjd * If the blkptr isn't set but they have nonzero data, 401168404Spjd * it had better be dirty, otherwise we'll lose that 402168404Spjd * data when we evict this buffer. 403168404Spjd */ 404168404Spjd if (db->db_dirtycnt == 0) { 405168404Spjd uint64_t *buf = db->db.db_data; 406168404Spjd int i; 407168404Spjd 408168404Spjd for (i = 0; i < db->db.db_size >> 3; i++) { 409168404Spjd ASSERT(buf[i] == 0); 410168404Spjd } 411168404Spjd } 412168404Spjd } 413219089Spjd DB_DNODE_EXIT(db); 414168404Spjd} 415168404Spjd#endif 416168404Spjd 417168404Spjdstatic void 418168404Spjddbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf) 419168404Spjd{ 420168404Spjd ASSERT(MUTEX_HELD(&db->db_mtx)); 421168404Spjd db->db_buf = buf; 422168404Spjd if (buf != NULL) { 423168404Spjd ASSERT(buf->b_data != NULL); 424168404Spjd db->db.db_data = buf->b_data; 425168404Spjd if (!arc_released(buf)) 426168404Spjd arc_set_callback(buf, dbuf_do_evict, db); 427168404Spjd } else { 428168404Spjd dbuf_evict_user(db); 429168404Spjd db->db.db_data = NULL; 430219089Spjd if (db->db_state != DB_NOFILL) 431219089Spjd db->db_state = DB_UNCACHED; 432168404Spjd } 433168404Spjd} 434168404Spjd 435219089Spjd/* 436219089Spjd * Loan out an arc_buf for read. Return the loaned arc_buf. 437219089Spjd */ 438219089Spjdarc_buf_t * 439219089Spjddbuf_loan_arcbuf(dmu_buf_impl_t *db) 440219089Spjd{ 441219089Spjd arc_buf_t *abuf; 442219089Spjd 443219089Spjd mutex_enter(&db->db_mtx); 444219089Spjd if (arc_released(db->db_buf) || refcount_count(&db->db_holds) > 1) { 445219089Spjd int blksz = db->db.db_size; 446260150Sdelphij spa_t *spa = db->db_objset->os_spa; 447219089Spjd 448219089Spjd mutex_exit(&db->db_mtx); 449219089Spjd abuf = arc_loan_buf(spa, blksz); 450219089Spjd bcopy(db->db.db_data, abuf->b_data, blksz); 451219089Spjd } else { 452219089Spjd abuf = db->db_buf; 453219089Spjd arc_loan_inuse_buf(abuf, db); 454219089Spjd dbuf_set_data(db, NULL); 455219089Spjd mutex_exit(&db->db_mtx); 456219089Spjd } 457219089Spjd return (abuf); 458219089Spjd} 459219089Spjd 460168404Spjduint64_t 461168404Spjddbuf_whichblock(dnode_t *dn, uint64_t offset) 462168404Spjd{ 463168404Spjd if (dn->dn_datablkshift) { 464168404Spjd return (offset >> dn->dn_datablkshift); 465168404Spjd } else { 466168404Spjd ASSERT3U(offset, <, dn->dn_datablksz); 467168404Spjd return (0); 468168404Spjd } 469168404Spjd} 470168404Spjd 471168404Spjdstatic void 472168404Spjddbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb) 473168404Spjd{ 474168404Spjd dmu_buf_impl_t *db = vdb; 475168404Spjd 476168404Spjd mutex_enter(&db->db_mtx); 477168404Spjd ASSERT3U(db->db_state, ==, DB_READ); 478168404Spjd /* 479168404Spjd * All reads are synchronous, so we must have a hold on the dbuf 480168404Spjd */ 481168404Spjd ASSERT(refcount_count(&db->db_holds) > 0); 482168404Spjd ASSERT(db->db_buf == NULL); 483168404Spjd ASSERT(db->db.db_data == NULL); 484168404Spjd if (db->db_level == 0 && db->db_freed_in_flight) { 485168404Spjd /* we were freed in flight; disregard any error */ 486168404Spjd arc_release(buf, db); 487168404Spjd bzero(buf->b_data, db->db.db_size); 488168404Spjd arc_buf_freeze(buf); 489168404Spjd db->db_freed_in_flight = FALSE; 490168404Spjd dbuf_set_data(db, buf); 491168404Spjd db->db_state = DB_CACHED; 492168404Spjd } else if (zio == NULL || zio->io_error == 0) { 493168404Spjd dbuf_set_data(db, buf); 494168404Spjd db->db_state = DB_CACHED; 495168404Spjd } else { 496219089Spjd ASSERT(db->db_blkid != DMU_BONUS_BLKID); 497168404Spjd ASSERT3P(db->db_buf, ==, NULL); 498248571Smm VERIFY(arc_buf_remove_ref(buf, db)); 499168404Spjd db->db_state = DB_UNCACHED; 500168404Spjd } 501168404Spjd cv_broadcast(&db->db_changed); 502219089Spjd dbuf_rele_and_unlock(db, NULL); 503168404Spjd} 504168404Spjd 505168404Spjdstatic void 506168404Spjddbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags) 507168404Spjd{ 508219089Spjd dnode_t *dn; 509268123Sdelphij zbookmark_phys_t zb; 510168404Spjd uint32_t aflags = ARC_NOWAIT; 511168404Spjd 512219089Spjd DB_DNODE_ENTER(db); 513219089Spjd dn = DB_DNODE(db); 514168404Spjd ASSERT(!refcount_is_zero(&db->db_holds)); 515168404Spjd /* We need the struct_rwlock to prevent db_blkptr from changing. */ 516185029Spjd ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 517168404Spjd ASSERT(MUTEX_HELD(&db->db_mtx)); 518168404Spjd ASSERT(db->db_state == DB_UNCACHED); 519168404Spjd ASSERT(db->db_buf == NULL); 520168404Spjd 521219089Spjd if (db->db_blkid == DMU_BONUS_BLKID) { 522207624Smm int bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen); 523185029Spjd 524185029Spjd ASSERT3U(bonuslen, <=, db->db.db_size); 525168404Spjd db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN); 526208373Smm arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); 527185029Spjd if (bonuslen < DN_MAX_BONUSLEN) 528168404Spjd bzero(db->db.db_data, DN_MAX_BONUSLEN); 529207624Smm if (bonuslen) 530207624Smm bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen); 531219089Spjd DB_DNODE_EXIT(db); 532168404Spjd db->db_state = DB_CACHED; 533168404Spjd mutex_exit(&db->db_mtx); 534168404Spjd return; 535168404Spjd } 536168404Spjd 537185029Spjd /* 538185029Spjd * Recheck BP_IS_HOLE() after dnode_block_freed() in case dnode_sync() 539185029Spjd * processes the delete record and clears the bp while we are waiting 540185029Spjd * for the dn_mtx (resulting in a "no" from block_freed). 541185029Spjd */ 542185029Spjd if (db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr) || 543185029Spjd (db->db_level == 0 && (dnode_block_freed(dn, db->db_blkid) || 544185029Spjd BP_IS_HOLE(db->db_blkptr)))) { 545168404Spjd arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 546168404Spjd 547260150Sdelphij DB_DNODE_EXIT(db); 548260150Sdelphij dbuf_set_data(db, arc_buf_alloc(db->db_objset->os_spa, 549168404Spjd db->db.db_size, db, type)); 550168404Spjd bzero(db->db.db_data, db->db.db_size); 551168404Spjd db->db_state = DB_CACHED; 552168404Spjd *flags |= DB_RF_CACHED; 553168404Spjd mutex_exit(&db->db_mtx); 554168404Spjd return; 555168404Spjd } 556168404Spjd 557219089Spjd DB_DNODE_EXIT(db); 558219089Spjd 559168404Spjd db->db_state = DB_READ; 560168404Spjd mutex_exit(&db->db_mtx); 561168404Spjd 562185029Spjd if (DBUF_IS_L2CACHEABLE(db)) 563185029Spjd aflags |= ARC_L2CACHE; 564251478Sdelphij if (DBUF_IS_L2COMPRESSIBLE(db)) 565251478Sdelphij aflags |= ARC_L2COMPRESS; 566185029Spjd 567219089Spjd SET_BOOKMARK(&zb, db->db_objset->os_dsl_dataset ? 568219089Spjd db->db_objset->os_dsl_dataset->ds_object : DMU_META_OBJSET, 569219089Spjd db->db.db_object, db->db_level, db->db_blkid); 570168404Spjd 571168404Spjd dbuf_add_ref(db, NULL); 572185029Spjd 573260150Sdelphij (void) arc_read(zio, db->db_objset->os_spa, db->db_blkptr, 574168404Spjd dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, 575168404Spjd (*flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED, 576168404Spjd &aflags, &zb); 577168404Spjd if (aflags & ARC_CACHED) 578168404Spjd *flags |= DB_RF_CACHED; 579168404Spjd} 580168404Spjd 581168404Spjdint 582168404Spjddbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) 583168404Spjd{ 584168404Spjd int err = 0; 585260150Sdelphij boolean_t havepzio = (zio != NULL); 586260150Sdelphij boolean_t prefetch; 587219089Spjd dnode_t *dn; 588168404Spjd 589168404Spjd /* 590168404Spjd * We don't have to hold the mutex to check db_state because it 591168404Spjd * can't be freed while we have a hold on the buffer. 592168404Spjd */ 593168404Spjd ASSERT(!refcount_is_zero(&db->db_holds)); 594168404Spjd 595219089Spjd if (db->db_state == DB_NOFILL) 596249195Smm return (SET_ERROR(EIO)); 597219089Spjd 598219089Spjd DB_DNODE_ENTER(db); 599219089Spjd dn = DB_DNODE(db); 600168404Spjd if ((flags & DB_RF_HAVESTRUCT) == 0) 601219089Spjd rw_enter(&dn->dn_struct_rwlock, RW_READER); 602168404Spjd 603219089Spjd prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && 604219089Spjd (flags & DB_RF_NOPREFETCH) == 0 && dn != NULL && 605185029Spjd DBUF_IS_CACHEABLE(db); 606168404Spjd 607168404Spjd mutex_enter(&db->db_mtx); 608168404Spjd if (db->db_state == DB_CACHED) { 609168404Spjd mutex_exit(&db->db_mtx); 610168404Spjd if (prefetch) 611219089Spjd dmu_zfetch(&dn->dn_zfetch, db->db.db_offset, 612168404Spjd db->db.db_size, TRUE); 613168404Spjd if ((flags & DB_RF_HAVESTRUCT) == 0) 614219089Spjd rw_exit(&dn->dn_struct_rwlock); 615219089Spjd DB_DNODE_EXIT(db); 616168404Spjd } else if (db->db_state == DB_UNCACHED) { 617219089Spjd spa_t *spa = dn->dn_objset->os_spa; 618219089Spjd 619219089Spjd if (zio == NULL) 620219089Spjd zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); 621168404Spjd dbuf_read_impl(db, zio, &flags); 622168404Spjd 623168404Spjd /* dbuf_read_impl has dropped db_mtx for us */ 624168404Spjd 625168404Spjd if (prefetch) 626219089Spjd dmu_zfetch(&dn->dn_zfetch, db->db.db_offset, 627168404Spjd db->db.db_size, flags & DB_RF_CACHED); 628168404Spjd 629168404Spjd if ((flags & DB_RF_HAVESTRUCT) == 0) 630219089Spjd rw_exit(&dn->dn_struct_rwlock); 631219089Spjd DB_DNODE_EXIT(db); 632168404Spjd 633168404Spjd if (!havepzio) 634168404Spjd err = zio_wait(zio); 635168404Spjd } else { 636251629Sdelphij /* 637251629Sdelphij * Another reader came in while the dbuf was in flight 638251629Sdelphij * between UNCACHED and CACHED. Either a writer will finish 639251629Sdelphij * writing the buffer (sending the dbuf to CACHED) or the 640251629Sdelphij * first reader's request will reach the read_done callback 641251629Sdelphij * and send the dbuf to CACHED. Otherwise, a failure 642251629Sdelphij * occurred and the dbuf went to UNCACHED. 643251629Sdelphij */ 644168404Spjd mutex_exit(&db->db_mtx); 645168404Spjd if (prefetch) 646219089Spjd dmu_zfetch(&dn->dn_zfetch, db->db.db_offset, 647168404Spjd db->db.db_size, TRUE); 648168404Spjd if ((flags & DB_RF_HAVESTRUCT) == 0) 649219089Spjd rw_exit(&dn->dn_struct_rwlock); 650219089Spjd DB_DNODE_EXIT(db); 651168404Spjd 652251629Sdelphij /* Skip the wait per the caller's request. */ 653168404Spjd mutex_enter(&db->db_mtx); 654168404Spjd if ((flags & DB_RF_NEVERWAIT) == 0) { 655168404Spjd while (db->db_state == DB_READ || 656168404Spjd db->db_state == DB_FILL) { 657168404Spjd ASSERT(db->db_state == DB_READ || 658168404Spjd (flags & DB_RF_HAVESTRUCT) == 0); 659272511Sdelphij DTRACE_PROBE2(blocked__read, dmu_buf_impl_t *, 660272511Sdelphij db, zio_t *, zio); 661168404Spjd cv_wait(&db->db_changed, &db->db_mtx); 662168404Spjd } 663168404Spjd if (db->db_state == DB_UNCACHED) 664249195Smm err = SET_ERROR(EIO); 665168404Spjd } 666168404Spjd mutex_exit(&db->db_mtx); 667168404Spjd } 668168404Spjd 669168404Spjd ASSERT(err || havepzio || db->db_state == DB_CACHED); 670168404Spjd return (err); 671168404Spjd} 672168404Spjd 673168404Spjdstatic void 674168404Spjddbuf_noread(dmu_buf_impl_t *db) 675168404Spjd{ 676168404Spjd ASSERT(!refcount_is_zero(&db->db_holds)); 677219089Spjd ASSERT(db->db_blkid != DMU_BONUS_BLKID); 678168404Spjd mutex_enter(&db->db_mtx); 679168404Spjd while (db->db_state == DB_READ || db->db_state == DB_FILL) 680168404Spjd cv_wait(&db->db_changed, &db->db_mtx); 681168404Spjd if (db->db_state == DB_UNCACHED) { 682168404Spjd arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 683260150Sdelphij spa_t *spa = db->db_objset->os_spa; 684168404Spjd 685168404Spjd ASSERT(db->db_buf == NULL); 686168404Spjd ASSERT(db->db.db_data == NULL); 687219089Spjd dbuf_set_data(db, arc_buf_alloc(spa, db->db.db_size, db, type)); 688168404Spjd db->db_state = DB_FILL; 689219089Spjd } else if (db->db_state == DB_NOFILL) { 690219089Spjd dbuf_set_data(db, NULL); 691168404Spjd } else { 692168404Spjd ASSERT3U(db->db_state, ==, DB_CACHED); 693168404Spjd } 694168404Spjd mutex_exit(&db->db_mtx); 695168404Spjd} 696168404Spjd 697168404Spjd/* 698168404Spjd * This is our just-in-time copy function. It makes a copy of 699168404Spjd * buffers, that have been modified in a previous transaction 700168404Spjd * group, before we modify them in the current active group. 701168404Spjd * 702168404Spjd * This function is used in two places: when we are dirtying a 703168404Spjd * buffer for the first time in a txg, and when we are freeing 704168404Spjd * a range in a dnode that includes this buffer. 705168404Spjd * 706168404Spjd * Note that when we are called from dbuf_free_range() we do 707168404Spjd * not put a hold on the buffer, we just traverse the active 708168404Spjd * dbuf list for the dnode. 709168404Spjd */ 710168404Spjdstatic void 711168404Spjddbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg) 712168404Spjd{ 713168404Spjd dbuf_dirty_record_t *dr = db->db_last_dirty; 714168404Spjd 715168404Spjd ASSERT(MUTEX_HELD(&db->db_mtx)); 716168404Spjd ASSERT(db->db.db_data != NULL); 717168404Spjd ASSERT(db->db_level == 0); 718168404Spjd ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT); 719168404Spjd 720168404Spjd if (dr == NULL || 721168404Spjd (dr->dt.dl.dr_data != 722219089Spjd ((db->db_blkid == DMU_BONUS_BLKID) ? db->db.db_data : db->db_buf))) 723168404Spjd return; 724168404Spjd 725168404Spjd /* 726168404Spjd * If the last dirty record for this dbuf has not yet synced 727168404Spjd * and its referencing the dbuf data, either: 728219089Spjd * reset the reference to point to a new copy, 729168404Spjd * or (if there a no active holders) 730168404Spjd * just null out the current db_data pointer. 731168404Spjd */ 732168404Spjd ASSERT(dr->dr_txg >= txg - 2); 733219089Spjd if (db->db_blkid == DMU_BONUS_BLKID) { 734168404Spjd /* Note that the data bufs here are zio_bufs */ 735168404Spjd dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN); 736208373Smm arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); 737168404Spjd bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN); 738168404Spjd } else if (refcount_count(&db->db_holds) > db->db_dirtycnt) { 739168404Spjd int size = db->db.db_size; 740168404Spjd arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 741260150Sdelphij spa_t *spa = db->db_objset->os_spa; 742219089Spjd 743219089Spjd dr->dt.dl.dr_data = arc_buf_alloc(spa, size, db, type); 744168404Spjd bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size); 745168404Spjd } else { 746168404Spjd dbuf_set_data(db, NULL); 747168404Spjd } 748168404Spjd} 749168404Spjd 750168404Spjdvoid 751168404Spjddbuf_unoverride(dbuf_dirty_record_t *dr) 752168404Spjd{ 753168404Spjd dmu_buf_impl_t *db = dr->dr_dbuf; 754219089Spjd blkptr_t *bp = &dr->dt.dl.dr_overridden_by; 755168404Spjd uint64_t txg = dr->dr_txg; 756168404Spjd 757168404Spjd ASSERT(MUTEX_HELD(&db->db_mtx)); 758168404Spjd ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC); 759168404Spjd ASSERT(db->db_level == 0); 760168404Spjd 761219089Spjd if (db->db_blkid == DMU_BONUS_BLKID || 762168404Spjd dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN) 763168404Spjd return; 764168404Spjd 765219089Spjd ASSERT(db->db_data_pending != dr); 766219089Spjd 767168404Spjd /* free this block */ 768260150Sdelphij if (!BP_IS_HOLE(bp) && !dr->dt.dl.dr_nopwrite) 769260150Sdelphij zio_free(db->db_objset->os_spa, txg, bp); 770219089Spjd 771168404Spjd dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; 772243524Smm dr->dt.dl.dr_nopwrite = B_FALSE; 773243524Smm 774168404Spjd /* 775168404Spjd * Release the already-written buffer, so we leave it in 776168404Spjd * a consistent dirty state. Note that all callers are 777168404Spjd * modifying the buffer, so they will immediately do 778168404Spjd * another (redundant) arc_release(). Therefore, leave 779168404Spjd * the buf thawed to save the effort of freezing & 780168404Spjd * immediately re-thawing it. 781168404Spjd */ 782168404Spjd arc_release(dr->dt.dl.dr_data, db); 783168404Spjd} 784168404Spjd 785185029Spjd/* 786185029Spjd * Evict (if its unreferenced) or clear (if its referenced) any level-0 787185029Spjd * data blocks in the free range, so that any future readers will find 788260150Sdelphij * empty blocks. 789253821Sdelphij * 790253821Sdelphij * This is a no-op if the dataset is in the middle of an incremental 791253821Sdelphij * receive; see comment below for details. 792185029Spjd */ 793168404Spjdvoid 794269229Sdelphijdbuf_free_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid, 795269229Sdelphij dmu_tx_t *tx) 796168404Spjd{ 797269229Sdelphij dmu_buf_impl_t *db, *db_next, db_search; 798168404Spjd uint64_t txg = tx->tx_txg; 799269229Sdelphij avl_index_t where; 800168404Spjd 801269229Sdelphij if (end_blkid > dn->dn_maxblkid && (end_blkid != DMU_SPILL_BLKID)) 802269229Sdelphij end_blkid = dn->dn_maxblkid; 803269229Sdelphij dprintf_dnode(dn, "start=%llu end=%llu\n", start_blkid, end_blkid); 804253821Sdelphij 805269229Sdelphij db_search.db_level = 0; 806269229Sdelphij db_search.db_blkid = start_blkid; 807270383Sdelphij db_search.db_state = DB_SEARCH; 808269229Sdelphij 809254753Sdelphij mutex_enter(&dn->dn_dbufs_mtx); 810269229Sdelphij if (start_blkid >= dn->dn_unlisted_l0_blkid) { 811254753Sdelphij /* There can't be any dbufs in this range; no need to search. */ 812269229Sdelphij#ifdef DEBUG 813269229Sdelphij db = avl_find(&dn->dn_dbufs, &db_search, &where); 814269229Sdelphij ASSERT3P(db, ==, NULL); 815269229Sdelphij db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER); 816269229Sdelphij ASSERT(db == NULL || db->db_level > 0); 817269229Sdelphij#endif 818254753Sdelphij mutex_exit(&dn->dn_dbufs_mtx); 819254753Sdelphij return; 820254753Sdelphij } else if (dmu_objset_is_receiving(dn->dn_objset)) { 821253821Sdelphij /* 822254753Sdelphij * If we are receiving, we expect there to be no dbufs in 823254753Sdelphij * the range to be freed, because receive modifies each 824254753Sdelphij * block at most once, and in offset order. If this is 825254753Sdelphij * not the case, it can lead to performance problems, 826254753Sdelphij * so note that we unexpectedly took the slow path. 827253821Sdelphij */ 828254753Sdelphij atomic_inc_64(&zfs_free_range_recv_miss); 829253821Sdelphij } 830253821Sdelphij 831269229Sdelphij db = avl_find(&dn->dn_dbufs, &db_search, &where); 832269229Sdelphij ASSERT3P(db, ==, NULL); 833269229Sdelphij db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER); 834269229Sdelphij 835269229Sdelphij for (; db != NULL; db = db_next) { 836269229Sdelphij db_next = AVL_NEXT(&dn->dn_dbufs, db); 837219089Spjd ASSERT(db->db_blkid != DMU_BONUS_BLKID); 838185029Spjd 839269229Sdelphij if (db->db_level != 0 || db->db_blkid > end_blkid) { 840269229Sdelphij break; 841269229Sdelphij } 842269229Sdelphij ASSERT3U(db->db_blkid, >=, start_blkid); 843168404Spjd 844168404Spjd /* found a level 0 buffer in the range */ 845248571Smm mutex_enter(&db->db_mtx); 846248571Smm if (dbuf_undirty(db, tx)) { 847248571Smm /* mutex has been dropped and dbuf destroyed */ 848168404Spjd continue; 849248571Smm } 850168404Spjd 851168404Spjd if (db->db_state == DB_UNCACHED || 852219089Spjd db->db_state == DB_NOFILL || 853168404Spjd db->db_state == DB_EVICTING) { 854168404Spjd ASSERT(db->db.db_data == NULL); 855168404Spjd mutex_exit(&db->db_mtx); 856168404Spjd continue; 857168404Spjd } 858168404Spjd if (db->db_state == DB_READ || db->db_state == DB_FILL) { 859168404Spjd /* will be handled in dbuf_read_done or dbuf_rele */ 860168404Spjd db->db_freed_in_flight = TRUE; 861168404Spjd mutex_exit(&db->db_mtx); 862168404Spjd continue; 863168404Spjd } 864168404Spjd if (refcount_count(&db->db_holds) == 0) { 865168404Spjd ASSERT(db->db_buf); 866168404Spjd dbuf_clear(db); 867168404Spjd continue; 868168404Spjd } 869168404Spjd /* The dbuf is referenced */ 870168404Spjd 871168404Spjd if (db->db_last_dirty != NULL) { 872168404Spjd dbuf_dirty_record_t *dr = db->db_last_dirty; 873168404Spjd 874168404Spjd if (dr->dr_txg == txg) { 875168404Spjd /* 876168404Spjd * This buffer is "in-use", re-adjust the file 877168404Spjd * size to reflect that this buffer may 878168404Spjd * contain new data when we sync. 879168404Spjd */ 880219089Spjd if (db->db_blkid != DMU_SPILL_BLKID && 881219089Spjd db->db_blkid > dn->dn_maxblkid) 882168404Spjd dn->dn_maxblkid = db->db_blkid; 883168404Spjd dbuf_unoverride(dr); 884168404Spjd } else { 885168404Spjd /* 886168404Spjd * This dbuf is not dirty in the open context. 887168404Spjd * Either uncache it (if its not referenced in 888168404Spjd * the open context) or reset its contents to 889168404Spjd * empty. 890168404Spjd */ 891168404Spjd dbuf_fix_old_data(db, txg); 892168404Spjd } 893168404Spjd } 894168404Spjd /* clear the contents if its cached */ 895168404Spjd if (db->db_state == DB_CACHED) { 896168404Spjd ASSERT(db->db.db_data != NULL); 897168404Spjd arc_release(db->db_buf, db); 898168404Spjd bzero(db->db.db_data, db->db.db_size); 899168404Spjd arc_buf_freeze(db->db_buf); 900168404Spjd } 901168404Spjd 902168404Spjd mutex_exit(&db->db_mtx); 903168404Spjd } 904168404Spjd mutex_exit(&dn->dn_dbufs_mtx); 905168404Spjd} 906168404Spjd 907168404Spjdstatic int 908185029Spjddbuf_block_freeable(dmu_buf_impl_t *db) 909168404Spjd{ 910168404Spjd dsl_dataset_t *ds = db->db_objset->os_dsl_dataset; 911168404Spjd uint64_t birth_txg = 0; 912168404Spjd 913168404Spjd /* 914168404Spjd * We don't need any locking to protect db_blkptr: 915168404Spjd * If it's syncing, then db_last_dirty will be set 916168404Spjd * so we'll ignore db_blkptr. 917260150Sdelphij * 918260150Sdelphij * This logic ensures that only block births for 919260150Sdelphij * filled blocks are considered. 920168404Spjd */ 921168404Spjd ASSERT(MUTEX_HELD(&db->db_mtx)); 922260150Sdelphij if (db->db_last_dirty && (db->db_blkptr == NULL || 923260150Sdelphij !BP_IS_HOLE(db->db_blkptr))) { 924168404Spjd birth_txg = db->db_last_dirty->dr_txg; 925260150Sdelphij } else if (db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)) { 926168404Spjd birth_txg = db->db_blkptr->blk_birth; 927260150Sdelphij } 928168404Spjd 929219089Spjd /* 930260150Sdelphij * If this block don't exist or is in a snapshot, it can't be freed. 931219089Spjd * Don't pass the bp to dsl_dataset_block_freeable() since we 932219089Spjd * are holding the db_mtx lock and might deadlock if we are 933219089Spjd * prefetching a dedup-ed block. 934219089Spjd */ 935260150Sdelphij if (birth_txg != 0) 936185029Spjd return (ds == NULL || 937219089Spjd dsl_dataset_block_freeable(ds, NULL, birth_txg)); 938168404Spjd else 939260150Sdelphij return (B_FALSE); 940168404Spjd} 941168404Spjd 942168404Spjdvoid 943168404Spjddbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx) 944168404Spjd{ 945168404Spjd arc_buf_t *buf, *obuf; 946168404Spjd int osize = db->db.db_size; 947168404Spjd arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 948219089Spjd dnode_t *dn; 949168404Spjd 950219089Spjd ASSERT(db->db_blkid != DMU_BONUS_BLKID); 951168404Spjd 952219089Spjd DB_DNODE_ENTER(db); 953219089Spjd dn = DB_DNODE(db); 954219089Spjd 955168404Spjd /* XXX does *this* func really need the lock? */ 956219089Spjd ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); 957168404Spjd 958168404Spjd /* 959260150Sdelphij * This call to dmu_buf_will_dirty() with the dn_struct_rwlock held 960168404Spjd * is OK, because there can be no other references to the db 961168404Spjd * when we are changing its size, so no concurrent DB_FILL can 962168404Spjd * be happening. 963168404Spjd */ 964168404Spjd /* 965168404Spjd * XXX we should be doing a dbuf_read, checking the return 966168404Spjd * value and returning that up to our callers 967168404Spjd */ 968260150Sdelphij dmu_buf_will_dirty(&db->db, tx); 969168404Spjd 970168404Spjd /* create the data buffer for the new block */ 971219089Spjd buf = arc_buf_alloc(dn->dn_objset->os_spa, size, db, type); 972168404Spjd 973168404Spjd /* copy old block data to the new block */ 974168404Spjd obuf = db->db_buf; 975168404Spjd bcopy(obuf->b_data, buf->b_data, MIN(osize, size)); 976168404Spjd /* zero the remainder */ 977168404Spjd if (size > osize) 978168404Spjd bzero((uint8_t *)buf->b_data + osize, size - osize); 979168404Spjd 980168404Spjd mutex_enter(&db->db_mtx); 981168404Spjd dbuf_set_data(db, buf); 982248571Smm VERIFY(arc_buf_remove_ref(obuf, db)); 983168404Spjd db->db.db_size = size; 984168404Spjd 985168404Spjd if (db->db_level == 0) { 986168404Spjd ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg); 987168404Spjd db->db_last_dirty->dt.dl.dr_data = buf; 988168404Spjd } 989168404Spjd mutex_exit(&db->db_mtx); 990168404Spjd 991219089Spjd dnode_willuse_space(dn, size-osize, tx); 992219089Spjd DB_DNODE_EXIT(db); 993168404Spjd} 994168404Spjd 995219089Spjdvoid 996219089Spjddbuf_release_bp(dmu_buf_impl_t *db) 997219089Spjd{ 998260150Sdelphij objset_t *os = db->db_objset; 999219089Spjd 1000219089Spjd ASSERT(dsl_pool_sync_context(dmu_objset_pool(os))); 1001219089Spjd ASSERT(arc_released(os->os_phys_buf) || 1002219089Spjd list_link_active(&os->os_dsl_dataset->ds_synced_link)); 1003219089Spjd ASSERT(db->db_parent == NULL || arc_released(db->db_parent->db_buf)); 1004219089Spjd 1005246666Smm (void) arc_release(db->db_buf, db); 1006219089Spjd} 1007219089Spjd 1008168404Spjddbuf_dirty_record_t * 1009168404Spjddbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) 1010168404Spjd{ 1011219089Spjd dnode_t *dn; 1012219089Spjd objset_t *os; 1013168404Spjd dbuf_dirty_record_t **drp, *dr; 1014168404Spjd int drop_struct_lock = FALSE; 1015185029Spjd boolean_t do_free_accounting = B_FALSE; 1016168404Spjd int txgoff = tx->tx_txg & TXG_MASK; 1017168404Spjd 1018168404Spjd ASSERT(tx->tx_txg != 0); 1019168404Spjd ASSERT(!refcount_is_zero(&db->db_holds)); 1020168404Spjd DMU_TX_DIRTY_BUF(tx, db); 1021168404Spjd 1022219089Spjd DB_DNODE_ENTER(db); 1023219089Spjd dn = DB_DNODE(db); 1024168404Spjd /* 1025168404Spjd * Shouldn't dirty a regular buffer in syncing context. Private 1026168404Spjd * objects may be dirtied in syncing context, but only if they 1027168404Spjd * were already pre-dirtied in open context. 1028168404Spjd */ 1029168404Spjd ASSERT(!dmu_tx_is_syncing(tx) || 1030168404Spjd BP_IS_HOLE(dn->dn_objset->os_rootbp) || 1031209962Smm DMU_OBJECT_IS_SPECIAL(dn->dn_object) || 1032209962Smm dn->dn_objset->os_dsl_dataset == NULL); 1033168404Spjd /* 1034168404Spjd * We make this assert for private objects as well, but after we 1035168404Spjd * check if we're already dirty. They are allowed to re-dirty 1036168404Spjd * in syncing context. 1037168404Spjd */ 1038168404Spjd ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || 1039168404Spjd dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx == 1040168404Spjd (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); 1041168404Spjd 1042168404Spjd mutex_enter(&db->db_mtx); 1043168404Spjd /* 1044168404Spjd * XXX make this true for indirects too? The problem is that 1045168404Spjd * transactions created with dmu_tx_create_assigned() from 1046168404Spjd * syncing context don't bother holding ahead. 1047168404Spjd */ 1048168404Spjd ASSERT(db->db_level != 0 || 1049219089Spjd db->db_state == DB_CACHED || db->db_state == DB_FILL || 1050219089Spjd db->db_state == DB_NOFILL); 1051168404Spjd 1052168404Spjd mutex_enter(&dn->dn_mtx); 1053168404Spjd /* 1054168404Spjd * Don't set dirtyctx to SYNC if we're just modifying this as we 1055168404Spjd * initialize the objset. 1056168404Spjd */ 1057168404Spjd if (dn->dn_dirtyctx == DN_UNDIRTIED && 1058168404Spjd !BP_IS_HOLE(dn->dn_objset->os_rootbp)) { 1059168404Spjd dn->dn_dirtyctx = 1060168404Spjd (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN); 1061168404Spjd ASSERT(dn->dn_dirtyctx_firstset == NULL); 1062168404Spjd dn->dn_dirtyctx_firstset = kmem_alloc(1, KM_SLEEP); 1063168404Spjd } 1064168404Spjd mutex_exit(&dn->dn_mtx); 1065168404Spjd 1066219089Spjd if (db->db_blkid == DMU_SPILL_BLKID) 1067219089Spjd dn->dn_have_spill = B_TRUE; 1068219089Spjd 1069168404Spjd /* 1070168404Spjd * If this buffer is already dirty, we're done. 1071168404Spjd */ 1072168404Spjd drp = &db->db_last_dirty; 1073168404Spjd ASSERT(*drp == NULL || (*drp)->dr_txg <= tx->tx_txg || 1074168404Spjd db->db.db_object == DMU_META_DNODE_OBJECT); 1075185029Spjd while ((dr = *drp) != NULL && dr->dr_txg > tx->tx_txg) 1076185029Spjd drp = &dr->dr_next; 1077185029Spjd if (dr && dr->dr_txg == tx->tx_txg) { 1078219089Spjd DB_DNODE_EXIT(db); 1079219089Spjd 1080219089Spjd if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID) { 1081168404Spjd /* 1082168404Spjd * If this buffer has already been written out, 1083168404Spjd * we now need to reset its state. 1084168404Spjd */ 1085185029Spjd dbuf_unoverride(dr); 1086219089Spjd if (db->db.db_object != DMU_META_DNODE_OBJECT && 1087219089Spjd db->db_state != DB_NOFILL) 1088168404Spjd arc_buf_thaw(db->db_buf); 1089168404Spjd } 1090168404Spjd mutex_exit(&db->db_mtx); 1091185029Spjd return (dr); 1092168404Spjd } 1093168404Spjd 1094168404Spjd /* 1095168404Spjd * Only valid if not already dirty. 1096168404Spjd */ 1097209962Smm ASSERT(dn->dn_object == 0 || 1098209962Smm dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx == 1099168404Spjd (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); 1100168404Spjd 1101168404Spjd ASSERT3U(dn->dn_nlevels, >, db->db_level); 1102168404Spjd ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) || 1103168404Spjd dn->dn_phys->dn_nlevels > db->db_level || 1104168404Spjd dn->dn_next_nlevels[txgoff] > db->db_level || 1105168404Spjd dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level || 1106168404Spjd dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level); 1107168404Spjd 1108168404Spjd /* 1109168404Spjd * We should only be dirtying in syncing context if it's the 1110209962Smm * mos or we're initializing the os or it's a special object. 1111209962Smm * However, we are allowed to dirty in syncing context provided 1112209962Smm * we already dirtied it in open context. Hence we must make 1113209962Smm * this assertion only if we're not already dirty. 1114168404Spjd */ 1115219089Spjd os = dn->dn_objset; 1116209962Smm ASSERT(!dmu_tx_is_syncing(tx) || DMU_OBJECT_IS_SPECIAL(dn->dn_object) || 1117209962Smm os->os_dsl_dataset == NULL || BP_IS_HOLE(os->os_rootbp)); 1118168404Spjd ASSERT(db->db.db_size != 0); 1119168404Spjd 1120168404Spjd dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); 1121168404Spjd 1122219089Spjd if (db->db_blkid != DMU_BONUS_BLKID) { 1123185029Spjd /* 1124185029Spjd * Update the accounting. 1125185029Spjd * Note: we delay "free accounting" until after we drop 1126185029Spjd * the db_mtx. This keeps us from grabbing other locks 1127219089Spjd * (and possibly deadlocking) in bp_get_dsize() while 1128185029Spjd * also holding the db_mtx. 1129185029Spjd */ 1130185029Spjd dnode_willuse_space(dn, db->db.db_size, tx); 1131185029Spjd do_free_accounting = dbuf_block_freeable(db); 1132185029Spjd } 1133185029Spjd 1134168404Spjd /* 1135168404Spjd * If this buffer is dirty in an old transaction group we need 1136168404Spjd * to make a copy of it so that the changes we make in this 1137168404Spjd * transaction group won't leak out when we sync the older txg. 1138168404Spjd */ 1139168404Spjd dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_SLEEP); 1140168404Spjd if (db->db_level == 0) { 1141168404Spjd void *data_old = db->db_buf; 1142168404Spjd 1143219089Spjd if (db->db_state != DB_NOFILL) { 1144219089Spjd if (db->db_blkid == DMU_BONUS_BLKID) { 1145219089Spjd dbuf_fix_old_data(db, tx->tx_txg); 1146219089Spjd data_old = db->db.db_data; 1147219089Spjd } else if (db->db.db_object != DMU_META_DNODE_OBJECT) { 1148219089Spjd /* 1149219089Spjd * Release the data buffer from the cache so 1150219089Spjd * that we can modify it without impacting 1151219089Spjd * possible other users of this cached data 1152219089Spjd * block. Note that indirect blocks and 1153219089Spjd * private objects are not released until the 1154219089Spjd * syncing state (since they are only modified 1155219089Spjd * then). 1156219089Spjd */ 1157219089Spjd arc_release(db->db_buf, db); 1158219089Spjd dbuf_fix_old_data(db, tx->tx_txg); 1159219089Spjd data_old = db->db_buf; 1160219089Spjd } 1161219089Spjd ASSERT(data_old != NULL); 1162168404Spjd } 1163168404Spjd dr->dt.dl.dr_data = data_old; 1164168404Spjd } else { 1165168404Spjd mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_DEFAULT, NULL); 1166168404Spjd list_create(&dr->dt.di.dr_children, 1167168404Spjd sizeof (dbuf_dirty_record_t), 1168168404Spjd offsetof(dbuf_dirty_record_t, dr_dirty_node)); 1169168404Spjd } 1170258632Savg if (db->db_blkid != DMU_BONUS_BLKID && os->os_dsl_dataset != NULL) 1171258632Savg dr->dr_accounted = db->db.db_size; 1172168404Spjd dr->dr_dbuf = db; 1173168404Spjd dr->dr_txg = tx->tx_txg; 1174168404Spjd dr->dr_next = *drp; 1175168404Spjd *drp = dr; 1176168404Spjd 1177168404Spjd /* 1178168404Spjd * We could have been freed_in_flight between the dbuf_noread 1179168404Spjd * and dbuf_dirty. We win, as though the dbuf_noread() had 1180168404Spjd * happened after the free. 1181168404Spjd */ 1182219089Spjd if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && 1183219089Spjd db->db_blkid != DMU_SPILL_BLKID) { 1184168404Spjd mutex_enter(&dn->dn_mtx); 1185264669Sdelphij if (dn->dn_free_ranges[txgoff] != NULL) { 1186264669Sdelphij range_tree_clear(dn->dn_free_ranges[txgoff], 1187264669Sdelphij db->db_blkid, 1); 1188264669Sdelphij } 1189168404Spjd mutex_exit(&dn->dn_mtx); 1190168404Spjd db->db_freed_in_flight = FALSE; 1191168404Spjd } 1192168404Spjd 1193168404Spjd /* 1194168404Spjd * This buffer is now part of this txg 1195168404Spjd */ 1196168404Spjd dbuf_add_ref(db, (void *)(uintptr_t)tx->tx_txg); 1197168404Spjd db->db_dirtycnt += 1; 1198168404Spjd ASSERT3U(db->db_dirtycnt, <=, 3); 1199168404Spjd 1200168404Spjd mutex_exit(&db->db_mtx); 1201168404Spjd 1202219089Spjd if (db->db_blkid == DMU_BONUS_BLKID || 1203219089Spjd db->db_blkid == DMU_SPILL_BLKID) { 1204168404Spjd mutex_enter(&dn->dn_mtx); 1205168404Spjd ASSERT(!list_link_active(&dr->dr_dirty_node)); 1206168404Spjd list_insert_tail(&dn->dn_dirty_records[txgoff], dr); 1207168404Spjd mutex_exit(&dn->dn_mtx); 1208168404Spjd dnode_setdirty(dn, tx); 1209219089Spjd DB_DNODE_EXIT(db); 1210168404Spjd return (dr); 1211185029Spjd } else if (do_free_accounting) { 1212185029Spjd blkptr_t *bp = db->db_blkptr; 1213185029Spjd int64_t willfree = (bp && !BP_IS_HOLE(bp)) ? 1214219089Spjd bp_get_dsize(os->os_spa, bp) : db->db.db_size; 1215185029Spjd /* 1216185029Spjd * This is only a guess -- if the dbuf is dirty 1217185029Spjd * in a previous txg, we don't know how much 1218185029Spjd * space it will use on disk yet. We should 1219185029Spjd * really have the struct_rwlock to access 1220185029Spjd * db_blkptr, but since this is just a guess, 1221185029Spjd * it's OK if we get an odd answer. 1222185029Spjd */ 1223219089Spjd ddt_prefetch(os->os_spa, bp); 1224185029Spjd dnode_willuse_space(dn, -willfree, tx); 1225168404Spjd } 1226168404Spjd 1227168404Spjd if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) { 1228168404Spjd rw_enter(&dn->dn_struct_rwlock, RW_READER); 1229168404Spjd drop_struct_lock = TRUE; 1230168404Spjd } 1231168404Spjd 1232185029Spjd if (db->db_level == 0) { 1233185029Spjd dnode_new_blkid(dn, db->db_blkid, tx, drop_struct_lock); 1234185029Spjd ASSERT(dn->dn_maxblkid >= db->db_blkid); 1235185029Spjd } 1236185029Spjd 1237168404Spjd if (db->db_level+1 < dn->dn_nlevels) { 1238168404Spjd dmu_buf_impl_t *parent = db->db_parent; 1239168404Spjd dbuf_dirty_record_t *di; 1240168404Spjd int parent_held = FALSE; 1241168404Spjd 1242168404Spjd if (db->db_parent == NULL || db->db_parent == dn->dn_dbuf) { 1243168404Spjd int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 1244168404Spjd 1245168404Spjd parent = dbuf_hold_level(dn, db->db_level+1, 1246168404Spjd db->db_blkid >> epbs, FTAG); 1247219089Spjd ASSERT(parent != NULL); 1248168404Spjd parent_held = TRUE; 1249168404Spjd } 1250168404Spjd if (drop_struct_lock) 1251168404Spjd rw_exit(&dn->dn_struct_rwlock); 1252168404Spjd ASSERT3U(db->db_level+1, ==, parent->db_level); 1253168404Spjd di = dbuf_dirty(parent, tx); 1254168404Spjd if (parent_held) 1255168404Spjd dbuf_rele(parent, FTAG); 1256168404Spjd 1257168404Spjd mutex_enter(&db->db_mtx); 1258258632Savg /* 1259258632Savg * Since we've dropped the mutex, it's possible that 1260258632Savg * dbuf_undirty() might have changed this out from under us. 1261258632Savg */ 1262168404Spjd if (db->db_last_dirty == dr || 1263168404Spjd dn->dn_object == DMU_META_DNODE_OBJECT) { 1264168404Spjd mutex_enter(&di->dt.di.dr_mtx); 1265168404Spjd ASSERT3U(di->dr_txg, ==, tx->tx_txg); 1266168404Spjd ASSERT(!list_link_active(&dr->dr_dirty_node)); 1267168404Spjd list_insert_tail(&di->dt.di.dr_children, dr); 1268168404Spjd mutex_exit(&di->dt.di.dr_mtx); 1269168404Spjd dr->dr_parent = di; 1270168404Spjd } 1271168404Spjd mutex_exit(&db->db_mtx); 1272168404Spjd } else { 1273168404Spjd ASSERT(db->db_level+1 == dn->dn_nlevels); 1274168404Spjd ASSERT(db->db_blkid < dn->dn_nblkptr); 1275219089Spjd ASSERT(db->db_parent == NULL || db->db_parent == dn->dn_dbuf); 1276168404Spjd mutex_enter(&dn->dn_mtx); 1277168404Spjd ASSERT(!list_link_active(&dr->dr_dirty_node)); 1278168404Spjd list_insert_tail(&dn->dn_dirty_records[txgoff], dr); 1279168404Spjd mutex_exit(&dn->dn_mtx); 1280168404Spjd if (drop_struct_lock) 1281168404Spjd rw_exit(&dn->dn_struct_rwlock); 1282168404Spjd } 1283168404Spjd 1284168404Spjd dnode_setdirty(dn, tx); 1285219089Spjd DB_DNODE_EXIT(db); 1286168404Spjd return (dr); 1287168404Spjd} 1288168404Spjd 1289248571Smm/* 1290251629Sdelphij * Undirty a buffer in the transaction group referenced by the given 1291251629Sdelphij * transaction. Return whether this evicted the dbuf. 1292248571Smm */ 1293248571Smmstatic boolean_t 1294168404Spjddbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) 1295168404Spjd{ 1296219089Spjd dnode_t *dn; 1297168404Spjd uint64_t txg = tx->tx_txg; 1298185029Spjd dbuf_dirty_record_t *dr, **drp; 1299168404Spjd 1300168404Spjd ASSERT(txg != 0); 1301219089Spjd ASSERT(db->db_blkid != DMU_BONUS_BLKID); 1302248571Smm ASSERT0(db->db_level); 1303248571Smm ASSERT(MUTEX_HELD(&db->db_mtx)); 1304168404Spjd 1305168404Spjd /* 1306168404Spjd * If this buffer is not dirty, we're done. 1307168404Spjd */ 1308185029Spjd for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next) 1309168404Spjd if (dr->dr_txg <= txg) 1310168404Spjd break; 1311248571Smm if (dr == NULL || dr->dr_txg < txg) 1312248571Smm return (B_FALSE); 1313168404Spjd ASSERT(dr->dr_txg == txg); 1314219089Spjd ASSERT(dr->dr_dbuf == db); 1315168404Spjd 1316219089Spjd DB_DNODE_ENTER(db); 1317219089Spjd dn = DB_DNODE(db); 1318219089Spjd 1319168404Spjd dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); 1320168404Spjd 1321168404Spjd ASSERT(db->db.db_size != 0); 1322168404Spjd 1323258632Savg /* 1324258632Savg * Any space we accounted for in dp_dirty_* will be cleaned up by 1325258632Savg * dsl_pool_sync(). This is relatively rare so the discrepancy 1326258632Savg * is not a big deal. 1327258632Savg */ 1328168404Spjd 1329185029Spjd *drp = dr->dr_next; 1330168404Spjd 1331219636Spjd /* 1332219636Spjd * Note that there are three places in dbuf_dirty() 1333219636Spjd * where this dirty record may be put on a list. 1334219636Spjd * Make sure to do a list_remove corresponding to 1335219636Spjd * every one of those list_insert calls. 1336219636Spjd */ 1337168404Spjd if (dr->dr_parent) { 1338168404Spjd mutex_enter(&dr->dr_parent->dt.di.dr_mtx); 1339168404Spjd list_remove(&dr->dr_parent->dt.di.dr_children, dr); 1340168404Spjd mutex_exit(&dr->dr_parent->dt.di.dr_mtx); 1341219636Spjd } else if (db->db_blkid == DMU_SPILL_BLKID || 1342219636Spjd db->db_level+1 == dn->dn_nlevels) { 1343185029Spjd ASSERT(db->db_blkptr == NULL || db->db_parent == dn->dn_dbuf); 1344168404Spjd mutex_enter(&dn->dn_mtx); 1345168404Spjd list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr); 1346168404Spjd mutex_exit(&dn->dn_mtx); 1347168404Spjd } 1348219089Spjd DB_DNODE_EXIT(db); 1349168404Spjd 1350248571Smm if (db->db_state != DB_NOFILL) { 1351248571Smm dbuf_unoverride(dr); 1352168404Spjd 1353168404Spjd ASSERT(db->db_buf != NULL); 1354248571Smm ASSERT(dr->dt.dl.dr_data != NULL); 1355248571Smm if (dr->dt.dl.dr_data != db->db_buf) 1356248571Smm VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db)); 1357168404Spjd } 1358268713Sdelphij 1359268713Sdelphij if (db->db_level != 0) { 1360268713Sdelphij mutex_destroy(&dr->dt.di.dr_mtx); 1361268713Sdelphij list_destroy(&dr->dt.di.dr_children); 1362268713Sdelphij } 1363268713Sdelphij 1364168404Spjd kmem_free(dr, sizeof (dbuf_dirty_record_t)); 1365168404Spjd 1366168404Spjd ASSERT(db->db_dirtycnt > 0); 1367168404Spjd db->db_dirtycnt -= 1; 1368168404Spjd 1369168404Spjd if (refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) { 1370168404Spjd arc_buf_t *buf = db->db_buf; 1371168404Spjd 1372219089Spjd ASSERT(db->db_state == DB_NOFILL || arc_released(buf)); 1373168404Spjd dbuf_set_data(db, NULL); 1374248571Smm VERIFY(arc_buf_remove_ref(buf, db)); 1375168404Spjd dbuf_evict(db); 1376248571Smm return (B_TRUE); 1377168404Spjd } 1378168404Spjd 1379248571Smm return (B_FALSE); 1380168404Spjd} 1381168404Spjd 1382168404Spjdvoid 1383260150Sdelphijdmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx) 1384168404Spjd{ 1385260150Sdelphij dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 1386185029Spjd int rf = DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH; 1387168404Spjd 1388168404Spjd ASSERT(tx->tx_txg != 0); 1389168404Spjd ASSERT(!refcount_is_zero(&db->db_holds)); 1390168404Spjd 1391219089Spjd DB_DNODE_ENTER(db); 1392219089Spjd if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock)) 1393168404Spjd rf |= DB_RF_HAVESTRUCT; 1394219089Spjd DB_DNODE_EXIT(db); 1395168404Spjd (void) dbuf_read(db, NULL, rf); 1396168404Spjd (void) dbuf_dirty(db, tx); 1397168404Spjd} 1398168404Spjd 1399168404Spjdvoid 1400219089Spjddmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) 1401219089Spjd{ 1402219089Spjd dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 1403219089Spjd 1404219089Spjd db->db_state = DB_NOFILL; 1405219089Spjd 1406219089Spjd dmu_buf_will_fill(db_fake, tx); 1407219089Spjd} 1408219089Spjd 1409219089Spjdvoid 1410168404Spjddmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) 1411168404Spjd{ 1412168404Spjd dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 1413168404Spjd 1414219089Spjd ASSERT(db->db_blkid != DMU_BONUS_BLKID); 1415168404Spjd ASSERT(tx->tx_txg != 0); 1416168404Spjd ASSERT(db->db_level == 0); 1417168404Spjd ASSERT(!refcount_is_zero(&db->db_holds)); 1418168404Spjd 1419168404Spjd ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT || 1420168404Spjd dmu_tx_private_ok(tx)); 1421168404Spjd 1422168404Spjd dbuf_noread(db); 1423168404Spjd (void) dbuf_dirty(db, tx); 1424168404Spjd} 1425168404Spjd 1426168404Spjd#pragma weak dmu_buf_fill_done = dbuf_fill_done 1427168404Spjd/* ARGSUSED */ 1428168404Spjdvoid 1429168404Spjddbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx) 1430168404Spjd{ 1431168404Spjd mutex_enter(&db->db_mtx); 1432168404Spjd DBUF_VERIFY(db); 1433168404Spjd 1434168404Spjd if (db->db_state == DB_FILL) { 1435168404Spjd if (db->db_level == 0 && db->db_freed_in_flight) { 1436219089Spjd ASSERT(db->db_blkid != DMU_BONUS_BLKID); 1437168404Spjd /* we were freed while filling */ 1438168404Spjd /* XXX dbuf_undirty? */ 1439168404Spjd bzero(db->db.db_data, db->db.db_size); 1440168404Spjd db->db_freed_in_flight = FALSE; 1441168404Spjd } 1442168404Spjd db->db_state = DB_CACHED; 1443168404Spjd cv_broadcast(&db->db_changed); 1444168404Spjd } 1445168404Spjd mutex_exit(&db->db_mtx); 1446168404Spjd} 1447168404Spjd 1448268075Sdelphijvoid 1449268075Sdelphijdmu_buf_write_embedded(dmu_buf_t *dbuf, void *data, 1450268075Sdelphij bp_embedded_type_t etype, enum zio_compress comp, 1451268075Sdelphij int uncompressed_size, int compressed_size, int byteorder, 1452268075Sdelphij dmu_tx_t *tx) 1453268075Sdelphij{ 1454268075Sdelphij dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf; 1455268075Sdelphij struct dirty_leaf *dl; 1456268075Sdelphij dmu_object_type_t type; 1457268075Sdelphij 1458268075Sdelphij DB_DNODE_ENTER(db); 1459268075Sdelphij type = DB_DNODE(db)->dn_type; 1460268075Sdelphij DB_DNODE_EXIT(db); 1461268075Sdelphij 1462268075Sdelphij ASSERT0(db->db_level); 1463268075Sdelphij ASSERT(db->db_blkid != DMU_BONUS_BLKID); 1464268075Sdelphij 1465268075Sdelphij dmu_buf_will_not_fill(dbuf, tx); 1466268075Sdelphij 1467268075Sdelphij ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg); 1468268075Sdelphij dl = &db->db_last_dirty->dt.dl; 1469268075Sdelphij encode_embedded_bp_compressed(&dl->dr_overridden_by, 1470268075Sdelphij data, comp, uncompressed_size, compressed_size); 1471268075Sdelphij BPE_SET_ETYPE(&dl->dr_overridden_by, etype); 1472268075Sdelphij BP_SET_TYPE(&dl->dr_overridden_by, type); 1473268075Sdelphij BP_SET_LEVEL(&dl->dr_overridden_by, 0); 1474268075Sdelphij BP_SET_BYTEORDER(&dl->dr_overridden_by, byteorder); 1475268075Sdelphij 1476268075Sdelphij dl->dr_override_state = DR_OVERRIDDEN; 1477268075Sdelphij dl->dr_overridden_by.blk_birth = db->db_last_dirty->dr_txg; 1478268075Sdelphij} 1479268075Sdelphij 1480168404Spjd/* 1481209962Smm * Directly assign a provided arc buf to a given dbuf if it's not referenced 1482209962Smm * by anybody except our caller. Otherwise copy arcbuf's contents to dbuf. 1483209962Smm */ 1484209962Smmvoid 1485209962Smmdbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx) 1486209962Smm{ 1487209962Smm ASSERT(!refcount_is_zero(&db->db_holds)); 1488219089Spjd ASSERT(db->db_blkid != DMU_BONUS_BLKID); 1489209962Smm ASSERT(db->db_level == 0); 1490209962Smm ASSERT(DBUF_GET_BUFC_TYPE(db) == ARC_BUFC_DATA); 1491209962Smm ASSERT(buf != NULL); 1492209962Smm ASSERT(arc_buf_size(buf) == db->db.db_size); 1493209962Smm ASSERT(tx->tx_txg != 0); 1494209962Smm 1495209962Smm arc_return_buf(buf, db); 1496209962Smm ASSERT(arc_released(buf)); 1497209962Smm 1498209962Smm mutex_enter(&db->db_mtx); 1499209962Smm 1500209962Smm while (db->db_state == DB_READ || db->db_state == DB_FILL) 1501209962Smm cv_wait(&db->db_changed, &db->db_mtx); 1502209962Smm 1503209962Smm ASSERT(db->db_state == DB_CACHED || db->db_state == DB_UNCACHED); 1504209962Smm 1505209962Smm if (db->db_state == DB_CACHED && 1506209962Smm refcount_count(&db->db_holds) - 1 > db->db_dirtycnt) { 1507209962Smm mutex_exit(&db->db_mtx); 1508209962Smm (void) dbuf_dirty(db, tx); 1509209962Smm bcopy(buf->b_data, db->db.db_data, db->db.db_size); 1510248571Smm VERIFY(arc_buf_remove_ref(buf, db)); 1511219089Spjd xuio_stat_wbuf_copied(); 1512209962Smm return; 1513209962Smm } 1514209962Smm 1515219089Spjd xuio_stat_wbuf_nocopy(); 1516209962Smm if (db->db_state == DB_CACHED) { 1517209962Smm dbuf_dirty_record_t *dr = db->db_last_dirty; 1518209962Smm 1519209962Smm ASSERT(db->db_buf != NULL); 1520209962Smm if (dr != NULL && dr->dr_txg == tx->tx_txg) { 1521209962Smm ASSERT(dr->dt.dl.dr_data == db->db_buf); 1522209962Smm if (!arc_released(db->db_buf)) { 1523209962Smm ASSERT(dr->dt.dl.dr_override_state == 1524209962Smm DR_OVERRIDDEN); 1525209962Smm arc_release(db->db_buf, db); 1526209962Smm } 1527209962Smm dr->dt.dl.dr_data = buf; 1528248571Smm VERIFY(arc_buf_remove_ref(db->db_buf, db)); 1529209962Smm } else if (dr == NULL || dr->dt.dl.dr_data != db->db_buf) { 1530209962Smm arc_release(db->db_buf, db); 1531248571Smm VERIFY(arc_buf_remove_ref(db->db_buf, db)); 1532209962Smm } 1533209962Smm db->db_buf = NULL; 1534209962Smm } 1535209962Smm ASSERT(db->db_buf == NULL); 1536209962Smm dbuf_set_data(db, buf); 1537209962Smm db->db_state = DB_FILL; 1538209962Smm mutex_exit(&db->db_mtx); 1539209962Smm (void) dbuf_dirty(db, tx); 1540260150Sdelphij dmu_buf_fill_done(&db->db, tx); 1541209962Smm} 1542209962Smm 1543209962Smm/* 1544168404Spjd * "Clear" the contents of this dbuf. This will mark the dbuf 1545258632Savg * EVICTING and clear *most* of its references. Unfortunately, 1546168404Spjd * when we are not holding the dn_dbufs_mtx, we can't clear the 1547168404Spjd * entry in the dn_dbufs list. We have to wait until dbuf_destroy() 1548168404Spjd * in this case. For callers from the DMU we will usually see: 1549268858Sdelphij * dbuf_clear()->arc_clear_callback()->dbuf_do_evict()->dbuf_destroy() 1550168404Spjd * For the arc callback, we will usually see: 1551219089Spjd * dbuf_do_evict()->dbuf_clear();dbuf_destroy() 1552168404Spjd * Sometimes, though, we will get a mix of these two: 1553268858Sdelphij * DMU: dbuf_clear()->arc_clear_callback() 1554168404Spjd * ARC: dbuf_do_evict()->dbuf_destroy() 1555268858Sdelphij * 1556268858Sdelphij * This routine will dissociate the dbuf from the arc, by calling 1557268858Sdelphij * arc_clear_callback(), but will not evict the data from the ARC. 1558168404Spjd */ 1559168404Spjdvoid 1560168404Spjddbuf_clear(dmu_buf_impl_t *db) 1561168404Spjd{ 1562219089Spjd dnode_t *dn; 1563168404Spjd dmu_buf_impl_t *parent = db->db_parent; 1564219089Spjd dmu_buf_impl_t *dndb; 1565268858Sdelphij boolean_t dbuf_gone = B_FALSE; 1566168404Spjd 1567168404Spjd ASSERT(MUTEX_HELD(&db->db_mtx)); 1568168404Spjd ASSERT(refcount_is_zero(&db->db_holds)); 1569168404Spjd 1570168404Spjd dbuf_evict_user(db); 1571168404Spjd 1572168404Spjd if (db->db_state == DB_CACHED) { 1573168404Spjd ASSERT(db->db.db_data != NULL); 1574219089Spjd if (db->db_blkid == DMU_BONUS_BLKID) { 1575168404Spjd zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN); 1576208373Smm arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); 1577185029Spjd } 1578168404Spjd db->db.db_data = NULL; 1579168404Spjd db->db_state = DB_UNCACHED; 1580168404Spjd } 1581168404Spjd 1582219089Spjd ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL); 1583168404Spjd ASSERT(db->db_data_pending == NULL); 1584168404Spjd 1585168404Spjd db->db_state = DB_EVICTING; 1586168404Spjd db->db_blkptr = NULL; 1587168404Spjd 1588219089Spjd DB_DNODE_ENTER(db); 1589219089Spjd dn = DB_DNODE(db); 1590219089Spjd dndb = dn->dn_dbuf; 1591219089Spjd if (db->db_blkid != DMU_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) { 1592269229Sdelphij avl_remove(&dn->dn_dbufs, db); 1593270248Sdelphij atomic_dec_32(&dn->dn_dbufs_count); 1594219089Spjd membar_producer(); 1595219089Spjd DB_DNODE_EXIT(db); 1596219089Spjd /* 1597219089Spjd * Decrementing the dbuf count means that the hold corresponding 1598219089Spjd * to the removed dbuf is no longer discounted in dnode_move(), 1599219089Spjd * so the dnode cannot be moved until after we release the hold. 1600219089Spjd * The membar_producer() ensures visibility of the decremented 1601219089Spjd * value in dnode_move(), since DB_DNODE_EXIT doesn't actually 1602219089Spjd * release any lock. 1603219089Spjd */ 1604168404Spjd dnode_rele(dn, db); 1605219089Spjd db->db_dnode_handle = NULL; 1606219089Spjd } else { 1607219089Spjd DB_DNODE_EXIT(db); 1608168404Spjd } 1609168404Spjd 1610168404Spjd if (db->db_buf) 1611268858Sdelphij dbuf_gone = arc_clear_callback(db->db_buf); 1612168404Spjd 1613168404Spjd if (!dbuf_gone) 1614168404Spjd mutex_exit(&db->db_mtx); 1615168404Spjd 1616168404Spjd /* 1617219089Spjd * If this dbuf is referenced from an indirect dbuf, 1618168404Spjd * decrement the ref count on the indirect dbuf. 1619168404Spjd */ 1620168404Spjd if (parent && parent != dndb) 1621168404Spjd dbuf_rele(parent, db); 1622168404Spjd} 1623168404Spjd 1624168404Spjdstatic int 1625168404Spjddbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse, 1626168404Spjd dmu_buf_impl_t **parentp, blkptr_t **bpp) 1627168404Spjd{ 1628168404Spjd int nlevels, epbs; 1629168404Spjd 1630168404Spjd *parentp = NULL; 1631168404Spjd *bpp = NULL; 1632168404Spjd 1633219089Spjd ASSERT(blkid != DMU_BONUS_BLKID); 1634168404Spjd 1635219089Spjd if (blkid == DMU_SPILL_BLKID) { 1636219089Spjd mutex_enter(&dn->dn_mtx); 1637219089Spjd if (dn->dn_have_spill && 1638219089Spjd (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) 1639219089Spjd *bpp = &dn->dn_phys->dn_spill; 1640219089Spjd else 1641219089Spjd *bpp = NULL; 1642219089Spjd dbuf_add_ref(dn->dn_dbuf, NULL); 1643219089Spjd *parentp = dn->dn_dbuf; 1644219089Spjd mutex_exit(&dn->dn_mtx); 1645219089Spjd return (0); 1646219089Spjd } 1647219089Spjd 1648168404Spjd if (dn->dn_phys->dn_nlevels == 0) 1649168404Spjd nlevels = 1; 1650168404Spjd else 1651168404Spjd nlevels = dn->dn_phys->dn_nlevels; 1652168404Spjd 1653168404Spjd epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 1654168404Spjd 1655168404Spjd ASSERT3U(level * epbs, <, 64); 1656168404Spjd ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 1657168404Spjd if (level >= nlevels || 1658168404Spjd (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))) { 1659168404Spjd /* the buffer has no parent yet */ 1660249195Smm return (SET_ERROR(ENOENT)); 1661168404Spjd } else if (level < nlevels-1) { 1662168404Spjd /* this block is referenced from an indirect block */ 1663168404Spjd int err = dbuf_hold_impl(dn, level+1, 1664168404Spjd blkid >> epbs, fail_sparse, NULL, parentp); 1665168404Spjd if (err) 1666168404Spjd return (err); 1667168404Spjd err = dbuf_read(*parentp, NULL, 1668168404Spjd (DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH | DB_RF_CANFAIL)); 1669168404Spjd if (err) { 1670168404Spjd dbuf_rele(*parentp, NULL); 1671168404Spjd *parentp = NULL; 1672168404Spjd return (err); 1673168404Spjd } 1674168404Spjd *bpp = ((blkptr_t *)(*parentp)->db.db_data) + 1675168404Spjd (blkid & ((1ULL << epbs) - 1)); 1676168404Spjd return (0); 1677168404Spjd } else { 1678168404Spjd /* the block is referenced from the dnode */ 1679168404Spjd ASSERT3U(level, ==, nlevels-1); 1680168404Spjd ASSERT(dn->dn_phys->dn_nblkptr == 0 || 1681168404Spjd blkid < dn->dn_phys->dn_nblkptr); 1682168404Spjd if (dn->dn_dbuf) { 1683168404Spjd dbuf_add_ref(dn->dn_dbuf, NULL); 1684168404Spjd *parentp = dn->dn_dbuf; 1685168404Spjd } 1686168404Spjd *bpp = &dn->dn_phys->dn_blkptr[blkid]; 1687168404Spjd return (0); 1688168404Spjd } 1689168404Spjd} 1690168404Spjd 1691168404Spjdstatic dmu_buf_impl_t * 1692168404Spjddbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, 1693168404Spjd dmu_buf_impl_t *parent, blkptr_t *blkptr) 1694168404Spjd{ 1695219089Spjd objset_t *os = dn->dn_objset; 1696168404Spjd dmu_buf_impl_t *db, *odb; 1697168404Spjd 1698168404Spjd ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 1699168404Spjd ASSERT(dn->dn_type != DMU_OT_NONE); 1700168404Spjd 1701168404Spjd db = kmem_cache_alloc(dbuf_cache, KM_SLEEP); 1702168404Spjd 1703168404Spjd db->db_objset = os; 1704168404Spjd db->db.db_object = dn->dn_object; 1705168404Spjd db->db_level = level; 1706168404Spjd db->db_blkid = blkid; 1707168404Spjd db->db_last_dirty = NULL; 1708168404Spjd db->db_dirtycnt = 0; 1709219089Spjd db->db_dnode_handle = dn->dn_handle; 1710168404Spjd db->db_parent = parent; 1711168404Spjd db->db_blkptr = blkptr; 1712168404Spjd 1713168404Spjd db->db_user_ptr = NULL; 1714168404Spjd db->db_evict_func = NULL; 1715168404Spjd db->db_immediate_evict = 0; 1716168404Spjd db->db_freed_in_flight = 0; 1717168404Spjd 1718219089Spjd if (blkid == DMU_BONUS_BLKID) { 1719168404Spjd ASSERT3P(parent, ==, dn->dn_dbuf); 1720185029Spjd db->db.db_size = DN_MAX_BONUSLEN - 1721185029Spjd (dn->dn_nblkptr-1) * sizeof (blkptr_t); 1722185029Spjd ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); 1723219089Spjd db->db.db_offset = DMU_BONUS_BLKID; 1724168404Spjd db->db_state = DB_UNCACHED; 1725168404Spjd /* the bonus dbuf is not placed in the hash table */ 1726208373Smm arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); 1727168404Spjd return (db); 1728219089Spjd } else if (blkid == DMU_SPILL_BLKID) { 1729219089Spjd db->db.db_size = (blkptr != NULL) ? 1730219089Spjd BP_GET_LSIZE(blkptr) : SPA_MINBLOCKSIZE; 1731219089Spjd db->db.db_offset = 0; 1732168404Spjd } else { 1733168404Spjd int blocksize = 1734258632Savg db->db_level ? 1 << dn->dn_indblkshift : dn->dn_datablksz; 1735168404Spjd db->db.db_size = blocksize; 1736168404Spjd db->db.db_offset = db->db_blkid * blocksize; 1737168404Spjd } 1738168404Spjd 1739168404Spjd /* 1740168404Spjd * Hold the dn_dbufs_mtx while we get the new dbuf 1741168404Spjd * in the hash table *and* added to the dbufs list. 1742168404Spjd * This prevents a possible deadlock with someone 1743168404Spjd * trying to look up this dbuf before its added to the 1744168404Spjd * dn_dbufs list. 1745168404Spjd */ 1746168404Spjd mutex_enter(&dn->dn_dbufs_mtx); 1747168404Spjd db->db_state = DB_EVICTING; 1748168404Spjd if ((odb = dbuf_hash_insert(db)) != NULL) { 1749168404Spjd /* someone else inserted it first */ 1750168404Spjd kmem_cache_free(dbuf_cache, db); 1751168404Spjd mutex_exit(&dn->dn_dbufs_mtx); 1752168404Spjd return (odb); 1753168404Spjd } 1754269229Sdelphij avl_add(&dn->dn_dbufs, db); 1755254753Sdelphij if (db->db_level == 0 && db->db_blkid >= 1756254753Sdelphij dn->dn_unlisted_l0_blkid) 1757254753Sdelphij dn->dn_unlisted_l0_blkid = db->db_blkid + 1; 1758168404Spjd db->db_state = DB_UNCACHED; 1759168404Spjd mutex_exit(&dn->dn_dbufs_mtx); 1760208373Smm arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); 1761168404Spjd 1762168404Spjd if (parent && parent != dn->dn_dbuf) 1763168404Spjd dbuf_add_ref(parent, db); 1764168404Spjd 1765168404Spjd ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || 1766168404Spjd refcount_count(&dn->dn_holds) > 0); 1767168404Spjd (void) refcount_add(&dn->dn_holds, db); 1768270248Sdelphij atomic_inc_32(&dn->dn_dbufs_count); 1769168404Spjd 1770168404Spjd dprintf_dbuf(db, "db=%p\n", db); 1771168404Spjd 1772168404Spjd return (db); 1773168404Spjd} 1774168404Spjd 1775168404Spjdstatic int 1776168404Spjddbuf_do_evict(void *private) 1777168404Spjd{ 1778268858Sdelphij dmu_buf_impl_t *db = private; 1779168404Spjd 1780168404Spjd if (!MUTEX_HELD(&db->db_mtx)) 1781168404Spjd mutex_enter(&db->db_mtx); 1782168404Spjd 1783168404Spjd ASSERT(refcount_is_zero(&db->db_holds)); 1784168404Spjd 1785168404Spjd if (db->db_state != DB_EVICTING) { 1786168404Spjd ASSERT(db->db_state == DB_CACHED); 1787168404Spjd DBUF_VERIFY(db); 1788168404Spjd db->db_buf = NULL; 1789168404Spjd dbuf_evict(db); 1790168404Spjd } else { 1791168404Spjd mutex_exit(&db->db_mtx); 1792168404Spjd dbuf_destroy(db); 1793168404Spjd } 1794168404Spjd return (0); 1795168404Spjd} 1796168404Spjd 1797168404Spjdstatic void 1798168404Spjddbuf_destroy(dmu_buf_impl_t *db) 1799168404Spjd{ 1800168404Spjd ASSERT(refcount_is_zero(&db->db_holds)); 1801168404Spjd 1802219089Spjd if (db->db_blkid != DMU_BONUS_BLKID) { 1803168404Spjd /* 1804168404Spjd * If this dbuf is still on the dn_dbufs list, 1805168404Spjd * remove it from that list. 1806168404Spjd */ 1807219089Spjd if (db->db_dnode_handle != NULL) { 1808219089Spjd dnode_t *dn; 1809185029Spjd 1810219089Spjd DB_DNODE_ENTER(db); 1811219089Spjd dn = DB_DNODE(db); 1812168404Spjd mutex_enter(&dn->dn_dbufs_mtx); 1813269229Sdelphij avl_remove(&dn->dn_dbufs, db); 1814270248Sdelphij atomic_dec_32(&dn->dn_dbufs_count); 1815168404Spjd mutex_exit(&dn->dn_dbufs_mtx); 1816219089Spjd DB_DNODE_EXIT(db); 1817219089Spjd /* 1818219089Spjd * Decrementing the dbuf count means that the hold 1819219089Spjd * corresponding to the removed dbuf is no longer 1820219089Spjd * discounted in dnode_move(), so the dnode cannot be 1821219089Spjd * moved until after we release the hold. 1822219089Spjd */ 1823168404Spjd dnode_rele(dn, db); 1824219089Spjd db->db_dnode_handle = NULL; 1825168404Spjd } 1826168404Spjd dbuf_hash_remove(db); 1827168404Spjd } 1828168404Spjd db->db_parent = NULL; 1829168404Spjd db->db_buf = NULL; 1830168404Spjd 1831168404Spjd ASSERT(db->db.db_data == NULL); 1832168404Spjd ASSERT(db->db_hash_next == NULL); 1833168404Spjd ASSERT(db->db_blkptr == NULL); 1834168404Spjd ASSERT(db->db_data_pending == NULL); 1835168404Spjd 1836168404Spjd kmem_cache_free(dbuf_cache, db); 1837208373Smm arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); 1838168404Spjd} 1839168404Spjd 1840168404Spjdvoid 1841258632Savgdbuf_prefetch(dnode_t *dn, uint64_t blkid, zio_priority_t prio) 1842168404Spjd{ 1843168404Spjd dmu_buf_impl_t *db = NULL; 1844168404Spjd blkptr_t *bp = NULL; 1845168404Spjd 1846219089Spjd ASSERT(blkid != DMU_BONUS_BLKID); 1847168404Spjd ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 1848168404Spjd 1849168404Spjd if (dnode_block_freed(dn, blkid)) 1850168404Spjd return; 1851168404Spjd 1852168404Spjd /* dbuf_find() returns with db_mtx held */ 1853168404Spjd if (db = dbuf_find(dn, 0, blkid)) { 1854219089Spjd /* 1855219089Spjd * This dbuf is already in the cache. We assume that 1856219089Spjd * it is already CACHED, or else about to be either 1857219089Spjd * read or filled. 1858219089Spjd */ 1859168404Spjd mutex_exit(&db->db_mtx); 1860219089Spjd return; 1861168404Spjd } 1862168404Spjd 1863168404Spjd if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp) == 0) { 1864268075Sdelphij if (bp && !BP_IS_HOLE(bp) && !BP_IS_EMBEDDED(bp)) { 1865219089Spjd dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; 1866168404Spjd uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH; 1867268123Sdelphij zbookmark_phys_t zb; 1868168404Spjd 1869219089Spjd SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET, 1870219089Spjd dn->dn_object, 0, blkid); 1871219089Spjd 1872246666Smm (void) arc_read(NULL, dn->dn_objset->os_spa, 1873258632Savg bp, NULL, NULL, prio, 1874168404Spjd ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, 1875168404Spjd &aflags, &zb); 1876168404Spjd } 1877168404Spjd if (db) 1878168404Spjd dbuf_rele(db, NULL); 1879168404Spjd } 1880168404Spjd} 1881168404Spjd 1882168404Spjd/* 1883168404Spjd * Returns with db_holds incremented, and db_mtx not held. 1884168404Spjd * Note: dn_struct_rwlock must be held. 1885168404Spjd */ 1886168404Spjdint 1887168404Spjddbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse, 1888168404Spjd void *tag, dmu_buf_impl_t **dbp) 1889168404Spjd{ 1890168404Spjd dmu_buf_impl_t *db, *parent = NULL; 1891168404Spjd 1892219089Spjd ASSERT(blkid != DMU_BONUS_BLKID); 1893168404Spjd ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 1894168404Spjd ASSERT3U(dn->dn_nlevels, >, level); 1895168404Spjd 1896168404Spjd *dbp = NULL; 1897168404Spjdtop: 1898168404Spjd /* dbuf_find() returns with db_mtx held */ 1899168404Spjd db = dbuf_find(dn, level, blkid); 1900168404Spjd 1901168404Spjd if (db == NULL) { 1902168404Spjd blkptr_t *bp = NULL; 1903168404Spjd int err; 1904168404Spjd 1905168404Spjd ASSERT3P(parent, ==, NULL); 1906168404Spjd err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp); 1907168404Spjd if (fail_sparse) { 1908168404Spjd if (err == 0 && bp && BP_IS_HOLE(bp)) 1909249195Smm err = SET_ERROR(ENOENT); 1910168404Spjd if (err) { 1911168404Spjd if (parent) 1912168404Spjd dbuf_rele(parent, NULL); 1913168404Spjd return (err); 1914168404Spjd } 1915168404Spjd } 1916168404Spjd if (err && err != ENOENT) 1917168404Spjd return (err); 1918168404Spjd db = dbuf_create(dn, level, blkid, parent, bp); 1919168404Spjd } 1920168404Spjd 1921168404Spjd if (db->db_buf && refcount_is_zero(&db->db_holds)) { 1922168404Spjd arc_buf_add_ref(db->db_buf, db); 1923168404Spjd if (db->db_buf->b_data == NULL) { 1924168404Spjd dbuf_clear(db); 1925168404Spjd if (parent) { 1926168404Spjd dbuf_rele(parent, NULL); 1927168404Spjd parent = NULL; 1928168404Spjd } 1929168404Spjd goto top; 1930168404Spjd } 1931168404Spjd ASSERT3P(db->db.db_data, ==, db->db_buf->b_data); 1932168404Spjd } 1933168404Spjd 1934168404Spjd ASSERT(db->db_buf == NULL || arc_referenced(db->db_buf)); 1935168404Spjd 1936168404Spjd /* 1937168404Spjd * If this buffer is currently syncing out, and we are are 1938168404Spjd * still referencing it from db_data, we need to make a copy 1939168404Spjd * of it in case we decide we want to dirty it again in this txg. 1940168404Spjd */ 1941219089Spjd if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && 1942168404Spjd dn->dn_object != DMU_META_DNODE_OBJECT && 1943168404Spjd db->db_state == DB_CACHED && db->db_data_pending) { 1944168404Spjd dbuf_dirty_record_t *dr = db->db_data_pending; 1945168404Spjd 1946168404Spjd if (dr->dt.dl.dr_data == db->db_buf) { 1947168404Spjd arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 1948168404Spjd 1949168404Spjd dbuf_set_data(db, 1950219089Spjd arc_buf_alloc(dn->dn_objset->os_spa, 1951168404Spjd db->db.db_size, db, type)); 1952168404Spjd bcopy(dr->dt.dl.dr_data->b_data, db->db.db_data, 1953168404Spjd db->db.db_size); 1954168404Spjd } 1955168404Spjd } 1956168404Spjd 1957168404Spjd (void) refcount_add(&db->db_holds, tag); 1958168404Spjd DBUF_VERIFY(db); 1959168404Spjd mutex_exit(&db->db_mtx); 1960168404Spjd 1961168404Spjd /* NOTE: we can't rele the parent until after we drop the db_mtx */ 1962168404Spjd if (parent) 1963168404Spjd dbuf_rele(parent, NULL); 1964168404Spjd 1965219089Spjd ASSERT3P(DB_DNODE(db), ==, dn); 1966168404Spjd ASSERT3U(db->db_blkid, ==, blkid); 1967168404Spjd ASSERT3U(db->db_level, ==, level); 1968168404Spjd *dbp = db; 1969168404Spjd 1970168404Spjd return (0); 1971168404Spjd} 1972168404Spjd 1973168404Spjddmu_buf_impl_t * 1974168404Spjddbuf_hold(dnode_t *dn, uint64_t blkid, void *tag) 1975168404Spjd{ 1976168404Spjd dmu_buf_impl_t *db; 1977168404Spjd int err = dbuf_hold_impl(dn, 0, blkid, FALSE, tag, &db); 1978168404Spjd return (err ? NULL : db); 1979168404Spjd} 1980168404Spjd 1981168404Spjddmu_buf_impl_t * 1982168404Spjddbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag) 1983168404Spjd{ 1984168404Spjd dmu_buf_impl_t *db; 1985168404Spjd int err = dbuf_hold_impl(dn, level, blkid, FALSE, tag, &db); 1986168404Spjd return (err ? NULL : db); 1987168404Spjd} 1988168404Spjd 1989185029Spjdvoid 1990168404Spjddbuf_create_bonus(dnode_t *dn) 1991168404Spjd{ 1992168404Spjd ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); 1993168404Spjd 1994168404Spjd ASSERT(dn->dn_bonus == NULL); 1995219089Spjd dn->dn_bonus = dbuf_create(dn, 0, DMU_BONUS_BLKID, dn->dn_dbuf, NULL); 1996168404Spjd} 1997168404Spjd 1998219089Spjdint 1999219089Spjddbuf_spill_set_blksz(dmu_buf_t *db_fake, uint64_t blksz, dmu_tx_t *tx) 2000219089Spjd{ 2001219089Spjd dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 2002219089Spjd dnode_t *dn; 2003219089Spjd 2004219089Spjd if (db->db_blkid != DMU_SPILL_BLKID) 2005249195Smm return (SET_ERROR(ENOTSUP)); 2006219089Spjd if (blksz == 0) 2007219089Spjd blksz = SPA_MINBLOCKSIZE; 2008274337Sdelphij ASSERT3U(blksz, <=, spa_maxblocksize(dmu_objset_spa(db->db_objset))); 2009274337Sdelphij blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE); 2010219089Spjd 2011219089Spjd DB_DNODE_ENTER(db); 2012219089Spjd dn = DB_DNODE(db); 2013219089Spjd rw_enter(&dn->dn_struct_rwlock, RW_WRITER); 2014219089Spjd dbuf_new_size(db, blksz, tx); 2015219089Spjd rw_exit(&dn->dn_struct_rwlock); 2016219089Spjd DB_DNODE_EXIT(db); 2017219089Spjd 2018219089Spjd return (0); 2019219089Spjd} 2020219089Spjd 2021219089Spjdvoid 2022219089Spjddbuf_rm_spill(dnode_t *dn, dmu_tx_t *tx) 2023219089Spjd{ 2024219089Spjd dbuf_free_range(dn, DMU_SPILL_BLKID, DMU_SPILL_BLKID, tx); 2025219089Spjd} 2026219089Spjd 2027168404Spjd#pragma weak dmu_buf_add_ref = dbuf_add_ref 2028168404Spjdvoid 2029168404Spjddbuf_add_ref(dmu_buf_impl_t *db, void *tag) 2030168404Spjd{ 2031168404Spjd int64_t holds = refcount_add(&db->db_holds, tag); 2032168404Spjd ASSERT(holds > 1); 2033168404Spjd} 2034168404Spjd 2035219089Spjd/* 2036219089Spjd * If you call dbuf_rele() you had better not be referencing the dnode handle 2037219089Spjd * unless you have some other direct or indirect hold on the dnode. (An indirect 2038219089Spjd * hold is a hold on one of the dnode's dbufs, including the bonus buffer.) 2039219089Spjd * Without that, the dbuf_rele() could lead to a dnode_rele() followed by the 2040219089Spjd * dnode's parent dbuf evicting its dnode handles. 2041219089Spjd */ 2042168404Spjdvoid 2043168404Spjddbuf_rele(dmu_buf_impl_t *db, void *tag) 2044168404Spjd{ 2045219089Spjd mutex_enter(&db->db_mtx); 2046219089Spjd dbuf_rele_and_unlock(db, tag); 2047219089Spjd} 2048219089Spjd 2049260150Sdelphijvoid 2050260150Sdelphijdmu_buf_rele(dmu_buf_t *db, void *tag) 2051260150Sdelphij{ 2052260150Sdelphij dbuf_rele((dmu_buf_impl_t *)db, tag); 2053260150Sdelphij} 2054260150Sdelphij 2055219089Spjd/* 2056219089Spjd * dbuf_rele() for an already-locked dbuf. This is necessary to allow 2057219089Spjd * db_dirtycnt and db_holds to be updated atomically. 2058219089Spjd */ 2059219089Spjdvoid 2060219089Spjddbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag) 2061219089Spjd{ 2062168404Spjd int64_t holds; 2063168404Spjd 2064219089Spjd ASSERT(MUTEX_HELD(&db->db_mtx)); 2065168404Spjd DBUF_VERIFY(db); 2066168404Spjd 2067219089Spjd /* 2068219089Spjd * Remove the reference to the dbuf before removing its hold on the 2069219089Spjd * dnode so we can guarantee in dnode_move() that a referenced bonus 2070219089Spjd * buffer has a corresponding dnode hold. 2071219089Spjd */ 2072168404Spjd holds = refcount_remove(&db->db_holds, tag); 2073168404Spjd ASSERT(holds >= 0); 2074168404Spjd 2075168404Spjd /* 2076168404Spjd * We can't freeze indirects if there is a possibility that they 2077168404Spjd * may be modified in the current syncing context. 2078168404Spjd */ 2079168404Spjd if (db->db_buf && holds == (db->db_level == 0 ? db->db_dirtycnt : 0)) 2080168404Spjd arc_buf_freeze(db->db_buf); 2081168404Spjd 2082168404Spjd if (holds == db->db_dirtycnt && 2083168404Spjd db->db_level == 0 && db->db_immediate_evict) 2084168404Spjd dbuf_evict_user(db); 2085168404Spjd 2086168404Spjd if (holds == 0) { 2087219089Spjd if (db->db_blkid == DMU_BONUS_BLKID) { 2088168404Spjd mutex_exit(&db->db_mtx); 2089219089Spjd 2090219089Spjd /* 2091219089Spjd * If the dnode moves here, we cannot cross this barrier 2092219089Spjd * until the move completes. 2093219089Spjd */ 2094219089Spjd DB_DNODE_ENTER(db); 2095270248Sdelphij atomic_dec_32(&DB_DNODE(db)->dn_dbufs_count); 2096219089Spjd DB_DNODE_EXIT(db); 2097219089Spjd /* 2098219089Spjd * The bonus buffer's dnode hold is no longer discounted 2099219089Spjd * in dnode_move(). The dnode cannot move until after 2100219089Spjd * the dnode_rele(). 2101219089Spjd */ 2102219089Spjd dnode_rele(DB_DNODE(db), db); 2103168404Spjd } else if (db->db_buf == NULL) { 2104168404Spjd /* 2105168404Spjd * This is a special case: we never associated this 2106168404Spjd * dbuf with any data allocated from the ARC. 2107168404Spjd */ 2108219089Spjd ASSERT(db->db_state == DB_UNCACHED || 2109219089Spjd db->db_state == DB_NOFILL); 2110168404Spjd dbuf_evict(db); 2111168404Spjd } else if (arc_released(db->db_buf)) { 2112168404Spjd arc_buf_t *buf = db->db_buf; 2113168404Spjd /* 2114168404Spjd * This dbuf has anonymous data associated with it. 2115168404Spjd */ 2116168404Spjd dbuf_set_data(db, NULL); 2117248571Smm VERIFY(arc_buf_remove_ref(buf, db)); 2118168404Spjd dbuf_evict(db); 2119168404Spjd } else { 2120248571Smm VERIFY(!arc_buf_remove_ref(db->db_buf, db)); 2121242845Sdelphij 2122242845Sdelphij /* 2123242845Sdelphij * A dbuf will be eligible for eviction if either the 2124242845Sdelphij * 'primarycache' property is set or a duplicate 2125242845Sdelphij * copy of this buffer is already cached in the arc. 2126242845Sdelphij * 2127242845Sdelphij * In the case of the 'primarycache' a buffer 2128242845Sdelphij * is considered for eviction if it matches the 2129242845Sdelphij * criteria set in the property. 2130242845Sdelphij * 2131242845Sdelphij * To decide if our buffer is considered a 2132242845Sdelphij * duplicate, we must call into the arc to determine 2133242845Sdelphij * if multiple buffers are referencing the same 2134242845Sdelphij * block on-disk. If so, then we simply evict 2135242845Sdelphij * ourselves. 2136242845Sdelphij */ 2137268858Sdelphij if (!DBUF_IS_CACHEABLE(db)) { 2138268858Sdelphij if (db->db_blkptr != NULL && 2139268858Sdelphij !BP_IS_HOLE(db->db_blkptr) && 2140268858Sdelphij !BP_IS_EMBEDDED(db->db_blkptr)) { 2141268858Sdelphij spa_t *spa = 2142268858Sdelphij dmu_objset_spa(db->db_objset); 2143268858Sdelphij blkptr_t bp = *db->db_blkptr; 2144268858Sdelphij dbuf_clear(db); 2145268858Sdelphij arc_freed(spa, &bp); 2146268858Sdelphij } else { 2147268858Sdelphij dbuf_clear(db); 2148268858Sdelphij } 2149268858Sdelphij } else if (arc_buf_eviction_needed(db->db_buf)) { 2150185029Spjd dbuf_clear(db); 2151268858Sdelphij } else { 2152185029Spjd mutex_exit(&db->db_mtx); 2153268858Sdelphij } 2154168404Spjd } 2155168404Spjd } else { 2156168404Spjd mutex_exit(&db->db_mtx); 2157168404Spjd } 2158168404Spjd} 2159168404Spjd 2160168404Spjd#pragma weak dmu_buf_refcount = dbuf_refcount 2161168404Spjduint64_t 2162168404Spjddbuf_refcount(dmu_buf_impl_t *db) 2163168404Spjd{ 2164168404Spjd return (refcount_count(&db->db_holds)); 2165168404Spjd} 2166168404Spjd 2167168404Spjdvoid * 2168275782Sdelphijdmu_buf_set_user(dmu_buf_t *db_fake, void *user_ptr, 2169168404Spjd dmu_buf_evict_func_t *evict_func) 2170168404Spjd{ 2171275782Sdelphij return (dmu_buf_update_user(db_fake, NULL, user_ptr, evict_func)); 2172168404Spjd} 2173168404Spjd 2174168404Spjdvoid * 2175275782Sdelphijdmu_buf_set_user_ie(dmu_buf_t *db_fake, void *user_ptr, 2176168404Spjd dmu_buf_evict_func_t *evict_func) 2177168404Spjd{ 2178168404Spjd dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 2179168404Spjd 2180168404Spjd db->db_immediate_evict = TRUE; 2181275782Sdelphij return (dmu_buf_update_user(db_fake, NULL, user_ptr, evict_func)); 2182168404Spjd} 2183168404Spjd 2184168404Spjdvoid * 2185168404Spjddmu_buf_update_user(dmu_buf_t *db_fake, void *old_user_ptr, void *user_ptr, 2186275782Sdelphij dmu_buf_evict_func_t *evict_func) 2187168404Spjd{ 2188168404Spjd dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 2189168404Spjd ASSERT(db->db_level == 0); 2190168404Spjd 2191168404Spjd ASSERT((user_ptr == NULL) == (evict_func == NULL)); 2192168404Spjd 2193168404Spjd mutex_enter(&db->db_mtx); 2194168404Spjd 2195168404Spjd if (db->db_user_ptr == old_user_ptr) { 2196168404Spjd db->db_user_ptr = user_ptr; 2197168404Spjd db->db_evict_func = evict_func; 2198168404Spjd } else { 2199168404Spjd old_user_ptr = db->db_user_ptr; 2200168404Spjd } 2201168404Spjd 2202168404Spjd mutex_exit(&db->db_mtx); 2203168404Spjd return (old_user_ptr); 2204168404Spjd} 2205168404Spjd 2206168404Spjdvoid * 2207168404Spjddmu_buf_get_user(dmu_buf_t *db_fake) 2208168404Spjd{ 2209168404Spjd dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 2210168404Spjd ASSERT(!refcount_is_zero(&db->db_holds)); 2211168404Spjd 2212168404Spjd return (db->db_user_ptr); 2213168404Spjd} 2214168404Spjd 2215209962Smmboolean_t 2216209962Smmdmu_buf_freeable(dmu_buf_t *dbuf) 2217209962Smm{ 2218209962Smm boolean_t res = B_FALSE; 2219209962Smm dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf; 2220209962Smm 2221209962Smm if (db->db_blkptr) 2222209962Smm res = dsl_dataset_block_freeable(db->db_objset->os_dsl_dataset, 2223219089Spjd db->db_blkptr, db->db_blkptr->blk_birth); 2224209962Smm 2225209962Smm return (res); 2226209962Smm} 2227209962Smm 2228243524Smmblkptr_t * 2229243524Smmdmu_buf_get_blkptr(dmu_buf_t *db) 2230243524Smm{ 2231243524Smm dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db; 2232243524Smm return (dbi->db_blkptr); 2233243524Smm} 2234243524Smm 2235168404Spjdstatic void 2236168404Spjddbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db) 2237168404Spjd{ 2238168404Spjd /* ASSERT(dmu_tx_is_syncing(tx) */ 2239168404Spjd ASSERT(MUTEX_HELD(&db->db_mtx)); 2240168404Spjd 2241168404Spjd if (db->db_blkptr != NULL) 2242168404Spjd return; 2243168404Spjd 2244219089Spjd if (db->db_blkid == DMU_SPILL_BLKID) { 2245219089Spjd db->db_blkptr = &dn->dn_phys->dn_spill; 2246219089Spjd BP_ZERO(db->db_blkptr); 2247219089Spjd return; 2248219089Spjd } 2249168404Spjd if (db->db_level == dn->dn_phys->dn_nlevels-1) { 2250168404Spjd /* 2251168404Spjd * This buffer was allocated at a time when there was 2252168404Spjd * no available blkptrs from the dnode, or it was 2253168404Spjd * inappropriate to hook it in (i.e., nlevels mis-match). 2254168404Spjd */ 2255168404Spjd ASSERT(db->db_blkid < dn->dn_phys->dn_nblkptr); 2256168404Spjd ASSERT(db->db_parent == NULL); 2257168404Spjd db->db_parent = dn->dn_dbuf; 2258168404Spjd db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid]; 2259168404Spjd DBUF_VERIFY(db); 2260168404Spjd } else { 2261168404Spjd dmu_buf_impl_t *parent = db->db_parent; 2262168404Spjd int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; 2263168404Spjd 2264168404Spjd ASSERT(dn->dn_phys->dn_nlevels > 1); 2265168404Spjd if (parent == NULL) { 2266168404Spjd mutex_exit(&db->db_mtx); 2267168404Spjd rw_enter(&dn->dn_struct_rwlock, RW_READER); 2268168404Spjd (void) dbuf_hold_impl(dn, db->db_level+1, 2269168404Spjd db->db_blkid >> epbs, FALSE, db, &parent); 2270168404Spjd rw_exit(&dn->dn_struct_rwlock); 2271168404Spjd mutex_enter(&db->db_mtx); 2272168404Spjd db->db_parent = parent; 2273168404Spjd } 2274168404Spjd db->db_blkptr = (blkptr_t *)parent->db.db_data + 2275168404Spjd (db->db_blkid & ((1ULL << epbs) - 1)); 2276168404Spjd DBUF_VERIFY(db); 2277168404Spjd } 2278168404Spjd} 2279168404Spjd 2280168404Spjdstatic void 2281168404Spjddbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx) 2282168404Spjd{ 2283168404Spjd dmu_buf_impl_t *db = dr->dr_dbuf; 2284219089Spjd dnode_t *dn; 2285168404Spjd zio_t *zio; 2286168404Spjd 2287168404Spjd ASSERT(dmu_tx_is_syncing(tx)); 2288168404Spjd 2289168404Spjd dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr); 2290168404Spjd 2291168404Spjd mutex_enter(&db->db_mtx); 2292168404Spjd 2293168404Spjd ASSERT(db->db_level > 0); 2294168404Spjd DBUF_VERIFY(db); 2295168404Spjd 2296251629Sdelphij /* Read the block if it hasn't been read yet. */ 2297168404Spjd if (db->db_buf == NULL) { 2298168404Spjd mutex_exit(&db->db_mtx); 2299168404Spjd (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED); 2300168404Spjd mutex_enter(&db->db_mtx); 2301168404Spjd } 2302168404Spjd ASSERT3U(db->db_state, ==, DB_CACHED); 2303168404Spjd ASSERT(db->db_buf != NULL); 2304168404Spjd 2305219089Spjd DB_DNODE_ENTER(db); 2306219089Spjd dn = DB_DNODE(db); 2307251629Sdelphij /* Indirect block size must match what the dnode thinks it is. */ 2308219089Spjd ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift); 2309168404Spjd dbuf_check_blkptr(dn, db); 2310219089Spjd DB_DNODE_EXIT(db); 2311168404Spjd 2312251629Sdelphij /* Provide the pending dirty record to child dbufs */ 2313168404Spjd db->db_data_pending = dr; 2314168404Spjd 2315168404Spjd mutex_exit(&db->db_mtx); 2316185029Spjd dbuf_write(dr, db->db_buf, tx); 2317168404Spjd 2318168404Spjd zio = dr->dr_zio; 2319168404Spjd mutex_enter(&dr->dt.di.dr_mtx); 2320168404Spjd dbuf_sync_list(&dr->dt.di.dr_children, tx); 2321168404Spjd ASSERT(list_head(&dr->dt.di.dr_children) == NULL); 2322168404Spjd mutex_exit(&dr->dt.di.dr_mtx); 2323168404Spjd zio_nowait(zio); 2324168404Spjd} 2325168404Spjd 2326168404Spjdstatic void 2327168404Spjddbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) 2328168404Spjd{ 2329168404Spjd arc_buf_t **datap = &dr->dt.dl.dr_data; 2330168404Spjd dmu_buf_impl_t *db = dr->dr_dbuf; 2331219089Spjd dnode_t *dn; 2332219089Spjd objset_t *os; 2333168404Spjd uint64_t txg = tx->tx_txg; 2334168404Spjd 2335168404Spjd ASSERT(dmu_tx_is_syncing(tx)); 2336168404Spjd 2337168404Spjd dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr); 2338168404Spjd 2339168404Spjd mutex_enter(&db->db_mtx); 2340168404Spjd /* 2341168404Spjd * To be synced, we must be dirtied. But we 2342168404Spjd * might have been freed after the dirty. 2343168404Spjd */ 2344168404Spjd if (db->db_state == DB_UNCACHED) { 2345168404Spjd /* This buffer has been freed since it was dirtied */ 2346168404Spjd ASSERT(db->db.db_data == NULL); 2347168404Spjd } else if (db->db_state == DB_FILL) { 2348168404Spjd /* This buffer was freed and is now being re-filled */ 2349168404Spjd ASSERT(db->db.db_data != dr->dt.dl.dr_data); 2350168404Spjd } else { 2351219089Spjd ASSERT(db->db_state == DB_CACHED || db->db_state == DB_NOFILL); 2352168404Spjd } 2353168404Spjd DBUF_VERIFY(db); 2354168404Spjd 2355219089Spjd DB_DNODE_ENTER(db); 2356219089Spjd dn = DB_DNODE(db); 2357219089Spjd 2358219089Spjd if (db->db_blkid == DMU_SPILL_BLKID) { 2359219089Spjd mutex_enter(&dn->dn_mtx); 2360219089Spjd dn->dn_phys->dn_flags |= DNODE_FLAG_SPILL_BLKPTR; 2361219089Spjd mutex_exit(&dn->dn_mtx); 2362219089Spjd } 2363219089Spjd 2364168404Spjd /* 2365168404Spjd * If this is a bonus buffer, simply copy the bonus data into the 2366168404Spjd * dnode. It will be written out when the dnode is synced (and it 2367168404Spjd * will be synced, since it must have been dirty for dbuf_sync to 2368168404Spjd * be called). 2369168404Spjd */ 2370219089Spjd if (db->db_blkid == DMU_BONUS_BLKID) { 2371168404Spjd dbuf_dirty_record_t **drp; 2372185029Spjd 2373168404Spjd ASSERT(*datap != NULL); 2374240415Smm ASSERT0(db->db_level); 2375168404Spjd ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN); 2376168404Spjd bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen); 2377219089Spjd DB_DNODE_EXIT(db); 2378219089Spjd 2379185029Spjd if (*datap != db->db.db_data) { 2380168404Spjd zio_buf_free(*datap, DN_MAX_BONUSLEN); 2381208373Smm arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); 2382185029Spjd } 2383168404Spjd db->db_data_pending = NULL; 2384168404Spjd drp = &db->db_last_dirty; 2385168404Spjd while (*drp != dr) 2386168404Spjd drp = &(*drp)->dr_next; 2387185029Spjd ASSERT(dr->dr_next == NULL); 2388219089Spjd ASSERT(dr->dr_dbuf == db); 2389185029Spjd *drp = dr->dr_next; 2390169325Spjd if (dr->dr_dbuf->db_level != 0) { 2391169325Spjd list_destroy(&dr->dt.di.dr_children); 2392169325Spjd mutex_destroy(&dr->dt.di.dr_mtx); 2393169325Spjd } 2394168404Spjd kmem_free(dr, sizeof (dbuf_dirty_record_t)); 2395168404Spjd ASSERT(db->db_dirtycnt > 0); 2396168404Spjd db->db_dirtycnt -= 1; 2397219089Spjd dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg); 2398168404Spjd return; 2399168404Spjd } 2400168404Spjd 2401219089Spjd os = dn->dn_objset; 2402219089Spjd 2403168404Spjd /* 2404185029Spjd * This function may have dropped the db_mtx lock allowing a dmu_sync 2405185029Spjd * operation to sneak in. As a result, we need to ensure that we 2406185029Spjd * don't check the dr_override_state until we have returned from 2407185029Spjd * dbuf_check_blkptr. 2408185029Spjd */ 2409185029Spjd dbuf_check_blkptr(dn, db); 2410185029Spjd 2411185029Spjd /* 2412219089Spjd * If this buffer is in the middle of an immediate write, 2413168404Spjd * wait for the synchronous IO to complete. 2414168404Spjd */ 2415168404Spjd while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) { 2416168404Spjd ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT); 2417168404Spjd cv_wait(&db->db_changed, &db->db_mtx); 2418168404Spjd ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN); 2419168404Spjd } 2420168404Spjd 2421219089Spjd if (db->db_state != DB_NOFILL && 2422219089Spjd dn->dn_object != DMU_META_DNODE_OBJECT && 2423208050Smm refcount_count(&db->db_holds) > 1 && 2424219089Spjd dr->dt.dl.dr_override_state != DR_OVERRIDDEN && 2425208050Smm *datap == db->db_buf) { 2426168404Spjd /* 2427208050Smm * If this buffer is currently "in use" (i.e., there 2428208050Smm * are active holds and db_data still references it), 2429208050Smm * then make a copy before we start the write so that 2430208050Smm * any modifications from the open txg will not leak 2431208050Smm * into this write. 2432168404Spjd * 2433208050Smm * NOTE: this copy does not need to be made for 2434208050Smm * objects only modified in the syncing context (e.g. 2435208050Smm * DNONE_DNODE blocks). 2436168404Spjd */ 2437208050Smm int blksz = arc_buf_size(*datap); 2438208050Smm arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 2439208050Smm *datap = arc_buf_alloc(os->os_spa, blksz, db, type); 2440208050Smm bcopy(db->db.db_data, (*datap)->b_data, blksz); 2441168404Spjd } 2442168404Spjd db->db_data_pending = dr; 2443168404Spjd 2444168404Spjd mutex_exit(&db->db_mtx); 2445168404Spjd 2446185029Spjd dbuf_write(dr, *datap, tx); 2447168404Spjd 2448168404Spjd ASSERT(!list_link_active(&dr->dr_dirty_node)); 2449219089Spjd if (dn->dn_object == DMU_META_DNODE_OBJECT) { 2450168404Spjd list_insert_tail(&dn->dn_dirty_records[txg&TXG_MASK], dr); 2451219089Spjd DB_DNODE_EXIT(db); 2452219089Spjd } else { 2453219089Spjd /* 2454219089Spjd * Although zio_nowait() does not "wait for an IO", it does 2455219089Spjd * initiate the IO. If this is an empty write it seems plausible 2456219089Spjd * that the IO could actually be completed before the nowait 2457219089Spjd * returns. We need to DB_DNODE_EXIT() first in case 2458219089Spjd * zio_nowait() invalidates the dbuf. 2459219089Spjd */ 2460219089Spjd DB_DNODE_EXIT(db); 2461168404Spjd zio_nowait(dr->dr_zio); 2462219089Spjd } 2463168404Spjd} 2464168404Spjd 2465168404Spjdvoid 2466168404Spjddbuf_sync_list(list_t *list, dmu_tx_t *tx) 2467168404Spjd{ 2468168404Spjd dbuf_dirty_record_t *dr; 2469168404Spjd 2470168404Spjd while (dr = list_head(list)) { 2471168404Spjd if (dr->dr_zio != NULL) { 2472168404Spjd /* 2473168404Spjd * If we find an already initialized zio then we 2474168404Spjd * are processing the meta-dnode, and we have finished. 2475168404Spjd * The dbufs for all dnodes are put back on the list 2476168404Spjd * during processing, so that we can zio_wait() 2477168404Spjd * these IOs after initiating all child IOs. 2478168404Spjd */ 2479168404Spjd ASSERT3U(dr->dr_dbuf->db.db_object, ==, 2480168404Spjd DMU_META_DNODE_OBJECT); 2481168404Spjd break; 2482168404Spjd } 2483168404Spjd list_remove(list, dr); 2484168404Spjd if (dr->dr_dbuf->db_level > 0) 2485168404Spjd dbuf_sync_indirect(dr, tx); 2486168404Spjd else 2487168404Spjd dbuf_sync_leaf(dr, tx); 2488168404Spjd } 2489168404Spjd} 2490168404Spjd 2491168404Spjd/* ARGSUSED */ 2492168404Spjdstatic void 2493168404Spjddbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb) 2494168404Spjd{ 2495168404Spjd dmu_buf_impl_t *db = vdb; 2496219089Spjd dnode_t *dn; 2497185029Spjd blkptr_t *bp = zio->io_bp; 2498168404Spjd blkptr_t *bp_orig = &zio->io_bp_orig; 2499219089Spjd spa_t *spa = zio->io_spa; 2500219089Spjd int64_t delta; 2501168404Spjd uint64_t fill = 0; 2502219089Spjd int i; 2503168404Spjd 2504268075Sdelphij ASSERT3P(db->db_blkptr, ==, bp); 2505185029Spjd 2506219089Spjd DB_DNODE_ENTER(db); 2507219089Spjd dn = DB_DNODE(db); 2508219089Spjd delta = bp_get_dsize_sync(spa, bp) - bp_get_dsize_sync(spa, bp_orig); 2509219089Spjd dnode_diduse_space(dn, delta - zio->io_prev_space_delta); 2510219089Spjd zio->io_prev_space_delta = delta; 2511168404Spjd 2512260150Sdelphij if (bp->blk_birth != 0) { 2513260150Sdelphij ASSERT((db->db_blkid != DMU_SPILL_BLKID && 2514260150Sdelphij BP_GET_TYPE(bp) == dn->dn_type) || 2515260150Sdelphij (db->db_blkid == DMU_SPILL_BLKID && 2516268075Sdelphij BP_GET_TYPE(bp) == dn->dn_bonustype) || 2517268075Sdelphij BP_IS_EMBEDDED(bp)); 2518260150Sdelphij ASSERT(BP_GET_LEVEL(bp) == db->db_level); 2519168404Spjd } 2520168404Spjd 2521168404Spjd mutex_enter(&db->db_mtx); 2522168404Spjd 2523219089Spjd#ifdef ZFS_DEBUG 2524219089Spjd if (db->db_blkid == DMU_SPILL_BLKID) { 2525219089Spjd ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR); 2526219089Spjd ASSERT(!(BP_IS_HOLE(db->db_blkptr)) && 2527219089Spjd db->db_blkptr == &dn->dn_phys->dn_spill); 2528219089Spjd } 2529219089Spjd#endif 2530219089Spjd 2531168404Spjd if (db->db_level == 0) { 2532168404Spjd mutex_enter(&dn->dn_mtx); 2533219089Spjd if (db->db_blkid > dn->dn_phys->dn_maxblkid && 2534219089Spjd db->db_blkid != DMU_SPILL_BLKID) 2535168404Spjd dn->dn_phys->dn_maxblkid = db->db_blkid; 2536168404Spjd mutex_exit(&dn->dn_mtx); 2537168404Spjd 2538168404Spjd if (dn->dn_type == DMU_OT_DNODE) { 2539168404Spjd dnode_phys_t *dnp = db->db.db_data; 2540168404Spjd for (i = db->db.db_size >> DNODE_SHIFT; i > 0; 2541168404Spjd i--, dnp++) { 2542168404Spjd if (dnp->dn_type != DMU_OT_NONE) 2543168404Spjd fill++; 2544168404Spjd } 2545168404Spjd } else { 2546260150Sdelphij if (BP_IS_HOLE(bp)) { 2547260150Sdelphij fill = 0; 2548260150Sdelphij } else { 2549260150Sdelphij fill = 1; 2550260150Sdelphij } 2551168404Spjd } 2552168404Spjd } else { 2553185029Spjd blkptr_t *ibp = db->db.db_data; 2554168404Spjd ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift); 2555185029Spjd for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) { 2556185029Spjd if (BP_IS_HOLE(ibp)) 2557168404Spjd continue; 2558268075Sdelphij fill += BP_GET_FILL(ibp); 2559168404Spjd } 2560168404Spjd } 2561219089Spjd DB_DNODE_EXIT(db); 2562168404Spjd 2563268075Sdelphij if (!BP_IS_EMBEDDED(bp)) 2564268075Sdelphij bp->blk_fill = fill; 2565168404Spjd 2566168404Spjd mutex_exit(&db->db_mtx); 2567168404Spjd} 2568168404Spjd 2569258632Savg/* 2570258632Savg * The SPA will call this callback several times for each zio - once 2571258632Savg * for every physical child i/o (zio->io_phys_children times). This 2572258632Savg * allows the DMU to monitor the progress of each logical i/o. For example, 2573258632Savg * there may be 2 copies of an indirect block, or many fragments of a RAID-Z 2574258632Savg * block. There may be a long delay before all copies/fragments are completed, 2575258632Savg * so this callback allows us to retire dirty space gradually, as the physical 2576258632Savg * i/os complete. 2577258632Savg */ 2578168404Spjd/* ARGSUSED */ 2579168404Spjdstatic void 2580258632Savgdbuf_write_physdone(zio_t *zio, arc_buf_t *buf, void *arg) 2581258632Savg{ 2582258632Savg dmu_buf_impl_t *db = arg; 2583258632Savg objset_t *os = db->db_objset; 2584258632Savg dsl_pool_t *dp = dmu_objset_pool(os); 2585258632Savg dbuf_dirty_record_t *dr; 2586258632Savg int delta = 0; 2587258632Savg 2588258632Savg dr = db->db_data_pending; 2589258632Savg ASSERT3U(dr->dr_txg, ==, zio->io_txg); 2590258632Savg 2591258632Savg /* 2592258632Savg * The callback will be called io_phys_children times. Retire one 2593258632Savg * portion of our dirty space each time we are called. Any rounding 2594258632Savg * error will be cleaned up by dsl_pool_sync()'s call to 2595258632Savg * dsl_pool_undirty_space(). 2596258632Savg */ 2597258632Savg delta = dr->dr_accounted / zio->io_phys_children; 2598258632Savg dsl_pool_undirty_space(dp, delta, zio->io_txg); 2599258632Savg} 2600258632Savg 2601258632Savg/* ARGSUSED */ 2602258632Savgstatic void 2603168404Spjddbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) 2604168404Spjd{ 2605168404Spjd dmu_buf_impl_t *db = vdb; 2606219089Spjd blkptr_t *bp_orig = &zio->io_bp_orig; 2607260150Sdelphij blkptr_t *bp = db->db_blkptr; 2608260150Sdelphij objset_t *os = db->db_objset; 2609260150Sdelphij dmu_tx_t *tx = os->os_synctx; 2610168404Spjd dbuf_dirty_record_t **drp, *dr; 2611168404Spjd 2612240415Smm ASSERT0(zio->io_error); 2613219089Spjd ASSERT(db->db_blkptr == bp); 2614168404Spjd 2615243524Smm /* 2616243524Smm * For nopwrites and rewrites we ensure that the bp matches our 2617243524Smm * original and bypass all the accounting. 2618243524Smm */ 2619243524Smm if (zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)) { 2620219089Spjd ASSERT(BP_EQUAL(bp, bp_orig)); 2621219089Spjd } else { 2622260150Sdelphij dsl_dataset_t *ds = os->os_dsl_dataset; 2623219089Spjd (void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE); 2624219089Spjd dsl_dataset_block_born(ds, bp, tx); 2625219089Spjd } 2626219089Spjd 2627168404Spjd mutex_enter(&db->db_mtx); 2628168404Spjd 2629219089Spjd DBUF_VERIFY(db); 2630219089Spjd 2631168404Spjd drp = &db->db_last_dirty; 2632185029Spjd while ((dr = *drp) != db->db_data_pending) 2633185029Spjd drp = &dr->dr_next; 2634185029Spjd ASSERT(!list_link_active(&dr->dr_dirty_node)); 2635219089Spjd ASSERT(dr->dr_dbuf == db); 2636185029Spjd ASSERT(dr->dr_next == NULL); 2637185029Spjd *drp = dr->dr_next; 2638168404Spjd 2639219089Spjd#ifdef ZFS_DEBUG 2640219089Spjd if (db->db_blkid == DMU_SPILL_BLKID) { 2641219089Spjd dnode_t *dn; 2642219089Spjd 2643219089Spjd DB_DNODE_ENTER(db); 2644219089Spjd dn = DB_DNODE(db); 2645219089Spjd ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR); 2646219089Spjd ASSERT(!(BP_IS_HOLE(db->db_blkptr)) && 2647219089Spjd db->db_blkptr == &dn->dn_phys->dn_spill); 2648219089Spjd DB_DNODE_EXIT(db); 2649219089Spjd } 2650219089Spjd#endif 2651219089Spjd 2652168404Spjd if (db->db_level == 0) { 2653219089Spjd ASSERT(db->db_blkid != DMU_BONUS_BLKID); 2654168404Spjd ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN); 2655219089Spjd if (db->db_state != DB_NOFILL) { 2656219089Spjd if (dr->dt.dl.dr_data != db->db_buf) 2657219089Spjd VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, 2658248571Smm db)); 2659219089Spjd else if (!arc_released(db->db_buf)) 2660219089Spjd arc_set_callback(db->db_buf, dbuf_do_evict, db); 2661219089Spjd } 2662168404Spjd } else { 2663219089Spjd dnode_t *dn; 2664168404Spjd 2665219089Spjd DB_DNODE_ENTER(db); 2666219089Spjd dn = DB_DNODE(db); 2667168404Spjd ASSERT(list_head(&dr->dt.di.dr_children) == NULL); 2668260150Sdelphij ASSERT3U(db->db.db_size, ==, 1 << dn->dn_phys->dn_indblkshift); 2669168404Spjd if (!BP_IS_HOLE(db->db_blkptr)) { 2670168404Spjd int epbs = 2671168404Spjd dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; 2672260150Sdelphij ASSERT3U(db->db_blkid, <=, 2673260150Sdelphij dn->dn_phys->dn_maxblkid >> (db->db_level * epbs)); 2674168404Spjd ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==, 2675168404Spjd db->db.db_size); 2676268075Sdelphij if (!arc_released(db->db_buf)) 2677268075Sdelphij arc_set_callback(db->db_buf, dbuf_do_evict, db); 2678168404Spjd } 2679219089Spjd DB_DNODE_EXIT(db); 2680185029Spjd mutex_destroy(&dr->dt.di.dr_mtx); 2681169325Spjd list_destroy(&dr->dt.di.dr_children); 2682168404Spjd } 2683168404Spjd kmem_free(dr, sizeof (dbuf_dirty_record_t)); 2684168404Spjd 2685168404Spjd cv_broadcast(&db->db_changed); 2686168404Spjd ASSERT(db->db_dirtycnt > 0); 2687168404Spjd db->db_dirtycnt -= 1; 2688168404Spjd db->db_data_pending = NULL; 2689260150Sdelphij dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg); 2690219089Spjd} 2691219089Spjd 2692219089Spjdstatic void 2693219089Spjddbuf_write_nofill_ready(zio_t *zio) 2694219089Spjd{ 2695219089Spjd dbuf_write_ready(zio, NULL, zio->io_private); 2696219089Spjd} 2697219089Spjd 2698219089Spjdstatic void 2699219089Spjddbuf_write_nofill_done(zio_t *zio) 2700219089Spjd{ 2701219089Spjd dbuf_write_done(zio, NULL, zio->io_private); 2702219089Spjd} 2703219089Spjd 2704219089Spjdstatic void 2705219089Spjddbuf_write_override_ready(zio_t *zio) 2706219089Spjd{ 2707219089Spjd dbuf_dirty_record_t *dr = zio->io_private; 2708219089Spjd dmu_buf_impl_t *db = dr->dr_dbuf; 2709219089Spjd 2710219089Spjd dbuf_write_ready(zio, NULL, db); 2711219089Spjd} 2712219089Spjd 2713219089Spjdstatic void 2714219089Spjddbuf_write_override_done(zio_t *zio) 2715219089Spjd{ 2716219089Spjd dbuf_dirty_record_t *dr = zio->io_private; 2717219089Spjd dmu_buf_impl_t *db = dr->dr_dbuf; 2718219089Spjd blkptr_t *obp = &dr->dt.dl.dr_overridden_by; 2719219089Spjd 2720219089Spjd mutex_enter(&db->db_mtx); 2721219089Spjd if (!BP_EQUAL(zio->io_bp, obp)) { 2722219089Spjd if (!BP_IS_HOLE(obp)) 2723219089Spjd dsl_free(spa_get_dsl(zio->io_spa), zio->io_txg, obp); 2724219089Spjd arc_release(dr->dt.dl.dr_data, db); 2725219089Spjd } 2726168404Spjd mutex_exit(&db->db_mtx); 2727168404Spjd 2728219089Spjd dbuf_write_done(zio, NULL, db); 2729219089Spjd} 2730168404Spjd 2731251629Sdelphij/* Issue I/O to commit a dirty buffer to disk. */ 2732219089Spjdstatic void 2733219089Spjddbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) 2734219089Spjd{ 2735219089Spjd dmu_buf_impl_t *db = dr->dr_dbuf; 2736219089Spjd dnode_t *dn; 2737219089Spjd objset_t *os; 2738219089Spjd dmu_buf_impl_t *parent = db->db_parent; 2739219089Spjd uint64_t txg = tx->tx_txg; 2740268123Sdelphij zbookmark_phys_t zb; 2741219089Spjd zio_prop_t zp; 2742219089Spjd zio_t *zio; 2743219089Spjd int wp_flag = 0; 2744219089Spjd 2745219089Spjd DB_DNODE_ENTER(db); 2746219089Spjd dn = DB_DNODE(db); 2747219089Spjd os = dn->dn_objset; 2748219089Spjd 2749219089Spjd if (db->db_state != DB_NOFILL) { 2750219089Spjd if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) { 2751219089Spjd /* 2752219089Spjd * Private object buffers are released here rather 2753219089Spjd * than in dbuf_dirty() since they are only modified 2754219089Spjd * in the syncing context and we don't want the 2755219089Spjd * overhead of making multiple copies of the data. 2756219089Spjd */ 2757219089Spjd if (BP_IS_HOLE(db->db_blkptr)) { 2758219089Spjd arc_buf_thaw(data); 2759219089Spjd } else { 2760219089Spjd dbuf_release_bp(db); 2761219089Spjd } 2762219089Spjd } 2763219089Spjd } 2764219089Spjd 2765219089Spjd if (parent != dn->dn_dbuf) { 2766251629Sdelphij /* Our parent is an indirect block. */ 2767251629Sdelphij /* We have a dirty parent that has been scheduled for write. */ 2768219089Spjd ASSERT(parent && parent->db_data_pending); 2769251629Sdelphij /* Our parent's buffer is one level closer to the dnode. */ 2770219089Spjd ASSERT(db->db_level == parent->db_level-1); 2771251629Sdelphij /* 2772251629Sdelphij * We're about to modify our parent's db_data by modifying 2773251629Sdelphij * our block pointer, so the parent must be released. 2774251629Sdelphij */ 2775219089Spjd ASSERT(arc_released(parent->db_buf)); 2776219089Spjd zio = parent->db_data_pending->dr_zio; 2777219089Spjd } else { 2778251629Sdelphij /* Our parent is the dnode itself. */ 2779219089Spjd ASSERT((db->db_level == dn->dn_phys->dn_nlevels-1 && 2780219089Spjd db->db_blkid != DMU_SPILL_BLKID) || 2781219089Spjd (db->db_blkid == DMU_SPILL_BLKID && db->db_level == 0)); 2782219089Spjd if (db->db_blkid != DMU_SPILL_BLKID) 2783219089Spjd ASSERT3P(db->db_blkptr, ==, 2784219089Spjd &dn->dn_phys->dn_blkptr[db->db_blkid]); 2785219089Spjd zio = dn->dn_zio; 2786219089Spjd } 2787219089Spjd 2788219089Spjd ASSERT(db->db_level == 0 || data == db->db_buf); 2789219089Spjd ASSERT3U(db->db_blkptr->blk_birth, <=, txg); 2790219089Spjd ASSERT(zio); 2791219089Spjd 2792219089Spjd SET_BOOKMARK(&zb, os->os_dsl_dataset ? 2793219089Spjd os->os_dsl_dataset->ds_object : DMU_META_OBJSET, 2794219089Spjd db->db.db_object, db->db_level, db->db_blkid); 2795219089Spjd 2796219089Spjd if (db->db_blkid == DMU_SPILL_BLKID) 2797219089Spjd wp_flag = WP_SPILL; 2798219089Spjd wp_flag |= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0; 2799219089Spjd 2800219089Spjd dmu_write_policy(os, dn, db->db_level, wp_flag, &zp); 2801219089Spjd DB_DNODE_EXIT(db); 2802219089Spjd 2803268075Sdelphij if (db->db_level == 0 && 2804268075Sdelphij dr->dt.dl.dr_override_state == DR_OVERRIDDEN) { 2805268075Sdelphij /* 2806268075Sdelphij * The BP for this block has been provided by open context 2807268075Sdelphij * (by dmu_sync() or dmu_buf_write_embedded()). 2808268075Sdelphij */ 2809268075Sdelphij void *contents = (data != NULL) ? data->b_data : NULL; 2810268075Sdelphij 2811219089Spjd dr->dr_zio = zio_write(zio, os->os_spa, txg, 2812268075Sdelphij db->db_blkptr, contents, db->db.db_size, &zp, 2813258632Savg dbuf_write_override_ready, NULL, dbuf_write_override_done, 2814258632Savg dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); 2815219089Spjd mutex_enter(&db->db_mtx); 2816219089Spjd dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; 2817219089Spjd zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by, 2818243524Smm dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite); 2819219089Spjd mutex_exit(&db->db_mtx); 2820219089Spjd } else if (db->db_state == DB_NOFILL) { 2821255750Sdelphij ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF || 2822255750Sdelphij zp.zp_checksum == ZIO_CHECKSUM_NOPARITY); 2823219089Spjd dr->dr_zio = zio_write(zio, os->os_spa, txg, 2824219089Spjd db->db_blkptr, NULL, db->db.db_size, &zp, 2825258632Savg dbuf_write_nofill_ready, NULL, dbuf_write_nofill_done, db, 2826219089Spjd ZIO_PRIORITY_ASYNC_WRITE, 2827219089Spjd ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb); 2828219089Spjd } else { 2829219089Spjd ASSERT(arc_released(data)); 2830219089Spjd dr->dr_zio = arc_write(zio, os->os_spa, txg, 2831251478Sdelphij db->db_blkptr, data, DBUF_IS_L2CACHEABLE(db), 2832251478Sdelphij DBUF_IS_L2COMPRESSIBLE(db), &zp, dbuf_write_ready, 2833258632Savg dbuf_write_physdone, dbuf_write_done, db, 2834258632Savg ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); 2835219089Spjd } 2836168404Spjd} 2837