1168404Spjd/* 2168404Spjd * CDDL HEADER START 3168404Spjd * 4168404Spjd * The contents of this file are subject to the terms of the 5168404Spjd * Common Development and Distribution License (the "License"). 6168404Spjd * You may not use this file except in compliance with the License. 7168404Spjd * 8168404Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9168404Spjd * or http://www.opensolaris.org/os/licensing. 10168404Spjd * See the License for the specific language governing permissions 11168404Spjd * and limitations under the License. 12168404Spjd * 13168404Spjd * When distributing Covered Code, include this CDDL HEADER in each 14168404Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15168404Spjd * If applicable, add the following below this CDDL HEADER, with the 16168404Spjd * fields enclosed by brackets "[]" replaced with your own identifying 17168404Spjd * information: Portions Copyright [yyyy] [name of copyright owner] 18168404Spjd * 19168404Spjd * CDDL HEADER END 20168404Spjd */ 21168404Spjd/* 22219089Spjd * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23219636Spjd * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 24307287Smav * Copyright (c) 2012, 2016 by Delphix. All rights reserved. 25251478Sdelphij * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. 26255750Sdelphij * Copyright (c) 2013, Joyent, Inc. All rights reserved. 27288549Smav * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. 28297112Smav * Copyright (c) 2014 Integros [integros.com] 29168404Spjd */ 30168404Spjd 31168404Spjd#include <sys/zfs_context.h> 32168404Spjd#include <sys/dmu.h> 33253821Sdelphij#include <sys/dmu_send.h> 34168404Spjd#include <sys/dmu_impl.h> 35168404Spjd#include <sys/dbuf.h> 36168404Spjd#include <sys/dmu_objset.h> 37168404Spjd#include <sys/dsl_dataset.h> 38168404Spjd#include <sys/dsl_dir.h> 39168404Spjd#include <sys/dmu_tx.h> 40168404Spjd#include <sys/spa.h> 41168404Spjd#include <sys/zio.h> 42168404Spjd#include <sys/dmu_zfetch.h> 43219089Spjd#include <sys/sa.h> 44219089Spjd#include <sys/sa_impl.h> 45268649Sdelphij#include <sys/zfeature.h> 46268649Sdelphij#include <sys/blkptr.h> 47265740Sdelphij#include <sys/range_tree.h> 48307266Smav#include <sys/callb.h> 49168404Spjd 50307266Smavuint_t zfs_dbuf_evict_key; 51307266Smav 52254753Sdelphij/* 53254753Sdelphij * Number of times that zfs_free_range() took the slow path while doing 54254753Sdelphij * a zfs receive. A nonzero value indicates a potential performance problem. 55254753Sdelphij */ 56254753Sdelphijuint64_t zfs_free_range_recv_miss; 57254753Sdelphij 58248571Smmstatic boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx); 59185029Spjdstatic void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx); 60168404Spjd 61168404Spjd/* 62168404Spjd * Global data structures and functions for the dbuf cache. 63168404Spjd */ 64307266Smavstatic kmem_cache_t *dbuf_kmem_cache; 65288549Smavstatic taskq_t *dbu_evict_taskq; 66168404Spjd 67307266Smavstatic kthread_t *dbuf_cache_evict_thread; 68307266Smavstatic kmutex_t dbuf_evict_lock; 69307266Smavstatic kcondvar_t dbuf_evict_cv; 70307266Smavstatic boolean_t dbuf_evict_thread_exit; 71307266Smav 72307266Smav/* 73307266Smav * LRU cache of dbufs. The dbuf cache maintains a list of dbufs that 74307266Smav * are not currently held but have been recently released. These dbufs 75307266Smav * are not eligible for arc eviction until they are aged out of the cache. 76307266Smav * Dbufs are added to the dbuf cache once the last hold is released. If a 77307266Smav * dbuf is later accessed and still exists in the dbuf cache, then it will 78307266Smav * be removed from the cache and later re-added to the head of the cache. 79307266Smav * Dbufs that are aged out of the cache will be immediately destroyed and 80307266Smav * become eligible for arc eviction. 81307266Smav */ 82307266Smavstatic multilist_t dbuf_cache; 83307266Smavstatic refcount_t dbuf_cache_size; 84307266Smavuint64_t dbuf_cache_max_bytes = 100 * 1024 * 1024; 85307266Smav 86307266Smav/* Cap the size of the dbuf cache to log2 fraction of arc size. */ 87307266Smavint dbuf_cache_max_shift = 5; 88307266Smav 89307266Smav/* 90307266Smav * The dbuf cache uses a three-stage eviction policy: 91307266Smav * - A low water marker designates when the dbuf eviction thread 92307266Smav * should stop evicting from the dbuf cache. 93307266Smav * - When we reach the maximum size (aka mid water mark), we 94307266Smav * signal the eviction thread to run. 95307266Smav * - The high water mark indicates when the eviction thread 96307266Smav * is unable to keep up with the incoming load and eviction must 97307266Smav * happen in the context of the calling thread. 98307266Smav * 99307266Smav * The dbuf cache: 100307266Smav * (max size) 101307266Smav * low water mid water hi water 102307266Smav * +----------------------------------------+----------+----------+ 103307266Smav * | | | | 104307266Smav * | | | | 105307266Smav * | | | | 106307266Smav * | | | | 107307266Smav * +----------------------------------------+----------+----------+ 108307266Smav * stop signal evict 109307266Smav * evicting eviction directly 110307266Smav * thread 111307266Smav * 112307266Smav * The high and low water marks indicate the operating range for the eviction 113307266Smav * thread. The low water mark is, by default, 90% of the total size of the 114307266Smav * cache and the high water mark is at 110% (both of these percentages can be 115307266Smav * changed by setting dbuf_cache_lowater_pct and dbuf_cache_hiwater_pct, 116307266Smav * respectively). The eviction thread will try to ensure that the cache remains 117307266Smav * within this range by waking up every second and checking if the cache is 118307266Smav * above the low water mark. The thread can also be woken up by callers adding 119307266Smav * elements into the cache if the cache is larger than the mid water (i.e max 120307266Smav * cache size). Once the eviction thread is woken up and eviction is required, 121307266Smav * it will continue evicting buffers until it's able to reduce the cache size 122307266Smav * to the low water mark. If the cache size continues to grow and hits the high 123307266Smav * water mark, then callers adding elments to the cache will begin to evict 124307266Smav * directly from the cache until the cache is no longer above the high water 125307266Smav * mark. 126307266Smav */ 127307266Smav 128307266Smav/* 129307266Smav * The percentage above and below the maximum cache size. 130307266Smav */ 131307266Smavuint_t dbuf_cache_hiwater_pct = 10; 132307266Smavuint_t dbuf_cache_lowater_pct = 10; 133307266Smav 134168404Spjd/* ARGSUSED */ 135168404Spjdstatic int 136168404Spjddbuf_cons(void *vdb, void *unused, int kmflag) 137168404Spjd{ 138168404Spjd dmu_buf_impl_t *db = vdb; 139168404Spjd bzero(db, sizeof (dmu_buf_impl_t)); 140168404Spjd 141168404Spjd mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL); 142168404Spjd cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL); 143307266Smav multilist_link_init(&db->db_cache_link); 144168404Spjd refcount_create(&db->db_holds); 145269845Sdelphij 146168404Spjd return (0); 147168404Spjd} 148168404Spjd 149168404Spjd/* ARGSUSED */ 150168404Spjdstatic void 151168404Spjddbuf_dest(void *vdb, void *unused) 152168404Spjd{ 153168404Spjd dmu_buf_impl_t *db = vdb; 154168404Spjd mutex_destroy(&db->db_mtx); 155168404Spjd cv_destroy(&db->db_changed); 156307266Smav ASSERT(!multilist_link_active(&db->db_cache_link)); 157168404Spjd refcount_destroy(&db->db_holds); 158168404Spjd} 159168404Spjd 160168404Spjd/* 161168404Spjd * dbuf hash table routines 162168404Spjd */ 163168404Spjdstatic dbuf_hash_table_t dbuf_hash_table; 164168404Spjd 165168404Spjdstatic uint64_t dbuf_hash_count; 166168404Spjd 167168404Spjdstatic uint64_t 168168404Spjddbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid) 169168404Spjd{ 170168404Spjd uintptr_t osv = (uintptr_t)os; 171168404Spjd uint64_t crc = -1ULL; 172168404Spjd 173168404Spjd ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); 174168404Spjd crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (lvl)) & 0xFF]; 175168404Spjd crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF]; 176168404Spjd crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF]; 177168404Spjd crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF]; 178168404Spjd crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 0)) & 0xFF]; 179168404Spjd crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 8)) & 0xFF]; 180168404Spjd 181168404Spjd crc ^= (osv>>14) ^ (obj>>16) ^ (blkid>>16); 182168404Spjd 183168404Spjd return (crc); 184168404Spjd} 185168404Spjd 186168404Spjd#define DBUF_EQUAL(dbuf, os, obj, level, blkid) \ 187168404Spjd ((dbuf)->db.db_object == (obj) && \ 188168404Spjd (dbuf)->db_objset == (os) && \ 189168404Spjd (dbuf)->db_level == (level) && \ 190168404Spjd (dbuf)->db_blkid == (blkid)) 191168404Spjd 192168404Spjddmu_buf_impl_t * 193288538Smavdbuf_find(objset_t *os, uint64_t obj, uint8_t level, uint64_t blkid) 194168404Spjd{ 195168404Spjd dbuf_hash_table_t *h = &dbuf_hash_table; 196307266Smav uint64_t hv = dbuf_hash(os, obj, level, blkid); 197168404Spjd uint64_t idx = hv & h->hash_table_mask; 198168404Spjd dmu_buf_impl_t *db; 199168404Spjd 200168404Spjd mutex_enter(DBUF_HASH_MUTEX(h, idx)); 201168404Spjd for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) { 202168404Spjd if (DBUF_EQUAL(db, os, obj, level, blkid)) { 203168404Spjd mutex_enter(&db->db_mtx); 204168404Spjd if (db->db_state != DB_EVICTING) { 205168404Spjd mutex_exit(DBUF_HASH_MUTEX(h, idx)); 206168404Spjd return (db); 207168404Spjd } 208168404Spjd mutex_exit(&db->db_mtx); 209168404Spjd } 210168404Spjd } 211168404Spjd mutex_exit(DBUF_HASH_MUTEX(h, idx)); 212168404Spjd return (NULL); 213168404Spjd} 214168404Spjd 215288538Smavstatic dmu_buf_impl_t * 216288538Smavdbuf_find_bonus(objset_t *os, uint64_t object) 217288538Smav{ 218288538Smav dnode_t *dn; 219288538Smav dmu_buf_impl_t *db = NULL; 220288538Smav 221288538Smav if (dnode_hold(os, object, FTAG, &dn) == 0) { 222288538Smav rw_enter(&dn->dn_struct_rwlock, RW_READER); 223288538Smav if (dn->dn_bonus != NULL) { 224288538Smav db = dn->dn_bonus; 225288538Smav mutex_enter(&db->db_mtx); 226288538Smav } 227288538Smav rw_exit(&dn->dn_struct_rwlock); 228288538Smav dnode_rele(dn, FTAG); 229288538Smav } 230288538Smav return (db); 231288538Smav} 232288538Smav 233168404Spjd/* 234168404Spjd * Insert an entry into the hash table. If there is already an element 235168404Spjd * equal to elem in the hash table, then the already existing element 236168404Spjd * will be returned and the new element will not be inserted. 237168404Spjd * Otherwise returns NULL. 238168404Spjd */ 239168404Spjdstatic dmu_buf_impl_t * 240168404Spjddbuf_hash_insert(dmu_buf_impl_t *db) 241168404Spjd{ 242168404Spjd dbuf_hash_table_t *h = &dbuf_hash_table; 243219089Spjd objset_t *os = db->db_objset; 244168404Spjd uint64_t obj = db->db.db_object; 245168404Spjd int level = db->db_level; 246168404Spjd uint64_t blkid = db->db_blkid; 247307266Smav uint64_t hv = dbuf_hash(os, obj, level, blkid); 248168404Spjd uint64_t idx = hv & h->hash_table_mask; 249168404Spjd dmu_buf_impl_t *dbf; 250168404Spjd 251168404Spjd mutex_enter(DBUF_HASH_MUTEX(h, idx)); 252168404Spjd for (dbf = h->hash_table[idx]; dbf != NULL; dbf = dbf->db_hash_next) { 253168404Spjd if (DBUF_EQUAL(dbf, os, obj, level, blkid)) { 254168404Spjd mutex_enter(&dbf->db_mtx); 255168404Spjd if (dbf->db_state != DB_EVICTING) { 256168404Spjd mutex_exit(DBUF_HASH_MUTEX(h, idx)); 257168404Spjd return (dbf); 258168404Spjd } 259168404Spjd mutex_exit(&dbf->db_mtx); 260168404Spjd } 261168404Spjd } 262168404Spjd 263168404Spjd mutex_enter(&db->db_mtx); 264168404Spjd db->db_hash_next = h->hash_table[idx]; 265168404Spjd h->hash_table[idx] = db; 266168404Spjd mutex_exit(DBUF_HASH_MUTEX(h, idx)); 267271001Sdelphij atomic_inc_64(&dbuf_hash_count); 268168404Spjd 269168404Spjd return (NULL); 270168404Spjd} 271168404Spjd 272168404Spjd/* 273269417Sdelphij * Remove an entry from the hash table. It must be in the EVICTING state. 274168404Spjd */ 275168404Spjdstatic void 276168404Spjddbuf_hash_remove(dmu_buf_impl_t *db) 277168404Spjd{ 278168404Spjd dbuf_hash_table_t *h = &dbuf_hash_table; 279307266Smav uint64_t hv = dbuf_hash(db->db_objset, db->db.db_object, 280168404Spjd db->db_level, db->db_blkid); 281168404Spjd uint64_t idx = hv & h->hash_table_mask; 282168404Spjd dmu_buf_impl_t *dbf, **dbp; 283168404Spjd 284168404Spjd /* 285269417Sdelphij * We musn't hold db_mtx to maintain lock ordering: 286168404Spjd * DBUF_HASH_MUTEX > db_mtx. 287168404Spjd */ 288168404Spjd ASSERT(refcount_is_zero(&db->db_holds)); 289168404Spjd ASSERT(db->db_state == DB_EVICTING); 290168404Spjd ASSERT(!MUTEX_HELD(&db->db_mtx)); 291168404Spjd 292168404Spjd mutex_enter(DBUF_HASH_MUTEX(h, idx)); 293168404Spjd dbp = &h->hash_table[idx]; 294168404Spjd while ((dbf = *dbp) != db) { 295168404Spjd dbp = &dbf->db_hash_next; 296168404Spjd ASSERT(dbf != NULL); 297168404Spjd } 298168404Spjd *dbp = db->db_hash_next; 299168404Spjd db->db_hash_next = NULL; 300168404Spjd mutex_exit(DBUF_HASH_MUTEX(h, idx)); 301271001Sdelphij atomic_dec_64(&dbuf_hash_count); 302168404Spjd} 303168404Spjd 304288549Smavtypedef enum { 305288549Smav DBVU_EVICTING, 306288549Smav DBVU_NOT_EVICTING 307288549Smav} dbvu_verify_type_t; 308288549Smav 309168404Spjdstatic void 310288549Smavdbuf_verify_user(dmu_buf_impl_t *db, dbvu_verify_type_t verify_type) 311288549Smav{ 312288549Smav#ifdef ZFS_DEBUG 313288549Smav int64_t holds; 314288549Smav 315288549Smav if (db->db_user == NULL) 316288549Smav return; 317288549Smav 318288549Smav /* Only data blocks support the attachment of user data. */ 319288549Smav ASSERT(db->db_level == 0); 320288549Smav 321288549Smav /* Clients must resolve a dbuf before attaching user data. */ 322288549Smav ASSERT(db->db.db_data != NULL); 323288549Smav ASSERT3U(db->db_state, ==, DB_CACHED); 324288549Smav 325288549Smav holds = refcount_count(&db->db_holds); 326288549Smav if (verify_type == DBVU_EVICTING) { 327288549Smav /* 328288549Smav * Immediate eviction occurs when holds == dirtycnt. 329288549Smav * For normal eviction buffers, holds is zero on 330288549Smav * eviction, except when dbuf_fix_old_data() calls 331288549Smav * dbuf_clear_data(). However, the hold count can grow 332288549Smav * during eviction even though db_mtx is held (see 333288549Smav * dmu_bonus_hold() for an example), so we can only 334288549Smav * test the generic invariant that holds >= dirtycnt. 335288549Smav */ 336288549Smav ASSERT3U(holds, >=, db->db_dirtycnt); 337288549Smav } else { 338290754Smav if (db->db_user_immediate_evict == TRUE) 339288549Smav ASSERT3U(holds, >=, db->db_dirtycnt); 340288549Smav else 341288549Smav ASSERT3U(holds, >, 0); 342288549Smav } 343288549Smav#endif 344288549Smav} 345288549Smav 346288549Smavstatic void 347168404Spjddbuf_evict_user(dmu_buf_impl_t *db) 348168404Spjd{ 349288549Smav dmu_buf_user_t *dbu = db->db_user; 350288549Smav 351168404Spjd ASSERT(MUTEX_HELD(&db->db_mtx)); 352168404Spjd 353288549Smav if (dbu == NULL) 354168404Spjd return; 355168404Spjd 356288549Smav dbuf_verify_user(db, DBVU_EVICTING); 357288549Smav db->db_user = NULL; 358288549Smav 359288549Smav#ifdef ZFS_DEBUG 360288549Smav if (dbu->dbu_clear_on_evict_dbufp != NULL) 361288549Smav *dbu->dbu_clear_on_evict_dbufp = NULL; 362288549Smav#endif 363288549Smav 364288549Smav /* 365288549Smav * Invoke the callback from a taskq to avoid lock order reversals 366288549Smav * and limit stack depth. 367288549Smav */ 368288549Smav taskq_dispatch_ent(dbu_evict_taskq, dbu->dbu_evict_func, dbu, 0, 369288549Smav &dbu->dbu_tqent); 370168404Spjd} 371168404Spjd 372219089Spjdboolean_t 373219089Spjddbuf_is_metadata(dmu_buf_impl_t *db) 374219089Spjd{ 375219089Spjd if (db->db_level > 0) { 376219089Spjd return (B_TRUE); 377219089Spjd } else { 378219089Spjd boolean_t is_metadata; 379219089Spjd 380219089Spjd DB_DNODE_ENTER(db); 381236884Smm is_metadata = DMU_OT_IS_METADATA(DB_DNODE(db)->dn_type); 382219089Spjd DB_DNODE_EXIT(db); 383219089Spjd 384219089Spjd return (is_metadata); 385219089Spjd } 386219089Spjd} 387219089Spjd 388307266Smav/* 389307266Smav * This function *must* return indices evenly distributed between all 390307266Smav * sublists of the multilist. This is needed due to how the dbuf eviction 391307266Smav * code is laid out; dbuf_evict_thread() assumes dbufs are evenly 392307266Smav * distributed between all sublists and uses this assumption when 393307266Smav * deciding which sublist to evict from and how much to evict from it. 394307266Smav */ 395307266Smavunsigned int 396307266Smavdbuf_cache_multilist_index_func(multilist_t *ml, void *obj) 397168404Spjd{ 398307266Smav dmu_buf_impl_t *db = obj; 399168404Spjd 400307266Smav /* 401307266Smav * The assumption here, is the hash value for a given 402307266Smav * dmu_buf_impl_t will remain constant throughout it's lifetime 403307266Smav * (i.e. it's objset, object, level and blkid fields don't change). 404307266Smav * Thus, we don't need to store the dbuf's sublist index 405307266Smav * on insertion, as this index can be recalculated on removal. 406307266Smav * 407307266Smav * Also, the low order bits of the hash value are thought to be 408307266Smav * distributed evenly. Otherwise, in the case that the multilist 409307266Smav * has a power of two number of sublists, each sublists' usage 410307266Smav * would not be evenly distributed. 411307266Smav */ 412307266Smav return (dbuf_hash(db->db_objset, db->db.db_object, 413307266Smav db->db_level, db->db_blkid) % 414307266Smav multilist_get_num_sublists(ml)); 415168404Spjd} 416168404Spjd 417307266Smavstatic inline boolean_t 418307266Smavdbuf_cache_above_hiwater(void) 419307266Smav{ 420307266Smav uint64_t dbuf_cache_hiwater_bytes = 421307266Smav (dbuf_cache_max_bytes * dbuf_cache_hiwater_pct) / 100; 422307266Smav 423307266Smav return (refcount_count(&dbuf_cache_size) > 424307266Smav dbuf_cache_max_bytes + dbuf_cache_hiwater_bytes); 425307266Smav} 426307266Smav 427307266Smavstatic inline boolean_t 428307266Smavdbuf_cache_above_lowater(void) 429307266Smav{ 430307266Smav uint64_t dbuf_cache_lowater_bytes = 431307266Smav (dbuf_cache_max_bytes * dbuf_cache_lowater_pct) / 100; 432307266Smav 433307266Smav return (refcount_count(&dbuf_cache_size) > 434307266Smav dbuf_cache_max_bytes - dbuf_cache_lowater_bytes); 435307266Smav} 436307266Smav 437307266Smav/* 438307266Smav * Evict the oldest eligible dbuf from the dbuf cache. 439307266Smav */ 440307266Smavstatic void 441307266Smavdbuf_evict_one(void) 442307266Smav{ 443307266Smav int idx = multilist_get_random_index(&dbuf_cache); 444307266Smav multilist_sublist_t *mls = multilist_sublist_lock(&dbuf_cache, idx); 445307266Smav 446307266Smav ASSERT(!MUTEX_HELD(&dbuf_evict_lock)); 447307266Smav 448307266Smav /* 449307266Smav * Set the thread's tsd to indicate that it's processing evictions. 450307266Smav * Once a thread stops evicting from the dbuf cache it will 451307266Smav * reset its tsd to NULL. 452307266Smav */ 453307266Smav ASSERT3P(tsd_get(zfs_dbuf_evict_key), ==, NULL); 454307266Smav (void) tsd_set(zfs_dbuf_evict_key, (void *)B_TRUE); 455307266Smav 456307266Smav dmu_buf_impl_t *db = multilist_sublist_tail(mls); 457307266Smav while (db != NULL && mutex_tryenter(&db->db_mtx) == 0) { 458307266Smav db = multilist_sublist_prev(mls, db); 459307266Smav } 460307266Smav 461307266Smav DTRACE_PROBE2(dbuf__evict__one, dmu_buf_impl_t *, db, 462307266Smav multilist_sublist_t *, mls); 463307266Smav 464307266Smav if (db != NULL) { 465307266Smav multilist_sublist_remove(mls, db); 466307266Smav multilist_sublist_unlock(mls); 467307266Smav (void) refcount_remove_many(&dbuf_cache_size, 468307266Smav db->db.db_size, db); 469307266Smav dbuf_destroy(db); 470307266Smav } else { 471307266Smav multilist_sublist_unlock(mls); 472307266Smav } 473307266Smav (void) tsd_set(zfs_dbuf_evict_key, NULL); 474307266Smav} 475307266Smav 476307266Smav/* 477307266Smav * The dbuf evict thread is responsible for aging out dbufs from the 478307266Smav * cache. Once the cache has reached it's maximum size, dbufs are removed 479307266Smav * and destroyed. The eviction thread will continue running until the size 480307266Smav * of the dbuf cache is at or below the maximum size. Once the dbuf is aged 481307266Smav * out of the cache it is destroyed and becomes eligible for arc eviction. 482307266Smav */ 483307266Smavstatic void 484307266Smavdbuf_evict_thread(void *dummy __unused) 485307266Smav{ 486307266Smav callb_cpr_t cpr; 487307266Smav 488307266Smav CALLB_CPR_INIT(&cpr, &dbuf_evict_lock, callb_generic_cpr, FTAG); 489307266Smav 490307266Smav mutex_enter(&dbuf_evict_lock); 491307266Smav while (!dbuf_evict_thread_exit) { 492307266Smav while (!dbuf_cache_above_lowater() && !dbuf_evict_thread_exit) { 493307266Smav CALLB_CPR_SAFE_BEGIN(&cpr); 494307266Smav (void) cv_timedwait_hires(&dbuf_evict_cv, 495307266Smav &dbuf_evict_lock, SEC2NSEC(1), MSEC2NSEC(1), 0); 496307266Smav CALLB_CPR_SAFE_END(&cpr, &dbuf_evict_lock); 497307266Smav } 498307266Smav mutex_exit(&dbuf_evict_lock); 499307266Smav 500307266Smav /* 501307266Smav * Keep evicting as long as we're above the low water mark 502307266Smav * for the cache. We do this without holding the locks to 503307266Smav * minimize lock contention. 504307266Smav */ 505307266Smav while (dbuf_cache_above_lowater() && !dbuf_evict_thread_exit) { 506307266Smav dbuf_evict_one(); 507307266Smav } 508307266Smav 509307266Smav mutex_enter(&dbuf_evict_lock); 510307266Smav } 511307266Smav 512307266Smav dbuf_evict_thread_exit = B_FALSE; 513307266Smav cv_broadcast(&dbuf_evict_cv); 514307266Smav CALLB_CPR_EXIT(&cpr); /* drops dbuf_evict_lock */ 515307266Smav thread_exit(); 516307266Smav} 517307266Smav 518307266Smav/* 519307266Smav * Wake up the dbuf eviction thread if the dbuf cache is at its max size. 520307266Smav * If the dbuf cache is at its high water mark, then evict a dbuf from the 521307266Smav * dbuf cache using the callers context. 522307266Smav */ 523307266Smavstatic void 524307266Smavdbuf_evict_notify(void) 525307266Smav{ 526307266Smav 527307266Smav /* 528307266Smav * We use thread specific data to track when a thread has 529307266Smav * started processing evictions. This allows us to avoid deeply 530307266Smav * nested stacks that would have a call flow similar to this: 531307266Smav * 532307266Smav * dbuf_rele()-->dbuf_rele_and_unlock()-->dbuf_evict_notify() 533307266Smav * ^ | 534307266Smav * | | 535307266Smav * +-----dbuf_destroy()<--dbuf_evict_one()<--------+ 536307266Smav * 537307266Smav * The dbuf_eviction_thread will always have its tsd set until 538307266Smav * that thread exits. All other threads will only set their tsd 539307266Smav * if they are participating in the eviction process. This only 540307266Smav * happens if the eviction thread is unable to process evictions 541307266Smav * fast enough. To keep the dbuf cache size in check, other threads 542307266Smav * can evict from the dbuf cache directly. Those threads will set 543307266Smav * their tsd values so that we ensure that they only evict one dbuf 544307266Smav * from the dbuf cache. 545307266Smav */ 546307266Smav if (tsd_get(zfs_dbuf_evict_key) != NULL) 547307266Smav return; 548307266Smav 549307266Smav if (refcount_count(&dbuf_cache_size) > dbuf_cache_max_bytes) { 550307266Smav boolean_t evict_now = B_FALSE; 551307266Smav 552307266Smav mutex_enter(&dbuf_evict_lock); 553307266Smav if (refcount_count(&dbuf_cache_size) > dbuf_cache_max_bytes) { 554307266Smav evict_now = dbuf_cache_above_hiwater(); 555307266Smav cv_signal(&dbuf_evict_cv); 556307266Smav } 557307266Smav mutex_exit(&dbuf_evict_lock); 558307266Smav 559307266Smav if (evict_now) { 560307266Smav dbuf_evict_one(); 561307266Smav } 562307266Smav } 563307266Smav} 564307266Smav 565168404Spjdvoid 566168404Spjddbuf_init(void) 567168404Spjd{ 568168404Spjd uint64_t hsize = 1ULL << 16; 569168404Spjd dbuf_hash_table_t *h = &dbuf_hash_table; 570168404Spjd int i; 571168404Spjd 572168404Spjd /* 573168404Spjd * The hash table is big enough to fill all of physical memory 574168404Spjd * with an average 4K block size. The table will take up 575168404Spjd * totalmem*sizeof(void*)/4K (i.e. 2MB/GB with 8-byte pointers). 576168404Spjd */ 577168696Spjd while (hsize * 4096 < (uint64_t)physmem * PAGESIZE) 578168404Spjd hsize <<= 1; 579168404Spjd 580168404Spjdretry: 581168404Spjd h->hash_table_mask = hsize - 1; 582168404Spjd h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP); 583168404Spjd if (h->hash_table == NULL) { 584168404Spjd /* XXX - we should really return an error instead of assert */ 585168404Spjd ASSERT(hsize > (1ULL << 10)); 586168404Spjd hsize >>= 1; 587168404Spjd goto retry; 588168404Spjd } 589168404Spjd 590307266Smav dbuf_kmem_cache = kmem_cache_create("dmu_buf_impl_t", 591168404Spjd sizeof (dmu_buf_impl_t), 592168404Spjd 0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0); 593168404Spjd 594168404Spjd for (i = 0; i < DBUF_MUTEXES; i++) 595168404Spjd mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL); 596288549Smav 597288549Smav /* 598307266Smav * Setup the parameters for the dbuf cache. We cap the size of the 599307266Smav * dbuf cache to 1/32nd (default) of the size of the ARC. 600307266Smav */ 601307266Smav dbuf_cache_max_bytes = MIN(dbuf_cache_max_bytes, 602307266Smav arc_max_bytes() >> dbuf_cache_max_shift); 603307266Smav 604307266Smav /* 605288549Smav * All entries are queued via taskq_dispatch_ent(), so min/maxalloc 606288549Smav * configuration is not required. 607288549Smav */ 608288549Smav dbu_evict_taskq = taskq_create("dbu_evict", 1, minclsyspri, 0, 0, 0); 609307266Smav 610307266Smav multilist_create(&dbuf_cache, sizeof (dmu_buf_impl_t), 611307266Smav offsetof(dmu_buf_impl_t, db_cache_link), 612307266Smav zfs_arc_num_sublists_per_state, 613307266Smav dbuf_cache_multilist_index_func); 614307266Smav refcount_create(&dbuf_cache_size); 615307266Smav 616307266Smav tsd_create(&zfs_dbuf_evict_key, NULL); 617307266Smav dbuf_evict_thread_exit = B_FALSE; 618307266Smav mutex_init(&dbuf_evict_lock, NULL, MUTEX_DEFAULT, NULL); 619307266Smav cv_init(&dbuf_evict_cv, NULL, CV_DEFAULT, NULL); 620307266Smav dbuf_cache_evict_thread = thread_create(NULL, 0, dbuf_evict_thread, 621307266Smav NULL, 0, &p0, TS_RUN, minclsyspri); 622168404Spjd} 623168404Spjd 624168404Spjdvoid 625168404Spjddbuf_fini(void) 626168404Spjd{ 627168404Spjd dbuf_hash_table_t *h = &dbuf_hash_table; 628168404Spjd int i; 629168404Spjd 630168404Spjd for (i = 0; i < DBUF_MUTEXES; i++) 631168404Spjd mutex_destroy(&h->hash_mutexes[i]); 632168404Spjd kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *)); 633307266Smav kmem_cache_destroy(dbuf_kmem_cache); 634288549Smav taskq_destroy(dbu_evict_taskq); 635307266Smav 636307266Smav mutex_enter(&dbuf_evict_lock); 637307266Smav dbuf_evict_thread_exit = B_TRUE; 638307266Smav while (dbuf_evict_thread_exit) { 639307266Smav cv_signal(&dbuf_evict_cv); 640307266Smav cv_wait(&dbuf_evict_cv, &dbuf_evict_lock); 641307266Smav } 642307266Smav mutex_exit(&dbuf_evict_lock); 643307266Smav tsd_destroy(&zfs_dbuf_evict_key); 644307266Smav 645307266Smav mutex_destroy(&dbuf_evict_lock); 646307266Smav cv_destroy(&dbuf_evict_cv); 647307266Smav 648307266Smav refcount_destroy(&dbuf_cache_size); 649307266Smav multilist_destroy(&dbuf_cache); 650168404Spjd} 651168404Spjd 652168404Spjd/* 653168404Spjd * Other stuff. 654168404Spjd */ 655168404Spjd 656168404Spjd#ifdef ZFS_DEBUG 657168404Spjdstatic void 658168404Spjddbuf_verify(dmu_buf_impl_t *db) 659168404Spjd{ 660219089Spjd dnode_t *dn; 661219089Spjd dbuf_dirty_record_t *dr; 662168404Spjd 663168404Spjd ASSERT(MUTEX_HELD(&db->db_mtx)); 664168404Spjd 665168404Spjd if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY)) 666168404Spjd return; 667168404Spjd 668168404Spjd ASSERT(db->db_objset != NULL); 669219089Spjd DB_DNODE_ENTER(db); 670219089Spjd dn = DB_DNODE(db); 671168404Spjd if (dn == NULL) { 672168404Spjd ASSERT(db->db_parent == NULL); 673168404Spjd ASSERT(db->db_blkptr == NULL); 674168404Spjd } else { 675168404Spjd ASSERT3U(db->db.db_object, ==, dn->dn_object); 676168404Spjd ASSERT3P(db->db_objset, ==, dn->dn_objset); 677168404Spjd ASSERT3U(db->db_level, <, dn->dn_nlevels); 678219089Spjd ASSERT(db->db_blkid == DMU_BONUS_BLKID || 679219089Spjd db->db_blkid == DMU_SPILL_BLKID || 680269845Sdelphij !avl_is_empty(&dn->dn_dbufs)); 681168404Spjd } 682219089Spjd if (db->db_blkid == DMU_BONUS_BLKID) { 683168404Spjd ASSERT(dn != NULL); 684185029Spjd ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); 685219089Spjd ASSERT3U(db->db.db_offset, ==, DMU_BONUS_BLKID); 686219089Spjd } else if (db->db_blkid == DMU_SPILL_BLKID) { 687219089Spjd ASSERT(dn != NULL); 688219089Spjd ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); 689240415Smm ASSERT0(db->db.db_offset); 690168404Spjd } else { 691168404Spjd ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size); 692168404Spjd } 693168404Spjd 694219089Spjd for (dr = db->db_data_pending; dr != NULL; dr = dr->dr_next) 695219089Spjd ASSERT(dr->dr_dbuf == db); 696219089Spjd 697219089Spjd for (dr = db->db_last_dirty; dr != NULL; dr = dr->dr_next) 698219089Spjd ASSERT(dr->dr_dbuf == db); 699219089Spjd 700208047Smm /* 701208047Smm * We can't assert that db_size matches dn_datablksz because it 702208047Smm * can be momentarily different when another thread is doing 703208047Smm * dnode_set_blksz(). 704208047Smm */ 705208047Smm if (db->db_level == 0 && db->db.db_object == DMU_META_DNODE_OBJECT) { 706219089Spjd dr = db->db_data_pending; 707208047Smm /* 708208047Smm * It should only be modified in syncing context, so 709208047Smm * make sure we only have one copy of the data. 710208047Smm */ 711208047Smm ASSERT(dr == NULL || dr->dt.dl.dr_data == db->db_buf); 712168404Spjd } 713168404Spjd 714168404Spjd /* verify db->db_blkptr */ 715168404Spjd if (db->db_blkptr) { 716168404Spjd if (db->db_parent == dn->dn_dbuf) { 717168404Spjd /* db is pointed to by the dnode */ 718168404Spjd /* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */ 719209962Smm if (DMU_OBJECT_IS_SPECIAL(db->db.db_object)) 720168404Spjd ASSERT(db->db_parent == NULL); 721168404Spjd else 722168404Spjd ASSERT(db->db_parent != NULL); 723219089Spjd if (db->db_blkid != DMU_SPILL_BLKID) 724219089Spjd ASSERT3P(db->db_blkptr, ==, 725219089Spjd &dn->dn_phys->dn_blkptr[db->db_blkid]); 726168404Spjd } else { 727168404Spjd /* db is pointed to by an indirect block */ 728168404Spjd int epb = db->db_parent->db.db_size >> SPA_BLKPTRSHIFT; 729168404Spjd ASSERT3U(db->db_parent->db_level, ==, db->db_level+1); 730168404Spjd ASSERT3U(db->db_parent->db.db_object, ==, 731168404Spjd db->db.db_object); 732168404Spjd /* 733168404Spjd * dnode_grow_indblksz() can make this fail if we don't 734168404Spjd * have the struct_rwlock. XXX indblksz no longer 735168404Spjd * grows. safe to do this now? 736168404Spjd */ 737219089Spjd if (RW_WRITE_HELD(&dn->dn_struct_rwlock)) { 738168404Spjd ASSERT3P(db->db_blkptr, ==, 739168404Spjd ((blkptr_t *)db->db_parent->db.db_data + 740168404Spjd db->db_blkid % epb)); 741168404Spjd } 742168404Spjd } 743168404Spjd } 744168404Spjd if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) && 745219089Spjd (db->db_buf == NULL || db->db_buf->b_data) && 746219089Spjd db->db.db_data && db->db_blkid != DMU_BONUS_BLKID && 747168404Spjd db->db_state != DB_FILL && !dn->dn_free_txg) { 748168404Spjd /* 749168404Spjd * If the blkptr isn't set but they have nonzero data, 750168404Spjd * it had better be dirty, otherwise we'll lose that 751168404Spjd * data when we evict this buffer. 752304139Savg * 753304139Savg * There is an exception to this rule for indirect blocks; in 754304139Savg * this case, if the indirect block is a hole, we fill in a few 755304139Savg * fields on each of the child blocks (importantly, birth time) 756304139Savg * to prevent hole birth times from being lost when you 757304139Savg * partially fill in a hole. 758168404Spjd */ 759168404Spjd if (db->db_dirtycnt == 0) { 760304139Savg if (db->db_level == 0) { 761304139Savg uint64_t *buf = db->db.db_data; 762304139Savg int i; 763168404Spjd 764304139Savg for (i = 0; i < db->db.db_size >> 3; i++) { 765304139Savg ASSERT(buf[i] == 0); 766304139Savg } 767304139Savg } else { 768304139Savg blkptr_t *bps = db->db.db_data; 769304139Savg ASSERT3U(1 << DB_DNODE(db)->dn_indblkshift, ==, 770304139Savg db->db.db_size); 771304139Savg /* 772304139Savg * We want to verify that all the blkptrs in the 773304139Savg * indirect block are holes, but we may have 774304139Savg * automatically set up a few fields for them. 775304139Savg * We iterate through each blkptr and verify 776304139Savg * they only have those fields set. 777304139Savg */ 778304139Savg for (int i = 0; 779304139Savg i < db->db.db_size / sizeof (blkptr_t); 780304139Savg i++) { 781304139Savg blkptr_t *bp = &bps[i]; 782304139Savg ASSERT(ZIO_CHECKSUM_IS_ZERO( 783304139Savg &bp->blk_cksum)); 784304139Savg ASSERT( 785304139Savg DVA_IS_EMPTY(&bp->blk_dva[0]) && 786304139Savg DVA_IS_EMPTY(&bp->blk_dva[1]) && 787304139Savg DVA_IS_EMPTY(&bp->blk_dva[2])); 788304139Savg ASSERT0(bp->blk_fill); 789304139Savg ASSERT0(bp->blk_pad[0]); 790304139Savg ASSERT0(bp->blk_pad[1]); 791304139Savg ASSERT(!BP_IS_EMBEDDED(bp)); 792304139Savg ASSERT(BP_IS_HOLE(bp)); 793304139Savg ASSERT0(bp->blk_phys_birth); 794304139Savg } 795168404Spjd } 796168404Spjd } 797168404Spjd } 798219089Spjd DB_DNODE_EXIT(db); 799168404Spjd} 800168404Spjd#endif 801168404Spjd 802168404Spjdstatic void 803288549Smavdbuf_clear_data(dmu_buf_impl_t *db) 804288549Smav{ 805288549Smav ASSERT(MUTEX_HELD(&db->db_mtx)); 806288549Smav dbuf_evict_user(db); 807307266Smav ASSERT3P(db->db_buf, ==, NULL); 808288549Smav db->db.db_data = NULL; 809288549Smav if (db->db_state != DB_NOFILL) 810288549Smav db->db_state = DB_UNCACHED; 811288549Smav} 812288549Smav 813288549Smavstatic void 814168404Spjddbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf) 815168404Spjd{ 816168404Spjd ASSERT(MUTEX_HELD(&db->db_mtx)); 817288549Smav ASSERT(buf != NULL); 818288549Smav 819168404Spjd db->db_buf = buf; 820288549Smav ASSERT(buf->b_data != NULL); 821288549Smav db->db.db_data = buf->b_data; 822168404Spjd} 823168404Spjd 824219089Spjd/* 825219089Spjd * Loan out an arc_buf for read. Return the loaned arc_buf. 826219089Spjd */ 827219089Spjdarc_buf_t * 828219089Spjddbuf_loan_arcbuf(dmu_buf_impl_t *db) 829219089Spjd{ 830219089Spjd arc_buf_t *abuf; 831219089Spjd 832307266Smav ASSERT(db->db_blkid != DMU_BONUS_BLKID); 833219089Spjd mutex_enter(&db->db_mtx); 834219089Spjd if (arc_released(db->db_buf) || refcount_count(&db->db_holds) > 1) { 835219089Spjd int blksz = db->db.db_size; 836263397Sdelphij spa_t *spa = db->db_objset->os_spa; 837219089Spjd 838219089Spjd mutex_exit(&db->db_mtx); 839219089Spjd abuf = arc_loan_buf(spa, blksz); 840219089Spjd bcopy(db->db.db_data, abuf->b_data, blksz); 841219089Spjd } else { 842219089Spjd abuf = db->db_buf; 843219089Spjd arc_loan_inuse_buf(abuf, db); 844307266Smav db->db_buf = NULL; 845288549Smav dbuf_clear_data(db); 846219089Spjd mutex_exit(&db->db_mtx); 847219089Spjd } 848219089Spjd return (abuf); 849219089Spjd} 850219089Spjd 851288571Smav/* 852288571Smav * Calculate which level n block references the data at the level 0 offset 853288571Smav * provided. 854288571Smav */ 855168404Spjduint64_t 856288571Smavdbuf_whichblock(dnode_t *dn, int64_t level, uint64_t offset) 857168404Spjd{ 858288571Smav if (dn->dn_datablkshift != 0 && dn->dn_indblkshift != 0) { 859288571Smav /* 860288571Smav * The level n blkid is equal to the level 0 blkid divided by 861288571Smav * the number of level 0s in a level n block. 862288571Smav * 863288571Smav * The level 0 blkid is offset >> datablkshift = 864288571Smav * offset / 2^datablkshift. 865288571Smav * 866288571Smav * The number of level 0s in a level n is the number of block 867288571Smav * pointers in an indirect block, raised to the power of level. 868288571Smav * This is 2^(indblkshift - SPA_BLKPTRSHIFT)^level = 869288571Smav * 2^(level*(indblkshift - SPA_BLKPTRSHIFT)). 870288571Smav * 871288571Smav * Thus, the level n blkid is: offset / 872288571Smav * ((2^datablkshift)*(2^(level*(indblkshift - SPA_BLKPTRSHIFT))) 873288571Smav * = offset / 2^(datablkshift + level * 874288571Smav * (indblkshift - SPA_BLKPTRSHIFT)) 875288571Smav * = offset >> (datablkshift + level * 876288571Smav * (indblkshift - SPA_BLKPTRSHIFT)) 877288571Smav */ 878288571Smav return (offset >> (dn->dn_datablkshift + level * 879288571Smav (dn->dn_indblkshift - SPA_BLKPTRSHIFT))); 880168404Spjd } else { 881168404Spjd ASSERT3U(offset, <, dn->dn_datablksz); 882168404Spjd return (0); 883168404Spjd } 884168404Spjd} 885168404Spjd 886168404Spjdstatic void 887168404Spjddbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb) 888168404Spjd{ 889168404Spjd dmu_buf_impl_t *db = vdb; 890168404Spjd 891168404Spjd mutex_enter(&db->db_mtx); 892168404Spjd ASSERT3U(db->db_state, ==, DB_READ); 893168404Spjd /* 894168404Spjd * All reads are synchronous, so we must have a hold on the dbuf 895168404Spjd */ 896168404Spjd ASSERT(refcount_count(&db->db_holds) > 0); 897168404Spjd ASSERT(db->db_buf == NULL); 898168404Spjd ASSERT(db->db.db_data == NULL); 899168404Spjd if (db->db_level == 0 && db->db_freed_in_flight) { 900168404Spjd /* we were freed in flight; disregard any error */ 901168404Spjd arc_release(buf, db); 902168404Spjd bzero(buf->b_data, db->db.db_size); 903168404Spjd arc_buf_freeze(buf); 904168404Spjd db->db_freed_in_flight = FALSE; 905168404Spjd dbuf_set_data(db, buf); 906168404Spjd db->db_state = DB_CACHED; 907168404Spjd } else if (zio == NULL || zio->io_error == 0) { 908168404Spjd dbuf_set_data(db, buf); 909168404Spjd db->db_state = DB_CACHED; 910168404Spjd } else { 911219089Spjd ASSERT(db->db_blkid != DMU_BONUS_BLKID); 912168404Spjd ASSERT3P(db->db_buf, ==, NULL); 913307266Smav arc_buf_destroy(buf, db); 914168404Spjd db->db_state = DB_UNCACHED; 915168404Spjd } 916168404Spjd cv_broadcast(&db->db_changed); 917219089Spjd dbuf_rele_and_unlock(db, NULL); 918168404Spjd} 919168404Spjd 920168404Spjdstatic void 921288594Smavdbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) 922168404Spjd{ 923219089Spjd dnode_t *dn; 924268657Sdelphij zbookmark_phys_t zb; 925277586Sdelphij arc_flags_t aflags = ARC_FLAG_NOWAIT; 926168404Spjd 927219089Spjd DB_DNODE_ENTER(db); 928219089Spjd dn = DB_DNODE(db); 929168404Spjd ASSERT(!refcount_is_zero(&db->db_holds)); 930168404Spjd /* We need the struct_rwlock to prevent db_blkptr from changing. */ 931185029Spjd ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 932168404Spjd ASSERT(MUTEX_HELD(&db->db_mtx)); 933168404Spjd ASSERT(db->db_state == DB_UNCACHED); 934168404Spjd ASSERT(db->db_buf == NULL); 935168404Spjd 936219089Spjd if (db->db_blkid == DMU_BONUS_BLKID) { 937207624Smm int bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen); 938185029Spjd 939185029Spjd ASSERT3U(bonuslen, <=, db->db.db_size); 940168404Spjd db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN); 941208373Smm arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); 942185029Spjd if (bonuslen < DN_MAX_BONUSLEN) 943168404Spjd bzero(db->db.db_data, DN_MAX_BONUSLEN); 944207624Smm if (bonuslen) 945207624Smm bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen); 946219089Spjd DB_DNODE_EXIT(db); 947168404Spjd db->db_state = DB_CACHED; 948168404Spjd mutex_exit(&db->db_mtx); 949168404Spjd return; 950168404Spjd } 951168404Spjd 952185029Spjd /* 953185029Spjd * Recheck BP_IS_HOLE() after dnode_block_freed() in case dnode_sync() 954185029Spjd * processes the delete record and clears the bp while we are waiting 955185029Spjd * for the dn_mtx (resulting in a "no" from block_freed). 956185029Spjd */ 957185029Spjd if (db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr) || 958185029Spjd (db->db_level == 0 && (dnode_block_freed(dn, db->db_blkid) || 959185029Spjd BP_IS_HOLE(db->db_blkptr)))) { 960168404Spjd arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 961168404Spjd 962307266Smav dbuf_set_data(db, arc_alloc_buf(db->db_objset->os_spa, 963168404Spjd db->db.db_size, db, type)); 964168404Spjd bzero(db->db.db_data, db->db.db_size); 965304139Savg 966304139Savg if (db->db_blkptr != NULL && db->db_level > 0 && 967304139Savg BP_IS_HOLE(db->db_blkptr) && 968304139Savg db->db_blkptr->blk_birth != 0) { 969304139Savg blkptr_t *bps = db->db.db_data; 970304139Savg for (int i = 0; i < ((1 << 971304139Savg DB_DNODE(db)->dn_indblkshift) / sizeof (blkptr_t)); 972304139Savg i++) { 973304139Savg blkptr_t *bp = &bps[i]; 974304139Savg ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==, 975304139Savg 1 << dn->dn_indblkshift); 976304139Savg BP_SET_LSIZE(bp, 977304139Savg BP_GET_LEVEL(db->db_blkptr) == 1 ? 978304139Savg dn->dn_datablksz : 979304139Savg BP_GET_LSIZE(db->db_blkptr)); 980304139Savg BP_SET_TYPE(bp, BP_GET_TYPE(db->db_blkptr)); 981304139Savg BP_SET_LEVEL(bp, 982304139Savg BP_GET_LEVEL(db->db_blkptr) - 1); 983304139Savg BP_SET_BIRTH(bp, db->db_blkptr->blk_birth, 0); 984304139Savg } 985304139Savg } 986304139Savg DB_DNODE_EXIT(db); 987168404Spjd db->db_state = DB_CACHED; 988168404Spjd mutex_exit(&db->db_mtx); 989168404Spjd return; 990168404Spjd } 991168404Spjd 992219089Spjd DB_DNODE_EXIT(db); 993219089Spjd 994168404Spjd db->db_state = DB_READ; 995168404Spjd mutex_exit(&db->db_mtx); 996168404Spjd 997185029Spjd if (DBUF_IS_L2CACHEABLE(db)) 998277586Sdelphij aflags |= ARC_FLAG_L2CACHE; 999185029Spjd 1000219089Spjd SET_BOOKMARK(&zb, db->db_objset->os_dsl_dataset ? 1001219089Spjd db->db_objset->os_dsl_dataset->ds_object : DMU_META_OBJSET, 1002219089Spjd db->db.db_object, db->db_level, db->db_blkid); 1003168404Spjd 1004168404Spjd dbuf_add_ref(db, NULL); 1005185029Spjd 1006263397Sdelphij (void) arc_read(zio, db->db_objset->os_spa, db->db_blkptr, 1007168404Spjd dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, 1008288594Smav (flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED, 1009168404Spjd &aflags, &zb); 1010168404Spjd} 1011168404Spjd 1012168404Spjdint 1013168404Spjddbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) 1014168404Spjd{ 1015168404Spjd int err = 0; 1016263397Sdelphij boolean_t havepzio = (zio != NULL); 1017263397Sdelphij boolean_t prefetch; 1018219089Spjd dnode_t *dn; 1019168404Spjd 1020168404Spjd /* 1021168404Spjd * We don't have to hold the mutex to check db_state because it 1022168404Spjd * can't be freed while we have a hold on the buffer. 1023168404Spjd */ 1024168404Spjd ASSERT(!refcount_is_zero(&db->db_holds)); 1025168404Spjd 1026219089Spjd if (db->db_state == DB_NOFILL) 1027249195Smm return (SET_ERROR(EIO)); 1028219089Spjd 1029219089Spjd DB_DNODE_ENTER(db); 1030219089Spjd dn = DB_DNODE(db); 1031168404Spjd if ((flags & DB_RF_HAVESTRUCT) == 0) 1032219089Spjd rw_enter(&dn->dn_struct_rwlock, RW_READER); 1033168404Spjd 1034219089Spjd prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && 1035219089Spjd (flags & DB_RF_NOPREFETCH) == 0 && dn != NULL && 1036185029Spjd DBUF_IS_CACHEABLE(db); 1037168404Spjd 1038168404Spjd mutex_enter(&db->db_mtx); 1039168404Spjd if (db->db_state == DB_CACHED) { 1040168404Spjd mutex_exit(&db->db_mtx); 1041168404Spjd if (prefetch) 1042299433Smav dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE); 1043168404Spjd if ((flags & DB_RF_HAVESTRUCT) == 0) 1044219089Spjd rw_exit(&dn->dn_struct_rwlock); 1045219089Spjd DB_DNODE_EXIT(db); 1046168404Spjd } else if (db->db_state == DB_UNCACHED) { 1047219089Spjd spa_t *spa = dn->dn_objset->os_spa; 1048219089Spjd 1049219089Spjd if (zio == NULL) 1050219089Spjd zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); 1051288594Smav dbuf_read_impl(db, zio, flags); 1052168404Spjd 1053168404Spjd /* dbuf_read_impl has dropped db_mtx for us */ 1054168404Spjd 1055168404Spjd if (prefetch) 1056299433Smav dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE); 1057168404Spjd 1058168404Spjd if ((flags & DB_RF_HAVESTRUCT) == 0) 1059219089Spjd rw_exit(&dn->dn_struct_rwlock); 1060219089Spjd DB_DNODE_EXIT(db); 1061168404Spjd 1062168404Spjd if (!havepzio) 1063168404Spjd err = zio_wait(zio); 1064168404Spjd } else { 1065251629Sdelphij /* 1066251629Sdelphij * Another reader came in while the dbuf was in flight 1067251629Sdelphij * between UNCACHED and CACHED. Either a writer will finish 1068251629Sdelphij * writing the buffer (sending the dbuf to CACHED) or the 1069251629Sdelphij * first reader's request will reach the read_done callback 1070251629Sdelphij * and send the dbuf to CACHED. Otherwise, a failure 1071251629Sdelphij * occurred and the dbuf went to UNCACHED. 1072251629Sdelphij */ 1073168404Spjd mutex_exit(&db->db_mtx); 1074168404Spjd if (prefetch) 1075299433Smav dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE); 1076168404Spjd if ((flags & DB_RF_HAVESTRUCT) == 0) 1077219089Spjd rw_exit(&dn->dn_struct_rwlock); 1078219089Spjd DB_DNODE_EXIT(db); 1079168404Spjd 1080251629Sdelphij /* Skip the wait per the caller's request. */ 1081168404Spjd mutex_enter(&db->db_mtx); 1082168404Spjd if ((flags & DB_RF_NEVERWAIT) == 0) { 1083168404Spjd while (db->db_state == DB_READ || 1084168404Spjd db->db_state == DB_FILL) { 1085168404Spjd ASSERT(db->db_state == DB_READ || 1086168404Spjd (flags & DB_RF_HAVESTRUCT) == 0); 1087273346Sdelphij DTRACE_PROBE2(blocked__read, dmu_buf_impl_t *, 1088273346Sdelphij db, zio_t *, zio); 1089168404Spjd cv_wait(&db->db_changed, &db->db_mtx); 1090168404Spjd } 1091168404Spjd if (db->db_state == DB_UNCACHED) 1092249195Smm err = SET_ERROR(EIO); 1093168404Spjd } 1094168404Spjd mutex_exit(&db->db_mtx); 1095168404Spjd } 1096168404Spjd 1097168404Spjd ASSERT(err || havepzio || db->db_state == DB_CACHED); 1098168404Spjd return (err); 1099168404Spjd} 1100168404Spjd 1101168404Spjdstatic void 1102168404Spjddbuf_noread(dmu_buf_impl_t *db) 1103168404Spjd{ 1104168404Spjd ASSERT(!refcount_is_zero(&db->db_holds)); 1105219089Spjd ASSERT(db->db_blkid != DMU_BONUS_BLKID); 1106168404Spjd mutex_enter(&db->db_mtx); 1107168404Spjd while (db->db_state == DB_READ || db->db_state == DB_FILL) 1108168404Spjd cv_wait(&db->db_changed, &db->db_mtx); 1109168404Spjd if (db->db_state == DB_UNCACHED) { 1110168404Spjd arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 1111263397Sdelphij spa_t *spa = db->db_objset->os_spa; 1112168404Spjd 1113168404Spjd ASSERT(db->db_buf == NULL); 1114168404Spjd ASSERT(db->db.db_data == NULL); 1115307266Smav dbuf_set_data(db, arc_alloc_buf(spa, db->db.db_size, db, type)); 1116168404Spjd db->db_state = DB_FILL; 1117219089Spjd } else if (db->db_state == DB_NOFILL) { 1118288549Smav dbuf_clear_data(db); 1119168404Spjd } else { 1120168404Spjd ASSERT3U(db->db_state, ==, DB_CACHED); 1121168404Spjd } 1122168404Spjd mutex_exit(&db->db_mtx); 1123168404Spjd} 1124168404Spjd 1125168404Spjd/* 1126168404Spjd * This is our just-in-time copy function. It makes a copy of 1127168404Spjd * buffers, that have been modified in a previous transaction 1128168404Spjd * group, before we modify them in the current active group. 1129168404Spjd * 1130168404Spjd * This function is used in two places: when we are dirtying a 1131168404Spjd * buffer for the first time in a txg, and when we are freeing 1132168404Spjd * a range in a dnode that includes this buffer. 1133168404Spjd * 1134168404Spjd * Note that when we are called from dbuf_free_range() we do 1135168404Spjd * not put a hold on the buffer, we just traverse the active 1136168404Spjd * dbuf list for the dnode. 1137168404Spjd */ 1138168404Spjdstatic void 1139168404Spjddbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg) 1140168404Spjd{ 1141168404Spjd dbuf_dirty_record_t *dr = db->db_last_dirty; 1142168404Spjd 1143168404Spjd ASSERT(MUTEX_HELD(&db->db_mtx)); 1144168404Spjd ASSERT(db->db.db_data != NULL); 1145168404Spjd ASSERT(db->db_level == 0); 1146168404Spjd ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT); 1147168404Spjd 1148168404Spjd if (dr == NULL || 1149168404Spjd (dr->dt.dl.dr_data != 1150219089Spjd ((db->db_blkid == DMU_BONUS_BLKID) ? db->db.db_data : db->db_buf))) 1151168404Spjd return; 1152168404Spjd 1153168404Spjd /* 1154168404Spjd * If the last dirty record for this dbuf has not yet synced 1155168404Spjd * and its referencing the dbuf data, either: 1156219089Spjd * reset the reference to point to a new copy, 1157168404Spjd * or (if there a no active holders) 1158168404Spjd * just null out the current db_data pointer. 1159168404Spjd */ 1160168404Spjd ASSERT(dr->dr_txg >= txg - 2); 1161219089Spjd if (db->db_blkid == DMU_BONUS_BLKID) { 1162168404Spjd /* Note that the data bufs here are zio_bufs */ 1163168404Spjd dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN); 1164208373Smm arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); 1165168404Spjd bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN); 1166168404Spjd } else if (refcount_count(&db->db_holds) > db->db_dirtycnt) { 1167168404Spjd int size = db->db.db_size; 1168168404Spjd arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 1169263397Sdelphij spa_t *spa = db->db_objset->os_spa; 1170219089Spjd 1171307266Smav dr->dt.dl.dr_data = arc_alloc_buf(spa, size, db, type); 1172168404Spjd bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size); 1173168404Spjd } else { 1174307266Smav db->db_buf = NULL; 1175288549Smav dbuf_clear_data(db); 1176168404Spjd } 1177168404Spjd} 1178168404Spjd 1179168404Spjdvoid 1180168404Spjddbuf_unoverride(dbuf_dirty_record_t *dr) 1181168404Spjd{ 1182168404Spjd dmu_buf_impl_t *db = dr->dr_dbuf; 1183219089Spjd blkptr_t *bp = &dr->dt.dl.dr_overridden_by; 1184168404Spjd uint64_t txg = dr->dr_txg; 1185168404Spjd 1186168404Spjd ASSERT(MUTEX_HELD(&db->db_mtx)); 1187168404Spjd ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC); 1188168404Spjd ASSERT(db->db_level == 0); 1189168404Spjd 1190219089Spjd if (db->db_blkid == DMU_BONUS_BLKID || 1191168404Spjd dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN) 1192168404Spjd return; 1193168404Spjd 1194219089Spjd ASSERT(db->db_data_pending != dr); 1195219089Spjd 1196168404Spjd /* free this block */ 1197263397Sdelphij if (!BP_IS_HOLE(bp) && !dr->dt.dl.dr_nopwrite) 1198263397Sdelphij zio_free(db->db_objset->os_spa, txg, bp); 1199219089Spjd 1200168404Spjd dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; 1201243524Smm dr->dt.dl.dr_nopwrite = B_FALSE; 1202243524Smm 1203168404Spjd /* 1204168404Spjd * Release the already-written buffer, so we leave it in 1205168404Spjd * a consistent dirty state. Note that all callers are 1206168404Spjd * modifying the buffer, so they will immediately do 1207168404Spjd * another (redundant) arc_release(). Therefore, leave 1208168404Spjd * the buf thawed to save the effort of freezing & 1209168404Spjd * immediately re-thawing it. 1210168404Spjd */ 1211168404Spjd arc_release(dr->dt.dl.dr_data, db); 1212168404Spjd} 1213168404Spjd 1214185029Spjd/* 1215185029Spjd * Evict (if its unreferenced) or clear (if its referenced) any level-0 1216185029Spjd * data blocks in the free range, so that any future readers will find 1217263397Sdelphij * empty blocks. 1218253821Sdelphij * 1219253821Sdelphij * This is a no-op if the dataset is in the middle of an incremental 1220253821Sdelphij * receive; see comment below for details. 1221185029Spjd */ 1222168404Spjdvoid 1223269845Sdelphijdbuf_free_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid, 1224269845Sdelphij dmu_tx_t *tx) 1225168404Spjd{ 1226288549Smav dmu_buf_impl_t db_search; 1227288549Smav dmu_buf_impl_t *db, *db_next; 1228168404Spjd uint64_t txg = tx->tx_txg; 1229269845Sdelphij avl_index_t where; 1230168404Spjd 1231269845Sdelphij if (end_blkid > dn->dn_maxblkid && (end_blkid != DMU_SPILL_BLKID)) 1232269845Sdelphij end_blkid = dn->dn_maxblkid; 1233269845Sdelphij dprintf_dnode(dn, "start=%llu end=%llu\n", start_blkid, end_blkid); 1234253821Sdelphij 1235269845Sdelphij db_search.db_level = 0; 1236269845Sdelphij db_search.db_blkid = start_blkid; 1237270809Sdelphij db_search.db_state = DB_SEARCH; 1238269845Sdelphij 1239254753Sdelphij mutex_enter(&dn->dn_dbufs_mtx); 1240269845Sdelphij if (start_blkid >= dn->dn_unlisted_l0_blkid) { 1241254753Sdelphij /* There can't be any dbufs in this range; no need to search. */ 1242269845Sdelphij#ifdef DEBUG 1243269845Sdelphij db = avl_find(&dn->dn_dbufs, &db_search, &where); 1244269845Sdelphij ASSERT3P(db, ==, NULL); 1245269845Sdelphij db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER); 1246269845Sdelphij ASSERT(db == NULL || db->db_level > 0); 1247269845Sdelphij#endif 1248254753Sdelphij mutex_exit(&dn->dn_dbufs_mtx); 1249254753Sdelphij return; 1250254753Sdelphij } else if (dmu_objset_is_receiving(dn->dn_objset)) { 1251253821Sdelphij /* 1252254753Sdelphij * If we are receiving, we expect there to be no dbufs in 1253254753Sdelphij * the range to be freed, because receive modifies each 1254254753Sdelphij * block at most once, and in offset order. If this is 1255254753Sdelphij * not the case, it can lead to performance problems, 1256254753Sdelphij * so note that we unexpectedly took the slow path. 1257253821Sdelphij */ 1258254753Sdelphij atomic_inc_64(&zfs_free_range_recv_miss); 1259253821Sdelphij } 1260253821Sdelphij 1261269845Sdelphij db = avl_find(&dn->dn_dbufs, &db_search, &where); 1262269845Sdelphij ASSERT3P(db, ==, NULL); 1263269845Sdelphij db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER); 1264269845Sdelphij 1265269845Sdelphij for (; db != NULL; db = db_next) { 1266269845Sdelphij db_next = AVL_NEXT(&dn->dn_dbufs, db); 1267219089Spjd ASSERT(db->db_blkid != DMU_BONUS_BLKID); 1268185029Spjd 1269269845Sdelphij if (db->db_level != 0 || db->db_blkid > end_blkid) { 1270269845Sdelphij break; 1271269845Sdelphij } 1272269845Sdelphij ASSERT3U(db->db_blkid, >=, start_blkid); 1273168404Spjd 1274168404Spjd /* found a level 0 buffer in the range */ 1275248571Smm mutex_enter(&db->db_mtx); 1276248571Smm if (dbuf_undirty(db, tx)) { 1277248571Smm /* mutex has been dropped and dbuf destroyed */ 1278168404Spjd continue; 1279248571Smm } 1280168404Spjd 1281168404Spjd if (db->db_state == DB_UNCACHED || 1282219089Spjd db->db_state == DB_NOFILL || 1283168404Spjd db->db_state == DB_EVICTING) { 1284168404Spjd ASSERT(db->db.db_data == NULL); 1285168404Spjd mutex_exit(&db->db_mtx); 1286168404Spjd continue; 1287168404Spjd } 1288168404Spjd if (db->db_state == DB_READ || db->db_state == DB_FILL) { 1289168404Spjd /* will be handled in dbuf_read_done or dbuf_rele */ 1290168404Spjd db->db_freed_in_flight = TRUE; 1291168404Spjd mutex_exit(&db->db_mtx); 1292168404Spjd continue; 1293168404Spjd } 1294168404Spjd if (refcount_count(&db->db_holds) == 0) { 1295168404Spjd ASSERT(db->db_buf); 1296307266Smav dbuf_destroy(db); 1297168404Spjd continue; 1298168404Spjd } 1299168404Spjd /* The dbuf is referenced */ 1300168404Spjd 1301168404Spjd if (db->db_last_dirty != NULL) { 1302168404Spjd dbuf_dirty_record_t *dr = db->db_last_dirty; 1303168404Spjd 1304168404Spjd if (dr->dr_txg == txg) { 1305168404Spjd /* 1306168404Spjd * This buffer is "in-use", re-adjust the file 1307168404Spjd * size to reflect that this buffer may 1308168404Spjd * contain new data when we sync. 1309168404Spjd */ 1310219089Spjd if (db->db_blkid != DMU_SPILL_BLKID && 1311219089Spjd db->db_blkid > dn->dn_maxblkid) 1312168404Spjd dn->dn_maxblkid = db->db_blkid; 1313168404Spjd dbuf_unoverride(dr); 1314168404Spjd } else { 1315168404Spjd /* 1316168404Spjd * This dbuf is not dirty in the open context. 1317168404Spjd * Either uncache it (if its not referenced in 1318168404Spjd * the open context) or reset its contents to 1319168404Spjd * empty. 1320168404Spjd */ 1321168404Spjd dbuf_fix_old_data(db, txg); 1322168404Spjd } 1323168404Spjd } 1324168404Spjd /* clear the contents if its cached */ 1325168404Spjd if (db->db_state == DB_CACHED) { 1326168404Spjd ASSERT(db->db.db_data != NULL); 1327168404Spjd arc_release(db->db_buf, db); 1328168404Spjd bzero(db->db.db_data, db->db.db_size); 1329168404Spjd arc_buf_freeze(db->db_buf); 1330168404Spjd } 1331168404Spjd 1332168404Spjd mutex_exit(&db->db_mtx); 1333168404Spjd } 1334168404Spjd mutex_exit(&dn->dn_dbufs_mtx); 1335168404Spjd} 1336168404Spjd 1337168404Spjdstatic int 1338185029Spjddbuf_block_freeable(dmu_buf_impl_t *db) 1339168404Spjd{ 1340168404Spjd dsl_dataset_t *ds = db->db_objset->os_dsl_dataset; 1341168404Spjd uint64_t birth_txg = 0; 1342168404Spjd 1343168404Spjd /* 1344168404Spjd * We don't need any locking to protect db_blkptr: 1345168404Spjd * If it's syncing, then db_last_dirty will be set 1346168404Spjd * so we'll ignore db_blkptr. 1347263397Sdelphij * 1348263397Sdelphij * This logic ensures that only block births for 1349263397Sdelphij * filled blocks are considered. 1350168404Spjd */ 1351168404Spjd ASSERT(MUTEX_HELD(&db->db_mtx)); 1352263397Sdelphij if (db->db_last_dirty && (db->db_blkptr == NULL || 1353263397Sdelphij !BP_IS_HOLE(db->db_blkptr))) { 1354168404Spjd birth_txg = db->db_last_dirty->dr_txg; 1355263397Sdelphij } else if (db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)) { 1356168404Spjd birth_txg = db->db_blkptr->blk_birth; 1357263397Sdelphij } 1358168404Spjd 1359219089Spjd /* 1360263397Sdelphij * If this block don't exist or is in a snapshot, it can't be freed. 1361219089Spjd * Don't pass the bp to dsl_dataset_block_freeable() since we 1362219089Spjd * are holding the db_mtx lock and might deadlock if we are 1363219089Spjd * prefetching a dedup-ed block. 1364219089Spjd */ 1365263397Sdelphij if (birth_txg != 0) 1366185029Spjd return (ds == NULL || 1367219089Spjd dsl_dataset_block_freeable(ds, NULL, birth_txg)); 1368168404Spjd else 1369263397Sdelphij return (B_FALSE); 1370168404Spjd} 1371168404Spjd 1372168404Spjdvoid 1373168404Spjddbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx) 1374168404Spjd{ 1375168404Spjd arc_buf_t *buf, *obuf; 1376168404Spjd int osize = db->db.db_size; 1377168404Spjd arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 1378219089Spjd dnode_t *dn; 1379168404Spjd 1380219089Spjd ASSERT(db->db_blkid != DMU_BONUS_BLKID); 1381168404Spjd 1382219089Spjd DB_DNODE_ENTER(db); 1383219089Spjd dn = DB_DNODE(db); 1384219089Spjd 1385168404Spjd /* XXX does *this* func really need the lock? */ 1386219089Spjd ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); 1387168404Spjd 1388168404Spjd /* 1389263397Sdelphij * This call to dmu_buf_will_dirty() with the dn_struct_rwlock held 1390168404Spjd * is OK, because there can be no other references to the db 1391168404Spjd * when we are changing its size, so no concurrent DB_FILL can 1392168404Spjd * be happening. 1393168404Spjd */ 1394168404Spjd /* 1395168404Spjd * XXX we should be doing a dbuf_read, checking the return 1396168404Spjd * value and returning that up to our callers 1397168404Spjd */ 1398263397Sdelphij dmu_buf_will_dirty(&db->db, tx); 1399168404Spjd 1400168404Spjd /* create the data buffer for the new block */ 1401307266Smav buf = arc_alloc_buf(dn->dn_objset->os_spa, size, db, type); 1402168404Spjd 1403168404Spjd /* copy old block data to the new block */ 1404168404Spjd obuf = db->db_buf; 1405168404Spjd bcopy(obuf->b_data, buf->b_data, MIN(osize, size)); 1406168404Spjd /* zero the remainder */ 1407168404Spjd if (size > osize) 1408168404Spjd bzero((uint8_t *)buf->b_data + osize, size - osize); 1409168404Spjd 1410168404Spjd mutex_enter(&db->db_mtx); 1411168404Spjd dbuf_set_data(db, buf); 1412307266Smav arc_buf_destroy(obuf, db); 1413168404Spjd db->db.db_size = size; 1414168404Spjd 1415168404Spjd if (db->db_level == 0) { 1416168404Spjd ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg); 1417168404Spjd db->db_last_dirty->dt.dl.dr_data = buf; 1418168404Spjd } 1419168404Spjd mutex_exit(&db->db_mtx); 1420168404Spjd 1421219089Spjd dnode_willuse_space(dn, size-osize, tx); 1422219089Spjd DB_DNODE_EXIT(db); 1423168404Spjd} 1424168404Spjd 1425219089Spjdvoid 1426219089Spjddbuf_release_bp(dmu_buf_impl_t *db) 1427219089Spjd{ 1428263397Sdelphij objset_t *os = db->db_objset; 1429219089Spjd 1430219089Spjd ASSERT(dsl_pool_sync_context(dmu_objset_pool(os))); 1431219089Spjd ASSERT(arc_released(os->os_phys_buf) || 1432219089Spjd list_link_active(&os->os_dsl_dataset->ds_synced_link)); 1433219089Spjd ASSERT(db->db_parent == NULL || arc_released(db->db_parent->db_buf)); 1434219089Spjd 1435246666Smm (void) arc_release(db->db_buf, db); 1436219089Spjd} 1437219089Spjd 1438290750Smav/* 1439290750Smav * We already have a dirty record for this TXG, and we are being 1440290750Smav * dirtied again. 1441290750Smav */ 1442290750Smavstatic void 1443290750Smavdbuf_redirty(dbuf_dirty_record_t *dr) 1444290750Smav{ 1445290750Smav dmu_buf_impl_t *db = dr->dr_dbuf; 1446290750Smav 1447290750Smav ASSERT(MUTEX_HELD(&db->db_mtx)); 1448290750Smav 1449290750Smav if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID) { 1450290750Smav /* 1451290750Smav * If this buffer has already been written out, 1452290750Smav * we now need to reset its state. 1453290750Smav */ 1454290750Smav dbuf_unoverride(dr); 1455290750Smav if (db->db.db_object != DMU_META_DNODE_OBJECT && 1456290750Smav db->db_state != DB_NOFILL) { 1457290750Smav /* Already released on initial dirty, so just thaw. */ 1458290750Smav ASSERT(arc_released(db->db_buf)); 1459290750Smav arc_buf_thaw(db->db_buf); 1460290750Smav } 1461290750Smav } 1462290750Smav} 1463290750Smav 1464168404Spjddbuf_dirty_record_t * 1465168404Spjddbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) 1466168404Spjd{ 1467219089Spjd dnode_t *dn; 1468219089Spjd objset_t *os; 1469168404Spjd dbuf_dirty_record_t **drp, *dr; 1470168404Spjd int drop_struct_lock = FALSE; 1471185029Spjd boolean_t do_free_accounting = B_FALSE; 1472168404Spjd int txgoff = tx->tx_txg & TXG_MASK; 1473168404Spjd 1474168404Spjd ASSERT(tx->tx_txg != 0); 1475168404Spjd ASSERT(!refcount_is_zero(&db->db_holds)); 1476168404Spjd DMU_TX_DIRTY_BUF(tx, db); 1477168404Spjd 1478219089Spjd DB_DNODE_ENTER(db); 1479219089Spjd dn = DB_DNODE(db); 1480168404Spjd /* 1481168404Spjd * Shouldn't dirty a regular buffer in syncing context. Private 1482168404Spjd * objects may be dirtied in syncing context, but only if they 1483168404Spjd * were already pre-dirtied in open context. 1484168404Spjd */ 1485308083Smav#ifdef DEBUG 1486308083Smav if (dn->dn_objset->os_dsl_dataset != NULL) { 1487308083Smav rrw_enter(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock, 1488308083Smav RW_READER, FTAG); 1489308083Smav } 1490168404Spjd ASSERT(!dmu_tx_is_syncing(tx) || 1491168404Spjd BP_IS_HOLE(dn->dn_objset->os_rootbp) || 1492209962Smm DMU_OBJECT_IS_SPECIAL(dn->dn_object) || 1493209962Smm dn->dn_objset->os_dsl_dataset == NULL); 1494308083Smav if (dn->dn_objset->os_dsl_dataset != NULL) 1495308083Smav rrw_exit(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock, FTAG); 1496308083Smav#endif 1497168404Spjd /* 1498168404Spjd * We make this assert for private objects as well, but after we 1499168404Spjd * check if we're already dirty. They are allowed to re-dirty 1500168404Spjd * in syncing context. 1501168404Spjd */ 1502168404Spjd ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || 1503168404Spjd dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx == 1504168404Spjd (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); 1505168404Spjd 1506168404Spjd mutex_enter(&db->db_mtx); 1507168404Spjd /* 1508168404Spjd * XXX make this true for indirects too? The problem is that 1509168404Spjd * transactions created with dmu_tx_create_assigned() from 1510168404Spjd * syncing context don't bother holding ahead. 1511168404Spjd */ 1512168404Spjd ASSERT(db->db_level != 0 || 1513219089Spjd db->db_state == DB_CACHED || db->db_state == DB_FILL || 1514219089Spjd db->db_state == DB_NOFILL); 1515168404Spjd 1516168404Spjd mutex_enter(&dn->dn_mtx); 1517168404Spjd /* 1518168404Spjd * Don't set dirtyctx to SYNC if we're just modifying this as we 1519168404Spjd * initialize the objset. 1520168404Spjd */ 1521308083Smav if (dn->dn_dirtyctx == DN_UNDIRTIED) { 1522308083Smav if (dn->dn_objset->os_dsl_dataset != NULL) { 1523308083Smav rrw_enter(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock, 1524308083Smav RW_READER, FTAG); 1525308083Smav } 1526308083Smav if (!BP_IS_HOLE(dn->dn_objset->os_rootbp)) { 1527308083Smav dn->dn_dirtyctx = (dmu_tx_is_syncing(tx) ? 1528308083Smav DN_DIRTY_SYNC : DN_DIRTY_OPEN); 1529308083Smav ASSERT(dn->dn_dirtyctx_firstset == NULL); 1530308083Smav dn->dn_dirtyctx_firstset = kmem_alloc(1, KM_SLEEP); 1531308083Smav } 1532308083Smav if (dn->dn_objset->os_dsl_dataset != NULL) { 1533308083Smav rrw_exit(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock, 1534308083Smav FTAG); 1535308083Smav } 1536168404Spjd } 1537168404Spjd mutex_exit(&dn->dn_mtx); 1538168404Spjd 1539219089Spjd if (db->db_blkid == DMU_SPILL_BLKID) 1540219089Spjd dn->dn_have_spill = B_TRUE; 1541219089Spjd 1542168404Spjd /* 1543168404Spjd * If this buffer is already dirty, we're done. 1544168404Spjd */ 1545168404Spjd drp = &db->db_last_dirty; 1546168404Spjd ASSERT(*drp == NULL || (*drp)->dr_txg <= tx->tx_txg || 1547168404Spjd db->db.db_object == DMU_META_DNODE_OBJECT); 1548185029Spjd while ((dr = *drp) != NULL && dr->dr_txg > tx->tx_txg) 1549185029Spjd drp = &dr->dr_next; 1550185029Spjd if (dr && dr->dr_txg == tx->tx_txg) { 1551219089Spjd DB_DNODE_EXIT(db); 1552219089Spjd 1553290750Smav dbuf_redirty(dr); 1554168404Spjd mutex_exit(&db->db_mtx); 1555185029Spjd return (dr); 1556168404Spjd } 1557168404Spjd 1558168404Spjd /* 1559168404Spjd * Only valid if not already dirty. 1560168404Spjd */ 1561209962Smm ASSERT(dn->dn_object == 0 || 1562209962Smm dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx == 1563168404Spjd (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); 1564168404Spjd 1565168404Spjd ASSERT3U(dn->dn_nlevels, >, db->db_level); 1566168404Spjd ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) || 1567168404Spjd dn->dn_phys->dn_nlevels > db->db_level || 1568168404Spjd dn->dn_next_nlevels[txgoff] > db->db_level || 1569168404Spjd dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level || 1570168404Spjd dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level); 1571168404Spjd 1572168404Spjd /* 1573168404Spjd * We should only be dirtying in syncing context if it's the 1574209962Smm * mos or we're initializing the os or it's a special object. 1575209962Smm * However, we are allowed to dirty in syncing context provided 1576209962Smm * we already dirtied it in open context. Hence we must make 1577209962Smm * this assertion only if we're not already dirty. 1578168404Spjd */ 1579219089Spjd os = dn->dn_objset; 1580308083Smav#ifdef DEBUG 1581308083Smav if (dn->dn_objset->os_dsl_dataset != NULL) 1582308083Smav rrw_enter(&os->os_dsl_dataset->ds_bp_rwlock, RW_READER, FTAG); 1583209962Smm ASSERT(!dmu_tx_is_syncing(tx) || DMU_OBJECT_IS_SPECIAL(dn->dn_object) || 1584209962Smm os->os_dsl_dataset == NULL || BP_IS_HOLE(os->os_rootbp)); 1585308083Smav if (dn->dn_objset->os_dsl_dataset != NULL) 1586308083Smav rrw_exit(&os->os_dsl_dataset->ds_bp_rwlock, FTAG); 1587308083Smav#endif 1588168404Spjd ASSERT(db->db.db_size != 0); 1589168404Spjd 1590168404Spjd dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); 1591168404Spjd 1592219089Spjd if (db->db_blkid != DMU_BONUS_BLKID) { 1593185029Spjd /* 1594185029Spjd * Update the accounting. 1595185029Spjd * Note: we delay "free accounting" until after we drop 1596185029Spjd * the db_mtx. This keeps us from grabbing other locks 1597219089Spjd * (and possibly deadlocking) in bp_get_dsize() while 1598185029Spjd * also holding the db_mtx. 1599185029Spjd */ 1600185029Spjd dnode_willuse_space(dn, db->db.db_size, tx); 1601185029Spjd do_free_accounting = dbuf_block_freeable(db); 1602185029Spjd } 1603185029Spjd 1604168404Spjd /* 1605168404Spjd * If this buffer is dirty in an old transaction group we need 1606168404Spjd * to make a copy of it so that the changes we make in this 1607168404Spjd * transaction group won't leak out when we sync the older txg. 1608168404Spjd */ 1609168404Spjd dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_SLEEP); 1610168404Spjd if (db->db_level == 0) { 1611168404Spjd void *data_old = db->db_buf; 1612168404Spjd 1613219089Spjd if (db->db_state != DB_NOFILL) { 1614219089Spjd if (db->db_blkid == DMU_BONUS_BLKID) { 1615219089Spjd dbuf_fix_old_data(db, tx->tx_txg); 1616219089Spjd data_old = db->db.db_data; 1617219089Spjd } else if (db->db.db_object != DMU_META_DNODE_OBJECT) { 1618219089Spjd /* 1619219089Spjd * Release the data buffer from the cache so 1620219089Spjd * that we can modify it without impacting 1621219089Spjd * possible other users of this cached data 1622219089Spjd * block. Note that indirect blocks and 1623219089Spjd * private objects are not released until the 1624219089Spjd * syncing state (since they are only modified 1625219089Spjd * then). 1626219089Spjd */ 1627219089Spjd arc_release(db->db_buf, db); 1628219089Spjd dbuf_fix_old_data(db, tx->tx_txg); 1629219089Spjd data_old = db->db_buf; 1630219089Spjd } 1631219089Spjd ASSERT(data_old != NULL); 1632168404Spjd } 1633168404Spjd dr->dt.dl.dr_data = data_old; 1634168404Spjd } else { 1635168404Spjd mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_DEFAULT, NULL); 1636168404Spjd list_create(&dr->dt.di.dr_children, 1637168404Spjd sizeof (dbuf_dirty_record_t), 1638168404Spjd offsetof(dbuf_dirty_record_t, dr_dirty_node)); 1639168404Spjd } 1640260763Savg if (db->db_blkid != DMU_BONUS_BLKID && os->os_dsl_dataset != NULL) 1641260763Savg dr->dr_accounted = db->db.db_size; 1642168404Spjd dr->dr_dbuf = db; 1643168404Spjd dr->dr_txg = tx->tx_txg; 1644168404Spjd dr->dr_next = *drp; 1645168404Spjd *drp = dr; 1646168404Spjd 1647168404Spjd /* 1648168404Spjd * We could have been freed_in_flight between the dbuf_noread 1649168404Spjd * and dbuf_dirty. We win, as though the dbuf_noread() had 1650168404Spjd * happened after the free. 1651168404Spjd */ 1652219089Spjd if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && 1653219089Spjd db->db_blkid != DMU_SPILL_BLKID) { 1654168404Spjd mutex_enter(&dn->dn_mtx); 1655265740Sdelphij if (dn->dn_free_ranges[txgoff] != NULL) { 1656265740Sdelphij range_tree_clear(dn->dn_free_ranges[txgoff], 1657265740Sdelphij db->db_blkid, 1); 1658265740Sdelphij } 1659168404Spjd mutex_exit(&dn->dn_mtx); 1660168404Spjd db->db_freed_in_flight = FALSE; 1661168404Spjd } 1662168404Spjd 1663168404Spjd /* 1664168404Spjd * This buffer is now part of this txg 1665168404Spjd */ 1666168404Spjd dbuf_add_ref(db, (void *)(uintptr_t)tx->tx_txg); 1667168404Spjd db->db_dirtycnt += 1; 1668168404Spjd ASSERT3U(db->db_dirtycnt, <=, 3); 1669168404Spjd 1670168404Spjd mutex_exit(&db->db_mtx); 1671168404Spjd 1672219089Spjd if (db->db_blkid == DMU_BONUS_BLKID || 1673219089Spjd db->db_blkid == DMU_SPILL_BLKID) { 1674168404Spjd mutex_enter(&dn->dn_mtx); 1675168404Spjd ASSERT(!list_link_active(&dr->dr_dirty_node)); 1676168404Spjd list_insert_tail(&dn->dn_dirty_records[txgoff], dr); 1677168404Spjd mutex_exit(&dn->dn_mtx); 1678168404Spjd dnode_setdirty(dn, tx); 1679219089Spjd DB_DNODE_EXIT(db); 1680168404Spjd return (dr); 1681307270Smav } 1682307270Smav 1683307270Smav /* 1684307270Smav * The dn_struct_rwlock prevents db_blkptr from changing 1685307270Smav * due to a write from syncing context completing 1686307270Smav * while we are running, so we want to acquire it before 1687307270Smav * looking at db_blkptr. 1688307270Smav */ 1689307270Smav if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) { 1690307270Smav rw_enter(&dn->dn_struct_rwlock, RW_READER); 1691307270Smav drop_struct_lock = TRUE; 1692307270Smav } 1693307270Smav 1694307270Smav if (do_free_accounting) { 1695185029Spjd blkptr_t *bp = db->db_blkptr; 1696185029Spjd int64_t willfree = (bp && !BP_IS_HOLE(bp)) ? 1697219089Spjd bp_get_dsize(os->os_spa, bp) : db->db.db_size; 1698185029Spjd /* 1699185029Spjd * This is only a guess -- if the dbuf is dirty 1700185029Spjd * in a previous txg, we don't know how much 1701185029Spjd * space it will use on disk yet. We should 1702185029Spjd * really have the struct_rwlock to access 1703185029Spjd * db_blkptr, but since this is just a guess, 1704185029Spjd * it's OK if we get an odd answer. 1705185029Spjd */ 1706219089Spjd ddt_prefetch(os->os_spa, bp); 1707185029Spjd dnode_willuse_space(dn, -willfree, tx); 1708168404Spjd } 1709168404Spjd 1710185029Spjd if (db->db_level == 0) { 1711185029Spjd dnode_new_blkid(dn, db->db_blkid, tx, drop_struct_lock); 1712185029Spjd ASSERT(dn->dn_maxblkid >= db->db_blkid); 1713185029Spjd } 1714185029Spjd 1715168404Spjd if (db->db_level+1 < dn->dn_nlevels) { 1716168404Spjd dmu_buf_impl_t *parent = db->db_parent; 1717168404Spjd dbuf_dirty_record_t *di; 1718168404Spjd int parent_held = FALSE; 1719168404Spjd 1720168404Spjd if (db->db_parent == NULL || db->db_parent == dn->dn_dbuf) { 1721168404Spjd int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 1722168404Spjd 1723168404Spjd parent = dbuf_hold_level(dn, db->db_level+1, 1724168404Spjd db->db_blkid >> epbs, FTAG); 1725219089Spjd ASSERT(parent != NULL); 1726168404Spjd parent_held = TRUE; 1727168404Spjd } 1728168404Spjd if (drop_struct_lock) 1729168404Spjd rw_exit(&dn->dn_struct_rwlock); 1730168404Spjd ASSERT3U(db->db_level+1, ==, parent->db_level); 1731168404Spjd di = dbuf_dirty(parent, tx); 1732168404Spjd if (parent_held) 1733168404Spjd dbuf_rele(parent, FTAG); 1734168404Spjd 1735168404Spjd mutex_enter(&db->db_mtx); 1736260763Savg /* 1737260763Savg * Since we've dropped the mutex, it's possible that 1738260763Savg * dbuf_undirty() might have changed this out from under us. 1739260763Savg */ 1740168404Spjd if (db->db_last_dirty == dr || 1741168404Spjd dn->dn_object == DMU_META_DNODE_OBJECT) { 1742168404Spjd mutex_enter(&di->dt.di.dr_mtx); 1743168404Spjd ASSERT3U(di->dr_txg, ==, tx->tx_txg); 1744168404Spjd ASSERT(!list_link_active(&dr->dr_dirty_node)); 1745168404Spjd list_insert_tail(&di->dt.di.dr_children, dr); 1746168404Spjd mutex_exit(&di->dt.di.dr_mtx); 1747168404Spjd dr->dr_parent = di; 1748168404Spjd } 1749168404Spjd mutex_exit(&db->db_mtx); 1750168404Spjd } else { 1751168404Spjd ASSERT(db->db_level+1 == dn->dn_nlevels); 1752168404Spjd ASSERT(db->db_blkid < dn->dn_nblkptr); 1753219089Spjd ASSERT(db->db_parent == NULL || db->db_parent == dn->dn_dbuf); 1754168404Spjd mutex_enter(&dn->dn_mtx); 1755168404Spjd ASSERT(!list_link_active(&dr->dr_dirty_node)); 1756168404Spjd list_insert_tail(&dn->dn_dirty_records[txgoff], dr); 1757168404Spjd mutex_exit(&dn->dn_mtx); 1758168404Spjd if (drop_struct_lock) 1759168404Spjd rw_exit(&dn->dn_struct_rwlock); 1760168404Spjd } 1761168404Spjd 1762168404Spjd dnode_setdirty(dn, tx); 1763219089Spjd DB_DNODE_EXIT(db); 1764168404Spjd return (dr); 1765168404Spjd} 1766168404Spjd 1767248571Smm/* 1768251629Sdelphij * Undirty a buffer in the transaction group referenced by the given 1769251629Sdelphij * transaction. Return whether this evicted the dbuf. 1770248571Smm */ 1771248571Smmstatic boolean_t 1772168404Spjddbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) 1773168404Spjd{ 1774219089Spjd dnode_t *dn; 1775168404Spjd uint64_t txg = tx->tx_txg; 1776185029Spjd dbuf_dirty_record_t *dr, **drp; 1777168404Spjd 1778168404Spjd ASSERT(txg != 0); 1779285202Savg 1780285202Savg /* 1781285202Savg * Due to our use of dn_nlevels below, this can only be called 1782285202Savg * in open context, unless we are operating on the MOS. 1783285202Savg * From syncing context, dn_nlevels may be different from the 1784285202Savg * dn_nlevels used when dbuf was dirtied. 1785285202Savg */ 1786285202Savg ASSERT(db->db_objset == 1787285202Savg dmu_objset_pool(db->db_objset)->dp_meta_objset || 1788285202Savg txg != spa_syncing_txg(dmu_objset_spa(db->db_objset))); 1789219089Spjd ASSERT(db->db_blkid != DMU_BONUS_BLKID); 1790248571Smm ASSERT0(db->db_level); 1791248571Smm ASSERT(MUTEX_HELD(&db->db_mtx)); 1792168404Spjd 1793168404Spjd /* 1794168404Spjd * If this buffer is not dirty, we're done. 1795168404Spjd */ 1796185029Spjd for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next) 1797168404Spjd if (dr->dr_txg <= txg) 1798168404Spjd break; 1799248571Smm if (dr == NULL || dr->dr_txg < txg) 1800248571Smm return (B_FALSE); 1801168404Spjd ASSERT(dr->dr_txg == txg); 1802219089Spjd ASSERT(dr->dr_dbuf == db); 1803168404Spjd 1804219089Spjd DB_DNODE_ENTER(db); 1805219089Spjd dn = DB_DNODE(db); 1806219089Spjd 1807168404Spjd dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); 1808168404Spjd 1809168404Spjd ASSERT(db->db.db_size != 0); 1810168404Spjd 1811285202Savg dsl_pool_undirty_space(dmu_objset_pool(dn->dn_objset), 1812285202Savg dr->dr_accounted, txg); 1813168404Spjd 1814185029Spjd *drp = dr->dr_next; 1815168404Spjd 1816219636Spjd /* 1817219636Spjd * Note that there are three places in dbuf_dirty() 1818219636Spjd * where this dirty record may be put on a list. 1819219636Spjd * Make sure to do a list_remove corresponding to 1820219636Spjd * every one of those list_insert calls. 1821219636Spjd */ 1822168404Spjd if (dr->dr_parent) { 1823168404Spjd mutex_enter(&dr->dr_parent->dt.di.dr_mtx); 1824168404Spjd list_remove(&dr->dr_parent->dt.di.dr_children, dr); 1825168404Spjd mutex_exit(&dr->dr_parent->dt.di.dr_mtx); 1826219636Spjd } else if (db->db_blkid == DMU_SPILL_BLKID || 1827285202Savg db->db_level + 1 == dn->dn_nlevels) { 1828185029Spjd ASSERT(db->db_blkptr == NULL || db->db_parent == dn->dn_dbuf); 1829168404Spjd mutex_enter(&dn->dn_mtx); 1830168404Spjd list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr); 1831168404Spjd mutex_exit(&dn->dn_mtx); 1832168404Spjd } 1833219089Spjd DB_DNODE_EXIT(db); 1834168404Spjd 1835248571Smm if (db->db_state != DB_NOFILL) { 1836248571Smm dbuf_unoverride(dr); 1837168404Spjd 1838168404Spjd ASSERT(db->db_buf != NULL); 1839248571Smm ASSERT(dr->dt.dl.dr_data != NULL); 1840248571Smm if (dr->dt.dl.dr_data != db->db_buf) 1841307266Smav arc_buf_destroy(dr->dt.dl.dr_data, db); 1842168404Spjd } 1843269218Sdelphij 1844168404Spjd kmem_free(dr, sizeof (dbuf_dirty_record_t)); 1845168404Spjd 1846168404Spjd ASSERT(db->db_dirtycnt > 0); 1847168404Spjd db->db_dirtycnt -= 1; 1848168404Spjd 1849168404Spjd if (refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) { 1850307266Smav ASSERT(db->db_state == DB_NOFILL || arc_released(db->db_buf)); 1851307266Smav dbuf_destroy(db); 1852248571Smm return (B_TRUE); 1853168404Spjd } 1854168404Spjd 1855248571Smm return (B_FALSE); 1856168404Spjd} 1857168404Spjd 1858168404Spjdvoid 1859263397Sdelphijdmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx) 1860168404Spjd{ 1861263397Sdelphij dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 1862185029Spjd int rf = DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH; 1863168404Spjd 1864168404Spjd ASSERT(tx->tx_txg != 0); 1865168404Spjd ASSERT(!refcount_is_zero(&db->db_holds)); 1866168404Spjd 1867290750Smav /* 1868290750Smav * Quick check for dirtyness. For already dirty blocks, this 1869290750Smav * reduces runtime of this function by >90%, and overall performance 1870290750Smav * by 50% for some workloads (e.g. file deletion with indirect blocks 1871290750Smav * cached). 1872290750Smav */ 1873290750Smav mutex_enter(&db->db_mtx); 1874290750Smav dbuf_dirty_record_t *dr; 1875290750Smav for (dr = db->db_last_dirty; 1876290750Smav dr != NULL && dr->dr_txg >= tx->tx_txg; dr = dr->dr_next) { 1877290750Smav /* 1878290750Smav * It's possible that it is already dirty but not cached, 1879290750Smav * because there are some calls to dbuf_dirty() that don't 1880290750Smav * go through dmu_buf_will_dirty(). 1881290750Smav */ 1882290750Smav if (dr->dr_txg == tx->tx_txg && db->db_state == DB_CACHED) { 1883290750Smav /* This dbuf is already dirty and cached. */ 1884290750Smav dbuf_redirty(dr); 1885290750Smav mutex_exit(&db->db_mtx); 1886290750Smav return; 1887290750Smav } 1888290750Smav } 1889290750Smav mutex_exit(&db->db_mtx); 1890290750Smav 1891219089Spjd DB_DNODE_ENTER(db); 1892219089Spjd if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock)) 1893168404Spjd rf |= DB_RF_HAVESTRUCT; 1894219089Spjd DB_DNODE_EXIT(db); 1895168404Spjd (void) dbuf_read(db, NULL, rf); 1896168404Spjd (void) dbuf_dirty(db, tx); 1897168404Spjd} 1898168404Spjd 1899168404Spjdvoid 1900219089Spjddmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) 1901219089Spjd{ 1902219089Spjd dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 1903219089Spjd 1904219089Spjd db->db_state = DB_NOFILL; 1905219089Spjd 1906219089Spjd dmu_buf_will_fill(db_fake, tx); 1907219089Spjd} 1908219089Spjd 1909219089Spjdvoid 1910168404Spjddmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) 1911168404Spjd{ 1912168404Spjd dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 1913168404Spjd 1914219089Spjd ASSERT(db->db_blkid != DMU_BONUS_BLKID); 1915168404Spjd ASSERT(tx->tx_txg != 0); 1916168404Spjd ASSERT(db->db_level == 0); 1917168404Spjd ASSERT(!refcount_is_zero(&db->db_holds)); 1918168404Spjd 1919168404Spjd ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT || 1920168404Spjd dmu_tx_private_ok(tx)); 1921168404Spjd 1922168404Spjd dbuf_noread(db); 1923168404Spjd (void) dbuf_dirty(db, tx); 1924168404Spjd} 1925168404Spjd 1926168404Spjd#pragma weak dmu_buf_fill_done = dbuf_fill_done 1927168404Spjd/* ARGSUSED */ 1928168404Spjdvoid 1929168404Spjddbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx) 1930168404Spjd{ 1931168404Spjd mutex_enter(&db->db_mtx); 1932168404Spjd DBUF_VERIFY(db); 1933168404Spjd 1934168404Spjd if (db->db_state == DB_FILL) { 1935168404Spjd if (db->db_level == 0 && db->db_freed_in_flight) { 1936219089Spjd ASSERT(db->db_blkid != DMU_BONUS_BLKID); 1937168404Spjd /* we were freed while filling */ 1938168404Spjd /* XXX dbuf_undirty? */ 1939168404Spjd bzero(db->db.db_data, db->db.db_size); 1940168404Spjd db->db_freed_in_flight = FALSE; 1941168404Spjd } 1942168404Spjd db->db_state = DB_CACHED; 1943168404Spjd cv_broadcast(&db->db_changed); 1944168404Spjd } 1945168404Spjd mutex_exit(&db->db_mtx); 1946168404Spjd} 1947168404Spjd 1948268649Sdelphijvoid 1949268649Sdelphijdmu_buf_write_embedded(dmu_buf_t *dbuf, void *data, 1950268649Sdelphij bp_embedded_type_t etype, enum zio_compress comp, 1951268649Sdelphij int uncompressed_size, int compressed_size, int byteorder, 1952268649Sdelphij dmu_tx_t *tx) 1953268649Sdelphij{ 1954268649Sdelphij dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf; 1955268649Sdelphij struct dirty_leaf *dl; 1956268649Sdelphij dmu_object_type_t type; 1957268649Sdelphij 1958288572Smav if (etype == BP_EMBEDDED_TYPE_DATA) { 1959288572Smav ASSERT(spa_feature_is_active(dmu_objset_spa(db->db_objset), 1960288572Smav SPA_FEATURE_EMBEDDED_DATA)); 1961288572Smav } 1962288572Smav 1963268649Sdelphij DB_DNODE_ENTER(db); 1964268649Sdelphij type = DB_DNODE(db)->dn_type; 1965268649Sdelphij DB_DNODE_EXIT(db); 1966268649Sdelphij 1967268649Sdelphij ASSERT0(db->db_level); 1968268649Sdelphij ASSERT(db->db_blkid != DMU_BONUS_BLKID); 1969268649Sdelphij 1970268649Sdelphij dmu_buf_will_not_fill(dbuf, tx); 1971268649Sdelphij 1972268649Sdelphij ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg); 1973268649Sdelphij dl = &db->db_last_dirty->dt.dl; 1974268649Sdelphij encode_embedded_bp_compressed(&dl->dr_overridden_by, 1975268649Sdelphij data, comp, uncompressed_size, compressed_size); 1976268649Sdelphij BPE_SET_ETYPE(&dl->dr_overridden_by, etype); 1977268649Sdelphij BP_SET_TYPE(&dl->dr_overridden_by, type); 1978268649Sdelphij BP_SET_LEVEL(&dl->dr_overridden_by, 0); 1979268649Sdelphij BP_SET_BYTEORDER(&dl->dr_overridden_by, byteorder); 1980268649Sdelphij 1981268649Sdelphij dl->dr_override_state = DR_OVERRIDDEN; 1982268649Sdelphij dl->dr_overridden_by.blk_birth = db->db_last_dirty->dr_txg; 1983268649Sdelphij} 1984268649Sdelphij 1985168404Spjd/* 1986209962Smm * Directly assign a provided arc buf to a given dbuf if it's not referenced 1987209962Smm * by anybody except our caller. Otherwise copy arcbuf's contents to dbuf. 1988209962Smm */ 1989209962Smmvoid 1990209962Smmdbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx) 1991209962Smm{ 1992209962Smm ASSERT(!refcount_is_zero(&db->db_holds)); 1993219089Spjd ASSERT(db->db_blkid != DMU_BONUS_BLKID); 1994209962Smm ASSERT(db->db_level == 0); 1995209962Smm ASSERT(DBUF_GET_BUFC_TYPE(db) == ARC_BUFC_DATA); 1996209962Smm ASSERT(buf != NULL); 1997209962Smm ASSERT(arc_buf_size(buf) == db->db.db_size); 1998209962Smm ASSERT(tx->tx_txg != 0); 1999209962Smm 2000209962Smm arc_return_buf(buf, db); 2001209962Smm ASSERT(arc_released(buf)); 2002209962Smm 2003209962Smm mutex_enter(&db->db_mtx); 2004209962Smm 2005209962Smm while (db->db_state == DB_READ || db->db_state == DB_FILL) 2006209962Smm cv_wait(&db->db_changed, &db->db_mtx); 2007209962Smm 2008209962Smm ASSERT(db->db_state == DB_CACHED || db->db_state == DB_UNCACHED); 2009209962Smm 2010209962Smm if (db->db_state == DB_CACHED && 2011209962Smm refcount_count(&db->db_holds) - 1 > db->db_dirtycnt) { 2012209962Smm mutex_exit(&db->db_mtx); 2013209962Smm (void) dbuf_dirty(db, tx); 2014209962Smm bcopy(buf->b_data, db->db.db_data, db->db.db_size); 2015307266Smav arc_buf_destroy(buf, db); 2016219089Spjd xuio_stat_wbuf_copied(); 2017209962Smm return; 2018209962Smm } 2019209962Smm 2020219089Spjd xuio_stat_wbuf_nocopy(); 2021209962Smm if (db->db_state == DB_CACHED) { 2022209962Smm dbuf_dirty_record_t *dr = db->db_last_dirty; 2023209962Smm 2024209962Smm ASSERT(db->db_buf != NULL); 2025209962Smm if (dr != NULL && dr->dr_txg == tx->tx_txg) { 2026209962Smm ASSERT(dr->dt.dl.dr_data == db->db_buf); 2027209962Smm if (!arc_released(db->db_buf)) { 2028209962Smm ASSERT(dr->dt.dl.dr_override_state == 2029209962Smm DR_OVERRIDDEN); 2030209962Smm arc_release(db->db_buf, db); 2031209962Smm } 2032209962Smm dr->dt.dl.dr_data = buf; 2033307266Smav arc_buf_destroy(db->db_buf, db); 2034209962Smm } else if (dr == NULL || dr->dt.dl.dr_data != db->db_buf) { 2035209962Smm arc_release(db->db_buf, db); 2036307266Smav arc_buf_destroy(db->db_buf, db); 2037209962Smm } 2038209962Smm db->db_buf = NULL; 2039209962Smm } 2040209962Smm ASSERT(db->db_buf == NULL); 2041209962Smm dbuf_set_data(db, buf); 2042209962Smm db->db_state = DB_FILL; 2043209962Smm mutex_exit(&db->db_mtx); 2044209962Smm (void) dbuf_dirty(db, tx); 2045263397Sdelphij dmu_buf_fill_done(&db->db, tx); 2046209962Smm} 2047209962Smm 2048168404Spjdvoid 2049307266Smavdbuf_destroy(dmu_buf_impl_t *db) 2050168404Spjd{ 2051219089Spjd dnode_t *dn; 2052168404Spjd dmu_buf_impl_t *parent = db->db_parent; 2053219089Spjd dmu_buf_impl_t *dndb; 2054168404Spjd 2055168404Spjd ASSERT(MUTEX_HELD(&db->db_mtx)); 2056168404Spjd ASSERT(refcount_is_zero(&db->db_holds)); 2057168404Spjd 2058307266Smav if (db->db_buf != NULL) { 2059307266Smav arc_buf_destroy(db->db_buf, db); 2060307266Smav db->db_buf = NULL; 2061307266Smav } 2062168404Spjd 2063307266Smav if (db->db_blkid == DMU_BONUS_BLKID) { 2064168404Spjd ASSERT(db->db.db_data != NULL); 2065307266Smav zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN); 2066307266Smav arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); 2067168404Spjd db->db_state = DB_UNCACHED; 2068168404Spjd } 2069168404Spjd 2070307266Smav dbuf_clear_data(db); 2071307266Smav 2072307266Smav if (multilist_link_active(&db->db_cache_link)) { 2073307266Smav multilist_remove(&dbuf_cache, db); 2074307266Smav (void) refcount_remove_many(&dbuf_cache_size, 2075307266Smav db->db.db_size, db); 2076307266Smav } 2077307266Smav 2078219089Spjd ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL); 2079168404Spjd ASSERT(db->db_data_pending == NULL); 2080168404Spjd 2081168404Spjd db->db_state = DB_EVICTING; 2082168404Spjd db->db_blkptr = NULL; 2083168404Spjd 2084307266Smav /* 2085307266Smav * Now that db_state is DB_EVICTING, nobody else can find this via 2086307266Smav * the hash table. We can now drop db_mtx, which allows us to 2087307266Smav * acquire the dn_dbufs_mtx. 2088307266Smav */ 2089307266Smav mutex_exit(&db->db_mtx); 2090307266Smav 2091219089Spjd DB_DNODE_ENTER(db); 2092219089Spjd dn = DB_DNODE(db); 2093219089Spjd dndb = dn->dn_dbuf; 2094307266Smav if (db->db_blkid != DMU_BONUS_BLKID) { 2095307266Smav boolean_t needlock = !MUTEX_HELD(&dn->dn_dbufs_mtx); 2096307266Smav if (needlock) 2097307266Smav mutex_enter(&dn->dn_dbufs_mtx); 2098269845Sdelphij avl_remove(&dn->dn_dbufs, db); 2099271002Sdelphij atomic_dec_32(&dn->dn_dbufs_count); 2100219089Spjd membar_producer(); 2101219089Spjd DB_DNODE_EXIT(db); 2102307266Smav if (needlock) 2103307266Smav mutex_exit(&dn->dn_dbufs_mtx); 2104219089Spjd /* 2105219089Spjd * Decrementing the dbuf count means that the hold corresponding 2106219089Spjd * to the removed dbuf is no longer discounted in dnode_move(), 2107219089Spjd * so the dnode cannot be moved until after we release the hold. 2108219089Spjd * The membar_producer() ensures visibility of the decremented 2109219089Spjd * value in dnode_move(), since DB_DNODE_EXIT doesn't actually 2110219089Spjd * release any lock. 2111219089Spjd */ 2112168404Spjd dnode_rele(dn, db); 2113219089Spjd db->db_dnode_handle = NULL; 2114307266Smav 2115307266Smav dbuf_hash_remove(db); 2116219089Spjd } else { 2117219089Spjd DB_DNODE_EXIT(db); 2118168404Spjd } 2119168404Spjd 2120307266Smav ASSERT(refcount_is_zero(&db->db_holds)); 2121168404Spjd 2122307266Smav db->db_parent = NULL; 2123168404Spjd 2124307266Smav ASSERT(db->db_buf == NULL); 2125307266Smav ASSERT(db->db.db_data == NULL); 2126307266Smav ASSERT(db->db_hash_next == NULL); 2127307266Smav ASSERT(db->db_blkptr == NULL); 2128307266Smav ASSERT(db->db_data_pending == NULL); 2129307266Smav ASSERT(!multilist_link_active(&db->db_cache_link)); 2130307266Smav 2131307266Smav kmem_cache_free(dbuf_kmem_cache, db); 2132307266Smav arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); 2133307266Smav 2134168404Spjd /* 2135219089Spjd * If this dbuf is referenced from an indirect dbuf, 2136168404Spjd * decrement the ref count on the indirect dbuf. 2137168404Spjd */ 2138168404Spjd if (parent && parent != dndb) 2139168404Spjd dbuf_rele(parent, db); 2140168404Spjd} 2141168404Spjd 2142288571Smav/* 2143288571Smav * Note: While bpp will always be updated if the function returns success, 2144288571Smav * parentp will not be updated if the dnode does not have dn_dbuf filled in; 2145288571Smav * this happens when the dnode is the meta-dnode, or a userused or groupused 2146288571Smav * object. 2147288571Smav */ 2148168404Spjdstatic int 2149168404Spjddbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse, 2150168404Spjd dmu_buf_impl_t **parentp, blkptr_t **bpp) 2151168404Spjd{ 2152168404Spjd int nlevels, epbs; 2153168404Spjd 2154168404Spjd *parentp = NULL; 2155168404Spjd *bpp = NULL; 2156168404Spjd 2157219089Spjd ASSERT(blkid != DMU_BONUS_BLKID); 2158168404Spjd 2159219089Spjd if (blkid == DMU_SPILL_BLKID) { 2160219089Spjd mutex_enter(&dn->dn_mtx); 2161219089Spjd if (dn->dn_have_spill && 2162219089Spjd (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) 2163219089Spjd *bpp = &dn->dn_phys->dn_spill; 2164219089Spjd else 2165219089Spjd *bpp = NULL; 2166219089Spjd dbuf_add_ref(dn->dn_dbuf, NULL); 2167219089Spjd *parentp = dn->dn_dbuf; 2168219089Spjd mutex_exit(&dn->dn_mtx); 2169219089Spjd return (0); 2170219089Spjd } 2171219089Spjd 2172168404Spjd if (dn->dn_phys->dn_nlevels == 0) 2173168404Spjd nlevels = 1; 2174168404Spjd else 2175168404Spjd nlevels = dn->dn_phys->dn_nlevels; 2176168404Spjd 2177168404Spjd epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 2178168404Spjd 2179168404Spjd ASSERT3U(level * epbs, <, 64); 2180168404Spjd ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 2181168404Spjd if (level >= nlevels || 2182168404Spjd (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))) { 2183168404Spjd /* the buffer has no parent yet */ 2184249195Smm return (SET_ERROR(ENOENT)); 2185168404Spjd } else if (level < nlevels-1) { 2186168404Spjd /* this block is referenced from an indirect block */ 2187168404Spjd int err = dbuf_hold_impl(dn, level+1, 2188288571Smav blkid >> epbs, fail_sparse, FALSE, NULL, parentp); 2189168404Spjd if (err) 2190168404Spjd return (err); 2191168404Spjd err = dbuf_read(*parentp, NULL, 2192168404Spjd (DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH | DB_RF_CANFAIL)); 2193168404Spjd if (err) { 2194168404Spjd dbuf_rele(*parentp, NULL); 2195168404Spjd *parentp = NULL; 2196168404Spjd return (err); 2197168404Spjd } 2198168404Spjd *bpp = ((blkptr_t *)(*parentp)->db.db_data) + 2199168404Spjd (blkid & ((1ULL << epbs) - 1)); 2200168404Spjd return (0); 2201168404Spjd } else { 2202168404Spjd /* the block is referenced from the dnode */ 2203168404Spjd ASSERT3U(level, ==, nlevels-1); 2204168404Spjd ASSERT(dn->dn_phys->dn_nblkptr == 0 || 2205168404Spjd blkid < dn->dn_phys->dn_nblkptr); 2206168404Spjd if (dn->dn_dbuf) { 2207168404Spjd dbuf_add_ref(dn->dn_dbuf, NULL); 2208168404Spjd *parentp = dn->dn_dbuf; 2209168404Spjd } 2210168404Spjd *bpp = &dn->dn_phys->dn_blkptr[blkid]; 2211168404Spjd return (0); 2212168404Spjd } 2213168404Spjd} 2214168404Spjd 2215168404Spjdstatic dmu_buf_impl_t * 2216168404Spjddbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, 2217168404Spjd dmu_buf_impl_t *parent, blkptr_t *blkptr) 2218168404Spjd{ 2219219089Spjd objset_t *os = dn->dn_objset; 2220168404Spjd dmu_buf_impl_t *db, *odb; 2221168404Spjd 2222168404Spjd ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 2223168404Spjd ASSERT(dn->dn_type != DMU_OT_NONE); 2224168404Spjd 2225307266Smav db = kmem_cache_alloc(dbuf_kmem_cache, KM_SLEEP); 2226168404Spjd 2227168404Spjd db->db_objset = os; 2228168404Spjd db->db.db_object = dn->dn_object; 2229168404Spjd db->db_level = level; 2230168404Spjd db->db_blkid = blkid; 2231168404Spjd db->db_last_dirty = NULL; 2232168404Spjd db->db_dirtycnt = 0; 2233219089Spjd db->db_dnode_handle = dn->dn_handle; 2234168404Spjd db->db_parent = parent; 2235168404Spjd db->db_blkptr = blkptr; 2236168404Spjd 2237288549Smav db->db_user = NULL; 2238290754Smav db->db_user_immediate_evict = FALSE; 2239290754Smav db->db_freed_in_flight = FALSE; 2240290754Smav db->db_pending_evict = FALSE; 2241168404Spjd 2242219089Spjd if (blkid == DMU_BONUS_BLKID) { 2243168404Spjd ASSERT3P(parent, ==, dn->dn_dbuf); 2244185029Spjd db->db.db_size = DN_MAX_BONUSLEN - 2245185029Spjd (dn->dn_nblkptr-1) * sizeof (blkptr_t); 2246185029Spjd ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); 2247219089Spjd db->db.db_offset = DMU_BONUS_BLKID; 2248168404Spjd db->db_state = DB_UNCACHED; 2249168404Spjd /* the bonus dbuf is not placed in the hash table */ 2250208373Smm arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); 2251168404Spjd return (db); 2252219089Spjd } else if (blkid == DMU_SPILL_BLKID) { 2253219089Spjd db->db.db_size = (blkptr != NULL) ? 2254219089Spjd BP_GET_LSIZE(blkptr) : SPA_MINBLOCKSIZE; 2255219089Spjd db->db.db_offset = 0; 2256168404Spjd } else { 2257168404Spjd int blocksize = 2258260763Savg db->db_level ? 1 << dn->dn_indblkshift : dn->dn_datablksz; 2259168404Spjd db->db.db_size = blocksize; 2260168404Spjd db->db.db_offset = db->db_blkid * blocksize; 2261168404Spjd } 2262168404Spjd 2263168404Spjd /* 2264168404Spjd * Hold the dn_dbufs_mtx while we get the new dbuf 2265168404Spjd * in the hash table *and* added to the dbufs list. 2266168404Spjd * This prevents a possible deadlock with someone 2267168404Spjd * trying to look up this dbuf before its added to the 2268168404Spjd * dn_dbufs list. 2269168404Spjd */ 2270168404Spjd mutex_enter(&dn->dn_dbufs_mtx); 2271168404Spjd db->db_state = DB_EVICTING; 2272168404Spjd if ((odb = dbuf_hash_insert(db)) != NULL) { 2273168404Spjd /* someone else inserted it first */ 2274307266Smav kmem_cache_free(dbuf_kmem_cache, db); 2275168404Spjd mutex_exit(&dn->dn_dbufs_mtx); 2276168404Spjd return (odb); 2277168404Spjd } 2278269845Sdelphij avl_add(&dn->dn_dbufs, db); 2279254753Sdelphij if (db->db_level == 0 && db->db_blkid >= 2280254753Sdelphij dn->dn_unlisted_l0_blkid) 2281254753Sdelphij dn->dn_unlisted_l0_blkid = db->db_blkid + 1; 2282168404Spjd db->db_state = DB_UNCACHED; 2283168404Spjd mutex_exit(&dn->dn_dbufs_mtx); 2284208373Smm arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); 2285168404Spjd 2286168404Spjd if (parent && parent != dn->dn_dbuf) 2287168404Spjd dbuf_add_ref(parent, db); 2288168404Spjd 2289168404Spjd ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || 2290168404Spjd refcount_count(&dn->dn_holds) > 0); 2291168404Spjd (void) refcount_add(&dn->dn_holds, db); 2292271002Sdelphij atomic_inc_32(&dn->dn_dbufs_count); 2293168404Spjd 2294168404Spjd dprintf_dbuf(db, "db=%p\n", db); 2295168404Spjd 2296168404Spjd return (db); 2297168404Spjd} 2298168404Spjd 2299288571Smavtypedef struct dbuf_prefetch_arg { 2300288571Smav spa_t *dpa_spa; /* The spa to issue the prefetch in. */ 2301288571Smav zbookmark_phys_t dpa_zb; /* The target block to prefetch. */ 2302288571Smav int dpa_epbs; /* Entries (blkptr_t's) Per Block Shift. */ 2303288571Smav int dpa_curlevel; /* The current level that we're reading */ 2304307266Smav dnode_t *dpa_dnode; /* The dnode associated with the prefetch */ 2305288571Smav zio_priority_t dpa_prio; /* The priority I/Os should be issued at. */ 2306288571Smav zio_t *dpa_zio; /* The parent zio_t for all prefetches. */ 2307288571Smav arc_flags_t dpa_aflags; /* Flags to pass to the final prefetch. */ 2308288571Smav} dbuf_prefetch_arg_t; 2309288571Smav 2310288571Smav/* 2311288571Smav * Actually issue the prefetch read for the block given. 2312288571Smav */ 2313288571Smavstatic void 2314288571Smavdbuf_issue_final_prefetch(dbuf_prefetch_arg_t *dpa, blkptr_t *bp) 2315288571Smav{ 2316288571Smav if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) 2317288571Smav return; 2318288571Smav 2319288571Smav arc_flags_t aflags = 2320288571Smav dpa->dpa_aflags | ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH; 2321288571Smav 2322288571Smav ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp)); 2323288571Smav ASSERT3U(dpa->dpa_curlevel, ==, dpa->dpa_zb.zb_level); 2324288571Smav ASSERT(dpa->dpa_zio != NULL); 2325288571Smav (void) arc_read(dpa->dpa_zio, dpa->dpa_spa, bp, NULL, NULL, 2326288571Smav dpa->dpa_prio, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, 2327288571Smav &aflags, &dpa->dpa_zb); 2328288571Smav} 2329288571Smav 2330288571Smav/* 2331288571Smav * Called when an indirect block above our prefetch target is read in. This 2332288571Smav * will either read in the next indirect block down the tree or issue the actual 2333288571Smav * prefetch if the next block down is our target. 2334288571Smav */ 2335288571Smavstatic void 2336288571Smavdbuf_prefetch_indirect_done(zio_t *zio, arc_buf_t *abuf, void *private) 2337288571Smav{ 2338288571Smav dbuf_prefetch_arg_t *dpa = private; 2339288571Smav 2340288571Smav ASSERT3S(dpa->dpa_zb.zb_level, <, dpa->dpa_curlevel); 2341288571Smav ASSERT3S(dpa->dpa_curlevel, >, 0); 2342307266Smav 2343307266Smav /* 2344307266Smav * The dpa_dnode is only valid if we are called with a NULL 2345307266Smav * zio. This indicates that the arc_read() returned without 2346307266Smav * first calling zio_read() to issue a physical read. Once 2347307266Smav * a physical read is made the dpa_dnode must be invalidated 2348307266Smav * as the locks guarding it may have been dropped. If the 2349307266Smav * dpa_dnode is still valid, then we want to add it to the dbuf 2350307266Smav * cache. To do so, we must hold the dbuf associated with the block 2351307266Smav * we just prefetched, read its contents so that we associate it 2352307266Smav * with an arc_buf_t, and then release it. 2353307266Smav */ 2354288571Smav if (zio != NULL) { 2355288571Smav ASSERT3S(BP_GET_LEVEL(zio->io_bp), ==, dpa->dpa_curlevel); 2356307266Smav if (zio->io_flags & ZIO_FLAG_RAW) { 2357307266Smav ASSERT3U(BP_GET_PSIZE(zio->io_bp), ==, zio->io_size); 2358307266Smav } else { 2359307266Smav ASSERT3U(BP_GET_LSIZE(zio->io_bp), ==, zio->io_size); 2360307266Smav } 2361288571Smav ASSERT3P(zio->io_spa, ==, dpa->dpa_spa); 2362307266Smav 2363307266Smav dpa->dpa_dnode = NULL; 2364307266Smav } else if (dpa->dpa_dnode != NULL) { 2365307266Smav uint64_t curblkid = dpa->dpa_zb.zb_blkid >> 2366307266Smav (dpa->dpa_epbs * (dpa->dpa_curlevel - 2367307266Smav dpa->dpa_zb.zb_level)); 2368307266Smav dmu_buf_impl_t *db = dbuf_hold_level(dpa->dpa_dnode, 2369307266Smav dpa->dpa_curlevel, curblkid, FTAG); 2370307266Smav (void) dbuf_read(db, NULL, 2371307266Smav DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH | DB_RF_HAVESTRUCT); 2372307266Smav dbuf_rele(db, FTAG); 2373288571Smav } 2374288571Smav 2375288571Smav dpa->dpa_curlevel--; 2376288571Smav 2377288571Smav uint64_t nextblkid = dpa->dpa_zb.zb_blkid >> 2378288571Smav (dpa->dpa_epbs * (dpa->dpa_curlevel - dpa->dpa_zb.zb_level)); 2379288571Smav blkptr_t *bp = ((blkptr_t *)abuf->b_data) + 2380288571Smav P2PHASE(nextblkid, 1ULL << dpa->dpa_epbs); 2381288571Smav if (BP_IS_HOLE(bp) || (zio != NULL && zio->io_error != 0)) { 2382288571Smav kmem_free(dpa, sizeof (*dpa)); 2383288571Smav } else if (dpa->dpa_curlevel == dpa->dpa_zb.zb_level) { 2384288571Smav ASSERT3U(nextblkid, ==, dpa->dpa_zb.zb_blkid); 2385288571Smav dbuf_issue_final_prefetch(dpa, bp); 2386288571Smav kmem_free(dpa, sizeof (*dpa)); 2387288571Smav } else { 2388288571Smav arc_flags_t iter_aflags = ARC_FLAG_NOWAIT; 2389288571Smav zbookmark_phys_t zb; 2390288571Smav 2391325932Savg /* flag if L2ARC eligible, l2arc_noprefetch then decides */ 2392325932Savg if (dpa->dpa_aflags & ARC_FLAG_L2CACHE) 2393325932Savg iter_aflags |= ARC_FLAG_L2CACHE; 2394325932Savg 2395288571Smav ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp)); 2396288571Smav 2397288571Smav SET_BOOKMARK(&zb, dpa->dpa_zb.zb_objset, 2398288571Smav dpa->dpa_zb.zb_object, dpa->dpa_curlevel, nextblkid); 2399288571Smav 2400288571Smav (void) arc_read(dpa->dpa_zio, dpa->dpa_spa, 2401288571Smav bp, dbuf_prefetch_indirect_done, dpa, dpa->dpa_prio, 2402288571Smav ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, 2403288571Smav &iter_aflags, &zb); 2404288571Smav } 2405307266Smav 2406307266Smav arc_buf_destroy(abuf, private); 2407288571Smav} 2408288571Smav 2409288571Smav/* 2410288571Smav * Issue prefetch reads for the given block on the given level. If the indirect 2411288571Smav * blocks above that block are not in memory, we will read them in 2412288571Smav * asynchronously. As a result, this call never blocks waiting for a read to 2413288571Smav * complete. 2414288571Smav */ 2415168404Spjdvoid 2416288571Smavdbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio, 2417288571Smav arc_flags_t aflags) 2418168404Spjd{ 2419288571Smav blkptr_t bp; 2420288571Smav int epbs, nlevels, curlevel; 2421288571Smav uint64_t curblkid; 2422168404Spjd 2423219089Spjd ASSERT(blkid != DMU_BONUS_BLKID); 2424168404Spjd ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 2425168404Spjd 2426288594Smav if (blkid > dn->dn_maxblkid) 2427288594Smav return; 2428288594Smav 2429168404Spjd if (dnode_block_freed(dn, blkid)) 2430168404Spjd return; 2431168404Spjd 2432288571Smav /* 2433288571Smav * This dnode hasn't been written to disk yet, so there's nothing to 2434288571Smav * prefetch. 2435288571Smav */ 2436288571Smav nlevels = dn->dn_phys->dn_nlevels; 2437288571Smav if (level >= nlevels || dn->dn_phys->dn_nblkptr == 0) 2438288571Smav return; 2439288571Smav 2440288571Smav epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; 2441288571Smav if (dn->dn_phys->dn_maxblkid < blkid << (epbs * level)) 2442288571Smav return; 2443288571Smav 2444288571Smav dmu_buf_impl_t *db = dbuf_find(dn->dn_objset, dn->dn_object, 2445288571Smav level, blkid); 2446288571Smav if (db != NULL) { 2447288571Smav mutex_exit(&db->db_mtx); 2448219089Spjd /* 2449288571Smav * This dbuf already exists. It is either CACHED, or 2450288571Smav * (we assume) about to be read or filled. 2451219089Spjd */ 2452219089Spjd return; 2453168404Spjd } 2454168404Spjd 2455288571Smav /* 2456288571Smav * Find the closest ancestor (indirect block) of the target block 2457288571Smav * that is present in the cache. In this indirect block, we will 2458288571Smav * find the bp that is at curlevel, curblkid. 2459288571Smav */ 2460288571Smav curlevel = level; 2461288571Smav curblkid = blkid; 2462288571Smav while (curlevel < nlevels - 1) { 2463288571Smav int parent_level = curlevel + 1; 2464288571Smav uint64_t parent_blkid = curblkid >> epbs; 2465288571Smav dmu_buf_impl_t *db; 2466168404Spjd 2467288571Smav if (dbuf_hold_impl(dn, parent_level, parent_blkid, 2468288571Smav FALSE, TRUE, FTAG, &db) == 0) { 2469288571Smav blkptr_t *bpp = db->db_buf->b_data; 2470288571Smav bp = bpp[P2PHASE(curblkid, 1 << epbs)]; 2471288571Smav dbuf_rele(db, FTAG); 2472288571Smav break; 2473288571Smav } 2474219089Spjd 2475288571Smav curlevel = parent_level; 2476288571Smav curblkid = parent_blkid; 2477168404Spjd } 2478288571Smav 2479288571Smav if (curlevel == nlevels - 1) { 2480288571Smav /* No cached indirect blocks found. */ 2481288571Smav ASSERT3U(curblkid, <, dn->dn_phys->dn_nblkptr); 2482288571Smav bp = dn->dn_phys->dn_blkptr[curblkid]; 2483288571Smav } 2484288571Smav if (BP_IS_HOLE(&bp)) 2485288571Smav return; 2486288571Smav 2487288571Smav ASSERT3U(curlevel, ==, BP_GET_LEVEL(&bp)); 2488288571Smav 2489288571Smav zio_t *pio = zio_root(dmu_objset_spa(dn->dn_objset), NULL, NULL, 2490288571Smav ZIO_FLAG_CANFAIL); 2491288571Smav 2492288571Smav dbuf_prefetch_arg_t *dpa = kmem_zalloc(sizeof (*dpa), KM_SLEEP); 2493288571Smav dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; 2494288571Smav SET_BOOKMARK(&dpa->dpa_zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET, 2495288571Smav dn->dn_object, level, blkid); 2496288571Smav dpa->dpa_curlevel = curlevel; 2497288571Smav dpa->dpa_prio = prio; 2498288571Smav dpa->dpa_aflags = aflags; 2499288571Smav dpa->dpa_spa = dn->dn_objset->os_spa; 2500307266Smav dpa->dpa_dnode = dn; 2501288571Smav dpa->dpa_epbs = epbs; 2502288571Smav dpa->dpa_zio = pio; 2503288571Smav 2504325932Savg /* flag if L2ARC eligible, l2arc_noprefetch then decides */ 2505325932Savg if (DNODE_LEVEL_IS_L2CACHEABLE(dn, level)) 2506325932Savg dpa->dpa_aflags |= ARC_FLAG_L2CACHE; 2507325932Savg 2508288571Smav /* 2509288571Smav * If we have the indirect just above us, no need to do the asynchronous 2510288571Smav * prefetch chain; we'll just run the last step ourselves. If we're at 2511288571Smav * a higher level, though, we want to issue the prefetches for all the 2512288571Smav * indirect blocks asynchronously, so we can go on with whatever we were 2513288571Smav * doing. 2514288571Smav */ 2515288571Smav if (curlevel == level) { 2516288571Smav ASSERT3U(curblkid, ==, blkid); 2517288571Smav dbuf_issue_final_prefetch(dpa, &bp); 2518288571Smav kmem_free(dpa, sizeof (*dpa)); 2519288571Smav } else { 2520288571Smav arc_flags_t iter_aflags = ARC_FLAG_NOWAIT; 2521288571Smav zbookmark_phys_t zb; 2522288571Smav 2523325932Savg /* flag if L2ARC eligible, l2arc_noprefetch then decides */ 2524325932Savg if (DNODE_LEVEL_IS_L2CACHEABLE(dn, level)) 2525325932Savg iter_aflags |= ARC_FLAG_L2CACHE; 2526325932Savg 2527288571Smav SET_BOOKMARK(&zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET, 2528288571Smav dn->dn_object, curlevel, curblkid); 2529288571Smav (void) arc_read(dpa->dpa_zio, dpa->dpa_spa, 2530288571Smav &bp, dbuf_prefetch_indirect_done, dpa, prio, 2531288571Smav ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, 2532288571Smav &iter_aflags, &zb); 2533288571Smav } 2534288571Smav /* 2535288571Smav * We use pio here instead of dpa_zio since it's possible that 2536288571Smav * dpa may have already been freed. 2537288571Smav */ 2538288571Smav zio_nowait(pio); 2539168404Spjd} 2540168404Spjd 2541168404Spjd/* 2542168404Spjd * Returns with db_holds incremented, and db_mtx not held. 2543168404Spjd * Note: dn_struct_rwlock must be held. 2544168404Spjd */ 2545168404Spjdint 2546288571Smavdbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, 2547288571Smav boolean_t fail_sparse, boolean_t fail_uncached, 2548168404Spjd void *tag, dmu_buf_impl_t **dbp) 2549168404Spjd{ 2550168404Spjd dmu_buf_impl_t *db, *parent = NULL; 2551168404Spjd 2552219089Spjd ASSERT(blkid != DMU_BONUS_BLKID); 2553168404Spjd ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 2554168404Spjd ASSERT3U(dn->dn_nlevels, >, level); 2555168404Spjd 2556168404Spjd *dbp = NULL; 2557168404Spjdtop: 2558168404Spjd /* dbuf_find() returns with db_mtx held */ 2559288538Smav db = dbuf_find(dn->dn_objset, dn->dn_object, level, blkid); 2560168404Spjd 2561168404Spjd if (db == NULL) { 2562168404Spjd blkptr_t *bp = NULL; 2563168404Spjd int err; 2564168404Spjd 2565288571Smav if (fail_uncached) 2566288571Smav return (SET_ERROR(ENOENT)); 2567288571Smav 2568168404Spjd ASSERT3P(parent, ==, NULL); 2569168404Spjd err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp); 2570168404Spjd if (fail_sparse) { 2571168404Spjd if (err == 0 && bp && BP_IS_HOLE(bp)) 2572249195Smm err = SET_ERROR(ENOENT); 2573168404Spjd if (err) { 2574168404Spjd if (parent) 2575168404Spjd dbuf_rele(parent, NULL); 2576168404Spjd return (err); 2577168404Spjd } 2578168404Spjd } 2579168404Spjd if (err && err != ENOENT) 2580168404Spjd return (err); 2581168404Spjd db = dbuf_create(dn, level, blkid, parent, bp); 2582168404Spjd } 2583168404Spjd 2584288571Smav if (fail_uncached && db->db_state != DB_CACHED) { 2585288571Smav mutex_exit(&db->db_mtx); 2586288571Smav return (SET_ERROR(ENOENT)); 2587288571Smav } 2588288571Smav 2589307266Smav if (db->db_buf != NULL) 2590168404Spjd ASSERT3P(db->db.db_data, ==, db->db_buf->b_data); 2591168404Spjd 2592168404Spjd ASSERT(db->db_buf == NULL || arc_referenced(db->db_buf)); 2593168404Spjd 2594168404Spjd /* 2595168404Spjd * If this buffer is currently syncing out, and we are are 2596168404Spjd * still referencing it from db_data, we need to make a copy 2597168404Spjd * of it in case we decide we want to dirty it again in this txg. 2598168404Spjd */ 2599219089Spjd if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && 2600168404Spjd dn->dn_object != DMU_META_DNODE_OBJECT && 2601168404Spjd db->db_state == DB_CACHED && db->db_data_pending) { 2602168404Spjd dbuf_dirty_record_t *dr = db->db_data_pending; 2603168404Spjd 2604168404Spjd if (dr->dt.dl.dr_data == db->db_buf) { 2605168404Spjd arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 2606168404Spjd 2607168404Spjd dbuf_set_data(db, 2608307266Smav arc_alloc_buf(dn->dn_objset->os_spa, 2609168404Spjd db->db.db_size, db, type)); 2610168404Spjd bcopy(dr->dt.dl.dr_data->b_data, db->db.db_data, 2611168404Spjd db->db.db_size); 2612168404Spjd } 2613168404Spjd } 2614168404Spjd 2615307266Smav if (multilist_link_active(&db->db_cache_link)) { 2616307266Smav ASSERT(refcount_is_zero(&db->db_holds)); 2617307266Smav multilist_remove(&dbuf_cache, db); 2618307266Smav (void) refcount_remove_many(&dbuf_cache_size, 2619307266Smav db->db.db_size, db); 2620307266Smav } 2621168404Spjd (void) refcount_add(&db->db_holds, tag); 2622168404Spjd DBUF_VERIFY(db); 2623168404Spjd mutex_exit(&db->db_mtx); 2624168404Spjd 2625168404Spjd /* NOTE: we can't rele the parent until after we drop the db_mtx */ 2626168404Spjd if (parent) 2627168404Spjd dbuf_rele(parent, NULL); 2628168404Spjd 2629219089Spjd ASSERT3P(DB_DNODE(db), ==, dn); 2630168404Spjd ASSERT3U(db->db_blkid, ==, blkid); 2631168404Spjd ASSERT3U(db->db_level, ==, level); 2632168404Spjd *dbp = db; 2633168404Spjd 2634168404Spjd return (0); 2635168404Spjd} 2636168404Spjd 2637168404Spjddmu_buf_impl_t * 2638168404Spjddbuf_hold(dnode_t *dn, uint64_t blkid, void *tag) 2639168404Spjd{ 2640288571Smav return (dbuf_hold_level(dn, 0, blkid, tag)); 2641168404Spjd} 2642168404Spjd 2643168404Spjddmu_buf_impl_t * 2644168404Spjddbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag) 2645168404Spjd{ 2646168404Spjd dmu_buf_impl_t *db; 2647288571Smav int err = dbuf_hold_impl(dn, level, blkid, FALSE, FALSE, tag, &db); 2648168404Spjd return (err ? NULL : db); 2649168404Spjd} 2650168404Spjd 2651185029Spjdvoid 2652168404Spjddbuf_create_bonus(dnode_t *dn) 2653168404Spjd{ 2654168404Spjd ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); 2655168404Spjd 2656168404Spjd ASSERT(dn->dn_bonus == NULL); 2657219089Spjd dn->dn_bonus = dbuf_create(dn, 0, DMU_BONUS_BLKID, dn->dn_dbuf, NULL); 2658168404Spjd} 2659168404Spjd 2660219089Spjdint 2661219089Spjddbuf_spill_set_blksz(dmu_buf_t *db_fake, uint64_t blksz, dmu_tx_t *tx) 2662219089Spjd{ 2663219089Spjd dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 2664219089Spjd dnode_t *dn; 2665219089Spjd 2666219089Spjd if (db->db_blkid != DMU_SPILL_BLKID) 2667249195Smm return (SET_ERROR(ENOTSUP)); 2668219089Spjd if (blksz == 0) 2669219089Spjd blksz = SPA_MINBLOCKSIZE; 2670276081Sdelphij ASSERT3U(blksz, <=, spa_maxblocksize(dmu_objset_spa(db->db_objset))); 2671276081Sdelphij blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE); 2672219089Spjd 2673219089Spjd DB_DNODE_ENTER(db); 2674219089Spjd dn = DB_DNODE(db); 2675219089Spjd rw_enter(&dn->dn_struct_rwlock, RW_WRITER); 2676219089Spjd dbuf_new_size(db, blksz, tx); 2677219089Spjd rw_exit(&dn->dn_struct_rwlock); 2678219089Spjd DB_DNODE_EXIT(db); 2679219089Spjd 2680219089Spjd return (0); 2681219089Spjd} 2682219089Spjd 2683219089Spjdvoid 2684219089Spjddbuf_rm_spill(dnode_t *dn, dmu_tx_t *tx) 2685219089Spjd{ 2686219089Spjd dbuf_free_range(dn, DMU_SPILL_BLKID, DMU_SPILL_BLKID, tx); 2687219089Spjd} 2688219089Spjd 2689168404Spjd#pragma weak dmu_buf_add_ref = dbuf_add_ref 2690168404Spjdvoid 2691168404Spjddbuf_add_ref(dmu_buf_impl_t *db, void *tag) 2692168404Spjd{ 2693168404Spjd int64_t holds = refcount_add(&db->db_holds, tag); 2694307266Smav ASSERT3S(holds, >, 1); 2695168404Spjd} 2696168404Spjd 2697288538Smav#pragma weak dmu_buf_try_add_ref = dbuf_try_add_ref 2698288538Smavboolean_t 2699288538Smavdbuf_try_add_ref(dmu_buf_t *db_fake, objset_t *os, uint64_t obj, uint64_t blkid, 2700288538Smav void *tag) 2701288538Smav{ 2702288538Smav dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 2703288538Smav dmu_buf_impl_t *found_db; 2704288538Smav boolean_t result = B_FALSE; 2705288538Smav 2706288538Smav if (db->db_blkid == DMU_BONUS_BLKID) 2707288538Smav found_db = dbuf_find_bonus(os, obj); 2708288538Smav else 2709288538Smav found_db = dbuf_find(os, obj, 0, blkid); 2710288538Smav 2711288538Smav if (found_db != NULL) { 2712288538Smav if (db == found_db && dbuf_refcount(db) > db->db_dirtycnt) { 2713288538Smav (void) refcount_add(&db->db_holds, tag); 2714288538Smav result = B_TRUE; 2715288538Smav } 2716288538Smav mutex_exit(&db->db_mtx); 2717288538Smav } 2718288538Smav return (result); 2719288538Smav} 2720288538Smav 2721219089Spjd/* 2722219089Spjd * If you call dbuf_rele() you had better not be referencing the dnode handle 2723219089Spjd * unless you have some other direct or indirect hold on the dnode. (An indirect 2724219089Spjd * hold is a hold on one of the dnode's dbufs, including the bonus buffer.) 2725219089Spjd * Without that, the dbuf_rele() could lead to a dnode_rele() followed by the 2726219089Spjd * dnode's parent dbuf evicting its dnode handles. 2727219089Spjd */ 2728168404Spjdvoid 2729168404Spjddbuf_rele(dmu_buf_impl_t *db, void *tag) 2730168404Spjd{ 2731219089Spjd mutex_enter(&db->db_mtx); 2732219089Spjd dbuf_rele_and_unlock(db, tag); 2733219089Spjd} 2734219089Spjd 2735263397Sdelphijvoid 2736263397Sdelphijdmu_buf_rele(dmu_buf_t *db, void *tag) 2737263397Sdelphij{ 2738263397Sdelphij dbuf_rele((dmu_buf_impl_t *)db, tag); 2739263397Sdelphij} 2740263397Sdelphij 2741219089Spjd/* 2742219089Spjd * dbuf_rele() for an already-locked dbuf. This is necessary to allow 2743219089Spjd * db_dirtycnt and db_holds to be updated atomically. 2744219089Spjd */ 2745219089Spjdvoid 2746219089Spjddbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag) 2747219089Spjd{ 2748168404Spjd int64_t holds; 2749168404Spjd 2750219089Spjd ASSERT(MUTEX_HELD(&db->db_mtx)); 2751168404Spjd DBUF_VERIFY(db); 2752168404Spjd 2753219089Spjd /* 2754219089Spjd * Remove the reference to the dbuf before removing its hold on the 2755219089Spjd * dnode so we can guarantee in dnode_move() that a referenced bonus 2756219089Spjd * buffer has a corresponding dnode hold. 2757219089Spjd */ 2758168404Spjd holds = refcount_remove(&db->db_holds, tag); 2759168404Spjd ASSERT(holds >= 0); 2760168404Spjd 2761168404Spjd /* 2762168404Spjd * We can't freeze indirects if there is a possibility that they 2763168404Spjd * may be modified in the current syncing context. 2764168404Spjd */ 2765307266Smav if (db->db_buf != NULL && 2766307266Smav holds == (db->db_level == 0 ? db->db_dirtycnt : 0)) { 2767168404Spjd arc_buf_freeze(db->db_buf); 2768307266Smav } 2769168404Spjd 2770168404Spjd if (holds == db->db_dirtycnt && 2771290754Smav db->db_level == 0 && db->db_user_immediate_evict) 2772168404Spjd dbuf_evict_user(db); 2773168404Spjd 2774168404Spjd if (holds == 0) { 2775219089Spjd if (db->db_blkid == DMU_BONUS_BLKID) { 2776288541Smav dnode_t *dn; 2777290754Smav boolean_t evict_dbuf = db->db_pending_evict; 2778219089Spjd 2779219089Spjd /* 2780288541Smav * If the dnode moves here, we cannot cross this 2781288541Smav * barrier until the move completes. 2782219089Spjd */ 2783219089Spjd DB_DNODE_ENTER(db); 2784288541Smav 2785288541Smav dn = DB_DNODE(db); 2786288541Smav atomic_dec_32(&dn->dn_dbufs_count); 2787288541Smav 2788288541Smav /* 2789288541Smav * Decrementing the dbuf count means that the bonus 2790288541Smav * buffer's dnode hold is no longer discounted in 2791288541Smav * dnode_move(). The dnode cannot move until after 2792290754Smav * the dnode_rele() below. 2793288541Smav */ 2794219089Spjd DB_DNODE_EXIT(db); 2795288541Smav 2796219089Spjd /* 2797288541Smav * Do not reference db after its lock is dropped. 2798288541Smav * Another thread may evict it. 2799219089Spjd */ 2800288541Smav mutex_exit(&db->db_mtx); 2801288541Smav 2802290754Smav if (evict_dbuf) 2803288541Smav dnode_evict_bonus(dn); 2804290754Smav 2805290754Smav dnode_rele(dn, db); 2806168404Spjd } else if (db->db_buf == NULL) { 2807168404Spjd /* 2808168404Spjd * This is a special case: we never associated this 2809168404Spjd * dbuf with any data allocated from the ARC. 2810168404Spjd */ 2811219089Spjd ASSERT(db->db_state == DB_UNCACHED || 2812219089Spjd db->db_state == DB_NOFILL); 2813307266Smav dbuf_destroy(db); 2814168404Spjd } else if (arc_released(db->db_buf)) { 2815168404Spjd /* 2816168404Spjd * This dbuf has anonymous data associated with it. 2817168404Spjd */ 2818307266Smav dbuf_destroy(db); 2819168404Spjd } else { 2820307266Smav boolean_t do_arc_evict = B_FALSE; 2821307266Smav blkptr_t bp; 2822307266Smav spa_t *spa = dmu_objset_spa(db->db_objset); 2823242845Sdelphij 2824307266Smav if (!DBUF_IS_CACHEABLE(db) && 2825307266Smav db->db_blkptr != NULL && 2826307266Smav !BP_IS_HOLE(db->db_blkptr) && 2827307266Smav !BP_IS_EMBEDDED(db->db_blkptr)) { 2828307266Smav do_arc_evict = B_TRUE; 2829307266Smav bp = *db->db_blkptr; 2830307266Smav } 2831307266Smav 2832307266Smav if (!DBUF_IS_CACHEABLE(db) || 2833307266Smav db->db_pending_evict) { 2834307266Smav dbuf_destroy(db); 2835307266Smav } else if (!multilist_link_active(&db->db_cache_link)) { 2836307266Smav multilist_insert(&dbuf_cache, db); 2837307266Smav (void) refcount_add_many(&dbuf_cache_size, 2838307266Smav db->db.db_size, db); 2839185029Spjd mutex_exit(&db->db_mtx); 2840307266Smav 2841307266Smav dbuf_evict_notify(); 2842269417Sdelphij } 2843307266Smav 2844307266Smav if (do_arc_evict) 2845307266Smav arc_freed(spa, &bp); 2846168404Spjd } 2847168404Spjd } else { 2848168404Spjd mutex_exit(&db->db_mtx); 2849168404Spjd } 2850307266Smav 2851168404Spjd} 2852168404Spjd 2853168404Spjd#pragma weak dmu_buf_refcount = dbuf_refcount 2854168404Spjduint64_t 2855168404Spjddbuf_refcount(dmu_buf_impl_t *db) 2856168404Spjd{ 2857168404Spjd return (refcount_count(&db->db_holds)); 2858168404Spjd} 2859168404Spjd 2860168404Spjdvoid * 2861288549Smavdmu_buf_replace_user(dmu_buf_t *db_fake, dmu_buf_user_t *old_user, 2862288549Smav dmu_buf_user_t *new_user) 2863168404Spjd{ 2864288549Smav dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 2865288549Smav 2866288549Smav mutex_enter(&db->db_mtx); 2867288549Smav dbuf_verify_user(db, DBVU_NOT_EVICTING); 2868288549Smav if (db->db_user == old_user) 2869288549Smav db->db_user = new_user; 2870288549Smav else 2871288549Smav old_user = db->db_user; 2872288549Smav dbuf_verify_user(db, DBVU_NOT_EVICTING); 2873288549Smav mutex_exit(&db->db_mtx); 2874288549Smav 2875288549Smav return (old_user); 2876168404Spjd} 2877168404Spjd 2878168404Spjdvoid * 2879288549Smavdmu_buf_set_user(dmu_buf_t *db_fake, dmu_buf_user_t *user) 2880168404Spjd{ 2881288549Smav return (dmu_buf_replace_user(db_fake, NULL, user)); 2882288549Smav} 2883288549Smav 2884288549Smavvoid * 2885288549Smavdmu_buf_set_user_ie(dmu_buf_t *db_fake, dmu_buf_user_t *user) 2886288549Smav{ 2887168404Spjd dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 2888168404Spjd 2889290754Smav db->db_user_immediate_evict = TRUE; 2890288549Smav return (dmu_buf_set_user(db_fake, user)); 2891168404Spjd} 2892168404Spjd 2893168404Spjdvoid * 2894288549Smavdmu_buf_remove_user(dmu_buf_t *db_fake, dmu_buf_user_t *user) 2895168404Spjd{ 2896288549Smav return (dmu_buf_replace_user(db_fake, user, NULL)); 2897168404Spjd} 2898168404Spjd 2899168404Spjdvoid * 2900168404Spjddmu_buf_get_user(dmu_buf_t *db_fake) 2901168404Spjd{ 2902168404Spjd dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 2903168404Spjd 2904288549Smav dbuf_verify_user(db, DBVU_NOT_EVICTING); 2905288549Smav return (db->db_user); 2906168404Spjd} 2907168404Spjd 2908288549Smavvoid 2909288549Smavdmu_buf_user_evict_wait() 2910288549Smav{ 2911288549Smav taskq_wait(dbu_evict_taskq); 2912288549Smav} 2913288549Smav 2914209962Smmboolean_t 2915209962Smmdmu_buf_freeable(dmu_buf_t *dbuf) 2916209962Smm{ 2917209962Smm boolean_t res = B_FALSE; 2918209962Smm dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf; 2919209962Smm 2920209962Smm if (db->db_blkptr) 2921209962Smm res = dsl_dataset_block_freeable(db->db_objset->os_dsl_dataset, 2922219089Spjd db->db_blkptr, db->db_blkptr->blk_birth); 2923209962Smm 2924209962Smm return (res); 2925209962Smm} 2926209962Smm 2927243524Smmblkptr_t * 2928243524Smmdmu_buf_get_blkptr(dmu_buf_t *db) 2929243524Smm{ 2930243524Smm dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db; 2931243524Smm return (dbi->db_blkptr); 2932243524Smm} 2933243524Smm 2934307287Smavobjset_t * 2935307287Smavdmu_buf_get_objset(dmu_buf_t *db) 2936307287Smav{ 2937307287Smav dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db; 2938307287Smav return (dbi->db_objset); 2939307287Smav} 2940307287Smav 2941307292Smavdnode_t * 2942307292Smavdmu_buf_dnode_enter(dmu_buf_t *db) 2943307292Smav{ 2944307292Smav dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db; 2945307292Smav DB_DNODE_ENTER(dbi); 2946307292Smav return (DB_DNODE(dbi)); 2947307292Smav} 2948307292Smav 2949307292Smavvoid 2950307292Smavdmu_buf_dnode_exit(dmu_buf_t *db) 2951307292Smav{ 2952307292Smav dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db; 2953307292Smav DB_DNODE_EXIT(dbi); 2954307292Smav} 2955307292Smav 2956168404Spjdstatic void 2957168404Spjddbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db) 2958168404Spjd{ 2959168404Spjd /* ASSERT(dmu_tx_is_syncing(tx) */ 2960168404Spjd ASSERT(MUTEX_HELD(&db->db_mtx)); 2961168404Spjd 2962168404Spjd if (db->db_blkptr != NULL) 2963168404Spjd return; 2964168404Spjd 2965219089Spjd if (db->db_blkid == DMU_SPILL_BLKID) { 2966219089Spjd db->db_blkptr = &dn->dn_phys->dn_spill; 2967219089Spjd BP_ZERO(db->db_blkptr); 2968219089Spjd return; 2969219089Spjd } 2970168404Spjd if (db->db_level == dn->dn_phys->dn_nlevels-1) { 2971168404Spjd /* 2972168404Spjd * This buffer was allocated at a time when there was 2973168404Spjd * no available blkptrs from the dnode, or it was 2974168404Spjd * inappropriate to hook it in (i.e., nlevels mis-match). 2975168404Spjd */ 2976168404Spjd ASSERT(db->db_blkid < dn->dn_phys->dn_nblkptr); 2977168404Spjd ASSERT(db->db_parent == NULL); 2978168404Spjd db->db_parent = dn->dn_dbuf; 2979168404Spjd db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid]; 2980168404Spjd DBUF_VERIFY(db); 2981168404Spjd } else { 2982168404Spjd dmu_buf_impl_t *parent = db->db_parent; 2983168404Spjd int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; 2984168404Spjd 2985168404Spjd ASSERT(dn->dn_phys->dn_nlevels > 1); 2986168404Spjd if (parent == NULL) { 2987168404Spjd mutex_exit(&db->db_mtx); 2988168404Spjd rw_enter(&dn->dn_struct_rwlock, RW_READER); 2989288571Smav parent = dbuf_hold_level(dn, db->db_level + 1, 2990288571Smav db->db_blkid >> epbs, db); 2991168404Spjd rw_exit(&dn->dn_struct_rwlock); 2992168404Spjd mutex_enter(&db->db_mtx); 2993168404Spjd db->db_parent = parent; 2994168404Spjd } 2995168404Spjd db->db_blkptr = (blkptr_t *)parent->db.db_data + 2996168404Spjd (db->db_blkid & ((1ULL << epbs) - 1)); 2997168404Spjd DBUF_VERIFY(db); 2998168404Spjd } 2999168404Spjd} 3000168404Spjd 3001168404Spjdstatic void 3002168404Spjddbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx) 3003168404Spjd{ 3004168404Spjd dmu_buf_impl_t *db = dr->dr_dbuf; 3005219089Spjd dnode_t *dn; 3006168404Spjd zio_t *zio; 3007168404Spjd 3008168404Spjd ASSERT(dmu_tx_is_syncing(tx)); 3009168404Spjd 3010168404Spjd dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr); 3011168404Spjd 3012168404Spjd mutex_enter(&db->db_mtx); 3013168404Spjd 3014168404Spjd ASSERT(db->db_level > 0); 3015168404Spjd DBUF_VERIFY(db); 3016168404Spjd 3017251629Sdelphij /* Read the block if it hasn't been read yet. */ 3018168404Spjd if (db->db_buf == NULL) { 3019168404Spjd mutex_exit(&db->db_mtx); 3020168404Spjd (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED); 3021168404Spjd mutex_enter(&db->db_mtx); 3022168404Spjd } 3023168404Spjd ASSERT3U(db->db_state, ==, DB_CACHED); 3024168404Spjd ASSERT(db->db_buf != NULL); 3025168404Spjd 3026219089Spjd DB_DNODE_ENTER(db); 3027219089Spjd dn = DB_DNODE(db); 3028251629Sdelphij /* Indirect block size must match what the dnode thinks it is. */ 3029219089Spjd ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift); 3030168404Spjd dbuf_check_blkptr(dn, db); 3031219089Spjd DB_DNODE_EXIT(db); 3032168404Spjd 3033251629Sdelphij /* Provide the pending dirty record to child dbufs */ 3034168404Spjd db->db_data_pending = dr; 3035168404Spjd 3036168404Spjd mutex_exit(&db->db_mtx); 3037185029Spjd dbuf_write(dr, db->db_buf, tx); 3038168404Spjd 3039168404Spjd zio = dr->dr_zio; 3040168404Spjd mutex_enter(&dr->dt.di.dr_mtx); 3041285202Savg dbuf_sync_list(&dr->dt.di.dr_children, db->db_level - 1, tx); 3042168404Spjd ASSERT(list_head(&dr->dt.di.dr_children) == NULL); 3043168404Spjd mutex_exit(&dr->dt.di.dr_mtx); 3044168404Spjd zio_nowait(zio); 3045168404Spjd} 3046168404Spjd 3047168404Spjdstatic void 3048168404Spjddbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) 3049168404Spjd{ 3050168404Spjd arc_buf_t **datap = &dr->dt.dl.dr_data; 3051168404Spjd dmu_buf_impl_t *db = dr->dr_dbuf; 3052219089Spjd dnode_t *dn; 3053219089Spjd objset_t *os; 3054168404Spjd uint64_t txg = tx->tx_txg; 3055168404Spjd 3056168404Spjd ASSERT(dmu_tx_is_syncing(tx)); 3057168404Spjd 3058168404Spjd dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr); 3059168404Spjd 3060168404Spjd mutex_enter(&db->db_mtx); 3061168404Spjd /* 3062168404Spjd * To be synced, we must be dirtied. But we 3063168404Spjd * might have been freed after the dirty. 3064168404Spjd */ 3065168404Spjd if (db->db_state == DB_UNCACHED) { 3066168404Spjd /* This buffer has been freed since it was dirtied */ 3067168404Spjd ASSERT(db->db.db_data == NULL); 3068168404Spjd } else if (db->db_state == DB_FILL) { 3069168404Spjd /* This buffer was freed and is now being re-filled */ 3070168404Spjd ASSERT(db->db.db_data != dr->dt.dl.dr_data); 3071168404Spjd } else { 3072219089Spjd ASSERT(db->db_state == DB_CACHED || db->db_state == DB_NOFILL); 3073168404Spjd } 3074168404Spjd DBUF_VERIFY(db); 3075168404Spjd 3076219089Spjd DB_DNODE_ENTER(db); 3077219089Spjd dn = DB_DNODE(db); 3078219089Spjd 3079219089Spjd if (db->db_blkid == DMU_SPILL_BLKID) { 3080219089Spjd mutex_enter(&dn->dn_mtx); 3081219089Spjd dn->dn_phys->dn_flags |= DNODE_FLAG_SPILL_BLKPTR; 3082219089Spjd mutex_exit(&dn->dn_mtx); 3083219089Spjd } 3084219089Spjd 3085168404Spjd /* 3086168404Spjd * If this is a bonus buffer, simply copy the bonus data into the 3087168404Spjd * dnode. It will be written out when the dnode is synced (and it 3088168404Spjd * will be synced, since it must have been dirty for dbuf_sync to 3089168404Spjd * be called). 3090168404Spjd */ 3091219089Spjd if (db->db_blkid == DMU_BONUS_BLKID) { 3092168404Spjd dbuf_dirty_record_t **drp; 3093185029Spjd 3094168404Spjd ASSERT(*datap != NULL); 3095240415Smm ASSERT0(db->db_level); 3096168404Spjd ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN); 3097168404Spjd bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen); 3098219089Spjd DB_DNODE_EXIT(db); 3099219089Spjd 3100185029Spjd if (*datap != db->db.db_data) { 3101168404Spjd zio_buf_free(*datap, DN_MAX_BONUSLEN); 3102208373Smm arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); 3103185029Spjd } 3104168404Spjd db->db_data_pending = NULL; 3105168404Spjd drp = &db->db_last_dirty; 3106168404Spjd while (*drp != dr) 3107168404Spjd drp = &(*drp)->dr_next; 3108185029Spjd ASSERT(dr->dr_next == NULL); 3109219089Spjd ASSERT(dr->dr_dbuf == db); 3110185029Spjd *drp = dr->dr_next; 3111169325Spjd if (dr->dr_dbuf->db_level != 0) { 3112169325Spjd list_destroy(&dr->dt.di.dr_children); 3113169325Spjd mutex_destroy(&dr->dt.di.dr_mtx); 3114169325Spjd } 3115168404Spjd kmem_free(dr, sizeof (dbuf_dirty_record_t)); 3116168404Spjd ASSERT(db->db_dirtycnt > 0); 3117168404Spjd db->db_dirtycnt -= 1; 3118219089Spjd dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg); 3119168404Spjd return; 3120168404Spjd } 3121168404Spjd 3122219089Spjd os = dn->dn_objset; 3123219089Spjd 3124168404Spjd /* 3125185029Spjd * This function may have dropped the db_mtx lock allowing a dmu_sync 3126185029Spjd * operation to sneak in. As a result, we need to ensure that we 3127185029Spjd * don't check the dr_override_state until we have returned from 3128185029Spjd * dbuf_check_blkptr. 3129185029Spjd */ 3130185029Spjd dbuf_check_blkptr(dn, db); 3131185029Spjd 3132185029Spjd /* 3133219089Spjd * If this buffer is in the middle of an immediate write, 3134168404Spjd * wait for the synchronous IO to complete. 3135168404Spjd */ 3136168404Spjd while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) { 3137168404Spjd ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT); 3138168404Spjd cv_wait(&db->db_changed, &db->db_mtx); 3139168404Spjd ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN); 3140168404Spjd } 3141168404Spjd 3142219089Spjd if (db->db_state != DB_NOFILL && 3143219089Spjd dn->dn_object != DMU_META_DNODE_OBJECT && 3144208050Smm refcount_count(&db->db_holds) > 1 && 3145219089Spjd dr->dt.dl.dr_override_state != DR_OVERRIDDEN && 3146208050Smm *datap == db->db_buf) { 3147168404Spjd /* 3148208050Smm * If this buffer is currently "in use" (i.e., there 3149208050Smm * are active holds and db_data still references it), 3150208050Smm * then make a copy before we start the write so that 3151208050Smm * any modifications from the open txg will not leak 3152208050Smm * into this write. 3153168404Spjd * 3154208050Smm * NOTE: this copy does not need to be made for 3155208050Smm * objects only modified in the syncing context (e.g. 3156208050Smm * DNONE_DNODE blocks). 3157168404Spjd */ 3158208050Smm int blksz = arc_buf_size(*datap); 3159208050Smm arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 3160307266Smav *datap = arc_alloc_buf(os->os_spa, blksz, db, type); 3161208050Smm bcopy(db->db.db_data, (*datap)->b_data, blksz); 3162168404Spjd } 3163168404Spjd db->db_data_pending = dr; 3164168404Spjd 3165168404Spjd mutex_exit(&db->db_mtx); 3166168404Spjd 3167185029Spjd dbuf_write(dr, *datap, tx); 3168168404Spjd 3169168404Spjd ASSERT(!list_link_active(&dr->dr_dirty_node)); 3170219089Spjd if (dn->dn_object == DMU_META_DNODE_OBJECT) { 3171168404Spjd list_insert_tail(&dn->dn_dirty_records[txg&TXG_MASK], dr); 3172219089Spjd DB_DNODE_EXIT(db); 3173219089Spjd } else { 3174219089Spjd /* 3175219089Spjd * Although zio_nowait() does not "wait for an IO", it does 3176219089Spjd * initiate the IO. If this is an empty write it seems plausible 3177219089Spjd * that the IO could actually be completed before the nowait 3178219089Spjd * returns. We need to DB_DNODE_EXIT() first in case 3179219089Spjd * zio_nowait() invalidates the dbuf. 3180219089Spjd */ 3181219089Spjd DB_DNODE_EXIT(db); 3182168404Spjd zio_nowait(dr->dr_zio); 3183219089Spjd } 3184168404Spjd} 3185168404Spjd 3186168404Spjdvoid 3187285202Savgdbuf_sync_list(list_t *list, int level, dmu_tx_t *tx) 3188168404Spjd{ 3189168404Spjd dbuf_dirty_record_t *dr; 3190168404Spjd 3191168404Spjd while (dr = list_head(list)) { 3192168404Spjd if (dr->dr_zio != NULL) { 3193168404Spjd /* 3194168404Spjd * If we find an already initialized zio then we 3195168404Spjd * are processing the meta-dnode, and we have finished. 3196168404Spjd * The dbufs for all dnodes are put back on the list 3197168404Spjd * during processing, so that we can zio_wait() 3198168404Spjd * these IOs after initiating all child IOs. 3199168404Spjd */ 3200168404Spjd ASSERT3U(dr->dr_dbuf->db.db_object, ==, 3201168404Spjd DMU_META_DNODE_OBJECT); 3202168404Spjd break; 3203168404Spjd } 3204285202Savg if (dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID && 3205285202Savg dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) { 3206285202Savg VERIFY3U(dr->dr_dbuf->db_level, ==, level); 3207285202Savg } 3208168404Spjd list_remove(list, dr); 3209168404Spjd if (dr->dr_dbuf->db_level > 0) 3210168404Spjd dbuf_sync_indirect(dr, tx); 3211168404Spjd else 3212168404Spjd dbuf_sync_leaf(dr, tx); 3213168404Spjd } 3214168404Spjd} 3215168404Spjd 3216168404Spjd/* ARGSUSED */ 3217168404Spjdstatic void 3218168404Spjddbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb) 3219168404Spjd{ 3220168404Spjd dmu_buf_impl_t *db = vdb; 3221219089Spjd dnode_t *dn; 3222185029Spjd blkptr_t *bp = zio->io_bp; 3223168404Spjd blkptr_t *bp_orig = &zio->io_bp_orig; 3224219089Spjd spa_t *spa = zio->io_spa; 3225219089Spjd int64_t delta; 3226168404Spjd uint64_t fill = 0; 3227219089Spjd int i; 3228168404Spjd 3229304136Savg ASSERT3P(db->db_blkptr, !=, NULL); 3230304136Savg ASSERT3P(&db->db_data_pending->dr_bp_copy, ==, bp); 3231185029Spjd 3232219089Spjd DB_DNODE_ENTER(db); 3233219089Spjd dn = DB_DNODE(db); 3234219089Spjd delta = bp_get_dsize_sync(spa, bp) - bp_get_dsize_sync(spa, bp_orig); 3235219089Spjd dnode_diduse_space(dn, delta - zio->io_prev_space_delta); 3236219089Spjd zio->io_prev_space_delta = delta; 3237168404Spjd 3238263397Sdelphij if (bp->blk_birth != 0) { 3239263397Sdelphij ASSERT((db->db_blkid != DMU_SPILL_BLKID && 3240263397Sdelphij BP_GET_TYPE(bp) == dn->dn_type) || 3241263397Sdelphij (db->db_blkid == DMU_SPILL_BLKID && 3242268649Sdelphij BP_GET_TYPE(bp) == dn->dn_bonustype) || 3243268649Sdelphij BP_IS_EMBEDDED(bp)); 3244263397Sdelphij ASSERT(BP_GET_LEVEL(bp) == db->db_level); 3245168404Spjd } 3246168404Spjd 3247168404Spjd mutex_enter(&db->db_mtx); 3248168404Spjd 3249219089Spjd#ifdef ZFS_DEBUG 3250219089Spjd if (db->db_blkid == DMU_SPILL_BLKID) { 3251219089Spjd ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR); 3252304136Savg ASSERT(!(BP_IS_HOLE(bp)) && 3253219089Spjd db->db_blkptr == &dn->dn_phys->dn_spill); 3254219089Spjd } 3255219089Spjd#endif 3256219089Spjd 3257168404Spjd if (db->db_level == 0) { 3258168404Spjd mutex_enter(&dn->dn_mtx); 3259219089Spjd if (db->db_blkid > dn->dn_phys->dn_maxblkid && 3260219089Spjd db->db_blkid != DMU_SPILL_BLKID) 3261168404Spjd dn->dn_phys->dn_maxblkid = db->db_blkid; 3262168404Spjd mutex_exit(&dn->dn_mtx); 3263168404Spjd 3264168404Spjd if (dn->dn_type == DMU_OT_DNODE) { 3265168404Spjd dnode_phys_t *dnp = db->db.db_data; 3266168404Spjd for (i = db->db.db_size >> DNODE_SHIFT; i > 0; 3267168404Spjd i--, dnp++) { 3268168404Spjd if (dnp->dn_type != DMU_OT_NONE) 3269168404Spjd fill++; 3270168404Spjd } 3271168404Spjd } else { 3272263397Sdelphij if (BP_IS_HOLE(bp)) { 3273263397Sdelphij fill = 0; 3274263397Sdelphij } else { 3275263397Sdelphij fill = 1; 3276263397Sdelphij } 3277168404Spjd } 3278168404Spjd } else { 3279185029Spjd blkptr_t *ibp = db->db.db_data; 3280168404Spjd ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift); 3281185029Spjd for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) { 3282185029Spjd if (BP_IS_HOLE(ibp)) 3283168404Spjd continue; 3284268649Sdelphij fill += BP_GET_FILL(ibp); 3285168404Spjd } 3286168404Spjd } 3287219089Spjd DB_DNODE_EXIT(db); 3288168404Spjd 3289268649Sdelphij if (!BP_IS_EMBEDDED(bp)) 3290268649Sdelphij bp->blk_fill = fill; 3291168404Spjd 3292168404Spjd mutex_exit(&db->db_mtx); 3293304136Savg 3294304136Savg rw_enter(&dn->dn_struct_rwlock, RW_WRITER); 3295304136Savg *db->db_blkptr = *bp; 3296304136Savg rw_exit(&dn->dn_struct_rwlock); 3297168404Spjd} 3298168404Spjd 3299304139Savg/* ARGSUSED */ 3300260763Savg/* 3301304139Savg * This function gets called just prior to running through the compression 3302304139Savg * stage of the zio pipeline. If we're an indirect block comprised of only 3303304139Savg * holes, then we want this indirect to be compressed away to a hole. In 3304304139Savg * order to do that we must zero out any information about the holes that 3305304139Savg * this indirect points to prior to before we try to compress it. 3306304139Savg */ 3307304139Savgstatic void 3308304139Savgdbuf_write_children_ready(zio_t *zio, arc_buf_t *buf, void *vdb) 3309304139Savg{ 3310304139Savg dmu_buf_impl_t *db = vdb; 3311304139Savg dnode_t *dn; 3312304139Savg blkptr_t *bp; 3313304139Savg uint64_t i; 3314304139Savg int epbs; 3315304139Savg 3316304139Savg ASSERT3U(db->db_level, >, 0); 3317304139Savg DB_DNODE_ENTER(db); 3318304139Savg dn = DB_DNODE(db); 3319304139Savg epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; 3320304139Savg 3321304139Savg /* Determine if all our children are holes */ 3322304139Savg for (i = 0, bp = db->db.db_data; i < 1 << epbs; i++, bp++) { 3323304139Savg if (!BP_IS_HOLE(bp)) 3324304139Savg break; 3325304139Savg } 3326304139Savg 3327304139Savg /* 3328304139Savg * If all the children are holes, then zero them all out so that 3329304139Savg * we may get compressed away. 3330304139Savg */ 3331304139Savg if (i == 1 << epbs) { 3332304139Savg /* didn't find any non-holes */ 3333304139Savg bzero(db->db.db_data, db->db.db_size); 3334304139Savg } 3335304139Savg DB_DNODE_EXIT(db); 3336304139Savg} 3337304139Savg 3338304139Savg/* 3339260763Savg * The SPA will call this callback several times for each zio - once 3340260763Savg * for every physical child i/o (zio->io_phys_children times). This 3341260763Savg * allows the DMU to monitor the progress of each logical i/o. For example, 3342260763Savg * there may be 2 copies of an indirect block, or many fragments of a RAID-Z 3343260763Savg * block. There may be a long delay before all copies/fragments are completed, 3344260763Savg * so this callback allows us to retire dirty space gradually, as the physical 3345260763Savg * i/os complete. 3346260763Savg */ 3347168404Spjd/* ARGSUSED */ 3348168404Spjdstatic void 3349260763Savgdbuf_write_physdone(zio_t *zio, arc_buf_t *buf, void *arg) 3350260763Savg{ 3351260763Savg dmu_buf_impl_t *db = arg; 3352260763Savg objset_t *os = db->db_objset; 3353260763Savg dsl_pool_t *dp = dmu_objset_pool(os); 3354260763Savg dbuf_dirty_record_t *dr; 3355260763Savg int delta = 0; 3356260763Savg 3357260763Savg dr = db->db_data_pending; 3358260763Savg ASSERT3U(dr->dr_txg, ==, zio->io_txg); 3359260763Savg 3360260763Savg /* 3361260763Savg * The callback will be called io_phys_children times. Retire one 3362260763Savg * portion of our dirty space each time we are called. Any rounding 3363260763Savg * error will be cleaned up by dsl_pool_sync()'s call to 3364260763Savg * dsl_pool_undirty_space(). 3365260763Savg */ 3366260763Savg delta = dr->dr_accounted / zio->io_phys_children; 3367260763Savg dsl_pool_undirty_space(dp, delta, zio->io_txg); 3368260763Savg} 3369260763Savg 3370260763Savg/* ARGSUSED */ 3371260763Savgstatic void 3372168404Spjddbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) 3373168404Spjd{ 3374168404Spjd dmu_buf_impl_t *db = vdb; 3375219089Spjd blkptr_t *bp_orig = &zio->io_bp_orig; 3376263397Sdelphij blkptr_t *bp = db->db_blkptr; 3377263397Sdelphij objset_t *os = db->db_objset; 3378263397Sdelphij dmu_tx_t *tx = os->os_synctx; 3379168404Spjd dbuf_dirty_record_t **drp, *dr; 3380168404Spjd 3381240415Smm ASSERT0(zio->io_error); 3382219089Spjd ASSERT(db->db_blkptr == bp); 3383168404Spjd 3384243524Smm /* 3385243524Smm * For nopwrites and rewrites we ensure that the bp matches our 3386243524Smm * original and bypass all the accounting. 3387243524Smm */ 3388243524Smm if (zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)) { 3389219089Spjd ASSERT(BP_EQUAL(bp, bp_orig)); 3390219089Spjd } else { 3391263397Sdelphij dsl_dataset_t *ds = os->os_dsl_dataset; 3392219089Spjd (void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE); 3393219089Spjd dsl_dataset_block_born(ds, bp, tx); 3394219089Spjd } 3395219089Spjd 3396168404Spjd mutex_enter(&db->db_mtx); 3397168404Spjd 3398219089Spjd DBUF_VERIFY(db); 3399219089Spjd 3400168404Spjd drp = &db->db_last_dirty; 3401185029Spjd while ((dr = *drp) != db->db_data_pending) 3402185029Spjd drp = &dr->dr_next; 3403185029Spjd ASSERT(!list_link_active(&dr->dr_dirty_node)); 3404219089Spjd ASSERT(dr->dr_dbuf == db); 3405185029Spjd ASSERT(dr->dr_next == NULL); 3406185029Spjd *drp = dr->dr_next; 3407168404Spjd 3408219089Spjd#ifdef ZFS_DEBUG 3409219089Spjd if (db->db_blkid == DMU_SPILL_BLKID) { 3410219089Spjd dnode_t *dn; 3411219089Spjd 3412219089Spjd DB_DNODE_ENTER(db); 3413219089Spjd dn = DB_DNODE(db); 3414219089Spjd ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR); 3415219089Spjd ASSERT(!(BP_IS_HOLE(db->db_blkptr)) && 3416219089Spjd db->db_blkptr == &dn->dn_phys->dn_spill); 3417219089Spjd DB_DNODE_EXIT(db); 3418219089Spjd } 3419219089Spjd#endif 3420219089Spjd 3421168404Spjd if (db->db_level == 0) { 3422219089Spjd ASSERT(db->db_blkid != DMU_BONUS_BLKID); 3423168404Spjd ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN); 3424219089Spjd if (db->db_state != DB_NOFILL) { 3425219089Spjd if (dr->dt.dl.dr_data != db->db_buf) 3426307266Smav arc_buf_destroy(dr->dt.dl.dr_data, db); 3427219089Spjd } 3428168404Spjd } else { 3429219089Spjd dnode_t *dn; 3430168404Spjd 3431219089Spjd DB_DNODE_ENTER(db); 3432219089Spjd dn = DB_DNODE(db); 3433168404Spjd ASSERT(list_head(&dr->dt.di.dr_children) == NULL); 3434263397Sdelphij ASSERT3U(db->db.db_size, ==, 1 << dn->dn_phys->dn_indblkshift); 3435168404Spjd if (!BP_IS_HOLE(db->db_blkptr)) { 3436168404Spjd int epbs = 3437168404Spjd dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; 3438263397Sdelphij ASSERT3U(db->db_blkid, <=, 3439263397Sdelphij dn->dn_phys->dn_maxblkid >> (db->db_level * epbs)); 3440168404Spjd ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==, 3441168404Spjd db->db.db_size); 3442168404Spjd } 3443219089Spjd DB_DNODE_EXIT(db); 3444185029Spjd mutex_destroy(&dr->dt.di.dr_mtx); 3445169325Spjd list_destroy(&dr->dt.di.dr_children); 3446168404Spjd } 3447168404Spjd kmem_free(dr, sizeof (dbuf_dirty_record_t)); 3448168404Spjd 3449168404Spjd cv_broadcast(&db->db_changed); 3450168404Spjd ASSERT(db->db_dirtycnt > 0); 3451168404Spjd db->db_dirtycnt -= 1; 3452168404Spjd db->db_data_pending = NULL; 3453263397Sdelphij dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg); 3454219089Spjd} 3455219089Spjd 3456219089Spjdstatic void 3457219089Spjddbuf_write_nofill_ready(zio_t *zio) 3458219089Spjd{ 3459219089Spjd dbuf_write_ready(zio, NULL, zio->io_private); 3460219089Spjd} 3461219089Spjd 3462219089Spjdstatic void 3463219089Spjddbuf_write_nofill_done(zio_t *zio) 3464219089Spjd{ 3465219089Spjd dbuf_write_done(zio, NULL, zio->io_private); 3466219089Spjd} 3467219089Spjd 3468219089Spjdstatic void 3469219089Spjddbuf_write_override_ready(zio_t *zio) 3470219089Spjd{ 3471219089Spjd dbuf_dirty_record_t *dr = zio->io_private; 3472219089Spjd dmu_buf_impl_t *db = dr->dr_dbuf; 3473219089Spjd 3474219089Spjd dbuf_write_ready(zio, NULL, db); 3475219089Spjd} 3476219089Spjd 3477219089Spjdstatic void 3478219089Spjddbuf_write_override_done(zio_t *zio) 3479219089Spjd{ 3480219089Spjd dbuf_dirty_record_t *dr = zio->io_private; 3481219089Spjd dmu_buf_impl_t *db = dr->dr_dbuf; 3482219089Spjd blkptr_t *obp = &dr->dt.dl.dr_overridden_by; 3483219089Spjd 3484219089Spjd mutex_enter(&db->db_mtx); 3485219089Spjd if (!BP_EQUAL(zio->io_bp, obp)) { 3486219089Spjd if (!BP_IS_HOLE(obp)) 3487219089Spjd dsl_free(spa_get_dsl(zio->io_spa), zio->io_txg, obp); 3488219089Spjd arc_release(dr->dt.dl.dr_data, db); 3489219089Spjd } 3490168404Spjd mutex_exit(&db->db_mtx); 3491168404Spjd 3492219089Spjd dbuf_write_done(zio, NULL, db); 3493219089Spjd} 3494168404Spjd 3495251629Sdelphij/* Issue I/O to commit a dirty buffer to disk. */ 3496219089Spjdstatic void 3497219089Spjddbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) 3498219089Spjd{ 3499219089Spjd dmu_buf_impl_t *db = dr->dr_dbuf; 3500219089Spjd dnode_t *dn; 3501219089Spjd objset_t *os; 3502219089Spjd dmu_buf_impl_t *parent = db->db_parent; 3503219089Spjd uint64_t txg = tx->tx_txg; 3504268657Sdelphij zbookmark_phys_t zb; 3505219089Spjd zio_prop_t zp; 3506219089Spjd zio_t *zio; 3507219089Spjd int wp_flag = 0; 3508219089Spjd 3509304136Savg ASSERT(dmu_tx_is_syncing(tx)); 3510304136Savg 3511219089Spjd DB_DNODE_ENTER(db); 3512219089Spjd dn = DB_DNODE(db); 3513219089Spjd os = dn->dn_objset; 3514219089Spjd 3515219089Spjd if (db->db_state != DB_NOFILL) { 3516219089Spjd if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) { 3517219089Spjd /* 3518219089Spjd * Private object buffers are released here rather 3519219089Spjd * than in dbuf_dirty() since they are only modified 3520219089Spjd * in the syncing context and we don't want the 3521219089Spjd * overhead of making multiple copies of the data. 3522219089Spjd */ 3523219089Spjd if (BP_IS_HOLE(db->db_blkptr)) { 3524219089Spjd arc_buf_thaw(data); 3525219089Spjd } else { 3526219089Spjd dbuf_release_bp(db); 3527219089Spjd } 3528219089Spjd } 3529219089Spjd } 3530219089Spjd 3531219089Spjd if (parent != dn->dn_dbuf) { 3532251629Sdelphij /* Our parent is an indirect block. */ 3533251629Sdelphij /* We have a dirty parent that has been scheduled for write. */ 3534219089Spjd ASSERT(parent && parent->db_data_pending); 3535251629Sdelphij /* Our parent's buffer is one level closer to the dnode. */ 3536219089Spjd ASSERT(db->db_level == parent->db_level-1); 3537251629Sdelphij /* 3538251629Sdelphij * We're about to modify our parent's db_data by modifying 3539251629Sdelphij * our block pointer, so the parent must be released. 3540251629Sdelphij */ 3541219089Spjd ASSERT(arc_released(parent->db_buf)); 3542219089Spjd zio = parent->db_data_pending->dr_zio; 3543219089Spjd } else { 3544251629Sdelphij /* Our parent is the dnode itself. */ 3545219089Spjd ASSERT((db->db_level == dn->dn_phys->dn_nlevels-1 && 3546219089Spjd db->db_blkid != DMU_SPILL_BLKID) || 3547219089Spjd (db->db_blkid == DMU_SPILL_BLKID && db->db_level == 0)); 3548219089Spjd if (db->db_blkid != DMU_SPILL_BLKID) 3549219089Spjd ASSERT3P(db->db_blkptr, ==, 3550219089Spjd &dn->dn_phys->dn_blkptr[db->db_blkid]); 3551219089Spjd zio = dn->dn_zio; 3552219089Spjd } 3553219089Spjd 3554219089Spjd ASSERT(db->db_level == 0 || data == db->db_buf); 3555219089Spjd ASSERT3U(db->db_blkptr->blk_birth, <=, txg); 3556219089Spjd ASSERT(zio); 3557219089Spjd 3558219089Spjd SET_BOOKMARK(&zb, os->os_dsl_dataset ? 3559219089Spjd os->os_dsl_dataset->ds_object : DMU_META_OBJSET, 3560219089Spjd db->db.db_object, db->db_level, db->db_blkid); 3561219089Spjd 3562219089Spjd if (db->db_blkid == DMU_SPILL_BLKID) 3563219089Spjd wp_flag = WP_SPILL; 3564219089Spjd wp_flag |= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0; 3565219089Spjd 3566219089Spjd dmu_write_policy(os, dn, db->db_level, wp_flag, &zp); 3567219089Spjd DB_DNODE_EXIT(db); 3568219089Spjd 3569304136Savg /* 3570304136Savg * We copy the blkptr now (rather than when we instantiate the dirty 3571304136Savg * record), because its value can change between open context and 3572304136Savg * syncing context. We do not need to hold dn_struct_rwlock to read 3573304136Savg * db_blkptr because we are in syncing context. 3574304136Savg */ 3575304136Savg dr->dr_bp_copy = *db->db_blkptr; 3576304136Savg 3577268649Sdelphij if (db->db_level == 0 && 3578268649Sdelphij dr->dt.dl.dr_override_state == DR_OVERRIDDEN) { 3579268649Sdelphij /* 3580268649Sdelphij * The BP for this block has been provided by open context 3581268649Sdelphij * (by dmu_sync() or dmu_buf_write_embedded()). 3582268649Sdelphij */ 3583268649Sdelphij void *contents = (data != NULL) ? data->b_data : NULL; 3584268649Sdelphij 3585219089Spjd dr->dr_zio = zio_write(zio, os->os_spa, txg, 3586304136Savg &dr->dr_bp_copy, contents, db->db.db_size, &zp, 3587304139Savg dbuf_write_override_ready, NULL, NULL, 3588304139Savg dbuf_write_override_done, 3589260763Savg dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); 3590219089Spjd mutex_enter(&db->db_mtx); 3591219089Spjd dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; 3592219089Spjd zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by, 3593243524Smm dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite); 3594219089Spjd mutex_exit(&db->db_mtx); 3595219089Spjd } else if (db->db_state == DB_NOFILL) { 3596255750Sdelphij ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF || 3597255750Sdelphij zp.zp_checksum == ZIO_CHECKSUM_NOPARITY); 3598219089Spjd dr->dr_zio = zio_write(zio, os->os_spa, txg, 3599304136Savg &dr->dr_bp_copy, NULL, db->db.db_size, &zp, 3600304139Savg dbuf_write_nofill_ready, NULL, NULL, 3601304139Savg dbuf_write_nofill_done, db, 3602219089Spjd ZIO_PRIORITY_ASYNC_WRITE, 3603219089Spjd ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb); 3604219089Spjd } else { 3605219089Spjd ASSERT(arc_released(data)); 3606304139Savg 3607304139Savg /* 3608304139Savg * For indirect blocks, we want to setup the children 3609304139Savg * ready callback so that we can properly handle an indirect 3610304139Savg * block that only contains holes. 3611304139Savg */ 3612304139Savg arc_done_func_t *children_ready_cb = NULL; 3613304139Savg if (db->db_level != 0) 3614304139Savg children_ready_cb = dbuf_write_children_ready; 3615304139Savg 3616219089Spjd dr->dr_zio = arc_write(zio, os->os_spa, txg, 3617304136Savg &dr->dr_bp_copy, data, DBUF_IS_L2CACHEABLE(db), 3618307266Smav &zp, dbuf_write_ready, children_ready_cb, 3619260763Savg dbuf_write_physdone, dbuf_write_done, db, 3620260763Savg ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); 3621219089Spjd } 3622168404Spjd} 3623