1168404Spjd/* 2168404Spjd * CDDL HEADER START 3168404Spjd * 4168404Spjd * The contents of this file are subject to the terms of the 5168404Spjd * Common Development and Distribution License (the "License"). 6168404Spjd * You may not use this file except in compliance with the License. 7168404Spjd * 8168404Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9168404Spjd * or http://www.opensolaris.org/os/licensing. 10168404Spjd * See the License for the specific language governing permissions 11168404Spjd * and limitations under the License. 12168404Spjd * 13168404Spjd * When distributing Covered Code, include this CDDL HEADER in each 14168404Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15168404Spjd * If applicable, add the following below this CDDL HEADER, with the 16168404Spjd * fields enclosed by brackets "[]" replaced with your own identifying 17168404Spjd * information: Portions Copyright [yyyy] [name of copyright owner] 18168404Spjd * 19168404Spjd * CDDL HEADER END 20168404Spjd */ 21168404Spjd/* 22219089Spjd * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23219636Spjd * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 24339114Smav * Copyright (c) 2012, 2018 by Delphix. All rights reserved. 25251478Sdelphij * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. 26255750Sdelphij * Copyright (c) 2013, Joyent, Inc. All rights reserved. 27286575Smav * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. 28296519Smav * Copyright (c) 2014 Integros [integros.com] 29168404Spjd */ 30168404Spjd 31168404Spjd#include <sys/zfs_context.h> 32168404Spjd#include <sys/dmu.h> 33253821Sdelphij#include <sys/dmu_send.h> 34168404Spjd#include <sys/dmu_impl.h> 35168404Spjd#include <sys/dbuf.h> 36168404Spjd#include <sys/dmu_objset.h> 37168404Spjd#include <sys/dsl_dataset.h> 38168404Spjd#include <sys/dsl_dir.h> 39168404Spjd#include <sys/dmu_tx.h> 40168404Spjd#include <sys/spa.h> 41168404Spjd#include <sys/zio.h> 42168404Spjd#include <sys/dmu_zfetch.h> 43219089Spjd#include <sys/sa.h> 44219089Spjd#include <sys/sa_impl.h> 45268075Sdelphij#include <sys/zfeature.h> 46268075Sdelphij#include <sys/blkptr.h> 47264669Sdelphij#include <sys/range_tree.h> 48307265Smav#include <sys/callb.h> 49321610Smav#include <sys/abd.h> 50332525Smav#include <sys/vdev.h> 51332540Smav#include <sys/cityhash.h> 52339109Smav#include <sys/spa_impl.h> 53168404Spjd 54248571Smmstatic boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx); 55185029Spjdstatic void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx); 56168404Spjd 57286575Smav#ifndef __lint 58286575Smavextern inline void dmu_buf_init_user(dmu_buf_user_t *dbu, 59321527Smav dmu_buf_evict_func_t *evict_func_sync, 60321527Smav dmu_buf_evict_func_t *evict_func_async, 61321527Smav dmu_buf_t **clear_on_evict_dbufp); 62286575Smav#endif /* ! __lint */ 63286575Smav 64168404Spjd/* 65168404Spjd * Global data structures and functions for the dbuf cache. 66168404Spjd */ 67307265Smavstatic kmem_cache_t *dbuf_kmem_cache; 68286575Smavstatic taskq_t *dbu_evict_taskq; 69168404Spjd 70307265Smavstatic kthread_t *dbuf_cache_evict_thread; 71307265Smavstatic kmutex_t dbuf_evict_lock; 72307265Smavstatic kcondvar_t dbuf_evict_cv; 73307265Smavstatic boolean_t dbuf_evict_thread_exit; 74307265Smav 75307265Smav/* 76339109Smav * There are two dbuf caches; each dbuf can only be in one of them at a time. 77339109Smav * 78339109Smav * 1. Cache of metadata dbufs, to help make read-heavy administrative commands 79339109Smav * from /sbin/zfs run faster. The "metadata cache" specifically stores dbufs 80339109Smav * that represent the metadata that describes filesystems/snapshots/ 81339109Smav * bookmarks/properties/etc. We only evict from this cache when we export a 82339109Smav * pool, to short-circuit as much I/O as possible for all administrative 83339109Smav * commands that need the metadata. There is no eviction policy for this 84339109Smav * cache, because we try to only include types in it which would occupy a 85339109Smav * very small amount of space per object but create a large impact on the 86339109Smav * performance of these commands. Instead, after it reaches a maximum size 87339109Smav * (which should only happen on very small memory systems with a very large 88339109Smav * number of filesystem objects), we stop taking new dbufs into the 89339109Smav * metadata cache, instead putting them in the normal dbuf cache. 90339109Smav * 91339109Smav * 2. LRU cache of dbufs. The "dbuf cache" maintains a list of dbufs that 92339109Smav * are not currently held but have been recently released. These dbufs 93339109Smav * are not eligible for arc eviction until they are aged out of the cache. 94339109Smav * Dbufs that are aged out of the cache will be immediately destroyed and 95339109Smav * become eligible for arc eviction. 96339109Smav * 97339109Smav * Dbufs are added to these caches once the last hold is released. If a dbuf is 98339109Smav * later accessed and still exists in the dbuf cache, then it will be removed 99339109Smav * from the cache and later re-added to the head of the cache. 100339109Smav * 101339109Smav * If a given dbuf meets the requirements for the metadata cache, it will go 102339109Smav * there, otherwise it will be considered for the generic LRU dbuf cache. The 103339109Smav * caches and the refcounts tracking their sizes are stored in an array indexed 104339109Smav * by those caches' matching enum values (from dbuf_cached_state_t). 105307265Smav */ 106339109Smavtypedef struct dbuf_cache { 107339109Smav multilist_t *cache; 108339109Smav refcount_t size; 109339109Smav} dbuf_cache_t; 110339109Smavdbuf_cache_t dbuf_caches[DB_CACHE_MAX]; 111339109Smav 112339109Smav/* Size limits for the caches */ 113332552Smavuint64_t dbuf_cache_max_bytes = 0; 114339109Smavuint64_t dbuf_metadata_cache_max_bytes = 0; 115339109Smav/* Set the default sizes of the caches to log2 fraction of arc size */ 116332552Smavint dbuf_cache_shift = 5; 117339109Smavint dbuf_metadata_cache_shift = 6; 118307265Smav 119307265Smav/* 120339109Smav * For diagnostic purposes, this is incremented whenever we can't add 121339109Smav * something to the metadata cache because it's full, and instead put 122339109Smav * the data in the regular dbuf cache. 123339109Smav */ 124339109Smavuint64_t dbuf_metadata_cache_overflow; 125339109Smav 126339109Smav/* 127339109Smav * The LRU dbuf cache uses a three-stage eviction policy: 128307265Smav * - A low water marker designates when the dbuf eviction thread 129307265Smav * should stop evicting from the dbuf cache. 130307265Smav * - When we reach the maximum size (aka mid water mark), we 131307265Smav * signal the eviction thread to run. 132307265Smav * - The high water mark indicates when the eviction thread 133307265Smav * is unable to keep up with the incoming load and eviction must 134307265Smav * happen in the context of the calling thread. 135307265Smav * 136307265Smav * The dbuf cache: 137307265Smav * (max size) 138307265Smav * low water mid water hi water 139307265Smav * +----------------------------------------+----------+----------+ 140307265Smav * | | | | 141307265Smav * | | | | 142307265Smav * | | | | 143307265Smav * | | | | 144307265Smav * +----------------------------------------+----------+----------+ 145307265Smav * stop signal evict 146307265Smav * evicting eviction directly 147307265Smav * thread 148307265Smav * 149307265Smav * The high and low water marks indicate the operating range for the eviction 150307265Smav * thread. The low water mark is, by default, 90% of the total size of the 151307265Smav * cache and the high water mark is at 110% (both of these percentages can be 152307265Smav * changed by setting dbuf_cache_lowater_pct and dbuf_cache_hiwater_pct, 153307265Smav * respectively). The eviction thread will try to ensure that the cache remains 154307265Smav * within this range by waking up every second and checking if the cache is 155307265Smav * above the low water mark. The thread can also be woken up by callers adding 156307265Smav * elements into the cache if the cache is larger than the mid water (i.e max 157307265Smav * cache size). Once the eviction thread is woken up and eviction is required, 158307265Smav * it will continue evicting buffers until it's able to reduce the cache size 159307265Smav * to the low water mark. If the cache size continues to grow and hits the high 160307265Smav * water mark, then callers adding elments to the cache will begin to evict 161307265Smav * directly from the cache until the cache is no longer above the high water 162307265Smav * mark. 163307265Smav */ 164307265Smav 165307265Smav/* 166307265Smav * The percentage above and below the maximum cache size. 167307265Smav */ 168307265Smavuint_t dbuf_cache_hiwater_pct = 10; 169307265Smavuint_t dbuf_cache_lowater_pct = 10; 170307265Smav 171330824SmavSYSCTL_DECL(_vfs_zfs); 172330824SmavSYSCTL_QUAD(_vfs_zfs, OID_AUTO, dbuf_cache_max_bytes, CTLFLAG_RWTUN, 173330824Smav &dbuf_cache_max_bytes, 0, "dbuf cache size in bytes"); 174339324SmavSYSCTL_QUAD(_vfs_zfs, OID_AUTO, dbuf_metadata_cache_max_bytes, CTLFLAG_RWTUN, 175339324Smav &dbuf_metadata_cache_max_bytes, 0, "dbuf metadata cache size in bytes"); 176332552SmavSYSCTL_INT(_vfs_zfs, OID_AUTO, dbuf_cache_shift, CTLFLAG_RDTUN, 177332552Smav &dbuf_cache_shift, 0, "dbuf cache size as log2 fraction of ARC"); 178339324SmavSYSCTL_INT(_vfs_zfs, OID_AUTO, dbuf_metadata_cache_shift, CTLFLAG_RDTUN, 179339324Smav &dbuf_metadata_cache_shift, 0, 180339324Smav "dbuf metadata cache size as log2 fraction of ARC"); 181339324SmavSYSCTL_QUAD(_vfs_zfs, OID_AUTO, dbuf_metadata_cache_overflow, CTLFLAG_RD, 182339324Smav &dbuf_metadata_cache_overflow, 0, "dbuf metadata cache overflow"); 183330824SmavSYSCTL_UINT(_vfs_zfs, OID_AUTO, dbuf_cache_hiwater_pct, CTLFLAG_RWTUN, 184330824Smav &dbuf_cache_hiwater_pct, 0, "max percents above the dbuf cache size"); 185330824SmavSYSCTL_UINT(_vfs_zfs, OID_AUTO, dbuf_cache_lowater_pct, CTLFLAG_RWTUN, 186330824Smav &dbuf_cache_lowater_pct, 0, "max percents below the dbuf cache size"); 187330824Smav 188168404Spjd/* ARGSUSED */ 189168404Spjdstatic int 190168404Spjddbuf_cons(void *vdb, void *unused, int kmflag) 191168404Spjd{ 192168404Spjd dmu_buf_impl_t *db = vdb; 193168404Spjd bzero(db, sizeof (dmu_buf_impl_t)); 194168404Spjd 195168404Spjd mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL); 196168404Spjd cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL); 197307265Smav multilist_link_init(&db->db_cache_link); 198168404Spjd refcount_create(&db->db_holds); 199269229Sdelphij 200168404Spjd return (0); 201168404Spjd} 202168404Spjd 203168404Spjd/* ARGSUSED */ 204168404Spjdstatic void 205168404Spjddbuf_dest(void *vdb, void *unused) 206168404Spjd{ 207168404Spjd dmu_buf_impl_t *db = vdb; 208168404Spjd mutex_destroy(&db->db_mtx); 209168404Spjd cv_destroy(&db->db_changed); 210307265Smav ASSERT(!multilist_link_active(&db->db_cache_link)); 211168404Spjd refcount_destroy(&db->db_holds); 212168404Spjd} 213168404Spjd 214168404Spjd/* 215168404Spjd * dbuf hash table routines 216168404Spjd */ 217168404Spjdstatic dbuf_hash_table_t dbuf_hash_table; 218168404Spjd 219168404Spjdstatic uint64_t dbuf_hash_count; 220168404Spjd 221332540Smav/* 222332540Smav * We use Cityhash for this. It's fast, and has good hash properties without 223332540Smav * requiring any large static buffers. 224332540Smav */ 225168404Spjdstatic uint64_t 226168404Spjddbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid) 227168404Spjd{ 228332540Smav return (cityhash4((uintptr_t)os, obj, (uint64_t)lvl, blkid)); 229168404Spjd} 230168404Spjd 231168404Spjd#define DBUF_EQUAL(dbuf, os, obj, level, blkid) \ 232168404Spjd ((dbuf)->db.db_object == (obj) && \ 233168404Spjd (dbuf)->db_objset == (os) && \ 234168404Spjd (dbuf)->db_level == (level) && \ 235168404Spjd (dbuf)->db_blkid == (blkid)) 236168404Spjd 237168404Spjddmu_buf_impl_t * 238286541Smavdbuf_find(objset_t *os, uint64_t obj, uint8_t level, uint64_t blkid) 239168404Spjd{ 240168404Spjd dbuf_hash_table_t *h = &dbuf_hash_table; 241307265Smav uint64_t hv = dbuf_hash(os, obj, level, blkid); 242168404Spjd uint64_t idx = hv & h->hash_table_mask; 243168404Spjd dmu_buf_impl_t *db; 244168404Spjd 245168404Spjd mutex_enter(DBUF_HASH_MUTEX(h, idx)); 246168404Spjd for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) { 247168404Spjd if (DBUF_EQUAL(db, os, obj, level, blkid)) { 248168404Spjd mutex_enter(&db->db_mtx); 249168404Spjd if (db->db_state != DB_EVICTING) { 250168404Spjd mutex_exit(DBUF_HASH_MUTEX(h, idx)); 251168404Spjd return (db); 252168404Spjd } 253168404Spjd mutex_exit(&db->db_mtx); 254168404Spjd } 255168404Spjd } 256168404Spjd mutex_exit(DBUF_HASH_MUTEX(h, idx)); 257168404Spjd return (NULL); 258168404Spjd} 259168404Spjd 260286541Smavstatic dmu_buf_impl_t * 261286541Smavdbuf_find_bonus(objset_t *os, uint64_t object) 262286541Smav{ 263286541Smav dnode_t *dn; 264286541Smav dmu_buf_impl_t *db = NULL; 265286541Smav 266286541Smav if (dnode_hold(os, object, FTAG, &dn) == 0) { 267286541Smav rw_enter(&dn->dn_struct_rwlock, RW_READER); 268286541Smav if (dn->dn_bonus != NULL) { 269286541Smav db = dn->dn_bonus; 270286541Smav mutex_enter(&db->db_mtx); 271286541Smav } 272286541Smav rw_exit(&dn->dn_struct_rwlock); 273286541Smav dnode_rele(dn, FTAG); 274286541Smav } 275286541Smav return (db); 276286541Smav} 277286541Smav 278168404Spjd/* 279168404Spjd * Insert an entry into the hash table. If there is already an element 280168404Spjd * equal to elem in the hash table, then the already existing element 281168404Spjd * will be returned and the new element will not be inserted. 282168404Spjd * Otherwise returns NULL. 283168404Spjd */ 284168404Spjdstatic dmu_buf_impl_t * 285168404Spjddbuf_hash_insert(dmu_buf_impl_t *db) 286168404Spjd{ 287168404Spjd dbuf_hash_table_t *h = &dbuf_hash_table; 288219089Spjd objset_t *os = db->db_objset; 289168404Spjd uint64_t obj = db->db.db_object; 290168404Spjd int level = db->db_level; 291168404Spjd uint64_t blkid = db->db_blkid; 292307265Smav uint64_t hv = dbuf_hash(os, obj, level, blkid); 293168404Spjd uint64_t idx = hv & h->hash_table_mask; 294168404Spjd dmu_buf_impl_t *dbf; 295168404Spjd 296168404Spjd mutex_enter(DBUF_HASH_MUTEX(h, idx)); 297168404Spjd for (dbf = h->hash_table[idx]; dbf != NULL; dbf = dbf->db_hash_next) { 298168404Spjd if (DBUF_EQUAL(dbf, os, obj, level, blkid)) { 299168404Spjd mutex_enter(&dbf->db_mtx); 300168404Spjd if (dbf->db_state != DB_EVICTING) { 301168404Spjd mutex_exit(DBUF_HASH_MUTEX(h, idx)); 302168404Spjd return (dbf); 303168404Spjd } 304168404Spjd mutex_exit(&dbf->db_mtx); 305168404Spjd } 306168404Spjd } 307168404Spjd 308168404Spjd mutex_enter(&db->db_mtx); 309168404Spjd db->db_hash_next = h->hash_table[idx]; 310168404Spjd h->hash_table[idx] = db; 311168404Spjd mutex_exit(DBUF_HASH_MUTEX(h, idx)); 312270247Sdelphij atomic_inc_64(&dbuf_hash_count); 313168404Spjd 314168404Spjd return (NULL); 315168404Spjd} 316168404Spjd 317168404Spjd/* 318268858Sdelphij * Remove an entry from the hash table. It must be in the EVICTING state. 319168404Spjd */ 320168404Spjdstatic void 321168404Spjddbuf_hash_remove(dmu_buf_impl_t *db) 322168404Spjd{ 323168404Spjd dbuf_hash_table_t *h = &dbuf_hash_table; 324307265Smav uint64_t hv = dbuf_hash(db->db_objset, db->db.db_object, 325168404Spjd db->db_level, db->db_blkid); 326168404Spjd uint64_t idx = hv & h->hash_table_mask; 327168404Spjd dmu_buf_impl_t *dbf, **dbp; 328168404Spjd 329168404Spjd /* 330268858Sdelphij * We musn't hold db_mtx to maintain lock ordering: 331168404Spjd * DBUF_HASH_MUTEX > db_mtx. 332168404Spjd */ 333168404Spjd ASSERT(refcount_is_zero(&db->db_holds)); 334168404Spjd ASSERT(db->db_state == DB_EVICTING); 335168404Spjd ASSERT(!MUTEX_HELD(&db->db_mtx)); 336168404Spjd 337168404Spjd mutex_enter(DBUF_HASH_MUTEX(h, idx)); 338168404Spjd dbp = &h->hash_table[idx]; 339168404Spjd while ((dbf = *dbp) != db) { 340168404Spjd dbp = &dbf->db_hash_next; 341168404Spjd ASSERT(dbf != NULL); 342168404Spjd } 343168404Spjd *dbp = db->db_hash_next; 344168404Spjd db->db_hash_next = NULL; 345168404Spjd mutex_exit(DBUF_HASH_MUTEX(h, idx)); 346270247Sdelphij atomic_dec_64(&dbuf_hash_count); 347168404Spjd} 348168404Spjd 349286575Smavtypedef enum { 350286575Smav DBVU_EVICTING, 351286575Smav DBVU_NOT_EVICTING 352286575Smav} dbvu_verify_type_t; 353286575Smav 354168404Spjdstatic void 355286575Smavdbuf_verify_user(dmu_buf_impl_t *db, dbvu_verify_type_t verify_type) 356286575Smav{ 357286575Smav#ifdef ZFS_DEBUG 358286575Smav int64_t holds; 359286575Smav 360286575Smav if (db->db_user == NULL) 361286575Smav return; 362286575Smav 363286575Smav /* Only data blocks support the attachment of user data. */ 364286575Smav ASSERT(db->db_level == 0); 365286575Smav 366286575Smav /* Clients must resolve a dbuf before attaching user data. */ 367286575Smav ASSERT(db->db.db_data != NULL); 368286575Smav ASSERT3U(db->db_state, ==, DB_CACHED); 369286575Smav 370286575Smav holds = refcount_count(&db->db_holds); 371286575Smav if (verify_type == DBVU_EVICTING) { 372286575Smav /* 373286575Smav * Immediate eviction occurs when holds == dirtycnt. 374286575Smav * For normal eviction buffers, holds is zero on 375286575Smav * eviction, except when dbuf_fix_old_data() calls 376286575Smav * dbuf_clear_data(). However, the hold count can grow 377286575Smav * during eviction even though db_mtx is held (see 378286575Smav * dmu_bonus_hold() for an example), so we can only 379286575Smav * test the generic invariant that holds >= dirtycnt. 380286575Smav */ 381286575Smav ASSERT3U(holds, >=, db->db_dirtycnt); 382286575Smav } else { 383289309Smav if (db->db_user_immediate_evict == TRUE) 384286575Smav ASSERT3U(holds, >=, db->db_dirtycnt); 385286575Smav else 386286575Smav ASSERT3U(holds, >, 0); 387286575Smav } 388286575Smav#endif 389286575Smav} 390286575Smav 391286575Smavstatic void 392168404Spjddbuf_evict_user(dmu_buf_impl_t *db) 393168404Spjd{ 394286575Smav dmu_buf_user_t *dbu = db->db_user; 395286575Smav 396168404Spjd ASSERT(MUTEX_HELD(&db->db_mtx)); 397168404Spjd 398286575Smav if (dbu == NULL) 399168404Spjd return; 400168404Spjd 401286575Smav dbuf_verify_user(db, DBVU_EVICTING); 402286575Smav db->db_user = NULL; 403286575Smav 404286575Smav#ifdef ZFS_DEBUG 405286575Smav if (dbu->dbu_clear_on_evict_dbufp != NULL) 406286575Smav *dbu->dbu_clear_on_evict_dbufp = NULL; 407286575Smav#endif 408286575Smav 409286575Smav /* 410321527Smav * There are two eviction callbacks - one that we call synchronously 411321527Smav * and one that we invoke via a taskq. The async one is useful for 412321527Smav * avoiding lock order reversals and limiting stack depth. 413321527Smav * 414321527Smav * Note that if we have a sync callback but no async callback, 415321527Smav * it's likely that the sync callback will free the structure 416321527Smav * containing the dbu. In that case we need to take care to not 417321527Smav * dereference dbu after calling the sync evict func. 418286575Smav */ 419321527Smav boolean_t has_async = (dbu->dbu_evict_func_async != NULL); 420321527Smav 421321527Smav if (dbu->dbu_evict_func_sync != NULL) 422321527Smav dbu->dbu_evict_func_sync(dbu); 423321527Smav 424321527Smav if (has_async) { 425321527Smav taskq_dispatch_ent(dbu_evict_taskq, dbu->dbu_evict_func_async, 426321527Smav dbu, 0, &dbu->dbu_tqent); 427321527Smav } 428168404Spjd} 429168404Spjd 430219089Spjdboolean_t 431219089Spjddbuf_is_metadata(dmu_buf_impl_t *db) 432219089Spjd{ 433219089Spjd if (db->db_level > 0) { 434219089Spjd return (B_TRUE); 435219089Spjd } else { 436219089Spjd boolean_t is_metadata; 437219089Spjd 438219089Spjd DB_DNODE_ENTER(db); 439236884Smm is_metadata = DMU_OT_IS_METADATA(DB_DNODE(db)->dn_type); 440219089Spjd DB_DNODE_EXIT(db); 441219089Spjd 442219089Spjd return (is_metadata); 443219089Spjd } 444219089Spjd} 445219089Spjd 446307265Smav/* 447339109Smav * This returns whether this dbuf should be stored in the metadata cache, which 448339109Smav * is based on whether it's from one of the dnode types that store data related 449339109Smav * to traversing dataset hierarchies. 450339109Smav */ 451339109Smavstatic boolean_t 452339109Smavdbuf_include_in_metadata_cache(dmu_buf_impl_t *db) 453339109Smav{ 454339109Smav DB_DNODE_ENTER(db); 455339109Smav dmu_object_type_t type = DB_DNODE(db)->dn_type; 456339109Smav DB_DNODE_EXIT(db); 457339109Smav 458339109Smav /* Check if this dbuf is one of the types we care about */ 459339109Smav if (DMU_OT_IS_METADATA_CACHED(type)) { 460339109Smav /* If we hit this, then we set something up wrong in dmu_ot */ 461339109Smav ASSERT(DMU_OT_IS_METADATA(type)); 462339109Smav 463339109Smav /* 464339109Smav * Sanity check for small-memory systems: don't allocate too 465339109Smav * much memory for this purpose. 466339109Smav */ 467339109Smav if (refcount_count(&dbuf_caches[DB_DBUF_METADATA_CACHE].size) > 468339109Smav dbuf_metadata_cache_max_bytes) { 469339109Smav dbuf_metadata_cache_overflow++; 470339109Smav DTRACE_PROBE1(dbuf__metadata__cache__overflow, 471339109Smav dmu_buf_impl_t *, db); 472339109Smav return (B_FALSE); 473339109Smav } 474339109Smav 475339109Smav return (B_TRUE); 476339109Smav } 477339109Smav 478339109Smav return (B_FALSE); 479339109Smav} 480339109Smav 481339109Smav/* 482307265Smav * This function *must* return indices evenly distributed between all 483307265Smav * sublists of the multilist. This is needed due to how the dbuf eviction 484307265Smav * code is laid out; dbuf_evict_thread() assumes dbufs are evenly 485307265Smav * distributed between all sublists and uses this assumption when 486307265Smav * deciding which sublist to evict from and how much to evict from it. 487307265Smav */ 488307265Smavunsigned int 489307265Smavdbuf_cache_multilist_index_func(multilist_t *ml, void *obj) 490168404Spjd{ 491307265Smav dmu_buf_impl_t *db = obj; 492168404Spjd 493307265Smav /* 494307265Smav * The assumption here, is the hash value for a given 495307265Smav * dmu_buf_impl_t will remain constant throughout it's lifetime 496307265Smav * (i.e. it's objset, object, level and blkid fields don't change). 497307265Smav * Thus, we don't need to store the dbuf's sublist index 498307265Smav * on insertion, as this index can be recalculated on removal. 499307265Smav * 500307265Smav * Also, the low order bits of the hash value are thought to be 501307265Smav * distributed evenly. Otherwise, in the case that the multilist 502307265Smav * has a power of two number of sublists, each sublists' usage 503307265Smav * would not be evenly distributed. 504307265Smav */ 505307265Smav return (dbuf_hash(db->db_objset, db->db.db_object, 506307265Smav db->db_level, db->db_blkid) % 507307265Smav multilist_get_num_sublists(ml)); 508168404Spjd} 509168404Spjd 510307265Smavstatic inline boolean_t 511307265Smavdbuf_cache_above_hiwater(void) 512307265Smav{ 513307265Smav uint64_t dbuf_cache_hiwater_bytes = 514307265Smav (dbuf_cache_max_bytes * dbuf_cache_hiwater_pct) / 100; 515307265Smav 516339109Smav return (refcount_count(&dbuf_caches[DB_DBUF_CACHE].size) > 517307265Smav dbuf_cache_max_bytes + dbuf_cache_hiwater_bytes); 518307265Smav} 519307265Smav 520307265Smavstatic inline boolean_t 521307265Smavdbuf_cache_above_lowater(void) 522307265Smav{ 523307265Smav uint64_t dbuf_cache_lowater_bytes = 524307265Smav (dbuf_cache_max_bytes * dbuf_cache_lowater_pct) / 100; 525307265Smav 526339109Smav return (refcount_count(&dbuf_caches[DB_DBUF_CACHE].size) > 527307265Smav dbuf_cache_max_bytes - dbuf_cache_lowater_bytes); 528307265Smav} 529307265Smav 530307265Smav/* 531307265Smav * Evict the oldest eligible dbuf from the dbuf cache. 532307265Smav */ 533307265Smavstatic void 534307265Smavdbuf_evict_one(void) 535307265Smav{ 536339109Smav int idx = multilist_get_random_index(dbuf_caches[DB_DBUF_CACHE].cache); 537339109Smav multilist_sublist_t *mls = multilist_sublist_lock( 538339109Smav dbuf_caches[DB_DBUF_CACHE].cache, idx); 539307265Smav 540307265Smav ASSERT(!MUTEX_HELD(&dbuf_evict_lock)); 541307265Smav 542307265Smav dmu_buf_impl_t *db = multilist_sublist_tail(mls); 543307265Smav while (db != NULL && mutex_tryenter(&db->db_mtx) == 0) { 544307265Smav db = multilist_sublist_prev(mls, db); 545307265Smav } 546307265Smav 547307265Smav DTRACE_PROBE2(dbuf__evict__one, dmu_buf_impl_t *, db, 548307265Smav multilist_sublist_t *, mls); 549307265Smav 550307265Smav if (db != NULL) { 551307265Smav multilist_sublist_remove(mls, db); 552307265Smav multilist_sublist_unlock(mls); 553339109Smav (void) refcount_remove_many(&dbuf_caches[DB_DBUF_CACHE].size, 554307265Smav db->db.db_size, db); 555339109Smav ASSERT3U(db->db_caching_status, ==, DB_DBUF_CACHE); 556339109Smav db->db_caching_status = DB_NO_CACHE; 557307265Smav dbuf_destroy(db); 558307265Smav } else { 559307265Smav multilist_sublist_unlock(mls); 560307265Smav } 561307265Smav} 562307265Smav 563307265Smav/* 564307265Smav * The dbuf evict thread is responsible for aging out dbufs from the 565307265Smav * cache. Once the cache has reached it's maximum size, dbufs are removed 566307265Smav * and destroyed. The eviction thread will continue running until the size 567307265Smav * of the dbuf cache is at or below the maximum size. Once the dbuf is aged 568307265Smav * out of the cache it is destroyed and becomes eligible for arc eviction. 569307265Smav */ 570331399Smav/* ARGSUSED */ 571307265Smavstatic void 572331399Smavdbuf_evict_thread(void *unused __unused) 573307265Smav{ 574307265Smav callb_cpr_t cpr; 575307265Smav 576307265Smav CALLB_CPR_INIT(&cpr, &dbuf_evict_lock, callb_generic_cpr, FTAG); 577307265Smav 578307265Smav mutex_enter(&dbuf_evict_lock); 579307265Smav while (!dbuf_evict_thread_exit) { 580307265Smav while (!dbuf_cache_above_lowater() && !dbuf_evict_thread_exit) { 581307265Smav CALLB_CPR_SAFE_BEGIN(&cpr); 582307265Smav (void) cv_timedwait_hires(&dbuf_evict_cv, 583307265Smav &dbuf_evict_lock, SEC2NSEC(1), MSEC2NSEC(1), 0); 584307265Smav CALLB_CPR_SAFE_END(&cpr, &dbuf_evict_lock); 585307265Smav } 586307265Smav mutex_exit(&dbuf_evict_lock); 587307265Smav 588307265Smav /* 589307265Smav * Keep evicting as long as we're above the low water mark 590307265Smav * for the cache. We do this without holding the locks to 591307265Smav * minimize lock contention. 592307265Smav */ 593307265Smav while (dbuf_cache_above_lowater() && !dbuf_evict_thread_exit) { 594307265Smav dbuf_evict_one(); 595307265Smav } 596307265Smav 597307265Smav mutex_enter(&dbuf_evict_lock); 598307265Smav } 599307265Smav 600307265Smav dbuf_evict_thread_exit = B_FALSE; 601307265Smav cv_broadcast(&dbuf_evict_cv); 602307265Smav CALLB_CPR_EXIT(&cpr); /* drops dbuf_evict_lock */ 603307265Smav thread_exit(); 604307265Smav} 605307265Smav 606307265Smav/* 607307265Smav * Wake up the dbuf eviction thread if the dbuf cache is at its max size. 608307265Smav * If the dbuf cache is at its high water mark, then evict a dbuf from the 609307265Smav * dbuf cache using the callers context. 610307265Smav */ 611307265Smavstatic void 612307265Smavdbuf_evict_notify(void) 613307265Smav{ 614307265Smav /* 615321575Smav * We check if we should evict without holding the dbuf_evict_lock, 616321575Smav * because it's OK to occasionally make the wrong decision here, 617321575Smav * and grabbing the lock results in massive lock contention. 618321575Smav */ 619339109Smav if (refcount_count(&dbuf_caches[DB_DBUF_CACHE].size) > 620339109Smav dbuf_cache_max_bytes) { 621321575Smav if (dbuf_cache_above_hiwater()) 622307265Smav dbuf_evict_one(); 623321575Smav cv_signal(&dbuf_evict_cv); 624307265Smav } 625307265Smav} 626307265Smav 627168404Spjdvoid 628168404Spjddbuf_init(void) 629168404Spjd{ 630168404Spjd uint64_t hsize = 1ULL << 16; 631168404Spjd dbuf_hash_table_t *h = &dbuf_hash_table; 632168404Spjd int i; 633168404Spjd 634168404Spjd /* 635168404Spjd * The hash table is big enough to fill all of physical memory 636168404Spjd * with an average 4K block size. The table will take up 637168404Spjd * totalmem*sizeof(void*)/4K (i.e. 2MB/GB with 8-byte pointers). 638168404Spjd */ 639168696Spjd while (hsize * 4096 < (uint64_t)physmem * PAGESIZE) 640168404Spjd hsize <<= 1; 641168404Spjd 642168404Spjdretry: 643168404Spjd h->hash_table_mask = hsize - 1; 644168404Spjd h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP); 645168404Spjd if (h->hash_table == NULL) { 646168404Spjd /* XXX - we should really return an error instead of assert */ 647168404Spjd ASSERT(hsize > (1ULL << 10)); 648168404Spjd hsize >>= 1; 649168404Spjd goto retry; 650168404Spjd } 651168404Spjd 652307265Smav dbuf_kmem_cache = kmem_cache_create("dmu_buf_impl_t", 653168404Spjd sizeof (dmu_buf_impl_t), 654168404Spjd 0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0); 655168404Spjd 656168404Spjd for (i = 0; i < DBUF_MUTEXES; i++) 657168404Spjd mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL); 658286575Smav 659286575Smav /* 660339109Smav * Setup the parameters for the dbuf caches. We set the sizes of the 661339109Smav * dbuf cache and the metadata cache to 1/32nd and 1/16th (default) 662339109Smav * of the size of the ARC, respectively. If the values are set in 663339109Smav * /etc/system and they're not greater than the size of the ARC, then 664339109Smav * we honor that value. 665307265Smav */ 666332552Smav if (dbuf_cache_max_bytes == 0 || 667332552Smav dbuf_cache_max_bytes >= arc_max_bytes()) { 668332552Smav dbuf_cache_max_bytes = arc_max_bytes() >> dbuf_cache_shift; 669332552Smav } 670339109Smav if (dbuf_metadata_cache_max_bytes == 0 || 671339109Smav dbuf_metadata_cache_max_bytes >= arc_max_bytes()) { 672339109Smav dbuf_metadata_cache_max_bytes = 673339109Smav arc_max_bytes() >> dbuf_metadata_cache_shift; 674339109Smav } 675307265Smav 676307265Smav /* 677286575Smav * All entries are queued via taskq_dispatch_ent(), so min/maxalloc 678286575Smav * configuration is not required. 679286575Smav */ 680286575Smav dbu_evict_taskq = taskq_create("dbu_evict", 1, minclsyspri, 0, 0, 0); 681307265Smav 682339109Smav for (dbuf_cached_state_t dcs = 0; dcs < DB_CACHE_MAX; dcs++) { 683339109Smav dbuf_caches[dcs].cache = 684339109Smav multilist_create(sizeof (dmu_buf_impl_t), 685339109Smav offsetof(dmu_buf_impl_t, db_cache_link), 686339109Smav dbuf_cache_multilist_index_func); 687339109Smav refcount_create(&dbuf_caches[dcs].size); 688339109Smav } 689307265Smav 690307265Smav dbuf_evict_thread_exit = B_FALSE; 691307265Smav mutex_init(&dbuf_evict_lock, NULL, MUTEX_DEFAULT, NULL); 692307265Smav cv_init(&dbuf_evict_cv, NULL, CV_DEFAULT, NULL); 693307265Smav dbuf_cache_evict_thread = thread_create(NULL, 0, dbuf_evict_thread, 694307265Smav NULL, 0, &p0, TS_RUN, minclsyspri); 695168404Spjd} 696168404Spjd 697168404Spjdvoid 698168404Spjddbuf_fini(void) 699168404Spjd{ 700168404Spjd dbuf_hash_table_t *h = &dbuf_hash_table; 701168404Spjd int i; 702168404Spjd 703168404Spjd for (i = 0; i < DBUF_MUTEXES; i++) 704168404Spjd mutex_destroy(&h->hash_mutexes[i]); 705168404Spjd kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *)); 706307265Smav kmem_cache_destroy(dbuf_kmem_cache); 707286575Smav taskq_destroy(dbu_evict_taskq); 708307265Smav 709307265Smav mutex_enter(&dbuf_evict_lock); 710307265Smav dbuf_evict_thread_exit = B_TRUE; 711307265Smav while (dbuf_evict_thread_exit) { 712307265Smav cv_signal(&dbuf_evict_cv); 713307265Smav cv_wait(&dbuf_evict_cv, &dbuf_evict_lock); 714307265Smav } 715307265Smav mutex_exit(&dbuf_evict_lock); 716307265Smav 717307265Smav mutex_destroy(&dbuf_evict_lock); 718307265Smav cv_destroy(&dbuf_evict_cv); 719307265Smav 720339109Smav for (dbuf_cached_state_t dcs = 0; dcs < DB_CACHE_MAX; dcs++) { 721339109Smav refcount_destroy(&dbuf_caches[dcs].size); 722339109Smav multilist_destroy(dbuf_caches[dcs].cache); 723339109Smav } 724168404Spjd} 725168404Spjd 726168404Spjd/* 727168404Spjd * Other stuff. 728168404Spjd */ 729168404Spjd 730168404Spjd#ifdef ZFS_DEBUG 731168404Spjdstatic void 732168404Spjddbuf_verify(dmu_buf_impl_t *db) 733168404Spjd{ 734219089Spjd dnode_t *dn; 735219089Spjd dbuf_dirty_record_t *dr; 736168404Spjd 737168404Spjd ASSERT(MUTEX_HELD(&db->db_mtx)); 738168404Spjd 739168404Spjd if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY)) 740168404Spjd return; 741168404Spjd 742168404Spjd ASSERT(db->db_objset != NULL); 743219089Spjd DB_DNODE_ENTER(db); 744219089Spjd dn = DB_DNODE(db); 745168404Spjd if (dn == NULL) { 746168404Spjd ASSERT(db->db_parent == NULL); 747168404Spjd ASSERT(db->db_blkptr == NULL); 748168404Spjd } else { 749168404Spjd ASSERT3U(db->db.db_object, ==, dn->dn_object); 750168404Spjd ASSERT3P(db->db_objset, ==, dn->dn_objset); 751168404Spjd ASSERT3U(db->db_level, <, dn->dn_nlevels); 752219089Spjd ASSERT(db->db_blkid == DMU_BONUS_BLKID || 753219089Spjd db->db_blkid == DMU_SPILL_BLKID || 754269229Sdelphij !avl_is_empty(&dn->dn_dbufs)); 755168404Spjd } 756219089Spjd if (db->db_blkid == DMU_BONUS_BLKID) { 757168404Spjd ASSERT(dn != NULL); 758185029Spjd ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); 759219089Spjd ASSERT3U(db->db.db_offset, ==, DMU_BONUS_BLKID); 760219089Spjd } else if (db->db_blkid == DMU_SPILL_BLKID) { 761219089Spjd ASSERT(dn != NULL); 762219089Spjd ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); 763240415Smm ASSERT0(db->db.db_offset); 764168404Spjd } else { 765168404Spjd ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size); 766168404Spjd } 767168404Spjd 768219089Spjd for (dr = db->db_data_pending; dr != NULL; dr = dr->dr_next) 769219089Spjd ASSERT(dr->dr_dbuf == db); 770219089Spjd 771219089Spjd for (dr = db->db_last_dirty; dr != NULL; dr = dr->dr_next) 772219089Spjd ASSERT(dr->dr_dbuf == db); 773219089Spjd 774208047Smm /* 775208047Smm * We can't assert that db_size matches dn_datablksz because it 776208047Smm * can be momentarily different when another thread is doing 777208047Smm * dnode_set_blksz(). 778208047Smm */ 779208047Smm if (db->db_level == 0 && db->db.db_object == DMU_META_DNODE_OBJECT) { 780219089Spjd dr = db->db_data_pending; 781208047Smm /* 782208047Smm * It should only be modified in syncing context, so 783208047Smm * make sure we only have one copy of the data. 784208047Smm */ 785208047Smm ASSERT(dr == NULL || dr->dt.dl.dr_data == db->db_buf); 786168404Spjd } 787168404Spjd 788168404Spjd /* verify db->db_blkptr */ 789168404Spjd if (db->db_blkptr) { 790168404Spjd if (db->db_parent == dn->dn_dbuf) { 791168404Spjd /* db is pointed to by the dnode */ 792168404Spjd /* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */ 793209962Smm if (DMU_OBJECT_IS_SPECIAL(db->db.db_object)) 794168404Spjd ASSERT(db->db_parent == NULL); 795168404Spjd else 796168404Spjd ASSERT(db->db_parent != NULL); 797219089Spjd if (db->db_blkid != DMU_SPILL_BLKID) 798219089Spjd ASSERT3P(db->db_blkptr, ==, 799219089Spjd &dn->dn_phys->dn_blkptr[db->db_blkid]); 800168404Spjd } else { 801168404Spjd /* db is pointed to by an indirect block */ 802168404Spjd int epb = db->db_parent->db.db_size >> SPA_BLKPTRSHIFT; 803168404Spjd ASSERT3U(db->db_parent->db_level, ==, db->db_level+1); 804168404Spjd ASSERT3U(db->db_parent->db.db_object, ==, 805168404Spjd db->db.db_object); 806168404Spjd /* 807168404Spjd * dnode_grow_indblksz() can make this fail if we don't 808168404Spjd * have the struct_rwlock. XXX indblksz no longer 809168404Spjd * grows. safe to do this now? 810168404Spjd */ 811219089Spjd if (RW_WRITE_HELD(&dn->dn_struct_rwlock)) { 812168404Spjd ASSERT3P(db->db_blkptr, ==, 813168404Spjd ((blkptr_t *)db->db_parent->db.db_data + 814168404Spjd db->db_blkid % epb)); 815168404Spjd } 816168404Spjd } 817168404Spjd } 818168404Spjd if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) && 819219089Spjd (db->db_buf == NULL || db->db_buf->b_data) && 820219089Spjd db->db.db_data && db->db_blkid != DMU_BONUS_BLKID && 821168404Spjd db->db_state != DB_FILL && !dn->dn_free_txg) { 822168404Spjd /* 823168404Spjd * If the blkptr isn't set but they have nonzero data, 824168404Spjd * it had better be dirty, otherwise we'll lose that 825168404Spjd * data when we evict this buffer. 826304138Savg * 827304138Savg * There is an exception to this rule for indirect blocks; in 828304138Savg * this case, if the indirect block is a hole, we fill in a few 829304138Savg * fields on each of the child blocks (importantly, birth time) 830304138Savg * to prevent hole birth times from being lost when you 831304138Savg * partially fill in a hole. 832168404Spjd */ 833168404Spjd if (db->db_dirtycnt == 0) { 834304138Savg if (db->db_level == 0) { 835304138Savg uint64_t *buf = db->db.db_data; 836304138Savg int i; 837168404Spjd 838304138Savg for (i = 0; i < db->db.db_size >> 3; i++) { 839304138Savg ASSERT(buf[i] == 0); 840304138Savg } 841304138Savg } else { 842304138Savg blkptr_t *bps = db->db.db_data; 843304138Savg ASSERT3U(1 << DB_DNODE(db)->dn_indblkshift, ==, 844304138Savg db->db.db_size); 845304138Savg /* 846304138Savg * We want to verify that all the blkptrs in the 847304138Savg * indirect block are holes, but we may have 848304138Savg * automatically set up a few fields for them. 849304138Savg * We iterate through each blkptr and verify 850304138Savg * they only have those fields set. 851304138Savg */ 852304138Savg for (int i = 0; 853304138Savg i < db->db.db_size / sizeof (blkptr_t); 854304138Savg i++) { 855304138Savg blkptr_t *bp = &bps[i]; 856304138Savg ASSERT(ZIO_CHECKSUM_IS_ZERO( 857304138Savg &bp->blk_cksum)); 858304138Savg ASSERT( 859304138Savg DVA_IS_EMPTY(&bp->blk_dva[0]) && 860304138Savg DVA_IS_EMPTY(&bp->blk_dva[1]) && 861304138Savg DVA_IS_EMPTY(&bp->blk_dva[2])); 862304138Savg ASSERT0(bp->blk_fill); 863304138Savg ASSERT0(bp->blk_pad[0]); 864304138Savg ASSERT0(bp->blk_pad[1]); 865304138Savg ASSERT(!BP_IS_EMBEDDED(bp)); 866304138Savg ASSERT(BP_IS_HOLE(bp)); 867304138Savg ASSERT0(bp->blk_phys_birth); 868304138Savg } 869168404Spjd } 870168404Spjd } 871168404Spjd } 872219089Spjd DB_DNODE_EXIT(db); 873168404Spjd} 874168404Spjd#endif 875168404Spjd 876168404Spjdstatic void 877286575Smavdbuf_clear_data(dmu_buf_impl_t *db) 878286575Smav{ 879286575Smav ASSERT(MUTEX_HELD(&db->db_mtx)); 880286575Smav dbuf_evict_user(db); 881307265Smav ASSERT3P(db->db_buf, ==, NULL); 882286575Smav db->db.db_data = NULL; 883286575Smav if (db->db_state != DB_NOFILL) 884286575Smav db->db_state = DB_UNCACHED; 885286575Smav} 886286575Smav 887286575Smavstatic void 888168404Spjddbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf) 889168404Spjd{ 890168404Spjd ASSERT(MUTEX_HELD(&db->db_mtx)); 891286575Smav ASSERT(buf != NULL); 892286575Smav 893168404Spjd db->db_buf = buf; 894286575Smav ASSERT(buf->b_data != NULL); 895286575Smav db->db.db_data = buf->b_data; 896168404Spjd} 897168404Spjd 898219089Spjd/* 899219089Spjd * Loan out an arc_buf for read. Return the loaned arc_buf. 900219089Spjd */ 901219089Spjdarc_buf_t * 902219089Spjddbuf_loan_arcbuf(dmu_buf_impl_t *db) 903219089Spjd{ 904219089Spjd arc_buf_t *abuf; 905219089Spjd 906307265Smav ASSERT(db->db_blkid != DMU_BONUS_BLKID); 907219089Spjd mutex_enter(&db->db_mtx); 908219089Spjd if (arc_released(db->db_buf) || refcount_count(&db->db_holds) > 1) { 909219089Spjd int blksz = db->db.db_size; 910260150Sdelphij spa_t *spa = db->db_objset->os_spa; 911219089Spjd 912219089Spjd mutex_exit(&db->db_mtx); 913321535Smav abuf = arc_loan_buf(spa, B_FALSE, blksz); 914219089Spjd bcopy(db->db.db_data, abuf->b_data, blksz); 915219089Spjd } else { 916219089Spjd abuf = db->db_buf; 917219089Spjd arc_loan_inuse_buf(abuf, db); 918307265Smav db->db_buf = NULL; 919286575Smav dbuf_clear_data(db); 920219089Spjd mutex_exit(&db->db_mtx); 921219089Spjd } 922219089Spjd return (abuf); 923219089Spjd} 924219089Spjd 925286705Smav/* 926286705Smav * Calculate which level n block references the data at the level 0 offset 927286705Smav * provided. 928286705Smav */ 929168404Spjduint64_t 930286705Smavdbuf_whichblock(dnode_t *dn, int64_t level, uint64_t offset) 931168404Spjd{ 932286705Smav if (dn->dn_datablkshift != 0 && dn->dn_indblkshift != 0) { 933286705Smav /* 934286705Smav * The level n blkid is equal to the level 0 blkid divided by 935286705Smav * the number of level 0s in a level n block. 936286705Smav * 937286705Smav * The level 0 blkid is offset >> datablkshift = 938286705Smav * offset / 2^datablkshift. 939286705Smav * 940286705Smav * The number of level 0s in a level n is the number of block 941286705Smav * pointers in an indirect block, raised to the power of level. 942286705Smav * This is 2^(indblkshift - SPA_BLKPTRSHIFT)^level = 943286705Smav * 2^(level*(indblkshift - SPA_BLKPTRSHIFT)). 944286705Smav * 945286705Smav * Thus, the level n blkid is: offset / 946286705Smav * ((2^datablkshift)*(2^(level*(indblkshift - SPA_BLKPTRSHIFT))) 947286705Smav * = offset / 2^(datablkshift + level * 948286705Smav * (indblkshift - SPA_BLKPTRSHIFT)) 949286705Smav * = offset >> (datablkshift + level * 950286705Smav * (indblkshift - SPA_BLKPTRSHIFT)) 951286705Smav */ 952286705Smav return (offset >> (dn->dn_datablkshift + level * 953286705Smav (dn->dn_indblkshift - SPA_BLKPTRSHIFT))); 954168404Spjd } else { 955168404Spjd ASSERT3U(offset, <, dn->dn_datablksz); 956168404Spjd return (0); 957168404Spjd } 958168404Spjd} 959168404Spjd 960168404Spjdstatic void 961339034Ssefdbuf_read_done(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp, 962339034Ssef arc_buf_t *buf, void *vdb) 963168404Spjd{ 964168404Spjd dmu_buf_impl_t *db = vdb; 965168404Spjd 966168404Spjd mutex_enter(&db->db_mtx); 967168404Spjd ASSERT3U(db->db_state, ==, DB_READ); 968168404Spjd /* 969168404Spjd * All reads are synchronous, so we must have a hold on the dbuf 970168404Spjd */ 971168404Spjd ASSERT(refcount_count(&db->db_holds) > 0); 972168404Spjd ASSERT(db->db_buf == NULL); 973168404Spjd ASSERT(db->db.db_data == NULL); 974339114Smav if (buf == NULL) { 975339114Smav /* i/o error */ 976339114Smav ASSERT(zio == NULL || zio->io_error != 0); 977339114Smav ASSERT(db->db_blkid != DMU_BONUS_BLKID); 978339114Smav ASSERT3P(db->db_buf, ==, NULL); 979339114Smav db->db_state = DB_UNCACHED; 980339114Smav } else if (db->db_level == 0 && db->db_freed_in_flight) { 981339114Smav /* freed in flight */ 982339114Smav ASSERT(zio == NULL || zio->io_error == 0); 983339034Ssef if (buf == NULL) { 984339034Ssef buf = arc_alloc_buf(db->db_objset->os_spa, 985339034Ssef db, DBUF_GET_BUFC_TYPE(db), db->db.db_size); 986339034Ssef } 987168404Spjd arc_release(buf, db); 988168404Spjd bzero(buf->b_data, db->db.db_size); 989168404Spjd arc_buf_freeze(buf); 990168404Spjd db->db_freed_in_flight = FALSE; 991168404Spjd dbuf_set_data(db, buf); 992168404Spjd db->db_state = DB_CACHED; 993339114Smav } else { 994339114Smav /* success */ 995339114Smav ASSERT(zio == NULL || zio->io_error == 0); 996168404Spjd dbuf_set_data(db, buf); 997168404Spjd db->db_state = DB_CACHED; 998168404Spjd } 999168404Spjd cv_broadcast(&db->db_changed); 1000339140Smav dbuf_rele_and_unlock(db, NULL, B_FALSE); 1001168404Spjd} 1002168404Spjd 1003168404Spjdstatic void 1004287702Sdelphijdbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) 1005168404Spjd{ 1006219089Spjd dnode_t *dn; 1007268123Sdelphij zbookmark_phys_t zb; 1008275811Sdelphij arc_flags_t aflags = ARC_FLAG_NOWAIT; 1009168404Spjd 1010219089Spjd DB_DNODE_ENTER(db); 1011219089Spjd dn = DB_DNODE(db); 1012168404Spjd ASSERT(!refcount_is_zero(&db->db_holds)); 1013168404Spjd /* We need the struct_rwlock to prevent db_blkptr from changing. */ 1014185029Spjd ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 1015168404Spjd ASSERT(MUTEX_HELD(&db->db_mtx)); 1016168404Spjd ASSERT(db->db_state == DB_UNCACHED); 1017168404Spjd ASSERT(db->db_buf == NULL); 1018168404Spjd 1019219089Spjd if (db->db_blkid == DMU_BONUS_BLKID) { 1020207624Smm int bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen); 1021185029Spjd 1022185029Spjd ASSERT3U(bonuslen, <=, db->db.db_size); 1023168404Spjd db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN); 1024208373Smm arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); 1025185029Spjd if (bonuslen < DN_MAX_BONUSLEN) 1026168404Spjd bzero(db->db.db_data, DN_MAX_BONUSLEN); 1027207624Smm if (bonuslen) 1028207624Smm bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen); 1029219089Spjd DB_DNODE_EXIT(db); 1030168404Spjd db->db_state = DB_CACHED; 1031168404Spjd mutex_exit(&db->db_mtx); 1032168404Spjd return; 1033168404Spjd } 1034168404Spjd 1035185029Spjd /* 1036185029Spjd * Recheck BP_IS_HOLE() after dnode_block_freed() in case dnode_sync() 1037185029Spjd * processes the delete record and clears the bp while we are waiting 1038185029Spjd * for the dn_mtx (resulting in a "no" from block_freed). 1039185029Spjd */ 1040185029Spjd if (db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr) || 1041185029Spjd (db->db_level == 0 && (dnode_block_freed(dn, db->db_blkid) || 1042185029Spjd BP_IS_HOLE(db->db_blkptr)))) { 1043168404Spjd arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 1044168404Spjd 1045321535Smav dbuf_set_data(db, arc_alloc_buf(db->db_objset->os_spa, db, type, 1046321535Smav db->db.db_size)); 1047168404Spjd bzero(db->db.db_data, db->db.db_size); 1048304138Savg 1049304138Savg if (db->db_blkptr != NULL && db->db_level > 0 && 1050304138Savg BP_IS_HOLE(db->db_blkptr) && 1051304138Savg db->db_blkptr->blk_birth != 0) { 1052304138Savg blkptr_t *bps = db->db.db_data; 1053304138Savg for (int i = 0; i < ((1 << 1054304138Savg DB_DNODE(db)->dn_indblkshift) / sizeof (blkptr_t)); 1055304138Savg i++) { 1056304138Savg blkptr_t *bp = &bps[i]; 1057304138Savg ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==, 1058304138Savg 1 << dn->dn_indblkshift); 1059304138Savg BP_SET_LSIZE(bp, 1060304138Savg BP_GET_LEVEL(db->db_blkptr) == 1 ? 1061304138Savg dn->dn_datablksz : 1062304138Savg BP_GET_LSIZE(db->db_blkptr)); 1063304138Savg BP_SET_TYPE(bp, BP_GET_TYPE(db->db_blkptr)); 1064304138Savg BP_SET_LEVEL(bp, 1065304138Savg BP_GET_LEVEL(db->db_blkptr) - 1); 1066304138Savg BP_SET_BIRTH(bp, db->db_blkptr->blk_birth, 0); 1067304138Savg } 1068304138Savg } 1069304138Savg DB_DNODE_EXIT(db); 1070168404Spjd db->db_state = DB_CACHED; 1071168404Spjd mutex_exit(&db->db_mtx); 1072168404Spjd return; 1073168404Spjd } 1074168404Spjd 1075219089Spjd DB_DNODE_EXIT(db); 1076219089Spjd 1077168404Spjd db->db_state = DB_READ; 1078168404Spjd mutex_exit(&db->db_mtx); 1079168404Spjd 1080185029Spjd if (DBUF_IS_L2CACHEABLE(db)) 1081275811Sdelphij aflags |= ARC_FLAG_L2CACHE; 1082185029Spjd 1083219089Spjd SET_BOOKMARK(&zb, db->db_objset->os_dsl_dataset ? 1084219089Spjd db->db_objset->os_dsl_dataset->ds_object : DMU_META_OBJSET, 1085219089Spjd db->db.db_object, db->db_level, db->db_blkid); 1086168404Spjd 1087168404Spjd dbuf_add_ref(db, NULL); 1088185029Spjd 1089260150Sdelphij (void) arc_read(zio, db->db_objset->os_spa, db->db_blkptr, 1090168404Spjd dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, 1091287702Sdelphij (flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED, 1092168404Spjd &aflags, &zb); 1093168404Spjd} 1094168404Spjd 1095321535Smav/* 1096321535Smav * This is our just-in-time copy function. It makes a copy of buffers that 1097321535Smav * have been modified in a previous transaction group before we access them in 1098321535Smav * the current active group. 1099321535Smav * 1100321535Smav * This function is used in three places: when we are dirtying a buffer for the 1101321535Smav * first time in a txg, when we are freeing a range in a dnode that includes 1102321535Smav * this buffer, and when we are accessing a buffer which was received compressed 1103321535Smav * and later referenced in a WRITE_BYREF record. 1104321535Smav * 1105321535Smav * Note that when we are called from dbuf_free_range() we do not put a hold on 1106321535Smav * the buffer, we just traverse the active dbuf list for the dnode. 1107321535Smav */ 1108321535Smavstatic void 1109321535Smavdbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg) 1110321535Smav{ 1111321535Smav dbuf_dirty_record_t *dr = db->db_last_dirty; 1112321535Smav 1113321535Smav ASSERT(MUTEX_HELD(&db->db_mtx)); 1114321535Smav ASSERT(db->db.db_data != NULL); 1115321535Smav ASSERT(db->db_level == 0); 1116321535Smav ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT); 1117321535Smav 1118321535Smav if (dr == NULL || 1119321535Smav (dr->dt.dl.dr_data != 1120321535Smav ((db->db_blkid == DMU_BONUS_BLKID) ? db->db.db_data : db->db_buf))) 1121321535Smav return; 1122321535Smav 1123321535Smav /* 1124321535Smav * If the last dirty record for this dbuf has not yet synced 1125321535Smav * and its referencing the dbuf data, either: 1126321535Smav * reset the reference to point to a new copy, 1127321535Smav * or (if there a no active holders) 1128321535Smav * just null out the current db_data pointer. 1129321535Smav */ 1130321535Smav ASSERT(dr->dr_txg >= txg - 2); 1131321535Smav if (db->db_blkid == DMU_BONUS_BLKID) { 1132321535Smav /* Note that the data bufs here are zio_bufs */ 1133321535Smav dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN); 1134321535Smav arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); 1135321535Smav bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN); 1136321535Smav } else if (refcount_count(&db->db_holds) > db->db_dirtycnt) { 1137321535Smav int size = arc_buf_size(db->db_buf); 1138321535Smav arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 1139321535Smav spa_t *spa = db->db_objset->os_spa; 1140321535Smav enum zio_compress compress_type = 1141321535Smav arc_get_compression(db->db_buf); 1142321535Smav 1143321535Smav if (compress_type == ZIO_COMPRESS_OFF) { 1144321535Smav dr->dt.dl.dr_data = arc_alloc_buf(spa, db, type, size); 1145321535Smav } else { 1146321535Smav ASSERT3U(type, ==, ARC_BUFC_DATA); 1147321535Smav dr->dt.dl.dr_data = arc_alloc_compressed_buf(spa, db, 1148321535Smav size, arc_buf_lsize(db->db_buf), compress_type); 1149321535Smav } 1150321535Smav bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size); 1151321535Smav } else { 1152321535Smav db->db_buf = NULL; 1153321535Smav dbuf_clear_data(db); 1154321535Smav } 1155321535Smav} 1156321535Smav 1157168404Spjdint 1158168404Spjddbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) 1159168404Spjd{ 1160168404Spjd int err = 0; 1161260150Sdelphij boolean_t prefetch; 1162219089Spjd dnode_t *dn; 1163168404Spjd 1164168404Spjd /* 1165168404Spjd * We don't have to hold the mutex to check db_state because it 1166168404Spjd * can't be freed while we have a hold on the buffer. 1167168404Spjd */ 1168168404Spjd ASSERT(!refcount_is_zero(&db->db_holds)); 1169168404Spjd 1170219089Spjd if (db->db_state == DB_NOFILL) 1171249195Smm return (SET_ERROR(EIO)); 1172219089Spjd 1173219089Spjd DB_DNODE_ENTER(db); 1174219089Spjd dn = DB_DNODE(db); 1175168404Spjd if ((flags & DB_RF_HAVESTRUCT) == 0) 1176219089Spjd rw_enter(&dn->dn_struct_rwlock, RW_READER); 1177168404Spjd 1178219089Spjd prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && 1179219089Spjd (flags & DB_RF_NOPREFETCH) == 0 && dn != NULL && 1180185029Spjd DBUF_IS_CACHEABLE(db); 1181168404Spjd 1182168404Spjd mutex_enter(&db->db_mtx); 1183168404Spjd if (db->db_state == DB_CACHED) { 1184321535Smav /* 1185321535Smav * If the arc buf is compressed, we need to decompress it to 1186321535Smav * read the data. This could happen during the "zfs receive" of 1187321535Smav * a stream which is compressed and deduplicated. 1188321535Smav */ 1189321535Smav if (db->db_buf != NULL && 1190321535Smav arc_get_compression(db->db_buf) != ZIO_COMPRESS_OFF) { 1191321535Smav dbuf_fix_old_data(db, 1192321535Smav spa_syncing_txg(dmu_objset_spa(db->db_objset))); 1193321535Smav err = arc_decompress(db->db_buf); 1194321535Smav dbuf_set_data(db, db->db_buf); 1195321535Smav } 1196168404Spjd mutex_exit(&db->db_mtx); 1197168404Spjd if (prefetch) 1198297832Smav dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE); 1199168404Spjd if ((flags & DB_RF_HAVESTRUCT) == 0) 1200219089Spjd rw_exit(&dn->dn_struct_rwlock); 1201219089Spjd DB_DNODE_EXIT(db); 1202168404Spjd } else if (db->db_state == DB_UNCACHED) { 1203219089Spjd spa_t *spa = dn->dn_objset->os_spa; 1204321565Smav boolean_t need_wait = B_FALSE; 1205219089Spjd 1206321565Smav if (zio == NULL && 1207321565Smav db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)) { 1208219089Spjd zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); 1209321565Smav need_wait = B_TRUE; 1210321565Smav } 1211287702Sdelphij dbuf_read_impl(db, zio, flags); 1212168404Spjd 1213168404Spjd /* dbuf_read_impl has dropped db_mtx for us */ 1214168404Spjd 1215168404Spjd if (prefetch) 1216297832Smav dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE); 1217168404Spjd 1218168404Spjd if ((flags & DB_RF_HAVESTRUCT) == 0) 1219219089Spjd rw_exit(&dn->dn_struct_rwlock); 1220219089Spjd DB_DNODE_EXIT(db); 1221168404Spjd 1222321565Smav if (need_wait) 1223168404Spjd err = zio_wait(zio); 1224168404Spjd } else { 1225251629Sdelphij /* 1226251629Sdelphij * Another reader came in while the dbuf was in flight 1227251629Sdelphij * between UNCACHED and CACHED. Either a writer will finish 1228251629Sdelphij * writing the buffer (sending the dbuf to CACHED) or the 1229251629Sdelphij * first reader's request will reach the read_done callback 1230251629Sdelphij * and send the dbuf to CACHED. Otherwise, a failure 1231251629Sdelphij * occurred and the dbuf went to UNCACHED. 1232251629Sdelphij */ 1233168404Spjd mutex_exit(&db->db_mtx); 1234168404Spjd if (prefetch) 1235297832Smav dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE); 1236168404Spjd if ((flags & DB_RF_HAVESTRUCT) == 0) 1237219089Spjd rw_exit(&dn->dn_struct_rwlock); 1238219089Spjd DB_DNODE_EXIT(db); 1239168404Spjd 1240251629Sdelphij /* Skip the wait per the caller's request. */ 1241168404Spjd mutex_enter(&db->db_mtx); 1242168404Spjd if ((flags & DB_RF_NEVERWAIT) == 0) { 1243168404Spjd while (db->db_state == DB_READ || 1244168404Spjd db->db_state == DB_FILL) { 1245168404Spjd ASSERT(db->db_state == DB_READ || 1246168404Spjd (flags & DB_RF_HAVESTRUCT) == 0); 1247272511Sdelphij DTRACE_PROBE2(blocked__read, dmu_buf_impl_t *, 1248272511Sdelphij db, zio_t *, zio); 1249168404Spjd cv_wait(&db->db_changed, &db->db_mtx); 1250168404Spjd } 1251168404Spjd if (db->db_state == DB_UNCACHED) 1252249195Smm err = SET_ERROR(EIO); 1253168404Spjd } 1254168404Spjd mutex_exit(&db->db_mtx); 1255168404Spjd } 1256168404Spjd 1257168404Spjd return (err); 1258168404Spjd} 1259168404Spjd 1260168404Spjdstatic void 1261168404Spjddbuf_noread(dmu_buf_impl_t *db) 1262168404Spjd{ 1263168404Spjd ASSERT(!refcount_is_zero(&db->db_holds)); 1264219089Spjd ASSERT(db->db_blkid != DMU_BONUS_BLKID); 1265168404Spjd mutex_enter(&db->db_mtx); 1266168404Spjd while (db->db_state == DB_READ || db->db_state == DB_FILL) 1267168404Spjd cv_wait(&db->db_changed, &db->db_mtx); 1268168404Spjd if (db->db_state == DB_UNCACHED) { 1269168404Spjd arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 1270260150Sdelphij spa_t *spa = db->db_objset->os_spa; 1271168404Spjd 1272168404Spjd ASSERT(db->db_buf == NULL); 1273168404Spjd ASSERT(db->db.db_data == NULL); 1274321535Smav dbuf_set_data(db, arc_alloc_buf(spa, db, type, db->db.db_size)); 1275168404Spjd db->db_state = DB_FILL; 1276219089Spjd } else if (db->db_state == DB_NOFILL) { 1277286575Smav dbuf_clear_data(db); 1278168404Spjd } else { 1279168404Spjd ASSERT3U(db->db_state, ==, DB_CACHED); 1280168404Spjd } 1281168404Spjd mutex_exit(&db->db_mtx); 1282168404Spjd} 1283168404Spjd 1284168404Spjdvoid 1285168404Spjddbuf_unoverride(dbuf_dirty_record_t *dr) 1286168404Spjd{ 1287168404Spjd dmu_buf_impl_t *db = dr->dr_dbuf; 1288219089Spjd blkptr_t *bp = &dr->dt.dl.dr_overridden_by; 1289168404Spjd uint64_t txg = dr->dr_txg; 1290168404Spjd 1291168404Spjd ASSERT(MUTEX_HELD(&db->db_mtx)); 1292321568Smav /* 1293321568Smav * This assert is valid because dmu_sync() expects to be called by 1294321568Smav * a zilog's get_data while holding a range lock. This call only 1295321568Smav * comes from dbuf_dirty() callers who must also hold a range lock. 1296321568Smav */ 1297168404Spjd ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC); 1298168404Spjd ASSERT(db->db_level == 0); 1299168404Spjd 1300219089Spjd if (db->db_blkid == DMU_BONUS_BLKID || 1301168404Spjd dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN) 1302168404Spjd return; 1303168404Spjd 1304219089Spjd ASSERT(db->db_data_pending != dr); 1305219089Spjd 1306168404Spjd /* free this block */ 1307260150Sdelphij if (!BP_IS_HOLE(bp) && !dr->dt.dl.dr_nopwrite) 1308260150Sdelphij zio_free(db->db_objset->os_spa, txg, bp); 1309219089Spjd 1310168404Spjd dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; 1311243524Smm dr->dt.dl.dr_nopwrite = B_FALSE; 1312243524Smm 1313168404Spjd /* 1314168404Spjd * Release the already-written buffer, so we leave it in 1315168404Spjd * a consistent dirty state. Note that all callers are 1316168404Spjd * modifying the buffer, so they will immediately do 1317168404Spjd * another (redundant) arc_release(). Therefore, leave 1318168404Spjd * the buf thawed to save the effort of freezing & 1319168404Spjd * immediately re-thawing it. 1320168404Spjd */ 1321168404Spjd arc_release(dr->dt.dl.dr_data, db); 1322168404Spjd} 1323168404Spjd 1324185029Spjd/* 1325185029Spjd * Evict (if its unreferenced) or clear (if its referenced) any level-0 1326185029Spjd * data blocks in the free range, so that any future readers will find 1327260150Sdelphij * empty blocks. 1328185029Spjd */ 1329168404Spjdvoid 1330269229Sdelphijdbuf_free_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid, 1331269229Sdelphij dmu_tx_t *tx) 1332168404Spjd{ 1333286575Smav dmu_buf_impl_t db_search; 1334286575Smav dmu_buf_impl_t *db, *db_next; 1335168404Spjd uint64_t txg = tx->tx_txg; 1336269229Sdelphij avl_index_t where; 1337168404Spjd 1338321524Smav if (end_blkid > dn->dn_maxblkid && 1339321524Smav !(start_blkid == DMU_SPILL_BLKID || end_blkid == DMU_SPILL_BLKID)) 1340269229Sdelphij end_blkid = dn->dn_maxblkid; 1341269229Sdelphij dprintf_dnode(dn, "start=%llu end=%llu\n", start_blkid, end_blkid); 1342253821Sdelphij 1343269229Sdelphij db_search.db_level = 0; 1344269229Sdelphij db_search.db_blkid = start_blkid; 1345270383Sdelphij db_search.db_state = DB_SEARCH; 1346269229Sdelphij 1347254753Sdelphij mutex_enter(&dn->dn_dbufs_mtx); 1348269229Sdelphij db = avl_find(&dn->dn_dbufs, &db_search, &where); 1349269229Sdelphij ASSERT3P(db, ==, NULL); 1350321524Smav 1351269229Sdelphij db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER); 1352269229Sdelphij 1353269229Sdelphij for (; db != NULL; db = db_next) { 1354269229Sdelphij db_next = AVL_NEXT(&dn->dn_dbufs, db); 1355219089Spjd ASSERT(db->db_blkid != DMU_BONUS_BLKID); 1356185029Spjd 1357269229Sdelphij if (db->db_level != 0 || db->db_blkid > end_blkid) { 1358269229Sdelphij break; 1359269229Sdelphij } 1360269229Sdelphij ASSERT3U(db->db_blkid, >=, start_blkid); 1361168404Spjd 1362168404Spjd /* found a level 0 buffer in the range */ 1363248571Smm mutex_enter(&db->db_mtx); 1364248571Smm if (dbuf_undirty(db, tx)) { 1365248571Smm /* mutex has been dropped and dbuf destroyed */ 1366168404Spjd continue; 1367248571Smm } 1368168404Spjd 1369168404Spjd if (db->db_state == DB_UNCACHED || 1370219089Spjd db->db_state == DB_NOFILL || 1371168404Spjd db->db_state == DB_EVICTING) { 1372168404Spjd ASSERT(db->db.db_data == NULL); 1373168404Spjd mutex_exit(&db->db_mtx); 1374168404Spjd continue; 1375168404Spjd } 1376168404Spjd if (db->db_state == DB_READ || db->db_state == DB_FILL) { 1377168404Spjd /* will be handled in dbuf_read_done or dbuf_rele */ 1378168404Spjd db->db_freed_in_flight = TRUE; 1379168404Spjd mutex_exit(&db->db_mtx); 1380168404Spjd continue; 1381168404Spjd } 1382168404Spjd if (refcount_count(&db->db_holds) == 0) { 1383168404Spjd ASSERT(db->db_buf); 1384307265Smav dbuf_destroy(db); 1385168404Spjd continue; 1386168404Spjd } 1387168404Spjd /* The dbuf is referenced */ 1388168404Spjd 1389168404Spjd if (db->db_last_dirty != NULL) { 1390168404Spjd dbuf_dirty_record_t *dr = db->db_last_dirty; 1391168404Spjd 1392168404Spjd if (dr->dr_txg == txg) { 1393168404Spjd /* 1394168404Spjd * This buffer is "in-use", re-adjust the file 1395168404Spjd * size to reflect that this buffer may 1396168404Spjd * contain new data when we sync. 1397168404Spjd */ 1398219089Spjd if (db->db_blkid != DMU_SPILL_BLKID && 1399219089Spjd db->db_blkid > dn->dn_maxblkid) 1400168404Spjd dn->dn_maxblkid = db->db_blkid; 1401168404Spjd dbuf_unoverride(dr); 1402168404Spjd } else { 1403168404Spjd /* 1404168404Spjd * This dbuf is not dirty in the open context. 1405168404Spjd * Either uncache it (if its not referenced in 1406168404Spjd * the open context) or reset its contents to 1407168404Spjd * empty. 1408168404Spjd */ 1409168404Spjd dbuf_fix_old_data(db, txg); 1410168404Spjd } 1411168404Spjd } 1412168404Spjd /* clear the contents if its cached */ 1413168404Spjd if (db->db_state == DB_CACHED) { 1414168404Spjd ASSERT(db->db.db_data != NULL); 1415168404Spjd arc_release(db->db_buf, db); 1416168404Spjd bzero(db->db.db_data, db->db.db_size); 1417168404Spjd arc_buf_freeze(db->db_buf); 1418168404Spjd } 1419168404Spjd 1420168404Spjd mutex_exit(&db->db_mtx); 1421168404Spjd } 1422168404Spjd mutex_exit(&dn->dn_dbufs_mtx); 1423168404Spjd} 1424168404Spjd 1425168404Spjdvoid 1426168404Spjddbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx) 1427168404Spjd{ 1428168404Spjd arc_buf_t *buf, *obuf; 1429168404Spjd int osize = db->db.db_size; 1430168404Spjd arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 1431219089Spjd dnode_t *dn; 1432168404Spjd 1433219089Spjd ASSERT(db->db_blkid != DMU_BONUS_BLKID); 1434168404Spjd 1435219089Spjd DB_DNODE_ENTER(db); 1436219089Spjd dn = DB_DNODE(db); 1437219089Spjd 1438168404Spjd /* XXX does *this* func really need the lock? */ 1439219089Spjd ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); 1440168404Spjd 1441168404Spjd /* 1442260150Sdelphij * This call to dmu_buf_will_dirty() with the dn_struct_rwlock held 1443168404Spjd * is OK, because there can be no other references to the db 1444168404Spjd * when we are changing its size, so no concurrent DB_FILL can 1445168404Spjd * be happening. 1446168404Spjd */ 1447168404Spjd /* 1448168404Spjd * XXX we should be doing a dbuf_read, checking the return 1449168404Spjd * value and returning that up to our callers 1450168404Spjd */ 1451260150Sdelphij dmu_buf_will_dirty(&db->db, tx); 1452168404Spjd 1453168404Spjd /* create the data buffer for the new block */ 1454321535Smav buf = arc_alloc_buf(dn->dn_objset->os_spa, db, type, size); 1455168404Spjd 1456168404Spjd /* copy old block data to the new block */ 1457168404Spjd obuf = db->db_buf; 1458168404Spjd bcopy(obuf->b_data, buf->b_data, MIN(osize, size)); 1459168404Spjd /* zero the remainder */ 1460168404Spjd if (size > osize) 1461168404Spjd bzero((uint8_t *)buf->b_data + osize, size - osize); 1462168404Spjd 1463168404Spjd mutex_enter(&db->db_mtx); 1464168404Spjd dbuf_set_data(db, buf); 1465307265Smav arc_buf_destroy(obuf, db); 1466168404Spjd db->db.db_size = size; 1467168404Spjd 1468168404Spjd if (db->db_level == 0) { 1469168404Spjd ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg); 1470168404Spjd db->db_last_dirty->dt.dl.dr_data = buf; 1471168404Spjd } 1472168404Spjd mutex_exit(&db->db_mtx); 1473168404Spjd 1474321547Smav dmu_objset_willuse_space(dn->dn_objset, size - osize, tx); 1475219089Spjd DB_DNODE_EXIT(db); 1476168404Spjd} 1477168404Spjd 1478219089Spjdvoid 1479219089Spjddbuf_release_bp(dmu_buf_impl_t *db) 1480219089Spjd{ 1481260150Sdelphij objset_t *os = db->db_objset; 1482219089Spjd 1483219089Spjd ASSERT(dsl_pool_sync_context(dmu_objset_pool(os))); 1484219089Spjd ASSERT(arc_released(os->os_phys_buf) || 1485219089Spjd list_link_active(&os->os_dsl_dataset->ds_synced_link)); 1486219089Spjd ASSERT(db->db_parent == NULL || arc_released(db->db_parent->db_buf)); 1487219089Spjd 1488246666Smm (void) arc_release(db->db_buf, db); 1489219089Spjd} 1490219089Spjd 1491289297Smav/* 1492289297Smav * We already have a dirty record for this TXG, and we are being 1493289297Smav * dirtied again. 1494289297Smav */ 1495289297Smavstatic void 1496289297Smavdbuf_redirty(dbuf_dirty_record_t *dr) 1497289297Smav{ 1498289297Smav dmu_buf_impl_t *db = dr->dr_dbuf; 1499289297Smav 1500289297Smav ASSERT(MUTEX_HELD(&db->db_mtx)); 1501289297Smav 1502289297Smav if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID) { 1503289297Smav /* 1504289297Smav * If this buffer has already been written out, 1505289297Smav * we now need to reset its state. 1506289297Smav */ 1507289297Smav dbuf_unoverride(dr); 1508289297Smav if (db->db.db_object != DMU_META_DNODE_OBJECT && 1509289297Smav db->db_state != DB_NOFILL) { 1510289297Smav /* Already released on initial dirty, so just thaw. */ 1511289297Smav ASSERT(arc_released(db->db_buf)); 1512289297Smav arc_buf_thaw(db->db_buf); 1513289297Smav } 1514289297Smav } 1515289297Smav} 1516289297Smav 1517168404Spjddbuf_dirty_record_t * 1518168404Spjddbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) 1519168404Spjd{ 1520219089Spjd dnode_t *dn; 1521219089Spjd objset_t *os; 1522168404Spjd dbuf_dirty_record_t **drp, *dr; 1523168404Spjd int drop_struct_lock = FALSE; 1524168404Spjd int txgoff = tx->tx_txg & TXG_MASK; 1525168404Spjd 1526168404Spjd ASSERT(tx->tx_txg != 0); 1527168404Spjd ASSERT(!refcount_is_zero(&db->db_holds)); 1528168404Spjd DMU_TX_DIRTY_BUF(tx, db); 1529168404Spjd 1530219089Spjd DB_DNODE_ENTER(db); 1531219089Spjd dn = DB_DNODE(db); 1532168404Spjd /* 1533168404Spjd * Shouldn't dirty a regular buffer in syncing context. Private 1534168404Spjd * objects may be dirtied in syncing context, but only if they 1535168404Spjd * were already pre-dirtied in open context. 1536168404Spjd */ 1537308082Smav#ifdef DEBUG 1538308082Smav if (dn->dn_objset->os_dsl_dataset != NULL) { 1539308082Smav rrw_enter(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock, 1540308082Smav RW_READER, FTAG); 1541308082Smav } 1542168404Spjd ASSERT(!dmu_tx_is_syncing(tx) || 1543168404Spjd BP_IS_HOLE(dn->dn_objset->os_rootbp) || 1544209962Smm DMU_OBJECT_IS_SPECIAL(dn->dn_object) || 1545209962Smm dn->dn_objset->os_dsl_dataset == NULL); 1546308082Smav if (dn->dn_objset->os_dsl_dataset != NULL) 1547308082Smav rrw_exit(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock, FTAG); 1548308082Smav#endif 1549168404Spjd /* 1550168404Spjd * We make this assert for private objects as well, but after we 1551168404Spjd * check if we're already dirty. They are allowed to re-dirty 1552168404Spjd * in syncing context. 1553168404Spjd */ 1554168404Spjd ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || 1555168404Spjd dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx == 1556168404Spjd (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); 1557168404Spjd 1558168404Spjd mutex_enter(&db->db_mtx); 1559168404Spjd /* 1560168404Spjd * XXX make this true for indirects too? The problem is that 1561168404Spjd * transactions created with dmu_tx_create_assigned() from 1562168404Spjd * syncing context don't bother holding ahead. 1563168404Spjd */ 1564168404Spjd ASSERT(db->db_level != 0 || 1565219089Spjd db->db_state == DB_CACHED || db->db_state == DB_FILL || 1566219089Spjd db->db_state == DB_NOFILL); 1567168404Spjd 1568168404Spjd mutex_enter(&dn->dn_mtx); 1569168404Spjd /* 1570168404Spjd * Don't set dirtyctx to SYNC if we're just modifying this as we 1571168404Spjd * initialize the objset. 1572168404Spjd */ 1573308082Smav if (dn->dn_dirtyctx == DN_UNDIRTIED) { 1574308082Smav if (dn->dn_objset->os_dsl_dataset != NULL) { 1575308082Smav rrw_enter(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock, 1576308082Smav RW_READER, FTAG); 1577308082Smav } 1578308082Smav if (!BP_IS_HOLE(dn->dn_objset->os_rootbp)) { 1579308082Smav dn->dn_dirtyctx = (dmu_tx_is_syncing(tx) ? 1580308082Smav DN_DIRTY_SYNC : DN_DIRTY_OPEN); 1581308082Smav ASSERT(dn->dn_dirtyctx_firstset == NULL); 1582308082Smav dn->dn_dirtyctx_firstset = kmem_alloc(1, KM_SLEEP); 1583308082Smav } 1584308082Smav if (dn->dn_objset->os_dsl_dataset != NULL) { 1585308082Smav rrw_exit(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock, 1586308082Smav FTAG); 1587308082Smav } 1588168404Spjd } 1589168404Spjd mutex_exit(&dn->dn_mtx); 1590168404Spjd 1591219089Spjd if (db->db_blkid == DMU_SPILL_BLKID) 1592219089Spjd dn->dn_have_spill = B_TRUE; 1593219089Spjd 1594168404Spjd /* 1595168404Spjd * If this buffer is already dirty, we're done. 1596168404Spjd */ 1597168404Spjd drp = &db->db_last_dirty; 1598168404Spjd ASSERT(*drp == NULL || (*drp)->dr_txg <= tx->tx_txg || 1599168404Spjd db->db.db_object == DMU_META_DNODE_OBJECT); 1600185029Spjd while ((dr = *drp) != NULL && dr->dr_txg > tx->tx_txg) 1601185029Spjd drp = &dr->dr_next; 1602185029Spjd if (dr && dr->dr_txg == tx->tx_txg) { 1603219089Spjd DB_DNODE_EXIT(db); 1604219089Spjd 1605289297Smav dbuf_redirty(dr); 1606168404Spjd mutex_exit(&db->db_mtx); 1607185029Spjd return (dr); 1608168404Spjd } 1609168404Spjd 1610168404Spjd /* 1611168404Spjd * Only valid if not already dirty. 1612168404Spjd */ 1613209962Smm ASSERT(dn->dn_object == 0 || 1614209962Smm dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx == 1615168404Spjd (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); 1616168404Spjd 1617168404Spjd ASSERT3U(dn->dn_nlevels, >, db->db_level); 1618168404Spjd 1619168404Spjd /* 1620168404Spjd * We should only be dirtying in syncing context if it's the 1621209962Smm * mos or we're initializing the os or it's a special object. 1622209962Smm * However, we are allowed to dirty in syncing context provided 1623209962Smm * we already dirtied it in open context. Hence we must make 1624209962Smm * this assertion only if we're not already dirty. 1625168404Spjd */ 1626219089Spjd os = dn->dn_objset; 1627321554Smav VERIFY3U(tx->tx_txg, <=, spa_final_dirty_txg(os->os_spa)); 1628308082Smav#ifdef DEBUG 1629308082Smav if (dn->dn_objset->os_dsl_dataset != NULL) 1630308082Smav rrw_enter(&os->os_dsl_dataset->ds_bp_rwlock, RW_READER, FTAG); 1631209962Smm ASSERT(!dmu_tx_is_syncing(tx) || DMU_OBJECT_IS_SPECIAL(dn->dn_object) || 1632209962Smm os->os_dsl_dataset == NULL || BP_IS_HOLE(os->os_rootbp)); 1633308082Smav if (dn->dn_objset->os_dsl_dataset != NULL) 1634308082Smav rrw_exit(&os->os_dsl_dataset->ds_bp_rwlock, FTAG); 1635308082Smav#endif 1636168404Spjd ASSERT(db->db.db_size != 0); 1637168404Spjd 1638168404Spjd dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); 1639168404Spjd 1640219089Spjd if (db->db_blkid != DMU_BONUS_BLKID) { 1641321547Smav dmu_objset_willuse_space(os, db->db.db_size, tx); 1642185029Spjd } 1643185029Spjd 1644168404Spjd /* 1645168404Spjd * If this buffer is dirty in an old transaction group we need 1646168404Spjd * to make a copy of it so that the changes we make in this 1647168404Spjd * transaction group won't leak out when we sync the older txg. 1648168404Spjd */ 1649168404Spjd dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_SLEEP); 1650168404Spjd if (db->db_level == 0) { 1651168404Spjd void *data_old = db->db_buf; 1652168404Spjd 1653219089Spjd if (db->db_state != DB_NOFILL) { 1654219089Spjd if (db->db_blkid == DMU_BONUS_BLKID) { 1655219089Spjd dbuf_fix_old_data(db, tx->tx_txg); 1656219089Spjd data_old = db->db.db_data; 1657219089Spjd } else if (db->db.db_object != DMU_META_DNODE_OBJECT) { 1658219089Spjd /* 1659219089Spjd * Release the data buffer from the cache so 1660219089Spjd * that we can modify it without impacting 1661219089Spjd * possible other users of this cached data 1662219089Spjd * block. Note that indirect blocks and 1663219089Spjd * private objects are not released until the 1664219089Spjd * syncing state (since they are only modified 1665219089Spjd * then). 1666219089Spjd */ 1667219089Spjd arc_release(db->db_buf, db); 1668219089Spjd dbuf_fix_old_data(db, tx->tx_txg); 1669219089Spjd data_old = db->db_buf; 1670219089Spjd } 1671219089Spjd ASSERT(data_old != NULL); 1672168404Spjd } 1673168404Spjd dr->dt.dl.dr_data = data_old; 1674168404Spjd } else { 1675168404Spjd mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_DEFAULT, NULL); 1676168404Spjd list_create(&dr->dt.di.dr_children, 1677168404Spjd sizeof (dbuf_dirty_record_t), 1678168404Spjd offsetof(dbuf_dirty_record_t, dr_dirty_node)); 1679168404Spjd } 1680258632Savg if (db->db_blkid != DMU_BONUS_BLKID && os->os_dsl_dataset != NULL) 1681258632Savg dr->dr_accounted = db->db.db_size; 1682168404Spjd dr->dr_dbuf = db; 1683168404Spjd dr->dr_txg = tx->tx_txg; 1684168404Spjd dr->dr_next = *drp; 1685168404Spjd *drp = dr; 1686168404Spjd 1687168404Spjd /* 1688168404Spjd * We could have been freed_in_flight between the dbuf_noread 1689168404Spjd * and dbuf_dirty. We win, as though the dbuf_noread() had 1690168404Spjd * happened after the free. 1691168404Spjd */ 1692219089Spjd if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && 1693219089Spjd db->db_blkid != DMU_SPILL_BLKID) { 1694168404Spjd mutex_enter(&dn->dn_mtx); 1695264669Sdelphij if (dn->dn_free_ranges[txgoff] != NULL) { 1696264669Sdelphij range_tree_clear(dn->dn_free_ranges[txgoff], 1697264669Sdelphij db->db_blkid, 1); 1698264669Sdelphij } 1699168404Spjd mutex_exit(&dn->dn_mtx); 1700168404Spjd db->db_freed_in_flight = FALSE; 1701168404Spjd } 1702168404Spjd 1703168404Spjd /* 1704168404Spjd * This buffer is now part of this txg 1705168404Spjd */ 1706168404Spjd dbuf_add_ref(db, (void *)(uintptr_t)tx->tx_txg); 1707168404Spjd db->db_dirtycnt += 1; 1708168404Spjd ASSERT3U(db->db_dirtycnt, <=, 3); 1709168404Spjd 1710168404Spjd mutex_exit(&db->db_mtx); 1711168404Spjd 1712219089Spjd if (db->db_blkid == DMU_BONUS_BLKID || 1713219089Spjd db->db_blkid == DMU_SPILL_BLKID) { 1714168404Spjd mutex_enter(&dn->dn_mtx); 1715168404Spjd ASSERT(!list_link_active(&dr->dr_dirty_node)); 1716168404Spjd list_insert_tail(&dn->dn_dirty_records[txgoff], dr); 1717168404Spjd mutex_exit(&dn->dn_mtx); 1718168404Spjd dnode_setdirty(dn, tx); 1719219089Spjd DB_DNODE_EXIT(db); 1720168404Spjd return (dr); 1721307269Smav } 1722307269Smav 1723307269Smav /* 1724307269Smav * The dn_struct_rwlock prevents db_blkptr from changing 1725307269Smav * due to a write from syncing context completing 1726307269Smav * while we are running, so we want to acquire it before 1727307269Smav * looking at db_blkptr. 1728307269Smav */ 1729307269Smav if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) { 1730307269Smav rw_enter(&dn->dn_struct_rwlock, RW_READER); 1731307269Smav drop_struct_lock = TRUE; 1732307269Smav } 1733307269Smav 1734321547Smav /* 1735323750Savg * We need to hold the dn_struct_rwlock to make this assertion, 1736323750Savg * because it protects dn_phys / dn_next_nlevels from changing. 1737323750Savg */ 1738323750Savg ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) || 1739323750Savg dn->dn_phys->dn_nlevels > db->db_level || 1740323750Savg dn->dn_next_nlevels[txgoff] > db->db_level || 1741323750Savg dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level || 1742323750Savg dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level); 1743323750Savg 1744323750Savg /* 1745321547Smav * If we are overwriting a dedup BP, then unless it is snapshotted, 1746321547Smav * when we get to syncing context we will need to decrement its 1747321547Smav * refcount in the DDT. Prefetch the relevant DDT block so that 1748321547Smav * syncing context won't have to wait for the i/o. 1749321547Smav */ 1750321547Smav ddt_prefetch(os->os_spa, db->db_blkptr); 1751168404Spjd 1752185029Spjd if (db->db_level == 0) { 1753185029Spjd dnode_new_blkid(dn, db->db_blkid, tx, drop_struct_lock); 1754185029Spjd ASSERT(dn->dn_maxblkid >= db->db_blkid); 1755185029Spjd } 1756185029Spjd 1757168404Spjd if (db->db_level+1 < dn->dn_nlevels) { 1758168404Spjd dmu_buf_impl_t *parent = db->db_parent; 1759168404Spjd dbuf_dirty_record_t *di; 1760168404Spjd int parent_held = FALSE; 1761168404Spjd 1762168404Spjd if (db->db_parent == NULL || db->db_parent == dn->dn_dbuf) { 1763168404Spjd int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 1764168404Spjd 1765168404Spjd parent = dbuf_hold_level(dn, db->db_level+1, 1766168404Spjd db->db_blkid >> epbs, FTAG); 1767219089Spjd ASSERT(parent != NULL); 1768168404Spjd parent_held = TRUE; 1769168404Spjd } 1770168404Spjd if (drop_struct_lock) 1771168404Spjd rw_exit(&dn->dn_struct_rwlock); 1772168404Spjd ASSERT3U(db->db_level+1, ==, parent->db_level); 1773168404Spjd di = dbuf_dirty(parent, tx); 1774168404Spjd if (parent_held) 1775168404Spjd dbuf_rele(parent, FTAG); 1776168404Spjd 1777168404Spjd mutex_enter(&db->db_mtx); 1778258632Savg /* 1779258632Savg * Since we've dropped the mutex, it's possible that 1780258632Savg * dbuf_undirty() might have changed this out from under us. 1781258632Savg */ 1782168404Spjd if (db->db_last_dirty == dr || 1783168404Spjd dn->dn_object == DMU_META_DNODE_OBJECT) { 1784168404Spjd mutex_enter(&di->dt.di.dr_mtx); 1785168404Spjd ASSERT3U(di->dr_txg, ==, tx->tx_txg); 1786168404Spjd ASSERT(!list_link_active(&dr->dr_dirty_node)); 1787168404Spjd list_insert_tail(&di->dt.di.dr_children, dr); 1788168404Spjd mutex_exit(&di->dt.di.dr_mtx); 1789168404Spjd dr->dr_parent = di; 1790168404Spjd } 1791168404Spjd mutex_exit(&db->db_mtx); 1792168404Spjd } else { 1793168404Spjd ASSERT(db->db_level+1 == dn->dn_nlevels); 1794168404Spjd ASSERT(db->db_blkid < dn->dn_nblkptr); 1795219089Spjd ASSERT(db->db_parent == NULL || db->db_parent == dn->dn_dbuf); 1796168404Spjd mutex_enter(&dn->dn_mtx); 1797168404Spjd ASSERT(!list_link_active(&dr->dr_dirty_node)); 1798168404Spjd list_insert_tail(&dn->dn_dirty_records[txgoff], dr); 1799168404Spjd mutex_exit(&dn->dn_mtx); 1800168404Spjd if (drop_struct_lock) 1801168404Spjd rw_exit(&dn->dn_struct_rwlock); 1802168404Spjd } 1803168404Spjd 1804168404Spjd dnode_setdirty(dn, tx); 1805219089Spjd DB_DNODE_EXIT(db); 1806168404Spjd return (dr); 1807168404Spjd} 1808168404Spjd 1809248571Smm/* 1810251629Sdelphij * Undirty a buffer in the transaction group referenced by the given 1811251629Sdelphij * transaction. Return whether this evicted the dbuf. 1812248571Smm */ 1813248571Smmstatic boolean_t 1814168404Spjddbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) 1815168404Spjd{ 1816219089Spjd dnode_t *dn; 1817168404Spjd uint64_t txg = tx->tx_txg; 1818185029Spjd dbuf_dirty_record_t *dr, **drp; 1819168404Spjd 1820168404Spjd ASSERT(txg != 0); 1821284593Savg 1822284593Savg /* 1823284593Savg * Due to our use of dn_nlevels below, this can only be called 1824284593Savg * in open context, unless we are operating on the MOS. 1825284593Savg * From syncing context, dn_nlevels may be different from the 1826284593Savg * dn_nlevels used when dbuf was dirtied. 1827284593Savg */ 1828284593Savg ASSERT(db->db_objset == 1829284593Savg dmu_objset_pool(db->db_objset)->dp_meta_objset || 1830284593Savg txg != spa_syncing_txg(dmu_objset_spa(db->db_objset))); 1831219089Spjd ASSERT(db->db_blkid != DMU_BONUS_BLKID); 1832248571Smm ASSERT0(db->db_level); 1833248571Smm ASSERT(MUTEX_HELD(&db->db_mtx)); 1834168404Spjd 1835168404Spjd /* 1836168404Spjd * If this buffer is not dirty, we're done. 1837168404Spjd */ 1838185029Spjd for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next) 1839168404Spjd if (dr->dr_txg <= txg) 1840168404Spjd break; 1841248571Smm if (dr == NULL || dr->dr_txg < txg) 1842248571Smm return (B_FALSE); 1843168404Spjd ASSERT(dr->dr_txg == txg); 1844219089Spjd ASSERT(dr->dr_dbuf == db); 1845168404Spjd 1846219089Spjd DB_DNODE_ENTER(db); 1847219089Spjd dn = DB_DNODE(db); 1848219089Spjd 1849168404Spjd dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); 1850168404Spjd 1851168404Spjd ASSERT(db->db.db_size != 0); 1852168404Spjd 1853284593Savg dsl_pool_undirty_space(dmu_objset_pool(dn->dn_objset), 1854284593Savg dr->dr_accounted, txg); 1855168404Spjd 1856185029Spjd *drp = dr->dr_next; 1857168404Spjd 1858219636Spjd /* 1859219636Spjd * Note that there are three places in dbuf_dirty() 1860219636Spjd * where this dirty record may be put on a list. 1861219636Spjd * Make sure to do a list_remove corresponding to 1862219636Spjd * every one of those list_insert calls. 1863219636Spjd */ 1864168404Spjd if (dr->dr_parent) { 1865168404Spjd mutex_enter(&dr->dr_parent->dt.di.dr_mtx); 1866168404Spjd list_remove(&dr->dr_parent->dt.di.dr_children, dr); 1867168404Spjd mutex_exit(&dr->dr_parent->dt.di.dr_mtx); 1868219636Spjd } else if (db->db_blkid == DMU_SPILL_BLKID || 1869284593Savg db->db_level + 1 == dn->dn_nlevels) { 1870185029Spjd ASSERT(db->db_blkptr == NULL || db->db_parent == dn->dn_dbuf); 1871168404Spjd mutex_enter(&dn->dn_mtx); 1872168404Spjd list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr); 1873168404Spjd mutex_exit(&dn->dn_mtx); 1874168404Spjd } 1875219089Spjd DB_DNODE_EXIT(db); 1876168404Spjd 1877248571Smm if (db->db_state != DB_NOFILL) { 1878248571Smm dbuf_unoverride(dr); 1879168404Spjd 1880168404Spjd ASSERT(db->db_buf != NULL); 1881248571Smm ASSERT(dr->dt.dl.dr_data != NULL); 1882248571Smm if (dr->dt.dl.dr_data != db->db_buf) 1883307265Smav arc_buf_destroy(dr->dt.dl.dr_data, db); 1884168404Spjd } 1885268713Sdelphij 1886168404Spjd kmem_free(dr, sizeof (dbuf_dirty_record_t)); 1887168404Spjd 1888168404Spjd ASSERT(db->db_dirtycnt > 0); 1889168404Spjd db->db_dirtycnt -= 1; 1890168404Spjd 1891168404Spjd if (refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) { 1892307265Smav ASSERT(db->db_state == DB_NOFILL || arc_released(db->db_buf)); 1893307265Smav dbuf_destroy(db); 1894248571Smm return (B_TRUE); 1895168404Spjd } 1896168404Spjd 1897248571Smm return (B_FALSE); 1898168404Spjd} 1899168404Spjd 1900168404Spjdvoid 1901260150Sdelphijdmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx) 1902168404Spjd{ 1903260150Sdelphij dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 1904185029Spjd int rf = DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH; 1905168404Spjd 1906168404Spjd ASSERT(tx->tx_txg != 0); 1907168404Spjd ASSERT(!refcount_is_zero(&db->db_holds)); 1908168404Spjd 1909289297Smav /* 1910289297Smav * Quick check for dirtyness. For already dirty blocks, this 1911289297Smav * reduces runtime of this function by >90%, and overall performance 1912289297Smav * by 50% for some workloads (e.g. file deletion with indirect blocks 1913289297Smav * cached). 1914289297Smav */ 1915289297Smav mutex_enter(&db->db_mtx); 1916289297Smav dbuf_dirty_record_t *dr; 1917289297Smav for (dr = db->db_last_dirty; 1918289297Smav dr != NULL && dr->dr_txg >= tx->tx_txg; dr = dr->dr_next) { 1919289297Smav /* 1920289297Smav * It's possible that it is already dirty but not cached, 1921289297Smav * because there are some calls to dbuf_dirty() that don't 1922289297Smav * go through dmu_buf_will_dirty(). 1923289297Smav */ 1924289297Smav if (dr->dr_txg == tx->tx_txg && db->db_state == DB_CACHED) { 1925289297Smav /* This dbuf is already dirty and cached. */ 1926289297Smav dbuf_redirty(dr); 1927289297Smav mutex_exit(&db->db_mtx); 1928289297Smav return; 1929289297Smav } 1930289297Smav } 1931289297Smav mutex_exit(&db->db_mtx); 1932289297Smav 1933219089Spjd DB_DNODE_ENTER(db); 1934219089Spjd if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock)) 1935168404Spjd rf |= DB_RF_HAVESTRUCT; 1936219089Spjd DB_DNODE_EXIT(db); 1937168404Spjd (void) dbuf_read(db, NULL, rf); 1938168404Spjd (void) dbuf_dirty(db, tx); 1939168404Spjd} 1940168404Spjd 1941168404Spjdvoid 1942219089Spjddmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) 1943219089Spjd{ 1944219089Spjd dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 1945219089Spjd 1946219089Spjd db->db_state = DB_NOFILL; 1947219089Spjd 1948219089Spjd dmu_buf_will_fill(db_fake, tx); 1949219089Spjd} 1950219089Spjd 1951219089Spjdvoid 1952168404Spjddmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) 1953168404Spjd{ 1954168404Spjd dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 1955168404Spjd 1956219089Spjd ASSERT(db->db_blkid != DMU_BONUS_BLKID); 1957168404Spjd ASSERT(tx->tx_txg != 0); 1958168404Spjd ASSERT(db->db_level == 0); 1959168404Spjd ASSERT(!refcount_is_zero(&db->db_holds)); 1960168404Spjd 1961168404Spjd ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT || 1962168404Spjd dmu_tx_private_ok(tx)); 1963168404Spjd 1964168404Spjd dbuf_noread(db); 1965168404Spjd (void) dbuf_dirty(db, tx); 1966168404Spjd} 1967168404Spjd 1968168404Spjd#pragma weak dmu_buf_fill_done = dbuf_fill_done 1969168404Spjd/* ARGSUSED */ 1970168404Spjdvoid 1971168404Spjddbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx) 1972168404Spjd{ 1973168404Spjd mutex_enter(&db->db_mtx); 1974168404Spjd DBUF_VERIFY(db); 1975168404Spjd 1976168404Spjd if (db->db_state == DB_FILL) { 1977168404Spjd if (db->db_level == 0 && db->db_freed_in_flight) { 1978219089Spjd ASSERT(db->db_blkid != DMU_BONUS_BLKID); 1979168404Spjd /* we were freed while filling */ 1980168404Spjd /* XXX dbuf_undirty? */ 1981168404Spjd bzero(db->db.db_data, db->db.db_size); 1982168404Spjd db->db_freed_in_flight = FALSE; 1983168404Spjd } 1984168404Spjd db->db_state = DB_CACHED; 1985168404Spjd cv_broadcast(&db->db_changed); 1986168404Spjd } 1987168404Spjd mutex_exit(&db->db_mtx); 1988168404Spjd} 1989168404Spjd 1990268075Sdelphijvoid 1991268075Sdelphijdmu_buf_write_embedded(dmu_buf_t *dbuf, void *data, 1992268075Sdelphij bp_embedded_type_t etype, enum zio_compress comp, 1993268075Sdelphij int uncompressed_size, int compressed_size, int byteorder, 1994268075Sdelphij dmu_tx_t *tx) 1995268075Sdelphij{ 1996268075Sdelphij dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf; 1997268075Sdelphij struct dirty_leaf *dl; 1998268075Sdelphij dmu_object_type_t type; 1999268075Sdelphij 2000286708Smav if (etype == BP_EMBEDDED_TYPE_DATA) { 2001286708Smav ASSERT(spa_feature_is_active(dmu_objset_spa(db->db_objset), 2002286708Smav SPA_FEATURE_EMBEDDED_DATA)); 2003286708Smav } 2004286708Smav 2005268075Sdelphij DB_DNODE_ENTER(db); 2006268075Sdelphij type = DB_DNODE(db)->dn_type; 2007268075Sdelphij DB_DNODE_EXIT(db); 2008268075Sdelphij 2009268075Sdelphij ASSERT0(db->db_level); 2010268075Sdelphij ASSERT(db->db_blkid != DMU_BONUS_BLKID); 2011268075Sdelphij 2012268075Sdelphij dmu_buf_will_not_fill(dbuf, tx); 2013268075Sdelphij 2014268075Sdelphij ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg); 2015268075Sdelphij dl = &db->db_last_dirty->dt.dl; 2016268075Sdelphij encode_embedded_bp_compressed(&dl->dr_overridden_by, 2017268075Sdelphij data, comp, uncompressed_size, compressed_size); 2018268075Sdelphij BPE_SET_ETYPE(&dl->dr_overridden_by, etype); 2019268075Sdelphij BP_SET_TYPE(&dl->dr_overridden_by, type); 2020268075Sdelphij BP_SET_LEVEL(&dl->dr_overridden_by, 0); 2021268075Sdelphij BP_SET_BYTEORDER(&dl->dr_overridden_by, byteorder); 2022268075Sdelphij 2023268075Sdelphij dl->dr_override_state = DR_OVERRIDDEN; 2024268075Sdelphij dl->dr_overridden_by.blk_birth = db->db_last_dirty->dr_txg; 2025268075Sdelphij} 2026268075Sdelphij 2027168404Spjd/* 2028209962Smm * Directly assign a provided arc buf to a given dbuf if it's not referenced 2029209962Smm * by anybody except our caller. Otherwise copy arcbuf's contents to dbuf. 2030209962Smm */ 2031209962Smmvoid 2032209962Smmdbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx) 2033209962Smm{ 2034209962Smm ASSERT(!refcount_is_zero(&db->db_holds)); 2035219089Spjd ASSERT(db->db_blkid != DMU_BONUS_BLKID); 2036209962Smm ASSERT(db->db_level == 0); 2037321535Smav ASSERT3U(dbuf_is_metadata(db), ==, arc_is_metadata(buf)); 2038209962Smm ASSERT(buf != NULL); 2039321535Smav ASSERT(arc_buf_lsize(buf) == db->db.db_size); 2040209962Smm ASSERT(tx->tx_txg != 0); 2041209962Smm 2042209962Smm arc_return_buf(buf, db); 2043209962Smm ASSERT(arc_released(buf)); 2044209962Smm 2045209962Smm mutex_enter(&db->db_mtx); 2046209962Smm 2047209962Smm while (db->db_state == DB_READ || db->db_state == DB_FILL) 2048209962Smm cv_wait(&db->db_changed, &db->db_mtx); 2049209962Smm 2050209962Smm ASSERT(db->db_state == DB_CACHED || db->db_state == DB_UNCACHED); 2051209962Smm 2052209962Smm if (db->db_state == DB_CACHED && 2053209962Smm refcount_count(&db->db_holds) - 1 > db->db_dirtycnt) { 2054209962Smm mutex_exit(&db->db_mtx); 2055209962Smm (void) dbuf_dirty(db, tx); 2056209962Smm bcopy(buf->b_data, db->db.db_data, db->db.db_size); 2057307265Smav arc_buf_destroy(buf, db); 2058219089Spjd xuio_stat_wbuf_copied(); 2059209962Smm return; 2060209962Smm } 2061209962Smm 2062219089Spjd xuio_stat_wbuf_nocopy(); 2063209962Smm if (db->db_state == DB_CACHED) { 2064209962Smm dbuf_dirty_record_t *dr = db->db_last_dirty; 2065209962Smm 2066209962Smm ASSERT(db->db_buf != NULL); 2067209962Smm if (dr != NULL && dr->dr_txg == tx->tx_txg) { 2068209962Smm ASSERT(dr->dt.dl.dr_data == db->db_buf); 2069209962Smm if (!arc_released(db->db_buf)) { 2070209962Smm ASSERT(dr->dt.dl.dr_override_state == 2071209962Smm DR_OVERRIDDEN); 2072209962Smm arc_release(db->db_buf, db); 2073209962Smm } 2074209962Smm dr->dt.dl.dr_data = buf; 2075307265Smav arc_buf_destroy(db->db_buf, db); 2076209962Smm } else if (dr == NULL || dr->dt.dl.dr_data != db->db_buf) { 2077209962Smm arc_release(db->db_buf, db); 2078307265Smav arc_buf_destroy(db->db_buf, db); 2079209962Smm } 2080209962Smm db->db_buf = NULL; 2081209962Smm } 2082209962Smm ASSERT(db->db_buf == NULL); 2083209962Smm dbuf_set_data(db, buf); 2084209962Smm db->db_state = DB_FILL; 2085209962Smm mutex_exit(&db->db_mtx); 2086209962Smm (void) dbuf_dirty(db, tx); 2087260150Sdelphij dmu_buf_fill_done(&db->db, tx); 2088209962Smm} 2089209962Smm 2090168404Spjdvoid 2091307265Smavdbuf_destroy(dmu_buf_impl_t *db) 2092168404Spjd{ 2093219089Spjd dnode_t *dn; 2094168404Spjd dmu_buf_impl_t *parent = db->db_parent; 2095219089Spjd dmu_buf_impl_t *dndb; 2096168404Spjd 2097168404Spjd ASSERT(MUTEX_HELD(&db->db_mtx)); 2098168404Spjd ASSERT(refcount_is_zero(&db->db_holds)); 2099168404Spjd 2100307265Smav if (db->db_buf != NULL) { 2101307265Smav arc_buf_destroy(db->db_buf, db); 2102307265Smav db->db_buf = NULL; 2103307265Smav } 2104168404Spjd 2105307265Smav if (db->db_blkid == DMU_BONUS_BLKID) { 2106168404Spjd ASSERT(db->db.db_data != NULL); 2107307265Smav zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN); 2108307265Smav arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); 2109168404Spjd db->db_state = DB_UNCACHED; 2110168404Spjd } 2111168404Spjd 2112307265Smav dbuf_clear_data(db); 2113307265Smav 2114307265Smav if (multilist_link_active(&db->db_cache_link)) { 2115339109Smav ASSERT(db->db_caching_status == DB_DBUF_CACHE || 2116339109Smav db->db_caching_status == DB_DBUF_METADATA_CACHE); 2117339109Smav 2118339109Smav multilist_remove(dbuf_caches[db->db_caching_status].cache, db); 2119339109Smav (void) refcount_remove_many( 2120339109Smav &dbuf_caches[db->db_caching_status].size, 2121307265Smav db->db.db_size, db); 2122339109Smav 2123339109Smav db->db_caching_status = DB_NO_CACHE; 2124307265Smav } 2125307265Smav 2126219089Spjd ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL); 2127168404Spjd ASSERT(db->db_data_pending == NULL); 2128168404Spjd 2129168404Spjd db->db_state = DB_EVICTING; 2130168404Spjd db->db_blkptr = NULL; 2131168404Spjd 2132307265Smav /* 2133307265Smav * Now that db_state is DB_EVICTING, nobody else can find this via 2134307265Smav * the hash table. We can now drop db_mtx, which allows us to 2135307265Smav * acquire the dn_dbufs_mtx. 2136307265Smav */ 2137307265Smav mutex_exit(&db->db_mtx); 2138307265Smav 2139219089Spjd DB_DNODE_ENTER(db); 2140219089Spjd dn = DB_DNODE(db); 2141219089Spjd dndb = dn->dn_dbuf; 2142307265Smav if (db->db_blkid != DMU_BONUS_BLKID) { 2143307265Smav boolean_t needlock = !MUTEX_HELD(&dn->dn_dbufs_mtx); 2144307265Smav if (needlock) 2145307265Smav mutex_enter(&dn->dn_dbufs_mtx); 2146269229Sdelphij avl_remove(&dn->dn_dbufs, db); 2147270248Sdelphij atomic_dec_32(&dn->dn_dbufs_count); 2148219089Spjd membar_producer(); 2149219089Spjd DB_DNODE_EXIT(db); 2150307265Smav if (needlock) 2151307265Smav mutex_exit(&dn->dn_dbufs_mtx); 2152219089Spjd /* 2153219089Spjd * Decrementing the dbuf count means that the hold corresponding 2154219089Spjd * to the removed dbuf is no longer discounted in dnode_move(), 2155219089Spjd * so the dnode cannot be moved until after we release the hold. 2156219089Spjd * The membar_producer() ensures visibility of the decremented 2157219089Spjd * value in dnode_move(), since DB_DNODE_EXIT doesn't actually 2158219089Spjd * release any lock. 2159219089Spjd */ 2160339140Smav mutex_enter(&dn->dn_mtx); 2161339140Smav dnode_rele_and_unlock(dn, db, B_TRUE); 2162219089Spjd db->db_dnode_handle = NULL; 2163307265Smav 2164307265Smav dbuf_hash_remove(db); 2165219089Spjd } else { 2166219089Spjd DB_DNODE_EXIT(db); 2167168404Spjd } 2168168404Spjd 2169307265Smav ASSERT(refcount_is_zero(&db->db_holds)); 2170168404Spjd 2171307265Smav db->db_parent = NULL; 2172168404Spjd 2173307265Smav ASSERT(db->db_buf == NULL); 2174307265Smav ASSERT(db->db.db_data == NULL); 2175307265Smav ASSERT(db->db_hash_next == NULL); 2176307265Smav ASSERT(db->db_blkptr == NULL); 2177307265Smav ASSERT(db->db_data_pending == NULL); 2178339109Smav ASSERT3U(db->db_caching_status, ==, DB_NO_CACHE); 2179307265Smav ASSERT(!multilist_link_active(&db->db_cache_link)); 2180307265Smav 2181307265Smav kmem_cache_free(dbuf_kmem_cache, db); 2182307265Smav arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); 2183307265Smav 2184168404Spjd /* 2185219089Spjd * If this dbuf is referenced from an indirect dbuf, 2186168404Spjd * decrement the ref count on the indirect dbuf. 2187168404Spjd */ 2188339140Smav if (parent && parent != dndb) { 2189339140Smav mutex_enter(&parent->db_mtx); 2190339140Smav dbuf_rele_and_unlock(parent, db, B_TRUE); 2191339140Smav } 2192168404Spjd} 2193168404Spjd 2194286705Smav/* 2195286705Smav * Note: While bpp will always be updated if the function returns success, 2196286705Smav * parentp will not be updated if the dnode does not have dn_dbuf filled in; 2197286705Smav * this happens when the dnode is the meta-dnode, or a userused or groupused 2198286705Smav * object. 2199286705Smav */ 2200168404Spjdstatic int 2201168404Spjddbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse, 2202168404Spjd dmu_buf_impl_t **parentp, blkptr_t **bpp) 2203168404Spjd{ 2204168404Spjd *parentp = NULL; 2205168404Spjd *bpp = NULL; 2206168404Spjd 2207219089Spjd ASSERT(blkid != DMU_BONUS_BLKID); 2208168404Spjd 2209219089Spjd if (blkid == DMU_SPILL_BLKID) { 2210219089Spjd mutex_enter(&dn->dn_mtx); 2211219089Spjd if (dn->dn_have_spill && 2212219089Spjd (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) 2213219089Spjd *bpp = &dn->dn_phys->dn_spill; 2214219089Spjd else 2215219089Spjd *bpp = NULL; 2216219089Spjd dbuf_add_ref(dn->dn_dbuf, NULL); 2217219089Spjd *parentp = dn->dn_dbuf; 2218219089Spjd mutex_exit(&dn->dn_mtx); 2219219089Spjd return (0); 2220219089Spjd } 2221219089Spjd 2222321541Smav int nlevels = 2223321541Smav (dn->dn_phys->dn_nlevels == 0) ? 1 : dn->dn_phys->dn_nlevels; 2224321541Smav int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 2225168404Spjd 2226168404Spjd ASSERT3U(level * epbs, <, 64); 2227168404Spjd ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 2228321541Smav /* 2229321541Smav * This assertion shouldn't trip as long as the max indirect block size 2230321541Smav * is less than 1M. The reason for this is that up to that point, 2231321541Smav * the number of levels required to address an entire object with blocks 2232321541Smav * of size SPA_MINBLOCKSIZE satisfies nlevels * epbs + 1 <= 64. In 2233321541Smav * other words, if N * epbs + 1 > 64, then if (N-1) * epbs + 1 > 55 2234321541Smav * (i.e. we can address the entire object), objects will all use at most 2235321541Smav * N-1 levels and the assertion won't overflow. However, once epbs is 2236321541Smav * 13, 4 * 13 + 1 = 53, but 5 * 13 + 1 = 66. Then, 4 levels will not be 2237321541Smav * enough to address an entire object, so objects will have 5 levels, 2238321541Smav * but then this assertion will overflow. 2239321541Smav * 2240321541Smav * All this is to say that if we ever increase DN_MAX_INDBLKSHIFT, we 2241321541Smav * need to redo this logic to handle overflows. 2242321541Smav */ 2243321541Smav ASSERT(level >= nlevels || 2244321541Smav ((nlevels - level - 1) * epbs) + 2245321541Smav highbit64(dn->dn_phys->dn_nblkptr) <= 64); 2246168404Spjd if (level >= nlevels || 2247321541Smav blkid >= ((uint64_t)dn->dn_phys->dn_nblkptr << 2248321541Smav ((nlevels - level - 1) * epbs)) || 2249321541Smav (fail_sparse && 2250321541Smav blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))) { 2251168404Spjd /* the buffer has no parent yet */ 2252249195Smm return (SET_ERROR(ENOENT)); 2253168404Spjd } else if (level < nlevels-1) { 2254168404Spjd /* this block is referenced from an indirect block */ 2255168404Spjd int err = dbuf_hold_impl(dn, level+1, 2256286705Smav blkid >> epbs, fail_sparse, FALSE, NULL, parentp); 2257168404Spjd if (err) 2258168404Spjd return (err); 2259168404Spjd err = dbuf_read(*parentp, NULL, 2260168404Spjd (DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH | DB_RF_CANFAIL)); 2261168404Spjd if (err) { 2262168404Spjd dbuf_rele(*parentp, NULL); 2263168404Spjd *parentp = NULL; 2264168404Spjd return (err); 2265168404Spjd } 2266168404Spjd *bpp = ((blkptr_t *)(*parentp)->db.db_data) + 2267168404Spjd (blkid & ((1ULL << epbs) - 1)); 2268321541Smav if (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs))) 2269321541Smav ASSERT(BP_IS_HOLE(*bpp)); 2270168404Spjd return (0); 2271168404Spjd } else { 2272168404Spjd /* the block is referenced from the dnode */ 2273168404Spjd ASSERT3U(level, ==, nlevels-1); 2274168404Spjd ASSERT(dn->dn_phys->dn_nblkptr == 0 || 2275168404Spjd blkid < dn->dn_phys->dn_nblkptr); 2276168404Spjd if (dn->dn_dbuf) { 2277168404Spjd dbuf_add_ref(dn->dn_dbuf, NULL); 2278168404Spjd *parentp = dn->dn_dbuf; 2279168404Spjd } 2280168404Spjd *bpp = &dn->dn_phys->dn_blkptr[blkid]; 2281168404Spjd return (0); 2282168404Spjd } 2283168404Spjd} 2284168404Spjd 2285168404Spjdstatic dmu_buf_impl_t * 2286168404Spjddbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, 2287168404Spjd dmu_buf_impl_t *parent, blkptr_t *blkptr) 2288168404Spjd{ 2289219089Spjd objset_t *os = dn->dn_objset; 2290168404Spjd dmu_buf_impl_t *db, *odb; 2291168404Spjd 2292168404Spjd ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 2293168404Spjd ASSERT(dn->dn_type != DMU_OT_NONE); 2294168404Spjd 2295307265Smav db = kmem_cache_alloc(dbuf_kmem_cache, KM_SLEEP); 2296168404Spjd 2297168404Spjd db->db_objset = os; 2298168404Spjd db->db.db_object = dn->dn_object; 2299168404Spjd db->db_level = level; 2300168404Spjd db->db_blkid = blkid; 2301168404Spjd db->db_last_dirty = NULL; 2302168404Spjd db->db_dirtycnt = 0; 2303219089Spjd db->db_dnode_handle = dn->dn_handle; 2304168404Spjd db->db_parent = parent; 2305168404Spjd db->db_blkptr = blkptr; 2306168404Spjd 2307286575Smav db->db_user = NULL; 2308289309Smav db->db_user_immediate_evict = FALSE; 2309289309Smav db->db_freed_in_flight = FALSE; 2310289309Smav db->db_pending_evict = FALSE; 2311168404Spjd 2312219089Spjd if (blkid == DMU_BONUS_BLKID) { 2313168404Spjd ASSERT3P(parent, ==, dn->dn_dbuf); 2314185029Spjd db->db.db_size = DN_MAX_BONUSLEN - 2315185029Spjd (dn->dn_nblkptr-1) * sizeof (blkptr_t); 2316185029Spjd ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); 2317219089Spjd db->db.db_offset = DMU_BONUS_BLKID; 2318168404Spjd db->db_state = DB_UNCACHED; 2319339109Smav db->db_caching_status = DB_NO_CACHE; 2320168404Spjd /* the bonus dbuf is not placed in the hash table */ 2321208373Smm arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); 2322168404Spjd return (db); 2323219089Spjd } else if (blkid == DMU_SPILL_BLKID) { 2324219089Spjd db->db.db_size = (blkptr != NULL) ? 2325219089Spjd BP_GET_LSIZE(blkptr) : SPA_MINBLOCKSIZE; 2326219089Spjd db->db.db_offset = 0; 2327168404Spjd } else { 2328168404Spjd int blocksize = 2329258632Savg db->db_level ? 1 << dn->dn_indblkshift : dn->dn_datablksz; 2330168404Spjd db->db.db_size = blocksize; 2331168404Spjd db->db.db_offset = db->db_blkid * blocksize; 2332168404Spjd } 2333168404Spjd 2334168404Spjd /* 2335168404Spjd * Hold the dn_dbufs_mtx while we get the new dbuf 2336168404Spjd * in the hash table *and* added to the dbufs list. 2337168404Spjd * This prevents a possible deadlock with someone 2338168404Spjd * trying to look up this dbuf before its added to the 2339168404Spjd * dn_dbufs list. 2340168404Spjd */ 2341168404Spjd mutex_enter(&dn->dn_dbufs_mtx); 2342168404Spjd db->db_state = DB_EVICTING; 2343168404Spjd if ((odb = dbuf_hash_insert(db)) != NULL) { 2344168404Spjd /* someone else inserted it first */ 2345307265Smav kmem_cache_free(dbuf_kmem_cache, db); 2346168404Spjd mutex_exit(&dn->dn_dbufs_mtx); 2347168404Spjd return (odb); 2348168404Spjd } 2349269229Sdelphij avl_add(&dn->dn_dbufs, db); 2350321524Smav 2351168404Spjd db->db_state = DB_UNCACHED; 2352339109Smav db->db_caching_status = DB_NO_CACHE; 2353168404Spjd mutex_exit(&dn->dn_dbufs_mtx); 2354208373Smm arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); 2355168404Spjd 2356168404Spjd if (parent && parent != dn->dn_dbuf) 2357168404Spjd dbuf_add_ref(parent, db); 2358168404Spjd 2359168404Spjd ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || 2360168404Spjd refcount_count(&dn->dn_holds) > 0); 2361168404Spjd (void) refcount_add(&dn->dn_holds, db); 2362270248Sdelphij atomic_inc_32(&dn->dn_dbufs_count); 2363168404Spjd 2364168404Spjd dprintf_dbuf(db, "db=%p\n", db); 2365168404Spjd 2366168404Spjd return (db); 2367168404Spjd} 2368168404Spjd 2369286705Smavtypedef struct dbuf_prefetch_arg { 2370286705Smav spa_t *dpa_spa; /* The spa to issue the prefetch in. */ 2371286705Smav zbookmark_phys_t dpa_zb; /* The target block to prefetch. */ 2372286705Smav int dpa_epbs; /* Entries (blkptr_t's) Per Block Shift. */ 2373286705Smav int dpa_curlevel; /* The current level that we're reading */ 2374307265Smav dnode_t *dpa_dnode; /* The dnode associated with the prefetch */ 2375286705Smav zio_priority_t dpa_prio; /* The priority I/Os should be issued at. */ 2376286705Smav zio_t *dpa_zio; /* The parent zio_t for all prefetches. */ 2377286705Smav arc_flags_t dpa_aflags; /* Flags to pass to the final prefetch. */ 2378286705Smav} dbuf_prefetch_arg_t; 2379286705Smav 2380286705Smav/* 2381286705Smav * Actually issue the prefetch read for the block given. 2382286705Smav */ 2383286705Smavstatic void 2384286705Smavdbuf_issue_final_prefetch(dbuf_prefetch_arg_t *dpa, blkptr_t *bp) 2385286705Smav{ 2386286705Smav if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) 2387286705Smav return; 2388286705Smav 2389286705Smav arc_flags_t aflags = 2390286705Smav dpa->dpa_aflags | ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH; 2391286705Smav 2392286705Smav ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp)); 2393286705Smav ASSERT3U(dpa->dpa_curlevel, ==, dpa->dpa_zb.zb_level); 2394286705Smav ASSERT(dpa->dpa_zio != NULL); 2395286705Smav (void) arc_read(dpa->dpa_zio, dpa->dpa_spa, bp, NULL, NULL, 2396286705Smav dpa->dpa_prio, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, 2397286705Smav &aflags, &dpa->dpa_zb); 2398286705Smav} 2399286705Smav 2400286705Smav/* 2401286705Smav * Called when an indirect block above our prefetch target is read in. This 2402286705Smav * will either read in the next indirect block down the tree or issue the actual 2403286705Smav * prefetch if the next block down is our target. 2404286705Smav */ 2405286705Smavstatic void 2406339034Ssefdbuf_prefetch_indirect_done(zio_t *zio, const zbookmark_phys_t *zb, 2407339034Ssef const blkptr_t *iobp, arc_buf_t *abuf, void *private) 2408286705Smav{ 2409286705Smav dbuf_prefetch_arg_t *dpa = private; 2410286705Smav 2411286705Smav ASSERT3S(dpa->dpa_zb.zb_level, <, dpa->dpa_curlevel); 2412286705Smav ASSERT3S(dpa->dpa_curlevel, >, 0); 2413307265Smav 2414339114Smav if (abuf == NULL) { 2415339114Smav ASSERT(zio == NULL || zio->io_error != 0); 2416339114Smav kmem_free(dpa, sizeof (*dpa)); 2417339114Smav return; 2418339114Smav } 2419339114Smav ASSERT(zio == NULL || zio->io_error == 0); 2420339114Smav 2421307265Smav /* 2422307265Smav * The dpa_dnode is only valid if we are called with a NULL 2423307265Smav * zio. This indicates that the arc_read() returned without 2424307265Smav * first calling zio_read() to issue a physical read. Once 2425307265Smav * a physical read is made the dpa_dnode must be invalidated 2426307265Smav * as the locks guarding it may have been dropped. If the 2427307265Smav * dpa_dnode is still valid, then we want to add it to the dbuf 2428307265Smav * cache. To do so, we must hold the dbuf associated with the block 2429307265Smav * we just prefetched, read its contents so that we associate it 2430307265Smav * with an arc_buf_t, and then release it. 2431307265Smav */ 2432286705Smav if (zio != NULL) { 2433286705Smav ASSERT3S(BP_GET_LEVEL(zio->io_bp), ==, dpa->dpa_curlevel); 2434307265Smav if (zio->io_flags & ZIO_FLAG_RAW) { 2435307265Smav ASSERT3U(BP_GET_PSIZE(zio->io_bp), ==, zio->io_size); 2436307265Smav } else { 2437307265Smav ASSERT3U(BP_GET_LSIZE(zio->io_bp), ==, zio->io_size); 2438307265Smav } 2439286705Smav ASSERT3P(zio->io_spa, ==, dpa->dpa_spa); 2440307265Smav 2441307265Smav dpa->dpa_dnode = NULL; 2442307265Smav } else if (dpa->dpa_dnode != NULL) { 2443307265Smav uint64_t curblkid = dpa->dpa_zb.zb_blkid >> 2444307265Smav (dpa->dpa_epbs * (dpa->dpa_curlevel - 2445307265Smav dpa->dpa_zb.zb_level)); 2446307265Smav dmu_buf_impl_t *db = dbuf_hold_level(dpa->dpa_dnode, 2447307265Smav dpa->dpa_curlevel, curblkid, FTAG); 2448307265Smav (void) dbuf_read(db, NULL, 2449307265Smav DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH | DB_RF_HAVESTRUCT); 2450307265Smav dbuf_rele(db, FTAG); 2451286705Smav } 2452286705Smav 2453339034Ssef if (abuf == NULL) { 2454339034Ssef kmem_free(dpa, sizeof(*dpa)); 2455339034Ssef return; 2456339034Ssef } 2457339034Ssef 2458286705Smav dpa->dpa_curlevel--; 2459286705Smav 2460286705Smav uint64_t nextblkid = dpa->dpa_zb.zb_blkid >> 2461286705Smav (dpa->dpa_epbs * (dpa->dpa_curlevel - dpa->dpa_zb.zb_level)); 2462286705Smav blkptr_t *bp = ((blkptr_t *)abuf->b_data) + 2463286705Smav P2PHASE(nextblkid, 1ULL << dpa->dpa_epbs); 2464339034Ssef if (BP_IS_HOLE(bp)) { 2465286705Smav kmem_free(dpa, sizeof (*dpa)); 2466286705Smav } else if (dpa->dpa_curlevel == dpa->dpa_zb.zb_level) { 2467286705Smav ASSERT3U(nextblkid, ==, dpa->dpa_zb.zb_blkid); 2468286705Smav dbuf_issue_final_prefetch(dpa, bp); 2469286705Smav kmem_free(dpa, sizeof (*dpa)); 2470286705Smav } else { 2471286705Smav arc_flags_t iter_aflags = ARC_FLAG_NOWAIT; 2472286705Smav zbookmark_phys_t zb; 2473286705Smav 2474325931Savg /* flag if L2ARC eligible, l2arc_noprefetch then decides */ 2475325931Savg if (dpa->dpa_aflags & ARC_FLAG_L2CACHE) 2476325931Savg iter_aflags |= ARC_FLAG_L2CACHE; 2477325931Savg 2478286705Smav ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp)); 2479286705Smav 2480286705Smav SET_BOOKMARK(&zb, dpa->dpa_zb.zb_objset, 2481286705Smav dpa->dpa_zb.zb_object, dpa->dpa_curlevel, nextblkid); 2482286705Smav 2483286705Smav (void) arc_read(dpa->dpa_zio, dpa->dpa_spa, 2484286705Smav bp, dbuf_prefetch_indirect_done, dpa, dpa->dpa_prio, 2485286705Smav ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, 2486286705Smav &iter_aflags, &zb); 2487286705Smav } 2488307265Smav 2489307265Smav arc_buf_destroy(abuf, private); 2490286705Smav} 2491286705Smav 2492286705Smav/* 2493286705Smav * Issue prefetch reads for the given block on the given level. If the indirect 2494286705Smav * blocks above that block are not in memory, we will read them in 2495286705Smav * asynchronously. As a result, this call never blocks waiting for a read to 2496286705Smav * complete. 2497286705Smav */ 2498168404Spjdvoid 2499286705Smavdbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio, 2500286705Smav arc_flags_t aflags) 2501168404Spjd{ 2502286705Smav blkptr_t bp; 2503286705Smav int epbs, nlevels, curlevel; 2504286705Smav uint64_t curblkid; 2505168404Spjd 2506219089Spjd ASSERT(blkid != DMU_BONUS_BLKID); 2507168404Spjd ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 2508168404Spjd 2509287702Sdelphij if (blkid > dn->dn_maxblkid) 2510287702Sdelphij return; 2511287702Sdelphij 2512168404Spjd if (dnode_block_freed(dn, blkid)) 2513168404Spjd return; 2514168404Spjd 2515286705Smav /* 2516286705Smav * This dnode hasn't been written to disk yet, so there's nothing to 2517286705Smav * prefetch. 2518286705Smav */ 2519286705Smav nlevels = dn->dn_phys->dn_nlevels; 2520286705Smav if (level >= nlevels || dn->dn_phys->dn_nblkptr == 0) 2521286705Smav return; 2522286705Smav 2523286705Smav epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; 2524286705Smav if (dn->dn_phys->dn_maxblkid < blkid << (epbs * level)) 2525286705Smav return; 2526286705Smav 2527286705Smav dmu_buf_impl_t *db = dbuf_find(dn->dn_objset, dn->dn_object, 2528286705Smav level, blkid); 2529286705Smav if (db != NULL) { 2530286705Smav mutex_exit(&db->db_mtx); 2531219089Spjd /* 2532286705Smav * This dbuf already exists. It is either CACHED, or 2533286705Smav * (we assume) about to be read or filled. 2534219089Spjd */ 2535219089Spjd return; 2536168404Spjd } 2537168404Spjd 2538286705Smav /* 2539286705Smav * Find the closest ancestor (indirect block) of the target block 2540286705Smav * that is present in the cache. In this indirect block, we will 2541286705Smav * find the bp that is at curlevel, curblkid. 2542286705Smav */ 2543286705Smav curlevel = level; 2544286705Smav curblkid = blkid; 2545286705Smav while (curlevel < nlevels - 1) { 2546286705Smav int parent_level = curlevel + 1; 2547286705Smav uint64_t parent_blkid = curblkid >> epbs; 2548286705Smav dmu_buf_impl_t *db; 2549168404Spjd 2550286705Smav if (dbuf_hold_impl(dn, parent_level, parent_blkid, 2551286705Smav FALSE, TRUE, FTAG, &db) == 0) { 2552286705Smav blkptr_t *bpp = db->db_buf->b_data; 2553286705Smav bp = bpp[P2PHASE(curblkid, 1 << epbs)]; 2554286705Smav dbuf_rele(db, FTAG); 2555286705Smav break; 2556286705Smav } 2557219089Spjd 2558286705Smav curlevel = parent_level; 2559286705Smav curblkid = parent_blkid; 2560168404Spjd } 2561286705Smav 2562286705Smav if (curlevel == nlevels - 1) { 2563286705Smav /* No cached indirect blocks found. */ 2564286705Smav ASSERT3U(curblkid, <, dn->dn_phys->dn_nblkptr); 2565286705Smav bp = dn->dn_phys->dn_blkptr[curblkid]; 2566286705Smav } 2567286705Smav if (BP_IS_HOLE(&bp)) 2568286705Smav return; 2569286705Smav 2570286705Smav ASSERT3U(curlevel, ==, BP_GET_LEVEL(&bp)); 2571286705Smav 2572286705Smav zio_t *pio = zio_root(dmu_objset_spa(dn->dn_objset), NULL, NULL, 2573286705Smav ZIO_FLAG_CANFAIL); 2574286705Smav 2575286705Smav dbuf_prefetch_arg_t *dpa = kmem_zalloc(sizeof (*dpa), KM_SLEEP); 2576286705Smav dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; 2577286705Smav SET_BOOKMARK(&dpa->dpa_zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET, 2578286705Smav dn->dn_object, level, blkid); 2579286705Smav dpa->dpa_curlevel = curlevel; 2580286705Smav dpa->dpa_prio = prio; 2581286705Smav dpa->dpa_aflags = aflags; 2582286705Smav dpa->dpa_spa = dn->dn_objset->os_spa; 2583307265Smav dpa->dpa_dnode = dn; 2584286705Smav dpa->dpa_epbs = epbs; 2585286705Smav dpa->dpa_zio = pio; 2586286705Smav 2587325931Savg /* flag if L2ARC eligible, l2arc_noprefetch then decides */ 2588325931Savg if (DNODE_LEVEL_IS_L2CACHEABLE(dn, level)) 2589325931Savg dpa->dpa_aflags |= ARC_FLAG_L2CACHE; 2590325931Savg 2591286705Smav /* 2592286705Smav * If we have the indirect just above us, no need to do the asynchronous 2593286705Smav * prefetch chain; we'll just run the last step ourselves. If we're at 2594286705Smav * a higher level, though, we want to issue the prefetches for all the 2595286705Smav * indirect blocks asynchronously, so we can go on with whatever we were 2596286705Smav * doing. 2597286705Smav */ 2598286705Smav if (curlevel == level) { 2599286705Smav ASSERT3U(curblkid, ==, blkid); 2600286705Smav dbuf_issue_final_prefetch(dpa, &bp); 2601286705Smav kmem_free(dpa, sizeof (*dpa)); 2602286705Smav } else { 2603286705Smav arc_flags_t iter_aflags = ARC_FLAG_NOWAIT; 2604286705Smav zbookmark_phys_t zb; 2605286705Smav 2606325931Savg /* flag if L2ARC eligible, l2arc_noprefetch then decides */ 2607325931Savg if (DNODE_LEVEL_IS_L2CACHEABLE(dn, level)) 2608325931Savg iter_aflags |= ARC_FLAG_L2CACHE; 2609325931Savg 2610286705Smav SET_BOOKMARK(&zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET, 2611286705Smav dn->dn_object, curlevel, curblkid); 2612286705Smav (void) arc_read(dpa->dpa_zio, dpa->dpa_spa, 2613286705Smav &bp, dbuf_prefetch_indirect_done, dpa, prio, 2614286705Smav ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, 2615286705Smav &iter_aflags, &zb); 2616286705Smav } 2617286705Smav /* 2618286705Smav * We use pio here instead of dpa_zio since it's possible that 2619286705Smav * dpa may have already been freed. 2620286705Smav */ 2621286705Smav zio_nowait(pio); 2622168404Spjd} 2623168404Spjd 2624168404Spjd/* 2625168404Spjd * Returns with db_holds incremented, and db_mtx not held. 2626168404Spjd * Note: dn_struct_rwlock must be held. 2627168404Spjd */ 2628168404Spjdint 2629286705Smavdbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, 2630286705Smav boolean_t fail_sparse, boolean_t fail_uncached, 2631168404Spjd void *tag, dmu_buf_impl_t **dbp) 2632168404Spjd{ 2633168404Spjd dmu_buf_impl_t *db, *parent = NULL; 2634168404Spjd 2635219089Spjd ASSERT(blkid != DMU_BONUS_BLKID); 2636168404Spjd ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 2637168404Spjd ASSERT3U(dn->dn_nlevels, >, level); 2638168404Spjd 2639168404Spjd *dbp = NULL; 2640168404Spjdtop: 2641168404Spjd /* dbuf_find() returns with db_mtx held */ 2642286541Smav db = dbuf_find(dn->dn_objset, dn->dn_object, level, blkid); 2643168404Spjd 2644168404Spjd if (db == NULL) { 2645168404Spjd blkptr_t *bp = NULL; 2646168404Spjd int err; 2647168404Spjd 2648286705Smav if (fail_uncached) 2649286705Smav return (SET_ERROR(ENOENT)); 2650286705Smav 2651168404Spjd ASSERT3P(parent, ==, NULL); 2652168404Spjd err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp); 2653168404Spjd if (fail_sparse) { 2654168404Spjd if (err == 0 && bp && BP_IS_HOLE(bp)) 2655249195Smm err = SET_ERROR(ENOENT); 2656168404Spjd if (err) { 2657168404Spjd if (parent) 2658168404Spjd dbuf_rele(parent, NULL); 2659168404Spjd return (err); 2660168404Spjd } 2661168404Spjd } 2662168404Spjd if (err && err != ENOENT) 2663168404Spjd return (err); 2664168404Spjd db = dbuf_create(dn, level, blkid, parent, bp); 2665168404Spjd } 2666168404Spjd 2667286705Smav if (fail_uncached && db->db_state != DB_CACHED) { 2668286705Smav mutex_exit(&db->db_mtx); 2669286705Smav return (SET_ERROR(ENOENT)); 2670286705Smav } 2671286705Smav 2672332785Smav if (db->db_buf != NULL) { 2673332785Smav arc_buf_access(db->db_buf); 2674168404Spjd ASSERT3P(db->db.db_data, ==, db->db_buf->b_data); 2675332785Smav } 2676168404Spjd 2677168404Spjd ASSERT(db->db_buf == NULL || arc_referenced(db->db_buf)); 2678168404Spjd 2679168404Spjd /* 2680168404Spjd * If this buffer is currently syncing out, and we are are 2681168404Spjd * still referencing it from db_data, we need to make a copy 2682168404Spjd * of it in case we decide we want to dirty it again in this txg. 2683168404Spjd */ 2684219089Spjd if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && 2685168404Spjd dn->dn_object != DMU_META_DNODE_OBJECT && 2686168404Spjd db->db_state == DB_CACHED && db->db_data_pending) { 2687168404Spjd dbuf_dirty_record_t *dr = db->db_data_pending; 2688168404Spjd 2689168404Spjd if (dr->dt.dl.dr_data == db->db_buf) { 2690168404Spjd arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 2691168404Spjd 2692168404Spjd dbuf_set_data(db, 2693321535Smav arc_alloc_buf(dn->dn_objset->os_spa, db, type, 2694321535Smav db->db.db_size)); 2695168404Spjd bcopy(dr->dt.dl.dr_data->b_data, db->db.db_data, 2696168404Spjd db->db.db_size); 2697168404Spjd } 2698168404Spjd } 2699168404Spjd 2700307265Smav if (multilist_link_active(&db->db_cache_link)) { 2701307265Smav ASSERT(refcount_is_zero(&db->db_holds)); 2702339109Smav ASSERT(db->db_caching_status == DB_DBUF_CACHE || 2703339109Smav db->db_caching_status == DB_DBUF_METADATA_CACHE); 2704339109Smav 2705339109Smav multilist_remove(dbuf_caches[db->db_caching_status].cache, db); 2706339109Smav (void) refcount_remove_many( 2707339109Smav &dbuf_caches[db->db_caching_status].size, 2708307265Smav db->db.db_size, db); 2709339109Smav 2710339109Smav db->db_caching_status = DB_NO_CACHE; 2711307265Smav } 2712168404Spjd (void) refcount_add(&db->db_holds, tag); 2713168404Spjd DBUF_VERIFY(db); 2714168404Spjd mutex_exit(&db->db_mtx); 2715168404Spjd 2716168404Spjd /* NOTE: we can't rele the parent until after we drop the db_mtx */ 2717168404Spjd if (parent) 2718168404Spjd dbuf_rele(parent, NULL); 2719168404Spjd 2720219089Spjd ASSERT3P(DB_DNODE(db), ==, dn); 2721168404Spjd ASSERT3U(db->db_blkid, ==, blkid); 2722168404Spjd ASSERT3U(db->db_level, ==, level); 2723168404Spjd *dbp = db; 2724168404Spjd 2725168404Spjd return (0); 2726168404Spjd} 2727168404Spjd 2728168404Spjddmu_buf_impl_t * 2729168404Spjddbuf_hold(dnode_t *dn, uint64_t blkid, void *tag) 2730168404Spjd{ 2731286705Smav return (dbuf_hold_level(dn, 0, blkid, tag)); 2732168404Spjd} 2733168404Spjd 2734168404Spjddmu_buf_impl_t * 2735168404Spjddbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag) 2736168404Spjd{ 2737168404Spjd dmu_buf_impl_t *db; 2738286705Smav int err = dbuf_hold_impl(dn, level, blkid, FALSE, FALSE, tag, &db); 2739168404Spjd return (err ? NULL : db); 2740168404Spjd} 2741168404Spjd 2742185029Spjdvoid 2743168404Spjddbuf_create_bonus(dnode_t *dn) 2744168404Spjd{ 2745168404Spjd ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); 2746168404Spjd 2747168404Spjd ASSERT(dn->dn_bonus == NULL); 2748219089Spjd dn->dn_bonus = dbuf_create(dn, 0, DMU_BONUS_BLKID, dn->dn_dbuf, NULL); 2749168404Spjd} 2750168404Spjd 2751219089Spjdint 2752219089Spjddbuf_spill_set_blksz(dmu_buf_t *db_fake, uint64_t blksz, dmu_tx_t *tx) 2753219089Spjd{ 2754219089Spjd dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 2755219089Spjd dnode_t *dn; 2756219089Spjd 2757219089Spjd if (db->db_blkid != DMU_SPILL_BLKID) 2758249195Smm return (SET_ERROR(ENOTSUP)); 2759219089Spjd if (blksz == 0) 2760219089Spjd blksz = SPA_MINBLOCKSIZE; 2761274337Sdelphij ASSERT3U(blksz, <=, spa_maxblocksize(dmu_objset_spa(db->db_objset))); 2762274337Sdelphij blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE); 2763219089Spjd 2764219089Spjd DB_DNODE_ENTER(db); 2765219089Spjd dn = DB_DNODE(db); 2766219089Spjd rw_enter(&dn->dn_struct_rwlock, RW_WRITER); 2767219089Spjd dbuf_new_size(db, blksz, tx); 2768219089Spjd rw_exit(&dn->dn_struct_rwlock); 2769219089Spjd DB_DNODE_EXIT(db); 2770219089Spjd 2771219089Spjd return (0); 2772219089Spjd} 2773219089Spjd 2774219089Spjdvoid 2775219089Spjddbuf_rm_spill(dnode_t *dn, dmu_tx_t *tx) 2776219089Spjd{ 2777219089Spjd dbuf_free_range(dn, DMU_SPILL_BLKID, DMU_SPILL_BLKID, tx); 2778219089Spjd} 2779219089Spjd 2780168404Spjd#pragma weak dmu_buf_add_ref = dbuf_add_ref 2781168404Spjdvoid 2782168404Spjddbuf_add_ref(dmu_buf_impl_t *db, void *tag) 2783168404Spjd{ 2784168404Spjd int64_t holds = refcount_add(&db->db_holds, tag); 2785307265Smav ASSERT3S(holds, >, 1); 2786168404Spjd} 2787168404Spjd 2788286541Smav#pragma weak dmu_buf_try_add_ref = dbuf_try_add_ref 2789286541Smavboolean_t 2790286541Smavdbuf_try_add_ref(dmu_buf_t *db_fake, objset_t *os, uint64_t obj, uint64_t blkid, 2791286541Smav void *tag) 2792286541Smav{ 2793286541Smav dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 2794286541Smav dmu_buf_impl_t *found_db; 2795286541Smav boolean_t result = B_FALSE; 2796286541Smav 2797286541Smav if (db->db_blkid == DMU_BONUS_BLKID) 2798286541Smav found_db = dbuf_find_bonus(os, obj); 2799286541Smav else 2800286541Smav found_db = dbuf_find(os, obj, 0, blkid); 2801286541Smav 2802286541Smav if (found_db != NULL) { 2803286541Smav if (db == found_db && dbuf_refcount(db) > db->db_dirtycnt) { 2804286541Smav (void) refcount_add(&db->db_holds, tag); 2805286541Smav result = B_TRUE; 2806286541Smav } 2807286541Smav mutex_exit(&db->db_mtx); 2808286541Smav } 2809286541Smav return (result); 2810286541Smav} 2811286541Smav 2812219089Spjd/* 2813219089Spjd * If you call dbuf_rele() you had better not be referencing the dnode handle 2814219089Spjd * unless you have some other direct or indirect hold on the dnode. (An indirect 2815219089Spjd * hold is a hold on one of the dnode's dbufs, including the bonus buffer.) 2816219089Spjd * Without that, the dbuf_rele() could lead to a dnode_rele() followed by the 2817219089Spjd * dnode's parent dbuf evicting its dnode handles. 2818219089Spjd */ 2819168404Spjdvoid 2820168404Spjddbuf_rele(dmu_buf_impl_t *db, void *tag) 2821168404Spjd{ 2822219089Spjd mutex_enter(&db->db_mtx); 2823339140Smav dbuf_rele_and_unlock(db, tag, B_FALSE); 2824219089Spjd} 2825219089Spjd 2826260150Sdelphijvoid 2827260150Sdelphijdmu_buf_rele(dmu_buf_t *db, void *tag) 2828260150Sdelphij{ 2829260150Sdelphij dbuf_rele((dmu_buf_impl_t *)db, tag); 2830260150Sdelphij} 2831260150Sdelphij 2832219089Spjd/* 2833219089Spjd * dbuf_rele() for an already-locked dbuf. This is necessary to allow 2834339140Smav * db_dirtycnt and db_holds to be updated atomically. The 'evicting' 2835339140Smav * argument should be set if we are already in the dbuf-evicting code 2836339140Smav * path, in which case we don't want to recursively evict. This allows us to 2837339140Smav * avoid deeply nested stacks that would have a call flow similar to this: 2838339140Smav * 2839339140Smav * dbuf_rele()-->dbuf_rele_and_unlock()-->dbuf_evict_notify() 2840339140Smav * ^ | 2841339140Smav * | | 2842339140Smav * +-----dbuf_destroy()<--dbuf_evict_one()<--------+ 2843339140Smav * 2844219089Spjd */ 2845219089Spjdvoid 2846339140Smavdbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag, boolean_t evicting) 2847219089Spjd{ 2848168404Spjd int64_t holds; 2849168404Spjd 2850219089Spjd ASSERT(MUTEX_HELD(&db->db_mtx)); 2851168404Spjd DBUF_VERIFY(db); 2852168404Spjd 2853219089Spjd /* 2854219089Spjd * Remove the reference to the dbuf before removing its hold on the 2855219089Spjd * dnode so we can guarantee in dnode_move() that a referenced bonus 2856219089Spjd * buffer has a corresponding dnode hold. 2857219089Spjd */ 2858168404Spjd holds = refcount_remove(&db->db_holds, tag); 2859168404Spjd ASSERT(holds >= 0); 2860168404Spjd 2861168404Spjd /* 2862168404Spjd * We can't freeze indirects if there is a possibility that they 2863168404Spjd * may be modified in the current syncing context. 2864168404Spjd */ 2865307265Smav if (db->db_buf != NULL && 2866307265Smav holds == (db->db_level == 0 ? db->db_dirtycnt : 0)) { 2867168404Spjd arc_buf_freeze(db->db_buf); 2868307265Smav } 2869168404Spjd 2870168404Spjd if (holds == db->db_dirtycnt && 2871289309Smav db->db_level == 0 && db->db_user_immediate_evict) 2872168404Spjd dbuf_evict_user(db); 2873168404Spjd 2874168404Spjd if (holds == 0) { 2875219089Spjd if (db->db_blkid == DMU_BONUS_BLKID) { 2876286545Smav dnode_t *dn; 2877289309Smav boolean_t evict_dbuf = db->db_pending_evict; 2878219089Spjd 2879219089Spjd /* 2880286545Smav * If the dnode moves here, we cannot cross this 2881286545Smav * barrier until the move completes. 2882219089Spjd */ 2883219089Spjd DB_DNODE_ENTER(db); 2884286545Smav 2885286545Smav dn = DB_DNODE(db); 2886286545Smav atomic_dec_32(&dn->dn_dbufs_count); 2887286545Smav 2888286545Smav /* 2889286545Smav * Decrementing the dbuf count means that the bonus 2890286545Smav * buffer's dnode hold is no longer discounted in 2891286545Smav * dnode_move(). The dnode cannot move until after 2892289309Smav * the dnode_rele() below. 2893286545Smav */ 2894219089Spjd DB_DNODE_EXIT(db); 2895286545Smav 2896219089Spjd /* 2897286545Smav * Do not reference db after its lock is dropped. 2898286545Smav * Another thread may evict it. 2899219089Spjd */ 2900286545Smav mutex_exit(&db->db_mtx); 2901286545Smav 2902289309Smav if (evict_dbuf) 2903286545Smav dnode_evict_bonus(dn); 2904289309Smav 2905289309Smav dnode_rele(dn, db); 2906168404Spjd } else if (db->db_buf == NULL) { 2907168404Spjd /* 2908168404Spjd * This is a special case: we never associated this 2909168404Spjd * dbuf with any data allocated from the ARC. 2910168404Spjd */ 2911219089Spjd ASSERT(db->db_state == DB_UNCACHED || 2912219089Spjd db->db_state == DB_NOFILL); 2913307265Smav dbuf_destroy(db); 2914168404Spjd } else if (arc_released(db->db_buf)) { 2915168404Spjd /* 2916168404Spjd * This dbuf has anonymous data associated with it. 2917168404Spjd */ 2918307265Smav dbuf_destroy(db); 2919168404Spjd } else { 2920307265Smav boolean_t do_arc_evict = B_FALSE; 2921307265Smav blkptr_t bp; 2922307265Smav spa_t *spa = dmu_objset_spa(db->db_objset); 2923242845Sdelphij 2924307265Smav if (!DBUF_IS_CACHEABLE(db) && 2925307265Smav db->db_blkptr != NULL && 2926307265Smav !BP_IS_HOLE(db->db_blkptr) && 2927307265Smav !BP_IS_EMBEDDED(db->db_blkptr)) { 2928307265Smav do_arc_evict = B_TRUE; 2929307265Smav bp = *db->db_blkptr; 2930307265Smav } 2931307265Smav 2932307265Smav if (!DBUF_IS_CACHEABLE(db) || 2933307265Smav db->db_pending_evict) { 2934307265Smav dbuf_destroy(db); 2935307265Smav } else if (!multilist_link_active(&db->db_cache_link)) { 2936339109Smav ASSERT3U(db->db_caching_status, ==, 2937339109Smav DB_NO_CACHE); 2938339109Smav 2939339109Smav dbuf_cached_state_t dcs = 2940339109Smav dbuf_include_in_metadata_cache(db) ? 2941339109Smav DB_DBUF_METADATA_CACHE : DB_DBUF_CACHE; 2942339109Smav db->db_caching_status = dcs; 2943339109Smav 2944339109Smav multilist_insert(dbuf_caches[dcs].cache, db); 2945339109Smav (void) refcount_add_many(&dbuf_caches[dcs].size, 2946307265Smav db->db.db_size, db); 2947185029Spjd mutex_exit(&db->db_mtx); 2948307265Smav 2949339140Smav if (db->db_caching_status == DB_DBUF_CACHE && 2950339140Smav !evicting) { 2951339109Smav dbuf_evict_notify(); 2952339109Smav } 2953268858Sdelphij } 2954307265Smav 2955307265Smav if (do_arc_evict) 2956307265Smav arc_freed(spa, &bp); 2957168404Spjd } 2958168404Spjd } else { 2959168404Spjd mutex_exit(&db->db_mtx); 2960168404Spjd } 2961307265Smav 2962168404Spjd} 2963168404Spjd 2964168404Spjd#pragma weak dmu_buf_refcount = dbuf_refcount 2965168404Spjduint64_t 2966168404Spjddbuf_refcount(dmu_buf_impl_t *db) 2967168404Spjd{ 2968168404Spjd return (refcount_count(&db->db_holds)); 2969168404Spjd} 2970168404Spjd 2971168404Spjdvoid * 2972286575Smavdmu_buf_replace_user(dmu_buf_t *db_fake, dmu_buf_user_t *old_user, 2973286575Smav dmu_buf_user_t *new_user) 2974168404Spjd{ 2975286575Smav dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 2976286575Smav 2977286575Smav mutex_enter(&db->db_mtx); 2978286575Smav dbuf_verify_user(db, DBVU_NOT_EVICTING); 2979286575Smav if (db->db_user == old_user) 2980286575Smav db->db_user = new_user; 2981286575Smav else 2982286575Smav old_user = db->db_user; 2983286575Smav dbuf_verify_user(db, DBVU_NOT_EVICTING); 2984286575Smav mutex_exit(&db->db_mtx); 2985286575Smav 2986286575Smav return (old_user); 2987168404Spjd} 2988168404Spjd 2989168404Spjdvoid * 2990286575Smavdmu_buf_set_user(dmu_buf_t *db_fake, dmu_buf_user_t *user) 2991168404Spjd{ 2992286575Smav return (dmu_buf_replace_user(db_fake, NULL, user)); 2993286575Smav} 2994286575Smav 2995286575Smavvoid * 2996286575Smavdmu_buf_set_user_ie(dmu_buf_t *db_fake, dmu_buf_user_t *user) 2997286575Smav{ 2998168404Spjd dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 2999168404Spjd 3000289309Smav db->db_user_immediate_evict = TRUE; 3001286575Smav return (dmu_buf_set_user(db_fake, user)); 3002168404Spjd} 3003168404Spjd 3004168404Spjdvoid * 3005286575Smavdmu_buf_remove_user(dmu_buf_t *db_fake, dmu_buf_user_t *user) 3006168404Spjd{ 3007286575Smav return (dmu_buf_replace_user(db_fake, user, NULL)); 3008168404Spjd} 3009168404Spjd 3010168404Spjdvoid * 3011168404Spjddmu_buf_get_user(dmu_buf_t *db_fake) 3012168404Spjd{ 3013168404Spjd dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 3014168404Spjd 3015286575Smav dbuf_verify_user(db, DBVU_NOT_EVICTING); 3016286575Smav return (db->db_user); 3017168404Spjd} 3018168404Spjd 3019286575Smavvoid 3020286575Smavdmu_buf_user_evict_wait() 3021286575Smav{ 3022286575Smav taskq_wait(dbu_evict_taskq); 3023286575Smav} 3024286575Smav 3025243524Smmblkptr_t * 3026243524Smmdmu_buf_get_blkptr(dmu_buf_t *db) 3027243524Smm{ 3028243524Smm dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db; 3029243524Smm return (dbi->db_blkptr); 3030243524Smm} 3031243524Smm 3032307286Smavobjset_t * 3033307286Smavdmu_buf_get_objset(dmu_buf_t *db) 3034307286Smav{ 3035307286Smav dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db; 3036307286Smav return (dbi->db_objset); 3037307286Smav} 3038307286Smav 3039307290Smavdnode_t * 3040307290Smavdmu_buf_dnode_enter(dmu_buf_t *db) 3041307290Smav{ 3042307290Smav dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db; 3043307290Smav DB_DNODE_ENTER(dbi); 3044307290Smav return (DB_DNODE(dbi)); 3045307290Smav} 3046307290Smav 3047307290Smavvoid 3048307290Smavdmu_buf_dnode_exit(dmu_buf_t *db) 3049307290Smav{ 3050307290Smav dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db; 3051307290Smav DB_DNODE_EXIT(dbi); 3052307290Smav} 3053307290Smav 3054168404Spjdstatic void 3055168404Spjddbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db) 3056168404Spjd{ 3057168404Spjd /* ASSERT(dmu_tx_is_syncing(tx) */ 3058168404Spjd ASSERT(MUTEX_HELD(&db->db_mtx)); 3059168404Spjd 3060168404Spjd if (db->db_blkptr != NULL) 3061168404Spjd return; 3062168404Spjd 3063219089Spjd if (db->db_blkid == DMU_SPILL_BLKID) { 3064219089Spjd db->db_blkptr = &dn->dn_phys->dn_spill; 3065219089Spjd BP_ZERO(db->db_blkptr); 3066219089Spjd return; 3067219089Spjd } 3068168404Spjd if (db->db_level == dn->dn_phys->dn_nlevels-1) { 3069168404Spjd /* 3070168404Spjd * This buffer was allocated at a time when there was 3071168404Spjd * no available blkptrs from the dnode, or it was 3072168404Spjd * inappropriate to hook it in (i.e., nlevels mis-match). 3073168404Spjd */ 3074168404Spjd ASSERT(db->db_blkid < dn->dn_phys->dn_nblkptr); 3075168404Spjd ASSERT(db->db_parent == NULL); 3076168404Spjd db->db_parent = dn->dn_dbuf; 3077168404Spjd db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid]; 3078168404Spjd DBUF_VERIFY(db); 3079168404Spjd } else { 3080168404Spjd dmu_buf_impl_t *parent = db->db_parent; 3081168404Spjd int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; 3082168404Spjd 3083168404Spjd ASSERT(dn->dn_phys->dn_nlevels > 1); 3084168404Spjd if (parent == NULL) { 3085168404Spjd mutex_exit(&db->db_mtx); 3086168404Spjd rw_enter(&dn->dn_struct_rwlock, RW_READER); 3087286705Smav parent = dbuf_hold_level(dn, db->db_level + 1, 3088286705Smav db->db_blkid >> epbs, db); 3089168404Spjd rw_exit(&dn->dn_struct_rwlock); 3090168404Spjd mutex_enter(&db->db_mtx); 3091168404Spjd db->db_parent = parent; 3092168404Spjd } 3093168404Spjd db->db_blkptr = (blkptr_t *)parent->db.db_data + 3094168404Spjd (db->db_blkid & ((1ULL << epbs) - 1)); 3095168404Spjd DBUF_VERIFY(db); 3096168404Spjd } 3097168404Spjd} 3098168404Spjd 3099168404Spjdstatic void 3100168404Spjddbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx) 3101168404Spjd{ 3102168404Spjd dmu_buf_impl_t *db = dr->dr_dbuf; 3103219089Spjd dnode_t *dn; 3104168404Spjd zio_t *zio; 3105168404Spjd 3106168404Spjd ASSERT(dmu_tx_is_syncing(tx)); 3107168404Spjd 3108168404Spjd dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr); 3109168404Spjd 3110168404Spjd mutex_enter(&db->db_mtx); 3111168404Spjd 3112168404Spjd ASSERT(db->db_level > 0); 3113168404Spjd DBUF_VERIFY(db); 3114168404Spjd 3115251629Sdelphij /* Read the block if it hasn't been read yet. */ 3116168404Spjd if (db->db_buf == NULL) { 3117168404Spjd mutex_exit(&db->db_mtx); 3118168404Spjd (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED); 3119168404Spjd mutex_enter(&db->db_mtx); 3120168404Spjd } 3121168404Spjd ASSERT3U(db->db_state, ==, DB_CACHED); 3122168404Spjd ASSERT(db->db_buf != NULL); 3123168404Spjd 3124219089Spjd DB_DNODE_ENTER(db); 3125219089Spjd dn = DB_DNODE(db); 3126251629Sdelphij /* Indirect block size must match what the dnode thinks it is. */ 3127219089Spjd ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift); 3128168404Spjd dbuf_check_blkptr(dn, db); 3129219089Spjd DB_DNODE_EXIT(db); 3130168404Spjd 3131251629Sdelphij /* Provide the pending dirty record to child dbufs */ 3132168404Spjd db->db_data_pending = dr; 3133168404Spjd 3134168404Spjd mutex_exit(&db->db_mtx); 3135332525Smav 3136185029Spjd dbuf_write(dr, db->db_buf, tx); 3137168404Spjd 3138168404Spjd zio = dr->dr_zio; 3139168404Spjd mutex_enter(&dr->dt.di.dr_mtx); 3140284593Savg dbuf_sync_list(&dr->dt.di.dr_children, db->db_level - 1, tx); 3141168404Spjd ASSERT(list_head(&dr->dt.di.dr_children) == NULL); 3142168404Spjd mutex_exit(&dr->dt.di.dr_mtx); 3143168404Spjd zio_nowait(zio); 3144168404Spjd} 3145168404Spjd 3146168404Spjdstatic void 3147168404Spjddbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) 3148168404Spjd{ 3149168404Spjd arc_buf_t **datap = &dr->dt.dl.dr_data; 3150168404Spjd dmu_buf_impl_t *db = dr->dr_dbuf; 3151219089Spjd dnode_t *dn; 3152219089Spjd objset_t *os; 3153168404Spjd uint64_t txg = tx->tx_txg; 3154168404Spjd 3155168404Spjd ASSERT(dmu_tx_is_syncing(tx)); 3156168404Spjd 3157168404Spjd dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr); 3158168404Spjd 3159168404Spjd mutex_enter(&db->db_mtx); 3160168404Spjd /* 3161168404Spjd * To be synced, we must be dirtied. But we 3162168404Spjd * might have been freed after the dirty. 3163168404Spjd */ 3164168404Spjd if (db->db_state == DB_UNCACHED) { 3165168404Spjd /* This buffer has been freed since it was dirtied */ 3166168404Spjd ASSERT(db->db.db_data == NULL); 3167168404Spjd } else if (db->db_state == DB_FILL) { 3168168404Spjd /* This buffer was freed and is now being re-filled */ 3169168404Spjd ASSERT(db->db.db_data != dr->dt.dl.dr_data); 3170168404Spjd } else { 3171219089Spjd ASSERT(db->db_state == DB_CACHED || db->db_state == DB_NOFILL); 3172168404Spjd } 3173168404Spjd DBUF_VERIFY(db); 3174168404Spjd 3175219089Spjd DB_DNODE_ENTER(db); 3176219089Spjd dn = DB_DNODE(db); 3177219089Spjd 3178219089Spjd if (db->db_blkid == DMU_SPILL_BLKID) { 3179219089Spjd mutex_enter(&dn->dn_mtx); 3180219089Spjd dn->dn_phys->dn_flags |= DNODE_FLAG_SPILL_BLKPTR; 3181219089Spjd mutex_exit(&dn->dn_mtx); 3182219089Spjd } 3183219089Spjd 3184168404Spjd /* 3185168404Spjd * If this is a bonus buffer, simply copy the bonus data into the 3186168404Spjd * dnode. It will be written out when the dnode is synced (and it 3187168404Spjd * will be synced, since it must have been dirty for dbuf_sync to 3188168404Spjd * be called). 3189168404Spjd */ 3190219089Spjd if (db->db_blkid == DMU_BONUS_BLKID) { 3191168404Spjd dbuf_dirty_record_t **drp; 3192185029Spjd 3193168404Spjd ASSERT(*datap != NULL); 3194240415Smm ASSERT0(db->db_level); 3195168404Spjd ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN); 3196168404Spjd bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen); 3197219089Spjd DB_DNODE_EXIT(db); 3198219089Spjd 3199185029Spjd if (*datap != db->db.db_data) { 3200168404Spjd zio_buf_free(*datap, DN_MAX_BONUSLEN); 3201208373Smm arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); 3202185029Spjd } 3203168404Spjd db->db_data_pending = NULL; 3204168404Spjd drp = &db->db_last_dirty; 3205168404Spjd while (*drp != dr) 3206168404Spjd drp = &(*drp)->dr_next; 3207185029Spjd ASSERT(dr->dr_next == NULL); 3208219089Spjd ASSERT(dr->dr_dbuf == db); 3209185029Spjd *drp = dr->dr_next; 3210169325Spjd if (dr->dr_dbuf->db_level != 0) { 3211169325Spjd list_destroy(&dr->dt.di.dr_children); 3212169325Spjd mutex_destroy(&dr->dt.di.dr_mtx); 3213169325Spjd } 3214168404Spjd kmem_free(dr, sizeof (dbuf_dirty_record_t)); 3215168404Spjd ASSERT(db->db_dirtycnt > 0); 3216168404Spjd db->db_dirtycnt -= 1; 3217339140Smav dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg, B_FALSE); 3218168404Spjd return; 3219168404Spjd } 3220168404Spjd 3221219089Spjd os = dn->dn_objset; 3222219089Spjd 3223168404Spjd /* 3224185029Spjd * This function may have dropped the db_mtx lock allowing a dmu_sync 3225185029Spjd * operation to sneak in. As a result, we need to ensure that we 3226185029Spjd * don't check the dr_override_state until we have returned from 3227185029Spjd * dbuf_check_blkptr. 3228185029Spjd */ 3229185029Spjd dbuf_check_blkptr(dn, db); 3230185029Spjd 3231185029Spjd /* 3232219089Spjd * If this buffer is in the middle of an immediate write, 3233168404Spjd * wait for the synchronous IO to complete. 3234168404Spjd */ 3235168404Spjd while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) { 3236168404Spjd ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT); 3237168404Spjd cv_wait(&db->db_changed, &db->db_mtx); 3238168404Spjd ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN); 3239168404Spjd } 3240168404Spjd 3241219089Spjd if (db->db_state != DB_NOFILL && 3242219089Spjd dn->dn_object != DMU_META_DNODE_OBJECT && 3243208050Smm refcount_count(&db->db_holds) > 1 && 3244219089Spjd dr->dt.dl.dr_override_state != DR_OVERRIDDEN && 3245208050Smm *datap == db->db_buf) { 3246168404Spjd /* 3247208050Smm * If this buffer is currently "in use" (i.e., there 3248208050Smm * are active holds and db_data still references it), 3249208050Smm * then make a copy before we start the write so that 3250208050Smm * any modifications from the open txg will not leak 3251208050Smm * into this write. 3252168404Spjd * 3253208050Smm * NOTE: this copy does not need to be made for 3254208050Smm * objects only modified in the syncing context (e.g. 3255208050Smm * DNONE_DNODE blocks). 3256168404Spjd */ 3257321535Smav int psize = arc_buf_size(*datap); 3258208050Smm arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 3259321535Smav enum zio_compress compress_type = arc_get_compression(*datap); 3260321535Smav 3261321535Smav if (compress_type == ZIO_COMPRESS_OFF) { 3262321535Smav *datap = arc_alloc_buf(os->os_spa, db, type, psize); 3263321535Smav } else { 3264321535Smav ASSERT3U(type, ==, ARC_BUFC_DATA); 3265321535Smav int lsize = arc_buf_lsize(*datap); 3266321535Smav *datap = arc_alloc_compressed_buf(os->os_spa, db, 3267321535Smav psize, lsize, compress_type); 3268321535Smav } 3269321535Smav bcopy(db->db.db_data, (*datap)->b_data, psize); 3270168404Spjd } 3271168404Spjd db->db_data_pending = dr; 3272168404Spjd 3273168404Spjd mutex_exit(&db->db_mtx); 3274168404Spjd 3275185029Spjd dbuf_write(dr, *datap, tx); 3276168404Spjd 3277168404Spjd ASSERT(!list_link_active(&dr->dr_dirty_node)); 3278219089Spjd if (dn->dn_object == DMU_META_DNODE_OBJECT) { 3279168404Spjd list_insert_tail(&dn->dn_dirty_records[txg&TXG_MASK], dr); 3280219089Spjd DB_DNODE_EXIT(db); 3281219089Spjd } else { 3282219089Spjd /* 3283219089Spjd * Although zio_nowait() does not "wait for an IO", it does 3284219089Spjd * initiate the IO. If this is an empty write it seems plausible 3285219089Spjd * that the IO could actually be completed before the nowait 3286219089Spjd * returns. We need to DB_DNODE_EXIT() first in case 3287219089Spjd * zio_nowait() invalidates the dbuf. 3288219089Spjd */ 3289219089Spjd DB_DNODE_EXIT(db); 3290168404Spjd zio_nowait(dr->dr_zio); 3291219089Spjd } 3292168404Spjd} 3293168404Spjd 3294168404Spjdvoid 3295284593Savgdbuf_sync_list(list_t *list, int level, dmu_tx_t *tx) 3296168404Spjd{ 3297168404Spjd dbuf_dirty_record_t *dr; 3298168404Spjd 3299168404Spjd while (dr = list_head(list)) { 3300168404Spjd if (dr->dr_zio != NULL) { 3301168404Spjd /* 3302168404Spjd * If we find an already initialized zio then we 3303168404Spjd * are processing the meta-dnode, and we have finished. 3304168404Spjd * The dbufs for all dnodes are put back on the list 3305168404Spjd * during processing, so that we can zio_wait() 3306168404Spjd * these IOs after initiating all child IOs. 3307168404Spjd */ 3308168404Spjd ASSERT3U(dr->dr_dbuf->db.db_object, ==, 3309168404Spjd DMU_META_DNODE_OBJECT); 3310168404Spjd break; 3311168404Spjd } 3312284593Savg if (dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID && 3313284593Savg dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) { 3314284593Savg VERIFY3U(dr->dr_dbuf->db_level, ==, level); 3315284593Savg } 3316168404Spjd list_remove(list, dr); 3317168404Spjd if (dr->dr_dbuf->db_level > 0) 3318168404Spjd dbuf_sync_indirect(dr, tx); 3319168404Spjd else 3320168404Spjd dbuf_sync_leaf(dr, tx); 3321168404Spjd } 3322168404Spjd} 3323168404Spjd 3324168404Spjd/* ARGSUSED */ 3325168404Spjdstatic void 3326168404Spjddbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb) 3327168404Spjd{ 3328168404Spjd dmu_buf_impl_t *db = vdb; 3329219089Spjd dnode_t *dn; 3330185029Spjd blkptr_t *bp = zio->io_bp; 3331168404Spjd blkptr_t *bp_orig = &zio->io_bp_orig; 3332219089Spjd spa_t *spa = zio->io_spa; 3333219089Spjd int64_t delta; 3334168404Spjd uint64_t fill = 0; 3335219089Spjd int i; 3336168404Spjd 3337304137Savg ASSERT3P(db->db_blkptr, !=, NULL); 3338304137Savg ASSERT3P(&db->db_data_pending->dr_bp_copy, ==, bp); 3339185029Spjd 3340219089Spjd DB_DNODE_ENTER(db); 3341219089Spjd dn = DB_DNODE(db); 3342219089Spjd delta = bp_get_dsize_sync(spa, bp) - bp_get_dsize_sync(spa, bp_orig); 3343219089Spjd dnode_diduse_space(dn, delta - zio->io_prev_space_delta); 3344219089Spjd zio->io_prev_space_delta = delta; 3345168404Spjd 3346260150Sdelphij if (bp->blk_birth != 0) { 3347260150Sdelphij ASSERT((db->db_blkid != DMU_SPILL_BLKID && 3348260150Sdelphij BP_GET_TYPE(bp) == dn->dn_type) || 3349260150Sdelphij (db->db_blkid == DMU_SPILL_BLKID && 3350268075Sdelphij BP_GET_TYPE(bp) == dn->dn_bonustype) || 3351268075Sdelphij BP_IS_EMBEDDED(bp)); 3352260150Sdelphij ASSERT(BP_GET_LEVEL(bp) == db->db_level); 3353168404Spjd } 3354168404Spjd 3355168404Spjd mutex_enter(&db->db_mtx); 3356168404Spjd 3357219089Spjd#ifdef ZFS_DEBUG 3358219089Spjd if (db->db_blkid == DMU_SPILL_BLKID) { 3359219089Spjd ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR); 3360304137Savg ASSERT(!(BP_IS_HOLE(bp)) && 3361219089Spjd db->db_blkptr == &dn->dn_phys->dn_spill); 3362219089Spjd } 3363219089Spjd#endif 3364219089Spjd 3365168404Spjd if (db->db_level == 0) { 3366168404Spjd mutex_enter(&dn->dn_mtx); 3367219089Spjd if (db->db_blkid > dn->dn_phys->dn_maxblkid && 3368219089Spjd db->db_blkid != DMU_SPILL_BLKID) 3369168404Spjd dn->dn_phys->dn_maxblkid = db->db_blkid; 3370168404Spjd mutex_exit(&dn->dn_mtx); 3371168404Spjd 3372168404Spjd if (dn->dn_type == DMU_OT_DNODE) { 3373168404Spjd dnode_phys_t *dnp = db->db.db_data; 3374168404Spjd for (i = db->db.db_size >> DNODE_SHIFT; i > 0; 3375168404Spjd i--, dnp++) { 3376168404Spjd if (dnp->dn_type != DMU_OT_NONE) 3377168404Spjd fill++; 3378168404Spjd } 3379168404Spjd } else { 3380260150Sdelphij if (BP_IS_HOLE(bp)) { 3381260150Sdelphij fill = 0; 3382260150Sdelphij } else { 3383260150Sdelphij fill = 1; 3384260150Sdelphij } 3385168404Spjd } 3386168404Spjd } else { 3387185029Spjd blkptr_t *ibp = db->db.db_data; 3388168404Spjd ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift); 3389185029Spjd for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) { 3390185029Spjd if (BP_IS_HOLE(ibp)) 3391168404Spjd continue; 3392268075Sdelphij fill += BP_GET_FILL(ibp); 3393168404Spjd } 3394168404Spjd } 3395219089Spjd DB_DNODE_EXIT(db); 3396168404Spjd 3397268075Sdelphij if (!BP_IS_EMBEDDED(bp)) 3398268075Sdelphij bp->blk_fill = fill; 3399168404Spjd 3400168404Spjd mutex_exit(&db->db_mtx); 3401304137Savg 3402304137Savg rw_enter(&dn->dn_struct_rwlock, RW_WRITER); 3403304137Savg *db->db_blkptr = *bp; 3404304137Savg rw_exit(&dn->dn_struct_rwlock); 3405168404Spjd} 3406168404Spjd 3407304138Savg/* ARGSUSED */ 3408258632Savg/* 3409304138Savg * This function gets called just prior to running through the compression 3410304138Savg * stage of the zio pipeline. If we're an indirect block comprised of only 3411304138Savg * holes, then we want this indirect to be compressed away to a hole. In 3412304138Savg * order to do that we must zero out any information about the holes that 3413304138Savg * this indirect points to prior to before we try to compress it. 3414304138Savg */ 3415304138Savgstatic void 3416304138Savgdbuf_write_children_ready(zio_t *zio, arc_buf_t *buf, void *vdb) 3417304138Savg{ 3418304138Savg dmu_buf_impl_t *db = vdb; 3419304138Savg dnode_t *dn; 3420304138Savg blkptr_t *bp; 3421321537Smav unsigned int epbs, i; 3422304138Savg 3423304138Savg ASSERT3U(db->db_level, >, 0); 3424304138Savg DB_DNODE_ENTER(db); 3425304138Savg dn = DB_DNODE(db); 3426304138Savg epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; 3427321537Smav ASSERT3U(epbs, <, 31); 3428304138Savg 3429304138Savg /* Determine if all our children are holes */ 3430304138Savg for (i = 0, bp = db->db.db_data; i < 1 << epbs; i++, bp++) { 3431304138Savg if (!BP_IS_HOLE(bp)) 3432304138Savg break; 3433304138Savg } 3434304138Savg 3435304138Savg /* 3436304138Savg * If all the children are holes, then zero them all out so that 3437304138Savg * we may get compressed away. 3438304138Savg */ 3439304138Savg if (i == 1 << epbs) { 3440321537Smav /* 3441321537Smav * We only found holes. Grab the rwlock to prevent 3442321537Smav * anybody from reading the blocks we're about to 3443321537Smav * zero out. 3444321537Smav */ 3445321537Smav rw_enter(&dn->dn_struct_rwlock, RW_WRITER); 3446304138Savg bzero(db->db.db_data, db->db.db_size); 3447321537Smav rw_exit(&dn->dn_struct_rwlock); 3448304138Savg } 3449304138Savg DB_DNODE_EXIT(db); 3450304138Savg} 3451304138Savg 3452304138Savg/* 3453258632Savg * The SPA will call this callback several times for each zio - once 3454258632Savg * for every physical child i/o (zio->io_phys_children times). This 3455258632Savg * allows the DMU to monitor the progress of each logical i/o. For example, 3456258632Savg * there may be 2 copies of an indirect block, or many fragments of a RAID-Z 3457258632Savg * block. There may be a long delay before all copies/fragments are completed, 3458258632Savg * so this callback allows us to retire dirty space gradually, as the physical 3459258632Savg * i/os complete. 3460258632Savg */ 3461168404Spjd/* ARGSUSED */ 3462168404Spjdstatic void 3463258632Savgdbuf_write_physdone(zio_t *zio, arc_buf_t *buf, void *arg) 3464258632Savg{ 3465258632Savg dmu_buf_impl_t *db = arg; 3466258632Savg objset_t *os = db->db_objset; 3467258632Savg dsl_pool_t *dp = dmu_objset_pool(os); 3468258632Savg dbuf_dirty_record_t *dr; 3469258632Savg int delta = 0; 3470258632Savg 3471258632Savg dr = db->db_data_pending; 3472258632Savg ASSERT3U(dr->dr_txg, ==, zio->io_txg); 3473258632Savg 3474258632Savg /* 3475258632Savg * The callback will be called io_phys_children times. Retire one 3476258632Savg * portion of our dirty space each time we are called. Any rounding 3477258632Savg * error will be cleaned up by dsl_pool_sync()'s call to 3478258632Savg * dsl_pool_undirty_space(). 3479258632Savg */ 3480258632Savg delta = dr->dr_accounted / zio->io_phys_children; 3481258632Savg dsl_pool_undirty_space(dp, delta, zio->io_txg); 3482258632Savg} 3483258632Savg 3484258632Savg/* ARGSUSED */ 3485258632Savgstatic void 3486168404Spjddbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) 3487168404Spjd{ 3488168404Spjd dmu_buf_impl_t *db = vdb; 3489219089Spjd blkptr_t *bp_orig = &zio->io_bp_orig; 3490260150Sdelphij blkptr_t *bp = db->db_blkptr; 3491260150Sdelphij objset_t *os = db->db_objset; 3492260150Sdelphij dmu_tx_t *tx = os->os_synctx; 3493168404Spjd dbuf_dirty_record_t **drp, *dr; 3494168404Spjd 3495240415Smm ASSERT0(zio->io_error); 3496219089Spjd ASSERT(db->db_blkptr == bp); 3497168404Spjd 3498243524Smm /* 3499243524Smm * For nopwrites and rewrites we ensure that the bp matches our 3500243524Smm * original and bypass all the accounting. 3501243524Smm */ 3502243524Smm if (zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)) { 3503219089Spjd ASSERT(BP_EQUAL(bp, bp_orig)); 3504219089Spjd } else { 3505260150Sdelphij dsl_dataset_t *ds = os->os_dsl_dataset; 3506219089Spjd (void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE); 3507219089Spjd dsl_dataset_block_born(ds, bp, tx); 3508219089Spjd } 3509219089Spjd 3510168404Spjd mutex_enter(&db->db_mtx); 3511168404Spjd 3512219089Spjd DBUF_VERIFY(db); 3513219089Spjd 3514168404Spjd drp = &db->db_last_dirty; 3515185029Spjd while ((dr = *drp) != db->db_data_pending) 3516185029Spjd drp = &dr->dr_next; 3517185029Spjd ASSERT(!list_link_active(&dr->dr_dirty_node)); 3518219089Spjd ASSERT(dr->dr_dbuf == db); 3519185029Spjd ASSERT(dr->dr_next == NULL); 3520185029Spjd *drp = dr->dr_next; 3521168404Spjd 3522219089Spjd#ifdef ZFS_DEBUG 3523219089Spjd if (db->db_blkid == DMU_SPILL_BLKID) { 3524219089Spjd dnode_t *dn; 3525219089Spjd 3526219089Spjd DB_DNODE_ENTER(db); 3527219089Spjd dn = DB_DNODE(db); 3528219089Spjd ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR); 3529219089Spjd ASSERT(!(BP_IS_HOLE(db->db_blkptr)) && 3530219089Spjd db->db_blkptr == &dn->dn_phys->dn_spill); 3531219089Spjd DB_DNODE_EXIT(db); 3532219089Spjd } 3533219089Spjd#endif 3534219089Spjd 3535168404Spjd if (db->db_level == 0) { 3536219089Spjd ASSERT(db->db_blkid != DMU_BONUS_BLKID); 3537168404Spjd ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN); 3538219089Spjd if (db->db_state != DB_NOFILL) { 3539219089Spjd if (dr->dt.dl.dr_data != db->db_buf) 3540307265Smav arc_buf_destroy(dr->dt.dl.dr_data, db); 3541219089Spjd } 3542168404Spjd } else { 3543219089Spjd dnode_t *dn; 3544168404Spjd 3545219089Spjd DB_DNODE_ENTER(db); 3546219089Spjd dn = DB_DNODE(db); 3547168404Spjd ASSERT(list_head(&dr->dt.di.dr_children) == NULL); 3548260150Sdelphij ASSERT3U(db->db.db_size, ==, 1 << dn->dn_phys->dn_indblkshift); 3549168404Spjd if (!BP_IS_HOLE(db->db_blkptr)) { 3550168404Spjd int epbs = 3551168404Spjd dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; 3552260150Sdelphij ASSERT3U(db->db_blkid, <=, 3553260150Sdelphij dn->dn_phys->dn_maxblkid >> (db->db_level * epbs)); 3554168404Spjd ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==, 3555168404Spjd db->db.db_size); 3556168404Spjd } 3557219089Spjd DB_DNODE_EXIT(db); 3558185029Spjd mutex_destroy(&dr->dt.di.dr_mtx); 3559169325Spjd list_destroy(&dr->dt.di.dr_children); 3560168404Spjd } 3561168404Spjd kmem_free(dr, sizeof (dbuf_dirty_record_t)); 3562168404Spjd 3563168404Spjd cv_broadcast(&db->db_changed); 3564168404Spjd ASSERT(db->db_dirtycnt > 0); 3565168404Spjd db->db_dirtycnt -= 1; 3566168404Spjd db->db_data_pending = NULL; 3567339140Smav dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg, B_FALSE); 3568219089Spjd} 3569219089Spjd 3570219089Spjdstatic void 3571219089Spjddbuf_write_nofill_ready(zio_t *zio) 3572219089Spjd{ 3573219089Spjd dbuf_write_ready(zio, NULL, zio->io_private); 3574219089Spjd} 3575219089Spjd 3576219089Spjdstatic void 3577219089Spjddbuf_write_nofill_done(zio_t *zio) 3578219089Spjd{ 3579219089Spjd dbuf_write_done(zio, NULL, zio->io_private); 3580219089Spjd} 3581219089Spjd 3582219089Spjdstatic void 3583219089Spjddbuf_write_override_ready(zio_t *zio) 3584219089Spjd{ 3585219089Spjd dbuf_dirty_record_t *dr = zio->io_private; 3586219089Spjd dmu_buf_impl_t *db = dr->dr_dbuf; 3587219089Spjd 3588219089Spjd dbuf_write_ready(zio, NULL, db); 3589219089Spjd} 3590219089Spjd 3591219089Spjdstatic void 3592219089Spjddbuf_write_override_done(zio_t *zio) 3593219089Spjd{ 3594219089Spjd dbuf_dirty_record_t *dr = zio->io_private; 3595219089Spjd dmu_buf_impl_t *db = dr->dr_dbuf; 3596219089Spjd blkptr_t *obp = &dr->dt.dl.dr_overridden_by; 3597219089Spjd 3598219089Spjd mutex_enter(&db->db_mtx); 3599219089Spjd if (!BP_EQUAL(zio->io_bp, obp)) { 3600219089Spjd if (!BP_IS_HOLE(obp)) 3601219089Spjd dsl_free(spa_get_dsl(zio->io_spa), zio->io_txg, obp); 3602219089Spjd arc_release(dr->dt.dl.dr_data, db); 3603219089Spjd } 3604168404Spjd mutex_exit(&db->db_mtx); 3605321610Smav dbuf_write_done(zio, NULL, db); 3606168404Spjd 3607321610Smav if (zio->io_abd != NULL) 3608321610Smav abd_put(zio->io_abd); 3609219089Spjd} 3610168404Spjd 3611332525Smavtypedef struct dbuf_remap_impl_callback_arg { 3612332525Smav objset_t *drica_os; 3613332525Smav uint64_t drica_blk_birth; 3614332525Smav dmu_tx_t *drica_tx; 3615332525Smav} dbuf_remap_impl_callback_arg_t; 3616332525Smav 3617332525Smavstatic void 3618332525Smavdbuf_remap_impl_callback(uint64_t vdev, uint64_t offset, uint64_t size, 3619332525Smav void *arg) 3620332525Smav{ 3621332525Smav dbuf_remap_impl_callback_arg_t *drica = arg; 3622332525Smav objset_t *os = drica->drica_os; 3623332525Smav spa_t *spa = dmu_objset_spa(os); 3624332525Smav dmu_tx_t *tx = drica->drica_tx; 3625332525Smav 3626332525Smav ASSERT(dsl_pool_sync_context(spa_get_dsl(spa))); 3627332525Smav 3628332525Smav if (os == spa_meta_objset(spa)) { 3629332525Smav spa_vdev_indirect_mark_obsolete(spa, vdev, offset, size, tx); 3630332525Smav } else { 3631332525Smav dsl_dataset_block_remapped(dmu_objset_ds(os), vdev, offset, 3632332525Smav size, drica->drica_blk_birth, tx); 3633332525Smav } 3634332525Smav} 3635332525Smav 3636332525Smavstatic void 3637332525Smavdbuf_remap_impl(dnode_t *dn, blkptr_t *bp, dmu_tx_t *tx) 3638332525Smav{ 3639332525Smav blkptr_t bp_copy = *bp; 3640332525Smav spa_t *spa = dmu_objset_spa(dn->dn_objset); 3641332525Smav dbuf_remap_impl_callback_arg_t drica; 3642332525Smav 3643332525Smav ASSERT(dsl_pool_sync_context(spa_get_dsl(spa))); 3644332525Smav 3645332525Smav drica.drica_os = dn->dn_objset; 3646332525Smav drica.drica_blk_birth = bp->blk_birth; 3647332525Smav drica.drica_tx = tx; 3648332525Smav if (spa_remap_blkptr(spa, &bp_copy, dbuf_remap_impl_callback, 3649332525Smav &drica)) { 3650332525Smav /* 3651332525Smav * The struct_rwlock prevents dbuf_read_impl() from 3652332525Smav * dereferencing the BP while we are changing it. To 3653332525Smav * avoid lock contention, only grab it when we are actually 3654332525Smav * changing the BP. 3655332525Smav */ 3656332525Smav rw_enter(&dn->dn_struct_rwlock, RW_WRITER); 3657332525Smav *bp = bp_copy; 3658332525Smav rw_exit(&dn->dn_struct_rwlock); 3659332525Smav } 3660332525Smav} 3661332525Smav 3662332525Smav/* 3663332525Smav * Returns true if a dbuf_remap would modify the dbuf. We do this by attempting 3664332525Smav * to remap a copy of every bp in the dbuf. 3665332525Smav */ 3666332525Smavboolean_t 3667332525Smavdbuf_can_remap(const dmu_buf_impl_t *db) 3668332525Smav{ 3669332525Smav spa_t *spa = dmu_objset_spa(db->db_objset); 3670332525Smav blkptr_t *bp = db->db.db_data; 3671332525Smav boolean_t ret = B_FALSE; 3672332525Smav 3673332525Smav ASSERT3U(db->db_level, >, 0); 3674332525Smav ASSERT3S(db->db_state, ==, DB_CACHED); 3675332525Smav 3676332525Smav ASSERT(spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REMOVAL)); 3677332525Smav 3678332525Smav spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 3679332525Smav for (int i = 0; i < db->db.db_size >> SPA_BLKPTRSHIFT; i++) { 3680332525Smav blkptr_t bp_copy = bp[i]; 3681332525Smav if (spa_remap_blkptr(spa, &bp_copy, NULL, NULL)) { 3682332525Smav ret = B_TRUE; 3683332525Smav break; 3684332525Smav } 3685332525Smav } 3686332525Smav spa_config_exit(spa, SCL_VDEV, FTAG); 3687332525Smav 3688332525Smav return (ret); 3689332525Smav} 3690332525Smav 3691332525Smavboolean_t 3692332525Smavdnode_needs_remap(const dnode_t *dn) 3693332525Smav{ 3694332525Smav spa_t *spa = dmu_objset_spa(dn->dn_objset); 3695332525Smav boolean_t ret = B_FALSE; 3696332525Smav 3697332525Smav if (dn->dn_phys->dn_nlevels == 0) { 3698332525Smav return (B_FALSE); 3699332525Smav } 3700332525Smav 3701332525Smav ASSERT(spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REMOVAL)); 3702332525Smav 3703332525Smav spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 3704332525Smav for (int j = 0; j < dn->dn_phys->dn_nblkptr; j++) { 3705332525Smav blkptr_t bp_copy = dn->dn_phys->dn_blkptr[j]; 3706332525Smav if (spa_remap_blkptr(spa, &bp_copy, NULL, NULL)) { 3707332525Smav ret = B_TRUE; 3708332525Smav break; 3709332525Smav } 3710332525Smav } 3711332525Smav spa_config_exit(spa, SCL_VDEV, FTAG); 3712332525Smav 3713332525Smav return (ret); 3714332525Smav} 3715332525Smav 3716332525Smav/* 3717332525Smav * Remap any existing BP's to concrete vdevs, if possible. 3718332525Smav */ 3719332525Smavstatic void 3720332525Smavdbuf_remap(dnode_t *dn, dmu_buf_impl_t *db, dmu_tx_t *tx) 3721332525Smav{ 3722332525Smav spa_t *spa = dmu_objset_spa(db->db_objset); 3723332525Smav ASSERT(dsl_pool_sync_context(spa_get_dsl(spa))); 3724332525Smav 3725332525Smav if (!spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REMOVAL)) 3726332525Smav return; 3727332525Smav 3728332525Smav if (db->db_level > 0) { 3729332525Smav blkptr_t *bp = db->db.db_data; 3730332525Smav for (int i = 0; i < db->db.db_size >> SPA_BLKPTRSHIFT; i++) { 3731332525Smav dbuf_remap_impl(dn, &bp[i], tx); 3732332525Smav } 3733332525Smav } else if (db->db.db_object == DMU_META_DNODE_OBJECT) { 3734332525Smav dnode_phys_t *dnp = db->db.db_data; 3735332525Smav ASSERT3U(db->db_dnode_handle->dnh_dnode->dn_type, ==, 3736332525Smav DMU_OT_DNODE); 3737332525Smav for (int i = 0; i < db->db.db_size >> DNODE_SHIFT; i++) { 3738332525Smav for (int j = 0; j < dnp[i].dn_nblkptr; j++) { 3739332525Smav dbuf_remap_impl(dn, &dnp[i].dn_blkptr[j], tx); 3740332525Smav } 3741332525Smav } 3742332525Smav } 3743332525Smav} 3744332525Smav 3745332525Smav 3746251629Sdelphij/* Issue I/O to commit a dirty buffer to disk. */ 3747219089Spjdstatic void 3748219089Spjddbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) 3749219089Spjd{ 3750219089Spjd dmu_buf_impl_t *db = dr->dr_dbuf; 3751219089Spjd dnode_t *dn; 3752219089Spjd objset_t *os; 3753219089Spjd dmu_buf_impl_t *parent = db->db_parent; 3754219089Spjd uint64_t txg = tx->tx_txg; 3755268123Sdelphij zbookmark_phys_t zb; 3756219089Spjd zio_prop_t zp; 3757219089Spjd zio_t *zio; 3758219089Spjd int wp_flag = 0; 3759219089Spjd 3760304137Savg ASSERT(dmu_tx_is_syncing(tx)); 3761304137Savg 3762219089Spjd DB_DNODE_ENTER(db); 3763219089Spjd dn = DB_DNODE(db); 3764219089Spjd os = dn->dn_objset; 3765219089Spjd 3766219089Spjd if (db->db_state != DB_NOFILL) { 3767219089Spjd if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) { 3768219089Spjd /* 3769219089Spjd * Private object buffers are released here rather 3770219089Spjd * than in dbuf_dirty() since they are only modified 3771219089Spjd * in the syncing context and we don't want the 3772219089Spjd * overhead of making multiple copies of the data. 3773219089Spjd */ 3774219089Spjd if (BP_IS_HOLE(db->db_blkptr)) { 3775219089Spjd arc_buf_thaw(data); 3776219089Spjd } else { 3777219089Spjd dbuf_release_bp(db); 3778219089Spjd } 3779332525Smav dbuf_remap(dn, db, tx); 3780219089Spjd } 3781219089Spjd } 3782219089Spjd 3783219089Spjd if (parent != dn->dn_dbuf) { 3784251629Sdelphij /* Our parent is an indirect block. */ 3785251629Sdelphij /* We have a dirty parent that has been scheduled for write. */ 3786219089Spjd ASSERT(parent && parent->db_data_pending); 3787251629Sdelphij /* Our parent's buffer is one level closer to the dnode. */ 3788219089Spjd ASSERT(db->db_level == parent->db_level-1); 3789251629Sdelphij /* 3790251629Sdelphij * We're about to modify our parent's db_data by modifying 3791251629Sdelphij * our block pointer, so the parent must be released. 3792251629Sdelphij */ 3793219089Spjd ASSERT(arc_released(parent->db_buf)); 3794219089Spjd zio = parent->db_data_pending->dr_zio; 3795219089Spjd } else { 3796251629Sdelphij /* Our parent is the dnode itself. */ 3797219089Spjd ASSERT((db->db_level == dn->dn_phys->dn_nlevels-1 && 3798219089Spjd db->db_blkid != DMU_SPILL_BLKID) || 3799219089Spjd (db->db_blkid == DMU_SPILL_BLKID && db->db_level == 0)); 3800219089Spjd if (db->db_blkid != DMU_SPILL_BLKID) 3801219089Spjd ASSERT3P(db->db_blkptr, ==, 3802219089Spjd &dn->dn_phys->dn_blkptr[db->db_blkid]); 3803219089Spjd zio = dn->dn_zio; 3804219089Spjd } 3805219089Spjd 3806219089Spjd ASSERT(db->db_level == 0 || data == db->db_buf); 3807219089Spjd ASSERT3U(db->db_blkptr->blk_birth, <=, txg); 3808219089Spjd ASSERT(zio); 3809219089Spjd 3810219089Spjd SET_BOOKMARK(&zb, os->os_dsl_dataset ? 3811219089Spjd os->os_dsl_dataset->ds_object : DMU_META_OBJSET, 3812219089Spjd db->db.db_object, db->db_level, db->db_blkid); 3813219089Spjd 3814219089Spjd if (db->db_blkid == DMU_SPILL_BLKID) 3815219089Spjd wp_flag = WP_SPILL; 3816219089Spjd wp_flag |= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0; 3817219089Spjd 3818321573Smav dmu_write_policy(os, dn, db->db_level, wp_flag, &zp); 3819219089Spjd DB_DNODE_EXIT(db); 3820219089Spjd 3821304137Savg /* 3822304137Savg * We copy the blkptr now (rather than when we instantiate the dirty 3823304137Savg * record), because its value can change between open context and 3824304137Savg * syncing context. We do not need to hold dn_struct_rwlock to read 3825304137Savg * db_blkptr because we are in syncing context. 3826304137Savg */ 3827304137Savg dr->dr_bp_copy = *db->db_blkptr; 3828304137Savg 3829268075Sdelphij if (db->db_level == 0 && 3830268075Sdelphij dr->dt.dl.dr_override_state == DR_OVERRIDDEN) { 3831268075Sdelphij /* 3832268075Sdelphij * The BP for this block has been provided by open context 3833268075Sdelphij * (by dmu_sync() or dmu_buf_write_embedded()). 3834268075Sdelphij */ 3835321610Smav abd_t *contents = (data != NULL) ? 3836321610Smav abd_get_from_buf(data->b_data, arc_buf_size(data)) : NULL; 3837268075Sdelphij 3838321535Smav dr->dr_zio = zio_write(zio, os->os_spa, txg, &dr->dr_bp_copy, 3839321535Smav contents, db->db.db_size, db->db.db_size, &zp, 3840304138Savg dbuf_write_override_ready, NULL, NULL, 3841304138Savg dbuf_write_override_done, 3842258632Savg dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); 3843219089Spjd mutex_enter(&db->db_mtx); 3844219089Spjd dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; 3845219089Spjd zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by, 3846243524Smm dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite); 3847219089Spjd mutex_exit(&db->db_mtx); 3848219089Spjd } else if (db->db_state == DB_NOFILL) { 3849255750Sdelphij ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF || 3850255750Sdelphij zp.zp_checksum == ZIO_CHECKSUM_NOPARITY); 3851219089Spjd dr->dr_zio = zio_write(zio, os->os_spa, txg, 3852321535Smav &dr->dr_bp_copy, NULL, db->db.db_size, db->db.db_size, &zp, 3853304138Savg dbuf_write_nofill_ready, NULL, NULL, 3854304138Savg dbuf_write_nofill_done, db, 3855219089Spjd ZIO_PRIORITY_ASYNC_WRITE, 3856219089Spjd ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb); 3857219089Spjd } else { 3858219089Spjd ASSERT(arc_released(data)); 3859304138Savg 3860304138Savg /* 3861304138Savg * For indirect blocks, we want to setup the children 3862304138Savg * ready callback so that we can properly handle an indirect 3863304138Savg * block that only contains holes. 3864304138Savg */ 3865339034Ssef arc_write_done_func_t *children_ready_cb = NULL; 3866304138Savg if (db->db_level != 0) 3867304138Savg children_ready_cb = dbuf_write_children_ready; 3868304138Savg 3869219089Spjd dr->dr_zio = arc_write(zio, os->os_spa, txg, 3870304137Savg &dr->dr_bp_copy, data, DBUF_IS_L2CACHEABLE(db), 3871307265Smav &zp, dbuf_write_ready, children_ready_cb, 3872258632Savg dbuf_write_physdone, dbuf_write_done, db, 3873258632Savg ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); 3874219089Spjd } 3875168404Spjd} 3876