dbuf.c revision 168696
1168404Spjd/* 2168404Spjd * CDDL HEADER START 3168404Spjd * 4168404Spjd * The contents of this file are subject to the terms of the 5168404Spjd * Common Development and Distribution License (the "License"). 6168404Spjd * You may not use this file except in compliance with the License. 7168404Spjd * 8168404Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9168404Spjd * or http://www.opensolaris.org/os/licensing. 10168404Spjd * See the License for the specific language governing permissions 11168404Spjd * and limitations under the License. 12168404Spjd * 13168404Spjd * When distributing Covered Code, include this CDDL HEADER in each 14168404Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15168404Spjd * If applicable, add the following below this CDDL HEADER, with the 16168404Spjd * fields enclosed by brackets "[]" replaced with your own identifying 17168404Spjd * information: Portions Copyright [yyyy] [name of copyright owner] 18168404Spjd * 19168404Spjd * CDDL HEADER END 20168404Spjd */ 21168404Spjd/* 22168404Spjd * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23168404Spjd * Use is subject to license terms. 24168404Spjd */ 25168404Spjd 26168404Spjd#pragma ident "%Z%%M% %I% %E% SMI" 27168404Spjd 28168404Spjd#include <sys/zfs_context.h> 29168404Spjd#include <sys/dmu.h> 30168404Spjd#include <sys/dmu_impl.h> 31168404Spjd#include <sys/dbuf.h> 32168404Spjd#include <sys/dmu_objset.h> 33168404Spjd#include <sys/dsl_dataset.h> 34168404Spjd#include <sys/dsl_dir.h> 35168404Spjd#include <sys/dmu_tx.h> 36168404Spjd#include <sys/spa.h> 37168404Spjd#include <sys/zio.h> 38168404Spjd#include <sys/dmu_zfetch.h> 39168404Spjd 40168404Spjdstatic void dbuf_destroy(dmu_buf_impl_t *db); 41168404Spjdstatic int dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx); 42168404Spjdstatic void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, int checksum, 43168404Spjd int compress, dmu_tx_t *tx); 44168404Spjdstatic arc_done_func_t dbuf_write_ready; 45168404Spjdstatic arc_done_func_t dbuf_write_done; 46168404Spjd 47168404Spjdint zfs_mdcomp_disable = 0; 48168404SpjdSYSCTL_DECL(_vfs_zfs); 49168404SpjdTUNABLE_INT("vfs.zfs.mdcomp_disable", &zfs_mdcomp_disable); 50168404SpjdSYSCTL_INT(_vfs_zfs, OID_AUTO, mdcomp_disable, CTLFLAG_RDTUN, 51168404Spjd &zfs_mdcomp_disable, 0, "Disable metadata compression"); 52168404Spjd 53168404Spjd/* 54168404Spjd * Global data structures and functions for the dbuf cache. 55168404Spjd */ 56168404Spjdstatic kmem_cache_t *dbuf_cache; 57168404Spjd 58168404Spjd/* ARGSUSED */ 59168404Spjdstatic int 60168404Spjddbuf_cons(void *vdb, void *unused, int kmflag) 61168404Spjd{ 62168404Spjd dmu_buf_impl_t *db = vdb; 63168404Spjd bzero(db, sizeof (dmu_buf_impl_t)); 64168404Spjd 65168404Spjd mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL); 66168404Spjd cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL); 67168404Spjd refcount_create(&db->db_holds); 68168404Spjd return (0); 69168404Spjd} 70168404Spjd 71168404Spjd/* ARGSUSED */ 72168404Spjdstatic void 73168404Spjddbuf_dest(void *vdb, void *unused) 74168404Spjd{ 75168404Spjd dmu_buf_impl_t *db = vdb; 76168404Spjd mutex_destroy(&db->db_mtx); 77168404Spjd cv_destroy(&db->db_changed); 78168404Spjd refcount_destroy(&db->db_holds); 79168404Spjd} 80168404Spjd 81168404Spjd/* 82168404Spjd * dbuf hash table routines 83168404Spjd */ 84168404Spjdstatic dbuf_hash_table_t dbuf_hash_table; 85168404Spjd 86168404Spjdstatic uint64_t dbuf_hash_count; 87168404Spjd 88168404Spjdstatic uint64_t 89168404Spjddbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid) 90168404Spjd{ 91168404Spjd uintptr_t osv = (uintptr_t)os; 92168404Spjd uint64_t crc = -1ULL; 93168404Spjd 94168404Spjd ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); 95168404Spjd crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (lvl)) & 0xFF]; 96168404Spjd crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF]; 97168404Spjd crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF]; 98168404Spjd crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF]; 99168404Spjd crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 0)) & 0xFF]; 100168404Spjd crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 8)) & 0xFF]; 101168404Spjd 102168404Spjd crc ^= (osv>>14) ^ (obj>>16) ^ (blkid>>16); 103168404Spjd 104168404Spjd return (crc); 105168404Spjd} 106168404Spjd 107168404Spjd#define DBUF_HASH(os, obj, level, blkid) dbuf_hash(os, obj, level, blkid); 108168404Spjd 109168404Spjd#define DBUF_EQUAL(dbuf, os, obj, level, blkid) \ 110168404Spjd ((dbuf)->db.db_object == (obj) && \ 111168404Spjd (dbuf)->db_objset == (os) && \ 112168404Spjd (dbuf)->db_level == (level) && \ 113168404Spjd (dbuf)->db_blkid == (blkid)) 114168404Spjd 115168404Spjddmu_buf_impl_t * 116168404Spjddbuf_find(dnode_t *dn, uint8_t level, uint64_t blkid) 117168404Spjd{ 118168404Spjd dbuf_hash_table_t *h = &dbuf_hash_table; 119168404Spjd objset_impl_t *os = dn->dn_objset; 120168404Spjd uint64_t obj = dn->dn_object; 121168404Spjd uint64_t hv = DBUF_HASH(os, obj, level, blkid); 122168404Spjd uint64_t idx = hv & h->hash_table_mask; 123168404Spjd dmu_buf_impl_t *db; 124168404Spjd 125168404Spjd mutex_enter(DBUF_HASH_MUTEX(h, idx)); 126168404Spjd for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) { 127168404Spjd if (DBUF_EQUAL(db, os, obj, level, blkid)) { 128168404Spjd mutex_enter(&db->db_mtx); 129168404Spjd if (db->db_state != DB_EVICTING) { 130168404Spjd mutex_exit(DBUF_HASH_MUTEX(h, idx)); 131168404Spjd return (db); 132168404Spjd } 133168404Spjd mutex_exit(&db->db_mtx); 134168404Spjd } 135168404Spjd } 136168404Spjd mutex_exit(DBUF_HASH_MUTEX(h, idx)); 137168404Spjd return (NULL); 138168404Spjd} 139168404Spjd 140168404Spjd/* 141168404Spjd * Insert an entry into the hash table. If there is already an element 142168404Spjd * equal to elem in the hash table, then the already existing element 143168404Spjd * will be returned and the new element will not be inserted. 144168404Spjd * Otherwise returns NULL. 145168404Spjd */ 146168404Spjdstatic dmu_buf_impl_t * 147168404Spjddbuf_hash_insert(dmu_buf_impl_t *db) 148168404Spjd{ 149168404Spjd dbuf_hash_table_t *h = &dbuf_hash_table; 150168404Spjd objset_impl_t *os = db->db_objset; 151168404Spjd uint64_t obj = db->db.db_object; 152168404Spjd int level = db->db_level; 153168404Spjd uint64_t blkid = db->db_blkid; 154168404Spjd uint64_t hv = DBUF_HASH(os, obj, level, blkid); 155168404Spjd uint64_t idx = hv & h->hash_table_mask; 156168404Spjd dmu_buf_impl_t *dbf; 157168404Spjd 158168404Spjd mutex_enter(DBUF_HASH_MUTEX(h, idx)); 159168404Spjd for (dbf = h->hash_table[idx]; dbf != NULL; dbf = dbf->db_hash_next) { 160168404Spjd if (DBUF_EQUAL(dbf, os, obj, level, blkid)) { 161168404Spjd mutex_enter(&dbf->db_mtx); 162168404Spjd if (dbf->db_state != DB_EVICTING) { 163168404Spjd mutex_exit(DBUF_HASH_MUTEX(h, idx)); 164168404Spjd return (dbf); 165168404Spjd } 166168404Spjd mutex_exit(&dbf->db_mtx); 167168404Spjd } 168168404Spjd } 169168404Spjd 170168404Spjd mutex_enter(&db->db_mtx); 171168404Spjd db->db_hash_next = h->hash_table[idx]; 172168404Spjd h->hash_table[idx] = db; 173168404Spjd mutex_exit(DBUF_HASH_MUTEX(h, idx)); 174168404Spjd atomic_add_64(&dbuf_hash_count, 1); 175168404Spjd 176168404Spjd return (NULL); 177168404Spjd} 178168404Spjd 179168404Spjd/* 180168404Spjd * Remove an entry from the hash table. This operation will 181168404Spjd * fail if there are any existing holds on the db. 182168404Spjd */ 183168404Spjdstatic void 184168404Spjddbuf_hash_remove(dmu_buf_impl_t *db) 185168404Spjd{ 186168404Spjd dbuf_hash_table_t *h = &dbuf_hash_table; 187168404Spjd uint64_t hv = DBUF_HASH(db->db_objset, db->db.db_object, 188168404Spjd db->db_level, db->db_blkid); 189168404Spjd uint64_t idx = hv & h->hash_table_mask; 190168404Spjd dmu_buf_impl_t *dbf, **dbp; 191168404Spjd 192168404Spjd /* 193168404Spjd * We musn't hold db_mtx to maintin lock ordering: 194168404Spjd * DBUF_HASH_MUTEX > db_mtx. 195168404Spjd */ 196168404Spjd ASSERT(refcount_is_zero(&db->db_holds)); 197168404Spjd ASSERT(db->db_state == DB_EVICTING); 198168404Spjd ASSERT(!MUTEX_HELD(&db->db_mtx)); 199168404Spjd 200168404Spjd mutex_enter(DBUF_HASH_MUTEX(h, idx)); 201168404Spjd dbp = &h->hash_table[idx]; 202168404Spjd while ((dbf = *dbp) != db) { 203168404Spjd dbp = &dbf->db_hash_next; 204168404Spjd ASSERT(dbf != NULL); 205168404Spjd } 206168404Spjd *dbp = db->db_hash_next; 207168404Spjd db->db_hash_next = NULL; 208168404Spjd mutex_exit(DBUF_HASH_MUTEX(h, idx)); 209168404Spjd atomic_add_64(&dbuf_hash_count, -1); 210168404Spjd} 211168404Spjd 212168404Spjdstatic arc_evict_func_t dbuf_do_evict; 213168404Spjd 214168404Spjdstatic void 215168404Spjddbuf_evict_user(dmu_buf_impl_t *db) 216168404Spjd{ 217168404Spjd ASSERT(MUTEX_HELD(&db->db_mtx)); 218168404Spjd 219168404Spjd if (db->db_level != 0 || db->db_evict_func == NULL) 220168404Spjd return; 221168404Spjd 222168404Spjd if (db->db_user_data_ptr_ptr) 223168404Spjd *db->db_user_data_ptr_ptr = db->db.db_data; 224168404Spjd db->db_evict_func(&db->db, db->db_user_ptr); 225168404Spjd db->db_user_ptr = NULL; 226168404Spjd db->db_user_data_ptr_ptr = NULL; 227168404Spjd db->db_evict_func = NULL; 228168404Spjd} 229168404Spjd 230168404Spjdvoid 231168404Spjddbuf_evict(dmu_buf_impl_t *db) 232168404Spjd{ 233168404Spjd ASSERT(MUTEX_HELD(&db->db_mtx)); 234168404Spjd ASSERT(db->db_buf == NULL); 235168404Spjd ASSERT(db->db_data_pending == NULL); 236168404Spjd 237168404Spjd dbuf_clear(db); 238168404Spjd dbuf_destroy(db); 239168404Spjd} 240168404Spjd 241168404Spjdvoid 242168404Spjddbuf_init(void) 243168404Spjd{ 244168404Spjd uint64_t hsize = 1ULL << 16; 245168404Spjd dbuf_hash_table_t *h = &dbuf_hash_table; 246168404Spjd int i; 247168404Spjd 248168404Spjd /* 249168404Spjd * The hash table is big enough to fill all of physical memory 250168404Spjd * with an average 4K block size. The table will take up 251168404Spjd * totalmem*sizeof(void*)/4K (i.e. 2MB/GB with 8-byte pointers). 252168404Spjd */ 253168696Spjd while (hsize * 4096 < (uint64_t)physmem * PAGESIZE) 254168404Spjd hsize <<= 1; 255168404Spjd 256168404Spjdretry: 257168404Spjd h->hash_table_mask = hsize - 1; 258168404Spjd h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP); 259168404Spjd if (h->hash_table == NULL) { 260168404Spjd /* XXX - we should really return an error instead of assert */ 261168404Spjd ASSERT(hsize > (1ULL << 10)); 262168404Spjd hsize >>= 1; 263168404Spjd goto retry; 264168404Spjd } 265168404Spjd 266168404Spjd dbuf_cache = kmem_cache_create("dmu_buf_impl_t", 267168404Spjd sizeof (dmu_buf_impl_t), 268168404Spjd 0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0); 269168404Spjd 270168404Spjd for (i = 0; i < DBUF_MUTEXES; i++) 271168404Spjd mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL); 272168404Spjd} 273168404Spjd 274168404Spjdvoid 275168404Spjddbuf_fini(void) 276168404Spjd{ 277168404Spjd dbuf_hash_table_t *h = &dbuf_hash_table; 278168404Spjd int i; 279168404Spjd 280168404Spjd for (i = 0; i < DBUF_MUTEXES; i++) 281168404Spjd mutex_destroy(&h->hash_mutexes[i]); 282168404Spjd kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *)); 283168404Spjd kmem_cache_destroy(dbuf_cache); 284168404Spjd} 285168404Spjd 286168404Spjd/* 287168404Spjd * Other stuff. 288168404Spjd */ 289168404Spjd 290168404Spjd#ifdef ZFS_DEBUG 291168404Spjdstatic void 292168404Spjddbuf_verify(dmu_buf_impl_t *db) 293168404Spjd{ 294168404Spjd dnode_t *dn = db->db_dnode; 295168404Spjd 296168404Spjd ASSERT(MUTEX_HELD(&db->db_mtx)); 297168404Spjd 298168404Spjd if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY)) 299168404Spjd return; 300168404Spjd 301168404Spjd ASSERT(db->db_objset != NULL); 302168404Spjd if (dn == NULL) { 303168404Spjd ASSERT(db->db_parent == NULL); 304168404Spjd ASSERT(db->db_blkptr == NULL); 305168404Spjd } else { 306168404Spjd ASSERT3U(db->db.db_object, ==, dn->dn_object); 307168404Spjd ASSERT3P(db->db_objset, ==, dn->dn_objset); 308168404Spjd ASSERT3U(db->db_level, <, dn->dn_nlevels); 309168404Spjd ASSERT(db->db_blkid == DB_BONUS_BLKID || 310168404Spjd list_head(&dn->dn_dbufs)); 311168404Spjd } 312168404Spjd if (db->db_blkid == DB_BONUS_BLKID) { 313168404Spjd ASSERT(dn != NULL); 314168404Spjd ASSERT3U(db->db.db_size, ==, dn->dn_bonuslen); 315168404Spjd ASSERT3U(db->db.db_offset, ==, DB_BONUS_BLKID); 316168404Spjd } else { 317168404Spjd ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size); 318168404Spjd } 319168404Spjd 320168404Spjd if (db->db_level == 0) { 321168404Spjd /* we can be momentarily larger in dnode_set_blksz() */ 322168404Spjd if (db->db_blkid != DB_BONUS_BLKID && dn) { 323168404Spjd ASSERT3U(db->db.db_size, >=, dn->dn_datablksz); 324168404Spjd } 325168404Spjd if (db->db.db_object == DMU_META_DNODE_OBJECT) { 326168404Spjd dbuf_dirty_record_t *dr = db->db_data_pending; 327168404Spjd /* 328168404Spjd * it should only be modified in syncing 329168404Spjd * context, so make sure we only have 330168404Spjd * one copy of the data. 331168404Spjd */ 332168404Spjd ASSERT(dr == NULL || dr->dt.dl.dr_data == db->db_buf); 333168404Spjd } 334168404Spjd } 335168404Spjd 336168404Spjd /* verify db->db_blkptr */ 337168404Spjd if (db->db_blkptr) { 338168404Spjd if (db->db_parent == dn->dn_dbuf) { 339168404Spjd /* db is pointed to by the dnode */ 340168404Spjd /* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */ 341168404Spjd if (db->db.db_object == DMU_META_DNODE_OBJECT) 342168404Spjd ASSERT(db->db_parent == NULL); 343168404Spjd else 344168404Spjd ASSERT(db->db_parent != NULL); 345168404Spjd ASSERT3P(db->db_blkptr, ==, 346168404Spjd &dn->dn_phys->dn_blkptr[db->db_blkid]); 347168404Spjd } else { 348168404Spjd /* db is pointed to by an indirect block */ 349168404Spjd int epb = db->db_parent->db.db_size >> SPA_BLKPTRSHIFT; 350168404Spjd ASSERT3U(db->db_parent->db_level, ==, db->db_level+1); 351168404Spjd ASSERT3U(db->db_parent->db.db_object, ==, 352168404Spjd db->db.db_object); 353168404Spjd /* 354168404Spjd * dnode_grow_indblksz() can make this fail if we don't 355168404Spjd * have the struct_rwlock. XXX indblksz no longer 356168404Spjd * grows. safe to do this now? 357168404Spjd */ 358168404Spjd if (RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock)) { 359168404Spjd ASSERT3P(db->db_blkptr, ==, 360168404Spjd ((blkptr_t *)db->db_parent->db.db_data + 361168404Spjd db->db_blkid % epb)); 362168404Spjd } 363168404Spjd } 364168404Spjd } 365168404Spjd if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) && 366168404Spjd db->db.db_data && db->db_blkid != DB_BONUS_BLKID && 367168404Spjd db->db_state != DB_FILL && !dn->dn_free_txg) { 368168404Spjd /* 369168404Spjd * If the blkptr isn't set but they have nonzero data, 370168404Spjd * it had better be dirty, otherwise we'll lose that 371168404Spjd * data when we evict this buffer. 372168404Spjd */ 373168404Spjd if (db->db_dirtycnt == 0) { 374168404Spjd uint64_t *buf = db->db.db_data; 375168404Spjd int i; 376168404Spjd 377168404Spjd for (i = 0; i < db->db.db_size >> 3; i++) { 378168404Spjd ASSERT(buf[i] == 0); 379168404Spjd } 380168404Spjd } 381168404Spjd } 382168404Spjd} 383168404Spjd#endif 384168404Spjd 385168404Spjdstatic void 386168404Spjddbuf_update_data(dmu_buf_impl_t *db) 387168404Spjd{ 388168404Spjd ASSERT(MUTEX_HELD(&db->db_mtx)); 389168404Spjd if (db->db_level == 0 && db->db_user_data_ptr_ptr) { 390168404Spjd ASSERT(!refcount_is_zero(&db->db_holds)); 391168404Spjd *db->db_user_data_ptr_ptr = db->db.db_data; 392168404Spjd } 393168404Spjd} 394168404Spjd 395168404Spjdstatic void 396168404Spjddbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf) 397168404Spjd{ 398168404Spjd ASSERT(MUTEX_HELD(&db->db_mtx)); 399168404Spjd ASSERT(db->db_buf == NULL || !arc_has_callback(db->db_buf)); 400168404Spjd db->db_buf = buf; 401168404Spjd if (buf != NULL) { 402168404Spjd ASSERT(buf->b_data != NULL); 403168404Spjd db->db.db_data = buf->b_data; 404168404Spjd if (!arc_released(buf)) 405168404Spjd arc_set_callback(buf, dbuf_do_evict, db); 406168404Spjd dbuf_update_data(db); 407168404Spjd } else { 408168404Spjd dbuf_evict_user(db); 409168404Spjd db->db.db_data = NULL; 410168404Spjd db->db_state = DB_UNCACHED; 411168404Spjd } 412168404Spjd} 413168404Spjd 414168404Spjduint64_t 415168404Spjddbuf_whichblock(dnode_t *dn, uint64_t offset) 416168404Spjd{ 417168404Spjd if (dn->dn_datablkshift) { 418168404Spjd return (offset >> dn->dn_datablkshift); 419168404Spjd } else { 420168404Spjd ASSERT3U(offset, <, dn->dn_datablksz); 421168404Spjd return (0); 422168404Spjd } 423168404Spjd} 424168404Spjd 425168404Spjdstatic void 426168404Spjddbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb) 427168404Spjd{ 428168404Spjd dmu_buf_impl_t *db = vdb; 429168404Spjd 430168404Spjd mutex_enter(&db->db_mtx); 431168404Spjd ASSERT3U(db->db_state, ==, DB_READ); 432168404Spjd /* 433168404Spjd * All reads are synchronous, so we must have a hold on the dbuf 434168404Spjd */ 435168404Spjd ASSERT(refcount_count(&db->db_holds) > 0); 436168404Spjd ASSERT(db->db_buf == NULL); 437168404Spjd ASSERT(db->db.db_data == NULL); 438168404Spjd if (db->db_level == 0 && db->db_freed_in_flight) { 439168404Spjd /* we were freed in flight; disregard any error */ 440168404Spjd arc_release(buf, db); 441168404Spjd bzero(buf->b_data, db->db.db_size); 442168404Spjd arc_buf_freeze(buf); 443168404Spjd db->db_freed_in_flight = FALSE; 444168404Spjd dbuf_set_data(db, buf); 445168404Spjd db->db_state = DB_CACHED; 446168404Spjd } else if (zio == NULL || zio->io_error == 0) { 447168404Spjd dbuf_set_data(db, buf); 448168404Spjd db->db_state = DB_CACHED; 449168404Spjd } else { 450168404Spjd ASSERT(db->db_blkid != DB_BONUS_BLKID); 451168404Spjd ASSERT3P(db->db_buf, ==, NULL); 452168404Spjd VERIFY(arc_buf_remove_ref(buf, db) == 1); 453168404Spjd db->db_state = DB_UNCACHED; 454168404Spjd } 455168404Spjd cv_broadcast(&db->db_changed); 456168404Spjd mutex_exit(&db->db_mtx); 457168404Spjd dbuf_rele(db, NULL); 458168404Spjd} 459168404Spjd 460168404Spjdstatic void 461168404Spjddbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags) 462168404Spjd{ 463168404Spjd blkptr_t *bp; 464168404Spjd zbookmark_t zb; 465168404Spjd uint32_t aflags = ARC_NOWAIT; 466168404Spjd 467168404Spjd ASSERT(!refcount_is_zero(&db->db_holds)); 468168404Spjd /* We need the struct_rwlock to prevent db_blkptr from changing. */ 469168404Spjd ASSERT(RW_LOCK_HELD(&db->db_dnode->dn_struct_rwlock)); 470168404Spjd ASSERT(MUTEX_HELD(&db->db_mtx)); 471168404Spjd ASSERT(db->db_state == DB_UNCACHED); 472168404Spjd ASSERT(db->db_buf == NULL); 473168404Spjd 474168404Spjd if (db->db_blkid == DB_BONUS_BLKID) { 475168404Spjd ASSERT3U(db->db_dnode->dn_bonuslen, ==, db->db.db_size); 476168404Spjd db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN); 477168404Spjd if (db->db.db_size < DN_MAX_BONUSLEN) 478168404Spjd bzero(db->db.db_data, DN_MAX_BONUSLEN); 479168404Spjd bcopy(DN_BONUS(db->db_dnode->dn_phys), db->db.db_data, 480168404Spjd db->db.db_size); 481168404Spjd dbuf_update_data(db); 482168404Spjd db->db_state = DB_CACHED; 483168404Spjd mutex_exit(&db->db_mtx); 484168404Spjd return; 485168404Spjd } 486168404Spjd 487168404Spjd if (db->db_level == 0 && dnode_block_freed(db->db_dnode, db->db_blkid)) 488168404Spjd bp = NULL; 489168404Spjd else 490168404Spjd bp = db->db_blkptr; 491168404Spjd 492168404Spjd if (bp == NULL) 493168404Spjd dprintf_dbuf(db, "blkptr: %s\n", "NULL"); 494168404Spjd else 495168404Spjd dprintf_dbuf_bp(db, bp, "%s", "blkptr:"); 496168404Spjd 497168404Spjd if (bp == NULL || BP_IS_HOLE(bp)) { 498168404Spjd arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 499168404Spjd 500168404Spjd ASSERT(bp == NULL || BP_IS_HOLE(bp)); 501168404Spjd dbuf_set_data(db, arc_buf_alloc(db->db_dnode->dn_objset->os_spa, 502168404Spjd db->db.db_size, db, type)); 503168404Spjd bzero(db->db.db_data, db->db.db_size); 504168404Spjd db->db_state = DB_CACHED; 505168404Spjd *flags |= DB_RF_CACHED; 506168404Spjd mutex_exit(&db->db_mtx); 507168404Spjd return; 508168404Spjd } 509168404Spjd 510168404Spjd db->db_state = DB_READ; 511168404Spjd mutex_exit(&db->db_mtx); 512168404Spjd 513168404Spjd zb.zb_objset = db->db_objset->os_dsl_dataset ? 514168404Spjd db->db_objset->os_dsl_dataset->ds_object : 0; 515168404Spjd zb.zb_object = db->db.db_object; 516168404Spjd zb.zb_level = db->db_level; 517168404Spjd zb.zb_blkid = db->db_blkid; 518168404Spjd 519168404Spjd dbuf_add_ref(db, NULL); 520168404Spjd /* ZIO_FLAG_CANFAIL callers have to check the parent zio's error */ 521168404Spjd ASSERT3U(db->db_dnode->dn_type, <, DMU_OT_NUMTYPES); 522168404Spjd (void) arc_read(zio, db->db_dnode->dn_objset->os_spa, bp, 523168404Spjd db->db_level > 0 ? byteswap_uint64_array : 524168404Spjd dmu_ot[db->db_dnode->dn_type].ot_byteswap, 525168404Spjd dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, 526168404Spjd (*flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED, 527168404Spjd &aflags, &zb); 528168404Spjd if (aflags & ARC_CACHED) 529168404Spjd *flags |= DB_RF_CACHED; 530168404Spjd} 531168404Spjd 532168404Spjdint 533168404Spjddbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) 534168404Spjd{ 535168404Spjd int err = 0; 536168404Spjd int havepzio = (zio != NULL); 537168404Spjd int prefetch; 538168404Spjd 539168404Spjd /* 540168404Spjd * We don't have to hold the mutex to check db_state because it 541168404Spjd * can't be freed while we have a hold on the buffer. 542168404Spjd */ 543168404Spjd ASSERT(!refcount_is_zero(&db->db_holds)); 544168404Spjd 545168404Spjd if ((flags & DB_RF_HAVESTRUCT) == 0) 546168404Spjd rw_enter(&db->db_dnode->dn_struct_rwlock, RW_READER); 547168404Spjd 548168404Spjd prefetch = db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID && 549168404Spjd (flags & DB_RF_NOPREFETCH) == 0 && db->db_dnode != NULL; 550168404Spjd 551168404Spjd mutex_enter(&db->db_mtx); 552168404Spjd if (db->db_state == DB_CACHED) { 553168404Spjd mutex_exit(&db->db_mtx); 554168404Spjd if (prefetch) 555168404Spjd dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset, 556168404Spjd db->db.db_size, TRUE); 557168404Spjd if ((flags & DB_RF_HAVESTRUCT) == 0) 558168404Spjd rw_exit(&db->db_dnode->dn_struct_rwlock); 559168404Spjd } else if (db->db_state == DB_UNCACHED) { 560168404Spjd if (zio == NULL) { 561168404Spjd zio = zio_root(db->db_dnode->dn_objset->os_spa, 562168404Spjd NULL, NULL, ZIO_FLAG_CANFAIL); 563168404Spjd } 564168404Spjd dbuf_read_impl(db, zio, &flags); 565168404Spjd 566168404Spjd /* dbuf_read_impl has dropped db_mtx for us */ 567168404Spjd 568168404Spjd if (prefetch) 569168404Spjd dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset, 570168404Spjd db->db.db_size, flags & DB_RF_CACHED); 571168404Spjd 572168404Spjd if ((flags & DB_RF_HAVESTRUCT) == 0) 573168404Spjd rw_exit(&db->db_dnode->dn_struct_rwlock); 574168404Spjd 575168404Spjd if (!havepzio) 576168404Spjd err = zio_wait(zio); 577168404Spjd } else { 578168404Spjd mutex_exit(&db->db_mtx); 579168404Spjd if (prefetch) 580168404Spjd dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset, 581168404Spjd db->db.db_size, TRUE); 582168404Spjd if ((flags & DB_RF_HAVESTRUCT) == 0) 583168404Spjd rw_exit(&db->db_dnode->dn_struct_rwlock); 584168404Spjd 585168404Spjd mutex_enter(&db->db_mtx); 586168404Spjd if ((flags & DB_RF_NEVERWAIT) == 0) { 587168404Spjd while (db->db_state == DB_READ || 588168404Spjd db->db_state == DB_FILL) { 589168404Spjd ASSERT(db->db_state == DB_READ || 590168404Spjd (flags & DB_RF_HAVESTRUCT) == 0); 591168404Spjd cv_wait(&db->db_changed, &db->db_mtx); 592168404Spjd } 593168404Spjd if (db->db_state == DB_UNCACHED) 594168404Spjd err = EIO; 595168404Spjd } 596168404Spjd mutex_exit(&db->db_mtx); 597168404Spjd } 598168404Spjd 599168404Spjd ASSERT(err || havepzio || db->db_state == DB_CACHED); 600168404Spjd return (err); 601168404Spjd} 602168404Spjd 603168404Spjdstatic void 604168404Spjddbuf_noread(dmu_buf_impl_t *db) 605168404Spjd{ 606168404Spjd ASSERT(!refcount_is_zero(&db->db_holds)); 607168404Spjd ASSERT(db->db_blkid != DB_BONUS_BLKID); 608168404Spjd mutex_enter(&db->db_mtx); 609168404Spjd while (db->db_state == DB_READ || db->db_state == DB_FILL) 610168404Spjd cv_wait(&db->db_changed, &db->db_mtx); 611168404Spjd if (db->db_state == DB_UNCACHED) { 612168404Spjd arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 613168404Spjd 614168404Spjd ASSERT(db->db_buf == NULL); 615168404Spjd ASSERT(db->db.db_data == NULL); 616168404Spjd dbuf_set_data(db, arc_buf_alloc(db->db_dnode->dn_objset->os_spa, 617168404Spjd db->db.db_size, db, type)); 618168404Spjd db->db_state = DB_FILL; 619168404Spjd } else { 620168404Spjd ASSERT3U(db->db_state, ==, DB_CACHED); 621168404Spjd } 622168404Spjd mutex_exit(&db->db_mtx); 623168404Spjd} 624168404Spjd 625168404Spjd/* 626168404Spjd * This is our just-in-time copy function. It makes a copy of 627168404Spjd * buffers, that have been modified in a previous transaction 628168404Spjd * group, before we modify them in the current active group. 629168404Spjd * 630168404Spjd * This function is used in two places: when we are dirtying a 631168404Spjd * buffer for the first time in a txg, and when we are freeing 632168404Spjd * a range in a dnode that includes this buffer. 633168404Spjd * 634168404Spjd * Note that when we are called from dbuf_free_range() we do 635168404Spjd * not put a hold on the buffer, we just traverse the active 636168404Spjd * dbuf list for the dnode. 637168404Spjd */ 638168404Spjdstatic void 639168404Spjddbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg) 640168404Spjd{ 641168404Spjd dbuf_dirty_record_t *dr = db->db_last_dirty; 642168404Spjd 643168404Spjd ASSERT(MUTEX_HELD(&db->db_mtx)); 644168404Spjd ASSERT(db->db.db_data != NULL); 645168404Spjd ASSERT(db->db_level == 0); 646168404Spjd ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT); 647168404Spjd 648168404Spjd if (dr == NULL || 649168404Spjd (dr->dt.dl.dr_data != 650168404Spjd ((db->db_blkid == DB_BONUS_BLKID) ? db->db.db_data : db->db_buf))) 651168404Spjd return; 652168404Spjd 653168404Spjd /* 654168404Spjd * If the last dirty record for this dbuf has not yet synced 655168404Spjd * and its referencing the dbuf data, either: 656168404Spjd * reset the reference to point to a new copy, 657168404Spjd * or (if there a no active holders) 658168404Spjd * just null out the current db_data pointer. 659168404Spjd */ 660168404Spjd ASSERT(dr->dr_txg >= txg - 2); 661168404Spjd if (db->db_blkid == DB_BONUS_BLKID) { 662168404Spjd /* Note that the data bufs here are zio_bufs */ 663168404Spjd dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN); 664168404Spjd bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN); 665168404Spjd } else if (refcount_count(&db->db_holds) > db->db_dirtycnt) { 666168404Spjd int size = db->db.db_size; 667168404Spjd arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 668168404Spjd dr->dt.dl.dr_data = arc_buf_alloc( 669168404Spjd db->db_dnode->dn_objset->os_spa, size, db, type); 670168404Spjd bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size); 671168404Spjd } else { 672168404Spjd dbuf_set_data(db, NULL); 673168404Spjd } 674168404Spjd} 675168404Spjd 676168404Spjdvoid 677168404Spjddbuf_unoverride(dbuf_dirty_record_t *dr) 678168404Spjd{ 679168404Spjd dmu_buf_impl_t *db = dr->dr_dbuf; 680168404Spjd uint64_t txg = dr->dr_txg; 681168404Spjd 682168404Spjd ASSERT(MUTEX_HELD(&db->db_mtx)); 683168404Spjd ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC); 684168404Spjd ASSERT(db->db_level == 0); 685168404Spjd 686168404Spjd if (db->db_blkid == DB_BONUS_BLKID || 687168404Spjd dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN) 688168404Spjd return; 689168404Spjd 690168404Spjd /* free this block */ 691168404Spjd if (!BP_IS_HOLE(&dr->dt.dl.dr_overridden_by)) { 692168404Spjd /* XXX can get silent EIO here */ 693168404Spjd (void) arc_free(NULL, db->db_dnode->dn_objset->os_spa, 694168404Spjd txg, &dr->dt.dl.dr_overridden_by, NULL, NULL, ARC_WAIT); 695168404Spjd } 696168404Spjd dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; 697168404Spjd /* 698168404Spjd * Release the already-written buffer, so we leave it in 699168404Spjd * a consistent dirty state. Note that all callers are 700168404Spjd * modifying the buffer, so they will immediately do 701168404Spjd * another (redundant) arc_release(). Therefore, leave 702168404Spjd * the buf thawed to save the effort of freezing & 703168404Spjd * immediately re-thawing it. 704168404Spjd */ 705168404Spjd arc_release(dr->dt.dl.dr_data, db); 706168404Spjd} 707168404Spjd 708168404Spjdvoid 709168404Spjddbuf_free_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx) 710168404Spjd{ 711168404Spjd dmu_buf_impl_t *db, *db_next; 712168404Spjd uint64_t txg = tx->tx_txg; 713168404Spjd 714168404Spjd dprintf_dnode(dn, "blkid=%llu nblks=%llu\n", blkid, nblks); 715168404Spjd mutex_enter(&dn->dn_dbufs_mtx); 716168404Spjd for (db = list_head(&dn->dn_dbufs); db; db = db_next) { 717168404Spjd db_next = list_next(&dn->dn_dbufs, db); 718168404Spjd ASSERT(db->db_blkid != DB_BONUS_BLKID); 719168404Spjd if (db->db_level != 0) 720168404Spjd continue; 721168404Spjd dprintf_dbuf(db, "found buf %s\n", ""); 722168404Spjd if (db->db_blkid < blkid || 723168404Spjd db->db_blkid >= blkid+nblks) 724168404Spjd continue; 725168404Spjd 726168404Spjd /* found a level 0 buffer in the range */ 727168404Spjd if (dbuf_undirty(db, tx)) 728168404Spjd continue; 729168404Spjd 730168404Spjd mutex_enter(&db->db_mtx); 731168404Spjd if (db->db_state == DB_UNCACHED || 732168404Spjd db->db_state == DB_EVICTING) { 733168404Spjd ASSERT(db->db.db_data == NULL); 734168404Spjd mutex_exit(&db->db_mtx); 735168404Spjd continue; 736168404Spjd } 737168404Spjd if (db->db_state == DB_READ || db->db_state == DB_FILL) { 738168404Spjd /* will be handled in dbuf_read_done or dbuf_rele */ 739168404Spjd db->db_freed_in_flight = TRUE; 740168404Spjd mutex_exit(&db->db_mtx); 741168404Spjd continue; 742168404Spjd } 743168404Spjd if (refcount_count(&db->db_holds) == 0) { 744168404Spjd ASSERT(db->db_buf); 745168404Spjd dbuf_clear(db); 746168404Spjd continue; 747168404Spjd } 748168404Spjd /* The dbuf is referenced */ 749168404Spjd 750168404Spjd if (db->db_last_dirty != NULL) { 751168404Spjd dbuf_dirty_record_t *dr = db->db_last_dirty; 752168404Spjd 753168404Spjd if (dr->dr_txg == txg) { 754168404Spjd /* 755168404Spjd * This buffer is "in-use", re-adjust the file 756168404Spjd * size to reflect that this buffer may 757168404Spjd * contain new data when we sync. 758168404Spjd */ 759168404Spjd if (db->db_blkid > dn->dn_maxblkid) 760168404Spjd dn->dn_maxblkid = db->db_blkid; 761168404Spjd dbuf_unoverride(dr); 762168404Spjd } else { 763168404Spjd /* 764168404Spjd * This dbuf is not dirty in the open context. 765168404Spjd * Either uncache it (if its not referenced in 766168404Spjd * the open context) or reset its contents to 767168404Spjd * empty. 768168404Spjd */ 769168404Spjd dbuf_fix_old_data(db, txg); 770168404Spjd } 771168404Spjd } 772168404Spjd /* clear the contents if its cached */ 773168404Spjd if (db->db_state == DB_CACHED) { 774168404Spjd ASSERT(db->db.db_data != NULL); 775168404Spjd arc_release(db->db_buf, db); 776168404Spjd bzero(db->db.db_data, db->db.db_size); 777168404Spjd arc_buf_freeze(db->db_buf); 778168404Spjd } 779168404Spjd 780168404Spjd mutex_exit(&db->db_mtx); 781168404Spjd } 782168404Spjd mutex_exit(&dn->dn_dbufs_mtx); 783168404Spjd} 784168404Spjd 785168404Spjdstatic int 786168404Spjddbuf_new_block(dmu_buf_impl_t *db) 787168404Spjd{ 788168404Spjd dsl_dataset_t *ds = db->db_objset->os_dsl_dataset; 789168404Spjd uint64_t birth_txg = 0; 790168404Spjd 791168404Spjd /* Don't count meta-objects */ 792168404Spjd if (ds == NULL) 793168404Spjd return (FALSE); 794168404Spjd 795168404Spjd /* 796168404Spjd * We don't need any locking to protect db_blkptr: 797168404Spjd * If it's syncing, then db_last_dirty will be set 798168404Spjd * so we'll ignore db_blkptr. 799168404Spjd */ 800168404Spjd ASSERT(MUTEX_HELD(&db->db_mtx)); 801168404Spjd /* If we have been dirtied since the last snapshot, its not new */ 802168404Spjd if (db->db_last_dirty) 803168404Spjd birth_txg = db->db_last_dirty->dr_txg; 804168404Spjd else if (db->db_blkptr) 805168404Spjd birth_txg = db->db_blkptr->blk_birth; 806168404Spjd 807168404Spjd if (birth_txg) 808168404Spjd return (!dsl_dataset_block_freeable(ds, birth_txg)); 809168404Spjd else 810168404Spjd return (TRUE); 811168404Spjd} 812168404Spjd 813168404Spjdvoid 814168404Spjddbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx) 815168404Spjd{ 816168404Spjd arc_buf_t *buf, *obuf; 817168404Spjd int osize = db->db.db_size; 818168404Spjd arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 819168404Spjd 820168404Spjd ASSERT(db->db_blkid != DB_BONUS_BLKID); 821168404Spjd 822168404Spjd /* XXX does *this* func really need the lock? */ 823168404Spjd ASSERT(RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock)); 824168404Spjd 825168404Spjd /* 826168404Spjd * This call to dbuf_will_dirty() with the dn_struct_rwlock held 827168404Spjd * is OK, because there can be no other references to the db 828168404Spjd * when we are changing its size, so no concurrent DB_FILL can 829168404Spjd * be happening. 830168404Spjd */ 831168404Spjd /* 832168404Spjd * XXX we should be doing a dbuf_read, checking the return 833168404Spjd * value and returning that up to our callers 834168404Spjd */ 835168404Spjd dbuf_will_dirty(db, tx); 836168404Spjd 837168404Spjd /* create the data buffer for the new block */ 838168404Spjd buf = arc_buf_alloc(db->db_dnode->dn_objset->os_spa, size, db, type); 839168404Spjd 840168404Spjd /* copy old block data to the new block */ 841168404Spjd obuf = db->db_buf; 842168404Spjd bcopy(obuf->b_data, buf->b_data, MIN(osize, size)); 843168404Spjd /* zero the remainder */ 844168404Spjd if (size > osize) 845168404Spjd bzero((uint8_t *)buf->b_data + osize, size - osize); 846168404Spjd 847168404Spjd mutex_enter(&db->db_mtx); 848168404Spjd dbuf_set_data(db, buf); 849168404Spjd VERIFY(arc_buf_remove_ref(obuf, db) == 1); 850168404Spjd db->db.db_size = size; 851168404Spjd 852168404Spjd if (db->db_level == 0) { 853168404Spjd ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg); 854168404Spjd db->db_last_dirty->dt.dl.dr_data = buf; 855168404Spjd } 856168404Spjd mutex_exit(&db->db_mtx); 857168404Spjd 858168404Spjd dnode_willuse_space(db->db_dnode, size-osize, tx); 859168404Spjd} 860168404Spjd 861168404Spjddbuf_dirty_record_t * 862168404Spjddbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) 863168404Spjd{ 864168404Spjd dnode_t *dn = db->db_dnode; 865168404Spjd objset_impl_t *os = dn->dn_objset; 866168404Spjd dbuf_dirty_record_t **drp, *dr; 867168404Spjd int drop_struct_lock = FALSE; 868168404Spjd int txgoff = tx->tx_txg & TXG_MASK; 869168404Spjd 870168404Spjd ASSERT(tx->tx_txg != 0); 871168404Spjd ASSERT(!refcount_is_zero(&db->db_holds)); 872168404Spjd DMU_TX_DIRTY_BUF(tx, db); 873168404Spjd 874168404Spjd /* 875168404Spjd * Shouldn't dirty a regular buffer in syncing context. Private 876168404Spjd * objects may be dirtied in syncing context, but only if they 877168404Spjd * were already pre-dirtied in open context. 878168404Spjd * XXX We may want to prohibit dirtying in syncing context even 879168404Spjd * if they did pre-dirty. 880168404Spjd */ 881168404Spjd ASSERT(!dmu_tx_is_syncing(tx) || 882168404Spjd BP_IS_HOLE(dn->dn_objset->os_rootbp) || 883168404Spjd dn->dn_object == DMU_META_DNODE_OBJECT || 884168404Spjd dn->dn_objset->os_dsl_dataset == NULL || 885168404Spjd dsl_dir_is_private(dn->dn_objset->os_dsl_dataset->ds_dir)); 886168404Spjd 887168404Spjd /* 888168404Spjd * We make this assert for private objects as well, but after we 889168404Spjd * check if we're already dirty. They are allowed to re-dirty 890168404Spjd * in syncing context. 891168404Spjd */ 892168404Spjd ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || 893168404Spjd dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx == 894168404Spjd (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); 895168404Spjd 896168404Spjd mutex_enter(&db->db_mtx); 897168404Spjd /* 898168404Spjd * XXX make this true for indirects too? The problem is that 899168404Spjd * transactions created with dmu_tx_create_assigned() from 900168404Spjd * syncing context don't bother holding ahead. 901168404Spjd */ 902168404Spjd ASSERT(db->db_level != 0 || 903168404Spjd db->db_state == DB_CACHED || db->db_state == DB_FILL); 904168404Spjd 905168404Spjd mutex_enter(&dn->dn_mtx); 906168404Spjd /* 907168404Spjd * Don't set dirtyctx to SYNC if we're just modifying this as we 908168404Spjd * initialize the objset. 909168404Spjd */ 910168404Spjd if (dn->dn_dirtyctx == DN_UNDIRTIED && 911168404Spjd !BP_IS_HOLE(dn->dn_objset->os_rootbp)) { 912168404Spjd dn->dn_dirtyctx = 913168404Spjd (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN); 914168404Spjd ASSERT(dn->dn_dirtyctx_firstset == NULL); 915168404Spjd dn->dn_dirtyctx_firstset = kmem_alloc(1, KM_SLEEP); 916168404Spjd } 917168404Spjd mutex_exit(&dn->dn_mtx); 918168404Spjd 919168404Spjd /* 920168404Spjd * If this buffer is already dirty, we're done. 921168404Spjd */ 922168404Spjd drp = &db->db_last_dirty; 923168404Spjd ASSERT(*drp == NULL || (*drp)->dr_txg <= tx->tx_txg || 924168404Spjd db->db.db_object == DMU_META_DNODE_OBJECT); 925168404Spjd while (*drp && (*drp)->dr_txg > tx->tx_txg) 926168404Spjd drp = &(*drp)->dr_next; 927168404Spjd if (*drp && (*drp)->dr_txg == tx->tx_txg) { 928168404Spjd if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID) { 929168404Spjd /* 930168404Spjd * If this buffer has already been written out, 931168404Spjd * we now need to reset its state. 932168404Spjd */ 933168404Spjd dbuf_unoverride(*drp); 934168404Spjd if (db->db.db_object != DMU_META_DNODE_OBJECT) 935168404Spjd arc_buf_thaw(db->db_buf); 936168404Spjd } 937168404Spjd mutex_exit(&db->db_mtx); 938168404Spjd return (*drp); 939168404Spjd } 940168404Spjd 941168404Spjd /* 942168404Spjd * Only valid if not already dirty. 943168404Spjd */ 944168404Spjd ASSERT(dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx == 945168404Spjd (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); 946168404Spjd 947168404Spjd ASSERT3U(dn->dn_nlevels, >, db->db_level); 948168404Spjd ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) || 949168404Spjd dn->dn_phys->dn_nlevels > db->db_level || 950168404Spjd dn->dn_next_nlevels[txgoff] > db->db_level || 951168404Spjd dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level || 952168404Spjd dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level); 953168404Spjd 954168404Spjd /* 955168404Spjd * We should only be dirtying in syncing context if it's the 956168404Spjd * mos, a spa os, or we're initializing the os. However, we are 957168404Spjd * allowed to dirty in syncing context provided we already 958168404Spjd * dirtied it in open context. Hence we must make this 959168404Spjd * assertion only if we're not already dirty. 960168404Spjd */ 961168404Spjd ASSERT(!dmu_tx_is_syncing(tx) || 962168404Spjd os->os_dsl_dataset == NULL || 963168404Spjd !dsl_dir_is_private(os->os_dsl_dataset->ds_dir) || 964168404Spjd !BP_IS_HOLE(os->os_rootbp)); 965168404Spjd ASSERT(db->db.db_size != 0); 966168404Spjd 967168404Spjd dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); 968168404Spjd 969168404Spjd /* 970168404Spjd * If this buffer is dirty in an old transaction group we need 971168404Spjd * to make a copy of it so that the changes we make in this 972168404Spjd * transaction group won't leak out when we sync the older txg. 973168404Spjd */ 974168404Spjd dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_SLEEP); 975168404Spjd if (db->db_level == 0) { 976168404Spjd void *data_old = db->db_buf; 977168404Spjd 978168404Spjd if (db->db_blkid == DB_BONUS_BLKID) { 979168404Spjd dbuf_fix_old_data(db, tx->tx_txg); 980168404Spjd data_old = db->db.db_data; 981168404Spjd } else if (db->db.db_object != DMU_META_DNODE_OBJECT) { 982168404Spjd /* 983168404Spjd * Release the data buffer from the cache so that we 984168404Spjd * can modify it without impacting possible other users 985168404Spjd * of this cached data block. Note that indirect 986168404Spjd * blocks and private objects are not released until the 987168404Spjd * syncing state (since they are only modified then). 988168404Spjd */ 989168404Spjd arc_release(db->db_buf, db); 990168404Spjd dbuf_fix_old_data(db, tx->tx_txg); 991168404Spjd data_old = db->db_buf; 992168404Spjd } 993168404Spjd ASSERT(data_old != NULL); 994168404Spjd dr->dt.dl.dr_data = data_old; 995168404Spjd } else { 996168404Spjd mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_DEFAULT, NULL); 997168404Spjd list_create(&dr->dt.di.dr_children, 998168404Spjd sizeof (dbuf_dirty_record_t), 999168404Spjd offsetof(dbuf_dirty_record_t, dr_dirty_node)); 1000168404Spjd } 1001168404Spjd dr->dr_dbuf = db; 1002168404Spjd dr->dr_txg = tx->tx_txg; 1003168404Spjd dr->dr_next = *drp; 1004168404Spjd *drp = dr; 1005168404Spjd 1006168404Spjd /* 1007168404Spjd * We could have been freed_in_flight between the dbuf_noread 1008168404Spjd * and dbuf_dirty. We win, as though the dbuf_noread() had 1009168404Spjd * happened after the free. 1010168404Spjd */ 1011168404Spjd if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID) { 1012168404Spjd mutex_enter(&dn->dn_mtx); 1013168404Spjd dnode_clear_range(dn, db->db_blkid, 1, tx); 1014168404Spjd mutex_exit(&dn->dn_mtx); 1015168404Spjd db->db_freed_in_flight = FALSE; 1016168404Spjd } 1017168404Spjd 1018168404Spjd if (db->db_blkid != DB_BONUS_BLKID) { 1019168404Spjd /* 1020168404Spjd * Update the accounting. 1021168404Spjd */ 1022168404Spjd if (!dbuf_new_block(db) && db->db_blkptr) { 1023168404Spjd /* 1024168404Spjd * This is only a guess -- if the dbuf is dirty 1025168404Spjd * in a previous txg, we don't know how much 1026168404Spjd * space it will use on disk yet. We should 1027168404Spjd * really have the struct_rwlock to access 1028168404Spjd * db_blkptr, but since this is just a guess, 1029168404Spjd * it's OK if we get an odd answer. 1030168404Spjd */ 1031168404Spjd dnode_willuse_space(dn, 1032168404Spjd -bp_get_dasize(os->os_spa, db->db_blkptr), tx); 1033168404Spjd } 1034168404Spjd dnode_willuse_space(dn, db->db.db_size, tx); 1035168404Spjd } 1036168404Spjd 1037168404Spjd /* 1038168404Spjd * This buffer is now part of this txg 1039168404Spjd */ 1040168404Spjd dbuf_add_ref(db, (void *)(uintptr_t)tx->tx_txg); 1041168404Spjd db->db_dirtycnt += 1; 1042168404Spjd ASSERT3U(db->db_dirtycnt, <=, 3); 1043168404Spjd 1044168404Spjd mutex_exit(&db->db_mtx); 1045168404Spjd 1046168404Spjd if (db->db_blkid == DB_BONUS_BLKID) { 1047168404Spjd mutex_enter(&dn->dn_mtx); 1048168404Spjd ASSERT(!list_link_active(&dr->dr_dirty_node)); 1049168404Spjd list_insert_tail(&dn->dn_dirty_records[txgoff], dr); 1050168404Spjd mutex_exit(&dn->dn_mtx); 1051168404Spjd dnode_setdirty(dn, tx); 1052168404Spjd return (dr); 1053168404Spjd } 1054168404Spjd 1055168404Spjd if (db->db_level == 0) { 1056168404Spjd dnode_new_blkid(dn, db->db_blkid, tx); 1057168404Spjd ASSERT(dn->dn_maxblkid >= db->db_blkid); 1058168404Spjd } 1059168404Spjd 1060168404Spjd if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) { 1061168404Spjd rw_enter(&dn->dn_struct_rwlock, RW_READER); 1062168404Spjd drop_struct_lock = TRUE; 1063168404Spjd } 1064168404Spjd 1065168404Spjd if (db->db_level+1 < dn->dn_nlevels) { 1066168404Spjd dmu_buf_impl_t *parent = db->db_parent; 1067168404Spjd dbuf_dirty_record_t *di; 1068168404Spjd int parent_held = FALSE; 1069168404Spjd 1070168404Spjd if (db->db_parent == NULL || db->db_parent == dn->dn_dbuf) { 1071168404Spjd int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 1072168404Spjd 1073168404Spjd parent = dbuf_hold_level(dn, db->db_level+1, 1074168404Spjd db->db_blkid >> epbs, FTAG); 1075168404Spjd parent_held = TRUE; 1076168404Spjd } 1077168404Spjd if (drop_struct_lock) 1078168404Spjd rw_exit(&dn->dn_struct_rwlock); 1079168404Spjd ASSERT3U(db->db_level+1, ==, parent->db_level); 1080168404Spjd di = dbuf_dirty(parent, tx); 1081168404Spjd if (parent_held) 1082168404Spjd dbuf_rele(parent, FTAG); 1083168404Spjd 1084168404Spjd mutex_enter(&db->db_mtx); 1085168404Spjd /* possible race with dbuf_undirty() */ 1086168404Spjd if (db->db_last_dirty == dr || 1087168404Spjd dn->dn_object == DMU_META_DNODE_OBJECT) { 1088168404Spjd mutex_enter(&di->dt.di.dr_mtx); 1089168404Spjd ASSERT3U(di->dr_txg, ==, tx->tx_txg); 1090168404Spjd ASSERT(!list_link_active(&dr->dr_dirty_node)); 1091168404Spjd list_insert_tail(&di->dt.di.dr_children, dr); 1092168404Spjd mutex_exit(&di->dt.di.dr_mtx); 1093168404Spjd dr->dr_parent = di; 1094168404Spjd } 1095168404Spjd mutex_exit(&db->db_mtx); 1096168404Spjd } else { 1097168404Spjd ASSERT(db->db_level+1 == dn->dn_nlevels); 1098168404Spjd ASSERT(db->db_blkid < dn->dn_nblkptr); 1099168404Spjd ASSERT(db->db_parent == NULL || 1100168404Spjd db->db_parent == db->db_dnode->dn_dbuf); 1101168404Spjd mutex_enter(&dn->dn_mtx); 1102168404Spjd ASSERT(!list_link_active(&dr->dr_dirty_node)); 1103168404Spjd list_insert_tail(&dn->dn_dirty_records[txgoff], dr); 1104168404Spjd mutex_exit(&dn->dn_mtx); 1105168404Spjd if (drop_struct_lock) 1106168404Spjd rw_exit(&dn->dn_struct_rwlock); 1107168404Spjd } 1108168404Spjd 1109168404Spjd dnode_setdirty(dn, tx); 1110168404Spjd return (dr); 1111168404Spjd} 1112168404Spjd 1113168404Spjdstatic int 1114168404Spjddbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) 1115168404Spjd{ 1116168404Spjd dnode_t *dn = db->db_dnode; 1117168404Spjd uint64_t txg = tx->tx_txg; 1118168404Spjd dbuf_dirty_record_t *dr; 1119168404Spjd 1120168404Spjd ASSERT(txg != 0); 1121168404Spjd ASSERT(db->db_blkid != DB_BONUS_BLKID); 1122168404Spjd 1123168404Spjd mutex_enter(&db->db_mtx); 1124168404Spjd 1125168404Spjd /* 1126168404Spjd * If this buffer is not dirty, we're done. 1127168404Spjd */ 1128168404Spjd for (dr = db->db_last_dirty; dr; dr = dr->dr_next) 1129168404Spjd if (dr->dr_txg <= txg) 1130168404Spjd break; 1131168404Spjd if (dr == NULL || dr->dr_txg < txg) { 1132168404Spjd mutex_exit(&db->db_mtx); 1133168404Spjd return (0); 1134168404Spjd } 1135168404Spjd ASSERT(dr->dr_txg == txg); 1136168404Spjd 1137168404Spjd /* 1138168404Spjd * If this buffer is currently held, we cannot undirty 1139168404Spjd * it, since one of the current holders may be in the 1140168404Spjd * middle of an update. Note that users of dbuf_undirty() 1141168404Spjd * should not place a hold on the dbuf before the call. 1142168404Spjd */ 1143168404Spjd if (refcount_count(&db->db_holds) > db->db_dirtycnt) { 1144168404Spjd mutex_exit(&db->db_mtx); 1145168404Spjd /* Make sure we don't toss this buffer at sync phase */ 1146168404Spjd mutex_enter(&dn->dn_mtx); 1147168404Spjd dnode_clear_range(dn, db->db_blkid, 1, tx); 1148168404Spjd mutex_exit(&dn->dn_mtx); 1149168404Spjd return (0); 1150168404Spjd } 1151168404Spjd 1152168404Spjd dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); 1153168404Spjd 1154168404Spjd ASSERT(db->db.db_size != 0); 1155168404Spjd 1156168404Spjd /* XXX would be nice to fix up dn_towrite_space[] */ 1157168404Spjd 1158168404Spjd db->db_last_dirty = dr->dr_next; 1159168404Spjd 1160168404Spjd if (dr->dr_parent) { 1161168404Spjd mutex_enter(&dr->dr_parent->dt.di.dr_mtx); 1162168404Spjd list_remove(&dr->dr_parent->dt.di.dr_children, dr); 1163168404Spjd mutex_exit(&dr->dr_parent->dt.di.dr_mtx); 1164168404Spjd } else if (db->db_level+1 == dn->dn_nlevels) { 1165168404Spjd ASSERT3P(db->db_parent, ==, dn->dn_dbuf); 1166168404Spjd mutex_enter(&dn->dn_mtx); 1167168404Spjd list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr); 1168168404Spjd mutex_exit(&dn->dn_mtx); 1169168404Spjd } 1170168404Spjd 1171168404Spjd if (db->db_level == 0) { 1172168404Spjd dbuf_unoverride(dr); 1173168404Spjd 1174168404Spjd ASSERT(db->db_buf != NULL); 1175168404Spjd ASSERT(dr->dt.dl.dr_data != NULL); 1176168404Spjd if (dr->dt.dl.dr_data != db->db_buf) 1177168404Spjd VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db) == 1); 1178168404Spjd } else { 1179168404Spjd ASSERT(db->db_buf != NULL); 1180168404Spjd ASSERT(list_head(&dr->dt.di.dr_children) == NULL); 1181168404Spjd /* XXX - mutex and list destroy? */ 1182168404Spjd } 1183168404Spjd kmem_free(dr, sizeof (dbuf_dirty_record_t)); 1184168404Spjd 1185168404Spjd ASSERT(db->db_dirtycnt > 0); 1186168404Spjd db->db_dirtycnt -= 1; 1187168404Spjd 1188168404Spjd if (refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) { 1189168404Spjd arc_buf_t *buf = db->db_buf; 1190168404Spjd 1191168404Spjd ASSERT(arc_released(buf)); 1192168404Spjd dbuf_set_data(db, NULL); 1193168404Spjd VERIFY(arc_buf_remove_ref(buf, db) == 1); 1194168404Spjd dbuf_evict(db); 1195168404Spjd return (1); 1196168404Spjd } 1197168404Spjd 1198168404Spjd mutex_exit(&db->db_mtx); 1199168404Spjd return (0); 1200168404Spjd} 1201168404Spjd 1202168404Spjd#pragma weak dmu_buf_will_dirty = dbuf_will_dirty 1203168404Spjdvoid 1204168404Spjddbuf_will_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) 1205168404Spjd{ 1206168404Spjd int rf = DB_RF_MUST_SUCCEED; 1207168404Spjd 1208168404Spjd ASSERT(tx->tx_txg != 0); 1209168404Spjd ASSERT(!refcount_is_zero(&db->db_holds)); 1210168404Spjd 1211168404Spjd if (RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock)) 1212168404Spjd rf |= DB_RF_HAVESTRUCT; 1213168404Spjd (void) dbuf_read(db, NULL, rf); 1214168404Spjd (void) dbuf_dirty(db, tx); 1215168404Spjd} 1216168404Spjd 1217168404Spjdvoid 1218168404Spjddmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) 1219168404Spjd{ 1220168404Spjd dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 1221168404Spjd 1222168404Spjd ASSERT(db->db_blkid != DB_BONUS_BLKID); 1223168404Spjd ASSERT(tx->tx_txg != 0); 1224168404Spjd ASSERT(db->db_level == 0); 1225168404Spjd ASSERT(!refcount_is_zero(&db->db_holds)); 1226168404Spjd 1227168404Spjd ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT || 1228168404Spjd dmu_tx_private_ok(tx)); 1229168404Spjd 1230168404Spjd dbuf_noread(db); 1231168404Spjd (void) dbuf_dirty(db, tx); 1232168404Spjd} 1233168404Spjd 1234168404Spjd#pragma weak dmu_buf_fill_done = dbuf_fill_done 1235168404Spjd/* ARGSUSED */ 1236168404Spjdvoid 1237168404Spjddbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx) 1238168404Spjd{ 1239168404Spjd mutex_enter(&db->db_mtx); 1240168404Spjd DBUF_VERIFY(db); 1241168404Spjd 1242168404Spjd if (db->db_state == DB_FILL) { 1243168404Spjd if (db->db_level == 0 && db->db_freed_in_flight) { 1244168404Spjd ASSERT(db->db_blkid != DB_BONUS_BLKID); 1245168404Spjd /* we were freed while filling */ 1246168404Spjd /* XXX dbuf_undirty? */ 1247168404Spjd bzero(db->db.db_data, db->db.db_size); 1248168404Spjd db->db_freed_in_flight = FALSE; 1249168404Spjd } 1250168404Spjd db->db_state = DB_CACHED; 1251168404Spjd cv_broadcast(&db->db_changed); 1252168404Spjd } 1253168404Spjd mutex_exit(&db->db_mtx); 1254168404Spjd} 1255168404Spjd 1256168404Spjd/* 1257168404Spjd * "Clear" the contents of this dbuf. This will mark the dbuf 1258168404Spjd * EVICTING and clear *most* of its references. Unfortunetely, 1259168404Spjd * when we are not holding the dn_dbufs_mtx, we can't clear the 1260168404Spjd * entry in the dn_dbufs list. We have to wait until dbuf_destroy() 1261168404Spjd * in this case. For callers from the DMU we will usually see: 1262168404Spjd * dbuf_clear()->arc_buf_evict()->dbuf_do_evict()->dbuf_destroy() 1263168404Spjd * For the arc callback, we will usually see: 1264168404Spjd * dbuf_do_evict()->dbuf_clear();dbuf_destroy() 1265168404Spjd * Sometimes, though, we will get a mix of these two: 1266168404Spjd * DMU: dbuf_clear()->arc_buf_evict() 1267168404Spjd * ARC: dbuf_do_evict()->dbuf_destroy() 1268168404Spjd */ 1269168404Spjdvoid 1270168404Spjddbuf_clear(dmu_buf_impl_t *db) 1271168404Spjd{ 1272168404Spjd dnode_t *dn = db->db_dnode; 1273168404Spjd dmu_buf_impl_t *parent = db->db_parent; 1274168404Spjd dmu_buf_impl_t *dndb = dn->dn_dbuf; 1275168404Spjd int dbuf_gone = FALSE; 1276168404Spjd 1277168404Spjd ASSERT(MUTEX_HELD(&db->db_mtx)); 1278168404Spjd ASSERT(refcount_is_zero(&db->db_holds)); 1279168404Spjd 1280168404Spjd dbuf_evict_user(db); 1281168404Spjd 1282168404Spjd if (db->db_state == DB_CACHED) { 1283168404Spjd ASSERT(db->db.db_data != NULL); 1284168404Spjd if (db->db_blkid == DB_BONUS_BLKID) 1285168404Spjd zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN); 1286168404Spjd db->db.db_data = NULL; 1287168404Spjd db->db_state = DB_UNCACHED; 1288168404Spjd } 1289168404Spjd 1290168404Spjd ASSERT3U(db->db_state, ==, DB_UNCACHED); 1291168404Spjd ASSERT(db->db_data_pending == NULL); 1292168404Spjd 1293168404Spjd db->db_state = DB_EVICTING; 1294168404Spjd db->db_blkptr = NULL; 1295168404Spjd 1296168404Spjd if (db->db_blkid != DB_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) { 1297168404Spjd list_remove(&dn->dn_dbufs, db); 1298168404Spjd dnode_rele(dn, db); 1299168404Spjd } 1300168404Spjd 1301168404Spjd if (db->db_buf) 1302168404Spjd dbuf_gone = arc_buf_evict(db->db_buf); 1303168404Spjd 1304168404Spjd if (!dbuf_gone) 1305168404Spjd mutex_exit(&db->db_mtx); 1306168404Spjd 1307168404Spjd /* 1308168404Spjd * If this dbuf is referened from an indirect dbuf, 1309168404Spjd * decrement the ref count on the indirect dbuf. 1310168404Spjd */ 1311168404Spjd if (parent && parent != dndb) 1312168404Spjd dbuf_rele(parent, db); 1313168404Spjd} 1314168404Spjd 1315168404Spjdstatic int 1316168404Spjddbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse, 1317168404Spjd dmu_buf_impl_t **parentp, blkptr_t **bpp) 1318168404Spjd{ 1319168404Spjd int nlevels, epbs; 1320168404Spjd 1321168404Spjd *parentp = NULL; 1322168404Spjd *bpp = NULL; 1323168404Spjd 1324168404Spjd ASSERT(blkid != DB_BONUS_BLKID); 1325168404Spjd 1326168404Spjd if (dn->dn_phys->dn_nlevels == 0) 1327168404Spjd nlevels = 1; 1328168404Spjd else 1329168404Spjd nlevels = dn->dn_phys->dn_nlevels; 1330168404Spjd 1331168404Spjd epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 1332168404Spjd 1333168404Spjd ASSERT3U(level * epbs, <, 64); 1334168404Spjd ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 1335168404Spjd if (level >= nlevels || 1336168404Spjd (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))) { 1337168404Spjd /* the buffer has no parent yet */ 1338168404Spjd return (ENOENT); 1339168404Spjd } else if (level < nlevels-1) { 1340168404Spjd /* this block is referenced from an indirect block */ 1341168404Spjd int err = dbuf_hold_impl(dn, level+1, 1342168404Spjd blkid >> epbs, fail_sparse, NULL, parentp); 1343168404Spjd if (err) 1344168404Spjd return (err); 1345168404Spjd err = dbuf_read(*parentp, NULL, 1346168404Spjd (DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH | DB_RF_CANFAIL)); 1347168404Spjd if (err) { 1348168404Spjd dbuf_rele(*parentp, NULL); 1349168404Spjd *parentp = NULL; 1350168404Spjd return (err); 1351168404Spjd } 1352168404Spjd *bpp = ((blkptr_t *)(*parentp)->db.db_data) + 1353168404Spjd (blkid & ((1ULL << epbs) - 1)); 1354168404Spjd return (0); 1355168404Spjd } else { 1356168404Spjd /* the block is referenced from the dnode */ 1357168404Spjd ASSERT3U(level, ==, nlevels-1); 1358168404Spjd ASSERT(dn->dn_phys->dn_nblkptr == 0 || 1359168404Spjd blkid < dn->dn_phys->dn_nblkptr); 1360168404Spjd if (dn->dn_dbuf) { 1361168404Spjd dbuf_add_ref(dn->dn_dbuf, NULL); 1362168404Spjd *parentp = dn->dn_dbuf; 1363168404Spjd } 1364168404Spjd *bpp = &dn->dn_phys->dn_blkptr[blkid]; 1365168404Spjd return (0); 1366168404Spjd } 1367168404Spjd} 1368168404Spjd 1369168404Spjdstatic dmu_buf_impl_t * 1370168404Spjddbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, 1371168404Spjd dmu_buf_impl_t *parent, blkptr_t *blkptr) 1372168404Spjd{ 1373168404Spjd objset_impl_t *os = dn->dn_objset; 1374168404Spjd dmu_buf_impl_t *db, *odb; 1375168404Spjd 1376168404Spjd ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 1377168404Spjd ASSERT(dn->dn_type != DMU_OT_NONE); 1378168404Spjd 1379168404Spjd db = kmem_cache_alloc(dbuf_cache, KM_SLEEP); 1380168404Spjd 1381168404Spjd db->db_objset = os; 1382168404Spjd db->db.db_object = dn->dn_object; 1383168404Spjd db->db_level = level; 1384168404Spjd db->db_blkid = blkid; 1385168404Spjd db->db_last_dirty = NULL; 1386168404Spjd db->db_dirtycnt = 0; 1387168404Spjd db->db_dnode = dn; 1388168404Spjd db->db_parent = parent; 1389168404Spjd db->db_blkptr = blkptr; 1390168404Spjd 1391168404Spjd db->db_user_ptr = NULL; 1392168404Spjd db->db_user_data_ptr_ptr = NULL; 1393168404Spjd db->db_evict_func = NULL; 1394168404Spjd db->db_immediate_evict = 0; 1395168404Spjd db->db_freed_in_flight = 0; 1396168404Spjd 1397168404Spjd if (blkid == DB_BONUS_BLKID) { 1398168404Spjd ASSERT3P(parent, ==, dn->dn_dbuf); 1399168404Spjd db->db.db_size = dn->dn_bonuslen; 1400168404Spjd db->db.db_offset = DB_BONUS_BLKID; 1401168404Spjd db->db_state = DB_UNCACHED; 1402168404Spjd /* the bonus dbuf is not placed in the hash table */ 1403168404Spjd return (db); 1404168404Spjd } else { 1405168404Spjd int blocksize = 1406168404Spjd db->db_level ? 1<<dn->dn_indblkshift : dn->dn_datablksz; 1407168404Spjd db->db.db_size = blocksize; 1408168404Spjd db->db.db_offset = db->db_blkid * blocksize; 1409168404Spjd } 1410168404Spjd 1411168404Spjd /* 1412168404Spjd * Hold the dn_dbufs_mtx while we get the new dbuf 1413168404Spjd * in the hash table *and* added to the dbufs list. 1414168404Spjd * This prevents a possible deadlock with someone 1415168404Spjd * trying to look up this dbuf before its added to the 1416168404Spjd * dn_dbufs list. 1417168404Spjd */ 1418168404Spjd mutex_enter(&dn->dn_dbufs_mtx); 1419168404Spjd db->db_state = DB_EVICTING; 1420168404Spjd if ((odb = dbuf_hash_insert(db)) != NULL) { 1421168404Spjd /* someone else inserted it first */ 1422168404Spjd kmem_cache_free(dbuf_cache, db); 1423168404Spjd mutex_exit(&dn->dn_dbufs_mtx); 1424168404Spjd return (odb); 1425168404Spjd } 1426168404Spjd list_insert_head(&dn->dn_dbufs, db); 1427168404Spjd db->db_state = DB_UNCACHED; 1428168404Spjd mutex_exit(&dn->dn_dbufs_mtx); 1429168404Spjd 1430168404Spjd if (parent && parent != dn->dn_dbuf) 1431168404Spjd dbuf_add_ref(parent, db); 1432168404Spjd 1433168404Spjd ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || 1434168404Spjd refcount_count(&dn->dn_holds) > 0); 1435168404Spjd (void) refcount_add(&dn->dn_holds, db); 1436168404Spjd 1437168404Spjd dprintf_dbuf(db, "db=%p\n", db); 1438168404Spjd 1439168404Spjd return (db); 1440168404Spjd} 1441168404Spjd 1442168404Spjdstatic int 1443168404Spjddbuf_do_evict(void *private) 1444168404Spjd{ 1445168404Spjd arc_buf_t *buf = private; 1446168404Spjd dmu_buf_impl_t *db = buf->b_private; 1447168404Spjd 1448168404Spjd if (!MUTEX_HELD(&db->db_mtx)) 1449168404Spjd mutex_enter(&db->db_mtx); 1450168404Spjd 1451168404Spjd ASSERT(refcount_is_zero(&db->db_holds)); 1452168404Spjd 1453168404Spjd if (db->db_state != DB_EVICTING) { 1454168404Spjd ASSERT(db->db_state == DB_CACHED); 1455168404Spjd DBUF_VERIFY(db); 1456168404Spjd db->db_buf = NULL; 1457168404Spjd dbuf_evict(db); 1458168404Spjd } else { 1459168404Spjd mutex_exit(&db->db_mtx); 1460168404Spjd dbuf_destroy(db); 1461168404Spjd } 1462168404Spjd return (0); 1463168404Spjd} 1464168404Spjd 1465168404Spjdstatic void 1466168404Spjddbuf_destroy(dmu_buf_impl_t *db) 1467168404Spjd{ 1468168404Spjd ASSERT(refcount_is_zero(&db->db_holds)); 1469168404Spjd 1470168404Spjd if (db->db_blkid != DB_BONUS_BLKID) { 1471168404Spjd dnode_t *dn = db->db_dnode; 1472168404Spjd 1473168404Spjd /* 1474168404Spjd * If this dbuf is still on the dn_dbufs list, 1475168404Spjd * remove it from that list. 1476168404Spjd */ 1477168404Spjd if (list_link_active(&db->db_link)) { 1478168404Spjd mutex_enter(&dn->dn_dbufs_mtx); 1479168404Spjd list_remove(&dn->dn_dbufs, db); 1480168404Spjd mutex_exit(&dn->dn_dbufs_mtx); 1481168404Spjd 1482168404Spjd dnode_rele(dn, db); 1483168404Spjd } 1484168404Spjd dbuf_hash_remove(db); 1485168404Spjd } 1486168404Spjd db->db_parent = NULL; 1487168404Spjd db->db_dnode = NULL; 1488168404Spjd db->db_buf = NULL; 1489168404Spjd 1490168404Spjd ASSERT(db->db.db_data == NULL); 1491168404Spjd ASSERT(db->db_hash_next == NULL); 1492168404Spjd ASSERT(db->db_blkptr == NULL); 1493168404Spjd ASSERT(db->db_data_pending == NULL); 1494168404Spjd 1495168404Spjd kmem_cache_free(dbuf_cache, db); 1496168404Spjd} 1497168404Spjd 1498168404Spjdvoid 1499168404Spjddbuf_prefetch(dnode_t *dn, uint64_t blkid) 1500168404Spjd{ 1501168404Spjd dmu_buf_impl_t *db = NULL; 1502168404Spjd blkptr_t *bp = NULL; 1503168404Spjd 1504168404Spjd ASSERT(blkid != DB_BONUS_BLKID); 1505168404Spjd ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 1506168404Spjd 1507168404Spjd if (dnode_block_freed(dn, blkid)) 1508168404Spjd return; 1509168404Spjd 1510168404Spjd /* dbuf_find() returns with db_mtx held */ 1511168404Spjd if (db = dbuf_find(dn, 0, blkid)) { 1512168404Spjd if (refcount_count(&db->db_holds) > 0) { 1513168404Spjd /* 1514168404Spjd * This dbuf is active. We assume that it is 1515168404Spjd * already CACHED, or else about to be either 1516168404Spjd * read or filled. 1517168404Spjd */ 1518168404Spjd mutex_exit(&db->db_mtx); 1519168404Spjd return; 1520168404Spjd } 1521168404Spjd mutex_exit(&db->db_mtx); 1522168404Spjd db = NULL; 1523168404Spjd } 1524168404Spjd 1525168404Spjd if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp) == 0) { 1526168404Spjd if (bp && !BP_IS_HOLE(bp)) { 1527168404Spjd uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH; 1528168404Spjd zbookmark_t zb; 1529168404Spjd zb.zb_objset = dn->dn_objset->os_dsl_dataset ? 1530168404Spjd dn->dn_objset->os_dsl_dataset->ds_object : 0; 1531168404Spjd zb.zb_object = dn->dn_object; 1532168404Spjd zb.zb_level = 0; 1533168404Spjd zb.zb_blkid = blkid; 1534168404Spjd 1535168404Spjd (void) arc_read(NULL, dn->dn_objset->os_spa, bp, 1536168404Spjd dmu_ot[dn->dn_type].ot_byteswap, 1537168404Spjd NULL, NULL, ZIO_PRIORITY_ASYNC_READ, 1538168404Spjd ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, 1539168404Spjd &aflags, &zb); 1540168404Spjd } 1541168404Spjd if (db) 1542168404Spjd dbuf_rele(db, NULL); 1543168404Spjd } 1544168404Spjd} 1545168404Spjd 1546168404Spjd/* 1547168404Spjd * Returns with db_holds incremented, and db_mtx not held. 1548168404Spjd * Note: dn_struct_rwlock must be held. 1549168404Spjd */ 1550168404Spjdint 1551168404Spjddbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse, 1552168404Spjd void *tag, dmu_buf_impl_t **dbp) 1553168404Spjd{ 1554168404Spjd dmu_buf_impl_t *db, *parent = NULL; 1555168404Spjd 1556168404Spjd ASSERT(blkid != DB_BONUS_BLKID); 1557168404Spjd ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 1558168404Spjd ASSERT3U(dn->dn_nlevels, >, level); 1559168404Spjd 1560168404Spjd *dbp = NULL; 1561168404Spjdtop: 1562168404Spjd /* dbuf_find() returns with db_mtx held */ 1563168404Spjd db = dbuf_find(dn, level, blkid); 1564168404Spjd 1565168404Spjd if (db == NULL) { 1566168404Spjd blkptr_t *bp = NULL; 1567168404Spjd int err; 1568168404Spjd 1569168404Spjd ASSERT3P(parent, ==, NULL); 1570168404Spjd err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp); 1571168404Spjd if (fail_sparse) { 1572168404Spjd if (err == 0 && bp && BP_IS_HOLE(bp)) 1573168404Spjd err = ENOENT; 1574168404Spjd if (err) { 1575168404Spjd if (parent) 1576168404Spjd dbuf_rele(parent, NULL); 1577168404Spjd return (err); 1578168404Spjd } 1579168404Spjd } 1580168404Spjd if (err && err != ENOENT) 1581168404Spjd return (err); 1582168404Spjd db = dbuf_create(dn, level, blkid, parent, bp); 1583168404Spjd } 1584168404Spjd 1585168404Spjd if (db->db_buf && refcount_is_zero(&db->db_holds)) { 1586168404Spjd arc_buf_add_ref(db->db_buf, db); 1587168404Spjd if (db->db_buf->b_data == NULL) { 1588168404Spjd dbuf_clear(db); 1589168404Spjd if (parent) { 1590168404Spjd dbuf_rele(parent, NULL); 1591168404Spjd parent = NULL; 1592168404Spjd } 1593168404Spjd goto top; 1594168404Spjd } 1595168404Spjd ASSERT3P(db->db.db_data, ==, db->db_buf->b_data); 1596168404Spjd } 1597168404Spjd 1598168404Spjd ASSERT(db->db_buf == NULL || arc_referenced(db->db_buf)); 1599168404Spjd 1600168404Spjd /* 1601168404Spjd * If this buffer is currently syncing out, and we are are 1602168404Spjd * still referencing it from db_data, we need to make a copy 1603168404Spjd * of it in case we decide we want to dirty it again in this txg. 1604168404Spjd */ 1605168404Spjd if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID && 1606168404Spjd dn->dn_object != DMU_META_DNODE_OBJECT && 1607168404Spjd db->db_state == DB_CACHED && db->db_data_pending) { 1608168404Spjd dbuf_dirty_record_t *dr = db->db_data_pending; 1609168404Spjd 1610168404Spjd if (dr->dt.dl.dr_data == db->db_buf) { 1611168404Spjd arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 1612168404Spjd 1613168404Spjd dbuf_set_data(db, 1614168404Spjd arc_buf_alloc(db->db_dnode->dn_objset->os_spa, 1615168404Spjd db->db.db_size, db, type)); 1616168404Spjd bcopy(dr->dt.dl.dr_data->b_data, db->db.db_data, 1617168404Spjd db->db.db_size); 1618168404Spjd } 1619168404Spjd } 1620168404Spjd 1621168404Spjd (void) refcount_add(&db->db_holds, tag); 1622168404Spjd dbuf_update_data(db); 1623168404Spjd DBUF_VERIFY(db); 1624168404Spjd mutex_exit(&db->db_mtx); 1625168404Spjd 1626168404Spjd /* NOTE: we can't rele the parent until after we drop the db_mtx */ 1627168404Spjd if (parent) 1628168404Spjd dbuf_rele(parent, NULL); 1629168404Spjd 1630168404Spjd ASSERT3P(db->db_dnode, ==, dn); 1631168404Spjd ASSERT3U(db->db_blkid, ==, blkid); 1632168404Spjd ASSERT3U(db->db_level, ==, level); 1633168404Spjd *dbp = db; 1634168404Spjd 1635168404Spjd return (0); 1636168404Spjd} 1637168404Spjd 1638168404Spjddmu_buf_impl_t * 1639168404Spjddbuf_hold(dnode_t *dn, uint64_t blkid, void *tag) 1640168404Spjd{ 1641168404Spjd dmu_buf_impl_t *db; 1642168404Spjd int err = dbuf_hold_impl(dn, 0, blkid, FALSE, tag, &db); 1643168404Spjd return (err ? NULL : db); 1644168404Spjd} 1645168404Spjd 1646168404Spjddmu_buf_impl_t * 1647168404Spjddbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag) 1648168404Spjd{ 1649168404Spjd dmu_buf_impl_t *db; 1650168404Spjd int err = dbuf_hold_impl(dn, level, blkid, FALSE, tag, &db); 1651168404Spjd return (err ? NULL : db); 1652168404Spjd} 1653168404Spjd 1654168404Spjddmu_buf_impl_t * 1655168404Spjddbuf_create_bonus(dnode_t *dn) 1656168404Spjd{ 1657168404Spjd dmu_buf_impl_t *db = dn->dn_bonus; 1658168404Spjd 1659168404Spjd ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); 1660168404Spjd 1661168404Spjd ASSERT(dn->dn_bonus == NULL); 1662168404Spjd db = dbuf_create(dn, 0, DB_BONUS_BLKID, dn->dn_dbuf, NULL); 1663168404Spjd return (db); 1664168404Spjd} 1665168404Spjd 1666168404Spjd#pragma weak dmu_buf_add_ref = dbuf_add_ref 1667168404Spjdvoid 1668168404Spjddbuf_add_ref(dmu_buf_impl_t *db, void *tag) 1669168404Spjd{ 1670168404Spjd int64_t holds = refcount_add(&db->db_holds, tag); 1671168404Spjd ASSERT(holds > 1); 1672168404Spjd} 1673168404Spjd 1674168404Spjd#pragma weak dmu_buf_rele = dbuf_rele 1675168404Spjdvoid 1676168404Spjddbuf_rele(dmu_buf_impl_t *db, void *tag) 1677168404Spjd{ 1678168404Spjd int64_t holds; 1679168404Spjd 1680168404Spjd mutex_enter(&db->db_mtx); 1681168404Spjd DBUF_VERIFY(db); 1682168404Spjd 1683168404Spjd holds = refcount_remove(&db->db_holds, tag); 1684168404Spjd ASSERT(holds >= 0); 1685168404Spjd 1686168404Spjd /* 1687168404Spjd * We can't freeze indirects if there is a possibility that they 1688168404Spjd * may be modified in the current syncing context. 1689168404Spjd */ 1690168404Spjd if (db->db_buf && holds == (db->db_level == 0 ? db->db_dirtycnt : 0)) 1691168404Spjd arc_buf_freeze(db->db_buf); 1692168404Spjd 1693168404Spjd if (holds == db->db_dirtycnt && 1694168404Spjd db->db_level == 0 && db->db_immediate_evict) 1695168404Spjd dbuf_evict_user(db); 1696168404Spjd 1697168404Spjd if (holds == 0) { 1698168404Spjd if (db->db_blkid == DB_BONUS_BLKID) { 1699168404Spjd mutex_exit(&db->db_mtx); 1700168404Spjd dnode_rele(db->db_dnode, db); 1701168404Spjd } else if (db->db_buf == NULL) { 1702168404Spjd /* 1703168404Spjd * This is a special case: we never associated this 1704168404Spjd * dbuf with any data allocated from the ARC. 1705168404Spjd */ 1706168404Spjd ASSERT3U(db->db_state, ==, DB_UNCACHED); 1707168404Spjd dbuf_evict(db); 1708168404Spjd } else if (arc_released(db->db_buf)) { 1709168404Spjd arc_buf_t *buf = db->db_buf; 1710168404Spjd /* 1711168404Spjd * This dbuf has anonymous data associated with it. 1712168404Spjd */ 1713168404Spjd dbuf_set_data(db, NULL); 1714168404Spjd VERIFY(arc_buf_remove_ref(buf, db) == 1); 1715168404Spjd dbuf_evict(db); 1716168404Spjd } else { 1717168404Spjd VERIFY(arc_buf_remove_ref(db->db_buf, db) == 0); 1718168404Spjd mutex_exit(&db->db_mtx); 1719168404Spjd } 1720168404Spjd } else { 1721168404Spjd mutex_exit(&db->db_mtx); 1722168404Spjd } 1723168404Spjd} 1724168404Spjd 1725168404Spjd#pragma weak dmu_buf_refcount = dbuf_refcount 1726168404Spjduint64_t 1727168404Spjddbuf_refcount(dmu_buf_impl_t *db) 1728168404Spjd{ 1729168404Spjd return (refcount_count(&db->db_holds)); 1730168404Spjd} 1731168404Spjd 1732168404Spjdvoid * 1733168404Spjddmu_buf_set_user(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr, 1734168404Spjd dmu_buf_evict_func_t *evict_func) 1735168404Spjd{ 1736168404Spjd return (dmu_buf_update_user(db_fake, NULL, user_ptr, 1737168404Spjd user_data_ptr_ptr, evict_func)); 1738168404Spjd} 1739168404Spjd 1740168404Spjdvoid * 1741168404Spjddmu_buf_set_user_ie(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr, 1742168404Spjd dmu_buf_evict_func_t *evict_func) 1743168404Spjd{ 1744168404Spjd dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 1745168404Spjd 1746168404Spjd db->db_immediate_evict = TRUE; 1747168404Spjd return (dmu_buf_update_user(db_fake, NULL, user_ptr, 1748168404Spjd user_data_ptr_ptr, evict_func)); 1749168404Spjd} 1750168404Spjd 1751168404Spjdvoid * 1752168404Spjddmu_buf_update_user(dmu_buf_t *db_fake, void *old_user_ptr, void *user_ptr, 1753168404Spjd void *user_data_ptr_ptr, dmu_buf_evict_func_t *evict_func) 1754168404Spjd{ 1755168404Spjd dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 1756168404Spjd ASSERT(db->db_level == 0); 1757168404Spjd 1758168404Spjd ASSERT((user_ptr == NULL) == (evict_func == NULL)); 1759168404Spjd 1760168404Spjd mutex_enter(&db->db_mtx); 1761168404Spjd 1762168404Spjd if (db->db_user_ptr == old_user_ptr) { 1763168404Spjd db->db_user_ptr = user_ptr; 1764168404Spjd db->db_user_data_ptr_ptr = user_data_ptr_ptr; 1765168404Spjd db->db_evict_func = evict_func; 1766168404Spjd 1767168404Spjd dbuf_update_data(db); 1768168404Spjd } else { 1769168404Spjd old_user_ptr = db->db_user_ptr; 1770168404Spjd } 1771168404Spjd 1772168404Spjd mutex_exit(&db->db_mtx); 1773168404Spjd return (old_user_ptr); 1774168404Spjd} 1775168404Spjd 1776168404Spjdvoid * 1777168404Spjddmu_buf_get_user(dmu_buf_t *db_fake) 1778168404Spjd{ 1779168404Spjd dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 1780168404Spjd ASSERT(!refcount_is_zero(&db->db_holds)); 1781168404Spjd 1782168404Spjd return (db->db_user_ptr); 1783168404Spjd} 1784168404Spjd 1785168404Spjdstatic void 1786168404Spjddbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db) 1787168404Spjd{ 1788168404Spjd /* ASSERT(dmu_tx_is_syncing(tx) */ 1789168404Spjd ASSERT(MUTEX_HELD(&db->db_mtx)); 1790168404Spjd 1791168404Spjd if (db->db_blkptr != NULL) 1792168404Spjd return; 1793168404Spjd 1794168404Spjd if (db->db_level == dn->dn_phys->dn_nlevels-1) { 1795168404Spjd /* 1796168404Spjd * This buffer was allocated at a time when there was 1797168404Spjd * no available blkptrs from the dnode, or it was 1798168404Spjd * inappropriate to hook it in (i.e., nlevels mis-match). 1799168404Spjd */ 1800168404Spjd ASSERT(db->db_blkid < dn->dn_phys->dn_nblkptr); 1801168404Spjd ASSERT(db->db_parent == NULL); 1802168404Spjd db->db_parent = dn->dn_dbuf; 1803168404Spjd db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid]; 1804168404Spjd DBUF_VERIFY(db); 1805168404Spjd } else { 1806168404Spjd dmu_buf_impl_t *parent = db->db_parent; 1807168404Spjd int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; 1808168404Spjd 1809168404Spjd ASSERT(dn->dn_phys->dn_nlevels > 1); 1810168404Spjd if (parent == NULL) { 1811168404Spjd mutex_exit(&db->db_mtx); 1812168404Spjd rw_enter(&dn->dn_struct_rwlock, RW_READER); 1813168404Spjd (void) dbuf_hold_impl(dn, db->db_level+1, 1814168404Spjd db->db_blkid >> epbs, FALSE, db, &parent); 1815168404Spjd rw_exit(&dn->dn_struct_rwlock); 1816168404Spjd mutex_enter(&db->db_mtx); 1817168404Spjd db->db_parent = parent; 1818168404Spjd } 1819168404Spjd db->db_blkptr = (blkptr_t *)parent->db.db_data + 1820168404Spjd (db->db_blkid & ((1ULL << epbs) - 1)); 1821168404Spjd DBUF_VERIFY(db); 1822168404Spjd } 1823168404Spjd} 1824168404Spjd 1825168404Spjdstatic void 1826168404Spjddbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx) 1827168404Spjd{ 1828168404Spjd dmu_buf_impl_t *db = dr->dr_dbuf; 1829168404Spjd dnode_t *dn = db->db_dnode; 1830168404Spjd zio_t *zio; 1831168404Spjd 1832168404Spjd ASSERT(dmu_tx_is_syncing(tx)); 1833168404Spjd 1834168404Spjd dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr); 1835168404Spjd 1836168404Spjd mutex_enter(&db->db_mtx); 1837168404Spjd 1838168404Spjd ASSERT(db->db_level > 0); 1839168404Spjd DBUF_VERIFY(db); 1840168404Spjd 1841168404Spjd if (db->db_buf == NULL) { 1842168404Spjd mutex_exit(&db->db_mtx); 1843168404Spjd (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED); 1844168404Spjd mutex_enter(&db->db_mtx); 1845168404Spjd } 1846168404Spjd ASSERT3U(db->db_state, ==, DB_CACHED); 1847168404Spjd ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift); 1848168404Spjd ASSERT(db->db_buf != NULL); 1849168404Spjd 1850168404Spjd dbuf_check_blkptr(dn, db); 1851168404Spjd 1852168404Spjd db->db_data_pending = dr; 1853168404Spjd 1854168404Spjd arc_release(db->db_buf, db); 1855168404Spjd mutex_exit(&db->db_mtx); 1856168404Spjd 1857168404Spjd /* 1858168404Spjd * XXX -- we should design a compression algorithm 1859168404Spjd * that specializes in arrays of bps. 1860168404Spjd */ 1861168404Spjd dbuf_write(dr, db->db_buf, ZIO_CHECKSUM_FLETCHER_4, 1862168404Spjd zfs_mdcomp_disable ? ZIO_COMPRESS_EMPTY : ZIO_COMPRESS_LZJB, tx); 1863168404Spjd 1864168404Spjd zio = dr->dr_zio; 1865168404Spjd mutex_enter(&dr->dt.di.dr_mtx); 1866168404Spjd dbuf_sync_list(&dr->dt.di.dr_children, tx); 1867168404Spjd ASSERT(list_head(&dr->dt.di.dr_children) == NULL); 1868168404Spjd mutex_exit(&dr->dt.di.dr_mtx); 1869168404Spjd zio_nowait(zio); 1870168404Spjd} 1871168404Spjd 1872168404Spjdstatic void 1873168404Spjddbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) 1874168404Spjd{ 1875168404Spjd arc_buf_t **datap = &dr->dt.dl.dr_data; 1876168404Spjd dmu_buf_impl_t *db = dr->dr_dbuf; 1877168404Spjd dnode_t *dn = db->db_dnode; 1878168404Spjd objset_impl_t *os = dn->dn_objset; 1879168404Spjd uint64_t txg = tx->tx_txg; 1880168404Spjd int checksum, compress; 1881168404Spjd int blksz; 1882168404Spjd 1883168404Spjd ASSERT(dmu_tx_is_syncing(tx)); 1884168404Spjd 1885168404Spjd dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr); 1886168404Spjd 1887168404Spjd mutex_enter(&db->db_mtx); 1888168404Spjd /* 1889168404Spjd * To be synced, we must be dirtied. But we 1890168404Spjd * might have been freed after the dirty. 1891168404Spjd */ 1892168404Spjd if (db->db_state == DB_UNCACHED) { 1893168404Spjd /* This buffer has been freed since it was dirtied */ 1894168404Spjd ASSERT(db->db.db_data == NULL); 1895168404Spjd } else if (db->db_state == DB_FILL) { 1896168404Spjd /* This buffer was freed and is now being re-filled */ 1897168404Spjd ASSERT(db->db.db_data != dr->dt.dl.dr_data); 1898168404Spjd } else { 1899168404Spjd ASSERT3U(db->db_state, ==, DB_CACHED); 1900168404Spjd } 1901168404Spjd DBUF_VERIFY(db); 1902168404Spjd 1903168404Spjd /* 1904168404Spjd * If this is a bonus buffer, simply copy the bonus data into the 1905168404Spjd * dnode. It will be written out when the dnode is synced (and it 1906168404Spjd * will be synced, since it must have been dirty for dbuf_sync to 1907168404Spjd * be called). 1908168404Spjd */ 1909168404Spjd if (db->db_blkid == DB_BONUS_BLKID) { 1910168404Spjd dbuf_dirty_record_t **drp; 1911168404Spjd /* 1912168404Spjd * Use dn_phys->dn_bonuslen since db.db_size is the length 1913168404Spjd * of the bonus buffer in the open transaction rather than 1914168404Spjd * the syncing transaction. 1915168404Spjd */ 1916168404Spjd ASSERT(*datap != NULL); 1917168404Spjd ASSERT3U(db->db_level, ==, 0); 1918168404Spjd ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN); 1919168404Spjd bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen); 1920168404Spjd if (*datap != db->db.db_data) 1921168404Spjd zio_buf_free(*datap, DN_MAX_BONUSLEN); 1922168404Spjd db->db_data_pending = NULL; 1923168404Spjd drp = &db->db_last_dirty; 1924168404Spjd while (*drp != dr) 1925168404Spjd drp = &(*drp)->dr_next; 1926168404Spjd ASSERT((*drp)->dr_next == NULL); 1927168404Spjd *drp = NULL; 1928168404Spjd kmem_free(dr, sizeof (dbuf_dirty_record_t)); 1929168404Spjd ASSERT(db->db_dirtycnt > 0); 1930168404Spjd db->db_dirtycnt -= 1; 1931168404Spjd mutex_exit(&db->db_mtx); 1932168404Spjd dbuf_rele(db, (void *)(uintptr_t)txg); 1933168404Spjd return; 1934168404Spjd } 1935168404Spjd 1936168404Spjd /* 1937168404Spjd * If this buffer is in the middle of an immdiate write, 1938168404Spjd * wait for the synchronous IO to complete. 1939168404Spjd */ 1940168404Spjd while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) { 1941168404Spjd ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT); 1942168404Spjd cv_wait(&db->db_changed, &db->db_mtx); 1943168404Spjd ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN); 1944168404Spjd } 1945168404Spjd 1946168404Spjd dbuf_check_blkptr(dn, db); 1947168404Spjd 1948168404Spjd /* 1949168404Spjd * If this dbuf has already been written out via an immediate write, 1950168404Spjd * just complete the write by copying over the new block pointer and 1951168404Spjd * updating the accounting via the write-completion functions. 1952168404Spjd */ 1953168404Spjd if (dr->dt.dl.dr_override_state == DR_OVERRIDDEN) { 1954168404Spjd zio_t zio_fake; 1955168404Spjd 1956168404Spjd zio_fake.io_private = &db; 1957168404Spjd zio_fake.io_error = 0; 1958168404Spjd zio_fake.io_bp = db->db_blkptr; 1959168404Spjd zio_fake.io_bp_orig = *db->db_blkptr; 1960168404Spjd zio_fake.io_txg = txg; 1961168404Spjd 1962168404Spjd *db->db_blkptr = dr->dt.dl.dr_overridden_by; 1963168404Spjd dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; 1964168404Spjd db->db_data_pending = dr; 1965168404Spjd dr->dr_zio = &zio_fake; 1966168404Spjd mutex_exit(&db->db_mtx); 1967168404Spjd 1968168404Spjd if (BP_IS_OLDER(&zio_fake.io_bp_orig, txg)) 1969168404Spjd dsl_dataset_block_kill(os->os_dsl_dataset, 1970168404Spjd &zio_fake.io_bp_orig, dn->dn_zio, tx); 1971168404Spjd 1972168404Spjd dbuf_write_ready(&zio_fake, db->db_buf, db); 1973168404Spjd dbuf_write_done(&zio_fake, db->db_buf, db); 1974168404Spjd 1975168404Spjd return; 1976168404Spjd } 1977168404Spjd 1978168404Spjd blksz = arc_buf_size(*datap); 1979168404Spjd 1980168404Spjd if (dn->dn_object != DMU_META_DNODE_OBJECT) { 1981168404Spjd /* 1982168404Spjd * If this buffer is currently "in use" (i.e., there are 1983168404Spjd * active holds and db_data still references it), then make 1984168404Spjd * a copy before we start the write so that any modifications 1985168404Spjd * from the open txg will not leak into this write. 1986168404Spjd * 1987168404Spjd * NOTE: this copy does not need to be made for objects only 1988168404Spjd * modified in the syncing context (e.g. DNONE_DNODE blocks). 1989168404Spjd */ 1990168404Spjd if (refcount_count(&db->db_holds) > 1 && *datap == db->db_buf) { 1991168404Spjd arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 1992168404Spjd *datap = arc_buf_alloc(os->os_spa, blksz, db, type); 1993168404Spjd bcopy(db->db.db_data, (*datap)->b_data, blksz); 1994168404Spjd } 1995168404Spjd } else { 1996168404Spjd /* 1997168404Spjd * Private object buffers are released here rather 1998168404Spjd * than in dbuf_dirty() since they are only modified 1999168404Spjd * in the syncing context and we don't want the 2000168404Spjd * overhead of making multiple copies of the data. 2001168404Spjd */ 2002168404Spjd arc_release(db->db_buf, db); 2003168404Spjd } 2004168404Spjd 2005168404Spjd ASSERT(*datap != NULL); 2006168404Spjd db->db_data_pending = dr; 2007168404Spjd 2008168404Spjd mutex_exit(&db->db_mtx); 2009168404Spjd 2010168404Spjd /* 2011168404Spjd * Allow dnode settings to override objset settings, 2012168404Spjd * except for metadata checksums. 2013168404Spjd */ 2014168404Spjd if (dmu_ot[dn->dn_type].ot_metadata) { 2015168404Spjd checksum = os->os_md_checksum; 2016168404Spjd compress = zio_compress_select(dn->dn_compress, 2017168404Spjd os->os_md_compress); 2018168404Spjd } else { 2019168404Spjd checksum = zio_checksum_select(dn->dn_checksum, 2020168404Spjd os->os_checksum); 2021168404Spjd compress = zio_compress_select(dn->dn_compress, 2022168404Spjd os->os_compress); 2023168404Spjd } 2024168404Spjd 2025168404Spjd dbuf_write(dr, *datap, checksum, compress, tx); 2026168404Spjd 2027168404Spjd ASSERT(!list_link_active(&dr->dr_dirty_node)); 2028168404Spjd if (dn->dn_object == DMU_META_DNODE_OBJECT) 2029168404Spjd list_insert_tail(&dn->dn_dirty_records[txg&TXG_MASK], dr); 2030168404Spjd else 2031168404Spjd zio_nowait(dr->dr_zio); 2032168404Spjd} 2033168404Spjd 2034168404Spjdvoid 2035168404Spjddbuf_sync_list(list_t *list, dmu_tx_t *tx) 2036168404Spjd{ 2037168404Spjd dbuf_dirty_record_t *dr; 2038168404Spjd 2039168404Spjd while (dr = list_head(list)) { 2040168404Spjd if (dr->dr_zio != NULL) { 2041168404Spjd /* 2042168404Spjd * If we find an already initialized zio then we 2043168404Spjd * are processing the meta-dnode, and we have finished. 2044168404Spjd * The dbufs for all dnodes are put back on the list 2045168404Spjd * during processing, so that we can zio_wait() 2046168404Spjd * these IOs after initiating all child IOs. 2047168404Spjd */ 2048168404Spjd ASSERT3U(dr->dr_dbuf->db.db_object, ==, 2049168404Spjd DMU_META_DNODE_OBJECT); 2050168404Spjd break; 2051168404Spjd } 2052168404Spjd list_remove(list, dr); 2053168404Spjd if (dr->dr_dbuf->db_level > 0) 2054168404Spjd dbuf_sync_indirect(dr, tx); 2055168404Spjd else 2056168404Spjd dbuf_sync_leaf(dr, tx); 2057168404Spjd } 2058168404Spjd} 2059168404Spjd 2060168404Spjdstatic void 2061168404Spjddbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, int checksum, 2062168404Spjd int compress, dmu_tx_t *tx) 2063168404Spjd{ 2064168404Spjd dmu_buf_impl_t *db = dr->dr_dbuf; 2065168404Spjd dnode_t *dn = db->db_dnode; 2066168404Spjd objset_impl_t *os = dn->dn_objset; 2067168404Spjd dmu_buf_impl_t *parent = db->db_parent; 2068168404Spjd uint64_t txg = tx->tx_txg; 2069168404Spjd zbookmark_t zb; 2070168404Spjd zio_t *zio; 2071168404Spjd int zio_flags; 2072168404Spjd 2073168404Spjd if (parent != dn->dn_dbuf) { 2074168404Spjd ASSERT(parent && parent->db_data_pending); 2075168404Spjd ASSERT(db->db_level == parent->db_level-1); 2076168404Spjd ASSERT(arc_released(parent->db_buf)); 2077168404Spjd zio = parent->db_data_pending->dr_zio; 2078168404Spjd } else { 2079168404Spjd ASSERT(db->db_level == dn->dn_phys->dn_nlevels-1); 2080168404Spjd ASSERT3P(db->db_blkptr, ==, 2081168404Spjd &dn->dn_phys->dn_blkptr[db->db_blkid]); 2082168404Spjd zio = dn->dn_zio; 2083168404Spjd } 2084168404Spjd 2085168404Spjd ASSERT(db->db_level == 0 || data == db->db_buf); 2086168404Spjd ASSERT3U(db->db_blkptr->blk_birth, <=, txg); 2087168404Spjd ASSERT(zio); 2088168404Spjd 2089168404Spjd zb.zb_objset = os->os_dsl_dataset ? os->os_dsl_dataset->ds_object : 0; 2090168404Spjd zb.zb_object = db->db.db_object; 2091168404Spjd zb.zb_level = db->db_level; 2092168404Spjd zb.zb_blkid = db->db_blkid; 2093168404Spjd 2094168404Spjd zio_flags = ZIO_FLAG_MUSTSUCCEED; 2095168404Spjd if (dmu_ot[dn->dn_type].ot_metadata || zb.zb_level != 0) 2096168404Spjd zio_flags |= ZIO_FLAG_METADATA; 2097168404Spjd if (BP_IS_OLDER(db->db_blkptr, txg)) 2098168404Spjd dsl_dataset_block_kill( 2099168404Spjd os->os_dsl_dataset, db->db_blkptr, zio, tx); 2100168404Spjd 2101168404Spjd dr->dr_zio = arc_write(zio, os->os_spa, checksum, compress, 2102168404Spjd dmu_get_replication_level(os, &zb, dn->dn_type), txg, 2103168404Spjd db->db_blkptr, data, dbuf_write_ready, dbuf_write_done, db, 2104168404Spjd ZIO_PRIORITY_ASYNC_WRITE, zio_flags, &zb); 2105168404Spjd} 2106168404Spjd 2107168404Spjd/* ARGSUSED */ 2108168404Spjdstatic void 2109168404Spjddbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb) 2110168404Spjd{ 2111168404Spjd dmu_buf_impl_t *db = vdb; 2112168404Spjd dnode_t *dn = db->db_dnode; 2113168404Spjd objset_impl_t *os = dn->dn_objset; 2114168404Spjd blkptr_t *bp_orig = &zio->io_bp_orig; 2115168404Spjd uint64_t fill = 0; 2116168404Spjd int old_size, new_size, i; 2117168404Spjd 2118168404Spjd dprintf_dbuf_bp(db, bp_orig, "bp_orig: %s", ""); 2119168404Spjd 2120168404Spjd old_size = bp_get_dasize(os->os_spa, bp_orig); 2121168404Spjd new_size = bp_get_dasize(os->os_spa, zio->io_bp); 2122168404Spjd 2123168404Spjd dnode_diduse_space(dn, new_size-old_size); 2124168404Spjd 2125168404Spjd if (BP_IS_HOLE(zio->io_bp)) { 2126168404Spjd dsl_dataset_t *ds = os->os_dsl_dataset; 2127168404Spjd dmu_tx_t *tx = os->os_synctx; 2128168404Spjd 2129168404Spjd if (bp_orig->blk_birth == tx->tx_txg) 2130168404Spjd dsl_dataset_block_kill(ds, bp_orig, NULL, tx); 2131168404Spjd ASSERT3U(db->db_blkptr->blk_fill, ==, 0); 2132168404Spjd return; 2133168404Spjd } 2134168404Spjd 2135168404Spjd mutex_enter(&db->db_mtx); 2136168404Spjd 2137168404Spjd if (db->db_level == 0) { 2138168404Spjd mutex_enter(&dn->dn_mtx); 2139168404Spjd if (db->db_blkid > dn->dn_phys->dn_maxblkid) 2140168404Spjd dn->dn_phys->dn_maxblkid = db->db_blkid; 2141168404Spjd mutex_exit(&dn->dn_mtx); 2142168404Spjd 2143168404Spjd if (dn->dn_type == DMU_OT_DNODE) { 2144168404Spjd dnode_phys_t *dnp = db->db.db_data; 2145168404Spjd for (i = db->db.db_size >> DNODE_SHIFT; i > 0; 2146168404Spjd i--, dnp++) { 2147168404Spjd if (dnp->dn_type != DMU_OT_NONE) 2148168404Spjd fill++; 2149168404Spjd } 2150168404Spjd } else { 2151168404Spjd fill = 1; 2152168404Spjd } 2153168404Spjd } else { 2154168404Spjd blkptr_t *bp = db->db.db_data; 2155168404Spjd ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift); 2156168404Spjd for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, bp++) { 2157168404Spjd if (BP_IS_HOLE(bp)) 2158168404Spjd continue; 2159168404Spjd ASSERT3U(BP_GET_LSIZE(bp), ==, 2160168404Spjd db->db_level == 1 ? dn->dn_datablksz : 2161168404Spjd (1<<dn->dn_phys->dn_indblkshift)); 2162168404Spjd fill += bp->blk_fill; 2163168404Spjd } 2164168404Spjd } 2165168404Spjd 2166168404Spjd db->db_blkptr->blk_fill = fill; 2167168404Spjd BP_SET_TYPE(db->db_blkptr, dn->dn_type); 2168168404Spjd BP_SET_LEVEL(db->db_blkptr, db->db_level); 2169168404Spjd 2170168404Spjd mutex_exit(&db->db_mtx); 2171168404Spjd 2172168404Spjd /* We must do this after we've set the bp's type and level */ 2173168404Spjd if (!DVA_EQUAL(BP_IDENTITY(zio->io_bp), BP_IDENTITY(bp_orig))) { 2174168404Spjd dsl_dataset_t *ds = os->os_dsl_dataset; 2175168404Spjd dmu_tx_t *tx = os->os_synctx; 2176168404Spjd 2177168404Spjd if (bp_orig->blk_birth == tx->tx_txg) 2178168404Spjd dsl_dataset_block_kill(ds, bp_orig, NULL, tx); 2179168404Spjd dsl_dataset_block_born(ds, zio->io_bp, tx); 2180168404Spjd } 2181168404Spjd} 2182168404Spjd 2183168404Spjd/* ARGSUSED */ 2184168404Spjdstatic void 2185168404Spjddbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) 2186168404Spjd{ 2187168404Spjd dmu_buf_impl_t *db = vdb; 2188168404Spjd uint64_t txg = zio->io_txg; 2189168404Spjd dbuf_dirty_record_t **drp, *dr; 2190168404Spjd 2191168404Spjd ASSERT3U(zio->io_error, ==, 0); 2192168404Spjd 2193168404Spjd mutex_enter(&db->db_mtx); 2194168404Spjd 2195168404Spjd drp = &db->db_last_dirty; 2196168404Spjd while (*drp != db->db_data_pending) 2197168404Spjd drp = &(*drp)->dr_next; 2198168404Spjd ASSERT(!list_link_active(&(*drp)->dr_dirty_node)); 2199168404Spjd ASSERT((*drp)->dr_txg == txg); 2200168404Spjd ASSERT((*drp)->dr_next == NULL); 2201168404Spjd dr = *drp; 2202168404Spjd *drp = NULL; 2203168404Spjd 2204168404Spjd if (db->db_level == 0) { 2205168404Spjd ASSERT(db->db_blkid != DB_BONUS_BLKID); 2206168404Spjd ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN); 2207168404Spjd 2208168404Spjd if (dr->dt.dl.dr_data != db->db_buf) 2209168404Spjd VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db) == 1); 2210168404Spjd else if (!BP_IS_HOLE(db->db_blkptr)) 2211168404Spjd arc_set_callback(db->db_buf, dbuf_do_evict, db); 2212168404Spjd else 2213168404Spjd ASSERT(arc_released(db->db_buf)); 2214168404Spjd } else { 2215168404Spjd dnode_t *dn = db->db_dnode; 2216168404Spjd 2217168404Spjd ASSERT(list_head(&dr->dt.di.dr_children) == NULL); 2218168404Spjd ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift); 2219168404Spjd if (!BP_IS_HOLE(db->db_blkptr)) { 2220168404Spjd int epbs = 2221168404Spjd dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; 2222168404Spjd ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==, 2223168404Spjd db->db.db_size); 2224168404Spjd ASSERT3U(dn->dn_phys->dn_maxblkid 2225168404Spjd >> (db->db_level * epbs), >=, db->db_blkid); 2226168404Spjd arc_set_callback(db->db_buf, dbuf_do_evict, db); 2227168404Spjd } 2228168404Spjd } 2229168404Spjd kmem_free(dr, sizeof (dbuf_dirty_record_t)); 2230168404Spjd 2231168404Spjd cv_broadcast(&db->db_changed); 2232168404Spjd ASSERT(db->db_dirtycnt > 0); 2233168404Spjd db->db_dirtycnt -= 1; 2234168404Spjd db->db_data_pending = NULL; 2235168404Spjd mutex_exit(&db->db_mtx); 2236168404Spjd 2237168404Spjd dprintf_dbuf_bp(db, zio->io_bp, "bp: %s", ""); 2238168404Spjd 2239168404Spjd dbuf_rele(db, (void *)(uintptr_t)txg); 2240168404Spjd} 2241