Cross Reference: /freebsd-11-stable/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c

Deleted Added

sdiff udiff text old ( 308082 ) new ( 321524 )

full compact

dbuf.c (308082)	dbuf.c (321524)
1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 / 21/ 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 24 * Copyright (c) 2012, 2016 by Delphix. All rights reserved. 25 * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. 26 * Copyright (c) 2013, Joyent, Inc. All rights reserved. 27 * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. 28 * Copyright (c) 2014 Integros [integros.com] 29 */ 30 31#include <sys/zfs_context.h> 32#include <sys/dmu.h> 33#include <sys/dmu_send.h> 34#include <sys/dmu_impl.h> 35#include <sys/dbuf.h> 36#include <sys/dmu_objset.h> 37#include <sys/dsl_dataset.h> 38#include <sys/dsl_dir.h> 39#include <sys/dmu_tx.h> 40#include <sys/spa.h> 41#include <sys/zio.h> 42#include <sys/dmu_zfetch.h> 43#include <sys/sa.h> 44#include <sys/sa_impl.h> 45#include <sys/zfeature.h> 46#include <sys/blkptr.h> 47#include <sys/range_tree.h> 48#include <sys/callb.h> 49 50uint_t zfs_dbuf_evict_key; 51	1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 / 21/ 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 24 * Copyright (c) 2012, 2016 by Delphix. All rights reserved. 25 * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. 26 * Copyright (c) 2013, Joyent, Inc. All rights reserved. 27 * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. 28 * Copyright (c) 2014 Integros [integros.com] 29 */ 30 31#include <sys/zfs_context.h> 32#include <sys/dmu.h> 33#include <sys/dmu_send.h> 34#include <sys/dmu_impl.h> 35#include <sys/dbuf.h> 36#include <sys/dmu_objset.h> 37#include <sys/dsl_dataset.h> 38#include <sys/dsl_dir.h> 39#include <sys/dmu_tx.h> 40#include <sys/spa.h> 41#include <sys/zio.h> 42#include <sys/dmu_zfetch.h> 43#include <sys/sa.h> 44#include <sys/sa_impl.h> 45#include <sys/zfeature.h> 46#include <sys/blkptr.h> 47#include <sys/range_tree.h> 48#include <sys/callb.h> 49 50uint_t zfs_dbuf_evict_key; 51
52/* 53 * Number of times that zfs_free_range() took the slow path while doing 54 * a zfs receive. A nonzero value indicates a potential performance problem. 55 */ 56uint64_t zfs_free_range_recv_miss; 57
58static boolean_t dbuf_undirty(dmu_buf_impl_t db, dmu_tx_t tx); 59static void dbuf_write(dbuf_dirty_record_t dr, arc_buf_t data, dmu_tx_t tx); 60 61#ifndef __lint 62extern inline void dmu_buf_init_user(dmu_buf_user_t dbu, 63 dmu_buf_evict_func_t evict_func, dmu_buf_t clear_on_evict_dbufp); 64#endif / ! __lint / 65 66/ 67 * Global data structures and functions for the dbuf cache. 68 / 69static kmem_cache_t dbuf_kmem_cache; 70static taskq_t dbu_evict_taskq; 71 72static kthread_t dbuf_cache_evict_thread; 73static kmutex_t dbuf_evict_lock; 74static kcondvar_t dbuf_evict_cv; 75static boolean_t dbuf_evict_thread_exit; 76 77/* 78 * LRU cache of dbufs. The dbuf cache maintains a list of dbufs that 79 * are not currently held but have been recently released. These dbufs 80 * are not eligible for arc eviction until they are aged out of the cache. 81 * Dbufs are added to the dbuf cache once the last hold is released. If a 82 * dbuf is later accessed and still exists in the dbuf cache, then it will 83 * be removed from the cache and later re-added to the head of the cache. 84 * Dbufs that are aged out of the cache will be immediately destroyed and 85 * become eligible for arc eviction. 86 / 87static multilist_t dbuf_cache; 88static refcount_t dbuf_cache_size; 89uint64_t dbuf_cache_max_bytes = 100 1024 * 1024; 90 91/* Cap the size of the dbuf cache to log2 fraction of arc size. / 92int dbuf_cache_max_shift = 5; 93 94/ 95 * The dbuf cache uses a three-stage eviction policy: 96 * - A low water marker designates when the dbuf eviction thread 97 * should stop evicting from the dbuf cache. 98 * - When we reach the maximum size (aka mid water mark), we 99 * signal the eviction thread to run. 100 * - The high water mark indicates when the eviction thread 101 * is unable to keep up with the incoming load and eviction must 102 * happen in the context of the calling thread. 103 * 104 * The dbuf cache: 105 * (max size) 106 * low water mid water hi water 107 * +----------------------------------------+----------+----------+ 108 * \| \| \| \| 109 * \| \| \| \| 110 * \| \| \| \| 111 * \| \| \| \| 112 * +----------------------------------------+----------+----------+ 113 * stop signal evict 114 * evicting eviction directly 115 * thread 116 * 117 * The high and low water marks indicate the operating range for the eviction 118 * thread. The low water mark is, by default, 90% of the total size of the 119 * cache and the high water mark is at 110% (both of these percentages can be 120 * changed by setting dbuf_cache_lowater_pct and dbuf_cache_hiwater_pct, 121 * respectively). The eviction thread will try to ensure that the cache remains 122 * within this range by waking up every second and checking if the cache is 123 * above the low water mark. The thread can also be woken up by callers adding 124 * elements into the cache if the cache is larger than the mid water (i.e max 125 * cache size). Once the eviction thread is woken up and eviction is required, 126 * it will continue evicting buffers until it's able to reduce the cache size 127 * to the low water mark. If the cache size continues to grow and hits the high 128 * water mark, then callers adding elments to the cache will begin to evict 129 * directly from the cache until the cache is no longer above the high water 130 * mark. 131 / 132* 133/* 134 * The percentage above and below the maximum cache size. 135 / 136uint_t dbuf_cache_hiwater_pct = 10; 137uint_t dbuf_cache_lowater_pct = 10; 138* 139/* ARGSUSED / 140static int 141dbuf_cons(void vdb, void unused, int kmflag) 142{ 143* dmu_buf_impl_t db = vdb; 144* bzero(db, sizeof (dmu_buf_impl_t)); 145 146 mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL); 147 cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL); 148 multilist_link_init(&db->db_cache_link); 149 refcount_create(&db->db_holds); 150 151 return (0); 152} 153 154/* ARGSUSED / 155static void 156dbuf_dest(void vdb, void unused) 157{ 158* dmu_buf_impl_t db = vdb; 159* mutex_destroy(&db->db_mtx); 160 cv_destroy(&db->db_changed); 161 ASSERT(!multilist_link_active(&db->db_cache_link)); 162 refcount_destroy(&db->db_holds); 163} 164 165/* 166 * dbuf hash table routines 167 / 168static dbuf_hash_table_t dbuf_hash_table; 169* 170static uint64_t dbuf_hash_count; 171 172static uint64_t 173dbuf_hash(void os, uint64_t obj, uint8_t lvl, uint64_t blkid) 174{ 175* uintptr_t osv = (uintptr_t)os; 176 uint64_t crc = -1ULL; 177 178 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); 179 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (lvl)) & 0xFF]; 180 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF]; 181 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF]; 182 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF]; 183 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 0)) & 0xFF]; 184 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 8)) & 0xFF]; 185 186 crc ^= (osv>>14) ^ (obj>>16) ^ (blkid>>16); 187 188 return (crc); 189} 190 191#define DBUF_EQUAL(dbuf, os, obj, level, blkid) \ 192 ((dbuf)->db.db_object == (obj) && \ 193 (dbuf)->db_objset == (os) && \ 194 (dbuf)->db_level == (level) && \ 195 (dbuf)->db_blkid == (blkid)) 196 197dmu_buf_impl_t * 198dbuf_find(objset_t os, uint64_t obj, uint8_t level, uint64_t blkid) 199{ 200* dbuf_hash_table_t h = &dbuf_hash_table; 201* uint64_t hv = dbuf_hash(os, obj, level, blkid); 202 uint64_t idx = hv & h->hash_table_mask; 203 dmu_buf_impl_t db; 204* 205 mutex_enter(DBUF_HASH_MUTEX(h, idx)); 206 for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) { 207 if (DBUF_EQUAL(db, os, obj, level, blkid)) { 208 mutex_enter(&db->db_mtx); 209 if (db->db_state != DB_EVICTING) { 210 mutex_exit(DBUF_HASH_MUTEX(h, idx)); 211 return (db); 212 } 213 mutex_exit(&db->db_mtx); 214 } 215 } 216 mutex_exit(DBUF_HASH_MUTEX(h, idx)); 217 return (NULL); 218} 219 220static dmu_buf_impl_t * 221dbuf_find_bonus(objset_t os, uint64_t object) 222{ 223* dnode_t dn; 224* dmu_buf_impl_t db = NULL; 225* 226 if (dnode_hold(os, object, FTAG, &dn) == 0) { 227 rw_enter(&dn->dn_struct_rwlock, RW_READER); 228 if (dn->dn_bonus != NULL) { 229 db = dn->dn_bonus; 230 mutex_enter(&db->db_mtx); 231 } 232 rw_exit(&dn->dn_struct_rwlock); 233 dnode_rele(dn, FTAG); 234 } 235 return (db); 236} 237 238/* 239 * Insert an entry into the hash table. If there is already an element 240 * equal to elem in the hash table, then the already existing element 241 * will be returned and the new element will not be inserted. 242 * Otherwise returns NULL. 243 / 244static dmu_buf_impl_t 245dbuf_hash_insert(dmu_buf_impl_t db) 246{ 247* dbuf_hash_table_t h = &dbuf_hash_table; 248* objset_t os = db->db_objset; 249* uint64_t obj = db->db.db_object; 250 int level = db->db_level; 251 uint64_t blkid = db->db_blkid; 252 uint64_t hv = dbuf_hash(os, obj, level, blkid); 253 uint64_t idx = hv & h->hash_table_mask; 254 dmu_buf_impl_t dbf; 255* 256 mutex_enter(DBUF_HASH_MUTEX(h, idx)); 257 for (dbf = h->hash_table[idx]; dbf != NULL; dbf = dbf->db_hash_next) { 258 if (DBUF_EQUAL(dbf, os, obj, level, blkid)) { 259 mutex_enter(&dbf->db_mtx); 260 if (dbf->db_state != DB_EVICTING) { 261 mutex_exit(DBUF_HASH_MUTEX(h, idx)); 262 return (dbf); 263 } 264 mutex_exit(&dbf->db_mtx); 265 } 266 } 267 268 mutex_enter(&db->db_mtx); 269 db->db_hash_next = h->hash_table[idx]; 270 h->hash_table[idx] = db; 271 mutex_exit(DBUF_HASH_MUTEX(h, idx)); 272 atomic_inc_64(&dbuf_hash_count); 273 274 return (NULL); 275} 276 277/* 278 * Remove an entry from the hash table. It must be in the EVICTING state. 279 / 280static void 281dbuf_hash_remove(dmu_buf_impl_t db) 282{ 283 dbuf_hash_table_t h = &dbuf_hash_table; 284* uint64_t hv = dbuf_hash(db->db_objset, db->db.db_object, 285 db->db_level, db->db_blkid); 286 uint64_t idx = hv & h->hash_table_mask; 287 dmu_buf_impl_t dbf, dbp; 288* 289 /* 290 * We musn't hold db_mtx to maintain lock ordering: 291 * DBUF_HASH_MUTEX > db_mtx. 292 / 293* ASSERT(refcount_is_zero(&db->db_holds)); 294 ASSERT(db->db_state == DB_EVICTING); 295 ASSERT(!MUTEX_HELD(&db->db_mtx)); 296 297 mutex_enter(DBUF_HASH_MUTEX(h, idx)); 298 dbp = &h->hash_table[idx]; 299 while ((dbf = dbp) != db) { 300* dbp = &dbf->db_hash_next; 301 ASSERT(dbf != NULL); 302 } 303 dbp = db->db_hash_next; 304* db->db_hash_next = NULL; 305 mutex_exit(DBUF_HASH_MUTEX(h, idx)); 306 atomic_dec_64(&dbuf_hash_count); 307} 308 309typedef enum { 310 DBVU_EVICTING, 311 DBVU_NOT_EVICTING 312} dbvu_verify_type_t; 313 314static void 315dbuf_verify_user(dmu_buf_impl_t db, dbvu_verify_type_t verify_type) 316{ 317#ifdef ZFS_DEBUG 318* int64_t holds; 319 320 if (db->db_user == NULL) 321 return; 322 323 /* Only data blocks support the attachment of user data. / 324* ASSERT(db->db_level == 0); 325 326 /* Clients must resolve a dbuf before attaching user data. / 327* ASSERT(db->db.db_data != NULL); 328 ASSERT3U(db->db_state, ==, DB_CACHED); 329 330 holds = refcount_count(&db->db_holds); 331 if (verify_type == DBVU_EVICTING) { 332 /* 333 * Immediate eviction occurs when holds == dirtycnt. 334 * For normal eviction buffers, holds is zero on 335 * eviction, except when dbuf_fix_old_data() calls 336 * dbuf_clear_data(). However, the hold count can grow 337 * during eviction even though db_mtx is held (see 338 * dmu_bonus_hold() for an example), so we can only 339 * test the generic invariant that holds >= dirtycnt. 340 / 341* ASSERT3U(holds, >=, db->db_dirtycnt); 342 } else { 343 if (db->db_user_immediate_evict == TRUE) 344 ASSERT3U(holds, >=, db->db_dirtycnt); 345 else 346 ASSERT3U(holds, >, 0); 347 } 348#endif 349} 350 351static void 352dbuf_evict_user(dmu_buf_impl_t db) 353{ 354* dmu_buf_user_t dbu = db->db_user; 355* 356 ASSERT(MUTEX_HELD(&db->db_mtx)); 357 358 if (dbu == NULL) 359 return; 360 361 dbuf_verify_user(db, DBVU_EVICTING); 362 db->db_user = NULL; 363 364#ifdef ZFS_DEBUG 365 if (dbu->dbu_clear_on_evict_dbufp != NULL) 366 dbu->dbu_clear_on_evict_dbufp = NULL; 367#endif 368* 369 /* 370 * Invoke the callback from a taskq to avoid lock order reversals 371 * and limit stack depth. 372 / 373* taskq_dispatch_ent(dbu_evict_taskq, dbu->dbu_evict_func, dbu, 0, 374 &dbu->dbu_tqent); 375} 376 377boolean_t 378dbuf_is_metadata(dmu_buf_impl_t db) 379{ 380* if (db->db_level > 0) { 381 return (B_TRUE); 382 } else { 383 boolean_t is_metadata; 384 385 DB_DNODE_ENTER(db); 386 is_metadata = DMU_OT_IS_METADATA(DB_DNODE(db)->dn_type); 387 DB_DNODE_EXIT(db); 388 389 return (is_metadata); 390 } 391} 392 393/* 394 * This function must return indices evenly distributed between all 395 * sublists of the multilist. This is needed due to how the dbuf eviction 396 * code is laid out; dbuf_evict_thread() assumes dbufs are evenly 397 * distributed between all sublists and uses this assumption when 398 * deciding which sublist to evict from and how much to evict from it. 399 / 400unsigned int 401dbuf_cache_multilist_index_func(multilist_t ml, void obj) 402{ 403* dmu_buf_impl_t db = obj; 404* 405 /* 406 * The assumption here, is the hash value for a given 407 * dmu_buf_impl_t will remain constant throughout it's lifetime 408 * (i.e. it's objset, object, level and blkid fields don't change). 409 * Thus, we don't need to store the dbuf's sublist index 410 * on insertion, as this index can be recalculated on removal. 411 * 412 * Also, the low order bits of the hash value are thought to be 413 * distributed evenly. Otherwise, in the case that the multilist 414 * has a power of two number of sublists, each sublists' usage 415 * would not be evenly distributed. 416 / 417* return (dbuf_hash(db->db_objset, db->db.db_object, 418 db->db_level, db->db_blkid) % 419 multilist_get_num_sublists(ml)); 420} 421 422static inline boolean_t 423dbuf_cache_above_hiwater(void) 424{ 425 uint64_t dbuf_cache_hiwater_bytes = 426 (dbuf_cache_max_bytes * dbuf_cache_hiwater_pct) / 100; 427 428 return (refcount_count(&dbuf_cache_size) > 429 dbuf_cache_max_bytes + dbuf_cache_hiwater_bytes); 430} 431 432static inline boolean_t 433dbuf_cache_above_lowater(void) 434{ 435 uint64_t dbuf_cache_lowater_bytes = 436 (dbuf_cache_max_bytes * dbuf_cache_lowater_pct) / 100; 437 438 return (refcount_count(&dbuf_cache_size) > 439 dbuf_cache_max_bytes - dbuf_cache_lowater_bytes); 440} 441 442/* 443 * Evict the oldest eligible dbuf from the dbuf cache. 444 / 445static void 446dbuf_evict_one(void) 447{ 448* int idx = multilist_get_random_index(&dbuf_cache); 449 multilist_sublist_t mls = multilist_sublist_lock(&dbuf_cache, idx); 450* 451 ASSERT(!MUTEX_HELD(&dbuf_evict_lock)); 452 453 /* 454 * Set the thread's tsd to indicate that it's processing evictions. 455 * Once a thread stops evicting from the dbuf cache it will 456 * reset its tsd to NULL. 457 / 458* ASSERT3P(tsd_get(zfs_dbuf_evict_key), ==, NULL); 459 (void) tsd_set(zfs_dbuf_evict_key, (void )B_TRUE); 460* 461 dmu_buf_impl_t db = multilist_sublist_tail(mls); 462* while (db != NULL && mutex_tryenter(&db->db_mtx) == 0) { 463 db = multilist_sublist_prev(mls, db); 464 } 465 466 DTRACE_PROBE2(dbuf__evict__one, dmu_buf_impl_t , db, 467* multilist_sublist_t , mls); 468* 469 if (db != NULL) { 470 multilist_sublist_remove(mls, db); 471 multilist_sublist_unlock(mls); 472 (void) refcount_remove_many(&dbuf_cache_size, 473 db->db.db_size, db); 474 dbuf_destroy(db); 475 } else { 476 multilist_sublist_unlock(mls); 477 } 478 (void) tsd_set(zfs_dbuf_evict_key, NULL); 479} 480 481/* 482 * The dbuf evict thread is responsible for aging out dbufs from the 483 * cache. Once the cache has reached it's maximum size, dbufs are removed 484 * and destroyed. The eviction thread will continue running until the size 485 * of the dbuf cache is at or below the maximum size. Once the dbuf is aged 486 * out of the cache it is destroyed and becomes eligible for arc eviction. 487 / 488static void 489dbuf_evict_thread(void dummy __unused) 490{ 491 callb_cpr_t cpr; 492 493 CALLB_CPR_INIT(&cpr, &dbuf_evict_lock, callb_generic_cpr, FTAG); 494 495 mutex_enter(&dbuf_evict_lock); 496 while (!dbuf_evict_thread_exit) { 497 while (!dbuf_cache_above_lowater() && !dbuf_evict_thread_exit) { 498 CALLB_CPR_SAFE_BEGIN(&cpr); 499 (void) cv_timedwait_hires(&dbuf_evict_cv, 500 &dbuf_evict_lock, SEC2NSEC(1), MSEC2NSEC(1), 0); 501 CALLB_CPR_SAFE_END(&cpr, &dbuf_evict_lock); 502 } 503 mutex_exit(&dbuf_evict_lock); 504 505 /* 506 * Keep evicting as long as we're above the low water mark 507 * for the cache. We do this without holding the locks to 508 * minimize lock contention. 509 / 510* while (dbuf_cache_above_lowater() && !dbuf_evict_thread_exit) { 511 dbuf_evict_one(); 512 } 513 514 mutex_enter(&dbuf_evict_lock); 515 } 516 517 dbuf_evict_thread_exit = B_FALSE; 518 cv_broadcast(&dbuf_evict_cv); 519 CALLB_CPR_EXIT(&cpr); /* drops dbuf_evict_lock / 520* thread_exit(); 521} 522 523/* 524 * Wake up the dbuf eviction thread if the dbuf cache is at its max size. 525 * If the dbuf cache is at its high water mark, then evict a dbuf from the 526 * dbuf cache using the callers context. 527 / 528static void 529dbuf_evict_notify(void) 530{ 531* 532 /* 533 * We use thread specific data to track when a thread has 534 * started processing evictions. This allows us to avoid deeply 535 * nested stacks that would have a call flow similar to this: 536 * 537 * dbuf_rele()-->dbuf_rele_and_unlock()-->dbuf_evict_notify() 538 * ^ \| 539 * \| \| 540 * +-----dbuf_destroy()<--dbuf_evict_one()<--------+ 541 * 542 * The dbuf_eviction_thread will always have its tsd set until 543 * that thread exits. All other threads will only set their tsd 544 * if they are participating in the eviction process. This only 545 * happens if the eviction thread is unable to process evictions 546 * fast enough. To keep the dbuf cache size in check, other threads 547 * can evict from the dbuf cache directly. Those threads will set 548 * their tsd values so that we ensure that they only evict one dbuf 549 * from the dbuf cache. 550 / 551* if (tsd_get(zfs_dbuf_evict_key) != NULL) 552 return; 553 554 if (refcount_count(&dbuf_cache_size) > dbuf_cache_max_bytes) { 555 boolean_t evict_now = B_FALSE; 556 557 mutex_enter(&dbuf_evict_lock); 558 if (refcount_count(&dbuf_cache_size) > dbuf_cache_max_bytes) { 559 evict_now = dbuf_cache_above_hiwater(); 560 cv_signal(&dbuf_evict_cv); 561 } 562 mutex_exit(&dbuf_evict_lock); 563 564 if (evict_now) { 565 dbuf_evict_one(); 566 } 567 } 568} 569 570void 571dbuf_init(void) 572{ 573 uint64_t hsize = 1ULL << 16; 574 dbuf_hash_table_t h = &dbuf_hash_table; 575* int i; 576 577 /* 578 * The hash table is big enough to fill all of physical memory 579 * with an average 4K block size. The table will take up 580 * totalmemsizeof(void)/4K (i.e. 2MB/GB with 8-byte pointers). 581 / 582* while (hsize * 4096 < (uint64_t)physmem * PAGESIZE) 583 hsize <<= 1; 584 585retry: 586 h->hash_table_mask = hsize - 1; 587 h->hash_table = kmem_zalloc(hsize * sizeof (void ), KM_NOSLEEP); 588* if (h->hash_table == NULL) { 589 /* XXX - we should really return an error instead of assert / 590* ASSERT(hsize > (1ULL << 10)); 591 hsize >>= 1; 592 goto retry; 593 } 594 595 dbuf_kmem_cache = kmem_cache_create("dmu_buf_impl_t", 596 sizeof (dmu_buf_impl_t), 597 0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0); 598 599 for (i = 0; i < DBUF_MUTEXES; i++) 600 mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL); 601 602 /* 603 * Setup the parameters for the dbuf cache. We cap the size of the 604 * dbuf cache to 1/32nd (default) of the size of the ARC. 605 / 606* dbuf_cache_max_bytes = MIN(dbuf_cache_max_bytes, 607 arc_max_bytes() >> dbuf_cache_max_shift); 608 609 /* 610 * All entries are queued via taskq_dispatch_ent(), so min/maxalloc 611 * configuration is not required. 612 / 613* dbu_evict_taskq = taskq_create("dbu_evict", 1, minclsyspri, 0, 0, 0); 614 615 multilist_create(&dbuf_cache, sizeof (dmu_buf_impl_t), 616 offsetof(dmu_buf_impl_t, db_cache_link), 617 zfs_arc_num_sublists_per_state, 618 dbuf_cache_multilist_index_func); 619 refcount_create(&dbuf_cache_size); 620 621 tsd_create(&zfs_dbuf_evict_key, NULL); 622 dbuf_evict_thread_exit = B_FALSE; 623 mutex_init(&dbuf_evict_lock, NULL, MUTEX_DEFAULT, NULL); 624 cv_init(&dbuf_evict_cv, NULL, CV_DEFAULT, NULL); 625 dbuf_cache_evict_thread = thread_create(NULL, 0, dbuf_evict_thread, 626 NULL, 0, &p0, TS_RUN, minclsyspri); 627} 628 629void 630dbuf_fini(void) 631{ 632 dbuf_hash_table_t h = &dbuf_hash_table; 633* int i; 634 635 for (i = 0; i < DBUF_MUTEXES; i++) 636 mutex_destroy(&h->hash_mutexes[i]); 637 kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void )); 638* kmem_cache_destroy(dbuf_kmem_cache); 639 taskq_destroy(dbu_evict_taskq); 640 641 mutex_enter(&dbuf_evict_lock); 642 dbuf_evict_thread_exit = B_TRUE; 643 while (dbuf_evict_thread_exit) { 644 cv_signal(&dbuf_evict_cv); 645 cv_wait(&dbuf_evict_cv, &dbuf_evict_lock); 646 } 647 mutex_exit(&dbuf_evict_lock); 648 tsd_destroy(&zfs_dbuf_evict_key); 649 650 mutex_destroy(&dbuf_evict_lock); 651 cv_destroy(&dbuf_evict_cv); 652 653 refcount_destroy(&dbuf_cache_size); 654 multilist_destroy(&dbuf_cache); 655} 656 657/* 658 * Other stuff. 659 / 660* 661#ifdef ZFS_DEBUG 662static void 663dbuf_verify(dmu_buf_impl_t db) 664{ 665* dnode_t dn; 666* dbuf_dirty_record_t dr; 667* 668 ASSERT(MUTEX_HELD(&db->db_mtx)); 669 670 if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY)) 671 return; 672 673 ASSERT(db->db_objset != NULL); 674 DB_DNODE_ENTER(db); 675 dn = DB_DNODE(db); 676 if (dn == NULL) { 677 ASSERT(db->db_parent == NULL); 678 ASSERT(db->db_blkptr == NULL); 679 } else { 680 ASSERT3U(db->db.db_object, ==, dn->dn_object); 681 ASSERT3P(db->db_objset, ==, dn->dn_objset); 682 ASSERT3U(db->db_level, <, dn->dn_nlevels); 683 ASSERT(db->db_blkid == DMU_BONUS_BLKID \|\| 684 db->db_blkid == DMU_SPILL_BLKID \|\| 685 !avl_is_empty(&dn->dn_dbufs)); 686 } 687 if (db->db_blkid == DMU_BONUS_BLKID) { 688 ASSERT(dn != NULL); 689 ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); 690 ASSERT3U(db->db.db_offset, ==, DMU_BONUS_BLKID); 691 } else if (db->db_blkid == DMU_SPILL_BLKID) { 692 ASSERT(dn != NULL); 693 ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); 694 ASSERT0(db->db.db_offset); 695 } else { 696 ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size); 697 } 698 699 for (dr = db->db_data_pending; dr != NULL; dr = dr->dr_next) 700 ASSERT(dr->dr_dbuf == db); 701 702 for (dr = db->db_last_dirty; dr != NULL; dr = dr->dr_next) 703 ASSERT(dr->dr_dbuf == db); 704 705 /* 706 * We can't assert that db_size matches dn_datablksz because it 707 * can be momentarily different when another thread is doing 708 * dnode_set_blksz(). 709 / 710* if (db->db_level == 0 && db->db.db_object == DMU_META_DNODE_OBJECT) { 711 dr = db->db_data_pending; 712 /* 713 * It should only be modified in syncing context, so 714 * make sure we only have one copy of the data. 715 / 716* ASSERT(dr == NULL \|\| dr->dt.dl.dr_data == db->db_buf); 717 } 718 719 /* verify db->db_blkptr / 720* if (db->db_blkptr) { 721 if (db->db_parent == dn->dn_dbuf) { 722 /* db is pointed to by the dnode / 723* /* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); / 724* if (DMU_OBJECT_IS_SPECIAL(db->db.db_object)) 725 ASSERT(db->db_parent == NULL); 726 else 727 ASSERT(db->db_parent != NULL); 728 if (db->db_blkid != DMU_SPILL_BLKID) 729 ASSERT3P(db->db_blkptr, ==, 730 &dn->dn_phys->dn_blkptr[db->db_blkid]); 731 } else { 732 /* db is pointed to by an indirect block / 733* int epb = db->db_parent->db.db_size >> SPA_BLKPTRSHIFT; 734 ASSERT3U(db->db_parent->db_level, ==, db->db_level+1); 735 ASSERT3U(db->db_parent->db.db_object, ==, 736 db->db.db_object); 737 /* 738 * dnode_grow_indblksz() can make this fail if we don't 739 * have the struct_rwlock. XXX indblksz no longer 740 * grows. safe to do this now? 741 / 742* if (RW_WRITE_HELD(&dn->dn_struct_rwlock)) { 743 ASSERT3P(db->db_blkptr, ==, 744 ((blkptr_t )db->db_parent->db.db_data + 745* db->db_blkid % epb)); 746 } 747 } 748 } 749 if ((db->db_blkptr == NULL \|\| BP_IS_HOLE(db->db_blkptr)) && 750 (db->db_buf == NULL \|\| db->db_buf->b_data) && 751 db->db.db_data && db->db_blkid != DMU_BONUS_BLKID && 752 db->db_state != DB_FILL && !dn->dn_free_txg) { 753 /* 754 * If the blkptr isn't set but they have nonzero data, 755 * it had better be dirty, otherwise we'll lose that 756 * data when we evict this buffer. 757 * 758 * There is an exception to this rule for indirect blocks; in 759 * this case, if the indirect block is a hole, we fill in a few 760 * fields on each of the child blocks (importantly, birth time) 761 * to prevent hole birth times from being lost when you 762 * partially fill in a hole. 763 / 764* if (db->db_dirtycnt == 0) { 765 if (db->db_level == 0) { 766 uint64_t buf = db->db.db_data; 767* int i; 768 769 for (i = 0; i < db->db.db_size >> 3; i++) { 770 ASSERT(buf[i] == 0); 771 } 772 } else { 773 blkptr_t bps = db->db.db_data; 774* ASSERT3U(1 << DB_DNODE(db)->dn_indblkshift, ==, 775 db->db.db_size); 776 /* 777 * We want to verify that all the blkptrs in the 778 * indirect block are holes, but we may have 779 * automatically set up a few fields for them. 780 * We iterate through each blkptr and verify 781 * they only have those fields set. 782 / 783* for (int i = 0; 784 i < db->db.db_size / sizeof (blkptr_t); 785 i++) { 786 blkptr_t bp = &bps[i]; 787* ASSERT(ZIO_CHECKSUM_IS_ZERO( 788 &bp->blk_cksum)); 789 ASSERT( 790 DVA_IS_EMPTY(&bp->blk_dva[0]) && 791 DVA_IS_EMPTY(&bp->blk_dva[1]) && 792 DVA_IS_EMPTY(&bp->blk_dva[2])); 793 ASSERT0(bp->blk_fill); 794 ASSERT0(bp->blk_pad[0]); 795 ASSERT0(bp->blk_pad[1]); 796 ASSERT(!BP_IS_EMBEDDED(bp)); 797 ASSERT(BP_IS_HOLE(bp)); 798 ASSERT0(bp->blk_phys_birth); 799 } 800 } 801 } 802 } 803 DB_DNODE_EXIT(db); 804} 805#endif 806 807static void 808dbuf_clear_data(dmu_buf_impl_t db) 809{ 810* ASSERT(MUTEX_HELD(&db->db_mtx)); 811 dbuf_evict_user(db); 812 ASSERT3P(db->db_buf, ==, NULL); 813 db->db.db_data = NULL; 814 if (db->db_state != DB_NOFILL) 815 db->db_state = DB_UNCACHED; 816} 817 818static void 819dbuf_set_data(dmu_buf_impl_t db, arc_buf_t buf) 820{ 821 ASSERT(MUTEX_HELD(&db->db_mtx)); 822 ASSERT(buf != NULL); 823 824 db->db_buf = buf; 825 ASSERT(buf->b_data != NULL); 826 db->db.db_data = buf->b_data; 827} 828 829/* 830 * Loan out an arc_buf for read. Return the loaned arc_buf. 831 / 832arc_buf_t 833dbuf_loan_arcbuf(dmu_buf_impl_t db) 834{ 835* arc_buf_t abuf; 836* 837 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 838 mutex_enter(&db->db_mtx); 839 if (arc_released(db->db_buf) \|\| refcount_count(&db->db_holds) > 1) { 840 int blksz = db->db.db_size; 841 spa_t spa = db->db_objset->os_spa; 842* 843 mutex_exit(&db->db_mtx); 844 abuf = arc_loan_buf(spa, blksz); 845 bcopy(db->db.db_data, abuf->b_data, blksz); 846 } else { 847 abuf = db->db_buf; 848 arc_loan_inuse_buf(abuf, db); 849 db->db_buf = NULL; 850 dbuf_clear_data(db); 851 mutex_exit(&db->db_mtx); 852 } 853 return (abuf); 854} 855 856/* 857 * Calculate which level n block references the data at the level 0 offset 858 * provided. 859 / 860uint64_t 861dbuf_whichblock(dnode_t dn, int64_t level, uint64_t offset) 862{ 863 if (dn->dn_datablkshift != 0 && dn->dn_indblkshift != 0) { 864 /* 865 * The level n blkid is equal to the level 0 blkid divided by 866 * the number of level 0s in a level n block. 867 * 868 * The level 0 blkid is offset >> datablkshift = 869 * offset / 2^datablkshift. 870 * 871 * The number of level 0s in a level n is the number of block 872 * pointers in an indirect block, raised to the power of level. 873 * This is 2^(indblkshift - SPA_BLKPTRSHIFT)^level = 874 * 2^(level(indblkshift - SPA_BLKPTRSHIFT)). 875* * 876 * Thus, the level n blkid is: offset / 877 * ((2^datablkshift)(2^(level(indblkshift - SPA_BLKPTRSHIFT))) 878 * = offset / 2^(datablkshift + level * 879 * (indblkshift - SPA_BLKPTRSHIFT)) 880 * = offset >> (datablkshift + level * 881 * (indblkshift - SPA_BLKPTRSHIFT)) 882 / 883* return (offset >> (dn->dn_datablkshift + level * 884 (dn->dn_indblkshift - SPA_BLKPTRSHIFT))); 885 } else { 886 ASSERT3U(offset, <, dn->dn_datablksz); 887 return (0); 888 } 889} 890 891static void 892dbuf_read_done(zio_t zio, arc_buf_t buf, void vdb) 893{ 894* dmu_buf_impl_t db = vdb; 895* 896 mutex_enter(&db->db_mtx); 897 ASSERT3U(db->db_state, ==, DB_READ); 898 /* 899 * All reads are synchronous, so we must have a hold on the dbuf 900 / 901* ASSERT(refcount_count(&db->db_holds) > 0); 902 ASSERT(db->db_buf == NULL); 903 ASSERT(db->db.db_data == NULL); 904 if (db->db_level == 0 && db->db_freed_in_flight) { 905 /* we were freed in flight; disregard any error / 906* arc_release(buf, db); 907 bzero(buf->b_data, db->db.db_size); 908 arc_buf_freeze(buf); 909 db->db_freed_in_flight = FALSE; 910 dbuf_set_data(db, buf); 911 db->db_state = DB_CACHED; 912 } else if (zio == NULL \|\| zio->io_error == 0) { 913 dbuf_set_data(db, buf); 914 db->db_state = DB_CACHED; 915 } else { 916 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 917 ASSERT3P(db->db_buf, ==, NULL); 918 arc_buf_destroy(buf, db); 919 db->db_state = DB_UNCACHED; 920 } 921 cv_broadcast(&db->db_changed); 922 dbuf_rele_and_unlock(db, NULL); 923} 924 925static void 926dbuf_read_impl(dmu_buf_impl_t db, zio_t zio, uint32_t flags) 927{ 928 dnode_t dn; 929* zbookmark_phys_t zb; 930 arc_flags_t aflags = ARC_FLAG_NOWAIT; 931 932 DB_DNODE_ENTER(db); 933 dn = DB_DNODE(db); 934 ASSERT(!refcount_is_zero(&db->db_holds)); 935 /* We need the struct_rwlock to prevent db_blkptr from changing. / 936* ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 937 ASSERT(MUTEX_HELD(&db->db_mtx)); 938 ASSERT(db->db_state == DB_UNCACHED); 939 ASSERT(db->db_buf == NULL); 940 941 if (db->db_blkid == DMU_BONUS_BLKID) { 942 int bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen); 943 944 ASSERT3U(bonuslen, <=, db->db.db_size); 945 db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN); 946 arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); 947 if (bonuslen < DN_MAX_BONUSLEN) 948 bzero(db->db.db_data, DN_MAX_BONUSLEN); 949 if (bonuslen) 950 bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen); 951 DB_DNODE_EXIT(db); 952 db->db_state = DB_CACHED; 953 mutex_exit(&db->db_mtx); 954 return; 955 } 956 957 /* 958 * Recheck BP_IS_HOLE() after dnode_block_freed() in case dnode_sync() 959 * processes the delete record and clears the bp while we are waiting 960 * for the dn_mtx (resulting in a "no" from block_freed). 961 / 962* if (db->db_blkptr == NULL \|\| BP_IS_HOLE(db->db_blkptr) \|\| 963 (db->db_level == 0 && (dnode_block_freed(dn, db->db_blkid) \|\| 964 BP_IS_HOLE(db->db_blkptr)))) { 965 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 966 967 dbuf_set_data(db, arc_alloc_buf(db->db_objset->os_spa, 968 db->db.db_size, db, type)); 969 bzero(db->db.db_data, db->db.db_size); 970 971 if (db->db_blkptr != NULL && db->db_level > 0 && 972 BP_IS_HOLE(db->db_blkptr) && 973 db->db_blkptr->blk_birth != 0) { 974 blkptr_t bps = db->db.db_data; 975* for (int i = 0; i < ((1 << 976 DB_DNODE(db)->dn_indblkshift) / sizeof (blkptr_t)); 977 i++) { 978 blkptr_t bp = &bps[i]; 979* ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==, 980 1 << dn->dn_indblkshift); 981 BP_SET_LSIZE(bp, 982 BP_GET_LEVEL(db->db_blkptr) == 1 ? 983 dn->dn_datablksz : 984 BP_GET_LSIZE(db->db_blkptr)); 985 BP_SET_TYPE(bp, BP_GET_TYPE(db->db_blkptr)); 986 BP_SET_LEVEL(bp, 987 BP_GET_LEVEL(db->db_blkptr) - 1); 988 BP_SET_BIRTH(bp, db->db_blkptr->blk_birth, 0); 989 } 990 } 991 DB_DNODE_EXIT(db); 992 db->db_state = DB_CACHED; 993 mutex_exit(&db->db_mtx); 994 return; 995 } 996 997 DB_DNODE_EXIT(db); 998 999 db->db_state = DB_READ; 1000 mutex_exit(&db->db_mtx); 1001 1002 if (DBUF_IS_L2CACHEABLE(db)) 1003 aflags \|= ARC_FLAG_L2CACHE; 1004 1005 SET_BOOKMARK(&zb, db->db_objset->os_dsl_dataset ? 1006 db->db_objset->os_dsl_dataset->ds_object : DMU_META_OBJSET, 1007 db->db.db_object, db->db_level, db->db_blkid); 1008 1009 dbuf_add_ref(db, NULL); 1010 1011 (void) arc_read(zio, db->db_objset->os_spa, db->db_blkptr, 1012 dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, 1013 (flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED, 1014 &aflags, &zb); 1015} 1016 1017int 1018dbuf_read(dmu_buf_impl_t db, zio_t zio, uint32_t flags) 1019{ 1020 int err = 0; 1021 boolean_t havepzio = (zio != NULL); 1022 boolean_t prefetch; 1023 dnode_t dn; 1024* 1025 /* 1026 * We don't have to hold the mutex to check db_state because it 1027 * can't be freed while we have a hold on the buffer. 1028 / 1029* ASSERT(!refcount_is_zero(&db->db_holds)); 1030 1031 if (db->db_state == DB_NOFILL) 1032 return (SET_ERROR(EIO)); 1033 1034 DB_DNODE_ENTER(db); 1035 dn = DB_DNODE(db); 1036 if ((flags & DB_RF_HAVESTRUCT) == 0) 1037 rw_enter(&dn->dn_struct_rwlock, RW_READER); 1038 1039 prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && 1040 (flags & DB_RF_NOPREFETCH) == 0 && dn != NULL && 1041 DBUF_IS_CACHEABLE(db); 1042 1043 mutex_enter(&db->db_mtx); 1044 if (db->db_state == DB_CACHED) { 1045 mutex_exit(&db->db_mtx); 1046 if (prefetch) 1047 dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE); 1048 if ((flags & DB_RF_HAVESTRUCT) == 0) 1049 rw_exit(&dn->dn_struct_rwlock); 1050 DB_DNODE_EXIT(db); 1051 } else if (db->db_state == DB_UNCACHED) { 1052 spa_t spa = dn->dn_objset->os_spa; 1053* 1054 if (zio == NULL) 1055 zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); 1056 dbuf_read_impl(db, zio, flags); 1057 1058 /* dbuf_read_impl has dropped db_mtx for us / 1059* 1060 if (prefetch) 1061 dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE); 1062 1063 if ((flags & DB_RF_HAVESTRUCT) == 0) 1064 rw_exit(&dn->dn_struct_rwlock); 1065 DB_DNODE_EXIT(db); 1066 1067 if (!havepzio) 1068 err = zio_wait(zio); 1069 } else { 1070 /* 1071 * Another reader came in while the dbuf was in flight 1072 * between UNCACHED and CACHED. Either a writer will finish 1073 * writing the buffer (sending the dbuf to CACHED) or the 1074 * first reader's request will reach the read_done callback 1075 * and send the dbuf to CACHED. Otherwise, a failure 1076 * occurred and the dbuf went to UNCACHED. 1077 / 1078* mutex_exit(&db->db_mtx); 1079 if (prefetch) 1080 dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE); 1081 if ((flags & DB_RF_HAVESTRUCT) == 0) 1082 rw_exit(&dn->dn_struct_rwlock); 1083 DB_DNODE_EXIT(db); 1084 1085 /* Skip the wait per the caller's request. / 1086* mutex_enter(&db->db_mtx); 1087 if ((flags & DB_RF_NEVERWAIT) == 0) { 1088 while (db->db_state == DB_READ \|\| 1089 db->db_state == DB_FILL) { 1090 ASSERT(db->db_state == DB_READ \|\| 1091 (flags & DB_RF_HAVESTRUCT) == 0); 1092 DTRACE_PROBE2(blocked__read, dmu_buf_impl_t , 1093* db, zio_t , zio); 1094* cv_wait(&db->db_changed, &db->db_mtx); 1095 } 1096 if (db->db_state == DB_UNCACHED) 1097 err = SET_ERROR(EIO); 1098 } 1099 mutex_exit(&db->db_mtx); 1100 } 1101 1102 ASSERT(err \|\| havepzio \|\| db->db_state == DB_CACHED); 1103 return (err); 1104} 1105 1106static void 1107dbuf_noread(dmu_buf_impl_t db) 1108{ 1109* ASSERT(!refcount_is_zero(&db->db_holds)); 1110 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 1111 mutex_enter(&db->db_mtx); 1112 while (db->db_state == DB_READ \|\| db->db_state == DB_FILL) 1113 cv_wait(&db->db_changed, &db->db_mtx); 1114 if (db->db_state == DB_UNCACHED) { 1115 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 1116 spa_t spa = db->db_objset->os_spa; 1117* 1118 ASSERT(db->db_buf == NULL); 1119 ASSERT(db->db.db_data == NULL); 1120 dbuf_set_data(db, arc_alloc_buf(spa, db->db.db_size, db, type)); 1121 db->db_state = DB_FILL; 1122 } else if (db->db_state == DB_NOFILL) { 1123 dbuf_clear_data(db); 1124 } else { 1125 ASSERT3U(db->db_state, ==, DB_CACHED); 1126 } 1127 mutex_exit(&db->db_mtx); 1128} 1129 1130/* 1131 * This is our just-in-time copy function. It makes a copy of 1132 * buffers, that have been modified in a previous transaction 1133 * group, before we modify them in the current active group. 1134 * 1135 * This function is used in two places: when we are dirtying a 1136 * buffer for the first time in a txg, and when we are freeing 1137 * a range in a dnode that includes this buffer. 1138 * 1139 * Note that when we are called from dbuf_free_range() we do 1140 * not put a hold on the buffer, we just traverse the active 1141 * dbuf list for the dnode. 1142 / 1143static void 1144dbuf_fix_old_data(dmu_buf_impl_t db, uint64_t txg) 1145{ 1146 dbuf_dirty_record_t dr = db->db_last_dirty; 1147* 1148 ASSERT(MUTEX_HELD(&db->db_mtx)); 1149 ASSERT(db->db.db_data != NULL); 1150 ASSERT(db->db_level == 0); 1151 ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT); 1152 1153 if (dr == NULL \|\| 1154 (dr->dt.dl.dr_data != 1155 ((db->db_blkid == DMU_BONUS_BLKID) ? db->db.db_data : db->db_buf))) 1156 return; 1157 1158 /* 1159 * If the last dirty record for this dbuf has not yet synced 1160 * and its referencing the dbuf data, either: 1161 * reset the reference to point to a new copy, 1162 * or (if there a no active holders) 1163 * just null out the current db_data pointer. 1164 / 1165* ASSERT(dr->dr_txg >= txg - 2); 1166 if (db->db_blkid == DMU_BONUS_BLKID) { 1167 /* Note that the data bufs here are zio_bufs / 1168* dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN); 1169 arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); 1170 bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN); 1171 } else if (refcount_count(&db->db_holds) > db->db_dirtycnt) { 1172 int size = db->db.db_size; 1173 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 1174 spa_t spa = db->db_objset->os_spa; 1175* 1176 dr->dt.dl.dr_data = arc_alloc_buf(spa, size, db, type); 1177 bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size); 1178 } else { 1179 db->db_buf = NULL; 1180 dbuf_clear_data(db); 1181 } 1182} 1183 1184void 1185dbuf_unoverride(dbuf_dirty_record_t dr) 1186{ 1187* dmu_buf_impl_t db = dr->dr_dbuf; 1188* blkptr_t bp = &dr->dt.dl.dr_overridden_by; 1189* uint64_t txg = dr->dr_txg; 1190 1191 ASSERT(MUTEX_HELD(&db->db_mtx)); 1192 ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC); 1193 ASSERT(db->db_level == 0); 1194 1195 if (db->db_blkid == DMU_BONUS_BLKID \|\| 1196 dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN) 1197 return; 1198 1199 ASSERT(db->db_data_pending != dr); 1200 1201 /* free this block / 1202* if (!BP_IS_HOLE(bp) && !dr->dt.dl.dr_nopwrite) 1203 zio_free(db->db_objset->os_spa, txg, bp); 1204 1205 dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; 1206 dr->dt.dl.dr_nopwrite = B_FALSE; 1207 1208 /* 1209 * Release the already-written buffer, so we leave it in 1210 * a consistent dirty state. Note that all callers are 1211 * modifying the buffer, so they will immediately do 1212 * another (redundant) arc_release(). Therefore, leave 1213 * the buf thawed to save the effort of freezing & 1214 * immediately re-thawing it. 1215 / 1216* arc_release(dr->dt.dl.dr_data, db); 1217} 1218 1219/* 1220 * Evict (if its unreferenced) or clear (if its referenced) any level-0 1221 * data blocks in the free range, so that any future readers will find 1222 * empty blocks.	52static boolean_t dbuf_undirty(dmu_buf_impl_t db, dmu_tx_t tx); 53static void dbuf_write(dbuf_dirty_record_t dr, arc_buf_t data, dmu_tx_t tx); 54 55#ifndef __lint 56extern inline void dmu_buf_init_user(dmu_buf_user_t dbu, 57 dmu_buf_evict_func_t evict_func, dmu_buf_t clear_on_evict_dbufp); 58#endif / ! __lint / 59 60/ 61 * Global data structures and functions for the dbuf cache. 62 / 63static kmem_cache_t dbuf_kmem_cache; 64static taskq_t dbu_evict_taskq; 65 66static kthread_t dbuf_cache_evict_thread; 67static kmutex_t dbuf_evict_lock; 68static kcondvar_t dbuf_evict_cv; 69static boolean_t dbuf_evict_thread_exit; 70 71/* 72 * LRU cache of dbufs. The dbuf cache maintains a list of dbufs that 73 * are not currently held but have been recently released. These dbufs 74 * are not eligible for arc eviction until they are aged out of the cache. 75 * Dbufs are added to the dbuf cache once the last hold is released. If a 76 * dbuf is later accessed and still exists in the dbuf cache, then it will 77 * be removed from the cache and later re-added to the head of the cache. 78 * Dbufs that are aged out of the cache will be immediately destroyed and 79 * become eligible for arc eviction. 80 / 81static multilist_t dbuf_cache; 82static refcount_t dbuf_cache_size; 83uint64_t dbuf_cache_max_bytes = 100 1024 * 1024; 84 85/* Cap the size of the dbuf cache to log2 fraction of arc size. / 86int dbuf_cache_max_shift = 5; 87 88/ 89 * The dbuf cache uses a three-stage eviction policy: 90 * - A low water marker designates when the dbuf eviction thread 91 * should stop evicting from the dbuf cache. 92 * - When we reach the maximum size (aka mid water mark), we 93 * signal the eviction thread to run. 94 * - The high water mark indicates when the eviction thread 95 * is unable to keep up with the incoming load and eviction must 96 * happen in the context of the calling thread. 97 * 98 * The dbuf cache: 99 * (max size) 100 * low water mid water hi water 101 * +----------------------------------------+----------+----------+ 102 * \| \| \| \| 103 * \| \| \| \| 104 * \| \| \| \| 105 * \| \| \| \| 106 * +----------------------------------------+----------+----------+ 107 * stop signal evict 108 * evicting eviction directly 109 * thread 110 * 111 * The high and low water marks indicate the operating range for the eviction 112 * thread. The low water mark is, by default, 90% of the total size of the 113 * cache and the high water mark is at 110% (both of these percentages can be 114 * changed by setting dbuf_cache_lowater_pct and dbuf_cache_hiwater_pct, 115 * respectively). The eviction thread will try to ensure that the cache remains 116 * within this range by waking up every second and checking if the cache is 117 * above the low water mark. The thread can also be woken up by callers adding 118 * elements into the cache if the cache is larger than the mid water (i.e max 119 * cache size). Once the eviction thread is woken up and eviction is required, 120 * it will continue evicting buffers until it's able to reduce the cache size 121 * to the low water mark. If the cache size continues to grow and hits the high 122 * water mark, then callers adding elments to the cache will begin to evict 123 * directly from the cache until the cache is no longer above the high water 124 * mark. 125 / 126* 127/* 128 * The percentage above and below the maximum cache size. 129 / 130uint_t dbuf_cache_hiwater_pct = 10; 131uint_t dbuf_cache_lowater_pct = 10; 132* 133/* ARGSUSED / 134static int 135dbuf_cons(void vdb, void unused, int kmflag) 136{ 137* dmu_buf_impl_t db = vdb; 138* bzero(db, sizeof (dmu_buf_impl_t)); 139 140 mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL); 141 cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL); 142 multilist_link_init(&db->db_cache_link); 143 refcount_create(&db->db_holds); 144 145 return (0); 146} 147 148/* ARGSUSED / 149static void 150dbuf_dest(void vdb, void unused) 151{ 152* dmu_buf_impl_t db = vdb; 153* mutex_destroy(&db->db_mtx); 154 cv_destroy(&db->db_changed); 155 ASSERT(!multilist_link_active(&db->db_cache_link)); 156 refcount_destroy(&db->db_holds); 157} 158 159/* 160 * dbuf hash table routines 161 / 162static dbuf_hash_table_t dbuf_hash_table; 163* 164static uint64_t dbuf_hash_count; 165 166static uint64_t 167dbuf_hash(void os, uint64_t obj, uint8_t lvl, uint64_t blkid) 168{ 169* uintptr_t osv = (uintptr_t)os; 170 uint64_t crc = -1ULL; 171 172 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); 173 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (lvl)) & 0xFF]; 174 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF]; 175 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF]; 176 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF]; 177 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 0)) & 0xFF]; 178 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 8)) & 0xFF]; 179 180 crc ^= (osv>>14) ^ (obj>>16) ^ (blkid>>16); 181 182 return (crc); 183} 184 185#define DBUF_EQUAL(dbuf, os, obj, level, blkid) \ 186 ((dbuf)->db.db_object == (obj) && \ 187 (dbuf)->db_objset == (os) && \ 188 (dbuf)->db_level == (level) && \ 189 (dbuf)->db_blkid == (blkid)) 190 191dmu_buf_impl_t * 192dbuf_find(objset_t os, uint64_t obj, uint8_t level, uint64_t blkid) 193{ 194* dbuf_hash_table_t h = &dbuf_hash_table; 195* uint64_t hv = dbuf_hash(os, obj, level, blkid); 196 uint64_t idx = hv & h->hash_table_mask; 197 dmu_buf_impl_t db; 198* 199 mutex_enter(DBUF_HASH_MUTEX(h, idx)); 200 for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) { 201 if (DBUF_EQUAL(db, os, obj, level, blkid)) { 202 mutex_enter(&db->db_mtx); 203 if (db->db_state != DB_EVICTING) { 204 mutex_exit(DBUF_HASH_MUTEX(h, idx)); 205 return (db); 206 } 207 mutex_exit(&db->db_mtx); 208 } 209 } 210 mutex_exit(DBUF_HASH_MUTEX(h, idx)); 211 return (NULL); 212} 213 214static dmu_buf_impl_t * 215dbuf_find_bonus(objset_t os, uint64_t object) 216{ 217* dnode_t dn; 218* dmu_buf_impl_t db = NULL; 219* 220 if (dnode_hold(os, object, FTAG, &dn) == 0) { 221 rw_enter(&dn->dn_struct_rwlock, RW_READER); 222 if (dn->dn_bonus != NULL) { 223 db = dn->dn_bonus; 224 mutex_enter(&db->db_mtx); 225 } 226 rw_exit(&dn->dn_struct_rwlock); 227 dnode_rele(dn, FTAG); 228 } 229 return (db); 230} 231 232/* 233 * Insert an entry into the hash table. If there is already an element 234 * equal to elem in the hash table, then the already existing element 235 * will be returned and the new element will not be inserted. 236 * Otherwise returns NULL. 237 / 238static dmu_buf_impl_t 239dbuf_hash_insert(dmu_buf_impl_t db) 240{ 241* dbuf_hash_table_t h = &dbuf_hash_table; 242* objset_t os = db->db_objset; 243* uint64_t obj = db->db.db_object; 244 int level = db->db_level; 245 uint64_t blkid = db->db_blkid; 246 uint64_t hv = dbuf_hash(os, obj, level, blkid); 247 uint64_t idx = hv & h->hash_table_mask; 248 dmu_buf_impl_t dbf; 249* 250 mutex_enter(DBUF_HASH_MUTEX(h, idx)); 251 for (dbf = h->hash_table[idx]; dbf != NULL; dbf = dbf->db_hash_next) { 252 if (DBUF_EQUAL(dbf, os, obj, level, blkid)) { 253 mutex_enter(&dbf->db_mtx); 254 if (dbf->db_state != DB_EVICTING) { 255 mutex_exit(DBUF_HASH_MUTEX(h, idx)); 256 return (dbf); 257 } 258 mutex_exit(&dbf->db_mtx); 259 } 260 } 261 262 mutex_enter(&db->db_mtx); 263 db->db_hash_next = h->hash_table[idx]; 264 h->hash_table[idx] = db; 265 mutex_exit(DBUF_HASH_MUTEX(h, idx)); 266 atomic_inc_64(&dbuf_hash_count); 267 268 return (NULL); 269} 270 271/* 272 * Remove an entry from the hash table. It must be in the EVICTING state. 273 / 274static void 275dbuf_hash_remove(dmu_buf_impl_t db) 276{ 277 dbuf_hash_table_t h = &dbuf_hash_table; 278* uint64_t hv = dbuf_hash(db->db_objset, db->db.db_object, 279 db->db_level, db->db_blkid); 280 uint64_t idx = hv & h->hash_table_mask; 281 dmu_buf_impl_t dbf, dbp; 282* 283 /* 284 * We musn't hold db_mtx to maintain lock ordering: 285 * DBUF_HASH_MUTEX > db_mtx. 286 / 287* ASSERT(refcount_is_zero(&db->db_holds)); 288 ASSERT(db->db_state == DB_EVICTING); 289 ASSERT(!MUTEX_HELD(&db->db_mtx)); 290 291 mutex_enter(DBUF_HASH_MUTEX(h, idx)); 292 dbp = &h->hash_table[idx]; 293 while ((dbf = dbp) != db) { 294* dbp = &dbf->db_hash_next; 295 ASSERT(dbf != NULL); 296 } 297 dbp = db->db_hash_next; 298* db->db_hash_next = NULL; 299 mutex_exit(DBUF_HASH_MUTEX(h, idx)); 300 atomic_dec_64(&dbuf_hash_count); 301} 302 303typedef enum { 304 DBVU_EVICTING, 305 DBVU_NOT_EVICTING 306} dbvu_verify_type_t; 307 308static void 309dbuf_verify_user(dmu_buf_impl_t db, dbvu_verify_type_t verify_type) 310{ 311#ifdef ZFS_DEBUG 312* int64_t holds; 313 314 if (db->db_user == NULL) 315 return; 316 317 /* Only data blocks support the attachment of user data. / 318* ASSERT(db->db_level == 0); 319 320 /* Clients must resolve a dbuf before attaching user data. / 321* ASSERT(db->db.db_data != NULL); 322 ASSERT3U(db->db_state, ==, DB_CACHED); 323 324 holds = refcount_count(&db->db_holds); 325 if (verify_type == DBVU_EVICTING) { 326 /* 327 * Immediate eviction occurs when holds == dirtycnt. 328 * For normal eviction buffers, holds is zero on 329 * eviction, except when dbuf_fix_old_data() calls 330 * dbuf_clear_data(). However, the hold count can grow 331 * during eviction even though db_mtx is held (see 332 * dmu_bonus_hold() for an example), so we can only 333 * test the generic invariant that holds >= dirtycnt. 334 / 335* ASSERT3U(holds, >=, db->db_dirtycnt); 336 } else { 337 if (db->db_user_immediate_evict == TRUE) 338 ASSERT3U(holds, >=, db->db_dirtycnt); 339 else 340 ASSERT3U(holds, >, 0); 341 } 342#endif 343} 344 345static void 346dbuf_evict_user(dmu_buf_impl_t db) 347{ 348* dmu_buf_user_t dbu = db->db_user; 349* 350 ASSERT(MUTEX_HELD(&db->db_mtx)); 351 352 if (dbu == NULL) 353 return; 354 355 dbuf_verify_user(db, DBVU_EVICTING); 356 db->db_user = NULL; 357 358#ifdef ZFS_DEBUG 359 if (dbu->dbu_clear_on_evict_dbufp != NULL) 360 dbu->dbu_clear_on_evict_dbufp = NULL; 361#endif 362* 363 /* 364 * Invoke the callback from a taskq to avoid lock order reversals 365 * and limit stack depth. 366 / 367* taskq_dispatch_ent(dbu_evict_taskq, dbu->dbu_evict_func, dbu, 0, 368 &dbu->dbu_tqent); 369} 370 371boolean_t 372dbuf_is_metadata(dmu_buf_impl_t db) 373{ 374* if (db->db_level > 0) { 375 return (B_TRUE); 376 } else { 377 boolean_t is_metadata; 378 379 DB_DNODE_ENTER(db); 380 is_metadata = DMU_OT_IS_METADATA(DB_DNODE(db)->dn_type); 381 DB_DNODE_EXIT(db); 382 383 return (is_metadata); 384 } 385} 386 387/* 388 * This function must return indices evenly distributed between all 389 * sublists of the multilist. This is needed due to how the dbuf eviction 390 * code is laid out; dbuf_evict_thread() assumes dbufs are evenly 391 * distributed between all sublists and uses this assumption when 392 * deciding which sublist to evict from and how much to evict from it. 393 / 394unsigned int 395dbuf_cache_multilist_index_func(multilist_t ml, void obj) 396{ 397* dmu_buf_impl_t db = obj; 398* 399 /* 400 * The assumption here, is the hash value for a given 401 * dmu_buf_impl_t will remain constant throughout it's lifetime 402 * (i.e. it's objset, object, level and blkid fields don't change). 403 * Thus, we don't need to store the dbuf's sublist index 404 * on insertion, as this index can be recalculated on removal. 405 * 406 * Also, the low order bits of the hash value are thought to be 407 * distributed evenly. Otherwise, in the case that the multilist 408 * has a power of two number of sublists, each sublists' usage 409 * would not be evenly distributed. 410 / 411* return (dbuf_hash(db->db_objset, db->db.db_object, 412 db->db_level, db->db_blkid) % 413 multilist_get_num_sublists(ml)); 414} 415 416static inline boolean_t 417dbuf_cache_above_hiwater(void) 418{ 419 uint64_t dbuf_cache_hiwater_bytes = 420 (dbuf_cache_max_bytes * dbuf_cache_hiwater_pct) / 100; 421 422 return (refcount_count(&dbuf_cache_size) > 423 dbuf_cache_max_bytes + dbuf_cache_hiwater_bytes); 424} 425 426static inline boolean_t 427dbuf_cache_above_lowater(void) 428{ 429 uint64_t dbuf_cache_lowater_bytes = 430 (dbuf_cache_max_bytes * dbuf_cache_lowater_pct) / 100; 431 432 return (refcount_count(&dbuf_cache_size) > 433 dbuf_cache_max_bytes - dbuf_cache_lowater_bytes); 434} 435 436/* 437 * Evict the oldest eligible dbuf from the dbuf cache. 438 / 439static void 440dbuf_evict_one(void) 441{ 442* int idx = multilist_get_random_index(&dbuf_cache); 443 multilist_sublist_t mls = multilist_sublist_lock(&dbuf_cache, idx); 444* 445 ASSERT(!MUTEX_HELD(&dbuf_evict_lock)); 446 447 /* 448 * Set the thread's tsd to indicate that it's processing evictions. 449 * Once a thread stops evicting from the dbuf cache it will 450 * reset its tsd to NULL. 451 / 452* ASSERT3P(tsd_get(zfs_dbuf_evict_key), ==, NULL); 453 (void) tsd_set(zfs_dbuf_evict_key, (void )B_TRUE); 454* 455 dmu_buf_impl_t db = multilist_sublist_tail(mls); 456* while (db != NULL && mutex_tryenter(&db->db_mtx) == 0) { 457 db = multilist_sublist_prev(mls, db); 458 } 459 460 DTRACE_PROBE2(dbuf__evict__one, dmu_buf_impl_t , db, 461* multilist_sublist_t , mls); 462* 463 if (db != NULL) { 464 multilist_sublist_remove(mls, db); 465 multilist_sublist_unlock(mls); 466 (void) refcount_remove_many(&dbuf_cache_size, 467 db->db.db_size, db); 468 dbuf_destroy(db); 469 } else { 470 multilist_sublist_unlock(mls); 471 } 472 (void) tsd_set(zfs_dbuf_evict_key, NULL); 473} 474 475/* 476 * The dbuf evict thread is responsible for aging out dbufs from the 477 * cache. Once the cache has reached it's maximum size, dbufs are removed 478 * and destroyed. The eviction thread will continue running until the size 479 * of the dbuf cache is at or below the maximum size. Once the dbuf is aged 480 * out of the cache it is destroyed and becomes eligible for arc eviction. 481 / 482static void 483dbuf_evict_thread(void dummy __unused) 484{ 485 callb_cpr_t cpr; 486 487 CALLB_CPR_INIT(&cpr, &dbuf_evict_lock, callb_generic_cpr, FTAG); 488 489 mutex_enter(&dbuf_evict_lock); 490 while (!dbuf_evict_thread_exit) { 491 while (!dbuf_cache_above_lowater() && !dbuf_evict_thread_exit) { 492 CALLB_CPR_SAFE_BEGIN(&cpr); 493 (void) cv_timedwait_hires(&dbuf_evict_cv, 494 &dbuf_evict_lock, SEC2NSEC(1), MSEC2NSEC(1), 0); 495 CALLB_CPR_SAFE_END(&cpr, &dbuf_evict_lock); 496 } 497 mutex_exit(&dbuf_evict_lock); 498 499 /* 500 * Keep evicting as long as we're above the low water mark 501 * for the cache. We do this without holding the locks to 502 * minimize lock contention. 503 / 504* while (dbuf_cache_above_lowater() && !dbuf_evict_thread_exit) { 505 dbuf_evict_one(); 506 } 507 508 mutex_enter(&dbuf_evict_lock); 509 } 510 511 dbuf_evict_thread_exit = B_FALSE; 512 cv_broadcast(&dbuf_evict_cv); 513 CALLB_CPR_EXIT(&cpr); /* drops dbuf_evict_lock / 514* thread_exit(); 515} 516 517/* 518 * Wake up the dbuf eviction thread if the dbuf cache is at its max size. 519 * If the dbuf cache is at its high water mark, then evict a dbuf from the 520 * dbuf cache using the callers context. 521 / 522static void 523dbuf_evict_notify(void) 524{ 525* 526 /* 527 * We use thread specific data to track when a thread has 528 * started processing evictions. This allows us to avoid deeply 529 * nested stacks that would have a call flow similar to this: 530 * 531 * dbuf_rele()-->dbuf_rele_and_unlock()-->dbuf_evict_notify() 532 * ^ \| 533 * \| \| 534 * +-----dbuf_destroy()<--dbuf_evict_one()<--------+ 535 * 536 * The dbuf_eviction_thread will always have its tsd set until 537 * that thread exits. All other threads will only set their tsd 538 * if they are participating in the eviction process. This only 539 * happens if the eviction thread is unable to process evictions 540 * fast enough. To keep the dbuf cache size in check, other threads 541 * can evict from the dbuf cache directly. Those threads will set 542 * their tsd values so that we ensure that they only evict one dbuf 543 * from the dbuf cache. 544 / 545* if (tsd_get(zfs_dbuf_evict_key) != NULL) 546 return; 547 548 if (refcount_count(&dbuf_cache_size) > dbuf_cache_max_bytes) { 549 boolean_t evict_now = B_FALSE; 550 551 mutex_enter(&dbuf_evict_lock); 552 if (refcount_count(&dbuf_cache_size) > dbuf_cache_max_bytes) { 553 evict_now = dbuf_cache_above_hiwater(); 554 cv_signal(&dbuf_evict_cv); 555 } 556 mutex_exit(&dbuf_evict_lock); 557 558 if (evict_now) { 559 dbuf_evict_one(); 560 } 561 } 562} 563 564void 565dbuf_init(void) 566{ 567 uint64_t hsize = 1ULL << 16; 568 dbuf_hash_table_t h = &dbuf_hash_table; 569* int i; 570 571 /* 572 * The hash table is big enough to fill all of physical memory 573 * with an average 4K block size. The table will take up 574 * totalmemsizeof(void)/4K (i.e. 2MB/GB with 8-byte pointers). 575 / 576* while (hsize * 4096 < (uint64_t)physmem * PAGESIZE) 577 hsize <<= 1; 578 579retry: 580 h->hash_table_mask = hsize - 1; 581 h->hash_table = kmem_zalloc(hsize * sizeof (void ), KM_NOSLEEP); 582* if (h->hash_table == NULL) { 583 /* XXX - we should really return an error instead of assert / 584* ASSERT(hsize > (1ULL << 10)); 585 hsize >>= 1; 586 goto retry; 587 } 588 589 dbuf_kmem_cache = kmem_cache_create("dmu_buf_impl_t", 590 sizeof (dmu_buf_impl_t), 591 0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0); 592 593 for (i = 0; i < DBUF_MUTEXES; i++) 594 mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL); 595 596 /* 597 * Setup the parameters for the dbuf cache. We cap the size of the 598 * dbuf cache to 1/32nd (default) of the size of the ARC. 599 / 600* dbuf_cache_max_bytes = MIN(dbuf_cache_max_bytes, 601 arc_max_bytes() >> dbuf_cache_max_shift); 602 603 /* 604 * All entries are queued via taskq_dispatch_ent(), so min/maxalloc 605 * configuration is not required. 606 / 607* dbu_evict_taskq = taskq_create("dbu_evict", 1, minclsyspri, 0, 0, 0); 608 609 multilist_create(&dbuf_cache, sizeof (dmu_buf_impl_t), 610 offsetof(dmu_buf_impl_t, db_cache_link), 611 zfs_arc_num_sublists_per_state, 612 dbuf_cache_multilist_index_func); 613 refcount_create(&dbuf_cache_size); 614 615 tsd_create(&zfs_dbuf_evict_key, NULL); 616 dbuf_evict_thread_exit = B_FALSE; 617 mutex_init(&dbuf_evict_lock, NULL, MUTEX_DEFAULT, NULL); 618 cv_init(&dbuf_evict_cv, NULL, CV_DEFAULT, NULL); 619 dbuf_cache_evict_thread = thread_create(NULL, 0, dbuf_evict_thread, 620 NULL, 0, &p0, TS_RUN, minclsyspri); 621} 622 623void 624dbuf_fini(void) 625{ 626 dbuf_hash_table_t h = &dbuf_hash_table; 627* int i; 628 629 for (i = 0; i < DBUF_MUTEXES; i++) 630 mutex_destroy(&h->hash_mutexes[i]); 631 kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void )); 632* kmem_cache_destroy(dbuf_kmem_cache); 633 taskq_destroy(dbu_evict_taskq); 634 635 mutex_enter(&dbuf_evict_lock); 636 dbuf_evict_thread_exit = B_TRUE; 637 while (dbuf_evict_thread_exit) { 638 cv_signal(&dbuf_evict_cv); 639 cv_wait(&dbuf_evict_cv, &dbuf_evict_lock); 640 } 641 mutex_exit(&dbuf_evict_lock); 642 tsd_destroy(&zfs_dbuf_evict_key); 643 644 mutex_destroy(&dbuf_evict_lock); 645 cv_destroy(&dbuf_evict_cv); 646 647 refcount_destroy(&dbuf_cache_size); 648 multilist_destroy(&dbuf_cache); 649} 650 651/* 652 * Other stuff. 653 / 654* 655#ifdef ZFS_DEBUG 656static void 657dbuf_verify(dmu_buf_impl_t db) 658{ 659* dnode_t dn; 660* dbuf_dirty_record_t dr; 661* 662 ASSERT(MUTEX_HELD(&db->db_mtx)); 663 664 if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY)) 665 return; 666 667 ASSERT(db->db_objset != NULL); 668 DB_DNODE_ENTER(db); 669 dn = DB_DNODE(db); 670 if (dn == NULL) { 671 ASSERT(db->db_parent == NULL); 672 ASSERT(db->db_blkptr == NULL); 673 } else { 674 ASSERT3U(db->db.db_object, ==, dn->dn_object); 675 ASSERT3P(db->db_objset, ==, dn->dn_objset); 676 ASSERT3U(db->db_level, <, dn->dn_nlevels); 677 ASSERT(db->db_blkid == DMU_BONUS_BLKID \|\| 678 db->db_blkid == DMU_SPILL_BLKID \|\| 679 !avl_is_empty(&dn->dn_dbufs)); 680 } 681 if (db->db_blkid == DMU_BONUS_BLKID) { 682 ASSERT(dn != NULL); 683 ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); 684 ASSERT3U(db->db.db_offset, ==, DMU_BONUS_BLKID); 685 } else if (db->db_blkid == DMU_SPILL_BLKID) { 686 ASSERT(dn != NULL); 687 ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); 688 ASSERT0(db->db.db_offset); 689 } else { 690 ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size); 691 } 692 693 for (dr = db->db_data_pending; dr != NULL; dr = dr->dr_next) 694 ASSERT(dr->dr_dbuf == db); 695 696 for (dr = db->db_last_dirty; dr != NULL; dr = dr->dr_next) 697 ASSERT(dr->dr_dbuf == db); 698 699 /* 700 * We can't assert that db_size matches dn_datablksz because it 701 * can be momentarily different when another thread is doing 702 * dnode_set_blksz(). 703 / 704* if (db->db_level == 0 && db->db.db_object == DMU_META_DNODE_OBJECT) { 705 dr = db->db_data_pending; 706 /* 707 * It should only be modified in syncing context, so 708 * make sure we only have one copy of the data. 709 / 710* ASSERT(dr == NULL \|\| dr->dt.dl.dr_data == db->db_buf); 711 } 712 713 /* verify db->db_blkptr / 714* if (db->db_blkptr) { 715 if (db->db_parent == dn->dn_dbuf) { 716 /* db is pointed to by the dnode / 717* /* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); / 718* if (DMU_OBJECT_IS_SPECIAL(db->db.db_object)) 719 ASSERT(db->db_parent == NULL); 720 else 721 ASSERT(db->db_parent != NULL); 722 if (db->db_blkid != DMU_SPILL_BLKID) 723 ASSERT3P(db->db_blkptr, ==, 724 &dn->dn_phys->dn_blkptr[db->db_blkid]); 725 } else { 726 /* db is pointed to by an indirect block / 727* int epb = db->db_parent->db.db_size >> SPA_BLKPTRSHIFT; 728 ASSERT3U(db->db_parent->db_level, ==, db->db_level+1); 729 ASSERT3U(db->db_parent->db.db_object, ==, 730 db->db.db_object); 731 /* 732 * dnode_grow_indblksz() can make this fail if we don't 733 * have the struct_rwlock. XXX indblksz no longer 734 * grows. safe to do this now? 735 / 736* if (RW_WRITE_HELD(&dn->dn_struct_rwlock)) { 737 ASSERT3P(db->db_blkptr, ==, 738 ((blkptr_t )db->db_parent->db.db_data + 739* db->db_blkid % epb)); 740 } 741 } 742 } 743 if ((db->db_blkptr == NULL \|\| BP_IS_HOLE(db->db_blkptr)) && 744 (db->db_buf == NULL \|\| db->db_buf->b_data) && 745 db->db.db_data && db->db_blkid != DMU_BONUS_BLKID && 746 db->db_state != DB_FILL && !dn->dn_free_txg) { 747 /* 748 * If the blkptr isn't set but they have nonzero data, 749 * it had better be dirty, otherwise we'll lose that 750 * data when we evict this buffer. 751 * 752 * There is an exception to this rule for indirect blocks; in 753 * this case, if the indirect block is a hole, we fill in a few 754 * fields on each of the child blocks (importantly, birth time) 755 * to prevent hole birth times from being lost when you 756 * partially fill in a hole. 757 / 758* if (db->db_dirtycnt == 0) { 759 if (db->db_level == 0) { 760 uint64_t buf = db->db.db_data; 761* int i; 762 763 for (i = 0; i < db->db.db_size >> 3; i++) { 764 ASSERT(buf[i] == 0); 765 } 766 } else { 767 blkptr_t bps = db->db.db_data; 768* ASSERT3U(1 << DB_DNODE(db)->dn_indblkshift, ==, 769 db->db.db_size); 770 /* 771 * We want to verify that all the blkptrs in the 772 * indirect block are holes, but we may have 773 * automatically set up a few fields for them. 774 * We iterate through each blkptr and verify 775 * they only have those fields set. 776 / 777* for (int i = 0; 778 i < db->db.db_size / sizeof (blkptr_t); 779 i++) { 780 blkptr_t bp = &bps[i]; 781* ASSERT(ZIO_CHECKSUM_IS_ZERO( 782 &bp->blk_cksum)); 783 ASSERT( 784 DVA_IS_EMPTY(&bp->blk_dva[0]) && 785 DVA_IS_EMPTY(&bp->blk_dva[1]) && 786 DVA_IS_EMPTY(&bp->blk_dva[2])); 787 ASSERT0(bp->blk_fill); 788 ASSERT0(bp->blk_pad[0]); 789 ASSERT0(bp->blk_pad[1]); 790 ASSERT(!BP_IS_EMBEDDED(bp)); 791 ASSERT(BP_IS_HOLE(bp)); 792 ASSERT0(bp->blk_phys_birth); 793 } 794 } 795 } 796 } 797 DB_DNODE_EXIT(db); 798} 799#endif 800 801static void 802dbuf_clear_data(dmu_buf_impl_t db) 803{ 804* ASSERT(MUTEX_HELD(&db->db_mtx)); 805 dbuf_evict_user(db); 806 ASSERT3P(db->db_buf, ==, NULL); 807 db->db.db_data = NULL; 808 if (db->db_state != DB_NOFILL) 809 db->db_state = DB_UNCACHED; 810} 811 812static void 813dbuf_set_data(dmu_buf_impl_t db, arc_buf_t buf) 814{ 815 ASSERT(MUTEX_HELD(&db->db_mtx)); 816 ASSERT(buf != NULL); 817 818 db->db_buf = buf; 819 ASSERT(buf->b_data != NULL); 820 db->db.db_data = buf->b_data; 821} 822 823/* 824 * Loan out an arc_buf for read. Return the loaned arc_buf. 825 / 826arc_buf_t 827dbuf_loan_arcbuf(dmu_buf_impl_t db) 828{ 829* arc_buf_t abuf; 830* 831 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 832 mutex_enter(&db->db_mtx); 833 if (arc_released(db->db_buf) \|\| refcount_count(&db->db_holds) > 1) { 834 int blksz = db->db.db_size; 835 spa_t spa = db->db_objset->os_spa; 836* 837 mutex_exit(&db->db_mtx); 838 abuf = arc_loan_buf(spa, blksz); 839 bcopy(db->db.db_data, abuf->b_data, blksz); 840 } else { 841 abuf = db->db_buf; 842 arc_loan_inuse_buf(abuf, db); 843 db->db_buf = NULL; 844 dbuf_clear_data(db); 845 mutex_exit(&db->db_mtx); 846 } 847 return (abuf); 848} 849 850/* 851 * Calculate which level n block references the data at the level 0 offset 852 * provided. 853 / 854uint64_t 855dbuf_whichblock(dnode_t dn, int64_t level, uint64_t offset) 856{ 857 if (dn->dn_datablkshift != 0 && dn->dn_indblkshift != 0) { 858 /* 859 * The level n blkid is equal to the level 0 blkid divided by 860 * the number of level 0s in a level n block. 861 * 862 * The level 0 blkid is offset >> datablkshift = 863 * offset / 2^datablkshift. 864 * 865 * The number of level 0s in a level n is the number of block 866 * pointers in an indirect block, raised to the power of level. 867 * This is 2^(indblkshift - SPA_BLKPTRSHIFT)^level = 868 * 2^(level(indblkshift - SPA_BLKPTRSHIFT)). 869* * 870 * Thus, the level n blkid is: offset / 871 * ((2^datablkshift)(2^(level(indblkshift - SPA_BLKPTRSHIFT))) 872 * = offset / 2^(datablkshift + level * 873 * (indblkshift - SPA_BLKPTRSHIFT)) 874 * = offset >> (datablkshift + level * 875 * (indblkshift - SPA_BLKPTRSHIFT)) 876 / 877* return (offset >> (dn->dn_datablkshift + level * 878 (dn->dn_indblkshift - SPA_BLKPTRSHIFT))); 879 } else { 880 ASSERT3U(offset, <, dn->dn_datablksz); 881 return (0); 882 } 883} 884 885static void 886dbuf_read_done(zio_t zio, arc_buf_t buf, void vdb) 887{ 888* dmu_buf_impl_t db = vdb; 889* 890 mutex_enter(&db->db_mtx); 891 ASSERT3U(db->db_state, ==, DB_READ); 892 /* 893 * All reads are synchronous, so we must have a hold on the dbuf 894 / 895* ASSERT(refcount_count(&db->db_holds) > 0); 896 ASSERT(db->db_buf == NULL); 897 ASSERT(db->db.db_data == NULL); 898 if (db->db_level == 0 && db->db_freed_in_flight) { 899 /* we were freed in flight; disregard any error / 900* arc_release(buf, db); 901 bzero(buf->b_data, db->db.db_size); 902 arc_buf_freeze(buf); 903 db->db_freed_in_flight = FALSE; 904 dbuf_set_data(db, buf); 905 db->db_state = DB_CACHED; 906 } else if (zio == NULL \|\| zio->io_error == 0) { 907 dbuf_set_data(db, buf); 908 db->db_state = DB_CACHED; 909 } else { 910 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 911 ASSERT3P(db->db_buf, ==, NULL); 912 arc_buf_destroy(buf, db); 913 db->db_state = DB_UNCACHED; 914 } 915 cv_broadcast(&db->db_changed); 916 dbuf_rele_and_unlock(db, NULL); 917} 918 919static void 920dbuf_read_impl(dmu_buf_impl_t db, zio_t zio, uint32_t flags) 921{ 922 dnode_t dn; 923* zbookmark_phys_t zb; 924 arc_flags_t aflags = ARC_FLAG_NOWAIT; 925 926 DB_DNODE_ENTER(db); 927 dn = DB_DNODE(db); 928 ASSERT(!refcount_is_zero(&db->db_holds)); 929 /* We need the struct_rwlock to prevent db_blkptr from changing. / 930* ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 931 ASSERT(MUTEX_HELD(&db->db_mtx)); 932 ASSERT(db->db_state == DB_UNCACHED); 933 ASSERT(db->db_buf == NULL); 934 935 if (db->db_blkid == DMU_BONUS_BLKID) { 936 int bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen); 937 938 ASSERT3U(bonuslen, <=, db->db.db_size); 939 db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN); 940 arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); 941 if (bonuslen < DN_MAX_BONUSLEN) 942 bzero(db->db.db_data, DN_MAX_BONUSLEN); 943 if (bonuslen) 944 bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen); 945 DB_DNODE_EXIT(db); 946 db->db_state = DB_CACHED; 947 mutex_exit(&db->db_mtx); 948 return; 949 } 950 951 /* 952 * Recheck BP_IS_HOLE() after dnode_block_freed() in case dnode_sync() 953 * processes the delete record and clears the bp while we are waiting 954 * for the dn_mtx (resulting in a "no" from block_freed). 955 / 956* if (db->db_blkptr == NULL \|\| BP_IS_HOLE(db->db_blkptr) \|\| 957 (db->db_level == 0 && (dnode_block_freed(dn, db->db_blkid) \|\| 958 BP_IS_HOLE(db->db_blkptr)))) { 959 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 960 961 dbuf_set_data(db, arc_alloc_buf(db->db_objset->os_spa, 962 db->db.db_size, db, type)); 963 bzero(db->db.db_data, db->db.db_size); 964 965 if (db->db_blkptr != NULL && db->db_level > 0 && 966 BP_IS_HOLE(db->db_blkptr) && 967 db->db_blkptr->blk_birth != 0) { 968 blkptr_t bps = db->db.db_data; 969* for (int i = 0; i < ((1 << 970 DB_DNODE(db)->dn_indblkshift) / sizeof (blkptr_t)); 971 i++) { 972 blkptr_t bp = &bps[i]; 973* ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==, 974 1 << dn->dn_indblkshift); 975 BP_SET_LSIZE(bp, 976 BP_GET_LEVEL(db->db_blkptr) == 1 ? 977 dn->dn_datablksz : 978 BP_GET_LSIZE(db->db_blkptr)); 979 BP_SET_TYPE(bp, BP_GET_TYPE(db->db_blkptr)); 980 BP_SET_LEVEL(bp, 981 BP_GET_LEVEL(db->db_blkptr) - 1); 982 BP_SET_BIRTH(bp, db->db_blkptr->blk_birth, 0); 983 } 984 } 985 DB_DNODE_EXIT(db); 986 db->db_state = DB_CACHED; 987 mutex_exit(&db->db_mtx); 988 return; 989 } 990 991 DB_DNODE_EXIT(db); 992 993 db->db_state = DB_READ; 994 mutex_exit(&db->db_mtx); 995 996 if (DBUF_IS_L2CACHEABLE(db)) 997 aflags \|= ARC_FLAG_L2CACHE; 998 999 SET_BOOKMARK(&zb, db->db_objset->os_dsl_dataset ? 1000 db->db_objset->os_dsl_dataset->ds_object : DMU_META_OBJSET, 1001 db->db.db_object, db->db_level, db->db_blkid); 1002 1003 dbuf_add_ref(db, NULL); 1004 1005 (void) arc_read(zio, db->db_objset->os_spa, db->db_blkptr, 1006 dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, 1007 (flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED, 1008 &aflags, &zb); 1009} 1010 1011int 1012dbuf_read(dmu_buf_impl_t db, zio_t zio, uint32_t flags) 1013{ 1014 int err = 0; 1015 boolean_t havepzio = (zio != NULL); 1016 boolean_t prefetch; 1017 dnode_t dn; 1018* 1019 /* 1020 * We don't have to hold the mutex to check db_state because it 1021 * can't be freed while we have a hold on the buffer. 1022 / 1023* ASSERT(!refcount_is_zero(&db->db_holds)); 1024 1025 if (db->db_state == DB_NOFILL) 1026 return (SET_ERROR(EIO)); 1027 1028 DB_DNODE_ENTER(db); 1029 dn = DB_DNODE(db); 1030 if ((flags & DB_RF_HAVESTRUCT) == 0) 1031 rw_enter(&dn->dn_struct_rwlock, RW_READER); 1032 1033 prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && 1034 (flags & DB_RF_NOPREFETCH) == 0 && dn != NULL && 1035 DBUF_IS_CACHEABLE(db); 1036 1037 mutex_enter(&db->db_mtx); 1038 if (db->db_state == DB_CACHED) { 1039 mutex_exit(&db->db_mtx); 1040 if (prefetch) 1041 dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE); 1042 if ((flags & DB_RF_HAVESTRUCT) == 0) 1043 rw_exit(&dn->dn_struct_rwlock); 1044 DB_DNODE_EXIT(db); 1045 } else if (db->db_state == DB_UNCACHED) { 1046 spa_t spa = dn->dn_objset->os_spa; 1047* 1048 if (zio == NULL) 1049 zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); 1050 dbuf_read_impl(db, zio, flags); 1051 1052 /* dbuf_read_impl has dropped db_mtx for us / 1053* 1054 if (prefetch) 1055 dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE); 1056 1057 if ((flags & DB_RF_HAVESTRUCT) == 0) 1058 rw_exit(&dn->dn_struct_rwlock); 1059 DB_DNODE_EXIT(db); 1060 1061 if (!havepzio) 1062 err = zio_wait(zio); 1063 } else { 1064 /* 1065 * Another reader came in while the dbuf was in flight 1066 * between UNCACHED and CACHED. Either a writer will finish 1067 * writing the buffer (sending the dbuf to CACHED) or the 1068 * first reader's request will reach the read_done callback 1069 * and send the dbuf to CACHED. Otherwise, a failure 1070 * occurred and the dbuf went to UNCACHED. 1071 / 1072* mutex_exit(&db->db_mtx); 1073 if (prefetch) 1074 dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE); 1075 if ((flags & DB_RF_HAVESTRUCT) == 0) 1076 rw_exit(&dn->dn_struct_rwlock); 1077 DB_DNODE_EXIT(db); 1078 1079 /* Skip the wait per the caller's request. / 1080* mutex_enter(&db->db_mtx); 1081 if ((flags & DB_RF_NEVERWAIT) == 0) { 1082 while (db->db_state == DB_READ \|\| 1083 db->db_state == DB_FILL) { 1084 ASSERT(db->db_state == DB_READ \|\| 1085 (flags & DB_RF_HAVESTRUCT) == 0); 1086 DTRACE_PROBE2(blocked__read, dmu_buf_impl_t , 1087* db, zio_t , zio); 1088* cv_wait(&db->db_changed, &db->db_mtx); 1089 } 1090 if (db->db_state == DB_UNCACHED) 1091 err = SET_ERROR(EIO); 1092 } 1093 mutex_exit(&db->db_mtx); 1094 } 1095 1096 ASSERT(err \|\| havepzio \|\| db->db_state == DB_CACHED); 1097 return (err); 1098} 1099 1100static void 1101dbuf_noread(dmu_buf_impl_t db) 1102{ 1103* ASSERT(!refcount_is_zero(&db->db_holds)); 1104 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 1105 mutex_enter(&db->db_mtx); 1106 while (db->db_state == DB_READ \|\| db->db_state == DB_FILL) 1107 cv_wait(&db->db_changed, &db->db_mtx); 1108 if (db->db_state == DB_UNCACHED) { 1109 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 1110 spa_t spa = db->db_objset->os_spa; 1111* 1112 ASSERT(db->db_buf == NULL); 1113 ASSERT(db->db.db_data == NULL); 1114 dbuf_set_data(db, arc_alloc_buf(spa, db->db.db_size, db, type)); 1115 db->db_state = DB_FILL; 1116 } else if (db->db_state == DB_NOFILL) { 1117 dbuf_clear_data(db); 1118 } else { 1119 ASSERT3U(db->db_state, ==, DB_CACHED); 1120 } 1121 mutex_exit(&db->db_mtx); 1122} 1123 1124/* 1125 * This is our just-in-time copy function. It makes a copy of 1126 * buffers, that have been modified in a previous transaction 1127 * group, before we modify them in the current active group. 1128 * 1129 * This function is used in two places: when we are dirtying a 1130 * buffer for the first time in a txg, and when we are freeing 1131 * a range in a dnode that includes this buffer. 1132 * 1133 * Note that when we are called from dbuf_free_range() we do 1134 * not put a hold on the buffer, we just traverse the active 1135 * dbuf list for the dnode. 1136 / 1137static void 1138dbuf_fix_old_data(dmu_buf_impl_t db, uint64_t txg) 1139{ 1140 dbuf_dirty_record_t dr = db->db_last_dirty; 1141* 1142 ASSERT(MUTEX_HELD(&db->db_mtx)); 1143 ASSERT(db->db.db_data != NULL); 1144 ASSERT(db->db_level == 0); 1145 ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT); 1146 1147 if (dr == NULL \|\| 1148 (dr->dt.dl.dr_data != 1149 ((db->db_blkid == DMU_BONUS_BLKID) ? db->db.db_data : db->db_buf))) 1150 return; 1151 1152 /* 1153 * If the last dirty record for this dbuf has not yet synced 1154 * and its referencing the dbuf data, either: 1155 * reset the reference to point to a new copy, 1156 * or (if there a no active holders) 1157 * just null out the current db_data pointer. 1158 / 1159* ASSERT(dr->dr_txg >= txg - 2); 1160 if (db->db_blkid == DMU_BONUS_BLKID) { 1161 /* Note that the data bufs here are zio_bufs / 1162* dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN); 1163 arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); 1164 bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN); 1165 } else if (refcount_count(&db->db_holds) > db->db_dirtycnt) { 1166 int size = db->db.db_size; 1167 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 1168 spa_t spa = db->db_objset->os_spa; 1169* 1170 dr->dt.dl.dr_data = arc_alloc_buf(spa, size, db, type); 1171 bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size); 1172 } else { 1173 db->db_buf = NULL; 1174 dbuf_clear_data(db); 1175 } 1176} 1177 1178void 1179dbuf_unoverride(dbuf_dirty_record_t dr) 1180{ 1181* dmu_buf_impl_t db = dr->dr_dbuf; 1182* blkptr_t bp = &dr->dt.dl.dr_overridden_by; 1183* uint64_t txg = dr->dr_txg; 1184 1185 ASSERT(MUTEX_HELD(&db->db_mtx)); 1186 ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC); 1187 ASSERT(db->db_level == 0); 1188 1189 if (db->db_blkid == DMU_BONUS_BLKID \|\| 1190 dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN) 1191 return; 1192 1193 ASSERT(db->db_data_pending != dr); 1194 1195 /* free this block / 1196* if (!BP_IS_HOLE(bp) && !dr->dt.dl.dr_nopwrite) 1197 zio_free(db->db_objset->os_spa, txg, bp); 1198 1199 dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; 1200 dr->dt.dl.dr_nopwrite = B_FALSE; 1201 1202 /* 1203 * Release the already-written buffer, so we leave it in 1204 * a consistent dirty state. Note that all callers are 1205 * modifying the buffer, so they will immediately do 1206 * another (redundant) arc_release(). Therefore, leave 1207 * the buf thawed to save the effort of freezing & 1208 * immediately re-thawing it. 1209 / 1210* arc_release(dr->dt.dl.dr_data, db); 1211} 1212 1213/* 1214 * Evict (if its unreferenced) or clear (if its referenced) any level-0 1215 * data blocks in the free range, so that any future readers will find 1216 * empty blocks.
1223 * 1224 * This is a no-op if the dataset is in the middle of an incremental 1225 * receive; see comment below for details.
1226 / 1227void 1228dbuf_free_range(dnode_t dn, uint64_t start_blkid, uint64_t end_blkid, 1229 dmu_tx_t tx) 1230{ 1231* dmu_buf_impl_t db_search; 1232 dmu_buf_impl_t db, db_next; 1233 uint64_t txg = tx->tx_txg; 1234 avl_index_t where;	1217 / 1218void 1219dbuf_free_range(dnode_t dn, uint64_t start_blkid, uint64_t end_blkid, 1220 dmu_tx_t tx) 1221{ 1222* dmu_buf_impl_t db_search; 1223 dmu_buf_impl_t db, db_next; 1224 uint64_t txg = tx->tx_txg; 1225 avl_index_t where;
1235 boolean_t freespill = 1236 (start_blkid == DMU_SPILL_BLKID \|\| end_blkid == DMU_SPILL_BLKID);
1237	1226
1238 if (end_blkid > dn->dn_maxblkid && !freespill)	1227 if (end_blkid > dn->dn_maxblkid && 1228 !(start_blkid == DMU_SPILL_BLKID \|\| end_blkid == DMU_SPILL_BLKID))
1239 end_blkid = dn->dn_maxblkid; 1240 dprintf_dnode(dn, "start=%llu end=%llu\n", start_blkid, end_blkid); 1241 1242 db_search.db_level = 0; 1243 db_search.db_blkid = start_blkid; 1244 db_search.db_state = DB_SEARCH; 1245 1246 mutex_enter(&dn->dn_dbufs_mtx);	1229 end_blkid = dn->dn_maxblkid; 1230 dprintf_dnode(dn, "start=%llu end=%llu\n", start_blkid, end_blkid); 1231 1232 db_search.db_level = 0; 1233 db_search.db_blkid = start_blkid; 1234 db_search.db_state = DB_SEARCH; 1235 1236 mutex_enter(&dn->dn_dbufs_mtx);
1247 if (start_blkid >= dn->dn_unlisted_l0_blkid && !freespill) { 1248 /* There can't be any dbufs in this range; no need to search. / 1249#ifdef DEBUG 1250* db = avl_find(&dn->dn_dbufs, &db_search, &where); 1251 ASSERT3P(db, ==, NULL); 1252 db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER); 1253 ASSERT(db == NULL \|\| db->db_level > 0); 1254#endif 1255 mutex_exit(&dn->dn_dbufs_mtx); 1256 return; 1257 } else if (dmu_objset_is_receiving(dn->dn_objset)) { 1258 /* 1259 * If we are receiving, we expect there to be no dbufs in 1260 * the range to be freed, because receive modifies each 1261 * block at most once, and in offset order. If this is 1262 * not the case, it can lead to performance problems, 1263 * so note that we unexpectedly took the slow path. 1264 / 1265* atomic_inc_64(&zfs_free_range_recv_miss); 1266 } 1267
1268 db = avl_find(&dn->dn_dbufs, &db_search, &where); 1269 ASSERT3P(db, ==, NULL);	1237 db = avl_find(&dn->dn_dbufs, &db_search, &where); 1238 ASSERT3P(db, ==, NULL);
	1239
1270 db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER); 1271 1272 for (; db != NULL; db = db_next) { 1273 db_next = AVL_NEXT(&dn->dn_dbufs, db); 1274 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 1275 1276 if (db->db_level != 0 \|\| db->db_blkid > end_blkid) { 1277 break; 1278 } 1279 ASSERT3U(db->db_blkid, >=, start_blkid); 1280 1281 /* found a level 0 buffer in the range / 1282* mutex_enter(&db->db_mtx); 1283 if (dbuf_undirty(db, tx)) { 1284 /* mutex has been dropped and dbuf destroyed / 1285* continue; 1286 } 1287 1288 if (db->db_state == DB_UNCACHED \|\| 1289 db->db_state == DB_NOFILL \|\| 1290 db->db_state == DB_EVICTING) { 1291 ASSERT(db->db.db_data == NULL); 1292 mutex_exit(&db->db_mtx); 1293 continue; 1294 } 1295 if (db->db_state == DB_READ \|\| db->db_state == DB_FILL) { 1296 /* will be handled in dbuf_read_done or dbuf_rele / 1297* db->db_freed_in_flight = TRUE; 1298 mutex_exit(&db->db_mtx); 1299 continue; 1300 } 1301 if (refcount_count(&db->db_holds) == 0) { 1302 ASSERT(db->db_buf); 1303 dbuf_destroy(db); 1304 continue; 1305 } 1306 /* The dbuf is referenced / 1307* 1308 if (db->db_last_dirty != NULL) { 1309 dbuf_dirty_record_t dr = db->db_last_dirty; 1310* 1311 if (dr->dr_txg == txg) { 1312 /* 1313 * This buffer is "in-use", re-adjust the file 1314 * size to reflect that this buffer may 1315 * contain new data when we sync. 1316 / 1317* if (db->db_blkid != DMU_SPILL_BLKID && 1318 db->db_blkid > dn->dn_maxblkid) 1319 dn->dn_maxblkid = db->db_blkid; 1320 dbuf_unoverride(dr); 1321 } else { 1322 /* 1323 * This dbuf is not dirty in the open context. 1324 * Either uncache it (if its not referenced in 1325 * the open context) or reset its contents to 1326 * empty. 1327 / 1328* dbuf_fix_old_data(db, txg); 1329 } 1330 } 1331 /* clear the contents if its cached / 1332* if (db->db_state == DB_CACHED) { 1333 ASSERT(db->db.db_data != NULL); 1334 arc_release(db->db_buf, db); 1335 bzero(db->db.db_data, db->db.db_size); 1336 arc_buf_freeze(db->db_buf); 1337 } 1338 1339 mutex_exit(&db->db_mtx); 1340 } 1341 mutex_exit(&dn->dn_dbufs_mtx); 1342} 1343 1344static int 1345dbuf_block_freeable(dmu_buf_impl_t db) 1346{ 1347* dsl_dataset_t ds = db->db_objset->os_dsl_dataset; 1348* uint64_t birth_txg = 0; 1349 1350 /* 1351 * We don't need any locking to protect db_blkptr: 1352 * If it's syncing, then db_last_dirty will be set 1353 * so we'll ignore db_blkptr. 1354 * 1355 * This logic ensures that only block births for 1356 * filled blocks are considered. 1357 / 1358* ASSERT(MUTEX_HELD(&db->db_mtx)); 1359 if (db->db_last_dirty && (db->db_blkptr == NULL \|\| 1360 !BP_IS_HOLE(db->db_blkptr))) { 1361 birth_txg = db->db_last_dirty->dr_txg; 1362 } else if (db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)) { 1363 birth_txg = db->db_blkptr->blk_birth; 1364 } 1365 1366 /* 1367 * If this block don't exist or is in a snapshot, it can't be freed. 1368 * Don't pass the bp to dsl_dataset_block_freeable() since we 1369 * are holding the db_mtx lock and might deadlock if we are 1370 * prefetching a dedup-ed block. 1371 / 1372* if (birth_txg != 0) 1373 return (ds == NULL \|\| 1374 dsl_dataset_block_freeable(ds, NULL, birth_txg)); 1375 else 1376 return (B_FALSE); 1377} 1378 1379void 1380dbuf_new_size(dmu_buf_impl_t db, int size, dmu_tx_t tx) 1381{ 1382 arc_buf_t buf, obuf; 1383 int osize = db->db.db_size; 1384 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 1385 dnode_t dn; 1386* 1387 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 1388 1389 DB_DNODE_ENTER(db); 1390 dn = DB_DNODE(db); 1391 1392 /* XXX does this func really need the lock? / 1393* ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); 1394 1395 /* 1396 * This call to dmu_buf_will_dirty() with the dn_struct_rwlock held 1397 * is OK, because there can be no other references to the db 1398 * when we are changing its size, so no concurrent DB_FILL can 1399 * be happening. 1400 / 1401* /* 1402 * XXX we should be doing a dbuf_read, checking the return 1403 * value and returning that up to our callers 1404 / 1405* dmu_buf_will_dirty(&db->db, tx); 1406 1407 /* create the data buffer for the new block / 1408* buf = arc_alloc_buf(dn->dn_objset->os_spa, size, db, type); 1409 1410 /* copy old block data to the new block / 1411* obuf = db->db_buf; 1412 bcopy(obuf->b_data, buf->b_data, MIN(osize, size)); 1413 /* zero the remainder / 1414* if (size > osize) 1415 bzero((uint8_t )buf->b_data + osize, size - osize); 1416* 1417 mutex_enter(&db->db_mtx); 1418 dbuf_set_data(db, buf); 1419 arc_buf_destroy(obuf, db); 1420 db->db.db_size = size; 1421 1422 if (db->db_level == 0) { 1423 ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg); 1424 db->db_last_dirty->dt.dl.dr_data = buf; 1425 } 1426 mutex_exit(&db->db_mtx); 1427 1428 dnode_willuse_space(dn, size-osize, tx); 1429 DB_DNODE_EXIT(db); 1430} 1431 1432void 1433dbuf_release_bp(dmu_buf_impl_t db) 1434{ 1435* objset_t os = db->db_objset; 1436* 1437 ASSERT(dsl_pool_sync_context(dmu_objset_pool(os))); 1438 ASSERT(arc_released(os->os_phys_buf) \|\| 1439 list_link_active(&os->os_dsl_dataset->ds_synced_link)); 1440 ASSERT(db->db_parent == NULL \|\| arc_released(db->db_parent->db_buf)); 1441 1442 (void) arc_release(db->db_buf, db); 1443} 1444 1445/* 1446 * We already have a dirty record for this TXG, and we are being 1447 * dirtied again. 1448 / 1449static void 1450dbuf_redirty(dbuf_dirty_record_t dr) 1451{ 1452 dmu_buf_impl_t db = dr->dr_dbuf; 1453* 1454 ASSERT(MUTEX_HELD(&db->db_mtx)); 1455 1456 if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID) { 1457 /* 1458 * If this buffer has already been written out, 1459 * we now need to reset its state. 1460 / 1461* dbuf_unoverride(dr); 1462 if (db->db.db_object != DMU_META_DNODE_OBJECT && 1463 db->db_state != DB_NOFILL) { 1464 /* Already released on initial dirty, so just thaw. / 1465* ASSERT(arc_released(db->db_buf)); 1466 arc_buf_thaw(db->db_buf); 1467 } 1468 } 1469} 1470 1471dbuf_dirty_record_t * 1472dbuf_dirty(dmu_buf_impl_t db, dmu_tx_t tx) 1473{ 1474 dnode_t dn; 1475* objset_t os; 1476* dbuf_dirty_record_t *drp, dr; 1477 int drop_struct_lock = FALSE; 1478 boolean_t do_free_accounting = B_FALSE; 1479 int txgoff = tx->tx_txg & TXG_MASK; 1480 1481 ASSERT(tx->tx_txg != 0); 1482 ASSERT(!refcount_is_zero(&db->db_holds)); 1483 DMU_TX_DIRTY_BUF(tx, db); 1484 1485 DB_DNODE_ENTER(db); 1486 dn = DB_DNODE(db); 1487 /* 1488 * Shouldn't dirty a regular buffer in syncing context. Private 1489 * objects may be dirtied in syncing context, but only if they 1490 * were already pre-dirtied in open context. 1491 / 1492#ifdef DEBUG 1493* if (dn->dn_objset->os_dsl_dataset != NULL) { 1494 rrw_enter(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock, 1495 RW_READER, FTAG); 1496 } 1497 ASSERT(!dmu_tx_is_syncing(tx) \|\| 1498 BP_IS_HOLE(dn->dn_objset->os_rootbp) \|\| 1499 DMU_OBJECT_IS_SPECIAL(dn->dn_object) \|\| 1500 dn->dn_objset->os_dsl_dataset == NULL); 1501 if (dn->dn_objset->os_dsl_dataset != NULL) 1502 rrw_exit(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock, FTAG); 1503#endif 1504 /* 1505 * We make this assert for private objects as well, but after we 1506 * check if we're already dirty. They are allowed to re-dirty 1507 * in syncing context. 1508 / 1509* ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT \|\| 1510 dn->dn_dirtyctx == DN_UNDIRTIED \|\| dn->dn_dirtyctx == 1511 (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); 1512 1513 mutex_enter(&db->db_mtx); 1514 /* 1515 * XXX make this true for indirects too? The problem is that 1516 * transactions created with dmu_tx_create_assigned() from 1517 * syncing context don't bother holding ahead. 1518 / 1519* ASSERT(db->db_level != 0 \|\| 1520 db->db_state == DB_CACHED \|\| db->db_state == DB_FILL \|\| 1521 db->db_state == DB_NOFILL); 1522 1523 mutex_enter(&dn->dn_mtx); 1524 /* 1525 * Don't set dirtyctx to SYNC if we're just modifying this as we 1526 * initialize the objset. 1527 / 1528* if (dn->dn_dirtyctx == DN_UNDIRTIED) { 1529 if (dn->dn_objset->os_dsl_dataset != NULL) { 1530 rrw_enter(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock, 1531 RW_READER, FTAG); 1532 } 1533 if (!BP_IS_HOLE(dn->dn_objset->os_rootbp)) { 1534 dn->dn_dirtyctx = (dmu_tx_is_syncing(tx) ? 1535 DN_DIRTY_SYNC : DN_DIRTY_OPEN); 1536 ASSERT(dn->dn_dirtyctx_firstset == NULL); 1537 dn->dn_dirtyctx_firstset = kmem_alloc(1, KM_SLEEP); 1538 } 1539 if (dn->dn_objset->os_dsl_dataset != NULL) { 1540 rrw_exit(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock, 1541 FTAG); 1542 } 1543 } 1544 mutex_exit(&dn->dn_mtx); 1545 1546 if (db->db_blkid == DMU_SPILL_BLKID) 1547 dn->dn_have_spill = B_TRUE; 1548 1549 /* 1550 * If this buffer is already dirty, we're done. 1551 / 1552* drp = &db->db_last_dirty; 1553 ASSERT(drp == NULL \|\| (drp)->dr_txg <= tx->tx_txg \|\| 1554 db->db.db_object == DMU_META_DNODE_OBJECT); 1555 while ((dr = drp) != NULL && dr->dr_txg > tx->tx_txg) 1556* drp = &dr->dr_next; 1557 if (dr && dr->dr_txg == tx->tx_txg) { 1558 DB_DNODE_EXIT(db); 1559 1560 dbuf_redirty(dr); 1561 mutex_exit(&db->db_mtx); 1562 return (dr); 1563 } 1564 1565 /* 1566 * Only valid if not already dirty. 1567 / 1568* ASSERT(dn->dn_object == 0 \|\| 1569 dn->dn_dirtyctx == DN_UNDIRTIED \|\| dn->dn_dirtyctx == 1570 (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); 1571 1572 ASSERT3U(dn->dn_nlevels, >, db->db_level); 1573 ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) \|\| 1574 dn->dn_phys->dn_nlevels > db->db_level \|\| 1575 dn->dn_next_nlevels[txgoff] > db->db_level \|\| 1576 dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level \|\| 1577 dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level); 1578 1579 /* 1580 * We should only be dirtying in syncing context if it's the 1581 * mos or we're initializing the os or it's a special object. 1582 * However, we are allowed to dirty in syncing context provided 1583 * we already dirtied it in open context. Hence we must make 1584 * this assertion only if we're not already dirty. 1585 / 1586* os = dn->dn_objset; 1587#ifdef DEBUG 1588 if (dn->dn_objset->os_dsl_dataset != NULL) 1589 rrw_enter(&os->os_dsl_dataset->ds_bp_rwlock, RW_READER, FTAG); 1590 ASSERT(!dmu_tx_is_syncing(tx) \|\| DMU_OBJECT_IS_SPECIAL(dn->dn_object) \|\| 1591 os->os_dsl_dataset == NULL \|\| BP_IS_HOLE(os->os_rootbp)); 1592 if (dn->dn_objset->os_dsl_dataset != NULL) 1593 rrw_exit(&os->os_dsl_dataset->ds_bp_rwlock, FTAG); 1594#endif 1595 ASSERT(db->db.db_size != 0); 1596 1597 dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); 1598 1599 if (db->db_blkid != DMU_BONUS_BLKID) { 1600 /* 1601 * Update the accounting. 1602 * Note: we delay "free accounting" until after we drop 1603 * the db_mtx. This keeps us from grabbing other locks 1604 * (and possibly deadlocking) in bp_get_dsize() while 1605 * also holding the db_mtx. 1606 / 1607* dnode_willuse_space(dn, db->db.db_size, tx); 1608 do_free_accounting = dbuf_block_freeable(db); 1609 } 1610 1611 /* 1612 * If this buffer is dirty in an old transaction group we need 1613 * to make a copy of it so that the changes we make in this 1614 * transaction group won't leak out when we sync the older txg. 1615 / 1616* dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_SLEEP); 1617 if (db->db_level == 0) { 1618 void data_old = db->db_buf; 1619* 1620 if (db->db_state != DB_NOFILL) { 1621 if (db->db_blkid == DMU_BONUS_BLKID) { 1622 dbuf_fix_old_data(db, tx->tx_txg); 1623 data_old = db->db.db_data; 1624 } else if (db->db.db_object != DMU_META_DNODE_OBJECT) { 1625 /* 1626 * Release the data buffer from the cache so 1627 * that we can modify it without impacting 1628 * possible other users of this cached data 1629 * block. Note that indirect blocks and 1630 * private objects are not released until the 1631 * syncing state (since they are only modified 1632 * then). 1633 / 1634* arc_release(db->db_buf, db); 1635 dbuf_fix_old_data(db, tx->tx_txg); 1636 data_old = db->db_buf; 1637 } 1638 ASSERT(data_old != NULL); 1639 } 1640 dr->dt.dl.dr_data = data_old; 1641 } else { 1642 mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_DEFAULT, NULL); 1643 list_create(&dr->dt.di.dr_children, 1644 sizeof (dbuf_dirty_record_t), 1645 offsetof(dbuf_dirty_record_t, dr_dirty_node)); 1646 } 1647 if (db->db_blkid != DMU_BONUS_BLKID && os->os_dsl_dataset != NULL) 1648 dr->dr_accounted = db->db.db_size; 1649 dr->dr_dbuf = db; 1650 dr->dr_txg = tx->tx_txg; 1651 dr->dr_next = drp; 1652* drp = dr; 1653* 1654 /* 1655 * We could have been freed_in_flight between the dbuf_noread 1656 * and dbuf_dirty. We win, as though the dbuf_noread() had 1657 * happened after the free. 1658 / 1659* if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && 1660 db->db_blkid != DMU_SPILL_BLKID) { 1661 mutex_enter(&dn->dn_mtx); 1662 if (dn->dn_free_ranges[txgoff] != NULL) { 1663 range_tree_clear(dn->dn_free_ranges[txgoff], 1664 db->db_blkid, 1); 1665 } 1666 mutex_exit(&dn->dn_mtx); 1667 db->db_freed_in_flight = FALSE; 1668 } 1669 1670 /* 1671 * This buffer is now part of this txg 1672 / 1673* dbuf_add_ref(db, (void )(uintptr_t)tx->tx_txg); 1674* db->db_dirtycnt += 1; 1675 ASSERT3U(db->db_dirtycnt, <=, 3); 1676 1677 mutex_exit(&db->db_mtx); 1678 1679 if (db->db_blkid == DMU_BONUS_BLKID \|\| 1680 db->db_blkid == DMU_SPILL_BLKID) { 1681 mutex_enter(&dn->dn_mtx); 1682 ASSERT(!list_link_active(&dr->dr_dirty_node)); 1683 list_insert_tail(&dn->dn_dirty_records[txgoff], dr); 1684 mutex_exit(&dn->dn_mtx); 1685 dnode_setdirty(dn, tx); 1686 DB_DNODE_EXIT(db); 1687 return (dr); 1688 } 1689 1690 /* 1691 * The dn_struct_rwlock prevents db_blkptr from changing 1692 * due to a write from syncing context completing 1693 * while we are running, so we want to acquire it before 1694 * looking at db_blkptr. 1695 / 1696* if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) { 1697 rw_enter(&dn->dn_struct_rwlock, RW_READER); 1698 drop_struct_lock = TRUE; 1699 } 1700 1701 if (do_free_accounting) { 1702 blkptr_t bp = db->db_blkptr; 1703* int64_t willfree = (bp && !BP_IS_HOLE(bp)) ? 1704 bp_get_dsize(os->os_spa, bp) : db->db.db_size; 1705 /* 1706 * This is only a guess -- if the dbuf is dirty 1707 * in a previous txg, we don't know how much 1708 * space it will use on disk yet. We should 1709 * really have the struct_rwlock to access 1710 * db_blkptr, but since this is just a guess, 1711 * it's OK if we get an odd answer. 1712 / 1713* ddt_prefetch(os->os_spa, bp); 1714 dnode_willuse_space(dn, -willfree, tx); 1715 } 1716 1717 if (db->db_level == 0) { 1718 dnode_new_blkid(dn, db->db_blkid, tx, drop_struct_lock); 1719 ASSERT(dn->dn_maxblkid >= db->db_blkid); 1720 } 1721 1722 if (db->db_level+1 < dn->dn_nlevels) { 1723 dmu_buf_impl_t parent = db->db_parent; 1724* dbuf_dirty_record_t di; 1725* int parent_held = FALSE; 1726 1727 if (db->db_parent == NULL \|\| db->db_parent == dn->dn_dbuf) { 1728 int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 1729 1730 parent = dbuf_hold_level(dn, db->db_level+1, 1731 db->db_blkid >> epbs, FTAG); 1732 ASSERT(parent != NULL); 1733 parent_held = TRUE; 1734 } 1735 if (drop_struct_lock) 1736 rw_exit(&dn->dn_struct_rwlock); 1737 ASSERT3U(db->db_level+1, ==, parent->db_level); 1738 di = dbuf_dirty(parent, tx); 1739 if (parent_held) 1740 dbuf_rele(parent, FTAG); 1741 1742 mutex_enter(&db->db_mtx); 1743 /* 1744 * Since we've dropped the mutex, it's possible that 1745 * dbuf_undirty() might have changed this out from under us. 1746 / 1747* if (db->db_last_dirty == dr \|\| 1748 dn->dn_object == DMU_META_DNODE_OBJECT) { 1749 mutex_enter(&di->dt.di.dr_mtx); 1750 ASSERT3U(di->dr_txg, ==, tx->tx_txg); 1751 ASSERT(!list_link_active(&dr->dr_dirty_node)); 1752 list_insert_tail(&di->dt.di.dr_children, dr); 1753 mutex_exit(&di->dt.di.dr_mtx); 1754 dr->dr_parent = di; 1755 } 1756 mutex_exit(&db->db_mtx); 1757 } else { 1758 ASSERT(db->db_level+1 == dn->dn_nlevels); 1759 ASSERT(db->db_blkid < dn->dn_nblkptr); 1760 ASSERT(db->db_parent == NULL \|\| db->db_parent == dn->dn_dbuf); 1761 mutex_enter(&dn->dn_mtx); 1762 ASSERT(!list_link_active(&dr->dr_dirty_node)); 1763 list_insert_tail(&dn->dn_dirty_records[txgoff], dr); 1764 mutex_exit(&dn->dn_mtx); 1765 if (drop_struct_lock) 1766 rw_exit(&dn->dn_struct_rwlock); 1767 } 1768 1769 dnode_setdirty(dn, tx); 1770 DB_DNODE_EXIT(db); 1771 return (dr); 1772} 1773 1774/* 1775 * Undirty a buffer in the transaction group referenced by the given 1776 * transaction. Return whether this evicted the dbuf. 1777 / 1778static boolean_t 1779dbuf_undirty(dmu_buf_impl_t db, dmu_tx_t tx) 1780{ 1781* dnode_t dn; 1782* uint64_t txg = tx->tx_txg; 1783 dbuf_dirty_record_t dr, drp; 1784* 1785 ASSERT(txg != 0); 1786 1787 /* 1788 * Due to our use of dn_nlevels below, this can only be called 1789 * in open context, unless we are operating on the MOS. 1790 * From syncing context, dn_nlevels may be different from the 1791 * dn_nlevels used when dbuf was dirtied. 1792 / 1793* ASSERT(db->db_objset == 1794 dmu_objset_pool(db->db_objset)->dp_meta_objset \|\| 1795 txg != spa_syncing_txg(dmu_objset_spa(db->db_objset))); 1796 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 1797 ASSERT0(db->db_level); 1798 ASSERT(MUTEX_HELD(&db->db_mtx)); 1799 1800 /* 1801 * If this buffer is not dirty, we're done. 1802 / 1803* for (drp = &db->db_last_dirty; (dr = drp) != NULL; drp = &dr->dr_next) 1804* if (dr->dr_txg <= txg) 1805 break; 1806 if (dr == NULL \|\| dr->dr_txg < txg) 1807 return (B_FALSE); 1808 ASSERT(dr->dr_txg == txg); 1809 ASSERT(dr->dr_dbuf == db); 1810 1811 DB_DNODE_ENTER(db); 1812 dn = DB_DNODE(db); 1813 1814 dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); 1815 1816 ASSERT(db->db.db_size != 0); 1817 1818 dsl_pool_undirty_space(dmu_objset_pool(dn->dn_objset), 1819 dr->dr_accounted, txg); 1820 1821 drp = dr->dr_next; 1822* 1823 /* 1824 * Note that there are three places in dbuf_dirty() 1825 * where this dirty record may be put on a list. 1826 * Make sure to do a list_remove corresponding to 1827 * every one of those list_insert calls. 1828 / 1829* if (dr->dr_parent) { 1830 mutex_enter(&dr->dr_parent->dt.di.dr_mtx); 1831 list_remove(&dr->dr_parent->dt.di.dr_children, dr); 1832 mutex_exit(&dr->dr_parent->dt.di.dr_mtx); 1833 } else if (db->db_blkid == DMU_SPILL_BLKID \|\| 1834 db->db_level + 1 == dn->dn_nlevels) { 1835 ASSERT(db->db_blkptr == NULL \|\| db->db_parent == dn->dn_dbuf); 1836 mutex_enter(&dn->dn_mtx); 1837 list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr); 1838 mutex_exit(&dn->dn_mtx); 1839 } 1840 DB_DNODE_EXIT(db); 1841 1842 if (db->db_state != DB_NOFILL) { 1843 dbuf_unoverride(dr); 1844 1845 ASSERT(db->db_buf != NULL); 1846 ASSERT(dr->dt.dl.dr_data != NULL); 1847 if (dr->dt.dl.dr_data != db->db_buf) 1848 arc_buf_destroy(dr->dt.dl.dr_data, db); 1849 } 1850 1851 kmem_free(dr, sizeof (dbuf_dirty_record_t)); 1852 1853 ASSERT(db->db_dirtycnt > 0); 1854 db->db_dirtycnt -= 1; 1855 1856 if (refcount_remove(&db->db_holds, (void )(uintptr_t)txg) == 0) { 1857* ASSERT(db->db_state == DB_NOFILL \|\| arc_released(db->db_buf)); 1858 dbuf_destroy(db); 1859 return (B_TRUE); 1860 } 1861 1862 return (B_FALSE); 1863} 1864 1865void 1866dmu_buf_will_dirty(dmu_buf_t db_fake, dmu_tx_t tx) 1867{ 1868 dmu_buf_impl_t db = (dmu_buf_impl_t )db_fake; 1869 int rf = DB_RF_MUST_SUCCEED \| DB_RF_NOPREFETCH; 1870 1871 ASSERT(tx->tx_txg != 0); 1872 ASSERT(!refcount_is_zero(&db->db_holds)); 1873 1874 /* 1875 * Quick check for dirtyness. For already dirty blocks, this 1876 * reduces runtime of this function by >90%, and overall performance 1877 * by 50% for some workloads (e.g. file deletion with indirect blocks 1878 * cached). 1879 / 1880* mutex_enter(&db->db_mtx); 1881 dbuf_dirty_record_t dr; 1882* for (dr = db->db_last_dirty; 1883 dr != NULL && dr->dr_txg >= tx->tx_txg; dr = dr->dr_next) { 1884 /* 1885 * It's possible that it is already dirty but not cached, 1886 * because there are some calls to dbuf_dirty() that don't 1887 * go through dmu_buf_will_dirty(). 1888 / 1889* if (dr->dr_txg == tx->tx_txg && db->db_state == DB_CACHED) { 1890 /* This dbuf is already dirty and cached. / 1891* dbuf_redirty(dr); 1892 mutex_exit(&db->db_mtx); 1893 return; 1894 } 1895 } 1896 mutex_exit(&db->db_mtx); 1897 1898 DB_DNODE_ENTER(db); 1899 if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock)) 1900 rf \|= DB_RF_HAVESTRUCT; 1901 DB_DNODE_EXIT(db); 1902 (void) dbuf_read(db, NULL, rf); 1903 (void) dbuf_dirty(db, tx); 1904} 1905 1906void 1907dmu_buf_will_not_fill(dmu_buf_t db_fake, dmu_tx_t tx) 1908{ 1909 dmu_buf_impl_t db = (dmu_buf_impl_t )db_fake; 1910 1911 db->db_state = DB_NOFILL; 1912 1913 dmu_buf_will_fill(db_fake, tx); 1914} 1915 1916void 1917dmu_buf_will_fill(dmu_buf_t db_fake, dmu_tx_t tx) 1918{ 1919 dmu_buf_impl_t db = (dmu_buf_impl_t )db_fake; 1920 1921 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 1922 ASSERT(tx->tx_txg != 0); 1923 ASSERT(db->db_level == 0); 1924 ASSERT(!refcount_is_zero(&db->db_holds)); 1925 1926 ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT \|\| 1927 dmu_tx_private_ok(tx)); 1928 1929 dbuf_noread(db); 1930 (void) dbuf_dirty(db, tx); 1931} 1932 1933#pragma weak dmu_buf_fill_done = dbuf_fill_done 1934/* ARGSUSED / 1935void 1936dbuf_fill_done(dmu_buf_impl_t db, dmu_tx_t tx) 1937{ 1938* mutex_enter(&db->db_mtx); 1939 DBUF_VERIFY(db); 1940 1941 if (db->db_state == DB_FILL) { 1942 if (db->db_level == 0 && db->db_freed_in_flight) { 1943 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 1944 /* we were freed while filling / 1945* /* XXX dbuf_undirty? / 1946* bzero(db->db.db_data, db->db.db_size); 1947 db->db_freed_in_flight = FALSE; 1948 } 1949 db->db_state = DB_CACHED; 1950 cv_broadcast(&db->db_changed); 1951 } 1952 mutex_exit(&db->db_mtx); 1953} 1954 1955void 1956dmu_buf_write_embedded(dmu_buf_t dbuf, void data, 1957 bp_embedded_type_t etype, enum zio_compress comp, 1958 int uncompressed_size, int compressed_size, int byteorder, 1959 dmu_tx_t tx) 1960{ 1961* dmu_buf_impl_t db = (dmu_buf_impl_t )dbuf; 1962 struct dirty_leaf dl; 1963* dmu_object_type_t type; 1964 1965 if (etype == BP_EMBEDDED_TYPE_DATA) { 1966 ASSERT(spa_feature_is_active(dmu_objset_spa(db->db_objset), 1967 SPA_FEATURE_EMBEDDED_DATA)); 1968 } 1969 1970 DB_DNODE_ENTER(db); 1971 type = DB_DNODE(db)->dn_type; 1972 DB_DNODE_EXIT(db); 1973 1974 ASSERT0(db->db_level); 1975 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 1976 1977 dmu_buf_will_not_fill(dbuf, tx); 1978 1979 ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg); 1980 dl = &db->db_last_dirty->dt.dl; 1981 encode_embedded_bp_compressed(&dl->dr_overridden_by, 1982 data, comp, uncompressed_size, compressed_size); 1983 BPE_SET_ETYPE(&dl->dr_overridden_by, etype); 1984 BP_SET_TYPE(&dl->dr_overridden_by, type); 1985 BP_SET_LEVEL(&dl->dr_overridden_by, 0); 1986 BP_SET_BYTEORDER(&dl->dr_overridden_by, byteorder); 1987 1988 dl->dr_override_state = DR_OVERRIDDEN; 1989 dl->dr_overridden_by.blk_birth = db->db_last_dirty->dr_txg; 1990} 1991 1992/* 1993 * Directly assign a provided arc buf to a given dbuf if it's not referenced 1994 * by anybody except our caller. Otherwise copy arcbuf's contents to dbuf. 1995 / 1996void 1997dbuf_assign_arcbuf(dmu_buf_impl_t db, arc_buf_t buf, dmu_tx_t tx) 1998{ 1999 ASSERT(!refcount_is_zero(&db->db_holds)); 2000 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 2001 ASSERT(db->db_level == 0); 2002 ASSERT(DBUF_GET_BUFC_TYPE(db) == ARC_BUFC_DATA); 2003 ASSERT(buf != NULL); 2004 ASSERT(arc_buf_size(buf) == db->db.db_size); 2005 ASSERT(tx->tx_txg != 0); 2006 2007 arc_return_buf(buf, db); 2008 ASSERT(arc_released(buf)); 2009 2010 mutex_enter(&db->db_mtx); 2011 2012 while (db->db_state == DB_READ \|\| db->db_state == DB_FILL) 2013 cv_wait(&db->db_changed, &db->db_mtx); 2014 2015 ASSERT(db->db_state == DB_CACHED \|\| db->db_state == DB_UNCACHED); 2016 2017 if (db->db_state == DB_CACHED && 2018 refcount_count(&db->db_holds) - 1 > db->db_dirtycnt) { 2019 mutex_exit(&db->db_mtx); 2020 (void) dbuf_dirty(db, tx); 2021 bcopy(buf->b_data, db->db.db_data, db->db.db_size); 2022 arc_buf_destroy(buf, db); 2023 xuio_stat_wbuf_copied(); 2024 return; 2025 } 2026 2027 xuio_stat_wbuf_nocopy(); 2028 if (db->db_state == DB_CACHED) { 2029 dbuf_dirty_record_t dr = db->db_last_dirty; 2030* 2031 ASSERT(db->db_buf != NULL); 2032 if (dr != NULL && dr->dr_txg == tx->tx_txg) { 2033 ASSERT(dr->dt.dl.dr_data == db->db_buf); 2034 if (!arc_released(db->db_buf)) { 2035 ASSERT(dr->dt.dl.dr_override_state == 2036 DR_OVERRIDDEN); 2037 arc_release(db->db_buf, db); 2038 } 2039 dr->dt.dl.dr_data = buf; 2040 arc_buf_destroy(db->db_buf, db); 2041 } else if (dr == NULL \|\| dr->dt.dl.dr_data != db->db_buf) { 2042 arc_release(db->db_buf, db); 2043 arc_buf_destroy(db->db_buf, db); 2044 } 2045 db->db_buf = NULL; 2046 } 2047 ASSERT(db->db_buf == NULL); 2048 dbuf_set_data(db, buf); 2049 db->db_state = DB_FILL; 2050 mutex_exit(&db->db_mtx); 2051 (void) dbuf_dirty(db, tx); 2052 dmu_buf_fill_done(&db->db, tx); 2053} 2054 2055void 2056dbuf_destroy(dmu_buf_impl_t db) 2057{ 2058* dnode_t dn; 2059* dmu_buf_impl_t parent = db->db_parent; 2060* dmu_buf_impl_t dndb; 2061* 2062 ASSERT(MUTEX_HELD(&db->db_mtx)); 2063 ASSERT(refcount_is_zero(&db->db_holds)); 2064 2065 if (db->db_buf != NULL) { 2066 arc_buf_destroy(db->db_buf, db); 2067 db->db_buf = NULL; 2068 } 2069 2070 if (db->db_blkid == DMU_BONUS_BLKID) { 2071 ASSERT(db->db.db_data != NULL); 2072 zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN); 2073 arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); 2074 db->db_state = DB_UNCACHED; 2075 } 2076 2077 dbuf_clear_data(db); 2078 2079 if (multilist_link_active(&db->db_cache_link)) { 2080 multilist_remove(&dbuf_cache, db); 2081 (void) refcount_remove_many(&dbuf_cache_size, 2082 db->db.db_size, db); 2083 } 2084 2085 ASSERT(db->db_state == DB_UNCACHED \|\| db->db_state == DB_NOFILL); 2086 ASSERT(db->db_data_pending == NULL); 2087 2088 db->db_state = DB_EVICTING; 2089 db->db_blkptr = NULL; 2090 2091 /* 2092 * Now that db_state is DB_EVICTING, nobody else can find this via 2093 * the hash table. We can now drop db_mtx, which allows us to 2094 * acquire the dn_dbufs_mtx. 2095 / 2096* mutex_exit(&db->db_mtx); 2097 2098 DB_DNODE_ENTER(db); 2099 dn = DB_DNODE(db); 2100 dndb = dn->dn_dbuf; 2101 if (db->db_blkid != DMU_BONUS_BLKID) { 2102 boolean_t needlock = !MUTEX_HELD(&dn->dn_dbufs_mtx); 2103 if (needlock) 2104 mutex_enter(&dn->dn_dbufs_mtx); 2105 avl_remove(&dn->dn_dbufs, db); 2106 atomic_dec_32(&dn->dn_dbufs_count); 2107 membar_producer(); 2108 DB_DNODE_EXIT(db); 2109 if (needlock) 2110 mutex_exit(&dn->dn_dbufs_mtx); 2111 /* 2112 * Decrementing the dbuf count means that the hold corresponding 2113 * to the removed dbuf is no longer discounted in dnode_move(), 2114 * so the dnode cannot be moved until after we release the hold. 2115 * The membar_producer() ensures visibility of the decremented 2116 * value in dnode_move(), since DB_DNODE_EXIT doesn't actually 2117 * release any lock. 2118 / 2119* dnode_rele(dn, db); 2120 db->db_dnode_handle = NULL; 2121 2122 dbuf_hash_remove(db); 2123 } else { 2124 DB_DNODE_EXIT(db); 2125 } 2126 2127 ASSERT(refcount_is_zero(&db->db_holds)); 2128 2129 db->db_parent = NULL; 2130 2131 ASSERT(db->db_buf == NULL); 2132 ASSERT(db->db.db_data == NULL); 2133 ASSERT(db->db_hash_next == NULL); 2134 ASSERT(db->db_blkptr == NULL); 2135 ASSERT(db->db_data_pending == NULL); 2136 ASSERT(!multilist_link_active(&db->db_cache_link)); 2137 2138 kmem_cache_free(dbuf_kmem_cache, db); 2139 arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); 2140 2141 /* 2142 * If this dbuf is referenced from an indirect dbuf, 2143 * decrement the ref count on the indirect dbuf. 2144 / 2145* if (parent && parent != dndb) 2146 dbuf_rele(parent, db); 2147} 2148 2149/* 2150 * Note: While bpp will always be updated if the function returns success, 2151 * parentp will not be updated if the dnode does not have dn_dbuf filled in; 2152 * this happens when the dnode is the meta-dnode, or a userused or groupused 2153 * object. 2154 / 2155static int 2156dbuf_findbp(dnode_t dn, int level, uint64_t blkid, int fail_sparse, 2157 dmu_buf_impl_t parentp, blkptr_t bpp) 2158{ 2159 int nlevels, epbs; 2160 2161 parentp = NULL; 2162* bpp = NULL; 2163* 2164 ASSERT(blkid != DMU_BONUS_BLKID); 2165 2166 if (blkid == DMU_SPILL_BLKID) { 2167 mutex_enter(&dn->dn_mtx); 2168 if (dn->dn_have_spill && 2169 (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) 2170 bpp = &dn->dn_phys->dn_spill; 2171* else 2172 bpp = NULL; 2173* dbuf_add_ref(dn->dn_dbuf, NULL); 2174 parentp = dn->dn_dbuf; 2175* mutex_exit(&dn->dn_mtx); 2176 return (0); 2177 } 2178 2179 if (dn->dn_phys->dn_nlevels == 0) 2180 nlevels = 1; 2181 else 2182 nlevels = dn->dn_phys->dn_nlevels; 2183 2184 epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 2185 2186 ASSERT3U(level * epbs, <, 64); 2187 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 2188 if (level >= nlevels \|\| 2189 (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))) { 2190 /* the buffer has no parent yet / 2191* return (SET_ERROR(ENOENT)); 2192 } else if (level < nlevels-1) { 2193 /* this block is referenced from an indirect block / 2194* int err = dbuf_hold_impl(dn, level+1, 2195 blkid >> epbs, fail_sparse, FALSE, NULL, parentp); 2196 if (err) 2197 return (err); 2198 err = dbuf_read(parentp, NULL, 2199* (DB_RF_HAVESTRUCT \| DB_RF_NOPREFETCH \| DB_RF_CANFAIL)); 2200 if (err) { 2201 dbuf_rele(parentp, NULL); 2202* parentp = NULL; 2203* return (err); 2204 } 2205 bpp = ((blkptr_t )(parentp)->db.db_data) + 2206* (blkid & ((1ULL << epbs) - 1)); 2207 return (0); 2208 } else { 2209 /* the block is referenced from the dnode / 2210* ASSERT3U(level, ==, nlevels-1); 2211 ASSERT(dn->dn_phys->dn_nblkptr == 0 \|\| 2212 blkid < dn->dn_phys->dn_nblkptr); 2213 if (dn->dn_dbuf) { 2214 dbuf_add_ref(dn->dn_dbuf, NULL); 2215 parentp = dn->dn_dbuf; 2216* } 2217 bpp = &dn->dn_phys->dn_blkptr[blkid]; 2218* return (0); 2219 } 2220} 2221 2222static dmu_buf_impl_t * 2223dbuf_create(dnode_t dn, uint8_t level, uint64_t blkid, 2224* dmu_buf_impl_t parent, blkptr_t blkptr) 2225{ 2226 objset_t os = dn->dn_objset; 2227* dmu_buf_impl_t db, odb; 2228 2229 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 2230 ASSERT(dn->dn_type != DMU_OT_NONE); 2231 2232 db = kmem_cache_alloc(dbuf_kmem_cache, KM_SLEEP); 2233 2234 db->db_objset = os; 2235 db->db.db_object = dn->dn_object; 2236 db->db_level = level; 2237 db->db_blkid = blkid; 2238 db->db_last_dirty = NULL; 2239 db->db_dirtycnt = 0; 2240 db->db_dnode_handle = dn->dn_handle; 2241 db->db_parent = parent; 2242 db->db_blkptr = blkptr; 2243 2244 db->db_user = NULL; 2245 db->db_user_immediate_evict = FALSE; 2246 db->db_freed_in_flight = FALSE; 2247 db->db_pending_evict = FALSE; 2248 2249 if (blkid == DMU_BONUS_BLKID) { 2250 ASSERT3P(parent, ==, dn->dn_dbuf); 2251 db->db.db_size = DN_MAX_BONUSLEN - 2252 (dn->dn_nblkptr-1) * sizeof (blkptr_t); 2253 ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); 2254 db->db.db_offset = DMU_BONUS_BLKID; 2255 db->db_state = DB_UNCACHED; 2256 /* the bonus dbuf is not placed in the hash table / 2257* arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); 2258 return (db); 2259 } else if (blkid == DMU_SPILL_BLKID) { 2260 db->db.db_size = (blkptr != NULL) ? 2261 BP_GET_LSIZE(blkptr) : SPA_MINBLOCKSIZE; 2262 db->db.db_offset = 0; 2263 } else { 2264 int blocksize = 2265 db->db_level ? 1 << dn->dn_indblkshift : dn->dn_datablksz; 2266 db->db.db_size = blocksize; 2267 db->db.db_offset = db->db_blkid * blocksize; 2268 } 2269 2270 /* 2271 * Hold the dn_dbufs_mtx while we get the new dbuf 2272 * in the hash table and added to the dbufs list. 2273 * This prevents a possible deadlock with someone 2274 * trying to look up this dbuf before its added to the 2275 * dn_dbufs list. 2276 / 2277* mutex_enter(&dn->dn_dbufs_mtx); 2278 db->db_state = DB_EVICTING; 2279 if ((odb = dbuf_hash_insert(db)) != NULL) { 2280 /* someone else inserted it first / 2281* kmem_cache_free(dbuf_kmem_cache, db); 2282 mutex_exit(&dn->dn_dbufs_mtx); 2283 return (odb); 2284 } 2285 avl_add(&dn->dn_dbufs, db);	1240 db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER); 1241 1242 for (; db != NULL; db = db_next) { 1243 db_next = AVL_NEXT(&dn->dn_dbufs, db); 1244 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 1245 1246 if (db->db_level != 0 \|\| db->db_blkid > end_blkid) { 1247 break; 1248 } 1249 ASSERT3U(db->db_blkid, >=, start_blkid); 1250 1251 /* found a level 0 buffer in the range / 1252* mutex_enter(&db->db_mtx); 1253 if (dbuf_undirty(db, tx)) { 1254 /* mutex has been dropped and dbuf destroyed / 1255* continue; 1256 } 1257 1258 if (db->db_state == DB_UNCACHED \|\| 1259 db->db_state == DB_NOFILL \|\| 1260 db->db_state == DB_EVICTING) { 1261 ASSERT(db->db.db_data == NULL); 1262 mutex_exit(&db->db_mtx); 1263 continue; 1264 } 1265 if (db->db_state == DB_READ \|\| db->db_state == DB_FILL) { 1266 /* will be handled in dbuf_read_done or dbuf_rele / 1267* db->db_freed_in_flight = TRUE; 1268 mutex_exit(&db->db_mtx); 1269 continue; 1270 } 1271 if (refcount_count(&db->db_holds) == 0) { 1272 ASSERT(db->db_buf); 1273 dbuf_destroy(db); 1274 continue; 1275 } 1276 /* The dbuf is referenced / 1277* 1278 if (db->db_last_dirty != NULL) { 1279 dbuf_dirty_record_t dr = db->db_last_dirty; 1280* 1281 if (dr->dr_txg == txg) { 1282 /* 1283 * This buffer is "in-use", re-adjust the file 1284 * size to reflect that this buffer may 1285 * contain new data when we sync. 1286 / 1287* if (db->db_blkid != DMU_SPILL_BLKID && 1288 db->db_blkid > dn->dn_maxblkid) 1289 dn->dn_maxblkid = db->db_blkid; 1290 dbuf_unoverride(dr); 1291 } else { 1292 /* 1293 * This dbuf is not dirty in the open context. 1294 * Either uncache it (if its not referenced in 1295 * the open context) or reset its contents to 1296 * empty. 1297 / 1298* dbuf_fix_old_data(db, txg); 1299 } 1300 } 1301 /* clear the contents if its cached / 1302* if (db->db_state == DB_CACHED) { 1303 ASSERT(db->db.db_data != NULL); 1304 arc_release(db->db_buf, db); 1305 bzero(db->db.db_data, db->db.db_size); 1306 arc_buf_freeze(db->db_buf); 1307 } 1308 1309 mutex_exit(&db->db_mtx); 1310 } 1311 mutex_exit(&dn->dn_dbufs_mtx); 1312} 1313 1314static int 1315dbuf_block_freeable(dmu_buf_impl_t db) 1316{ 1317* dsl_dataset_t ds = db->db_objset->os_dsl_dataset; 1318* uint64_t birth_txg = 0; 1319 1320 /* 1321 * We don't need any locking to protect db_blkptr: 1322 * If it's syncing, then db_last_dirty will be set 1323 * so we'll ignore db_blkptr. 1324 * 1325 * This logic ensures that only block births for 1326 * filled blocks are considered. 1327 / 1328* ASSERT(MUTEX_HELD(&db->db_mtx)); 1329 if (db->db_last_dirty && (db->db_blkptr == NULL \|\| 1330 !BP_IS_HOLE(db->db_blkptr))) { 1331 birth_txg = db->db_last_dirty->dr_txg; 1332 } else if (db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)) { 1333 birth_txg = db->db_blkptr->blk_birth; 1334 } 1335 1336 /* 1337 * If this block don't exist or is in a snapshot, it can't be freed. 1338 * Don't pass the bp to dsl_dataset_block_freeable() since we 1339 * are holding the db_mtx lock and might deadlock if we are 1340 * prefetching a dedup-ed block. 1341 / 1342* if (birth_txg != 0) 1343 return (ds == NULL \|\| 1344 dsl_dataset_block_freeable(ds, NULL, birth_txg)); 1345 else 1346 return (B_FALSE); 1347} 1348 1349void 1350dbuf_new_size(dmu_buf_impl_t db, int size, dmu_tx_t tx) 1351{ 1352 arc_buf_t buf, obuf; 1353 int osize = db->db.db_size; 1354 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 1355 dnode_t dn; 1356* 1357 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 1358 1359 DB_DNODE_ENTER(db); 1360 dn = DB_DNODE(db); 1361 1362 /* XXX does this func really need the lock? / 1363* ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); 1364 1365 /* 1366 * This call to dmu_buf_will_dirty() with the dn_struct_rwlock held 1367 * is OK, because there can be no other references to the db 1368 * when we are changing its size, so no concurrent DB_FILL can 1369 * be happening. 1370 / 1371* /* 1372 * XXX we should be doing a dbuf_read, checking the return 1373 * value and returning that up to our callers 1374 / 1375* dmu_buf_will_dirty(&db->db, tx); 1376 1377 /* create the data buffer for the new block / 1378* buf = arc_alloc_buf(dn->dn_objset->os_spa, size, db, type); 1379 1380 /* copy old block data to the new block / 1381* obuf = db->db_buf; 1382 bcopy(obuf->b_data, buf->b_data, MIN(osize, size)); 1383 /* zero the remainder / 1384* if (size > osize) 1385 bzero((uint8_t )buf->b_data + osize, size - osize); 1386* 1387 mutex_enter(&db->db_mtx); 1388 dbuf_set_data(db, buf); 1389 arc_buf_destroy(obuf, db); 1390 db->db.db_size = size; 1391 1392 if (db->db_level == 0) { 1393 ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg); 1394 db->db_last_dirty->dt.dl.dr_data = buf; 1395 } 1396 mutex_exit(&db->db_mtx); 1397 1398 dnode_willuse_space(dn, size-osize, tx); 1399 DB_DNODE_EXIT(db); 1400} 1401 1402void 1403dbuf_release_bp(dmu_buf_impl_t db) 1404{ 1405* objset_t os = db->db_objset; 1406* 1407 ASSERT(dsl_pool_sync_context(dmu_objset_pool(os))); 1408 ASSERT(arc_released(os->os_phys_buf) \|\| 1409 list_link_active(&os->os_dsl_dataset->ds_synced_link)); 1410 ASSERT(db->db_parent == NULL \|\| arc_released(db->db_parent->db_buf)); 1411 1412 (void) arc_release(db->db_buf, db); 1413} 1414 1415/* 1416 * We already have a dirty record for this TXG, and we are being 1417 * dirtied again. 1418 / 1419static void 1420dbuf_redirty(dbuf_dirty_record_t dr) 1421{ 1422 dmu_buf_impl_t db = dr->dr_dbuf; 1423* 1424 ASSERT(MUTEX_HELD(&db->db_mtx)); 1425 1426 if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID) { 1427 /* 1428 * If this buffer has already been written out, 1429 * we now need to reset its state. 1430 / 1431* dbuf_unoverride(dr); 1432 if (db->db.db_object != DMU_META_DNODE_OBJECT && 1433 db->db_state != DB_NOFILL) { 1434 /* Already released on initial dirty, so just thaw. / 1435* ASSERT(arc_released(db->db_buf)); 1436 arc_buf_thaw(db->db_buf); 1437 } 1438 } 1439} 1440 1441dbuf_dirty_record_t * 1442dbuf_dirty(dmu_buf_impl_t db, dmu_tx_t tx) 1443{ 1444 dnode_t dn; 1445* objset_t os; 1446* dbuf_dirty_record_t *drp, dr; 1447 int drop_struct_lock = FALSE; 1448 boolean_t do_free_accounting = B_FALSE; 1449 int txgoff = tx->tx_txg & TXG_MASK; 1450 1451 ASSERT(tx->tx_txg != 0); 1452 ASSERT(!refcount_is_zero(&db->db_holds)); 1453 DMU_TX_DIRTY_BUF(tx, db); 1454 1455 DB_DNODE_ENTER(db); 1456 dn = DB_DNODE(db); 1457 /* 1458 * Shouldn't dirty a regular buffer in syncing context. Private 1459 * objects may be dirtied in syncing context, but only if they 1460 * were already pre-dirtied in open context. 1461 / 1462#ifdef DEBUG 1463* if (dn->dn_objset->os_dsl_dataset != NULL) { 1464 rrw_enter(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock, 1465 RW_READER, FTAG); 1466 } 1467 ASSERT(!dmu_tx_is_syncing(tx) \|\| 1468 BP_IS_HOLE(dn->dn_objset->os_rootbp) \|\| 1469 DMU_OBJECT_IS_SPECIAL(dn->dn_object) \|\| 1470 dn->dn_objset->os_dsl_dataset == NULL); 1471 if (dn->dn_objset->os_dsl_dataset != NULL) 1472 rrw_exit(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock, FTAG); 1473#endif 1474 /* 1475 * We make this assert for private objects as well, but after we 1476 * check if we're already dirty. They are allowed to re-dirty 1477 * in syncing context. 1478 / 1479* ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT \|\| 1480 dn->dn_dirtyctx == DN_UNDIRTIED \|\| dn->dn_dirtyctx == 1481 (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); 1482 1483 mutex_enter(&db->db_mtx); 1484 /* 1485 * XXX make this true for indirects too? The problem is that 1486 * transactions created with dmu_tx_create_assigned() from 1487 * syncing context don't bother holding ahead. 1488 / 1489* ASSERT(db->db_level != 0 \|\| 1490 db->db_state == DB_CACHED \|\| db->db_state == DB_FILL \|\| 1491 db->db_state == DB_NOFILL); 1492 1493 mutex_enter(&dn->dn_mtx); 1494 /* 1495 * Don't set dirtyctx to SYNC if we're just modifying this as we 1496 * initialize the objset. 1497 / 1498* if (dn->dn_dirtyctx == DN_UNDIRTIED) { 1499 if (dn->dn_objset->os_dsl_dataset != NULL) { 1500 rrw_enter(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock, 1501 RW_READER, FTAG); 1502 } 1503 if (!BP_IS_HOLE(dn->dn_objset->os_rootbp)) { 1504 dn->dn_dirtyctx = (dmu_tx_is_syncing(tx) ? 1505 DN_DIRTY_SYNC : DN_DIRTY_OPEN); 1506 ASSERT(dn->dn_dirtyctx_firstset == NULL); 1507 dn->dn_dirtyctx_firstset = kmem_alloc(1, KM_SLEEP); 1508 } 1509 if (dn->dn_objset->os_dsl_dataset != NULL) { 1510 rrw_exit(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock, 1511 FTAG); 1512 } 1513 } 1514 mutex_exit(&dn->dn_mtx); 1515 1516 if (db->db_blkid == DMU_SPILL_BLKID) 1517 dn->dn_have_spill = B_TRUE; 1518 1519 /* 1520 * If this buffer is already dirty, we're done. 1521 / 1522* drp = &db->db_last_dirty; 1523 ASSERT(drp == NULL \|\| (drp)->dr_txg <= tx->tx_txg \|\| 1524 db->db.db_object == DMU_META_DNODE_OBJECT); 1525 while ((dr = drp) != NULL && dr->dr_txg > tx->tx_txg) 1526* drp = &dr->dr_next; 1527 if (dr && dr->dr_txg == tx->tx_txg) { 1528 DB_DNODE_EXIT(db); 1529 1530 dbuf_redirty(dr); 1531 mutex_exit(&db->db_mtx); 1532 return (dr); 1533 } 1534 1535 /* 1536 * Only valid if not already dirty. 1537 / 1538* ASSERT(dn->dn_object == 0 \|\| 1539 dn->dn_dirtyctx == DN_UNDIRTIED \|\| dn->dn_dirtyctx == 1540 (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); 1541 1542 ASSERT3U(dn->dn_nlevels, >, db->db_level); 1543 ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) \|\| 1544 dn->dn_phys->dn_nlevels > db->db_level \|\| 1545 dn->dn_next_nlevels[txgoff] > db->db_level \|\| 1546 dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level \|\| 1547 dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level); 1548 1549 /* 1550 * We should only be dirtying in syncing context if it's the 1551 * mos or we're initializing the os or it's a special object. 1552 * However, we are allowed to dirty in syncing context provided 1553 * we already dirtied it in open context. Hence we must make 1554 * this assertion only if we're not already dirty. 1555 / 1556* os = dn->dn_objset; 1557#ifdef DEBUG 1558 if (dn->dn_objset->os_dsl_dataset != NULL) 1559 rrw_enter(&os->os_dsl_dataset->ds_bp_rwlock, RW_READER, FTAG); 1560 ASSERT(!dmu_tx_is_syncing(tx) \|\| DMU_OBJECT_IS_SPECIAL(dn->dn_object) \|\| 1561 os->os_dsl_dataset == NULL \|\| BP_IS_HOLE(os->os_rootbp)); 1562 if (dn->dn_objset->os_dsl_dataset != NULL) 1563 rrw_exit(&os->os_dsl_dataset->ds_bp_rwlock, FTAG); 1564#endif 1565 ASSERT(db->db.db_size != 0); 1566 1567 dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); 1568 1569 if (db->db_blkid != DMU_BONUS_BLKID) { 1570 /* 1571 * Update the accounting. 1572 * Note: we delay "free accounting" until after we drop 1573 * the db_mtx. This keeps us from grabbing other locks 1574 * (and possibly deadlocking) in bp_get_dsize() while 1575 * also holding the db_mtx. 1576 / 1577* dnode_willuse_space(dn, db->db.db_size, tx); 1578 do_free_accounting = dbuf_block_freeable(db); 1579 } 1580 1581 /* 1582 * If this buffer is dirty in an old transaction group we need 1583 * to make a copy of it so that the changes we make in this 1584 * transaction group won't leak out when we sync the older txg. 1585 / 1586* dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_SLEEP); 1587 if (db->db_level == 0) { 1588 void data_old = db->db_buf; 1589* 1590 if (db->db_state != DB_NOFILL) { 1591 if (db->db_blkid == DMU_BONUS_BLKID) { 1592 dbuf_fix_old_data(db, tx->tx_txg); 1593 data_old = db->db.db_data; 1594 } else if (db->db.db_object != DMU_META_DNODE_OBJECT) { 1595 /* 1596 * Release the data buffer from the cache so 1597 * that we can modify it without impacting 1598 * possible other users of this cached data 1599 * block. Note that indirect blocks and 1600 * private objects are not released until the 1601 * syncing state (since they are only modified 1602 * then). 1603 / 1604* arc_release(db->db_buf, db); 1605 dbuf_fix_old_data(db, tx->tx_txg); 1606 data_old = db->db_buf; 1607 } 1608 ASSERT(data_old != NULL); 1609 } 1610 dr->dt.dl.dr_data = data_old; 1611 } else { 1612 mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_DEFAULT, NULL); 1613 list_create(&dr->dt.di.dr_children, 1614 sizeof (dbuf_dirty_record_t), 1615 offsetof(dbuf_dirty_record_t, dr_dirty_node)); 1616 } 1617 if (db->db_blkid != DMU_BONUS_BLKID && os->os_dsl_dataset != NULL) 1618 dr->dr_accounted = db->db.db_size; 1619 dr->dr_dbuf = db; 1620 dr->dr_txg = tx->tx_txg; 1621 dr->dr_next = drp; 1622* drp = dr; 1623* 1624 /* 1625 * We could have been freed_in_flight between the dbuf_noread 1626 * and dbuf_dirty. We win, as though the dbuf_noread() had 1627 * happened after the free. 1628 / 1629* if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && 1630 db->db_blkid != DMU_SPILL_BLKID) { 1631 mutex_enter(&dn->dn_mtx); 1632 if (dn->dn_free_ranges[txgoff] != NULL) { 1633 range_tree_clear(dn->dn_free_ranges[txgoff], 1634 db->db_blkid, 1); 1635 } 1636 mutex_exit(&dn->dn_mtx); 1637 db->db_freed_in_flight = FALSE; 1638 } 1639 1640 /* 1641 * This buffer is now part of this txg 1642 / 1643* dbuf_add_ref(db, (void )(uintptr_t)tx->tx_txg); 1644* db->db_dirtycnt += 1; 1645 ASSERT3U(db->db_dirtycnt, <=, 3); 1646 1647 mutex_exit(&db->db_mtx); 1648 1649 if (db->db_blkid == DMU_BONUS_BLKID \|\| 1650 db->db_blkid == DMU_SPILL_BLKID) { 1651 mutex_enter(&dn->dn_mtx); 1652 ASSERT(!list_link_active(&dr->dr_dirty_node)); 1653 list_insert_tail(&dn->dn_dirty_records[txgoff], dr); 1654 mutex_exit(&dn->dn_mtx); 1655 dnode_setdirty(dn, tx); 1656 DB_DNODE_EXIT(db); 1657 return (dr); 1658 } 1659 1660 /* 1661 * The dn_struct_rwlock prevents db_blkptr from changing 1662 * due to a write from syncing context completing 1663 * while we are running, so we want to acquire it before 1664 * looking at db_blkptr. 1665 / 1666* if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) { 1667 rw_enter(&dn->dn_struct_rwlock, RW_READER); 1668 drop_struct_lock = TRUE; 1669 } 1670 1671 if (do_free_accounting) { 1672 blkptr_t bp = db->db_blkptr; 1673* int64_t willfree = (bp && !BP_IS_HOLE(bp)) ? 1674 bp_get_dsize(os->os_spa, bp) : db->db.db_size; 1675 /* 1676 * This is only a guess -- if the dbuf is dirty 1677 * in a previous txg, we don't know how much 1678 * space it will use on disk yet. We should 1679 * really have the struct_rwlock to access 1680 * db_blkptr, but since this is just a guess, 1681 * it's OK if we get an odd answer. 1682 / 1683* ddt_prefetch(os->os_spa, bp); 1684 dnode_willuse_space(dn, -willfree, tx); 1685 } 1686 1687 if (db->db_level == 0) { 1688 dnode_new_blkid(dn, db->db_blkid, tx, drop_struct_lock); 1689 ASSERT(dn->dn_maxblkid >= db->db_blkid); 1690 } 1691 1692 if (db->db_level+1 < dn->dn_nlevels) { 1693 dmu_buf_impl_t parent = db->db_parent; 1694* dbuf_dirty_record_t di; 1695* int parent_held = FALSE; 1696 1697 if (db->db_parent == NULL \|\| db->db_parent == dn->dn_dbuf) { 1698 int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 1699 1700 parent = dbuf_hold_level(dn, db->db_level+1, 1701 db->db_blkid >> epbs, FTAG); 1702 ASSERT(parent != NULL); 1703 parent_held = TRUE; 1704 } 1705 if (drop_struct_lock) 1706 rw_exit(&dn->dn_struct_rwlock); 1707 ASSERT3U(db->db_level+1, ==, parent->db_level); 1708 di = dbuf_dirty(parent, tx); 1709 if (parent_held) 1710 dbuf_rele(parent, FTAG); 1711 1712 mutex_enter(&db->db_mtx); 1713 /* 1714 * Since we've dropped the mutex, it's possible that 1715 * dbuf_undirty() might have changed this out from under us. 1716 / 1717* if (db->db_last_dirty == dr \|\| 1718 dn->dn_object == DMU_META_DNODE_OBJECT) { 1719 mutex_enter(&di->dt.di.dr_mtx); 1720 ASSERT3U(di->dr_txg, ==, tx->tx_txg); 1721 ASSERT(!list_link_active(&dr->dr_dirty_node)); 1722 list_insert_tail(&di->dt.di.dr_children, dr); 1723 mutex_exit(&di->dt.di.dr_mtx); 1724 dr->dr_parent = di; 1725 } 1726 mutex_exit(&db->db_mtx); 1727 } else { 1728 ASSERT(db->db_level+1 == dn->dn_nlevels); 1729 ASSERT(db->db_blkid < dn->dn_nblkptr); 1730 ASSERT(db->db_parent == NULL \|\| db->db_parent == dn->dn_dbuf); 1731 mutex_enter(&dn->dn_mtx); 1732 ASSERT(!list_link_active(&dr->dr_dirty_node)); 1733 list_insert_tail(&dn->dn_dirty_records[txgoff], dr); 1734 mutex_exit(&dn->dn_mtx); 1735 if (drop_struct_lock) 1736 rw_exit(&dn->dn_struct_rwlock); 1737 } 1738 1739 dnode_setdirty(dn, tx); 1740 DB_DNODE_EXIT(db); 1741 return (dr); 1742} 1743 1744/* 1745 * Undirty a buffer in the transaction group referenced by the given 1746 * transaction. Return whether this evicted the dbuf. 1747 / 1748static boolean_t 1749dbuf_undirty(dmu_buf_impl_t db, dmu_tx_t tx) 1750{ 1751* dnode_t dn; 1752* uint64_t txg = tx->tx_txg; 1753 dbuf_dirty_record_t dr, drp; 1754* 1755 ASSERT(txg != 0); 1756 1757 /* 1758 * Due to our use of dn_nlevels below, this can only be called 1759 * in open context, unless we are operating on the MOS. 1760 * From syncing context, dn_nlevels may be different from the 1761 * dn_nlevels used when dbuf was dirtied. 1762 / 1763* ASSERT(db->db_objset == 1764 dmu_objset_pool(db->db_objset)->dp_meta_objset \|\| 1765 txg != spa_syncing_txg(dmu_objset_spa(db->db_objset))); 1766 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 1767 ASSERT0(db->db_level); 1768 ASSERT(MUTEX_HELD(&db->db_mtx)); 1769 1770 /* 1771 * If this buffer is not dirty, we're done. 1772 / 1773* for (drp = &db->db_last_dirty; (dr = drp) != NULL; drp = &dr->dr_next) 1774* if (dr->dr_txg <= txg) 1775 break; 1776 if (dr == NULL \|\| dr->dr_txg < txg) 1777 return (B_FALSE); 1778 ASSERT(dr->dr_txg == txg); 1779 ASSERT(dr->dr_dbuf == db); 1780 1781 DB_DNODE_ENTER(db); 1782 dn = DB_DNODE(db); 1783 1784 dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); 1785 1786 ASSERT(db->db.db_size != 0); 1787 1788 dsl_pool_undirty_space(dmu_objset_pool(dn->dn_objset), 1789 dr->dr_accounted, txg); 1790 1791 drp = dr->dr_next; 1792* 1793 /* 1794 * Note that there are three places in dbuf_dirty() 1795 * where this dirty record may be put on a list. 1796 * Make sure to do a list_remove corresponding to 1797 * every one of those list_insert calls. 1798 / 1799* if (dr->dr_parent) { 1800 mutex_enter(&dr->dr_parent->dt.di.dr_mtx); 1801 list_remove(&dr->dr_parent->dt.di.dr_children, dr); 1802 mutex_exit(&dr->dr_parent->dt.di.dr_mtx); 1803 } else if (db->db_blkid == DMU_SPILL_BLKID \|\| 1804 db->db_level + 1 == dn->dn_nlevels) { 1805 ASSERT(db->db_blkptr == NULL \|\| db->db_parent == dn->dn_dbuf); 1806 mutex_enter(&dn->dn_mtx); 1807 list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr); 1808 mutex_exit(&dn->dn_mtx); 1809 } 1810 DB_DNODE_EXIT(db); 1811 1812 if (db->db_state != DB_NOFILL) { 1813 dbuf_unoverride(dr); 1814 1815 ASSERT(db->db_buf != NULL); 1816 ASSERT(dr->dt.dl.dr_data != NULL); 1817 if (dr->dt.dl.dr_data != db->db_buf) 1818 arc_buf_destroy(dr->dt.dl.dr_data, db); 1819 } 1820 1821 kmem_free(dr, sizeof (dbuf_dirty_record_t)); 1822 1823 ASSERT(db->db_dirtycnt > 0); 1824 db->db_dirtycnt -= 1; 1825 1826 if (refcount_remove(&db->db_holds, (void )(uintptr_t)txg) == 0) { 1827* ASSERT(db->db_state == DB_NOFILL \|\| arc_released(db->db_buf)); 1828 dbuf_destroy(db); 1829 return (B_TRUE); 1830 } 1831 1832 return (B_FALSE); 1833} 1834 1835void 1836dmu_buf_will_dirty(dmu_buf_t db_fake, dmu_tx_t tx) 1837{ 1838 dmu_buf_impl_t db = (dmu_buf_impl_t )db_fake; 1839 int rf = DB_RF_MUST_SUCCEED \| DB_RF_NOPREFETCH; 1840 1841 ASSERT(tx->tx_txg != 0); 1842 ASSERT(!refcount_is_zero(&db->db_holds)); 1843 1844 /* 1845 * Quick check for dirtyness. For already dirty blocks, this 1846 * reduces runtime of this function by >90%, and overall performance 1847 * by 50% for some workloads (e.g. file deletion with indirect blocks 1848 * cached). 1849 / 1850* mutex_enter(&db->db_mtx); 1851 dbuf_dirty_record_t dr; 1852* for (dr = db->db_last_dirty; 1853 dr != NULL && dr->dr_txg >= tx->tx_txg; dr = dr->dr_next) { 1854 /* 1855 * It's possible that it is already dirty but not cached, 1856 * because there are some calls to dbuf_dirty() that don't 1857 * go through dmu_buf_will_dirty(). 1858 / 1859* if (dr->dr_txg == tx->tx_txg && db->db_state == DB_CACHED) { 1860 /* This dbuf is already dirty and cached. / 1861* dbuf_redirty(dr); 1862 mutex_exit(&db->db_mtx); 1863 return; 1864 } 1865 } 1866 mutex_exit(&db->db_mtx); 1867 1868 DB_DNODE_ENTER(db); 1869 if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock)) 1870 rf \|= DB_RF_HAVESTRUCT; 1871 DB_DNODE_EXIT(db); 1872 (void) dbuf_read(db, NULL, rf); 1873 (void) dbuf_dirty(db, tx); 1874} 1875 1876void 1877dmu_buf_will_not_fill(dmu_buf_t db_fake, dmu_tx_t tx) 1878{ 1879 dmu_buf_impl_t db = (dmu_buf_impl_t )db_fake; 1880 1881 db->db_state = DB_NOFILL; 1882 1883 dmu_buf_will_fill(db_fake, tx); 1884} 1885 1886void 1887dmu_buf_will_fill(dmu_buf_t db_fake, dmu_tx_t tx) 1888{ 1889 dmu_buf_impl_t db = (dmu_buf_impl_t )db_fake; 1890 1891 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 1892 ASSERT(tx->tx_txg != 0); 1893 ASSERT(db->db_level == 0); 1894 ASSERT(!refcount_is_zero(&db->db_holds)); 1895 1896 ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT \|\| 1897 dmu_tx_private_ok(tx)); 1898 1899 dbuf_noread(db); 1900 (void) dbuf_dirty(db, tx); 1901} 1902 1903#pragma weak dmu_buf_fill_done = dbuf_fill_done 1904/* ARGSUSED / 1905void 1906dbuf_fill_done(dmu_buf_impl_t db, dmu_tx_t tx) 1907{ 1908* mutex_enter(&db->db_mtx); 1909 DBUF_VERIFY(db); 1910 1911 if (db->db_state == DB_FILL) { 1912 if (db->db_level == 0 && db->db_freed_in_flight) { 1913 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 1914 /* we were freed while filling / 1915* /* XXX dbuf_undirty? / 1916* bzero(db->db.db_data, db->db.db_size); 1917 db->db_freed_in_flight = FALSE; 1918 } 1919 db->db_state = DB_CACHED; 1920 cv_broadcast(&db->db_changed); 1921 } 1922 mutex_exit(&db->db_mtx); 1923} 1924 1925void 1926dmu_buf_write_embedded(dmu_buf_t dbuf, void data, 1927 bp_embedded_type_t etype, enum zio_compress comp, 1928 int uncompressed_size, int compressed_size, int byteorder, 1929 dmu_tx_t tx) 1930{ 1931* dmu_buf_impl_t db = (dmu_buf_impl_t )dbuf; 1932 struct dirty_leaf dl; 1933* dmu_object_type_t type; 1934 1935 if (etype == BP_EMBEDDED_TYPE_DATA) { 1936 ASSERT(spa_feature_is_active(dmu_objset_spa(db->db_objset), 1937 SPA_FEATURE_EMBEDDED_DATA)); 1938 } 1939 1940 DB_DNODE_ENTER(db); 1941 type = DB_DNODE(db)->dn_type; 1942 DB_DNODE_EXIT(db); 1943 1944 ASSERT0(db->db_level); 1945 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 1946 1947 dmu_buf_will_not_fill(dbuf, tx); 1948 1949 ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg); 1950 dl = &db->db_last_dirty->dt.dl; 1951 encode_embedded_bp_compressed(&dl->dr_overridden_by, 1952 data, comp, uncompressed_size, compressed_size); 1953 BPE_SET_ETYPE(&dl->dr_overridden_by, etype); 1954 BP_SET_TYPE(&dl->dr_overridden_by, type); 1955 BP_SET_LEVEL(&dl->dr_overridden_by, 0); 1956 BP_SET_BYTEORDER(&dl->dr_overridden_by, byteorder); 1957 1958 dl->dr_override_state = DR_OVERRIDDEN; 1959 dl->dr_overridden_by.blk_birth = db->db_last_dirty->dr_txg; 1960} 1961 1962/* 1963 * Directly assign a provided arc buf to a given dbuf if it's not referenced 1964 * by anybody except our caller. Otherwise copy arcbuf's contents to dbuf. 1965 / 1966void 1967dbuf_assign_arcbuf(dmu_buf_impl_t db, arc_buf_t buf, dmu_tx_t tx) 1968{ 1969 ASSERT(!refcount_is_zero(&db->db_holds)); 1970 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 1971 ASSERT(db->db_level == 0); 1972 ASSERT(DBUF_GET_BUFC_TYPE(db) == ARC_BUFC_DATA); 1973 ASSERT(buf != NULL); 1974 ASSERT(arc_buf_size(buf) == db->db.db_size); 1975 ASSERT(tx->tx_txg != 0); 1976 1977 arc_return_buf(buf, db); 1978 ASSERT(arc_released(buf)); 1979 1980 mutex_enter(&db->db_mtx); 1981 1982 while (db->db_state == DB_READ \|\| db->db_state == DB_FILL) 1983 cv_wait(&db->db_changed, &db->db_mtx); 1984 1985 ASSERT(db->db_state == DB_CACHED \|\| db->db_state == DB_UNCACHED); 1986 1987 if (db->db_state == DB_CACHED && 1988 refcount_count(&db->db_holds) - 1 > db->db_dirtycnt) { 1989 mutex_exit(&db->db_mtx); 1990 (void) dbuf_dirty(db, tx); 1991 bcopy(buf->b_data, db->db.db_data, db->db.db_size); 1992 arc_buf_destroy(buf, db); 1993 xuio_stat_wbuf_copied(); 1994 return; 1995 } 1996 1997 xuio_stat_wbuf_nocopy(); 1998 if (db->db_state == DB_CACHED) { 1999 dbuf_dirty_record_t dr = db->db_last_dirty; 2000* 2001 ASSERT(db->db_buf != NULL); 2002 if (dr != NULL && dr->dr_txg == tx->tx_txg) { 2003 ASSERT(dr->dt.dl.dr_data == db->db_buf); 2004 if (!arc_released(db->db_buf)) { 2005 ASSERT(dr->dt.dl.dr_override_state == 2006 DR_OVERRIDDEN); 2007 arc_release(db->db_buf, db); 2008 } 2009 dr->dt.dl.dr_data = buf; 2010 arc_buf_destroy(db->db_buf, db); 2011 } else if (dr == NULL \|\| dr->dt.dl.dr_data != db->db_buf) { 2012 arc_release(db->db_buf, db); 2013 arc_buf_destroy(db->db_buf, db); 2014 } 2015 db->db_buf = NULL; 2016 } 2017 ASSERT(db->db_buf == NULL); 2018 dbuf_set_data(db, buf); 2019 db->db_state = DB_FILL; 2020 mutex_exit(&db->db_mtx); 2021 (void) dbuf_dirty(db, tx); 2022 dmu_buf_fill_done(&db->db, tx); 2023} 2024 2025void 2026dbuf_destroy(dmu_buf_impl_t db) 2027{ 2028* dnode_t dn; 2029* dmu_buf_impl_t parent = db->db_parent; 2030* dmu_buf_impl_t dndb; 2031* 2032 ASSERT(MUTEX_HELD(&db->db_mtx)); 2033 ASSERT(refcount_is_zero(&db->db_holds)); 2034 2035 if (db->db_buf != NULL) { 2036 arc_buf_destroy(db->db_buf, db); 2037 db->db_buf = NULL; 2038 } 2039 2040 if (db->db_blkid == DMU_BONUS_BLKID) { 2041 ASSERT(db->db.db_data != NULL); 2042 zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN); 2043 arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); 2044 db->db_state = DB_UNCACHED; 2045 } 2046 2047 dbuf_clear_data(db); 2048 2049 if (multilist_link_active(&db->db_cache_link)) { 2050 multilist_remove(&dbuf_cache, db); 2051 (void) refcount_remove_many(&dbuf_cache_size, 2052 db->db.db_size, db); 2053 } 2054 2055 ASSERT(db->db_state == DB_UNCACHED \|\| db->db_state == DB_NOFILL); 2056 ASSERT(db->db_data_pending == NULL); 2057 2058 db->db_state = DB_EVICTING; 2059 db->db_blkptr = NULL; 2060 2061 /* 2062 * Now that db_state is DB_EVICTING, nobody else can find this via 2063 * the hash table. We can now drop db_mtx, which allows us to 2064 * acquire the dn_dbufs_mtx. 2065 / 2066* mutex_exit(&db->db_mtx); 2067 2068 DB_DNODE_ENTER(db); 2069 dn = DB_DNODE(db); 2070 dndb = dn->dn_dbuf; 2071 if (db->db_blkid != DMU_BONUS_BLKID) { 2072 boolean_t needlock = !MUTEX_HELD(&dn->dn_dbufs_mtx); 2073 if (needlock) 2074 mutex_enter(&dn->dn_dbufs_mtx); 2075 avl_remove(&dn->dn_dbufs, db); 2076 atomic_dec_32(&dn->dn_dbufs_count); 2077 membar_producer(); 2078 DB_DNODE_EXIT(db); 2079 if (needlock) 2080 mutex_exit(&dn->dn_dbufs_mtx); 2081 /* 2082 * Decrementing the dbuf count means that the hold corresponding 2083 * to the removed dbuf is no longer discounted in dnode_move(), 2084 * so the dnode cannot be moved until after we release the hold. 2085 * The membar_producer() ensures visibility of the decremented 2086 * value in dnode_move(), since DB_DNODE_EXIT doesn't actually 2087 * release any lock. 2088 / 2089* dnode_rele(dn, db); 2090 db->db_dnode_handle = NULL; 2091 2092 dbuf_hash_remove(db); 2093 } else { 2094 DB_DNODE_EXIT(db); 2095 } 2096 2097 ASSERT(refcount_is_zero(&db->db_holds)); 2098 2099 db->db_parent = NULL; 2100 2101 ASSERT(db->db_buf == NULL); 2102 ASSERT(db->db.db_data == NULL); 2103 ASSERT(db->db_hash_next == NULL); 2104 ASSERT(db->db_blkptr == NULL); 2105 ASSERT(db->db_data_pending == NULL); 2106 ASSERT(!multilist_link_active(&db->db_cache_link)); 2107 2108 kmem_cache_free(dbuf_kmem_cache, db); 2109 arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); 2110 2111 /* 2112 * If this dbuf is referenced from an indirect dbuf, 2113 * decrement the ref count on the indirect dbuf. 2114 / 2115* if (parent && parent != dndb) 2116 dbuf_rele(parent, db); 2117} 2118 2119/* 2120 * Note: While bpp will always be updated if the function returns success, 2121 * parentp will not be updated if the dnode does not have dn_dbuf filled in; 2122 * this happens when the dnode is the meta-dnode, or a userused or groupused 2123 * object. 2124 / 2125static int 2126dbuf_findbp(dnode_t dn, int level, uint64_t blkid, int fail_sparse, 2127 dmu_buf_impl_t parentp, blkptr_t bpp) 2128{ 2129 int nlevels, epbs; 2130 2131 parentp = NULL; 2132* bpp = NULL; 2133* 2134 ASSERT(blkid != DMU_BONUS_BLKID); 2135 2136 if (blkid == DMU_SPILL_BLKID) { 2137 mutex_enter(&dn->dn_mtx); 2138 if (dn->dn_have_spill && 2139 (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) 2140 bpp = &dn->dn_phys->dn_spill; 2141* else 2142 bpp = NULL; 2143* dbuf_add_ref(dn->dn_dbuf, NULL); 2144 parentp = dn->dn_dbuf; 2145* mutex_exit(&dn->dn_mtx); 2146 return (0); 2147 } 2148 2149 if (dn->dn_phys->dn_nlevels == 0) 2150 nlevels = 1; 2151 else 2152 nlevels = dn->dn_phys->dn_nlevels; 2153 2154 epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 2155 2156 ASSERT3U(level * epbs, <, 64); 2157 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 2158 if (level >= nlevels \|\| 2159 (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))) { 2160 /* the buffer has no parent yet / 2161* return (SET_ERROR(ENOENT)); 2162 } else if (level < nlevels-1) { 2163 /* this block is referenced from an indirect block / 2164* int err = dbuf_hold_impl(dn, level+1, 2165 blkid >> epbs, fail_sparse, FALSE, NULL, parentp); 2166 if (err) 2167 return (err); 2168 err = dbuf_read(parentp, NULL, 2169* (DB_RF_HAVESTRUCT \| DB_RF_NOPREFETCH \| DB_RF_CANFAIL)); 2170 if (err) { 2171 dbuf_rele(parentp, NULL); 2172* parentp = NULL; 2173* return (err); 2174 } 2175 bpp = ((blkptr_t )(parentp)->db.db_data) + 2176* (blkid & ((1ULL << epbs) - 1)); 2177 return (0); 2178 } else { 2179 /* the block is referenced from the dnode / 2180* ASSERT3U(level, ==, nlevels-1); 2181 ASSERT(dn->dn_phys->dn_nblkptr == 0 \|\| 2182 blkid < dn->dn_phys->dn_nblkptr); 2183 if (dn->dn_dbuf) { 2184 dbuf_add_ref(dn->dn_dbuf, NULL); 2185 parentp = dn->dn_dbuf; 2186* } 2187 bpp = &dn->dn_phys->dn_blkptr[blkid]; 2188* return (0); 2189 } 2190} 2191 2192static dmu_buf_impl_t * 2193dbuf_create(dnode_t dn, uint8_t level, uint64_t blkid, 2194* dmu_buf_impl_t parent, blkptr_t blkptr) 2195{ 2196 objset_t os = dn->dn_objset; 2197* dmu_buf_impl_t db, odb; 2198 2199 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 2200 ASSERT(dn->dn_type != DMU_OT_NONE); 2201 2202 db = kmem_cache_alloc(dbuf_kmem_cache, KM_SLEEP); 2203 2204 db->db_objset = os; 2205 db->db.db_object = dn->dn_object; 2206 db->db_level = level; 2207 db->db_blkid = blkid; 2208 db->db_last_dirty = NULL; 2209 db->db_dirtycnt = 0; 2210 db->db_dnode_handle = dn->dn_handle; 2211 db->db_parent = parent; 2212 db->db_blkptr = blkptr; 2213 2214 db->db_user = NULL; 2215 db->db_user_immediate_evict = FALSE; 2216 db->db_freed_in_flight = FALSE; 2217 db->db_pending_evict = FALSE; 2218 2219 if (blkid == DMU_BONUS_BLKID) { 2220 ASSERT3P(parent, ==, dn->dn_dbuf); 2221 db->db.db_size = DN_MAX_BONUSLEN - 2222 (dn->dn_nblkptr-1) * sizeof (blkptr_t); 2223 ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); 2224 db->db.db_offset = DMU_BONUS_BLKID; 2225 db->db_state = DB_UNCACHED; 2226 /* the bonus dbuf is not placed in the hash table / 2227* arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); 2228 return (db); 2229 } else if (blkid == DMU_SPILL_BLKID) { 2230 db->db.db_size = (blkptr != NULL) ? 2231 BP_GET_LSIZE(blkptr) : SPA_MINBLOCKSIZE; 2232 db->db.db_offset = 0; 2233 } else { 2234 int blocksize = 2235 db->db_level ? 1 << dn->dn_indblkshift : dn->dn_datablksz; 2236 db->db.db_size = blocksize; 2237 db->db.db_offset = db->db_blkid * blocksize; 2238 } 2239 2240 /* 2241 * Hold the dn_dbufs_mtx while we get the new dbuf 2242 * in the hash table and added to the dbufs list. 2243 * This prevents a possible deadlock with someone 2244 * trying to look up this dbuf before its added to the 2245 * dn_dbufs list. 2246 / 2247* mutex_enter(&dn->dn_dbufs_mtx); 2248 db->db_state = DB_EVICTING; 2249 if ((odb = dbuf_hash_insert(db)) != NULL) { 2250 /* someone else inserted it first / 2251* kmem_cache_free(dbuf_kmem_cache, db); 2252 mutex_exit(&dn->dn_dbufs_mtx); 2253 return (odb); 2254 } 2255 avl_add(&dn->dn_dbufs, db);
2286 if (db->db_level == 0 && db->db_blkid >= 2287 dn->dn_unlisted_l0_blkid) 2288 dn->dn_unlisted_l0_blkid = db->db_blkid + 1;	2256
2289 db->db_state = DB_UNCACHED; 2290 mutex_exit(&dn->dn_dbufs_mtx); 2291 arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); 2292 2293 if (parent && parent != dn->dn_dbuf) 2294 dbuf_add_ref(parent, db); 2295 2296 ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT \|\| 2297 refcount_count(&dn->dn_holds) > 0); 2298 (void) refcount_add(&dn->dn_holds, db); 2299 atomic_inc_32(&dn->dn_dbufs_count); 2300 2301 dprintf_dbuf(db, "db=%p\n", db); 2302 2303 return (db); 2304} 2305 2306typedef struct dbuf_prefetch_arg { 2307 spa_t dpa_spa; / The spa to issue the prefetch in. / 2308* zbookmark_phys_t dpa_zb; /* The target block to prefetch. / 2309* int dpa_epbs; /* Entries (blkptr_t's) Per Block Shift. / 2310* int dpa_curlevel; /* The current level that we're reading / 2311* dnode_t dpa_dnode; / The dnode associated with the prefetch / 2312* zio_priority_t dpa_prio; /* The priority I/Os should be issued at. / 2313* zio_t dpa_zio; / The parent zio_t for all prefetches. / 2314* arc_flags_t dpa_aflags; /* Flags to pass to the final prefetch. / 2315} dbuf_prefetch_arg_t; 2316* 2317/* 2318 * Actually issue the prefetch read for the block given. 2319 / 2320static void 2321dbuf_issue_final_prefetch(dbuf_prefetch_arg_t dpa, blkptr_t bp) 2322{ 2323* if (BP_IS_HOLE(bp) \|\| BP_IS_EMBEDDED(bp)) 2324 return; 2325 2326 arc_flags_t aflags = 2327 dpa->dpa_aflags \| ARC_FLAG_NOWAIT \| ARC_FLAG_PREFETCH; 2328 2329 ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp)); 2330 ASSERT3U(dpa->dpa_curlevel, ==, dpa->dpa_zb.zb_level); 2331 ASSERT(dpa->dpa_zio != NULL); 2332 (void) arc_read(dpa->dpa_zio, dpa->dpa_spa, bp, NULL, NULL, 2333 dpa->dpa_prio, ZIO_FLAG_CANFAIL \| ZIO_FLAG_SPECULATIVE, 2334 &aflags, &dpa->dpa_zb); 2335} 2336 2337/* 2338 * Called when an indirect block above our prefetch target is read in. This 2339 * will either read in the next indirect block down the tree or issue the actual 2340 * prefetch if the next block down is our target. 2341 / 2342static void 2343dbuf_prefetch_indirect_done(zio_t zio, arc_buf_t abuf, void private) 2344{ 2345 dbuf_prefetch_arg_t dpa = private; 2346* 2347 ASSERT3S(dpa->dpa_zb.zb_level, <, dpa->dpa_curlevel); 2348 ASSERT3S(dpa->dpa_curlevel, >, 0); 2349 2350 /* 2351 * The dpa_dnode is only valid if we are called with a NULL 2352 * zio. This indicates that the arc_read() returned without 2353 * first calling zio_read() to issue a physical read. Once 2354 * a physical read is made the dpa_dnode must be invalidated 2355 * as the locks guarding it may have been dropped. If the 2356 * dpa_dnode is still valid, then we want to add it to the dbuf 2357 * cache. To do so, we must hold the dbuf associated with the block 2358 * we just prefetched, read its contents so that we associate it 2359 * with an arc_buf_t, and then release it. 2360 / 2361* if (zio != NULL) { 2362 ASSERT3S(BP_GET_LEVEL(zio->io_bp), ==, dpa->dpa_curlevel); 2363 if (zio->io_flags & ZIO_FLAG_RAW) { 2364 ASSERT3U(BP_GET_PSIZE(zio->io_bp), ==, zio->io_size); 2365 } else { 2366 ASSERT3U(BP_GET_LSIZE(zio->io_bp), ==, zio->io_size); 2367 } 2368 ASSERT3P(zio->io_spa, ==, dpa->dpa_spa); 2369 2370 dpa->dpa_dnode = NULL; 2371 } else if (dpa->dpa_dnode != NULL) { 2372 uint64_t curblkid = dpa->dpa_zb.zb_blkid >> 2373 (dpa->dpa_epbs * (dpa->dpa_curlevel - 2374 dpa->dpa_zb.zb_level)); 2375 dmu_buf_impl_t db = dbuf_hold_level(dpa->dpa_dnode, 2376* dpa->dpa_curlevel, curblkid, FTAG); 2377 (void) dbuf_read(db, NULL, 2378 DB_RF_MUST_SUCCEED \| DB_RF_NOPREFETCH \| DB_RF_HAVESTRUCT); 2379 dbuf_rele(db, FTAG); 2380 } 2381 2382 dpa->dpa_curlevel--; 2383 2384 uint64_t nextblkid = dpa->dpa_zb.zb_blkid >> 2385 (dpa->dpa_epbs * (dpa->dpa_curlevel - dpa->dpa_zb.zb_level)); 2386 blkptr_t bp = ((blkptr_t )abuf->b_data) + 2387 P2PHASE(nextblkid, 1ULL << dpa->dpa_epbs); 2388 if (BP_IS_HOLE(bp) \|\| (zio != NULL && zio->io_error != 0)) { 2389 kmem_free(dpa, sizeof (dpa)); 2390* } else if (dpa->dpa_curlevel == dpa->dpa_zb.zb_level) { 2391 ASSERT3U(nextblkid, ==, dpa->dpa_zb.zb_blkid); 2392 dbuf_issue_final_prefetch(dpa, bp); 2393 kmem_free(dpa, sizeof (dpa)); 2394* } else { 2395 arc_flags_t iter_aflags = ARC_FLAG_NOWAIT; 2396 zbookmark_phys_t zb; 2397 2398 ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp)); 2399 2400 SET_BOOKMARK(&zb, dpa->dpa_zb.zb_objset, 2401 dpa->dpa_zb.zb_object, dpa->dpa_curlevel, nextblkid); 2402 2403 (void) arc_read(dpa->dpa_zio, dpa->dpa_spa, 2404 bp, dbuf_prefetch_indirect_done, dpa, dpa->dpa_prio, 2405 ZIO_FLAG_CANFAIL \| ZIO_FLAG_SPECULATIVE, 2406 &iter_aflags, &zb); 2407 } 2408 2409 arc_buf_destroy(abuf, private); 2410} 2411 2412/* 2413 * Issue prefetch reads for the given block on the given level. If the indirect 2414 * blocks above that block are not in memory, we will read them in 2415 * asynchronously. As a result, this call never blocks waiting for a read to 2416 * complete. 2417 / 2418void 2419dbuf_prefetch(dnode_t dn, int64_t level, uint64_t blkid, zio_priority_t prio, 2420 arc_flags_t aflags) 2421{ 2422 blkptr_t bp; 2423 int epbs, nlevels, curlevel; 2424 uint64_t curblkid; 2425 2426 ASSERT(blkid != DMU_BONUS_BLKID); 2427 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 2428 2429 if (blkid > dn->dn_maxblkid) 2430 return; 2431 2432 if (dnode_block_freed(dn, blkid)) 2433 return; 2434 2435 /* 2436 * This dnode hasn't been written to disk yet, so there's nothing to 2437 * prefetch. 2438 / 2439* nlevels = dn->dn_phys->dn_nlevels; 2440 if (level >= nlevels \|\| dn->dn_phys->dn_nblkptr == 0) 2441 return; 2442 2443 epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; 2444 if (dn->dn_phys->dn_maxblkid < blkid << (epbs * level)) 2445 return; 2446 2447 dmu_buf_impl_t db = dbuf_find(dn->dn_objset, dn->dn_object, 2448* level, blkid); 2449 if (db != NULL) { 2450 mutex_exit(&db->db_mtx); 2451 /* 2452 * This dbuf already exists. It is either CACHED, or 2453 * (we assume) about to be read or filled. 2454 / 2455* return; 2456 } 2457 2458 /* 2459 * Find the closest ancestor (indirect block) of the target block 2460 * that is present in the cache. In this indirect block, we will 2461 * find the bp that is at curlevel, curblkid. 2462 / 2463* curlevel = level; 2464 curblkid = blkid; 2465 while (curlevel < nlevels - 1) { 2466 int parent_level = curlevel + 1; 2467 uint64_t parent_blkid = curblkid >> epbs; 2468 dmu_buf_impl_t db; 2469* 2470 if (dbuf_hold_impl(dn, parent_level, parent_blkid, 2471 FALSE, TRUE, FTAG, &db) == 0) { 2472 blkptr_t bpp = db->db_buf->b_data; 2473* bp = bpp[P2PHASE(curblkid, 1 << epbs)]; 2474 dbuf_rele(db, FTAG); 2475 break; 2476 } 2477 2478 curlevel = parent_level; 2479 curblkid = parent_blkid; 2480 } 2481 2482 if (curlevel == nlevels - 1) { 2483 /* No cached indirect blocks found. / 2484* ASSERT3U(curblkid, <, dn->dn_phys->dn_nblkptr); 2485 bp = dn->dn_phys->dn_blkptr[curblkid]; 2486 } 2487 if (BP_IS_HOLE(&bp)) 2488 return; 2489 2490 ASSERT3U(curlevel, ==, BP_GET_LEVEL(&bp)); 2491 2492 zio_t pio = zio_root(dmu_objset_spa(dn->dn_objset), NULL, NULL, 2493* ZIO_FLAG_CANFAIL); 2494 2495 dbuf_prefetch_arg_t dpa = kmem_zalloc(sizeof (dpa), KM_SLEEP); 2496 dsl_dataset_t ds = dn->dn_objset->os_dsl_dataset; 2497* SET_BOOKMARK(&dpa->dpa_zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET, 2498 dn->dn_object, level, blkid); 2499 dpa->dpa_curlevel = curlevel; 2500 dpa->dpa_prio = prio; 2501 dpa->dpa_aflags = aflags; 2502 dpa->dpa_spa = dn->dn_objset->os_spa; 2503 dpa->dpa_dnode = dn; 2504 dpa->dpa_epbs = epbs; 2505 dpa->dpa_zio = pio; 2506 2507 /* 2508 * If we have the indirect just above us, no need to do the asynchronous 2509 * prefetch chain; we'll just run the last step ourselves. If we're at 2510 * a higher level, though, we want to issue the prefetches for all the 2511 * indirect blocks asynchronously, so we can go on with whatever we were 2512 * doing. 2513 / 2514* if (curlevel == level) { 2515 ASSERT3U(curblkid, ==, blkid); 2516 dbuf_issue_final_prefetch(dpa, &bp); 2517 kmem_free(dpa, sizeof (dpa)); 2518* } else { 2519 arc_flags_t iter_aflags = ARC_FLAG_NOWAIT; 2520 zbookmark_phys_t zb; 2521 2522 SET_BOOKMARK(&zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET, 2523 dn->dn_object, curlevel, curblkid); 2524 (void) arc_read(dpa->dpa_zio, dpa->dpa_spa, 2525 &bp, dbuf_prefetch_indirect_done, dpa, prio, 2526 ZIO_FLAG_CANFAIL \| ZIO_FLAG_SPECULATIVE, 2527 &iter_aflags, &zb); 2528 } 2529 /* 2530 * We use pio here instead of dpa_zio since it's possible that 2531 * dpa may have already been freed. 2532 / 2533* zio_nowait(pio); 2534} 2535 2536/* 2537 * Returns with db_holds incremented, and db_mtx not held. 2538 * Note: dn_struct_rwlock must be held. 2539 / 2540int 2541dbuf_hold_impl(dnode_t dn, uint8_t level, uint64_t blkid, 2542 boolean_t fail_sparse, boolean_t fail_uncached, 2543 void tag, dmu_buf_impl_t dbp) 2544{ 2545* dmu_buf_impl_t db, parent = NULL; 2546 2547 ASSERT(blkid != DMU_BONUS_BLKID); 2548 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 2549 ASSERT3U(dn->dn_nlevels, >, level); 2550 2551 dbp = NULL; 2552top: 2553* /* dbuf_find() returns with db_mtx held / 2554* db = dbuf_find(dn->dn_objset, dn->dn_object, level, blkid); 2555 2556 if (db == NULL) { 2557 blkptr_t bp = NULL; 2558* int err; 2559 2560 if (fail_uncached) 2561 return (SET_ERROR(ENOENT)); 2562 2563 ASSERT3P(parent, ==, NULL); 2564 err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp); 2565 if (fail_sparse) { 2566 if (err == 0 && bp && BP_IS_HOLE(bp)) 2567 err = SET_ERROR(ENOENT); 2568 if (err) { 2569 if (parent) 2570 dbuf_rele(parent, NULL); 2571 return (err); 2572 } 2573 } 2574 if (err && err != ENOENT) 2575 return (err); 2576 db = dbuf_create(dn, level, blkid, parent, bp); 2577 } 2578 2579 if (fail_uncached && db->db_state != DB_CACHED) { 2580 mutex_exit(&db->db_mtx); 2581 return (SET_ERROR(ENOENT)); 2582 } 2583 2584 if (db->db_buf != NULL) 2585 ASSERT3P(db->db.db_data, ==, db->db_buf->b_data); 2586 2587 ASSERT(db->db_buf == NULL \|\| arc_referenced(db->db_buf)); 2588 2589 /* 2590 * If this buffer is currently syncing out, and we are are 2591 * still referencing it from db_data, we need to make a copy 2592 * of it in case we decide we want to dirty it again in this txg. 2593 / 2594* if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && 2595 dn->dn_object != DMU_META_DNODE_OBJECT && 2596 db->db_state == DB_CACHED && db->db_data_pending) { 2597 dbuf_dirty_record_t dr = db->db_data_pending; 2598* 2599 if (dr->dt.dl.dr_data == db->db_buf) { 2600 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 2601 2602 dbuf_set_data(db, 2603 arc_alloc_buf(dn->dn_objset->os_spa, 2604 db->db.db_size, db, type)); 2605 bcopy(dr->dt.dl.dr_data->b_data, db->db.db_data, 2606 db->db.db_size); 2607 } 2608 } 2609 2610 if (multilist_link_active(&db->db_cache_link)) { 2611 ASSERT(refcount_is_zero(&db->db_holds)); 2612 multilist_remove(&dbuf_cache, db); 2613 (void) refcount_remove_many(&dbuf_cache_size, 2614 db->db.db_size, db); 2615 } 2616 (void) refcount_add(&db->db_holds, tag); 2617 DBUF_VERIFY(db); 2618 mutex_exit(&db->db_mtx); 2619 2620 /* NOTE: we can't rele the parent until after we drop the db_mtx / 2621* if (parent) 2622 dbuf_rele(parent, NULL); 2623 2624 ASSERT3P(DB_DNODE(db), ==, dn); 2625 ASSERT3U(db->db_blkid, ==, blkid); 2626 ASSERT3U(db->db_level, ==, level); 2627 dbp = db; 2628* 2629 return (0); 2630} 2631 2632dmu_buf_impl_t * 2633dbuf_hold(dnode_t dn, uint64_t blkid, void tag) 2634{ 2635 return (dbuf_hold_level(dn, 0, blkid, tag)); 2636} 2637 2638dmu_buf_impl_t * 2639dbuf_hold_level(dnode_t dn, int level, uint64_t blkid, void tag) 2640{ 2641 dmu_buf_impl_t db; 2642* int err = dbuf_hold_impl(dn, level, blkid, FALSE, FALSE, tag, &db); 2643 return (err ? NULL : db); 2644} 2645 2646void 2647dbuf_create_bonus(dnode_t dn) 2648{ 2649* ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); 2650 2651 ASSERT(dn->dn_bonus == NULL); 2652 dn->dn_bonus = dbuf_create(dn, 0, DMU_BONUS_BLKID, dn->dn_dbuf, NULL); 2653} 2654 2655int 2656dbuf_spill_set_blksz(dmu_buf_t db_fake, uint64_t blksz, dmu_tx_t tx) 2657{ 2658 dmu_buf_impl_t db = (dmu_buf_impl_t )db_fake; 2659 dnode_t dn; 2660* 2661 if (db->db_blkid != DMU_SPILL_BLKID) 2662 return (SET_ERROR(ENOTSUP)); 2663 if (blksz == 0) 2664 blksz = SPA_MINBLOCKSIZE; 2665 ASSERT3U(blksz, <=, spa_maxblocksize(dmu_objset_spa(db->db_objset))); 2666 blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE); 2667 2668 DB_DNODE_ENTER(db); 2669 dn = DB_DNODE(db); 2670 rw_enter(&dn->dn_struct_rwlock, RW_WRITER); 2671 dbuf_new_size(db, blksz, tx); 2672 rw_exit(&dn->dn_struct_rwlock); 2673 DB_DNODE_EXIT(db); 2674 2675 return (0); 2676} 2677 2678void 2679dbuf_rm_spill(dnode_t dn, dmu_tx_t tx) 2680{ 2681 dbuf_free_range(dn, DMU_SPILL_BLKID, DMU_SPILL_BLKID, tx); 2682} 2683 2684#pragma weak dmu_buf_add_ref = dbuf_add_ref 2685void 2686dbuf_add_ref(dmu_buf_impl_t db, void tag) 2687{ 2688 int64_t holds = refcount_add(&db->db_holds, tag); 2689 ASSERT3S(holds, >, 1); 2690} 2691 2692#pragma weak dmu_buf_try_add_ref = dbuf_try_add_ref 2693boolean_t 2694dbuf_try_add_ref(dmu_buf_t db_fake, objset_t os, uint64_t obj, uint64_t blkid, 2695 void tag) 2696{ 2697* dmu_buf_impl_t db = (dmu_buf_impl_t )db_fake; 2698 dmu_buf_impl_t found_db; 2699* boolean_t result = B_FALSE; 2700 2701 if (db->db_blkid == DMU_BONUS_BLKID) 2702 found_db = dbuf_find_bonus(os, obj); 2703 else 2704 found_db = dbuf_find(os, obj, 0, blkid); 2705 2706 if (found_db != NULL) { 2707 if (db == found_db && dbuf_refcount(db) > db->db_dirtycnt) { 2708 (void) refcount_add(&db->db_holds, tag); 2709 result = B_TRUE; 2710 } 2711 mutex_exit(&db->db_mtx); 2712 } 2713 return (result); 2714} 2715 2716/* 2717 * If you call dbuf_rele() you had better not be referencing the dnode handle 2718 * unless you have some other direct or indirect hold on the dnode. (An indirect 2719 * hold is a hold on one of the dnode's dbufs, including the bonus buffer.) 2720 * Without that, the dbuf_rele() could lead to a dnode_rele() followed by the 2721 * dnode's parent dbuf evicting its dnode handles. 2722 / 2723void 2724dbuf_rele(dmu_buf_impl_t db, void tag) 2725{ 2726* mutex_enter(&db->db_mtx); 2727 dbuf_rele_and_unlock(db, tag); 2728} 2729 2730void 2731dmu_buf_rele(dmu_buf_t db, void tag) 2732{ 2733 dbuf_rele((dmu_buf_impl_t )db, tag); 2734} 2735* 2736/* 2737 * dbuf_rele() for an already-locked dbuf. This is necessary to allow 2738 * db_dirtycnt and db_holds to be updated atomically. 2739 / 2740void 2741dbuf_rele_and_unlock(dmu_buf_impl_t db, void tag) 2742{ 2743* int64_t holds; 2744 2745 ASSERT(MUTEX_HELD(&db->db_mtx)); 2746 DBUF_VERIFY(db); 2747 2748 /* 2749 * Remove the reference to the dbuf before removing its hold on the 2750 * dnode so we can guarantee in dnode_move() that a referenced bonus 2751 * buffer has a corresponding dnode hold. 2752 / 2753* holds = refcount_remove(&db->db_holds, tag); 2754 ASSERT(holds >= 0); 2755 2756 /* 2757 * We can't freeze indirects if there is a possibility that they 2758 * may be modified in the current syncing context. 2759 / 2760* if (db->db_buf != NULL && 2761 holds == (db->db_level == 0 ? db->db_dirtycnt : 0)) { 2762 arc_buf_freeze(db->db_buf); 2763 } 2764 2765 if (holds == db->db_dirtycnt && 2766 db->db_level == 0 && db->db_user_immediate_evict) 2767 dbuf_evict_user(db); 2768 2769 if (holds == 0) { 2770 if (db->db_blkid == DMU_BONUS_BLKID) { 2771 dnode_t dn; 2772* boolean_t evict_dbuf = db->db_pending_evict; 2773 2774 /* 2775 * If the dnode moves here, we cannot cross this 2776 * barrier until the move completes. 2777 / 2778* DB_DNODE_ENTER(db); 2779 2780 dn = DB_DNODE(db); 2781 atomic_dec_32(&dn->dn_dbufs_count); 2782 2783 /* 2784 * Decrementing the dbuf count means that the bonus 2785 * buffer's dnode hold is no longer discounted in 2786 * dnode_move(). The dnode cannot move until after 2787 * the dnode_rele() below. 2788 / 2789* DB_DNODE_EXIT(db); 2790 2791 /* 2792 * Do not reference db after its lock is dropped. 2793 * Another thread may evict it. 2794 / 2795* mutex_exit(&db->db_mtx); 2796 2797 if (evict_dbuf) 2798 dnode_evict_bonus(dn); 2799 2800 dnode_rele(dn, db); 2801 } else if (db->db_buf == NULL) { 2802 /* 2803 * This is a special case: we never associated this 2804 * dbuf with any data allocated from the ARC. 2805 / 2806* ASSERT(db->db_state == DB_UNCACHED \|\| 2807 db->db_state == DB_NOFILL); 2808 dbuf_destroy(db); 2809 } else if (arc_released(db->db_buf)) { 2810 /* 2811 * This dbuf has anonymous data associated with it. 2812 / 2813* dbuf_destroy(db); 2814 } else { 2815 boolean_t do_arc_evict = B_FALSE; 2816 blkptr_t bp; 2817 spa_t spa = dmu_objset_spa(db->db_objset); 2818* 2819 if (!DBUF_IS_CACHEABLE(db) && 2820 db->db_blkptr != NULL && 2821 !BP_IS_HOLE(db->db_blkptr) && 2822 !BP_IS_EMBEDDED(db->db_blkptr)) { 2823 do_arc_evict = B_TRUE; 2824 bp = db->db_blkptr; 2825* } 2826 2827 if (!DBUF_IS_CACHEABLE(db) \|\| 2828 db->db_pending_evict) { 2829 dbuf_destroy(db); 2830 } else if (!multilist_link_active(&db->db_cache_link)) { 2831 multilist_insert(&dbuf_cache, db); 2832 (void) refcount_add_many(&dbuf_cache_size, 2833 db->db.db_size, db); 2834 mutex_exit(&db->db_mtx); 2835 2836 dbuf_evict_notify(); 2837 } 2838 2839 if (do_arc_evict) 2840 arc_freed(spa, &bp); 2841 } 2842 } else { 2843 mutex_exit(&db->db_mtx); 2844 } 2845 2846} 2847 2848#pragma weak dmu_buf_refcount = dbuf_refcount 2849uint64_t 2850dbuf_refcount(dmu_buf_impl_t db) 2851{ 2852* return (refcount_count(&db->db_holds)); 2853} 2854 2855void * 2856dmu_buf_replace_user(dmu_buf_t db_fake, dmu_buf_user_t old_user, 2857 dmu_buf_user_t new_user) 2858{ 2859* dmu_buf_impl_t db = (dmu_buf_impl_t )db_fake; 2860 2861 mutex_enter(&db->db_mtx); 2862 dbuf_verify_user(db, DBVU_NOT_EVICTING); 2863 if (db->db_user == old_user) 2864 db->db_user = new_user; 2865 else 2866 old_user = db->db_user; 2867 dbuf_verify_user(db, DBVU_NOT_EVICTING); 2868 mutex_exit(&db->db_mtx); 2869 2870 return (old_user); 2871} 2872 2873void * 2874dmu_buf_set_user(dmu_buf_t db_fake, dmu_buf_user_t user) 2875{ 2876 return (dmu_buf_replace_user(db_fake, NULL, user)); 2877} 2878 2879void * 2880dmu_buf_set_user_ie(dmu_buf_t db_fake, dmu_buf_user_t user) 2881{ 2882 dmu_buf_impl_t db = (dmu_buf_impl_t )db_fake; 2883 2884 db->db_user_immediate_evict = TRUE; 2885 return (dmu_buf_set_user(db_fake, user)); 2886} 2887 2888void * 2889dmu_buf_remove_user(dmu_buf_t db_fake, dmu_buf_user_t user) 2890{ 2891 return (dmu_buf_replace_user(db_fake, user, NULL)); 2892} 2893 2894void * 2895dmu_buf_get_user(dmu_buf_t db_fake) 2896{ 2897* dmu_buf_impl_t db = (dmu_buf_impl_t )db_fake; 2898 2899 dbuf_verify_user(db, DBVU_NOT_EVICTING); 2900 return (db->db_user); 2901} 2902 2903void 2904dmu_buf_user_evict_wait() 2905{ 2906 taskq_wait(dbu_evict_taskq); 2907} 2908 2909boolean_t 2910dmu_buf_freeable(dmu_buf_t dbuf) 2911{ 2912* boolean_t res = B_FALSE; 2913 dmu_buf_impl_t db = (dmu_buf_impl_t )dbuf; 2914 2915 if (db->db_blkptr) 2916 res = dsl_dataset_block_freeable(db->db_objset->os_dsl_dataset, 2917 db->db_blkptr, db->db_blkptr->blk_birth); 2918 2919 return (res); 2920} 2921 2922blkptr_t * 2923dmu_buf_get_blkptr(dmu_buf_t db) 2924{ 2925* dmu_buf_impl_t dbi = (dmu_buf_impl_t )db; 2926 return (dbi->db_blkptr); 2927} 2928 2929objset_t * 2930dmu_buf_get_objset(dmu_buf_t db) 2931{ 2932* dmu_buf_impl_t dbi = (dmu_buf_impl_t )db; 2933 return (dbi->db_objset); 2934} 2935 2936dnode_t * 2937dmu_buf_dnode_enter(dmu_buf_t db) 2938{ 2939* dmu_buf_impl_t dbi = (dmu_buf_impl_t )db; 2940 DB_DNODE_ENTER(dbi); 2941 return (DB_DNODE(dbi)); 2942} 2943 2944void 2945dmu_buf_dnode_exit(dmu_buf_t db) 2946{ 2947* dmu_buf_impl_t dbi = (dmu_buf_impl_t )db; 2948 DB_DNODE_EXIT(dbi); 2949} 2950 2951static void 2952dbuf_check_blkptr(dnode_t dn, dmu_buf_impl_t db) 2953{ 2954 /* ASSERT(dmu_tx_is_syncing(tx) / 2955* ASSERT(MUTEX_HELD(&db->db_mtx)); 2956 2957 if (db->db_blkptr != NULL) 2958 return; 2959 2960 if (db->db_blkid == DMU_SPILL_BLKID) { 2961 db->db_blkptr = &dn->dn_phys->dn_spill; 2962 BP_ZERO(db->db_blkptr); 2963 return; 2964 } 2965 if (db->db_level == dn->dn_phys->dn_nlevels-1) { 2966 /* 2967 * This buffer was allocated at a time when there was 2968 * no available blkptrs from the dnode, or it was 2969 * inappropriate to hook it in (i.e., nlevels mis-match). 2970 / 2971* ASSERT(db->db_blkid < dn->dn_phys->dn_nblkptr); 2972 ASSERT(db->db_parent == NULL); 2973 db->db_parent = dn->dn_dbuf; 2974 db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid]; 2975 DBUF_VERIFY(db); 2976 } else { 2977 dmu_buf_impl_t parent = db->db_parent; 2978* int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; 2979 2980 ASSERT(dn->dn_phys->dn_nlevels > 1); 2981 if (parent == NULL) { 2982 mutex_exit(&db->db_mtx); 2983 rw_enter(&dn->dn_struct_rwlock, RW_READER); 2984 parent = dbuf_hold_level(dn, db->db_level + 1, 2985 db->db_blkid >> epbs, db); 2986 rw_exit(&dn->dn_struct_rwlock); 2987 mutex_enter(&db->db_mtx); 2988 db->db_parent = parent; 2989 } 2990 db->db_blkptr = (blkptr_t )parent->db.db_data + 2991* (db->db_blkid & ((1ULL << epbs) - 1)); 2992 DBUF_VERIFY(db); 2993 } 2994} 2995 2996static void 2997dbuf_sync_indirect(dbuf_dirty_record_t dr, dmu_tx_t tx) 2998{ 2999 dmu_buf_impl_t db = dr->dr_dbuf; 3000* dnode_t dn; 3001* zio_t zio; 3002* 3003 ASSERT(dmu_tx_is_syncing(tx)); 3004 3005 dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr); 3006 3007 mutex_enter(&db->db_mtx); 3008 3009 ASSERT(db->db_level > 0); 3010 DBUF_VERIFY(db); 3011 3012 /* Read the block if it hasn't been read yet. / 3013* if (db->db_buf == NULL) { 3014 mutex_exit(&db->db_mtx); 3015 (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED); 3016 mutex_enter(&db->db_mtx); 3017 } 3018 ASSERT3U(db->db_state, ==, DB_CACHED); 3019 ASSERT(db->db_buf != NULL); 3020 3021 DB_DNODE_ENTER(db); 3022 dn = DB_DNODE(db); 3023 /* Indirect block size must match what the dnode thinks it is. / 3024* ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift); 3025 dbuf_check_blkptr(dn, db); 3026 DB_DNODE_EXIT(db); 3027 3028 /* Provide the pending dirty record to child dbufs / 3029* db->db_data_pending = dr; 3030 3031 mutex_exit(&db->db_mtx); 3032 dbuf_write(dr, db->db_buf, tx); 3033 3034 zio = dr->dr_zio; 3035 mutex_enter(&dr->dt.di.dr_mtx); 3036 dbuf_sync_list(&dr->dt.di.dr_children, db->db_level - 1, tx); 3037 ASSERT(list_head(&dr->dt.di.dr_children) == NULL); 3038 mutex_exit(&dr->dt.di.dr_mtx); 3039 zio_nowait(zio); 3040} 3041 3042static void 3043dbuf_sync_leaf(dbuf_dirty_record_t dr, dmu_tx_t tx) 3044{ 3045 arc_buf_t *datap = &dr->dt.dl.dr_data; 3046* dmu_buf_impl_t db = dr->dr_dbuf; 3047* dnode_t dn; 3048* objset_t os; 3049* uint64_t txg = tx->tx_txg; 3050 3051 ASSERT(dmu_tx_is_syncing(tx)); 3052 3053 dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr); 3054 3055 mutex_enter(&db->db_mtx); 3056 /* 3057 * To be synced, we must be dirtied. But we 3058 * might have been freed after the dirty. 3059 / 3060* if (db->db_state == DB_UNCACHED) { 3061 /* This buffer has been freed since it was dirtied / 3062* ASSERT(db->db.db_data == NULL); 3063 } else if (db->db_state == DB_FILL) { 3064 /* This buffer was freed and is now being re-filled / 3065* ASSERT(db->db.db_data != dr->dt.dl.dr_data); 3066 } else { 3067 ASSERT(db->db_state == DB_CACHED \|\| db->db_state == DB_NOFILL); 3068 } 3069 DBUF_VERIFY(db); 3070 3071 DB_DNODE_ENTER(db); 3072 dn = DB_DNODE(db); 3073 3074 if (db->db_blkid == DMU_SPILL_BLKID) { 3075 mutex_enter(&dn->dn_mtx); 3076 dn->dn_phys->dn_flags \|= DNODE_FLAG_SPILL_BLKPTR; 3077 mutex_exit(&dn->dn_mtx); 3078 } 3079 3080 /* 3081 * If this is a bonus buffer, simply copy the bonus data into the 3082 * dnode. It will be written out when the dnode is synced (and it 3083 * will be synced, since it must have been dirty for dbuf_sync to 3084 * be called). 3085 / 3086* if (db->db_blkid == DMU_BONUS_BLKID) { 3087 dbuf_dirty_record_t *drp; 3088* 3089 ASSERT(datap != NULL); 3090* ASSERT0(db->db_level); 3091 ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN); 3092 bcopy(datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen); 3093* DB_DNODE_EXIT(db); 3094 3095 if (datap != db->db.db_data) { 3096* zio_buf_free(datap, DN_MAX_BONUSLEN); 3097* arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); 3098 } 3099 db->db_data_pending = NULL; 3100 drp = &db->db_last_dirty; 3101 while (drp != dr) 3102* drp = &(drp)->dr_next; 3103* ASSERT(dr->dr_next == NULL); 3104 ASSERT(dr->dr_dbuf == db); 3105 drp = dr->dr_next; 3106* if (dr->dr_dbuf->db_level != 0) { 3107 list_destroy(&dr->dt.di.dr_children); 3108 mutex_destroy(&dr->dt.di.dr_mtx); 3109 } 3110 kmem_free(dr, sizeof (dbuf_dirty_record_t)); 3111 ASSERT(db->db_dirtycnt > 0); 3112 db->db_dirtycnt -= 1; 3113 dbuf_rele_and_unlock(db, (void )(uintptr_t)txg); 3114* return; 3115 } 3116 3117 os = dn->dn_objset; 3118 3119 /* 3120 * This function may have dropped the db_mtx lock allowing a dmu_sync 3121 * operation to sneak in. As a result, we need to ensure that we 3122 * don't check the dr_override_state until we have returned from 3123 * dbuf_check_blkptr. 3124 / 3125* dbuf_check_blkptr(dn, db); 3126 3127 /* 3128 * If this buffer is in the middle of an immediate write, 3129 * wait for the synchronous IO to complete. 3130 / 3131* while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) { 3132 ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT); 3133 cv_wait(&db->db_changed, &db->db_mtx); 3134 ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN); 3135 } 3136 3137 if (db->db_state != DB_NOFILL && 3138 dn->dn_object != DMU_META_DNODE_OBJECT && 3139 refcount_count(&db->db_holds) > 1 && 3140 dr->dt.dl.dr_override_state != DR_OVERRIDDEN && 3141 datap == db->db_buf) { 3142* /* 3143 * If this buffer is currently "in use" (i.e., there 3144 * are active holds and db_data still references it), 3145 * then make a copy before we start the write so that 3146 * any modifications from the open txg will not leak 3147 * into this write. 3148 * 3149 * NOTE: this copy does not need to be made for 3150 * objects only modified in the syncing context (e.g. 3151 * DNONE_DNODE blocks). 3152 / 3153* int blksz = arc_buf_size(datap); 3154* arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 3155 datap = arc_alloc_buf(os->os_spa, blksz, db, type); 3156* bcopy(db->db.db_data, (datap)->b_data, blksz); 3157* } 3158 db->db_data_pending = dr; 3159 3160 mutex_exit(&db->db_mtx); 3161 3162 dbuf_write(dr, datap, tx); 3163* 3164 ASSERT(!list_link_active(&dr->dr_dirty_node)); 3165 if (dn->dn_object == DMU_META_DNODE_OBJECT) { 3166 list_insert_tail(&dn->dn_dirty_records[txg&TXG_MASK], dr); 3167 DB_DNODE_EXIT(db); 3168 } else { 3169 /* 3170 * Although zio_nowait() does not "wait for an IO", it does 3171 * initiate the IO. If this is an empty write it seems plausible 3172 * that the IO could actually be completed before the nowait 3173 * returns. We need to DB_DNODE_EXIT() first in case 3174 * zio_nowait() invalidates the dbuf. 3175 / 3176* DB_DNODE_EXIT(db); 3177 zio_nowait(dr->dr_zio); 3178 } 3179} 3180 3181void 3182dbuf_sync_list(list_t list, int level, dmu_tx_t tx) 3183{ 3184 dbuf_dirty_record_t dr; 3185* 3186 while (dr = list_head(list)) { 3187 if (dr->dr_zio != NULL) { 3188 /* 3189 * If we find an already initialized zio then we 3190 * are processing the meta-dnode, and we have finished. 3191 * The dbufs for all dnodes are put back on the list 3192 * during processing, so that we can zio_wait() 3193 * these IOs after initiating all child IOs. 3194 / 3195* ASSERT3U(dr->dr_dbuf->db.db_object, ==, 3196 DMU_META_DNODE_OBJECT); 3197 break; 3198 } 3199 if (dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID && 3200 dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) { 3201 VERIFY3U(dr->dr_dbuf->db_level, ==, level); 3202 } 3203 list_remove(list, dr); 3204 if (dr->dr_dbuf->db_level > 0) 3205 dbuf_sync_indirect(dr, tx); 3206 else 3207 dbuf_sync_leaf(dr, tx); 3208 } 3209} 3210 3211/* ARGSUSED / 3212static void 3213dbuf_write_ready(zio_t zio, arc_buf_t buf, void vdb) 3214{ 3215 dmu_buf_impl_t db = vdb; 3216* dnode_t dn; 3217* blkptr_t bp = zio->io_bp; 3218* blkptr_t bp_orig = &zio->io_bp_orig; 3219* spa_t spa = zio->io_spa; 3220* int64_t delta; 3221 uint64_t fill = 0; 3222 int i; 3223 3224 ASSERT3P(db->db_blkptr, !=, NULL); 3225 ASSERT3P(&db->db_data_pending->dr_bp_copy, ==, bp); 3226 3227 DB_DNODE_ENTER(db); 3228 dn = DB_DNODE(db); 3229 delta = bp_get_dsize_sync(spa, bp) - bp_get_dsize_sync(spa, bp_orig); 3230 dnode_diduse_space(dn, delta - zio->io_prev_space_delta); 3231 zio->io_prev_space_delta = delta; 3232 3233 if (bp->blk_birth != 0) { 3234 ASSERT((db->db_blkid != DMU_SPILL_BLKID && 3235 BP_GET_TYPE(bp) == dn->dn_type) \|\| 3236 (db->db_blkid == DMU_SPILL_BLKID && 3237 BP_GET_TYPE(bp) == dn->dn_bonustype) \|\| 3238 BP_IS_EMBEDDED(bp)); 3239 ASSERT(BP_GET_LEVEL(bp) == db->db_level); 3240 } 3241 3242 mutex_enter(&db->db_mtx); 3243 3244#ifdef ZFS_DEBUG 3245 if (db->db_blkid == DMU_SPILL_BLKID) { 3246 ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR); 3247 ASSERT(!(BP_IS_HOLE(bp)) && 3248 db->db_blkptr == &dn->dn_phys->dn_spill); 3249 } 3250#endif 3251 3252 if (db->db_level == 0) { 3253 mutex_enter(&dn->dn_mtx); 3254 if (db->db_blkid > dn->dn_phys->dn_maxblkid && 3255 db->db_blkid != DMU_SPILL_BLKID) 3256 dn->dn_phys->dn_maxblkid = db->db_blkid; 3257 mutex_exit(&dn->dn_mtx); 3258 3259 if (dn->dn_type == DMU_OT_DNODE) { 3260 dnode_phys_t dnp = db->db.db_data; 3261* for (i = db->db.db_size >> DNODE_SHIFT; i > 0; 3262 i--, dnp++) { 3263 if (dnp->dn_type != DMU_OT_NONE) 3264 fill++; 3265 } 3266 } else { 3267 if (BP_IS_HOLE(bp)) { 3268 fill = 0; 3269 } else { 3270 fill = 1; 3271 } 3272 } 3273 } else { 3274 blkptr_t ibp = db->db.db_data; 3275* ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift); 3276 for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) { 3277 if (BP_IS_HOLE(ibp)) 3278 continue; 3279 fill += BP_GET_FILL(ibp); 3280 } 3281 } 3282 DB_DNODE_EXIT(db); 3283 3284 if (!BP_IS_EMBEDDED(bp)) 3285 bp->blk_fill = fill; 3286 3287 mutex_exit(&db->db_mtx); 3288 3289 rw_enter(&dn->dn_struct_rwlock, RW_WRITER); 3290 db->db_blkptr = bp; 3291 rw_exit(&dn->dn_struct_rwlock); 3292} 3293 3294/* ARGSUSED / 3295/ 3296 * This function gets called just prior to running through the compression 3297 * stage of the zio pipeline. If we're an indirect block comprised of only 3298 * holes, then we want this indirect to be compressed away to a hole. In 3299 * order to do that we must zero out any information about the holes that 3300 * this indirect points to prior to before we try to compress it. 3301 / 3302static void 3303dbuf_write_children_ready(zio_t zio, arc_buf_t buf, void vdb) 3304{ 3305 dmu_buf_impl_t db = vdb; 3306* dnode_t dn; 3307* blkptr_t bp; 3308* uint64_t i; 3309 int epbs; 3310 3311 ASSERT3U(db->db_level, >, 0); 3312 DB_DNODE_ENTER(db); 3313 dn = DB_DNODE(db); 3314 epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; 3315 3316 /* Determine if all our children are holes / 3317* for (i = 0, bp = db->db.db_data; i < 1 << epbs; i++, bp++) { 3318 if (!BP_IS_HOLE(bp)) 3319 break; 3320 } 3321 3322 /* 3323 * If all the children are holes, then zero them all out so that 3324 * we may get compressed away. 3325 / 3326* if (i == 1 << epbs) { 3327 /* didn't find any non-holes / 3328* bzero(db->db.db_data, db->db.db_size); 3329 } 3330 DB_DNODE_EXIT(db); 3331} 3332 3333/* 3334 * The SPA will call this callback several times for each zio - once 3335 * for every physical child i/o (zio->io_phys_children times). This 3336 * allows the DMU to monitor the progress of each logical i/o. For example, 3337 * there may be 2 copies of an indirect block, or many fragments of a RAID-Z 3338 * block. There may be a long delay before all copies/fragments are completed, 3339 * so this callback allows us to retire dirty space gradually, as the physical 3340 * i/os complete. 3341 / 3342/ ARGSUSED / 3343static void 3344dbuf_write_physdone(zio_t zio, arc_buf_t buf, void arg) 3345{ 3346 dmu_buf_impl_t db = arg; 3347* objset_t os = db->db_objset; 3348* dsl_pool_t dp = dmu_objset_pool(os); 3349* dbuf_dirty_record_t dr; 3350* int delta = 0; 3351 3352 dr = db->db_data_pending; 3353 ASSERT3U(dr->dr_txg, ==, zio->io_txg); 3354 3355 /* 3356 * The callback will be called io_phys_children times. Retire one 3357 * portion of our dirty space each time we are called. Any rounding 3358 * error will be cleaned up by dsl_pool_sync()'s call to 3359 * dsl_pool_undirty_space(). 3360 / 3361* delta = dr->dr_accounted / zio->io_phys_children; 3362 dsl_pool_undirty_space(dp, delta, zio->io_txg); 3363} 3364 3365/* ARGSUSED / 3366static void 3367dbuf_write_done(zio_t zio, arc_buf_t buf, void vdb) 3368{ 3369 dmu_buf_impl_t db = vdb; 3370* blkptr_t bp_orig = &zio->io_bp_orig; 3371* blkptr_t bp = db->db_blkptr; 3372* objset_t os = db->db_objset; 3373* dmu_tx_t tx = os->os_synctx; 3374* dbuf_dirty_record_t *drp, dr; 3375 3376 ASSERT0(zio->io_error); 3377 ASSERT(db->db_blkptr == bp); 3378 3379 /* 3380 * For nopwrites and rewrites we ensure that the bp matches our 3381 * original and bypass all the accounting. 3382 / 3383* if (zio->io_flags & (ZIO_FLAG_IO_REWRITE \| ZIO_FLAG_NOPWRITE)) { 3384 ASSERT(BP_EQUAL(bp, bp_orig)); 3385 } else { 3386 dsl_dataset_t ds = os->os_dsl_dataset; 3387* (void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE); 3388 dsl_dataset_block_born(ds, bp, tx); 3389 } 3390 3391 mutex_enter(&db->db_mtx); 3392 3393 DBUF_VERIFY(db); 3394 3395 drp = &db->db_last_dirty; 3396 while ((dr = drp) != db->db_data_pending) 3397* drp = &dr->dr_next; 3398 ASSERT(!list_link_active(&dr->dr_dirty_node)); 3399 ASSERT(dr->dr_dbuf == db); 3400 ASSERT(dr->dr_next == NULL); 3401 drp = dr->dr_next; 3402* 3403#ifdef ZFS_DEBUG 3404 if (db->db_blkid == DMU_SPILL_BLKID) { 3405 dnode_t dn; 3406* 3407 DB_DNODE_ENTER(db); 3408 dn = DB_DNODE(db); 3409 ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR); 3410 ASSERT(!(BP_IS_HOLE(db->db_blkptr)) && 3411 db->db_blkptr == &dn->dn_phys->dn_spill); 3412 DB_DNODE_EXIT(db); 3413 } 3414#endif 3415 3416 if (db->db_level == 0) { 3417 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 3418 ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN); 3419 if (db->db_state != DB_NOFILL) { 3420 if (dr->dt.dl.dr_data != db->db_buf) 3421 arc_buf_destroy(dr->dt.dl.dr_data, db); 3422 } 3423 } else { 3424 dnode_t dn; 3425* 3426 DB_DNODE_ENTER(db); 3427 dn = DB_DNODE(db); 3428 ASSERT(list_head(&dr->dt.di.dr_children) == NULL); 3429 ASSERT3U(db->db.db_size, ==, 1 << dn->dn_phys->dn_indblkshift); 3430 if (!BP_IS_HOLE(db->db_blkptr)) { 3431 int epbs = 3432 dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; 3433 ASSERT3U(db->db_blkid, <=, 3434 dn->dn_phys->dn_maxblkid >> (db->db_level * epbs)); 3435 ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==, 3436 db->db.db_size); 3437 } 3438 DB_DNODE_EXIT(db); 3439 mutex_destroy(&dr->dt.di.dr_mtx); 3440 list_destroy(&dr->dt.di.dr_children); 3441 } 3442 kmem_free(dr, sizeof (dbuf_dirty_record_t)); 3443 3444 cv_broadcast(&db->db_changed); 3445 ASSERT(db->db_dirtycnt > 0); 3446 db->db_dirtycnt -= 1; 3447 db->db_data_pending = NULL; 3448 dbuf_rele_and_unlock(db, (void )(uintptr_t)tx->tx_txg); 3449} 3450* 3451static void 3452dbuf_write_nofill_ready(zio_t zio) 3453{ 3454* dbuf_write_ready(zio, NULL, zio->io_private); 3455} 3456 3457static void 3458dbuf_write_nofill_done(zio_t zio) 3459{ 3460* dbuf_write_done(zio, NULL, zio->io_private); 3461} 3462 3463static void 3464dbuf_write_override_ready(zio_t zio) 3465{ 3466* dbuf_dirty_record_t dr = zio->io_private; 3467* dmu_buf_impl_t db = dr->dr_dbuf; 3468* 3469 dbuf_write_ready(zio, NULL, db); 3470} 3471 3472static void 3473dbuf_write_override_done(zio_t zio) 3474{ 3475* dbuf_dirty_record_t dr = zio->io_private; 3476* dmu_buf_impl_t db = dr->dr_dbuf; 3477* blkptr_t obp = &dr->dt.dl.dr_overridden_by; 3478* 3479 mutex_enter(&db->db_mtx); 3480 if (!BP_EQUAL(zio->io_bp, obp)) { 3481 if (!BP_IS_HOLE(obp)) 3482 dsl_free(spa_get_dsl(zio->io_spa), zio->io_txg, obp); 3483 arc_release(dr->dt.dl.dr_data, db); 3484 } 3485 mutex_exit(&db->db_mtx); 3486 3487 dbuf_write_done(zio, NULL, db); 3488} 3489 3490/* Issue I/O to commit a dirty buffer to disk. / 3491static void 3492dbuf_write(dbuf_dirty_record_t dr, arc_buf_t data, dmu_tx_t tx) 3493{ 3494 dmu_buf_impl_t db = dr->dr_dbuf; 3495* dnode_t dn; 3496* objset_t os; 3497* dmu_buf_impl_t parent = db->db_parent; 3498* uint64_t txg = tx->tx_txg; 3499 zbookmark_phys_t zb; 3500 zio_prop_t zp; 3501 zio_t zio; 3502* int wp_flag = 0; 3503 3504 ASSERT(dmu_tx_is_syncing(tx)); 3505 3506 DB_DNODE_ENTER(db); 3507 dn = DB_DNODE(db); 3508 os = dn->dn_objset; 3509 3510 if (db->db_state != DB_NOFILL) { 3511 if (db->db_level > 0 \|\| dn->dn_type == DMU_OT_DNODE) { 3512 /* 3513 * Private object buffers are released here rather 3514 * than in dbuf_dirty() since they are only modified 3515 * in the syncing context and we don't want the 3516 * overhead of making multiple copies of the data. 3517 / 3518* if (BP_IS_HOLE(db->db_blkptr)) { 3519 arc_buf_thaw(data); 3520 } else { 3521 dbuf_release_bp(db); 3522 } 3523 } 3524 } 3525 3526 if (parent != dn->dn_dbuf) { 3527 /* Our parent is an indirect block. / 3528* /* We have a dirty parent that has been scheduled for write. / 3529* ASSERT(parent && parent->db_data_pending); 3530 /* Our parent's buffer is one level closer to the dnode. / 3531* ASSERT(db->db_level == parent->db_level-1); 3532 /* 3533 * We're about to modify our parent's db_data by modifying 3534 * our block pointer, so the parent must be released. 3535 / 3536* ASSERT(arc_released(parent->db_buf)); 3537 zio = parent->db_data_pending->dr_zio; 3538 } else { 3539 /* Our parent is the dnode itself. / 3540* ASSERT((db->db_level == dn->dn_phys->dn_nlevels-1 && 3541 db->db_blkid != DMU_SPILL_BLKID) \|\| 3542 (db->db_blkid == DMU_SPILL_BLKID && db->db_level == 0)); 3543 if (db->db_blkid != DMU_SPILL_BLKID) 3544 ASSERT3P(db->db_blkptr, ==, 3545 &dn->dn_phys->dn_blkptr[db->db_blkid]); 3546 zio = dn->dn_zio; 3547 } 3548 3549 ASSERT(db->db_level == 0 \|\| data == db->db_buf); 3550 ASSERT3U(db->db_blkptr->blk_birth, <=, txg); 3551 ASSERT(zio); 3552 3553 SET_BOOKMARK(&zb, os->os_dsl_dataset ? 3554 os->os_dsl_dataset->ds_object : DMU_META_OBJSET, 3555 db->db.db_object, db->db_level, db->db_blkid); 3556 3557 if (db->db_blkid == DMU_SPILL_BLKID) 3558 wp_flag = WP_SPILL; 3559 wp_flag \|= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0; 3560 3561 dmu_write_policy(os, dn, db->db_level, wp_flag, &zp); 3562 DB_DNODE_EXIT(db); 3563 3564 /* 3565 * We copy the blkptr now (rather than when we instantiate the dirty 3566 * record), because its value can change between open context and 3567 * syncing context. We do not need to hold dn_struct_rwlock to read 3568 * db_blkptr because we are in syncing context. 3569 / 3570* dr->dr_bp_copy = db->db_blkptr; 3571* 3572 if (db->db_level == 0 && 3573 dr->dt.dl.dr_override_state == DR_OVERRIDDEN) { 3574 /* 3575 * The BP for this block has been provided by open context 3576 * (by dmu_sync() or dmu_buf_write_embedded()). 3577 / 3578* void contents = (data != NULL) ? data->b_data : NULL; 3579* 3580 dr->dr_zio = zio_write(zio, os->os_spa, txg, 3581 &dr->dr_bp_copy, contents, db->db.db_size, &zp, 3582 dbuf_write_override_ready, NULL, NULL, 3583 dbuf_write_override_done, 3584 dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); 3585 mutex_enter(&db->db_mtx); 3586 dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; 3587 zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by, 3588 dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite); 3589 mutex_exit(&db->db_mtx); 3590 } else if (db->db_state == DB_NOFILL) { 3591 ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF \|\| 3592 zp.zp_checksum == ZIO_CHECKSUM_NOPARITY); 3593 dr->dr_zio = zio_write(zio, os->os_spa, txg, 3594 &dr->dr_bp_copy, NULL, db->db.db_size, &zp, 3595 dbuf_write_nofill_ready, NULL, NULL, 3596 dbuf_write_nofill_done, db, 3597 ZIO_PRIORITY_ASYNC_WRITE, 3598 ZIO_FLAG_MUSTSUCCEED \| ZIO_FLAG_NODATA, &zb); 3599 } else { 3600 ASSERT(arc_released(data)); 3601 3602 /* 3603 * For indirect blocks, we want to setup the children 3604 * ready callback so that we can properly handle an indirect 3605 * block that only contains holes. 3606 / 3607* arc_done_func_t children_ready_cb = NULL; 3608* if (db->db_level != 0) 3609 children_ready_cb = dbuf_write_children_ready; 3610 3611 dr->dr_zio = arc_write(zio, os->os_spa, txg, 3612 &dr->dr_bp_copy, data, DBUF_IS_L2CACHEABLE(db), 3613 &zp, dbuf_write_ready, children_ready_cb, 3614 dbuf_write_physdone, dbuf_write_done, db, 3615 ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); 3616 } 3617}	2257 db->db_state = DB_UNCACHED; 2258 mutex_exit(&dn->dn_dbufs_mtx); 2259 arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); 2260 2261 if (parent && parent != dn->dn_dbuf) 2262 dbuf_add_ref(parent, db); 2263 2264 ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT \|\| 2265 refcount_count(&dn->dn_holds) > 0); 2266 (void) refcount_add(&dn->dn_holds, db); 2267 atomic_inc_32(&dn->dn_dbufs_count); 2268 2269 dprintf_dbuf(db, "db=%p\n", db); 2270 2271 return (db); 2272} 2273 2274typedef struct dbuf_prefetch_arg { 2275 spa_t dpa_spa; / The spa to issue the prefetch in. / 2276* zbookmark_phys_t dpa_zb; /* The target block to prefetch. / 2277* int dpa_epbs; /* Entries (blkptr_t's) Per Block Shift. / 2278* int dpa_curlevel; /* The current level that we're reading / 2279* dnode_t dpa_dnode; / The dnode associated with the prefetch / 2280* zio_priority_t dpa_prio; /* The priority I/Os should be issued at. / 2281* zio_t dpa_zio; / The parent zio_t for all prefetches. / 2282* arc_flags_t dpa_aflags; /* Flags to pass to the final prefetch. / 2283} dbuf_prefetch_arg_t; 2284* 2285/* 2286 * Actually issue the prefetch read for the block given. 2287 / 2288static void 2289dbuf_issue_final_prefetch(dbuf_prefetch_arg_t dpa, blkptr_t bp) 2290{ 2291* if (BP_IS_HOLE(bp) \|\| BP_IS_EMBEDDED(bp)) 2292 return; 2293 2294 arc_flags_t aflags = 2295 dpa->dpa_aflags \| ARC_FLAG_NOWAIT \| ARC_FLAG_PREFETCH; 2296 2297 ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp)); 2298 ASSERT3U(dpa->dpa_curlevel, ==, dpa->dpa_zb.zb_level); 2299 ASSERT(dpa->dpa_zio != NULL); 2300 (void) arc_read(dpa->dpa_zio, dpa->dpa_spa, bp, NULL, NULL, 2301 dpa->dpa_prio, ZIO_FLAG_CANFAIL \| ZIO_FLAG_SPECULATIVE, 2302 &aflags, &dpa->dpa_zb); 2303} 2304 2305/* 2306 * Called when an indirect block above our prefetch target is read in. This 2307 * will either read in the next indirect block down the tree or issue the actual 2308 * prefetch if the next block down is our target. 2309 / 2310static void 2311dbuf_prefetch_indirect_done(zio_t zio, arc_buf_t abuf, void private) 2312{ 2313 dbuf_prefetch_arg_t dpa = private; 2314* 2315 ASSERT3S(dpa->dpa_zb.zb_level, <, dpa->dpa_curlevel); 2316 ASSERT3S(dpa->dpa_curlevel, >, 0); 2317 2318 /* 2319 * The dpa_dnode is only valid if we are called with a NULL 2320 * zio. This indicates that the arc_read() returned without 2321 * first calling zio_read() to issue a physical read. Once 2322 * a physical read is made the dpa_dnode must be invalidated 2323 * as the locks guarding it may have been dropped. If the 2324 * dpa_dnode is still valid, then we want to add it to the dbuf 2325 * cache. To do so, we must hold the dbuf associated with the block 2326 * we just prefetched, read its contents so that we associate it 2327 * with an arc_buf_t, and then release it. 2328 / 2329* if (zio != NULL) { 2330 ASSERT3S(BP_GET_LEVEL(zio->io_bp), ==, dpa->dpa_curlevel); 2331 if (zio->io_flags & ZIO_FLAG_RAW) { 2332 ASSERT3U(BP_GET_PSIZE(zio->io_bp), ==, zio->io_size); 2333 } else { 2334 ASSERT3U(BP_GET_LSIZE(zio->io_bp), ==, zio->io_size); 2335 } 2336 ASSERT3P(zio->io_spa, ==, dpa->dpa_spa); 2337 2338 dpa->dpa_dnode = NULL; 2339 } else if (dpa->dpa_dnode != NULL) { 2340 uint64_t curblkid = dpa->dpa_zb.zb_blkid >> 2341 (dpa->dpa_epbs * (dpa->dpa_curlevel - 2342 dpa->dpa_zb.zb_level)); 2343 dmu_buf_impl_t db = dbuf_hold_level(dpa->dpa_dnode, 2344* dpa->dpa_curlevel, curblkid, FTAG); 2345 (void) dbuf_read(db, NULL, 2346 DB_RF_MUST_SUCCEED \| DB_RF_NOPREFETCH \| DB_RF_HAVESTRUCT); 2347 dbuf_rele(db, FTAG); 2348 } 2349 2350 dpa->dpa_curlevel--; 2351 2352 uint64_t nextblkid = dpa->dpa_zb.zb_blkid >> 2353 (dpa->dpa_epbs * (dpa->dpa_curlevel - dpa->dpa_zb.zb_level)); 2354 blkptr_t bp = ((blkptr_t )abuf->b_data) + 2355 P2PHASE(nextblkid, 1ULL << dpa->dpa_epbs); 2356 if (BP_IS_HOLE(bp) \|\| (zio != NULL && zio->io_error != 0)) { 2357 kmem_free(dpa, sizeof (dpa)); 2358* } else if (dpa->dpa_curlevel == dpa->dpa_zb.zb_level) { 2359 ASSERT3U(nextblkid, ==, dpa->dpa_zb.zb_blkid); 2360 dbuf_issue_final_prefetch(dpa, bp); 2361 kmem_free(dpa, sizeof (dpa)); 2362* } else { 2363 arc_flags_t iter_aflags = ARC_FLAG_NOWAIT; 2364 zbookmark_phys_t zb; 2365 2366 ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp)); 2367 2368 SET_BOOKMARK(&zb, dpa->dpa_zb.zb_objset, 2369 dpa->dpa_zb.zb_object, dpa->dpa_curlevel, nextblkid); 2370 2371 (void) arc_read(dpa->dpa_zio, dpa->dpa_spa, 2372 bp, dbuf_prefetch_indirect_done, dpa, dpa->dpa_prio, 2373 ZIO_FLAG_CANFAIL \| ZIO_FLAG_SPECULATIVE, 2374 &iter_aflags, &zb); 2375 } 2376 2377 arc_buf_destroy(abuf, private); 2378} 2379 2380/* 2381 * Issue prefetch reads for the given block on the given level. If the indirect 2382 * blocks above that block are not in memory, we will read them in 2383 * asynchronously. As a result, this call never blocks waiting for a read to 2384 * complete. 2385 / 2386void 2387dbuf_prefetch(dnode_t dn, int64_t level, uint64_t blkid, zio_priority_t prio, 2388 arc_flags_t aflags) 2389{ 2390 blkptr_t bp; 2391 int epbs, nlevels, curlevel; 2392 uint64_t curblkid; 2393 2394 ASSERT(blkid != DMU_BONUS_BLKID); 2395 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 2396 2397 if (blkid > dn->dn_maxblkid) 2398 return; 2399 2400 if (dnode_block_freed(dn, blkid)) 2401 return; 2402 2403 /* 2404 * This dnode hasn't been written to disk yet, so there's nothing to 2405 * prefetch. 2406 / 2407* nlevels = dn->dn_phys->dn_nlevels; 2408 if (level >= nlevels \|\| dn->dn_phys->dn_nblkptr == 0) 2409 return; 2410 2411 epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; 2412 if (dn->dn_phys->dn_maxblkid < blkid << (epbs * level)) 2413 return; 2414 2415 dmu_buf_impl_t db = dbuf_find(dn->dn_objset, dn->dn_object, 2416* level, blkid); 2417 if (db != NULL) { 2418 mutex_exit(&db->db_mtx); 2419 /* 2420 * This dbuf already exists. It is either CACHED, or 2421 * (we assume) about to be read or filled. 2422 / 2423* return; 2424 } 2425 2426 /* 2427 * Find the closest ancestor (indirect block) of the target block 2428 * that is present in the cache. In this indirect block, we will 2429 * find the bp that is at curlevel, curblkid. 2430 / 2431* curlevel = level; 2432 curblkid = blkid; 2433 while (curlevel < nlevels - 1) { 2434 int parent_level = curlevel + 1; 2435 uint64_t parent_blkid = curblkid >> epbs; 2436 dmu_buf_impl_t db; 2437* 2438 if (dbuf_hold_impl(dn, parent_level, parent_blkid, 2439 FALSE, TRUE, FTAG, &db) == 0) { 2440 blkptr_t bpp = db->db_buf->b_data; 2441* bp = bpp[P2PHASE(curblkid, 1 << epbs)]; 2442 dbuf_rele(db, FTAG); 2443 break; 2444 } 2445 2446 curlevel = parent_level; 2447 curblkid = parent_blkid; 2448 } 2449 2450 if (curlevel == nlevels - 1) { 2451 /* No cached indirect blocks found. / 2452* ASSERT3U(curblkid, <, dn->dn_phys->dn_nblkptr); 2453 bp = dn->dn_phys->dn_blkptr[curblkid]; 2454 } 2455 if (BP_IS_HOLE(&bp)) 2456 return; 2457 2458 ASSERT3U(curlevel, ==, BP_GET_LEVEL(&bp)); 2459 2460 zio_t pio = zio_root(dmu_objset_spa(dn->dn_objset), NULL, NULL, 2461* ZIO_FLAG_CANFAIL); 2462 2463 dbuf_prefetch_arg_t dpa = kmem_zalloc(sizeof (dpa), KM_SLEEP); 2464 dsl_dataset_t ds = dn->dn_objset->os_dsl_dataset; 2465* SET_BOOKMARK(&dpa->dpa_zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET, 2466 dn->dn_object, level, blkid); 2467 dpa->dpa_curlevel = curlevel; 2468 dpa->dpa_prio = prio; 2469 dpa->dpa_aflags = aflags; 2470 dpa->dpa_spa = dn->dn_objset->os_spa; 2471 dpa->dpa_dnode = dn; 2472 dpa->dpa_epbs = epbs; 2473 dpa->dpa_zio = pio; 2474 2475 /* 2476 * If we have the indirect just above us, no need to do the asynchronous 2477 * prefetch chain; we'll just run the last step ourselves. If we're at 2478 * a higher level, though, we want to issue the prefetches for all the 2479 * indirect blocks asynchronously, so we can go on with whatever we were 2480 * doing. 2481 / 2482* if (curlevel == level) { 2483 ASSERT3U(curblkid, ==, blkid); 2484 dbuf_issue_final_prefetch(dpa, &bp); 2485 kmem_free(dpa, sizeof (dpa)); 2486* } else { 2487 arc_flags_t iter_aflags = ARC_FLAG_NOWAIT; 2488 zbookmark_phys_t zb; 2489 2490 SET_BOOKMARK(&zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET, 2491 dn->dn_object, curlevel, curblkid); 2492 (void) arc_read(dpa->dpa_zio, dpa->dpa_spa, 2493 &bp, dbuf_prefetch_indirect_done, dpa, prio, 2494 ZIO_FLAG_CANFAIL \| ZIO_FLAG_SPECULATIVE, 2495 &iter_aflags, &zb); 2496 } 2497 /* 2498 * We use pio here instead of dpa_zio since it's possible that 2499 * dpa may have already been freed. 2500 / 2501* zio_nowait(pio); 2502} 2503 2504/* 2505 * Returns with db_holds incremented, and db_mtx not held. 2506 * Note: dn_struct_rwlock must be held. 2507 / 2508int 2509dbuf_hold_impl(dnode_t dn, uint8_t level, uint64_t blkid, 2510 boolean_t fail_sparse, boolean_t fail_uncached, 2511 void tag, dmu_buf_impl_t dbp) 2512{ 2513* dmu_buf_impl_t db, parent = NULL; 2514 2515 ASSERT(blkid != DMU_BONUS_BLKID); 2516 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 2517 ASSERT3U(dn->dn_nlevels, >, level); 2518 2519 dbp = NULL; 2520top: 2521* /* dbuf_find() returns with db_mtx held / 2522* db = dbuf_find(dn->dn_objset, dn->dn_object, level, blkid); 2523 2524 if (db == NULL) { 2525 blkptr_t bp = NULL; 2526* int err; 2527 2528 if (fail_uncached) 2529 return (SET_ERROR(ENOENT)); 2530 2531 ASSERT3P(parent, ==, NULL); 2532 err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp); 2533 if (fail_sparse) { 2534 if (err == 0 && bp && BP_IS_HOLE(bp)) 2535 err = SET_ERROR(ENOENT); 2536 if (err) { 2537 if (parent) 2538 dbuf_rele(parent, NULL); 2539 return (err); 2540 } 2541 } 2542 if (err && err != ENOENT) 2543 return (err); 2544 db = dbuf_create(dn, level, blkid, parent, bp); 2545 } 2546 2547 if (fail_uncached && db->db_state != DB_CACHED) { 2548 mutex_exit(&db->db_mtx); 2549 return (SET_ERROR(ENOENT)); 2550 } 2551 2552 if (db->db_buf != NULL) 2553 ASSERT3P(db->db.db_data, ==, db->db_buf->b_data); 2554 2555 ASSERT(db->db_buf == NULL \|\| arc_referenced(db->db_buf)); 2556 2557 /* 2558 * If this buffer is currently syncing out, and we are are 2559 * still referencing it from db_data, we need to make a copy 2560 * of it in case we decide we want to dirty it again in this txg. 2561 / 2562* if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && 2563 dn->dn_object != DMU_META_DNODE_OBJECT && 2564 db->db_state == DB_CACHED && db->db_data_pending) { 2565 dbuf_dirty_record_t dr = db->db_data_pending; 2566* 2567 if (dr->dt.dl.dr_data == db->db_buf) { 2568 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 2569 2570 dbuf_set_data(db, 2571 arc_alloc_buf(dn->dn_objset->os_spa, 2572 db->db.db_size, db, type)); 2573 bcopy(dr->dt.dl.dr_data->b_data, db->db.db_data, 2574 db->db.db_size); 2575 } 2576 } 2577 2578 if (multilist_link_active(&db->db_cache_link)) { 2579 ASSERT(refcount_is_zero(&db->db_holds)); 2580 multilist_remove(&dbuf_cache, db); 2581 (void) refcount_remove_many(&dbuf_cache_size, 2582 db->db.db_size, db); 2583 } 2584 (void) refcount_add(&db->db_holds, tag); 2585 DBUF_VERIFY(db); 2586 mutex_exit(&db->db_mtx); 2587 2588 /* NOTE: we can't rele the parent until after we drop the db_mtx / 2589* if (parent) 2590 dbuf_rele(parent, NULL); 2591 2592 ASSERT3P(DB_DNODE(db), ==, dn); 2593 ASSERT3U(db->db_blkid, ==, blkid); 2594 ASSERT3U(db->db_level, ==, level); 2595 dbp = db; 2596* 2597 return (0); 2598} 2599 2600dmu_buf_impl_t * 2601dbuf_hold(dnode_t dn, uint64_t blkid, void tag) 2602{ 2603 return (dbuf_hold_level(dn, 0, blkid, tag)); 2604} 2605 2606dmu_buf_impl_t * 2607dbuf_hold_level(dnode_t dn, int level, uint64_t blkid, void tag) 2608{ 2609 dmu_buf_impl_t db; 2610* int err = dbuf_hold_impl(dn, level, blkid, FALSE, FALSE, tag, &db); 2611 return (err ? NULL : db); 2612} 2613 2614void 2615dbuf_create_bonus(dnode_t dn) 2616{ 2617* ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); 2618 2619 ASSERT(dn->dn_bonus == NULL); 2620 dn->dn_bonus = dbuf_create(dn, 0, DMU_BONUS_BLKID, dn->dn_dbuf, NULL); 2621} 2622 2623int 2624dbuf_spill_set_blksz(dmu_buf_t db_fake, uint64_t blksz, dmu_tx_t tx) 2625{ 2626 dmu_buf_impl_t db = (dmu_buf_impl_t )db_fake; 2627 dnode_t dn; 2628* 2629 if (db->db_blkid != DMU_SPILL_BLKID) 2630 return (SET_ERROR(ENOTSUP)); 2631 if (blksz == 0) 2632 blksz = SPA_MINBLOCKSIZE; 2633 ASSERT3U(blksz, <=, spa_maxblocksize(dmu_objset_spa(db->db_objset))); 2634 blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE); 2635 2636 DB_DNODE_ENTER(db); 2637 dn = DB_DNODE(db); 2638 rw_enter(&dn->dn_struct_rwlock, RW_WRITER); 2639 dbuf_new_size(db, blksz, tx); 2640 rw_exit(&dn->dn_struct_rwlock); 2641 DB_DNODE_EXIT(db); 2642 2643 return (0); 2644} 2645 2646void 2647dbuf_rm_spill(dnode_t dn, dmu_tx_t tx) 2648{ 2649 dbuf_free_range(dn, DMU_SPILL_BLKID, DMU_SPILL_BLKID, tx); 2650} 2651 2652#pragma weak dmu_buf_add_ref = dbuf_add_ref 2653void 2654dbuf_add_ref(dmu_buf_impl_t db, void tag) 2655{ 2656 int64_t holds = refcount_add(&db->db_holds, tag); 2657 ASSERT3S(holds, >, 1); 2658} 2659 2660#pragma weak dmu_buf_try_add_ref = dbuf_try_add_ref 2661boolean_t 2662dbuf_try_add_ref(dmu_buf_t db_fake, objset_t os, uint64_t obj, uint64_t blkid, 2663 void tag) 2664{ 2665* dmu_buf_impl_t db = (dmu_buf_impl_t )db_fake; 2666 dmu_buf_impl_t found_db; 2667* boolean_t result = B_FALSE; 2668 2669 if (db->db_blkid == DMU_BONUS_BLKID) 2670 found_db = dbuf_find_bonus(os, obj); 2671 else 2672 found_db = dbuf_find(os, obj, 0, blkid); 2673 2674 if (found_db != NULL) { 2675 if (db == found_db && dbuf_refcount(db) > db->db_dirtycnt) { 2676 (void) refcount_add(&db->db_holds, tag); 2677 result = B_TRUE; 2678 } 2679 mutex_exit(&db->db_mtx); 2680 } 2681 return (result); 2682} 2683 2684/* 2685 * If you call dbuf_rele() you had better not be referencing the dnode handle 2686 * unless you have some other direct or indirect hold on the dnode. (An indirect 2687 * hold is a hold on one of the dnode's dbufs, including the bonus buffer.) 2688 * Without that, the dbuf_rele() could lead to a dnode_rele() followed by the 2689 * dnode's parent dbuf evicting its dnode handles. 2690 / 2691void 2692dbuf_rele(dmu_buf_impl_t db, void tag) 2693{ 2694* mutex_enter(&db->db_mtx); 2695 dbuf_rele_and_unlock(db, tag); 2696} 2697 2698void 2699dmu_buf_rele(dmu_buf_t db, void tag) 2700{ 2701 dbuf_rele((dmu_buf_impl_t )db, tag); 2702} 2703* 2704/* 2705 * dbuf_rele() for an already-locked dbuf. This is necessary to allow 2706 * db_dirtycnt and db_holds to be updated atomically. 2707 / 2708void 2709dbuf_rele_and_unlock(dmu_buf_impl_t db, void tag) 2710{ 2711* int64_t holds; 2712 2713 ASSERT(MUTEX_HELD(&db->db_mtx)); 2714 DBUF_VERIFY(db); 2715 2716 /* 2717 * Remove the reference to the dbuf before removing its hold on the 2718 * dnode so we can guarantee in dnode_move() that a referenced bonus 2719 * buffer has a corresponding dnode hold. 2720 / 2721* holds = refcount_remove(&db->db_holds, tag); 2722 ASSERT(holds >= 0); 2723 2724 /* 2725 * We can't freeze indirects if there is a possibility that they 2726 * may be modified in the current syncing context. 2727 / 2728* if (db->db_buf != NULL && 2729 holds == (db->db_level == 0 ? db->db_dirtycnt : 0)) { 2730 arc_buf_freeze(db->db_buf); 2731 } 2732 2733 if (holds == db->db_dirtycnt && 2734 db->db_level == 0 && db->db_user_immediate_evict) 2735 dbuf_evict_user(db); 2736 2737 if (holds == 0) { 2738 if (db->db_blkid == DMU_BONUS_BLKID) { 2739 dnode_t dn; 2740* boolean_t evict_dbuf = db->db_pending_evict; 2741 2742 /* 2743 * If the dnode moves here, we cannot cross this 2744 * barrier until the move completes. 2745 / 2746* DB_DNODE_ENTER(db); 2747 2748 dn = DB_DNODE(db); 2749 atomic_dec_32(&dn->dn_dbufs_count); 2750 2751 /* 2752 * Decrementing the dbuf count means that the bonus 2753 * buffer's dnode hold is no longer discounted in 2754 * dnode_move(). The dnode cannot move until after 2755 * the dnode_rele() below. 2756 / 2757* DB_DNODE_EXIT(db); 2758 2759 /* 2760 * Do not reference db after its lock is dropped. 2761 * Another thread may evict it. 2762 / 2763* mutex_exit(&db->db_mtx); 2764 2765 if (evict_dbuf) 2766 dnode_evict_bonus(dn); 2767 2768 dnode_rele(dn, db); 2769 } else if (db->db_buf == NULL) { 2770 /* 2771 * This is a special case: we never associated this 2772 * dbuf with any data allocated from the ARC. 2773 / 2774* ASSERT(db->db_state == DB_UNCACHED \|\| 2775 db->db_state == DB_NOFILL); 2776 dbuf_destroy(db); 2777 } else if (arc_released(db->db_buf)) { 2778 /* 2779 * This dbuf has anonymous data associated with it. 2780 / 2781* dbuf_destroy(db); 2782 } else { 2783 boolean_t do_arc_evict = B_FALSE; 2784 blkptr_t bp; 2785 spa_t spa = dmu_objset_spa(db->db_objset); 2786* 2787 if (!DBUF_IS_CACHEABLE(db) && 2788 db->db_blkptr != NULL && 2789 !BP_IS_HOLE(db->db_blkptr) && 2790 !BP_IS_EMBEDDED(db->db_blkptr)) { 2791 do_arc_evict = B_TRUE; 2792 bp = db->db_blkptr; 2793* } 2794 2795 if (!DBUF_IS_CACHEABLE(db) \|\| 2796 db->db_pending_evict) { 2797 dbuf_destroy(db); 2798 } else if (!multilist_link_active(&db->db_cache_link)) { 2799 multilist_insert(&dbuf_cache, db); 2800 (void) refcount_add_many(&dbuf_cache_size, 2801 db->db.db_size, db); 2802 mutex_exit(&db->db_mtx); 2803 2804 dbuf_evict_notify(); 2805 } 2806 2807 if (do_arc_evict) 2808 arc_freed(spa, &bp); 2809 } 2810 } else { 2811 mutex_exit(&db->db_mtx); 2812 } 2813 2814} 2815 2816#pragma weak dmu_buf_refcount = dbuf_refcount 2817uint64_t 2818dbuf_refcount(dmu_buf_impl_t db) 2819{ 2820* return (refcount_count(&db->db_holds)); 2821} 2822 2823void * 2824dmu_buf_replace_user(dmu_buf_t db_fake, dmu_buf_user_t old_user, 2825 dmu_buf_user_t new_user) 2826{ 2827* dmu_buf_impl_t db = (dmu_buf_impl_t )db_fake; 2828 2829 mutex_enter(&db->db_mtx); 2830 dbuf_verify_user(db, DBVU_NOT_EVICTING); 2831 if (db->db_user == old_user) 2832 db->db_user = new_user; 2833 else 2834 old_user = db->db_user; 2835 dbuf_verify_user(db, DBVU_NOT_EVICTING); 2836 mutex_exit(&db->db_mtx); 2837 2838 return (old_user); 2839} 2840 2841void * 2842dmu_buf_set_user(dmu_buf_t db_fake, dmu_buf_user_t user) 2843{ 2844 return (dmu_buf_replace_user(db_fake, NULL, user)); 2845} 2846 2847void * 2848dmu_buf_set_user_ie(dmu_buf_t db_fake, dmu_buf_user_t user) 2849{ 2850 dmu_buf_impl_t db = (dmu_buf_impl_t )db_fake; 2851 2852 db->db_user_immediate_evict = TRUE; 2853 return (dmu_buf_set_user(db_fake, user)); 2854} 2855 2856void * 2857dmu_buf_remove_user(dmu_buf_t db_fake, dmu_buf_user_t user) 2858{ 2859 return (dmu_buf_replace_user(db_fake, user, NULL)); 2860} 2861 2862void * 2863dmu_buf_get_user(dmu_buf_t db_fake) 2864{ 2865* dmu_buf_impl_t db = (dmu_buf_impl_t )db_fake; 2866 2867 dbuf_verify_user(db, DBVU_NOT_EVICTING); 2868 return (db->db_user); 2869} 2870 2871void 2872dmu_buf_user_evict_wait() 2873{ 2874 taskq_wait(dbu_evict_taskq); 2875} 2876 2877boolean_t 2878dmu_buf_freeable(dmu_buf_t dbuf) 2879{ 2880* boolean_t res = B_FALSE; 2881 dmu_buf_impl_t db = (dmu_buf_impl_t )dbuf; 2882 2883 if (db->db_blkptr) 2884 res = dsl_dataset_block_freeable(db->db_objset->os_dsl_dataset, 2885 db->db_blkptr, db->db_blkptr->blk_birth); 2886 2887 return (res); 2888} 2889 2890blkptr_t * 2891dmu_buf_get_blkptr(dmu_buf_t db) 2892{ 2893* dmu_buf_impl_t dbi = (dmu_buf_impl_t )db; 2894 return (dbi->db_blkptr); 2895} 2896 2897objset_t * 2898dmu_buf_get_objset(dmu_buf_t db) 2899{ 2900* dmu_buf_impl_t dbi = (dmu_buf_impl_t )db; 2901 return (dbi->db_objset); 2902} 2903 2904dnode_t * 2905dmu_buf_dnode_enter(dmu_buf_t db) 2906{ 2907* dmu_buf_impl_t dbi = (dmu_buf_impl_t )db; 2908 DB_DNODE_ENTER(dbi); 2909 return (DB_DNODE(dbi)); 2910} 2911 2912void 2913dmu_buf_dnode_exit(dmu_buf_t db) 2914{ 2915* dmu_buf_impl_t dbi = (dmu_buf_impl_t )db; 2916 DB_DNODE_EXIT(dbi); 2917} 2918 2919static void 2920dbuf_check_blkptr(dnode_t dn, dmu_buf_impl_t db) 2921{ 2922 /* ASSERT(dmu_tx_is_syncing(tx) / 2923* ASSERT(MUTEX_HELD(&db->db_mtx)); 2924 2925 if (db->db_blkptr != NULL) 2926 return; 2927 2928 if (db->db_blkid == DMU_SPILL_BLKID) { 2929 db->db_blkptr = &dn->dn_phys->dn_spill; 2930 BP_ZERO(db->db_blkptr); 2931 return; 2932 } 2933 if (db->db_level == dn->dn_phys->dn_nlevels-1) { 2934 /* 2935 * This buffer was allocated at a time when there was 2936 * no available blkptrs from the dnode, or it was 2937 * inappropriate to hook it in (i.e., nlevels mis-match). 2938 / 2939* ASSERT(db->db_blkid < dn->dn_phys->dn_nblkptr); 2940 ASSERT(db->db_parent == NULL); 2941 db->db_parent = dn->dn_dbuf; 2942 db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid]; 2943 DBUF_VERIFY(db); 2944 } else { 2945 dmu_buf_impl_t parent = db->db_parent; 2946* int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; 2947 2948 ASSERT(dn->dn_phys->dn_nlevels > 1); 2949 if (parent == NULL) { 2950 mutex_exit(&db->db_mtx); 2951 rw_enter(&dn->dn_struct_rwlock, RW_READER); 2952 parent = dbuf_hold_level(dn, db->db_level + 1, 2953 db->db_blkid >> epbs, db); 2954 rw_exit(&dn->dn_struct_rwlock); 2955 mutex_enter(&db->db_mtx); 2956 db->db_parent = parent; 2957 } 2958 db->db_blkptr = (blkptr_t )parent->db.db_data + 2959* (db->db_blkid & ((1ULL << epbs) - 1)); 2960 DBUF_VERIFY(db); 2961 } 2962} 2963 2964static void 2965dbuf_sync_indirect(dbuf_dirty_record_t dr, dmu_tx_t tx) 2966{ 2967 dmu_buf_impl_t db = dr->dr_dbuf; 2968* dnode_t dn; 2969* zio_t zio; 2970* 2971 ASSERT(dmu_tx_is_syncing(tx)); 2972 2973 dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr); 2974 2975 mutex_enter(&db->db_mtx); 2976 2977 ASSERT(db->db_level > 0); 2978 DBUF_VERIFY(db); 2979 2980 /* Read the block if it hasn't been read yet. / 2981* if (db->db_buf == NULL) { 2982 mutex_exit(&db->db_mtx); 2983 (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED); 2984 mutex_enter(&db->db_mtx); 2985 } 2986 ASSERT3U(db->db_state, ==, DB_CACHED); 2987 ASSERT(db->db_buf != NULL); 2988 2989 DB_DNODE_ENTER(db); 2990 dn = DB_DNODE(db); 2991 /* Indirect block size must match what the dnode thinks it is. / 2992* ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift); 2993 dbuf_check_blkptr(dn, db); 2994 DB_DNODE_EXIT(db); 2995 2996 /* Provide the pending dirty record to child dbufs / 2997* db->db_data_pending = dr; 2998 2999 mutex_exit(&db->db_mtx); 3000 dbuf_write(dr, db->db_buf, tx); 3001 3002 zio = dr->dr_zio; 3003 mutex_enter(&dr->dt.di.dr_mtx); 3004 dbuf_sync_list(&dr->dt.di.dr_children, db->db_level - 1, tx); 3005 ASSERT(list_head(&dr->dt.di.dr_children) == NULL); 3006 mutex_exit(&dr->dt.di.dr_mtx); 3007 zio_nowait(zio); 3008} 3009 3010static void 3011dbuf_sync_leaf(dbuf_dirty_record_t dr, dmu_tx_t tx) 3012{ 3013 arc_buf_t *datap = &dr->dt.dl.dr_data; 3014* dmu_buf_impl_t db = dr->dr_dbuf; 3015* dnode_t dn; 3016* objset_t os; 3017* uint64_t txg = tx->tx_txg; 3018 3019 ASSERT(dmu_tx_is_syncing(tx)); 3020 3021 dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr); 3022 3023 mutex_enter(&db->db_mtx); 3024 /* 3025 * To be synced, we must be dirtied. But we 3026 * might have been freed after the dirty. 3027 / 3028* if (db->db_state == DB_UNCACHED) { 3029 /* This buffer has been freed since it was dirtied / 3030* ASSERT(db->db.db_data == NULL); 3031 } else if (db->db_state == DB_FILL) { 3032 /* This buffer was freed and is now being re-filled / 3033* ASSERT(db->db.db_data != dr->dt.dl.dr_data); 3034 } else { 3035 ASSERT(db->db_state == DB_CACHED \|\| db->db_state == DB_NOFILL); 3036 } 3037 DBUF_VERIFY(db); 3038 3039 DB_DNODE_ENTER(db); 3040 dn = DB_DNODE(db); 3041 3042 if (db->db_blkid == DMU_SPILL_BLKID) { 3043 mutex_enter(&dn->dn_mtx); 3044 dn->dn_phys->dn_flags \|= DNODE_FLAG_SPILL_BLKPTR; 3045 mutex_exit(&dn->dn_mtx); 3046 } 3047 3048 /* 3049 * If this is a bonus buffer, simply copy the bonus data into the 3050 * dnode. It will be written out when the dnode is synced (and it 3051 * will be synced, since it must have been dirty for dbuf_sync to 3052 * be called). 3053 / 3054* if (db->db_blkid == DMU_BONUS_BLKID) { 3055 dbuf_dirty_record_t *drp; 3056* 3057 ASSERT(datap != NULL); 3058* ASSERT0(db->db_level); 3059 ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN); 3060 bcopy(datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen); 3061* DB_DNODE_EXIT(db); 3062 3063 if (datap != db->db.db_data) { 3064* zio_buf_free(datap, DN_MAX_BONUSLEN); 3065* arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); 3066 } 3067 db->db_data_pending = NULL; 3068 drp = &db->db_last_dirty; 3069 while (drp != dr) 3070* drp = &(drp)->dr_next; 3071* ASSERT(dr->dr_next == NULL); 3072 ASSERT(dr->dr_dbuf == db); 3073 drp = dr->dr_next; 3074* if (dr->dr_dbuf->db_level != 0) { 3075 list_destroy(&dr->dt.di.dr_children); 3076 mutex_destroy(&dr->dt.di.dr_mtx); 3077 } 3078 kmem_free(dr, sizeof (dbuf_dirty_record_t)); 3079 ASSERT(db->db_dirtycnt > 0); 3080 db->db_dirtycnt -= 1; 3081 dbuf_rele_and_unlock(db, (void )(uintptr_t)txg); 3082* return; 3083 } 3084 3085 os = dn->dn_objset; 3086 3087 /* 3088 * This function may have dropped the db_mtx lock allowing a dmu_sync 3089 * operation to sneak in. As a result, we need to ensure that we 3090 * don't check the dr_override_state until we have returned from 3091 * dbuf_check_blkptr. 3092 / 3093* dbuf_check_blkptr(dn, db); 3094 3095 /* 3096 * If this buffer is in the middle of an immediate write, 3097 * wait for the synchronous IO to complete. 3098 / 3099* while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) { 3100 ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT); 3101 cv_wait(&db->db_changed, &db->db_mtx); 3102 ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN); 3103 } 3104 3105 if (db->db_state != DB_NOFILL && 3106 dn->dn_object != DMU_META_DNODE_OBJECT && 3107 refcount_count(&db->db_holds) > 1 && 3108 dr->dt.dl.dr_override_state != DR_OVERRIDDEN && 3109 datap == db->db_buf) { 3110* /* 3111 * If this buffer is currently "in use" (i.e., there 3112 * are active holds and db_data still references it), 3113 * then make a copy before we start the write so that 3114 * any modifications from the open txg will not leak 3115 * into this write. 3116 * 3117 * NOTE: this copy does not need to be made for 3118 * objects only modified in the syncing context (e.g. 3119 * DNONE_DNODE blocks). 3120 / 3121* int blksz = arc_buf_size(datap); 3122* arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 3123 datap = arc_alloc_buf(os->os_spa, blksz, db, type); 3124* bcopy(db->db.db_data, (datap)->b_data, blksz); 3125* } 3126 db->db_data_pending = dr; 3127 3128 mutex_exit(&db->db_mtx); 3129 3130 dbuf_write(dr, datap, tx); 3131* 3132 ASSERT(!list_link_active(&dr->dr_dirty_node)); 3133 if (dn->dn_object == DMU_META_DNODE_OBJECT) { 3134 list_insert_tail(&dn->dn_dirty_records[txg&TXG_MASK], dr); 3135 DB_DNODE_EXIT(db); 3136 } else { 3137 /* 3138 * Although zio_nowait() does not "wait for an IO", it does 3139 * initiate the IO. If this is an empty write it seems plausible 3140 * that the IO could actually be completed before the nowait 3141 * returns. We need to DB_DNODE_EXIT() first in case 3142 * zio_nowait() invalidates the dbuf. 3143 / 3144* DB_DNODE_EXIT(db); 3145 zio_nowait(dr->dr_zio); 3146 } 3147} 3148 3149void 3150dbuf_sync_list(list_t list, int level, dmu_tx_t tx) 3151{ 3152 dbuf_dirty_record_t dr; 3153* 3154 while (dr = list_head(list)) { 3155 if (dr->dr_zio != NULL) { 3156 /* 3157 * If we find an already initialized zio then we 3158 * are processing the meta-dnode, and we have finished. 3159 * The dbufs for all dnodes are put back on the list 3160 * during processing, so that we can zio_wait() 3161 * these IOs after initiating all child IOs. 3162 / 3163* ASSERT3U(dr->dr_dbuf->db.db_object, ==, 3164 DMU_META_DNODE_OBJECT); 3165 break; 3166 } 3167 if (dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID && 3168 dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) { 3169 VERIFY3U(dr->dr_dbuf->db_level, ==, level); 3170 } 3171 list_remove(list, dr); 3172 if (dr->dr_dbuf->db_level > 0) 3173 dbuf_sync_indirect(dr, tx); 3174 else 3175 dbuf_sync_leaf(dr, tx); 3176 } 3177} 3178 3179/* ARGSUSED / 3180static void 3181dbuf_write_ready(zio_t zio, arc_buf_t buf, void vdb) 3182{ 3183 dmu_buf_impl_t db = vdb; 3184* dnode_t dn; 3185* blkptr_t bp = zio->io_bp; 3186* blkptr_t bp_orig = &zio->io_bp_orig; 3187* spa_t spa = zio->io_spa; 3188* int64_t delta; 3189 uint64_t fill = 0; 3190 int i; 3191 3192 ASSERT3P(db->db_blkptr, !=, NULL); 3193 ASSERT3P(&db->db_data_pending->dr_bp_copy, ==, bp); 3194 3195 DB_DNODE_ENTER(db); 3196 dn = DB_DNODE(db); 3197 delta = bp_get_dsize_sync(spa, bp) - bp_get_dsize_sync(spa, bp_orig); 3198 dnode_diduse_space(dn, delta - zio->io_prev_space_delta); 3199 zio->io_prev_space_delta = delta; 3200 3201 if (bp->blk_birth != 0) { 3202 ASSERT((db->db_blkid != DMU_SPILL_BLKID && 3203 BP_GET_TYPE(bp) == dn->dn_type) \|\| 3204 (db->db_blkid == DMU_SPILL_BLKID && 3205 BP_GET_TYPE(bp) == dn->dn_bonustype) \|\| 3206 BP_IS_EMBEDDED(bp)); 3207 ASSERT(BP_GET_LEVEL(bp) == db->db_level); 3208 } 3209 3210 mutex_enter(&db->db_mtx); 3211 3212#ifdef ZFS_DEBUG 3213 if (db->db_blkid == DMU_SPILL_BLKID) { 3214 ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR); 3215 ASSERT(!(BP_IS_HOLE(bp)) && 3216 db->db_blkptr == &dn->dn_phys->dn_spill); 3217 } 3218#endif 3219 3220 if (db->db_level == 0) { 3221 mutex_enter(&dn->dn_mtx); 3222 if (db->db_blkid > dn->dn_phys->dn_maxblkid && 3223 db->db_blkid != DMU_SPILL_BLKID) 3224 dn->dn_phys->dn_maxblkid = db->db_blkid; 3225 mutex_exit(&dn->dn_mtx); 3226 3227 if (dn->dn_type == DMU_OT_DNODE) { 3228 dnode_phys_t dnp = db->db.db_data; 3229* for (i = db->db.db_size >> DNODE_SHIFT; i > 0; 3230 i--, dnp++) { 3231 if (dnp->dn_type != DMU_OT_NONE) 3232 fill++; 3233 } 3234 } else { 3235 if (BP_IS_HOLE(bp)) { 3236 fill = 0; 3237 } else { 3238 fill = 1; 3239 } 3240 } 3241 } else { 3242 blkptr_t ibp = db->db.db_data; 3243* ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift); 3244 for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) { 3245 if (BP_IS_HOLE(ibp)) 3246 continue; 3247 fill += BP_GET_FILL(ibp); 3248 } 3249 } 3250 DB_DNODE_EXIT(db); 3251 3252 if (!BP_IS_EMBEDDED(bp)) 3253 bp->blk_fill = fill; 3254 3255 mutex_exit(&db->db_mtx); 3256 3257 rw_enter(&dn->dn_struct_rwlock, RW_WRITER); 3258 db->db_blkptr = bp; 3259 rw_exit(&dn->dn_struct_rwlock); 3260} 3261 3262/* ARGSUSED / 3263/ 3264 * This function gets called just prior to running through the compression 3265 * stage of the zio pipeline. If we're an indirect block comprised of only 3266 * holes, then we want this indirect to be compressed away to a hole. In 3267 * order to do that we must zero out any information about the holes that 3268 * this indirect points to prior to before we try to compress it. 3269 / 3270static void 3271dbuf_write_children_ready(zio_t zio, arc_buf_t buf, void vdb) 3272{ 3273 dmu_buf_impl_t db = vdb; 3274* dnode_t dn; 3275* blkptr_t bp; 3276* uint64_t i; 3277 int epbs; 3278 3279 ASSERT3U(db->db_level, >, 0); 3280 DB_DNODE_ENTER(db); 3281 dn = DB_DNODE(db); 3282 epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; 3283 3284 /* Determine if all our children are holes / 3285* for (i = 0, bp = db->db.db_data; i < 1 << epbs; i++, bp++) { 3286 if (!BP_IS_HOLE(bp)) 3287 break; 3288 } 3289 3290 /* 3291 * If all the children are holes, then zero them all out so that 3292 * we may get compressed away. 3293 / 3294* if (i == 1 << epbs) { 3295 /* didn't find any non-holes / 3296* bzero(db->db.db_data, db->db.db_size); 3297 } 3298 DB_DNODE_EXIT(db); 3299} 3300 3301/* 3302 * The SPA will call this callback several times for each zio - once 3303 * for every physical child i/o (zio->io_phys_children times). This 3304 * allows the DMU to monitor the progress of each logical i/o. For example, 3305 * there may be 2 copies of an indirect block, or many fragments of a RAID-Z 3306 * block. There may be a long delay before all copies/fragments are completed, 3307 * so this callback allows us to retire dirty space gradually, as the physical 3308 * i/os complete. 3309 / 3310/ ARGSUSED / 3311static void 3312dbuf_write_physdone(zio_t zio, arc_buf_t buf, void arg) 3313{ 3314 dmu_buf_impl_t db = arg; 3315* objset_t os = db->db_objset; 3316* dsl_pool_t dp = dmu_objset_pool(os); 3317* dbuf_dirty_record_t dr; 3318* int delta = 0; 3319 3320 dr = db->db_data_pending; 3321 ASSERT3U(dr->dr_txg, ==, zio->io_txg); 3322 3323 /* 3324 * The callback will be called io_phys_children times. Retire one 3325 * portion of our dirty space each time we are called. Any rounding 3326 * error will be cleaned up by dsl_pool_sync()'s call to 3327 * dsl_pool_undirty_space(). 3328 / 3329* delta = dr->dr_accounted / zio->io_phys_children; 3330 dsl_pool_undirty_space(dp, delta, zio->io_txg); 3331} 3332 3333/* ARGSUSED / 3334static void 3335dbuf_write_done(zio_t zio, arc_buf_t buf, void vdb) 3336{ 3337 dmu_buf_impl_t db = vdb; 3338* blkptr_t bp_orig = &zio->io_bp_orig; 3339* blkptr_t bp = db->db_blkptr; 3340* objset_t os = db->db_objset; 3341* dmu_tx_t tx = os->os_synctx; 3342* dbuf_dirty_record_t *drp, dr; 3343 3344 ASSERT0(zio->io_error); 3345 ASSERT(db->db_blkptr == bp); 3346 3347 /* 3348 * For nopwrites and rewrites we ensure that the bp matches our 3349 * original and bypass all the accounting. 3350 / 3351* if (zio->io_flags & (ZIO_FLAG_IO_REWRITE \| ZIO_FLAG_NOPWRITE)) { 3352 ASSERT(BP_EQUAL(bp, bp_orig)); 3353 } else { 3354 dsl_dataset_t ds = os->os_dsl_dataset; 3355* (void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE); 3356 dsl_dataset_block_born(ds, bp, tx); 3357 } 3358 3359 mutex_enter(&db->db_mtx); 3360 3361 DBUF_VERIFY(db); 3362 3363 drp = &db->db_last_dirty; 3364 while ((dr = drp) != db->db_data_pending) 3365* drp = &dr->dr_next; 3366 ASSERT(!list_link_active(&dr->dr_dirty_node)); 3367 ASSERT(dr->dr_dbuf == db); 3368 ASSERT(dr->dr_next == NULL); 3369 drp = dr->dr_next; 3370* 3371#ifdef ZFS_DEBUG 3372 if (db->db_blkid == DMU_SPILL_BLKID) { 3373 dnode_t dn; 3374* 3375 DB_DNODE_ENTER(db); 3376 dn = DB_DNODE(db); 3377 ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR); 3378 ASSERT(!(BP_IS_HOLE(db->db_blkptr)) && 3379 db->db_blkptr == &dn->dn_phys->dn_spill); 3380 DB_DNODE_EXIT(db); 3381 } 3382#endif 3383 3384 if (db->db_level == 0) { 3385 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 3386 ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN); 3387 if (db->db_state != DB_NOFILL) { 3388 if (dr->dt.dl.dr_data != db->db_buf) 3389 arc_buf_destroy(dr->dt.dl.dr_data, db); 3390 } 3391 } else { 3392 dnode_t dn; 3393* 3394 DB_DNODE_ENTER(db); 3395 dn = DB_DNODE(db); 3396 ASSERT(list_head(&dr->dt.di.dr_children) == NULL); 3397 ASSERT3U(db->db.db_size, ==, 1 << dn->dn_phys->dn_indblkshift); 3398 if (!BP_IS_HOLE(db->db_blkptr)) { 3399 int epbs = 3400 dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; 3401 ASSERT3U(db->db_blkid, <=, 3402 dn->dn_phys->dn_maxblkid >> (db->db_level * epbs)); 3403 ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==, 3404 db->db.db_size); 3405 } 3406 DB_DNODE_EXIT(db); 3407 mutex_destroy(&dr->dt.di.dr_mtx); 3408 list_destroy(&dr->dt.di.dr_children); 3409 } 3410 kmem_free(dr, sizeof (dbuf_dirty_record_t)); 3411 3412 cv_broadcast(&db->db_changed); 3413 ASSERT(db->db_dirtycnt > 0); 3414 db->db_dirtycnt -= 1; 3415 db->db_data_pending = NULL; 3416 dbuf_rele_and_unlock(db, (void )(uintptr_t)tx->tx_txg); 3417} 3418* 3419static void 3420dbuf_write_nofill_ready(zio_t zio) 3421{ 3422* dbuf_write_ready(zio, NULL, zio->io_private); 3423} 3424 3425static void 3426dbuf_write_nofill_done(zio_t zio) 3427{ 3428* dbuf_write_done(zio, NULL, zio->io_private); 3429} 3430 3431static void 3432dbuf_write_override_ready(zio_t zio) 3433{ 3434* dbuf_dirty_record_t dr = zio->io_private; 3435* dmu_buf_impl_t db = dr->dr_dbuf; 3436* 3437 dbuf_write_ready(zio, NULL, db); 3438} 3439 3440static void 3441dbuf_write_override_done(zio_t zio) 3442{ 3443* dbuf_dirty_record_t dr = zio->io_private; 3444* dmu_buf_impl_t db = dr->dr_dbuf; 3445* blkptr_t obp = &dr->dt.dl.dr_overridden_by; 3446* 3447 mutex_enter(&db->db_mtx); 3448 if (!BP_EQUAL(zio->io_bp, obp)) { 3449 if (!BP_IS_HOLE(obp)) 3450 dsl_free(spa_get_dsl(zio->io_spa), zio->io_txg, obp); 3451 arc_release(dr->dt.dl.dr_data, db); 3452 } 3453 mutex_exit(&db->db_mtx); 3454 3455 dbuf_write_done(zio, NULL, db); 3456} 3457 3458/* Issue I/O to commit a dirty buffer to disk. / 3459static void 3460dbuf_write(dbuf_dirty_record_t dr, arc_buf_t data, dmu_tx_t tx) 3461{ 3462 dmu_buf_impl_t db = dr->dr_dbuf; 3463* dnode_t dn; 3464* objset_t os; 3465* dmu_buf_impl_t parent = db->db_parent; 3466* uint64_t txg = tx->tx_txg; 3467 zbookmark_phys_t zb; 3468 zio_prop_t zp; 3469 zio_t zio; 3470* int wp_flag = 0; 3471 3472 ASSERT(dmu_tx_is_syncing(tx)); 3473 3474 DB_DNODE_ENTER(db); 3475 dn = DB_DNODE(db); 3476 os = dn->dn_objset; 3477 3478 if (db->db_state != DB_NOFILL) { 3479 if (db->db_level > 0 \|\| dn->dn_type == DMU_OT_DNODE) { 3480 /* 3481 * Private object buffers are released here rather 3482 * than in dbuf_dirty() since they are only modified 3483 * in the syncing context and we don't want the 3484 * overhead of making multiple copies of the data. 3485 / 3486* if (BP_IS_HOLE(db->db_blkptr)) { 3487 arc_buf_thaw(data); 3488 } else { 3489 dbuf_release_bp(db); 3490 } 3491 } 3492 } 3493 3494 if (parent != dn->dn_dbuf) { 3495 /* Our parent is an indirect block. / 3496* /* We have a dirty parent that has been scheduled for write. / 3497* ASSERT(parent && parent->db_data_pending); 3498 /* Our parent's buffer is one level closer to the dnode. / 3499* ASSERT(db->db_level == parent->db_level-1); 3500 /* 3501 * We're about to modify our parent's db_data by modifying 3502 * our block pointer, so the parent must be released. 3503 / 3504* ASSERT(arc_released(parent->db_buf)); 3505 zio = parent->db_data_pending->dr_zio; 3506 } else { 3507 /* Our parent is the dnode itself. / 3508* ASSERT((db->db_level == dn->dn_phys->dn_nlevels-1 && 3509 db->db_blkid != DMU_SPILL_BLKID) \|\| 3510 (db->db_blkid == DMU_SPILL_BLKID && db->db_level == 0)); 3511 if (db->db_blkid != DMU_SPILL_BLKID) 3512 ASSERT3P(db->db_blkptr, ==, 3513 &dn->dn_phys->dn_blkptr[db->db_blkid]); 3514 zio = dn->dn_zio; 3515 } 3516 3517 ASSERT(db->db_level == 0 \|\| data == db->db_buf); 3518 ASSERT3U(db->db_blkptr->blk_birth, <=, txg); 3519 ASSERT(zio); 3520 3521 SET_BOOKMARK(&zb, os->os_dsl_dataset ? 3522 os->os_dsl_dataset->ds_object : DMU_META_OBJSET, 3523 db->db.db_object, db->db_level, db->db_blkid); 3524 3525 if (db->db_blkid == DMU_SPILL_BLKID) 3526 wp_flag = WP_SPILL; 3527 wp_flag \|= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0; 3528 3529 dmu_write_policy(os, dn, db->db_level, wp_flag, &zp); 3530 DB_DNODE_EXIT(db); 3531 3532 /* 3533 * We copy the blkptr now (rather than when we instantiate the dirty 3534 * record), because its value can change between open context and 3535 * syncing context. We do not need to hold dn_struct_rwlock to read 3536 * db_blkptr because we are in syncing context. 3537 / 3538* dr->dr_bp_copy = db->db_blkptr; 3539* 3540 if (db->db_level == 0 && 3541 dr->dt.dl.dr_override_state == DR_OVERRIDDEN) { 3542 /* 3543 * The BP for this block has been provided by open context 3544 * (by dmu_sync() or dmu_buf_write_embedded()). 3545 / 3546* void contents = (data != NULL) ? data->b_data : NULL; 3547* 3548 dr->dr_zio = zio_write(zio, os->os_spa, txg, 3549 &dr->dr_bp_copy, contents, db->db.db_size, &zp, 3550 dbuf_write_override_ready, NULL, NULL, 3551 dbuf_write_override_done, 3552 dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); 3553 mutex_enter(&db->db_mtx); 3554 dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; 3555 zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by, 3556 dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite); 3557 mutex_exit(&db->db_mtx); 3558 } else if (db->db_state == DB_NOFILL) { 3559 ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF \|\| 3560 zp.zp_checksum == ZIO_CHECKSUM_NOPARITY); 3561 dr->dr_zio = zio_write(zio, os->os_spa, txg, 3562 &dr->dr_bp_copy, NULL, db->db.db_size, &zp, 3563 dbuf_write_nofill_ready, NULL, NULL, 3564 dbuf_write_nofill_done, db, 3565 ZIO_PRIORITY_ASYNC_WRITE, 3566 ZIO_FLAG_MUSTSUCCEED \| ZIO_FLAG_NODATA, &zb); 3567 } else { 3568 ASSERT(arc_released(data)); 3569 3570 /* 3571 * For indirect blocks, we want to setup the children 3572 * ready callback so that we can properly handle an indirect 3573 * block that only contains holes. 3574 / 3575* arc_done_func_t children_ready_cb = NULL; 3576* if (db->db_level != 0) 3577 children_ready_cb = dbuf_write_children_ready; 3578 3579 dr->dr_zio = arc_write(zio, os->os_spa, txg, 3580 &dr->dr_bp_copy, data, DBUF_IS_L2CACHEABLE(db), 3581 &zp, dbuf_write_ready, children_ready_cb, 3582 dbuf_write_physdone, dbuf_write_done, db, 3583 ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); 3584 } 3585}