1168404Spjd/* 2168404Spjd * CDDL HEADER START 3168404Spjd * 4168404Spjd * The contents of this file are subject to the terms of the 5168404Spjd * Common Development and Distribution License (the "License"). 6168404Spjd * You may not use this file except in compliance with the License. 7168404Spjd * 8168404Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9168404Spjd * or http://www.opensolaris.org/os/licensing. 10168404Spjd * See the License for the specific language governing permissions 11168404Spjd * and limitations under the License. 12168404Spjd * 13168404Spjd * When distributing Covered Code, include this CDDL HEADER in each 14168404Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15168404Spjd * If applicable, add the following below this CDDL HEADER, with the 16168404Spjd * fields enclosed by brackets "[]" replaced with your own identifying 17168404Spjd * information: Portions Copyright [yyyy] [name of copyright owner] 18168404Spjd * 19168404Spjd * CDDL HEADER END 20168404Spjd */ 21168404Spjd/* 22219089Spjd * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23285202Savg * Copyright (c) 2012, 2015 by Delphix. All rights reserved. 24251478Sdelphij * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. 25288549Smav * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. 26168404Spjd */ 27168404Spjd 28168404Spjd#ifndef _SYS_DBUF_H 29168404Spjd#define _SYS_DBUF_H 30168404Spjd 31168404Spjd#include <sys/dmu.h> 32168404Spjd#include <sys/spa.h> 33168404Spjd#include <sys/txg.h> 34168404Spjd#include <sys/zio.h> 35168404Spjd#include <sys/arc.h> 36168404Spjd#include <sys/zfs_context.h> 37168404Spjd#include <sys/refcount.h> 38219089Spjd#include <sys/zrlock.h> 39307266Smav#include <sys/multilist.h> 40168404Spjd 41168404Spjd#ifdef __cplusplus 42168404Spjdextern "C" { 43168404Spjd#endif 44168404Spjd 45168404Spjd#define IN_DMU_SYNC 2 46168404Spjd 47168404Spjd/* 48168404Spjd * define flags for dbuf_read 49168404Spjd */ 50168404Spjd 51168404Spjd#define DB_RF_MUST_SUCCEED (1 << 0) 52168404Spjd#define DB_RF_CANFAIL (1 << 1) 53168404Spjd#define DB_RF_HAVESTRUCT (1 << 2) 54168404Spjd#define DB_RF_NOPREFETCH (1 << 3) 55168404Spjd#define DB_RF_NEVERWAIT (1 << 4) 56168404Spjd#define DB_RF_CACHED (1 << 5) 57168404Spjd 58168404Spjd/* 59219089Spjd * The simplified state transition diagram for dbufs looks like: 60168404Spjd * 61168404Spjd * +----> READ ----+ 62168404Spjd * | | 63168404Spjd * | V 64168404Spjd * (alloc)-->UNCACHED CACHED-->EVICTING-->(free) 65219089Spjd * | ^ ^ 66219089Spjd * | | | 67219089Spjd * +----> FILL ----+ | 68219089Spjd * | | 69219089Spjd * | | 70219089Spjd * +--------> NOFILL -------+ 71270809Sdelphij * 72270809Sdelphij * DB_SEARCH is an invalid state for a dbuf. It is used by dbuf_free_range 73270809Sdelphij * to find all dbufs in a range of a dnode and must be less than any other 74270809Sdelphij * dbuf_states_t (see comment on dn_dbufs in dnode.h). 75168404Spjd */ 76168404Spjdtypedef enum dbuf_states { 77270809Sdelphij DB_SEARCH = -1, 78168404Spjd DB_UNCACHED, 79168404Spjd DB_FILL, 80219089Spjd DB_NOFILL, 81168404Spjd DB_READ, 82168404Spjd DB_CACHED, 83168404Spjd DB_EVICTING 84168404Spjd} dbuf_states_t; 85168404Spjd 86168404Spjdstruct dnode; 87168404Spjdstruct dmu_tx; 88168404Spjd 89168404Spjd/* 90168404Spjd * level = 0 means the user data 91168404Spjd * level = 1 means the single indirect block 92168404Spjd * etc. 93168404Spjd */ 94168404Spjd 95168404Spjdstruct dmu_buf_impl; 96168404Spjd 97168404Spjdtypedef enum override_states { 98168404Spjd DR_NOT_OVERRIDDEN, 99168404Spjd DR_IN_DMU_SYNC, 100168404Spjd DR_OVERRIDDEN 101168404Spjd} override_states_t; 102168404Spjd 103168404Spjdtypedef struct dbuf_dirty_record { 104168404Spjd /* link on our parents dirty list */ 105168404Spjd list_node_t dr_dirty_node; 106168404Spjd 107168404Spjd /* transaction group this data will sync in */ 108168404Spjd uint64_t dr_txg; 109168404Spjd 110168404Spjd /* zio of outstanding write IO */ 111168404Spjd zio_t *dr_zio; 112168404Spjd 113168404Spjd /* pointer back to our dbuf */ 114168404Spjd struct dmu_buf_impl *dr_dbuf; 115168404Spjd 116168404Spjd /* pointer to next dirty record */ 117168404Spjd struct dbuf_dirty_record *dr_next; 118168404Spjd 119168404Spjd /* pointer to parent dirty record */ 120168404Spjd struct dbuf_dirty_record *dr_parent; 121168404Spjd 122260763Savg /* How much space was changed to dsl_pool_dirty_space() for this? */ 123260763Savg unsigned int dr_accounted; 124260763Savg 125304136Savg /* A copy of the bp that points to us */ 126304136Savg blkptr_t dr_bp_copy; 127304136Savg 128168404Spjd union dirty_types { 129168404Spjd struct dirty_indirect { 130168404Spjd 131168404Spjd /* protect access to list */ 132168404Spjd kmutex_t dr_mtx; 133168404Spjd 134168404Spjd /* Our list of dirty children */ 135168404Spjd list_t dr_children; 136168404Spjd } di; 137168404Spjd struct dirty_leaf { 138168404Spjd 139168404Spjd /* 140168404Spjd * dr_data is set when we dirty the buffer 141168404Spjd * so that we can retain the pointer even if it 142168404Spjd * gets COW'd in a subsequent transaction group. 143168404Spjd */ 144168404Spjd arc_buf_t *dr_data; 145168404Spjd blkptr_t dr_overridden_by; 146168404Spjd override_states_t dr_override_state; 147219089Spjd uint8_t dr_copies; 148243524Smm boolean_t dr_nopwrite; 149168404Spjd } dl; 150168404Spjd } dt; 151168404Spjd} dbuf_dirty_record_t; 152168404Spjd 153168404Spjdtypedef struct dmu_buf_impl { 154168404Spjd /* 155168404Spjd * The following members are immutable, with the exception of 156168404Spjd * db.db_data, which is protected by db_mtx. 157168404Spjd */ 158168404Spjd 159168404Spjd /* the publicly visible structure */ 160168404Spjd dmu_buf_t db; 161168404Spjd 162168404Spjd /* the objset we belong to */ 163219089Spjd struct objset *db_objset; 164168404Spjd 165168404Spjd /* 166219089Spjd * handle to safely access the dnode we belong to (NULL when evicted) 167168404Spjd */ 168219089Spjd struct dnode_handle *db_dnode_handle; 169168404Spjd 170168404Spjd /* 171168404Spjd * our parent buffer; if the dnode points to us directly, 172219089Spjd * db_parent == db_dnode_handle->dnh_dnode->dn_dbuf 173168404Spjd * only accessed by sync thread ??? 174168404Spjd * (NULL when evicted) 175219089Spjd * May change from NULL to non-NULL under the protection of db_mtx 176219089Spjd * (see dbuf_check_blkptr()) 177168404Spjd */ 178168404Spjd struct dmu_buf_impl *db_parent; 179168404Spjd 180168404Spjd /* 181168404Spjd * link for hash table of all dmu_buf_impl_t's 182168404Spjd */ 183168404Spjd struct dmu_buf_impl *db_hash_next; 184168404Spjd 185168404Spjd /* our block number */ 186168404Spjd uint64_t db_blkid; 187168404Spjd 188168404Spjd /* 189168404Spjd * Pointer to the blkptr_t which points to us. May be NULL if we 190168404Spjd * don't have one yet. (NULL when evicted) 191168404Spjd */ 192168404Spjd blkptr_t *db_blkptr; 193168404Spjd 194168404Spjd /* 195168404Spjd * Our indirection level. Data buffers have db_level==0. 196168404Spjd * Indirect buffers which point to data buffers have 197168404Spjd * db_level==1. etc. Buffers which contain dnodes have 198168404Spjd * db_level==0, since the dnodes are stored in a file. 199168404Spjd */ 200168404Spjd uint8_t db_level; 201168404Spjd 202168404Spjd /* db_mtx protects the members below */ 203168404Spjd kmutex_t db_mtx; 204168404Spjd 205168404Spjd /* 206168404Spjd * Current state of the buffer 207168404Spjd */ 208168404Spjd dbuf_states_t db_state; 209168404Spjd 210168404Spjd /* 211168404Spjd * Refcount accessed by dmu_buf_{hold,rele}. 212168404Spjd * If nonzero, the buffer can't be destroyed. 213168404Spjd * Protected by db_mtx. 214168404Spjd */ 215168404Spjd refcount_t db_holds; 216168404Spjd 217168404Spjd /* buffer holding our data */ 218168404Spjd arc_buf_t *db_buf; 219168404Spjd 220168404Spjd kcondvar_t db_changed; 221168404Spjd dbuf_dirty_record_t *db_data_pending; 222168404Spjd 223168404Spjd /* pointer to most recent dirty record for this buffer */ 224168404Spjd dbuf_dirty_record_t *db_last_dirty; 225168404Spjd 226168404Spjd /* 227168404Spjd * Our link on the owner dnodes's dn_dbufs list. 228168404Spjd * Protected by its dn_dbufs_mtx. 229168404Spjd */ 230269845Sdelphij avl_node_t db_link; 231168404Spjd 232307266Smav /* 233307266Smav * Link in dbuf_cache. 234307266Smav */ 235307266Smav multilist_node_t db_cache_link; 236307266Smav 237168404Spjd /* Data which is unique to data (leaf) blocks: */ 238168404Spjd 239288549Smav /* User callback information. */ 240288549Smav dmu_buf_user_t *db_user; 241168404Spjd 242290754Smav /* 243290754Smav * Evict user data as soon as the dirty and reference 244290754Smav * counts are equal. 245290754Smav */ 246290754Smav uint8_t db_user_immediate_evict; 247290754Smav 248290754Smav /* 249290754Smav * This block was freed while a read or write was 250290754Smav * active. 251290754Smav */ 252168404Spjd uint8_t db_freed_in_flight; 253168404Spjd 254290754Smav /* 255290754Smav * dnode_evict_dbufs() or dnode_evict_bonus() tried to 256290754Smav * evict this dbuf, but couldn't due to outstanding 257290754Smav * references. Evict once the refcount drops to 0. 258290754Smav */ 259290754Smav uint8_t db_pending_evict; 260290754Smav 261168404Spjd uint8_t db_dirtycnt; 262168404Spjd} dmu_buf_impl_t; 263168404Spjd 264168404Spjd/* Note: the dbuf hash table is exposed only for the mdb module */ 265168404Spjd#define DBUF_MUTEXES 256 266168404Spjd#define DBUF_HASH_MUTEX(h, idx) (&(h)->hash_mutexes[(idx) & (DBUF_MUTEXES-1)]) 267168404Spjdtypedef struct dbuf_hash_table { 268168404Spjd uint64_t hash_table_mask; 269168404Spjd dmu_buf_impl_t **hash_table; 270168404Spjd kmutex_t hash_mutexes[DBUF_MUTEXES]; 271168404Spjd} dbuf_hash_table_t; 272168404Spjd 273288571Smavuint64_t dbuf_whichblock(struct dnode *di, int64_t level, uint64_t offset); 274168404Spjd 275168404Spjddmu_buf_impl_t *dbuf_create_tlib(struct dnode *dn, char *data); 276185029Spjdvoid dbuf_create_bonus(struct dnode *dn); 277219089Spjdint dbuf_spill_set_blksz(dmu_buf_t *db, uint64_t blksz, dmu_tx_t *tx); 278219089Spjdvoid dbuf_spill_hold(struct dnode *dn, dmu_buf_impl_t **dbp, void *tag); 279168404Spjd 280219089Spjdvoid dbuf_rm_spill(struct dnode *dn, dmu_tx_t *tx); 281219089Spjd 282168404Spjddmu_buf_impl_t *dbuf_hold(struct dnode *dn, uint64_t blkid, void *tag); 283168404Spjddmu_buf_impl_t *dbuf_hold_level(struct dnode *dn, int level, uint64_t blkid, 284168404Spjd void *tag); 285288571Smavint dbuf_hold_impl(struct dnode *dn, uint8_t level, uint64_t blkid, 286288571Smav boolean_t fail_sparse, boolean_t fail_uncached, 287168404Spjd void *tag, dmu_buf_impl_t **dbp); 288168404Spjd 289288571Smavvoid dbuf_prefetch(struct dnode *dn, int64_t level, uint64_t blkid, 290288571Smav zio_priority_t prio, arc_flags_t aflags); 291168404Spjd 292168404Spjdvoid dbuf_add_ref(dmu_buf_impl_t *db, void *tag); 293288538Smavboolean_t dbuf_try_add_ref(dmu_buf_t *db, objset_t *os, uint64_t obj, 294288538Smav uint64_t blkid, void *tag); 295168404Spjduint64_t dbuf_refcount(dmu_buf_impl_t *db); 296168404Spjd 297168404Spjdvoid dbuf_rele(dmu_buf_impl_t *db, void *tag); 298219089Spjdvoid dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag); 299168404Spjd 300288538Smavdmu_buf_impl_t *dbuf_find(struct objset *os, uint64_t object, uint8_t level, 301288538Smav uint64_t blkid); 302168404Spjd 303168404Spjdint dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags); 304219089Spjdvoid dmu_buf_will_not_fill(dmu_buf_t *db, dmu_tx_t *tx); 305168404Spjdvoid dmu_buf_will_fill(dmu_buf_t *db, dmu_tx_t *tx); 306168404Spjdvoid dmu_buf_fill_done(dmu_buf_t *db, dmu_tx_t *tx); 307209962Smmvoid dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx); 308168404Spjddbuf_dirty_record_t *dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx); 309219089Spjdarc_buf_t *dbuf_loan_arcbuf(dmu_buf_impl_t *db); 310268649Sdelphijvoid dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data, 311268649Sdelphij bp_embedded_type_t etype, enum zio_compress comp, 312268649Sdelphij int uncompressed_size, int compressed_size, int byteorder, dmu_tx_t *tx); 313168404Spjd 314307266Smavvoid dbuf_destroy(dmu_buf_impl_t *db); 315168404Spjd 316168404Spjdvoid dbuf_setdirty(dmu_buf_impl_t *db, dmu_tx_t *tx); 317168404Spjdvoid dbuf_unoverride(dbuf_dirty_record_t *dr); 318285202Savgvoid dbuf_sync_list(list_t *list, int level, dmu_tx_t *tx); 319219089Spjdvoid dbuf_release_bp(dmu_buf_impl_t *db); 320168404Spjd 321185029Spjdvoid dbuf_free_range(struct dnode *dn, uint64_t start, uint64_t end, 322168404Spjd struct dmu_tx *); 323168404Spjd 324168404Spjdvoid dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx); 325168404Spjd 326219089Spjd#define DB_DNODE(_db) ((_db)->db_dnode_handle->dnh_dnode) 327219089Spjd#define DB_DNODE_LOCK(_db) ((_db)->db_dnode_handle->dnh_zrlock) 328219089Spjd#define DB_DNODE_ENTER(_db) (zrl_add(&DB_DNODE_LOCK(_db))) 329219089Spjd#define DB_DNODE_EXIT(_db) (zrl_remove(&DB_DNODE_LOCK(_db))) 330219089Spjd#define DB_DNODE_HELD(_db) (!zrl_is_zero(&DB_DNODE_LOCK(_db))) 331219089Spjd 332168404Spjdvoid dbuf_init(void); 333168404Spjdvoid dbuf_fini(void); 334168404Spjd 335219089Spjdboolean_t dbuf_is_metadata(dmu_buf_impl_t *db); 336168404Spjd 337219089Spjd#define DBUF_GET_BUFC_TYPE(_db) \ 338248571Smm (dbuf_is_metadata(_db) ? ARC_BUFC_METADATA : ARC_BUFC_DATA) 339185029Spjd 340219089Spjd#define DBUF_IS_CACHEABLE(_db) \ 341219089Spjd ((_db)->db_objset->os_primary_cache == ZFS_CACHE_ALL || \ 342248571Smm (dbuf_is_metadata(_db) && \ 343219089Spjd ((_db)->db_objset->os_primary_cache == ZFS_CACHE_METADATA))) 344185029Spjd 345219089Spjd#define DBUF_IS_L2CACHEABLE(_db) \ 346219089Spjd ((_db)->db_objset->os_secondary_cache == ZFS_CACHE_ALL || \ 347248571Smm (dbuf_is_metadata(_db) && \ 348219089Spjd ((_db)->db_objset->os_secondary_cache == ZFS_CACHE_METADATA))) 349219089Spjd 350325932Savg#define DNODE_LEVEL_IS_L2CACHEABLE(_dn, _level) \ 351325932Savg ((_dn)->dn_objset->os_secondary_cache == ZFS_CACHE_ALL || \ 352325932Savg (((_level) > 0 || \ 353325932Savg DMU_OT_IS_METADATA((_dn)->dn_handle->dnh_dnode->dn_type)) && \ 354325932Savg ((_dn)->dn_objset->os_secondary_cache == ZFS_CACHE_METADATA))) 355325932Savg 356168404Spjd#ifdef ZFS_DEBUG 357168404Spjd 358168404Spjd/* 359168404Spjd * There should be a ## between the string literal and fmt, to make it 360168404Spjd * clear that we're joining two strings together, but gcc does not 361168404Spjd * support that preprocessor token. 362168404Spjd */ 363168404Spjd#define dprintf_dbuf(dbuf, fmt, ...) do { \ 364168404Spjd if (zfs_flags & ZFS_DEBUG_DPRINTF) { \ 365168404Spjd char __db_buf[32]; \ 366168404Spjd uint64_t __db_obj = (dbuf)->db.db_object; \ 367168404Spjd if (__db_obj == DMU_META_DNODE_OBJECT) \ 368168404Spjd (void) strcpy(__db_buf, "mdn"); \ 369168404Spjd else \ 370168404Spjd (void) snprintf(__db_buf, sizeof (__db_buf), "%lld", \ 371168404Spjd (u_longlong_t)__db_obj); \ 372168404Spjd dprintf_ds((dbuf)->db_objset->os_dsl_dataset, \ 373168404Spjd "obj=%s lvl=%u blkid=%lld " fmt, \ 374168404Spjd __db_buf, (dbuf)->db_level, \ 375168404Spjd (u_longlong_t)(dbuf)->db_blkid, __VA_ARGS__); \ 376168404Spjd } \ 377168404Spjd_NOTE(CONSTCOND) } while (0) 378168404Spjd 379168404Spjd#define dprintf_dbuf_bp(db, bp, fmt, ...) do { \ 380168404Spjd if (zfs_flags & ZFS_DEBUG_DPRINTF) { \ 381168404Spjd char *__blkbuf = kmem_alloc(BP_SPRINTF_LEN, KM_SLEEP); \ 382263397Sdelphij snprintf_blkptr(__blkbuf, BP_SPRINTF_LEN, bp); \ 383168404Spjd dprintf_dbuf(db, fmt " %s\n", __VA_ARGS__, __blkbuf); \ 384168404Spjd kmem_free(__blkbuf, BP_SPRINTF_LEN); \ 385219089Spjd } \ 386168404Spjd_NOTE(CONSTCOND) } while (0) 387168404Spjd 388168404Spjd#define DBUF_VERIFY(db) dbuf_verify(db) 389168404Spjd 390168404Spjd#else 391168404Spjd 392168404Spjd#define dprintf_dbuf(db, fmt, ...) 393168404Spjd#define dprintf_dbuf_bp(db, bp, fmt, ...) 394168404Spjd#define DBUF_VERIFY(db) 395168404Spjd 396168404Spjd#endif 397168404Spjd 398168404Spjd 399168404Spjd#ifdef __cplusplus 400168404Spjd} 401168404Spjd#endif 402168404Spjd 403168404Spjd#endif /* _SYS_DBUF_H */ 404