arc.c revision 209275
1168404Spjd/* 2168404Spjd * CDDL HEADER START 3168404Spjd * 4168404Spjd * The contents of this file are subject to the terms of the 5168404Spjd * Common Development and Distribution License (the "License"). 6168404Spjd * You may not use this file except in compliance with the License. 7168404Spjd * 8168404Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9168404Spjd * or http://www.opensolaris.org/os/licensing. 10168404Spjd * See the License for the specific language governing permissions 11168404Spjd * and limitations under the License. 12168404Spjd * 13168404Spjd * When distributing Covered Code, include this CDDL HEADER in each 14168404Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15168404Spjd * If applicable, add the following below this CDDL HEADER, with the 16168404Spjd * fields enclosed by brackets "[]" replaced with your own identifying 17168404Spjd * information: Portions Copyright [yyyy] [name of copyright owner] 18168404Spjd * 19168404Spjd * CDDL HEADER END 20168404Spjd */ 21168404Spjd/* 22208373Smm * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23168404Spjd * Use is subject to license terms. 24168404Spjd */ 25168404Spjd 26168404Spjd/* 27168404Spjd * DVA-based Adjustable Replacement Cache 28168404Spjd * 29168404Spjd * While much of the theory of operation used here is 30168404Spjd * based on the self-tuning, low overhead replacement cache 31168404Spjd * presented by Megiddo and Modha at FAST 2003, there are some 32168404Spjd * significant differences: 33168404Spjd * 34168404Spjd * 1. The Megiddo and Modha model assumes any page is evictable. 35168404Spjd * Pages in its cache cannot be "locked" into memory. This makes 36168404Spjd * the eviction algorithm simple: evict the last page in the list. 37168404Spjd * This also make the performance characteristics easy to reason 38168404Spjd * about. Our cache is not so simple. At any given moment, some 39168404Spjd * subset of the blocks in the cache are un-evictable because we 40168404Spjd * have handed out a reference to them. Blocks are only evictable 41168404Spjd * when there are no external references active. This makes 42168404Spjd * eviction far more problematic: we choose to evict the evictable 43168404Spjd * blocks that are the "lowest" in the list. 44168404Spjd * 45168404Spjd * There are times when it is not possible to evict the requested 46168404Spjd * space. In these circumstances we are unable to adjust the cache 47168404Spjd * size. To prevent the cache growing unbounded at these times we 48185029Spjd * implement a "cache throttle" that slows the flow of new data 49185029Spjd * into the cache until we can make space available. 50168404Spjd * 51168404Spjd * 2. The Megiddo and Modha model assumes a fixed cache size. 52168404Spjd * Pages are evicted when the cache is full and there is a cache 53168404Spjd * miss. Our model has a variable sized cache. It grows with 54185029Spjd * high use, but also tries to react to memory pressure from the 55168404Spjd * operating system: decreasing its size when system memory is 56168404Spjd * tight. 57168404Spjd * 58168404Spjd * 3. The Megiddo and Modha model assumes a fixed page size. All 59168404Spjd * elements of the cache are therefor exactly the same size. So 60168404Spjd * when adjusting the cache size following a cache miss, its simply 61168404Spjd * a matter of choosing a single page to evict. In our model, we 62168404Spjd * have variable sized cache blocks (rangeing from 512 bytes to 63168404Spjd * 128K bytes). We therefor choose a set of blocks to evict to make 64168404Spjd * space for a cache miss that approximates as closely as possible 65168404Spjd * the space used by the new block. 66168404Spjd * 67168404Spjd * See also: "ARC: A Self-Tuning, Low Overhead Replacement Cache" 68168404Spjd * by N. Megiddo & D. Modha, FAST 2003 69168404Spjd */ 70168404Spjd 71168404Spjd/* 72168404Spjd * The locking model: 73168404Spjd * 74168404Spjd * A new reference to a cache buffer can be obtained in two 75168404Spjd * ways: 1) via a hash table lookup using the DVA as a key, 76185029Spjd * or 2) via one of the ARC lists. The arc_read() interface 77168404Spjd * uses method 1, while the internal arc algorithms for 78168404Spjd * adjusting the cache use method 2. We therefor provide two 79168404Spjd * types of locks: 1) the hash table lock array, and 2) the 80168404Spjd * arc list locks. 81168404Spjd * 82168404Spjd * Buffers do not have their own mutexs, rather they rely on the 83168404Spjd * hash table mutexs for the bulk of their protection (i.e. most 84168404Spjd * fields in the arc_buf_hdr_t are protected by these mutexs). 85168404Spjd * 86168404Spjd * buf_hash_find() returns the appropriate mutex (held) when it 87168404Spjd * locates the requested buffer in the hash table. It returns 88168404Spjd * NULL for the mutex if the buffer was not in the table. 89168404Spjd * 90168404Spjd * buf_hash_remove() expects the appropriate hash mutex to be 91168404Spjd * already held before it is invoked. 92168404Spjd * 93168404Spjd * Each arc state also has a mutex which is used to protect the 94168404Spjd * buffer list associated with the state. When attempting to 95168404Spjd * obtain a hash table lock while holding an arc list lock you 96168404Spjd * must use: mutex_tryenter() to avoid deadlock. Also note that 97168404Spjd * the active state mutex must be held before the ghost state mutex. 98168404Spjd * 99168404Spjd * Arc buffers may have an associated eviction callback function. 100168404Spjd * This function will be invoked prior to removing the buffer (e.g. 101168404Spjd * in arc_do_user_evicts()). Note however that the data associated 102168404Spjd * with the buffer may be evicted prior to the callback. The callback 103168404Spjd * must be made with *no locks held* (to prevent deadlock). Additionally, 104168404Spjd * the users of callbacks must ensure that their private data is 105168404Spjd * protected from simultaneous callbacks from arc_buf_evict() 106168404Spjd * and arc_do_user_evicts(). 107168404Spjd * 108168404Spjd * Note that the majority of the performance stats are manipulated 109168404Spjd * with atomic operations. 110185029Spjd * 111185029Spjd * The L2ARC uses the l2arc_buflist_mtx global mutex for the following: 112185029Spjd * 113185029Spjd * - L2ARC buflist creation 114185029Spjd * - L2ARC buflist eviction 115185029Spjd * - L2ARC write completion, which walks L2ARC buflists 116185029Spjd * - ARC header destruction, as it removes from L2ARC buflists 117185029Spjd * - ARC header release, as it removes from L2ARC buflists 118168404Spjd */ 119168404Spjd 120168404Spjd#include <sys/spa.h> 121168404Spjd#include <sys/zio.h> 122168404Spjd#include <sys/zio_checksum.h> 123168404Spjd#include <sys/zfs_context.h> 124168404Spjd#include <sys/arc.h> 125168404Spjd#include <sys/refcount.h> 126185029Spjd#include <sys/vdev.h> 127168404Spjd#ifdef _KERNEL 128168404Spjd#include <sys/dnlc.h> 129168404Spjd#endif 130168404Spjd#include <sys/callb.h> 131168404Spjd#include <sys/kstat.h> 132168404Spjd#include <sys/sdt.h> 133168404Spjd 134191902Skmacy#include <vm/vm_pageout.h> 135191902Skmacy 136168404Spjdstatic kmutex_t arc_reclaim_thr_lock; 137168404Spjdstatic kcondvar_t arc_reclaim_thr_cv; /* used to signal reclaim thr */ 138168404Spjdstatic uint8_t arc_thread_exit; 139168404Spjd 140185029Spjdextern int zfs_write_limit_shift; 141185029Spjdextern uint64_t zfs_write_limit_max; 142185029Spjdextern kmutex_t zfs_write_limit_lock; 143185029Spjd 144168404Spjd#define ARC_REDUCE_DNLC_PERCENT 3 145168404Spjduint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT; 146168404Spjd 147168404Spjdtypedef enum arc_reclaim_strategy { 148168404Spjd ARC_RECLAIM_AGGR, /* Aggressive reclaim strategy */ 149168404Spjd ARC_RECLAIM_CONS /* Conservative reclaim strategy */ 150168404Spjd} arc_reclaim_strategy_t; 151168404Spjd 152168404Spjd/* number of seconds before growing cache again */ 153168404Spjdstatic int arc_grow_retry = 60; 154168404Spjd 155208373Smm/* shift of arc_c for calculating both min and max arc_p */ 156208373Smmstatic int arc_p_min_shift = 4; 157208373Smm 158208373Smm/* log2(fraction of arc to reclaim) */ 159208373Smmstatic int arc_shrink_shift = 5; 160208373Smm 161168404Spjd/* 162168404Spjd * minimum lifespan of a prefetch block in clock ticks 163168404Spjd * (initialized in arc_init()) 164168404Spjd */ 165168404Spjdstatic int arc_min_prefetch_lifespan; 166168404Spjd 167208373Smmstatic int arc_dead; 168194043Skmacyextern int zfs_prefetch_disable; 169168404Spjd 170168404Spjd/* 171185029Spjd * The arc has filled available memory and has now warmed up. 172185029Spjd */ 173185029Spjdstatic boolean_t arc_warm; 174185029Spjd 175185029Spjd/* 176168404Spjd * These tunables are for performance analysis. 177168404Spjd */ 178185029Spjduint64_t zfs_arc_max; 179185029Spjduint64_t zfs_arc_min; 180185029Spjduint64_t zfs_arc_meta_limit = 0; 181185029Spjdint zfs_mdcomp_disable = 0; 182208373Smmint zfs_arc_grow_retry = 0; 183208373Smmint zfs_arc_shrink_shift = 0; 184208373Smmint zfs_arc_p_min_shift = 0; 185185029Spjd 186185029SpjdTUNABLE_QUAD("vfs.zfs.arc_max", &zfs_arc_max); 187185029SpjdTUNABLE_QUAD("vfs.zfs.arc_min", &zfs_arc_min); 188185029SpjdTUNABLE_QUAD("vfs.zfs.arc_meta_limit", &zfs_arc_meta_limit); 189185029SpjdTUNABLE_INT("vfs.zfs.mdcomp_disable", &zfs_mdcomp_disable); 190168473SpjdSYSCTL_DECL(_vfs_zfs); 191185029SpjdSYSCTL_QUAD(_vfs_zfs, OID_AUTO, arc_max, CTLFLAG_RDTUN, &zfs_arc_max, 0, 192168473Spjd "Maximum ARC size"); 193185029SpjdSYSCTL_QUAD(_vfs_zfs, OID_AUTO, arc_min, CTLFLAG_RDTUN, &zfs_arc_min, 0, 194168473Spjd "Minimum ARC size"); 195185029SpjdSYSCTL_INT(_vfs_zfs, OID_AUTO, mdcomp_disable, CTLFLAG_RDTUN, 196185029Spjd &zfs_mdcomp_disable, 0, "Disable metadata compression"); 197168404Spjd 198168404Spjd/* 199185029Spjd * Note that buffers can be in one of 6 states: 200168404Spjd * ARC_anon - anonymous (discussed below) 201168404Spjd * ARC_mru - recently used, currently cached 202168404Spjd * ARC_mru_ghost - recentely used, no longer in cache 203168404Spjd * ARC_mfu - frequently used, currently cached 204168404Spjd * ARC_mfu_ghost - frequently used, no longer in cache 205185029Spjd * ARC_l2c_only - exists in L2ARC but not other states 206185029Spjd * When there are no active references to the buffer, they are 207185029Spjd * are linked onto a list in one of these arc states. These are 208185029Spjd * the only buffers that can be evicted or deleted. Within each 209185029Spjd * state there are multiple lists, one for meta-data and one for 210185029Spjd * non-meta-data. Meta-data (indirect blocks, blocks of dnodes, 211185029Spjd * etc.) is tracked separately so that it can be managed more 212185029Spjd * explicitly: favored over data, limited explicitly. 213168404Spjd * 214168404Spjd * Anonymous buffers are buffers that are not associated with 215168404Spjd * a DVA. These are buffers that hold dirty block copies 216168404Spjd * before they are written to stable storage. By definition, 217168404Spjd * they are "ref'd" and are considered part of arc_mru 218168404Spjd * that cannot be freed. Generally, they will aquire a DVA 219168404Spjd * as they are written and migrate onto the arc_mru list. 220185029Spjd * 221185029Spjd * The ARC_l2c_only state is for buffers that are in the second 222185029Spjd * level ARC but no longer in any of the ARC_m* lists. The second 223185029Spjd * level ARC itself may also contain buffers that are in any of 224185029Spjd * the ARC_m* states - meaning that a buffer can exist in two 225185029Spjd * places. The reason for the ARC_l2c_only state is to keep the 226185029Spjd * buffer header in the hash table, so that reads that hit the 227185029Spjd * second level ARC benefit from these fast lookups. 228168404Spjd */ 229168404Spjd 230205264Skmacy#define ARCS_LOCK_PAD CACHE_LINE_SIZE 231205231Skmacystruct arcs_lock { 232205231Skmacy kmutex_t arcs_lock; 233205231Skmacy#ifdef _KERNEL 234205231Skmacy unsigned char pad[(ARCS_LOCK_PAD - sizeof (kmutex_t))]; 235205231Skmacy#endif 236205231Skmacy}; 237205231Skmacy 238205231Skmacy/* 239205231Skmacy * must be power of two for mask use to work 240205231Skmacy * 241205231Skmacy */ 242205231Skmacy#define ARC_BUFC_NUMDATALISTS 16 243205231Skmacy#define ARC_BUFC_NUMMETADATALISTS 16 244206796Spjd#define ARC_BUFC_NUMLISTS (ARC_BUFC_NUMMETADATALISTS + ARC_BUFC_NUMDATALISTS) 245205231Skmacy 246168404Spjdtypedef struct arc_state { 247185029Spjd uint64_t arcs_lsize[ARC_BUFC_NUMTYPES]; /* amount of evictable data */ 248185029Spjd uint64_t arcs_size; /* total amount of data in this state */ 249205231Skmacy list_t arcs_lists[ARC_BUFC_NUMLISTS]; /* list of evictable buffers */ 250205264Skmacy struct arcs_lock arcs_locks[ARC_BUFC_NUMLISTS] __aligned(CACHE_LINE_SIZE); 251168404Spjd} arc_state_t; 252168404Spjd 253206796Spjd#define ARCS_LOCK(s, i) (&((s)->arcs_locks[(i)].arcs_lock)) 254205231Skmacy 255185029Spjd/* The 6 states: */ 256168404Spjdstatic arc_state_t ARC_anon; 257168404Spjdstatic arc_state_t ARC_mru; 258168404Spjdstatic arc_state_t ARC_mru_ghost; 259168404Spjdstatic arc_state_t ARC_mfu; 260168404Spjdstatic arc_state_t ARC_mfu_ghost; 261185029Spjdstatic arc_state_t ARC_l2c_only; 262168404Spjd 263168404Spjdtypedef struct arc_stats { 264168404Spjd kstat_named_t arcstat_hits; 265168404Spjd kstat_named_t arcstat_misses; 266168404Spjd kstat_named_t arcstat_demand_data_hits; 267168404Spjd kstat_named_t arcstat_demand_data_misses; 268168404Spjd kstat_named_t arcstat_demand_metadata_hits; 269168404Spjd kstat_named_t arcstat_demand_metadata_misses; 270168404Spjd kstat_named_t arcstat_prefetch_data_hits; 271168404Spjd kstat_named_t arcstat_prefetch_data_misses; 272168404Spjd kstat_named_t arcstat_prefetch_metadata_hits; 273168404Spjd kstat_named_t arcstat_prefetch_metadata_misses; 274168404Spjd kstat_named_t arcstat_mru_hits; 275168404Spjd kstat_named_t arcstat_mru_ghost_hits; 276168404Spjd kstat_named_t arcstat_mfu_hits; 277168404Spjd kstat_named_t arcstat_mfu_ghost_hits; 278205231Skmacy kstat_named_t arcstat_allocated; 279168404Spjd kstat_named_t arcstat_deleted; 280205231Skmacy kstat_named_t arcstat_stolen; 281168404Spjd kstat_named_t arcstat_recycle_miss; 282168404Spjd kstat_named_t arcstat_mutex_miss; 283168404Spjd kstat_named_t arcstat_evict_skip; 284208373Smm kstat_named_t arcstat_evict_l2_cached; 285208373Smm kstat_named_t arcstat_evict_l2_eligible; 286208373Smm kstat_named_t arcstat_evict_l2_ineligible; 287168404Spjd kstat_named_t arcstat_hash_elements; 288168404Spjd kstat_named_t arcstat_hash_elements_max; 289168404Spjd kstat_named_t arcstat_hash_collisions; 290168404Spjd kstat_named_t arcstat_hash_chains; 291168404Spjd kstat_named_t arcstat_hash_chain_max; 292168404Spjd kstat_named_t arcstat_p; 293168404Spjd kstat_named_t arcstat_c; 294168404Spjd kstat_named_t arcstat_c_min; 295168404Spjd kstat_named_t arcstat_c_max; 296168404Spjd kstat_named_t arcstat_size; 297185029Spjd kstat_named_t arcstat_hdr_size; 298208373Smm kstat_named_t arcstat_data_size; 299208373Smm kstat_named_t arcstat_other_size; 300185029Spjd kstat_named_t arcstat_l2_hits; 301185029Spjd kstat_named_t arcstat_l2_misses; 302185029Spjd kstat_named_t arcstat_l2_feeds; 303185029Spjd kstat_named_t arcstat_l2_rw_clash; 304208373Smm kstat_named_t arcstat_l2_read_bytes; 305208373Smm kstat_named_t arcstat_l2_write_bytes; 306185029Spjd kstat_named_t arcstat_l2_writes_sent; 307185029Spjd kstat_named_t arcstat_l2_writes_done; 308185029Spjd kstat_named_t arcstat_l2_writes_error; 309185029Spjd kstat_named_t arcstat_l2_writes_hdr_miss; 310185029Spjd kstat_named_t arcstat_l2_evict_lock_retry; 311185029Spjd kstat_named_t arcstat_l2_evict_reading; 312185029Spjd kstat_named_t arcstat_l2_free_on_write; 313185029Spjd kstat_named_t arcstat_l2_abort_lowmem; 314185029Spjd kstat_named_t arcstat_l2_cksum_bad; 315185029Spjd kstat_named_t arcstat_l2_io_error; 316185029Spjd kstat_named_t arcstat_l2_size; 317185029Spjd kstat_named_t arcstat_l2_hdr_size; 318185029Spjd kstat_named_t arcstat_memory_throttle_count; 319205231Skmacy kstat_named_t arcstat_l2_write_trylock_fail; 320205231Skmacy kstat_named_t arcstat_l2_write_passed_headroom; 321205231Skmacy kstat_named_t arcstat_l2_write_spa_mismatch; 322206796Spjd kstat_named_t arcstat_l2_write_in_l2; 323205231Skmacy kstat_named_t arcstat_l2_write_hdr_io_in_progress; 324205231Skmacy kstat_named_t arcstat_l2_write_not_cacheable; 325205231Skmacy kstat_named_t arcstat_l2_write_full; 326205231Skmacy kstat_named_t arcstat_l2_write_buffer_iter; 327205231Skmacy kstat_named_t arcstat_l2_write_pios; 328205231Skmacy kstat_named_t arcstat_l2_write_buffer_bytes_scanned; 329205231Skmacy kstat_named_t arcstat_l2_write_buffer_list_iter; 330205231Skmacy kstat_named_t arcstat_l2_write_buffer_list_null_iter; 331168404Spjd} arc_stats_t; 332168404Spjd 333168404Spjdstatic arc_stats_t arc_stats = { 334168404Spjd { "hits", KSTAT_DATA_UINT64 }, 335168404Spjd { "misses", KSTAT_DATA_UINT64 }, 336168404Spjd { "demand_data_hits", KSTAT_DATA_UINT64 }, 337168404Spjd { "demand_data_misses", KSTAT_DATA_UINT64 }, 338168404Spjd { "demand_metadata_hits", KSTAT_DATA_UINT64 }, 339168404Spjd { "demand_metadata_misses", KSTAT_DATA_UINT64 }, 340168404Spjd { "prefetch_data_hits", KSTAT_DATA_UINT64 }, 341168404Spjd { "prefetch_data_misses", KSTAT_DATA_UINT64 }, 342168404Spjd { "prefetch_metadata_hits", KSTAT_DATA_UINT64 }, 343168404Spjd { "prefetch_metadata_misses", KSTAT_DATA_UINT64 }, 344168404Spjd { "mru_hits", KSTAT_DATA_UINT64 }, 345168404Spjd { "mru_ghost_hits", KSTAT_DATA_UINT64 }, 346168404Spjd { "mfu_hits", KSTAT_DATA_UINT64 }, 347168404Spjd { "mfu_ghost_hits", KSTAT_DATA_UINT64 }, 348205231Skmacy { "allocated", KSTAT_DATA_UINT64 }, 349168404Spjd { "deleted", KSTAT_DATA_UINT64 }, 350205231Skmacy { "stolen", KSTAT_DATA_UINT64 }, 351168404Spjd { "recycle_miss", KSTAT_DATA_UINT64 }, 352168404Spjd { "mutex_miss", KSTAT_DATA_UINT64 }, 353168404Spjd { "evict_skip", KSTAT_DATA_UINT64 }, 354208373Smm { "evict_l2_cached", KSTAT_DATA_UINT64 }, 355208373Smm { "evict_l2_eligible", KSTAT_DATA_UINT64 }, 356208373Smm { "evict_l2_ineligible", KSTAT_DATA_UINT64 }, 357168404Spjd { "hash_elements", KSTAT_DATA_UINT64 }, 358168404Spjd { "hash_elements_max", KSTAT_DATA_UINT64 }, 359168404Spjd { "hash_collisions", KSTAT_DATA_UINT64 }, 360168404Spjd { "hash_chains", KSTAT_DATA_UINT64 }, 361168404Spjd { "hash_chain_max", KSTAT_DATA_UINT64 }, 362168404Spjd { "p", KSTAT_DATA_UINT64 }, 363168404Spjd { "c", KSTAT_DATA_UINT64 }, 364168404Spjd { "c_min", KSTAT_DATA_UINT64 }, 365168404Spjd { "c_max", KSTAT_DATA_UINT64 }, 366185029Spjd { "size", KSTAT_DATA_UINT64 }, 367185029Spjd { "hdr_size", KSTAT_DATA_UINT64 }, 368208373Smm { "data_size", KSTAT_DATA_UINT64 }, 369208373Smm { "other_size", KSTAT_DATA_UINT64 }, 370185029Spjd { "l2_hits", KSTAT_DATA_UINT64 }, 371185029Spjd { "l2_misses", KSTAT_DATA_UINT64 }, 372185029Spjd { "l2_feeds", KSTAT_DATA_UINT64 }, 373185029Spjd { "l2_rw_clash", KSTAT_DATA_UINT64 }, 374208373Smm { "l2_read_bytes", KSTAT_DATA_UINT64 }, 375208373Smm { "l2_write_bytes", KSTAT_DATA_UINT64 }, 376185029Spjd { "l2_writes_sent", KSTAT_DATA_UINT64 }, 377185029Spjd { "l2_writes_done", KSTAT_DATA_UINT64 }, 378185029Spjd { "l2_writes_error", KSTAT_DATA_UINT64 }, 379185029Spjd { "l2_writes_hdr_miss", KSTAT_DATA_UINT64 }, 380185029Spjd { "l2_evict_lock_retry", KSTAT_DATA_UINT64 }, 381185029Spjd { "l2_evict_reading", KSTAT_DATA_UINT64 }, 382185029Spjd { "l2_free_on_write", KSTAT_DATA_UINT64 }, 383185029Spjd { "l2_abort_lowmem", KSTAT_DATA_UINT64 }, 384185029Spjd { "l2_cksum_bad", KSTAT_DATA_UINT64 }, 385185029Spjd { "l2_io_error", KSTAT_DATA_UINT64 }, 386185029Spjd { "l2_size", KSTAT_DATA_UINT64 }, 387185029Spjd { "l2_hdr_size", KSTAT_DATA_UINT64 }, 388205231Skmacy { "memory_throttle_count", KSTAT_DATA_UINT64 }, 389206796Spjd { "l2_write_trylock_fail", KSTAT_DATA_UINT64 }, 390206796Spjd { "l2_write_passed_headroom", KSTAT_DATA_UINT64 }, 391206796Spjd { "l2_write_spa_mismatch", KSTAT_DATA_UINT64 }, 392206796Spjd { "l2_write_in_l2", KSTAT_DATA_UINT64 }, 393206796Spjd { "l2_write_io_in_progress", KSTAT_DATA_UINT64 }, 394206796Spjd { "l2_write_not_cacheable", KSTAT_DATA_UINT64 }, 395206796Spjd { "l2_write_full", KSTAT_DATA_UINT64 }, 396206796Spjd { "l2_write_buffer_iter", KSTAT_DATA_UINT64 }, 397206796Spjd { "l2_write_pios", KSTAT_DATA_UINT64 }, 398206796Spjd { "l2_write_buffer_bytes_scanned", KSTAT_DATA_UINT64 }, 399206796Spjd { "l2_write_buffer_list_iter", KSTAT_DATA_UINT64 }, 400206796Spjd { "l2_write_buffer_list_null_iter", KSTAT_DATA_UINT64 } 401168404Spjd}; 402168404Spjd 403168404Spjd#define ARCSTAT(stat) (arc_stats.stat.value.ui64) 404168404Spjd 405168404Spjd#define ARCSTAT_INCR(stat, val) \ 406168404Spjd atomic_add_64(&arc_stats.stat.value.ui64, (val)); 407168404Spjd 408206796Spjd#define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1) 409168404Spjd#define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1) 410168404Spjd 411168404Spjd#define ARCSTAT_MAX(stat, val) { \ 412168404Spjd uint64_t m; \ 413168404Spjd while ((val) > (m = arc_stats.stat.value.ui64) && \ 414168404Spjd (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val)))) \ 415168404Spjd continue; \ 416168404Spjd} 417168404Spjd 418168404Spjd#define ARCSTAT_MAXSTAT(stat) \ 419168404Spjd ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64) 420168404Spjd 421168404Spjd/* 422168404Spjd * We define a macro to allow ARC hits/misses to be easily broken down by 423168404Spjd * two separate conditions, giving a total of four different subtypes for 424168404Spjd * each of hits and misses (so eight statistics total). 425168404Spjd */ 426168404Spjd#define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \ 427168404Spjd if (cond1) { \ 428168404Spjd if (cond2) { \ 429168404Spjd ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \ 430168404Spjd } else { \ 431168404Spjd ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \ 432168404Spjd } \ 433168404Spjd } else { \ 434168404Spjd if (cond2) { \ 435168404Spjd ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \ 436168404Spjd } else { \ 437168404Spjd ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\ 438168404Spjd } \ 439168404Spjd } 440168404Spjd 441168404Spjdkstat_t *arc_ksp; 442206796Spjdstatic arc_state_t *arc_anon; 443168404Spjdstatic arc_state_t *arc_mru; 444168404Spjdstatic arc_state_t *arc_mru_ghost; 445168404Spjdstatic arc_state_t *arc_mfu; 446168404Spjdstatic arc_state_t *arc_mfu_ghost; 447185029Spjdstatic arc_state_t *arc_l2c_only; 448168404Spjd 449168404Spjd/* 450168404Spjd * There are several ARC variables that are critical to export as kstats -- 451168404Spjd * but we don't want to have to grovel around in the kstat whenever we wish to 452168404Spjd * manipulate them. For these variables, we therefore define them to be in 453168404Spjd * terms of the statistic variable. This assures that we are not introducing 454168404Spjd * the possibility of inconsistency by having shadow copies of the variables, 455168404Spjd * while still allowing the code to be readable. 456168404Spjd */ 457168404Spjd#define arc_size ARCSTAT(arcstat_size) /* actual total arc size */ 458168404Spjd#define arc_p ARCSTAT(arcstat_p) /* target size of MRU */ 459168404Spjd#define arc_c ARCSTAT(arcstat_c) /* target size of cache */ 460168404Spjd#define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */ 461168404Spjd#define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */ 462168404Spjd 463168404Spjdstatic int arc_no_grow; /* Don't try to grow cache size */ 464168404Spjdstatic uint64_t arc_tempreserve; 465185029Spjdstatic uint64_t arc_meta_used; 466185029Spjdstatic uint64_t arc_meta_limit; 467185029Spjdstatic uint64_t arc_meta_max = 0; 468185029SpjdSYSCTL_QUAD(_vfs_zfs, OID_AUTO, arc_meta_used, CTLFLAG_RDTUN, 469185029Spjd &arc_meta_used, 0, "ARC metadata used"); 470185029SpjdSYSCTL_QUAD(_vfs_zfs, OID_AUTO, arc_meta_limit, CTLFLAG_RDTUN, 471185029Spjd &arc_meta_limit, 0, "ARC metadata limit"); 472168404Spjd 473185029Spjdtypedef struct l2arc_buf_hdr l2arc_buf_hdr_t; 474185029Spjd 475168404Spjdtypedef struct arc_callback arc_callback_t; 476168404Spjd 477168404Spjdstruct arc_callback { 478168404Spjd void *acb_private; 479168404Spjd arc_done_func_t *acb_done; 480168404Spjd arc_buf_t *acb_buf; 481168404Spjd zio_t *acb_zio_dummy; 482168404Spjd arc_callback_t *acb_next; 483168404Spjd}; 484168404Spjd 485168404Spjdtypedef struct arc_write_callback arc_write_callback_t; 486168404Spjd 487168404Spjdstruct arc_write_callback { 488168404Spjd void *awcb_private; 489168404Spjd arc_done_func_t *awcb_ready; 490168404Spjd arc_done_func_t *awcb_done; 491168404Spjd arc_buf_t *awcb_buf; 492168404Spjd}; 493168404Spjd 494168404Spjdstruct arc_buf_hdr { 495168404Spjd /* protected by hash lock */ 496168404Spjd dva_t b_dva; 497168404Spjd uint64_t b_birth; 498168404Spjd uint64_t b_cksum0; 499168404Spjd 500168404Spjd kmutex_t b_freeze_lock; 501168404Spjd zio_cksum_t *b_freeze_cksum; 502168404Spjd 503168404Spjd arc_buf_hdr_t *b_hash_next; 504168404Spjd arc_buf_t *b_buf; 505168404Spjd uint32_t b_flags; 506168404Spjd uint32_t b_datacnt; 507168404Spjd 508168404Spjd arc_callback_t *b_acb; 509168404Spjd kcondvar_t b_cv; 510168404Spjd 511168404Spjd /* immutable */ 512168404Spjd arc_buf_contents_t b_type; 513168404Spjd uint64_t b_size; 514168404Spjd spa_t *b_spa; 515168404Spjd 516168404Spjd /* protected by arc state mutex */ 517168404Spjd arc_state_t *b_state; 518168404Spjd list_node_t b_arc_node; 519168404Spjd 520168404Spjd /* updated atomically */ 521168404Spjd clock_t b_arc_access; 522168404Spjd 523168404Spjd /* self protecting */ 524168404Spjd refcount_t b_refcnt; 525185029Spjd 526185029Spjd l2arc_buf_hdr_t *b_l2hdr; 527185029Spjd list_node_t b_l2node; 528168404Spjd}; 529168404Spjd 530168404Spjdstatic arc_buf_t *arc_eviction_list; 531168404Spjdstatic kmutex_t arc_eviction_mtx; 532168404Spjdstatic arc_buf_hdr_t arc_eviction_hdr; 533168404Spjdstatic void arc_get_data_buf(arc_buf_t *buf); 534168404Spjdstatic void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock); 535185029Spjdstatic int arc_evict_needed(arc_buf_contents_t type); 536185029Spjdstatic void arc_evict_ghost(arc_state_t *state, spa_t *spa, int64_t bytes); 537168404Spjd 538208373Smmstatic boolean_t l2arc_write_eligible(spa_t *spa, arc_buf_hdr_t *ab); 539208373Smm 540168404Spjd#define GHOST_STATE(state) \ 541185029Spjd ((state) == arc_mru_ghost || (state) == arc_mfu_ghost || \ 542185029Spjd (state) == arc_l2c_only) 543168404Spjd 544168404Spjd/* 545168404Spjd * Private ARC flags. These flags are private ARC only flags that will show up 546168404Spjd * in b_flags in the arc_hdr_buf_t. Some flags are publicly declared, and can 547168404Spjd * be passed in as arc_flags in things like arc_read. However, these flags 548168404Spjd * should never be passed and should only be set by ARC code. When adding new 549168404Spjd * public flags, make sure not to smash the private ones. 550168404Spjd */ 551168404Spjd 552168404Spjd#define ARC_IN_HASH_TABLE (1 << 9) /* this buffer is hashed */ 553168404Spjd#define ARC_IO_IN_PROGRESS (1 << 10) /* I/O in progress for buf */ 554168404Spjd#define ARC_IO_ERROR (1 << 11) /* I/O failed for buf */ 555168404Spjd#define ARC_FREED_IN_READ (1 << 12) /* buf freed while in read */ 556168404Spjd#define ARC_BUF_AVAILABLE (1 << 13) /* block not in active use */ 557168404Spjd#define ARC_INDIRECT (1 << 14) /* this is an indirect block */ 558185029Spjd#define ARC_FREE_IN_PROGRESS (1 << 15) /* hdr about to be freed */ 559185029Spjd#define ARC_L2_WRITING (1 << 16) /* L2ARC write in progress */ 560185029Spjd#define ARC_L2_EVICTED (1 << 17) /* evicted during I/O */ 561185029Spjd#define ARC_L2_WRITE_HEAD (1 << 18) /* head of write list */ 562185029Spjd#define ARC_STORED (1 << 19) /* has been store()d to */ 563168404Spjd 564168404Spjd#define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_IN_HASH_TABLE) 565168404Spjd#define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS) 566168404Spjd#define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_IO_ERROR) 567208373Smm#define HDR_PREFETCH(hdr) ((hdr)->b_flags & ARC_PREFETCH) 568168404Spjd#define HDR_FREED_IN_READ(hdr) ((hdr)->b_flags & ARC_FREED_IN_READ) 569168404Spjd#define HDR_BUF_AVAILABLE(hdr) ((hdr)->b_flags & ARC_BUF_AVAILABLE) 570185029Spjd#define HDR_FREE_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FREE_IN_PROGRESS) 571185029Spjd#define HDR_L2CACHE(hdr) ((hdr)->b_flags & ARC_L2CACHE) 572185029Spjd#define HDR_L2_READING(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS && \ 573185029Spjd (hdr)->b_l2hdr != NULL) 574185029Spjd#define HDR_L2_WRITING(hdr) ((hdr)->b_flags & ARC_L2_WRITING) 575185029Spjd#define HDR_L2_EVICTED(hdr) ((hdr)->b_flags & ARC_L2_EVICTED) 576185029Spjd#define HDR_L2_WRITE_HEAD(hdr) ((hdr)->b_flags & ARC_L2_WRITE_HEAD) 577168404Spjd 578168404Spjd/* 579185029Spjd * Other sizes 580185029Spjd */ 581185029Spjd 582185029Spjd#define HDR_SIZE ((int64_t)sizeof (arc_buf_hdr_t)) 583185029Spjd#define L2HDR_SIZE ((int64_t)sizeof (l2arc_buf_hdr_t)) 584185029Spjd 585185029Spjd/* 586168404Spjd * Hash table routines 587168404Spjd */ 588168404Spjd 589205253Skmacy#define HT_LOCK_PAD CACHE_LINE_SIZE 590168404Spjd 591168404Spjdstruct ht_lock { 592168404Spjd kmutex_t ht_lock; 593168404Spjd#ifdef _KERNEL 594168404Spjd unsigned char pad[(HT_LOCK_PAD - sizeof (kmutex_t))]; 595168404Spjd#endif 596168404Spjd}; 597168404Spjd 598168404Spjd#define BUF_LOCKS 256 599168404Spjdtypedef struct buf_hash_table { 600168404Spjd uint64_t ht_mask; 601168404Spjd arc_buf_hdr_t **ht_table; 602205264Skmacy struct ht_lock ht_locks[BUF_LOCKS] __aligned(CACHE_LINE_SIZE); 603168404Spjd} buf_hash_table_t; 604168404Spjd 605168404Spjdstatic buf_hash_table_t buf_hash_table; 606168404Spjd 607168404Spjd#define BUF_HASH_INDEX(spa, dva, birth) \ 608168404Spjd (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask) 609168404Spjd#define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)]) 610168404Spjd#define BUF_HASH_LOCK(idx) (&(BUF_HASH_LOCK_NTRY(idx).ht_lock)) 611168404Spjd#define HDR_LOCK(buf) \ 612168404Spjd (BUF_HASH_LOCK(BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth))) 613168404Spjd 614168404Spjduint64_t zfs_crc64_table[256]; 615168404Spjd 616185029Spjd/* 617185029Spjd * Level 2 ARC 618185029Spjd */ 619185029Spjd 620208373Smm#define L2ARC_WRITE_SIZE (8 * 1024 * 1024) /* initial write max */ 621208373Smm#define L2ARC_HEADROOM 2 /* num of writes */ 622208373Smm#define L2ARC_FEED_SECS 1 /* caching interval secs */ 623208373Smm#define L2ARC_FEED_MIN_MS 200 /* min caching interval ms */ 624185029Spjd 625185029Spjd#define l2arc_writes_sent ARCSTAT(arcstat_l2_writes_sent) 626185029Spjd#define l2arc_writes_done ARCSTAT(arcstat_l2_writes_done) 627185029Spjd 628185029Spjd/* 629185029Spjd * L2ARC Performance Tunables 630185029Spjd */ 631185029Spjduint64_t l2arc_write_max = L2ARC_WRITE_SIZE; /* default max write size */ 632185029Spjduint64_t l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra write during warmup */ 633185029Spjduint64_t l2arc_headroom = L2ARC_HEADROOM; /* number of dev writes */ 634185029Spjduint64_t l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */ 635208373Smmuint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval milliseconds */ 636205231Skmacyboolean_t l2arc_noprefetch = B_FALSE; /* don't cache prefetch bufs */ 637208373Smmboolean_t l2arc_feed_again = B_TRUE; /* turbo warmup */ 638208373Smmboolean_t l2arc_norw = B_TRUE; /* no reads during writes */ 639185029Spjd 640205231SkmacySYSCTL_QUAD(_vfs_zfs, OID_AUTO, l2arc_write_max, CTLFLAG_RW, 641205231Skmacy &l2arc_write_max, 0, "max write size"); 642205231SkmacySYSCTL_QUAD(_vfs_zfs, OID_AUTO, l2arc_write_boost, CTLFLAG_RW, 643205231Skmacy &l2arc_write_boost, 0, "extra write during warmup"); 644205231SkmacySYSCTL_QUAD(_vfs_zfs, OID_AUTO, l2arc_headroom, CTLFLAG_RW, 645205231Skmacy &l2arc_headroom, 0, "number of dev writes"); 646205231SkmacySYSCTL_QUAD(_vfs_zfs, OID_AUTO, l2arc_feed_secs, CTLFLAG_RW, 647205231Skmacy &l2arc_feed_secs, 0, "interval seconds"); 648208373SmmSYSCTL_QUAD(_vfs_zfs, OID_AUTO, l2arc_feed_min_ms, CTLFLAG_RW, 649208373Smm &l2arc_feed_min_ms, 0, "min interval milliseconds"); 650205231Skmacy 651205231SkmacySYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_noprefetch, CTLFLAG_RW, 652205231Skmacy &l2arc_noprefetch, 0, "don't cache prefetch bufs"); 653208373SmmSYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_feed_again, CTLFLAG_RW, 654208373Smm &l2arc_feed_again, 0, "turbo warmup"); 655208373SmmSYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_norw, CTLFLAG_RW, 656208373Smm &l2arc_norw, 0, "no reads during writes"); 657205231Skmacy 658205231SkmacySYSCTL_QUAD(_vfs_zfs, OID_AUTO, anon_size, CTLFLAG_RD, 659205231Skmacy &ARC_anon.arcs_size, 0, "size of anonymous state"); 660205231SkmacySYSCTL_QUAD(_vfs_zfs, OID_AUTO, anon_metadata_lsize, CTLFLAG_RD, 661205231Skmacy &ARC_anon.arcs_lsize[ARC_BUFC_METADATA], 0, "size of anonymous state"); 662205231SkmacySYSCTL_QUAD(_vfs_zfs, OID_AUTO, anon_data_lsize, CTLFLAG_RD, 663205231Skmacy &ARC_anon.arcs_lsize[ARC_BUFC_DATA], 0, "size of anonymous state"); 664205231Skmacy 665205231SkmacySYSCTL_QUAD(_vfs_zfs, OID_AUTO, mru_size, CTLFLAG_RD, 666205231Skmacy &ARC_mru.arcs_size, 0, "size of mru state"); 667205231SkmacySYSCTL_QUAD(_vfs_zfs, OID_AUTO, mru_metadata_lsize, CTLFLAG_RD, 668205231Skmacy &ARC_mru.arcs_lsize[ARC_BUFC_METADATA], 0, "size of metadata in mru state"); 669205231SkmacySYSCTL_QUAD(_vfs_zfs, OID_AUTO, mru_data_lsize, CTLFLAG_RD, 670205231Skmacy &ARC_mru.arcs_lsize[ARC_BUFC_DATA], 0, "size of data in mru state"); 671205231Skmacy 672205231SkmacySYSCTL_QUAD(_vfs_zfs, OID_AUTO, mru_ghost_size, CTLFLAG_RD, 673205231Skmacy &ARC_mru_ghost.arcs_size, 0, "size of mru ghost state"); 674205231SkmacySYSCTL_QUAD(_vfs_zfs, OID_AUTO, mru_ghost_metadata_lsize, CTLFLAG_RD, 675205231Skmacy &ARC_mru_ghost.arcs_lsize[ARC_BUFC_METADATA], 0, 676205231Skmacy "size of metadata in mru ghost state"); 677205231SkmacySYSCTL_QUAD(_vfs_zfs, OID_AUTO, mru_ghost_data_lsize, CTLFLAG_RD, 678205231Skmacy &ARC_mru_ghost.arcs_lsize[ARC_BUFC_DATA], 0, 679205231Skmacy "size of data in mru ghost state"); 680205231Skmacy 681205231SkmacySYSCTL_QUAD(_vfs_zfs, OID_AUTO, mfu_size, CTLFLAG_RD, 682205231Skmacy &ARC_mfu.arcs_size, 0, "size of mfu state"); 683205231SkmacySYSCTL_QUAD(_vfs_zfs, OID_AUTO, mfu_metadata_lsize, CTLFLAG_RD, 684205231Skmacy &ARC_mfu.arcs_lsize[ARC_BUFC_METADATA], 0, "size of metadata in mfu state"); 685205231SkmacySYSCTL_QUAD(_vfs_zfs, OID_AUTO, mfu_data_lsize, CTLFLAG_RD, 686205231Skmacy &ARC_mfu.arcs_lsize[ARC_BUFC_DATA], 0, "size of data in mfu state"); 687205231Skmacy 688205231SkmacySYSCTL_QUAD(_vfs_zfs, OID_AUTO, mfu_ghost_size, CTLFLAG_RD, 689205231Skmacy &ARC_mfu_ghost.arcs_size, 0, "size of mfu ghost state"); 690205231SkmacySYSCTL_QUAD(_vfs_zfs, OID_AUTO, mfu_ghost_metadata_lsize, CTLFLAG_RD, 691205231Skmacy &ARC_mfu_ghost.arcs_lsize[ARC_BUFC_METADATA], 0, 692205231Skmacy "size of metadata in mfu ghost state"); 693205231SkmacySYSCTL_QUAD(_vfs_zfs, OID_AUTO, mfu_ghost_data_lsize, CTLFLAG_RD, 694205231Skmacy &ARC_mfu_ghost.arcs_lsize[ARC_BUFC_DATA], 0, 695205231Skmacy "size of data in mfu ghost state"); 696205231Skmacy 697205231SkmacySYSCTL_QUAD(_vfs_zfs, OID_AUTO, l2c_only_size, CTLFLAG_RD, 698205231Skmacy &ARC_l2c_only.arcs_size, 0, "size of mru state"); 699205231Skmacy 700185029Spjd/* 701185029Spjd * L2ARC Internals 702185029Spjd */ 703185029Spjdtypedef struct l2arc_dev { 704185029Spjd vdev_t *l2ad_vdev; /* vdev */ 705185029Spjd spa_t *l2ad_spa; /* spa */ 706185029Spjd uint64_t l2ad_hand; /* next write location */ 707185029Spjd uint64_t l2ad_write; /* desired write size, bytes */ 708185029Spjd uint64_t l2ad_boost; /* warmup write boost, bytes */ 709185029Spjd uint64_t l2ad_start; /* first addr on device */ 710185029Spjd uint64_t l2ad_end; /* last addr on device */ 711185029Spjd uint64_t l2ad_evict; /* last addr eviction reached */ 712185029Spjd boolean_t l2ad_first; /* first sweep through */ 713208373Smm boolean_t l2ad_writing; /* currently writing */ 714185029Spjd list_t *l2ad_buflist; /* buffer list */ 715185029Spjd list_node_t l2ad_node; /* device list node */ 716185029Spjd} l2arc_dev_t; 717185029Spjd 718185029Spjdstatic list_t L2ARC_dev_list; /* device list */ 719185029Spjdstatic list_t *l2arc_dev_list; /* device list pointer */ 720185029Spjdstatic kmutex_t l2arc_dev_mtx; /* device list mutex */ 721185029Spjdstatic l2arc_dev_t *l2arc_dev_last; /* last device used */ 722185029Spjdstatic kmutex_t l2arc_buflist_mtx; /* mutex for all buflists */ 723185029Spjdstatic list_t L2ARC_free_on_write; /* free after write buf list */ 724185029Spjdstatic list_t *l2arc_free_on_write; /* free after write list ptr */ 725185029Spjdstatic kmutex_t l2arc_free_on_write_mtx; /* mutex for list */ 726185029Spjdstatic uint64_t l2arc_ndev; /* number of devices */ 727185029Spjd 728185029Spjdtypedef struct l2arc_read_callback { 729185029Spjd arc_buf_t *l2rcb_buf; /* read buffer */ 730185029Spjd spa_t *l2rcb_spa; /* spa */ 731185029Spjd blkptr_t l2rcb_bp; /* original blkptr */ 732185029Spjd zbookmark_t l2rcb_zb; /* original bookmark */ 733185029Spjd int l2rcb_flags; /* original flags */ 734185029Spjd} l2arc_read_callback_t; 735185029Spjd 736185029Spjdtypedef struct l2arc_write_callback { 737185029Spjd l2arc_dev_t *l2wcb_dev; /* device info */ 738185029Spjd arc_buf_hdr_t *l2wcb_head; /* head of write buflist */ 739185029Spjd} l2arc_write_callback_t; 740185029Spjd 741185029Spjdstruct l2arc_buf_hdr { 742185029Spjd /* protected by arc_buf_hdr mutex */ 743185029Spjd l2arc_dev_t *b_dev; /* L2ARC device */ 744208373Smm uint64_t b_daddr; /* disk address, offset byte */ 745185029Spjd}; 746185029Spjd 747185029Spjdtypedef struct l2arc_data_free { 748185029Spjd /* protected by l2arc_free_on_write_mtx */ 749185029Spjd void *l2df_data; 750185029Spjd size_t l2df_size; 751185029Spjd void (*l2df_func)(void *, size_t); 752185029Spjd list_node_t l2df_list_node; 753185029Spjd} l2arc_data_free_t; 754185029Spjd 755185029Spjdstatic kmutex_t l2arc_feed_thr_lock; 756185029Spjdstatic kcondvar_t l2arc_feed_thr_cv; 757185029Spjdstatic uint8_t l2arc_thread_exit; 758185029Spjd 759185029Spjdstatic void l2arc_read_done(zio_t *zio); 760185029Spjdstatic void l2arc_hdr_stat_add(void); 761185029Spjdstatic void l2arc_hdr_stat_remove(void); 762185029Spjd 763168404Spjdstatic uint64_t 764185029Spjdbuf_hash(spa_t *spa, const dva_t *dva, uint64_t birth) 765168404Spjd{ 766168404Spjd uintptr_t spav = (uintptr_t)spa; 767168404Spjd uint8_t *vdva = (uint8_t *)dva; 768168404Spjd uint64_t crc = -1ULL; 769168404Spjd int i; 770168404Spjd 771168404Spjd ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); 772168404Spjd 773168404Spjd for (i = 0; i < sizeof (dva_t); i++) 774168404Spjd crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF]; 775168404Spjd 776168404Spjd crc ^= (spav>>8) ^ birth; 777168404Spjd 778168404Spjd return (crc); 779168404Spjd} 780168404Spjd 781168404Spjd#define BUF_EMPTY(buf) \ 782168404Spjd ((buf)->b_dva.dva_word[0] == 0 && \ 783168404Spjd (buf)->b_dva.dva_word[1] == 0 && \ 784168404Spjd (buf)->b_birth == 0) 785168404Spjd 786168404Spjd#define BUF_EQUAL(spa, dva, birth, buf) \ 787168404Spjd ((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \ 788168404Spjd ((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \ 789168404Spjd ((buf)->b_birth == birth) && ((buf)->b_spa == spa) 790168404Spjd 791168404Spjdstatic arc_buf_hdr_t * 792185029Spjdbuf_hash_find(spa_t *spa, const dva_t *dva, uint64_t birth, kmutex_t **lockp) 793168404Spjd{ 794168404Spjd uint64_t idx = BUF_HASH_INDEX(spa, dva, birth); 795168404Spjd kmutex_t *hash_lock = BUF_HASH_LOCK(idx); 796168404Spjd arc_buf_hdr_t *buf; 797168404Spjd 798168404Spjd mutex_enter(hash_lock); 799168404Spjd for (buf = buf_hash_table.ht_table[idx]; buf != NULL; 800168404Spjd buf = buf->b_hash_next) { 801168404Spjd if (BUF_EQUAL(spa, dva, birth, buf)) { 802168404Spjd *lockp = hash_lock; 803168404Spjd return (buf); 804168404Spjd } 805168404Spjd } 806168404Spjd mutex_exit(hash_lock); 807168404Spjd *lockp = NULL; 808168404Spjd return (NULL); 809168404Spjd} 810168404Spjd 811168404Spjd/* 812168404Spjd * Insert an entry into the hash table. If there is already an element 813168404Spjd * equal to elem in the hash table, then the already existing element 814168404Spjd * will be returned and the new element will not be inserted. 815168404Spjd * Otherwise returns NULL. 816168404Spjd */ 817168404Spjdstatic arc_buf_hdr_t * 818168404Spjdbuf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp) 819168404Spjd{ 820168404Spjd uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth); 821168404Spjd kmutex_t *hash_lock = BUF_HASH_LOCK(idx); 822168404Spjd arc_buf_hdr_t *fbuf; 823168404Spjd uint32_t i; 824168404Spjd 825168404Spjd ASSERT(!HDR_IN_HASH_TABLE(buf)); 826168404Spjd *lockp = hash_lock; 827168404Spjd mutex_enter(hash_lock); 828168404Spjd for (fbuf = buf_hash_table.ht_table[idx], i = 0; fbuf != NULL; 829168404Spjd fbuf = fbuf->b_hash_next, i++) { 830168404Spjd if (BUF_EQUAL(buf->b_spa, &buf->b_dva, buf->b_birth, fbuf)) 831168404Spjd return (fbuf); 832168404Spjd } 833168404Spjd 834168404Spjd buf->b_hash_next = buf_hash_table.ht_table[idx]; 835168404Spjd buf_hash_table.ht_table[idx] = buf; 836168404Spjd buf->b_flags |= ARC_IN_HASH_TABLE; 837168404Spjd 838168404Spjd /* collect some hash table performance data */ 839168404Spjd if (i > 0) { 840168404Spjd ARCSTAT_BUMP(arcstat_hash_collisions); 841168404Spjd if (i == 1) 842168404Spjd ARCSTAT_BUMP(arcstat_hash_chains); 843168404Spjd 844168404Spjd ARCSTAT_MAX(arcstat_hash_chain_max, i); 845168404Spjd } 846168404Spjd 847168404Spjd ARCSTAT_BUMP(arcstat_hash_elements); 848168404Spjd ARCSTAT_MAXSTAT(arcstat_hash_elements); 849168404Spjd 850168404Spjd return (NULL); 851168404Spjd} 852168404Spjd 853168404Spjdstatic void 854168404Spjdbuf_hash_remove(arc_buf_hdr_t *buf) 855168404Spjd{ 856168404Spjd arc_buf_hdr_t *fbuf, **bufp; 857168404Spjd uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth); 858168404Spjd 859168404Spjd ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx))); 860168404Spjd ASSERT(HDR_IN_HASH_TABLE(buf)); 861168404Spjd 862168404Spjd bufp = &buf_hash_table.ht_table[idx]; 863168404Spjd while ((fbuf = *bufp) != buf) { 864168404Spjd ASSERT(fbuf != NULL); 865168404Spjd bufp = &fbuf->b_hash_next; 866168404Spjd } 867168404Spjd *bufp = buf->b_hash_next; 868168404Spjd buf->b_hash_next = NULL; 869168404Spjd buf->b_flags &= ~ARC_IN_HASH_TABLE; 870168404Spjd 871168404Spjd /* collect some hash table performance data */ 872168404Spjd ARCSTAT_BUMPDOWN(arcstat_hash_elements); 873168404Spjd 874168404Spjd if (buf_hash_table.ht_table[idx] && 875168404Spjd buf_hash_table.ht_table[idx]->b_hash_next == NULL) 876168404Spjd ARCSTAT_BUMPDOWN(arcstat_hash_chains); 877168404Spjd} 878168404Spjd 879168404Spjd/* 880168404Spjd * Global data structures and functions for the buf kmem cache. 881168404Spjd */ 882168404Spjdstatic kmem_cache_t *hdr_cache; 883168404Spjdstatic kmem_cache_t *buf_cache; 884168404Spjd 885168404Spjdstatic void 886168404Spjdbuf_fini(void) 887168404Spjd{ 888168404Spjd int i; 889168404Spjd 890168404Spjd kmem_free(buf_hash_table.ht_table, 891168404Spjd (buf_hash_table.ht_mask + 1) * sizeof (void *)); 892168404Spjd for (i = 0; i < BUF_LOCKS; i++) 893168404Spjd mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock); 894168404Spjd kmem_cache_destroy(hdr_cache); 895168404Spjd kmem_cache_destroy(buf_cache); 896168404Spjd} 897168404Spjd 898168404Spjd/* 899168404Spjd * Constructor callback - called when the cache is empty 900168404Spjd * and a new buf is requested. 901168404Spjd */ 902168404Spjd/* ARGSUSED */ 903168404Spjdstatic int 904168404Spjdhdr_cons(void *vbuf, void *unused, int kmflag) 905168404Spjd{ 906168404Spjd arc_buf_hdr_t *buf = vbuf; 907168404Spjd 908168404Spjd bzero(buf, sizeof (arc_buf_hdr_t)); 909168404Spjd refcount_create(&buf->b_refcnt); 910168404Spjd cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL); 911185029Spjd mutex_init(&buf->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL); 912208373Smm arc_space_consume(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS); 913185029Spjd 914168404Spjd return (0); 915168404Spjd} 916168404Spjd 917185029Spjd/* ARGSUSED */ 918185029Spjdstatic int 919185029Spjdbuf_cons(void *vbuf, void *unused, int kmflag) 920185029Spjd{ 921185029Spjd arc_buf_t *buf = vbuf; 922185029Spjd 923185029Spjd bzero(buf, sizeof (arc_buf_t)); 924185029Spjd rw_init(&buf->b_lock, NULL, RW_DEFAULT, NULL); 925208373Smm arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS); 926208373Smm 927185029Spjd return (0); 928185029Spjd} 929185029Spjd 930168404Spjd/* 931168404Spjd * Destructor callback - called when a cached buf is 932168404Spjd * no longer required. 933168404Spjd */ 934168404Spjd/* ARGSUSED */ 935168404Spjdstatic void 936168404Spjdhdr_dest(void *vbuf, void *unused) 937168404Spjd{ 938168404Spjd arc_buf_hdr_t *buf = vbuf; 939168404Spjd 940168404Spjd refcount_destroy(&buf->b_refcnt); 941168404Spjd cv_destroy(&buf->b_cv); 942185029Spjd mutex_destroy(&buf->b_freeze_lock); 943208373Smm arc_space_return(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS); 944168404Spjd} 945168404Spjd 946185029Spjd/* ARGSUSED */ 947185029Spjdstatic void 948185029Spjdbuf_dest(void *vbuf, void *unused) 949185029Spjd{ 950185029Spjd arc_buf_t *buf = vbuf; 951185029Spjd 952185029Spjd rw_destroy(&buf->b_lock); 953208373Smm arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS); 954185029Spjd} 955185029Spjd 956168404Spjd/* 957168404Spjd * Reclaim callback -- invoked when memory is low. 958168404Spjd */ 959168404Spjd/* ARGSUSED */ 960168404Spjdstatic void 961168404Spjdhdr_recl(void *unused) 962168404Spjd{ 963168404Spjd dprintf("hdr_recl called\n"); 964168404Spjd /* 965168404Spjd * umem calls the reclaim func when we destroy the buf cache, 966168404Spjd * which is after we do arc_fini(). 967168404Spjd */ 968168404Spjd if (!arc_dead) 969168404Spjd cv_signal(&arc_reclaim_thr_cv); 970168404Spjd} 971168404Spjd 972168404Spjdstatic void 973168404Spjdbuf_init(void) 974168404Spjd{ 975168404Spjd uint64_t *ct; 976168404Spjd uint64_t hsize = 1ULL << 12; 977168404Spjd int i, j; 978168404Spjd 979168404Spjd /* 980168404Spjd * The hash table is big enough to fill all of physical memory 981168404Spjd * with an average 64K block size. The table will take up 982168404Spjd * totalmem*sizeof(void*)/64K (eg. 128KB/GB with 8-byte pointers). 983168404Spjd */ 984168696Spjd while (hsize * 65536 < (uint64_t)physmem * PAGESIZE) 985168404Spjd hsize <<= 1; 986168404Spjdretry: 987168404Spjd buf_hash_table.ht_mask = hsize - 1; 988168404Spjd buf_hash_table.ht_table = 989168404Spjd kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP); 990168404Spjd if (buf_hash_table.ht_table == NULL) { 991168404Spjd ASSERT(hsize > (1ULL << 8)); 992168404Spjd hsize >>= 1; 993168404Spjd goto retry; 994168404Spjd } 995168404Spjd 996168404Spjd hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t), 997168404Spjd 0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0); 998168404Spjd buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t), 999185029Spjd 0, buf_cons, buf_dest, NULL, NULL, NULL, 0); 1000168404Spjd 1001168404Spjd for (i = 0; i < 256; i++) 1002168404Spjd for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--) 1003168404Spjd *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY); 1004168404Spjd 1005168404Spjd for (i = 0; i < BUF_LOCKS; i++) { 1006168404Spjd mutex_init(&buf_hash_table.ht_locks[i].ht_lock, 1007168404Spjd NULL, MUTEX_DEFAULT, NULL); 1008168404Spjd } 1009168404Spjd} 1010168404Spjd 1011168404Spjd#define ARC_MINTIME (hz>>4) /* 62 ms */ 1012168404Spjd 1013168404Spjdstatic void 1014168404Spjdarc_cksum_verify(arc_buf_t *buf) 1015168404Spjd{ 1016168404Spjd zio_cksum_t zc; 1017168404Spjd 1018168404Spjd if (!(zfs_flags & ZFS_DEBUG_MODIFY)) 1019168404Spjd return; 1020168404Spjd 1021168404Spjd mutex_enter(&buf->b_hdr->b_freeze_lock); 1022168404Spjd if (buf->b_hdr->b_freeze_cksum == NULL || 1023168404Spjd (buf->b_hdr->b_flags & ARC_IO_ERROR)) { 1024168404Spjd mutex_exit(&buf->b_hdr->b_freeze_lock); 1025168404Spjd return; 1026168404Spjd } 1027168404Spjd fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc); 1028168404Spjd if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc)) 1029168404Spjd panic("buffer modified while frozen!"); 1030168404Spjd mutex_exit(&buf->b_hdr->b_freeze_lock); 1031168404Spjd} 1032168404Spjd 1033185029Spjdstatic int 1034185029Spjdarc_cksum_equal(arc_buf_t *buf) 1035185029Spjd{ 1036185029Spjd zio_cksum_t zc; 1037185029Spjd int equal; 1038185029Spjd 1039185029Spjd mutex_enter(&buf->b_hdr->b_freeze_lock); 1040185029Spjd fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc); 1041185029Spjd equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc); 1042185029Spjd mutex_exit(&buf->b_hdr->b_freeze_lock); 1043185029Spjd 1044185029Spjd return (equal); 1045185029Spjd} 1046185029Spjd 1047168404Spjdstatic void 1048185029Spjdarc_cksum_compute(arc_buf_t *buf, boolean_t force) 1049168404Spjd{ 1050185029Spjd if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY)) 1051168404Spjd return; 1052168404Spjd 1053168404Spjd mutex_enter(&buf->b_hdr->b_freeze_lock); 1054168404Spjd if (buf->b_hdr->b_freeze_cksum != NULL) { 1055168404Spjd mutex_exit(&buf->b_hdr->b_freeze_lock); 1056168404Spjd return; 1057168404Spjd } 1058168404Spjd buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP); 1059168404Spjd fletcher_2_native(buf->b_data, buf->b_hdr->b_size, 1060168404Spjd buf->b_hdr->b_freeze_cksum); 1061168404Spjd mutex_exit(&buf->b_hdr->b_freeze_lock); 1062168404Spjd} 1063168404Spjd 1064168404Spjdvoid 1065168404Spjdarc_buf_thaw(arc_buf_t *buf) 1066168404Spjd{ 1067185029Spjd if (zfs_flags & ZFS_DEBUG_MODIFY) { 1068185029Spjd if (buf->b_hdr->b_state != arc_anon) 1069185029Spjd panic("modifying non-anon buffer!"); 1070185029Spjd if (buf->b_hdr->b_flags & ARC_IO_IN_PROGRESS) 1071185029Spjd panic("modifying buffer while i/o in progress!"); 1072185029Spjd arc_cksum_verify(buf); 1073185029Spjd } 1074168404Spjd 1075168404Spjd mutex_enter(&buf->b_hdr->b_freeze_lock); 1076168404Spjd if (buf->b_hdr->b_freeze_cksum != NULL) { 1077168404Spjd kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t)); 1078168404Spjd buf->b_hdr->b_freeze_cksum = NULL; 1079168404Spjd } 1080168404Spjd mutex_exit(&buf->b_hdr->b_freeze_lock); 1081168404Spjd} 1082168404Spjd 1083168404Spjdvoid 1084168404Spjdarc_buf_freeze(arc_buf_t *buf) 1085168404Spjd{ 1086168404Spjd if (!(zfs_flags & ZFS_DEBUG_MODIFY)) 1087168404Spjd return; 1088168404Spjd 1089168404Spjd ASSERT(buf->b_hdr->b_freeze_cksum != NULL || 1090168404Spjd buf->b_hdr->b_state == arc_anon); 1091185029Spjd arc_cksum_compute(buf, B_FALSE); 1092168404Spjd} 1093168404Spjd 1094168404Spjdstatic void 1095205231Skmacyget_buf_info(arc_buf_hdr_t *ab, arc_state_t *state, list_t **list, kmutex_t **lock) 1096205231Skmacy{ 1097205231Skmacy uint64_t buf_hashid = buf_hash(ab->b_spa, &ab->b_dva, ab->b_birth); 1098205231Skmacy 1099206796Spjd if (ab->b_type == ARC_BUFC_METADATA) 1100206796Spjd buf_hashid &= (ARC_BUFC_NUMMETADATALISTS - 1); 1101205231Skmacy else { 1102206796Spjd buf_hashid &= (ARC_BUFC_NUMDATALISTS - 1); 1103205231Skmacy buf_hashid += ARC_BUFC_NUMMETADATALISTS; 1104205231Skmacy } 1105205231Skmacy 1106205231Skmacy *list = &state->arcs_lists[buf_hashid]; 1107205231Skmacy *lock = ARCS_LOCK(state, buf_hashid); 1108205231Skmacy} 1109205231Skmacy 1110205231Skmacy 1111205231Skmacystatic void 1112168404Spjdadd_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag) 1113168404Spjd{ 1114205231Skmacy 1115168404Spjd ASSERT(MUTEX_HELD(hash_lock)); 1116168404Spjd 1117168404Spjd if ((refcount_add(&ab->b_refcnt, tag) == 1) && 1118168404Spjd (ab->b_state != arc_anon)) { 1119206796Spjd uint64_t delta = ab->b_size * ab->b_datacnt; 1120206796Spjd uint64_t *size = &ab->b_state->arcs_lsize[ab->b_type]; 1121205231Skmacy list_t *list; 1122205231Skmacy kmutex_t *lock; 1123168404Spjd 1124205231Skmacy get_buf_info(ab, ab->b_state, &list, &lock); 1125205231Skmacy ASSERT(!MUTEX_HELD(lock)); 1126205231Skmacy mutex_enter(lock); 1127168404Spjd ASSERT(list_link_active(&ab->b_arc_node)); 1128185029Spjd list_remove(list, ab); 1129168404Spjd if (GHOST_STATE(ab->b_state)) { 1130168404Spjd ASSERT3U(ab->b_datacnt, ==, 0); 1131168404Spjd ASSERT3P(ab->b_buf, ==, NULL); 1132168404Spjd delta = ab->b_size; 1133168404Spjd } 1134168404Spjd ASSERT(delta > 0); 1135185029Spjd ASSERT3U(*size, >=, delta); 1136185029Spjd atomic_add_64(size, -delta); 1137206794Spjd mutex_exit(lock); 1138185029Spjd /* remove the prefetch flag if we get a reference */ 1139168404Spjd if (ab->b_flags & ARC_PREFETCH) 1140168404Spjd ab->b_flags &= ~ARC_PREFETCH; 1141168404Spjd } 1142168404Spjd} 1143168404Spjd 1144168404Spjdstatic int 1145168404Spjdremove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag) 1146168404Spjd{ 1147168404Spjd int cnt; 1148168404Spjd arc_state_t *state = ab->b_state; 1149168404Spjd 1150168404Spjd ASSERT(state == arc_anon || MUTEX_HELD(hash_lock)); 1151168404Spjd ASSERT(!GHOST_STATE(state)); 1152168404Spjd 1153168404Spjd if (((cnt = refcount_remove(&ab->b_refcnt, tag)) == 0) && 1154168404Spjd (state != arc_anon)) { 1155185029Spjd uint64_t *size = &state->arcs_lsize[ab->b_type]; 1156205231Skmacy list_t *list; 1157205231Skmacy kmutex_t *lock; 1158185029Spjd 1159205231Skmacy get_buf_info(ab, state, &list, &lock); 1160205231Skmacy ASSERT(!MUTEX_HELD(lock)); 1161205231Skmacy mutex_enter(lock); 1162168404Spjd ASSERT(!list_link_active(&ab->b_arc_node)); 1163205231Skmacy list_insert_head(list, ab); 1164168404Spjd ASSERT(ab->b_datacnt > 0); 1165185029Spjd atomic_add_64(size, ab->b_size * ab->b_datacnt); 1166206794Spjd mutex_exit(lock); 1167168404Spjd } 1168168404Spjd return (cnt); 1169168404Spjd} 1170168404Spjd 1171168404Spjd/* 1172168404Spjd * Move the supplied buffer to the indicated state. The mutex 1173168404Spjd * for the buffer must be held by the caller. 1174168404Spjd */ 1175168404Spjdstatic void 1176168404Spjdarc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock) 1177168404Spjd{ 1178168404Spjd arc_state_t *old_state = ab->b_state; 1179168404Spjd int64_t refcnt = refcount_count(&ab->b_refcnt); 1180168404Spjd uint64_t from_delta, to_delta; 1181205231Skmacy list_t *list; 1182205231Skmacy kmutex_t *lock; 1183168404Spjd 1184168404Spjd ASSERT(MUTEX_HELD(hash_lock)); 1185168404Spjd ASSERT(new_state != old_state); 1186168404Spjd ASSERT(refcnt == 0 || ab->b_datacnt > 0); 1187168404Spjd ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state)); 1188168404Spjd 1189168404Spjd from_delta = to_delta = ab->b_datacnt * ab->b_size; 1190168404Spjd 1191168404Spjd /* 1192168404Spjd * If this buffer is evictable, transfer it from the 1193168404Spjd * old state list to the new state list. 1194168404Spjd */ 1195168404Spjd if (refcnt == 0) { 1196168404Spjd if (old_state != arc_anon) { 1197205231Skmacy int use_mutex; 1198185029Spjd uint64_t *size = &old_state->arcs_lsize[ab->b_type]; 1199168404Spjd 1200205231Skmacy get_buf_info(ab, old_state, &list, &lock); 1201205231Skmacy use_mutex = !MUTEX_HELD(lock); 1202168404Spjd if (use_mutex) 1203205231Skmacy mutex_enter(lock); 1204168404Spjd 1205168404Spjd ASSERT(list_link_active(&ab->b_arc_node)); 1206205231Skmacy list_remove(list, ab); 1207168404Spjd 1208168404Spjd /* 1209168404Spjd * If prefetching out of the ghost cache, 1210168404Spjd * we will have a non-null datacnt. 1211168404Spjd */ 1212168404Spjd if (GHOST_STATE(old_state) && ab->b_datacnt == 0) { 1213168404Spjd /* ghost elements have a ghost size */ 1214168404Spjd ASSERT(ab->b_buf == NULL); 1215168404Spjd from_delta = ab->b_size; 1216168404Spjd } 1217185029Spjd ASSERT3U(*size, >=, from_delta); 1218185029Spjd atomic_add_64(size, -from_delta); 1219168404Spjd 1220168404Spjd if (use_mutex) 1221205231Skmacy mutex_exit(lock); 1222168404Spjd } 1223168404Spjd if (new_state != arc_anon) { 1224206796Spjd int use_mutex; 1225185029Spjd uint64_t *size = &new_state->arcs_lsize[ab->b_type]; 1226168404Spjd 1227205231Skmacy get_buf_info(ab, new_state, &list, &lock); 1228205231Skmacy use_mutex = !MUTEX_HELD(lock); 1229168404Spjd if (use_mutex) 1230205231Skmacy mutex_enter(lock); 1231168404Spjd 1232205231Skmacy list_insert_head(list, ab); 1233168404Spjd 1234168404Spjd /* ghost elements have a ghost size */ 1235168404Spjd if (GHOST_STATE(new_state)) { 1236168404Spjd ASSERT(ab->b_datacnt == 0); 1237168404Spjd ASSERT(ab->b_buf == NULL); 1238168404Spjd to_delta = ab->b_size; 1239168404Spjd } 1240185029Spjd atomic_add_64(size, to_delta); 1241168404Spjd 1242168404Spjd if (use_mutex) 1243205231Skmacy mutex_exit(lock); 1244168404Spjd } 1245168404Spjd } 1246168404Spjd 1247168404Spjd ASSERT(!BUF_EMPTY(ab)); 1248185029Spjd if (new_state == arc_anon) { 1249168404Spjd buf_hash_remove(ab); 1250168404Spjd } 1251168404Spjd 1252168404Spjd /* adjust state sizes */ 1253168404Spjd if (to_delta) 1254168404Spjd atomic_add_64(&new_state->arcs_size, to_delta); 1255168404Spjd if (from_delta) { 1256168404Spjd ASSERT3U(old_state->arcs_size, >=, from_delta); 1257168404Spjd atomic_add_64(&old_state->arcs_size, -from_delta); 1258168404Spjd } 1259168404Spjd ab->b_state = new_state; 1260185029Spjd 1261185029Spjd /* adjust l2arc hdr stats */ 1262185029Spjd if (new_state == arc_l2c_only) 1263185029Spjd l2arc_hdr_stat_add(); 1264185029Spjd else if (old_state == arc_l2c_only) 1265185029Spjd l2arc_hdr_stat_remove(); 1266168404Spjd} 1267168404Spjd 1268185029Spjdvoid 1269208373Smmarc_space_consume(uint64_t space, arc_space_type_t type) 1270185029Spjd{ 1271208373Smm ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES); 1272208373Smm 1273208373Smm switch (type) { 1274208373Smm case ARC_SPACE_DATA: 1275208373Smm ARCSTAT_INCR(arcstat_data_size, space); 1276208373Smm break; 1277208373Smm case ARC_SPACE_OTHER: 1278208373Smm ARCSTAT_INCR(arcstat_other_size, space); 1279208373Smm break; 1280208373Smm case ARC_SPACE_HDRS: 1281208373Smm ARCSTAT_INCR(arcstat_hdr_size, space); 1282208373Smm break; 1283208373Smm case ARC_SPACE_L2HDRS: 1284208373Smm ARCSTAT_INCR(arcstat_l2_hdr_size, space); 1285208373Smm break; 1286208373Smm } 1287208373Smm 1288185029Spjd atomic_add_64(&arc_meta_used, space); 1289185029Spjd atomic_add_64(&arc_size, space); 1290185029Spjd} 1291185029Spjd 1292185029Spjdvoid 1293208373Smmarc_space_return(uint64_t space, arc_space_type_t type) 1294185029Spjd{ 1295208373Smm ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES); 1296208373Smm 1297208373Smm switch (type) { 1298208373Smm case ARC_SPACE_DATA: 1299208373Smm ARCSTAT_INCR(arcstat_data_size, -space); 1300208373Smm break; 1301208373Smm case ARC_SPACE_OTHER: 1302208373Smm ARCSTAT_INCR(arcstat_other_size, -space); 1303208373Smm break; 1304208373Smm case ARC_SPACE_HDRS: 1305208373Smm ARCSTAT_INCR(arcstat_hdr_size, -space); 1306208373Smm break; 1307208373Smm case ARC_SPACE_L2HDRS: 1308208373Smm ARCSTAT_INCR(arcstat_l2_hdr_size, -space); 1309208373Smm break; 1310208373Smm } 1311208373Smm 1312185029Spjd ASSERT(arc_meta_used >= space); 1313185029Spjd if (arc_meta_max < arc_meta_used) 1314185029Spjd arc_meta_max = arc_meta_used; 1315185029Spjd atomic_add_64(&arc_meta_used, -space); 1316185029Spjd ASSERT(arc_size >= space); 1317185029Spjd atomic_add_64(&arc_size, -space); 1318185029Spjd} 1319185029Spjd 1320185029Spjdvoid * 1321185029Spjdarc_data_buf_alloc(uint64_t size) 1322185029Spjd{ 1323185029Spjd if (arc_evict_needed(ARC_BUFC_DATA)) 1324185029Spjd cv_signal(&arc_reclaim_thr_cv); 1325185029Spjd atomic_add_64(&arc_size, size); 1326185029Spjd return (zio_data_buf_alloc(size)); 1327185029Spjd} 1328185029Spjd 1329185029Spjdvoid 1330185029Spjdarc_data_buf_free(void *buf, uint64_t size) 1331185029Spjd{ 1332185029Spjd zio_data_buf_free(buf, size); 1333185029Spjd ASSERT(arc_size >= size); 1334185029Spjd atomic_add_64(&arc_size, -size); 1335185029Spjd} 1336185029Spjd 1337168404Spjdarc_buf_t * 1338168404Spjdarc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type) 1339168404Spjd{ 1340168404Spjd arc_buf_hdr_t *hdr; 1341168404Spjd arc_buf_t *buf; 1342168404Spjd 1343168404Spjd ASSERT3U(size, >, 0); 1344185029Spjd hdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE); 1345168404Spjd ASSERT(BUF_EMPTY(hdr)); 1346168404Spjd hdr->b_size = size; 1347168404Spjd hdr->b_type = type; 1348168404Spjd hdr->b_spa = spa; 1349168404Spjd hdr->b_state = arc_anon; 1350168404Spjd hdr->b_arc_access = 0; 1351185029Spjd buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); 1352168404Spjd buf->b_hdr = hdr; 1353168404Spjd buf->b_data = NULL; 1354168404Spjd buf->b_efunc = NULL; 1355168404Spjd buf->b_private = NULL; 1356168404Spjd buf->b_next = NULL; 1357168404Spjd hdr->b_buf = buf; 1358168404Spjd arc_get_data_buf(buf); 1359168404Spjd hdr->b_datacnt = 1; 1360168404Spjd hdr->b_flags = 0; 1361168404Spjd ASSERT(refcount_is_zero(&hdr->b_refcnt)); 1362168404Spjd (void) refcount_add(&hdr->b_refcnt, tag); 1363168404Spjd 1364168404Spjd return (buf); 1365168404Spjd} 1366168404Spjd 1367168404Spjdstatic arc_buf_t * 1368168404Spjdarc_buf_clone(arc_buf_t *from) 1369168404Spjd{ 1370168404Spjd arc_buf_t *buf; 1371168404Spjd arc_buf_hdr_t *hdr = from->b_hdr; 1372168404Spjd uint64_t size = hdr->b_size; 1373168404Spjd 1374185029Spjd buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); 1375168404Spjd buf->b_hdr = hdr; 1376168404Spjd buf->b_data = NULL; 1377168404Spjd buf->b_efunc = NULL; 1378168404Spjd buf->b_private = NULL; 1379168404Spjd buf->b_next = hdr->b_buf; 1380168404Spjd hdr->b_buf = buf; 1381168404Spjd arc_get_data_buf(buf); 1382168404Spjd bcopy(from->b_data, buf->b_data, size); 1383168404Spjd hdr->b_datacnt += 1; 1384168404Spjd return (buf); 1385168404Spjd} 1386168404Spjd 1387168404Spjdvoid 1388168404Spjdarc_buf_add_ref(arc_buf_t *buf, void* tag) 1389168404Spjd{ 1390168404Spjd arc_buf_hdr_t *hdr; 1391168404Spjd kmutex_t *hash_lock; 1392168404Spjd 1393168404Spjd /* 1394185029Spjd * Check to see if this buffer is evicted. Callers 1395185029Spjd * must verify b_data != NULL to know if the add_ref 1396185029Spjd * was successful. 1397168404Spjd */ 1398185029Spjd rw_enter(&buf->b_lock, RW_READER); 1399185029Spjd if (buf->b_data == NULL) { 1400185029Spjd rw_exit(&buf->b_lock); 1401168404Spjd return; 1402168404Spjd } 1403185029Spjd hdr = buf->b_hdr; 1404185029Spjd ASSERT(hdr != NULL); 1405168404Spjd hash_lock = HDR_LOCK(hdr); 1406168404Spjd mutex_enter(hash_lock); 1407185029Spjd rw_exit(&buf->b_lock); 1408168404Spjd 1409168404Spjd ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu); 1410168404Spjd add_reference(hdr, hash_lock, tag); 1411208373Smm DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); 1412168404Spjd arc_access(hdr, hash_lock); 1413168404Spjd mutex_exit(hash_lock); 1414168404Spjd ARCSTAT_BUMP(arcstat_hits); 1415168404Spjd ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH), 1416168404Spjd demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, 1417168404Spjd data, metadata, hits); 1418168404Spjd} 1419168404Spjd 1420185029Spjd/* 1421185029Spjd * Free the arc data buffer. If it is an l2arc write in progress, 1422185029Spjd * the buffer is placed on l2arc_free_on_write to be freed later. 1423185029Spjd */ 1424168404Spjdstatic void 1425185029Spjdarc_buf_data_free(arc_buf_hdr_t *hdr, void (*free_func)(void *, size_t), 1426185029Spjd void *data, size_t size) 1427185029Spjd{ 1428185029Spjd if (HDR_L2_WRITING(hdr)) { 1429185029Spjd l2arc_data_free_t *df; 1430185029Spjd df = kmem_alloc(sizeof (l2arc_data_free_t), KM_SLEEP); 1431185029Spjd df->l2df_data = data; 1432185029Spjd df->l2df_size = size; 1433185029Spjd df->l2df_func = free_func; 1434185029Spjd mutex_enter(&l2arc_free_on_write_mtx); 1435185029Spjd list_insert_head(l2arc_free_on_write, df); 1436185029Spjd mutex_exit(&l2arc_free_on_write_mtx); 1437185029Spjd ARCSTAT_BUMP(arcstat_l2_free_on_write); 1438185029Spjd } else { 1439185029Spjd free_func(data, size); 1440185029Spjd } 1441185029Spjd} 1442185029Spjd 1443185029Spjdstatic void 1444168404Spjdarc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all) 1445168404Spjd{ 1446168404Spjd arc_buf_t **bufp; 1447168404Spjd 1448168404Spjd /* free up data associated with the buf */ 1449168404Spjd if (buf->b_data) { 1450168404Spjd arc_state_t *state = buf->b_hdr->b_state; 1451168404Spjd uint64_t size = buf->b_hdr->b_size; 1452168404Spjd arc_buf_contents_t type = buf->b_hdr->b_type; 1453168404Spjd 1454168404Spjd arc_cksum_verify(buf); 1455168404Spjd if (!recycle) { 1456168404Spjd if (type == ARC_BUFC_METADATA) { 1457185029Spjd arc_buf_data_free(buf->b_hdr, zio_buf_free, 1458185029Spjd buf->b_data, size); 1459208373Smm arc_space_return(size, ARC_SPACE_DATA); 1460168404Spjd } else { 1461168404Spjd ASSERT(type == ARC_BUFC_DATA); 1462185029Spjd arc_buf_data_free(buf->b_hdr, 1463185029Spjd zio_data_buf_free, buf->b_data, size); 1464208373Smm ARCSTAT_INCR(arcstat_data_size, -size); 1465185029Spjd atomic_add_64(&arc_size, -size); 1466168404Spjd } 1467168404Spjd } 1468168404Spjd if (list_link_active(&buf->b_hdr->b_arc_node)) { 1469185029Spjd uint64_t *cnt = &state->arcs_lsize[type]; 1470185029Spjd 1471168404Spjd ASSERT(refcount_is_zero(&buf->b_hdr->b_refcnt)); 1472168404Spjd ASSERT(state != arc_anon); 1473185029Spjd 1474185029Spjd ASSERT3U(*cnt, >=, size); 1475185029Spjd atomic_add_64(cnt, -size); 1476168404Spjd } 1477168404Spjd ASSERT3U(state->arcs_size, >=, size); 1478168404Spjd atomic_add_64(&state->arcs_size, -size); 1479168404Spjd buf->b_data = NULL; 1480168404Spjd ASSERT(buf->b_hdr->b_datacnt > 0); 1481168404Spjd buf->b_hdr->b_datacnt -= 1; 1482168404Spjd } 1483168404Spjd 1484168404Spjd /* only remove the buf if requested */ 1485168404Spjd if (!all) 1486168404Spjd return; 1487168404Spjd 1488168404Spjd /* remove the buf from the hdr list */ 1489168404Spjd for (bufp = &buf->b_hdr->b_buf; *bufp != buf; bufp = &(*bufp)->b_next) 1490168404Spjd continue; 1491168404Spjd *bufp = buf->b_next; 1492168404Spjd 1493168404Spjd ASSERT(buf->b_efunc == NULL); 1494168404Spjd 1495168404Spjd /* clean up the buf */ 1496168404Spjd buf->b_hdr = NULL; 1497168404Spjd kmem_cache_free(buf_cache, buf); 1498168404Spjd} 1499168404Spjd 1500168404Spjdstatic void 1501168404Spjdarc_hdr_destroy(arc_buf_hdr_t *hdr) 1502168404Spjd{ 1503168404Spjd ASSERT(refcount_is_zero(&hdr->b_refcnt)); 1504168404Spjd ASSERT3P(hdr->b_state, ==, arc_anon); 1505168404Spjd ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 1506185029Spjd ASSERT(!(hdr->b_flags & ARC_STORED)); 1507168404Spjd 1508185029Spjd if (hdr->b_l2hdr != NULL) { 1509185029Spjd if (!MUTEX_HELD(&l2arc_buflist_mtx)) { 1510185029Spjd /* 1511185029Spjd * To prevent arc_free() and l2arc_evict() from 1512185029Spjd * attempting to free the same buffer at the same time, 1513185029Spjd * a FREE_IN_PROGRESS flag is given to arc_free() to 1514185029Spjd * give it priority. l2arc_evict() can't destroy this 1515185029Spjd * header while we are waiting on l2arc_buflist_mtx. 1516185029Spjd * 1517185029Spjd * The hdr may be removed from l2ad_buflist before we 1518185029Spjd * grab l2arc_buflist_mtx, so b_l2hdr is rechecked. 1519185029Spjd */ 1520185029Spjd mutex_enter(&l2arc_buflist_mtx); 1521185029Spjd if (hdr->b_l2hdr != NULL) { 1522185029Spjd list_remove(hdr->b_l2hdr->b_dev->l2ad_buflist, 1523185029Spjd hdr); 1524185029Spjd } 1525185029Spjd mutex_exit(&l2arc_buflist_mtx); 1526185029Spjd } else { 1527185029Spjd list_remove(hdr->b_l2hdr->b_dev->l2ad_buflist, hdr); 1528185029Spjd } 1529185029Spjd ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); 1530185029Spjd kmem_free(hdr->b_l2hdr, sizeof (l2arc_buf_hdr_t)); 1531185029Spjd if (hdr->b_state == arc_l2c_only) 1532185029Spjd l2arc_hdr_stat_remove(); 1533185029Spjd hdr->b_l2hdr = NULL; 1534185029Spjd } 1535185029Spjd 1536168404Spjd if (!BUF_EMPTY(hdr)) { 1537168404Spjd ASSERT(!HDR_IN_HASH_TABLE(hdr)); 1538168404Spjd bzero(&hdr->b_dva, sizeof (dva_t)); 1539168404Spjd hdr->b_birth = 0; 1540168404Spjd hdr->b_cksum0 = 0; 1541168404Spjd } 1542168404Spjd while (hdr->b_buf) { 1543168404Spjd arc_buf_t *buf = hdr->b_buf; 1544168404Spjd 1545168404Spjd if (buf->b_efunc) { 1546168404Spjd mutex_enter(&arc_eviction_mtx); 1547185029Spjd rw_enter(&buf->b_lock, RW_WRITER); 1548168404Spjd ASSERT(buf->b_hdr != NULL); 1549168404Spjd arc_buf_destroy(hdr->b_buf, FALSE, FALSE); 1550168404Spjd hdr->b_buf = buf->b_next; 1551168404Spjd buf->b_hdr = &arc_eviction_hdr; 1552168404Spjd buf->b_next = arc_eviction_list; 1553168404Spjd arc_eviction_list = buf; 1554185029Spjd rw_exit(&buf->b_lock); 1555168404Spjd mutex_exit(&arc_eviction_mtx); 1556168404Spjd } else { 1557168404Spjd arc_buf_destroy(hdr->b_buf, FALSE, TRUE); 1558168404Spjd } 1559168404Spjd } 1560168404Spjd if (hdr->b_freeze_cksum != NULL) { 1561168404Spjd kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t)); 1562168404Spjd hdr->b_freeze_cksum = NULL; 1563168404Spjd } 1564168404Spjd 1565168404Spjd ASSERT(!list_link_active(&hdr->b_arc_node)); 1566168404Spjd ASSERT3P(hdr->b_hash_next, ==, NULL); 1567168404Spjd ASSERT3P(hdr->b_acb, ==, NULL); 1568168404Spjd kmem_cache_free(hdr_cache, hdr); 1569168404Spjd} 1570168404Spjd 1571168404Spjdvoid 1572168404Spjdarc_buf_free(arc_buf_t *buf, void *tag) 1573168404Spjd{ 1574168404Spjd arc_buf_hdr_t *hdr = buf->b_hdr; 1575168404Spjd int hashed = hdr->b_state != arc_anon; 1576168404Spjd 1577168404Spjd ASSERT(buf->b_efunc == NULL); 1578168404Spjd ASSERT(buf->b_data != NULL); 1579168404Spjd 1580168404Spjd if (hashed) { 1581168404Spjd kmutex_t *hash_lock = HDR_LOCK(hdr); 1582168404Spjd 1583168404Spjd mutex_enter(hash_lock); 1584168404Spjd (void) remove_reference(hdr, hash_lock, tag); 1585168404Spjd if (hdr->b_datacnt > 1) 1586168404Spjd arc_buf_destroy(buf, FALSE, TRUE); 1587168404Spjd else 1588168404Spjd hdr->b_flags |= ARC_BUF_AVAILABLE; 1589168404Spjd mutex_exit(hash_lock); 1590168404Spjd } else if (HDR_IO_IN_PROGRESS(hdr)) { 1591168404Spjd int destroy_hdr; 1592168404Spjd /* 1593168404Spjd * We are in the middle of an async write. Don't destroy 1594168404Spjd * this buffer unless the write completes before we finish 1595168404Spjd * decrementing the reference count. 1596168404Spjd */ 1597168404Spjd mutex_enter(&arc_eviction_mtx); 1598168404Spjd (void) remove_reference(hdr, NULL, tag); 1599168404Spjd ASSERT(refcount_is_zero(&hdr->b_refcnt)); 1600168404Spjd destroy_hdr = !HDR_IO_IN_PROGRESS(hdr); 1601168404Spjd mutex_exit(&arc_eviction_mtx); 1602168404Spjd if (destroy_hdr) 1603168404Spjd arc_hdr_destroy(hdr); 1604168404Spjd } else { 1605168404Spjd if (remove_reference(hdr, NULL, tag) > 0) { 1606168404Spjd ASSERT(HDR_IO_ERROR(hdr)); 1607168404Spjd arc_buf_destroy(buf, FALSE, TRUE); 1608168404Spjd } else { 1609168404Spjd arc_hdr_destroy(hdr); 1610168404Spjd } 1611168404Spjd } 1612168404Spjd} 1613168404Spjd 1614168404Spjdint 1615168404Spjdarc_buf_remove_ref(arc_buf_t *buf, void* tag) 1616168404Spjd{ 1617168404Spjd arc_buf_hdr_t *hdr = buf->b_hdr; 1618168404Spjd kmutex_t *hash_lock = HDR_LOCK(hdr); 1619168404Spjd int no_callback = (buf->b_efunc == NULL); 1620168404Spjd 1621168404Spjd if (hdr->b_state == arc_anon) { 1622168404Spjd arc_buf_free(buf, tag); 1623168404Spjd return (no_callback); 1624168404Spjd } 1625168404Spjd 1626168404Spjd mutex_enter(hash_lock); 1627168404Spjd ASSERT(hdr->b_state != arc_anon); 1628168404Spjd ASSERT(buf->b_data != NULL); 1629168404Spjd 1630168404Spjd (void) remove_reference(hdr, hash_lock, tag); 1631168404Spjd if (hdr->b_datacnt > 1) { 1632168404Spjd if (no_callback) 1633168404Spjd arc_buf_destroy(buf, FALSE, TRUE); 1634168404Spjd } else if (no_callback) { 1635168404Spjd ASSERT(hdr->b_buf == buf && buf->b_next == NULL); 1636168404Spjd hdr->b_flags |= ARC_BUF_AVAILABLE; 1637168404Spjd } 1638168404Spjd ASSERT(no_callback || hdr->b_datacnt > 1 || 1639168404Spjd refcount_is_zero(&hdr->b_refcnt)); 1640168404Spjd mutex_exit(hash_lock); 1641168404Spjd return (no_callback); 1642168404Spjd} 1643168404Spjd 1644168404Spjdint 1645168404Spjdarc_buf_size(arc_buf_t *buf) 1646168404Spjd{ 1647168404Spjd return (buf->b_hdr->b_size); 1648168404Spjd} 1649168404Spjd 1650168404Spjd/* 1651168404Spjd * Evict buffers from list until we've removed the specified number of 1652168404Spjd * bytes. Move the removed buffers to the appropriate evict state. 1653168404Spjd * If the recycle flag is set, then attempt to "recycle" a buffer: 1654168404Spjd * - look for a buffer to evict that is `bytes' long. 1655168404Spjd * - return the data block from this buffer rather than freeing it. 1656168404Spjd * This flag is used by callers that are trying to make space for a 1657168404Spjd * new buffer in a full arc cache. 1658185029Spjd * 1659185029Spjd * This function makes a "best effort". It skips over any buffers 1660185029Spjd * it can't get a hash_lock on, and so may not catch all candidates. 1661185029Spjd * It may also return without evicting as much space as requested. 1662168404Spjd */ 1663168404Spjdstatic void * 1664185029Spjdarc_evict(arc_state_t *state, spa_t *spa, int64_t bytes, boolean_t recycle, 1665168404Spjd arc_buf_contents_t type) 1666168404Spjd{ 1667168404Spjd arc_state_t *evicted_state; 1668168404Spjd uint64_t bytes_evicted = 0, skipped = 0, missed = 0; 1669205231Skmacy int64_t bytes_remaining; 1670168404Spjd arc_buf_hdr_t *ab, *ab_prev = NULL; 1671205231Skmacy list_t *evicted_list, *list, *evicted_list_start, *list_start; 1672205231Skmacy kmutex_t *lock, *evicted_lock; 1673168404Spjd kmutex_t *hash_lock; 1674168404Spjd boolean_t have_lock; 1675168404Spjd void *stolen = NULL; 1676205231Skmacy static int evict_metadata_offset, evict_data_offset; 1677205231Skmacy int i, idx, offset, list_count, count; 1678168404Spjd 1679168404Spjd ASSERT(state == arc_mru || state == arc_mfu); 1680168404Spjd 1681168404Spjd evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost; 1682206796Spjd 1683205231Skmacy if (type == ARC_BUFC_METADATA) { 1684205231Skmacy offset = 0; 1685205231Skmacy list_count = ARC_BUFC_NUMMETADATALISTS; 1686205231Skmacy list_start = &state->arcs_lists[0]; 1687205231Skmacy evicted_list_start = &evicted_state->arcs_lists[0]; 1688205231Skmacy idx = evict_metadata_offset; 1689205231Skmacy } else { 1690205231Skmacy offset = ARC_BUFC_NUMMETADATALISTS; 1691205231Skmacy list_start = &state->arcs_lists[offset]; 1692205231Skmacy evicted_list_start = &evicted_state->arcs_lists[offset]; 1693205231Skmacy list_count = ARC_BUFC_NUMDATALISTS; 1694205231Skmacy idx = evict_data_offset; 1695205231Skmacy } 1696205231Skmacy bytes_remaining = evicted_state->arcs_lsize[type]; 1697205231Skmacy count = 0; 1698206796Spjd 1699205231Skmacyevict_start: 1700205231Skmacy list = &list_start[idx]; 1701205231Skmacy evicted_list = &evicted_list_start[idx]; 1702205231Skmacy lock = ARCS_LOCK(state, (offset + idx)); 1703206796Spjd evicted_lock = ARCS_LOCK(evicted_state, (offset + idx)); 1704168404Spjd 1705205231Skmacy mutex_enter(lock); 1706205231Skmacy mutex_enter(evicted_lock); 1707205231Skmacy 1708185029Spjd for (ab = list_tail(list); ab; ab = ab_prev) { 1709185029Spjd ab_prev = list_prev(list, ab); 1710205231Skmacy bytes_remaining -= (ab->b_size * ab->b_datacnt); 1711168404Spjd /* prefetch buffers have a minimum lifespan */ 1712168404Spjd if (HDR_IO_IN_PROGRESS(ab) || 1713185029Spjd (spa && ab->b_spa != spa) || 1714168404Spjd (ab->b_flags & (ARC_PREFETCH|ARC_INDIRECT) && 1715174049Sjb LBOLT - ab->b_arc_access < arc_min_prefetch_lifespan)) { 1716168404Spjd skipped++; 1717168404Spjd continue; 1718168404Spjd } 1719168404Spjd /* "lookahead" for better eviction candidate */ 1720168404Spjd if (recycle && ab->b_size != bytes && 1721168404Spjd ab_prev && ab_prev->b_size == bytes) 1722168404Spjd continue; 1723168404Spjd hash_lock = HDR_LOCK(ab); 1724168404Spjd have_lock = MUTEX_HELD(hash_lock); 1725168404Spjd if (have_lock || mutex_tryenter(hash_lock)) { 1726168404Spjd ASSERT3U(refcount_count(&ab->b_refcnt), ==, 0); 1727168404Spjd ASSERT(ab->b_datacnt > 0); 1728168404Spjd while (ab->b_buf) { 1729168404Spjd arc_buf_t *buf = ab->b_buf; 1730185029Spjd if (!rw_tryenter(&buf->b_lock, RW_WRITER)) { 1731185029Spjd missed += 1; 1732185029Spjd break; 1733185029Spjd } 1734168404Spjd if (buf->b_data) { 1735168404Spjd bytes_evicted += ab->b_size; 1736168404Spjd if (recycle && ab->b_type == type && 1737185029Spjd ab->b_size == bytes && 1738185029Spjd !HDR_L2_WRITING(ab)) { 1739168404Spjd stolen = buf->b_data; 1740168404Spjd recycle = FALSE; 1741168404Spjd } 1742168404Spjd } 1743168404Spjd if (buf->b_efunc) { 1744168404Spjd mutex_enter(&arc_eviction_mtx); 1745168404Spjd arc_buf_destroy(buf, 1746168404Spjd buf->b_data == stolen, FALSE); 1747168404Spjd ab->b_buf = buf->b_next; 1748168404Spjd buf->b_hdr = &arc_eviction_hdr; 1749168404Spjd buf->b_next = arc_eviction_list; 1750168404Spjd arc_eviction_list = buf; 1751168404Spjd mutex_exit(&arc_eviction_mtx); 1752185029Spjd rw_exit(&buf->b_lock); 1753168404Spjd } else { 1754185029Spjd rw_exit(&buf->b_lock); 1755168404Spjd arc_buf_destroy(buf, 1756168404Spjd buf->b_data == stolen, TRUE); 1757168404Spjd } 1758168404Spjd } 1759208373Smm 1760208373Smm if (ab->b_l2hdr) { 1761208373Smm ARCSTAT_INCR(arcstat_evict_l2_cached, 1762208373Smm ab->b_size); 1763208373Smm } else { 1764208373Smm if (l2arc_write_eligible(ab->b_spa, ab)) { 1765208373Smm ARCSTAT_INCR(arcstat_evict_l2_eligible, 1766208373Smm ab->b_size); 1767208373Smm } else { 1768208373Smm ARCSTAT_INCR( 1769208373Smm arcstat_evict_l2_ineligible, 1770208373Smm ab->b_size); 1771208373Smm } 1772208373Smm } 1773208373Smm 1774185029Spjd if (ab->b_datacnt == 0) { 1775185029Spjd arc_change_state(evicted_state, ab, hash_lock); 1776185029Spjd ASSERT(HDR_IN_HASH_TABLE(ab)); 1777185029Spjd ab->b_flags |= ARC_IN_HASH_TABLE; 1778185029Spjd ab->b_flags &= ~ARC_BUF_AVAILABLE; 1779185029Spjd DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, ab); 1780185029Spjd } 1781168404Spjd if (!have_lock) 1782168404Spjd mutex_exit(hash_lock); 1783168404Spjd if (bytes >= 0 && bytes_evicted >= bytes) 1784168404Spjd break; 1785205231Skmacy if (bytes_remaining > 0) { 1786205231Skmacy mutex_exit(evicted_lock); 1787205231Skmacy mutex_exit(lock); 1788206796Spjd idx = ((idx + 1) & (list_count - 1)); 1789205231Skmacy count++; 1790205231Skmacy goto evict_start; 1791205231Skmacy } 1792168404Spjd } else { 1793168404Spjd missed += 1; 1794168404Spjd } 1795168404Spjd } 1796168404Spjd 1797205231Skmacy mutex_exit(evicted_lock); 1798205231Skmacy mutex_exit(lock); 1799206796Spjd 1800206796Spjd idx = ((idx + 1) & (list_count - 1)); 1801205231Skmacy count++; 1802168404Spjd 1803205231Skmacy if (bytes_evicted < bytes) { 1804205231Skmacy if (count < list_count) 1805205231Skmacy goto evict_start; 1806205231Skmacy else 1807205231Skmacy dprintf("only evicted %lld bytes from %x", 1808205231Skmacy (longlong_t)bytes_evicted, state); 1809205231Skmacy } 1810206796Spjd if (type == ARC_BUFC_METADATA) 1811205231Skmacy evict_metadata_offset = idx; 1812205231Skmacy else 1813205231Skmacy evict_data_offset = idx; 1814206796Spjd 1815168404Spjd if (skipped) 1816168404Spjd ARCSTAT_INCR(arcstat_evict_skip, skipped); 1817168404Spjd 1818168404Spjd if (missed) 1819168404Spjd ARCSTAT_INCR(arcstat_mutex_miss, missed); 1820168404Spjd 1821185029Spjd /* 1822185029Spjd * We have just evicted some date into the ghost state, make 1823185029Spjd * sure we also adjust the ghost state size if necessary. 1824185029Spjd */ 1825185029Spjd if (arc_no_grow && 1826185029Spjd arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size > arc_c) { 1827185029Spjd int64_t mru_over = arc_anon->arcs_size + arc_mru->arcs_size + 1828185029Spjd arc_mru_ghost->arcs_size - arc_c; 1829185029Spjd 1830185029Spjd if (mru_over > 0 && arc_mru_ghost->arcs_lsize[type] > 0) { 1831185029Spjd int64_t todelete = 1832185029Spjd MIN(arc_mru_ghost->arcs_lsize[type], mru_over); 1833185029Spjd arc_evict_ghost(arc_mru_ghost, NULL, todelete); 1834185029Spjd } else if (arc_mfu_ghost->arcs_lsize[type] > 0) { 1835185029Spjd int64_t todelete = MIN(arc_mfu_ghost->arcs_lsize[type], 1836185029Spjd arc_mru_ghost->arcs_size + 1837185029Spjd arc_mfu_ghost->arcs_size - arc_c); 1838185029Spjd arc_evict_ghost(arc_mfu_ghost, NULL, todelete); 1839185029Spjd } 1840185029Spjd } 1841205231Skmacy if (stolen) 1842205231Skmacy ARCSTAT_BUMP(arcstat_stolen); 1843185029Spjd 1844168404Spjd return (stolen); 1845168404Spjd} 1846168404Spjd 1847168404Spjd/* 1848168404Spjd * Remove buffers from list until we've removed the specified number of 1849168404Spjd * bytes. Destroy the buffers that are removed. 1850168404Spjd */ 1851168404Spjdstatic void 1852185029Spjdarc_evict_ghost(arc_state_t *state, spa_t *spa, int64_t bytes) 1853168404Spjd{ 1854168404Spjd arc_buf_hdr_t *ab, *ab_prev; 1855205231Skmacy list_t *list, *list_start; 1856205231Skmacy kmutex_t *hash_lock, *lock; 1857168404Spjd uint64_t bytes_deleted = 0; 1858168404Spjd uint64_t bufs_skipped = 0; 1859205231Skmacy static int evict_offset; 1860205231Skmacy int list_count, idx = evict_offset; 1861205231Skmacy int offset, count = 0; 1862168404Spjd 1863168404Spjd ASSERT(GHOST_STATE(state)); 1864205231Skmacy 1865205231Skmacy /* 1866205231Skmacy * data lists come after metadata lists 1867205231Skmacy */ 1868205231Skmacy list_start = &state->arcs_lists[ARC_BUFC_NUMMETADATALISTS]; 1869205231Skmacy list_count = ARC_BUFC_NUMDATALISTS; 1870205231Skmacy offset = ARC_BUFC_NUMMETADATALISTS; 1871206796Spjd 1872205231Skmacyevict_start: 1873205231Skmacy list = &list_start[idx]; 1874205231Skmacy lock = ARCS_LOCK(state, idx + offset); 1875205231Skmacy 1876205231Skmacy mutex_enter(lock); 1877185029Spjd for (ab = list_tail(list); ab; ab = ab_prev) { 1878185029Spjd ab_prev = list_prev(list, ab); 1879185029Spjd if (spa && ab->b_spa != spa) 1880185029Spjd continue; 1881168404Spjd hash_lock = HDR_LOCK(ab); 1882168404Spjd if (mutex_tryenter(hash_lock)) { 1883168404Spjd ASSERT(!HDR_IO_IN_PROGRESS(ab)); 1884168404Spjd ASSERT(ab->b_buf == NULL); 1885168404Spjd ARCSTAT_BUMP(arcstat_deleted); 1886168404Spjd bytes_deleted += ab->b_size; 1887185029Spjd 1888185029Spjd if (ab->b_l2hdr != NULL) { 1889185029Spjd /* 1890185029Spjd * This buffer is cached on the 2nd Level ARC; 1891185029Spjd * don't destroy the header. 1892185029Spjd */ 1893185029Spjd arc_change_state(arc_l2c_only, ab, hash_lock); 1894185029Spjd mutex_exit(hash_lock); 1895185029Spjd } else { 1896185029Spjd arc_change_state(arc_anon, ab, hash_lock); 1897185029Spjd mutex_exit(hash_lock); 1898185029Spjd arc_hdr_destroy(ab); 1899185029Spjd } 1900185029Spjd 1901168404Spjd DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab); 1902168404Spjd if (bytes >= 0 && bytes_deleted >= bytes) 1903168404Spjd break; 1904168404Spjd } else { 1905168404Spjd if (bytes < 0) { 1906205231Skmacy /* 1907205231Skmacy * we're draining the ARC, retry 1908205231Skmacy */ 1909205231Skmacy mutex_exit(lock); 1910168404Spjd mutex_enter(hash_lock); 1911168404Spjd mutex_exit(hash_lock); 1912205231Skmacy goto evict_start; 1913168404Spjd } 1914168404Spjd bufs_skipped += 1; 1915168404Spjd } 1916168404Spjd } 1917205231Skmacy mutex_exit(lock); 1918206796Spjd idx = ((idx + 1) & (ARC_BUFC_NUMDATALISTS - 1)); 1919205231Skmacy count++; 1920206796Spjd 1921205231Skmacy if (count < list_count) 1922205231Skmacy goto evict_start; 1923206796Spjd 1924205231Skmacy evict_offset = idx; 1925205231Skmacy if ((uintptr_t)list > (uintptr_t)&state->arcs_lists[ARC_BUFC_NUMMETADATALISTS] && 1926185029Spjd (bytes < 0 || bytes_deleted < bytes)) { 1927205231Skmacy list_start = &state->arcs_lists[0]; 1928205231Skmacy list_count = ARC_BUFC_NUMMETADATALISTS; 1929205231Skmacy offset = count = 0; 1930205231Skmacy goto evict_start; 1931185029Spjd } 1932185029Spjd 1933168404Spjd if (bufs_skipped) { 1934168404Spjd ARCSTAT_INCR(arcstat_mutex_miss, bufs_skipped); 1935168404Spjd ASSERT(bytes >= 0); 1936168404Spjd } 1937168404Spjd 1938168404Spjd if (bytes_deleted < bytes) 1939168404Spjd dprintf("only deleted %lld bytes from %p", 1940168404Spjd (longlong_t)bytes_deleted, state); 1941168404Spjd} 1942168404Spjd 1943168404Spjdstatic void 1944168404Spjdarc_adjust(void) 1945168404Spjd{ 1946208373Smm int64_t adjustment, delta; 1947168404Spjd 1948208373Smm /* 1949208373Smm * Adjust MRU size 1950208373Smm */ 1951168404Spjd 1952209275Smm adjustment = MIN((int64_t)(arc_size - arc_c), 1953209275Smm (int64_t)(arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used - 1954209275Smm arc_p)); 1955208373Smm 1956208373Smm if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) { 1957208373Smm delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], adjustment); 1958208373Smm (void) arc_evict(arc_mru, NULL, delta, FALSE, ARC_BUFC_DATA); 1959208373Smm adjustment -= delta; 1960168404Spjd } 1961168404Spjd 1962208373Smm if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) { 1963208373Smm delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustment); 1964208373Smm (void) arc_evict(arc_mru, NULL, delta, FALSE, 1965185029Spjd ARC_BUFC_METADATA); 1966185029Spjd } 1967185029Spjd 1968208373Smm /* 1969208373Smm * Adjust MFU size 1970208373Smm */ 1971168404Spjd 1972208373Smm adjustment = arc_size - arc_c; 1973208373Smm 1974208373Smm if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) { 1975208373Smm delta = MIN(adjustment, arc_mfu->arcs_lsize[ARC_BUFC_DATA]); 1976208373Smm (void) arc_evict(arc_mfu, NULL, delta, FALSE, ARC_BUFC_DATA); 1977208373Smm adjustment -= delta; 1978168404Spjd } 1979168404Spjd 1980208373Smm if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) { 1981208373Smm int64_t delta = MIN(adjustment, 1982208373Smm arc_mfu->arcs_lsize[ARC_BUFC_METADATA]); 1983208373Smm (void) arc_evict(arc_mfu, NULL, delta, FALSE, 1984208373Smm ARC_BUFC_METADATA); 1985208373Smm } 1986168404Spjd 1987208373Smm /* 1988208373Smm * Adjust ghost lists 1989208373Smm */ 1990168404Spjd 1991208373Smm adjustment = arc_mru->arcs_size + arc_mru_ghost->arcs_size - arc_c; 1992168404Spjd 1993208373Smm if (adjustment > 0 && arc_mru_ghost->arcs_size > 0) { 1994208373Smm delta = MIN(arc_mru_ghost->arcs_size, adjustment); 1995208373Smm arc_evict_ghost(arc_mru_ghost, NULL, delta); 1996208373Smm } 1997185029Spjd 1998208373Smm adjustment = 1999208373Smm arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size - arc_c; 2000208373Smm 2001208373Smm if (adjustment > 0 && arc_mfu_ghost->arcs_size > 0) { 2002208373Smm delta = MIN(arc_mfu_ghost->arcs_size, adjustment); 2003208373Smm arc_evict_ghost(arc_mfu_ghost, NULL, delta); 2004168404Spjd } 2005168404Spjd} 2006168404Spjd 2007168404Spjdstatic void 2008168404Spjdarc_do_user_evicts(void) 2009168404Spjd{ 2010191903Skmacy static arc_buf_t *tmp_arc_eviction_list; 2011191903Skmacy 2012191903Skmacy /* 2013191903Skmacy * Move list over to avoid LOR 2014191903Skmacy */ 2015206796Spjdrestart: 2016168404Spjd mutex_enter(&arc_eviction_mtx); 2017191903Skmacy tmp_arc_eviction_list = arc_eviction_list; 2018191903Skmacy arc_eviction_list = NULL; 2019191903Skmacy mutex_exit(&arc_eviction_mtx); 2020191903Skmacy 2021191903Skmacy while (tmp_arc_eviction_list != NULL) { 2022191903Skmacy arc_buf_t *buf = tmp_arc_eviction_list; 2023191903Skmacy tmp_arc_eviction_list = buf->b_next; 2024185029Spjd rw_enter(&buf->b_lock, RW_WRITER); 2025168404Spjd buf->b_hdr = NULL; 2026185029Spjd rw_exit(&buf->b_lock); 2027168404Spjd 2028168404Spjd if (buf->b_efunc != NULL) 2029168404Spjd VERIFY(buf->b_efunc(buf) == 0); 2030168404Spjd 2031168404Spjd buf->b_efunc = NULL; 2032168404Spjd buf->b_private = NULL; 2033168404Spjd kmem_cache_free(buf_cache, buf); 2034168404Spjd } 2035191903Skmacy 2036191903Skmacy if (arc_eviction_list != NULL) 2037191903Skmacy goto restart; 2038168404Spjd} 2039168404Spjd 2040168404Spjd/* 2041185029Spjd * Flush all *evictable* data from the cache for the given spa. 2042168404Spjd * NOTE: this will not touch "active" (i.e. referenced) data. 2043168404Spjd */ 2044168404Spjdvoid 2045185029Spjdarc_flush(spa_t *spa) 2046168404Spjd{ 2047205231Skmacy while (arc_mru->arcs_lsize[ARC_BUFC_DATA]) { 2048185029Spjd (void) arc_evict(arc_mru, spa, -1, FALSE, ARC_BUFC_DATA); 2049185029Spjd if (spa) 2050185029Spjd break; 2051185029Spjd } 2052205231Skmacy while (arc_mru->arcs_lsize[ARC_BUFC_METADATA]) { 2053185029Spjd (void) arc_evict(arc_mru, spa, -1, FALSE, ARC_BUFC_METADATA); 2054185029Spjd if (spa) 2055185029Spjd break; 2056185029Spjd } 2057205231Skmacy while (arc_mfu->arcs_lsize[ARC_BUFC_DATA]) { 2058185029Spjd (void) arc_evict(arc_mfu, spa, -1, FALSE, ARC_BUFC_DATA); 2059185029Spjd if (spa) 2060185029Spjd break; 2061185029Spjd } 2062205231Skmacy while (arc_mfu->arcs_lsize[ARC_BUFC_METADATA]) { 2063185029Spjd (void) arc_evict(arc_mfu, spa, -1, FALSE, ARC_BUFC_METADATA); 2064185029Spjd if (spa) 2065185029Spjd break; 2066185029Spjd } 2067168404Spjd 2068185029Spjd arc_evict_ghost(arc_mru_ghost, spa, -1); 2069185029Spjd arc_evict_ghost(arc_mfu_ghost, spa, -1); 2070168404Spjd 2071168404Spjd mutex_enter(&arc_reclaim_thr_lock); 2072168404Spjd arc_do_user_evicts(); 2073168404Spjd mutex_exit(&arc_reclaim_thr_lock); 2074185029Spjd ASSERT(spa || arc_eviction_list == NULL); 2075168404Spjd} 2076168404Spjd 2077168404Spjdvoid 2078168404Spjdarc_shrink(void) 2079168404Spjd{ 2080168404Spjd if (arc_c > arc_c_min) { 2081168404Spjd uint64_t to_free; 2082168404Spjd 2083168404Spjd#ifdef _KERNEL 2084168404Spjd to_free = arc_c >> arc_shrink_shift; 2085168404Spjd#else 2086168404Spjd to_free = arc_c >> arc_shrink_shift; 2087168404Spjd#endif 2088168404Spjd if (arc_c > arc_c_min + to_free) 2089168404Spjd atomic_add_64(&arc_c, -to_free); 2090168404Spjd else 2091168404Spjd arc_c = arc_c_min; 2092168404Spjd 2093168404Spjd atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift)); 2094168404Spjd if (arc_c > arc_size) 2095168404Spjd arc_c = MAX(arc_size, arc_c_min); 2096168404Spjd if (arc_p > arc_c) 2097168404Spjd arc_p = (arc_c >> 1); 2098168404Spjd ASSERT(arc_c >= arc_c_min); 2099168404Spjd ASSERT((int64_t)arc_p >= 0); 2100168404Spjd } 2101168404Spjd 2102168404Spjd if (arc_size > arc_c) 2103168404Spjd arc_adjust(); 2104168404Spjd} 2105168404Spjd 2106185029Spjdstatic int needfree = 0; 2107168404Spjd 2108168404Spjdstatic int 2109168404Spjdarc_reclaim_needed(void) 2110168404Spjd{ 2111168404Spjd#if 0 2112168404Spjd uint64_t extra; 2113168404Spjd#endif 2114168404Spjd 2115168404Spjd#ifdef _KERNEL 2116197816Skmacy if (needfree) 2117197816Skmacy return (1); 2118197816Skmacy if (arc_size > arc_c_max) 2119197816Skmacy return (1); 2120197816Skmacy if (arc_size <= arc_c_min) 2121197816Skmacy return (0); 2122168404Spjd 2123191902Skmacy /* 2124206796Spjd * If pages are needed or we're within 2048 pages 2125191902Skmacy * of needing to page need to reclaim 2126191902Skmacy */ 2127191902Skmacy if (vm_pages_needed || (vm_paging_target() > -2048)) 2128191902Skmacy return (1); 2129191902Skmacy 2130168404Spjd#if 0 2131168404Spjd /* 2132185029Spjd * take 'desfree' extra pages, so we reclaim sooner, rather than later 2133185029Spjd */ 2134185029Spjd extra = desfree; 2135185029Spjd 2136185029Spjd /* 2137185029Spjd * check that we're out of range of the pageout scanner. It starts to 2138185029Spjd * schedule paging if freemem is less than lotsfree and needfree. 2139185029Spjd * lotsfree is the high-water mark for pageout, and needfree is the 2140185029Spjd * number of needed free pages. We add extra pages here to make sure 2141185029Spjd * the scanner doesn't start up while we're freeing memory. 2142185029Spjd */ 2143185029Spjd if (freemem < lotsfree + needfree + extra) 2144185029Spjd return (1); 2145185029Spjd 2146185029Spjd /* 2147168404Spjd * check to make sure that swapfs has enough space so that anon 2148185029Spjd * reservations can still succeed. anon_resvmem() checks that the 2149168404Spjd * availrmem is greater than swapfs_minfree, and the number of reserved 2150168404Spjd * swap pages. We also add a bit of extra here just to prevent 2151168404Spjd * circumstances from getting really dire. 2152168404Spjd */ 2153168404Spjd if (availrmem < swapfs_minfree + swapfs_reserve + extra) 2154168404Spjd return (1); 2155168404Spjd 2156168404Spjd#if defined(__i386) 2157168404Spjd /* 2158168404Spjd * If we're on an i386 platform, it's possible that we'll exhaust the 2159168404Spjd * kernel heap space before we ever run out of available physical 2160168404Spjd * memory. Most checks of the size of the heap_area compare against 2161168404Spjd * tune.t_minarmem, which is the minimum available real memory that we 2162168404Spjd * can have in the system. However, this is generally fixed at 25 pages 2163168404Spjd * which is so low that it's useless. In this comparison, we seek to 2164168404Spjd * calculate the total heap-size, and reclaim if more than 3/4ths of the 2165185029Spjd * heap is allocated. (Or, in the calculation, if less than 1/4th is 2166168404Spjd * free) 2167168404Spjd */ 2168168404Spjd if (btop(vmem_size(heap_arena, VMEM_FREE)) < 2169168404Spjd (btop(vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC)) >> 2)) 2170168404Spjd return (1); 2171168404Spjd#endif 2172168404Spjd#else 2173175633Spjd if (kmem_used() > (kmem_size() * 3) / 4) 2174168404Spjd return (1); 2175168404Spjd#endif 2176168404Spjd 2177168404Spjd#else 2178168404Spjd if (spa_get_random(100) == 0) 2179168404Spjd return (1); 2180168404Spjd#endif 2181168404Spjd return (0); 2182168404Spjd} 2183168404Spjd 2184208454Spjdextern kmem_cache_t *zio_buf_cache[]; 2185208454Spjdextern kmem_cache_t *zio_data_buf_cache[]; 2186208454Spjd 2187168404Spjdstatic void 2188168404Spjdarc_kmem_reap_now(arc_reclaim_strategy_t strat) 2189168404Spjd{ 2190168404Spjd size_t i; 2191168404Spjd kmem_cache_t *prev_cache = NULL; 2192168404Spjd kmem_cache_t *prev_data_cache = NULL; 2193168404Spjd 2194168404Spjd#ifdef _KERNEL 2195185029Spjd if (arc_meta_used >= arc_meta_limit) { 2196185029Spjd /* 2197185029Spjd * We are exceeding our meta-data cache limit. 2198185029Spjd * Purge some DNLC entries to release holds on meta-data. 2199185029Spjd */ 2200185029Spjd dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent); 2201185029Spjd } 2202168404Spjd#if defined(__i386) 2203168404Spjd /* 2204168404Spjd * Reclaim unused memory from all kmem caches. 2205168404Spjd */ 2206168404Spjd kmem_reap(); 2207168404Spjd#endif 2208168404Spjd#endif 2209168404Spjd 2210168404Spjd /* 2211185029Spjd * An aggressive reclamation will shrink the cache size as well as 2212168404Spjd * reap free buffers from the arc kmem caches. 2213168404Spjd */ 2214168404Spjd if (strat == ARC_RECLAIM_AGGR) 2215168404Spjd arc_shrink(); 2216168404Spjd 2217168404Spjd for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) { 2218168404Spjd if (zio_buf_cache[i] != prev_cache) { 2219168404Spjd prev_cache = zio_buf_cache[i]; 2220168404Spjd kmem_cache_reap_now(zio_buf_cache[i]); 2221168404Spjd } 2222168404Spjd if (zio_data_buf_cache[i] != prev_data_cache) { 2223168404Spjd prev_data_cache = zio_data_buf_cache[i]; 2224168404Spjd kmem_cache_reap_now(zio_data_buf_cache[i]); 2225168404Spjd } 2226168404Spjd } 2227168404Spjd kmem_cache_reap_now(buf_cache); 2228168404Spjd kmem_cache_reap_now(hdr_cache); 2229168404Spjd} 2230168404Spjd 2231168404Spjdstatic void 2232168404Spjdarc_reclaim_thread(void *dummy __unused) 2233168404Spjd{ 2234168404Spjd clock_t growtime = 0; 2235168404Spjd arc_reclaim_strategy_t last_reclaim = ARC_RECLAIM_CONS; 2236168404Spjd callb_cpr_t cpr; 2237168404Spjd 2238168404Spjd CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG); 2239168404Spjd 2240168404Spjd mutex_enter(&arc_reclaim_thr_lock); 2241168404Spjd while (arc_thread_exit == 0) { 2242168404Spjd if (arc_reclaim_needed()) { 2243168404Spjd 2244168404Spjd if (arc_no_grow) { 2245168404Spjd if (last_reclaim == ARC_RECLAIM_CONS) { 2246168404Spjd last_reclaim = ARC_RECLAIM_AGGR; 2247168404Spjd } else { 2248168404Spjd last_reclaim = ARC_RECLAIM_CONS; 2249168404Spjd } 2250168404Spjd } else { 2251168404Spjd arc_no_grow = TRUE; 2252168404Spjd last_reclaim = ARC_RECLAIM_AGGR; 2253168404Spjd membar_producer(); 2254168404Spjd } 2255168404Spjd 2256168404Spjd /* reset the growth delay for every reclaim */ 2257174049Sjb growtime = LBOLT + (arc_grow_retry * hz); 2258168404Spjd 2259185029Spjd if (needfree && last_reclaim == ARC_RECLAIM_CONS) { 2260168404Spjd /* 2261185029Spjd * If needfree is TRUE our vm_lowmem hook 2262168404Spjd * was called and in that case we must free some 2263168404Spjd * memory, so switch to aggressive mode. 2264168404Spjd */ 2265168404Spjd arc_no_grow = TRUE; 2266168404Spjd last_reclaim = ARC_RECLAIM_AGGR; 2267168404Spjd } 2268168404Spjd arc_kmem_reap_now(last_reclaim); 2269185029Spjd arc_warm = B_TRUE; 2270185029Spjd 2271185029Spjd } else if (arc_no_grow && LBOLT >= growtime) { 2272168404Spjd arc_no_grow = FALSE; 2273168404Spjd } 2274168404Spjd 2275209275Smm arc_adjust(); 2276168404Spjd 2277168404Spjd if (arc_eviction_list != NULL) 2278168404Spjd arc_do_user_evicts(); 2279168404Spjd 2280168404Spjd if (arc_reclaim_needed()) { 2281185029Spjd needfree = 0; 2282168404Spjd#ifdef _KERNEL 2283185029Spjd wakeup(&needfree); 2284168404Spjd#endif 2285168404Spjd } 2286168404Spjd 2287168404Spjd /* block until needed, or one second, whichever is shorter */ 2288168404Spjd CALLB_CPR_SAFE_BEGIN(&cpr); 2289168404Spjd (void) cv_timedwait(&arc_reclaim_thr_cv, 2290168404Spjd &arc_reclaim_thr_lock, hz); 2291168404Spjd CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock); 2292168404Spjd } 2293168404Spjd 2294168404Spjd arc_thread_exit = 0; 2295168404Spjd cv_broadcast(&arc_reclaim_thr_cv); 2296168404Spjd CALLB_CPR_EXIT(&cpr); /* drops arc_reclaim_thr_lock */ 2297168404Spjd thread_exit(); 2298168404Spjd} 2299168404Spjd 2300168404Spjd/* 2301168404Spjd * Adapt arc info given the number of bytes we are trying to add and 2302168404Spjd * the state that we are comming from. This function is only called 2303168404Spjd * when we are adding new content to the cache. 2304168404Spjd */ 2305168404Spjdstatic void 2306168404Spjdarc_adapt(int bytes, arc_state_t *state) 2307168404Spjd{ 2308168404Spjd int mult; 2309208373Smm uint64_t arc_p_min = (arc_c >> arc_p_min_shift); 2310168404Spjd 2311185029Spjd if (state == arc_l2c_only) 2312185029Spjd return; 2313185029Spjd 2314168404Spjd ASSERT(bytes > 0); 2315168404Spjd /* 2316168404Spjd * Adapt the target size of the MRU list: 2317168404Spjd * - if we just hit in the MRU ghost list, then increase 2318168404Spjd * the target size of the MRU list. 2319168404Spjd * - if we just hit in the MFU ghost list, then increase 2320168404Spjd * the target size of the MFU list by decreasing the 2321168404Spjd * target size of the MRU list. 2322168404Spjd */ 2323168404Spjd if (state == arc_mru_ghost) { 2324168404Spjd mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ? 2325168404Spjd 1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size)); 2326209275Smm mult = MIN(mult, 10); /* avoid wild arc_p adjustment */ 2327168404Spjd 2328208373Smm arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult); 2329168404Spjd } else if (state == arc_mfu_ghost) { 2330208373Smm uint64_t delta; 2331208373Smm 2332168404Spjd mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ? 2333168404Spjd 1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size)); 2334209275Smm mult = MIN(mult, 10); 2335168404Spjd 2336208373Smm delta = MIN(bytes * mult, arc_p); 2337208373Smm arc_p = MAX(arc_p_min, arc_p - delta); 2338168404Spjd } 2339168404Spjd ASSERT((int64_t)arc_p >= 0); 2340168404Spjd 2341168404Spjd if (arc_reclaim_needed()) { 2342168404Spjd cv_signal(&arc_reclaim_thr_cv); 2343168404Spjd return; 2344168404Spjd } 2345168404Spjd 2346168404Spjd if (arc_no_grow) 2347168404Spjd return; 2348168404Spjd 2349168404Spjd if (arc_c >= arc_c_max) 2350168404Spjd return; 2351168404Spjd 2352168404Spjd /* 2353168404Spjd * If we're within (2 * maxblocksize) bytes of the target 2354168404Spjd * cache size, increment the target cache size 2355168404Spjd */ 2356168404Spjd if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) { 2357168404Spjd atomic_add_64(&arc_c, (int64_t)bytes); 2358168404Spjd if (arc_c > arc_c_max) 2359168404Spjd arc_c = arc_c_max; 2360168404Spjd else if (state == arc_anon) 2361168404Spjd atomic_add_64(&arc_p, (int64_t)bytes); 2362168404Spjd if (arc_p > arc_c) 2363168404Spjd arc_p = arc_c; 2364168404Spjd } 2365168404Spjd ASSERT((int64_t)arc_p >= 0); 2366168404Spjd} 2367168404Spjd 2368168404Spjd/* 2369168404Spjd * Check if the cache has reached its limits and eviction is required 2370168404Spjd * prior to insert. 2371168404Spjd */ 2372168404Spjdstatic int 2373185029Spjdarc_evict_needed(arc_buf_contents_t type) 2374168404Spjd{ 2375185029Spjd if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit) 2376185029Spjd return (1); 2377185029Spjd 2378185029Spjd#if 0 2379185029Spjd#ifdef _KERNEL 2380185029Spjd /* 2381185029Spjd * If zio data pages are being allocated out of a separate heap segment, 2382185029Spjd * then enforce that the size of available vmem for this area remains 2383185029Spjd * above about 1/32nd free. 2384185029Spjd */ 2385185029Spjd if (type == ARC_BUFC_DATA && zio_arena != NULL && 2386185029Spjd vmem_size(zio_arena, VMEM_FREE) < 2387185029Spjd (vmem_size(zio_arena, VMEM_ALLOC) >> 5)) 2388185029Spjd return (1); 2389185029Spjd#endif 2390185029Spjd#endif 2391185029Spjd 2392168404Spjd if (arc_reclaim_needed()) 2393168404Spjd return (1); 2394168404Spjd 2395168404Spjd return (arc_size > arc_c); 2396168404Spjd} 2397168404Spjd 2398168404Spjd/* 2399168404Spjd * The buffer, supplied as the first argument, needs a data block. 2400168404Spjd * So, if we are at cache max, determine which cache should be victimized. 2401168404Spjd * We have the following cases: 2402168404Spjd * 2403168404Spjd * 1. Insert for MRU, p > sizeof(arc_anon + arc_mru) -> 2404168404Spjd * In this situation if we're out of space, but the resident size of the MFU is 2405168404Spjd * under the limit, victimize the MFU cache to satisfy this insertion request. 2406168404Spjd * 2407168404Spjd * 2. Insert for MRU, p <= sizeof(arc_anon + arc_mru) -> 2408168404Spjd * Here, we've used up all of the available space for the MRU, so we need to 2409168404Spjd * evict from our own cache instead. Evict from the set of resident MRU 2410168404Spjd * entries. 2411168404Spjd * 2412168404Spjd * 3. Insert for MFU (c - p) > sizeof(arc_mfu) -> 2413168404Spjd * c minus p represents the MFU space in the cache, since p is the size of the 2414168404Spjd * cache that is dedicated to the MRU. In this situation there's still space on 2415168404Spjd * the MFU side, so the MRU side needs to be victimized. 2416168404Spjd * 2417168404Spjd * 4. Insert for MFU (c - p) < sizeof(arc_mfu) -> 2418168404Spjd * MFU's resident set is consuming more space than it has been allotted. In 2419168404Spjd * this situation, we must victimize our own cache, the MFU, for this insertion. 2420168404Spjd */ 2421168404Spjdstatic void 2422168404Spjdarc_get_data_buf(arc_buf_t *buf) 2423168404Spjd{ 2424168404Spjd arc_state_t *state = buf->b_hdr->b_state; 2425168404Spjd uint64_t size = buf->b_hdr->b_size; 2426168404Spjd arc_buf_contents_t type = buf->b_hdr->b_type; 2427168404Spjd 2428168404Spjd arc_adapt(size, state); 2429168404Spjd 2430168404Spjd /* 2431168404Spjd * We have not yet reached cache maximum size, 2432168404Spjd * just allocate a new buffer. 2433168404Spjd */ 2434185029Spjd if (!arc_evict_needed(type)) { 2435168404Spjd if (type == ARC_BUFC_METADATA) { 2436168404Spjd buf->b_data = zio_buf_alloc(size); 2437208373Smm arc_space_consume(size, ARC_SPACE_DATA); 2438168404Spjd } else { 2439168404Spjd ASSERT(type == ARC_BUFC_DATA); 2440168404Spjd buf->b_data = zio_data_buf_alloc(size); 2441208373Smm ARCSTAT_INCR(arcstat_data_size, size); 2442185029Spjd atomic_add_64(&arc_size, size); 2443168404Spjd } 2444168404Spjd goto out; 2445168404Spjd } 2446168404Spjd 2447168404Spjd /* 2448168404Spjd * If we are prefetching from the mfu ghost list, this buffer 2449168404Spjd * will end up on the mru list; so steal space from there. 2450168404Spjd */ 2451168404Spjd if (state == arc_mfu_ghost) 2452168404Spjd state = buf->b_hdr->b_flags & ARC_PREFETCH ? arc_mru : arc_mfu; 2453168404Spjd else if (state == arc_mru_ghost) 2454168404Spjd state = arc_mru; 2455168404Spjd 2456168404Spjd if (state == arc_mru || state == arc_anon) { 2457168404Spjd uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size; 2458208373Smm state = (arc_mfu->arcs_lsize[type] >= size && 2459185029Spjd arc_p > mru_used) ? arc_mfu : arc_mru; 2460168404Spjd } else { 2461168404Spjd /* MFU cases */ 2462168404Spjd uint64_t mfu_space = arc_c - arc_p; 2463208373Smm state = (arc_mru->arcs_lsize[type] >= size && 2464185029Spjd mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu; 2465168404Spjd } 2466185029Spjd if ((buf->b_data = arc_evict(state, NULL, size, TRUE, type)) == NULL) { 2467168404Spjd if (type == ARC_BUFC_METADATA) { 2468168404Spjd buf->b_data = zio_buf_alloc(size); 2469208373Smm arc_space_consume(size, ARC_SPACE_DATA); 2470168404Spjd } else { 2471168404Spjd ASSERT(type == ARC_BUFC_DATA); 2472168404Spjd buf->b_data = zio_data_buf_alloc(size); 2473208373Smm ARCSTAT_INCR(arcstat_data_size, size); 2474185029Spjd atomic_add_64(&arc_size, size); 2475168404Spjd } 2476168404Spjd ARCSTAT_BUMP(arcstat_recycle_miss); 2477168404Spjd } 2478168404Spjd ASSERT(buf->b_data != NULL); 2479168404Spjdout: 2480168404Spjd /* 2481168404Spjd * Update the state size. Note that ghost states have a 2482168404Spjd * "ghost size" and so don't need to be updated. 2483168404Spjd */ 2484168404Spjd if (!GHOST_STATE(buf->b_hdr->b_state)) { 2485168404Spjd arc_buf_hdr_t *hdr = buf->b_hdr; 2486168404Spjd 2487168404Spjd atomic_add_64(&hdr->b_state->arcs_size, size); 2488168404Spjd if (list_link_active(&hdr->b_arc_node)) { 2489168404Spjd ASSERT(refcount_is_zero(&hdr->b_refcnt)); 2490185029Spjd atomic_add_64(&hdr->b_state->arcs_lsize[type], size); 2491168404Spjd } 2492168404Spjd /* 2493168404Spjd * If we are growing the cache, and we are adding anonymous 2494168404Spjd * data, and we have outgrown arc_p, update arc_p 2495168404Spjd */ 2496168404Spjd if (arc_size < arc_c && hdr->b_state == arc_anon && 2497168404Spjd arc_anon->arcs_size + arc_mru->arcs_size > arc_p) 2498168404Spjd arc_p = MIN(arc_c, arc_p + size); 2499168404Spjd } 2500205231Skmacy ARCSTAT_BUMP(arcstat_allocated); 2501168404Spjd} 2502168404Spjd 2503168404Spjd/* 2504168404Spjd * This routine is called whenever a buffer is accessed. 2505168404Spjd * NOTE: the hash lock is dropped in this function. 2506168404Spjd */ 2507168404Spjdstatic void 2508168404Spjdarc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) 2509168404Spjd{ 2510168404Spjd ASSERT(MUTEX_HELD(hash_lock)); 2511168404Spjd 2512168404Spjd if (buf->b_state == arc_anon) { 2513168404Spjd /* 2514168404Spjd * This buffer is not in the cache, and does not 2515168404Spjd * appear in our "ghost" list. Add the new buffer 2516168404Spjd * to the MRU state. 2517168404Spjd */ 2518168404Spjd 2519168404Spjd ASSERT(buf->b_arc_access == 0); 2520174049Sjb buf->b_arc_access = LBOLT; 2521168404Spjd DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf); 2522168404Spjd arc_change_state(arc_mru, buf, hash_lock); 2523168404Spjd 2524168404Spjd } else if (buf->b_state == arc_mru) { 2525168404Spjd /* 2526168404Spjd * If this buffer is here because of a prefetch, then either: 2527168404Spjd * - clear the flag if this is a "referencing" read 2528168404Spjd * (any subsequent access will bump this into the MFU state). 2529168404Spjd * or 2530168404Spjd * - move the buffer to the head of the list if this is 2531168404Spjd * another prefetch (to make it less likely to be evicted). 2532168404Spjd */ 2533168404Spjd if ((buf->b_flags & ARC_PREFETCH) != 0) { 2534168404Spjd if (refcount_count(&buf->b_refcnt) == 0) { 2535168404Spjd ASSERT(list_link_active(&buf->b_arc_node)); 2536168404Spjd } else { 2537168404Spjd buf->b_flags &= ~ARC_PREFETCH; 2538168404Spjd ARCSTAT_BUMP(arcstat_mru_hits); 2539168404Spjd } 2540174049Sjb buf->b_arc_access = LBOLT; 2541168404Spjd return; 2542168404Spjd } 2543168404Spjd 2544168404Spjd /* 2545168404Spjd * This buffer has been "accessed" only once so far, 2546168404Spjd * but it is still in the cache. Move it to the MFU 2547168404Spjd * state. 2548168404Spjd */ 2549174049Sjb if (LBOLT > buf->b_arc_access + ARC_MINTIME) { 2550168404Spjd /* 2551168404Spjd * More than 125ms have passed since we 2552168404Spjd * instantiated this buffer. Move it to the 2553168404Spjd * most frequently used state. 2554168404Spjd */ 2555174049Sjb buf->b_arc_access = LBOLT; 2556168404Spjd DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); 2557168404Spjd arc_change_state(arc_mfu, buf, hash_lock); 2558168404Spjd } 2559168404Spjd ARCSTAT_BUMP(arcstat_mru_hits); 2560168404Spjd } else if (buf->b_state == arc_mru_ghost) { 2561168404Spjd arc_state_t *new_state; 2562168404Spjd /* 2563168404Spjd * This buffer has been "accessed" recently, but 2564168404Spjd * was evicted from the cache. Move it to the 2565168404Spjd * MFU state. 2566168404Spjd */ 2567168404Spjd 2568168404Spjd if (buf->b_flags & ARC_PREFETCH) { 2569168404Spjd new_state = arc_mru; 2570168404Spjd if (refcount_count(&buf->b_refcnt) > 0) 2571168404Spjd buf->b_flags &= ~ARC_PREFETCH; 2572168404Spjd DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf); 2573168404Spjd } else { 2574168404Spjd new_state = arc_mfu; 2575168404Spjd DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); 2576168404Spjd } 2577168404Spjd 2578174049Sjb buf->b_arc_access = LBOLT; 2579168404Spjd arc_change_state(new_state, buf, hash_lock); 2580168404Spjd 2581168404Spjd ARCSTAT_BUMP(arcstat_mru_ghost_hits); 2582168404Spjd } else if (buf->b_state == arc_mfu) { 2583168404Spjd /* 2584168404Spjd * This buffer has been accessed more than once and is 2585168404Spjd * still in the cache. Keep it in the MFU state. 2586168404Spjd * 2587168404Spjd * NOTE: an add_reference() that occurred when we did 2588168404Spjd * the arc_read() will have kicked this off the list. 2589168404Spjd * If it was a prefetch, we will explicitly move it to 2590168404Spjd * the head of the list now. 2591168404Spjd */ 2592168404Spjd if ((buf->b_flags & ARC_PREFETCH) != 0) { 2593168404Spjd ASSERT(refcount_count(&buf->b_refcnt) == 0); 2594168404Spjd ASSERT(list_link_active(&buf->b_arc_node)); 2595168404Spjd } 2596168404Spjd ARCSTAT_BUMP(arcstat_mfu_hits); 2597174049Sjb buf->b_arc_access = LBOLT; 2598168404Spjd } else if (buf->b_state == arc_mfu_ghost) { 2599168404Spjd arc_state_t *new_state = arc_mfu; 2600168404Spjd /* 2601168404Spjd * This buffer has been accessed more than once but has 2602168404Spjd * been evicted from the cache. Move it back to the 2603168404Spjd * MFU state. 2604168404Spjd */ 2605168404Spjd 2606168404Spjd if (buf->b_flags & ARC_PREFETCH) { 2607168404Spjd /* 2608168404Spjd * This is a prefetch access... 2609168404Spjd * move this block back to the MRU state. 2610168404Spjd */ 2611168404Spjd ASSERT3U(refcount_count(&buf->b_refcnt), ==, 0); 2612168404Spjd new_state = arc_mru; 2613168404Spjd } 2614168404Spjd 2615174049Sjb buf->b_arc_access = LBOLT; 2616168404Spjd DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); 2617168404Spjd arc_change_state(new_state, buf, hash_lock); 2618168404Spjd 2619168404Spjd ARCSTAT_BUMP(arcstat_mfu_ghost_hits); 2620185029Spjd } else if (buf->b_state == arc_l2c_only) { 2621185029Spjd /* 2622185029Spjd * This buffer is on the 2nd Level ARC. 2623185029Spjd */ 2624185029Spjd 2625185029Spjd buf->b_arc_access = LBOLT; 2626185029Spjd DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); 2627185029Spjd arc_change_state(arc_mfu, buf, hash_lock); 2628168404Spjd } else { 2629168404Spjd ASSERT(!"invalid arc state"); 2630168404Spjd } 2631168404Spjd} 2632168404Spjd 2633168404Spjd/* a generic arc_done_func_t which you can use */ 2634168404Spjd/* ARGSUSED */ 2635168404Spjdvoid 2636168404Spjdarc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg) 2637168404Spjd{ 2638168404Spjd bcopy(buf->b_data, arg, buf->b_hdr->b_size); 2639168404Spjd VERIFY(arc_buf_remove_ref(buf, arg) == 1); 2640168404Spjd} 2641168404Spjd 2642185029Spjd/* a generic arc_done_func_t */ 2643168404Spjdvoid 2644168404Spjdarc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg) 2645168404Spjd{ 2646168404Spjd arc_buf_t **bufp = arg; 2647168404Spjd if (zio && zio->io_error) { 2648168404Spjd VERIFY(arc_buf_remove_ref(buf, arg) == 1); 2649168404Spjd *bufp = NULL; 2650168404Spjd } else { 2651168404Spjd *bufp = buf; 2652168404Spjd } 2653168404Spjd} 2654168404Spjd 2655168404Spjdstatic void 2656168404Spjdarc_read_done(zio_t *zio) 2657168404Spjd{ 2658168404Spjd arc_buf_hdr_t *hdr, *found; 2659168404Spjd arc_buf_t *buf; 2660168404Spjd arc_buf_t *abuf; /* buffer we're assigning to callback */ 2661168404Spjd kmutex_t *hash_lock; 2662168404Spjd arc_callback_t *callback_list, *acb; 2663168404Spjd int freeable = FALSE; 2664168404Spjd 2665168404Spjd buf = zio->io_private; 2666168404Spjd hdr = buf->b_hdr; 2667168404Spjd 2668168404Spjd /* 2669168404Spjd * The hdr was inserted into hash-table and removed from lists 2670168404Spjd * prior to starting I/O. We should find this header, since 2671168404Spjd * it's in the hash table, and it should be legit since it's 2672168404Spjd * not possible to evict it during the I/O. The only possible 2673168404Spjd * reason for it not to be found is if we were freed during the 2674168404Spjd * read. 2675168404Spjd */ 2676168404Spjd found = buf_hash_find(zio->io_spa, &hdr->b_dva, hdr->b_birth, 2677168404Spjd &hash_lock); 2678168404Spjd 2679168404Spjd ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && hash_lock == NULL) || 2680185029Spjd (found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) || 2681185029Spjd (found == hdr && HDR_L2_READING(hdr))); 2682168404Spjd 2683185029Spjd hdr->b_flags &= ~ARC_L2_EVICTED; 2684185029Spjd if (l2arc_noprefetch && (hdr->b_flags & ARC_PREFETCH)) 2685185029Spjd hdr->b_flags &= ~ARC_L2CACHE; 2686206796Spjd 2687168404Spjd /* byteswap if necessary */ 2688168404Spjd callback_list = hdr->b_acb; 2689168404Spjd ASSERT(callback_list != NULL); 2690209101Smm if (BP_SHOULD_BYTESWAP(zio->io_bp) && zio->io_error == 0) { 2691185029Spjd arc_byteswap_func_t *func = BP_GET_LEVEL(zio->io_bp) > 0 ? 2692185029Spjd byteswap_uint64_array : 2693185029Spjd dmu_ot[BP_GET_TYPE(zio->io_bp)].ot_byteswap; 2694185029Spjd func(buf->b_data, hdr->b_size); 2695185029Spjd } 2696168404Spjd 2697185029Spjd arc_cksum_compute(buf, B_FALSE); 2698168404Spjd 2699168404Spjd /* create copies of the data buffer for the callers */ 2700168404Spjd abuf = buf; 2701168404Spjd for (acb = callback_list; acb; acb = acb->acb_next) { 2702168404Spjd if (acb->acb_done) { 2703168404Spjd if (abuf == NULL) 2704168404Spjd abuf = arc_buf_clone(buf); 2705168404Spjd acb->acb_buf = abuf; 2706168404Spjd abuf = NULL; 2707168404Spjd } 2708168404Spjd } 2709168404Spjd hdr->b_acb = NULL; 2710168404Spjd hdr->b_flags &= ~ARC_IO_IN_PROGRESS; 2711168404Spjd ASSERT(!HDR_BUF_AVAILABLE(hdr)); 2712168404Spjd if (abuf == buf) 2713168404Spjd hdr->b_flags |= ARC_BUF_AVAILABLE; 2714168404Spjd 2715168404Spjd ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL); 2716168404Spjd 2717168404Spjd if (zio->io_error != 0) { 2718168404Spjd hdr->b_flags |= ARC_IO_ERROR; 2719168404Spjd if (hdr->b_state != arc_anon) 2720168404Spjd arc_change_state(arc_anon, hdr, hash_lock); 2721168404Spjd if (HDR_IN_HASH_TABLE(hdr)) 2722168404Spjd buf_hash_remove(hdr); 2723168404Spjd freeable = refcount_is_zero(&hdr->b_refcnt); 2724168404Spjd } 2725168404Spjd 2726168404Spjd /* 2727168404Spjd * Broadcast before we drop the hash_lock to avoid the possibility 2728168404Spjd * that the hdr (and hence the cv) might be freed before we get to 2729168404Spjd * the cv_broadcast(). 2730168404Spjd */ 2731168404Spjd cv_broadcast(&hdr->b_cv); 2732168404Spjd 2733168404Spjd if (hash_lock) { 2734168404Spjd /* 2735168404Spjd * Only call arc_access on anonymous buffers. This is because 2736168404Spjd * if we've issued an I/O for an evicted buffer, we've already 2737168404Spjd * called arc_access (to prevent any simultaneous readers from 2738168404Spjd * getting confused). 2739168404Spjd */ 2740168404Spjd if (zio->io_error == 0 && hdr->b_state == arc_anon) 2741168404Spjd arc_access(hdr, hash_lock); 2742168404Spjd mutex_exit(hash_lock); 2743168404Spjd } else { 2744168404Spjd /* 2745168404Spjd * This block was freed while we waited for the read to 2746168404Spjd * complete. It has been removed from the hash table and 2747168404Spjd * moved to the anonymous state (so that it won't show up 2748168404Spjd * in the cache). 2749168404Spjd */ 2750168404Spjd ASSERT3P(hdr->b_state, ==, arc_anon); 2751168404Spjd freeable = refcount_is_zero(&hdr->b_refcnt); 2752168404Spjd } 2753168404Spjd 2754168404Spjd /* execute each callback and free its structure */ 2755168404Spjd while ((acb = callback_list) != NULL) { 2756168404Spjd if (acb->acb_done) 2757168404Spjd acb->acb_done(zio, acb->acb_buf, acb->acb_private); 2758168404Spjd 2759168404Spjd if (acb->acb_zio_dummy != NULL) { 2760168404Spjd acb->acb_zio_dummy->io_error = zio->io_error; 2761168404Spjd zio_nowait(acb->acb_zio_dummy); 2762168404Spjd } 2763168404Spjd 2764168404Spjd callback_list = acb->acb_next; 2765168404Spjd kmem_free(acb, sizeof (arc_callback_t)); 2766168404Spjd } 2767168404Spjd 2768168404Spjd if (freeable) 2769168404Spjd arc_hdr_destroy(hdr); 2770168404Spjd} 2771168404Spjd 2772168404Spjd/* 2773168404Spjd * "Read" the block block at the specified DVA (in bp) via the 2774168404Spjd * cache. If the block is found in the cache, invoke the provided 2775168404Spjd * callback immediately and return. Note that the `zio' parameter 2776168404Spjd * in the callback will be NULL in this case, since no IO was 2777168404Spjd * required. If the block is not in the cache pass the read request 2778168404Spjd * on to the spa with a substitute callback function, so that the 2779168404Spjd * requested block will be added to the cache. 2780168404Spjd * 2781168404Spjd * If a read request arrives for a block that has a read in-progress, 2782168404Spjd * either wait for the in-progress read to complete (and return the 2783168404Spjd * results); or, if this is a read with a "done" func, add a record 2784168404Spjd * to the read to invoke the "done" func when the read completes, 2785168404Spjd * and return; or just return. 2786168404Spjd * 2787168404Spjd * arc_read_done() will invoke all the requested "done" functions 2788168404Spjd * for readers of this block. 2789185029Spjd * 2790185029Spjd * Normal callers should use arc_read and pass the arc buffer and offset 2791185029Spjd * for the bp. But if you know you don't need locking, you can use 2792185029Spjd * arc_read_bp. 2793168404Spjd */ 2794168404Spjdint 2795185029Spjdarc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_buf_t *pbuf, 2796185029Spjd arc_done_func_t *done, void *private, int priority, int zio_flags, 2797185029Spjd uint32_t *arc_flags, const zbookmark_t *zb) 2798168404Spjd{ 2799185029Spjd int err; 2800185029Spjd 2801185029Spjd ASSERT(!refcount_is_zero(&pbuf->b_hdr->b_refcnt)); 2802185029Spjd ASSERT3U((char *)bp - (char *)pbuf->b_data, <, pbuf->b_hdr->b_size); 2803185029Spjd rw_enter(&pbuf->b_lock, RW_READER); 2804185029Spjd 2805185029Spjd err = arc_read_nolock(pio, spa, bp, done, private, priority, 2806185029Spjd zio_flags, arc_flags, zb); 2807185029Spjd rw_exit(&pbuf->b_lock); 2808185029Spjd return (err); 2809185029Spjd} 2810185029Spjd 2811185029Spjdint 2812185029Spjdarc_read_nolock(zio_t *pio, spa_t *spa, blkptr_t *bp, 2813185029Spjd arc_done_func_t *done, void *private, int priority, int zio_flags, 2814185029Spjd uint32_t *arc_flags, const zbookmark_t *zb) 2815185029Spjd{ 2816168404Spjd arc_buf_hdr_t *hdr; 2817168404Spjd arc_buf_t *buf; 2818168404Spjd kmutex_t *hash_lock; 2819185029Spjd zio_t *rzio; 2820168404Spjd 2821168404Spjdtop: 2822168404Spjd hdr = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_lock); 2823168404Spjd if (hdr && hdr->b_datacnt > 0) { 2824168404Spjd 2825168404Spjd *arc_flags |= ARC_CACHED; 2826168404Spjd 2827168404Spjd if (HDR_IO_IN_PROGRESS(hdr)) { 2828168404Spjd 2829168404Spjd if (*arc_flags & ARC_WAIT) { 2830168404Spjd cv_wait(&hdr->b_cv, hash_lock); 2831168404Spjd mutex_exit(hash_lock); 2832168404Spjd goto top; 2833168404Spjd } 2834168404Spjd ASSERT(*arc_flags & ARC_NOWAIT); 2835168404Spjd 2836168404Spjd if (done) { 2837168404Spjd arc_callback_t *acb = NULL; 2838168404Spjd 2839168404Spjd acb = kmem_zalloc(sizeof (arc_callback_t), 2840168404Spjd KM_SLEEP); 2841168404Spjd acb->acb_done = done; 2842168404Spjd acb->acb_private = private; 2843168404Spjd if (pio != NULL) 2844168404Spjd acb->acb_zio_dummy = zio_null(pio, 2845185029Spjd spa, NULL, NULL, zio_flags); 2846168404Spjd 2847168404Spjd ASSERT(acb->acb_done != NULL); 2848168404Spjd acb->acb_next = hdr->b_acb; 2849168404Spjd hdr->b_acb = acb; 2850168404Spjd add_reference(hdr, hash_lock, private); 2851168404Spjd mutex_exit(hash_lock); 2852168404Spjd return (0); 2853168404Spjd } 2854168404Spjd mutex_exit(hash_lock); 2855168404Spjd return (0); 2856168404Spjd } 2857168404Spjd 2858168404Spjd ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu); 2859168404Spjd 2860168404Spjd if (done) { 2861168404Spjd add_reference(hdr, hash_lock, private); 2862168404Spjd /* 2863168404Spjd * If this block is already in use, create a new 2864168404Spjd * copy of the data so that we will be guaranteed 2865168404Spjd * that arc_release() will always succeed. 2866168404Spjd */ 2867168404Spjd buf = hdr->b_buf; 2868168404Spjd ASSERT(buf); 2869168404Spjd ASSERT(buf->b_data); 2870168404Spjd if (HDR_BUF_AVAILABLE(hdr)) { 2871168404Spjd ASSERT(buf->b_efunc == NULL); 2872168404Spjd hdr->b_flags &= ~ARC_BUF_AVAILABLE; 2873168404Spjd } else { 2874168404Spjd buf = arc_buf_clone(buf); 2875168404Spjd } 2876168404Spjd } else if (*arc_flags & ARC_PREFETCH && 2877168404Spjd refcount_count(&hdr->b_refcnt) == 0) { 2878168404Spjd hdr->b_flags |= ARC_PREFETCH; 2879168404Spjd } 2880168404Spjd DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); 2881168404Spjd arc_access(hdr, hash_lock); 2882185029Spjd if (*arc_flags & ARC_L2CACHE) 2883185029Spjd hdr->b_flags |= ARC_L2CACHE; 2884168404Spjd mutex_exit(hash_lock); 2885168404Spjd ARCSTAT_BUMP(arcstat_hits); 2886168404Spjd ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH), 2887168404Spjd demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, 2888168404Spjd data, metadata, hits); 2889168404Spjd 2890168404Spjd if (done) 2891168404Spjd done(NULL, buf, private); 2892168404Spjd } else { 2893168404Spjd uint64_t size = BP_GET_LSIZE(bp); 2894168404Spjd arc_callback_t *acb; 2895185029Spjd vdev_t *vd = NULL; 2896208373Smm uint64_t addr; 2897208373Smm boolean_t devw = B_FALSE; 2898168404Spjd 2899168404Spjd if (hdr == NULL) { 2900168404Spjd /* this block is not in the cache */ 2901168404Spjd arc_buf_hdr_t *exists; 2902168404Spjd arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp); 2903168404Spjd buf = arc_buf_alloc(spa, size, private, type); 2904168404Spjd hdr = buf->b_hdr; 2905168404Spjd hdr->b_dva = *BP_IDENTITY(bp); 2906168404Spjd hdr->b_birth = bp->blk_birth; 2907168404Spjd hdr->b_cksum0 = bp->blk_cksum.zc_word[0]; 2908168404Spjd exists = buf_hash_insert(hdr, &hash_lock); 2909168404Spjd if (exists) { 2910168404Spjd /* somebody beat us to the hash insert */ 2911168404Spjd mutex_exit(hash_lock); 2912168404Spjd bzero(&hdr->b_dva, sizeof (dva_t)); 2913168404Spjd hdr->b_birth = 0; 2914168404Spjd hdr->b_cksum0 = 0; 2915168404Spjd (void) arc_buf_remove_ref(buf, private); 2916168404Spjd goto top; /* restart the IO request */ 2917168404Spjd } 2918168404Spjd /* if this is a prefetch, we don't have a reference */ 2919168404Spjd if (*arc_flags & ARC_PREFETCH) { 2920168404Spjd (void) remove_reference(hdr, hash_lock, 2921168404Spjd private); 2922168404Spjd hdr->b_flags |= ARC_PREFETCH; 2923168404Spjd } 2924185029Spjd if (*arc_flags & ARC_L2CACHE) 2925185029Spjd hdr->b_flags |= ARC_L2CACHE; 2926168404Spjd if (BP_GET_LEVEL(bp) > 0) 2927168404Spjd hdr->b_flags |= ARC_INDIRECT; 2928168404Spjd } else { 2929168404Spjd /* this block is in the ghost cache */ 2930168404Spjd ASSERT(GHOST_STATE(hdr->b_state)); 2931168404Spjd ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 2932168404Spjd ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 0); 2933168404Spjd ASSERT(hdr->b_buf == NULL); 2934168404Spjd 2935168404Spjd /* if this is a prefetch, we don't have a reference */ 2936168404Spjd if (*arc_flags & ARC_PREFETCH) 2937168404Spjd hdr->b_flags |= ARC_PREFETCH; 2938168404Spjd else 2939168404Spjd add_reference(hdr, hash_lock, private); 2940185029Spjd if (*arc_flags & ARC_L2CACHE) 2941185029Spjd hdr->b_flags |= ARC_L2CACHE; 2942185029Spjd buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); 2943168404Spjd buf->b_hdr = hdr; 2944168404Spjd buf->b_data = NULL; 2945168404Spjd buf->b_efunc = NULL; 2946168404Spjd buf->b_private = NULL; 2947168404Spjd buf->b_next = NULL; 2948168404Spjd hdr->b_buf = buf; 2949168404Spjd arc_get_data_buf(buf); 2950168404Spjd ASSERT(hdr->b_datacnt == 0); 2951168404Spjd hdr->b_datacnt = 1; 2952168404Spjd 2953168404Spjd } 2954168404Spjd 2955168404Spjd acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); 2956168404Spjd acb->acb_done = done; 2957168404Spjd acb->acb_private = private; 2958168404Spjd 2959168404Spjd ASSERT(hdr->b_acb == NULL); 2960168404Spjd hdr->b_acb = acb; 2961168404Spjd hdr->b_flags |= ARC_IO_IN_PROGRESS; 2962168404Spjd 2963168404Spjd /* 2964168404Spjd * If the buffer has been evicted, migrate it to a present state 2965168404Spjd * before issuing the I/O. Once we drop the hash-table lock, 2966168404Spjd * the header will be marked as I/O in progress and have an 2967168404Spjd * attached buffer. At this point, anybody who finds this 2968168404Spjd * buffer ought to notice that it's legit but has a pending I/O. 2969168404Spjd */ 2970168404Spjd 2971168404Spjd if (GHOST_STATE(hdr->b_state)) 2972168404Spjd arc_access(hdr, hash_lock); 2973185029Spjd 2974185029Spjd if (HDR_L2CACHE(hdr) && hdr->b_l2hdr != NULL && 2975185029Spjd (vd = hdr->b_l2hdr->b_dev->l2ad_vdev) != NULL) { 2976208373Smm devw = hdr->b_l2hdr->b_dev->l2ad_writing; 2977185029Spjd addr = hdr->b_l2hdr->b_daddr; 2978185029Spjd /* 2979185029Spjd * Lock out device removal. 2980185029Spjd */ 2981185029Spjd if (vdev_is_dead(vd) || 2982185029Spjd !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER)) 2983185029Spjd vd = NULL; 2984185029Spjd } 2985185029Spjd 2986168404Spjd mutex_exit(hash_lock); 2987168404Spjd 2988168404Spjd ASSERT3U(hdr->b_size, ==, size); 2989168404Spjd DTRACE_PROBE3(arc__miss, blkptr_t *, bp, uint64_t, size, 2990168404Spjd zbookmark_t *, zb); 2991168404Spjd ARCSTAT_BUMP(arcstat_misses); 2992168404Spjd ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH), 2993168404Spjd demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, 2994168404Spjd data, metadata, misses); 2995168404Spjd 2996208373Smm if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) { 2997185029Spjd /* 2998185029Spjd * Read from the L2ARC if the following are true: 2999185029Spjd * 1. The L2ARC vdev was previously cached. 3000185029Spjd * 2. This buffer still has L2ARC metadata. 3001185029Spjd * 3. This buffer isn't currently writing to the L2ARC. 3002185029Spjd * 4. The L2ARC entry wasn't evicted, which may 3003185029Spjd * also have invalidated the vdev. 3004208373Smm * 5. This isn't prefetch and l2arc_noprefetch is set. 3005185029Spjd */ 3006185029Spjd if (hdr->b_l2hdr != NULL && 3007208373Smm !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) && 3008208373Smm !(l2arc_noprefetch && HDR_PREFETCH(hdr))) { 3009185029Spjd l2arc_read_callback_t *cb; 3010185029Spjd 3011185029Spjd DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr); 3012185029Spjd ARCSTAT_BUMP(arcstat_l2_hits); 3013185029Spjd 3014185029Spjd cb = kmem_zalloc(sizeof (l2arc_read_callback_t), 3015185029Spjd KM_SLEEP); 3016185029Spjd cb->l2rcb_buf = buf; 3017185029Spjd cb->l2rcb_spa = spa; 3018185029Spjd cb->l2rcb_bp = *bp; 3019185029Spjd cb->l2rcb_zb = *zb; 3020185029Spjd cb->l2rcb_flags = zio_flags; 3021185029Spjd 3022185029Spjd /* 3023185029Spjd * l2arc read. The SCL_L2ARC lock will be 3024185029Spjd * released by l2arc_read_done(). 3025185029Spjd */ 3026185029Spjd rzio = zio_read_phys(pio, vd, addr, size, 3027206796Spjd buf->b_data, ZIO_CHECKSUM_OFF, 3028185029Spjd l2arc_read_done, cb, priority, zio_flags | 3029185029Spjd ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL | 3030185029Spjd ZIO_FLAG_DONT_PROPAGATE | 3031185029Spjd ZIO_FLAG_DONT_RETRY, B_FALSE); 3032185029Spjd DTRACE_PROBE2(l2arc__read, vdev_t *, vd, 3033185029Spjd zio_t *, rzio); 3034208373Smm ARCSTAT_INCR(arcstat_l2_read_bytes, size); 3035185029Spjd 3036185029Spjd if (*arc_flags & ARC_NOWAIT) { 3037185029Spjd zio_nowait(rzio); 3038185029Spjd return (0); 3039185029Spjd } 3040185029Spjd 3041185029Spjd ASSERT(*arc_flags & ARC_WAIT); 3042185029Spjd if (zio_wait(rzio) == 0) 3043185029Spjd return (0); 3044185029Spjd 3045185029Spjd /* l2arc read error; goto zio_read() */ 3046185029Spjd } else { 3047185029Spjd DTRACE_PROBE1(l2arc__miss, 3048185029Spjd arc_buf_hdr_t *, hdr); 3049185029Spjd ARCSTAT_BUMP(arcstat_l2_misses); 3050185029Spjd if (HDR_L2_WRITING(hdr)) 3051185029Spjd ARCSTAT_BUMP(arcstat_l2_rw_clash); 3052185029Spjd spa_config_exit(spa, SCL_L2ARC, vd); 3053185029Spjd } 3054208373Smm } else { 3055208373Smm if (vd != NULL) 3056208373Smm spa_config_exit(spa, SCL_L2ARC, vd); 3057208373Smm if (l2arc_ndev != 0) { 3058208373Smm DTRACE_PROBE1(l2arc__miss, 3059208373Smm arc_buf_hdr_t *, hdr); 3060208373Smm ARCSTAT_BUMP(arcstat_l2_misses); 3061208373Smm } 3062185029Spjd } 3063185029Spjd 3064168404Spjd rzio = zio_read(pio, spa, bp, buf->b_data, size, 3065185029Spjd arc_read_done, buf, priority, zio_flags, zb); 3066168404Spjd 3067168404Spjd if (*arc_flags & ARC_WAIT) 3068168404Spjd return (zio_wait(rzio)); 3069168404Spjd 3070168404Spjd ASSERT(*arc_flags & ARC_NOWAIT); 3071168404Spjd zio_nowait(rzio); 3072168404Spjd } 3073168404Spjd return (0); 3074168404Spjd} 3075168404Spjd 3076168404Spjd/* 3077168404Spjd * arc_read() variant to support pool traversal. If the block is already 3078168404Spjd * in the ARC, make a copy of it; otherwise, the caller will do the I/O. 3079168404Spjd * The idea is that we don't want pool traversal filling up memory, but 3080168404Spjd * if the ARC already has the data anyway, we shouldn't pay for the I/O. 3081168404Spjd */ 3082168404Spjdint 3083168404Spjdarc_tryread(spa_t *spa, blkptr_t *bp, void *data) 3084168404Spjd{ 3085168404Spjd arc_buf_hdr_t *hdr; 3086168404Spjd kmutex_t *hash_mtx; 3087168404Spjd int rc = 0; 3088168404Spjd 3089168404Spjd hdr = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_mtx); 3090168404Spjd 3091168404Spjd if (hdr && hdr->b_datacnt > 0 && !HDR_IO_IN_PROGRESS(hdr)) { 3092168404Spjd arc_buf_t *buf = hdr->b_buf; 3093168404Spjd 3094168404Spjd ASSERT(buf); 3095168404Spjd while (buf->b_data == NULL) { 3096168404Spjd buf = buf->b_next; 3097168404Spjd ASSERT(buf); 3098168404Spjd } 3099168404Spjd bcopy(buf->b_data, data, hdr->b_size); 3100168404Spjd } else { 3101168404Spjd rc = ENOENT; 3102168404Spjd } 3103168404Spjd 3104168404Spjd if (hash_mtx) 3105168404Spjd mutex_exit(hash_mtx); 3106168404Spjd 3107168404Spjd return (rc); 3108168404Spjd} 3109168404Spjd 3110168404Spjdvoid 3111168404Spjdarc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private) 3112168404Spjd{ 3113168404Spjd ASSERT(buf->b_hdr != NULL); 3114168404Spjd ASSERT(buf->b_hdr->b_state != arc_anon); 3115168404Spjd ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt) || func == NULL); 3116168404Spjd buf->b_efunc = func; 3117168404Spjd buf->b_private = private; 3118168404Spjd} 3119168404Spjd 3120168404Spjd/* 3121168404Spjd * This is used by the DMU to let the ARC know that a buffer is 3122168404Spjd * being evicted, so the ARC should clean up. If this arc buf 3123168404Spjd * is not yet in the evicted state, it will be put there. 3124168404Spjd */ 3125168404Spjdint 3126168404Spjdarc_buf_evict(arc_buf_t *buf) 3127168404Spjd{ 3128168404Spjd arc_buf_hdr_t *hdr; 3129168404Spjd kmutex_t *hash_lock; 3130168404Spjd arc_buf_t **bufp; 3131205231Skmacy list_t *list, *evicted_list; 3132205231Skmacy kmutex_t *lock, *evicted_lock; 3133206796Spjd 3134185029Spjd rw_enter(&buf->b_lock, RW_WRITER); 3135168404Spjd hdr = buf->b_hdr; 3136168404Spjd if (hdr == NULL) { 3137168404Spjd /* 3138168404Spjd * We are in arc_do_user_evicts(). 3139168404Spjd */ 3140168404Spjd ASSERT(buf->b_data == NULL); 3141185029Spjd rw_exit(&buf->b_lock); 3142168404Spjd return (0); 3143185029Spjd } else if (buf->b_data == NULL) { 3144185029Spjd arc_buf_t copy = *buf; /* structure assignment */ 3145185029Spjd /* 3146185029Spjd * We are on the eviction list; process this buffer now 3147185029Spjd * but let arc_do_user_evicts() do the reaping. 3148185029Spjd */ 3149185029Spjd buf->b_efunc = NULL; 3150185029Spjd rw_exit(&buf->b_lock); 3151185029Spjd VERIFY(copy.b_efunc(©) == 0); 3152185029Spjd return (1); 3153168404Spjd } 3154168404Spjd hash_lock = HDR_LOCK(hdr); 3155168404Spjd mutex_enter(hash_lock); 3156168404Spjd 3157168404Spjd ASSERT(buf->b_hdr == hdr); 3158168404Spjd ASSERT3U(refcount_count(&hdr->b_refcnt), <, hdr->b_datacnt); 3159168404Spjd ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu); 3160168404Spjd 3161168404Spjd /* 3162168404Spjd * Pull this buffer off of the hdr 3163168404Spjd */ 3164168404Spjd bufp = &hdr->b_buf; 3165168404Spjd while (*bufp != buf) 3166168404Spjd bufp = &(*bufp)->b_next; 3167168404Spjd *bufp = buf->b_next; 3168168404Spjd 3169168404Spjd ASSERT(buf->b_data != NULL); 3170168404Spjd arc_buf_destroy(buf, FALSE, FALSE); 3171168404Spjd 3172168404Spjd if (hdr->b_datacnt == 0) { 3173168404Spjd arc_state_t *old_state = hdr->b_state; 3174168404Spjd arc_state_t *evicted_state; 3175168404Spjd 3176168404Spjd ASSERT(refcount_is_zero(&hdr->b_refcnt)); 3177168404Spjd 3178168404Spjd evicted_state = 3179168404Spjd (old_state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost; 3180168404Spjd 3181205231Skmacy get_buf_info(hdr, old_state, &list, &lock); 3182205231Skmacy get_buf_info(hdr, evicted_state, &evicted_list, &evicted_lock); 3183205231Skmacy mutex_enter(lock); 3184205231Skmacy mutex_enter(evicted_lock); 3185168404Spjd 3186168404Spjd arc_change_state(evicted_state, hdr, hash_lock); 3187168404Spjd ASSERT(HDR_IN_HASH_TABLE(hdr)); 3188185029Spjd hdr->b_flags |= ARC_IN_HASH_TABLE; 3189185029Spjd hdr->b_flags &= ~ARC_BUF_AVAILABLE; 3190168404Spjd 3191205231Skmacy mutex_exit(evicted_lock); 3192205231Skmacy mutex_exit(lock); 3193168404Spjd } 3194168404Spjd mutex_exit(hash_lock); 3195185029Spjd rw_exit(&buf->b_lock); 3196168404Spjd 3197168404Spjd VERIFY(buf->b_efunc(buf) == 0); 3198168404Spjd buf->b_efunc = NULL; 3199168404Spjd buf->b_private = NULL; 3200168404Spjd buf->b_hdr = NULL; 3201168404Spjd kmem_cache_free(buf_cache, buf); 3202168404Spjd return (1); 3203168404Spjd} 3204168404Spjd 3205168404Spjd/* 3206168404Spjd * Release this buffer from the cache. This must be done 3207168404Spjd * after a read and prior to modifying the buffer contents. 3208168404Spjd * If the buffer has more than one reference, we must make 3209185029Spjd * a new hdr for the buffer. 3210168404Spjd */ 3211168404Spjdvoid 3212168404Spjdarc_release(arc_buf_t *buf, void *tag) 3213168404Spjd{ 3214185029Spjd arc_buf_hdr_t *hdr; 3215185029Spjd kmutex_t *hash_lock; 3216185029Spjd l2arc_buf_hdr_t *l2hdr; 3217185029Spjd uint64_t buf_size; 3218208373Smm boolean_t released = B_FALSE; 3219168404Spjd 3220185029Spjd rw_enter(&buf->b_lock, RW_WRITER); 3221185029Spjd hdr = buf->b_hdr; 3222185029Spjd 3223168404Spjd /* this buffer is not on any list */ 3224168404Spjd ASSERT(refcount_count(&hdr->b_refcnt) > 0); 3225185029Spjd ASSERT(!(hdr->b_flags & ARC_STORED)); 3226168404Spjd 3227168404Spjd if (hdr->b_state == arc_anon) { 3228168404Spjd /* this buffer is already released */ 3229168404Spjd ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 1); 3230168404Spjd ASSERT(BUF_EMPTY(hdr)); 3231168404Spjd ASSERT(buf->b_efunc == NULL); 3232168404Spjd arc_buf_thaw(buf); 3233185029Spjd rw_exit(&buf->b_lock); 3234208373Smm released = B_TRUE; 3235208373Smm } else { 3236208373Smm hash_lock = HDR_LOCK(hdr); 3237208373Smm mutex_enter(hash_lock); 3238168404Spjd } 3239168404Spjd 3240185029Spjd l2hdr = hdr->b_l2hdr; 3241185029Spjd if (l2hdr) { 3242185029Spjd mutex_enter(&l2arc_buflist_mtx); 3243185029Spjd hdr->b_l2hdr = NULL; 3244185029Spjd buf_size = hdr->b_size; 3245185029Spjd } 3246185029Spjd 3247208373Smm if (released) 3248208373Smm goto out; 3249208373Smm 3250168404Spjd /* 3251168404Spjd * Do we have more than one buf? 3252168404Spjd */ 3253185029Spjd if (hdr->b_datacnt > 1) { 3254168404Spjd arc_buf_hdr_t *nhdr; 3255168404Spjd arc_buf_t **bufp; 3256168404Spjd uint64_t blksz = hdr->b_size; 3257168404Spjd spa_t *spa = hdr->b_spa; 3258168404Spjd arc_buf_contents_t type = hdr->b_type; 3259185029Spjd uint32_t flags = hdr->b_flags; 3260168404Spjd 3261185029Spjd ASSERT(hdr->b_buf != buf || buf->b_next != NULL); 3262168404Spjd /* 3263168404Spjd * Pull the data off of this buf and attach it to 3264168404Spjd * a new anonymous buf. 3265168404Spjd */ 3266168404Spjd (void) remove_reference(hdr, hash_lock, tag); 3267168404Spjd bufp = &hdr->b_buf; 3268168404Spjd while (*bufp != buf) 3269168404Spjd bufp = &(*bufp)->b_next; 3270168404Spjd *bufp = (*bufp)->b_next; 3271168404Spjd buf->b_next = NULL; 3272168404Spjd 3273168404Spjd ASSERT3U(hdr->b_state->arcs_size, >=, hdr->b_size); 3274168404Spjd atomic_add_64(&hdr->b_state->arcs_size, -hdr->b_size); 3275168404Spjd if (refcount_is_zero(&hdr->b_refcnt)) { 3276185029Spjd uint64_t *size = &hdr->b_state->arcs_lsize[hdr->b_type]; 3277185029Spjd ASSERT3U(*size, >=, hdr->b_size); 3278185029Spjd atomic_add_64(size, -hdr->b_size); 3279168404Spjd } 3280168404Spjd hdr->b_datacnt -= 1; 3281168404Spjd arc_cksum_verify(buf); 3282168404Spjd 3283168404Spjd mutex_exit(hash_lock); 3284168404Spjd 3285185029Spjd nhdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE); 3286168404Spjd nhdr->b_size = blksz; 3287168404Spjd nhdr->b_spa = spa; 3288168404Spjd nhdr->b_type = type; 3289168404Spjd nhdr->b_buf = buf; 3290168404Spjd nhdr->b_state = arc_anon; 3291168404Spjd nhdr->b_arc_access = 0; 3292185029Spjd nhdr->b_flags = flags & ARC_L2_WRITING; 3293185029Spjd nhdr->b_l2hdr = NULL; 3294168404Spjd nhdr->b_datacnt = 1; 3295168404Spjd nhdr->b_freeze_cksum = NULL; 3296168404Spjd (void) refcount_add(&nhdr->b_refcnt, tag); 3297168404Spjd buf->b_hdr = nhdr; 3298185029Spjd rw_exit(&buf->b_lock); 3299168404Spjd atomic_add_64(&arc_anon->arcs_size, blksz); 3300168404Spjd } else { 3301185029Spjd rw_exit(&buf->b_lock); 3302168404Spjd ASSERT(refcount_count(&hdr->b_refcnt) == 1); 3303168404Spjd ASSERT(!list_link_active(&hdr->b_arc_node)); 3304168404Spjd ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 3305168404Spjd arc_change_state(arc_anon, hdr, hash_lock); 3306168404Spjd hdr->b_arc_access = 0; 3307168404Spjd mutex_exit(hash_lock); 3308185029Spjd 3309168404Spjd bzero(&hdr->b_dva, sizeof (dva_t)); 3310168404Spjd hdr->b_birth = 0; 3311168404Spjd hdr->b_cksum0 = 0; 3312168404Spjd arc_buf_thaw(buf); 3313168404Spjd } 3314168404Spjd buf->b_efunc = NULL; 3315168404Spjd buf->b_private = NULL; 3316185029Spjd 3317208373Smmout: 3318185029Spjd if (l2hdr) { 3319185029Spjd list_remove(l2hdr->b_dev->l2ad_buflist, hdr); 3320185029Spjd kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t)); 3321185029Spjd ARCSTAT_INCR(arcstat_l2_size, -buf_size); 3322185029Spjd mutex_exit(&l2arc_buflist_mtx); 3323185029Spjd } 3324168404Spjd} 3325168404Spjd 3326168404Spjdint 3327168404Spjdarc_released(arc_buf_t *buf) 3328168404Spjd{ 3329185029Spjd int released; 3330185029Spjd 3331185029Spjd rw_enter(&buf->b_lock, RW_READER); 3332185029Spjd released = (buf->b_data != NULL && buf->b_hdr->b_state == arc_anon); 3333185029Spjd rw_exit(&buf->b_lock); 3334185029Spjd return (released); 3335168404Spjd} 3336168404Spjd 3337168404Spjdint 3338168404Spjdarc_has_callback(arc_buf_t *buf) 3339168404Spjd{ 3340185029Spjd int callback; 3341185029Spjd 3342185029Spjd rw_enter(&buf->b_lock, RW_READER); 3343185029Spjd callback = (buf->b_efunc != NULL); 3344185029Spjd rw_exit(&buf->b_lock); 3345185029Spjd return (callback); 3346168404Spjd} 3347168404Spjd 3348168404Spjd#ifdef ZFS_DEBUG 3349168404Spjdint 3350168404Spjdarc_referenced(arc_buf_t *buf) 3351168404Spjd{ 3352185029Spjd int referenced; 3353185029Spjd 3354185029Spjd rw_enter(&buf->b_lock, RW_READER); 3355185029Spjd referenced = (refcount_count(&buf->b_hdr->b_refcnt)); 3356185029Spjd rw_exit(&buf->b_lock); 3357185029Spjd return (referenced); 3358168404Spjd} 3359168404Spjd#endif 3360168404Spjd 3361168404Spjdstatic void 3362168404Spjdarc_write_ready(zio_t *zio) 3363168404Spjd{ 3364168404Spjd arc_write_callback_t *callback = zio->io_private; 3365168404Spjd arc_buf_t *buf = callback->awcb_buf; 3366185029Spjd arc_buf_hdr_t *hdr = buf->b_hdr; 3367168404Spjd 3368185029Spjd ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt)); 3369185029Spjd callback->awcb_ready(zio, buf, callback->awcb_private); 3370185029Spjd 3371185029Spjd /* 3372185029Spjd * If the IO is already in progress, then this is a re-write 3373185029Spjd * attempt, so we need to thaw and re-compute the cksum. 3374185029Spjd * It is the responsibility of the callback to handle the 3375185029Spjd * accounting for any re-write attempt. 3376185029Spjd */ 3377185029Spjd if (HDR_IO_IN_PROGRESS(hdr)) { 3378185029Spjd mutex_enter(&hdr->b_freeze_lock); 3379185029Spjd if (hdr->b_freeze_cksum != NULL) { 3380185029Spjd kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t)); 3381185029Spjd hdr->b_freeze_cksum = NULL; 3382185029Spjd } 3383185029Spjd mutex_exit(&hdr->b_freeze_lock); 3384168404Spjd } 3385185029Spjd arc_cksum_compute(buf, B_FALSE); 3386185029Spjd hdr->b_flags |= ARC_IO_IN_PROGRESS; 3387168404Spjd} 3388168404Spjd 3389168404Spjdstatic void 3390168404Spjdarc_write_done(zio_t *zio) 3391168404Spjd{ 3392168404Spjd arc_write_callback_t *callback = zio->io_private; 3393168404Spjd arc_buf_t *buf = callback->awcb_buf; 3394168404Spjd arc_buf_hdr_t *hdr = buf->b_hdr; 3395168404Spjd 3396168404Spjd hdr->b_acb = NULL; 3397168404Spjd 3398168404Spjd hdr->b_dva = *BP_IDENTITY(zio->io_bp); 3399168404Spjd hdr->b_birth = zio->io_bp->blk_birth; 3400168404Spjd hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0]; 3401168404Spjd /* 3402168404Spjd * If the block to be written was all-zero, we may have 3403168404Spjd * compressed it away. In this case no write was performed 3404168404Spjd * so there will be no dva/birth-date/checksum. The buffer 3405168404Spjd * must therefor remain anonymous (and uncached). 3406168404Spjd */ 3407168404Spjd if (!BUF_EMPTY(hdr)) { 3408168404Spjd arc_buf_hdr_t *exists; 3409168404Spjd kmutex_t *hash_lock; 3410168404Spjd 3411168404Spjd arc_cksum_verify(buf); 3412168404Spjd 3413168404Spjd exists = buf_hash_insert(hdr, &hash_lock); 3414168404Spjd if (exists) { 3415168404Spjd /* 3416168404Spjd * This can only happen if we overwrite for 3417168404Spjd * sync-to-convergence, because we remove 3418168404Spjd * buffers from the hash table when we arc_free(). 3419168404Spjd */ 3420185029Spjd ASSERT(zio->io_flags & ZIO_FLAG_IO_REWRITE); 3421168404Spjd ASSERT(DVA_EQUAL(BP_IDENTITY(&zio->io_bp_orig), 3422168404Spjd BP_IDENTITY(zio->io_bp))); 3423168404Spjd ASSERT3U(zio->io_bp_orig.blk_birth, ==, 3424168404Spjd zio->io_bp->blk_birth); 3425168404Spjd 3426168404Spjd ASSERT(refcount_is_zero(&exists->b_refcnt)); 3427168404Spjd arc_change_state(arc_anon, exists, hash_lock); 3428168404Spjd mutex_exit(hash_lock); 3429168404Spjd arc_hdr_destroy(exists); 3430168404Spjd exists = buf_hash_insert(hdr, &hash_lock); 3431168404Spjd ASSERT3P(exists, ==, NULL); 3432168404Spjd } 3433168404Spjd hdr->b_flags &= ~ARC_IO_IN_PROGRESS; 3434185029Spjd /* if it's not anon, we are doing a scrub */ 3435185029Spjd if (hdr->b_state == arc_anon) 3436185029Spjd arc_access(hdr, hash_lock); 3437168404Spjd mutex_exit(hash_lock); 3438168404Spjd } else if (callback->awcb_done == NULL) { 3439168404Spjd int destroy_hdr; 3440168404Spjd /* 3441168404Spjd * This is an anonymous buffer with no user callback, 3442168404Spjd * destroy it if there are no active references. 3443168404Spjd */ 3444168404Spjd mutex_enter(&arc_eviction_mtx); 3445168404Spjd destroy_hdr = refcount_is_zero(&hdr->b_refcnt); 3446168404Spjd hdr->b_flags &= ~ARC_IO_IN_PROGRESS; 3447168404Spjd mutex_exit(&arc_eviction_mtx); 3448168404Spjd if (destroy_hdr) 3449168404Spjd arc_hdr_destroy(hdr); 3450168404Spjd } else { 3451168404Spjd hdr->b_flags &= ~ARC_IO_IN_PROGRESS; 3452168404Spjd } 3453185029Spjd hdr->b_flags &= ~ARC_STORED; 3454168404Spjd 3455168404Spjd if (callback->awcb_done) { 3456168404Spjd ASSERT(!refcount_is_zero(&hdr->b_refcnt)); 3457168404Spjd callback->awcb_done(zio, buf, callback->awcb_private); 3458168404Spjd } 3459168404Spjd 3460168404Spjd kmem_free(callback, sizeof (arc_write_callback_t)); 3461168404Spjd} 3462168404Spjd 3463185029Spjdstatic void 3464185029Spjdwrite_policy(spa_t *spa, const writeprops_t *wp, zio_prop_t *zp) 3465185029Spjd{ 3466185029Spjd boolean_t ismd = (wp->wp_level > 0 || dmu_ot[wp->wp_type].ot_metadata); 3467185029Spjd 3468185029Spjd /* Determine checksum setting */ 3469185029Spjd if (ismd) { 3470185029Spjd /* 3471185029Spjd * Metadata always gets checksummed. If the data 3472185029Spjd * checksum is multi-bit correctable, and it's not a 3473185029Spjd * ZBT-style checksum, then it's suitable for metadata 3474185029Spjd * as well. Otherwise, the metadata checksum defaults 3475185029Spjd * to fletcher4. 3476185029Spjd */ 3477185029Spjd if (zio_checksum_table[wp->wp_oschecksum].ci_correctable && 3478185029Spjd !zio_checksum_table[wp->wp_oschecksum].ci_zbt) 3479185029Spjd zp->zp_checksum = wp->wp_oschecksum; 3480185029Spjd else 3481185029Spjd zp->zp_checksum = ZIO_CHECKSUM_FLETCHER_4; 3482185029Spjd } else { 3483185029Spjd zp->zp_checksum = zio_checksum_select(wp->wp_dnchecksum, 3484185029Spjd wp->wp_oschecksum); 3485185029Spjd } 3486185029Spjd 3487185029Spjd /* Determine compression setting */ 3488185029Spjd if (ismd) { 3489185029Spjd /* 3490185029Spjd * XXX -- we should design a compression algorithm 3491185029Spjd * that specializes in arrays of bps. 3492185029Spjd */ 3493185029Spjd zp->zp_compress = zfs_mdcomp_disable ? ZIO_COMPRESS_EMPTY : 3494185029Spjd ZIO_COMPRESS_LZJB; 3495185029Spjd } else { 3496185029Spjd zp->zp_compress = zio_compress_select(wp->wp_dncompress, 3497185029Spjd wp->wp_oscompress); 3498185029Spjd } 3499185029Spjd 3500185029Spjd zp->zp_type = wp->wp_type; 3501185029Spjd zp->zp_level = wp->wp_level; 3502185029Spjd zp->zp_ndvas = MIN(wp->wp_copies + ismd, spa_max_replication(spa)); 3503185029Spjd} 3504185029Spjd 3505168404Spjdzio_t * 3506185029Spjdarc_write(zio_t *pio, spa_t *spa, const writeprops_t *wp, 3507185029Spjd boolean_t l2arc, uint64_t txg, blkptr_t *bp, arc_buf_t *buf, 3508168404Spjd arc_done_func_t *ready, arc_done_func_t *done, void *private, int priority, 3509185029Spjd int zio_flags, const zbookmark_t *zb) 3510168404Spjd{ 3511168404Spjd arc_buf_hdr_t *hdr = buf->b_hdr; 3512168404Spjd arc_write_callback_t *callback; 3513185029Spjd zio_t *zio; 3514185029Spjd zio_prop_t zp; 3515168404Spjd 3516185029Spjd ASSERT(ready != NULL); 3517168404Spjd ASSERT(!HDR_IO_ERROR(hdr)); 3518168404Spjd ASSERT((hdr->b_flags & ARC_IO_IN_PROGRESS) == 0); 3519168404Spjd ASSERT(hdr->b_acb == 0); 3520185029Spjd if (l2arc) 3521185029Spjd hdr->b_flags |= ARC_L2CACHE; 3522168404Spjd callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP); 3523168404Spjd callback->awcb_ready = ready; 3524168404Spjd callback->awcb_done = done; 3525168404Spjd callback->awcb_private = private; 3526168404Spjd callback->awcb_buf = buf; 3527168404Spjd 3528185029Spjd write_policy(spa, wp, &zp); 3529185029Spjd zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, &zp, 3530185029Spjd arc_write_ready, arc_write_done, callback, priority, zio_flags, zb); 3531185029Spjd 3532168404Spjd return (zio); 3533168404Spjd} 3534168404Spjd 3535168404Spjdint 3536168404Spjdarc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, 3537168404Spjd zio_done_func_t *done, void *private, uint32_t arc_flags) 3538168404Spjd{ 3539168404Spjd arc_buf_hdr_t *ab; 3540168404Spjd kmutex_t *hash_lock; 3541168404Spjd zio_t *zio; 3542168404Spjd 3543168404Spjd /* 3544168404Spjd * If this buffer is in the cache, release it, so it 3545168404Spjd * can be re-used. 3546168404Spjd */ 3547168404Spjd ab = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_lock); 3548168404Spjd if (ab != NULL) { 3549168404Spjd /* 3550168404Spjd * The checksum of blocks to free is not always 3551168404Spjd * preserved (eg. on the deadlist). However, if it is 3552168404Spjd * nonzero, it should match what we have in the cache. 3553168404Spjd */ 3554168404Spjd ASSERT(bp->blk_cksum.zc_word[0] == 0 || 3555185029Spjd bp->blk_cksum.zc_word[0] == ab->b_cksum0 || 3556185029Spjd bp->blk_fill == BLK_FILL_ALREADY_FREED); 3557185029Spjd 3558168404Spjd if (ab->b_state != arc_anon) 3559168404Spjd arc_change_state(arc_anon, ab, hash_lock); 3560168404Spjd if (HDR_IO_IN_PROGRESS(ab)) { 3561168404Spjd /* 3562168404Spjd * This should only happen when we prefetch. 3563168404Spjd */ 3564168404Spjd ASSERT(ab->b_flags & ARC_PREFETCH); 3565168404Spjd ASSERT3U(ab->b_datacnt, ==, 1); 3566168404Spjd ab->b_flags |= ARC_FREED_IN_READ; 3567168404Spjd if (HDR_IN_HASH_TABLE(ab)) 3568168404Spjd buf_hash_remove(ab); 3569168404Spjd ab->b_arc_access = 0; 3570168404Spjd bzero(&ab->b_dva, sizeof (dva_t)); 3571168404Spjd ab->b_birth = 0; 3572168404Spjd ab->b_cksum0 = 0; 3573168404Spjd ab->b_buf->b_efunc = NULL; 3574168404Spjd ab->b_buf->b_private = NULL; 3575168404Spjd mutex_exit(hash_lock); 3576168404Spjd } else if (refcount_is_zero(&ab->b_refcnt)) { 3577185029Spjd ab->b_flags |= ARC_FREE_IN_PROGRESS; 3578168404Spjd mutex_exit(hash_lock); 3579168404Spjd arc_hdr_destroy(ab); 3580168404Spjd ARCSTAT_BUMP(arcstat_deleted); 3581168404Spjd } else { 3582168404Spjd /* 3583168404Spjd * We still have an active reference on this 3584168404Spjd * buffer. This can happen, e.g., from 3585168404Spjd * dbuf_unoverride(). 3586168404Spjd */ 3587168404Spjd ASSERT(!HDR_IN_HASH_TABLE(ab)); 3588168404Spjd ab->b_arc_access = 0; 3589168404Spjd bzero(&ab->b_dva, sizeof (dva_t)); 3590168404Spjd ab->b_birth = 0; 3591168404Spjd ab->b_cksum0 = 0; 3592168404Spjd ab->b_buf->b_efunc = NULL; 3593168404Spjd ab->b_buf->b_private = NULL; 3594168404Spjd mutex_exit(hash_lock); 3595168404Spjd } 3596168404Spjd } 3597168404Spjd 3598185029Spjd zio = zio_free(pio, spa, txg, bp, done, private, ZIO_FLAG_MUSTSUCCEED); 3599168404Spjd 3600168404Spjd if (arc_flags & ARC_WAIT) 3601168404Spjd return (zio_wait(zio)); 3602168404Spjd 3603168404Spjd ASSERT(arc_flags & ARC_NOWAIT); 3604168404Spjd zio_nowait(zio); 3605168404Spjd 3606168404Spjd return (0); 3607168404Spjd} 3608168404Spjd 3609185029Spjdstatic int 3610185029Spjdarc_memory_throttle(uint64_t reserve, uint64_t txg) 3611185029Spjd{ 3612185029Spjd#ifdef _KERNEL 3613185029Spjd uint64_t inflight_data = arc_anon->arcs_size; 3614185029Spjd uint64_t available_memory = ptoa((uintmax_t)cnt.v_free_count); 3615185029Spjd static uint64_t page_load = 0; 3616185029Spjd static uint64_t last_txg = 0; 3617185029Spjd 3618185029Spjd#if 0 3619185029Spjd#if defined(__i386) 3620185029Spjd available_memory = 3621185029Spjd MIN(available_memory, vmem_size(heap_arena, VMEM_FREE)); 3622185029Spjd#endif 3623185029Spjd#endif 3624185029Spjd if (available_memory >= zfs_write_limit_max) 3625185029Spjd return (0); 3626185029Spjd 3627185029Spjd if (txg > last_txg) { 3628185029Spjd last_txg = txg; 3629185029Spjd page_load = 0; 3630185029Spjd } 3631185029Spjd /* 3632185029Spjd * If we are in pageout, we know that memory is already tight, 3633185029Spjd * the arc is already going to be evicting, so we just want to 3634185029Spjd * continue to let page writes occur as quickly as possible. 3635185029Spjd */ 3636185029Spjd if (curproc == pageproc) { 3637185029Spjd if (page_load > available_memory / 4) 3638185029Spjd return (ERESTART); 3639185029Spjd /* Note: reserve is inflated, so we deflate */ 3640185029Spjd page_load += reserve / 8; 3641185029Spjd return (0); 3642185029Spjd } else if (page_load > 0 && arc_reclaim_needed()) { 3643185029Spjd /* memory is low, delay before restarting */ 3644185029Spjd ARCSTAT_INCR(arcstat_memory_throttle_count, 1); 3645185029Spjd return (EAGAIN); 3646185029Spjd } 3647185029Spjd page_load = 0; 3648185029Spjd 3649185029Spjd if (arc_size > arc_c_min) { 3650185029Spjd uint64_t evictable_memory = 3651185029Spjd arc_mru->arcs_lsize[ARC_BUFC_DATA] + 3652185029Spjd arc_mru->arcs_lsize[ARC_BUFC_METADATA] + 3653185029Spjd arc_mfu->arcs_lsize[ARC_BUFC_DATA] + 3654185029Spjd arc_mfu->arcs_lsize[ARC_BUFC_METADATA]; 3655185029Spjd available_memory += MIN(evictable_memory, arc_size - arc_c_min); 3656185029Spjd } 3657185029Spjd 3658185029Spjd if (inflight_data > available_memory / 4) { 3659185029Spjd ARCSTAT_INCR(arcstat_memory_throttle_count, 1); 3660185029Spjd return (ERESTART); 3661185029Spjd } 3662185029Spjd#endif 3663185029Spjd return (0); 3664185029Spjd} 3665185029Spjd 3666168404Spjdvoid 3667185029Spjdarc_tempreserve_clear(uint64_t reserve) 3668168404Spjd{ 3669185029Spjd atomic_add_64(&arc_tempreserve, -reserve); 3670168404Spjd ASSERT((int64_t)arc_tempreserve >= 0); 3671168404Spjd} 3672168404Spjd 3673168404Spjdint 3674185029Spjdarc_tempreserve_space(uint64_t reserve, uint64_t txg) 3675168404Spjd{ 3676185029Spjd int error; 3677185029Spjd 3678168404Spjd#ifdef ZFS_DEBUG 3679168404Spjd /* 3680168404Spjd * Once in a while, fail for no reason. Everything should cope. 3681168404Spjd */ 3682168404Spjd if (spa_get_random(10000) == 0) { 3683168404Spjd dprintf("forcing random failure\n"); 3684168404Spjd return (ERESTART); 3685168404Spjd } 3686168404Spjd#endif 3687185029Spjd if (reserve > arc_c/4 && !arc_no_grow) 3688185029Spjd arc_c = MIN(arc_c_max, reserve * 4); 3689185029Spjd if (reserve > arc_c) 3690168404Spjd return (ENOMEM); 3691168404Spjd 3692168404Spjd /* 3693185029Spjd * Writes will, almost always, require additional memory allocations 3694185029Spjd * in order to compress/encrypt/etc the data. We therefor need to 3695185029Spjd * make sure that there is sufficient available memory for this. 3696185029Spjd */ 3697185029Spjd if (error = arc_memory_throttle(reserve, txg)) 3698185029Spjd return (error); 3699185029Spjd 3700185029Spjd /* 3701168404Spjd * Throttle writes when the amount of dirty data in the cache 3702168404Spjd * gets too large. We try to keep the cache less than half full 3703168404Spjd * of dirty blocks so that our sync times don't grow too large. 3704168404Spjd * Note: if two requests come in concurrently, we might let them 3705168404Spjd * both succeed, when one of them should fail. Not a huge deal. 3706168404Spjd */ 3707185029Spjd if (reserve + arc_tempreserve + arc_anon->arcs_size > arc_c / 2 && 3708185029Spjd arc_anon->arcs_size > arc_c / 4) { 3709185029Spjd dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK " 3710185029Spjd "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n", 3711185029Spjd arc_tempreserve>>10, 3712185029Spjd arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10, 3713185029Spjd arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10, 3714185029Spjd reserve>>10, arc_c>>10); 3715168404Spjd return (ERESTART); 3716168404Spjd } 3717185029Spjd atomic_add_64(&arc_tempreserve, reserve); 3718168404Spjd return (0); 3719168404Spjd} 3720168404Spjd 3721168582Spjdstatic kmutex_t arc_lowmem_lock; 3722168404Spjd#ifdef _KERNEL 3723168566Spjdstatic eventhandler_tag arc_event_lowmem = NULL; 3724168404Spjd 3725168404Spjdstatic void 3726168566Spjdarc_lowmem(void *arg __unused, int howto __unused) 3727168404Spjd{ 3728168404Spjd 3729168566Spjd /* Serialize access via arc_lowmem_lock. */ 3730168566Spjd mutex_enter(&arc_lowmem_lock); 3731185029Spjd needfree = 1; 3732168404Spjd cv_signal(&arc_reclaim_thr_cv); 3733185029Spjd while (needfree) 3734185029Spjd tsleep(&needfree, 0, "zfs:lowmem", hz / 5); 3735168566Spjd mutex_exit(&arc_lowmem_lock); 3736168404Spjd} 3737168404Spjd#endif 3738168404Spjd 3739168404Spjdvoid 3740168404Spjdarc_init(void) 3741168404Spjd{ 3742193953Skmacy int prefetch_tunable_set = 0; 3743205231Skmacy int i; 3744205231Skmacy 3745168404Spjd mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL); 3746168404Spjd cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL); 3747168566Spjd mutex_init(&arc_lowmem_lock, NULL, MUTEX_DEFAULT, NULL); 3748168404Spjd 3749168404Spjd /* Convert seconds to clock ticks */ 3750168404Spjd arc_min_prefetch_lifespan = 1 * hz; 3751168404Spjd 3752168404Spjd /* Start out with 1/8 of all memory */ 3753168566Spjd arc_c = kmem_size() / 8; 3754192360Skmacy#if 0 3755192360Skmacy#ifdef _KERNEL 3756192360Skmacy /* 3757192360Skmacy * On architectures where the physical memory can be larger 3758192360Skmacy * than the addressable space (intel in 32-bit mode), we may 3759192360Skmacy * need to limit the cache to 1/8 of VM size. 3760192360Skmacy */ 3761192360Skmacy arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8); 3762192360Skmacy#endif 3763192360Skmacy#endif 3764168566Spjd /* set min cache to 1/32 of all memory, or 16MB, whichever is more */ 3765168566Spjd arc_c_min = MAX(arc_c / 4, 64<<18); 3766168566Spjd /* set max to 1/2 of all memory, or all but 1GB, whichever is more */ 3767168404Spjd if (arc_c * 8 >= 1<<30) 3768168404Spjd arc_c_max = (arc_c * 8) - (1<<30); 3769168404Spjd else 3770168404Spjd arc_c_max = arc_c_min; 3771175633Spjd arc_c_max = MAX(arc_c * 5, arc_c_max); 3772168481Spjd#ifdef _KERNEL 3773168404Spjd /* 3774168404Spjd * Allow the tunables to override our calculations if they are 3775168566Spjd * reasonable (ie. over 16MB) 3776168404Spjd */ 3777168566Spjd if (zfs_arc_max >= 64<<18 && zfs_arc_max < kmem_size()) 3778168404Spjd arc_c_max = zfs_arc_max; 3779168566Spjd if (zfs_arc_min >= 64<<18 && zfs_arc_min <= arc_c_max) 3780168404Spjd arc_c_min = zfs_arc_min; 3781168481Spjd#endif 3782168404Spjd arc_c = arc_c_max; 3783168404Spjd arc_p = (arc_c >> 1); 3784168404Spjd 3785185029Spjd /* limit meta-data to 1/4 of the arc capacity */ 3786185029Spjd arc_meta_limit = arc_c_max / 4; 3787185029Spjd 3788185029Spjd /* Allow the tunable to override if it is reasonable */ 3789185029Spjd if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max) 3790185029Spjd arc_meta_limit = zfs_arc_meta_limit; 3791185029Spjd 3792185029Spjd if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0) 3793185029Spjd arc_c_min = arc_meta_limit / 2; 3794185029Spjd 3795208373Smm if (zfs_arc_grow_retry > 0) 3796208373Smm arc_grow_retry = zfs_arc_grow_retry; 3797208373Smm 3798208373Smm if (zfs_arc_shrink_shift > 0) 3799208373Smm arc_shrink_shift = zfs_arc_shrink_shift; 3800208373Smm 3801208373Smm if (zfs_arc_p_min_shift > 0) 3802208373Smm arc_p_min_shift = zfs_arc_p_min_shift; 3803208373Smm 3804168404Spjd /* if kmem_flags are set, lets try to use less memory */ 3805168404Spjd if (kmem_debugging()) 3806168404Spjd arc_c = arc_c / 2; 3807168404Spjd if (arc_c < arc_c_min) 3808168404Spjd arc_c = arc_c_min; 3809168404Spjd 3810168473Spjd zfs_arc_min = arc_c_min; 3811168473Spjd zfs_arc_max = arc_c_max; 3812168473Spjd 3813168404Spjd arc_anon = &ARC_anon; 3814168404Spjd arc_mru = &ARC_mru; 3815168404Spjd arc_mru_ghost = &ARC_mru_ghost; 3816168404Spjd arc_mfu = &ARC_mfu; 3817168404Spjd arc_mfu_ghost = &ARC_mfu_ghost; 3818185029Spjd arc_l2c_only = &ARC_l2c_only; 3819168404Spjd arc_size = 0; 3820168404Spjd 3821205231Skmacy for (i = 0; i < ARC_BUFC_NUMLISTS; i++) { 3822205231Skmacy mutex_init(&arc_anon->arcs_locks[i].arcs_lock, 3823205231Skmacy NULL, MUTEX_DEFAULT, NULL); 3824205231Skmacy mutex_init(&arc_mru->arcs_locks[i].arcs_lock, 3825205231Skmacy NULL, MUTEX_DEFAULT, NULL); 3826205231Skmacy mutex_init(&arc_mru_ghost->arcs_locks[i].arcs_lock, 3827205231Skmacy NULL, MUTEX_DEFAULT, NULL); 3828205231Skmacy mutex_init(&arc_mfu->arcs_locks[i].arcs_lock, 3829205231Skmacy NULL, MUTEX_DEFAULT, NULL); 3830205231Skmacy mutex_init(&arc_mfu_ghost->arcs_locks[i].arcs_lock, 3831205231Skmacy NULL, MUTEX_DEFAULT, NULL); 3832205231Skmacy mutex_init(&arc_l2c_only->arcs_locks[i].arcs_lock, 3833205231Skmacy NULL, MUTEX_DEFAULT, NULL); 3834206796Spjd 3835205231Skmacy list_create(&arc_mru->arcs_lists[i], 3836205231Skmacy sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 3837205231Skmacy list_create(&arc_mru_ghost->arcs_lists[i], 3838205231Skmacy sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 3839205231Skmacy list_create(&arc_mfu->arcs_lists[i], 3840205231Skmacy sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 3841205231Skmacy list_create(&arc_mfu_ghost->arcs_lists[i], 3842205231Skmacy sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 3843205231Skmacy list_create(&arc_mfu_ghost->arcs_lists[i], 3844205231Skmacy sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 3845205231Skmacy list_create(&arc_l2c_only->arcs_lists[i], 3846205231Skmacy sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 3847205231Skmacy } 3848168404Spjd 3849168404Spjd buf_init(); 3850168404Spjd 3851168404Spjd arc_thread_exit = 0; 3852168404Spjd arc_eviction_list = NULL; 3853168404Spjd mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL); 3854168404Spjd bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t)); 3855168404Spjd 3856168404Spjd arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED, 3857168404Spjd sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); 3858168404Spjd 3859168404Spjd if (arc_ksp != NULL) { 3860168404Spjd arc_ksp->ks_data = &arc_stats; 3861168404Spjd kstat_install(arc_ksp); 3862168404Spjd } 3863168404Spjd 3864168404Spjd (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0, 3865168404Spjd TS_RUN, minclsyspri); 3866168404Spjd 3867168404Spjd#ifdef _KERNEL 3868168566Spjd arc_event_lowmem = EVENTHANDLER_REGISTER(vm_lowmem, arc_lowmem, NULL, 3869168404Spjd EVENTHANDLER_PRI_FIRST); 3870168404Spjd#endif 3871168404Spjd 3872168404Spjd arc_dead = FALSE; 3873185029Spjd arc_warm = B_FALSE; 3874168566Spjd 3875185029Spjd if (zfs_write_limit_max == 0) 3876185029Spjd zfs_write_limit_max = ptob(physmem) >> zfs_write_limit_shift; 3877185029Spjd else 3878185029Spjd zfs_write_limit_shift = 0; 3879185029Spjd mutex_init(&zfs_write_limit_lock, NULL, MUTEX_DEFAULT, NULL); 3880185029Spjd 3881168566Spjd#ifdef _KERNEL 3882194043Skmacy if (TUNABLE_INT_FETCH("vfs.zfs.prefetch_disable", &zfs_prefetch_disable)) 3883193953Skmacy prefetch_tunable_set = 1; 3884206796Spjd 3885193878Skmacy#ifdef __i386__ 3886193953Skmacy if (prefetch_tunable_set == 0) { 3887196863Strasz printf("ZFS NOTICE: Prefetch is disabled by default on i386 " 3888196863Strasz "-- to enable,\n"); 3889196863Strasz printf(" add \"vfs.zfs.prefetch_disable=0\" " 3890196863Strasz "to /boot/loader.conf.\n"); 3891194043Skmacy zfs_prefetch_disable=1; 3892193878Skmacy } 3893206796Spjd#else 3894193878Skmacy if ((((uint64_t)physmem * PAGESIZE) < (1ULL << 32)) && 3895193953Skmacy prefetch_tunable_set == 0) { 3896196863Strasz printf("ZFS NOTICE: Prefetch is disabled by default if less " 3897196941Strasz "than 4GB of RAM is present;\n" 3898196863Strasz " to enable, add \"vfs.zfs.prefetch_disable=0\" " 3899196863Strasz "to /boot/loader.conf.\n"); 3900194043Skmacy zfs_prefetch_disable=1; 3901193878Skmacy } 3902206796Spjd#endif 3903175633Spjd /* Warn about ZFS memory and address space requirements. */ 3904168696Spjd if (((uint64_t)physmem * PAGESIZE) < (256 + 128 + 64) * (1 << 20)) { 3905168987Sbmah printf("ZFS WARNING: Recommended minimum RAM size is 512MB; " 3906168987Sbmah "expect unstable behavior.\n"); 3907175633Spjd } 3908175633Spjd if (kmem_size() < 512 * (1 << 20)) { 3909173419Spjd printf("ZFS WARNING: Recommended minimum kmem_size is 512MB; " 3910168987Sbmah "expect unstable behavior.\n"); 3911185029Spjd printf(" Consider tuning vm.kmem_size and " 3912173419Spjd "vm.kmem_size_max\n"); 3913185029Spjd printf(" in /boot/loader.conf.\n"); 3914168566Spjd } 3915168566Spjd#endif 3916168404Spjd} 3917168404Spjd 3918168404Spjdvoid 3919168404Spjdarc_fini(void) 3920168404Spjd{ 3921205231Skmacy int i; 3922206796Spjd 3923168404Spjd mutex_enter(&arc_reclaim_thr_lock); 3924168404Spjd arc_thread_exit = 1; 3925168404Spjd cv_signal(&arc_reclaim_thr_cv); 3926168404Spjd while (arc_thread_exit != 0) 3927168404Spjd cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock); 3928168404Spjd mutex_exit(&arc_reclaim_thr_lock); 3929168404Spjd 3930185029Spjd arc_flush(NULL); 3931168404Spjd 3932168404Spjd arc_dead = TRUE; 3933168404Spjd 3934168404Spjd if (arc_ksp != NULL) { 3935168404Spjd kstat_delete(arc_ksp); 3936168404Spjd arc_ksp = NULL; 3937168404Spjd } 3938168404Spjd 3939168404Spjd mutex_destroy(&arc_eviction_mtx); 3940168404Spjd mutex_destroy(&arc_reclaim_thr_lock); 3941168404Spjd cv_destroy(&arc_reclaim_thr_cv); 3942168404Spjd 3943205231Skmacy for (i = 0; i < ARC_BUFC_NUMLISTS; i++) { 3944205231Skmacy list_destroy(&arc_mru->arcs_lists[i]); 3945205231Skmacy list_destroy(&arc_mru_ghost->arcs_lists[i]); 3946205231Skmacy list_destroy(&arc_mfu->arcs_lists[i]); 3947205231Skmacy list_destroy(&arc_mfu_ghost->arcs_lists[i]); 3948206795Spjd list_destroy(&arc_l2c_only->arcs_lists[i]); 3949168404Spjd 3950205231Skmacy mutex_destroy(&arc_anon->arcs_locks[i].arcs_lock); 3951205231Skmacy mutex_destroy(&arc_mru->arcs_locks[i].arcs_lock); 3952205231Skmacy mutex_destroy(&arc_mru_ghost->arcs_locks[i].arcs_lock); 3953205231Skmacy mutex_destroy(&arc_mfu->arcs_locks[i].arcs_lock); 3954205231Skmacy mutex_destroy(&arc_mfu_ghost->arcs_locks[i].arcs_lock); 3955206795Spjd mutex_destroy(&arc_l2c_only->arcs_locks[i].arcs_lock); 3956205231Skmacy } 3957206796Spjd 3958185029Spjd mutex_destroy(&zfs_write_limit_lock); 3959185029Spjd 3960168404Spjd buf_fini(); 3961168404Spjd 3962168582Spjd mutex_destroy(&arc_lowmem_lock); 3963168404Spjd#ifdef _KERNEL 3964168566Spjd if (arc_event_lowmem != NULL) 3965168566Spjd EVENTHANDLER_DEREGISTER(vm_lowmem, arc_event_lowmem); 3966168404Spjd#endif 3967168404Spjd} 3968185029Spjd 3969185029Spjd/* 3970185029Spjd * Level 2 ARC 3971185029Spjd * 3972185029Spjd * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk. 3973185029Spjd * It uses dedicated storage devices to hold cached data, which are populated 3974185029Spjd * using large infrequent writes. The main role of this cache is to boost 3975185029Spjd * the performance of random read workloads. The intended L2ARC devices 3976185029Spjd * include short-stroked disks, solid state disks, and other media with 3977185029Spjd * substantially faster read latency than disk. 3978185029Spjd * 3979185029Spjd * +-----------------------+ 3980185029Spjd * | ARC | 3981185029Spjd * +-----------------------+ 3982185029Spjd * | ^ ^ 3983185029Spjd * | | | 3984185029Spjd * l2arc_feed_thread() arc_read() 3985185029Spjd * | | | 3986185029Spjd * | l2arc read | 3987185029Spjd * V | | 3988185029Spjd * +---------------+ | 3989185029Spjd * | L2ARC | | 3990185029Spjd * +---------------+ | 3991185029Spjd * | ^ | 3992185029Spjd * l2arc_write() | | 3993185029Spjd * | | | 3994185029Spjd * V | | 3995185029Spjd * +-------+ +-------+ 3996185029Spjd * | vdev | | vdev | 3997185029Spjd * | cache | | cache | 3998185029Spjd * +-------+ +-------+ 3999185029Spjd * +=========+ .-----. 4000185029Spjd * : L2ARC : |-_____-| 4001185029Spjd * : devices : | Disks | 4002185029Spjd * +=========+ `-_____-' 4003185029Spjd * 4004185029Spjd * Read requests are satisfied from the following sources, in order: 4005185029Spjd * 4006185029Spjd * 1) ARC 4007185029Spjd * 2) vdev cache of L2ARC devices 4008185029Spjd * 3) L2ARC devices 4009185029Spjd * 4) vdev cache of disks 4010185029Spjd * 5) disks 4011185029Spjd * 4012185029Spjd * Some L2ARC device types exhibit extremely slow write performance. 4013185029Spjd * To accommodate for this there are some significant differences between 4014185029Spjd * the L2ARC and traditional cache design: 4015185029Spjd * 4016185029Spjd * 1. There is no eviction path from the ARC to the L2ARC. Evictions from 4017185029Spjd * the ARC behave as usual, freeing buffers and placing headers on ghost 4018185029Spjd * lists. The ARC does not send buffers to the L2ARC during eviction as 4019185029Spjd * this would add inflated write latencies for all ARC memory pressure. 4020185029Spjd * 4021185029Spjd * 2. The L2ARC attempts to cache data from the ARC before it is evicted. 4022185029Spjd * It does this by periodically scanning buffers from the eviction-end of 4023185029Spjd * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are 4024185029Spjd * not already there. It scans until a headroom of buffers is satisfied, 4025185029Spjd * which itself is a buffer for ARC eviction. The thread that does this is 4026185029Spjd * l2arc_feed_thread(), illustrated below; example sizes are included to 4027185029Spjd * provide a better sense of ratio than this diagram: 4028185029Spjd * 4029185029Spjd * head --> tail 4030185029Spjd * +---------------------+----------+ 4031185029Spjd * ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->. # already on L2ARC 4032185029Spjd * +---------------------+----------+ | o L2ARC eligible 4033185029Spjd * ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->| : ARC buffer 4034185029Spjd * +---------------------+----------+ | 4035185029Spjd * 15.9 Gbytes ^ 32 Mbytes | 4036185029Spjd * headroom | 4037185029Spjd * l2arc_feed_thread() 4038185029Spjd * | 4039185029Spjd * l2arc write hand <--[oooo]--' 4040185029Spjd * | 8 Mbyte 4041185029Spjd * | write max 4042185029Spjd * V 4043185029Spjd * +==============================+ 4044185029Spjd * L2ARC dev |####|#|###|###| |####| ... | 4045185029Spjd * +==============================+ 4046185029Spjd * 32 Gbytes 4047185029Spjd * 4048185029Spjd * 3. If an ARC buffer is copied to the L2ARC but then hit instead of 4049185029Spjd * evicted, then the L2ARC has cached a buffer much sooner than it probably 4050185029Spjd * needed to, potentially wasting L2ARC device bandwidth and storage. It is 4051185029Spjd * safe to say that this is an uncommon case, since buffers at the end of 4052185029Spjd * the ARC lists have moved there due to inactivity. 4053185029Spjd * 4054185029Spjd * 4. If the ARC evicts faster than the L2ARC can maintain a headroom, 4055185029Spjd * then the L2ARC simply misses copying some buffers. This serves as a 4056185029Spjd * pressure valve to prevent heavy read workloads from both stalling the ARC 4057185029Spjd * with waits and clogging the L2ARC with writes. This also helps prevent 4058185029Spjd * the potential for the L2ARC to churn if it attempts to cache content too 4059185029Spjd * quickly, such as during backups of the entire pool. 4060185029Spjd * 4061185029Spjd * 5. After system boot and before the ARC has filled main memory, there are 4062185029Spjd * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru 4063185029Spjd * lists can remain mostly static. Instead of searching from tail of these 4064185029Spjd * lists as pictured, the l2arc_feed_thread() will search from the list heads 4065185029Spjd * for eligible buffers, greatly increasing its chance of finding them. 4066185029Spjd * 4067185029Spjd * The L2ARC device write speed is also boosted during this time so that 4068185029Spjd * the L2ARC warms up faster. Since there have been no ARC evictions yet, 4069185029Spjd * there are no L2ARC reads, and no fear of degrading read performance 4070185029Spjd * through increased writes. 4071185029Spjd * 4072185029Spjd * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that 4073185029Spjd * the vdev queue can aggregate them into larger and fewer writes. Each 4074185029Spjd * device is written to in a rotor fashion, sweeping writes through 4075185029Spjd * available space then repeating. 4076185029Spjd * 4077185029Spjd * 7. The L2ARC does not store dirty content. It never needs to flush 4078185029Spjd * write buffers back to disk based storage. 4079185029Spjd * 4080185029Spjd * 8. If an ARC buffer is written (and dirtied) which also exists in the 4081185029Spjd * L2ARC, the now stale L2ARC buffer is immediately dropped. 4082185029Spjd * 4083185029Spjd * The performance of the L2ARC can be tweaked by a number of tunables, which 4084185029Spjd * may be necessary for different workloads: 4085185029Spjd * 4086185029Spjd * l2arc_write_max max write bytes per interval 4087185029Spjd * l2arc_write_boost extra write bytes during device warmup 4088185029Spjd * l2arc_noprefetch skip caching prefetched buffers 4089185029Spjd * l2arc_headroom number of max device writes to precache 4090185029Spjd * l2arc_feed_secs seconds between L2ARC writing 4091185029Spjd * 4092185029Spjd * Tunables may be removed or added as future performance improvements are 4093185029Spjd * integrated, and also may become zpool properties. 4094208373Smm * 4095208373Smm * There are three key functions that control how the L2ARC warms up: 4096208373Smm * 4097208373Smm * l2arc_write_eligible() check if a buffer is eligible to cache 4098208373Smm * l2arc_write_size() calculate how much to write 4099208373Smm * l2arc_write_interval() calculate sleep delay between writes 4100208373Smm * 4101208373Smm * These three functions determine what to write, how much, and how quickly 4102208373Smm * to send writes. 4103185029Spjd */ 4104185029Spjd 4105208373Smmstatic boolean_t 4106208373Smml2arc_write_eligible(spa_t *spa, arc_buf_hdr_t *ab) 4107208373Smm{ 4108208373Smm /* 4109208373Smm * A buffer is *not* eligible for the L2ARC if it: 4110208373Smm * 1. belongs to a different spa. 4111208373Smm * 2. is already cached on the L2ARC. 4112208373Smm * 3. has an I/O in progress (it may be an incomplete read). 4113208373Smm * 4. is flagged not eligible (zfs property). 4114208373Smm */ 4115208373Smm if (ab->b_spa != spa) { 4116208373Smm ARCSTAT_BUMP(arcstat_l2_write_spa_mismatch); 4117208373Smm return (B_FALSE); 4118208373Smm } 4119208373Smm if (ab->b_l2hdr != NULL) { 4120208373Smm ARCSTAT_BUMP(arcstat_l2_write_in_l2); 4121208373Smm return (B_FALSE); 4122208373Smm } 4123208373Smm if (HDR_IO_IN_PROGRESS(ab)) { 4124208373Smm ARCSTAT_BUMP(arcstat_l2_write_hdr_io_in_progress); 4125208373Smm return (B_FALSE); 4126208373Smm } 4127208373Smm if (!HDR_L2CACHE(ab)) { 4128208373Smm ARCSTAT_BUMP(arcstat_l2_write_not_cacheable); 4129208373Smm return (B_FALSE); 4130208373Smm } 4131208373Smm 4132208373Smm return (B_TRUE); 4133208373Smm} 4134208373Smm 4135208373Smmstatic uint64_t 4136208373Smml2arc_write_size(l2arc_dev_t *dev) 4137208373Smm{ 4138208373Smm uint64_t size; 4139208373Smm 4140208373Smm size = dev->l2ad_write; 4141208373Smm 4142208373Smm if (arc_warm == B_FALSE) 4143208373Smm size += dev->l2ad_boost; 4144208373Smm 4145208373Smm return (size); 4146208373Smm 4147208373Smm} 4148208373Smm 4149208373Smmstatic clock_t 4150208373Smml2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote) 4151208373Smm{ 4152208373Smm clock_t interval, next; 4153208373Smm 4154208373Smm /* 4155208373Smm * If the ARC lists are busy, increase our write rate; if the 4156208373Smm * lists are stale, idle back. This is achieved by checking 4157208373Smm * how much we previously wrote - if it was more than half of 4158208373Smm * what we wanted, schedule the next write much sooner. 4159208373Smm */ 4160208373Smm if (l2arc_feed_again && wrote > (wanted / 2)) 4161208373Smm interval = (hz * l2arc_feed_min_ms) / 1000; 4162208373Smm else 4163208373Smm interval = hz * l2arc_feed_secs; 4164208373Smm 4165208373Smm next = MAX(LBOLT, MIN(LBOLT + interval, began + interval)); 4166208373Smm 4167208373Smm return (next); 4168208373Smm} 4169208373Smm 4170185029Spjdstatic void 4171185029Spjdl2arc_hdr_stat_add(void) 4172185029Spjd{ 4173185029Spjd ARCSTAT_INCR(arcstat_l2_hdr_size, HDR_SIZE + L2HDR_SIZE); 4174185029Spjd ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE); 4175185029Spjd} 4176185029Spjd 4177185029Spjdstatic void 4178185029Spjdl2arc_hdr_stat_remove(void) 4179185029Spjd{ 4180185029Spjd ARCSTAT_INCR(arcstat_l2_hdr_size, -(HDR_SIZE + L2HDR_SIZE)); 4181185029Spjd ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE); 4182185029Spjd} 4183185029Spjd 4184185029Spjd/* 4185185029Spjd * Cycle through L2ARC devices. This is how L2ARC load balances. 4186185029Spjd * If a device is returned, this also returns holding the spa config lock. 4187185029Spjd */ 4188185029Spjdstatic l2arc_dev_t * 4189185029Spjdl2arc_dev_get_next(void) 4190185029Spjd{ 4191185029Spjd l2arc_dev_t *first, *next = NULL; 4192185029Spjd 4193185029Spjd /* 4194185029Spjd * Lock out the removal of spas (spa_namespace_lock), then removal 4195185029Spjd * of cache devices (l2arc_dev_mtx). Once a device has been selected, 4196185029Spjd * both locks will be dropped and a spa config lock held instead. 4197185029Spjd */ 4198185029Spjd mutex_enter(&spa_namespace_lock); 4199185029Spjd mutex_enter(&l2arc_dev_mtx); 4200185029Spjd 4201185029Spjd /* if there are no vdevs, there is nothing to do */ 4202185029Spjd if (l2arc_ndev == 0) 4203185029Spjd goto out; 4204185029Spjd 4205185029Spjd first = NULL; 4206185029Spjd next = l2arc_dev_last; 4207185029Spjd do { 4208185029Spjd /* loop around the list looking for a non-faulted vdev */ 4209185029Spjd if (next == NULL) { 4210185029Spjd next = list_head(l2arc_dev_list); 4211185029Spjd } else { 4212185029Spjd next = list_next(l2arc_dev_list, next); 4213185029Spjd if (next == NULL) 4214185029Spjd next = list_head(l2arc_dev_list); 4215185029Spjd } 4216185029Spjd 4217185029Spjd /* if we have come back to the start, bail out */ 4218185029Spjd if (first == NULL) 4219185029Spjd first = next; 4220185029Spjd else if (next == first) 4221185029Spjd break; 4222185029Spjd 4223185029Spjd } while (vdev_is_dead(next->l2ad_vdev)); 4224185029Spjd 4225185029Spjd /* if we were unable to find any usable vdevs, return NULL */ 4226185029Spjd if (vdev_is_dead(next->l2ad_vdev)) 4227185029Spjd next = NULL; 4228185029Spjd 4229185029Spjd l2arc_dev_last = next; 4230185029Spjd 4231185029Spjdout: 4232185029Spjd mutex_exit(&l2arc_dev_mtx); 4233185029Spjd 4234185029Spjd /* 4235185029Spjd * Grab the config lock to prevent the 'next' device from being 4236185029Spjd * removed while we are writing to it. 4237185029Spjd */ 4238185029Spjd if (next != NULL) 4239185029Spjd spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER); 4240185029Spjd mutex_exit(&spa_namespace_lock); 4241185029Spjd 4242185029Spjd return (next); 4243185029Spjd} 4244185029Spjd 4245185029Spjd/* 4246185029Spjd * Free buffers that were tagged for destruction. 4247185029Spjd */ 4248185029Spjdstatic void 4249185029Spjdl2arc_do_free_on_write() 4250185029Spjd{ 4251185029Spjd list_t *buflist; 4252185029Spjd l2arc_data_free_t *df, *df_prev; 4253185029Spjd 4254185029Spjd mutex_enter(&l2arc_free_on_write_mtx); 4255185029Spjd buflist = l2arc_free_on_write; 4256185029Spjd 4257185029Spjd for (df = list_tail(buflist); df; df = df_prev) { 4258185029Spjd df_prev = list_prev(buflist, df); 4259185029Spjd ASSERT(df->l2df_data != NULL); 4260185029Spjd ASSERT(df->l2df_func != NULL); 4261185029Spjd df->l2df_func(df->l2df_data, df->l2df_size); 4262185029Spjd list_remove(buflist, df); 4263185029Spjd kmem_free(df, sizeof (l2arc_data_free_t)); 4264185029Spjd } 4265185029Spjd 4266185029Spjd mutex_exit(&l2arc_free_on_write_mtx); 4267185029Spjd} 4268185029Spjd 4269185029Spjd/* 4270185029Spjd * A write to a cache device has completed. Update all headers to allow 4271185029Spjd * reads from these buffers to begin. 4272185029Spjd */ 4273185029Spjdstatic void 4274185029Spjdl2arc_write_done(zio_t *zio) 4275185029Spjd{ 4276185029Spjd l2arc_write_callback_t *cb; 4277185029Spjd l2arc_dev_t *dev; 4278185029Spjd list_t *buflist; 4279185029Spjd arc_buf_hdr_t *head, *ab, *ab_prev; 4280185029Spjd l2arc_buf_hdr_t *abl2; 4281185029Spjd kmutex_t *hash_lock; 4282185029Spjd 4283185029Spjd cb = zio->io_private; 4284185029Spjd ASSERT(cb != NULL); 4285185029Spjd dev = cb->l2wcb_dev; 4286185029Spjd ASSERT(dev != NULL); 4287185029Spjd head = cb->l2wcb_head; 4288185029Spjd ASSERT(head != NULL); 4289185029Spjd buflist = dev->l2ad_buflist; 4290185029Spjd ASSERT(buflist != NULL); 4291185029Spjd DTRACE_PROBE2(l2arc__iodone, zio_t *, zio, 4292185029Spjd l2arc_write_callback_t *, cb); 4293185029Spjd 4294185029Spjd if (zio->io_error != 0) 4295185029Spjd ARCSTAT_BUMP(arcstat_l2_writes_error); 4296185029Spjd 4297185029Spjd mutex_enter(&l2arc_buflist_mtx); 4298185029Spjd 4299185029Spjd /* 4300185029Spjd * All writes completed, or an error was hit. 4301185029Spjd */ 4302185029Spjd for (ab = list_prev(buflist, head); ab; ab = ab_prev) { 4303185029Spjd ab_prev = list_prev(buflist, ab); 4304185029Spjd 4305185029Spjd hash_lock = HDR_LOCK(ab); 4306185029Spjd if (!mutex_tryenter(hash_lock)) { 4307185029Spjd /* 4308185029Spjd * This buffer misses out. It may be in a stage 4309185029Spjd * of eviction. Its ARC_L2_WRITING flag will be 4310185029Spjd * left set, denying reads to this buffer. 4311185029Spjd */ 4312185029Spjd ARCSTAT_BUMP(arcstat_l2_writes_hdr_miss); 4313185029Spjd continue; 4314185029Spjd } 4315185029Spjd 4316185029Spjd if (zio->io_error != 0) { 4317185029Spjd /* 4318185029Spjd * Error - drop L2ARC entry. 4319185029Spjd */ 4320185029Spjd list_remove(buflist, ab); 4321185029Spjd abl2 = ab->b_l2hdr; 4322185029Spjd ab->b_l2hdr = NULL; 4323185029Spjd kmem_free(abl2, sizeof (l2arc_buf_hdr_t)); 4324185029Spjd ARCSTAT_INCR(arcstat_l2_size, -ab->b_size); 4325185029Spjd } 4326185029Spjd 4327185029Spjd /* 4328185029Spjd * Allow ARC to begin reads to this L2ARC entry. 4329185029Spjd */ 4330185029Spjd ab->b_flags &= ~ARC_L2_WRITING; 4331185029Spjd 4332185029Spjd mutex_exit(hash_lock); 4333185029Spjd } 4334185029Spjd 4335185029Spjd atomic_inc_64(&l2arc_writes_done); 4336185029Spjd list_remove(buflist, head); 4337185029Spjd kmem_cache_free(hdr_cache, head); 4338185029Spjd mutex_exit(&l2arc_buflist_mtx); 4339185029Spjd 4340185029Spjd l2arc_do_free_on_write(); 4341185029Spjd 4342185029Spjd kmem_free(cb, sizeof (l2arc_write_callback_t)); 4343185029Spjd} 4344185029Spjd 4345185029Spjd/* 4346185029Spjd * A read to a cache device completed. Validate buffer contents before 4347185029Spjd * handing over to the regular ARC routines. 4348185029Spjd */ 4349185029Spjdstatic void 4350185029Spjdl2arc_read_done(zio_t *zio) 4351185029Spjd{ 4352185029Spjd l2arc_read_callback_t *cb; 4353185029Spjd arc_buf_hdr_t *hdr; 4354185029Spjd arc_buf_t *buf; 4355185029Spjd kmutex_t *hash_lock; 4356185029Spjd int equal; 4357185029Spjd 4358185029Spjd ASSERT(zio->io_vd != NULL); 4359185029Spjd ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE); 4360185029Spjd 4361185029Spjd spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd); 4362185029Spjd 4363185029Spjd cb = zio->io_private; 4364185029Spjd ASSERT(cb != NULL); 4365185029Spjd buf = cb->l2rcb_buf; 4366185029Spjd ASSERT(buf != NULL); 4367185029Spjd hdr = buf->b_hdr; 4368185029Spjd ASSERT(hdr != NULL); 4369185029Spjd 4370185029Spjd hash_lock = HDR_LOCK(hdr); 4371185029Spjd mutex_enter(hash_lock); 4372185029Spjd 4373185029Spjd /* 4374185029Spjd * Check this survived the L2ARC journey. 4375185029Spjd */ 4376185029Spjd equal = arc_cksum_equal(buf); 4377185029Spjd if (equal && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) { 4378185029Spjd mutex_exit(hash_lock); 4379185029Spjd zio->io_private = buf; 4380185029Spjd zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 */ 4381185029Spjd zio->io_bp = &zio->io_bp_copy; /* XXX fix in L2ARC 2.0 */ 4382185029Spjd arc_read_done(zio); 4383185029Spjd } else { 4384185029Spjd mutex_exit(hash_lock); 4385185029Spjd /* 4386185029Spjd * Buffer didn't survive caching. Increment stats and 4387185029Spjd * reissue to the original storage device. 4388185029Spjd */ 4389185029Spjd if (zio->io_error != 0) { 4390185029Spjd ARCSTAT_BUMP(arcstat_l2_io_error); 4391185029Spjd } else { 4392185029Spjd zio->io_error = EIO; 4393185029Spjd } 4394185029Spjd if (!equal) 4395185029Spjd ARCSTAT_BUMP(arcstat_l2_cksum_bad); 4396185029Spjd 4397185029Spjd /* 4398185029Spjd * If there's no waiter, issue an async i/o to the primary 4399185029Spjd * storage now. If there *is* a waiter, the caller must 4400185029Spjd * issue the i/o in a context where it's OK to block. 4401185029Spjd */ 4402185029Spjd if (zio->io_waiter == NULL) 4403185029Spjd zio_nowait(zio_read(zio->io_parent, 4404185029Spjd cb->l2rcb_spa, &cb->l2rcb_bp, 4405185029Spjd buf->b_data, zio->io_size, arc_read_done, buf, 4406185029Spjd zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb)); 4407185029Spjd } 4408185029Spjd 4409185029Spjd kmem_free(cb, sizeof (l2arc_read_callback_t)); 4410185029Spjd} 4411185029Spjd 4412185029Spjd/* 4413185029Spjd * This is the list priority from which the L2ARC will search for pages to 4414185029Spjd * cache. This is used within loops (0..3) to cycle through lists in the 4415185029Spjd * desired order. This order can have a significant effect on cache 4416185029Spjd * performance. 4417185029Spjd * 4418185029Spjd * Currently the metadata lists are hit first, MFU then MRU, followed by 4419185029Spjd * the data lists. This function returns a locked list, and also returns 4420185029Spjd * the lock pointer. 4421185029Spjd */ 4422185029Spjdstatic list_t * 4423185029Spjdl2arc_list_locked(int list_num, kmutex_t **lock) 4424185029Spjd{ 4425185029Spjd list_t *list; 4426205231Skmacy int idx; 4427185029Spjd 4428206796Spjd ASSERT(list_num >= 0 && list_num < 2 * ARC_BUFC_NUMLISTS); 4429206796Spjd 4430205231Skmacy if (list_num < ARC_BUFC_NUMMETADATALISTS) { 4431205231Skmacy idx = list_num; 4432205231Skmacy list = &arc_mfu->arcs_lists[idx]; 4433205231Skmacy *lock = ARCS_LOCK(arc_mfu, idx); 4434206796Spjd } else if (list_num < ARC_BUFC_NUMMETADATALISTS * 2) { 4435205231Skmacy idx = list_num - ARC_BUFC_NUMMETADATALISTS; 4436205231Skmacy list = &arc_mru->arcs_lists[idx]; 4437205231Skmacy *lock = ARCS_LOCK(arc_mru, idx); 4438206796Spjd } else if (list_num < (ARC_BUFC_NUMMETADATALISTS * 2 + 4439205231Skmacy ARC_BUFC_NUMDATALISTS)) { 4440205231Skmacy idx = list_num - ARC_BUFC_NUMMETADATALISTS; 4441205231Skmacy list = &arc_mfu->arcs_lists[idx]; 4442205231Skmacy *lock = ARCS_LOCK(arc_mfu, idx); 4443205231Skmacy } else { 4444205231Skmacy idx = list_num - ARC_BUFC_NUMLISTS; 4445205231Skmacy list = &arc_mru->arcs_lists[idx]; 4446205231Skmacy *lock = ARCS_LOCK(arc_mru, idx); 4447185029Spjd } 4448185029Spjd 4449185029Spjd ASSERT(!(MUTEX_HELD(*lock))); 4450185029Spjd mutex_enter(*lock); 4451185029Spjd return (list); 4452185029Spjd} 4453185029Spjd 4454185029Spjd/* 4455185029Spjd * Evict buffers from the device write hand to the distance specified in 4456185029Spjd * bytes. This distance may span populated buffers, it may span nothing. 4457185029Spjd * This is clearing a region on the L2ARC device ready for writing. 4458185029Spjd * If the 'all' boolean is set, every buffer is evicted. 4459185029Spjd */ 4460185029Spjdstatic void 4461185029Spjdl2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) 4462185029Spjd{ 4463185029Spjd list_t *buflist; 4464185029Spjd l2arc_buf_hdr_t *abl2; 4465185029Spjd arc_buf_hdr_t *ab, *ab_prev; 4466185029Spjd kmutex_t *hash_lock; 4467185029Spjd uint64_t taddr; 4468185029Spjd 4469185029Spjd buflist = dev->l2ad_buflist; 4470185029Spjd 4471185029Spjd if (buflist == NULL) 4472185029Spjd return; 4473185029Spjd 4474185029Spjd if (!all && dev->l2ad_first) { 4475185029Spjd /* 4476185029Spjd * This is the first sweep through the device. There is 4477185029Spjd * nothing to evict. 4478185029Spjd */ 4479185029Spjd return; 4480185029Spjd } 4481185029Spjd 4482185029Spjd if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) { 4483185029Spjd /* 4484185029Spjd * When nearing the end of the device, evict to the end 4485185029Spjd * before the device write hand jumps to the start. 4486185029Spjd */ 4487185029Spjd taddr = dev->l2ad_end; 4488185029Spjd } else { 4489185029Spjd taddr = dev->l2ad_hand + distance; 4490185029Spjd } 4491185029Spjd DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist, 4492185029Spjd uint64_t, taddr, boolean_t, all); 4493185029Spjd 4494185029Spjdtop: 4495185029Spjd mutex_enter(&l2arc_buflist_mtx); 4496185029Spjd for (ab = list_tail(buflist); ab; ab = ab_prev) { 4497185029Spjd ab_prev = list_prev(buflist, ab); 4498185029Spjd 4499185029Spjd hash_lock = HDR_LOCK(ab); 4500185029Spjd if (!mutex_tryenter(hash_lock)) { 4501185029Spjd /* 4502185029Spjd * Missed the hash lock. Retry. 4503185029Spjd */ 4504185029Spjd ARCSTAT_BUMP(arcstat_l2_evict_lock_retry); 4505185029Spjd mutex_exit(&l2arc_buflist_mtx); 4506185029Spjd mutex_enter(hash_lock); 4507185029Spjd mutex_exit(hash_lock); 4508185029Spjd goto top; 4509185029Spjd } 4510185029Spjd 4511185029Spjd if (HDR_L2_WRITE_HEAD(ab)) { 4512185029Spjd /* 4513185029Spjd * We hit a write head node. Leave it for 4514185029Spjd * l2arc_write_done(). 4515185029Spjd */ 4516185029Spjd list_remove(buflist, ab); 4517185029Spjd mutex_exit(hash_lock); 4518185029Spjd continue; 4519185029Spjd } 4520185029Spjd 4521185029Spjd if (!all && ab->b_l2hdr != NULL && 4522185029Spjd (ab->b_l2hdr->b_daddr > taddr || 4523185029Spjd ab->b_l2hdr->b_daddr < dev->l2ad_hand)) { 4524185029Spjd /* 4525185029Spjd * We've evicted to the target address, 4526185029Spjd * or the end of the device. 4527185029Spjd */ 4528185029Spjd mutex_exit(hash_lock); 4529185029Spjd break; 4530185029Spjd } 4531185029Spjd 4532185029Spjd if (HDR_FREE_IN_PROGRESS(ab)) { 4533185029Spjd /* 4534185029Spjd * Already on the path to destruction. 4535185029Spjd */ 4536185029Spjd mutex_exit(hash_lock); 4537185029Spjd continue; 4538185029Spjd } 4539185029Spjd 4540185029Spjd if (ab->b_state == arc_l2c_only) { 4541185029Spjd ASSERT(!HDR_L2_READING(ab)); 4542185029Spjd /* 4543185029Spjd * This doesn't exist in the ARC. Destroy. 4544185029Spjd * arc_hdr_destroy() will call list_remove() 4545185029Spjd * and decrement arcstat_l2_size. 4546185029Spjd */ 4547185029Spjd arc_change_state(arc_anon, ab, hash_lock); 4548185029Spjd arc_hdr_destroy(ab); 4549185029Spjd } else { 4550185029Spjd /* 4551185029Spjd * Invalidate issued or about to be issued 4552185029Spjd * reads, since we may be about to write 4553185029Spjd * over this location. 4554185029Spjd */ 4555185029Spjd if (HDR_L2_READING(ab)) { 4556185029Spjd ARCSTAT_BUMP(arcstat_l2_evict_reading); 4557185029Spjd ab->b_flags |= ARC_L2_EVICTED; 4558185029Spjd } 4559185029Spjd 4560185029Spjd /* 4561185029Spjd * Tell ARC this no longer exists in L2ARC. 4562185029Spjd */ 4563185029Spjd if (ab->b_l2hdr != NULL) { 4564185029Spjd abl2 = ab->b_l2hdr; 4565185029Spjd ab->b_l2hdr = NULL; 4566185029Spjd kmem_free(abl2, sizeof (l2arc_buf_hdr_t)); 4567185029Spjd ARCSTAT_INCR(arcstat_l2_size, -ab->b_size); 4568185029Spjd } 4569185029Spjd list_remove(buflist, ab); 4570185029Spjd 4571185029Spjd /* 4572185029Spjd * This may have been leftover after a 4573185029Spjd * failed write. 4574185029Spjd */ 4575185029Spjd ab->b_flags &= ~ARC_L2_WRITING; 4576185029Spjd } 4577185029Spjd mutex_exit(hash_lock); 4578185029Spjd } 4579185029Spjd mutex_exit(&l2arc_buflist_mtx); 4580185029Spjd 4581185029Spjd spa_l2cache_space_update(dev->l2ad_vdev, 0, -(taddr - dev->l2ad_evict)); 4582185029Spjd dev->l2ad_evict = taddr; 4583185029Spjd} 4584185029Spjd 4585185029Spjd/* 4586185029Spjd * Find and write ARC buffers to the L2ARC device. 4587185029Spjd * 4588185029Spjd * An ARC_L2_WRITING flag is set so that the L2ARC buffers are not valid 4589185029Spjd * for reading until they have completed writing. 4590185029Spjd */ 4591208373Smmstatic uint64_t 4592185029Spjdl2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) 4593185029Spjd{ 4594185029Spjd arc_buf_hdr_t *ab, *ab_prev, *head; 4595185029Spjd l2arc_buf_hdr_t *hdrl2; 4596185029Spjd list_t *list; 4597185029Spjd uint64_t passed_sz, write_sz, buf_sz, headroom; 4598185029Spjd void *buf_data; 4599185029Spjd kmutex_t *hash_lock, *list_lock; 4600185029Spjd boolean_t have_lock, full; 4601185029Spjd l2arc_write_callback_t *cb; 4602185029Spjd zio_t *pio, *wzio; 4603185029Spjd int try; 4604185029Spjd 4605185029Spjd ASSERT(dev->l2ad_vdev != NULL); 4606185029Spjd 4607185029Spjd pio = NULL; 4608185029Spjd write_sz = 0; 4609185029Spjd full = B_FALSE; 4610185029Spjd head = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE); 4611185029Spjd head->b_flags |= ARC_L2_WRITE_HEAD; 4612185029Spjd 4613205231Skmacy ARCSTAT_BUMP(arcstat_l2_write_buffer_iter); 4614185029Spjd /* 4615185029Spjd * Copy buffers for L2ARC writing. 4616185029Spjd */ 4617185029Spjd mutex_enter(&l2arc_buflist_mtx); 4618206796Spjd for (try = 0; try < 2 * ARC_BUFC_NUMLISTS; try++) { 4619185029Spjd list = l2arc_list_locked(try, &list_lock); 4620185029Spjd passed_sz = 0; 4621205231Skmacy ARCSTAT_BUMP(arcstat_l2_write_buffer_list_iter); 4622185029Spjd 4623185029Spjd /* 4624185029Spjd * L2ARC fast warmup. 4625185029Spjd * 4626185029Spjd * Until the ARC is warm and starts to evict, read from the 4627185029Spjd * head of the ARC lists rather than the tail. 4628185029Spjd */ 4629185029Spjd headroom = target_sz * l2arc_headroom; 4630185029Spjd if (arc_warm == B_FALSE) 4631185029Spjd ab = list_head(list); 4632185029Spjd else 4633185029Spjd ab = list_tail(list); 4634206796Spjd if (ab == NULL) 4635205231Skmacy ARCSTAT_BUMP(arcstat_l2_write_buffer_list_null_iter); 4636185029Spjd 4637185029Spjd for (; ab; ab = ab_prev) { 4638185029Spjd if (arc_warm == B_FALSE) 4639185029Spjd ab_prev = list_next(list, ab); 4640185029Spjd else 4641185029Spjd ab_prev = list_prev(list, ab); 4642205231Skmacy ARCSTAT_INCR(arcstat_l2_write_buffer_bytes_scanned, ab->b_size); 4643206796Spjd 4644185029Spjd hash_lock = HDR_LOCK(ab); 4645185029Spjd have_lock = MUTEX_HELD(hash_lock); 4646185029Spjd if (!have_lock && !mutex_tryenter(hash_lock)) { 4647205231Skmacy ARCSTAT_BUMP(arcstat_l2_write_trylock_fail); 4648185029Spjd /* 4649185029Spjd * Skip this buffer rather than waiting. 4650185029Spjd */ 4651185029Spjd continue; 4652185029Spjd } 4653185029Spjd 4654185029Spjd passed_sz += ab->b_size; 4655185029Spjd if (passed_sz > headroom) { 4656185029Spjd /* 4657185029Spjd * Searched too far. 4658185029Spjd */ 4659185029Spjd mutex_exit(hash_lock); 4660205231Skmacy ARCSTAT_BUMP(arcstat_l2_write_passed_headroom); 4661185029Spjd break; 4662185029Spjd } 4663185029Spjd 4664208373Smm if (!l2arc_write_eligible(spa, ab)) { 4665185029Spjd mutex_exit(hash_lock); 4666185029Spjd continue; 4667185029Spjd } 4668185029Spjd 4669185029Spjd if ((write_sz + ab->b_size) > target_sz) { 4670185029Spjd full = B_TRUE; 4671185029Spjd mutex_exit(hash_lock); 4672205231Skmacy ARCSTAT_BUMP(arcstat_l2_write_full); 4673185029Spjd break; 4674185029Spjd } 4675185029Spjd 4676185029Spjd if (pio == NULL) { 4677185029Spjd /* 4678185029Spjd * Insert a dummy header on the buflist so 4679185029Spjd * l2arc_write_done() can find where the 4680185029Spjd * write buffers begin without searching. 4681185029Spjd */ 4682185029Spjd list_insert_head(dev->l2ad_buflist, head); 4683185029Spjd 4684185029Spjd cb = kmem_alloc( 4685185029Spjd sizeof (l2arc_write_callback_t), KM_SLEEP); 4686185029Spjd cb->l2wcb_dev = dev; 4687185029Spjd cb->l2wcb_head = head; 4688185029Spjd pio = zio_root(spa, l2arc_write_done, cb, 4689185029Spjd ZIO_FLAG_CANFAIL); 4690205231Skmacy ARCSTAT_BUMP(arcstat_l2_write_pios); 4691185029Spjd } 4692185029Spjd 4693185029Spjd /* 4694185029Spjd * Create and add a new L2ARC header. 4695185029Spjd */ 4696185029Spjd hdrl2 = kmem_zalloc(sizeof (l2arc_buf_hdr_t), KM_SLEEP); 4697185029Spjd hdrl2->b_dev = dev; 4698185029Spjd hdrl2->b_daddr = dev->l2ad_hand; 4699185029Spjd 4700206792Spjd ab->b_flags |= ARC_L2_WRITING; 4701185029Spjd ab->b_l2hdr = hdrl2; 4702185029Spjd list_insert_head(dev->l2ad_buflist, ab); 4703185029Spjd buf_data = ab->b_buf->b_data; 4704185029Spjd buf_sz = ab->b_size; 4705185029Spjd 4706185029Spjd /* 4707185029Spjd * Compute and store the buffer cksum before 4708185029Spjd * writing. On debug the cksum is verified first. 4709185029Spjd */ 4710185029Spjd arc_cksum_verify(ab->b_buf); 4711185029Spjd arc_cksum_compute(ab->b_buf, B_TRUE); 4712185029Spjd 4713185029Spjd mutex_exit(hash_lock); 4714185029Spjd 4715185029Spjd wzio = zio_write_phys(pio, dev->l2ad_vdev, 4716185029Spjd dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF, 4717185029Spjd NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE, 4718185029Spjd ZIO_FLAG_CANFAIL, B_FALSE); 4719185029Spjd 4720185029Spjd DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev, 4721185029Spjd zio_t *, wzio); 4722185029Spjd (void) zio_nowait(wzio); 4723185029Spjd 4724185029Spjd /* 4725185029Spjd * Keep the clock hand suitably device-aligned. 4726185029Spjd */ 4727185029Spjd buf_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz); 4728185029Spjd 4729185029Spjd write_sz += buf_sz; 4730185029Spjd dev->l2ad_hand += buf_sz; 4731185029Spjd } 4732185029Spjd 4733185029Spjd mutex_exit(list_lock); 4734185029Spjd 4735185029Spjd if (full == B_TRUE) 4736185029Spjd break; 4737185029Spjd } 4738185029Spjd mutex_exit(&l2arc_buflist_mtx); 4739185029Spjd 4740185029Spjd if (pio == NULL) { 4741185029Spjd ASSERT3U(write_sz, ==, 0); 4742185029Spjd kmem_cache_free(hdr_cache, head); 4743208373Smm return (0); 4744185029Spjd } 4745185029Spjd 4746185029Spjd ASSERT3U(write_sz, <=, target_sz); 4747185029Spjd ARCSTAT_BUMP(arcstat_l2_writes_sent); 4748208373Smm ARCSTAT_INCR(arcstat_l2_write_bytes, write_sz); 4749185029Spjd ARCSTAT_INCR(arcstat_l2_size, write_sz); 4750185029Spjd spa_l2cache_space_update(dev->l2ad_vdev, 0, write_sz); 4751185029Spjd 4752185029Spjd /* 4753185029Spjd * Bump device hand to the device start if it is approaching the end. 4754185029Spjd * l2arc_evict() will already have evicted ahead for this case. 4755185029Spjd */ 4756185029Spjd if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) { 4757185029Spjd spa_l2cache_space_update(dev->l2ad_vdev, 0, 4758185029Spjd dev->l2ad_end - dev->l2ad_hand); 4759185029Spjd dev->l2ad_hand = dev->l2ad_start; 4760185029Spjd dev->l2ad_evict = dev->l2ad_start; 4761185029Spjd dev->l2ad_first = B_FALSE; 4762185029Spjd } 4763185029Spjd 4764208373Smm dev->l2ad_writing = B_TRUE; 4765185029Spjd (void) zio_wait(pio); 4766208373Smm dev->l2ad_writing = B_FALSE; 4767208373Smm 4768208373Smm return (write_sz); 4769185029Spjd} 4770185029Spjd 4771185029Spjd/* 4772185029Spjd * This thread feeds the L2ARC at regular intervals. This is the beating 4773185029Spjd * heart of the L2ARC. 4774185029Spjd */ 4775185029Spjdstatic void 4776185029Spjdl2arc_feed_thread(void *dummy __unused) 4777185029Spjd{ 4778185029Spjd callb_cpr_t cpr; 4779185029Spjd l2arc_dev_t *dev; 4780185029Spjd spa_t *spa; 4781208373Smm uint64_t size, wrote; 4782208373Smm clock_t begin, next = LBOLT; 4783185029Spjd 4784185029Spjd CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG); 4785185029Spjd 4786185029Spjd mutex_enter(&l2arc_feed_thr_lock); 4787185029Spjd 4788185029Spjd while (l2arc_thread_exit == 0) { 4789185029Spjd CALLB_CPR_SAFE_BEGIN(&cpr); 4790185029Spjd (void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock, 4791208373Smm next - LBOLT); 4792185029Spjd CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock); 4793208373Smm next = LBOLT + hz; 4794185029Spjd 4795185029Spjd /* 4796185029Spjd * Quick check for L2ARC devices. 4797185029Spjd */ 4798185029Spjd mutex_enter(&l2arc_dev_mtx); 4799185029Spjd if (l2arc_ndev == 0) { 4800185029Spjd mutex_exit(&l2arc_dev_mtx); 4801185029Spjd continue; 4802185029Spjd } 4803185029Spjd mutex_exit(&l2arc_dev_mtx); 4804208373Smm begin = LBOLT; 4805185029Spjd 4806185029Spjd /* 4807185029Spjd * This selects the next l2arc device to write to, and in 4808185029Spjd * doing so the next spa to feed from: dev->l2ad_spa. This 4809185029Spjd * will return NULL if there are now no l2arc devices or if 4810185029Spjd * they are all faulted. 4811185029Spjd * 4812185029Spjd * If a device is returned, its spa's config lock is also 4813185029Spjd * held to prevent device removal. l2arc_dev_get_next() 4814185029Spjd * will grab and release l2arc_dev_mtx. 4815185029Spjd */ 4816185029Spjd if ((dev = l2arc_dev_get_next()) == NULL) 4817185029Spjd continue; 4818185029Spjd 4819185029Spjd spa = dev->l2ad_spa; 4820185029Spjd ASSERT(spa != NULL); 4821185029Spjd 4822185029Spjd /* 4823185029Spjd * Avoid contributing to memory pressure. 4824185029Spjd */ 4825185029Spjd if (arc_reclaim_needed()) { 4826185029Spjd ARCSTAT_BUMP(arcstat_l2_abort_lowmem); 4827185029Spjd spa_config_exit(spa, SCL_L2ARC, dev); 4828185029Spjd continue; 4829185029Spjd } 4830185029Spjd 4831185029Spjd ARCSTAT_BUMP(arcstat_l2_feeds); 4832185029Spjd 4833208373Smm size = l2arc_write_size(dev); 4834185029Spjd 4835185029Spjd /* 4836185029Spjd * Evict L2ARC buffers that will be overwritten. 4837185029Spjd */ 4838185029Spjd l2arc_evict(dev, size, B_FALSE); 4839185029Spjd 4840185029Spjd /* 4841185029Spjd * Write ARC buffers. 4842185029Spjd */ 4843208373Smm wrote = l2arc_write_buffers(spa, dev, size); 4844208373Smm 4845208373Smm /* 4846208373Smm * Calculate interval between writes. 4847208373Smm */ 4848208373Smm next = l2arc_write_interval(begin, size, wrote); 4849185029Spjd spa_config_exit(spa, SCL_L2ARC, dev); 4850185029Spjd } 4851185029Spjd 4852185029Spjd l2arc_thread_exit = 0; 4853185029Spjd cv_broadcast(&l2arc_feed_thr_cv); 4854185029Spjd CALLB_CPR_EXIT(&cpr); /* drops l2arc_feed_thr_lock */ 4855185029Spjd thread_exit(); 4856185029Spjd} 4857185029Spjd 4858185029Spjdboolean_t 4859185029Spjdl2arc_vdev_present(vdev_t *vd) 4860185029Spjd{ 4861185029Spjd l2arc_dev_t *dev; 4862185029Spjd 4863185029Spjd mutex_enter(&l2arc_dev_mtx); 4864185029Spjd for (dev = list_head(l2arc_dev_list); dev != NULL; 4865185029Spjd dev = list_next(l2arc_dev_list, dev)) { 4866185029Spjd if (dev->l2ad_vdev == vd) 4867185029Spjd break; 4868185029Spjd } 4869185029Spjd mutex_exit(&l2arc_dev_mtx); 4870185029Spjd 4871185029Spjd return (dev != NULL); 4872185029Spjd} 4873185029Spjd 4874185029Spjd/* 4875185029Spjd * Add a vdev for use by the L2ARC. By this point the spa has already 4876185029Spjd * validated the vdev and opened it. 4877185029Spjd */ 4878185029Spjdvoid 4879185029Spjdl2arc_add_vdev(spa_t *spa, vdev_t *vd, uint64_t start, uint64_t end) 4880185029Spjd{ 4881185029Spjd l2arc_dev_t *adddev; 4882185029Spjd 4883185029Spjd ASSERT(!l2arc_vdev_present(vd)); 4884185029Spjd 4885185029Spjd /* 4886185029Spjd * Create a new l2arc device entry. 4887185029Spjd */ 4888185029Spjd adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP); 4889185029Spjd adddev->l2ad_spa = spa; 4890185029Spjd adddev->l2ad_vdev = vd; 4891185029Spjd adddev->l2ad_write = l2arc_write_max; 4892185029Spjd adddev->l2ad_boost = l2arc_write_boost; 4893185029Spjd adddev->l2ad_start = start; 4894185029Spjd adddev->l2ad_end = end; 4895185029Spjd adddev->l2ad_hand = adddev->l2ad_start; 4896185029Spjd adddev->l2ad_evict = adddev->l2ad_start; 4897185029Spjd adddev->l2ad_first = B_TRUE; 4898208373Smm adddev->l2ad_writing = B_FALSE; 4899185029Spjd ASSERT3U(adddev->l2ad_write, >, 0); 4900185029Spjd 4901185029Spjd /* 4902185029Spjd * This is a list of all ARC buffers that are still valid on the 4903185029Spjd * device. 4904185029Spjd */ 4905185029Spjd adddev->l2ad_buflist = kmem_zalloc(sizeof (list_t), KM_SLEEP); 4906185029Spjd list_create(adddev->l2ad_buflist, sizeof (arc_buf_hdr_t), 4907185029Spjd offsetof(arc_buf_hdr_t, b_l2node)); 4908185029Spjd 4909185029Spjd spa_l2cache_space_update(vd, adddev->l2ad_end - adddev->l2ad_hand, 0); 4910185029Spjd 4911185029Spjd /* 4912185029Spjd * Add device to global list 4913185029Spjd */ 4914185029Spjd mutex_enter(&l2arc_dev_mtx); 4915185029Spjd list_insert_head(l2arc_dev_list, adddev); 4916185029Spjd atomic_inc_64(&l2arc_ndev); 4917185029Spjd mutex_exit(&l2arc_dev_mtx); 4918185029Spjd} 4919185029Spjd 4920185029Spjd/* 4921185029Spjd * Remove a vdev from the L2ARC. 4922185029Spjd */ 4923185029Spjdvoid 4924185029Spjdl2arc_remove_vdev(vdev_t *vd) 4925185029Spjd{ 4926185029Spjd l2arc_dev_t *dev, *nextdev, *remdev = NULL; 4927185029Spjd 4928185029Spjd /* 4929185029Spjd * Find the device by vdev 4930185029Spjd */ 4931185029Spjd mutex_enter(&l2arc_dev_mtx); 4932185029Spjd for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) { 4933185029Spjd nextdev = list_next(l2arc_dev_list, dev); 4934185029Spjd if (vd == dev->l2ad_vdev) { 4935185029Spjd remdev = dev; 4936185029Spjd break; 4937185029Spjd } 4938185029Spjd } 4939185029Spjd ASSERT(remdev != NULL); 4940185029Spjd 4941185029Spjd /* 4942185029Spjd * Remove device from global list 4943185029Spjd */ 4944185029Spjd list_remove(l2arc_dev_list, remdev); 4945185029Spjd l2arc_dev_last = NULL; /* may have been invalidated */ 4946185029Spjd atomic_dec_64(&l2arc_ndev); 4947185029Spjd mutex_exit(&l2arc_dev_mtx); 4948185029Spjd 4949185029Spjd /* 4950185029Spjd * Clear all buflists and ARC references. L2ARC device flush. 4951185029Spjd */ 4952185029Spjd l2arc_evict(remdev, 0, B_TRUE); 4953185029Spjd list_destroy(remdev->l2ad_buflist); 4954185029Spjd kmem_free(remdev->l2ad_buflist, sizeof (list_t)); 4955185029Spjd kmem_free(remdev, sizeof (l2arc_dev_t)); 4956185029Spjd} 4957185029Spjd 4958185029Spjdvoid 4959185029Spjdl2arc_init(void) 4960185029Spjd{ 4961185029Spjd l2arc_thread_exit = 0; 4962185029Spjd l2arc_ndev = 0; 4963185029Spjd l2arc_writes_sent = 0; 4964185029Spjd l2arc_writes_done = 0; 4965185029Spjd 4966185029Spjd mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL); 4967185029Spjd cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL); 4968185029Spjd mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL); 4969185029Spjd mutex_init(&l2arc_buflist_mtx, NULL, MUTEX_DEFAULT, NULL); 4970185029Spjd mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL); 4971185029Spjd 4972185029Spjd l2arc_dev_list = &L2ARC_dev_list; 4973185029Spjd l2arc_free_on_write = &L2ARC_free_on_write; 4974185029Spjd list_create(l2arc_dev_list, sizeof (l2arc_dev_t), 4975185029Spjd offsetof(l2arc_dev_t, l2ad_node)); 4976185029Spjd list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t), 4977185029Spjd offsetof(l2arc_data_free_t, l2df_list_node)); 4978185029Spjd} 4979185029Spjd 4980185029Spjdvoid 4981185029Spjdl2arc_fini(void) 4982185029Spjd{ 4983185029Spjd /* 4984185029Spjd * This is called from dmu_fini(), which is called from spa_fini(); 4985185029Spjd * Because of this, we can assume that all l2arc devices have 4986185029Spjd * already been removed when the pools themselves were removed. 4987185029Spjd */ 4988185029Spjd 4989185029Spjd l2arc_do_free_on_write(); 4990185029Spjd 4991185029Spjd mutex_destroy(&l2arc_feed_thr_lock); 4992185029Spjd cv_destroy(&l2arc_feed_thr_cv); 4993185029Spjd mutex_destroy(&l2arc_dev_mtx); 4994185029Spjd mutex_destroy(&l2arc_buflist_mtx); 4995185029Spjd mutex_destroy(&l2arc_free_on_write_mtx); 4996185029Spjd 4997185029Spjd list_destroy(l2arc_dev_list); 4998185029Spjd list_destroy(l2arc_free_on_write); 4999185029Spjd} 5000185029Spjd 5001185029Spjdvoid 5002185029Spjdl2arc_start(void) 5003185029Spjd{ 5004185029Spjd if (!(spa_mode & FWRITE)) 5005185029Spjd return; 5006185029Spjd 5007185029Spjd (void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0, 5008185029Spjd TS_RUN, minclsyspri); 5009185029Spjd} 5010185029Spjd 5011185029Spjdvoid 5012185029Spjdl2arc_stop(void) 5013185029Spjd{ 5014185029Spjd if (!(spa_mode & FWRITE)) 5015185029Spjd return; 5016185029Spjd 5017185029Spjd mutex_enter(&l2arc_feed_thr_lock); 5018185029Spjd cv_signal(&l2arc_feed_thr_cv); /* kick thread out of startup */ 5019185029Spjd l2arc_thread_exit = 1; 5020185029Spjd while (l2arc_thread_exit != 0) 5021185029Spjd cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock); 5022185029Spjd mutex_exit(&l2arc_feed_thr_lock); 5023185029Spjd} 5024