arc.c revision 168582
1288644Spjd/* 2288644Spjd * CDDL HEADER START 3288644Spjd * 4288644Spjd * The contents of this file are subject to the terms of the 5288644Spjd * Common Development and Distribution License (the "License"). 6288644Spjd * You may not use this file except in compliance with the License. 7288644Spjd * 8288644Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9288644Spjd * or http://www.opensolaris.org/os/licensing. 10288644Spjd * See the License for the specific language governing permissions 11288644Spjd * and limitations under the License. 12288644Spjd * 13288644Spjd * When distributing Covered Code, include this CDDL HEADER in each 14288644Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15288644Spjd * If applicable, add the following below this CDDL HEADER, with the 16288644Spjd * fields enclosed by brackets "[]" replaced with your own identifying 17288644Spjd * information: Portions Copyright [yyyy] [name of copyright owner] 18288644Spjd * 19288644Spjd * CDDL HEADER END 20288644Spjd */ 21288644Spjd/* 22288644Spjd * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23288644Spjd * Use is subject to license terms. 24288644Spjd */ 25288644Spjd 26288644Spjd#pragma ident "%Z%%M% %I% %E% SMI" 27288644Spjd 28288644Spjd/* 29288644Spjd * DVA-based Adjustable Replacement Cache 30288644Spjd * 31288644Spjd * While much of the theory of operation used here is 32288644Spjd * based on the self-tuning, low overhead replacement cache 33288644Spjd * presented by Megiddo and Modha at FAST 2003, there are some 34288644Spjd * significant differences: 35288644Spjd * 36288644Spjd * 1. The Megiddo and Modha model assumes any page is evictable. 37288644Spjd * Pages in its cache cannot be "locked" into memory. This makes 38288644Spjd * the eviction algorithm simple: evict the last page in the list. 39288644Spjd * This also make the performance characteristics easy to reason 40288644Spjd * about. Our cache is not so simple. At any given moment, some 41288644Spjd * subset of the blocks in the cache are un-evictable because we 42288644Spjd * have handed out a reference to them. Blocks are only evictable 43288644Spjd * when there are no external references active. This makes 44288644Spjd * eviction far more problematic: we choose to evict the evictable 45288644Spjd * blocks that are the "lowest" in the list. 46288644Spjd * 47288644Spjd * There are times when it is not possible to evict the requested 48288644Spjd * space. In these circumstances we are unable to adjust the cache 49288644Spjd * size. To prevent the cache growing unbounded at these times we 50288644Spjd * implement a "cache throttle" that slowes the flow of new data 51288644Spjd * into the cache until we can make space avaiable. 52288644Spjd * 53288644Spjd * 2. The Megiddo and Modha model assumes a fixed cache size. 54288644Spjd * Pages are evicted when the cache is full and there is a cache 55288644Spjd * miss. Our model has a variable sized cache. It grows with 56288644Spjd * high use, but also tries to react to memory preasure from the 57288644Spjd * operating system: decreasing its size when system memory is 58 * tight. 59 * 60 * 3. The Megiddo and Modha model assumes a fixed page size. All 61 * elements of the cache are therefor exactly the same size. So 62 * when adjusting the cache size following a cache miss, its simply 63 * a matter of choosing a single page to evict. In our model, we 64 * have variable sized cache blocks (rangeing from 512 bytes to 65 * 128K bytes). We therefor choose a set of blocks to evict to make 66 * space for a cache miss that approximates as closely as possible 67 * the space used by the new block. 68 * 69 * See also: "ARC: A Self-Tuning, Low Overhead Replacement Cache" 70 * by N. Megiddo & D. Modha, FAST 2003 71 */ 72 73/* 74 * The locking model: 75 * 76 * A new reference to a cache buffer can be obtained in two 77 * ways: 1) via a hash table lookup using the DVA as a key, 78 * or 2) via one of the ARC lists. The arc_read() inerface 79 * uses method 1, while the internal arc algorithms for 80 * adjusting the cache use method 2. We therefor provide two 81 * types of locks: 1) the hash table lock array, and 2) the 82 * arc list locks. 83 * 84 * Buffers do not have their own mutexs, rather they rely on the 85 * hash table mutexs for the bulk of their protection (i.e. most 86 * fields in the arc_buf_hdr_t are protected by these mutexs). 87 * 88 * buf_hash_find() returns the appropriate mutex (held) when it 89 * locates the requested buffer in the hash table. It returns 90 * NULL for the mutex if the buffer was not in the table. 91 * 92 * buf_hash_remove() expects the appropriate hash mutex to be 93 * already held before it is invoked. 94 * 95 * Each arc state also has a mutex which is used to protect the 96 * buffer list associated with the state. When attempting to 97 * obtain a hash table lock while holding an arc list lock you 98 * must use: mutex_tryenter() to avoid deadlock. Also note that 99 * the active state mutex must be held before the ghost state mutex. 100 * 101 * Arc buffers may have an associated eviction callback function. 102 * This function will be invoked prior to removing the buffer (e.g. 103 * in arc_do_user_evicts()). Note however that the data associated 104 * with the buffer may be evicted prior to the callback. The callback 105 * must be made with *no locks held* (to prevent deadlock). Additionally, 106 * the users of callbacks must ensure that their private data is 107 * protected from simultaneous callbacks from arc_buf_evict() 108 * and arc_do_user_evicts(). 109 * 110 * Note that the majority of the performance stats are manipulated 111 * with atomic operations. 112 */ 113 114#include <sys/spa.h> 115#include <sys/zio.h> 116#include <sys/zio_checksum.h> 117#include <sys/zfs_context.h> 118#include <sys/arc.h> 119#include <sys/refcount.h> 120#ifdef _KERNEL 121#include <sys/dnlc.h> 122#endif 123#include <sys/callb.h> 124#include <sys/kstat.h> 125#include <sys/sdt.h> 126 127static kmutex_t arc_reclaim_thr_lock; 128static kcondvar_t arc_reclaim_thr_cv; /* used to signal reclaim thr */ 129static uint8_t arc_thread_exit; 130 131#define ARC_REDUCE_DNLC_PERCENT 3 132uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT; 133 134typedef enum arc_reclaim_strategy { 135 ARC_RECLAIM_AGGR, /* Aggressive reclaim strategy */ 136 ARC_RECLAIM_CONS /* Conservative reclaim strategy */ 137} arc_reclaim_strategy_t; 138 139/* number of seconds before growing cache again */ 140static int arc_grow_retry = 60; 141 142/* 143 * minimum lifespan of a prefetch block in clock ticks 144 * (initialized in arc_init()) 145 */ 146static int arc_min_prefetch_lifespan; 147 148static int arc_dead; 149 150/* 151 * These tunables are for performance analysis. 152 */ 153u_long zfs_arc_max; 154u_long zfs_arc_min; 155TUNABLE_ULONG("vfs.zfs.arc_max", &zfs_arc_max); 156TUNABLE_ULONG("vfs.zfs.arc_min", &zfs_arc_min); 157SYSCTL_DECL(_vfs_zfs); 158SYSCTL_ULONG(_vfs_zfs, OID_AUTO, arc_max, CTLFLAG_RD, &zfs_arc_max, 0, 159 "Maximum ARC size"); 160SYSCTL_ULONG(_vfs_zfs, OID_AUTO, arc_min, CTLFLAG_RD, &zfs_arc_min, 0, 161 "Minimum ARC size"); 162 163/* 164 * Note that buffers can be on one of 5 states: 165 * ARC_anon - anonymous (discussed below) 166 * ARC_mru - recently used, currently cached 167 * ARC_mru_ghost - recentely used, no longer in cache 168 * ARC_mfu - frequently used, currently cached 169 * ARC_mfu_ghost - frequently used, no longer in cache 170 * When there are no active references to the buffer, they 171 * are linked onto one of the lists in arc. These are the 172 * only buffers that can be evicted or deleted. 173 * 174 * Anonymous buffers are buffers that are not associated with 175 * a DVA. These are buffers that hold dirty block copies 176 * before they are written to stable storage. By definition, 177 * they are "ref'd" and are considered part of arc_mru 178 * that cannot be freed. Generally, they will aquire a DVA 179 * as they are written and migrate onto the arc_mru list. 180 */ 181 182typedef struct arc_state { 183 list_t arcs_list; /* linked list of evictable buffer in state */ 184 uint64_t arcs_lsize; /* total size of buffers in the linked list */ 185 uint64_t arcs_size; /* total size of all buffers in this state */ 186 kmutex_t arcs_mtx; 187} arc_state_t; 188 189/* The 5 states: */ 190static arc_state_t ARC_anon; 191static arc_state_t ARC_mru; 192static arc_state_t ARC_mru_ghost; 193static arc_state_t ARC_mfu; 194static arc_state_t ARC_mfu_ghost; 195 196typedef struct arc_stats { 197 kstat_named_t arcstat_hits; 198 kstat_named_t arcstat_misses; 199 kstat_named_t arcstat_demand_data_hits; 200 kstat_named_t arcstat_demand_data_misses; 201 kstat_named_t arcstat_demand_metadata_hits; 202 kstat_named_t arcstat_demand_metadata_misses; 203 kstat_named_t arcstat_prefetch_data_hits; 204 kstat_named_t arcstat_prefetch_data_misses; 205 kstat_named_t arcstat_prefetch_metadata_hits; 206 kstat_named_t arcstat_prefetch_metadata_misses; 207 kstat_named_t arcstat_mru_hits; 208 kstat_named_t arcstat_mru_ghost_hits; 209 kstat_named_t arcstat_mfu_hits; 210 kstat_named_t arcstat_mfu_ghost_hits; 211 kstat_named_t arcstat_deleted; 212 kstat_named_t arcstat_recycle_miss; 213 kstat_named_t arcstat_mutex_miss; 214 kstat_named_t arcstat_evict_skip; 215 kstat_named_t arcstat_hash_elements; 216 kstat_named_t arcstat_hash_elements_max; 217 kstat_named_t arcstat_hash_collisions; 218 kstat_named_t arcstat_hash_chains; 219 kstat_named_t arcstat_hash_chain_max; 220 kstat_named_t arcstat_p; 221 kstat_named_t arcstat_c; 222 kstat_named_t arcstat_c_min; 223 kstat_named_t arcstat_c_max; 224 kstat_named_t arcstat_size; 225} arc_stats_t; 226 227static arc_stats_t arc_stats = { 228 { "hits", KSTAT_DATA_UINT64 }, 229 { "misses", KSTAT_DATA_UINT64 }, 230 { "demand_data_hits", KSTAT_DATA_UINT64 }, 231 { "demand_data_misses", KSTAT_DATA_UINT64 }, 232 { "demand_metadata_hits", KSTAT_DATA_UINT64 }, 233 { "demand_metadata_misses", KSTAT_DATA_UINT64 }, 234 { "prefetch_data_hits", KSTAT_DATA_UINT64 }, 235 { "prefetch_data_misses", KSTAT_DATA_UINT64 }, 236 { "prefetch_metadata_hits", KSTAT_DATA_UINT64 }, 237 { "prefetch_metadata_misses", KSTAT_DATA_UINT64 }, 238 { "mru_hits", KSTAT_DATA_UINT64 }, 239 { "mru_ghost_hits", KSTAT_DATA_UINT64 }, 240 { "mfu_hits", KSTAT_DATA_UINT64 }, 241 { "mfu_ghost_hits", KSTAT_DATA_UINT64 }, 242 { "deleted", KSTAT_DATA_UINT64 }, 243 { "recycle_miss", KSTAT_DATA_UINT64 }, 244 { "mutex_miss", KSTAT_DATA_UINT64 }, 245 { "evict_skip", KSTAT_DATA_UINT64 }, 246 { "hash_elements", KSTAT_DATA_UINT64 }, 247 { "hash_elements_max", KSTAT_DATA_UINT64 }, 248 { "hash_collisions", KSTAT_DATA_UINT64 }, 249 { "hash_chains", KSTAT_DATA_UINT64 }, 250 { "hash_chain_max", KSTAT_DATA_UINT64 }, 251 { "p", KSTAT_DATA_UINT64 }, 252 { "c", KSTAT_DATA_UINT64 }, 253 { "c_min", KSTAT_DATA_UINT64 }, 254 { "c_max", KSTAT_DATA_UINT64 }, 255 { "size", KSTAT_DATA_UINT64 } 256}; 257 258#define ARCSTAT(stat) (arc_stats.stat.value.ui64) 259 260#define ARCSTAT_INCR(stat, val) \ 261 atomic_add_64(&arc_stats.stat.value.ui64, (val)); 262 263#define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1) 264#define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1) 265 266#define ARCSTAT_MAX(stat, val) { \ 267 uint64_t m; \ 268 while ((val) > (m = arc_stats.stat.value.ui64) && \ 269 (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val)))) \ 270 continue; \ 271} 272 273#define ARCSTAT_MAXSTAT(stat) \ 274 ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64) 275 276/* 277 * We define a macro to allow ARC hits/misses to be easily broken down by 278 * two separate conditions, giving a total of four different subtypes for 279 * each of hits and misses (so eight statistics total). 280 */ 281#define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \ 282 if (cond1) { \ 283 if (cond2) { \ 284 ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \ 285 } else { \ 286 ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \ 287 } \ 288 } else { \ 289 if (cond2) { \ 290 ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \ 291 } else { \ 292 ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\ 293 } \ 294 } 295 296kstat_t *arc_ksp; 297static arc_state_t *arc_anon; 298static arc_state_t *arc_mru; 299static arc_state_t *arc_mru_ghost; 300static arc_state_t *arc_mfu; 301static arc_state_t *arc_mfu_ghost; 302 303/* 304 * There are several ARC variables that are critical to export as kstats -- 305 * but we don't want to have to grovel around in the kstat whenever we wish to 306 * manipulate them. For these variables, we therefore define them to be in 307 * terms of the statistic variable. This assures that we are not introducing 308 * the possibility of inconsistency by having shadow copies of the variables, 309 * while still allowing the code to be readable. 310 */ 311#define arc_size ARCSTAT(arcstat_size) /* actual total arc size */ 312#define arc_p ARCSTAT(arcstat_p) /* target size of MRU */ 313#define arc_c ARCSTAT(arcstat_c) /* target size of cache */ 314#define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */ 315#define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */ 316 317static int arc_no_grow; /* Don't try to grow cache size */ 318static uint64_t arc_tempreserve; 319 320typedef struct arc_callback arc_callback_t; 321 322struct arc_callback { 323 void *acb_private; 324 arc_done_func_t *acb_done; 325 arc_byteswap_func_t *acb_byteswap; 326 arc_buf_t *acb_buf; 327 zio_t *acb_zio_dummy; 328 arc_callback_t *acb_next; 329}; 330 331typedef struct arc_write_callback arc_write_callback_t; 332 333struct arc_write_callback { 334 void *awcb_private; 335 arc_done_func_t *awcb_ready; 336 arc_done_func_t *awcb_done; 337 arc_buf_t *awcb_buf; 338}; 339 340struct arc_buf_hdr { 341 /* protected by hash lock */ 342 dva_t b_dva; 343 uint64_t b_birth; 344 uint64_t b_cksum0; 345 346 kmutex_t b_freeze_lock; 347 zio_cksum_t *b_freeze_cksum; 348 349 arc_buf_hdr_t *b_hash_next; 350 arc_buf_t *b_buf; 351 uint32_t b_flags; 352 uint32_t b_datacnt; 353 354 arc_callback_t *b_acb; 355 kcondvar_t b_cv; 356 357 /* immutable */ 358 arc_buf_contents_t b_type; 359 uint64_t b_size; 360 spa_t *b_spa; 361 362 /* protected by arc state mutex */ 363 arc_state_t *b_state; 364 list_node_t b_arc_node; 365 366 /* updated atomically */ 367 clock_t b_arc_access; 368 369 /* self protecting */ 370 refcount_t b_refcnt; 371}; 372 373static arc_buf_t *arc_eviction_list; 374static kmutex_t arc_eviction_mtx; 375static arc_buf_hdr_t arc_eviction_hdr; 376static void arc_get_data_buf(arc_buf_t *buf); 377static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock); 378 379#define GHOST_STATE(state) \ 380 ((state) == arc_mru_ghost || (state) == arc_mfu_ghost) 381 382/* 383 * Private ARC flags. These flags are private ARC only flags that will show up 384 * in b_flags in the arc_hdr_buf_t. Some flags are publicly declared, and can 385 * be passed in as arc_flags in things like arc_read. However, these flags 386 * should never be passed and should only be set by ARC code. When adding new 387 * public flags, make sure not to smash the private ones. 388 */ 389 390#define ARC_IN_HASH_TABLE (1 << 9) /* this buffer is hashed */ 391#define ARC_IO_IN_PROGRESS (1 << 10) /* I/O in progress for buf */ 392#define ARC_IO_ERROR (1 << 11) /* I/O failed for buf */ 393#define ARC_FREED_IN_READ (1 << 12) /* buf freed while in read */ 394#define ARC_BUF_AVAILABLE (1 << 13) /* block not in active use */ 395#define ARC_INDIRECT (1 << 14) /* this is an indirect block */ 396 397#define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_IN_HASH_TABLE) 398#define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS) 399#define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_IO_ERROR) 400#define HDR_FREED_IN_READ(hdr) ((hdr)->b_flags & ARC_FREED_IN_READ) 401#define HDR_BUF_AVAILABLE(hdr) ((hdr)->b_flags & ARC_BUF_AVAILABLE) 402 403/* 404 * Hash table routines 405 */ 406 407#define HT_LOCK_PAD 128 408 409struct ht_lock { 410 kmutex_t ht_lock; 411#ifdef _KERNEL 412 unsigned char pad[(HT_LOCK_PAD - sizeof (kmutex_t))]; 413#endif 414}; 415 416#define BUF_LOCKS 256 417typedef struct buf_hash_table { 418 uint64_t ht_mask; 419 arc_buf_hdr_t **ht_table; 420 struct ht_lock ht_locks[BUF_LOCKS]; 421} buf_hash_table_t; 422 423static buf_hash_table_t buf_hash_table; 424 425#define BUF_HASH_INDEX(spa, dva, birth) \ 426 (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask) 427#define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)]) 428#define BUF_HASH_LOCK(idx) (&(BUF_HASH_LOCK_NTRY(idx).ht_lock)) 429#define HDR_LOCK(buf) \ 430 (BUF_HASH_LOCK(BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth))) 431 432uint64_t zfs_crc64_table[256]; 433 434static uint64_t 435buf_hash(spa_t *spa, dva_t *dva, uint64_t birth) 436{ 437 uintptr_t spav = (uintptr_t)spa; 438 uint8_t *vdva = (uint8_t *)dva; 439 uint64_t crc = -1ULL; 440 int i; 441 442 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); 443 444 for (i = 0; i < sizeof (dva_t); i++) 445 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF]; 446 447 crc ^= (spav>>8) ^ birth; 448 449 return (crc); 450} 451 452#define BUF_EMPTY(buf) \ 453 ((buf)->b_dva.dva_word[0] == 0 && \ 454 (buf)->b_dva.dva_word[1] == 0 && \ 455 (buf)->b_birth == 0) 456 457#define BUF_EQUAL(spa, dva, birth, buf) \ 458 ((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \ 459 ((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \ 460 ((buf)->b_birth == birth) && ((buf)->b_spa == spa) 461 462static arc_buf_hdr_t * 463buf_hash_find(spa_t *spa, dva_t *dva, uint64_t birth, kmutex_t **lockp) 464{ 465 uint64_t idx = BUF_HASH_INDEX(spa, dva, birth); 466 kmutex_t *hash_lock = BUF_HASH_LOCK(idx); 467 arc_buf_hdr_t *buf; 468 469 mutex_enter(hash_lock); 470 for (buf = buf_hash_table.ht_table[idx]; buf != NULL; 471 buf = buf->b_hash_next) { 472 if (BUF_EQUAL(spa, dva, birth, buf)) { 473 *lockp = hash_lock; 474 return (buf); 475 } 476 } 477 mutex_exit(hash_lock); 478 *lockp = NULL; 479 return (NULL); 480} 481 482/* 483 * Insert an entry into the hash table. If there is already an element 484 * equal to elem in the hash table, then the already existing element 485 * will be returned and the new element will not be inserted. 486 * Otherwise returns NULL. 487 */ 488static arc_buf_hdr_t * 489buf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp) 490{ 491 uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth); 492 kmutex_t *hash_lock = BUF_HASH_LOCK(idx); 493 arc_buf_hdr_t *fbuf; 494 uint32_t i; 495 496 ASSERT(!HDR_IN_HASH_TABLE(buf)); 497 *lockp = hash_lock; 498 mutex_enter(hash_lock); 499 for (fbuf = buf_hash_table.ht_table[idx], i = 0; fbuf != NULL; 500 fbuf = fbuf->b_hash_next, i++) { 501 if (BUF_EQUAL(buf->b_spa, &buf->b_dva, buf->b_birth, fbuf)) 502 return (fbuf); 503 } 504 505 buf->b_hash_next = buf_hash_table.ht_table[idx]; 506 buf_hash_table.ht_table[idx] = buf; 507 buf->b_flags |= ARC_IN_HASH_TABLE; 508 509 /* collect some hash table performance data */ 510 if (i > 0) { 511 ARCSTAT_BUMP(arcstat_hash_collisions); 512 if (i == 1) 513 ARCSTAT_BUMP(arcstat_hash_chains); 514 515 ARCSTAT_MAX(arcstat_hash_chain_max, i); 516 } 517 518 ARCSTAT_BUMP(arcstat_hash_elements); 519 ARCSTAT_MAXSTAT(arcstat_hash_elements); 520 521 return (NULL); 522} 523 524static void 525buf_hash_remove(arc_buf_hdr_t *buf) 526{ 527 arc_buf_hdr_t *fbuf, **bufp; 528 uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth); 529 530 ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx))); 531 ASSERT(HDR_IN_HASH_TABLE(buf)); 532 533 bufp = &buf_hash_table.ht_table[idx]; 534 while ((fbuf = *bufp) != buf) { 535 ASSERT(fbuf != NULL); 536 bufp = &fbuf->b_hash_next; 537 } 538 *bufp = buf->b_hash_next; 539 buf->b_hash_next = NULL; 540 buf->b_flags &= ~ARC_IN_HASH_TABLE; 541 542 /* collect some hash table performance data */ 543 ARCSTAT_BUMPDOWN(arcstat_hash_elements); 544 545 if (buf_hash_table.ht_table[idx] && 546 buf_hash_table.ht_table[idx]->b_hash_next == NULL) 547 ARCSTAT_BUMPDOWN(arcstat_hash_chains); 548} 549 550/* 551 * Global data structures and functions for the buf kmem cache. 552 */ 553static kmem_cache_t *hdr_cache; 554static kmem_cache_t *buf_cache; 555 556static void 557buf_fini(void) 558{ 559 int i; 560 561 kmem_free(buf_hash_table.ht_table, 562 (buf_hash_table.ht_mask + 1) * sizeof (void *)); 563 for (i = 0; i < BUF_LOCKS; i++) 564 mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock); 565 kmem_cache_destroy(hdr_cache); 566 kmem_cache_destroy(buf_cache); 567} 568 569/* 570 * Constructor callback - called when the cache is empty 571 * and a new buf is requested. 572 */ 573/* ARGSUSED */ 574static int 575hdr_cons(void *vbuf, void *unused, int kmflag) 576{ 577 arc_buf_hdr_t *buf = vbuf; 578 579 bzero(buf, sizeof (arc_buf_hdr_t)); 580 refcount_create(&buf->b_refcnt); 581 cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL); 582 return (0); 583} 584 585/* 586 * Destructor callback - called when a cached buf is 587 * no longer required. 588 */ 589/* ARGSUSED */ 590static void 591hdr_dest(void *vbuf, void *unused) 592{ 593 arc_buf_hdr_t *buf = vbuf; 594 595 refcount_destroy(&buf->b_refcnt); 596 cv_destroy(&buf->b_cv); 597} 598 599/* 600 * Reclaim callback -- invoked when memory is low. 601 */ 602/* ARGSUSED */ 603static void 604hdr_recl(void *unused) 605{ 606 dprintf("hdr_recl called\n"); 607 /* 608 * umem calls the reclaim func when we destroy the buf cache, 609 * which is after we do arc_fini(). 610 */ 611 if (!arc_dead) 612 cv_signal(&arc_reclaim_thr_cv); 613} 614 615static void 616buf_init(void) 617{ 618 uint64_t *ct; 619 uint64_t hsize = 1ULL << 12; 620 int i, j; 621 622 /* 623 * The hash table is big enough to fill all of physical memory 624 * with an average 64K block size. The table will take up 625 * totalmem*sizeof(void*)/64K (eg. 128KB/GB with 8-byte pointers). 626 */ 627 while (hsize * 65536 < physmem * PAGESIZE) 628 hsize <<= 1; 629retry: 630 buf_hash_table.ht_mask = hsize - 1; 631 buf_hash_table.ht_table = 632 kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP); 633 if (buf_hash_table.ht_table == NULL) { 634 ASSERT(hsize > (1ULL << 8)); 635 hsize >>= 1; 636 goto retry; 637 } 638 639 hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t), 640 0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0); 641 buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t), 642 0, NULL, NULL, NULL, NULL, NULL, 0); 643 644 for (i = 0; i < 256; i++) 645 for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--) 646 *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY); 647 648 for (i = 0; i < BUF_LOCKS; i++) { 649 mutex_init(&buf_hash_table.ht_locks[i].ht_lock, 650 NULL, MUTEX_DEFAULT, NULL); 651 } 652} 653 654#define ARC_MINTIME (hz>>4) /* 62 ms */ 655 656static void 657arc_cksum_verify(arc_buf_t *buf) 658{ 659 zio_cksum_t zc; 660 661 if (!(zfs_flags & ZFS_DEBUG_MODIFY)) 662 return; 663 664 mutex_enter(&buf->b_hdr->b_freeze_lock); 665 if (buf->b_hdr->b_freeze_cksum == NULL || 666 (buf->b_hdr->b_flags & ARC_IO_ERROR)) { 667 mutex_exit(&buf->b_hdr->b_freeze_lock); 668 return; 669 } 670 fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc); 671 if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc)) 672 panic("buffer modified while frozen!"); 673 mutex_exit(&buf->b_hdr->b_freeze_lock); 674} 675 676static void 677arc_cksum_compute(arc_buf_t *buf) 678{ 679 if (!(zfs_flags & ZFS_DEBUG_MODIFY)) 680 return; 681 682 mutex_enter(&buf->b_hdr->b_freeze_lock); 683 if (buf->b_hdr->b_freeze_cksum != NULL) { 684 mutex_exit(&buf->b_hdr->b_freeze_lock); 685 return; 686 } 687 buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP); 688 fletcher_2_native(buf->b_data, buf->b_hdr->b_size, 689 buf->b_hdr->b_freeze_cksum); 690 mutex_exit(&buf->b_hdr->b_freeze_lock); 691} 692 693void 694arc_buf_thaw(arc_buf_t *buf) 695{ 696 if (!(zfs_flags & ZFS_DEBUG_MODIFY)) 697 return; 698 699 if (buf->b_hdr->b_state != arc_anon) 700 panic("modifying non-anon buffer!"); 701 if (buf->b_hdr->b_flags & ARC_IO_IN_PROGRESS) 702 panic("modifying buffer while i/o in progress!"); 703 arc_cksum_verify(buf); 704 mutex_enter(&buf->b_hdr->b_freeze_lock); 705 if (buf->b_hdr->b_freeze_cksum != NULL) { 706 kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t)); 707 buf->b_hdr->b_freeze_cksum = NULL; 708 } 709 mutex_exit(&buf->b_hdr->b_freeze_lock); 710} 711 712void 713arc_buf_freeze(arc_buf_t *buf) 714{ 715 if (!(zfs_flags & ZFS_DEBUG_MODIFY)) 716 return; 717 718 ASSERT(buf->b_hdr->b_freeze_cksum != NULL || 719 buf->b_hdr->b_state == arc_anon); 720 arc_cksum_compute(buf); 721} 722 723static void 724add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag) 725{ 726 ASSERT(MUTEX_HELD(hash_lock)); 727 728 if ((refcount_add(&ab->b_refcnt, tag) == 1) && 729 (ab->b_state != arc_anon)) { 730 uint64_t delta = ab->b_size * ab->b_datacnt; 731 732 ASSERT(!MUTEX_HELD(&ab->b_state->arcs_mtx)); 733 mutex_enter(&ab->b_state->arcs_mtx); 734 ASSERT(list_link_active(&ab->b_arc_node)); 735 list_remove(&ab->b_state->arcs_list, ab); 736 if (GHOST_STATE(ab->b_state)) { 737 ASSERT3U(ab->b_datacnt, ==, 0); 738 ASSERT3P(ab->b_buf, ==, NULL); 739 delta = ab->b_size; 740 } 741 ASSERT(delta > 0); 742 ASSERT3U(ab->b_state->arcs_lsize, >=, delta); 743 atomic_add_64(&ab->b_state->arcs_lsize, -delta); 744 mutex_exit(&ab->b_state->arcs_mtx); 745 /* remove the prefetch flag is we get a reference */ 746 if (ab->b_flags & ARC_PREFETCH) 747 ab->b_flags &= ~ARC_PREFETCH; 748 } 749} 750 751static int 752remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag) 753{ 754 int cnt; 755 arc_state_t *state = ab->b_state; 756 757 ASSERT(state == arc_anon || MUTEX_HELD(hash_lock)); 758 ASSERT(!GHOST_STATE(state)); 759 760 if (((cnt = refcount_remove(&ab->b_refcnt, tag)) == 0) && 761 (state != arc_anon)) { 762 ASSERT(!MUTEX_HELD(&state->arcs_mtx)); 763 mutex_enter(&state->arcs_mtx); 764 ASSERT(!list_link_active(&ab->b_arc_node)); 765 list_insert_head(&state->arcs_list, ab); 766 ASSERT(ab->b_datacnt > 0); 767 atomic_add_64(&state->arcs_lsize, ab->b_size * ab->b_datacnt); 768 ASSERT3U(state->arcs_size, >=, state->arcs_lsize); 769 mutex_exit(&state->arcs_mtx); 770 } 771 return (cnt); 772} 773 774/* 775 * Move the supplied buffer to the indicated state. The mutex 776 * for the buffer must be held by the caller. 777 */ 778static void 779arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock) 780{ 781 arc_state_t *old_state = ab->b_state; 782 int64_t refcnt = refcount_count(&ab->b_refcnt); 783 uint64_t from_delta, to_delta; 784 785 ASSERT(MUTEX_HELD(hash_lock)); 786 ASSERT(new_state != old_state); 787 ASSERT(refcnt == 0 || ab->b_datacnt > 0); 788 ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state)); 789 790 from_delta = to_delta = ab->b_datacnt * ab->b_size; 791 792 /* 793 * If this buffer is evictable, transfer it from the 794 * old state list to the new state list. 795 */ 796 if (refcnt == 0) { 797 if (old_state != arc_anon) { 798 int use_mutex = !MUTEX_HELD(&old_state->arcs_mtx); 799 800 if (use_mutex) 801 mutex_enter(&old_state->arcs_mtx); 802 803 ASSERT(list_link_active(&ab->b_arc_node)); 804 list_remove(&old_state->arcs_list, ab); 805 806 /* 807 * If prefetching out of the ghost cache, 808 * we will have a non-null datacnt. 809 */ 810 if (GHOST_STATE(old_state) && ab->b_datacnt == 0) { 811 /* ghost elements have a ghost size */ 812 ASSERT(ab->b_buf == NULL); 813 from_delta = ab->b_size; 814 } 815 ASSERT3U(old_state->arcs_lsize, >=, from_delta); 816 atomic_add_64(&old_state->arcs_lsize, -from_delta); 817 818 if (use_mutex) 819 mutex_exit(&old_state->arcs_mtx); 820 } 821 if (new_state != arc_anon) { 822 int use_mutex = !MUTEX_HELD(&new_state->arcs_mtx); 823 824 if (use_mutex) 825 mutex_enter(&new_state->arcs_mtx); 826 827 list_insert_head(&new_state->arcs_list, ab); 828 829 /* ghost elements have a ghost size */ 830 if (GHOST_STATE(new_state)) { 831 ASSERT(ab->b_datacnt == 0); 832 ASSERT(ab->b_buf == NULL); 833 to_delta = ab->b_size; 834 } 835 atomic_add_64(&new_state->arcs_lsize, to_delta); 836 ASSERT3U(new_state->arcs_size + to_delta, >=, 837 new_state->arcs_lsize); 838 839 if (use_mutex) 840 mutex_exit(&new_state->arcs_mtx); 841 } 842 } 843 844 ASSERT(!BUF_EMPTY(ab)); 845 if (new_state == arc_anon && old_state != arc_anon) { 846 buf_hash_remove(ab); 847 } 848 849 /* adjust state sizes */ 850 if (to_delta) 851 atomic_add_64(&new_state->arcs_size, to_delta); 852 if (from_delta) { 853 ASSERT3U(old_state->arcs_size, >=, from_delta); 854 atomic_add_64(&old_state->arcs_size, -from_delta); 855 } 856 ab->b_state = new_state; 857} 858 859arc_buf_t * 860arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type) 861{ 862 arc_buf_hdr_t *hdr; 863 arc_buf_t *buf; 864 865 ASSERT3U(size, >, 0); 866 hdr = kmem_cache_alloc(hdr_cache, KM_SLEEP); 867 ASSERT(BUF_EMPTY(hdr)); 868 hdr->b_size = size; 869 hdr->b_type = type; 870 hdr->b_spa = spa; 871 hdr->b_state = arc_anon; 872 hdr->b_arc_access = 0; 873 mutex_init(&hdr->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL); 874 buf = kmem_cache_alloc(buf_cache, KM_SLEEP); 875 buf->b_hdr = hdr; 876 buf->b_data = NULL; 877 buf->b_efunc = NULL; 878 buf->b_private = NULL; 879 buf->b_next = NULL; 880 hdr->b_buf = buf; 881 arc_get_data_buf(buf); 882 hdr->b_datacnt = 1; 883 hdr->b_flags = 0; 884 ASSERT(refcount_is_zero(&hdr->b_refcnt)); 885 (void) refcount_add(&hdr->b_refcnt, tag); 886 887 return (buf); 888} 889 890static arc_buf_t * 891arc_buf_clone(arc_buf_t *from) 892{ 893 arc_buf_t *buf; 894 arc_buf_hdr_t *hdr = from->b_hdr; 895 uint64_t size = hdr->b_size; 896 897 buf = kmem_cache_alloc(buf_cache, KM_SLEEP); 898 buf->b_hdr = hdr; 899 buf->b_data = NULL; 900 buf->b_efunc = NULL; 901 buf->b_private = NULL; 902 buf->b_next = hdr->b_buf; 903 hdr->b_buf = buf; 904 arc_get_data_buf(buf); 905 bcopy(from->b_data, buf->b_data, size); 906 hdr->b_datacnt += 1; 907 return (buf); 908} 909 910void 911arc_buf_add_ref(arc_buf_t *buf, void* tag) 912{ 913 arc_buf_hdr_t *hdr; 914 kmutex_t *hash_lock; 915 916 /* 917 * Check to see if this buffer is currently being evicted via 918 * arc_do_user_evicts(). 919 */ 920 mutex_enter(&arc_eviction_mtx); 921 hdr = buf->b_hdr; 922 if (hdr == NULL) { 923 mutex_exit(&arc_eviction_mtx); 924 return; 925 } 926 hash_lock = HDR_LOCK(hdr); 927 mutex_exit(&arc_eviction_mtx); 928 929 mutex_enter(hash_lock); 930 if (buf->b_data == NULL) { 931 /* 932 * This buffer is evicted. 933 */ 934 mutex_exit(hash_lock); 935 return; 936 } 937 938 ASSERT(buf->b_hdr == hdr); 939 ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu); 940 add_reference(hdr, hash_lock, tag); 941 arc_access(hdr, hash_lock); 942 mutex_exit(hash_lock); 943 ARCSTAT_BUMP(arcstat_hits); 944 ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH), 945 demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, 946 data, metadata, hits); 947} 948 949static void 950arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all) 951{ 952 arc_buf_t **bufp; 953 954 /* free up data associated with the buf */ 955 if (buf->b_data) { 956 arc_state_t *state = buf->b_hdr->b_state; 957 uint64_t size = buf->b_hdr->b_size; 958 arc_buf_contents_t type = buf->b_hdr->b_type; 959 960 arc_cksum_verify(buf); 961 if (!recycle) { 962 if (type == ARC_BUFC_METADATA) { 963 zio_buf_free(buf->b_data, size); 964 } else { 965 ASSERT(type == ARC_BUFC_DATA); 966 zio_data_buf_free(buf->b_data, size); 967 } 968 atomic_add_64(&arc_size, -size); 969 } 970 if (list_link_active(&buf->b_hdr->b_arc_node)) { 971 ASSERT(refcount_is_zero(&buf->b_hdr->b_refcnt)); 972 ASSERT(state != arc_anon); 973 ASSERT3U(state->arcs_lsize, >=, size); 974 atomic_add_64(&state->arcs_lsize, -size); 975 } 976 ASSERT3U(state->arcs_size, >=, size); 977 atomic_add_64(&state->arcs_size, -size); 978 buf->b_data = NULL; 979 ASSERT(buf->b_hdr->b_datacnt > 0); 980 buf->b_hdr->b_datacnt -= 1; 981 } 982 983 /* only remove the buf if requested */ 984 if (!all) 985 return; 986 987 /* remove the buf from the hdr list */ 988 for (bufp = &buf->b_hdr->b_buf; *bufp != buf; bufp = &(*bufp)->b_next) 989 continue; 990 *bufp = buf->b_next; 991 992 ASSERT(buf->b_efunc == NULL); 993 994 /* clean up the buf */ 995 buf->b_hdr = NULL; 996 kmem_cache_free(buf_cache, buf); 997} 998 999static void 1000arc_hdr_destroy(arc_buf_hdr_t *hdr) 1001{ 1002 ASSERT(refcount_is_zero(&hdr->b_refcnt)); 1003 ASSERT3P(hdr->b_state, ==, arc_anon); 1004 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 1005 1006 if (!BUF_EMPTY(hdr)) { 1007 ASSERT(!HDR_IN_HASH_TABLE(hdr)); 1008 bzero(&hdr->b_dva, sizeof (dva_t)); 1009 hdr->b_birth = 0; 1010 hdr->b_cksum0 = 0; 1011 } 1012 while (hdr->b_buf) { 1013 arc_buf_t *buf = hdr->b_buf; 1014 1015 if (buf->b_efunc) { 1016 mutex_enter(&arc_eviction_mtx); 1017 ASSERT(buf->b_hdr != NULL); 1018 arc_buf_destroy(hdr->b_buf, FALSE, FALSE); 1019 hdr->b_buf = buf->b_next; 1020 buf->b_hdr = &arc_eviction_hdr; 1021 buf->b_next = arc_eviction_list; 1022 arc_eviction_list = buf; 1023 mutex_exit(&arc_eviction_mtx); 1024 } else { 1025 arc_buf_destroy(hdr->b_buf, FALSE, TRUE); 1026 } 1027 } 1028 if (hdr->b_freeze_cksum != NULL) { 1029 kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t)); 1030 hdr->b_freeze_cksum = NULL; 1031 } 1032 mutex_destroy(&hdr->b_freeze_lock); 1033 1034 ASSERT(!list_link_active(&hdr->b_arc_node)); 1035 ASSERT3P(hdr->b_hash_next, ==, NULL); 1036 ASSERT3P(hdr->b_acb, ==, NULL); 1037 kmem_cache_free(hdr_cache, hdr); 1038} 1039 1040void 1041arc_buf_free(arc_buf_t *buf, void *tag) 1042{ 1043 arc_buf_hdr_t *hdr = buf->b_hdr; 1044 int hashed = hdr->b_state != arc_anon; 1045 1046 ASSERT(buf->b_efunc == NULL); 1047 ASSERT(buf->b_data != NULL); 1048 1049 if (hashed) { 1050 kmutex_t *hash_lock = HDR_LOCK(hdr); 1051 1052 mutex_enter(hash_lock); 1053 (void) remove_reference(hdr, hash_lock, tag); 1054 if (hdr->b_datacnt > 1) 1055 arc_buf_destroy(buf, FALSE, TRUE); 1056 else 1057 hdr->b_flags |= ARC_BUF_AVAILABLE; 1058 mutex_exit(hash_lock); 1059 } else if (HDR_IO_IN_PROGRESS(hdr)) { 1060 int destroy_hdr; 1061 /* 1062 * We are in the middle of an async write. Don't destroy 1063 * this buffer unless the write completes before we finish 1064 * decrementing the reference count. 1065 */ 1066 mutex_enter(&arc_eviction_mtx); 1067 (void) remove_reference(hdr, NULL, tag); 1068 ASSERT(refcount_is_zero(&hdr->b_refcnt)); 1069 destroy_hdr = !HDR_IO_IN_PROGRESS(hdr); 1070 mutex_exit(&arc_eviction_mtx); 1071 if (destroy_hdr) 1072 arc_hdr_destroy(hdr); 1073 } else { 1074 if (remove_reference(hdr, NULL, tag) > 0) { 1075 ASSERT(HDR_IO_ERROR(hdr)); 1076 arc_buf_destroy(buf, FALSE, TRUE); 1077 } else { 1078 arc_hdr_destroy(hdr); 1079 } 1080 } 1081} 1082 1083int 1084arc_buf_remove_ref(arc_buf_t *buf, void* tag) 1085{ 1086 arc_buf_hdr_t *hdr = buf->b_hdr; 1087 kmutex_t *hash_lock = HDR_LOCK(hdr); 1088 int no_callback = (buf->b_efunc == NULL); 1089 1090 if (hdr->b_state == arc_anon) { 1091 arc_buf_free(buf, tag); 1092 return (no_callback); 1093 } 1094 1095 mutex_enter(hash_lock); 1096 ASSERT(hdr->b_state != arc_anon); 1097 ASSERT(buf->b_data != NULL); 1098 1099 (void) remove_reference(hdr, hash_lock, tag); 1100 if (hdr->b_datacnt > 1) { 1101 if (no_callback) 1102 arc_buf_destroy(buf, FALSE, TRUE); 1103 } else if (no_callback) { 1104 ASSERT(hdr->b_buf == buf && buf->b_next == NULL); 1105 hdr->b_flags |= ARC_BUF_AVAILABLE; 1106 } 1107 ASSERT(no_callback || hdr->b_datacnt > 1 || 1108 refcount_is_zero(&hdr->b_refcnt)); 1109 mutex_exit(hash_lock); 1110 return (no_callback); 1111} 1112 1113int 1114arc_buf_size(arc_buf_t *buf) 1115{ 1116 return (buf->b_hdr->b_size); 1117} 1118 1119/* 1120 * Evict buffers from list until we've removed the specified number of 1121 * bytes. Move the removed buffers to the appropriate evict state. 1122 * If the recycle flag is set, then attempt to "recycle" a buffer: 1123 * - look for a buffer to evict that is `bytes' long. 1124 * - return the data block from this buffer rather than freeing it. 1125 * This flag is used by callers that are trying to make space for a 1126 * new buffer in a full arc cache. 1127 */ 1128static void * 1129arc_evict(arc_state_t *state, int64_t bytes, boolean_t recycle, 1130 arc_buf_contents_t type) 1131{ 1132 arc_state_t *evicted_state; 1133 uint64_t bytes_evicted = 0, skipped = 0, missed = 0; 1134 arc_buf_hdr_t *ab, *ab_prev = NULL; 1135 kmutex_t *hash_lock; 1136 boolean_t have_lock; 1137 void *stolen = NULL; 1138 1139 ASSERT(state == arc_mru || state == arc_mfu); 1140 1141 evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost; 1142 1143 mutex_enter(&state->arcs_mtx); 1144 mutex_enter(&evicted_state->arcs_mtx); 1145 1146 for (ab = list_tail(&state->arcs_list); ab; ab = ab_prev) { 1147 ab_prev = list_prev(&state->arcs_list, ab); 1148 /* prefetch buffers have a minimum lifespan */ 1149 if (HDR_IO_IN_PROGRESS(ab) || 1150 (ab->b_flags & (ARC_PREFETCH|ARC_INDIRECT) && 1151 lbolt - ab->b_arc_access < arc_min_prefetch_lifespan)) { 1152 skipped++; 1153 continue; 1154 } 1155 /* "lookahead" for better eviction candidate */ 1156 if (recycle && ab->b_size != bytes && 1157 ab_prev && ab_prev->b_size == bytes) 1158 continue; 1159 hash_lock = HDR_LOCK(ab); 1160 have_lock = MUTEX_HELD(hash_lock); 1161 if (have_lock || mutex_tryenter(hash_lock)) { 1162 ASSERT3U(refcount_count(&ab->b_refcnt), ==, 0); 1163 ASSERT(ab->b_datacnt > 0); 1164 while (ab->b_buf) { 1165 arc_buf_t *buf = ab->b_buf; 1166 if (buf->b_data) { 1167 bytes_evicted += ab->b_size; 1168 if (recycle && ab->b_type == type && 1169 ab->b_size == bytes) { 1170 stolen = buf->b_data; 1171 recycle = FALSE; 1172 } 1173 } 1174 if (buf->b_efunc) { 1175 mutex_enter(&arc_eviction_mtx); 1176 arc_buf_destroy(buf, 1177 buf->b_data == stolen, FALSE); 1178 ab->b_buf = buf->b_next; 1179 buf->b_hdr = &arc_eviction_hdr; 1180 buf->b_next = arc_eviction_list; 1181 arc_eviction_list = buf; 1182 mutex_exit(&arc_eviction_mtx); 1183 } else { 1184 arc_buf_destroy(buf, 1185 buf->b_data == stolen, TRUE); 1186 } 1187 } 1188 ASSERT(ab->b_datacnt == 0); 1189 arc_change_state(evicted_state, ab, hash_lock); 1190 ASSERT(HDR_IN_HASH_TABLE(ab)); 1191 ab->b_flags = ARC_IN_HASH_TABLE; 1192 DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, ab); 1193 if (!have_lock) 1194 mutex_exit(hash_lock); 1195 if (bytes >= 0 && bytes_evicted >= bytes) 1196 break; 1197 } else { 1198 missed += 1; 1199 } 1200 } 1201 1202 mutex_exit(&evicted_state->arcs_mtx); 1203 mutex_exit(&state->arcs_mtx); 1204 1205 if (bytes_evicted < bytes) 1206 dprintf("only evicted %lld bytes from %x", 1207 (longlong_t)bytes_evicted, state); 1208 1209 if (skipped) 1210 ARCSTAT_INCR(arcstat_evict_skip, skipped); 1211 1212 if (missed) 1213 ARCSTAT_INCR(arcstat_mutex_miss, missed); 1214 1215 return (stolen); 1216} 1217 1218/* 1219 * Remove buffers from list until we've removed the specified number of 1220 * bytes. Destroy the buffers that are removed. 1221 */ 1222static void 1223arc_evict_ghost(arc_state_t *state, int64_t bytes) 1224{ 1225 arc_buf_hdr_t *ab, *ab_prev; 1226 kmutex_t *hash_lock; 1227 uint64_t bytes_deleted = 0; 1228 uint64_t bufs_skipped = 0; 1229 1230 ASSERT(GHOST_STATE(state)); 1231top: 1232 mutex_enter(&state->arcs_mtx); 1233 for (ab = list_tail(&state->arcs_list); ab; ab = ab_prev) { 1234 ab_prev = list_prev(&state->arcs_list, ab); 1235 hash_lock = HDR_LOCK(ab); 1236 if (mutex_tryenter(hash_lock)) { 1237 ASSERT(!HDR_IO_IN_PROGRESS(ab)); 1238 ASSERT(ab->b_buf == NULL); 1239 arc_change_state(arc_anon, ab, hash_lock); 1240 mutex_exit(hash_lock); 1241 ARCSTAT_BUMP(arcstat_deleted); 1242 bytes_deleted += ab->b_size; 1243 arc_hdr_destroy(ab); 1244 DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab); 1245 if (bytes >= 0 && bytes_deleted >= bytes) 1246 break; 1247 } else { 1248 if (bytes < 0) { 1249 mutex_exit(&state->arcs_mtx); 1250 mutex_enter(hash_lock); 1251 mutex_exit(hash_lock); 1252 goto top; 1253 } 1254 bufs_skipped += 1; 1255 } 1256 } 1257 mutex_exit(&state->arcs_mtx); 1258 1259 if (bufs_skipped) { 1260 ARCSTAT_INCR(arcstat_mutex_miss, bufs_skipped); 1261 ASSERT(bytes >= 0); 1262 } 1263 1264 if (bytes_deleted < bytes) 1265 dprintf("only deleted %lld bytes from %p", 1266 (longlong_t)bytes_deleted, state); 1267} 1268 1269static void 1270arc_adjust(void) 1271{ 1272 int64_t top_sz, mru_over, arc_over, todelete; 1273 1274 top_sz = arc_anon->arcs_size + arc_mru->arcs_size; 1275 1276 if (top_sz > arc_p && arc_mru->arcs_lsize > 0) { 1277 int64_t toevict = MIN(arc_mru->arcs_lsize, top_sz - arc_p); 1278 (void) arc_evict(arc_mru, toevict, FALSE, ARC_BUFC_UNDEF); 1279 top_sz = arc_anon->arcs_size + arc_mru->arcs_size; 1280 } 1281 1282 mru_over = top_sz + arc_mru_ghost->arcs_size - arc_c; 1283 1284 if (mru_over > 0) { 1285 if (arc_mru_ghost->arcs_lsize > 0) { 1286 todelete = MIN(arc_mru_ghost->arcs_lsize, mru_over); 1287 arc_evict_ghost(arc_mru_ghost, todelete); 1288 } 1289 } 1290 1291 if ((arc_over = arc_size - arc_c) > 0) { 1292 int64_t tbl_over; 1293 1294 if (arc_mfu->arcs_lsize > 0) { 1295 int64_t toevict = MIN(arc_mfu->arcs_lsize, arc_over); 1296 (void) arc_evict(arc_mfu, toevict, FALSE, 1297 ARC_BUFC_UNDEF); 1298 } 1299 1300 tbl_over = arc_size + arc_mru_ghost->arcs_lsize + 1301 arc_mfu_ghost->arcs_lsize - arc_c*2; 1302 1303 if (tbl_over > 0 && arc_mfu_ghost->arcs_lsize > 0) { 1304 todelete = MIN(arc_mfu_ghost->arcs_lsize, tbl_over); 1305 arc_evict_ghost(arc_mfu_ghost, todelete); 1306 } 1307 } 1308} 1309 1310static void 1311arc_do_user_evicts(void) 1312{ 1313 mutex_enter(&arc_eviction_mtx); 1314 while (arc_eviction_list != NULL) { 1315 arc_buf_t *buf = arc_eviction_list; 1316 arc_eviction_list = buf->b_next; 1317 buf->b_hdr = NULL; 1318 mutex_exit(&arc_eviction_mtx); 1319 1320 if (buf->b_efunc != NULL) 1321 VERIFY(buf->b_efunc(buf) == 0); 1322 1323 buf->b_efunc = NULL; 1324 buf->b_private = NULL; 1325 kmem_cache_free(buf_cache, buf); 1326 mutex_enter(&arc_eviction_mtx); 1327 } 1328 mutex_exit(&arc_eviction_mtx); 1329} 1330 1331/* 1332 * Flush all *evictable* data from the cache. 1333 * NOTE: this will not touch "active" (i.e. referenced) data. 1334 */ 1335void 1336arc_flush(void) 1337{ 1338 while (list_head(&arc_mru->arcs_list)) 1339 (void) arc_evict(arc_mru, -1, FALSE, ARC_BUFC_UNDEF); 1340 while (list_head(&arc_mfu->arcs_list)) 1341 (void) arc_evict(arc_mfu, -1, FALSE, ARC_BUFC_UNDEF); 1342 1343 arc_evict_ghost(arc_mru_ghost, -1); 1344 arc_evict_ghost(arc_mfu_ghost, -1); 1345 1346 mutex_enter(&arc_reclaim_thr_lock); 1347 arc_do_user_evicts(); 1348 mutex_exit(&arc_reclaim_thr_lock); 1349 ASSERT(arc_eviction_list == NULL); 1350} 1351 1352int arc_shrink_shift = 5; /* log2(fraction of arc to reclaim) */ 1353 1354void 1355arc_shrink(void) 1356{ 1357 if (arc_c > arc_c_min) { 1358 uint64_t to_free; 1359 1360#ifdef _KERNEL 1361 to_free = arc_c >> arc_shrink_shift; 1362#else 1363 to_free = arc_c >> arc_shrink_shift; 1364#endif 1365 if (arc_c > arc_c_min + to_free) 1366 atomic_add_64(&arc_c, -to_free); 1367 else 1368 arc_c = arc_c_min; 1369 1370 atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift)); 1371 if (arc_c > arc_size) 1372 arc_c = MAX(arc_size, arc_c_min); 1373 if (arc_p > arc_c) 1374 arc_p = (arc_c >> 1); 1375 ASSERT(arc_c >= arc_c_min); 1376 ASSERT((int64_t)arc_p >= 0); 1377 } 1378 1379 if (arc_size > arc_c) 1380 arc_adjust(); 1381} 1382 1383static int zfs_needfree = 0; 1384 1385static int 1386arc_reclaim_needed(void) 1387{ 1388#if 0 1389 uint64_t extra; 1390#endif 1391 1392#ifdef _KERNEL 1393 1394 if (zfs_needfree) 1395 return (1); 1396 1397#if 0 1398 /* 1399 * check to make sure that swapfs has enough space so that anon 1400 * reservations can still succeeed. anon_resvmem() checks that the 1401 * availrmem is greater than swapfs_minfree, and the number of reserved 1402 * swap pages. We also add a bit of extra here just to prevent 1403 * circumstances from getting really dire. 1404 */ 1405 if (availrmem < swapfs_minfree + swapfs_reserve + extra) 1406 return (1); 1407 1408 /* 1409 * If zio data pages are being allocated out of a separate heap segment, 1410 * then check that the size of available vmem for this area remains 1411 * above 1/4th free. This needs to be done when the size of the 1412 * non-default segment is smaller than physical memory, so we could 1413 * conceivably run out of VA in that segment before running out of 1414 * physical memory. 1415 */ 1416 if (zio_arena != NULL) { 1417 size_t arc_ziosize = 1418 btop(vmem_size(zio_arena, VMEM_FREE | VMEM_ALLOC)); 1419 1420 if ((physmem > arc_ziosize) && 1421 (btop(vmem_size(zio_arena, VMEM_FREE)) < arc_ziosize >> 2)) 1422 return (1); 1423 } 1424 1425#if defined(__i386) 1426 /* 1427 * If we're on an i386 platform, it's possible that we'll exhaust the 1428 * kernel heap space before we ever run out of available physical 1429 * memory. Most checks of the size of the heap_area compare against 1430 * tune.t_minarmem, which is the minimum available real memory that we 1431 * can have in the system. However, this is generally fixed at 25 pages 1432 * which is so low that it's useless. In this comparison, we seek to 1433 * calculate the total heap-size, and reclaim if more than 3/4ths of the 1434 * heap is allocated. (Or, in the caclulation, if less than 1/4th is 1435 * free) 1436 */ 1437 if (btop(vmem_size(heap_arena, VMEM_FREE)) < 1438 (btop(vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC)) >> 2)) 1439 return (1); 1440#endif 1441#else 1442 if (kmem_used() > kmem_size() / 2) 1443 return (1); 1444#endif 1445 1446#else 1447 if (spa_get_random(100) == 0) 1448 return (1); 1449#endif 1450 return (0); 1451} 1452 1453static void 1454arc_kmem_reap_now(arc_reclaim_strategy_t strat) 1455{ 1456#ifdef ZIO_USE_UMA 1457 size_t i; 1458 kmem_cache_t *prev_cache = NULL; 1459 kmem_cache_t *prev_data_cache = NULL; 1460 extern kmem_cache_t *zio_buf_cache[]; 1461 extern kmem_cache_t *zio_data_buf_cache[]; 1462#endif 1463 1464#ifdef _KERNEL 1465 /* 1466 * First purge some DNLC entries, in case the DNLC is using 1467 * up too much memory. 1468 */ 1469 dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent); 1470 1471#if defined(__i386) 1472 /* 1473 * Reclaim unused memory from all kmem caches. 1474 */ 1475 kmem_reap(); 1476#endif 1477#endif 1478 1479 /* 1480 * An agressive reclamation will shrink the cache size as well as 1481 * reap free buffers from the arc kmem caches. 1482 */ 1483 if (strat == ARC_RECLAIM_AGGR) 1484 arc_shrink(); 1485 1486#ifdef ZIO_USE_UMA 1487 for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) { 1488 if (zio_buf_cache[i] != prev_cache) { 1489 prev_cache = zio_buf_cache[i]; 1490 kmem_cache_reap_now(zio_buf_cache[i]); 1491 } 1492 if (zio_data_buf_cache[i] != prev_data_cache) { 1493 prev_data_cache = zio_data_buf_cache[i]; 1494 kmem_cache_reap_now(zio_data_buf_cache[i]); 1495 } 1496 } 1497#endif 1498 kmem_cache_reap_now(buf_cache); 1499 kmem_cache_reap_now(hdr_cache); 1500} 1501 1502static void 1503arc_reclaim_thread(void *dummy __unused) 1504{ 1505 clock_t growtime = 0; 1506 arc_reclaim_strategy_t last_reclaim = ARC_RECLAIM_CONS; 1507 callb_cpr_t cpr; 1508 1509 CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG); 1510 1511 mutex_enter(&arc_reclaim_thr_lock); 1512 while (arc_thread_exit == 0) { 1513 if (arc_reclaim_needed()) { 1514 1515 if (arc_no_grow) { 1516 if (last_reclaim == ARC_RECLAIM_CONS) { 1517 last_reclaim = ARC_RECLAIM_AGGR; 1518 } else { 1519 last_reclaim = ARC_RECLAIM_CONS; 1520 } 1521 } else { 1522 arc_no_grow = TRUE; 1523 last_reclaim = ARC_RECLAIM_AGGR; 1524 membar_producer(); 1525 } 1526 1527 /* reset the growth delay for every reclaim */ 1528 growtime = lbolt + (arc_grow_retry * hz); 1529 ASSERT(growtime > 0); 1530 1531 if (zfs_needfree && last_reclaim == ARC_RECLAIM_CONS) { 1532 /* 1533 * If zfs_needfree is TRUE our vm_lowmem hook 1534 * was called and in that case we must free some 1535 * memory, so switch to aggressive mode. 1536 */ 1537 arc_no_grow = TRUE; 1538 last_reclaim = ARC_RECLAIM_AGGR; 1539 } 1540 arc_kmem_reap_now(last_reclaim); 1541 } else if ((growtime > 0) && ((growtime - lbolt) <= 0)) { 1542 arc_no_grow = FALSE; 1543 } 1544 1545 if (zfs_needfree || 1546 (2 * arc_c < arc_size + 1547 arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size)) 1548 arc_adjust(); 1549 1550 if (arc_eviction_list != NULL) 1551 arc_do_user_evicts(); 1552 1553 if (arc_reclaim_needed()) { 1554 zfs_needfree = 0; 1555#ifdef _KERNEL 1556 wakeup(&zfs_needfree); 1557#endif 1558 } 1559 1560 /* block until needed, or one second, whichever is shorter */ 1561 CALLB_CPR_SAFE_BEGIN(&cpr); 1562 (void) cv_timedwait(&arc_reclaim_thr_cv, 1563 &arc_reclaim_thr_lock, hz); 1564 CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock); 1565 } 1566 1567 arc_thread_exit = 0; 1568 cv_broadcast(&arc_reclaim_thr_cv); 1569 CALLB_CPR_EXIT(&cpr); /* drops arc_reclaim_thr_lock */ 1570 thread_exit(); 1571} 1572 1573/* 1574 * Adapt arc info given the number of bytes we are trying to add and 1575 * the state that we are comming from. This function is only called 1576 * when we are adding new content to the cache. 1577 */ 1578static void 1579arc_adapt(int bytes, arc_state_t *state) 1580{ 1581 int mult; 1582 1583 ASSERT(bytes > 0); 1584 /* 1585 * Adapt the target size of the MRU list: 1586 * - if we just hit in the MRU ghost list, then increase 1587 * the target size of the MRU list. 1588 * - if we just hit in the MFU ghost list, then increase 1589 * the target size of the MFU list by decreasing the 1590 * target size of the MRU list. 1591 */ 1592 if (state == arc_mru_ghost) { 1593 mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ? 1594 1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size)); 1595 1596 arc_p = MIN(arc_c, arc_p + bytes * mult); 1597 } else if (state == arc_mfu_ghost) { 1598 mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ? 1599 1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size)); 1600 1601 arc_p = MAX(0, (int64_t)arc_p - bytes * mult); 1602 } 1603 ASSERT((int64_t)arc_p >= 0); 1604 1605 if (arc_reclaim_needed()) { 1606 cv_signal(&arc_reclaim_thr_cv); 1607 return; 1608 } 1609 1610 if (arc_no_grow) 1611 return; 1612 1613 if (arc_c >= arc_c_max) 1614 return; 1615 1616 /* 1617 * If we're within (2 * maxblocksize) bytes of the target 1618 * cache size, increment the target cache size 1619 */ 1620 if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) { 1621 atomic_add_64(&arc_c, (int64_t)bytes); 1622 if (arc_c > arc_c_max) 1623 arc_c = arc_c_max; 1624 else if (state == arc_anon) 1625 atomic_add_64(&arc_p, (int64_t)bytes); 1626 if (arc_p > arc_c) 1627 arc_p = arc_c; 1628 } 1629 ASSERT((int64_t)arc_p >= 0); 1630} 1631 1632/* 1633 * Check if the cache has reached its limits and eviction is required 1634 * prior to insert. 1635 */ 1636static int 1637arc_evict_needed() 1638{ 1639 if (arc_reclaim_needed()) 1640 return (1); 1641 1642 return (arc_size > arc_c); 1643} 1644 1645/* 1646 * The buffer, supplied as the first argument, needs a data block. 1647 * So, if we are at cache max, determine which cache should be victimized. 1648 * We have the following cases: 1649 * 1650 * 1. Insert for MRU, p > sizeof(arc_anon + arc_mru) -> 1651 * In this situation if we're out of space, but the resident size of the MFU is 1652 * under the limit, victimize the MFU cache to satisfy this insertion request. 1653 * 1654 * 2. Insert for MRU, p <= sizeof(arc_anon + arc_mru) -> 1655 * Here, we've used up all of the available space for the MRU, so we need to 1656 * evict from our own cache instead. Evict from the set of resident MRU 1657 * entries. 1658 * 1659 * 3. Insert for MFU (c - p) > sizeof(arc_mfu) -> 1660 * c minus p represents the MFU space in the cache, since p is the size of the 1661 * cache that is dedicated to the MRU. In this situation there's still space on 1662 * the MFU side, so the MRU side needs to be victimized. 1663 * 1664 * 4. Insert for MFU (c - p) < sizeof(arc_mfu) -> 1665 * MFU's resident set is consuming more space than it has been allotted. In 1666 * this situation, we must victimize our own cache, the MFU, for this insertion. 1667 */ 1668static void 1669arc_get_data_buf(arc_buf_t *buf) 1670{ 1671 arc_state_t *state = buf->b_hdr->b_state; 1672 uint64_t size = buf->b_hdr->b_size; 1673 arc_buf_contents_t type = buf->b_hdr->b_type; 1674 1675 arc_adapt(size, state); 1676 1677 /* 1678 * We have not yet reached cache maximum size, 1679 * just allocate a new buffer. 1680 */ 1681 if (!arc_evict_needed()) { 1682 if (type == ARC_BUFC_METADATA) { 1683 buf->b_data = zio_buf_alloc(size); 1684 } else { 1685 ASSERT(type == ARC_BUFC_DATA); 1686 buf->b_data = zio_data_buf_alloc(size); 1687 } 1688 atomic_add_64(&arc_size, size); 1689 goto out; 1690 } 1691 1692 /* 1693 * If we are prefetching from the mfu ghost list, this buffer 1694 * will end up on the mru list; so steal space from there. 1695 */ 1696 if (state == arc_mfu_ghost) 1697 state = buf->b_hdr->b_flags & ARC_PREFETCH ? arc_mru : arc_mfu; 1698 else if (state == arc_mru_ghost) 1699 state = arc_mru; 1700 1701 if (state == arc_mru || state == arc_anon) { 1702 uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size; 1703 state = (arc_p > mru_used) ? arc_mfu : arc_mru; 1704 } else { 1705 /* MFU cases */ 1706 uint64_t mfu_space = arc_c - arc_p; 1707 state = (mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu; 1708 } 1709 if ((buf->b_data = arc_evict(state, size, TRUE, type)) == NULL) { 1710 if (type == ARC_BUFC_METADATA) { 1711 buf->b_data = zio_buf_alloc(size); 1712 } else { 1713 ASSERT(type == ARC_BUFC_DATA); 1714 buf->b_data = zio_data_buf_alloc(size); 1715 } 1716 atomic_add_64(&arc_size, size); 1717 ARCSTAT_BUMP(arcstat_recycle_miss); 1718 } 1719 ASSERT(buf->b_data != NULL); 1720out: 1721 /* 1722 * Update the state size. Note that ghost states have a 1723 * "ghost size" and so don't need to be updated. 1724 */ 1725 if (!GHOST_STATE(buf->b_hdr->b_state)) { 1726 arc_buf_hdr_t *hdr = buf->b_hdr; 1727 1728 atomic_add_64(&hdr->b_state->arcs_size, size); 1729 if (list_link_active(&hdr->b_arc_node)) { 1730 ASSERT(refcount_is_zero(&hdr->b_refcnt)); 1731 atomic_add_64(&hdr->b_state->arcs_lsize, size); 1732 } 1733 /* 1734 * If we are growing the cache, and we are adding anonymous 1735 * data, and we have outgrown arc_p, update arc_p 1736 */ 1737 if (arc_size < arc_c && hdr->b_state == arc_anon && 1738 arc_anon->arcs_size + arc_mru->arcs_size > arc_p) 1739 arc_p = MIN(arc_c, arc_p + size); 1740 } 1741} 1742 1743/* 1744 * This routine is called whenever a buffer is accessed. 1745 * NOTE: the hash lock is dropped in this function. 1746 */ 1747static void 1748arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) 1749{ 1750 ASSERT(MUTEX_HELD(hash_lock)); 1751 1752 if (buf->b_state == arc_anon) { 1753 /* 1754 * This buffer is not in the cache, and does not 1755 * appear in our "ghost" list. Add the new buffer 1756 * to the MRU state. 1757 */ 1758 1759 ASSERT(buf->b_arc_access == 0); 1760 buf->b_arc_access = lbolt; 1761 DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf); 1762 arc_change_state(arc_mru, buf, hash_lock); 1763 1764 } else if (buf->b_state == arc_mru) { 1765 /* 1766 * If this buffer is here because of a prefetch, then either: 1767 * - clear the flag if this is a "referencing" read 1768 * (any subsequent access will bump this into the MFU state). 1769 * or 1770 * - move the buffer to the head of the list if this is 1771 * another prefetch (to make it less likely to be evicted). 1772 */ 1773 if ((buf->b_flags & ARC_PREFETCH) != 0) { 1774 if (refcount_count(&buf->b_refcnt) == 0) { 1775 ASSERT(list_link_active(&buf->b_arc_node)); 1776 mutex_enter(&arc_mru->arcs_mtx); 1777 list_remove(&arc_mru->arcs_list, buf); 1778 list_insert_head(&arc_mru->arcs_list, buf); 1779 mutex_exit(&arc_mru->arcs_mtx); 1780 } else { 1781 buf->b_flags &= ~ARC_PREFETCH; 1782 ARCSTAT_BUMP(arcstat_mru_hits); 1783 } 1784 buf->b_arc_access = lbolt; 1785 return; 1786 } 1787 1788 /* 1789 * This buffer has been "accessed" only once so far, 1790 * but it is still in the cache. Move it to the MFU 1791 * state. 1792 */ 1793 if (lbolt > buf->b_arc_access + ARC_MINTIME) { 1794 /* 1795 * More than 125ms have passed since we 1796 * instantiated this buffer. Move it to the 1797 * most frequently used state. 1798 */ 1799 buf->b_arc_access = lbolt; 1800 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); 1801 arc_change_state(arc_mfu, buf, hash_lock); 1802 } 1803 ARCSTAT_BUMP(arcstat_mru_hits); 1804 } else if (buf->b_state == arc_mru_ghost) { 1805 arc_state_t *new_state; 1806 /* 1807 * This buffer has been "accessed" recently, but 1808 * was evicted from the cache. Move it to the 1809 * MFU state. 1810 */ 1811 1812 if (buf->b_flags & ARC_PREFETCH) { 1813 new_state = arc_mru; 1814 if (refcount_count(&buf->b_refcnt) > 0) 1815 buf->b_flags &= ~ARC_PREFETCH; 1816 DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf); 1817 } else { 1818 new_state = arc_mfu; 1819 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); 1820 } 1821 1822 buf->b_arc_access = lbolt; 1823 arc_change_state(new_state, buf, hash_lock); 1824 1825 ARCSTAT_BUMP(arcstat_mru_ghost_hits); 1826 } else if (buf->b_state == arc_mfu) { 1827 /* 1828 * This buffer has been accessed more than once and is 1829 * still in the cache. Keep it in the MFU state. 1830 * 1831 * NOTE: an add_reference() that occurred when we did 1832 * the arc_read() will have kicked this off the list. 1833 * If it was a prefetch, we will explicitly move it to 1834 * the head of the list now. 1835 */ 1836 if ((buf->b_flags & ARC_PREFETCH) != 0) { 1837 ASSERT(refcount_count(&buf->b_refcnt) == 0); 1838 ASSERT(list_link_active(&buf->b_arc_node)); 1839 mutex_enter(&arc_mfu->arcs_mtx); 1840 list_remove(&arc_mfu->arcs_list, buf); 1841 list_insert_head(&arc_mfu->arcs_list, buf); 1842 mutex_exit(&arc_mfu->arcs_mtx); 1843 } 1844 ARCSTAT_BUMP(arcstat_mfu_hits); 1845 buf->b_arc_access = lbolt; 1846 } else if (buf->b_state == arc_mfu_ghost) { 1847 arc_state_t *new_state = arc_mfu; 1848 /* 1849 * This buffer has been accessed more than once but has 1850 * been evicted from the cache. Move it back to the 1851 * MFU state. 1852 */ 1853 1854 if (buf->b_flags & ARC_PREFETCH) { 1855 /* 1856 * This is a prefetch access... 1857 * move this block back to the MRU state. 1858 */ 1859 ASSERT3U(refcount_count(&buf->b_refcnt), ==, 0); 1860 new_state = arc_mru; 1861 } 1862 1863 buf->b_arc_access = lbolt; 1864 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); 1865 arc_change_state(new_state, buf, hash_lock); 1866 1867 ARCSTAT_BUMP(arcstat_mfu_ghost_hits); 1868 } else { 1869 ASSERT(!"invalid arc state"); 1870 } 1871} 1872 1873/* a generic arc_done_func_t which you can use */ 1874/* ARGSUSED */ 1875void 1876arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg) 1877{ 1878 bcopy(buf->b_data, arg, buf->b_hdr->b_size); 1879 VERIFY(arc_buf_remove_ref(buf, arg) == 1); 1880} 1881 1882/* a generic arc_done_func_t which you can use */ 1883void 1884arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg) 1885{ 1886 arc_buf_t **bufp = arg; 1887 if (zio && zio->io_error) { 1888 VERIFY(arc_buf_remove_ref(buf, arg) == 1); 1889 *bufp = NULL; 1890 } else { 1891 *bufp = buf; 1892 } 1893} 1894 1895static void 1896arc_read_done(zio_t *zio) 1897{ 1898 arc_buf_hdr_t *hdr, *found; 1899 arc_buf_t *buf; 1900 arc_buf_t *abuf; /* buffer we're assigning to callback */ 1901 kmutex_t *hash_lock; 1902 arc_callback_t *callback_list, *acb; 1903 int freeable = FALSE; 1904 1905 buf = zio->io_private; 1906 hdr = buf->b_hdr; 1907 1908 /* 1909 * The hdr was inserted into hash-table and removed from lists 1910 * prior to starting I/O. We should find this header, since 1911 * it's in the hash table, and it should be legit since it's 1912 * not possible to evict it during the I/O. The only possible 1913 * reason for it not to be found is if we were freed during the 1914 * read. 1915 */ 1916 found = buf_hash_find(zio->io_spa, &hdr->b_dva, hdr->b_birth, 1917 &hash_lock); 1918 1919 ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && hash_lock == NULL) || 1920 (found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp)))); 1921 1922 /* byteswap if necessary */ 1923 callback_list = hdr->b_acb; 1924 ASSERT(callback_list != NULL); 1925 if (BP_SHOULD_BYTESWAP(zio->io_bp) && callback_list->acb_byteswap) 1926 callback_list->acb_byteswap(buf->b_data, hdr->b_size); 1927 1928 arc_cksum_compute(buf); 1929 1930 /* create copies of the data buffer for the callers */ 1931 abuf = buf; 1932 for (acb = callback_list; acb; acb = acb->acb_next) { 1933 if (acb->acb_done) { 1934 if (abuf == NULL) 1935 abuf = arc_buf_clone(buf); 1936 acb->acb_buf = abuf; 1937 abuf = NULL; 1938 } 1939 } 1940 hdr->b_acb = NULL; 1941 hdr->b_flags &= ~ARC_IO_IN_PROGRESS; 1942 ASSERT(!HDR_BUF_AVAILABLE(hdr)); 1943 if (abuf == buf) 1944 hdr->b_flags |= ARC_BUF_AVAILABLE; 1945 1946 ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL); 1947 1948 if (zio->io_error != 0) { 1949 hdr->b_flags |= ARC_IO_ERROR; 1950 if (hdr->b_state != arc_anon) 1951 arc_change_state(arc_anon, hdr, hash_lock); 1952 if (HDR_IN_HASH_TABLE(hdr)) 1953 buf_hash_remove(hdr); 1954 freeable = refcount_is_zero(&hdr->b_refcnt); 1955 /* convert checksum errors into IO errors */ 1956 if (zio->io_error == ECKSUM) 1957 zio->io_error = EIO; 1958 } 1959 1960 /* 1961 * Broadcast before we drop the hash_lock to avoid the possibility 1962 * that the hdr (and hence the cv) might be freed before we get to 1963 * the cv_broadcast(). 1964 */ 1965 cv_broadcast(&hdr->b_cv); 1966 1967 if (hash_lock) { 1968 /* 1969 * Only call arc_access on anonymous buffers. This is because 1970 * if we've issued an I/O for an evicted buffer, we've already 1971 * called arc_access (to prevent any simultaneous readers from 1972 * getting confused). 1973 */ 1974 if (zio->io_error == 0 && hdr->b_state == arc_anon) 1975 arc_access(hdr, hash_lock); 1976 mutex_exit(hash_lock); 1977 } else { 1978 /* 1979 * This block was freed while we waited for the read to 1980 * complete. It has been removed from the hash table and 1981 * moved to the anonymous state (so that it won't show up 1982 * in the cache). 1983 */ 1984 ASSERT3P(hdr->b_state, ==, arc_anon); 1985 freeable = refcount_is_zero(&hdr->b_refcnt); 1986 } 1987 1988 /* execute each callback and free its structure */ 1989 while ((acb = callback_list) != NULL) { 1990 if (acb->acb_done) 1991 acb->acb_done(zio, acb->acb_buf, acb->acb_private); 1992 1993 if (acb->acb_zio_dummy != NULL) { 1994 acb->acb_zio_dummy->io_error = zio->io_error; 1995 zio_nowait(acb->acb_zio_dummy); 1996 } 1997 1998 callback_list = acb->acb_next; 1999 kmem_free(acb, sizeof (arc_callback_t)); 2000 } 2001 2002 if (freeable) 2003 arc_hdr_destroy(hdr); 2004} 2005 2006/* 2007 * "Read" the block block at the specified DVA (in bp) via the 2008 * cache. If the block is found in the cache, invoke the provided 2009 * callback immediately and return. Note that the `zio' parameter 2010 * in the callback will be NULL in this case, since no IO was 2011 * required. If the block is not in the cache pass the read request 2012 * on to the spa with a substitute callback function, so that the 2013 * requested block will be added to the cache. 2014 * 2015 * If a read request arrives for a block that has a read in-progress, 2016 * either wait for the in-progress read to complete (and return the 2017 * results); or, if this is a read with a "done" func, add a record 2018 * to the read to invoke the "done" func when the read completes, 2019 * and return; or just return. 2020 * 2021 * arc_read_done() will invoke all the requested "done" functions 2022 * for readers of this block. 2023 */ 2024int 2025arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_byteswap_func_t *swap, 2026 arc_done_func_t *done, void *private, int priority, int flags, 2027 uint32_t *arc_flags, zbookmark_t *zb) 2028{ 2029 arc_buf_hdr_t *hdr; 2030 arc_buf_t *buf; 2031 kmutex_t *hash_lock; 2032 zio_t *rzio; 2033 2034top: 2035 hdr = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_lock); 2036 if (hdr && hdr->b_datacnt > 0) { 2037 2038 *arc_flags |= ARC_CACHED; 2039 2040 if (HDR_IO_IN_PROGRESS(hdr)) { 2041 2042 if (*arc_flags & ARC_WAIT) { 2043 cv_wait(&hdr->b_cv, hash_lock); 2044 mutex_exit(hash_lock); 2045 goto top; 2046 } 2047 ASSERT(*arc_flags & ARC_NOWAIT); 2048 2049 if (done) { 2050 arc_callback_t *acb = NULL; 2051 2052 acb = kmem_zalloc(sizeof (arc_callback_t), 2053 KM_SLEEP); 2054 acb->acb_done = done; 2055 acb->acb_private = private; 2056 acb->acb_byteswap = swap; 2057 if (pio != NULL) 2058 acb->acb_zio_dummy = zio_null(pio, 2059 spa, NULL, NULL, flags); 2060 2061 ASSERT(acb->acb_done != NULL); 2062 acb->acb_next = hdr->b_acb; 2063 hdr->b_acb = acb; 2064 add_reference(hdr, hash_lock, private); 2065 mutex_exit(hash_lock); 2066 return (0); 2067 } 2068 mutex_exit(hash_lock); 2069 return (0); 2070 } 2071 2072 ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu); 2073 2074 if (done) { 2075 add_reference(hdr, hash_lock, private); 2076 /* 2077 * If this block is already in use, create a new 2078 * copy of the data so that we will be guaranteed 2079 * that arc_release() will always succeed. 2080 */ 2081 buf = hdr->b_buf; 2082 ASSERT(buf); 2083 ASSERT(buf->b_data); 2084 if (HDR_BUF_AVAILABLE(hdr)) { 2085 ASSERT(buf->b_efunc == NULL); 2086 hdr->b_flags &= ~ARC_BUF_AVAILABLE; 2087 } else { 2088 buf = arc_buf_clone(buf); 2089 } 2090 } else if (*arc_flags & ARC_PREFETCH && 2091 refcount_count(&hdr->b_refcnt) == 0) { 2092 hdr->b_flags |= ARC_PREFETCH; 2093 } 2094 DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); 2095 arc_access(hdr, hash_lock); 2096 mutex_exit(hash_lock); 2097 ARCSTAT_BUMP(arcstat_hits); 2098 ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH), 2099 demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, 2100 data, metadata, hits); 2101 2102 if (done) 2103 done(NULL, buf, private); 2104 } else { 2105 uint64_t size = BP_GET_LSIZE(bp); 2106 arc_callback_t *acb; 2107 2108 if (hdr == NULL) { 2109 /* this block is not in the cache */ 2110 arc_buf_hdr_t *exists; 2111 arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp); 2112 buf = arc_buf_alloc(spa, size, private, type); 2113 hdr = buf->b_hdr; 2114 hdr->b_dva = *BP_IDENTITY(bp); 2115 hdr->b_birth = bp->blk_birth; 2116 hdr->b_cksum0 = bp->blk_cksum.zc_word[0]; 2117 exists = buf_hash_insert(hdr, &hash_lock); 2118 if (exists) { 2119 /* somebody beat us to the hash insert */ 2120 mutex_exit(hash_lock); 2121 bzero(&hdr->b_dva, sizeof (dva_t)); 2122 hdr->b_birth = 0; 2123 hdr->b_cksum0 = 0; 2124 (void) arc_buf_remove_ref(buf, private); 2125 goto top; /* restart the IO request */ 2126 } 2127 /* if this is a prefetch, we don't have a reference */ 2128 if (*arc_flags & ARC_PREFETCH) { 2129 (void) remove_reference(hdr, hash_lock, 2130 private); 2131 hdr->b_flags |= ARC_PREFETCH; 2132 } 2133 if (BP_GET_LEVEL(bp) > 0) 2134 hdr->b_flags |= ARC_INDIRECT; 2135 } else { 2136 /* this block is in the ghost cache */ 2137 ASSERT(GHOST_STATE(hdr->b_state)); 2138 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 2139 ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 0); 2140 ASSERT(hdr->b_buf == NULL); 2141 2142 /* if this is a prefetch, we don't have a reference */ 2143 if (*arc_flags & ARC_PREFETCH) 2144 hdr->b_flags |= ARC_PREFETCH; 2145 else 2146 add_reference(hdr, hash_lock, private); 2147 buf = kmem_cache_alloc(buf_cache, KM_SLEEP); 2148 buf->b_hdr = hdr; 2149 buf->b_data = NULL; 2150 buf->b_efunc = NULL; 2151 buf->b_private = NULL; 2152 buf->b_next = NULL; 2153 hdr->b_buf = buf; 2154 arc_get_data_buf(buf); 2155 ASSERT(hdr->b_datacnt == 0); 2156 hdr->b_datacnt = 1; 2157 2158 } 2159 2160 acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); 2161 acb->acb_done = done; 2162 acb->acb_private = private; 2163 acb->acb_byteswap = swap; 2164 2165 ASSERT(hdr->b_acb == NULL); 2166 hdr->b_acb = acb; 2167 hdr->b_flags |= ARC_IO_IN_PROGRESS; 2168 2169 /* 2170 * If the buffer has been evicted, migrate it to a present state 2171 * before issuing the I/O. Once we drop the hash-table lock, 2172 * the header will be marked as I/O in progress and have an 2173 * attached buffer. At this point, anybody who finds this 2174 * buffer ought to notice that it's legit but has a pending I/O. 2175 */ 2176 2177 if (GHOST_STATE(hdr->b_state)) 2178 arc_access(hdr, hash_lock); 2179 mutex_exit(hash_lock); 2180 2181 ASSERT3U(hdr->b_size, ==, size); 2182 DTRACE_PROBE3(arc__miss, blkptr_t *, bp, uint64_t, size, 2183 zbookmark_t *, zb); 2184 ARCSTAT_BUMP(arcstat_misses); 2185 ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH), 2186 demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, 2187 data, metadata, misses); 2188 2189 rzio = zio_read(pio, spa, bp, buf->b_data, size, 2190 arc_read_done, buf, priority, flags, zb); 2191 2192 if (*arc_flags & ARC_WAIT) 2193 return (zio_wait(rzio)); 2194 2195 ASSERT(*arc_flags & ARC_NOWAIT); 2196 zio_nowait(rzio); 2197 } 2198 return (0); 2199} 2200 2201/* 2202 * arc_read() variant to support pool traversal. If the block is already 2203 * in the ARC, make a copy of it; otherwise, the caller will do the I/O. 2204 * The idea is that we don't want pool traversal filling up memory, but 2205 * if the ARC already has the data anyway, we shouldn't pay for the I/O. 2206 */ 2207int 2208arc_tryread(spa_t *spa, blkptr_t *bp, void *data) 2209{ 2210 arc_buf_hdr_t *hdr; 2211 kmutex_t *hash_mtx; 2212 int rc = 0; 2213 2214 hdr = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_mtx); 2215 2216 if (hdr && hdr->b_datacnt > 0 && !HDR_IO_IN_PROGRESS(hdr)) { 2217 arc_buf_t *buf = hdr->b_buf; 2218 2219 ASSERT(buf); 2220 while (buf->b_data == NULL) { 2221 buf = buf->b_next; 2222 ASSERT(buf); 2223 } 2224 bcopy(buf->b_data, data, hdr->b_size); 2225 } else { 2226 rc = ENOENT; 2227 } 2228 2229 if (hash_mtx) 2230 mutex_exit(hash_mtx); 2231 2232 return (rc); 2233} 2234 2235void 2236arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private) 2237{ 2238 ASSERT(buf->b_hdr != NULL); 2239 ASSERT(buf->b_hdr->b_state != arc_anon); 2240 ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt) || func == NULL); 2241 buf->b_efunc = func; 2242 buf->b_private = private; 2243} 2244 2245/* 2246 * This is used by the DMU to let the ARC know that a buffer is 2247 * being evicted, so the ARC should clean up. If this arc buf 2248 * is not yet in the evicted state, it will be put there. 2249 */ 2250int 2251arc_buf_evict(arc_buf_t *buf) 2252{ 2253 arc_buf_hdr_t *hdr; 2254 kmutex_t *hash_lock; 2255 arc_buf_t **bufp; 2256 2257 mutex_enter(&arc_eviction_mtx); 2258 hdr = buf->b_hdr; 2259 if (hdr == NULL) { 2260 /* 2261 * We are in arc_do_user_evicts(). 2262 */ 2263 ASSERT(buf->b_data == NULL); 2264 mutex_exit(&arc_eviction_mtx); 2265 return (0); 2266 } 2267 hash_lock = HDR_LOCK(hdr); 2268 mutex_exit(&arc_eviction_mtx); 2269 2270 mutex_enter(hash_lock); 2271 2272 if (buf->b_data == NULL) { 2273 /* 2274 * We are on the eviction list. 2275 */ 2276 mutex_exit(hash_lock); 2277 mutex_enter(&arc_eviction_mtx); 2278 if (buf->b_hdr == NULL) { 2279 /* 2280 * We are already in arc_do_user_evicts(). 2281 */ 2282 mutex_exit(&arc_eviction_mtx); 2283 return (0); 2284 } else { 2285 arc_buf_t copy = *buf; /* structure assignment */ 2286 /* 2287 * Process this buffer now 2288 * but let arc_do_user_evicts() do the reaping. 2289 */ 2290 buf->b_efunc = NULL; 2291 mutex_exit(&arc_eviction_mtx); 2292 VERIFY(copy.b_efunc(©) == 0); 2293 return (1); 2294 } 2295 } 2296 2297 ASSERT(buf->b_hdr == hdr); 2298 ASSERT3U(refcount_count(&hdr->b_refcnt), <, hdr->b_datacnt); 2299 ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu); 2300 2301 /* 2302 * Pull this buffer off of the hdr 2303 */ 2304 bufp = &hdr->b_buf; 2305 while (*bufp != buf) 2306 bufp = &(*bufp)->b_next; 2307 *bufp = buf->b_next; 2308 2309 ASSERT(buf->b_data != NULL); 2310 arc_buf_destroy(buf, FALSE, FALSE); 2311 2312 if (hdr->b_datacnt == 0) { 2313 arc_state_t *old_state = hdr->b_state; 2314 arc_state_t *evicted_state; 2315 2316 ASSERT(refcount_is_zero(&hdr->b_refcnt)); 2317 2318 evicted_state = 2319 (old_state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost; 2320 2321 mutex_enter(&old_state->arcs_mtx); 2322 mutex_enter(&evicted_state->arcs_mtx); 2323 2324 arc_change_state(evicted_state, hdr, hash_lock); 2325 ASSERT(HDR_IN_HASH_TABLE(hdr)); 2326 hdr->b_flags = ARC_IN_HASH_TABLE; 2327 2328 mutex_exit(&evicted_state->arcs_mtx); 2329 mutex_exit(&old_state->arcs_mtx); 2330 } 2331 mutex_exit(hash_lock); 2332 2333 VERIFY(buf->b_efunc(buf) == 0); 2334 buf->b_efunc = NULL; 2335 buf->b_private = NULL; 2336 buf->b_hdr = NULL; 2337 kmem_cache_free(buf_cache, buf); 2338 return (1); 2339} 2340 2341/* 2342 * Release this buffer from the cache. This must be done 2343 * after a read and prior to modifying the buffer contents. 2344 * If the buffer has more than one reference, we must make 2345 * make a new hdr for the buffer. 2346 */ 2347void 2348arc_release(arc_buf_t *buf, void *tag) 2349{ 2350 arc_buf_hdr_t *hdr = buf->b_hdr; 2351 kmutex_t *hash_lock = HDR_LOCK(hdr); 2352 2353 /* this buffer is not on any list */ 2354 ASSERT(refcount_count(&hdr->b_refcnt) > 0); 2355 2356 if (hdr->b_state == arc_anon) { 2357 /* this buffer is already released */ 2358 ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 1); 2359 ASSERT(BUF_EMPTY(hdr)); 2360 ASSERT(buf->b_efunc == NULL); 2361 arc_buf_thaw(buf); 2362 return; 2363 } 2364 2365 mutex_enter(hash_lock); 2366 2367 /* 2368 * Do we have more than one buf? 2369 */ 2370 if (hdr->b_buf != buf || buf->b_next != NULL) { 2371 arc_buf_hdr_t *nhdr; 2372 arc_buf_t **bufp; 2373 uint64_t blksz = hdr->b_size; 2374 spa_t *spa = hdr->b_spa; 2375 arc_buf_contents_t type = hdr->b_type; 2376 2377 ASSERT(hdr->b_datacnt > 1); 2378 /* 2379 * Pull the data off of this buf and attach it to 2380 * a new anonymous buf. 2381 */ 2382 (void) remove_reference(hdr, hash_lock, tag); 2383 bufp = &hdr->b_buf; 2384 while (*bufp != buf) 2385 bufp = &(*bufp)->b_next; 2386 *bufp = (*bufp)->b_next; 2387 buf->b_next = NULL; 2388 2389 ASSERT3U(hdr->b_state->arcs_size, >=, hdr->b_size); 2390 atomic_add_64(&hdr->b_state->arcs_size, -hdr->b_size); 2391 if (refcount_is_zero(&hdr->b_refcnt)) { 2392 ASSERT3U(hdr->b_state->arcs_lsize, >=, hdr->b_size); 2393 atomic_add_64(&hdr->b_state->arcs_lsize, -hdr->b_size); 2394 } 2395 hdr->b_datacnt -= 1; 2396 arc_cksum_verify(buf); 2397 2398 mutex_exit(hash_lock); 2399 2400 nhdr = kmem_cache_alloc(hdr_cache, KM_SLEEP); 2401 nhdr->b_size = blksz; 2402 nhdr->b_spa = spa; 2403 nhdr->b_type = type; 2404 nhdr->b_buf = buf; 2405 nhdr->b_state = arc_anon; 2406 nhdr->b_arc_access = 0; 2407 nhdr->b_flags = 0; 2408 nhdr->b_datacnt = 1; 2409 nhdr->b_freeze_cksum = NULL; 2410 mutex_init(&nhdr->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL); 2411 (void) refcount_add(&nhdr->b_refcnt, tag); 2412 buf->b_hdr = nhdr; 2413 atomic_add_64(&arc_anon->arcs_size, blksz); 2414 2415 hdr = nhdr; 2416 } else { 2417 ASSERT(refcount_count(&hdr->b_refcnt) == 1); 2418 ASSERT(!list_link_active(&hdr->b_arc_node)); 2419 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 2420 arc_change_state(arc_anon, hdr, hash_lock); 2421 hdr->b_arc_access = 0; 2422 mutex_exit(hash_lock); 2423 bzero(&hdr->b_dva, sizeof (dva_t)); 2424 hdr->b_birth = 0; 2425 hdr->b_cksum0 = 0; 2426 arc_buf_thaw(buf); 2427 } 2428 buf->b_efunc = NULL; 2429 buf->b_private = NULL; 2430} 2431 2432int 2433arc_released(arc_buf_t *buf) 2434{ 2435 return (buf->b_data != NULL && buf->b_hdr->b_state == arc_anon); 2436} 2437 2438int 2439arc_has_callback(arc_buf_t *buf) 2440{ 2441 return (buf->b_efunc != NULL); 2442} 2443 2444#ifdef ZFS_DEBUG 2445int 2446arc_referenced(arc_buf_t *buf) 2447{ 2448 return (refcount_count(&buf->b_hdr->b_refcnt)); 2449} 2450#endif 2451 2452static void 2453arc_write_ready(zio_t *zio) 2454{ 2455 arc_write_callback_t *callback = zio->io_private; 2456 arc_buf_t *buf = callback->awcb_buf; 2457 2458 if (callback->awcb_ready) { 2459 ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt)); 2460 callback->awcb_ready(zio, buf, callback->awcb_private); 2461 } 2462 arc_cksum_compute(buf); 2463} 2464 2465static void 2466arc_write_done(zio_t *zio) 2467{ 2468 arc_write_callback_t *callback = zio->io_private; 2469 arc_buf_t *buf = callback->awcb_buf; 2470 arc_buf_hdr_t *hdr = buf->b_hdr; 2471 2472 hdr->b_acb = NULL; 2473 2474 /* this buffer is on no lists and is not in the hash table */ 2475 ASSERT3P(hdr->b_state, ==, arc_anon); 2476 2477 hdr->b_dva = *BP_IDENTITY(zio->io_bp); 2478 hdr->b_birth = zio->io_bp->blk_birth; 2479 hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0]; 2480 /* 2481 * If the block to be written was all-zero, we may have 2482 * compressed it away. In this case no write was performed 2483 * so there will be no dva/birth-date/checksum. The buffer 2484 * must therefor remain anonymous (and uncached). 2485 */ 2486 if (!BUF_EMPTY(hdr)) { 2487 arc_buf_hdr_t *exists; 2488 kmutex_t *hash_lock; 2489 2490 arc_cksum_verify(buf); 2491 2492 exists = buf_hash_insert(hdr, &hash_lock); 2493 if (exists) { 2494 /* 2495 * This can only happen if we overwrite for 2496 * sync-to-convergence, because we remove 2497 * buffers from the hash table when we arc_free(). 2498 */ 2499 ASSERT(DVA_EQUAL(BP_IDENTITY(&zio->io_bp_orig), 2500 BP_IDENTITY(zio->io_bp))); 2501 ASSERT3U(zio->io_bp_orig.blk_birth, ==, 2502 zio->io_bp->blk_birth); 2503 2504 ASSERT(refcount_is_zero(&exists->b_refcnt)); 2505 arc_change_state(arc_anon, exists, hash_lock); 2506 mutex_exit(hash_lock); 2507 arc_hdr_destroy(exists); 2508 exists = buf_hash_insert(hdr, &hash_lock); 2509 ASSERT3P(exists, ==, NULL); 2510 } 2511 hdr->b_flags &= ~ARC_IO_IN_PROGRESS; 2512 arc_access(hdr, hash_lock); 2513 mutex_exit(hash_lock); 2514 } else if (callback->awcb_done == NULL) { 2515 int destroy_hdr; 2516 /* 2517 * This is an anonymous buffer with no user callback, 2518 * destroy it if there are no active references. 2519 */ 2520 mutex_enter(&arc_eviction_mtx); 2521 destroy_hdr = refcount_is_zero(&hdr->b_refcnt); 2522 hdr->b_flags &= ~ARC_IO_IN_PROGRESS; 2523 mutex_exit(&arc_eviction_mtx); 2524 if (destroy_hdr) 2525 arc_hdr_destroy(hdr); 2526 } else { 2527 hdr->b_flags &= ~ARC_IO_IN_PROGRESS; 2528 } 2529 2530 if (callback->awcb_done) { 2531 ASSERT(!refcount_is_zero(&hdr->b_refcnt)); 2532 callback->awcb_done(zio, buf, callback->awcb_private); 2533 } 2534 2535 kmem_free(callback, sizeof (arc_write_callback_t)); 2536} 2537 2538zio_t * 2539arc_write(zio_t *pio, spa_t *spa, int checksum, int compress, int ncopies, 2540 uint64_t txg, blkptr_t *bp, arc_buf_t *buf, 2541 arc_done_func_t *ready, arc_done_func_t *done, void *private, int priority, 2542 int flags, zbookmark_t *zb) 2543{ 2544 arc_buf_hdr_t *hdr = buf->b_hdr; 2545 arc_write_callback_t *callback; 2546 zio_t *zio; 2547 2548 /* this is a private buffer - no locking required */ 2549 ASSERT3P(hdr->b_state, ==, arc_anon); 2550 ASSERT(BUF_EMPTY(hdr)); 2551 ASSERT(!HDR_IO_ERROR(hdr)); 2552 ASSERT((hdr->b_flags & ARC_IO_IN_PROGRESS) == 0); 2553 ASSERT(hdr->b_acb == 0); 2554 callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP); 2555 callback->awcb_ready = ready; 2556 callback->awcb_done = done; 2557 callback->awcb_private = private; 2558 callback->awcb_buf = buf; 2559 hdr->b_flags |= ARC_IO_IN_PROGRESS; 2560 zio = zio_write(pio, spa, checksum, compress, ncopies, txg, bp, 2561 buf->b_data, hdr->b_size, arc_write_ready, arc_write_done, callback, 2562 priority, flags, zb); 2563 2564 return (zio); 2565} 2566 2567int 2568arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, 2569 zio_done_func_t *done, void *private, uint32_t arc_flags) 2570{ 2571 arc_buf_hdr_t *ab; 2572 kmutex_t *hash_lock; 2573 zio_t *zio; 2574 2575 /* 2576 * If this buffer is in the cache, release it, so it 2577 * can be re-used. 2578 */ 2579 ab = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_lock); 2580 if (ab != NULL) { 2581 /* 2582 * The checksum of blocks to free is not always 2583 * preserved (eg. on the deadlist). However, if it is 2584 * nonzero, it should match what we have in the cache. 2585 */ 2586 ASSERT(bp->blk_cksum.zc_word[0] == 0 || 2587 ab->b_cksum0 == bp->blk_cksum.zc_word[0]); 2588 if (ab->b_state != arc_anon) 2589 arc_change_state(arc_anon, ab, hash_lock); 2590 if (HDR_IO_IN_PROGRESS(ab)) { 2591 /* 2592 * This should only happen when we prefetch. 2593 */ 2594 ASSERT(ab->b_flags & ARC_PREFETCH); 2595 ASSERT3U(ab->b_datacnt, ==, 1); 2596 ab->b_flags |= ARC_FREED_IN_READ; 2597 if (HDR_IN_HASH_TABLE(ab)) 2598 buf_hash_remove(ab); 2599 ab->b_arc_access = 0; 2600 bzero(&ab->b_dva, sizeof (dva_t)); 2601 ab->b_birth = 0; 2602 ab->b_cksum0 = 0; 2603 ab->b_buf->b_efunc = NULL; 2604 ab->b_buf->b_private = NULL; 2605 mutex_exit(hash_lock); 2606 } else if (refcount_is_zero(&ab->b_refcnt)) { 2607 mutex_exit(hash_lock); 2608 arc_hdr_destroy(ab); 2609 ARCSTAT_BUMP(arcstat_deleted); 2610 } else { 2611 /* 2612 * We still have an active reference on this 2613 * buffer. This can happen, e.g., from 2614 * dbuf_unoverride(). 2615 */ 2616 ASSERT(!HDR_IN_HASH_TABLE(ab)); 2617 ab->b_arc_access = 0; 2618 bzero(&ab->b_dva, sizeof (dva_t)); 2619 ab->b_birth = 0; 2620 ab->b_cksum0 = 0; 2621 ab->b_buf->b_efunc = NULL; 2622 ab->b_buf->b_private = NULL; 2623 mutex_exit(hash_lock); 2624 } 2625 } 2626 2627 zio = zio_free(pio, spa, txg, bp, done, private); 2628 2629 if (arc_flags & ARC_WAIT) 2630 return (zio_wait(zio)); 2631 2632 ASSERT(arc_flags & ARC_NOWAIT); 2633 zio_nowait(zio); 2634 2635 return (0); 2636} 2637 2638void 2639arc_tempreserve_clear(uint64_t tempreserve) 2640{ 2641 atomic_add_64(&arc_tempreserve, -tempreserve); 2642 ASSERT((int64_t)arc_tempreserve >= 0); 2643} 2644 2645int 2646arc_tempreserve_space(uint64_t tempreserve) 2647{ 2648#ifdef ZFS_DEBUG 2649 /* 2650 * Once in a while, fail for no reason. Everything should cope. 2651 */ 2652 if (spa_get_random(10000) == 0) { 2653 dprintf("forcing random failure\n"); 2654 return (ERESTART); 2655 } 2656#endif 2657 if (tempreserve > arc_c/4 && !arc_no_grow) 2658 arc_c = MIN(arc_c_max, tempreserve * 4); 2659 if (tempreserve > arc_c) 2660 return (ENOMEM); 2661 2662 /* 2663 * Throttle writes when the amount of dirty data in the cache 2664 * gets too large. We try to keep the cache less than half full 2665 * of dirty blocks so that our sync times don't grow too large. 2666 * Note: if two requests come in concurrently, we might let them 2667 * both succeed, when one of them should fail. Not a huge deal. 2668 * 2669 * XXX The limit should be adjusted dynamically to keep the time 2670 * to sync a dataset fixed (around 1-5 seconds?). 2671 */ 2672 2673 if (tempreserve + arc_tempreserve + arc_anon->arcs_size > arc_c / 2 && 2674 arc_tempreserve + arc_anon->arcs_size > arc_c / 4) { 2675 dprintf("failing, arc_tempreserve=%lluK anon=%lluK " 2676 "tempreserve=%lluK arc_c=%lluK\n", 2677 arc_tempreserve>>10, arc_anon->arcs_lsize>>10, 2678 tempreserve>>10, arc_c>>10); 2679 return (ERESTART); 2680 } 2681 atomic_add_64(&arc_tempreserve, tempreserve); 2682 return (0); 2683} 2684 2685static kmutex_t arc_lowmem_lock; 2686#ifdef _KERNEL 2687static eventhandler_tag arc_event_lowmem = NULL; 2688 2689static void 2690arc_lowmem(void *arg __unused, int howto __unused) 2691{ 2692 2693 /* Serialize access via arc_lowmem_lock. */ 2694 mutex_enter(&arc_lowmem_lock); 2695 zfs_needfree = 1; 2696 cv_signal(&arc_reclaim_thr_cv); 2697 while (zfs_needfree) 2698 tsleep(&zfs_needfree, 0, "zfs:lowmem", hz / 5); 2699 mutex_exit(&arc_lowmem_lock); 2700} 2701#endif 2702 2703void 2704arc_init(void) 2705{ 2706 mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL); 2707 cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL); 2708 mutex_init(&arc_lowmem_lock, NULL, MUTEX_DEFAULT, NULL); 2709 2710 /* Convert seconds to clock ticks */ 2711 arc_min_prefetch_lifespan = 1 * hz; 2712 2713 /* Start out with 1/8 of all memory */ 2714 arc_c = kmem_size() / 8; 2715#if 0 2716#ifdef _KERNEL 2717 /* 2718 * On architectures where the physical memory can be larger 2719 * than the addressable space (intel in 32-bit mode), we may 2720 * need to limit the cache to 1/8 of VM size. 2721 */ 2722 arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8); 2723#endif 2724#endif 2725 /* set min cache to 1/32 of all memory, or 16MB, whichever is more */ 2726 arc_c_min = MAX(arc_c / 4, 64<<18); 2727 /* set max to 1/2 of all memory, or all but 1GB, whichever is more */ 2728 if (arc_c * 8 >= 1<<30) 2729 arc_c_max = (arc_c * 8) - (1<<30); 2730 else 2731 arc_c_max = arc_c_min; 2732 arc_c_max = MAX(arc_c * 4, arc_c_max); 2733#ifdef _KERNEL 2734 /* 2735 * Allow the tunables to override our calculations if they are 2736 * reasonable (ie. over 16MB) 2737 */ 2738 if (zfs_arc_max >= 64<<18 && zfs_arc_max < kmem_size()) 2739 arc_c_max = zfs_arc_max; 2740 if (zfs_arc_min >= 64<<18 && zfs_arc_min <= arc_c_max) 2741 arc_c_min = zfs_arc_min; 2742#endif 2743 arc_c = arc_c_max; 2744 arc_p = (arc_c >> 1); 2745 2746 /* if kmem_flags are set, lets try to use less memory */ 2747 if (kmem_debugging()) 2748 arc_c = arc_c / 2; 2749 if (arc_c < arc_c_min) 2750 arc_c = arc_c_min; 2751 2752 zfs_arc_min = arc_c_min; 2753 zfs_arc_max = arc_c_max; 2754 2755 arc_anon = &ARC_anon; 2756 arc_mru = &ARC_mru; 2757 arc_mru_ghost = &ARC_mru_ghost; 2758 arc_mfu = &ARC_mfu; 2759 arc_mfu_ghost = &ARC_mfu_ghost; 2760 arc_size = 0; 2761 2762 mutex_init(&arc_anon->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); 2763 mutex_init(&arc_mru->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); 2764 mutex_init(&arc_mru_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); 2765 mutex_init(&arc_mfu->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); 2766 mutex_init(&arc_mfu_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); 2767 2768 list_create(&arc_mru->arcs_list, sizeof (arc_buf_hdr_t), 2769 offsetof(arc_buf_hdr_t, b_arc_node)); 2770 list_create(&arc_mru_ghost->arcs_list, sizeof (arc_buf_hdr_t), 2771 offsetof(arc_buf_hdr_t, b_arc_node)); 2772 list_create(&arc_mfu->arcs_list, sizeof (arc_buf_hdr_t), 2773 offsetof(arc_buf_hdr_t, b_arc_node)); 2774 list_create(&arc_mfu_ghost->arcs_list, sizeof (arc_buf_hdr_t), 2775 offsetof(arc_buf_hdr_t, b_arc_node)); 2776 2777 buf_init(); 2778 2779 arc_thread_exit = 0; 2780 arc_eviction_list = NULL; 2781 mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL); 2782 bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t)); 2783 2784 arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED, 2785 sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); 2786 2787 if (arc_ksp != NULL) { 2788 arc_ksp->ks_data = &arc_stats; 2789 kstat_install(arc_ksp); 2790 } 2791 2792 (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0, 2793 TS_RUN, minclsyspri); 2794 2795#ifdef _KERNEL 2796 arc_event_lowmem = EVENTHANDLER_REGISTER(vm_lowmem, arc_lowmem, NULL, 2797 EVENTHANDLER_PRI_FIRST); 2798#endif 2799 2800 arc_dead = FALSE; 2801 2802#ifdef _KERNEL 2803 /* Warn about ZFS memory requirements. */ 2804 if ((physmem * PAGESIZE) < (256 + 128 + 64) * (1 << 20)) { 2805 printf("ZFS WARNING: Recomended minimum of RAM size is 512MB, " 2806 "expect unstable behaviour.\n"); 2807 } else if (kmem_size() < 256 * (1 << 20)) { 2808 printf("ZFS WARNING: Recomended minimum of kmem_map size is " 2809 "256MB, expect unstable behaviour.\n"); 2810 printf(" Consider tunning vm.kmem_size and " 2811 "vm.kmem_size_max in /boot/loader.conf.\n"); 2812 } 2813#endif 2814} 2815 2816void 2817arc_fini(void) 2818{ 2819 mutex_enter(&arc_reclaim_thr_lock); 2820 arc_thread_exit = 1; 2821 cv_signal(&arc_reclaim_thr_cv); 2822 while (arc_thread_exit != 0) 2823 cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock); 2824 mutex_exit(&arc_reclaim_thr_lock); 2825 2826 arc_flush(); 2827 2828 arc_dead = TRUE; 2829 2830 if (arc_ksp != NULL) { 2831 kstat_delete(arc_ksp); 2832 arc_ksp = NULL; 2833 } 2834 2835 mutex_destroy(&arc_eviction_mtx); 2836 mutex_destroy(&arc_reclaim_thr_lock); 2837 cv_destroy(&arc_reclaim_thr_cv); 2838 2839 list_destroy(&arc_mru->arcs_list); 2840 list_destroy(&arc_mru_ghost->arcs_list); 2841 list_destroy(&arc_mfu->arcs_list); 2842 list_destroy(&arc_mfu_ghost->arcs_list); 2843 2844 mutex_destroy(&arc_anon->arcs_mtx); 2845 mutex_destroy(&arc_mru->arcs_mtx); 2846 mutex_destroy(&arc_mru_ghost->arcs_mtx); 2847 mutex_destroy(&arc_mfu->arcs_mtx); 2848 mutex_destroy(&arc_mfu_ghost->arcs_mtx); 2849 2850 buf_fini(); 2851 2852 mutex_destroy(&arc_lowmem_lock); 2853#ifdef _KERNEL 2854 if (arc_event_lowmem != NULL) 2855 EVENTHANDLER_DEREGISTER(vm_lowmem, arc_event_lowmem); 2856#endif 2857} 2858