arc.c revision 168473
1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21/* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26#pragma ident "%Z%%M% %I% %E% SMI" 27 28/* 29 * DVA-based Adjustable Replacement Cache 30 * 31 * While much of the theory of operation used here is 32 * based on the self-tuning, low overhead replacement cache 33 * presented by Megiddo and Modha at FAST 2003, there are some 34 * significant differences: 35 * 36 * 1. The Megiddo and Modha model assumes any page is evictable. 37 * Pages in its cache cannot be "locked" into memory. This makes 38 * the eviction algorithm simple: evict the last page in the list. 39 * This also make the performance characteristics easy to reason 40 * about. Our cache is not so simple. At any given moment, some 41 * subset of the blocks in the cache are un-evictable because we 42 * have handed out a reference to them. Blocks are only evictable 43 * when there are no external references active. This makes 44 * eviction far more problematic: we choose to evict the evictable 45 * blocks that are the "lowest" in the list. 46 * 47 * There are times when it is not possible to evict the requested 48 * space. In these circumstances we are unable to adjust the cache 49 * size. To prevent the cache growing unbounded at these times we 50 * implement a "cache throttle" that slowes the flow of new data 51 * into the cache until we can make space avaiable. 52 * 53 * 2. The Megiddo and Modha model assumes a fixed cache size. 54 * Pages are evicted when the cache is full and there is a cache 55 * miss. Our model has a variable sized cache. It grows with 56 * high use, but also tries to react to memory preasure from the 57 * operating system: decreasing its size when system memory is 58 * tight. 59 * 60 * 3. The Megiddo and Modha model assumes a fixed page size. All 61 * elements of the cache are therefor exactly the same size. So 62 * when adjusting the cache size following a cache miss, its simply 63 * a matter of choosing a single page to evict. In our model, we 64 * have variable sized cache blocks (rangeing from 512 bytes to 65 * 128K bytes). We therefor choose a set of blocks to evict to make 66 * space for a cache miss that approximates as closely as possible 67 * the space used by the new block. 68 * 69 * See also: "ARC: A Self-Tuning, Low Overhead Replacement Cache" 70 * by N. Megiddo & D. Modha, FAST 2003 71 */ 72 73/* 74 * The locking model: 75 * 76 * A new reference to a cache buffer can be obtained in two 77 * ways: 1) via a hash table lookup using the DVA as a key, 78 * or 2) via one of the ARC lists. The arc_read() inerface 79 * uses method 1, while the internal arc algorithms for 80 * adjusting the cache use method 2. We therefor provide two 81 * types of locks: 1) the hash table lock array, and 2) the 82 * arc list locks. 83 * 84 * Buffers do not have their own mutexs, rather they rely on the 85 * hash table mutexs for the bulk of their protection (i.e. most 86 * fields in the arc_buf_hdr_t are protected by these mutexs). 87 * 88 * buf_hash_find() returns the appropriate mutex (held) when it 89 * locates the requested buffer in the hash table. It returns 90 * NULL for the mutex if the buffer was not in the table. 91 * 92 * buf_hash_remove() expects the appropriate hash mutex to be 93 * already held before it is invoked. 94 * 95 * Each arc state also has a mutex which is used to protect the 96 * buffer list associated with the state. When attempting to 97 * obtain a hash table lock while holding an arc list lock you 98 * must use: mutex_tryenter() to avoid deadlock. Also note that 99 * the active state mutex must be held before the ghost state mutex. 100 * 101 * Arc buffers may have an associated eviction callback function. 102 * This function will be invoked prior to removing the buffer (e.g. 103 * in arc_do_user_evicts()). Note however that the data associated 104 * with the buffer may be evicted prior to the callback. The callback 105 * must be made with *no locks held* (to prevent deadlock). Additionally, 106 * the users of callbacks must ensure that their private data is 107 * protected from simultaneous callbacks from arc_buf_evict() 108 * and arc_do_user_evicts(). 109 * 110 * Note that the majority of the performance stats are manipulated 111 * with atomic operations. 112 */ 113 114#include <sys/spa.h> 115#include <sys/zio.h> 116#include <sys/zio_checksum.h> 117#include <sys/zfs_context.h> 118#include <sys/arc.h> 119#include <sys/refcount.h> 120#ifdef _KERNEL 121#include <sys/dnlc.h> 122#endif 123#include <sys/callb.h> 124#include <sys/kstat.h> 125#include <sys/sdt.h> 126 127#define ARC_FREE_AT_ONCE 4194304 128 129static kmutex_t arc_reclaim_thr_lock; 130static kcondvar_t arc_reclaim_thr_cv; /* used to signal reclaim thr */ 131static uint8_t arc_thread_exit; 132 133#define ARC_REDUCE_DNLC_PERCENT 3 134uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT; 135 136typedef enum arc_reclaim_strategy { 137 ARC_RECLAIM_AGGR, /* Aggressive reclaim strategy */ 138 ARC_RECLAIM_CONS /* Conservative reclaim strategy */ 139} arc_reclaim_strategy_t; 140 141/* number of seconds before growing cache again */ 142static int arc_grow_retry = 60; 143 144/* 145 * minimum lifespan of a prefetch block in clock ticks 146 * (initialized in arc_init()) 147 */ 148static int arc_min_prefetch_lifespan; 149 150static int arc_dead; 151 152/* 153 * These tunables are for performance analysis. 154 */ 155u_long zfs_arc_max; 156u_long zfs_arc_min; 157TUNABLE_ULONG("vfs.zfs.arc_max", &zfs_arc_max); 158TUNABLE_ULONG("vfs.zfs.arc_min", &zfs_arc_min); 159SYSCTL_DECL(_vfs_zfs); 160SYSCTL_ULONG(_vfs_zfs, OID_AUTO, arc_max, CTLFLAG_RD, &zfs_arc_max, 0, 161 "Maximum ARC size"); 162SYSCTL_ULONG(_vfs_zfs, OID_AUTO, arc_min, CTLFLAG_RD, &zfs_arc_min, 0, 163 "Minimum ARC size"); 164 165/* 166 * Note that buffers can be on one of 5 states: 167 * ARC_anon - anonymous (discussed below) 168 * ARC_mru - recently used, currently cached 169 * ARC_mru_ghost - recentely used, no longer in cache 170 * ARC_mfu - frequently used, currently cached 171 * ARC_mfu_ghost - frequently used, no longer in cache 172 * When there are no active references to the buffer, they 173 * are linked onto one of the lists in arc. These are the 174 * only buffers that can be evicted or deleted. 175 * 176 * Anonymous buffers are buffers that are not associated with 177 * a DVA. These are buffers that hold dirty block copies 178 * before they are written to stable storage. By definition, 179 * they are "ref'd" and are considered part of arc_mru 180 * that cannot be freed. Generally, they will aquire a DVA 181 * as they are written and migrate onto the arc_mru list. 182 */ 183 184typedef struct arc_state { 185 list_t arcs_list; /* linked list of evictable buffer in state */ 186 uint64_t arcs_lsize; /* total size of buffers in the linked list */ 187 uint64_t arcs_size; /* total size of all buffers in this state */ 188 kmutex_t arcs_mtx; 189} arc_state_t; 190 191/* The 5 states: */ 192static arc_state_t ARC_anon; 193static arc_state_t ARC_mru; 194static arc_state_t ARC_mru_ghost; 195static arc_state_t ARC_mfu; 196static arc_state_t ARC_mfu_ghost; 197 198typedef struct arc_stats { 199 kstat_named_t arcstat_hits; 200 kstat_named_t arcstat_misses; 201 kstat_named_t arcstat_demand_data_hits; 202 kstat_named_t arcstat_demand_data_misses; 203 kstat_named_t arcstat_demand_metadata_hits; 204 kstat_named_t arcstat_demand_metadata_misses; 205 kstat_named_t arcstat_prefetch_data_hits; 206 kstat_named_t arcstat_prefetch_data_misses; 207 kstat_named_t arcstat_prefetch_metadata_hits; 208 kstat_named_t arcstat_prefetch_metadata_misses; 209 kstat_named_t arcstat_mru_hits; 210 kstat_named_t arcstat_mru_ghost_hits; 211 kstat_named_t arcstat_mfu_hits; 212 kstat_named_t arcstat_mfu_ghost_hits; 213 kstat_named_t arcstat_deleted; 214 kstat_named_t arcstat_recycle_miss; 215 kstat_named_t arcstat_mutex_miss; 216 kstat_named_t arcstat_evict_skip; 217 kstat_named_t arcstat_hash_elements; 218 kstat_named_t arcstat_hash_elements_max; 219 kstat_named_t arcstat_hash_collisions; 220 kstat_named_t arcstat_hash_chains; 221 kstat_named_t arcstat_hash_chain_max; 222 kstat_named_t arcstat_p; 223 kstat_named_t arcstat_c; 224 kstat_named_t arcstat_c_min; 225 kstat_named_t arcstat_c_max; 226 kstat_named_t arcstat_size; 227} arc_stats_t; 228 229static arc_stats_t arc_stats = { 230 { "hits", KSTAT_DATA_UINT64 }, 231 { "misses", KSTAT_DATA_UINT64 }, 232 { "demand_data_hits", KSTAT_DATA_UINT64 }, 233 { "demand_data_misses", KSTAT_DATA_UINT64 }, 234 { "demand_metadata_hits", KSTAT_DATA_UINT64 }, 235 { "demand_metadata_misses", KSTAT_DATA_UINT64 }, 236 { "prefetch_data_hits", KSTAT_DATA_UINT64 }, 237 { "prefetch_data_misses", KSTAT_DATA_UINT64 }, 238 { "prefetch_metadata_hits", KSTAT_DATA_UINT64 }, 239 { "prefetch_metadata_misses", KSTAT_DATA_UINT64 }, 240 { "mru_hits", KSTAT_DATA_UINT64 }, 241 { "mru_ghost_hits", KSTAT_DATA_UINT64 }, 242 { "mfu_hits", KSTAT_DATA_UINT64 }, 243 { "mfu_ghost_hits", KSTAT_DATA_UINT64 }, 244 { "deleted", KSTAT_DATA_UINT64 }, 245 { "recycle_miss", KSTAT_DATA_UINT64 }, 246 { "mutex_miss", KSTAT_DATA_UINT64 }, 247 { "evict_skip", KSTAT_DATA_UINT64 }, 248 { "hash_elements", KSTAT_DATA_UINT64 }, 249 { "hash_elements_max", KSTAT_DATA_UINT64 }, 250 { "hash_collisions", KSTAT_DATA_UINT64 }, 251 { "hash_chains", KSTAT_DATA_UINT64 }, 252 { "hash_chain_max", KSTAT_DATA_UINT64 }, 253 { "p", KSTAT_DATA_UINT64 }, 254 { "c", KSTAT_DATA_UINT64 }, 255 { "c_min", KSTAT_DATA_UINT64 }, 256 { "c_max", KSTAT_DATA_UINT64 }, 257 { "size", KSTAT_DATA_UINT64 } 258}; 259 260#define ARCSTAT(stat) (arc_stats.stat.value.ui64) 261 262#define ARCSTAT_INCR(stat, val) \ 263 atomic_add_64(&arc_stats.stat.value.ui64, (val)); 264 265#define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1) 266#define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1) 267 268#define ARCSTAT_MAX(stat, val) { \ 269 uint64_t m; \ 270 while ((val) > (m = arc_stats.stat.value.ui64) && \ 271 (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val)))) \ 272 continue; \ 273} 274 275#define ARCSTAT_MAXSTAT(stat) \ 276 ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64) 277 278/* 279 * We define a macro to allow ARC hits/misses to be easily broken down by 280 * two separate conditions, giving a total of four different subtypes for 281 * each of hits and misses (so eight statistics total). 282 */ 283#define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \ 284 if (cond1) { \ 285 if (cond2) { \ 286 ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \ 287 } else { \ 288 ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \ 289 } \ 290 } else { \ 291 if (cond2) { \ 292 ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \ 293 } else { \ 294 ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\ 295 } \ 296 } 297 298kstat_t *arc_ksp; 299static arc_state_t *arc_anon; 300static arc_state_t *arc_mru; 301static arc_state_t *arc_mru_ghost; 302static arc_state_t *arc_mfu; 303static arc_state_t *arc_mfu_ghost; 304 305/* 306 * There are several ARC variables that are critical to export as kstats -- 307 * but we don't want to have to grovel around in the kstat whenever we wish to 308 * manipulate them. For these variables, we therefore define them to be in 309 * terms of the statistic variable. This assures that we are not introducing 310 * the possibility of inconsistency by having shadow copies of the variables, 311 * while still allowing the code to be readable. 312 */ 313#define arc_size ARCSTAT(arcstat_size) /* actual total arc size */ 314#define arc_p ARCSTAT(arcstat_p) /* target size of MRU */ 315#define arc_c ARCSTAT(arcstat_c) /* target size of cache */ 316#define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */ 317#define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */ 318 319static int arc_no_grow; /* Don't try to grow cache size */ 320static uint64_t arc_tempreserve; 321 322typedef struct arc_callback arc_callback_t; 323 324struct arc_callback { 325 void *acb_private; 326 arc_done_func_t *acb_done; 327 arc_byteswap_func_t *acb_byteswap; 328 arc_buf_t *acb_buf; 329 zio_t *acb_zio_dummy; 330 arc_callback_t *acb_next; 331}; 332 333typedef struct arc_write_callback arc_write_callback_t; 334 335struct arc_write_callback { 336 void *awcb_private; 337 arc_done_func_t *awcb_ready; 338 arc_done_func_t *awcb_done; 339 arc_buf_t *awcb_buf; 340}; 341 342struct arc_buf_hdr { 343 /* protected by hash lock */ 344 dva_t b_dva; 345 uint64_t b_birth; 346 uint64_t b_cksum0; 347 348 kmutex_t b_freeze_lock; 349 zio_cksum_t *b_freeze_cksum; 350 351 arc_buf_hdr_t *b_hash_next; 352 arc_buf_t *b_buf; 353 uint32_t b_flags; 354 uint32_t b_datacnt; 355 356 arc_callback_t *b_acb; 357 kcondvar_t b_cv; 358 359 /* immutable */ 360 arc_buf_contents_t b_type; 361 uint64_t b_size; 362 spa_t *b_spa; 363 364 /* protected by arc state mutex */ 365 arc_state_t *b_state; 366 list_node_t b_arc_node; 367 368 /* updated atomically */ 369 clock_t b_arc_access; 370 371 /* self protecting */ 372 refcount_t b_refcnt; 373}; 374 375static arc_buf_t *arc_eviction_list; 376static kmutex_t arc_eviction_mtx; 377static arc_buf_hdr_t arc_eviction_hdr; 378static void arc_get_data_buf(arc_buf_t *buf); 379static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock); 380 381#define GHOST_STATE(state) \ 382 ((state) == arc_mru_ghost || (state) == arc_mfu_ghost) 383 384/* 385 * Private ARC flags. These flags are private ARC only flags that will show up 386 * in b_flags in the arc_hdr_buf_t. Some flags are publicly declared, and can 387 * be passed in as arc_flags in things like arc_read. However, these flags 388 * should never be passed and should only be set by ARC code. When adding new 389 * public flags, make sure not to smash the private ones. 390 */ 391 392#define ARC_IN_HASH_TABLE (1 << 9) /* this buffer is hashed */ 393#define ARC_IO_IN_PROGRESS (1 << 10) /* I/O in progress for buf */ 394#define ARC_IO_ERROR (1 << 11) /* I/O failed for buf */ 395#define ARC_FREED_IN_READ (1 << 12) /* buf freed while in read */ 396#define ARC_BUF_AVAILABLE (1 << 13) /* block not in active use */ 397#define ARC_INDIRECT (1 << 14) /* this is an indirect block */ 398 399#define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_IN_HASH_TABLE) 400#define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS) 401#define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_IO_ERROR) 402#define HDR_FREED_IN_READ(hdr) ((hdr)->b_flags & ARC_FREED_IN_READ) 403#define HDR_BUF_AVAILABLE(hdr) ((hdr)->b_flags & ARC_BUF_AVAILABLE) 404 405/* 406 * Hash table routines 407 */ 408 409#define HT_LOCK_PAD 128 410 411struct ht_lock { 412 kmutex_t ht_lock; 413#ifdef _KERNEL 414 unsigned char pad[(HT_LOCK_PAD - sizeof (kmutex_t))]; 415#endif 416}; 417 418#define BUF_LOCKS 256 419typedef struct buf_hash_table { 420 uint64_t ht_mask; 421 arc_buf_hdr_t **ht_table; 422 struct ht_lock ht_locks[BUF_LOCKS]; 423} buf_hash_table_t; 424 425static buf_hash_table_t buf_hash_table; 426 427#define BUF_HASH_INDEX(spa, dva, birth) \ 428 (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask) 429#define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)]) 430#define BUF_HASH_LOCK(idx) (&(BUF_HASH_LOCK_NTRY(idx).ht_lock)) 431#define HDR_LOCK(buf) \ 432 (BUF_HASH_LOCK(BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth))) 433 434uint64_t zfs_crc64_table[256]; 435 436static uint64_t 437buf_hash(spa_t *spa, dva_t *dva, uint64_t birth) 438{ 439 uintptr_t spav = (uintptr_t)spa; 440 uint8_t *vdva = (uint8_t *)dva; 441 uint64_t crc = -1ULL; 442 int i; 443 444 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); 445 446 for (i = 0; i < sizeof (dva_t); i++) 447 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF]; 448 449 crc ^= (spav>>8) ^ birth; 450 451 return (crc); 452} 453 454#define BUF_EMPTY(buf) \ 455 ((buf)->b_dva.dva_word[0] == 0 && \ 456 (buf)->b_dva.dva_word[1] == 0 && \ 457 (buf)->b_birth == 0) 458 459#define BUF_EQUAL(spa, dva, birth, buf) \ 460 ((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \ 461 ((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \ 462 ((buf)->b_birth == birth) && ((buf)->b_spa == spa) 463 464static arc_buf_hdr_t * 465buf_hash_find(spa_t *spa, dva_t *dva, uint64_t birth, kmutex_t **lockp) 466{ 467 uint64_t idx = BUF_HASH_INDEX(spa, dva, birth); 468 kmutex_t *hash_lock = BUF_HASH_LOCK(idx); 469 arc_buf_hdr_t *buf; 470 471 mutex_enter(hash_lock); 472 for (buf = buf_hash_table.ht_table[idx]; buf != NULL; 473 buf = buf->b_hash_next) { 474 if (BUF_EQUAL(spa, dva, birth, buf)) { 475 *lockp = hash_lock; 476 return (buf); 477 } 478 } 479 mutex_exit(hash_lock); 480 *lockp = NULL; 481 return (NULL); 482} 483 484/* 485 * Insert an entry into the hash table. If there is already an element 486 * equal to elem in the hash table, then the already existing element 487 * will be returned and the new element will not be inserted. 488 * Otherwise returns NULL. 489 */ 490static arc_buf_hdr_t * 491buf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp) 492{ 493 uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth); 494 kmutex_t *hash_lock = BUF_HASH_LOCK(idx); 495 arc_buf_hdr_t *fbuf; 496 uint32_t i; 497 498 ASSERT(!HDR_IN_HASH_TABLE(buf)); 499 *lockp = hash_lock; 500 mutex_enter(hash_lock); 501 for (fbuf = buf_hash_table.ht_table[idx], i = 0; fbuf != NULL; 502 fbuf = fbuf->b_hash_next, i++) { 503 if (BUF_EQUAL(buf->b_spa, &buf->b_dva, buf->b_birth, fbuf)) 504 return (fbuf); 505 } 506 507 buf->b_hash_next = buf_hash_table.ht_table[idx]; 508 buf_hash_table.ht_table[idx] = buf; 509 buf->b_flags |= ARC_IN_HASH_TABLE; 510 511 /* collect some hash table performance data */ 512 if (i > 0) { 513 ARCSTAT_BUMP(arcstat_hash_collisions); 514 if (i == 1) 515 ARCSTAT_BUMP(arcstat_hash_chains); 516 517 ARCSTAT_MAX(arcstat_hash_chain_max, i); 518 } 519 520 ARCSTAT_BUMP(arcstat_hash_elements); 521 ARCSTAT_MAXSTAT(arcstat_hash_elements); 522 523 return (NULL); 524} 525 526static void 527buf_hash_remove(arc_buf_hdr_t *buf) 528{ 529 arc_buf_hdr_t *fbuf, **bufp; 530 uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth); 531 532 ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx))); 533 ASSERT(HDR_IN_HASH_TABLE(buf)); 534 535 bufp = &buf_hash_table.ht_table[idx]; 536 while ((fbuf = *bufp) != buf) { 537 ASSERT(fbuf != NULL); 538 bufp = &fbuf->b_hash_next; 539 } 540 *bufp = buf->b_hash_next; 541 buf->b_hash_next = NULL; 542 buf->b_flags &= ~ARC_IN_HASH_TABLE; 543 544 /* collect some hash table performance data */ 545 ARCSTAT_BUMPDOWN(arcstat_hash_elements); 546 547 if (buf_hash_table.ht_table[idx] && 548 buf_hash_table.ht_table[idx]->b_hash_next == NULL) 549 ARCSTAT_BUMPDOWN(arcstat_hash_chains); 550} 551 552/* 553 * Global data structures and functions for the buf kmem cache. 554 */ 555static kmem_cache_t *hdr_cache; 556static kmem_cache_t *buf_cache; 557 558static void 559buf_fini(void) 560{ 561 int i; 562 563 kmem_free(buf_hash_table.ht_table, 564 (buf_hash_table.ht_mask + 1) * sizeof (void *)); 565 for (i = 0; i < BUF_LOCKS; i++) 566 mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock); 567 kmem_cache_destroy(hdr_cache); 568 kmem_cache_destroy(buf_cache); 569} 570 571/* 572 * Constructor callback - called when the cache is empty 573 * and a new buf is requested. 574 */ 575/* ARGSUSED */ 576static int 577hdr_cons(void *vbuf, void *unused, int kmflag) 578{ 579 arc_buf_hdr_t *buf = vbuf; 580 581 bzero(buf, sizeof (arc_buf_hdr_t)); 582 refcount_create(&buf->b_refcnt); 583 cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL); 584 return (0); 585} 586 587/* 588 * Destructor callback - called when a cached buf is 589 * no longer required. 590 */ 591/* ARGSUSED */ 592static void 593hdr_dest(void *vbuf, void *unused) 594{ 595 arc_buf_hdr_t *buf = vbuf; 596 597 refcount_destroy(&buf->b_refcnt); 598 cv_destroy(&buf->b_cv); 599} 600 601/* 602 * Reclaim callback -- invoked when memory is low. 603 */ 604/* ARGSUSED */ 605static void 606hdr_recl(void *unused) 607{ 608 dprintf("hdr_recl called\n"); 609 /* 610 * umem calls the reclaim func when we destroy the buf cache, 611 * which is after we do arc_fini(). 612 */ 613 if (!arc_dead) 614 cv_signal(&arc_reclaim_thr_cv); 615} 616 617static void 618buf_init(void) 619{ 620 uint64_t *ct; 621 uint64_t hsize = 1ULL << 12; 622 int i, j; 623 624 /* 625 * The hash table is big enough to fill all of physical memory 626 * with an average 64K block size. The table will take up 627 * totalmem*sizeof(void*)/64K (eg. 128KB/GB with 8-byte pointers). 628 */ 629 while (hsize * 65536 < physmem * PAGESIZE) 630 hsize <<= 1; 631retry: 632 buf_hash_table.ht_mask = hsize - 1; 633 buf_hash_table.ht_table = 634 kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP); 635 if (buf_hash_table.ht_table == NULL) { 636 ASSERT(hsize > (1ULL << 8)); 637 hsize >>= 1; 638 goto retry; 639 } 640 641 hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t), 642 0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0); 643 buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t), 644 0, NULL, NULL, NULL, NULL, NULL, 0); 645 646 for (i = 0; i < 256; i++) 647 for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--) 648 *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY); 649 650 for (i = 0; i < BUF_LOCKS; i++) { 651 mutex_init(&buf_hash_table.ht_locks[i].ht_lock, 652 NULL, MUTEX_DEFAULT, NULL); 653 } 654} 655 656#define ARC_MINTIME (hz>>4) /* 62 ms */ 657 658static void 659arc_cksum_verify(arc_buf_t *buf) 660{ 661 zio_cksum_t zc; 662 663 if (!(zfs_flags & ZFS_DEBUG_MODIFY)) 664 return; 665 666 mutex_enter(&buf->b_hdr->b_freeze_lock); 667 if (buf->b_hdr->b_freeze_cksum == NULL || 668 (buf->b_hdr->b_flags & ARC_IO_ERROR)) { 669 mutex_exit(&buf->b_hdr->b_freeze_lock); 670 return; 671 } 672 fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc); 673 if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc)) 674 panic("buffer modified while frozen!"); 675 mutex_exit(&buf->b_hdr->b_freeze_lock); 676} 677 678static void 679arc_cksum_compute(arc_buf_t *buf) 680{ 681 if (!(zfs_flags & ZFS_DEBUG_MODIFY)) 682 return; 683 684 mutex_enter(&buf->b_hdr->b_freeze_lock); 685 if (buf->b_hdr->b_freeze_cksum != NULL) { 686 mutex_exit(&buf->b_hdr->b_freeze_lock); 687 return; 688 } 689 buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP); 690 fletcher_2_native(buf->b_data, buf->b_hdr->b_size, 691 buf->b_hdr->b_freeze_cksum); 692 mutex_exit(&buf->b_hdr->b_freeze_lock); 693} 694 695void 696arc_buf_thaw(arc_buf_t *buf) 697{ 698 if (!(zfs_flags & ZFS_DEBUG_MODIFY)) 699 return; 700 701 if (buf->b_hdr->b_state != arc_anon) 702 panic("modifying non-anon buffer!"); 703 if (buf->b_hdr->b_flags & ARC_IO_IN_PROGRESS) 704 panic("modifying buffer while i/o in progress!"); 705 arc_cksum_verify(buf); 706 mutex_enter(&buf->b_hdr->b_freeze_lock); 707 if (buf->b_hdr->b_freeze_cksum != NULL) { 708 kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t)); 709 buf->b_hdr->b_freeze_cksum = NULL; 710 } 711 mutex_exit(&buf->b_hdr->b_freeze_lock); 712} 713 714void 715arc_buf_freeze(arc_buf_t *buf) 716{ 717 if (!(zfs_flags & ZFS_DEBUG_MODIFY)) 718 return; 719 720 ASSERT(buf->b_hdr->b_freeze_cksum != NULL || 721 buf->b_hdr->b_state == arc_anon); 722 arc_cksum_compute(buf); 723} 724 725static void 726add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag) 727{ 728 ASSERT(MUTEX_HELD(hash_lock)); 729 730 if ((refcount_add(&ab->b_refcnt, tag) == 1) && 731 (ab->b_state != arc_anon)) { 732 uint64_t delta = ab->b_size * ab->b_datacnt; 733 734 ASSERT(!MUTEX_HELD(&ab->b_state->arcs_mtx)); 735 mutex_enter(&ab->b_state->arcs_mtx); 736 ASSERT(list_link_active(&ab->b_arc_node)); 737 list_remove(&ab->b_state->arcs_list, ab); 738 if (GHOST_STATE(ab->b_state)) { 739 ASSERT3U(ab->b_datacnt, ==, 0); 740 ASSERT3P(ab->b_buf, ==, NULL); 741 delta = ab->b_size; 742 } 743 ASSERT(delta > 0); 744 ASSERT3U(ab->b_state->arcs_lsize, >=, delta); 745 atomic_add_64(&ab->b_state->arcs_lsize, -delta); 746 mutex_exit(&ab->b_state->arcs_mtx); 747 /* remove the prefetch flag is we get a reference */ 748 if (ab->b_flags & ARC_PREFETCH) 749 ab->b_flags &= ~ARC_PREFETCH; 750 } 751} 752 753static int 754remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag) 755{ 756 int cnt; 757 arc_state_t *state = ab->b_state; 758 759 ASSERT(state == arc_anon || MUTEX_HELD(hash_lock)); 760 ASSERT(!GHOST_STATE(state)); 761 762 if (((cnt = refcount_remove(&ab->b_refcnt, tag)) == 0) && 763 (state != arc_anon)) { 764 ASSERT(!MUTEX_HELD(&state->arcs_mtx)); 765 mutex_enter(&state->arcs_mtx); 766 ASSERT(!list_link_active(&ab->b_arc_node)); 767 list_insert_head(&state->arcs_list, ab); 768 ASSERT(ab->b_datacnt > 0); 769 atomic_add_64(&state->arcs_lsize, ab->b_size * ab->b_datacnt); 770 ASSERT3U(state->arcs_size, >=, state->arcs_lsize); 771 mutex_exit(&state->arcs_mtx); 772 } 773 return (cnt); 774} 775 776/* 777 * Move the supplied buffer to the indicated state. The mutex 778 * for the buffer must be held by the caller. 779 */ 780static void 781arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock) 782{ 783 arc_state_t *old_state = ab->b_state; 784 int64_t refcnt = refcount_count(&ab->b_refcnt); 785 uint64_t from_delta, to_delta; 786 787 ASSERT(MUTEX_HELD(hash_lock)); 788 ASSERT(new_state != old_state); 789 ASSERT(refcnt == 0 || ab->b_datacnt > 0); 790 ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state)); 791 792 from_delta = to_delta = ab->b_datacnt * ab->b_size; 793 794 /* 795 * If this buffer is evictable, transfer it from the 796 * old state list to the new state list. 797 */ 798 if (refcnt == 0) { 799 if (old_state != arc_anon) { 800 int use_mutex = !MUTEX_HELD(&old_state->arcs_mtx); 801 802 if (use_mutex) 803 mutex_enter(&old_state->arcs_mtx); 804 805 ASSERT(list_link_active(&ab->b_arc_node)); 806 list_remove(&old_state->arcs_list, ab); 807 808 /* 809 * If prefetching out of the ghost cache, 810 * we will have a non-null datacnt. 811 */ 812 if (GHOST_STATE(old_state) && ab->b_datacnt == 0) { 813 /* ghost elements have a ghost size */ 814 ASSERT(ab->b_buf == NULL); 815 from_delta = ab->b_size; 816 } 817 ASSERT3U(old_state->arcs_lsize, >=, from_delta); 818 atomic_add_64(&old_state->arcs_lsize, -from_delta); 819 820 if (use_mutex) 821 mutex_exit(&old_state->arcs_mtx); 822 } 823 if (new_state != arc_anon) { 824 int use_mutex = !MUTEX_HELD(&new_state->arcs_mtx); 825 826 if (use_mutex) 827 mutex_enter(&new_state->arcs_mtx); 828 829 list_insert_head(&new_state->arcs_list, ab); 830 831 /* ghost elements have a ghost size */ 832 if (GHOST_STATE(new_state)) { 833 ASSERT(ab->b_datacnt == 0); 834 ASSERT(ab->b_buf == NULL); 835 to_delta = ab->b_size; 836 } 837 atomic_add_64(&new_state->arcs_lsize, to_delta); 838 ASSERT3U(new_state->arcs_size + to_delta, >=, 839 new_state->arcs_lsize); 840 841 if (use_mutex) 842 mutex_exit(&new_state->arcs_mtx); 843 } 844 } 845 846 ASSERT(!BUF_EMPTY(ab)); 847 if (new_state == arc_anon && old_state != arc_anon) { 848 buf_hash_remove(ab); 849 } 850 851 /* adjust state sizes */ 852 if (to_delta) 853 atomic_add_64(&new_state->arcs_size, to_delta); 854 if (from_delta) { 855 ASSERT3U(old_state->arcs_size, >=, from_delta); 856 atomic_add_64(&old_state->arcs_size, -from_delta); 857 } 858 ab->b_state = new_state; 859} 860 861arc_buf_t * 862arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type) 863{ 864 arc_buf_hdr_t *hdr; 865 arc_buf_t *buf; 866 867 ASSERT3U(size, >, 0); 868 hdr = kmem_cache_alloc(hdr_cache, KM_SLEEP); 869 ASSERT(BUF_EMPTY(hdr)); 870 hdr->b_size = size; 871 hdr->b_type = type; 872 hdr->b_spa = spa; 873 hdr->b_state = arc_anon; 874 hdr->b_arc_access = 0; 875 mutex_init(&hdr->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL); 876 buf = kmem_cache_alloc(buf_cache, KM_SLEEP); 877 buf->b_hdr = hdr; 878 buf->b_data = NULL; 879 buf->b_efunc = NULL; 880 buf->b_private = NULL; 881 buf->b_next = NULL; 882 hdr->b_buf = buf; 883 arc_get_data_buf(buf); 884 hdr->b_datacnt = 1; 885 hdr->b_flags = 0; 886 ASSERT(refcount_is_zero(&hdr->b_refcnt)); 887 (void) refcount_add(&hdr->b_refcnt, tag); 888 889 return (buf); 890} 891 892static arc_buf_t * 893arc_buf_clone(arc_buf_t *from) 894{ 895 arc_buf_t *buf; 896 arc_buf_hdr_t *hdr = from->b_hdr; 897 uint64_t size = hdr->b_size; 898 899 buf = kmem_cache_alloc(buf_cache, KM_SLEEP); 900 buf->b_hdr = hdr; 901 buf->b_data = NULL; 902 buf->b_efunc = NULL; 903 buf->b_private = NULL; 904 buf->b_next = hdr->b_buf; 905 hdr->b_buf = buf; 906 arc_get_data_buf(buf); 907 bcopy(from->b_data, buf->b_data, size); 908 hdr->b_datacnt += 1; 909 return (buf); 910} 911 912void 913arc_buf_add_ref(arc_buf_t *buf, void* tag) 914{ 915 arc_buf_hdr_t *hdr; 916 kmutex_t *hash_lock; 917 918 /* 919 * Check to see if this buffer is currently being evicted via 920 * arc_do_user_evicts(). 921 */ 922 mutex_enter(&arc_eviction_mtx); 923 hdr = buf->b_hdr; 924 if (hdr == NULL) { 925 mutex_exit(&arc_eviction_mtx); 926 return; 927 } 928 hash_lock = HDR_LOCK(hdr); 929 mutex_exit(&arc_eviction_mtx); 930 931 mutex_enter(hash_lock); 932 if (buf->b_data == NULL) { 933 /* 934 * This buffer is evicted. 935 */ 936 mutex_exit(hash_lock); 937 return; 938 } 939 940 ASSERT(buf->b_hdr == hdr); 941 ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu); 942 add_reference(hdr, hash_lock, tag); 943 arc_access(hdr, hash_lock); 944 mutex_exit(hash_lock); 945 ARCSTAT_BUMP(arcstat_hits); 946 ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH), 947 demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, 948 data, metadata, hits); 949} 950 951static void 952arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all) 953{ 954 arc_buf_t **bufp; 955 956 /* free up data associated with the buf */ 957 if (buf->b_data) { 958 arc_state_t *state = buf->b_hdr->b_state; 959 uint64_t size = buf->b_hdr->b_size; 960 arc_buf_contents_t type = buf->b_hdr->b_type; 961 962 arc_cksum_verify(buf); 963 if (!recycle) { 964 if (type == ARC_BUFC_METADATA) { 965 zio_buf_free(buf->b_data, size); 966 } else { 967 ASSERT(type == ARC_BUFC_DATA); 968 zio_data_buf_free(buf->b_data, size); 969 } 970 atomic_add_64(&arc_size, -size); 971 } 972 if (list_link_active(&buf->b_hdr->b_arc_node)) { 973 ASSERT(refcount_is_zero(&buf->b_hdr->b_refcnt)); 974 ASSERT(state != arc_anon); 975 ASSERT3U(state->arcs_lsize, >=, size); 976 atomic_add_64(&state->arcs_lsize, -size); 977 } 978 ASSERT3U(state->arcs_size, >=, size); 979 atomic_add_64(&state->arcs_size, -size); 980 buf->b_data = NULL; 981 ASSERT(buf->b_hdr->b_datacnt > 0); 982 buf->b_hdr->b_datacnt -= 1; 983 } 984 985 /* only remove the buf if requested */ 986 if (!all) 987 return; 988 989 /* remove the buf from the hdr list */ 990 for (bufp = &buf->b_hdr->b_buf; *bufp != buf; bufp = &(*bufp)->b_next) 991 continue; 992 *bufp = buf->b_next; 993 994 ASSERT(buf->b_efunc == NULL); 995 996 /* clean up the buf */ 997 buf->b_hdr = NULL; 998 kmem_cache_free(buf_cache, buf); 999} 1000 1001static void 1002arc_hdr_destroy(arc_buf_hdr_t *hdr) 1003{ 1004 ASSERT(refcount_is_zero(&hdr->b_refcnt)); 1005 ASSERT3P(hdr->b_state, ==, arc_anon); 1006 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 1007 1008 if (!BUF_EMPTY(hdr)) { 1009 ASSERT(!HDR_IN_HASH_TABLE(hdr)); 1010 bzero(&hdr->b_dva, sizeof (dva_t)); 1011 hdr->b_birth = 0; 1012 hdr->b_cksum0 = 0; 1013 } 1014 while (hdr->b_buf) { 1015 arc_buf_t *buf = hdr->b_buf; 1016 1017 if (buf->b_efunc) { 1018 mutex_enter(&arc_eviction_mtx); 1019 ASSERT(buf->b_hdr != NULL); 1020 arc_buf_destroy(hdr->b_buf, FALSE, FALSE); 1021 hdr->b_buf = buf->b_next; 1022 buf->b_hdr = &arc_eviction_hdr; 1023 buf->b_next = arc_eviction_list; 1024 arc_eviction_list = buf; 1025 mutex_exit(&arc_eviction_mtx); 1026 } else { 1027 arc_buf_destroy(hdr->b_buf, FALSE, TRUE); 1028 } 1029 } 1030 if (hdr->b_freeze_cksum != NULL) { 1031 kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t)); 1032 hdr->b_freeze_cksum = NULL; 1033 } 1034 mutex_destroy(&hdr->b_freeze_lock); 1035 1036 ASSERT(!list_link_active(&hdr->b_arc_node)); 1037 ASSERT3P(hdr->b_hash_next, ==, NULL); 1038 ASSERT3P(hdr->b_acb, ==, NULL); 1039 kmem_cache_free(hdr_cache, hdr); 1040} 1041 1042void 1043arc_buf_free(arc_buf_t *buf, void *tag) 1044{ 1045 arc_buf_hdr_t *hdr = buf->b_hdr; 1046 int hashed = hdr->b_state != arc_anon; 1047 1048 ASSERT(buf->b_efunc == NULL); 1049 ASSERT(buf->b_data != NULL); 1050 1051 if (hashed) { 1052 kmutex_t *hash_lock = HDR_LOCK(hdr); 1053 1054 mutex_enter(hash_lock); 1055 (void) remove_reference(hdr, hash_lock, tag); 1056 if (hdr->b_datacnt > 1) 1057 arc_buf_destroy(buf, FALSE, TRUE); 1058 else 1059 hdr->b_flags |= ARC_BUF_AVAILABLE; 1060 mutex_exit(hash_lock); 1061 } else if (HDR_IO_IN_PROGRESS(hdr)) { 1062 int destroy_hdr; 1063 /* 1064 * We are in the middle of an async write. Don't destroy 1065 * this buffer unless the write completes before we finish 1066 * decrementing the reference count. 1067 */ 1068 mutex_enter(&arc_eviction_mtx); 1069 (void) remove_reference(hdr, NULL, tag); 1070 ASSERT(refcount_is_zero(&hdr->b_refcnt)); 1071 destroy_hdr = !HDR_IO_IN_PROGRESS(hdr); 1072 mutex_exit(&arc_eviction_mtx); 1073 if (destroy_hdr) 1074 arc_hdr_destroy(hdr); 1075 } else { 1076 if (remove_reference(hdr, NULL, tag) > 0) { 1077 ASSERT(HDR_IO_ERROR(hdr)); 1078 arc_buf_destroy(buf, FALSE, TRUE); 1079 } else { 1080 arc_hdr_destroy(hdr); 1081 } 1082 } 1083} 1084 1085int 1086arc_buf_remove_ref(arc_buf_t *buf, void* tag) 1087{ 1088 arc_buf_hdr_t *hdr = buf->b_hdr; 1089 kmutex_t *hash_lock = HDR_LOCK(hdr); 1090 int no_callback = (buf->b_efunc == NULL); 1091 1092 if (hdr->b_state == arc_anon) { 1093 arc_buf_free(buf, tag); 1094 return (no_callback); 1095 } 1096 1097 mutex_enter(hash_lock); 1098 ASSERT(hdr->b_state != arc_anon); 1099 ASSERT(buf->b_data != NULL); 1100 1101 (void) remove_reference(hdr, hash_lock, tag); 1102 if (hdr->b_datacnt > 1) { 1103 if (no_callback) 1104 arc_buf_destroy(buf, FALSE, TRUE); 1105 } else if (no_callback) { 1106 ASSERT(hdr->b_buf == buf && buf->b_next == NULL); 1107 hdr->b_flags |= ARC_BUF_AVAILABLE; 1108 } 1109 ASSERT(no_callback || hdr->b_datacnt > 1 || 1110 refcount_is_zero(&hdr->b_refcnt)); 1111 mutex_exit(hash_lock); 1112 return (no_callback); 1113} 1114 1115int 1116arc_buf_size(arc_buf_t *buf) 1117{ 1118 return (buf->b_hdr->b_size); 1119} 1120 1121/* 1122 * Evict buffers from list until we've removed the specified number of 1123 * bytes. Move the removed buffers to the appropriate evict state. 1124 * If the recycle flag is set, then attempt to "recycle" a buffer: 1125 * - look for a buffer to evict that is `bytes' long. 1126 * - return the data block from this buffer rather than freeing it. 1127 * This flag is used by callers that are trying to make space for a 1128 * new buffer in a full arc cache. 1129 */ 1130static void * 1131arc_evict(arc_state_t *state, int64_t bytes, boolean_t recycle, 1132 arc_buf_contents_t type) 1133{ 1134 arc_state_t *evicted_state; 1135 uint64_t bytes_evicted = 0, skipped = 0, missed = 0; 1136 arc_buf_hdr_t *ab, *ab_prev = NULL; 1137 kmutex_t *hash_lock; 1138 boolean_t have_lock; 1139 void *stolen = NULL; 1140 1141 ASSERT(state == arc_mru || state == arc_mfu); 1142 1143 evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost; 1144 1145 mutex_enter(&state->arcs_mtx); 1146 mutex_enter(&evicted_state->arcs_mtx); 1147 1148 for (ab = list_tail(&state->arcs_list); ab; ab = ab_prev) { 1149 ab_prev = list_prev(&state->arcs_list, ab); 1150 /* prefetch buffers have a minimum lifespan */ 1151 if (HDR_IO_IN_PROGRESS(ab) || 1152 (ab->b_flags & (ARC_PREFETCH|ARC_INDIRECT) && 1153 lbolt - ab->b_arc_access < arc_min_prefetch_lifespan)) { 1154 skipped++; 1155 continue; 1156 } 1157 /* "lookahead" for better eviction candidate */ 1158 if (recycle && ab->b_size != bytes && 1159 ab_prev && ab_prev->b_size == bytes) 1160 continue; 1161 hash_lock = HDR_LOCK(ab); 1162 have_lock = MUTEX_HELD(hash_lock); 1163 if (have_lock || mutex_tryenter(hash_lock)) { 1164 ASSERT3U(refcount_count(&ab->b_refcnt), ==, 0); 1165 ASSERT(ab->b_datacnt > 0); 1166 while (ab->b_buf) { 1167 arc_buf_t *buf = ab->b_buf; 1168 if (buf->b_data) { 1169 bytes_evicted += ab->b_size; 1170 if (recycle && ab->b_type == type && 1171 ab->b_size == bytes) { 1172 stolen = buf->b_data; 1173 recycle = FALSE; 1174 } 1175 } 1176 if (buf->b_efunc) { 1177 mutex_enter(&arc_eviction_mtx); 1178 arc_buf_destroy(buf, 1179 buf->b_data == stolen, FALSE); 1180 ab->b_buf = buf->b_next; 1181 buf->b_hdr = &arc_eviction_hdr; 1182 buf->b_next = arc_eviction_list; 1183 arc_eviction_list = buf; 1184 mutex_exit(&arc_eviction_mtx); 1185 } else { 1186 arc_buf_destroy(buf, 1187 buf->b_data == stolen, TRUE); 1188 } 1189 } 1190 ASSERT(ab->b_datacnt == 0); 1191 arc_change_state(evicted_state, ab, hash_lock); 1192 ASSERT(HDR_IN_HASH_TABLE(ab)); 1193 ab->b_flags = ARC_IN_HASH_TABLE; 1194 DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, ab); 1195 if (!have_lock) 1196 mutex_exit(hash_lock); 1197 if (bytes >= 0 && bytes_evicted >= bytes) 1198 break; 1199 } else { 1200 missed += 1; 1201 } 1202 } 1203 1204 mutex_exit(&evicted_state->arcs_mtx); 1205 mutex_exit(&state->arcs_mtx); 1206 1207 if (bytes_evicted < bytes) 1208 dprintf("only evicted %lld bytes from %x", 1209 (longlong_t)bytes_evicted, state); 1210 1211 if (skipped) 1212 ARCSTAT_INCR(arcstat_evict_skip, skipped); 1213 1214 if (missed) 1215 ARCSTAT_INCR(arcstat_mutex_miss, missed); 1216 1217 return (stolen); 1218} 1219 1220/* 1221 * Remove buffers from list until we've removed the specified number of 1222 * bytes. Destroy the buffers that are removed. 1223 */ 1224static void 1225arc_evict_ghost(arc_state_t *state, int64_t bytes) 1226{ 1227 arc_buf_hdr_t *ab, *ab_prev; 1228 kmutex_t *hash_lock; 1229 uint64_t bytes_deleted = 0; 1230 uint64_t bufs_skipped = 0; 1231 1232 ASSERT(GHOST_STATE(state)); 1233top: 1234 mutex_enter(&state->arcs_mtx); 1235 for (ab = list_tail(&state->arcs_list); ab; ab = ab_prev) { 1236 ab_prev = list_prev(&state->arcs_list, ab); 1237 hash_lock = HDR_LOCK(ab); 1238 if (mutex_tryenter(hash_lock)) { 1239 ASSERT(!HDR_IO_IN_PROGRESS(ab)); 1240 ASSERT(ab->b_buf == NULL); 1241 arc_change_state(arc_anon, ab, hash_lock); 1242 mutex_exit(hash_lock); 1243 ARCSTAT_BUMP(arcstat_deleted); 1244 bytes_deleted += ab->b_size; 1245 arc_hdr_destroy(ab); 1246 DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab); 1247 if (bytes >= 0 && bytes_deleted >= bytes) 1248 break; 1249 } else { 1250 if (bytes < 0) { 1251 mutex_exit(&state->arcs_mtx); 1252 mutex_enter(hash_lock); 1253 mutex_exit(hash_lock); 1254 goto top; 1255 } 1256 bufs_skipped += 1; 1257 } 1258 } 1259 mutex_exit(&state->arcs_mtx); 1260 1261 if (bufs_skipped) { 1262 ARCSTAT_INCR(arcstat_mutex_miss, bufs_skipped); 1263 ASSERT(bytes >= 0); 1264 } 1265 1266 if (bytes_deleted < bytes) 1267 dprintf("only deleted %lld bytes from %p", 1268 (longlong_t)bytes_deleted, state); 1269} 1270 1271static void 1272arc_adjust(void) 1273{ 1274 int64_t top_sz, mru_over, arc_over, todelete; 1275 1276 top_sz = arc_anon->arcs_size + arc_mru->arcs_size; 1277 1278 if (top_sz > arc_p && arc_mru->arcs_lsize > 0) { 1279 int64_t toevict = MIN(arc_mru->arcs_lsize, top_sz - arc_p); 1280 (void) arc_evict(arc_mru, toevict, FALSE, ARC_BUFC_UNDEF); 1281 top_sz = arc_anon->arcs_size + arc_mru->arcs_size; 1282 } 1283 1284 mru_over = top_sz + arc_mru_ghost->arcs_size - arc_c; 1285 1286 if (mru_over > 0) { 1287 if (arc_mru_ghost->arcs_lsize > 0) { 1288 todelete = MIN(arc_mru_ghost->arcs_lsize, mru_over); 1289 arc_evict_ghost(arc_mru_ghost, todelete); 1290 } 1291 } 1292 1293 if ((arc_over = arc_size - arc_c) > 0) { 1294 int64_t tbl_over; 1295 1296 if (arc_mfu->arcs_lsize > 0) { 1297 int64_t toevict = MIN(arc_mfu->arcs_lsize, arc_over); 1298 (void) arc_evict(arc_mfu, toevict, FALSE, 1299 ARC_BUFC_UNDEF); 1300 } 1301 1302 tbl_over = arc_size + arc_mru_ghost->arcs_lsize + 1303 arc_mfu_ghost->arcs_lsize - arc_c*2; 1304 1305 if (tbl_over > 0 && arc_mfu_ghost->arcs_lsize > 0) { 1306 todelete = MIN(arc_mfu_ghost->arcs_lsize, tbl_over); 1307 arc_evict_ghost(arc_mfu_ghost, todelete); 1308 } 1309 } 1310} 1311 1312static void 1313arc_do_user_evicts(void) 1314{ 1315 mutex_enter(&arc_eviction_mtx); 1316 while (arc_eviction_list != NULL) { 1317 arc_buf_t *buf = arc_eviction_list; 1318 arc_eviction_list = buf->b_next; 1319 buf->b_hdr = NULL; 1320 mutex_exit(&arc_eviction_mtx); 1321 1322 if (buf->b_efunc != NULL) 1323 VERIFY(buf->b_efunc(buf) == 0); 1324 1325 buf->b_efunc = NULL; 1326 buf->b_private = NULL; 1327 kmem_cache_free(buf_cache, buf); 1328 mutex_enter(&arc_eviction_mtx); 1329 } 1330 mutex_exit(&arc_eviction_mtx); 1331} 1332 1333/* 1334 * Flush all *evictable* data from the cache. 1335 * NOTE: this will not touch "active" (i.e. referenced) data. 1336 */ 1337void 1338arc_flush(void) 1339{ 1340 while (list_head(&arc_mru->arcs_list)) 1341 (void) arc_evict(arc_mru, -1, FALSE, ARC_BUFC_UNDEF); 1342 while (list_head(&arc_mfu->arcs_list)) 1343 (void) arc_evict(arc_mfu, -1, FALSE, ARC_BUFC_UNDEF); 1344 1345 arc_evict_ghost(arc_mru_ghost, -1); 1346 arc_evict_ghost(arc_mfu_ghost, -1); 1347 1348 mutex_enter(&arc_reclaim_thr_lock); 1349 arc_do_user_evicts(); 1350 mutex_exit(&arc_reclaim_thr_lock); 1351 ASSERT(arc_eviction_list == NULL); 1352} 1353 1354int arc_shrink_shift = 5; /* log2(fraction of arc to reclaim) */ 1355 1356void 1357arc_shrink(void) 1358{ 1359 if (arc_c > arc_c_min) { 1360 uint64_t to_free; 1361 1362#ifdef _KERNEL 1363 to_free = arc_c >> arc_shrink_shift; 1364#else 1365 to_free = arc_c >> arc_shrink_shift; 1366#endif 1367 if (arc_c > arc_c_min + to_free) 1368 atomic_add_64(&arc_c, -to_free); 1369 else 1370 arc_c = arc_c_min; 1371 1372 atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift)); 1373 if (arc_c > arc_size) 1374 arc_c = MAX(arc_size, arc_c_min); 1375 if (arc_p > arc_c) 1376 arc_p = (arc_c >> 1); 1377 ASSERT(arc_c >= arc_c_min); 1378 ASSERT((int64_t)arc_p >= 0); 1379 } 1380 1381 if (arc_size > arc_c) 1382 arc_adjust(); 1383} 1384 1385static int zfs_needfree = 0; 1386 1387static int 1388arc_reclaim_needed(void) 1389{ 1390#if 0 1391 uint64_t extra; 1392#endif 1393 1394#ifdef _KERNEL 1395 1396 if (zfs_needfree) 1397 return (1); 1398 1399#if 0 1400 /* 1401 * check to make sure that swapfs has enough space so that anon 1402 * reservations can still succeeed. anon_resvmem() checks that the 1403 * availrmem is greater than swapfs_minfree, and the number of reserved 1404 * swap pages. We also add a bit of extra here just to prevent 1405 * circumstances from getting really dire. 1406 */ 1407 if (availrmem < swapfs_minfree + swapfs_reserve + extra) 1408 return (1); 1409 1410 /* 1411 * If zio data pages are being allocated out of a separate heap segment, 1412 * then check that the size of available vmem for this area remains 1413 * above 1/4th free. This needs to be done when the size of the 1414 * non-default segment is smaller than physical memory, so we could 1415 * conceivably run out of VA in that segment before running out of 1416 * physical memory. 1417 */ 1418 if (zio_arena != NULL) { 1419 size_t arc_ziosize = 1420 btop(vmem_size(zio_arena, VMEM_FREE | VMEM_ALLOC)); 1421 1422 if ((physmem > arc_ziosize) && 1423 (btop(vmem_size(zio_arena, VMEM_FREE)) < arc_ziosize >> 2)) 1424 return (1); 1425 } 1426 1427#if defined(__i386) 1428 /* 1429 * If we're on an i386 platform, it's possible that we'll exhaust the 1430 * kernel heap space before we ever run out of available physical 1431 * memory. Most checks of the size of the heap_area compare against 1432 * tune.t_minarmem, which is the minimum available real memory that we 1433 * can have in the system. However, this is generally fixed at 25 pages 1434 * which is so low that it's useless. In this comparison, we seek to 1435 * calculate the total heap-size, and reclaim if more than 3/4ths of the 1436 * heap is allocated. (Or, in the caclulation, if less than 1/4th is 1437 * free) 1438 */ 1439 if (btop(vmem_size(heap_arena, VMEM_FREE)) < 1440 (btop(vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC)) >> 2)) 1441 return (1); 1442#endif 1443#else 1444 if (kmem_map->size > (vm_kmem_size * 3) / 4) 1445 return (1); 1446#endif 1447 1448#else 1449 if (spa_get_random(100) == 0) 1450 return (1); 1451#endif 1452 return (0); 1453} 1454 1455static void 1456arc_kmem_reap_now(arc_reclaim_strategy_t strat) 1457{ 1458#ifdef ZIO_USE_UMA 1459 size_t i; 1460 kmem_cache_t *prev_cache = NULL; 1461 kmem_cache_t *prev_data_cache = NULL; 1462 extern kmem_cache_t *zio_buf_cache[]; 1463 extern kmem_cache_t *zio_data_buf_cache[]; 1464#endif 1465 1466#ifdef _KERNEL 1467 /* 1468 * First purge some DNLC entries, in case the DNLC is using 1469 * up too much memory. 1470 */ 1471 dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent); 1472 1473#if defined(__i386) 1474 /* 1475 * Reclaim unused memory from all kmem caches. 1476 */ 1477 kmem_reap(); 1478#endif 1479#endif 1480 1481 /* 1482 * An agressive reclamation will shrink the cache size as well as 1483 * reap free buffers from the arc kmem caches. 1484 */ 1485 if (strat == ARC_RECLAIM_AGGR) 1486 arc_shrink(); 1487 1488#ifdef ZIO_USE_UMA 1489 for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) { 1490 if (zio_buf_cache[i] != prev_cache) { 1491 prev_cache = zio_buf_cache[i]; 1492 kmem_cache_reap_now(zio_buf_cache[i]); 1493 } 1494 if (zio_data_buf_cache[i] != prev_data_cache) { 1495 prev_data_cache = zio_data_buf_cache[i]; 1496 kmem_cache_reap_now(zio_data_buf_cache[i]); 1497 } 1498 } 1499#endif 1500 kmem_cache_reap_now(buf_cache); 1501 kmem_cache_reap_now(hdr_cache); 1502} 1503 1504static void 1505arc_reclaim_thread(void *dummy __unused) 1506{ 1507 clock_t growtime = 0; 1508 arc_reclaim_strategy_t last_reclaim = ARC_RECLAIM_CONS; 1509 callb_cpr_t cpr; 1510 1511 CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG); 1512 1513 mutex_enter(&arc_reclaim_thr_lock); 1514 while (arc_thread_exit == 0) { 1515 if (arc_reclaim_needed()) { 1516 1517 if (arc_no_grow) { 1518 if (last_reclaim == ARC_RECLAIM_CONS) { 1519 last_reclaim = ARC_RECLAIM_AGGR; 1520 } else { 1521 last_reclaim = ARC_RECLAIM_CONS; 1522 } 1523 } else { 1524 arc_no_grow = TRUE; 1525 last_reclaim = ARC_RECLAIM_AGGR; 1526 membar_producer(); 1527 } 1528 1529 /* reset the growth delay for every reclaim */ 1530 growtime = lbolt + (arc_grow_retry * hz); 1531 ASSERT(growtime > 0); 1532 1533 if (zfs_needfree && last_reclaim == ARC_RECLAIM_CONS) { 1534 /* 1535 * If zfs_needfree is TRUE our vm_lowmem hook 1536 * was called and in that case we must free some 1537 * memory, so switch to aggressive mode. 1538 */ 1539 arc_no_grow = TRUE; 1540 last_reclaim = ARC_RECLAIM_AGGR; 1541 } 1542 arc_kmem_reap_now(last_reclaim); 1543 } else if ((growtime > 0) && ((growtime - lbolt) <= 0)) { 1544 arc_no_grow = FALSE; 1545 } 1546 1547 if (zfs_needfree || 1548 (2 * arc_c < arc_size + 1549 arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size)) 1550 arc_adjust(); 1551 1552 if (arc_eviction_list != NULL) 1553 arc_do_user_evicts(); 1554 1555 if (arc_reclaim_needed()) { 1556 zfs_needfree = 0; 1557#ifdef _KERNEL 1558 wakeup(&zfs_needfree); 1559#endif 1560 } 1561 1562 /* block until needed, or one second, whichever is shorter */ 1563 CALLB_CPR_SAFE_BEGIN(&cpr); 1564 (void) cv_timedwait(&arc_reclaim_thr_cv, 1565 &arc_reclaim_thr_lock, hz); 1566 CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock); 1567 } 1568 1569 arc_thread_exit = 0; 1570 cv_broadcast(&arc_reclaim_thr_cv); 1571 CALLB_CPR_EXIT(&cpr); /* drops arc_reclaim_thr_lock */ 1572 thread_exit(); 1573} 1574 1575/* 1576 * Adapt arc info given the number of bytes we are trying to add and 1577 * the state that we are comming from. This function is only called 1578 * when we are adding new content to the cache. 1579 */ 1580static void 1581arc_adapt(int bytes, arc_state_t *state) 1582{ 1583 int mult; 1584 1585 ASSERT(bytes > 0); 1586 /* 1587 * Adapt the target size of the MRU list: 1588 * - if we just hit in the MRU ghost list, then increase 1589 * the target size of the MRU list. 1590 * - if we just hit in the MFU ghost list, then increase 1591 * the target size of the MFU list by decreasing the 1592 * target size of the MRU list. 1593 */ 1594 if (state == arc_mru_ghost) { 1595 mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ? 1596 1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size)); 1597 1598 arc_p = MIN(arc_c, arc_p + bytes * mult); 1599 } else if (state == arc_mfu_ghost) { 1600 mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ? 1601 1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size)); 1602 1603 arc_p = MAX(0, (int64_t)arc_p - bytes * mult); 1604 } 1605 ASSERT((int64_t)arc_p >= 0); 1606 1607 if (arc_reclaim_needed()) { 1608 cv_signal(&arc_reclaim_thr_cv); 1609 return; 1610 } 1611 1612 if (arc_no_grow) 1613 return; 1614 1615 if (arc_c >= arc_c_max) 1616 return; 1617 1618 /* 1619 * If we're within (2 * maxblocksize) bytes of the target 1620 * cache size, increment the target cache size 1621 */ 1622 if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) { 1623 atomic_add_64(&arc_c, (int64_t)bytes); 1624 if (arc_c > arc_c_max) 1625 arc_c = arc_c_max; 1626 else if (state == arc_anon) 1627 atomic_add_64(&arc_p, (int64_t)bytes); 1628 if (arc_p > arc_c) 1629 arc_p = arc_c; 1630 } 1631 ASSERT((int64_t)arc_p >= 0); 1632} 1633 1634/* 1635 * Check if the cache has reached its limits and eviction is required 1636 * prior to insert. 1637 */ 1638static int 1639arc_evict_needed() 1640{ 1641 if (arc_reclaim_needed()) 1642 return (1); 1643 1644 return (arc_size > arc_c); 1645} 1646 1647/* 1648 * The buffer, supplied as the first argument, needs a data block. 1649 * So, if we are at cache max, determine which cache should be victimized. 1650 * We have the following cases: 1651 * 1652 * 1. Insert for MRU, p > sizeof(arc_anon + arc_mru) -> 1653 * In this situation if we're out of space, but the resident size of the MFU is 1654 * under the limit, victimize the MFU cache to satisfy this insertion request. 1655 * 1656 * 2. Insert for MRU, p <= sizeof(arc_anon + arc_mru) -> 1657 * Here, we've used up all of the available space for the MRU, so we need to 1658 * evict from our own cache instead. Evict from the set of resident MRU 1659 * entries. 1660 * 1661 * 3. Insert for MFU (c - p) > sizeof(arc_mfu) -> 1662 * c minus p represents the MFU space in the cache, since p is the size of the 1663 * cache that is dedicated to the MRU. In this situation there's still space on 1664 * the MFU side, so the MRU side needs to be victimized. 1665 * 1666 * 4. Insert for MFU (c - p) < sizeof(arc_mfu) -> 1667 * MFU's resident set is consuming more space than it has been allotted. In 1668 * this situation, we must victimize our own cache, the MFU, for this insertion. 1669 */ 1670static void 1671arc_get_data_buf(arc_buf_t *buf) 1672{ 1673 arc_state_t *state = buf->b_hdr->b_state; 1674 uint64_t size = buf->b_hdr->b_size; 1675 arc_buf_contents_t type = buf->b_hdr->b_type; 1676 1677 arc_adapt(size, state); 1678 1679 /* 1680 * We have not yet reached cache maximum size, 1681 * just allocate a new buffer. 1682 */ 1683 if (!arc_evict_needed()) { 1684 if (type == ARC_BUFC_METADATA) { 1685 buf->b_data = zio_buf_alloc(size); 1686 } else { 1687 ASSERT(type == ARC_BUFC_DATA); 1688 buf->b_data = zio_data_buf_alloc(size); 1689 } 1690 atomic_add_64(&arc_size, size); 1691 goto out; 1692 } 1693 1694 /* 1695 * If we are prefetching from the mfu ghost list, this buffer 1696 * will end up on the mru list; so steal space from there. 1697 */ 1698 if (state == arc_mfu_ghost) 1699 state = buf->b_hdr->b_flags & ARC_PREFETCH ? arc_mru : arc_mfu; 1700 else if (state == arc_mru_ghost) 1701 state = arc_mru; 1702 1703 if (state == arc_mru || state == arc_anon) { 1704 uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size; 1705 state = (arc_p > mru_used) ? arc_mfu : arc_mru; 1706 } else { 1707 /* MFU cases */ 1708 uint64_t mfu_space = arc_c - arc_p; 1709 state = (mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu; 1710 } 1711 if ((buf->b_data = arc_evict(state, size, TRUE, type)) == NULL) { 1712 if (type == ARC_BUFC_METADATA) { 1713 buf->b_data = zio_buf_alloc(size); 1714 } else { 1715 ASSERT(type == ARC_BUFC_DATA); 1716 buf->b_data = zio_data_buf_alloc(size); 1717 } 1718 atomic_add_64(&arc_size, size); 1719 ARCSTAT_BUMP(arcstat_recycle_miss); 1720 } 1721 ASSERT(buf->b_data != NULL); 1722out: 1723 /* 1724 * Update the state size. Note that ghost states have a 1725 * "ghost size" and so don't need to be updated. 1726 */ 1727 if (!GHOST_STATE(buf->b_hdr->b_state)) { 1728 arc_buf_hdr_t *hdr = buf->b_hdr; 1729 1730 atomic_add_64(&hdr->b_state->arcs_size, size); 1731 if (list_link_active(&hdr->b_arc_node)) { 1732 ASSERT(refcount_is_zero(&hdr->b_refcnt)); 1733 atomic_add_64(&hdr->b_state->arcs_lsize, size); 1734 } 1735 /* 1736 * If we are growing the cache, and we are adding anonymous 1737 * data, and we have outgrown arc_p, update arc_p 1738 */ 1739 if (arc_size < arc_c && hdr->b_state == arc_anon && 1740 arc_anon->arcs_size + arc_mru->arcs_size > arc_p) 1741 arc_p = MIN(arc_c, arc_p + size); 1742 } 1743} 1744 1745/* 1746 * This routine is called whenever a buffer is accessed. 1747 * NOTE: the hash lock is dropped in this function. 1748 */ 1749static void 1750arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) 1751{ 1752 ASSERT(MUTEX_HELD(hash_lock)); 1753 1754 if (buf->b_state == arc_anon) { 1755 /* 1756 * This buffer is not in the cache, and does not 1757 * appear in our "ghost" list. Add the new buffer 1758 * to the MRU state. 1759 */ 1760 1761 ASSERT(buf->b_arc_access == 0); 1762 buf->b_arc_access = lbolt; 1763 DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf); 1764 arc_change_state(arc_mru, buf, hash_lock); 1765 1766 } else if (buf->b_state == arc_mru) { 1767 /* 1768 * If this buffer is here because of a prefetch, then either: 1769 * - clear the flag if this is a "referencing" read 1770 * (any subsequent access will bump this into the MFU state). 1771 * or 1772 * - move the buffer to the head of the list if this is 1773 * another prefetch (to make it less likely to be evicted). 1774 */ 1775 if ((buf->b_flags & ARC_PREFETCH) != 0) { 1776 if (refcount_count(&buf->b_refcnt) == 0) { 1777 ASSERT(list_link_active(&buf->b_arc_node)); 1778 mutex_enter(&arc_mru->arcs_mtx); 1779 list_remove(&arc_mru->arcs_list, buf); 1780 list_insert_head(&arc_mru->arcs_list, buf); 1781 mutex_exit(&arc_mru->arcs_mtx); 1782 } else { 1783 buf->b_flags &= ~ARC_PREFETCH; 1784 ARCSTAT_BUMP(arcstat_mru_hits); 1785 } 1786 buf->b_arc_access = lbolt; 1787 return; 1788 } 1789 1790 /* 1791 * This buffer has been "accessed" only once so far, 1792 * but it is still in the cache. Move it to the MFU 1793 * state. 1794 */ 1795 if (lbolt > buf->b_arc_access + ARC_MINTIME) { 1796 /* 1797 * More than 125ms have passed since we 1798 * instantiated this buffer. Move it to the 1799 * most frequently used state. 1800 */ 1801 buf->b_arc_access = lbolt; 1802 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); 1803 arc_change_state(arc_mfu, buf, hash_lock); 1804 } 1805 ARCSTAT_BUMP(arcstat_mru_hits); 1806 } else if (buf->b_state == arc_mru_ghost) { 1807 arc_state_t *new_state; 1808 /* 1809 * This buffer has been "accessed" recently, but 1810 * was evicted from the cache. Move it to the 1811 * MFU state. 1812 */ 1813 1814 if (buf->b_flags & ARC_PREFETCH) { 1815 new_state = arc_mru; 1816 if (refcount_count(&buf->b_refcnt) > 0) 1817 buf->b_flags &= ~ARC_PREFETCH; 1818 DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf); 1819 } else { 1820 new_state = arc_mfu; 1821 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); 1822 } 1823 1824 buf->b_arc_access = lbolt; 1825 arc_change_state(new_state, buf, hash_lock); 1826 1827 ARCSTAT_BUMP(arcstat_mru_ghost_hits); 1828 } else if (buf->b_state == arc_mfu) { 1829 /* 1830 * This buffer has been accessed more than once and is 1831 * still in the cache. Keep it in the MFU state. 1832 * 1833 * NOTE: an add_reference() that occurred when we did 1834 * the arc_read() will have kicked this off the list. 1835 * If it was a prefetch, we will explicitly move it to 1836 * the head of the list now. 1837 */ 1838 if ((buf->b_flags & ARC_PREFETCH) != 0) { 1839 ASSERT(refcount_count(&buf->b_refcnt) == 0); 1840 ASSERT(list_link_active(&buf->b_arc_node)); 1841 mutex_enter(&arc_mfu->arcs_mtx); 1842 list_remove(&arc_mfu->arcs_list, buf); 1843 list_insert_head(&arc_mfu->arcs_list, buf); 1844 mutex_exit(&arc_mfu->arcs_mtx); 1845 } 1846 ARCSTAT_BUMP(arcstat_mfu_hits); 1847 buf->b_arc_access = lbolt; 1848 } else if (buf->b_state == arc_mfu_ghost) { 1849 arc_state_t *new_state = arc_mfu; 1850 /* 1851 * This buffer has been accessed more than once but has 1852 * been evicted from the cache. Move it back to the 1853 * MFU state. 1854 */ 1855 1856 if (buf->b_flags & ARC_PREFETCH) { 1857 /* 1858 * This is a prefetch access... 1859 * move this block back to the MRU state. 1860 */ 1861 ASSERT3U(refcount_count(&buf->b_refcnt), ==, 0); 1862 new_state = arc_mru; 1863 } 1864 1865 buf->b_arc_access = lbolt; 1866 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); 1867 arc_change_state(new_state, buf, hash_lock); 1868 1869 ARCSTAT_BUMP(arcstat_mfu_ghost_hits); 1870 } else { 1871 ASSERT(!"invalid arc state"); 1872 } 1873} 1874 1875/* a generic arc_done_func_t which you can use */ 1876/* ARGSUSED */ 1877void 1878arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg) 1879{ 1880 bcopy(buf->b_data, arg, buf->b_hdr->b_size); 1881 VERIFY(arc_buf_remove_ref(buf, arg) == 1); 1882} 1883 1884/* a generic arc_done_func_t which you can use */ 1885void 1886arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg) 1887{ 1888 arc_buf_t **bufp = arg; 1889 if (zio && zio->io_error) { 1890 VERIFY(arc_buf_remove_ref(buf, arg) == 1); 1891 *bufp = NULL; 1892 } else { 1893 *bufp = buf; 1894 } 1895} 1896 1897static void 1898arc_read_done(zio_t *zio) 1899{ 1900 arc_buf_hdr_t *hdr, *found; 1901 arc_buf_t *buf; 1902 arc_buf_t *abuf; /* buffer we're assigning to callback */ 1903 kmutex_t *hash_lock; 1904 arc_callback_t *callback_list, *acb; 1905 int freeable = FALSE; 1906 1907 buf = zio->io_private; 1908 hdr = buf->b_hdr; 1909 1910 /* 1911 * The hdr was inserted into hash-table and removed from lists 1912 * prior to starting I/O. We should find this header, since 1913 * it's in the hash table, and it should be legit since it's 1914 * not possible to evict it during the I/O. The only possible 1915 * reason for it not to be found is if we were freed during the 1916 * read. 1917 */ 1918 found = buf_hash_find(zio->io_spa, &hdr->b_dva, hdr->b_birth, 1919 &hash_lock); 1920 1921 ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && hash_lock == NULL) || 1922 (found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp)))); 1923 1924 /* byteswap if necessary */ 1925 callback_list = hdr->b_acb; 1926 ASSERT(callback_list != NULL); 1927 if (BP_SHOULD_BYTESWAP(zio->io_bp) && callback_list->acb_byteswap) 1928 callback_list->acb_byteswap(buf->b_data, hdr->b_size); 1929 1930 arc_cksum_compute(buf); 1931 1932 /* create copies of the data buffer for the callers */ 1933 abuf = buf; 1934 for (acb = callback_list; acb; acb = acb->acb_next) { 1935 if (acb->acb_done) { 1936 if (abuf == NULL) 1937 abuf = arc_buf_clone(buf); 1938 acb->acb_buf = abuf; 1939 abuf = NULL; 1940 } 1941 } 1942 hdr->b_acb = NULL; 1943 hdr->b_flags &= ~ARC_IO_IN_PROGRESS; 1944 ASSERT(!HDR_BUF_AVAILABLE(hdr)); 1945 if (abuf == buf) 1946 hdr->b_flags |= ARC_BUF_AVAILABLE; 1947 1948 ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL); 1949 1950 if (zio->io_error != 0) { 1951 hdr->b_flags |= ARC_IO_ERROR; 1952 if (hdr->b_state != arc_anon) 1953 arc_change_state(arc_anon, hdr, hash_lock); 1954 if (HDR_IN_HASH_TABLE(hdr)) 1955 buf_hash_remove(hdr); 1956 freeable = refcount_is_zero(&hdr->b_refcnt); 1957 /* convert checksum errors into IO errors */ 1958 if (zio->io_error == ECKSUM) 1959 zio->io_error = EIO; 1960 } 1961 1962 /* 1963 * Broadcast before we drop the hash_lock to avoid the possibility 1964 * that the hdr (and hence the cv) might be freed before we get to 1965 * the cv_broadcast(). 1966 */ 1967 cv_broadcast(&hdr->b_cv); 1968 1969 if (hash_lock) { 1970 /* 1971 * Only call arc_access on anonymous buffers. This is because 1972 * if we've issued an I/O for an evicted buffer, we've already 1973 * called arc_access (to prevent any simultaneous readers from 1974 * getting confused). 1975 */ 1976 if (zio->io_error == 0 && hdr->b_state == arc_anon) 1977 arc_access(hdr, hash_lock); 1978 mutex_exit(hash_lock); 1979 } else { 1980 /* 1981 * This block was freed while we waited for the read to 1982 * complete. It has been removed from the hash table and 1983 * moved to the anonymous state (so that it won't show up 1984 * in the cache). 1985 */ 1986 ASSERT3P(hdr->b_state, ==, arc_anon); 1987 freeable = refcount_is_zero(&hdr->b_refcnt); 1988 } 1989 1990 /* execute each callback and free its structure */ 1991 while ((acb = callback_list) != NULL) { 1992 if (acb->acb_done) 1993 acb->acb_done(zio, acb->acb_buf, acb->acb_private); 1994 1995 if (acb->acb_zio_dummy != NULL) { 1996 acb->acb_zio_dummy->io_error = zio->io_error; 1997 zio_nowait(acb->acb_zio_dummy); 1998 } 1999 2000 callback_list = acb->acb_next; 2001 kmem_free(acb, sizeof (arc_callback_t)); 2002 } 2003 2004 if (freeable) 2005 arc_hdr_destroy(hdr); 2006} 2007 2008/* 2009 * "Read" the block block at the specified DVA (in bp) via the 2010 * cache. If the block is found in the cache, invoke the provided 2011 * callback immediately and return. Note that the `zio' parameter 2012 * in the callback will be NULL in this case, since no IO was 2013 * required. If the block is not in the cache pass the read request 2014 * on to the spa with a substitute callback function, so that the 2015 * requested block will be added to the cache. 2016 * 2017 * If a read request arrives for a block that has a read in-progress, 2018 * either wait for the in-progress read to complete (and return the 2019 * results); or, if this is a read with a "done" func, add a record 2020 * to the read to invoke the "done" func when the read completes, 2021 * and return; or just return. 2022 * 2023 * arc_read_done() will invoke all the requested "done" functions 2024 * for readers of this block. 2025 */ 2026int 2027arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_byteswap_func_t *swap, 2028 arc_done_func_t *done, void *private, int priority, int flags, 2029 uint32_t *arc_flags, zbookmark_t *zb) 2030{ 2031 arc_buf_hdr_t *hdr; 2032 arc_buf_t *buf; 2033 kmutex_t *hash_lock; 2034 zio_t *rzio; 2035 2036top: 2037 hdr = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_lock); 2038 if (hdr && hdr->b_datacnt > 0) { 2039 2040 *arc_flags |= ARC_CACHED; 2041 2042 if (HDR_IO_IN_PROGRESS(hdr)) { 2043 2044 if (*arc_flags & ARC_WAIT) { 2045 cv_wait(&hdr->b_cv, hash_lock); 2046 mutex_exit(hash_lock); 2047 goto top; 2048 } 2049 ASSERT(*arc_flags & ARC_NOWAIT); 2050 2051 if (done) { 2052 arc_callback_t *acb = NULL; 2053 2054 acb = kmem_zalloc(sizeof (arc_callback_t), 2055 KM_SLEEP); 2056 acb->acb_done = done; 2057 acb->acb_private = private; 2058 acb->acb_byteswap = swap; 2059 if (pio != NULL) 2060 acb->acb_zio_dummy = zio_null(pio, 2061 spa, NULL, NULL, flags); 2062 2063 ASSERT(acb->acb_done != NULL); 2064 acb->acb_next = hdr->b_acb; 2065 hdr->b_acb = acb; 2066 add_reference(hdr, hash_lock, private); 2067 mutex_exit(hash_lock); 2068 return (0); 2069 } 2070 mutex_exit(hash_lock); 2071 return (0); 2072 } 2073 2074 ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu); 2075 2076 if (done) { 2077 add_reference(hdr, hash_lock, private); 2078 /* 2079 * If this block is already in use, create a new 2080 * copy of the data so that we will be guaranteed 2081 * that arc_release() will always succeed. 2082 */ 2083 buf = hdr->b_buf; 2084 ASSERT(buf); 2085 ASSERT(buf->b_data); 2086 if (HDR_BUF_AVAILABLE(hdr)) { 2087 ASSERT(buf->b_efunc == NULL); 2088 hdr->b_flags &= ~ARC_BUF_AVAILABLE; 2089 } else { 2090 buf = arc_buf_clone(buf); 2091 } 2092 } else if (*arc_flags & ARC_PREFETCH && 2093 refcount_count(&hdr->b_refcnt) == 0) { 2094 hdr->b_flags |= ARC_PREFETCH; 2095 } 2096 DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); 2097 arc_access(hdr, hash_lock); 2098 mutex_exit(hash_lock); 2099 ARCSTAT_BUMP(arcstat_hits); 2100 ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH), 2101 demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, 2102 data, metadata, hits); 2103 2104 if (done) 2105 done(NULL, buf, private); 2106 } else { 2107 uint64_t size = BP_GET_LSIZE(bp); 2108 arc_callback_t *acb; 2109 2110 if (hdr == NULL) { 2111 /* this block is not in the cache */ 2112 arc_buf_hdr_t *exists; 2113 arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp); 2114 buf = arc_buf_alloc(spa, size, private, type); 2115 hdr = buf->b_hdr; 2116 hdr->b_dva = *BP_IDENTITY(bp); 2117 hdr->b_birth = bp->blk_birth; 2118 hdr->b_cksum0 = bp->blk_cksum.zc_word[0]; 2119 exists = buf_hash_insert(hdr, &hash_lock); 2120 if (exists) { 2121 /* somebody beat us to the hash insert */ 2122 mutex_exit(hash_lock); 2123 bzero(&hdr->b_dva, sizeof (dva_t)); 2124 hdr->b_birth = 0; 2125 hdr->b_cksum0 = 0; 2126 (void) arc_buf_remove_ref(buf, private); 2127 goto top; /* restart the IO request */ 2128 } 2129 /* if this is a prefetch, we don't have a reference */ 2130 if (*arc_flags & ARC_PREFETCH) { 2131 (void) remove_reference(hdr, hash_lock, 2132 private); 2133 hdr->b_flags |= ARC_PREFETCH; 2134 } 2135 if (BP_GET_LEVEL(bp) > 0) 2136 hdr->b_flags |= ARC_INDIRECT; 2137 } else { 2138 /* this block is in the ghost cache */ 2139 ASSERT(GHOST_STATE(hdr->b_state)); 2140 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 2141 ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 0); 2142 ASSERT(hdr->b_buf == NULL); 2143 2144 /* if this is a prefetch, we don't have a reference */ 2145 if (*arc_flags & ARC_PREFETCH) 2146 hdr->b_flags |= ARC_PREFETCH; 2147 else 2148 add_reference(hdr, hash_lock, private); 2149 buf = kmem_cache_alloc(buf_cache, KM_SLEEP); 2150 buf->b_hdr = hdr; 2151 buf->b_data = NULL; 2152 buf->b_efunc = NULL; 2153 buf->b_private = NULL; 2154 buf->b_next = NULL; 2155 hdr->b_buf = buf; 2156 arc_get_data_buf(buf); 2157 ASSERT(hdr->b_datacnt == 0); 2158 hdr->b_datacnt = 1; 2159 2160 } 2161 2162 acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); 2163 acb->acb_done = done; 2164 acb->acb_private = private; 2165 acb->acb_byteswap = swap; 2166 2167 ASSERT(hdr->b_acb == NULL); 2168 hdr->b_acb = acb; 2169 hdr->b_flags |= ARC_IO_IN_PROGRESS; 2170 2171 /* 2172 * If the buffer has been evicted, migrate it to a present state 2173 * before issuing the I/O. Once we drop the hash-table lock, 2174 * the header will be marked as I/O in progress and have an 2175 * attached buffer. At this point, anybody who finds this 2176 * buffer ought to notice that it's legit but has a pending I/O. 2177 */ 2178 2179 if (GHOST_STATE(hdr->b_state)) 2180 arc_access(hdr, hash_lock); 2181 mutex_exit(hash_lock); 2182 2183 ASSERT3U(hdr->b_size, ==, size); 2184 DTRACE_PROBE3(arc__miss, blkptr_t *, bp, uint64_t, size, 2185 zbookmark_t *, zb); 2186 ARCSTAT_BUMP(arcstat_misses); 2187 ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH), 2188 demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, 2189 data, metadata, misses); 2190 2191 rzio = zio_read(pio, spa, bp, buf->b_data, size, 2192 arc_read_done, buf, priority, flags, zb); 2193 2194 if (*arc_flags & ARC_WAIT) 2195 return (zio_wait(rzio)); 2196 2197 ASSERT(*arc_flags & ARC_NOWAIT); 2198 zio_nowait(rzio); 2199 } 2200 return (0); 2201} 2202 2203/* 2204 * arc_read() variant to support pool traversal. If the block is already 2205 * in the ARC, make a copy of it; otherwise, the caller will do the I/O. 2206 * The idea is that we don't want pool traversal filling up memory, but 2207 * if the ARC already has the data anyway, we shouldn't pay for the I/O. 2208 */ 2209int 2210arc_tryread(spa_t *spa, blkptr_t *bp, void *data) 2211{ 2212 arc_buf_hdr_t *hdr; 2213 kmutex_t *hash_mtx; 2214 int rc = 0; 2215 2216 hdr = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_mtx); 2217 2218 if (hdr && hdr->b_datacnt > 0 && !HDR_IO_IN_PROGRESS(hdr)) { 2219 arc_buf_t *buf = hdr->b_buf; 2220 2221 ASSERT(buf); 2222 while (buf->b_data == NULL) { 2223 buf = buf->b_next; 2224 ASSERT(buf); 2225 } 2226 bcopy(buf->b_data, data, hdr->b_size); 2227 } else { 2228 rc = ENOENT; 2229 } 2230 2231 if (hash_mtx) 2232 mutex_exit(hash_mtx); 2233 2234 return (rc); 2235} 2236 2237void 2238arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private) 2239{ 2240 ASSERT(buf->b_hdr != NULL); 2241 ASSERT(buf->b_hdr->b_state != arc_anon); 2242 ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt) || func == NULL); 2243 buf->b_efunc = func; 2244 buf->b_private = private; 2245} 2246 2247/* 2248 * This is used by the DMU to let the ARC know that a buffer is 2249 * being evicted, so the ARC should clean up. If this arc buf 2250 * is not yet in the evicted state, it will be put there. 2251 */ 2252int 2253arc_buf_evict(arc_buf_t *buf) 2254{ 2255 arc_buf_hdr_t *hdr; 2256 kmutex_t *hash_lock; 2257 arc_buf_t **bufp; 2258 2259 mutex_enter(&arc_eviction_mtx); 2260 hdr = buf->b_hdr; 2261 if (hdr == NULL) { 2262 /* 2263 * We are in arc_do_user_evicts(). 2264 */ 2265 ASSERT(buf->b_data == NULL); 2266 mutex_exit(&arc_eviction_mtx); 2267 return (0); 2268 } 2269 hash_lock = HDR_LOCK(hdr); 2270 mutex_exit(&arc_eviction_mtx); 2271 2272 mutex_enter(hash_lock); 2273 2274 if (buf->b_data == NULL) { 2275 /* 2276 * We are on the eviction list. 2277 */ 2278 mutex_exit(hash_lock); 2279 mutex_enter(&arc_eviction_mtx); 2280 if (buf->b_hdr == NULL) { 2281 /* 2282 * We are already in arc_do_user_evicts(). 2283 */ 2284 mutex_exit(&arc_eviction_mtx); 2285 return (0); 2286 } else { 2287 arc_buf_t copy = *buf; /* structure assignment */ 2288 /* 2289 * Process this buffer now 2290 * but let arc_do_user_evicts() do the reaping. 2291 */ 2292 buf->b_efunc = NULL; 2293 mutex_exit(&arc_eviction_mtx); 2294 VERIFY(copy.b_efunc(©) == 0); 2295 return (1); 2296 } 2297 } 2298 2299 ASSERT(buf->b_hdr == hdr); 2300 ASSERT3U(refcount_count(&hdr->b_refcnt), <, hdr->b_datacnt); 2301 ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu); 2302 2303 /* 2304 * Pull this buffer off of the hdr 2305 */ 2306 bufp = &hdr->b_buf; 2307 while (*bufp != buf) 2308 bufp = &(*bufp)->b_next; 2309 *bufp = buf->b_next; 2310 2311 ASSERT(buf->b_data != NULL); 2312 arc_buf_destroy(buf, FALSE, FALSE); 2313 2314 if (hdr->b_datacnt == 0) { 2315 arc_state_t *old_state = hdr->b_state; 2316 arc_state_t *evicted_state; 2317 2318 ASSERT(refcount_is_zero(&hdr->b_refcnt)); 2319 2320 evicted_state = 2321 (old_state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost; 2322 2323 mutex_enter(&old_state->arcs_mtx); 2324 mutex_enter(&evicted_state->arcs_mtx); 2325 2326 arc_change_state(evicted_state, hdr, hash_lock); 2327 ASSERT(HDR_IN_HASH_TABLE(hdr)); 2328 hdr->b_flags = ARC_IN_HASH_TABLE; 2329 2330 mutex_exit(&evicted_state->arcs_mtx); 2331 mutex_exit(&old_state->arcs_mtx); 2332 } 2333 mutex_exit(hash_lock); 2334 2335 VERIFY(buf->b_efunc(buf) == 0); 2336 buf->b_efunc = NULL; 2337 buf->b_private = NULL; 2338 buf->b_hdr = NULL; 2339 kmem_cache_free(buf_cache, buf); 2340 return (1); 2341} 2342 2343/* 2344 * Release this buffer from the cache. This must be done 2345 * after a read and prior to modifying the buffer contents. 2346 * If the buffer has more than one reference, we must make 2347 * make a new hdr for the buffer. 2348 */ 2349void 2350arc_release(arc_buf_t *buf, void *tag) 2351{ 2352 arc_buf_hdr_t *hdr = buf->b_hdr; 2353 kmutex_t *hash_lock = HDR_LOCK(hdr); 2354 2355 /* this buffer is not on any list */ 2356 ASSERT(refcount_count(&hdr->b_refcnt) > 0); 2357 2358 if (hdr->b_state == arc_anon) { 2359 /* this buffer is already released */ 2360 ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 1); 2361 ASSERT(BUF_EMPTY(hdr)); 2362 ASSERT(buf->b_efunc == NULL); 2363 arc_buf_thaw(buf); 2364 return; 2365 } 2366 2367 mutex_enter(hash_lock); 2368 2369 /* 2370 * Do we have more than one buf? 2371 */ 2372 if (hdr->b_buf != buf || buf->b_next != NULL) { 2373 arc_buf_hdr_t *nhdr; 2374 arc_buf_t **bufp; 2375 uint64_t blksz = hdr->b_size; 2376 spa_t *spa = hdr->b_spa; 2377 arc_buf_contents_t type = hdr->b_type; 2378 2379 ASSERT(hdr->b_datacnt > 1); 2380 /* 2381 * Pull the data off of this buf and attach it to 2382 * a new anonymous buf. 2383 */ 2384 (void) remove_reference(hdr, hash_lock, tag); 2385 bufp = &hdr->b_buf; 2386 while (*bufp != buf) 2387 bufp = &(*bufp)->b_next; 2388 *bufp = (*bufp)->b_next; 2389 buf->b_next = NULL; 2390 2391 ASSERT3U(hdr->b_state->arcs_size, >=, hdr->b_size); 2392 atomic_add_64(&hdr->b_state->arcs_size, -hdr->b_size); 2393 if (refcount_is_zero(&hdr->b_refcnt)) { 2394 ASSERT3U(hdr->b_state->arcs_lsize, >=, hdr->b_size); 2395 atomic_add_64(&hdr->b_state->arcs_lsize, -hdr->b_size); 2396 } 2397 hdr->b_datacnt -= 1; 2398 arc_cksum_verify(buf); 2399 2400 mutex_exit(hash_lock); 2401 2402 nhdr = kmem_cache_alloc(hdr_cache, KM_SLEEP); 2403 nhdr->b_size = blksz; 2404 nhdr->b_spa = spa; 2405 nhdr->b_type = type; 2406 nhdr->b_buf = buf; 2407 nhdr->b_state = arc_anon; 2408 nhdr->b_arc_access = 0; 2409 nhdr->b_flags = 0; 2410 nhdr->b_datacnt = 1; 2411 nhdr->b_freeze_cksum = NULL; 2412 mutex_init(&nhdr->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL); 2413 (void) refcount_add(&nhdr->b_refcnt, tag); 2414 buf->b_hdr = nhdr; 2415 atomic_add_64(&arc_anon->arcs_size, blksz); 2416 2417 hdr = nhdr; 2418 } else { 2419 ASSERT(refcount_count(&hdr->b_refcnt) == 1); 2420 ASSERT(!list_link_active(&hdr->b_arc_node)); 2421 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 2422 arc_change_state(arc_anon, hdr, hash_lock); 2423 hdr->b_arc_access = 0; 2424 mutex_exit(hash_lock); 2425 bzero(&hdr->b_dva, sizeof (dva_t)); 2426 hdr->b_birth = 0; 2427 hdr->b_cksum0 = 0; 2428 arc_buf_thaw(buf); 2429 } 2430 buf->b_efunc = NULL; 2431 buf->b_private = NULL; 2432} 2433 2434int 2435arc_released(arc_buf_t *buf) 2436{ 2437 return (buf->b_data != NULL && buf->b_hdr->b_state == arc_anon); 2438} 2439 2440int 2441arc_has_callback(arc_buf_t *buf) 2442{ 2443 return (buf->b_efunc != NULL); 2444} 2445 2446#ifdef ZFS_DEBUG 2447int 2448arc_referenced(arc_buf_t *buf) 2449{ 2450 return (refcount_count(&buf->b_hdr->b_refcnt)); 2451} 2452#endif 2453 2454static void 2455arc_write_ready(zio_t *zio) 2456{ 2457 arc_write_callback_t *callback = zio->io_private; 2458 arc_buf_t *buf = callback->awcb_buf; 2459 2460 if (callback->awcb_ready) { 2461 ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt)); 2462 callback->awcb_ready(zio, buf, callback->awcb_private); 2463 } 2464 arc_cksum_compute(buf); 2465} 2466 2467static void 2468arc_write_done(zio_t *zio) 2469{ 2470 arc_write_callback_t *callback = zio->io_private; 2471 arc_buf_t *buf = callback->awcb_buf; 2472 arc_buf_hdr_t *hdr = buf->b_hdr; 2473 2474 hdr->b_acb = NULL; 2475 2476 /* this buffer is on no lists and is not in the hash table */ 2477 ASSERT3P(hdr->b_state, ==, arc_anon); 2478 2479 hdr->b_dva = *BP_IDENTITY(zio->io_bp); 2480 hdr->b_birth = zio->io_bp->blk_birth; 2481 hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0]; 2482 /* 2483 * If the block to be written was all-zero, we may have 2484 * compressed it away. In this case no write was performed 2485 * so there will be no dva/birth-date/checksum. The buffer 2486 * must therefor remain anonymous (and uncached). 2487 */ 2488 if (!BUF_EMPTY(hdr)) { 2489 arc_buf_hdr_t *exists; 2490 kmutex_t *hash_lock; 2491 2492 arc_cksum_verify(buf); 2493 2494 exists = buf_hash_insert(hdr, &hash_lock); 2495 if (exists) { 2496 /* 2497 * This can only happen if we overwrite for 2498 * sync-to-convergence, because we remove 2499 * buffers from the hash table when we arc_free(). 2500 */ 2501 ASSERT(DVA_EQUAL(BP_IDENTITY(&zio->io_bp_orig), 2502 BP_IDENTITY(zio->io_bp))); 2503 ASSERT3U(zio->io_bp_orig.blk_birth, ==, 2504 zio->io_bp->blk_birth); 2505 2506 ASSERT(refcount_is_zero(&exists->b_refcnt)); 2507 arc_change_state(arc_anon, exists, hash_lock); 2508 mutex_exit(hash_lock); 2509 arc_hdr_destroy(exists); 2510 exists = buf_hash_insert(hdr, &hash_lock); 2511 ASSERT3P(exists, ==, NULL); 2512 } 2513 hdr->b_flags &= ~ARC_IO_IN_PROGRESS; 2514 arc_access(hdr, hash_lock); 2515 mutex_exit(hash_lock); 2516 } else if (callback->awcb_done == NULL) { 2517 int destroy_hdr; 2518 /* 2519 * This is an anonymous buffer with no user callback, 2520 * destroy it if there are no active references. 2521 */ 2522 mutex_enter(&arc_eviction_mtx); 2523 destroy_hdr = refcount_is_zero(&hdr->b_refcnt); 2524 hdr->b_flags &= ~ARC_IO_IN_PROGRESS; 2525 mutex_exit(&arc_eviction_mtx); 2526 if (destroy_hdr) 2527 arc_hdr_destroy(hdr); 2528 } else { 2529 hdr->b_flags &= ~ARC_IO_IN_PROGRESS; 2530 } 2531 2532 if (callback->awcb_done) { 2533 ASSERT(!refcount_is_zero(&hdr->b_refcnt)); 2534 callback->awcb_done(zio, buf, callback->awcb_private); 2535 } 2536 2537 kmem_free(callback, sizeof (arc_write_callback_t)); 2538} 2539 2540zio_t * 2541arc_write(zio_t *pio, spa_t *spa, int checksum, int compress, int ncopies, 2542 uint64_t txg, blkptr_t *bp, arc_buf_t *buf, 2543 arc_done_func_t *ready, arc_done_func_t *done, void *private, int priority, 2544 int flags, zbookmark_t *zb) 2545{ 2546 arc_buf_hdr_t *hdr = buf->b_hdr; 2547 arc_write_callback_t *callback; 2548 zio_t *zio; 2549 2550 /* this is a private buffer - no locking required */ 2551 ASSERT3P(hdr->b_state, ==, arc_anon); 2552 ASSERT(BUF_EMPTY(hdr)); 2553 ASSERT(!HDR_IO_ERROR(hdr)); 2554 ASSERT((hdr->b_flags & ARC_IO_IN_PROGRESS) == 0); 2555 ASSERT(hdr->b_acb == 0); 2556 callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP); 2557 callback->awcb_ready = ready; 2558 callback->awcb_done = done; 2559 callback->awcb_private = private; 2560 callback->awcb_buf = buf; 2561 hdr->b_flags |= ARC_IO_IN_PROGRESS; 2562 zio = zio_write(pio, spa, checksum, compress, ncopies, txg, bp, 2563 buf->b_data, hdr->b_size, arc_write_ready, arc_write_done, callback, 2564 priority, flags, zb); 2565 2566 return (zio); 2567} 2568 2569int 2570arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, 2571 zio_done_func_t *done, void *private, uint32_t arc_flags) 2572{ 2573 arc_buf_hdr_t *ab; 2574 kmutex_t *hash_lock; 2575 zio_t *zio; 2576 2577 /* 2578 * If this buffer is in the cache, release it, so it 2579 * can be re-used. 2580 */ 2581 ab = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_lock); 2582 if (ab != NULL) { 2583 /* 2584 * The checksum of blocks to free is not always 2585 * preserved (eg. on the deadlist). However, if it is 2586 * nonzero, it should match what we have in the cache. 2587 */ 2588 ASSERT(bp->blk_cksum.zc_word[0] == 0 || 2589 ab->b_cksum0 == bp->blk_cksum.zc_word[0]); 2590 if (ab->b_state != arc_anon) 2591 arc_change_state(arc_anon, ab, hash_lock); 2592 if (HDR_IO_IN_PROGRESS(ab)) { 2593 /* 2594 * This should only happen when we prefetch. 2595 */ 2596 ASSERT(ab->b_flags & ARC_PREFETCH); 2597 ASSERT3U(ab->b_datacnt, ==, 1); 2598 ab->b_flags |= ARC_FREED_IN_READ; 2599 if (HDR_IN_HASH_TABLE(ab)) 2600 buf_hash_remove(ab); 2601 ab->b_arc_access = 0; 2602 bzero(&ab->b_dva, sizeof (dva_t)); 2603 ab->b_birth = 0; 2604 ab->b_cksum0 = 0; 2605 ab->b_buf->b_efunc = NULL; 2606 ab->b_buf->b_private = NULL; 2607 mutex_exit(hash_lock); 2608 } else if (refcount_is_zero(&ab->b_refcnt)) { 2609 mutex_exit(hash_lock); 2610 arc_hdr_destroy(ab); 2611 ARCSTAT_BUMP(arcstat_deleted); 2612 } else { 2613 /* 2614 * We still have an active reference on this 2615 * buffer. This can happen, e.g., from 2616 * dbuf_unoverride(). 2617 */ 2618 ASSERT(!HDR_IN_HASH_TABLE(ab)); 2619 ab->b_arc_access = 0; 2620 bzero(&ab->b_dva, sizeof (dva_t)); 2621 ab->b_birth = 0; 2622 ab->b_cksum0 = 0; 2623 ab->b_buf->b_efunc = NULL; 2624 ab->b_buf->b_private = NULL; 2625 mutex_exit(hash_lock); 2626 } 2627 } 2628 2629 zio = zio_free(pio, spa, txg, bp, done, private); 2630 2631 if (arc_flags & ARC_WAIT) 2632 return (zio_wait(zio)); 2633 2634 ASSERT(arc_flags & ARC_NOWAIT); 2635 zio_nowait(zio); 2636 2637 return (0); 2638} 2639 2640void 2641arc_tempreserve_clear(uint64_t tempreserve) 2642{ 2643 atomic_add_64(&arc_tempreserve, -tempreserve); 2644 ASSERT((int64_t)arc_tempreserve >= 0); 2645} 2646 2647int 2648arc_tempreserve_space(uint64_t tempreserve) 2649{ 2650#ifdef ZFS_DEBUG 2651 /* 2652 * Once in a while, fail for no reason. Everything should cope. 2653 */ 2654 if (spa_get_random(10000) == 0) { 2655 dprintf("forcing random failure\n"); 2656 return (ERESTART); 2657 } 2658#endif 2659 if (tempreserve > arc_c/4 && !arc_no_grow) 2660 arc_c = MIN(arc_c_max, tempreserve * 4); 2661 if (tempreserve > arc_c) 2662 return (ENOMEM); 2663 2664 /* 2665 * Throttle writes when the amount of dirty data in the cache 2666 * gets too large. We try to keep the cache less than half full 2667 * of dirty blocks so that our sync times don't grow too large. 2668 * Note: if two requests come in concurrently, we might let them 2669 * both succeed, when one of them should fail. Not a huge deal. 2670 * 2671 * XXX The limit should be adjusted dynamically to keep the time 2672 * to sync a dataset fixed (around 1-5 seconds?). 2673 */ 2674 2675 if (tempreserve + arc_tempreserve + arc_anon->arcs_size > arc_c / 2 && 2676 arc_tempreserve + arc_anon->arcs_size > arc_c / 4) { 2677 dprintf("failing, arc_tempreserve=%lluK anon=%lluK " 2678 "tempreserve=%lluK arc_c=%lluK\n", 2679 arc_tempreserve>>10, arc_anon->arcs_lsize>>10, 2680 tempreserve>>10, arc_c>>10); 2681 return (ERESTART); 2682 } 2683 atomic_add_64(&arc_tempreserve, tempreserve); 2684 return (0); 2685} 2686 2687#ifdef _KERNEL 2688static eventhandler_tag zfs_event_lowmem = NULL; 2689 2690static void 2691zfs_lowmem(void *arg __unused, int howto __unused) 2692{ 2693 2694 zfs_needfree = 1; 2695 cv_signal(&arc_reclaim_thr_cv); 2696 while (zfs_needfree) 2697 tsleep(&zfs_needfree, 0, "zfs:lowmem", hz / 5); 2698} 2699#endif 2700 2701void 2702arc_init(void) 2703{ 2704 mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL); 2705 cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL); 2706 2707 /* Convert seconds to clock ticks */ 2708 arc_min_prefetch_lifespan = 1 * hz; 2709 2710 /* Start out with 1/8 of all memory */ 2711 arc_c = physmem * PAGESIZE / 8; 2712#if 0 2713#ifdef _KERNEL 2714 /* 2715 * On architectures where the physical memory can be larger 2716 * than the addressable space (intel in 32-bit mode), we may 2717 * need to limit the cache to 1/8 of VM size. 2718 */ 2719 arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8); 2720#endif 2721#endif 2722 /* set min cache to 1/32 of all memory, or 64MB, whichever is more */ 2723 arc_c_min = MAX(arc_c / 4, 64<<20); 2724 /* set max to 3/4 of all memory, or all but 1GB, whichever is more */ 2725 if (arc_c * 8 >= 1<<30) 2726 arc_c_max = (arc_c * 8) - (1<<30); 2727 else 2728 arc_c_max = arc_c_min; 2729 arc_c_max = MAX(arc_c * 6, arc_c_max); 2730 /* 2731 * Allow the tunables to override our calculations if they are 2732 * reasonable (ie. over 64MB) 2733 */ 2734 if (zfs_arc_max > 64<<20 && zfs_arc_max < vm_kmem_size) 2735 arc_c_max = zfs_arc_max; 2736 if (zfs_arc_min > 64<<20 && zfs_arc_min <= arc_c_max) 2737 arc_c_min = zfs_arc_min; 2738 arc_c = arc_c_max; 2739 arc_p = (arc_c >> 1); 2740 2741 /* if kmem_flags are set, lets try to use less memory */ 2742 if (kmem_debugging()) 2743 arc_c = arc_c / 2; 2744 if (arc_c < arc_c_min) 2745 arc_c = arc_c_min; 2746 2747 zfs_arc_min = arc_c_min; 2748 zfs_arc_max = arc_c_max; 2749 2750 arc_anon = &ARC_anon; 2751 arc_mru = &ARC_mru; 2752 arc_mru_ghost = &ARC_mru_ghost; 2753 arc_mfu = &ARC_mfu; 2754 arc_mfu_ghost = &ARC_mfu_ghost; 2755 arc_size = 0; 2756 2757 mutex_init(&arc_anon->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); 2758 mutex_init(&arc_mru->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); 2759 mutex_init(&arc_mru_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); 2760 mutex_init(&arc_mfu->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); 2761 mutex_init(&arc_mfu_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); 2762 2763 list_create(&arc_mru->arcs_list, sizeof (arc_buf_hdr_t), 2764 offsetof(arc_buf_hdr_t, b_arc_node)); 2765 list_create(&arc_mru_ghost->arcs_list, sizeof (arc_buf_hdr_t), 2766 offsetof(arc_buf_hdr_t, b_arc_node)); 2767 list_create(&arc_mfu->arcs_list, sizeof (arc_buf_hdr_t), 2768 offsetof(arc_buf_hdr_t, b_arc_node)); 2769 list_create(&arc_mfu_ghost->arcs_list, sizeof (arc_buf_hdr_t), 2770 offsetof(arc_buf_hdr_t, b_arc_node)); 2771 2772 buf_init(); 2773 2774 arc_thread_exit = 0; 2775 arc_eviction_list = NULL; 2776 mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL); 2777 bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t)); 2778 2779 arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED, 2780 sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); 2781 2782 if (arc_ksp != NULL) { 2783 arc_ksp->ks_data = &arc_stats; 2784 kstat_install(arc_ksp); 2785 } 2786 2787 (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0, 2788 TS_RUN, minclsyspri); 2789 2790#ifdef _KERNEL 2791 zfs_event_lowmem = EVENTHANDLER_REGISTER(vm_lowmem, zfs_lowmem, NULL, 2792 EVENTHANDLER_PRI_FIRST); 2793#endif 2794 2795 arc_dead = FALSE; 2796} 2797 2798void 2799arc_fini(void) 2800{ 2801 mutex_enter(&arc_reclaim_thr_lock); 2802 arc_thread_exit = 1; 2803 cv_signal(&arc_reclaim_thr_cv); 2804 while (arc_thread_exit != 0) 2805 cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock); 2806 mutex_exit(&arc_reclaim_thr_lock); 2807 2808 arc_flush(); 2809 2810 arc_dead = TRUE; 2811 2812 if (arc_ksp != NULL) { 2813 kstat_delete(arc_ksp); 2814 arc_ksp = NULL; 2815 } 2816 2817 mutex_destroy(&arc_eviction_mtx); 2818 mutex_destroy(&arc_reclaim_thr_lock); 2819 cv_destroy(&arc_reclaim_thr_cv); 2820 2821 list_destroy(&arc_mru->arcs_list); 2822 list_destroy(&arc_mru_ghost->arcs_list); 2823 list_destroy(&arc_mfu->arcs_list); 2824 list_destroy(&arc_mfu_ghost->arcs_list); 2825 2826 mutex_destroy(&arc_anon->arcs_mtx); 2827 mutex_destroy(&arc_mru->arcs_mtx); 2828 mutex_destroy(&arc_mru_ghost->arcs_mtx); 2829 mutex_destroy(&arc_mfu->arcs_mtx); 2830 mutex_destroy(&arc_mfu_ghost->arcs_mtx); 2831 2832 buf_fini(); 2833 2834#ifdef _KERNEL 2835 if (zfs_event_lowmem != NULL) 2836 EVENTHANDLER_DEREGISTER(vm_lowmem, zfs_event_lowmem); 2837#endif 2838} 2839