arc.c revision 168481
1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21/* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26#pragma ident "%Z%%M% %I% %E% SMI" 27 28/* 29 * DVA-based Adjustable Replacement Cache 30 * 31 * While much of the theory of operation used here is 32 * based on the self-tuning, low overhead replacement cache 33 * presented by Megiddo and Modha at FAST 2003, there are some 34 * significant differences: 35 * 36 * 1. The Megiddo and Modha model assumes any page is evictable. 37 * Pages in its cache cannot be "locked" into memory. This makes 38 * the eviction algorithm simple: evict the last page in the list. 39 * This also make the performance characteristics easy to reason 40 * about. Our cache is not so simple. At any given moment, some 41 * subset of the blocks in the cache are un-evictable because we 42 * have handed out a reference to them. Blocks are only evictable 43 * when there are no external references active. This makes 44 * eviction far more problematic: we choose to evict the evictable 45 * blocks that are the "lowest" in the list. 46 * 47 * There are times when it is not possible to evict the requested 48 * space. In these circumstances we are unable to adjust the cache 49 * size. To prevent the cache growing unbounded at these times we 50 * implement a "cache throttle" that slowes the flow of new data 51 * into the cache until we can make space avaiable. 52 * 53 * 2. The Megiddo and Modha model assumes a fixed cache size. 54 * Pages are evicted when the cache is full and there is a cache 55 * miss. Our model has a variable sized cache. It grows with 56 * high use, but also tries to react to memory preasure from the 57 * operating system: decreasing its size when system memory is 58 * tight. 59 * 60 * 3. The Megiddo and Modha model assumes a fixed page size. All 61 * elements of the cache are therefor exactly the same size. So 62 * when adjusting the cache size following a cache miss, its simply 63 * a matter of choosing a single page to evict. In our model, we 64 * have variable sized cache blocks (rangeing from 512 bytes to 65 * 128K bytes). We therefor choose a set of blocks to evict to make 66 * space for a cache miss that approximates as closely as possible 67 * the space used by the new block. 68 * 69 * See also: "ARC: A Self-Tuning, Low Overhead Replacement Cache" 70 * by N. Megiddo & D. Modha, FAST 2003 71 */ 72 73/* 74 * The locking model: 75 * 76 * A new reference to a cache buffer can be obtained in two 77 * ways: 1) via a hash table lookup using the DVA as a key, 78 * or 2) via one of the ARC lists. The arc_read() inerface 79 * uses method 1, while the internal arc algorithms for 80 * adjusting the cache use method 2. We therefor provide two 81 * types of locks: 1) the hash table lock array, and 2) the 82 * arc list locks. 83 * 84 * Buffers do not have their own mutexs, rather they rely on the 85 * hash table mutexs for the bulk of their protection (i.e. most 86 * fields in the arc_buf_hdr_t are protected by these mutexs). 87 * 88 * buf_hash_find() returns the appropriate mutex (held) when it 89 * locates the requested buffer in the hash table. It returns 90 * NULL for the mutex if the buffer was not in the table. 91 * 92 * buf_hash_remove() expects the appropriate hash mutex to be 93 * already held before it is invoked. 94 * 95 * Each arc state also has a mutex which is used to protect the 96 * buffer list associated with the state. When attempting to 97 * obtain a hash table lock while holding an arc list lock you 98 * must use: mutex_tryenter() to avoid deadlock. Also note that 99 * the active state mutex must be held before the ghost state mutex. 100 * 101 * Arc buffers may have an associated eviction callback function. 102 * This function will be invoked prior to removing the buffer (e.g. 103 * in arc_do_user_evicts()). Note however that the data associated 104 * with the buffer may be evicted prior to the callback. The callback 105 * must be made with *no locks held* (to prevent deadlock). Additionally, 106 * the users of callbacks must ensure that their private data is 107 * protected from simultaneous callbacks from arc_buf_evict() 108 * and arc_do_user_evicts(). 109 * 110 * Note that the majority of the performance stats are manipulated 111 * with atomic operations. 112 */ 113 114#include <sys/spa.h> 115#include <sys/zio.h> 116#include <sys/zio_checksum.h> 117#include <sys/zfs_context.h> 118#include <sys/arc.h> 119#include <sys/refcount.h> 120#ifdef _KERNEL 121#include <sys/dnlc.h> 122#endif 123#include <sys/callb.h> 124#include <sys/kstat.h> 125#include <sys/sdt.h> 126 127#define ARC_FREE_AT_ONCE 4194304 128 129static kmutex_t arc_reclaim_thr_lock; 130static kcondvar_t arc_reclaim_thr_cv; /* used to signal reclaim thr */ 131static uint8_t arc_thread_exit; 132 133#define ARC_REDUCE_DNLC_PERCENT 3 134uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT; 135 136typedef enum arc_reclaim_strategy { 137 ARC_RECLAIM_AGGR, /* Aggressive reclaim strategy */ 138 ARC_RECLAIM_CONS /* Conservative reclaim strategy */ 139} arc_reclaim_strategy_t; 140 141/* number of seconds before growing cache again */ 142static int arc_grow_retry = 60; 143 144/* 145 * minimum lifespan of a prefetch block in clock ticks 146 * (initialized in arc_init()) 147 */ 148static int arc_min_prefetch_lifespan; 149 150static int arc_dead; 151 152/* 153 * These tunables are for performance analysis. 154 */ 155u_long zfs_arc_max; 156u_long zfs_arc_min; 157#ifdef _KERNEL 158TUNABLE_ULONG("vfs.zfs.arc_max", &zfs_arc_max); 159TUNABLE_ULONG("vfs.zfs.arc_min", &zfs_arc_min); 160SYSCTL_DECL(_vfs_zfs); 161SYSCTL_ULONG(_vfs_zfs, OID_AUTO, arc_max, CTLFLAG_RD, &zfs_arc_max, 0, 162 "Maximum ARC size"); 163SYSCTL_ULONG(_vfs_zfs, OID_AUTO, arc_min, CTLFLAG_RD, &zfs_arc_min, 0, 164 "Minimum ARC size"); 165#endif 166 167/* 168 * Note that buffers can be on one of 5 states: 169 * ARC_anon - anonymous (discussed below) 170 * ARC_mru - recently used, currently cached 171 * ARC_mru_ghost - recentely used, no longer in cache 172 * ARC_mfu - frequently used, currently cached 173 * ARC_mfu_ghost - frequently used, no longer in cache 174 * When there are no active references to the buffer, they 175 * are linked onto one of the lists in arc. These are the 176 * only buffers that can be evicted or deleted. 177 * 178 * Anonymous buffers are buffers that are not associated with 179 * a DVA. These are buffers that hold dirty block copies 180 * before they are written to stable storage. By definition, 181 * they are "ref'd" and are considered part of arc_mru 182 * that cannot be freed. Generally, they will aquire a DVA 183 * as they are written and migrate onto the arc_mru list. 184 */ 185 186typedef struct arc_state { 187 list_t arcs_list; /* linked list of evictable buffer in state */ 188 uint64_t arcs_lsize; /* total size of buffers in the linked list */ 189 uint64_t arcs_size; /* total size of all buffers in this state */ 190 kmutex_t arcs_mtx; 191} arc_state_t; 192 193/* The 5 states: */ 194static arc_state_t ARC_anon; 195static arc_state_t ARC_mru; 196static arc_state_t ARC_mru_ghost; 197static arc_state_t ARC_mfu; 198static arc_state_t ARC_mfu_ghost; 199 200typedef struct arc_stats { 201 kstat_named_t arcstat_hits; 202 kstat_named_t arcstat_misses; 203 kstat_named_t arcstat_demand_data_hits; 204 kstat_named_t arcstat_demand_data_misses; 205 kstat_named_t arcstat_demand_metadata_hits; 206 kstat_named_t arcstat_demand_metadata_misses; 207 kstat_named_t arcstat_prefetch_data_hits; 208 kstat_named_t arcstat_prefetch_data_misses; 209 kstat_named_t arcstat_prefetch_metadata_hits; 210 kstat_named_t arcstat_prefetch_metadata_misses; 211 kstat_named_t arcstat_mru_hits; 212 kstat_named_t arcstat_mru_ghost_hits; 213 kstat_named_t arcstat_mfu_hits; 214 kstat_named_t arcstat_mfu_ghost_hits; 215 kstat_named_t arcstat_deleted; 216 kstat_named_t arcstat_recycle_miss; 217 kstat_named_t arcstat_mutex_miss; 218 kstat_named_t arcstat_evict_skip; 219 kstat_named_t arcstat_hash_elements; 220 kstat_named_t arcstat_hash_elements_max; 221 kstat_named_t arcstat_hash_collisions; 222 kstat_named_t arcstat_hash_chains; 223 kstat_named_t arcstat_hash_chain_max; 224 kstat_named_t arcstat_p; 225 kstat_named_t arcstat_c; 226 kstat_named_t arcstat_c_min; 227 kstat_named_t arcstat_c_max; 228 kstat_named_t arcstat_size; 229} arc_stats_t; 230 231static arc_stats_t arc_stats = { 232 { "hits", KSTAT_DATA_UINT64 }, 233 { "misses", KSTAT_DATA_UINT64 }, 234 { "demand_data_hits", KSTAT_DATA_UINT64 }, 235 { "demand_data_misses", KSTAT_DATA_UINT64 }, 236 { "demand_metadata_hits", KSTAT_DATA_UINT64 }, 237 { "demand_metadata_misses", KSTAT_DATA_UINT64 }, 238 { "prefetch_data_hits", KSTAT_DATA_UINT64 }, 239 { "prefetch_data_misses", KSTAT_DATA_UINT64 }, 240 { "prefetch_metadata_hits", KSTAT_DATA_UINT64 }, 241 { "prefetch_metadata_misses", KSTAT_DATA_UINT64 }, 242 { "mru_hits", KSTAT_DATA_UINT64 }, 243 { "mru_ghost_hits", KSTAT_DATA_UINT64 }, 244 { "mfu_hits", KSTAT_DATA_UINT64 }, 245 { "mfu_ghost_hits", KSTAT_DATA_UINT64 }, 246 { "deleted", KSTAT_DATA_UINT64 }, 247 { "recycle_miss", KSTAT_DATA_UINT64 }, 248 { "mutex_miss", KSTAT_DATA_UINT64 }, 249 { "evict_skip", KSTAT_DATA_UINT64 }, 250 { "hash_elements", KSTAT_DATA_UINT64 }, 251 { "hash_elements_max", KSTAT_DATA_UINT64 }, 252 { "hash_collisions", KSTAT_DATA_UINT64 }, 253 { "hash_chains", KSTAT_DATA_UINT64 }, 254 { "hash_chain_max", KSTAT_DATA_UINT64 }, 255 { "p", KSTAT_DATA_UINT64 }, 256 { "c", KSTAT_DATA_UINT64 }, 257 { "c_min", KSTAT_DATA_UINT64 }, 258 { "c_max", KSTAT_DATA_UINT64 }, 259 { "size", KSTAT_DATA_UINT64 } 260}; 261 262#define ARCSTAT(stat) (arc_stats.stat.value.ui64) 263 264#define ARCSTAT_INCR(stat, val) \ 265 atomic_add_64(&arc_stats.stat.value.ui64, (val)); 266 267#define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1) 268#define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1) 269 270#define ARCSTAT_MAX(stat, val) { \ 271 uint64_t m; \ 272 while ((val) > (m = arc_stats.stat.value.ui64) && \ 273 (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val)))) \ 274 continue; \ 275} 276 277#define ARCSTAT_MAXSTAT(stat) \ 278 ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64) 279 280/* 281 * We define a macro to allow ARC hits/misses to be easily broken down by 282 * two separate conditions, giving a total of four different subtypes for 283 * each of hits and misses (so eight statistics total). 284 */ 285#define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \ 286 if (cond1) { \ 287 if (cond2) { \ 288 ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \ 289 } else { \ 290 ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \ 291 } \ 292 } else { \ 293 if (cond2) { \ 294 ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \ 295 } else { \ 296 ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\ 297 } \ 298 } 299 300kstat_t *arc_ksp; 301static arc_state_t *arc_anon; 302static arc_state_t *arc_mru; 303static arc_state_t *arc_mru_ghost; 304static arc_state_t *arc_mfu; 305static arc_state_t *arc_mfu_ghost; 306 307/* 308 * There are several ARC variables that are critical to export as kstats -- 309 * but we don't want to have to grovel around in the kstat whenever we wish to 310 * manipulate them. For these variables, we therefore define them to be in 311 * terms of the statistic variable. This assures that we are not introducing 312 * the possibility of inconsistency by having shadow copies of the variables, 313 * while still allowing the code to be readable. 314 */ 315#define arc_size ARCSTAT(arcstat_size) /* actual total arc size */ 316#define arc_p ARCSTAT(arcstat_p) /* target size of MRU */ 317#define arc_c ARCSTAT(arcstat_c) /* target size of cache */ 318#define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */ 319#define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */ 320 321static int arc_no_grow; /* Don't try to grow cache size */ 322static uint64_t arc_tempreserve; 323 324typedef struct arc_callback arc_callback_t; 325 326struct arc_callback { 327 void *acb_private; 328 arc_done_func_t *acb_done; 329 arc_byteswap_func_t *acb_byteswap; 330 arc_buf_t *acb_buf; 331 zio_t *acb_zio_dummy; 332 arc_callback_t *acb_next; 333}; 334 335typedef struct arc_write_callback arc_write_callback_t; 336 337struct arc_write_callback { 338 void *awcb_private; 339 arc_done_func_t *awcb_ready; 340 arc_done_func_t *awcb_done; 341 arc_buf_t *awcb_buf; 342}; 343 344struct arc_buf_hdr { 345 /* protected by hash lock */ 346 dva_t b_dva; 347 uint64_t b_birth; 348 uint64_t b_cksum0; 349 350 kmutex_t b_freeze_lock; 351 zio_cksum_t *b_freeze_cksum; 352 353 arc_buf_hdr_t *b_hash_next; 354 arc_buf_t *b_buf; 355 uint32_t b_flags; 356 uint32_t b_datacnt; 357 358 arc_callback_t *b_acb; 359 kcondvar_t b_cv; 360 361 /* immutable */ 362 arc_buf_contents_t b_type; 363 uint64_t b_size; 364 spa_t *b_spa; 365 366 /* protected by arc state mutex */ 367 arc_state_t *b_state; 368 list_node_t b_arc_node; 369 370 /* updated atomically */ 371 clock_t b_arc_access; 372 373 /* self protecting */ 374 refcount_t b_refcnt; 375}; 376 377static arc_buf_t *arc_eviction_list; 378static kmutex_t arc_eviction_mtx; 379static arc_buf_hdr_t arc_eviction_hdr; 380static void arc_get_data_buf(arc_buf_t *buf); 381static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock); 382 383#define GHOST_STATE(state) \ 384 ((state) == arc_mru_ghost || (state) == arc_mfu_ghost) 385 386/* 387 * Private ARC flags. These flags are private ARC only flags that will show up 388 * in b_flags in the arc_hdr_buf_t. Some flags are publicly declared, and can 389 * be passed in as arc_flags in things like arc_read. However, these flags 390 * should never be passed and should only be set by ARC code. When adding new 391 * public flags, make sure not to smash the private ones. 392 */ 393 394#define ARC_IN_HASH_TABLE (1 << 9) /* this buffer is hashed */ 395#define ARC_IO_IN_PROGRESS (1 << 10) /* I/O in progress for buf */ 396#define ARC_IO_ERROR (1 << 11) /* I/O failed for buf */ 397#define ARC_FREED_IN_READ (1 << 12) /* buf freed while in read */ 398#define ARC_BUF_AVAILABLE (1 << 13) /* block not in active use */ 399#define ARC_INDIRECT (1 << 14) /* this is an indirect block */ 400 401#define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_IN_HASH_TABLE) 402#define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS) 403#define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_IO_ERROR) 404#define HDR_FREED_IN_READ(hdr) ((hdr)->b_flags & ARC_FREED_IN_READ) 405#define HDR_BUF_AVAILABLE(hdr) ((hdr)->b_flags & ARC_BUF_AVAILABLE) 406 407/* 408 * Hash table routines 409 */ 410 411#define HT_LOCK_PAD 128 412 413struct ht_lock { 414 kmutex_t ht_lock; 415#ifdef _KERNEL 416 unsigned char pad[(HT_LOCK_PAD - sizeof (kmutex_t))]; 417#endif 418}; 419 420#define BUF_LOCKS 256 421typedef struct buf_hash_table { 422 uint64_t ht_mask; 423 arc_buf_hdr_t **ht_table; 424 struct ht_lock ht_locks[BUF_LOCKS]; 425} buf_hash_table_t; 426 427static buf_hash_table_t buf_hash_table; 428 429#define BUF_HASH_INDEX(spa, dva, birth) \ 430 (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask) 431#define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)]) 432#define BUF_HASH_LOCK(idx) (&(BUF_HASH_LOCK_NTRY(idx).ht_lock)) 433#define HDR_LOCK(buf) \ 434 (BUF_HASH_LOCK(BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth))) 435 436uint64_t zfs_crc64_table[256]; 437 438static uint64_t 439buf_hash(spa_t *spa, dva_t *dva, uint64_t birth) 440{ 441 uintptr_t spav = (uintptr_t)spa; 442 uint8_t *vdva = (uint8_t *)dva; 443 uint64_t crc = -1ULL; 444 int i; 445 446 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); 447 448 for (i = 0; i < sizeof (dva_t); i++) 449 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF]; 450 451 crc ^= (spav>>8) ^ birth; 452 453 return (crc); 454} 455 456#define BUF_EMPTY(buf) \ 457 ((buf)->b_dva.dva_word[0] == 0 && \ 458 (buf)->b_dva.dva_word[1] == 0 && \ 459 (buf)->b_birth == 0) 460 461#define BUF_EQUAL(spa, dva, birth, buf) \ 462 ((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \ 463 ((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \ 464 ((buf)->b_birth == birth) && ((buf)->b_spa == spa) 465 466static arc_buf_hdr_t * 467buf_hash_find(spa_t *spa, dva_t *dva, uint64_t birth, kmutex_t **lockp) 468{ 469 uint64_t idx = BUF_HASH_INDEX(spa, dva, birth); 470 kmutex_t *hash_lock = BUF_HASH_LOCK(idx); 471 arc_buf_hdr_t *buf; 472 473 mutex_enter(hash_lock); 474 for (buf = buf_hash_table.ht_table[idx]; buf != NULL; 475 buf = buf->b_hash_next) { 476 if (BUF_EQUAL(spa, dva, birth, buf)) { 477 *lockp = hash_lock; 478 return (buf); 479 } 480 } 481 mutex_exit(hash_lock); 482 *lockp = NULL; 483 return (NULL); 484} 485 486/* 487 * Insert an entry into the hash table. If there is already an element 488 * equal to elem in the hash table, then the already existing element 489 * will be returned and the new element will not be inserted. 490 * Otherwise returns NULL. 491 */ 492static arc_buf_hdr_t * 493buf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp) 494{ 495 uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth); 496 kmutex_t *hash_lock = BUF_HASH_LOCK(idx); 497 arc_buf_hdr_t *fbuf; 498 uint32_t i; 499 500 ASSERT(!HDR_IN_HASH_TABLE(buf)); 501 *lockp = hash_lock; 502 mutex_enter(hash_lock); 503 for (fbuf = buf_hash_table.ht_table[idx], i = 0; fbuf != NULL; 504 fbuf = fbuf->b_hash_next, i++) { 505 if (BUF_EQUAL(buf->b_spa, &buf->b_dva, buf->b_birth, fbuf)) 506 return (fbuf); 507 } 508 509 buf->b_hash_next = buf_hash_table.ht_table[idx]; 510 buf_hash_table.ht_table[idx] = buf; 511 buf->b_flags |= ARC_IN_HASH_TABLE; 512 513 /* collect some hash table performance data */ 514 if (i > 0) { 515 ARCSTAT_BUMP(arcstat_hash_collisions); 516 if (i == 1) 517 ARCSTAT_BUMP(arcstat_hash_chains); 518 519 ARCSTAT_MAX(arcstat_hash_chain_max, i); 520 } 521 522 ARCSTAT_BUMP(arcstat_hash_elements); 523 ARCSTAT_MAXSTAT(arcstat_hash_elements); 524 525 return (NULL); 526} 527 528static void 529buf_hash_remove(arc_buf_hdr_t *buf) 530{ 531 arc_buf_hdr_t *fbuf, **bufp; 532 uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth); 533 534 ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx))); 535 ASSERT(HDR_IN_HASH_TABLE(buf)); 536 537 bufp = &buf_hash_table.ht_table[idx]; 538 while ((fbuf = *bufp) != buf) { 539 ASSERT(fbuf != NULL); 540 bufp = &fbuf->b_hash_next; 541 } 542 *bufp = buf->b_hash_next; 543 buf->b_hash_next = NULL; 544 buf->b_flags &= ~ARC_IN_HASH_TABLE; 545 546 /* collect some hash table performance data */ 547 ARCSTAT_BUMPDOWN(arcstat_hash_elements); 548 549 if (buf_hash_table.ht_table[idx] && 550 buf_hash_table.ht_table[idx]->b_hash_next == NULL) 551 ARCSTAT_BUMPDOWN(arcstat_hash_chains); 552} 553 554/* 555 * Global data structures and functions for the buf kmem cache. 556 */ 557static kmem_cache_t *hdr_cache; 558static kmem_cache_t *buf_cache; 559 560static void 561buf_fini(void) 562{ 563 int i; 564 565 kmem_free(buf_hash_table.ht_table, 566 (buf_hash_table.ht_mask + 1) * sizeof (void *)); 567 for (i = 0; i < BUF_LOCKS; i++) 568 mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock); 569 kmem_cache_destroy(hdr_cache); 570 kmem_cache_destroy(buf_cache); 571} 572 573/* 574 * Constructor callback - called when the cache is empty 575 * and a new buf is requested. 576 */ 577/* ARGSUSED */ 578static int 579hdr_cons(void *vbuf, void *unused, int kmflag) 580{ 581 arc_buf_hdr_t *buf = vbuf; 582 583 bzero(buf, sizeof (arc_buf_hdr_t)); 584 refcount_create(&buf->b_refcnt); 585 cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL); 586 return (0); 587} 588 589/* 590 * Destructor callback - called when a cached buf is 591 * no longer required. 592 */ 593/* ARGSUSED */ 594static void 595hdr_dest(void *vbuf, void *unused) 596{ 597 arc_buf_hdr_t *buf = vbuf; 598 599 refcount_destroy(&buf->b_refcnt); 600 cv_destroy(&buf->b_cv); 601} 602 603/* 604 * Reclaim callback -- invoked when memory is low. 605 */ 606/* ARGSUSED */ 607static void 608hdr_recl(void *unused) 609{ 610 dprintf("hdr_recl called\n"); 611 /* 612 * umem calls the reclaim func when we destroy the buf cache, 613 * which is after we do arc_fini(). 614 */ 615 if (!arc_dead) 616 cv_signal(&arc_reclaim_thr_cv); 617} 618 619static void 620buf_init(void) 621{ 622 uint64_t *ct; 623 uint64_t hsize = 1ULL << 12; 624 int i, j; 625 626 /* 627 * The hash table is big enough to fill all of physical memory 628 * with an average 64K block size. The table will take up 629 * totalmem*sizeof(void*)/64K (eg. 128KB/GB with 8-byte pointers). 630 */ 631 while (hsize * 65536 < physmem * PAGESIZE) 632 hsize <<= 1; 633retry: 634 buf_hash_table.ht_mask = hsize - 1; 635 buf_hash_table.ht_table = 636 kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP); 637 if (buf_hash_table.ht_table == NULL) { 638 ASSERT(hsize > (1ULL << 8)); 639 hsize >>= 1; 640 goto retry; 641 } 642 643 hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t), 644 0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0); 645 buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t), 646 0, NULL, NULL, NULL, NULL, NULL, 0); 647 648 for (i = 0; i < 256; i++) 649 for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--) 650 *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY); 651 652 for (i = 0; i < BUF_LOCKS; i++) { 653 mutex_init(&buf_hash_table.ht_locks[i].ht_lock, 654 NULL, MUTEX_DEFAULT, NULL); 655 } 656} 657 658#define ARC_MINTIME (hz>>4) /* 62 ms */ 659 660static void 661arc_cksum_verify(arc_buf_t *buf) 662{ 663 zio_cksum_t zc; 664 665 if (!(zfs_flags & ZFS_DEBUG_MODIFY)) 666 return; 667 668 mutex_enter(&buf->b_hdr->b_freeze_lock); 669 if (buf->b_hdr->b_freeze_cksum == NULL || 670 (buf->b_hdr->b_flags & ARC_IO_ERROR)) { 671 mutex_exit(&buf->b_hdr->b_freeze_lock); 672 return; 673 } 674 fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc); 675 if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc)) 676 panic("buffer modified while frozen!"); 677 mutex_exit(&buf->b_hdr->b_freeze_lock); 678} 679 680static void 681arc_cksum_compute(arc_buf_t *buf) 682{ 683 if (!(zfs_flags & ZFS_DEBUG_MODIFY)) 684 return; 685 686 mutex_enter(&buf->b_hdr->b_freeze_lock); 687 if (buf->b_hdr->b_freeze_cksum != NULL) { 688 mutex_exit(&buf->b_hdr->b_freeze_lock); 689 return; 690 } 691 buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP); 692 fletcher_2_native(buf->b_data, buf->b_hdr->b_size, 693 buf->b_hdr->b_freeze_cksum); 694 mutex_exit(&buf->b_hdr->b_freeze_lock); 695} 696 697void 698arc_buf_thaw(arc_buf_t *buf) 699{ 700 if (!(zfs_flags & ZFS_DEBUG_MODIFY)) 701 return; 702 703 if (buf->b_hdr->b_state != arc_anon) 704 panic("modifying non-anon buffer!"); 705 if (buf->b_hdr->b_flags & ARC_IO_IN_PROGRESS) 706 panic("modifying buffer while i/o in progress!"); 707 arc_cksum_verify(buf); 708 mutex_enter(&buf->b_hdr->b_freeze_lock); 709 if (buf->b_hdr->b_freeze_cksum != NULL) { 710 kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t)); 711 buf->b_hdr->b_freeze_cksum = NULL; 712 } 713 mutex_exit(&buf->b_hdr->b_freeze_lock); 714} 715 716void 717arc_buf_freeze(arc_buf_t *buf) 718{ 719 if (!(zfs_flags & ZFS_DEBUG_MODIFY)) 720 return; 721 722 ASSERT(buf->b_hdr->b_freeze_cksum != NULL || 723 buf->b_hdr->b_state == arc_anon); 724 arc_cksum_compute(buf); 725} 726 727static void 728add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag) 729{ 730 ASSERT(MUTEX_HELD(hash_lock)); 731 732 if ((refcount_add(&ab->b_refcnt, tag) == 1) && 733 (ab->b_state != arc_anon)) { 734 uint64_t delta = ab->b_size * ab->b_datacnt; 735 736 ASSERT(!MUTEX_HELD(&ab->b_state->arcs_mtx)); 737 mutex_enter(&ab->b_state->arcs_mtx); 738 ASSERT(list_link_active(&ab->b_arc_node)); 739 list_remove(&ab->b_state->arcs_list, ab); 740 if (GHOST_STATE(ab->b_state)) { 741 ASSERT3U(ab->b_datacnt, ==, 0); 742 ASSERT3P(ab->b_buf, ==, NULL); 743 delta = ab->b_size; 744 } 745 ASSERT(delta > 0); 746 ASSERT3U(ab->b_state->arcs_lsize, >=, delta); 747 atomic_add_64(&ab->b_state->arcs_lsize, -delta); 748 mutex_exit(&ab->b_state->arcs_mtx); 749 /* remove the prefetch flag is we get a reference */ 750 if (ab->b_flags & ARC_PREFETCH) 751 ab->b_flags &= ~ARC_PREFETCH; 752 } 753} 754 755static int 756remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag) 757{ 758 int cnt; 759 arc_state_t *state = ab->b_state; 760 761 ASSERT(state == arc_anon || MUTEX_HELD(hash_lock)); 762 ASSERT(!GHOST_STATE(state)); 763 764 if (((cnt = refcount_remove(&ab->b_refcnt, tag)) == 0) && 765 (state != arc_anon)) { 766 ASSERT(!MUTEX_HELD(&state->arcs_mtx)); 767 mutex_enter(&state->arcs_mtx); 768 ASSERT(!list_link_active(&ab->b_arc_node)); 769 list_insert_head(&state->arcs_list, ab); 770 ASSERT(ab->b_datacnt > 0); 771 atomic_add_64(&state->arcs_lsize, ab->b_size * ab->b_datacnt); 772 ASSERT3U(state->arcs_size, >=, state->arcs_lsize); 773 mutex_exit(&state->arcs_mtx); 774 } 775 return (cnt); 776} 777 778/* 779 * Move the supplied buffer to the indicated state. The mutex 780 * for the buffer must be held by the caller. 781 */ 782static void 783arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock) 784{ 785 arc_state_t *old_state = ab->b_state; 786 int64_t refcnt = refcount_count(&ab->b_refcnt); 787 uint64_t from_delta, to_delta; 788 789 ASSERT(MUTEX_HELD(hash_lock)); 790 ASSERT(new_state != old_state); 791 ASSERT(refcnt == 0 || ab->b_datacnt > 0); 792 ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state)); 793 794 from_delta = to_delta = ab->b_datacnt * ab->b_size; 795 796 /* 797 * If this buffer is evictable, transfer it from the 798 * old state list to the new state list. 799 */ 800 if (refcnt == 0) { 801 if (old_state != arc_anon) { 802 int use_mutex = !MUTEX_HELD(&old_state->arcs_mtx); 803 804 if (use_mutex) 805 mutex_enter(&old_state->arcs_mtx); 806 807 ASSERT(list_link_active(&ab->b_arc_node)); 808 list_remove(&old_state->arcs_list, ab); 809 810 /* 811 * If prefetching out of the ghost cache, 812 * we will have a non-null datacnt. 813 */ 814 if (GHOST_STATE(old_state) && ab->b_datacnt == 0) { 815 /* ghost elements have a ghost size */ 816 ASSERT(ab->b_buf == NULL); 817 from_delta = ab->b_size; 818 } 819 ASSERT3U(old_state->arcs_lsize, >=, from_delta); 820 atomic_add_64(&old_state->arcs_lsize, -from_delta); 821 822 if (use_mutex) 823 mutex_exit(&old_state->arcs_mtx); 824 } 825 if (new_state != arc_anon) { 826 int use_mutex = !MUTEX_HELD(&new_state->arcs_mtx); 827 828 if (use_mutex) 829 mutex_enter(&new_state->arcs_mtx); 830 831 list_insert_head(&new_state->arcs_list, ab); 832 833 /* ghost elements have a ghost size */ 834 if (GHOST_STATE(new_state)) { 835 ASSERT(ab->b_datacnt == 0); 836 ASSERT(ab->b_buf == NULL); 837 to_delta = ab->b_size; 838 } 839 atomic_add_64(&new_state->arcs_lsize, to_delta); 840 ASSERT3U(new_state->arcs_size + to_delta, >=, 841 new_state->arcs_lsize); 842 843 if (use_mutex) 844 mutex_exit(&new_state->arcs_mtx); 845 } 846 } 847 848 ASSERT(!BUF_EMPTY(ab)); 849 if (new_state == arc_anon && old_state != arc_anon) { 850 buf_hash_remove(ab); 851 } 852 853 /* adjust state sizes */ 854 if (to_delta) 855 atomic_add_64(&new_state->arcs_size, to_delta); 856 if (from_delta) { 857 ASSERT3U(old_state->arcs_size, >=, from_delta); 858 atomic_add_64(&old_state->arcs_size, -from_delta); 859 } 860 ab->b_state = new_state; 861} 862 863arc_buf_t * 864arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type) 865{ 866 arc_buf_hdr_t *hdr; 867 arc_buf_t *buf; 868 869 ASSERT3U(size, >, 0); 870 hdr = kmem_cache_alloc(hdr_cache, KM_SLEEP); 871 ASSERT(BUF_EMPTY(hdr)); 872 hdr->b_size = size; 873 hdr->b_type = type; 874 hdr->b_spa = spa; 875 hdr->b_state = arc_anon; 876 hdr->b_arc_access = 0; 877 mutex_init(&hdr->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL); 878 buf = kmem_cache_alloc(buf_cache, KM_SLEEP); 879 buf->b_hdr = hdr; 880 buf->b_data = NULL; 881 buf->b_efunc = NULL; 882 buf->b_private = NULL; 883 buf->b_next = NULL; 884 hdr->b_buf = buf; 885 arc_get_data_buf(buf); 886 hdr->b_datacnt = 1; 887 hdr->b_flags = 0; 888 ASSERT(refcount_is_zero(&hdr->b_refcnt)); 889 (void) refcount_add(&hdr->b_refcnt, tag); 890 891 return (buf); 892} 893 894static arc_buf_t * 895arc_buf_clone(arc_buf_t *from) 896{ 897 arc_buf_t *buf; 898 arc_buf_hdr_t *hdr = from->b_hdr; 899 uint64_t size = hdr->b_size; 900 901 buf = kmem_cache_alloc(buf_cache, KM_SLEEP); 902 buf->b_hdr = hdr; 903 buf->b_data = NULL; 904 buf->b_efunc = NULL; 905 buf->b_private = NULL; 906 buf->b_next = hdr->b_buf; 907 hdr->b_buf = buf; 908 arc_get_data_buf(buf); 909 bcopy(from->b_data, buf->b_data, size); 910 hdr->b_datacnt += 1; 911 return (buf); 912} 913 914void 915arc_buf_add_ref(arc_buf_t *buf, void* tag) 916{ 917 arc_buf_hdr_t *hdr; 918 kmutex_t *hash_lock; 919 920 /* 921 * Check to see if this buffer is currently being evicted via 922 * arc_do_user_evicts(). 923 */ 924 mutex_enter(&arc_eviction_mtx); 925 hdr = buf->b_hdr; 926 if (hdr == NULL) { 927 mutex_exit(&arc_eviction_mtx); 928 return; 929 } 930 hash_lock = HDR_LOCK(hdr); 931 mutex_exit(&arc_eviction_mtx); 932 933 mutex_enter(hash_lock); 934 if (buf->b_data == NULL) { 935 /* 936 * This buffer is evicted. 937 */ 938 mutex_exit(hash_lock); 939 return; 940 } 941 942 ASSERT(buf->b_hdr == hdr); 943 ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu); 944 add_reference(hdr, hash_lock, tag); 945 arc_access(hdr, hash_lock); 946 mutex_exit(hash_lock); 947 ARCSTAT_BUMP(arcstat_hits); 948 ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH), 949 demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, 950 data, metadata, hits); 951} 952 953static void 954arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all) 955{ 956 arc_buf_t **bufp; 957 958 /* free up data associated with the buf */ 959 if (buf->b_data) { 960 arc_state_t *state = buf->b_hdr->b_state; 961 uint64_t size = buf->b_hdr->b_size; 962 arc_buf_contents_t type = buf->b_hdr->b_type; 963 964 arc_cksum_verify(buf); 965 if (!recycle) { 966 if (type == ARC_BUFC_METADATA) { 967 zio_buf_free(buf->b_data, size); 968 } else { 969 ASSERT(type == ARC_BUFC_DATA); 970 zio_data_buf_free(buf->b_data, size); 971 } 972 atomic_add_64(&arc_size, -size); 973 } 974 if (list_link_active(&buf->b_hdr->b_arc_node)) { 975 ASSERT(refcount_is_zero(&buf->b_hdr->b_refcnt)); 976 ASSERT(state != arc_anon); 977 ASSERT3U(state->arcs_lsize, >=, size); 978 atomic_add_64(&state->arcs_lsize, -size); 979 } 980 ASSERT3U(state->arcs_size, >=, size); 981 atomic_add_64(&state->arcs_size, -size); 982 buf->b_data = NULL; 983 ASSERT(buf->b_hdr->b_datacnt > 0); 984 buf->b_hdr->b_datacnt -= 1; 985 } 986 987 /* only remove the buf if requested */ 988 if (!all) 989 return; 990 991 /* remove the buf from the hdr list */ 992 for (bufp = &buf->b_hdr->b_buf; *bufp != buf; bufp = &(*bufp)->b_next) 993 continue; 994 *bufp = buf->b_next; 995 996 ASSERT(buf->b_efunc == NULL); 997 998 /* clean up the buf */ 999 buf->b_hdr = NULL; 1000 kmem_cache_free(buf_cache, buf); 1001} 1002 1003static void 1004arc_hdr_destroy(arc_buf_hdr_t *hdr) 1005{ 1006 ASSERT(refcount_is_zero(&hdr->b_refcnt)); 1007 ASSERT3P(hdr->b_state, ==, arc_anon); 1008 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 1009 1010 if (!BUF_EMPTY(hdr)) { 1011 ASSERT(!HDR_IN_HASH_TABLE(hdr)); 1012 bzero(&hdr->b_dva, sizeof (dva_t)); 1013 hdr->b_birth = 0; 1014 hdr->b_cksum0 = 0; 1015 } 1016 while (hdr->b_buf) { 1017 arc_buf_t *buf = hdr->b_buf; 1018 1019 if (buf->b_efunc) { 1020 mutex_enter(&arc_eviction_mtx); 1021 ASSERT(buf->b_hdr != NULL); 1022 arc_buf_destroy(hdr->b_buf, FALSE, FALSE); 1023 hdr->b_buf = buf->b_next; 1024 buf->b_hdr = &arc_eviction_hdr; 1025 buf->b_next = arc_eviction_list; 1026 arc_eviction_list = buf; 1027 mutex_exit(&arc_eviction_mtx); 1028 } else { 1029 arc_buf_destroy(hdr->b_buf, FALSE, TRUE); 1030 } 1031 } 1032 if (hdr->b_freeze_cksum != NULL) { 1033 kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t)); 1034 hdr->b_freeze_cksum = NULL; 1035 } 1036 mutex_destroy(&hdr->b_freeze_lock); 1037 1038 ASSERT(!list_link_active(&hdr->b_arc_node)); 1039 ASSERT3P(hdr->b_hash_next, ==, NULL); 1040 ASSERT3P(hdr->b_acb, ==, NULL); 1041 kmem_cache_free(hdr_cache, hdr); 1042} 1043 1044void 1045arc_buf_free(arc_buf_t *buf, void *tag) 1046{ 1047 arc_buf_hdr_t *hdr = buf->b_hdr; 1048 int hashed = hdr->b_state != arc_anon; 1049 1050 ASSERT(buf->b_efunc == NULL); 1051 ASSERT(buf->b_data != NULL); 1052 1053 if (hashed) { 1054 kmutex_t *hash_lock = HDR_LOCK(hdr); 1055 1056 mutex_enter(hash_lock); 1057 (void) remove_reference(hdr, hash_lock, tag); 1058 if (hdr->b_datacnt > 1) 1059 arc_buf_destroy(buf, FALSE, TRUE); 1060 else 1061 hdr->b_flags |= ARC_BUF_AVAILABLE; 1062 mutex_exit(hash_lock); 1063 } else if (HDR_IO_IN_PROGRESS(hdr)) { 1064 int destroy_hdr; 1065 /* 1066 * We are in the middle of an async write. Don't destroy 1067 * this buffer unless the write completes before we finish 1068 * decrementing the reference count. 1069 */ 1070 mutex_enter(&arc_eviction_mtx); 1071 (void) remove_reference(hdr, NULL, tag); 1072 ASSERT(refcount_is_zero(&hdr->b_refcnt)); 1073 destroy_hdr = !HDR_IO_IN_PROGRESS(hdr); 1074 mutex_exit(&arc_eviction_mtx); 1075 if (destroy_hdr) 1076 arc_hdr_destroy(hdr); 1077 } else { 1078 if (remove_reference(hdr, NULL, tag) > 0) { 1079 ASSERT(HDR_IO_ERROR(hdr)); 1080 arc_buf_destroy(buf, FALSE, TRUE); 1081 } else { 1082 arc_hdr_destroy(hdr); 1083 } 1084 } 1085} 1086 1087int 1088arc_buf_remove_ref(arc_buf_t *buf, void* tag) 1089{ 1090 arc_buf_hdr_t *hdr = buf->b_hdr; 1091 kmutex_t *hash_lock = HDR_LOCK(hdr); 1092 int no_callback = (buf->b_efunc == NULL); 1093 1094 if (hdr->b_state == arc_anon) { 1095 arc_buf_free(buf, tag); 1096 return (no_callback); 1097 } 1098 1099 mutex_enter(hash_lock); 1100 ASSERT(hdr->b_state != arc_anon); 1101 ASSERT(buf->b_data != NULL); 1102 1103 (void) remove_reference(hdr, hash_lock, tag); 1104 if (hdr->b_datacnt > 1) { 1105 if (no_callback) 1106 arc_buf_destroy(buf, FALSE, TRUE); 1107 } else if (no_callback) { 1108 ASSERT(hdr->b_buf == buf && buf->b_next == NULL); 1109 hdr->b_flags |= ARC_BUF_AVAILABLE; 1110 } 1111 ASSERT(no_callback || hdr->b_datacnt > 1 || 1112 refcount_is_zero(&hdr->b_refcnt)); 1113 mutex_exit(hash_lock); 1114 return (no_callback); 1115} 1116 1117int 1118arc_buf_size(arc_buf_t *buf) 1119{ 1120 return (buf->b_hdr->b_size); 1121} 1122 1123/* 1124 * Evict buffers from list until we've removed the specified number of 1125 * bytes. Move the removed buffers to the appropriate evict state. 1126 * If the recycle flag is set, then attempt to "recycle" a buffer: 1127 * - look for a buffer to evict that is `bytes' long. 1128 * - return the data block from this buffer rather than freeing it. 1129 * This flag is used by callers that are trying to make space for a 1130 * new buffer in a full arc cache. 1131 */ 1132static void * 1133arc_evict(arc_state_t *state, int64_t bytes, boolean_t recycle, 1134 arc_buf_contents_t type) 1135{ 1136 arc_state_t *evicted_state; 1137 uint64_t bytes_evicted = 0, skipped = 0, missed = 0; 1138 arc_buf_hdr_t *ab, *ab_prev = NULL; 1139 kmutex_t *hash_lock; 1140 boolean_t have_lock; 1141 void *stolen = NULL; 1142 1143 ASSERT(state == arc_mru || state == arc_mfu); 1144 1145 evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost; 1146 1147 mutex_enter(&state->arcs_mtx); 1148 mutex_enter(&evicted_state->arcs_mtx); 1149 1150 for (ab = list_tail(&state->arcs_list); ab; ab = ab_prev) { 1151 ab_prev = list_prev(&state->arcs_list, ab); 1152 /* prefetch buffers have a minimum lifespan */ 1153 if (HDR_IO_IN_PROGRESS(ab) || 1154 (ab->b_flags & (ARC_PREFETCH|ARC_INDIRECT) && 1155 lbolt - ab->b_arc_access < arc_min_prefetch_lifespan)) { 1156 skipped++; 1157 continue; 1158 } 1159 /* "lookahead" for better eviction candidate */ 1160 if (recycle && ab->b_size != bytes && 1161 ab_prev && ab_prev->b_size == bytes) 1162 continue; 1163 hash_lock = HDR_LOCK(ab); 1164 have_lock = MUTEX_HELD(hash_lock); 1165 if (have_lock || mutex_tryenter(hash_lock)) { 1166 ASSERT3U(refcount_count(&ab->b_refcnt), ==, 0); 1167 ASSERT(ab->b_datacnt > 0); 1168 while (ab->b_buf) { 1169 arc_buf_t *buf = ab->b_buf; 1170 if (buf->b_data) { 1171 bytes_evicted += ab->b_size; 1172 if (recycle && ab->b_type == type && 1173 ab->b_size == bytes) { 1174 stolen = buf->b_data; 1175 recycle = FALSE; 1176 } 1177 } 1178 if (buf->b_efunc) { 1179 mutex_enter(&arc_eviction_mtx); 1180 arc_buf_destroy(buf, 1181 buf->b_data == stolen, FALSE); 1182 ab->b_buf = buf->b_next; 1183 buf->b_hdr = &arc_eviction_hdr; 1184 buf->b_next = arc_eviction_list; 1185 arc_eviction_list = buf; 1186 mutex_exit(&arc_eviction_mtx); 1187 } else { 1188 arc_buf_destroy(buf, 1189 buf->b_data == stolen, TRUE); 1190 } 1191 } 1192 ASSERT(ab->b_datacnt == 0); 1193 arc_change_state(evicted_state, ab, hash_lock); 1194 ASSERT(HDR_IN_HASH_TABLE(ab)); 1195 ab->b_flags = ARC_IN_HASH_TABLE; 1196 DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, ab); 1197 if (!have_lock) 1198 mutex_exit(hash_lock); 1199 if (bytes >= 0 && bytes_evicted >= bytes) 1200 break; 1201 } else { 1202 missed += 1; 1203 } 1204 } 1205 1206 mutex_exit(&evicted_state->arcs_mtx); 1207 mutex_exit(&state->arcs_mtx); 1208 1209 if (bytes_evicted < bytes) 1210 dprintf("only evicted %lld bytes from %x", 1211 (longlong_t)bytes_evicted, state); 1212 1213 if (skipped) 1214 ARCSTAT_INCR(arcstat_evict_skip, skipped); 1215 1216 if (missed) 1217 ARCSTAT_INCR(arcstat_mutex_miss, missed); 1218 1219 return (stolen); 1220} 1221 1222/* 1223 * Remove buffers from list until we've removed the specified number of 1224 * bytes. Destroy the buffers that are removed. 1225 */ 1226static void 1227arc_evict_ghost(arc_state_t *state, int64_t bytes) 1228{ 1229 arc_buf_hdr_t *ab, *ab_prev; 1230 kmutex_t *hash_lock; 1231 uint64_t bytes_deleted = 0; 1232 uint64_t bufs_skipped = 0; 1233 1234 ASSERT(GHOST_STATE(state)); 1235top: 1236 mutex_enter(&state->arcs_mtx); 1237 for (ab = list_tail(&state->arcs_list); ab; ab = ab_prev) { 1238 ab_prev = list_prev(&state->arcs_list, ab); 1239 hash_lock = HDR_LOCK(ab); 1240 if (mutex_tryenter(hash_lock)) { 1241 ASSERT(!HDR_IO_IN_PROGRESS(ab)); 1242 ASSERT(ab->b_buf == NULL); 1243 arc_change_state(arc_anon, ab, hash_lock); 1244 mutex_exit(hash_lock); 1245 ARCSTAT_BUMP(arcstat_deleted); 1246 bytes_deleted += ab->b_size; 1247 arc_hdr_destroy(ab); 1248 DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab); 1249 if (bytes >= 0 && bytes_deleted >= bytes) 1250 break; 1251 } else { 1252 if (bytes < 0) { 1253 mutex_exit(&state->arcs_mtx); 1254 mutex_enter(hash_lock); 1255 mutex_exit(hash_lock); 1256 goto top; 1257 } 1258 bufs_skipped += 1; 1259 } 1260 } 1261 mutex_exit(&state->arcs_mtx); 1262 1263 if (bufs_skipped) { 1264 ARCSTAT_INCR(arcstat_mutex_miss, bufs_skipped); 1265 ASSERT(bytes >= 0); 1266 } 1267 1268 if (bytes_deleted < bytes) 1269 dprintf("only deleted %lld bytes from %p", 1270 (longlong_t)bytes_deleted, state); 1271} 1272 1273static void 1274arc_adjust(void) 1275{ 1276 int64_t top_sz, mru_over, arc_over, todelete; 1277 1278 top_sz = arc_anon->arcs_size + arc_mru->arcs_size; 1279 1280 if (top_sz > arc_p && arc_mru->arcs_lsize > 0) { 1281 int64_t toevict = MIN(arc_mru->arcs_lsize, top_sz - arc_p); 1282 (void) arc_evict(arc_mru, toevict, FALSE, ARC_BUFC_UNDEF); 1283 top_sz = arc_anon->arcs_size + arc_mru->arcs_size; 1284 } 1285 1286 mru_over = top_sz + arc_mru_ghost->arcs_size - arc_c; 1287 1288 if (mru_over > 0) { 1289 if (arc_mru_ghost->arcs_lsize > 0) { 1290 todelete = MIN(arc_mru_ghost->arcs_lsize, mru_over); 1291 arc_evict_ghost(arc_mru_ghost, todelete); 1292 } 1293 } 1294 1295 if ((arc_over = arc_size - arc_c) > 0) { 1296 int64_t tbl_over; 1297 1298 if (arc_mfu->arcs_lsize > 0) { 1299 int64_t toevict = MIN(arc_mfu->arcs_lsize, arc_over); 1300 (void) arc_evict(arc_mfu, toevict, FALSE, 1301 ARC_BUFC_UNDEF); 1302 } 1303 1304 tbl_over = arc_size + arc_mru_ghost->arcs_lsize + 1305 arc_mfu_ghost->arcs_lsize - arc_c*2; 1306 1307 if (tbl_over > 0 && arc_mfu_ghost->arcs_lsize > 0) { 1308 todelete = MIN(arc_mfu_ghost->arcs_lsize, tbl_over); 1309 arc_evict_ghost(arc_mfu_ghost, todelete); 1310 } 1311 } 1312} 1313 1314static void 1315arc_do_user_evicts(void) 1316{ 1317 mutex_enter(&arc_eviction_mtx); 1318 while (arc_eviction_list != NULL) { 1319 arc_buf_t *buf = arc_eviction_list; 1320 arc_eviction_list = buf->b_next; 1321 buf->b_hdr = NULL; 1322 mutex_exit(&arc_eviction_mtx); 1323 1324 if (buf->b_efunc != NULL) 1325 VERIFY(buf->b_efunc(buf) == 0); 1326 1327 buf->b_efunc = NULL; 1328 buf->b_private = NULL; 1329 kmem_cache_free(buf_cache, buf); 1330 mutex_enter(&arc_eviction_mtx); 1331 } 1332 mutex_exit(&arc_eviction_mtx); 1333} 1334 1335/* 1336 * Flush all *evictable* data from the cache. 1337 * NOTE: this will not touch "active" (i.e. referenced) data. 1338 */ 1339void 1340arc_flush(void) 1341{ 1342 while (list_head(&arc_mru->arcs_list)) 1343 (void) arc_evict(arc_mru, -1, FALSE, ARC_BUFC_UNDEF); 1344 while (list_head(&arc_mfu->arcs_list)) 1345 (void) arc_evict(arc_mfu, -1, FALSE, ARC_BUFC_UNDEF); 1346 1347 arc_evict_ghost(arc_mru_ghost, -1); 1348 arc_evict_ghost(arc_mfu_ghost, -1); 1349 1350 mutex_enter(&arc_reclaim_thr_lock); 1351 arc_do_user_evicts(); 1352 mutex_exit(&arc_reclaim_thr_lock); 1353 ASSERT(arc_eviction_list == NULL); 1354} 1355 1356int arc_shrink_shift = 5; /* log2(fraction of arc to reclaim) */ 1357 1358void 1359arc_shrink(void) 1360{ 1361 if (arc_c > arc_c_min) { 1362 uint64_t to_free; 1363 1364#ifdef _KERNEL 1365 to_free = arc_c >> arc_shrink_shift; 1366#else 1367 to_free = arc_c >> arc_shrink_shift; 1368#endif 1369 if (arc_c > arc_c_min + to_free) 1370 atomic_add_64(&arc_c, -to_free); 1371 else 1372 arc_c = arc_c_min; 1373 1374 atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift)); 1375 if (arc_c > arc_size) 1376 arc_c = MAX(arc_size, arc_c_min); 1377 if (arc_p > arc_c) 1378 arc_p = (arc_c >> 1); 1379 ASSERT(arc_c >= arc_c_min); 1380 ASSERT((int64_t)arc_p >= 0); 1381 } 1382 1383 if (arc_size > arc_c) 1384 arc_adjust(); 1385} 1386 1387static int zfs_needfree = 0; 1388 1389static int 1390arc_reclaim_needed(void) 1391{ 1392#if 0 1393 uint64_t extra; 1394#endif 1395 1396#ifdef _KERNEL 1397 1398 if (zfs_needfree) 1399 return (1); 1400 1401#if 0 1402 /* 1403 * check to make sure that swapfs has enough space so that anon 1404 * reservations can still succeeed. anon_resvmem() checks that the 1405 * availrmem is greater than swapfs_minfree, and the number of reserved 1406 * swap pages. We also add a bit of extra here just to prevent 1407 * circumstances from getting really dire. 1408 */ 1409 if (availrmem < swapfs_minfree + swapfs_reserve + extra) 1410 return (1); 1411 1412 /* 1413 * If zio data pages are being allocated out of a separate heap segment, 1414 * then check that the size of available vmem for this area remains 1415 * above 1/4th free. This needs to be done when the size of the 1416 * non-default segment is smaller than physical memory, so we could 1417 * conceivably run out of VA in that segment before running out of 1418 * physical memory. 1419 */ 1420 if (zio_arena != NULL) { 1421 size_t arc_ziosize = 1422 btop(vmem_size(zio_arena, VMEM_FREE | VMEM_ALLOC)); 1423 1424 if ((physmem > arc_ziosize) && 1425 (btop(vmem_size(zio_arena, VMEM_FREE)) < arc_ziosize >> 2)) 1426 return (1); 1427 } 1428 1429#if defined(__i386) 1430 /* 1431 * If we're on an i386 platform, it's possible that we'll exhaust the 1432 * kernel heap space before we ever run out of available physical 1433 * memory. Most checks of the size of the heap_area compare against 1434 * tune.t_minarmem, which is the minimum available real memory that we 1435 * can have in the system. However, this is generally fixed at 25 pages 1436 * which is so low that it's useless. In this comparison, we seek to 1437 * calculate the total heap-size, and reclaim if more than 3/4ths of the 1438 * heap is allocated. (Or, in the caclulation, if less than 1/4th is 1439 * free) 1440 */ 1441 if (btop(vmem_size(heap_arena, VMEM_FREE)) < 1442 (btop(vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC)) >> 2)) 1443 return (1); 1444#endif 1445#else 1446 if (kmem_map->size > (vm_kmem_size * 3) / 4) 1447 return (1); 1448#endif 1449 1450#else 1451 if (spa_get_random(100) == 0) 1452 return (1); 1453#endif 1454 return (0); 1455} 1456 1457static void 1458arc_kmem_reap_now(arc_reclaim_strategy_t strat) 1459{ 1460#ifdef ZIO_USE_UMA 1461 size_t i; 1462 kmem_cache_t *prev_cache = NULL; 1463 kmem_cache_t *prev_data_cache = NULL; 1464 extern kmem_cache_t *zio_buf_cache[]; 1465 extern kmem_cache_t *zio_data_buf_cache[]; 1466#endif 1467 1468#ifdef _KERNEL 1469 /* 1470 * First purge some DNLC entries, in case the DNLC is using 1471 * up too much memory. 1472 */ 1473 dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent); 1474 1475#if defined(__i386) 1476 /* 1477 * Reclaim unused memory from all kmem caches. 1478 */ 1479 kmem_reap(); 1480#endif 1481#endif 1482 1483 /* 1484 * An agressive reclamation will shrink the cache size as well as 1485 * reap free buffers from the arc kmem caches. 1486 */ 1487 if (strat == ARC_RECLAIM_AGGR) 1488 arc_shrink(); 1489 1490#ifdef ZIO_USE_UMA 1491 for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) { 1492 if (zio_buf_cache[i] != prev_cache) { 1493 prev_cache = zio_buf_cache[i]; 1494 kmem_cache_reap_now(zio_buf_cache[i]); 1495 } 1496 if (zio_data_buf_cache[i] != prev_data_cache) { 1497 prev_data_cache = zio_data_buf_cache[i]; 1498 kmem_cache_reap_now(zio_data_buf_cache[i]); 1499 } 1500 } 1501#endif 1502 kmem_cache_reap_now(buf_cache); 1503 kmem_cache_reap_now(hdr_cache); 1504} 1505 1506static void 1507arc_reclaim_thread(void *dummy __unused) 1508{ 1509 clock_t growtime = 0; 1510 arc_reclaim_strategy_t last_reclaim = ARC_RECLAIM_CONS; 1511 callb_cpr_t cpr; 1512 1513 CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG); 1514 1515 mutex_enter(&arc_reclaim_thr_lock); 1516 while (arc_thread_exit == 0) { 1517 if (arc_reclaim_needed()) { 1518 1519 if (arc_no_grow) { 1520 if (last_reclaim == ARC_RECLAIM_CONS) { 1521 last_reclaim = ARC_RECLAIM_AGGR; 1522 } else { 1523 last_reclaim = ARC_RECLAIM_CONS; 1524 } 1525 } else { 1526 arc_no_grow = TRUE; 1527 last_reclaim = ARC_RECLAIM_AGGR; 1528 membar_producer(); 1529 } 1530 1531 /* reset the growth delay for every reclaim */ 1532 growtime = lbolt + (arc_grow_retry * hz); 1533 ASSERT(growtime > 0); 1534 1535 if (zfs_needfree && last_reclaim == ARC_RECLAIM_CONS) { 1536 /* 1537 * If zfs_needfree is TRUE our vm_lowmem hook 1538 * was called and in that case we must free some 1539 * memory, so switch to aggressive mode. 1540 */ 1541 arc_no_grow = TRUE; 1542 last_reclaim = ARC_RECLAIM_AGGR; 1543 } 1544 arc_kmem_reap_now(last_reclaim); 1545 } else if ((growtime > 0) && ((growtime - lbolt) <= 0)) { 1546 arc_no_grow = FALSE; 1547 } 1548 1549 if (zfs_needfree || 1550 (2 * arc_c < arc_size + 1551 arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size)) 1552 arc_adjust(); 1553 1554 if (arc_eviction_list != NULL) 1555 arc_do_user_evicts(); 1556 1557 if (arc_reclaim_needed()) { 1558 zfs_needfree = 0; 1559#ifdef _KERNEL 1560 wakeup(&zfs_needfree); 1561#endif 1562 } 1563 1564 /* block until needed, or one second, whichever is shorter */ 1565 CALLB_CPR_SAFE_BEGIN(&cpr); 1566 (void) cv_timedwait(&arc_reclaim_thr_cv, 1567 &arc_reclaim_thr_lock, hz); 1568 CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock); 1569 } 1570 1571 arc_thread_exit = 0; 1572 cv_broadcast(&arc_reclaim_thr_cv); 1573 CALLB_CPR_EXIT(&cpr); /* drops arc_reclaim_thr_lock */ 1574 thread_exit(); 1575} 1576 1577/* 1578 * Adapt arc info given the number of bytes we are trying to add and 1579 * the state that we are comming from. This function is only called 1580 * when we are adding new content to the cache. 1581 */ 1582static void 1583arc_adapt(int bytes, arc_state_t *state) 1584{ 1585 int mult; 1586 1587 ASSERT(bytes > 0); 1588 /* 1589 * Adapt the target size of the MRU list: 1590 * - if we just hit in the MRU ghost list, then increase 1591 * the target size of the MRU list. 1592 * - if we just hit in the MFU ghost list, then increase 1593 * the target size of the MFU list by decreasing the 1594 * target size of the MRU list. 1595 */ 1596 if (state == arc_mru_ghost) { 1597 mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ? 1598 1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size)); 1599 1600 arc_p = MIN(arc_c, arc_p + bytes * mult); 1601 } else if (state == arc_mfu_ghost) { 1602 mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ? 1603 1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size)); 1604 1605 arc_p = MAX(0, (int64_t)arc_p - bytes * mult); 1606 } 1607 ASSERT((int64_t)arc_p >= 0); 1608 1609 if (arc_reclaim_needed()) { 1610 cv_signal(&arc_reclaim_thr_cv); 1611 return; 1612 } 1613 1614 if (arc_no_grow) 1615 return; 1616 1617 if (arc_c >= arc_c_max) 1618 return; 1619 1620 /* 1621 * If we're within (2 * maxblocksize) bytes of the target 1622 * cache size, increment the target cache size 1623 */ 1624 if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) { 1625 atomic_add_64(&arc_c, (int64_t)bytes); 1626 if (arc_c > arc_c_max) 1627 arc_c = arc_c_max; 1628 else if (state == arc_anon) 1629 atomic_add_64(&arc_p, (int64_t)bytes); 1630 if (arc_p > arc_c) 1631 arc_p = arc_c; 1632 } 1633 ASSERT((int64_t)arc_p >= 0); 1634} 1635 1636/* 1637 * Check if the cache has reached its limits and eviction is required 1638 * prior to insert. 1639 */ 1640static int 1641arc_evict_needed() 1642{ 1643 if (arc_reclaim_needed()) 1644 return (1); 1645 1646 return (arc_size > arc_c); 1647} 1648 1649/* 1650 * The buffer, supplied as the first argument, needs a data block. 1651 * So, if we are at cache max, determine which cache should be victimized. 1652 * We have the following cases: 1653 * 1654 * 1. Insert for MRU, p > sizeof(arc_anon + arc_mru) -> 1655 * In this situation if we're out of space, but the resident size of the MFU is 1656 * under the limit, victimize the MFU cache to satisfy this insertion request. 1657 * 1658 * 2. Insert for MRU, p <= sizeof(arc_anon + arc_mru) -> 1659 * Here, we've used up all of the available space for the MRU, so we need to 1660 * evict from our own cache instead. Evict from the set of resident MRU 1661 * entries. 1662 * 1663 * 3. Insert for MFU (c - p) > sizeof(arc_mfu) -> 1664 * c minus p represents the MFU space in the cache, since p is the size of the 1665 * cache that is dedicated to the MRU. In this situation there's still space on 1666 * the MFU side, so the MRU side needs to be victimized. 1667 * 1668 * 4. Insert for MFU (c - p) < sizeof(arc_mfu) -> 1669 * MFU's resident set is consuming more space than it has been allotted. In 1670 * this situation, we must victimize our own cache, the MFU, for this insertion. 1671 */ 1672static void 1673arc_get_data_buf(arc_buf_t *buf) 1674{ 1675 arc_state_t *state = buf->b_hdr->b_state; 1676 uint64_t size = buf->b_hdr->b_size; 1677 arc_buf_contents_t type = buf->b_hdr->b_type; 1678 1679 arc_adapt(size, state); 1680 1681 /* 1682 * We have not yet reached cache maximum size, 1683 * just allocate a new buffer. 1684 */ 1685 if (!arc_evict_needed()) { 1686 if (type == ARC_BUFC_METADATA) { 1687 buf->b_data = zio_buf_alloc(size); 1688 } else { 1689 ASSERT(type == ARC_BUFC_DATA); 1690 buf->b_data = zio_data_buf_alloc(size); 1691 } 1692 atomic_add_64(&arc_size, size); 1693 goto out; 1694 } 1695 1696 /* 1697 * If we are prefetching from the mfu ghost list, this buffer 1698 * will end up on the mru list; so steal space from there. 1699 */ 1700 if (state == arc_mfu_ghost) 1701 state = buf->b_hdr->b_flags & ARC_PREFETCH ? arc_mru : arc_mfu; 1702 else if (state == arc_mru_ghost) 1703 state = arc_mru; 1704 1705 if (state == arc_mru || state == arc_anon) { 1706 uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size; 1707 state = (arc_p > mru_used) ? arc_mfu : arc_mru; 1708 } else { 1709 /* MFU cases */ 1710 uint64_t mfu_space = arc_c - arc_p; 1711 state = (mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu; 1712 } 1713 if ((buf->b_data = arc_evict(state, size, TRUE, type)) == NULL) { 1714 if (type == ARC_BUFC_METADATA) { 1715 buf->b_data = zio_buf_alloc(size); 1716 } else { 1717 ASSERT(type == ARC_BUFC_DATA); 1718 buf->b_data = zio_data_buf_alloc(size); 1719 } 1720 atomic_add_64(&arc_size, size); 1721 ARCSTAT_BUMP(arcstat_recycle_miss); 1722 } 1723 ASSERT(buf->b_data != NULL); 1724out: 1725 /* 1726 * Update the state size. Note that ghost states have a 1727 * "ghost size" and so don't need to be updated. 1728 */ 1729 if (!GHOST_STATE(buf->b_hdr->b_state)) { 1730 arc_buf_hdr_t *hdr = buf->b_hdr; 1731 1732 atomic_add_64(&hdr->b_state->arcs_size, size); 1733 if (list_link_active(&hdr->b_arc_node)) { 1734 ASSERT(refcount_is_zero(&hdr->b_refcnt)); 1735 atomic_add_64(&hdr->b_state->arcs_lsize, size); 1736 } 1737 /* 1738 * If we are growing the cache, and we are adding anonymous 1739 * data, and we have outgrown arc_p, update arc_p 1740 */ 1741 if (arc_size < arc_c && hdr->b_state == arc_anon && 1742 arc_anon->arcs_size + arc_mru->arcs_size > arc_p) 1743 arc_p = MIN(arc_c, arc_p + size); 1744 } 1745} 1746 1747/* 1748 * This routine is called whenever a buffer is accessed. 1749 * NOTE: the hash lock is dropped in this function. 1750 */ 1751static void 1752arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) 1753{ 1754 ASSERT(MUTEX_HELD(hash_lock)); 1755 1756 if (buf->b_state == arc_anon) { 1757 /* 1758 * This buffer is not in the cache, and does not 1759 * appear in our "ghost" list. Add the new buffer 1760 * to the MRU state. 1761 */ 1762 1763 ASSERT(buf->b_arc_access == 0); 1764 buf->b_arc_access = lbolt; 1765 DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf); 1766 arc_change_state(arc_mru, buf, hash_lock); 1767 1768 } else if (buf->b_state == arc_mru) { 1769 /* 1770 * If this buffer is here because of a prefetch, then either: 1771 * - clear the flag if this is a "referencing" read 1772 * (any subsequent access will bump this into the MFU state). 1773 * or 1774 * - move the buffer to the head of the list if this is 1775 * another prefetch (to make it less likely to be evicted). 1776 */ 1777 if ((buf->b_flags & ARC_PREFETCH) != 0) { 1778 if (refcount_count(&buf->b_refcnt) == 0) { 1779 ASSERT(list_link_active(&buf->b_arc_node)); 1780 mutex_enter(&arc_mru->arcs_mtx); 1781 list_remove(&arc_mru->arcs_list, buf); 1782 list_insert_head(&arc_mru->arcs_list, buf); 1783 mutex_exit(&arc_mru->arcs_mtx); 1784 } else { 1785 buf->b_flags &= ~ARC_PREFETCH; 1786 ARCSTAT_BUMP(arcstat_mru_hits); 1787 } 1788 buf->b_arc_access = lbolt; 1789 return; 1790 } 1791 1792 /* 1793 * This buffer has been "accessed" only once so far, 1794 * but it is still in the cache. Move it to the MFU 1795 * state. 1796 */ 1797 if (lbolt > buf->b_arc_access + ARC_MINTIME) { 1798 /* 1799 * More than 125ms have passed since we 1800 * instantiated this buffer. Move it to the 1801 * most frequently used state. 1802 */ 1803 buf->b_arc_access = lbolt; 1804 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); 1805 arc_change_state(arc_mfu, buf, hash_lock); 1806 } 1807 ARCSTAT_BUMP(arcstat_mru_hits); 1808 } else if (buf->b_state == arc_mru_ghost) { 1809 arc_state_t *new_state; 1810 /* 1811 * This buffer has been "accessed" recently, but 1812 * was evicted from the cache. Move it to the 1813 * MFU state. 1814 */ 1815 1816 if (buf->b_flags & ARC_PREFETCH) { 1817 new_state = arc_mru; 1818 if (refcount_count(&buf->b_refcnt) > 0) 1819 buf->b_flags &= ~ARC_PREFETCH; 1820 DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf); 1821 } else { 1822 new_state = arc_mfu; 1823 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); 1824 } 1825 1826 buf->b_arc_access = lbolt; 1827 arc_change_state(new_state, buf, hash_lock); 1828 1829 ARCSTAT_BUMP(arcstat_mru_ghost_hits); 1830 } else if (buf->b_state == arc_mfu) { 1831 /* 1832 * This buffer has been accessed more than once and is 1833 * still in the cache. Keep it in the MFU state. 1834 * 1835 * NOTE: an add_reference() that occurred when we did 1836 * the arc_read() will have kicked this off the list. 1837 * If it was a prefetch, we will explicitly move it to 1838 * the head of the list now. 1839 */ 1840 if ((buf->b_flags & ARC_PREFETCH) != 0) { 1841 ASSERT(refcount_count(&buf->b_refcnt) == 0); 1842 ASSERT(list_link_active(&buf->b_arc_node)); 1843 mutex_enter(&arc_mfu->arcs_mtx); 1844 list_remove(&arc_mfu->arcs_list, buf); 1845 list_insert_head(&arc_mfu->arcs_list, buf); 1846 mutex_exit(&arc_mfu->arcs_mtx); 1847 } 1848 ARCSTAT_BUMP(arcstat_mfu_hits); 1849 buf->b_arc_access = lbolt; 1850 } else if (buf->b_state == arc_mfu_ghost) { 1851 arc_state_t *new_state = arc_mfu; 1852 /* 1853 * This buffer has been accessed more than once but has 1854 * been evicted from the cache. Move it back to the 1855 * MFU state. 1856 */ 1857 1858 if (buf->b_flags & ARC_PREFETCH) { 1859 /* 1860 * This is a prefetch access... 1861 * move this block back to the MRU state. 1862 */ 1863 ASSERT3U(refcount_count(&buf->b_refcnt), ==, 0); 1864 new_state = arc_mru; 1865 } 1866 1867 buf->b_arc_access = lbolt; 1868 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); 1869 arc_change_state(new_state, buf, hash_lock); 1870 1871 ARCSTAT_BUMP(arcstat_mfu_ghost_hits); 1872 } else { 1873 ASSERT(!"invalid arc state"); 1874 } 1875} 1876 1877/* a generic arc_done_func_t which you can use */ 1878/* ARGSUSED */ 1879void 1880arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg) 1881{ 1882 bcopy(buf->b_data, arg, buf->b_hdr->b_size); 1883 VERIFY(arc_buf_remove_ref(buf, arg) == 1); 1884} 1885 1886/* a generic arc_done_func_t which you can use */ 1887void 1888arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg) 1889{ 1890 arc_buf_t **bufp = arg; 1891 if (zio && zio->io_error) { 1892 VERIFY(arc_buf_remove_ref(buf, arg) == 1); 1893 *bufp = NULL; 1894 } else { 1895 *bufp = buf; 1896 } 1897} 1898 1899static void 1900arc_read_done(zio_t *zio) 1901{ 1902 arc_buf_hdr_t *hdr, *found; 1903 arc_buf_t *buf; 1904 arc_buf_t *abuf; /* buffer we're assigning to callback */ 1905 kmutex_t *hash_lock; 1906 arc_callback_t *callback_list, *acb; 1907 int freeable = FALSE; 1908 1909 buf = zio->io_private; 1910 hdr = buf->b_hdr; 1911 1912 /* 1913 * The hdr was inserted into hash-table and removed from lists 1914 * prior to starting I/O. We should find this header, since 1915 * it's in the hash table, and it should be legit since it's 1916 * not possible to evict it during the I/O. The only possible 1917 * reason for it not to be found is if we were freed during the 1918 * read. 1919 */ 1920 found = buf_hash_find(zio->io_spa, &hdr->b_dva, hdr->b_birth, 1921 &hash_lock); 1922 1923 ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && hash_lock == NULL) || 1924 (found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp)))); 1925 1926 /* byteswap if necessary */ 1927 callback_list = hdr->b_acb; 1928 ASSERT(callback_list != NULL); 1929 if (BP_SHOULD_BYTESWAP(zio->io_bp) && callback_list->acb_byteswap) 1930 callback_list->acb_byteswap(buf->b_data, hdr->b_size); 1931 1932 arc_cksum_compute(buf); 1933 1934 /* create copies of the data buffer for the callers */ 1935 abuf = buf; 1936 for (acb = callback_list; acb; acb = acb->acb_next) { 1937 if (acb->acb_done) { 1938 if (abuf == NULL) 1939 abuf = arc_buf_clone(buf); 1940 acb->acb_buf = abuf; 1941 abuf = NULL; 1942 } 1943 } 1944 hdr->b_acb = NULL; 1945 hdr->b_flags &= ~ARC_IO_IN_PROGRESS; 1946 ASSERT(!HDR_BUF_AVAILABLE(hdr)); 1947 if (abuf == buf) 1948 hdr->b_flags |= ARC_BUF_AVAILABLE; 1949 1950 ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL); 1951 1952 if (zio->io_error != 0) { 1953 hdr->b_flags |= ARC_IO_ERROR; 1954 if (hdr->b_state != arc_anon) 1955 arc_change_state(arc_anon, hdr, hash_lock); 1956 if (HDR_IN_HASH_TABLE(hdr)) 1957 buf_hash_remove(hdr); 1958 freeable = refcount_is_zero(&hdr->b_refcnt); 1959 /* convert checksum errors into IO errors */ 1960 if (zio->io_error == ECKSUM) 1961 zio->io_error = EIO; 1962 } 1963 1964 /* 1965 * Broadcast before we drop the hash_lock to avoid the possibility 1966 * that the hdr (and hence the cv) might be freed before we get to 1967 * the cv_broadcast(). 1968 */ 1969 cv_broadcast(&hdr->b_cv); 1970 1971 if (hash_lock) { 1972 /* 1973 * Only call arc_access on anonymous buffers. This is because 1974 * if we've issued an I/O for an evicted buffer, we've already 1975 * called arc_access (to prevent any simultaneous readers from 1976 * getting confused). 1977 */ 1978 if (zio->io_error == 0 && hdr->b_state == arc_anon) 1979 arc_access(hdr, hash_lock); 1980 mutex_exit(hash_lock); 1981 } else { 1982 /* 1983 * This block was freed while we waited for the read to 1984 * complete. It has been removed from the hash table and 1985 * moved to the anonymous state (so that it won't show up 1986 * in the cache). 1987 */ 1988 ASSERT3P(hdr->b_state, ==, arc_anon); 1989 freeable = refcount_is_zero(&hdr->b_refcnt); 1990 } 1991 1992 /* execute each callback and free its structure */ 1993 while ((acb = callback_list) != NULL) { 1994 if (acb->acb_done) 1995 acb->acb_done(zio, acb->acb_buf, acb->acb_private); 1996 1997 if (acb->acb_zio_dummy != NULL) { 1998 acb->acb_zio_dummy->io_error = zio->io_error; 1999 zio_nowait(acb->acb_zio_dummy); 2000 } 2001 2002 callback_list = acb->acb_next; 2003 kmem_free(acb, sizeof (arc_callback_t)); 2004 } 2005 2006 if (freeable) 2007 arc_hdr_destroy(hdr); 2008} 2009 2010/* 2011 * "Read" the block block at the specified DVA (in bp) via the 2012 * cache. If the block is found in the cache, invoke the provided 2013 * callback immediately and return. Note that the `zio' parameter 2014 * in the callback will be NULL in this case, since no IO was 2015 * required. If the block is not in the cache pass the read request 2016 * on to the spa with a substitute callback function, so that the 2017 * requested block will be added to the cache. 2018 * 2019 * If a read request arrives for a block that has a read in-progress, 2020 * either wait for the in-progress read to complete (and return the 2021 * results); or, if this is a read with a "done" func, add a record 2022 * to the read to invoke the "done" func when the read completes, 2023 * and return; or just return. 2024 * 2025 * arc_read_done() will invoke all the requested "done" functions 2026 * for readers of this block. 2027 */ 2028int 2029arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_byteswap_func_t *swap, 2030 arc_done_func_t *done, void *private, int priority, int flags, 2031 uint32_t *arc_flags, zbookmark_t *zb) 2032{ 2033 arc_buf_hdr_t *hdr; 2034 arc_buf_t *buf; 2035 kmutex_t *hash_lock; 2036 zio_t *rzio; 2037 2038top: 2039 hdr = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_lock); 2040 if (hdr && hdr->b_datacnt > 0) { 2041 2042 *arc_flags |= ARC_CACHED; 2043 2044 if (HDR_IO_IN_PROGRESS(hdr)) { 2045 2046 if (*arc_flags & ARC_WAIT) { 2047 cv_wait(&hdr->b_cv, hash_lock); 2048 mutex_exit(hash_lock); 2049 goto top; 2050 } 2051 ASSERT(*arc_flags & ARC_NOWAIT); 2052 2053 if (done) { 2054 arc_callback_t *acb = NULL; 2055 2056 acb = kmem_zalloc(sizeof (arc_callback_t), 2057 KM_SLEEP); 2058 acb->acb_done = done; 2059 acb->acb_private = private; 2060 acb->acb_byteswap = swap; 2061 if (pio != NULL) 2062 acb->acb_zio_dummy = zio_null(pio, 2063 spa, NULL, NULL, flags); 2064 2065 ASSERT(acb->acb_done != NULL); 2066 acb->acb_next = hdr->b_acb; 2067 hdr->b_acb = acb; 2068 add_reference(hdr, hash_lock, private); 2069 mutex_exit(hash_lock); 2070 return (0); 2071 } 2072 mutex_exit(hash_lock); 2073 return (0); 2074 } 2075 2076 ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu); 2077 2078 if (done) { 2079 add_reference(hdr, hash_lock, private); 2080 /* 2081 * If this block is already in use, create a new 2082 * copy of the data so that we will be guaranteed 2083 * that arc_release() will always succeed. 2084 */ 2085 buf = hdr->b_buf; 2086 ASSERT(buf); 2087 ASSERT(buf->b_data); 2088 if (HDR_BUF_AVAILABLE(hdr)) { 2089 ASSERT(buf->b_efunc == NULL); 2090 hdr->b_flags &= ~ARC_BUF_AVAILABLE; 2091 } else { 2092 buf = arc_buf_clone(buf); 2093 } 2094 } else if (*arc_flags & ARC_PREFETCH && 2095 refcount_count(&hdr->b_refcnt) == 0) { 2096 hdr->b_flags |= ARC_PREFETCH; 2097 } 2098 DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); 2099 arc_access(hdr, hash_lock); 2100 mutex_exit(hash_lock); 2101 ARCSTAT_BUMP(arcstat_hits); 2102 ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH), 2103 demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, 2104 data, metadata, hits); 2105 2106 if (done) 2107 done(NULL, buf, private); 2108 } else { 2109 uint64_t size = BP_GET_LSIZE(bp); 2110 arc_callback_t *acb; 2111 2112 if (hdr == NULL) { 2113 /* this block is not in the cache */ 2114 arc_buf_hdr_t *exists; 2115 arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp); 2116 buf = arc_buf_alloc(spa, size, private, type); 2117 hdr = buf->b_hdr; 2118 hdr->b_dva = *BP_IDENTITY(bp); 2119 hdr->b_birth = bp->blk_birth; 2120 hdr->b_cksum0 = bp->blk_cksum.zc_word[0]; 2121 exists = buf_hash_insert(hdr, &hash_lock); 2122 if (exists) { 2123 /* somebody beat us to the hash insert */ 2124 mutex_exit(hash_lock); 2125 bzero(&hdr->b_dva, sizeof (dva_t)); 2126 hdr->b_birth = 0; 2127 hdr->b_cksum0 = 0; 2128 (void) arc_buf_remove_ref(buf, private); 2129 goto top; /* restart the IO request */ 2130 } 2131 /* if this is a prefetch, we don't have a reference */ 2132 if (*arc_flags & ARC_PREFETCH) { 2133 (void) remove_reference(hdr, hash_lock, 2134 private); 2135 hdr->b_flags |= ARC_PREFETCH; 2136 } 2137 if (BP_GET_LEVEL(bp) > 0) 2138 hdr->b_flags |= ARC_INDIRECT; 2139 } else { 2140 /* this block is in the ghost cache */ 2141 ASSERT(GHOST_STATE(hdr->b_state)); 2142 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 2143 ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 0); 2144 ASSERT(hdr->b_buf == NULL); 2145 2146 /* if this is a prefetch, we don't have a reference */ 2147 if (*arc_flags & ARC_PREFETCH) 2148 hdr->b_flags |= ARC_PREFETCH; 2149 else 2150 add_reference(hdr, hash_lock, private); 2151 buf = kmem_cache_alloc(buf_cache, KM_SLEEP); 2152 buf->b_hdr = hdr; 2153 buf->b_data = NULL; 2154 buf->b_efunc = NULL; 2155 buf->b_private = NULL; 2156 buf->b_next = NULL; 2157 hdr->b_buf = buf; 2158 arc_get_data_buf(buf); 2159 ASSERT(hdr->b_datacnt == 0); 2160 hdr->b_datacnt = 1; 2161 2162 } 2163 2164 acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); 2165 acb->acb_done = done; 2166 acb->acb_private = private; 2167 acb->acb_byteswap = swap; 2168 2169 ASSERT(hdr->b_acb == NULL); 2170 hdr->b_acb = acb; 2171 hdr->b_flags |= ARC_IO_IN_PROGRESS; 2172 2173 /* 2174 * If the buffer has been evicted, migrate it to a present state 2175 * before issuing the I/O. Once we drop the hash-table lock, 2176 * the header will be marked as I/O in progress and have an 2177 * attached buffer. At this point, anybody who finds this 2178 * buffer ought to notice that it's legit but has a pending I/O. 2179 */ 2180 2181 if (GHOST_STATE(hdr->b_state)) 2182 arc_access(hdr, hash_lock); 2183 mutex_exit(hash_lock); 2184 2185 ASSERT3U(hdr->b_size, ==, size); 2186 DTRACE_PROBE3(arc__miss, blkptr_t *, bp, uint64_t, size, 2187 zbookmark_t *, zb); 2188 ARCSTAT_BUMP(arcstat_misses); 2189 ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH), 2190 demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, 2191 data, metadata, misses); 2192 2193 rzio = zio_read(pio, spa, bp, buf->b_data, size, 2194 arc_read_done, buf, priority, flags, zb); 2195 2196 if (*arc_flags & ARC_WAIT) 2197 return (zio_wait(rzio)); 2198 2199 ASSERT(*arc_flags & ARC_NOWAIT); 2200 zio_nowait(rzio); 2201 } 2202 return (0); 2203} 2204 2205/* 2206 * arc_read() variant to support pool traversal. If the block is already 2207 * in the ARC, make a copy of it; otherwise, the caller will do the I/O. 2208 * The idea is that we don't want pool traversal filling up memory, but 2209 * if the ARC already has the data anyway, we shouldn't pay for the I/O. 2210 */ 2211int 2212arc_tryread(spa_t *spa, blkptr_t *bp, void *data) 2213{ 2214 arc_buf_hdr_t *hdr; 2215 kmutex_t *hash_mtx; 2216 int rc = 0; 2217 2218 hdr = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_mtx); 2219 2220 if (hdr && hdr->b_datacnt > 0 && !HDR_IO_IN_PROGRESS(hdr)) { 2221 arc_buf_t *buf = hdr->b_buf; 2222 2223 ASSERT(buf); 2224 while (buf->b_data == NULL) { 2225 buf = buf->b_next; 2226 ASSERT(buf); 2227 } 2228 bcopy(buf->b_data, data, hdr->b_size); 2229 } else { 2230 rc = ENOENT; 2231 } 2232 2233 if (hash_mtx) 2234 mutex_exit(hash_mtx); 2235 2236 return (rc); 2237} 2238 2239void 2240arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private) 2241{ 2242 ASSERT(buf->b_hdr != NULL); 2243 ASSERT(buf->b_hdr->b_state != arc_anon); 2244 ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt) || func == NULL); 2245 buf->b_efunc = func; 2246 buf->b_private = private; 2247} 2248 2249/* 2250 * This is used by the DMU to let the ARC know that a buffer is 2251 * being evicted, so the ARC should clean up. If this arc buf 2252 * is not yet in the evicted state, it will be put there. 2253 */ 2254int 2255arc_buf_evict(arc_buf_t *buf) 2256{ 2257 arc_buf_hdr_t *hdr; 2258 kmutex_t *hash_lock; 2259 arc_buf_t **bufp; 2260 2261 mutex_enter(&arc_eviction_mtx); 2262 hdr = buf->b_hdr; 2263 if (hdr == NULL) { 2264 /* 2265 * We are in arc_do_user_evicts(). 2266 */ 2267 ASSERT(buf->b_data == NULL); 2268 mutex_exit(&arc_eviction_mtx); 2269 return (0); 2270 } 2271 hash_lock = HDR_LOCK(hdr); 2272 mutex_exit(&arc_eviction_mtx); 2273 2274 mutex_enter(hash_lock); 2275 2276 if (buf->b_data == NULL) { 2277 /* 2278 * We are on the eviction list. 2279 */ 2280 mutex_exit(hash_lock); 2281 mutex_enter(&arc_eviction_mtx); 2282 if (buf->b_hdr == NULL) { 2283 /* 2284 * We are already in arc_do_user_evicts(). 2285 */ 2286 mutex_exit(&arc_eviction_mtx); 2287 return (0); 2288 } else { 2289 arc_buf_t copy = *buf; /* structure assignment */ 2290 /* 2291 * Process this buffer now 2292 * but let arc_do_user_evicts() do the reaping. 2293 */ 2294 buf->b_efunc = NULL; 2295 mutex_exit(&arc_eviction_mtx); 2296 VERIFY(copy.b_efunc(©) == 0); 2297 return (1); 2298 } 2299 } 2300 2301 ASSERT(buf->b_hdr == hdr); 2302 ASSERT3U(refcount_count(&hdr->b_refcnt), <, hdr->b_datacnt); 2303 ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu); 2304 2305 /* 2306 * Pull this buffer off of the hdr 2307 */ 2308 bufp = &hdr->b_buf; 2309 while (*bufp != buf) 2310 bufp = &(*bufp)->b_next; 2311 *bufp = buf->b_next; 2312 2313 ASSERT(buf->b_data != NULL); 2314 arc_buf_destroy(buf, FALSE, FALSE); 2315 2316 if (hdr->b_datacnt == 0) { 2317 arc_state_t *old_state = hdr->b_state; 2318 arc_state_t *evicted_state; 2319 2320 ASSERT(refcount_is_zero(&hdr->b_refcnt)); 2321 2322 evicted_state = 2323 (old_state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost; 2324 2325 mutex_enter(&old_state->arcs_mtx); 2326 mutex_enter(&evicted_state->arcs_mtx); 2327 2328 arc_change_state(evicted_state, hdr, hash_lock); 2329 ASSERT(HDR_IN_HASH_TABLE(hdr)); 2330 hdr->b_flags = ARC_IN_HASH_TABLE; 2331 2332 mutex_exit(&evicted_state->arcs_mtx); 2333 mutex_exit(&old_state->arcs_mtx); 2334 } 2335 mutex_exit(hash_lock); 2336 2337 VERIFY(buf->b_efunc(buf) == 0); 2338 buf->b_efunc = NULL; 2339 buf->b_private = NULL; 2340 buf->b_hdr = NULL; 2341 kmem_cache_free(buf_cache, buf); 2342 return (1); 2343} 2344 2345/* 2346 * Release this buffer from the cache. This must be done 2347 * after a read and prior to modifying the buffer contents. 2348 * If the buffer has more than one reference, we must make 2349 * make a new hdr for the buffer. 2350 */ 2351void 2352arc_release(arc_buf_t *buf, void *tag) 2353{ 2354 arc_buf_hdr_t *hdr = buf->b_hdr; 2355 kmutex_t *hash_lock = HDR_LOCK(hdr); 2356 2357 /* this buffer is not on any list */ 2358 ASSERT(refcount_count(&hdr->b_refcnt) > 0); 2359 2360 if (hdr->b_state == arc_anon) { 2361 /* this buffer is already released */ 2362 ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 1); 2363 ASSERT(BUF_EMPTY(hdr)); 2364 ASSERT(buf->b_efunc == NULL); 2365 arc_buf_thaw(buf); 2366 return; 2367 } 2368 2369 mutex_enter(hash_lock); 2370 2371 /* 2372 * Do we have more than one buf? 2373 */ 2374 if (hdr->b_buf != buf || buf->b_next != NULL) { 2375 arc_buf_hdr_t *nhdr; 2376 arc_buf_t **bufp; 2377 uint64_t blksz = hdr->b_size; 2378 spa_t *spa = hdr->b_spa; 2379 arc_buf_contents_t type = hdr->b_type; 2380 2381 ASSERT(hdr->b_datacnt > 1); 2382 /* 2383 * Pull the data off of this buf and attach it to 2384 * a new anonymous buf. 2385 */ 2386 (void) remove_reference(hdr, hash_lock, tag); 2387 bufp = &hdr->b_buf; 2388 while (*bufp != buf) 2389 bufp = &(*bufp)->b_next; 2390 *bufp = (*bufp)->b_next; 2391 buf->b_next = NULL; 2392 2393 ASSERT3U(hdr->b_state->arcs_size, >=, hdr->b_size); 2394 atomic_add_64(&hdr->b_state->arcs_size, -hdr->b_size); 2395 if (refcount_is_zero(&hdr->b_refcnt)) { 2396 ASSERT3U(hdr->b_state->arcs_lsize, >=, hdr->b_size); 2397 atomic_add_64(&hdr->b_state->arcs_lsize, -hdr->b_size); 2398 } 2399 hdr->b_datacnt -= 1; 2400 arc_cksum_verify(buf); 2401 2402 mutex_exit(hash_lock); 2403 2404 nhdr = kmem_cache_alloc(hdr_cache, KM_SLEEP); 2405 nhdr->b_size = blksz; 2406 nhdr->b_spa = spa; 2407 nhdr->b_type = type; 2408 nhdr->b_buf = buf; 2409 nhdr->b_state = arc_anon; 2410 nhdr->b_arc_access = 0; 2411 nhdr->b_flags = 0; 2412 nhdr->b_datacnt = 1; 2413 nhdr->b_freeze_cksum = NULL; 2414 mutex_init(&nhdr->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL); 2415 (void) refcount_add(&nhdr->b_refcnt, tag); 2416 buf->b_hdr = nhdr; 2417 atomic_add_64(&arc_anon->arcs_size, blksz); 2418 2419 hdr = nhdr; 2420 } else { 2421 ASSERT(refcount_count(&hdr->b_refcnt) == 1); 2422 ASSERT(!list_link_active(&hdr->b_arc_node)); 2423 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 2424 arc_change_state(arc_anon, hdr, hash_lock); 2425 hdr->b_arc_access = 0; 2426 mutex_exit(hash_lock); 2427 bzero(&hdr->b_dva, sizeof (dva_t)); 2428 hdr->b_birth = 0; 2429 hdr->b_cksum0 = 0; 2430 arc_buf_thaw(buf); 2431 } 2432 buf->b_efunc = NULL; 2433 buf->b_private = NULL; 2434} 2435 2436int 2437arc_released(arc_buf_t *buf) 2438{ 2439 return (buf->b_data != NULL && buf->b_hdr->b_state == arc_anon); 2440} 2441 2442int 2443arc_has_callback(arc_buf_t *buf) 2444{ 2445 return (buf->b_efunc != NULL); 2446} 2447 2448#ifdef ZFS_DEBUG 2449int 2450arc_referenced(arc_buf_t *buf) 2451{ 2452 return (refcount_count(&buf->b_hdr->b_refcnt)); 2453} 2454#endif 2455 2456static void 2457arc_write_ready(zio_t *zio) 2458{ 2459 arc_write_callback_t *callback = zio->io_private; 2460 arc_buf_t *buf = callback->awcb_buf; 2461 2462 if (callback->awcb_ready) { 2463 ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt)); 2464 callback->awcb_ready(zio, buf, callback->awcb_private); 2465 } 2466 arc_cksum_compute(buf); 2467} 2468 2469static void 2470arc_write_done(zio_t *zio) 2471{ 2472 arc_write_callback_t *callback = zio->io_private; 2473 arc_buf_t *buf = callback->awcb_buf; 2474 arc_buf_hdr_t *hdr = buf->b_hdr; 2475 2476 hdr->b_acb = NULL; 2477 2478 /* this buffer is on no lists and is not in the hash table */ 2479 ASSERT3P(hdr->b_state, ==, arc_anon); 2480 2481 hdr->b_dva = *BP_IDENTITY(zio->io_bp); 2482 hdr->b_birth = zio->io_bp->blk_birth; 2483 hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0]; 2484 /* 2485 * If the block to be written was all-zero, we may have 2486 * compressed it away. In this case no write was performed 2487 * so there will be no dva/birth-date/checksum. The buffer 2488 * must therefor remain anonymous (and uncached). 2489 */ 2490 if (!BUF_EMPTY(hdr)) { 2491 arc_buf_hdr_t *exists; 2492 kmutex_t *hash_lock; 2493 2494 arc_cksum_verify(buf); 2495 2496 exists = buf_hash_insert(hdr, &hash_lock); 2497 if (exists) { 2498 /* 2499 * This can only happen if we overwrite for 2500 * sync-to-convergence, because we remove 2501 * buffers from the hash table when we arc_free(). 2502 */ 2503 ASSERT(DVA_EQUAL(BP_IDENTITY(&zio->io_bp_orig), 2504 BP_IDENTITY(zio->io_bp))); 2505 ASSERT3U(zio->io_bp_orig.blk_birth, ==, 2506 zio->io_bp->blk_birth); 2507 2508 ASSERT(refcount_is_zero(&exists->b_refcnt)); 2509 arc_change_state(arc_anon, exists, hash_lock); 2510 mutex_exit(hash_lock); 2511 arc_hdr_destroy(exists); 2512 exists = buf_hash_insert(hdr, &hash_lock); 2513 ASSERT3P(exists, ==, NULL); 2514 } 2515 hdr->b_flags &= ~ARC_IO_IN_PROGRESS; 2516 arc_access(hdr, hash_lock); 2517 mutex_exit(hash_lock); 2518 } else if (callback->awcb_done == NULL) { 2519 int destroy_hdr; 2520 /* 2521 * This is an anonymous buffer with no user callback, 2522 * destroy it if there are no active references. 2523 */ 2524 mutex_enter(&arc_eviction_mtx); 2525 destroy_hdr = refcount_is_zero(&hdr->b_refcnt); 2526 hdr->b_flags &= ~ARC_IO_IN_PROGRESS; 2527 mutex_exit(&arc_eviction_mtx); 2528 if (destroy_hdr) 2529 arc_hdr_destroy(hdr); 2530 } else { 2531 hdr->b_flags &= ~ARC_IO_IN_PROGRESS; 2532 } 2533 2534 if (callback->awcb_done) { 2535 ASSERT(!refcount_is_zero(&hdr->b_refcnt)); 2536 callback->awcb_done(zio, buf, callback->awcb_private); 2537 } 2538 2539 kmem_free(callback, sizeof (arc_write_callback_t)); 2540} 2541 2542zio_t * 2543arc_write(zio_t *pio, spa_t *spa, int checksum, int compress, int ncopies, 2544 uint64_t txg, blkptr_t *bp, arc_buf_t *buf, 2545 arc_done_func_t *ready, arc_done_func_t *done, void *private, int priority, 2546 int flags, zbookmark_t *zb) 2547{ 2548 arc_buf_hdr_t *hdr = buf->b_hdr; 2549 arc_write_callback_t *callback; 2550 zio_t *zio; 2551 2552 /* this is a private buffer - no locking required */ 2553 ASSERT3P(hdr->b_state, ==, arc_anon); 2554 ASSERT(BUF_EMPTY(hdr)); 2555 ASSERT(!HDR_IO_ERROR(hdr)); 2556 ASSERT((hdr->b_flags & ARC_IO_IN_PROGRESS) == 0); 2557 ASSERT(hdr->b_acb == 0); 2558 callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP); 2559 callback->awcb_ready = ready; 2560 callback->awcb_done = done; 2561 callback->awcb_private = private; 2562 callback->awcb_buf = buf; 2563 hdr->b_flags |= ARC_IO_IN_PROGRESS; 2564 zio = zio_write(pio, spa, checksum, compress, ncopies, txg, bp, 2565 buf->b_data, hdr->b_size, arc_write_ready, arc_write_done, callback, 2566 priority, flags, zb); 2567 2568 return (zio); 2569} 2570 2571int 2572arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, 2573 zio_done_func_t *done, void *private, uint32_t arc_flags) 2574{ 2575 arc_buf_hdr_t *ab; 2576 kmutex_t *hash_lock; 2577 zio_t *zio; 2578 2579 /* 2580 * If this buffer is in the cache, release it, so it 2581 * can be re-used. 2582 */ 2583 ab = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_lock); 2584 if (ab != NULL) { 2585 /* 2586 * The checksum of blocks to free is not always 2587 * preserved (eg. on the deadlist). However, if it is 2588 * nonzero, it should match what we have in the cache. 2589 */ 2590 ASSERT(bp->blk_cksum.zc_word[0] == 0 || 2591 ab->b_cksum0 == bp->blk_cksum.zc_word[0]); 2592 if (ab->b_state != arc_anon) 2593 arc_change_state(arc_anon, ab, hash_lock); 2594 if (HDR_IO_IN_PROGRESS(ab)) { 2595 /* 2596 * This should only happen when we prefetch. 2597 */ 2598 ASSERT(ab->b_flags & ARC_PREFETCH); 2599 ASSERT3U(ab->b_datacnt, ==, 1); 2600 ab->b_flags |= ARC_FREED_IN_READ; 2601 if (HDR_IN_HASH_TABLE(ab)) 2602 buf_hash_remove(ab); 2603 ab->b_arc_access = 0; 2604 bzero(&ab->b_dva, sizeof (dva_t)); 2605 ab->b_birth = 0; 2606 ab->b_cksum0 = 0; 2607 ab->b_buf->b_efunc = NULL; 2608 ab->b_buf->b_private = NULL; 2609 mutex_exit(hash_lock); 2610 } else if (refcount_is_zero(&ab->b_refcnt)) { 2611 mutex_exit(hash_lock); 2612 arc_hdr_destroy(ab); 2613 ARCSTAT_BUMP(arcstat_deleted); 2614 } else { 2615 /* 2616 * We still have an active reference on this 2617 * buffer. This can happen, e.g., from 2618 * dbuf_unoverride(). 2619 */ 2620 ASSERT(!HDR_IN_HASH_TABLE(ab)); 2621 ab->b_arc_access = 0; 2622 bzero(&ab->b_dva, sizeof (dva_t)); 2623 ab->b_birth = 0; 2624 ab->b_cksum0 = 0; 2625 ab->b_buf->b_efunc = NULL; 2626 ab->b_buf->b_private = NULL; 2627 mutex_exit(hash_lock); 2628 } 2629 } 2630 2631 zio = zio_free(pio, spa, txg, bp, done, private); 2632 2633 if (arc_flags & ARC_WAIT) 2634 return (zio_wait(zio)); 2635 2636 ASSERT(arc_flags & ARC_NOWAIT); 2637 zio_nowait(zio); 2638 2639 return (0); 2640} 2641 2642void 2643arc_tempreserve_clear(uint64_t tempreserve) 2644{ 2645 atomic_add_64(&arc_tempreserve, -tempreserve); 2646 ASSERT((int64_t)arc_tempreserve >= 0); 2647} 2648 2649int 2650arc_tempreserve_space(uint64_t tempreserve) 2651{ 2652#ifdef ZFS_DEBUG 2653 /* 2654 * Once in a while, fail for no reason. Everything should cope. 2655 */ 2656 if (spa_get_random(10000) == 0) { 2657 dprintf("forcing random failure\n"); 2658 return (ERESTART); 2659 } 2660#endif 2661 if (tempreserve > arc_c/4 && !arc_no_grow) 2662 arc_c = MIN(arc_c_max, tempreserve * 4); 2663 if (tempreserve > arc_c) 2664 return (ENOMEM); 2665 2666 /* 2667 * Throttle writes when the amount of dirty data in the cache 2668 * gets too large. We try to keep the cache less than half full 2669 * of dirty blocks so that our sync times don't grow too large. 2670 * Note: if two requests come in concurrently, we might let them 2671 * both succeed, when one of them should fail. Not a huge deal. 2672 * 2673 * XXX The limit should be adjusted dynamically to keep the time 2674 * to sync a dataset fixed (around 1-5 seconds?). 2675 */ 2676 2677 if (tempreserve + arc_tempreserve + arc_anon->arcs_size > arc_c / 2 && 2678 arc_tempreserve + arc_anon->arcs_size > arc_c / 4) { 2679 dprintf("failing, arc_tempreserve=%lluK anon=%lluK " 2680 "tempreserve=%lluK arc_c=%lluK\n", 2681 arc_tempreserve>>10, arc_anon->arcs_lsize>>10, 2682 tempreserve>>10, arc_c>>10); 2683 return (ERESTART); 2684 } 2685 atomic_add_64(&arc_tempreserve, tempreserve); 2686 return (0); 2687} 2688 2689#ifdef _KERNEL 2690static eventhandler_tag zfs_event_lowmem = NULL; 2691 2692static void 2693zfs_lowmem(void *arg __unused, int howto __unused) 2694{ 2695 2696 zfs_needfree = 1; 2697 cv_signal(&arc_reclaim_thr_cv); 2698 while (zfs_needfree) 2699 tsleep(&zfs_needfree, 0, "zfs:lowmem", hz / 5); 2700} 2701#endif 2702 2703void 2704arc_init(void) 2705{ 2706 mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL); 2707 cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL); 2708 2709 /* Convert seconds to clock ticks */ 2710 arc_min_prefetch_lifespan = 1 * hz; 2711 2712 /* Start out with 1/8 of all memory */ 2713 arc_c = physmem * PAGESIZE / 8; 2714#if 0 2715#ifdef _KERNEL 2716 /* 2717 * On architectures where the physical memory can be larger 2718 * than the addressable space (intel in 32-bit mode), we may 2719 * need to limit the cache to 1/8 of VM size. 2720 */ 2721 arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8); 2722#endif 2723#endif 2724 /* set min cache to 1/32 of all memory, or 64MB, whichever is more */ 2725 arc_c_min = MAX(arc_c / 4, 64<<20); 2726 /* set max to 3/4 of all memory, or all but 1GB, whichever is more */ 2727 if (arc_c * 8 >= 1<<30) 2728 arc_c_max = (arc_c * 8) - (1<<30); 2729 else 2730 arc_c_max = arc_c_min; 2731 arc_c_max = MAX(arc_c * 6, arc_c_max); 2732#ifdef _KERNEL 2733 /* 2734 * Allow the tunables to override our calculations if they are 2735 * reasonable (ie. over 64MB) 2736 */ 2737 if (zfs_arc_max > 64<<20 && zfs_arc_max < vm_kmem_size) 2738 arc_c_max = zfs_arc_max; 2739 if (zfs_arc_min > 64<<20 && zfs_arc_min <= arc_c_max) 2740 arc_c_min = zfs_arc_min; 2741#endif 2742 arc_c = arc_c_max; 2743 arc_p = (arc_c >> 1); 2744 2745 /* if kmem_flags are set, lets try to use less memory */ 2746 if (kmem_debugging()) 2747 arc_c = arc_c / 2; 2748 if (arc_c < arc_c_min) 2749 arc_c = arc_c_min; 2750 2751 zfs_arc_min = arc_c_min; 2752 zfs_arc_max = arc_c_max; 2753 2754 arc_anon = &ARC_anon; 2755 arc_mru = &ARC_mru; 2756 arc_mru_ghost = &ARC_mru_ghost; 2757 arc_mfu = &ARC_mfu; 2758 arc_mfu_ghost = &ARC_mfu_ghost; 2759 arc_size = 0; 2760 2761 mutex_init(&arc_anon->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); 2762 mutex_init(&arc_mru->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); 2763 mutex_init(&arc_mru_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); 2764 mutex_init(&arc_mfu->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); 2765 mutex_init(&arc_mfu_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); 2766 2767 list_create(&arc_mru->arcs_list, sizeof (arc_buf_hdr_t), 2768 offsetof(arc_buf_hdr_t, b_arc_node)); 2769 list_create(&arc_mru_ghost->arcs_list, sizeof (arc_buf_hdr_t), 2770 offsetof(arc_buf_hdr_t, b_arc_node)); 2771 list_create(&arc_mfu->arcs_list, sizeof (arc_buf_hdr_t), 2772 offsetof(arc_buf_hdr_t, b_arc_node)); 2773 list_create(&arc_mfu_ghost->arcs_list, sizeof (arc_buf_hdr_t), 2774 offsetof(arc_buf_hdr_t, b_arc_node)); 2775 2776 buf_init(); 2777 2778 arc_thread_exit = 0; 2779 arc_eviction_list = NULL; 2780 mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL); 2781 bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t)); 2782 2783 arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED, 2784 sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); 2785 2786 if (arc_ksp != NULL) { 2787 arc_ksp->ks_data = &arc_stats; 2788 kstat_install(arc_ksp); 2789 } 2790 2791 (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0, 2792 TS_RUN, minclsyspri); 2793 2794#ifdef _KERNEL 2795 zfs_event_lowmem = EVENTHANDLER_REGISTER(vm_lowmem, zfs_lowmem, NULL, 2796 EVENTHANDLER_PRI_FIRST); 2797#endif 2798 2799 arc_dead = FALSE; 2800} 2801 2802void 2803arc_fini(void) 2804{ 2805 mutex_enter(&arc_reclaim_thr_lock); 2806 arc_thread_exit = 1; 2807 cv_signal(&arc_reclaim_thr_cv); 2808 while (arc_thread_exit != 0) 2809 cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock); 2810 mutex_exit(&arc_reclaim_thr_lock); 2811 2812 arc_flush(); 2813 2814 arc_dead = TRUE; 2815 2816 if (arc_ksp != NULL) { 2817 kstat_delete(arc_ksp); 2818 arc_ksp = NULL; 2819 } 2820 2821 mutex_destroy(&arc_eviction_mtx); 2822 mutex_destroy(&arc_reclaim_thr_lock); 2823 cv_destroy(&arc_reclaim_thr_cv); 2824 2825 list_destroy(&arc_mru->arcs_list); 2826 list_destroy(&arc_mru_ghost->arcs_list); 2827 list_destroy(&arc_mfu->arcs_list); 2828 list_destroy(&arc_mfu_ghost->arcs_list); 2829 2830 mutex_destroy(&arc_anon->arcs_mtx); 2831 mutex_destroy(&arc_mru->arcs_mtx); 2832 mutex_destroy(&arc_mru_ghost->arcs_mtx); 2833 mutex_destroy(&arc_mfu->arcs_mtx); 2834 mutex_destroy(&arc_mfu_ghost->arcs_mtx); 2835 2836 buf_fini(); 2837 2838#ifdef _KERNEL 2839 if (zfs_event_lowmem != NULL) 2840 EVENTHANDLER_DEREGISTER(vm_lowmem, zfs_event_lowmem); 2841#endif 2842} 2843