1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21/* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 24 * Copyright (c) 2011 by Delphix. All rights reserved. 25 */ 26 27/* 28 * DVA-based Adjustable Replacement Cache 29 * 30 * While much of the theory of operation used here is 31 * based on the self-tuning, low overhead replacement cache 32 * presented by Megiddo and Modha at FAST 2003, there are some 33 * significant differences: 34 * 35 * 1. The Megiddo and Modha model assumes any page is evictable. 36 * Pages in its cache cannot be "locked" into memory. This makes 37 * the eviction algorithm simple: evict the last page in the list. 38 * This also make the performance characteristics easy to reason 39 * about. Our cache is not so simple. At any given moment, some 40 * subset of the blocks in the cache are un-evictable because we 41 * have handed out a reference to them. Blocks are only evictable 42 * when there are no external references active. This makes 43 * eviction far more problematic: we choose to evict the evictable 44 * blocks that are the "lowest" in the list. 45 * 46 * There are times when it is not possible to evict the requested 47 * space. In these circumstances we are unable to adjust the cache 48 * size. To prevent the cache growing unbounded at these times we 49 * implement a "cache throttle" that slows the flow of new data 50 * into the cache until we can make space available. 51 * 52 * 2. The Megiddo and Modha model assumes a fixed cache size. 53 * Pages are evicted when the cache is full and there is a cache 54 * miss. Our model has a variable sized cache. It grows with 55 * high use, but also tries to react to memory pressure from the 56 * operating system: decreasing its size when system memory is 57 * tight. 58 * 59 * 3. The Megiddo and Modha model assumes a fixed page size. All 60 * elements of the cache are therefor exactly the same size. So 61 * when adjusting the cache size following a cache miss, its simply 62 * a matter of choosing a single page to evict. In our model, we 63 * have variable sized cache blocks (rangeing from 512 bytes to 64 * 128K bytes). We therefor choose a set of blocks to evict to make 65 * space for a cache miss that approximates as closely as possible 66 * the space used by the new block. 67 * 68 * See also: "ARC: A Self-Tuning, Low Overhead Replacement Cache" 69 * by N. Megiddo & D. Modha, FAST 2003 70 */ 71 72/* 73 * The locking model: 74 * 75 * A new reference to a cache buffer can be obtained in two 76 * ways: 1) via a hash table lookup using the DVA as a key, 77 * or 2) via one of the ARC lists. The arc_read() interface 78 * uses method 1, while the internal arc algorithms for 79 * adjusting the cache use method 2. We therefor provide two 80 * types of locks: 1) the hash table lock array, and 2) the 81 * arc list locks. 82 * 83 * Buffers do not have their own mutexs, rather they rely on the 84 * hash table mutexs for the bulk of their protection (i.e. most 85 * fields in the arc_buf_hdr_t are protected by these mutexs). 86 * 87 * buf_hash_find() returns the appropriate mutex (held) when it 88 * locates the requested buffer in the hash table. It returns 89 * NULL for the mutex if the buffer was not in the table. 90 * 91 * buf_hash_remove() expects the appropriate hash mutex to be 92 * already held before it is invoked. 93 * 94 * Each arc state also has a mutex which is used to protect the 95 * buffer list associated with the state. When attempting to 96 * obtain a hash table lock while holding an arc list lock you 97 * must use: mutex_tryenter() to avoid deadlock. Also note that 98 * the active state mutex must be held before the ghost state mutex. 99 * 100 * Arc buffers may have an associated eviction callback function. 101 * This function will be invoked prior to removing the buffer (e.g. 102 * in arc_do_user_evicts()). Note however that the data associated 103 * with the buffer may be evicted prior to the callback. The callback 104 * must be made with *no locks held* (to prevent deadlock). Additionally, 105 * the users of callbacks must ensure that their private data is 106 * protected from simultaneous callbacks from arc_buf_evict() 107 * and arc_do_user_evicts(). 108 * 109 * Note that the majority of the performance stats are manipulated 110 * with atomic operations. 111 * 112 * The L2ARC uses the l2arc_buflist_mtx global mutex for the following: 113 * 114 * - L2ARC buflist creation 115 * - L2ARC buflist eviction 116 * - L2ARC write completion, which walks L2ARC buflists 117 * - ARC header destruction, as it removes from L2ARC buflists 118 * - ARC header release, as it removes from L2ARC buflists 119 */ 120 121#include <sys/spa.h> 122#include <sys/zio.h> 123#include <sys/zfs_context.h> 124#include <sys/arc.h> 125#include <sys/refcount.h> 126#include <sys/vdev.h> 127#include <sys/vdev_impl.h> 128#ifdef _KERNEL 129#include <sys/dnlc.h> 130#endif 131#include <sys/callb.h> 132#include <sys/kstat.h> 133#include <zfs_fletcher.h> 134#include <sys/sdt.h> 135 136#include <vm/vm_pageout.h> 137 138#ifdef illumos 139#ifndef _KERNEL 140/* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */ 141boolean_t arc_watch = B_FALSE; 142int arc_procfd; 143#endif 144#endif /* illumos */ 145 146static kmutex_t arc_reclaim_thr_lock; 147static kcondvar_t arc_reclaim_thr_cv; /* used to signal reclaim thr */ 148static uint8_t arc_thread_exit; 149 150extern int zfs_write_limit_shift; 151extern uint64_t zfs_write_limit_max; 152extern kmutex_t zfs_write_limit_lock; 153 154#define ARC_REDUCE_DNLC_PERCENT 3 155uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT; 156 157typedef enum arc_reclaim_strategy { 158 ARC_RECLAIM_AGGR, /* Aggressive reclaim strategy */ 159 ARC_RECLAIM_CONS /* Conservative reclaim strategy */ 160} arc_reclaim_strategy_t; 161 162/* number of seconds before growing cache again */ 163static int arc_grow_retry = 60; 164 165/* shift of arc_c for calculating both min and max arc_p */ 166static int arc_p_min_shift = 4; 167 168/* log2(fraction of arc to reclaim) */ 169static int arc_shrink_shift = 5; 170 171/* 172 * minimum lifespan of a prefetch block in clock ticks 173 * (initialized in arc_init()) 174 */ 175static int arc_min_prefetch_lifespan; 176 177static int arc_dead; 178extern int zfs_prefetch_disable; 179 180/* 181 * The arc has filled available memory and has now warmed up. 182 */ 183static boolean_t arc_warm; 184 185/* 186 * These tunables are for performance analysis. 187 */ 188uint64_t zfs_arc_max; 189uint64_t zfs_arc_min; 190uint64_t zfs_arc_meta_limit = 0; 191int zfs_arc_grow_retry = 0; 192int zfs_arc_shrink_shift = 0; 193int zfs_arc_p_min_shift = 0; 194int zfs_disable_dup_eviction = 0; 195 196TUNABLE_QUAD("vfs.zfs.arc_max", &zfs_arc_max); 197TUNABLE_QUAD("vfs.zfs.arc_min", &zfs_arc_min); 198TUNABLE_QUAD("vfs.zfs.arc_meta_limit", &zfs_arc_meta_limit); 199SYSCTL_DECL(_vfs_zfs); 200SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_max, CTLFLAG_RDTUN, &zfs_arc_max, 0, 201 "Maximum ARC size"); 202SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_min, CTLFLAG_RDTUN, &zfs_arc_min, 0, 203 "Minimum ARC size"); 204 205/* 206 * Note that buffers can be in one of 6 states: 207 * ARC_anon - anonymous (discussed below) 208 * ARC_mru - recently used, currently cached 209 * ARC_mru_ghost - recentely used, no longer in cache 210 * ARC_mfu - frequently used, currently cached 211 * ARC_mfu_ghost - frequently used, no longer in cache 212 * ARC_l2c_only - exists in L2ARC but not other states 213 * When there are no active references to the buffer, they are 214 * are linked onto a list in one of these arc states. These are 215 * the only buffers that can be evicted or deleted. Within each 216 * state there are multiple lists, one for meta-data and one for 217 * non-meta-data. Meta-data (indirect blocks, blocks of dnodes, 218 * etc.) is tracked separately so that it can be managed more 219 * explicitly: favored over data, limited explicitly. 220 * 221 * Anonymous buffers are buffers that are not associated with 222 * a DVA. These are buffers that hold dirty block copies 223 * before they are written to stable storage. By definition, 224 * they are "ref'd" and are considered part of arc_mru 225 * that cannot be freed. Generally, they will aquire a DVA 226 * as they are written and migrate onto the arc_mru list. 227 * 228 * The ARC_l2c_only state is for buffers that are in the second 229 * level ARC but no longer in any of the ARC_m* lists. The second 230 * level ARC itself may also contain buffers that are in any of 231 * the ARC_m* states - meaning that a buffer can exist in two 232 * places. The reason for the ARC_l2c_only state is to keep the 233 * buffer header in the hash table, so that reads that hit the 234 * second level ARC benefit from these fast lookups. 235 */ 236 237#define ARCS_LOCK_PAD CACHE_LINE_SIZE 238struct arcs_lock { 239 kmutex_t arcs_lock; 240#ifdef _KERNEL 241 unsigned char pad[(ARCS_LOCK_PAD - sizeof (kmutex_t))]; 242#endif 243}; 244 245/* 246 * must be power of two for mask use to work 247 * 248 */ 249#define ARC_BUFC_NUMDATALISTS 16 250#define ARC_BUFC_NUMMETADATALISTS 16 251#define ARC_BUFC_NUMLISTS (ARC_BUFC_NUMMETADATALISTS + ARC_BUFC_NUMDATALISTS) 252 253typedef struct arc_state { 254 uint64_t arcs_lsize[ARC_BUFC_NUMTYPES]; /* amount of evictable data */ 255 uint64_t arcs_size; /* total amount of data in this state */ 256 list_t arcs_lists[ARC_BUFC_NUMLISTS]; /* list of evictable buffers */ 257 struct arcs_lock arcs_locks[ARC_BUFC_NUMLISTS] __aligned(CACHE_LINE_SIZE); 258} arc_state_t; 259 260#define ARCS_LOCK(s, i) (&((s)->arcs_locks[(i)].arcs_lock)) 261 262/* The 6 states: */ 263static arc_state_t ARC_anon; 264static arc_state_t ARC_mru; 265static arc_state_t ARC_mru_ghost; 266static arc_state_t ARC_mfu; 267static arc_state_t ARC_mfu_ghost; 268static arc_state_t ARC_l2c_only; 269 270typedef struct arc_stats { 271 kstat_named_t arcstat_hits; 272 kstat_named_t arcstat_misses; 273 kstat_named_t arcstat_demand_data_hits; 274 kstat_named_t arcstat_demand_data_misses; 275 kstat_named_t arcstat_demand_metadata_hits; 276 kstat_named_t arcstat_demand_metadata_misses; 277 kstat_named_t arcstat_prefetch_data_hits; 278 kstat_named_t arcstat_prefetch_data_misses; 279 kstat_named_t arcstat_prefetch_metadata_hits; 280 kstat_named_t arcstat_prefetch_metadata_misses; 281 kstat_named_t arcstat_mru_hits; 282 kstat_named_t arcstat_mru_ghost_hits; 283 kstat_named_t arcstat_mfu_hits; 284 kstat_named_t arcstat_mfu_ghost_hits; 285 kstat_named_t arcstat_allocated; 286 kstat_named_t arcstat_deleted; 287 kstat_named_t arcstat_stolen; 288 kstat_named_t arcstat_recycle_miss; 289 kstat_named_t arcstat_mutex_miss; 290 kstat_named_t arcstat_evict_skip; 291 kstat_named_t arcstat_evict_l2_cached; 292 kstat_named_t arcstat_evict_l2_eligible; 293 kstat_named_t arcstat_evict_l2_ineligible; 294 kstat_named_t arcstat_hash_elements; 295 kstat_named_t arcstat_hash_elements_max; 296 kstat_named_t arcstat_hash_collisions; 297 kstat_named_t arcstat_hash_chains; 298 kstat_named_t arcstat_hash_chain_max; 299 kstat_named_t arcstat_p; 300 kstat_named_t arcstat_c; 301 kstat_named_t arcstat_c_min; 302 kstat_named_t arcstat_c_max; 303 kstat_named_t arcstat_size; 304 kstat_named_t arcstat_hdr_size; 305 kstat_named_t arcstat_data_size; 306 kstat_named_t arcstat_other_size; 307 kstat_named_t arcstat_l2_hits; 308 kstat_named_t arcstat_l2_misses; 309 kstat_named_t arcstat_l2_feeds; 310 kstat_named_t arcstat_l2_rw_clash; 311 kstat_named_t arcstat_l2_read_bytes; 312 kstat_named_t arcstat_l2_write_bytes; 313 kstat_named_t arcstat_l2_writes_sent; 314 kstat_named_t arcstat_l2_writes_done; 315 kstat_named_t arcstat_l2_writes_error; 316 kstat_named_t arcstat_l2_writes_hdr_miss; 317 kstat_named_t arcstat_l2_evict_lock_retry; 318 kstat_named_t arcstat_l2_evict_reading; 319 kstat_named_t arcstat_l2_free_on_write; 320 kstat_named_t arcstat_l2_abort_lowmem; 321 kstat_named_t arcstat_l2_cksum_bad; 322 kstat_named_t arcstat_l2_io_error; 323 kstat_named_t arcstat_l2_size; 324 kstat_named_t arcstat_l2_hdr_size; 325 kstat_named_t arcstat_l2_write_trylock_fail; 326 kstat_named_t arcstat_l2_write_passed_headroom; 327 kstat_named_t arcstat_l2_write_spa_mismatch; 328 kstat_named_t arcstat_l2_write_in_l2; 329 kstat_named_t arcstat_l2_write_hdr_io_in_progress; 330 kstat_named_t arcstat_l2_write_not_cacheable; 331 kstat_named_t arcstat_l2_write_full; 332 kstat_named_t arcstat_l2_write_buffer_iter; 333 kstat_named_t arcstat_l2_write_pios; 334 kstat_named_t arcstat_l2_write_buffer_bytes_scanned; 335 kstat_named_t arcstat_l2_write_buffer_list_iter; 336 kstat_named_t arcstat_l2_write_buffer_list_null_iter; 337 kstat_named_t arcstat_memory_throttle_count; 338 kstat_named_t arcstat_duplicate_buffers; 339 kstat_named_t arcstat_duplicate_buffers_size; 340 kstat_named_t arcstat_duplicate_reads; 341} arc_stats_t; 342 343static arc_stats_t arc_stats = { 344 { "hits", KSTAT_DATA_UINT64 }, 345 { "misses", KSTAT_DATA_UINT64 }, 346 { "demand_data_hits", KSTAT_DATA_UINT64 }, 347 { "demand_data_misses", KSTAT_DATA_UINT64 }, 348 { "demand_metadata_hits", KSTAT_DATA_UINT64 }, 349 { "demand_metadata_misses", KSTAT_DATA_UINT64 }, 350 { "prefetch_data_hits", KSTAT_DATA_UINT64 }, 351 { "prefetch_data_misses", KSTAT_DATA_UINT64 }, 352 { "prefetch_metadata_hits", KSTAT_DATA_UINT64 }, 353 { "prefetch_metadata_misses", KSTAT_DATA_UINT64 }, 354 { "mru_hits", KSTAT_DATA_UINT64 }, 355 { "mru_ghost_hits", KSTAT_DATA_UINT64 }, 356 { "mfu_hits", KSTAT_DATA_UINT64 }, 357 { "mfu_ghost_hits", KSTAT_DATA_UINT64 }, 358 { "allocated", KSTAT_DATA_UINT64 }, 359 { "deleted", KSTAT_DATA_UINT64 }, 360 { "stolen", KSTAT_DATA_UINT64 }, 361 { "recycle_miss", KSTAT_DATA_UINT64 }, 362 { "mutex_miss", KSTAT_DATA_UINT64 }, 363 { "evict_skip", KSTAT_DATA_UINT64 }, 364 { "evict_l2_cached", KSTAT_DATA_UINT64 }, 365 { "evict_l2_eligible", KSTAT_DATA_UINT64 }, 366 { "evict_l2_ineligible", KSTAT_DATA_UINT64 }, 367 { "hash_elements", KSTAT_DATA_UINT64 }, 368 { "hash_elements_max", KSTAT_DATA_UINT64 }, 369 { "hash_collisions", KSTAT_DATA_UINT64 }, 370 { "hash_chains", KSTAT_DATA_UINT64 }, 371 { "hash_chain_max", KSTAT_DATA_UINT64 }, 372 { "p", KSTAT_DATA_UINT64 }, 373 { "c", KSTAT_DATA_UINT64 }, 374 { "c_min", KSTAT_DATA_UINT64 }, 375 { "c_max", KSTAT_DATA_UINT64 }, 376 { "size", KSTAT_DATA_UINT64 }, 377 { "hdr_size", KSTAT_DATA_UINT64 }, 378 { "data_size", KSTAT_DATA_UINT64 }, 379 { "other_size", KSTAT_DATA_UINT64 }, 380 { "l2_hits", KSTAT_DATA_UINT64 }, 381 { "l2_misses", KSTAT_DATA_UINT64 }, 382 { "l2_feeds", KSTAT_DATA_UINT64 }, 383 { "l2_rw_clash", KSTAT_DATA_UINT64 }, 384 { "l2_read_bytes", KSTAT_DATA_UINT64 }, 385 { "l2_write_bytes", KSTAT_DATA_UINT64 }, 386 { "l2_writes_sent", KSTAT_DATA_UINT64 }, 387 { "l2_writes_done", KSTAT_DATA_UINT64 }, 388 { "l2_writes_error", KSTAT_DATA_UINT64 }, 389 { "l2_writes_hdr_miss", KSTAT_DATA_UINT64 }, 390 { "l2_evict_lock_retry", KSTAT_DATA_UINT64 }, 391 { "l2_evict_reading", KSTAT_DATA_UINT64 }, 392 { "l2_free_on_write", KSTAT_DATA_UINT64 }, 393 { "l2_abort_lowmem", KSTAT_DATA_UINT64 }, 394 { "l2_cksum_bad", KSTAT_DATA_UINT64 }, 395 { "l2_io_error", KSTAT_DATA_UINT64 }, 396 { "l2_size", KSTAT_DATA_UINT64 }, 397 { "l2_hdr_size", KSTAT_DATA_UINT64 }, 398 { "l2_write_trylock_fail", KSTAT_DATA_UINT64 }, 399 { "l2_write_passed_headroom", KSTAT_DATA_UINT64 }, 400 { "l2_write_spa_mismatch", KSTAT_DATA_UINT64 }, 401 { "l2_write_in_l2", KSTAT_DATA_UINT64 }, 402 { "l2_write_io_in_progress", KSTAT_DATA_UINT64 }, 403 { "l2_write_not_cacheable", KSTAT_DATA_UINT64 }, 404 { "l2_write_full", KSTAT_DATA_UINT64 }, 405 { "l2_write_buffer_iter", KSTAT_DATA_UINT64 }, 406 { "l2_write_pios", KSTAT_DATA_UINT64 }, 407 { "l2_write_buffer_bytes_scanned", KSTAT_DATA_UINT64 }, 408 { "l2_write_buffer_list_iter", KSTAT_DATA_UINT64 }, 409 { "l2_write_buffer_list_null_iter", KSTAT_DATA_UINT64 }, 410 { "memory_throttle_count", KSTAT_DATA_UINT64 }, 411 { "duplicate_buffers", KSTAT_DATA_UINT64 }, 412 { "duplicate_buffers_size", KSTAT_DATA_UINT64 }, 413 { "duplicate_reads", KSTAT_DATA_UINT64 } 414}; 415 416#define ARCSTAT(stat) (arc_stats.stat.value.ui64) 417 418#define ARCSTAT_INCR(stat, val) \ 419 atomic_add_64(&arc_stats.stat.value.ui64, (val)); 420 421#define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1) 422#define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1) 423 424#define ARCSTAT_MAX(stat, val) { \ 425 uint64_t m; \ 426 while ((val) > (m = arc_stats.stat.value.ui64) && \ 427 (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val)))) \ 428 continue; \ 429} 430 431#define ARCSTAT_MAXSTAT(stat) \ 432 ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64) 433 434/* 435 * We define a macro to allow ARC hits/misses to be easily broken down by 436 * two separate conditions, giving a total of four different subtypes for 437 * each of hits and misses (so eight statistics total). 438 */ 439#define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \ 440 if (cond1) { \ 441 if (cond2) { \ 442 ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \ 443 } else { \ 444 ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \ 445 } \ 446 } else { \ 447 if (cond2) { \ 448 ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \ 449 } else { \ 450 ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\ 451 } \ 452 } 453 454kstat_t *arc_ksp; 455static arc_state_t *arc_anon; 456static arc_state_t *arc_mru; 457static arc_state_t *arc_mru_ghost; 458static arc_state_t *arc_mfu; 459static arc_state_t *arc_mfu_ghost; 460static arc_state_t *arc_l2c_only; 461 462/* 463 * There are several ARC variables that are critical to export as kstats -- 464 * but we don't want to have to grovel around in the kstat whenever we wish to 465 * manipulate them. For these variables, we therefore define them to be in 466 * terms of the statistic variable. This assures that we are not introducing 467 * the possibility of inconsistency by having shadow copies of the variables, 468 * while still allowing the code to be readable. 469 */ 470#define arc_size ARCSTAT(arcstat_size) /* actual total arc size */ 471#define arc_p ARCSTAT(arcstat_p) /* target size of MRU */ 472#define arc_c ARCSTAT(arcstat_c) /* target size of cache */ 473#define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */ 474#define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */ 475 476static int arc_no_grow; /* Don't try to grow cache size */ 477static uint64_t arc_tempreserve; 478static uint64_t arc_loaned_bytes; 479static uint64_t arc_meta_used; 480static uint64_t arc_meta_limit; 481static uint64_t arc_meta_max = 0; 482SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_meta_used, CTLFLAG_RD, &arc_meta_used, 0, 483 "ARC metadata used"); 484SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_meta_limit, CTLFLAG_RW, &arc_meta_limit, 0, 485 "ARC metadata limit"); 486 487typedef struct l2arc_buf_hdr l2arc_buf_hdr_t; 488 489typedef struct arc_callback arc_callback_t; 490 491struct arc_callback { 492 void *acb_private; 493 arc_done_func_t *acb_done; 494 arc_buf_t *acb_buf; 495 zio_t *acb_zio_dummy; 496 arc_callback_t *acb_next; 497}; 498 499typedef struct arc_write_callback arc_write_callback_t; 500 501struct arc_write_callback { 502 void *awcb_private; 503 arc_done_func_t *awcb_ready; 504 arc_done_func_t *awcb_done; 505 arc_buf_t *awcb_buf; 506}; 507 508struct arc_buf_hdr { 509 /* protected by hash lock */ 510 dva_t b_dva; 511 uint64_t b_birth; 512 uint64_t b_cksum0; 513 514 kmutex_t b_freeze_lock; 515 zio_cksum_t *b_freeze_cksum; 516 void *b_thawed; 517 518 arc_buf_hdr_t *b_hash_next; 519 arc_buf_t *b_buf; 520 uint32_t b_flags; 521 uint32_t b_datacnt; 522 523 arc_callback_t *b_acb; 524 kcondvar_t b_cv; 525 526 /* immutable */ 527 arc_buf_contents_t b_type; 528 uint64_t b_size; 529 uint64_t b_spa; 530 531 /* protected by arc state mutex */ 532 arc_state_t *b_state; 533 list_node_t b_arc_node; 534 535 /* updated atomically */ 536 clock_t b_arc_access; 537 538 /* self protecting */ 539 refcount_t b_refcnt; 540 541 l2arc_buf_hdr_t *b_l2hdr; 542 list_node_t b_l2node; 543}; 544 545static arc_buf_t *arc_eviction_list; 546static kmutex_t arc_eviction_mtx; 547static arc_buf_hdr_t arc_eviction_hdr; 548static void arc_get_data_buf(arc_buf_t *buf); 549static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock); 550static int arc_evict_needed(arc_buf_contents_t type); 551static void arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes); 552#ifdef illumos 553static void arc_buf_watch(arc_buf_t *buf); 554#endif /* illumos */ 555 556static boolean_t l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab); 557 558#define GHOST_STATE(state) \ 559 ((state) == arc_mru_ghost || (state) == arc_mfu_ghost || \ 560 (state) == arc_l2c_only) 561 562/* 563 * Private ARC flags. These flags are private ARC only flags that will show up 564 * in b_flags in the arc_hdr_buf_t. Some flags are publicly declared, and can 565 * be passed in as arc_flags in things like arc_read. However, these flags 566 * should never be passed and should only be set by ARC code. When adding new 567 * public flags, make sure not to smash the private ones. 568 */ 569 570#define ARC_IN_HASH_TABLE (1 << 9) /* this buffer is hashed */ 571#define ARC_IO_IN_PROGRESS (1 << 10) /* I/O in progress for buf */ 572#define ARC_IO_ERROR (1 << 11) /* I/O failed for buf */ 573#define ARC_FREED_IN_READ (1 << 12) /* buf freed while in read */ 574#define ARC_BUF_AVAILABLE (1 << 13) /* block not in active use */ 575#define ARC_INDIRECT (1 << 14) /* this is an indirect block */ 576#define ARC_FREE_IN_PROGRESS (1 << 15) /* hdr about to be freed */ 577#define ARC_L2_WRITING (1 << 16) /* L2ARC write in progress */ 578#define ARC_L2_EVICTED (1 << 17) /* evicted during I/O */ 579#define ARC_L2_WRITE_HEAD (1 << 18) /* head of write list */ 580 581#define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_IN_HASH_TABLE) 582#define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS) 583#define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_IO_ERROR) 584#define HDR_PREFETCH(hdr) ((hdr)->b_flags & ARC_PREFETCH) 585#define HDR_FREED_IN_READ(hdr) ((hdr)->b_flags & ARC_FREED_IN_READ) 586#define HDR_BUF_AVAILABLE(hdr) ((hdr)->b_flags & ARC_BUF_AVAILABLE) 587#define HDR_FREE_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FREE_IN_PROGRESS) 588#define HDR_L2CACHE(hdr) ((hdr)->b_flags & ARC_L2CACHE) 589#define HDR_L2_READING(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS && \ 590 (hdr)->b_l2hdr != NULL) 591#define HDR_L2_WRITING(hdr) ((hdr)->b_flags & ARC_L2_WRITING) 592#define HDR_L2_EVICTED(hdr) ((hdr)->b_flags & ARC_L2_EVICTED) 593#define HDR_L2_WRITE_HEAD(hdr) ((hdr)->b_flags & ARC_L2_WRITE_HEAD) 594 595/* 596 * Other sizes 597 */ 598 599#define HDR_SIZE ((int64_t)sizeof (arc_buf_hdr_t)) 600#define L2HDR_SIZE ((int64_t)sizeof (l2arc_buf_hdr_t)) 601 602/* 603 * Hash table routines 604 */ 605 606#define HT_LOCK_PAD CACHE_LINE_SIZE 607 608struct ht_lock { 609 kmutex_t ht_lock; 610#ifdef _KERNEL 611 unsigned char pad[(HT_LOCK_PAD - sizeof (kmutex_t))]; 612#endif 613}; 614 615#define BUF_LOCKS 256 616typedef struct buf_hash_table { 617 uint64_t ht_mask; 618 arc_buf_hdr_t **ht_table; 619 struct ht_lock ht_locks[BUF_LOCKS] __aligned(CACHE_LINE_SIZE); 620} buf_hash_table_t; 621 622static buf_hash_table_t buf_hash_table; 623 624#define BUF_HASH_INDEX(spa, dva, birth) \ 625 (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask) 626#define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)]) 627#define BUF_HASH_LOCK(idx) (&(BUF_HASH_LOCK_NTRY(idx).ht_lock)) 628#define HDR_LOCK(hdr) \ 629 (BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth))) 630 631uint64_t zfs_crc64_table[256]; 632 633/* 634 * Level 2 ARC 635 */ 636 637#define L2ARC_WRITE_SIZE (8 * 1024 * 1024) /* initial write max */ 638#define L2ARC_HEADROOM 2 /* num of writes */ 639#define L2ARC_FEED_SECS 1 /* caching interval secs */ 640#define L2ARC_FEED_MIN_MS 200 /* min caching interval ms */ 641 642#define l2arc_writes_sent ARCSTAT(arcstat_l2_writes_sent) 643#define l2arc_writes_done ARCSTAT(arcstat_l2_writes_done) 644 645/* 646 * L2ARC Performance Tunables 647 */ 648uint64_t l2arc_write_max = L2ARC_WRITE_SIZE; /* default max write size */ 649uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra write during warmup */ 650uint64_t l2arc_headroom = L2ARC_HEADROOM; /* number of dev writes */ 651uint64_t l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */ 652uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval milliseconds */ 653boolean_t l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */ 654boolean_t l2arc_feed_again = B_TRUE; /* turbo warmup */ 655boolean_t l2arc_norw = B_TRUE; /* no reads during writes */ 656 657SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_max, CTLFLAG_RW, 658 &l2arc_write_max, 0, "max write size"); 659SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_boost, CTLFLAG_RW, 660 &l2arc_write_boost, 0, "extra write during warmup"); 661SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_headroom, CTLFLAG_RW, 662 &l2arc_headroom, 0, "number of dev writes"); 663SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_secs, CTLFLAG_RW, 664 &l2arc_feed_secs, 0, "interval seconds"); 665SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_min_ms, CTLFLAG_RW, 666 &l2arc_feed_min_ms, 0, "min interval milliseconds"); 667 668SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_noprefetch, CTLFLAG_RW, 669 &l2arc_noprefetch, 0, "don't cache prefetch bufs"); 670SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_feed_again, CTLFLAG_RW, 671 &l2arc_feed_again, 0, "turbo warmup"); 672SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_norw, CTLFLAG_RW, 673 &l2arc_norw, 0, "no reads during writes"); 674 675SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_size, CTLFLAG_RD, 676 &ARC_anon.arcs_size, 0, "size of anonymous state"); 677SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_metadata_lsize, CTLFLAG_RD, 678 &ARC_anon.arcs_lsize[ARC_BUFC_METADATA], 0, "size of anonymous state"); 679SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_data_lsize, CTLFLAG_RD, 680 &ARC_anon.arcs_lsize[ARC_BUFC_DATA], 0, "size of anonymous state"); 681 682SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_size, CTLFLAG_RD, 683 &ARC_mru.arcs_size, 0, "size of mru state"); 684SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_metadata_lsize, CTLFLAG_RD, 685 &ARC_mru.arcs_lsize[ARC_BUFC_METADATA], 0, "size of metadata in mru state"); 686SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_data_lsize, CTLFLAG_RD, 687 &ARC_mru.arcs_lsize[ARC_BUFC_DATA], 0, "size of data in mru state"); 688 689SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_size, CTLFLAG_RD, 690 &ARC_mru_ghost.arcs_size, 0, "size of mru ghost state"); 691SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_metadata_lsize, CTLFLAG_RD, 692 &ARC_mru_ghost.arcs_lsize[ARC_BUFC_METADATA], 0, 693 "size of metadata in mru ghost state"); 694SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_data_lsize, CTLFLAG_RD, 695 &ARC_mru_ghost.arcs_lsize[ARC_BUFC_DATA], 0, 696 "size of data in mru ghost state"); 697 698SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_size, CTLFLAG_RD, 699 &ARC_mfu.arcs_size, 0, "size of mfu state"); 700SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_metadata_lsize, CTLFLAG_RD, 701 &ARC_mfu.arcs_lsize[ARC_BUFC_METADATA], 0, "size of metadata in mfu state"); 702SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_data_lsize, CTLFLAG_RD, 703 &ARC_mfu.arcs_lsize[ARC_BUFC_DATA], 0, "size of data in mfu state"); 704 705SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_size, CTLFLAG_RD, 706 &ARC_mfu_ghost.arcs_size, 0, "size of mfu ghost state"); 707SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_metadata_lsize, CTLFLAG_RD, 708 &ARC_mfu_ghost.arcs_lsize[ARC_BUFC_METADATA], 0, 709 "size of metadata in mfu ghost state"); 710SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_data_lsize, CTLFLAG_RD, 711 &ARC_mfu_ghost.arcs_lsize[ARC_BUFC_DATA], 0, 712 "size of data in mfu ghost state"); 713 714SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2c_only_size, CTLFLAG_RD, 715 &ARC_l2c_only.arcs_size, 0, "size of mru state"); 716 717/* 718 * L2ARC Internals 719 */ 720typedef struct l2arc_dev { 721 vdev_t *l2ad_vdev; /* vdev */ 722 spa_t *l2ad_spa; /* spa */ 723 uint64_t l2ad_hand; /* next write location */ 724 uint64_t l2ad_write; /* desired write size, bytes */ 725 uint64_t l2ad_boost; /* warmup write boost, bytes */ 726 uint64_t l2ad_start; /* first addr on device */ 727 uint64_t l2ad_end; /* last addr on device */ 728 uint64_t l2ad_evict; /* last addr eviction reached */ 729 boolean_t l2ad_first; /* first sweep through */ 730 boolean_t l2ad_writing; /* currently writing */ 731 list_t *l2ad_buflist; /* buffer list */ 732 list_node_t l2ad_node; /* device list node */ 733} l2arc_dev_t; 734 735static list_t L2ARC_dev_list; /* device list */ 736static list_t *l2arc_dev_list; /* device list pointer */ 737static kmutex_t l2arc_dev_mtx; /* device list mutex */ 738static l2arc_dev_t *l2arc_dev_last; /* last device used */ 739static kmutex_t l2arc_buflist_mtx; /* mutex for all buflists */ 740static list_t L2ARC_free_on_write; /* free after write buf list */ 741static list_t *l2arc_free_on_write; /* free after write list ptr */ 742static kmutex_t l2arc_free_on_write_mtx; /* mutex for list */ 743static uint64_t l2arc_ndev; /* number of devices */ 744 745typedef struct l2arc_read_callback { 746 arc_buf_t *l2rcb_buf; /* read buffer */ 747 spa_t *l2rcb_spa; /* spa */ 748 blkptr_t l2rcb_bp; /* original blkptr */ 749 zbookmark_t l2rcb_zb; /* original bookmark */ 750 int l2rcb_flags; /* original flags */ 751} l2arc_read_callback_t; 752 753typedef struct l2arc_write_callback { 754 l2arc_dev_t *l2wcb_dev; /* device info */ 755 arc_buf_hdr_t *l2wcb_head; /* head of write buflist */ 756} l2arc_write_callback_t; 757 758struct l2arc_buf_hdr { 759 /* protected by arc_buf_hdr mutex */ 760 l2arc_dev_t *b_dev; /* L2ARC device */ 761 uint64_t b_daddr; /* disk address, offset byte */ 762}; 763 764typedef struct l2arc_data_free { 765 /* protected by l2arc_free_on_write_mtx */ 766 void *l2df_data; 767 size_t l2df_size; 768 void (*l2df_func)(void *, size_t); 769 list_node_t l2df_list_node; 770} l2arc_data_free_t; 771 772static kmutex_t l2arc_feed_thr_lock; 773static kcondvar_t l2arc_feed_thr_cv; 774static uint8_t l2arc_thread_exit; 775 776static void l2arc_read_done(zio_t *zio); 777static void l2arc_hdr_stat_add(void); 778static void l2arc_hdr_stat_remove(void); 779 780static uint64_t 781buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth) 782{ 783 uint8_t *vdva = (uint8_t *)dva; 784 uint64_t crc = -1ULL; 785 int i; 786 787 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); 788 789 for (i = 0; i < sizeof (dva_t); i++) 790 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF]; 791 792 crc ^= (spa>>8) ^ birth; 793 794 return (crc); 795} 796 797#define BUF_EMPTY(buf) \ 798 ((buf)->b_dva.dva_word[0] == 0 && \ 799 (buf)->b_dva.dva_word[1] == 0 && \ 800 (buf)->b_birth == 0) 801 802#define BUF_EQUAL(spa, dva, birth, buf) \ 803 ((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \ 804 ((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \ 805 ((buf)->b_birth == birth) && ((buf)->b_spa == spa) 806 807static void 808buf_discard_identity(arc_buf_hdr_t *hdr) 809{ 810 hdr->b_dva.dva_word[0] = 0; 811 hdr->b_dva.dva_word[1] = 0; 812 hdr->b_birth = 0; 813 hdr->b_cksum0 = 0; 814} 815 816static arc_buf_hdr_t * 817buf_hash_find(uint64_t spa, const dva_t *dva, uint64_t birth, kmutex_t **lockp) 818{ 819 uint64_t idx = BUF_HASH_INDEX(spa, dva, birth); 820 kmutex_t *hash_lock = BUF_HASH_LOCK(idx); 821 arc_buf_hdr_t *buf; 822 823 mutex_enter(hash_lock); 824 for (buf = buf_hash_table.ht_table[idx]; buf != NULL; 825 buf = buf->b_hash_next) { 826 if (BUF_EQUAL(spa, dva, birth, buf)) { 827 *lockp = hash_lock; 828 return (buf); 829 } 830 } 831 mutex_exit(hash_lock); 832 *lockp = NULL; 833 return (NULL); 834} 835 836/* 837 * Insert an entry into the hash table. If there is already an element 838 * equal to elem in the hash table, then the already existing element 839 * will be returned and the new element will not be inserted. 840 * Otherwise returns NULL. 841 */ 842static arc_buf_hdr_t * 843buf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp) 844{ 845 uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth); 846 kmutex_t *hash_lock = BUF_HASH_LOCK(idx); 847 arc_buf_hdr_t *fbuf; 848 uint32_t i; 849 850 ASSERT(!HDR_IN_HASH_TABLE(buf)); 851 *lockp = hash_lock; 852 mutex_enter(hash_lock); 853 for (fbuf = buf_hash_table.ht_table[idx], i = 0; fbuf != NULL; 854 fbuf = fbuf->b_hash_next, i++) { 855 if (BUF_EQUAL(buf->b_spa, &buf->b_dva, buf->b_birth, fbuf)) 856 return (fbuf); 857 } 858 859 buf->b_hash_next = buf_hash_table.ht_table[idx]; 860 buf_hash_table.ht_table[idx] = buf; 861 buf->b_flags |= ARC_IN_HASH_TABLE; 862 863 /* collect some hash table performance data */ 864 if (i > 0) { 865 ARCSTAT_BUMP(arcstat_hash_collisions); 866 if (i == 1) 867 ARCSTAT_BUMP(arcstat_hash_chains); 868 869 ARCSTAT_MAX(arcstat_hash_chain_max, i); 870 } 871 872 ARCSTAT_BUMP(arcstat_hash_elements); 873 ARCSTAT_MAXSTAT(arcstat_hash_elements); 874 875 return (NULL); 876} 877 878static void 879buf_hash_remove(arc_buf_hdr_t *buf) 880{ 881 arc_buf_hdr_t *fbuf, **bufp; 882 uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth); 883 884 ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx))); 885 ASSERT(HDR_IN_HASH_TABLE(buf)); 886 887 bufp = &buf_hash_table.ht_table[idx]; 888 while ((fbuf = *bufp) != buf) { 889 ASSERT(fbuf != NULL); 890 bufp = &fbuf->b_hash_next; 891 } 892 *bufp = buf->b_hash_next; 893 buf->b_hash_next = NULL; 894 buf->b_flags &= ~ARC_IN_HASH_TABLE; 895 896 /* collect some hash table performance data */ 897 ARCSTAT_BUMPDOWN(arcstat_hash_elements); 898 899 if (buf_hash_table.ht_table[idx] && 900 buf_hash_table.ht_table[idx]->b_hash_next == NULL) 901 ARCSTAT_BUMPDOWN(arcstat_hash_chains); 902} 903 904/* 905 * Global data structures and functions for the buf kmem cache. 906 */ 907static kmem_cache_t *hdr_cache; 908static kmem_cache_t *buf_cache; 909 910static void 911buf_fini(void) 912{ 913 int i; 914 915 kmem_free(buf_hash_table.ht_table, 916 (buf_hash_table.ht_mask + 1) * sizeof (void *)); 917 for (i = 0; i < BUF_LOCKS; i++) 918 mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock); 919 kmem_cache_destroy(hdr_cache); 920 kmem_cache_destroy(buf_cache); 921} 922 923/* 924 * Constructor callback - called when the cache is empty 925 * and a new buf is requested. 926 */ 927/* ARGSUSED */ 928static int 929hdr_cons(void *vbuf, void *unused, int kmflag) 930{ 931 arc_buf_hdr_t *buf = vbuf; 932 933 bzero(buf, sizeof (arc_buf_hdr_t)); 934 refcount_create(&buf->b_refcnt); 935 cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL); 936 mutex_init(&buf->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL); 937 arc_space_consume(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS); 938 939 return (0); 940} 941 942/* ARGSUSED */ 943static int 944buf_cons(void *vbuf, void *unused, int kmflag) 945{ 946 arc_buf_t *buf = vbuf; 947 948 bzero(buf, sizeof (arc_buf_t)); 949 mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL); 950 arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS); 951 952 return (0); 953} 954 955/* 956 * Destructor callback - called when a cached buf is 957 * no longer required. 958 */ 959/* ARGSUSED */ 960static void 961hdr_dest(void *vbuf, void *unused) 962{ 963 arc_buf_hdr_t *buf = vbuf; 964 965 ASSERT(BUF_EMPTY(buf)); 966 refcount_destroy(&buf->b_refcnt); 967 cv_destroy(&buf->b_cv); 968 mutex_destroy(&buf->b_freeze_lock); 969 arc_space_return(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS); 970} 971 972/* ARGSUSED */ 973static void 974buf_dest(void *vbuf, void *unused) 975{ 976 arc_buf_t *buf = vbuf; 977 978 mutex_destroy(&buf->b_evict_lock); 979 arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS); 980} 981 982/* 983 * Reclaim callback -- invoked when memory is low. 984 */ 985/* ARGSUSED */ 986static void 987hdr_recl(void *unused) 988{ 989 dprintf("hdr_recl called\n"); 990 /* 991 * umem calls the reclaim func when we destroy the buf cache, 992 * which is after we do arc_fini(). 993 */ 994 if (!arc_dead) 995 cv_signal(&arc_reclaim_thr_cv); 996} 997 998static void 999buf_init(void) 1000{ 1001 uint64_t *ct; 1002 uint64_t hsize = 1ULL << 12; 1003 int i, j; 1004 1005 /* 1006 * The hash table is big enough to fill all of physical memory 1007 * with an average 64K block size. The table will take up 1008 * totalmem*sizeof(void*)/64K (eg. 128KB/GB with 8-byte pointers). 1009 */ 1010 while (hsize * 65536 < (uint64_t)physmem * PAGESIZE) 1011 hsize <<= 1; 1012retry: 1013 buf_hash_table.ht_mask = hsize - 1; 1014 buf_hash_table.ht_table = 1015 kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP); 1016 if (buf_hash_table.ht_table == NULL) { 1017 ASSERT(hsize > (1ULL << 8)); 1018 hsize >>= 1; 1019 goto retry; 1020 } 1021 1022 hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t), 1023 0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0); 1024 buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t), 1025 0, buf_cons, buf_dest, NULL, NULL, NULL, 0); 1026 1027 for (i = 0; i < 256; i++) 1028 for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--) 1029 *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY); 1030 1031 for (i = 0; i < BUF_LOCKS; i++) { 1032 mutex_init(&buf_hash_table.ht_locks[i].ht_lock, 1033 NULL, MUTEX_DEFAULT, NULL); 1034 } 1035} 1036 1037#define ARC_MINTIME (hz>>4) /* 62 ms */ 1038 1039static void 1040arc_cksum_verify(arc_buf_t *buf) 1041{ 1042 zio_cksum_t zc; 1043 1044 if (!(zfs_flags & ZFS_DEBUG_MODIFY)) 1045 return; 1046 1047 mutex_enter(&buf->b_hdr->b_freeze_lock); 1048 if (buf->b_hdr->b_freeze_cksum == NULL || 1049 (buf->b_hdr->b_flags & ARC_IO_ERROR)) { 1050 mutex_exit(&buf->b_hdr->b_freeze_lock); 1051 return; 1052 } 1053 fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc); 1054 if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc)) 1055 panic("buffer modified while frozen!"); 1056 mutex_exit(&buf->b_hdr->b_freeze_lock); 1057} 1058 1059static int 1060arc_cksum_equal(arc_buf_t *buf) 1061{ 1062 zio_cksum_t zc; 1063 int equal; 1064 1065 mutex_enter(&buf->b_hdr->b_freeze_lock); 1066 fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc); 1067 equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc); 1068 mutex_exit(&buf->b_hdr->b_freeze_lock); 1069 1070 return (equal); 1071} 1072 1073static void 1074arc_cksum_compute(arc_buf_t *buf, boolean_t force) 1075{ 1076 if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY)) 1077 return; 1078 1079 mutex_enter(&buf->b_hdr->b_freeze_lock); 1080 if (buf->b_hdr->b_freeze_cksum != NULL) { 1081 mutex_exit(&buf->b_hdr->b_freeze_lock); 1082 return; 1083 } 1084 buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP); 1085 fletcher_2_native(buf->b_data, buf->b_hdr->b_size, 1086 buf->b_hdr->b_freeze_cksum); 1087 mutex_exit(&buf->b_hdr->b_freeze_lock); 1088#ifdef illumos 1089 arc_buf_watch(buf); 1090#endif /* illumos */ 1091} 1092 1093#ifdef illumos 1094#ifndef _KERNEL 1095typedef struct procctl { 1096 long cmd; 1097 prwatch_t prwatch; 1098} procctl_t; 1099#endif 1100 1101/* ARGSUSED */ 1102static void 1103arc_buf_unwatch(arc_buf_t *buf) 1104{ 1105#ifndef _KERNEL 1106 if (arc_watch) { 1107 int result; 1108 procctl_t ctl; 1109 ctl.cmd = PCWATCH; 1110 ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data; 1111 ctl.prwatch.pr_size = 0; 1112 ctl.prwatch.pr_wflags = 0; 1113 result = write(arc_procfd, &ctl, sizeof (ctl)); 1114 ASSERT3U(result, ==, sizeof (ctl)); 1115 } 1116#endif 1117} 1118 1119/* ARGSUSED */ 1120static void 1121arc_buf_watch(arc_buf_t *buf) 1122{ 1123#ifndef _KERNEL 1124 if (arc_watch) { 1125 int result; 1126 procctl_t ctl; 1127 ctl.cmd = PCWATCH; 1128 ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data; 1129 ctl.prwatch.pr_size = buf->b_hdr->b_size; 1130 ctl.prwatch.pr_wflags = WA_WRITE; 1131 result = write(arc_procfd, &ctl, sizeof (ctl)); 1132 ASSERT3U(result, ==, sizeof (ctl)); 1133 } 1134#endif 1135} 1136#endif /* illumos */ 1137 1138void 1139arc_buf_thaw(arc_buf_t *buf) 1140{ 1141 if (zfs_flags & ZFS_DEBUG_MODIFY) { 1142 if (buf->b_hdr->b_state != arc_anon) 1143 panic("modifying non-anon buffer!"); 1144 if (buf->b_hdr->b_flags & ARC_IO_IN_PROGRESS) 1145 panic("modifying buffer while i/o in progress!"); 1146 arc_cksum_verify(buf); 1147 } 1148 1149 mutex_enter(&buf->b_hdr->b_freeze_lock); 1150 if (buf->b_hdr->b_freeze_cksum != NULL) { 1151 kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t)); 1152 buf->b_hdr->b_freeze_cksum = NULL; 1153 } 1154 1155 if (zfs_flags & ZFS_DEBUG_MODIFY) { 1156 if (buf->b_hdr->b_thawed) 1157 kmem_free(buf->b_hdr->b_thawed, 1); 1158 buf->b_hdr->b_thawed = kmem_alloc(1, KM_SLEEP); 1159 } 1160 1161 mutex_exit(&buf->b_hdr->b_freeze_lock); 1162 1163#ifdef illumos 1164 arc_buf_unwatch(buf); 1165#endif /* illumos */ 1166} 1167 1168void 1169arc_buf_freeze(arc_buf_t *buf) 1170{ 1171 kmutex_t *hash_lock; 1172 1173 if (!(zfs_flags & ZFS_DEBUG_MODIFY)) 1174 return; 1175 1176 hash_lock = HDR_LOCK(buf->b_hdr); 1177 mutex_enter(hash_lock); 1178 1179 ASSERT(buf->b_hdr->b_freeze_cksum != NULL || 1180 buf->b_hdr->b_state == arc_anon); 1181 arc_cksum_compute(buf, B_FALSE); 1182 mutex_exit(hash_lock); 1183 1184} 1185 1186static void 1187get_buf_info(arc_buf_hdr_t *ab, arc_state_t *state, list_t **list, kmutex_t **lock) 1188{ 1189 uint64_t buf_hashid = buf_hash(ab->b_spa, &ab->b_dva, ab->b_birth); 1190 1191 if (ab->b_type == ARC_BUFC_METADATA) 1192 buf_hashid &= (ARC_BUFC_NUMMETADATALISTS - 1); 1193 else { 1194 buf_hashid &= (ARC_BUFC_NUMDATALISTS - 1); 1195 buf_hashid += ARC_BUFC_NUMMETADATALISTS; 1196 } 1197 1198 *list = &state->arcs_lists[buf_hashid]; 1199 *lock = ARCS_LOCK(state, buf_hashid); 1200} 1201 1202 1203static void 1204add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag) 1205{ 1206 ASSERT(MUTEX_HELD(hash_lock)); 1207 1208 if ((refcount_add(&ab->b_refcnt, tag) == 1) && 1209 (ab->b_state != arc_anon)) { 1210 uint64_t delta = ab->b_size * ab->b_datacnt; 1211 uint64_t *size = &ab->b_state->arcs_lsize[ab->b_type]; 1212 list_t *list; 1213 kmutex_t *lock; 1214 1215 get_buf_info(ab, ab->b_state, &list, &lock); 1216 ASSERT(!MUTEX_HELD(lock)); 1217 mutex_enter(lock); 1218 ASSERT(list_link_active(&ab->b_arc_node)); 1219 list_remove(list, ab); 1220 if (GHOST_STATE(ab->b_state)) { 1221 ASSERT0(ab->b_datacnt); 1222 ASSERT3P(ab->b_buf, ==, NULL); 1223 delta = ab->b_size; 1224 } 1225 ASSERT(delta > 0); 1226 ASSERT3U(*size, >=, delta); 1227 atomic_add_64(size, -delta); 1228 mutex_exit(lock); 1229 /* remove the prefetch flag if we get a reference */ 1230 if (ab->b_flags & ARC_PREFETCH) 1231 ab->b_flags &= ~ARC_PREFETCH; 1232 } 1233} 1234 1235static int 1236remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag) 1237{ 1238 int cnt; 1239 arc_state_t *state = ab->b_state; 1240 1241 ASSERT(state == arc_anon || MUTEX_HELD(hash_lock)); 1242 ASSERT(!GHOST_STATE(state)); 1243 1244 if (((cnt = refcount_remove(&ab->b_refcnt, tag)) == 0) && 1245 (state != arc_anon)) { 1246 uint64_t *size = &state->arcs_lsize[ab->b_type]; 1247 list_t *list; 1248 kmutex_t *lock; 1249 1250 get_buf_info(ab, state, &list, &lock); 1251 ASSERT(!MUTEX_HELD(lock)); 1252 mutex_enter(lock); 1253 ASSERT(!list_link_active(&ab->b_arc_node)); 1254 list_insert_head(list, ab); 1255 ASSERT(ab->b_datacnt > 0); 1256 atomic_add_64(size, ab->b_size * ab->b_datacnt); 1257 mutex_exit(lock); 1258 } 1259 return (cnt); 1260} 1261 1262/* 1263 * Move the supplied buffer to the indicated state. The mutex 1264 * for the buffer must be held by the caller. 1265 */ 1266static void 1267arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock) 1268{ 1269 arc_state_t *old_state = ab->b_state; 1270 int64_t refcnt = refcount_count(&ab->b_refcnt); 1271 uint64_t from_delta, to_delta; 1272 list_t *list; 1273 kmutex_t *lock; 1274 1275 ASSERT(MUTEX_HELD(hash_lock)); 1276 ASSERT(new_state != old_state); 1277 ASSERT(refcnt == 0 || ab->b_datacnt > 0); 1278 ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state)); 1279 ASSERT(ab->b_datacnt <= 1 || old_state != arc_anon); 1280 1281 from_delta = to_delta = ab->b_datacnt * ab->b_size; 1282 1283 /* 1284 * If this buffer is evictable, transfer it from the 1285 * old state list to the new state list. 1286 */ 1287 if (refcnt == 0) { 1288 if (old_state != arc_anon) { 1289 int use_mutex; 1290 uint64_t *size = &old_state->arcs_lsize[ab->b_type]; 1291 1292 get_buf_info(ab, old_state, &list, &lock); 1293 use_mutex = !MUTEX_HELD(lock); 1294 if (use_mutex) 1295 mutex_enter(lock); 1296 1297 ASSERT(list_link_active(&ab->b_arc_node)); 1298 list_remove(list, ab); 1299 1300 /* 1301 * If prefetching out of the ghost cache, 1302 * we will have a non-zero datacnt. 1303 */ 1304 if (GHOST_STATE(old_state) && ab->b_datacnt == 0) { 1305 /* ghost elements have a ghost size */ 1306 ASSERT(ab->b_buf == NULL); 1307 from_delta = ab->b_size; 1308 } 1309 ASSERT3U(*size, >=, from_delta); 1310 atomic_add_64(size, -from_delta); 1311 1312 if (use_mutex) 1313 mutex_exit(lock); 1314 } 1315 if (new_state != arc_anon) { 1316 int use_mutex; 1317 uint64_t *size = &new_state->arcs_lsize[ab->b_type]; 1318 1319 get_buf_info(ab, new_state, &list, &lock); 1320 use_mutex = !MUTEX_HELD(lock); 1321 if (use_mutex) 1322 mutex_enter(lock); 1323 1324 list_insert_head(list, ab); 1325 1326 /* ghost elements have a ghost size */ 1327 if (GHOST_STATE(new_state)) { 1328 ASSERT(ab->b_datacnt == 0); 1329 ASSERT(ab->b_buf == NULL); 1330 to_delta = ab->b_size; 1331 } 1332 atomic_add_64(size, to_delta); 1333 1334 if (use_mutex) 1335 mutex_exit(lock); 1336 } 1337 } 1338 1339 ASSERT(!BUF_EMPTY(ab)); 1340 if (new_state == arc_anon && HDR_IN_HASH_TABLE(ab)) 1341 buf_hash_remove(ab); 1342 1343 /* adjust state sizes */ 1344 if (to_delta) 1345 atomic_add_64(&new_state->arcs_size, to_delta); 1346 if (from_delta) { 1347 ASSERT3U(old_state->arcs_size, >=, from_delta); 1348 atomic_add_64(&old_state->arcs_size, -from_delta); 1349 } 1350 ab->b_state = new_state; 1351 1352 /* adjust l2arc hdr stats */ 1353 if (new_state == arc_l2c_only) 1354 l2arc_hdr_stat_add(); 1355 else if (old_state == arc_l2c_only) 1356 l2arc_hdr_stat_remove(); 1357} 1358 1359void 1360arc_space_consume(uint64_t space, arc_space_type_t type) 1361{ 1362 ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES); 1363 1364 switch (type) { 1365 case ARC_SPACE_DATA: 1366 ARCSTAT_INCR(arcstat_data_size, space); 1367 break; 1368 case ARC_SPACE_OTHER: 1369 ARCSTAT_INCR(arcstat_other_size, space); 1370 break; 1371 case ARC_SPACE_HDRS: 1372 ARCSTAT_INCR(arcstat_hdr_size, space); 1373 break; 1374 case ARC_SPACE_L2HDRS: 1375 ARCSTAT_INCR(arcstat_l2_hdr_size, space); 1376 break; 1377 } 1378 1379 atomic_add_64(&arc_meta_used, space); 1380 atomic_add_64(&arc_size, space); 1381} 1382 1383void 1384arc_space_return(uint64_t space, arc_space_type_t type) 1385{ 1386 ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES); 1387 1388 switch (type) { 1389 case ARC_SPACE_DATA: 1390 ARCSTAT_INCR(arcstat_data_size, -space); 1391 break; 1392 case ARC_SPACE_OTHER: 1393 ARCSTAT_INCR(arcstat_other_size, -space); 1394 break; 1395 case ARC_SPACE_HDRS: 1396 ARCSTAT_INCR(arcstat_hdr_size, -space); 1397 break; 1398 case ARC_SPACE_L2HDRS: 1399 ARCSTAT_INCR(arcstat_l2_hdr_size, -space); 1400 break; 1401 } 1402 1403 ASSERT(arc_meta_used >= space); 1404 if (arc_meta_max < arc_meta_used) 1405 arc_meta_max = arc_meta_used; 1406 atomic_add_64(&arc_meta_used, -space); 1407 ASSERT(arc_size >= space); 1408 atomic_add_64(&arc_size, -space); 1409} 1410 1411void * 1412arc_data_buf_alloc(uint64_t size) 1413{ 1414 if (arc_evict_needed(ARC_BUFC_DATA)) 1415 cv_signal(&arc_reclaim_thr_cv); 1416 atomic_add_64(&arc_size, size); 1417 return (zio_data_buf_alloc(size)); 1418} 1419 1420void 1421arc_data_buf_free(void *buf, uint64_t size) 1422{ 1423 zio_data_buf_free(buf, size); 1424 ASSERT(arc_size >= size); 1425 atomic_add_64(&arc_size, -size); 1426} 1427 1428arc_buf_t * 1429arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type) 1430{ 1431 arc_buf_hdr_t *hdr; 1432 arc_buf_t *buf; 1433 1434 ASSERT3U(size, >, 0); 1435 hdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE); 1436 ASSERT(BUF_EMPTY(hdr)); 1437 hdr->b_size = size; 1438 hdr->b_type = type; 1439 hdr->b_spa = spa_load_guid(spa); 1440 hdr->b_state = arc_anon; 1441 hdr->b_arc_access = 0; 1442 buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); 1443 buf->b_hdr = hdr; 1444 buf->b_data = NULL; 1445 buf->b_efunc = NULL; 1446 buf->b_private = NULL; 1447 buf->b_next = NULL; 1448 hdr->b_buf = buf; 1449 arc_get_data_buf(buf); 1450 hdr->b_datacnt = 1; 1451 hdr->b_flags = 0; 1452 ASSERT(refcount_is_zero(&hdr->b_refcnt)); 1453 (void) refcount_add(&hdr->b_refcnt, tag); 1454 1455 return (buf); 1456} 1457 1458static char *arc_onloan_tag = "onloan"; 1459 1460/* 1461 * Loan out an anonymous arc buffer. Loaned buffers are not counted as in 1462 * flight data by arc_tempreserve_space() until they are "returned". Loaned 1463 * buffers must be returned to the arc before they can be used by the DMU or 1464 * freed. 1465 */ 1466arc_buf_t * 1467arc_loan_buf(spa_t *spa, int size) 1468{ 1469 arc_buf_t *buf; 1470 1471 buf = arc_buf_alloc(spa, size, arc_onloan_tag, ARC_BUFC_DATA); 1472 1473 atomic_add_64(&arc_loaned_bytes, size); 1474 return (buf); 1475} 1476 1477/* 1478 * Return a loaned arc buffer to the arc. 1479 */ 1480void 1481arc_return_buf(arc_buf_t *buf, void *tag) 1482{ 1483 arc_buf_hdr_t *hdr = buf->b_hdr; 1484 1485 ASSERT(buf->b_data != NULL); 1486 (void) refcount_add(&hdr->b_refcnt, tag); 1487 (void) refcount_remove(&hdr->b_refcnt, arc_onloan_tag); 1488 1489 atomic_add_64(&arc_loaned_bytes, -hdr->b_size); 1490} 1491 1492/* Detach an arc_buf from a dbuf (tag) */ 1493void 1494arc_loan_inuse_buf(arc_buf_t *buf, void *tag) 1495{ 1496 arc_buf_hdr_t *hdr; 1497 1498 ASSERT(buf->b_data != NULL); 1499 hdr = buf->b_hdr; 1500 (void) refcount_add(&hdr->b_refcnt, arc_onloan_tag); 1501 (void) refcount_remove(&hdr->b_refcnt, tag); 1502 buf->b_efunc = NULL; 1503 buf->b_private = NULL; 1504 1505 atomic_add_64(&arc_loaned_bytes, hdr->b_size); 1506} 1507 1508static arc_buf_t * 1509arc_buf_clone(arc_buf_t *from) 1510{ 1511 arc_buf_t *buf; 1512 arc_buf_hdr_t *hdr = from->b_hdr; 1513 uint64_t size = hdr->b_size; 1514 1515 ASSERT(hdr->b_state != arc_anon); 1516 1517 buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); 1518 buf->b_hdr = hdr; 1519 buf->b_data = NULL; 1520 buf->b_efunc = NULL; 1521 buf->b_private = NULL; 1522 buf->b_next = hdr->b_buf; 1523 hdr->b_buf = buf; 1524 arc_get_data_buf(buf); 1525 bcopy(from->b_data, buf->b_data, size); 1526 1527 /* 1528 * This buffer already exists in the arc so create a duplicate 1529 * copy for the caller. If the buffer is associated with user data 1530 * then track the size and number of duplicates. These stats will be 1531 * updated as duplicate buffers are created and destroyed. 1532 */ 1533 if (hdr->b_type == ARC_BUFC_DATA) { 1534 ARCSTAT_BUMP(arcstat_duplicate_buffers); 1535 ARCSTAT_INCR(arcstat_duplicate_buffers_size, size); 1536 } 1537 hdr->b_datacnt += 1; 1538 return (buf); 1539} 1540 1541void 1542arc_buf_add_ref(arc_buf_t *buf, void* tag) 1543{ 1544 arc_buf_hdr_t *hdr; 1545 kmutex_t *hash_lock; 1546 1547 /* 1548 * Check to see if this buffer is evicted. Callers 1549 * must verify b_data != NULL to know if the add_ref 1550 * was successful. 1551 */ 1552 mutex_enter(&buf->b_evict_lock); 1553 if (buf->b_data == NULL) { 1554 mutex_exit(&buf->b_evict_lock); 1555 return; 1556 } 1557 hash_lock = HDR_LOCK(buf->b_hdr); 1558 mutex_enter(hash_lock); 1559 hdr = buf->b_hdr; 1560 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 1561 mutex_exit(&buf->b_evict_lock); 1562 1563 ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu); 1564 add_reference(hdr, hash_lock, tag); 1565 DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); 1566 arc_access(hdr, hash_lock); 1567 mutex_exit(hash_lock); 1568 ARCSTAT_BUMP(arcstat_hits); 1569 ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH), 1570 demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, 1571 data, metadata, hits); 1572} 1573 1574/* 1575 * Free the arc data buffer. If it is an l2arc write in progress, 1576 * the buffer is placed on l2arc_free_on_write to be freed later. 1577 */ 1578static void 1579arc_buf_data_free(arc_buf_t *buf, void (*free_func)(void *, size_t)) 1580{ 1581 arc_buf_hdr_t *hdr = buf->b_hdr; 1582 1583 if (HDR_L2_WRITING(hdr)) { 1584 l2arc_data_free_t *df; 1585 df = kmem_alloc(sizeof (l2arc_data_free_t), KM_SLEEP); 1586 df->l2df_data = buf->b_data; 1587 df->l2df_size = hdr->b_size; 1588 df->l2df_func = free_func; 1589 mutex_enter(&l2arc_free_on_write_mtx); 1590 list_insert_head(l2arc_free_on_write, df); 1591 mutex_exit(&l2arc_free_on_write_mtx); 1592 ARCSTAT_BUMP(arcstat_l2_free_on_write); 1593 } else { 1594 free_func(buf->b_data, hdr->b_size); 1595 } 1596} 1597 1598static void 1599arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all) 1600{ 1601 arc_buf_t **bufp; 1602 1603 /* free up data associated with the buf */ 1604 if (buf->b_data) { 1605 arc_state_t *state = buf->b_hdr->b_state; 1606 uint64_t size = buf->b_hdr->b_size; 1607 arc_buf_contents_t type = buf->b_hdr->b_type; 1608 1609 arc_cksum_verify(buf); 1610#ifdef illumos 1611 arc_buf_unwatch(buf); 1612#endif /* illumos */ 1613 1614 if (!recycle) { 1615 if (type == ARC_BUFC_METADATA) { 1616 arc_buf_data_free(buf, zio_buf_free); 1617 arc_space_return(size, ARC_SPACE_DATA); 1618 } else { 1619 ASSERT(type == ARC_BUFC_DATA); 1620 arc_buf_data_free(buf, zio_data_buf_free); 1621 ARCSTAT_INCR(arcstat_data_size, -size); 1622 atomic_add_64(&arc_size, -size); 1623 } 1624 } 1625 if (list_link_active(&buf->b_hdr->b_arc_node)) { 1626 uint64_t *cnt = &state->arcs_lsize[type]; 1627 1628 ASSERT(refcount_is_zero(&buf->b_hdr->b_refcnt)); 1629 ASSERT(state != arc_anon); 1630 1631 ASSERT3U(*cnt, >=, size); 1632 atomic_add_64(cnt, -size); 1633 } 1634 ASSERT3U(state->arcs_size, >=, size); 1635 atomic_add_64(&state->arcs_size, -size); 1636 buf->b_data = NULL; 1637 1638 /* 1639 * If we're destroying a duplicate buffer make sure 1640 * that the appropriate statistics are updated. 1641 */ 1642 if (buf->b_hdr->b_datacnt > 1 && 1643 buf->b_hdr->b_type == ARC_BUFC_DATA) { 1644 ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers); 1645 ARCSTAT_INCR(arcstat_duplicate_buffers_size, -size); 1646 } 1647 ASSERT(buf->b_hdr->b_datacnt > 0); 1648 buf->b_hdr->b_datacnt -= 1; 1649 } 1650 1651 /* only remove the buf if requested */ 1652 if (!all) 1653 return; 1654 1655 /* remove the buf from the hdr list */ 1656 for (bufp = &buf->b_hdr->b_buf; *bufp != buf; bufp = &(*bufp)->b_next) 1657 continue; 1658 *bufp = buf->b_next; 1659 buf->b_next = NULL; 1660 1661 ASSERT(buf->b_efunc == NULL); 1662 1663 /* clean up the buf */ 1664 buf->b_hdr = NULL; 1665 kmem_cache_free(buf_cache, buf); 1666} 1667 1668static void 1669arc_hdr_destroy(arc_buf_hdr_t *hdr) 1670{ 1671 ASSERT(refcount_is_zero(&hdr->b_refcnt)); 1672 ASSERT3P(hdr->b_state, ==, arc_anon); 1673 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 1674 l2arc_buf_hdr_t *l2hdr = hdr->b_l2hdr; 1675 1676 if (l2hdr != NULL) { 1677 boolean_t buflist_held = MUTEX_HELD(&l2arc_buflist_mtx); 1678 /* 1679 * To prevent arc_free() and l2arc_evict() from 1680 * attempting to free the same buffer at the same time, 1681 * a FREE_IN_PROGRESS flag is given to arc_free() to 1682 * give it priority. l2arc_evict() can't destroy this 1683 * header while we are waiting on l2arc_buflist_mtx. 1684 * 1685 * The hdr may be removed from l2ad_buflist before we 1686 * grab l2arc_buflist_mtx, so b_l2hdr is rechecked. 1687 */ 1688 if (!buflist_held) { 1689 mutex_enter(&l2arc_buflist_mtx); 1690 l2hdr = hdr->b_l2hdr; 1691 } 1692 1693 if (l2hdr != NULL) { 1694 list_remove(l2hdr->b_dev->l2ad_buflist, hdr); 1695 ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); 1696 kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t)); 1697 if (hdr->b_state == arc_l2c_only) 1698 l2arc_hdr_stat_remove(); 1699 hdr->b_l2hdr = NULL; 1700 } 1701 1702 if (!buflist_held) 1703 mutex_exit(&l2arc_buflist_mtx); 1704 } 1705 1706 if (!BUF_EMPTY(hdr)) { 1707 ASSERT(!HDR_IN_HASH_TABLE(hdr)); 1708 buf_discard_identity(hdr); 1709 } 1710 while (hdr->b_buf) { 1711 arc_buf_t *buf = hdr->b_buf; 1712 1713 if (buf->b_efunc) { 1714 mutex_enter(&arc_eviction_mtx); 1715 mutex_enter(&buf->b_evict_lock); 1716 ASSERT(buf->b_hdr != NULL); 1717 arc_buf_destroy(hdr->b_buf, FALSE, FALSE); 1718 hdr->b_buf = buf->b_next; 1719 buf->b_hdr = &arc_eviction_hdr; 1720 buf->b_next = arc_eviction_list; 1721 arc_eviction_list = buf; 1722 mutex_exit(&buf->b_evict_lock); 1723 mutex_exit(&arc_eviction_mtx); 1724 } else { 1725 arc_buf_destroy(hdr->b_buf, FALSE, TRUE); 1726 } 1727 } 1728 if (hdr->b_freeze_cksum != NULL) { 1729 kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t)); 1730 hdr->b_freeze_cksum = NULL; 1731 } 1732 if (hdr->b_thawed) { 1733 kmem_free(hdr->b_thawed, 1); 1734 hdr->b_thawed = NULL; 1735 } 1736 1737 ASSERT(!list_link_active(&hdr->b_arc_node)); 1738 ASSERT3P(hdr->b_hash_next, ==, NULL); 1739 ASSERT3P(hdr->b_acb, ==, NULL); 1740 kmem_cache_free(hdr_cache, hdr); 1741} 1742 1743void 1744arc_buf_free(arc_buf_t *buf, void *tag) 1745{ 1746 arc_buf_hdr_t *hdr = buf->b_hdr; 1747 int hashed = hdr->b_state != arc_anon; 1748 1749 ASSERT(buf->b_efunc == NULL); 1750 ASSERT(buf->b_data != NULL); 1751 1752 if (hashed) { 1753 kmutex_t *hash_lock = HDR_LOCK(hdr); 1754 1755 mutex_enter(hash_lock); 1756 hdr = buf->b_hdr; 1757 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 1758 1759 (void) remove_reference(hdr, hash_lock, tag); 1760 if (hdr->b_datacnt > 1) { 1761 arc_buf_destroy(buf, FALSE, TRUE); 1762 } else { 1763 ASSERT(buf == hdr->b_buf); 1764 ASSERT(buf->b_efunc == NULL); 1765 hdr->b_flags |= ARC_BUF_AVAILABLE; 1766 } 1767 mutex_exit(hash_lock); 1768 } else if (HDR_IO_IN_PROGRESS(hdr)) { 1769 int destroy_hdr; 1770 /* 1771 * We are in the middle of an async write. Don't destroy 1772 * this buffer unless the write completes before we finish 1773 * decrementing the reference count. 1774 */ 1775 mutex_enter(&arc_eviction_mtx); 1776 (void) remove_reference(hdr, NULL, tag); 1777 ASSERT(refcount_is_zero(&hdr->b_refcnt)); 1778 destroy_hdr = !HDR_IO_IN_PROGRESS(hdr); 1779 mutex_exit(&arc_eviction_mtx); 1780 if (destroy_hdr) 1781 arc_hdr_destroy(hdr); 1782 } else { 1783 if (remove_reference(hdr, NULL, tag) > 0) 1784 arc_buf_destroy(buf, FALSE, TRUE); 1785 else 1786 arc_hdr_destroy(hdr); 1787 } 1788} 1789 1790int 1791arc_buf_remove_ref(arc_buf_t *buf, void* tag) 1792{ 1793 arc_buf_hdr_t *hdr = buf->b_hdr; 1794 kmutex_t *hash_lock = HDR_LOCK(hdr); 1795 int no_callback = (buf->b_efunc == NULL); 1796 1797 if (hdr->b_state == arc_anon) { 1798 ASSERT(hdr->b_datacnt == 1); 1799 arc_buf_free(buf, tag); 1800 return (no_callback); 1801 } 1802 1803 mutex_enter(hash_lock); 1804 hdr = buf->b_hdr; 1805 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 1806 ASSERT(hdr->b_state != arc_anon); 1807 ASSERT(buf->b_data != NULL); 1808 1809 (void) remove_reference(hdr, hash_lock, tag); 1810 if (hdr->b_datacnt > 1) { 1811 if (no_callback) 1812 arc_buf_destroy(buf, FALSE, TRUE); 1813 } else if (no_callback) { 1814 ASSERT(hdr->b_buf == buf && buf->b_next == NULL); 1815 ASSERT(buf->b_efunc == NULL); 1816 hdr->b_flags |= ARC_BUF_AVAILABLE; 1817 } 1818 ASSERT(no_callback || hdr->b_datacnt > 1 || 1819 refcount_is_zero(&hdr->b_refcnt)); 1820 mutex_exit(hash_lock); 1821 return (no_callback); 1822} 1823 1824int 1825arc_buf_size(arc_buf_t *buf) 1826{ 1827 return (buf->b_hdr->b_size); 1828} 1829 1830/* 1831 * Called from the DMU to determine if the current buffer should be 1832 * evicted. In order to ensure proper locking, the eviction must be initiated 1833 * from the DMU. Return true if the buffer is associated with user data and 1834 * duplicate buffers still exist. 1835 */ 1836boolean_t 1837arc_buf_eviction_needed(arc_buf_t *buf) 1838{ 1839 arc_buf_hdr_t *hdr; 1840 boolean_t evict_needed = B_FALSE; 1841 1842 if (zfs_disable_dup_eviction) 1843 return (B_FALSE); 1844 1845 mutex_enter(&buf->b_evict_lock); 1846 hdr = buf->b_hdr; 1847 if (hdr == NULL) { 1848 /* 1849 * We are in arc_do_user_evicts(); let that function 1850 * perform the eviction. 1851 */ 1852 ASSERT(buf->b_data == NULL); 1853 mutex_exit(&buf->b_evict_lock); 1854 return (B_FALSE); 1855 } else if (buf->b_data == NULL) { 1856 /* 1857 * We have already been added to the arc eviction list; 1858 * recommend eviction. 1859 */ 1860 ASSERT3P(hdr, ==, &arc_eviction_hdr); 1861 mutex_exit(&buf->b_evict_lock); 1862 return (B_TRUE); 1863 } 1864 1865 if (hdr->b_datacnt > 1 && hdr->b_type == ARC_BUFC_DATA) 1866 evict_needed = B_TRUE; 1867 1868 mutex_exit(&buf->b_evict_lock); 1869 return (evict_needed); 1870} 1871 1872/* 1873 * Evict buffers from list until we've removed the specified number of 1874 * bytes. Move the removed buffers to the appropriate evict state. 1875 * If the recycle flag is set, then attempt to "recycle" a buffer: 1876 * - look for a buffer to evict that is `bytes' long. 1877 * - return the data block from this buffer rather than freeing it. 1878 * This flag is used by callers that are trying to make space for a 1879 * new buffer in a full arc cache. 1880 * 1881 * This function makes a "best effort". It skips over any buffers 1882 * it can't get a hash_lock on, and so may not catch all candidates. 1883 * It may also return without evicting as much space as requested. 1884 */ 1885static void * 1886arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle, 1887 arc_buf_contents_t type) 1888{ 1889 arc_state_t *evicted_state; 1890 uint64_t bytes_evicted = 0, skipped = 0, missed = 0; 1891 int64_t bytes_remaining; 1892 arc_buf_hdr_t *ab, *ab_prev = NULL; 1893 list_t *evicted_list, *list, *evicted_list_start, *list_start; 1894 kmutex_t *lock, *evicted_lock; 1895 kmutex_t *hash_lock; 1896 boolean_t have_lock; 1897 void *stolen = NULL; 1898 static int evict_metadata_offset, evict_data_offset; 1899 int i, idx, offset, list_count, count; 1900 1901 ASSERT(state == arc_mru || state == arc_mfu); 1902 1903 evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost; 1904 1905 if (type == ARC_BUFC_METADATA) { 1906 offset = 0; 1907 list_count = ARC_BUFC_NUMMETADATALISTS; 1908 list_start = &state->arcs_lists[0]; 1909 evicted_list_start = &evicted_state->arcs_lists[0]; 1910 idx = evict_metadata_offset; 1911 } else { 1912 offset = ARC_BUFC_NUMMETADATALISTS; 1913 list_start = &state->arcs_lists[offset]; 1914 evicted_list_start = &evicted_state->arcs_lists[offset]; 1915 list_count = ARC_BUFC_NUMDATALISTS; 1916 idx = evict_data_offset; 1917 } 1918 bytes_remaining = evicted_state->arcs_lsize[type]; 1919 count = 0; 1920 1921evict_start: 1922 list = &list_start[idx]; 1923 evicted_list = &evicted_list_start[idx]; 1924 lock = ARCS_LOCK(state, (offset + idx)); 1925 evicted_lock = ARCS_LOCK(evicted_state, (offset + idx)); 1926 1927 mutex_enter(lock); 1928 mutex_enter(evicted_lock); 1929 1930 for (ab = list_tail(list); ab; ab = ab_prev) { 1931 ab_prev = list_prev(list, ab); 1932 bytes_remaining -= (ab->b_size * ab->b_datacnt); 1933 /* prefetch buffers have a minimum lifespan */ 1934 if (HDR_IO_IN_PROGRESS(ab) || 1935 (spa && ab->b_spa != spa) || 1936 (ab->b_flags & (ARC_PREFETCH|ARC_INDIRECT) && 1937 ddi_get_lbolt() - ab->b_arc_access < 1938 arc_min_prefetch_lifespan)) { 1939 skipped++; 1940 continue; 1941 } 1942 /* "lookahead" for better eviction candidate */ 1943 if (recycle && ab->b_size != bytes && 1944 ab_prev && ab_prev->b_size == bytes) 1945 continue; 1946 hash_lock = HDR_LOCK(ab); 1947 have_lock = MUTEX_HELD(hash_lock); 1948 if (have_lock || mutex_tryenter(hash_lock)) { 1949 ASSERT0(refcount_count(&ab->b_refcnt)); 1950 ASSERT(ab->b_datacnt > 0); 1951 while (ab->b_buf) { 1952 arc_buf_t *buf = ab->b_buf; 1953 if (!mutex_tryenter(&buf->b_evict_lock)) { 1954 missed += 1; 1955 break; 1956 } 1957 if (buf->b_data) { 1958 bytes_evicted += ab->b_size; 1959 if (recycle && ab->b_type == type && 1960 ab->b_size == bytes && 1961 !HDR_L2_WRITING(ab)) { 1962 stolen = buf->b_data; 1963 recycle = FALSE; 1964 } 1965 } 1966 if (buf->b_efunc) { 1967 mutex_enter(&arc_eviction_mtx); 1968 arc_buf_destroy(buf, 1969 buf->b_data == stolen, FALSE); 1970 ab->b_buf = buf->b_next; 1971 buf->b_hdr = &arc_eviction_hdr; 1972 buf->b_next = arc_eviction_list; 1973 arc_eviction_list = buf; 1974 mutex_exit(&arc_eviction_mtx); 1975 mutex_exit(&buf->b_evict_lock); 1976 } else { 1977 mutex_exit(&buf->b_evict_lock); 1978 arc_buf_destroy(buf, 1979 buf->b_data == stolen, TRUE); 1980 } 1981 } 1982 1983 if (ab->b_l2hdr) { 1984 ARCSTAT_INCR(arcstat_evict_l2_cached, 1985 ab->b_size); 1986 } else { 1987 if (l2arc_write_eligible(ab->b_spa, ab)) { 1988 ARCSTAT_INCR(arcstat_evict_l2_eligible, 1989 ab->b_size); 1990 } else { 1991 ARCSTAT_INCR( 1992 arcstat_evict_l2_ineligible, 1993 ab->b_size); 1994 } 1995 } 1996 1997 if (ab->b_datacnt == 0) { 1998 arc_change_state(evicted_state, ab, hash_lock); 1999 ASSERT(HDR_IN_HASH_TABLE(ab)); 2000 ab->b_flags |= ARC_IN_HASH_TABLE; 2001 ab->b_flags &= ~ARC_BUF_AVAILABLE; 2002 DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, ab); 2003 } 2004 if (!have_lock) 2005 mutex_exit(hash_lock); 2006 if (bytes >= 0 && bytes_evicted >= bytes) 2007 break; 2008 if (bytes_remaining > 0) { 2009 mutex_exit(evicted_lock); 2010 mutex_exit(lock); 2011 idx = ((idx + 1) & (list_count - 1)); 2012 count++; 2013 goto evict_start; 2014 } 2015 } else { 2016 missed += 1; 2017 } 2018 } 2019 2020 mutex_exit(evicted_lock); 2021 mutex_exit(lock); 2022 2023 idx = ((idx + 1) & (list_count - 1)); 2024 count++; 2025 2026 if (bytes_evicted < bytes) { 2027 if (count < list_count) 2028 goto evict_start; 2029 else 2030 dprintf("only evicted %lld bytes from %x", 2031 (longlong_t)bytes_evicted, state); 2032 } 2033 if (type == ARC_BUFC_METADATA) 2034 evict_metadata_offset = idx; 2035 else 2036 evict_data_offset = idx; 2037 2038 if (skipped) 2039 ARCSTAT_INCR(arcstat_evict_skip, skipped); 2040 2041 if (missed) 2042 ARCSTAT_INCR(arcstat_mutex_miss, missed); 2043 2044 /* 2045 * We have just evicted some date into the ghost state, make 2046 * sure we also adjust the ghost state size if necessary. 2047 */ 2048 if (arc_no_grow && 2049 arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size > arc_c) { 2050 int64_t mru_over = arc_anon->arcs_size + arc_mru->arcs_size + 2051 arc_mru_ghost->arcs_size - arc_c; 2052 2053 if (mru_over > 0 && arc_mru_ghost->arcs_lsize[type] > 0) { 2054 int64_t todelete = 2055 MIN(arc_mru_ghost->arcs_lsize[type], mru_over); 2056 arc_evict_ghost(arc_mru_ghost, 0, todelete); 2057 } else if (arc_mfu_ghost->arcs_lsize[type] > 0) { 2058 int64_t todelete = MIN(arc_mfu_ghost->arcs_lsize[type], 2059 arc_mru_ghost->arcs_size + 2060 arc_mfu_ghost->arcs_size - arc_c); 2061 arc_evict_ghost(arc_mfu_ghost, 0, todelete); 2062 } 2063 } 2064 if (stolen) 2065 ARCSTAT_BUMP(arcstat_stolen); 2066 2067 return (stolen); 2068} 2069 2070/* 2071 * Remove buffers from list until we've removed the specified number of 2072 * bytes. Destroy the buffers that are removed. 2073 */ 2074static void 2075arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes) 2076{ 2077 arc_buf_hdr_t *ab, *ab_prev; 2078 arc_buf_hdr_t marker = { 0 }; 2079 list_t *list, *list_start; 2080 kmutex_t *hash_lock, *lock; 2081 uint64_t bytes_deleted = 0; 2082 uint64_t bufs_skipped = 0; 2083 static int evict_offset; 2084 int list_count, idx = evict_offset; 2085 int offset, count = 0; 2086 2087 ASSERT(GHOST_STATE(state)); 2088 2089 /* 2090 * data lists come after metadata lists 2091 */ 2092 list_start = &state->arcs_lists[ARC_BUFC_NUMMETADATALISTS]; 2093 list_count = ARC_BUFC_NUMDATALISTS; 2094 offset = ARC_BUFC_NUMMETADATALISTS; 2095 2096evict_start: 2097 list = &list_start[idx]; 2098 lock = ARCS_LOCK(state, idx + offset); 2099 2100 mutex_enter(lock); 2101 for (ab = list_tail(list); ab; ab = ab_prev) { 2102 ab_prev = list_prev(list, ab); 2103 if (spa && ab->b_spa != spa) 2104 continue; 2105 2106 /* ignore markers */ 2107 if (ab->b_spa == 0) 2108 continue; 2109 2110 hash_lock = HDR_LOCK(ab); 2111 /* caller may be trying to modify this buffer, skip it */ 2112 if (MUTEX_HELD(hash_lock)) 2113 continue; 2114 if (mutex_tryenter(hash_lock)) { 2115 ASSERT(!HDR_IO_IN_PROGRESS(ab)); 2116 ASSERT(ab->b_buf == NULL); 2117 ARCSTAT_BUMP(arcstat_deleted); 2118 bytes_deleted += ab->b_size; 2119 2120 if (ab->b_l2hdr != NULL) { 2121 /* 2122 * This buffer is cached on the 2nd Level ARC; 2123 * don't destroy the header. 2124 */ 2125 arc_change_state(arc_l2c_only, ab, hash_lock); 2126 mutex_exit(hash_lock); 2127 } else { 2128 arc_change_state(arc_anon, ab, hash_lock); 2129 mutex_exit(hash_lock); 2130 arc_hdr_destroy(ab); 2131 } 2132 2133 DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab); 2134 if (bytes >= 0 && bytes_deleted >= bytes) 2135 break; 2136 } else if (bytes < 0) { 2137 /* 2138 * Insert a list marker and then wait for the 2139 * hash lock to become available. Once its 2140 * available, restart from where we left off. 2141 */ 2142 list_insert_after(list, ab, &marker); 2143 mutex_exit(lock); 2144 mutex_enter(hash_lock); 2145 mutex_exit(hash_lock); 2146 mutex_enter(lock); 2147 ab_prev = list_prev(list, &marker); 2148 list_remove(list, &marker); 2149 } else 2150 bufs_skipped += 1; 2151 } 2152 mutex_exit(lock); 2153 idx = ((idx + 1) & (ARC_BUFC_NUMDATALISTS - 1)); 2154 count++; 2155 2156 if (count < list_count) 2157 goto evict_start; 2158 2159 evict_offset = idx; 2160 if ((uintptr_t)list > (uintptr_t)&state->arcs_lists[ARC_BUFC_NUMMETADATALISTS] && 2161 (bytes < 0 || bytes_deleted < bytes)) { 2162 list_start = &state->arcs_lists[0]; 2163 list_count = ARC_BUFC_NUMMETADATALISTS; 2164 offset = count = 0; 2165 goto evict_start; 2166 } 2167 2168 if (bufs_skipped) { 2169 ARCSTAT_INCR(arcstat_mutex_miss, bufs_skipped); 2170 ASSERT(bytes >= 0); 2171 } 2172 2173 if (bytes_deleted < bytes) 2174 dprintf("only deleted %lld bytes from %p", 2175 (longlong_t)bytes_deleted, state); 2176} 2177 2178static void 2179arc_adjust(void) 2180{ 2181 int64_t adjustment, delta; 2182 2183 /* 2184 * Adjust MRU size 2185 */ 2186 2187 adjustment = MIN((int64_t)(arc_size - arc_c), 2188 (int64_t)(arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used - 2189 arc_p)); 2190 2191 if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) { 2192 delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], adjustment); 2193 (void) arc_evict(arc_mru, 0, delta, FALSE, ARC_BUFC_DATA); 2194 adjustment -= delta; 2195 } 2196 2197 if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) { 2198 delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustment); 2199 (void) arc_evict(arc_mru, 0, delta, FALSE, 2200 ARC_BUFC_METADATA); 2201 } 2202 2203 /* 2204 * Adjust MFU size 2205 */ 2206 2207 adjustment = arc_size - arc_c; 2208 2209 if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) { 2210 delta = MIN(adjustment, arc_mfu->arcs_lsize[ARC_BUFC_DATA]); 2211 (void) arc_evict(arc_mfu, 0, delta, FALSE, ARC_BUFC_DATA); 2212 adjustment -= delta; 2213 } 2214 2215 if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) { 2216 int64_t delta = MIN(adjustment, 2217 arc_mfu->arcs_lsize[ARC_BUFC_METADATA]); 2218 (void) arc_evict(arc_mfu, 0, delta, FALSE, 2219 ARC_BUFC_METADATA); 2220 } 2221 2222 /* 2223 * Adjust ghost lists 2224 */ 2225 2226 adjustment = arc_mru->arcs_size + arc_mru_ghost->arcs_size - arc_c; 2227 2228 if (adjustment > 0 && arc_mru_ghost->arcs_size > 0) { 2229 delta = MIN(arc_mru_ghost->arcs_size, adjustment); 2230 arc_evict_ghost(arc_mru_ghost, 0, delta); 2231 } 2232 2233 adjustment = 2234 arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size - arc_c; 2235 2236 if (adjustment > 0 && arc_mfu_ghost->arcs_size > 0) { 2237 delta = MIN(arc_mfu_ghost->arcs_size, adjustment); 2238 arc_evict_ghost(arc_mfu_ghost, 0, delta); 2239 } 2240} 2241 2242static void 2243arc_do_user_evicts(void) 2244{ 2245 static arc_buf_t *tmp_arc_eviction_list; 2246 2247 /* 2248 * Move list over to avoid LOR 2249 */ 2250restart: 2251 mutex_enter(&arc_eviction_mtx); 2252 tmp_arc_eviction_list = arc_eviction_list; 2253 arc_eviction_list = NULL; 2254 mutex_exit(&arc_eviction_mtx); 2255 2256 while (tmp_arc_eviction_list != NULL) { 2257 arc_buf_t *buf = tmp_arc_eviction_list; 2258 tmp_arc_eviction_list = buf->b_next; 2259 mutex_enter(&buf->b_evict_lock); 2260 buf->b_hdr = NULL; 2261 mutex_exit(&buf->b_evict_lock); 2262 2263 if (buf->b_efunc != NULL) 2264 VERIFY(buf->b_efunc(buf) == 0); 2265 2266 buf->b_efunc = NULL; 2267 buf->b_private = NULL; 2268 kmem_cache_free(buf_cache, buf); 2269 } 2270 2271 if (arc_eviction_list != NULL) 2272 goto restart; 2273} 2274 2275/* 2276 * Flush all *evictable* data from the cache for the given spa. 2277 * NOTE: this will not touch "active" (i.e. referenced) data. 2278 */ 2279void 2280arc_flush(spa_t *spa) 2281{ 2282 uint64_t guid = 0; 2283 2284 if (spa) 2285 guid = spa_load_guid(spa); 2286 2287 while (arc_mru->arcs_lsize[ARC_BUFC_DATA]) { 2288 (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_DATA); 2289 if (spa) 2290 break; 2291 } 2292 while (arc_mru->arcs_lsize[ARC_BUFC_METADATA]) { 2293 (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_METADATA); 2294 if (spa) 2295 break; 2296 } 2297 while (arc_mfu->arcs_lsize[ARC_BUFC_DATA]) { 2298 (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_DATA); 2299 if (spa) 2300 break; 2301 } 2302 while (arc_mfu->arcs_lsize[ARC_BUFC_METADATA]) { 2303 (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_METADATA); 2304 if (spa) 2305 break; 2306 } 2307 2308 arc_evict_ghost(arc_mru_ghost, guid, -1); 2309 arc_evict_ghost(arc_mfu_ghost, guid, -1); 2310 2311 mutex_enter(&arc_reclaim_thr_lock); 2312 arc_do_user_evicts(); 2313 mutex_exit(&arc_reclaim_thr_lock); 2314 ASSERT(spa || arc_eviction_list == NULL); 2315} 2316 2317void 2318arc_shrink(void) 2319{ 2320 if (arc_c > arc_c_min) { 2321 uint64_t to_free; 2322 2323#ifdef _KERNEL 2324 to_free = arc_c >> arc_shrink_shift; 2325#else 2326 to_free = arc_c >> arc_shrink_shift; 2327#endif 2328 if (arc_c > arc_c_min + to_free) 2329 atomic_add_64(&arc_c, -to_free); 2330 else 2331 arc_c = arc_c_min; 2332 2333 atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift)); 2334 if (arc_c > arc_size) 2335 arc_c = MAX(arc_size, arc_c_min); 2336 if (arc_p > arc_c) 2337 arc_p = (arc_c >> 1); 2338 ASSERT(arc_c >= arc_c_min); 2339 ASSERT((int64_t)arc_p >= 0); 2340 } 2341 2342 if (arc_size > arc_c) 2343 arc_adjust(); 2344} 2345 2346static int needfree = 0; 2347 2348static int 2349arc_reclaim_needed(void) 2350{ 2351 2352#ifdef _KERNEL 2353 2354 if (needfree) 2355 return (1); 2356 2357 /* 2358 * Cooperate with pagedaemon when it's time for it to scan 2359 * and reclaim some pages. 2360 */ 2361 if (vm_paging_needed()) 2362 return (1); 2363 2364#ifdef sun 2365 /* 2366 * take 'desfree' extra pages, so we reclaim sooner, rather than later 2367 */ 2368 extra = desfree; 2369 2370 /* 2371 * check that we're out of range of the pageout scanner. It starts to 2372 * schedule paging if freemem is less than lotsfree and needfree. 2373 * lotsfree is the high-water mark for pageout, and needfree is the 2374 * number of needed free pages. We add extra pages here to make sure 2375 * the scanner doesn't start up while we're freeing memory. 2376 */ 2377 if (freemem < lotsfree + needfree + extra) 2378 return (1); 2379 2380 /* 2381 * check to make sure that swapfs has enough space so that anon 2382 * reservations can still succeed. anon_resvmem() checks that the 2383 * availrmem is greater than swapfs_minfree, and the number of reserved 2384 * swap pages. We also add a bit of extra here just to prevent 2385 * circumstances from getting really dire. 2386 */ 2387 if (availrmem < swapfs_minfree + swapfs_reserve + extra) 2388 return (1); 2389 2390#if defined(__i386) 2391 /* 2392 * If we're on an i386 platform, it's possible that we'll exhaust the 2393 * kernel heap space before we ever run out of available physical 2394 * memory. Most checks of the size of the heap_area compare against 2395 * tune.t_minarmem, which is the minimum available real memory that we 2396 * can have in the system. However, this is generally fixed at 25 pages 2397 * which is so low that it's useless. In this comparison, we seek to 2398 * calculate the total heap-size, and reclaim if more than 3/4ths of the 2399 * heap is allocated. (Or, in the calculation, if less than 1/4th is 2400 * free) 2401 */ 2402 if (btop(vmem_size(heap_arena, VMEM_FREE)) < 2403 (btop(vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC)) >> 2)) 2404 return (1); 2405#endif 2406#else /* !sun */ 2407 if (kmem_used() > (kmem_size() * 3) / 4) 2408 return (1); 2409#endif /* sun */ 2410 2411#else 2412 if (spa_get_random(100) == 0) 2413 return (1); 2414#endif 2415 return (0); 2416} 2417 2418extern kmem_cache_t *zio_buf_cache[]; 2419extern kmem_cache_t *zio_data_buf_cache[]; 2420 2421static void 2422arc_kmem_reap_now(arc_reclaim_strategy_t strat) 2423{ 2424 size_t i; 2425 kmem_cache_t *prev_cache = NULL; 2426 kmem_cache_t *prev_data_cache = NULL; 2427 2428#ifdef _KERNEL 2429 if (arc_meta_used >= arc_meta_limit) { 2430 /* 2431 * We are exceeding our meta-data cache limit. 2432 * Purge some DNLC entries to release holds on meta-data. 2433 */ 2434 dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent); 2435 } 2436#if defined(__i386) 2437 /* 2438 * Reclaim unused memory from all kmem caches. 2439 */ 2440 kmem_reap(); 2441#endif 2442#endif 2443 2444 /* 2445 * An aggressive reclamation will shrink the cache size as well as 2446 * reap free buffers from the arc kmem caches. 2447 */ 2448 if (strat == ARC_RECLAIM_AGGR) 2449 arc_shrink(); 2450 2451 for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) { 2452 if (zio_buf_cache[i] != prev_cache) { 2453 prev_cache = zio_buf_cache[i]; 2454 kmem_cache_reap_now(zio_buf_cache[i]); 2455 } 2456 if (zio_data_buf_cache[i] != prev_data_cache) { 2457 prev_data_cache = zio_data_buf_cache[i]; 2458 kmem_cache_reap_now(zio_data_buf_cache[i]); 2459 } 2460 } 2461 kmem_cache_reap_now(buf_cache); 2462 kmem_cache_reap_now(hdr_cache); 2463} 2464 2465static void 2466arc_reclaim_thread(void *dummy __unused) 2467{ 2468 clock_t growtime = 0; 2469 arc_reclaim_strategy_t last_reclaim = ARC_RECLAIM_CONS; 2470 callb_cpr_t cpr; 2471 2472 CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG); 2473 2474 mutex_enter(&arc_reclaim_thr_lock); 2475 while (arc_thread_exit == 0) { 2476 if (arc_reclaim_needed()) { 2477 2478 if (arc_no_grow) { 2479 if (last_reclaim == ARC_RECLAIM_CONS) { 2480 last_reclaim = ARC_RECLAIM_AGGR; 2481 } else { 2482 last_reclaim = ARC_RECLAIM_CONS; 2483 } 2484 } else { 2485 arc_no_grow = TRUE; 2486 last_reclaim = ARC_RECLAIM_AGGR; 2487 membar_producer(); 2488 } 2489 2490 /* reset the growth delay for every reclaim */ 2491 growtime = ddi_get_lbolt() + (arc_grow_retry * hz); 2492 2493 if (needfree && last_reclaim == ARC_RECLAIM_CONS) { 2494 /* 2495 * If needfree is TRUE our vm_lowmem hook 2496 * was called and in that case we must free some 2497 * memory, so switch to aggressive mode. 2498 */ 2499 arc_no_grow = TRUE; 2500 last_reclaim = ARC_RECLAIM_AGGR; 2501 } 2502 arc_kmem_reap_now(last_reclaim); 2503 arc_warm = B_TRUE; 2504 2505 } else if (arc_no_grow && ddi_get_lbolt() >= growtime) { 2506 arc_no_grow = FALSE; 2507 } 2508 2509 arc_adjust(); 2510 2511 if (arc_eviction_list != NULL) 2512 arc_do_user_evicts(); 2513 2514#ifdef _KERNEL 2515 if (needfree) { 2516 needfree = 0; 2517 wakeup(&needfree); 2518 } 2519#endif 2520 2521 /* block until needed, or one second, whichever is shorter */ 2522 CALLB_CPR_SAFE_BEGIN(&cpr); 2523 (void) cv_timedwait(&arc_reclaim_thr_cv, 2524 &arc_reclaim_thr_lock, hz); 2525 CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock); 2526 } 2527 2528 arc_thread_exit = 0; 2529 cv_broadcast(&arc_reclaim_thr_cv); 2530 CALLB_CPR_EXIT(&cpr); /* drops arc_reclaim_thr_lock */ 2531 thread_exit(); 2532} 2533 2534/* 2535 * Adapt arc info given the number of bytes we are trying to add and 2536 * the state that we are comming from. This function is only called 2537 * when we are adding new content to the cache. 2538 */ 2539static void 2540arc_adapt(int bytes, arc_state_t *state) 2541{ 2542 int mult; 2543 uint64_t arc_p_min = (arc_c >> arc_p_min_shift); 2544 2545 if (state == arc_l2c_only) 2546 return; 2547 2548 ASSERT(bytes > 0); 2549 /* 2550 * Adapt the target size of the MRU list: 2551 * - if we just hit in the MRU ghost list, then increase 2552 * the target size of the MRU list. 2553 * - if we just hit in the MFU ghost list, then increase 2554 * the target size of the MFU list by decreasing the 2555 * target size of the MRU list. 2556 */ 2557 if (state == arc_mru_ghost) { 2558 mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ? 2559 1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size)); 2560 mult = MIN(mult, 10); /* avoid wild arc_p adjustment */ 2561 2562 arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult); 2563 } else if (state == arc_mfu_ghost) { 2564 uint64_t delta; 2565 2566 mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ? 2567 1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size)); 2568 mult = MIN(mult, 10); 2569 2570 delta = MIN(bytes * mult, arc_p); 2571 arc_p = MAX(arc_p_min, arc_p - delta); 2572 } 2573 ASSERT((int64_t)arc_p >= 0); 2574 2575 if (arc_reclaim_needed()) { 2576 cv_signal(&arc_reclaim_thr_cv); 2577 return; 2578 } 2579 2580 if (arc_no_grow) 2581 return; 2582 2583 if (arc_c >= arc_c_max) 2584 return; 2585 2586 /* 2587 * If we're within (2 * maxblocksize) bytes of the target 2588 * cache size, increment the target cache size 2589 */ 2590 if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) { 2591 atomic_add_64(&arc_c, (int64_t)bytes); 2592 if (arc_c > arc_c_max) 2593 arc_c = arc_c_max; 2594 else if (state == arc_anon) 2595 atomic_add_64(&arc_p, (int64_t)bytes); 2596 if (arc_p > arc_c) 2597 arc_p = arc_c; 2598 } 2599 ASSERT((int64_t)arc_p >= 0); 2600} 2601 2602/* 2603 * Check if the cache has reached its limits and eviction is required 2604 * prior to insert. 2605 */ 2606static int 2607arc_evict_needed(arc_buf_contents_t type) 2608{ 2609 if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit) 2610 return (1); 2611 2612#ifdef sun 2613#ifdef _KERNEL 2614 /* 2615 * If zio data pages are being allocated out of a separate heap segment, 2616 * then enforce that the size of available vmem for this area remains 2617 * above about 1/32nd free. 2618 */ 2619 if (type == ARC_BUFC_DATA && zio_arena != NULL && 2620 vmem_size(zio_arena, VMEM_FREE) < 2621 (vmem_size(zio_arena, VMEM_ALLOC) >> 5)) 2622 return (1); 2623#endif 2624#endif /* sun */ 2625 2626 if (arc_reclaim_needed()) 2627 return (1); 2628 2629 return (arc_size > arc_c); 2630} 2631 2632/* 2633 * The buffer, supplied as the first argument, needs a data block. 2634 * So, if we are at cache max, determine which cache should be victimized. 2635 * We have the following cases: 2636 * 2637 * 1. Insert for MRU, p > sizeof(arc_anon + arc_mru) -> 2638 * In this situation if we're out of space, but the resident size of the MFU is 2639 * under the limit, victimize the MFU cache to satisfy this insertion request. 2640 * 2641 * 2. Insert for MRU, p <= sizeof(arc_anon + arc_mru) -> 2642 * Here, we've used up all of the available space for the MRU, so we need to 2643 * evict from our own cache instead. Evict from the set of resident MRU 2644 * entries. 2645 * 2646 * 3. Insert for MFU (c - p) > sizeof(arc_mfu) -> 2647 * c minus p represents the MFU space in the cache, since p is the size of the 2648 * cache that is dedicated to the MRU. In this situation there's still space on 2649 * the MFU side, so the MRU side needs to be victimized. 2650 * 2651 * 4. Insert for MFU (c - p) < sizeof(arc_mfu) -> 2652 * MFU's resident set is consuming more space than it has been allotted. In 2653 * this situation, we must victimize our own cache, the MFU, for this insertion. 2654 */ 2655static void 2656arc_get_data_buf(arc_buf_t *buf) 2657{ 2658 arc_state_t *state = buf->b_hdr->b_state; 2659 uint64_t size = buf->b_hdr->b_size; 2660 arc_buf_contents_t type = buf->b_hdr->b_type; 2661 2662 arc_adapt(size, state); 2663 2664 /* 2665 * We have not yet reached cache maximum size, 2666 * just allocate a new buffer. 2667 */ 2668 if (!arc_evict_needed(type)) { 2669 if (type == ARC_BUFC_METADATA) { 2670 buf->b_data = zio_buf_alloc(size); 2671 arc_space_consume(size, ARC_SPACE_DATA); 2672 } else { 2673 ASSERT(type == ARC_BUFC_DATA); 2674 buf->b_data = zio_data_buf_alloc(size); 2675 ARCSTAT_INCR(arcstat_data_size, size); 2676 atomic_add_64(&arc_size, size); 2677 } 2678 goto out; 2679 } 2680 2681 /* 2682 * If we are prefetching from the mfu ghost list, this buffer 2683 * will end up on the mru list; so steal space from there. 2684 */ 2685 if (state == arc_mfu_ghost) 2686 state = buf->b_hdr->b_flags & ARC_PREFETCH ? arc_mru : arc_mfu; 2687 else if (state == arc_mru_ghost) 2688 state = arc_mru; 2689 2690 if (state == arc_mru || state == arc_anon) { 2691 uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size; 2692 state = (arc_mfu->arcs_lsize[type] >= size && 2693 arc_p > mru_used) ? arc_mfu : arc_mru; 2694 } else { 2695 /* MFU cases */ 2696 uint64_t mfu_space = arc_c - arc_p; 2697 state = (arc_mru->arcs_lsize[type] >= size && 2698 mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu; 2699 } 2700 if ((buf->b_data = arc_evict(state, 0, size, TRUE, type)) == NULL) { 2701 if (type == ARC_BUFC_METADATA) { 2702 buf->b_data = zio_buf_alloc(size); 2703 arc_space_consume(size, ARC_SPACE_DATA); 2704 } else { 2705 ASSERT(type == ARC_BUFC_DATA); 2706 buf->b_data = zio_data_buf_alloc(size); 2707 ARCSTAT_INCR(arcstat_data_size, size); 2708 atomic_add_64(&arc_size, size); 2709 } 2710 ARCSTAT_BUMP(arcstat_recycle_miss); 2711 } 2712 ASSERT(buf->b_data != NULL); 2713out: 2714 /* 2715 * Update the state size. Note that ghost states have a 2716 * "ghost size" and so don't need to be updated. 2717 */ 2718 if (!GHOST_STATE(buf->b_hdr->b_state)) { 2719 arc_buf_hdr_t *hdr = buf->b_hdr; 2720 2721 atomic_add_64(&hdr->b_state->arcs_size, size); 2722 if (list_link_active(&hdr->b_arc_node)) { 2723 ASSERT(refcount_is_zero(&hdr->b_refcnt)); 2724 atomic_add_64(&hdr->b_state->arcs_lsize[type], size); 2725 } 2726 /* 2727 * If we are growing the cache, and we are adding anonymous 2728 * data, and we have outgrown arc_p, update arc_p 2729 */ 2730 if (arc_size < arc_c && hdr->b_state == arc_anon && 2731 arc_anon->arcs_size + arc_mru->arcs_size > arc_p) 2732 arc_p = MIN(arc_c, arc_p + size); 2733 } 2734 ARCSTAT_BUMP(arcstat_allocated); 2735} 2736 2737/* 2738 * This routine is called whenever a buffer is accessed. 2739 * NOTE: the hash lock is dropped in this function. 2740 */ 2741static void 2742arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) 2743{ 2744 clock_t now; 2745 2746 ASSERT(MUTEX_HELD(hash_lock)); 2747 2748 if (buf->b_state == arc_anon) { 2749 /* 2750 * This buffer is not in the cache, and does not 2751 * appear in our "ghost" list. Add the new buffer 2752 * to the MRU state. 2753 */ 2754 2755 ASSERT(buf->b_arc_access == 0); 2756 buf->b_arc_access = ddi_get_lbolt(); 2757 DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf); 2758 arc_change_state(arc_mru, buf, hash_lock); 2759 2760 } else if (buf->b_state == arc_mru) { 2761 now = ddi_get_lbolt(); 2762 2763 /* 2764 * If this buffer is here because of a prefetch, then either: 2765 * - clear the flag if this is a "referencing" read 2766 * (any subsequent access will bump this into the MFU state). 2767 * or 2768 * - move the buffer to the head of the list if this is 2769 * another prefetch (to make it less likely to be evicted). 2770 */ 2771 if ((buf->b_flags & ARC_PREFETCH) != 0) { 2772 if (refcount_count(&buf->b_refcnt) == 0) { 2773 ASSERT(list_link_active(&buf->b_arc_node)); 2774 } else { 2775 buf->b_flags &= ~ARC_PREFETCH; 2776 ARCSTAT_BUMP(arcstat_mru_hits); 2777 } 2778 buf->b_arc_access = now; 2779 return; 2780 } 2781 2782 /* 2783 * This buffer has been "accessed" only once so far, 2784 * but it is still in the cache. Move it to the MFU 2785 * state. 2786 */ 2787 if (now > buf->b_arc_access + ARC_MINTIME) { 2788 /* 2789 * More than 125ms have passed since we 2790 * instantiated this buffer. Move it to the 2791 * most frequently used state. 2792 */ 2793 buf->b_arc_access = now; 2794 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); 2795 arc_change_state(arc_mfu, buf, hash_lock); 2796 } 2797 ARCSTAT_BUMP(arcstat_mru_hits); 2798 } else if (buf->b_state == arc_mru_ghost) { 2799 arc_state_t *new_state; 2800 /* 2801 * This buffer has been "accessed" recently, but 2802 * was evicted from the cache. Move it to the 2803 * MFU state. 2804 */ 2805 2806 if (buf->b_flags & ARC_PREFETCH) { 2807 new_state = arc_mru; 2808 if (refcount_count(&buf->b_refcnt) > 0) 2809 buf->b_flags &= ~ARC_PREFETCH; 2810 DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf); 2811 } else { 2812 new_state = arc_mfu; 2813 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); 2814 } 2815 2816 buf->b_arc_access = ddi_get_lbolt(); 2817 arc_change_state(new_state, buf, hash_lock); 2818 2819 ARCSTAT_BUMP(arcstat_mru_ghost_hits); 2820 } else if (buf->b_state == arc_mfu) { 2821 /* 2822 * This buffer has been accessed more than once and is 2823 * still in the cache. Keep it in the MFU state. 2824 * 2825 * NOTE: an add_reference() that occurred when we did 2826 * the arc_read() will have kicked this off the list. 2827 * If it was a prefetch, we will explicitly move it to 2828 * the head of the list now. 2829 */ 2830 if ((buf->b_flags & ARC_PREFETCH) != 0) { 2831 ASSERT(refcount_count(&buf->b_refcnt) == 0); 2832 ASSERT(list_link_active(&buf->b_arc_node)); 2833 } 2834 ARCSTAT_BUMP(arcstat_mfu_hits); 2835 buf->b_arc_access = ddi_get_lbolt(); 2836 } else if (buf->b_state == arc_mfu_ghost) { 2837 arc_state_t *new_state = arc_mfu; 2838 /* 2839 * This buffer has been accessed more than once but has 2840 * been evicted from the cache. Move it back to the 2841 * MFU state. 2842 */ 2843 2844 if (buf->b_flags & ARC_PREFETCH) { 2845 /* 2846 * This is a prefetch access... 2847 * move this block back to the MRU state. 2848 */ 2849 ASSERT0(refcount_count(&buf->b_refcnt)); 2850 new_state = arc_mru; 2851 } 2852 2853 buf->b_arc_access = ddi_get_lbolt(); 2854 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); 2855 arc_change_state(new_state, buf, hash_lock); 2856 2857 ARCSTAT_BUMP(arcstat_mfu_ghost_hits); 2858 } else if (buf->b_state == arc_l2c_only) { 2859 /* 2860 * This buffer is on the 2nd Level ARC. 2861 */ 2862 2863 buf->b_arc_access = ddi_get_lbolt(); 2864 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); 2865 arc_change_state(arc_mfu, buf, hash_lock); 2866 } else { 2867 ASSERT(!"invalid arc state"); 2868 } 2869} 2870 2871/* a generic arc_done_func_t which you can use */ 2872/* ARGSUSED */ 2873void 2874arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg) 2875{ 2876 if (zio == NULL || zio->io_error == 0) 2877 bcopy(buf->b_data, arg, buf->b_hdr->b_size); 2878 VERIFY(arc_buf_remove_ref(buf, arg) == 1); 2879} 2880 2881/* a generic arc_done_func_t */ 2882void 2883arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg) 2884{ 2885 arc_buf_t **bufp = arg; 2886 if (zio && zio->io_error) { 2887 VERIFY(arc_buf_remove_ref(buf, arg) == 1); 2888 *bufp = NULL; 2889 } else { 2890 *bufp = buf; 2891 ASSERT(buf->b_data); 2892 } 2893} 2894 2895static void 2896arc_read_done(zio_t *zio) 2897{ 2898 arc_buf_hdr_t *hdr, *found; 2899 arc_buf_t *buf; 2900 arc_buf_t *abuf; /* buffer we're assigning to callback */ 2901 kmutex_t *hash_lock; 2902 arc_callback_t *callback_list, *acb; 2903 int freeable = FALSE; 2904 2905 buf = zio->io_private; 2906 hdr = buf->b_hdr; 2907 2908 /* 2909 * The hdr was inserted into hash-table and removed from lists 2910 * prior to starting I/O. We should find this header, since 2911 * it's in the hash table, and it should be legit since it's 2912 * not possible to evict it during the I/O. The only possible 2913 * reason for it not to be found is if we were freed during the 2914 * read. 2915 */ 2916 found = buf_hash_find(hdr->b_spa, &hdr->b_dva, hdr->b_birth, 2917 &hash_lock); 2918 2919 ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && hash_lock == NULL) || 2920 (found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) || 2921 (found == hdr && HDR_L2_READING(hdr))); 2922 2923 hdr->b_flags &= ~ARC_L2_EVICTED; 2924 if (l2arc_noprefetch && (hdr->b_flags & ARC_PREFETCH)) 2925 hdr->b_flags &= ~ARC_L2CACHE; 2926 2927 /* byteswap if necessary */ 2928 callback_list = hdr->b_acb; 2929 ASSERT(callback_list != NULL); 2930 if (BP_SHOULD_BYTESWAP(zio->io_bp) && zio->io_error == 0) { 2931 dmu_object_byteswap_t bswap = 2932 DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp)); 2933 arc_byteswap_func_t *func = BP_GET_LEVEL(zio->io_bp) > 0 ? 2934 byteswap_uint64_array : 2935 dmu_ot_byteswap[bswap].ob_func; 2936 func(buf->b_data, hdr->b_size); 2937 } 2938 2939 arc_cksum_compute(buf, B_FALSE); 2940#ifdef illumos 2941 arc_buf_watch(buf); 2942#endif /* illumos */ 2943 2944 if (hash_lock && zio->io_error == 0 && hdr->b_state == arc_anon) { 2945 /* 2946 * Only call arc_access on anonymous buffers. This is because 2947 * if we've issued an I/O for an evicted buffer, we've already 2948 * called arc_access (to prevent any simultaneous readers from 2949 * getting confused). 2950 */ 2951 arc_access(hdr, hash_lock); 2952 } 2953 2954 /* create copies of the data buffer for the callers */ 2955 abuf = buf; 2956 for (acb = callback_list; acb; acb = acb->acb_next) { 2957 if (acb->acb_done) { 2958 if (abuf == NULL) { 2959 ARCSTAT_BUMP(arcstat_duplicate_reads); 2960 abuf = arc_buf_clone(buf); 2961 } 2962 acb->acb_buf = abuf; 2963 abuf = NULL; 2964 } 2965 } 2966 hdr->b_acb = NULL; 2967 hdr->b_flags &= ~ARC_IO_IN_PROGRESS; 2968 ASSERT(!HDR_BUF_AVAILABLE(hdr)); 2969 if (abuf == buf) { 2970 ASSERT(buf->b_efunc == NULL); 2971 ASSERT(hdr->b_datacnt == 1); 2972 hdr->b_flags |= ARC_BUF_AVAILABLE; 2973 } 2974 2975 ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL); 2976 2977 if (zio->io_error != 0) { 2978 hdr->b_flags |= ARC_IO_ERROR; 2979 if (hdr->b_state != arc_anon) 2980 arc_change_state(arc_anon, hdr, hash_lock); 2981 if (HDR_IN_HASH_TABLE(hdr)) 2982 buf_hash_remove(hdr); 2983 freeable = refcount_is_zero(&hdr->b_refcnt); 2984 } 2985 2986 /* 2987 * Broadcast before we drop the hash_lock to avoid the possibility 2988 * that the hdr (and hence the cv) might be freed before we get to 2989 * the cv_broadcast(). 2990 */ 2991 cv_broadcast(&hdr->b_cv); 2992 2993 if (hash_lock) { 2994 mutex_exit(hash_lock); 2995 } else { 2996 /* 2997 * This block was freed while we waited for the read to 2998 * complete. It has been removed from the hash table and 2999 * moved to the anonymous state (so that it won't show up 3000 * in the cache). 3001 */ 3002 ASSERT3P(hdr->b_state, ==, arc_anon); 3003 freeable = refcount_is_zero(&hdr->b_refcnt); 3004 } 3005 3006 /* execute each callback and free its structure */ 3007 while ((acb = callback_list) != NULL) { 3008 if (acb->acb_done) 3009 acb->acb_done(zio, acb->acb_buf, acb->acb_private); 3010 3011 if (acb->acb_zio_dummy != NULL) { 3012 acb->acb_zio_dummy->io_error = zio->io_error; 3013 zio_nowait(acb->acb_zio_dummy); 3014 } 3015 3016 callback_list = acb->acb_next; 3017 kmem_free(acb, sizeof (arc_callback_t)); 3018 } 3019 3020 if (freeable) 3021 arc_hdr_destroy(hdr); 3022} 3023 3024/* 3025 * "Read" the block block at the specified DVA (in bp) via the 3026 * cache. If the block is found in the cache, invoke the provided 3027 * callback immediately and return. Note that the `zio' parameter 3028 * in the callback will be NULL in this case, since no IO was 3029 * required. If the block is not in the cache pass the read request 3030 * on to the spa with a substitute callback function, so that the 3031 * requested block will be added to the cache. 3032 * 3033 * If a read request arrives for a block that has a read in-progress, 3034 * either wait for the in-progress read to complete (and return the 3035 * results); or, if this is a read with a "done" func, add a record 3036 * to the read to invoke the "done" func when the read completes, 3037 * and return; or just return. 3038 * 3039 * arc_read_done() will invoke all the requested "done" functions 3040 * for readers of this block. 3041 */ 3042int 3043arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, 3044 void *private, int priority, int zio_flags, uint32_t *arc_flags, 3045 const zbookmark_t *zb) 3046{ 3047 arc_buf_hdr_t *hdr;
| 1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21/* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 24 * Copyright (c) 2011 by Delphix. All rights reserved. 25 */ 26 27/* 28 * DVA-based Adjustable Replacement Cache 29 * 30 * While much of the theory of operation used here is 31 * based on the self-tuning, low overhead replacement cache 32 * presented by Megiddo and Modha at FAST 2003, there are some 33 * significant differences: 34 * 35 * 1. The Megiddo and Modha model assumes any page is evictable. 36 * Pages in its cache cannot be "locked" into memory. This makes 37 * the eviction algorithm simple: evict the last page in the list. 38 * This also make the performance characteristics easy to reason 39 * about. Our cache is not so simple. At any given moment, some 40 * subset of the blocks in the cache are un-evictable because we 41 * have handed out a reference to them. Blocks are only evictable 42 * when there are no external references active. This makes 43 * eviction far more problematic: we choose to evict the evictable 44 * blocks that are the "lowest" in the list. 45 * 46 * There are times when it is not possible to evict the requested 47 * space. In these circumstances we are unable to adjust the cache 48 * size. To prevent the cache growing unbounded at these times we 49 * implement a "cache throttle" that slows the flow of new data 50 * into the cache until we can make space available. 51 * 52 * 2. The Megiddo and Modha model assumes a fixed cache size. 53 * Pages are evicted when the cache is full and there is a cache 54 * miss. Our model has a variable sized cache. It grows with 55 * high use, but also tries to react to memory pressure from the 56 * operating system: decreasing its size when system memory is 57 * tight. 58 * 59 * 3. The Megiddo and Modha model assumes a fixed page size. All 60 * elements of the cache are therefor exactly the same size. So 61 * when adjusting the cache size following a cache miss, its simply 62 * a matter of choosing a single page to evict. In our model, we 63 * have variable sized cache blocks (rangeing from 512 bytes to 64 * 128K bytes). We therefor choose a set of blocks to evict to make 65 * space for a cache miss that approximates as closely as possible 66 * the space used by the new block. 67 * 68 * See also: "ARC: A Self-Tuning, Low Overhead Replacement Cache" 69 * by N. Megiddo & D. Modha, FAST 2003 70 */ 71 72/* 73 * The locking model: 74 * 75 * A new reference to a cache buffer can be obtained in two 76 * ways: 1) via a hash table lookup using the DVA as a key, 77 * or 2) via one of the ARC lists. The arc_read() interface 78 * uses method 1, while the internal arc algorithms for 79 * adjusting the cache use method 2. We therefor provide two 80 * types of locks: 1) the hash table lock array, and 2) the 81 * arc list locks. 82 * 83 * Buffers do not have their own mutexs, rather they rely on the 84 * hash table mutexs for the bulk of their protection (i.e. most 85 * fields in the arc_buf_hdr_t are protected by these mutexs). 86 * 87 * buf_hash_find() returns the appropriate mutex (held) when it 88 * locates the requested buffer in the hash table. It returns 89 * NULL for the mutex if the buffer was not in the table. 90 * 91 * buf_hash_remove() expects the appropriate hash mutex to be 92 * already held before it is invoked. 93 * 94 * Each arc state also has a mutex which is used to protect the 95 * buffer list associated with the state. When attempting to 96 * obtain a hash table lock while holding an arc list lock you 97 * must use: mutex_tryenter() to avoid deadlock. Also note that 98 * the active state mutex must be held before the ghost state mutex. 99 * 100 * Arc buffers may have an associated eviction callback function. 101 * This function will be invoked prior to removing the buffer (e.g. 102 * in arc_do_user_evicts()). Note however that the data associated 103 * with the buffer may be evicted prior to the callback. The callback 104 * must be made with *no locks held* (to prevent deadlock). Additionally, 105 * the users of callbacks must ensure that their private data is 106 * protected from simultaneous callbacks from arc_buf_evict() 107 * and arc_do_user_evicts(). 108 * 109 * Note that the majority of the performance stats are manipulated 110 * with atomic operations. 111 * 112 * The L2ARC uses the l2arc_buflist_mtx global mutex for the following: 113 * 114 * - L2ARC buflist creation 115 * - L2ARC buflist eviction 116 * - L2ARC write completion, which walks L2ARC buflists 117 * - ARC header destruction, as it removes from L2ARC buflists 118 * - ARC header release, as it removes from L2ARC buflists 119 */ 120 121#include <sys/spa.h> 122#include <sys/zio.h> 123#include <sys/zfs_context.h> 124#include <sys/arc.h> 125#include <sys/refcount.h> 126#include <sys/vdev.h> 127#include <sys/vdev_impl.h> 128#ifdef _KERNEL 129#include <sys/dnlc.h> 130#endif 131#include <sys/callb.h> 132#include <sys/kstat.h> 133#include <zfs_fletcher.h> 134#include <sys/sdt.h> 135 136#include <vm/vm_pageout.h> 137 138#ifdef illumos 139#ifndef _KERNEL 140/* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */ 141boolean_t arc_watch = B_FALSE; 142int arc_procfd; 143#endif 144#endif /* illumos */ 145 146static kmutex_t arc_reclaim_thr_lock; 147static kcondvar_t arc_reclaim_thr_cv; /* used to signal reclaim thr */ 148static uint8_t arc_thread_exit; 149 150extern int zfs_write_limit_shift; 151extern uint64_t zfs_write_limit_max; 152extern kmutex_t zfs_write_limit_lock; 153 154#define ARC_REDUCE_DNLC_PERCENT 3 155uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT; 156 157typedef enum arc_reclaim_strategy { 158 ARC_RECLAIM_AGGR, /* Aggressive reclaim strategy */ 159 ARC_RECLAIM_CONS /* Conservative reclaim strategy */ 160} arc_reclaim_strategy_t; 161 162/* number of seconds before growing cache again */ 163static int arc_grow_retry = 60; 164 165/* shift of arc_c for calculating both min and max arc_p */ 166static int arc_p_min_shift = 4; 167 168/* log2(fraction of arc to reclaim) */ 169static int arc_shrink_shift = 5; 170 171/* 172 * minimum lifespan of a prefetch block in clock ticks 173 * (initialized in arc_init()) 174 */ 175static int arc_min_prefetch_lifespan; 176 177static int arc_dead; 178extern int zfs_prefetch_disable; 179 180/* 181 * The arc has filled available memory and has now warmed up. 182 */ 183static boolean_t arc_warm; 184 185/* 186 * These tunables are for performance analysis. 187 */ 188uint64_t zfs_arc_max; 189uint64_t zfs_arc_min; 190uint64_t zfs_arc_meta_limit = 0; 191int zfs_arc_grow_retry = 0; 192int zfs_arc_shrink_shift = 0; 193int zfs_arc_p_min_shift = 0; 194int zfs_disable_dup_eviction = 0; 195 196TUNABLE_QUAD("vfs.zfs.arc_max", &zfs_arc_max); 197TUNABLE_QUAD("vfs.zfs.arc_min", &zfs_arc_min); 198TUNABLE_QUAD("vfs.zfs.arc_meta_limit", &zfs_arc_meta_limit); 199SYSCTL_DECL(_vfs_zfs); 200SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_max, CTLFLAG_RDTUN, &zfs_arc_max, 0, 201 "Maximum ARC size"); 202SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_min, CTLFLAG_RDTUN, &zfs_arc_min, 0, 203 "Minimum ARC size"); 204 205/* 206 * Note that buffers can be in one of 6 states: 207 * ARC_anon - anonymous (discussed below) 208 * ARC_mru - recently used, currently cached 209 * ARC_mru_ghost - recentely used, no longer in cache 210 * ARC_mfu - frequently used, currently cached 211 * ARC_mfu_ghost - frequently used, no longer in cache 212 * ARC_l2c_only - exists in L2ARC but not other states 213 * When there are no active references to the buffer, they are 214 * are linked onto a list in one of these arc states. These are 215 * the only buffers that can be evicted or deleted. Within each 216 * state there are multiple lists, one for meta-data and one for 217 * non-meta-data. Meta-data (indirect blocks, blocks of dnodes, 218 * etc.) is tracked separately so that it can be managed more 219 * explicitly: favored over data, limited explicitly. 220 * 221 * Anonymous buffers are buffers that are not associated with 222 * a DVA. These are buffers that hold dirty block copies 223 * before they are written to stable storage. By definition, 224 * they are "ref'd" and are considered part of arc_mru 225 * that cannot be freed. Generally, they will aquire a DVA 226 * as they are written and migrate onto the arc_mru list. 227 * 228 * The ARC_l2c_only state is for buffers that are in the second 229 * level ARC but no longer in any of the ARC_m* lists. The second 230 * level ARC itself may also contain buffers that are in any of 231 * the ARC_m* states - meaning that a buffer can exist in two 232 * places. The reason for the ARC_l2c_only state is to keep the 233 * buffer header in the hash table, so that reads that hit the 234 * second level ARC benefit from these fast lookups. 235 */ 236 237#define ARCS_LOCK_PAD CACHE_LINE_SIZE 238struct arcs_lock { 239 kmutex_t arcs_lock; 240#ifdef _KERNEL 241 unsigned char pad[(ARCS_LOCK_PAD - sizeof (kmutex_t))]; 242#endif 243}; 244 245/* 246 * must be power of two for mask use to work 247 * 248 */ 249#define ARC_BUFC_NUMDATALISTS 16 250#define ARC_BUFC_NUMMETADATALISTS 16 251#define ARC_BUFC_NUMLISTS (ARC_BUFC_NUMMETADATALISTS + ARC_BUFC_NUMDATALISTS) 252 253typedef struct arc_state { 254 uint64_t arcs_lsize[ARC_BUFC_NUMTYPES]; /* amount of evictable data */ 255 uint64_t arcs_size; /* total amount of data in this state */ 256 list_t arcs_lists[ARC_BUFC_NUMLISTS]; /* list of evictable buffers */ 257 struct arcs_lock arcs_locks[ARC_BUFC_NUMLISTS] __aligned(CACHE_LINE_SIZE); 258} arc_state_t; 259 260#define ARCS_LOCK(s, i) (&((s)->arcs_locks[(i)].arcs_lock)) 261 262/* The 6 states: */ 263static arc_state_t ARC_anon; 264static arc_state_t ARC_mru; 265static arc_state_t ARC_mru_ghost; 266static arc_state_t ARC_mfu; 267static arc_state_t ARC_mfu_ghost; 268static arc_state_t ARC_l2c_only; 269 270typedef struct arc_stats { 271 kstat_named_t arcstat_hits; 272 kstat_named_t arcstat_misses; 273 kstat_named_t arcstat_demand_data_hits; 274 kstat_named_t arcstat_demand_data_misses; 275 kstat_named_t arcstat_demand_metadata_hits; 276 kstat_named_t arcstat_demand_metadata_misses; 277 kstat_named_t arcstat_prefetch_data_hits; 278 kstat_named_t arcstat_prefetch_data_misses; 279 kstat_named_t arcstat_prefetch_metadata_hits; 280 kstat_named_t arcstat_prefetch_metadata_misses; 281 kstat_named_t arcstat_mru_hits; 282 kstat_named_t arcstat_mru_ghost_hits; 283 kstat_named_t arcstat_mfu_hits; 284 kstat_named_t arcstat_mfu_ghost_hits; 285 kstat_named_t arcstat_allocated; 286 kstat_named_t arcstat_deleted; 287 kstat_named_t arcstat_stolen; 288 kstat_named_t arcstat_recycle_miss; 289 kstat_named_t arcstat_mutex_miss; 290 kstat_named_t arcstat_evict_skip; 291 kstat_named_t arcstat_evict_l2_cached; 292 kstat_named_t arcstat_evict_l2_eligible; 293 kstat_named_t arcstat_evict_l2_ineligible; 294 kstat_named_t arcstat_hash_elements; 295 kstat_named_t arcstat_hash_elements_max; 296 kstat_named_t arcstat_hash_collisions; 297 kstat_named_t arcstat_hash_chains; 298 kstat_named_t arcstat_hash_chain_max; 299 kstat_named_t arcstat_p; 300 kstat_named_t arcstat_c; 301 kstat_named_t arcstat_c_min; 302 kstat_named_t arcstat_c_max; 303 kstat_named_t arcstat_size; 304 kstat_named_t arcstat_hdr_size; 305 kstat_named_t arcstat_data_size; 306 kstat_named_t arcstat_other_size; 307 kstat_named_t arcstat_l2_hits; 308 kstat_named_t arcstat_l2_misses; 309 kstat_named_t arcstat_l2_feeds; 310 kstat_named_t arcstat_l2_rw_clash; 311 kstat_named_t arcstat_l2_read_bytes; 312 kstat_named_t arcstat_l2_write_bytes; 313 kstat_named_t arcstat_l2_writes_sent; 314 kstat_named_t arcstat_l2_writes_done; 315 kstat_named_t arcstat_l2_writes_error; 316 kstat_named_t arcstat_l2_writes_hdr_miss; 317 kstat_named_t arcstat_l2_evict_lock_retry; 318 kstat_named_t arcstat_l2_evict_reading; 319 kstat_named_t arcstat_l2_free_on_write; 320 kstat_named_t arcstat_l2_abort_lowmem; 321 kstat_named_t arcstat_l2_cksum_bad; 322 kstat_named_t arcstat_l2_io_error; 323 kstat_named_t arcstat_l2_size; 324 kstat_named_t arcstat_l2_hdr_size; 325 kstat_named_t arcstat_l2_write_trylock_fail; 326 kstat_named_t arcstat_l2_write_passed_headroom; 327 kstat_named_t arcstat_l2_write_spa_mismatch; 328 kstat_named_t arcstat_l2_write_in_l2; 329 kstat_named_t arcstat_l2_write_hdr_io_in_progress; 330 kstat_named_t arcstat_l2_write_not_cacheable; 331 kstat_named_t arcstat_l2_write_full; 332 kstat_named_t arcstat_l2_write_buffer_iter; 333 kstat_named_t arcstat_l2_write_pios; 334 kstat_named_t arcstat_l2_write_buffer_bytes_scanned; 335 kstat_named_t arcstat_l2_write_buffer_list_iter; 336 kstat_named_t arcstat_l2_write_buffer_list_null_iter; 337 kstat_named_t arcstat_memory_throttle_count; 338 kstat_named_t arcstat_duplicate_buffers; 339 kstat_named_t arcstat_duplicate_buffers_size; 340 kstat_named_t arcstat_duplicate_reads; 341} arc_stats_t; 342 343static arc_stats_t arc_stats = { 344 { "hits", KSTAT_DATA_UINT64 }, 345 { "misses", KSTAT_DATA_UINT64 }, 346 { "demand_data_hits", KSTAT_DATA_UINT64 }, 347 { "demand_data_misses", KSTAT_DATA_UINT64 }, 348 { "demand_metadata_hits", KSTAT_DATA_UINT64 }, 349 { "demand_metadata_misses", KSTAT_DATA_UINT64 }, 350 { "prefetch_data_hits", KSTAT_DATA_UINT64 }, 351 { "prefetch_data_misses", KSTAT_DATA_UINT64 }, 352 { "prefetch_metadata_hits", KSTAT_DATA_UINT64 }, 353 { "prefetch_metadata_misses", KSTAT_DATA_UINT64 }, 354 { "mru_hits", KSTAT_DATA_UINT64 }, 355 { "mru_ghost_hits", KSTAT_DATA_UINT64 }, 356 { "mfu_hits", KSTAT_DATA_UINT64 }, 357 { "mfu_ghost_hits", KSTAT_DATA_UINT64 }, 358 { "allocated", KSTAT_DATA_UINT64 }, 359 { "deleted", KSTAT_DATA_UINT64 }, 360 { "stolen", KSTAT_DATA_UINT64 }, 361 { "recycle_miss", KSTAT_DATA_UINT64 }, 362 { "mutex_miss", KSTAT_DATA_UINT64 }, 363 { "evict_skip", KSTAT_DATA_UINT64 }, 364 { "evict_l2_cached", KSTAT_DATA_UINT64 }, 365 { "evict_l2_eligible", KSTAT_DATA_UINT64 }, 366 { "evict_l2_ineligible", KSTAT_DATA_UINT64 }, 367 { "hash_elements", KSTAT_DATA_UINT64 }, 368 { "hash_elements_max", KSTAT_DATA_UINT64 }, 369 { "hash_collisions", KSTAT_DATA_UINT64 }, 370 { "hash_chains", KSTAT_DATA_UINT64 }, 371 { "hash_chain_max", KSTAT_DATA_UINT64 }, 372 { "p", KSTAT_DATA_UINT64 }, 373 { "c", KSTAT_DATA_UINT64 }, 374 { "c_min", KSTAT_DATA_UINT64 }, 375 { "c_max", KSTAT_DATA_UINT64 }, 376 { "size", KSTAT_DATA_UINT64 }, 377 { "hdr_size", KSTAT_DATA_UINT64 }, 378 { "data_size", KSTAT_DATA_UINT64 }, 379 { "other_size", KSTAT_DATA_UINT64 }, 380 { "l2_hits", KSTAT_DATA_UINT64 }, 381 { "l2_misses", KSTAT_DATA_UINT64 }, 382 { "l2_feeds", KSTAT_DATA_UINT64 }, 383 { "l2_rw_clash", KSTAT_DATA_UINT64 }, 384 { "l2_read_bytes", KSTAT_DATA_UINT64 }, 385 { "l2_write_bytes", KSTAT_DATA_UINT64 }, 386 { "l2_writes_sent", KSTAT_DATA_UINT64 }, 387 { "l2_writes_done", KSTAT_DATA_UINT64 }, 388 { "l2_writes_error", KSTAT_DATA_UINT64 }, 389 { "l2_writes_hdr_miss", KSTAT_DATA_UINT64 }, 390 { "l2_evict_lock_retry", KSTAT_DATA_UINT64 }, 391 { "l2_evict_reading", KSTAT_DATA_UINT64 }, 392 { "l2_free_on_write", KSTAT_DATA_UINT64 }, 393 { "l2_abort_lowmem", KSTAT_DATA_UINT64 }, 394 { "l2_cksum_bad", KSTAT_DATA_UINT64 }, 395 { "l2_io_error", KSTAT_DATA_UINT64 }, 396 { "l2_size", KSTAT_DATA_UINT64 }, 397 { "l2_hdr_size", KSTAT_DATA_UINT64 }, 398 { "l2_write_trylock_fail", KSTAT_DATA_UINT64 }, 399 { "l2_write_passed_headroom", KSTAT_DATA_UINT64 }, 400 { "l2_write_spa_mismatch", KSTAT_DATA_UINT64 }, 401 { "l2_write_in_l2", KSTAT_DATA_UINT64 }, 402 { "l2_write_io_in_progress", KSTAT_DATA_UINT64 }, 403 { "l2_write_not_cacheable", KSTAT_DATA_UINT64 }, 404 { "l2_write_full", KSTAT_DATA_UINT64 }, 405 { "l2_write_buffer_iter", KSTAT_DATA_UINT64 }, 406 { "l2_write_pios", KSTAT_DATA_UINT64 }, 407 { "l2_write_buffer_bytes_scanned", KSTAT_DATA_UINT64 }, 408 { "l2_write_buffer_list_iter", KSTAT_DATA_UINT64 }, 409 { "l2_write_buffer_list_null_iter", KSTAT_DATA_UINT64 }, 410 { "memory_throttle_count", KSTAT_DATA_UINT64 }, 411 { "duplicate_buffers", KSTAT_DATA_UINT64 }, 412 { "duplicate_buffers_size", KSTAT_DATA_UINT64 }, 413 { "duplicate_reads", KSTAT_DATA_UINT64 } 414}; 415 416#define ARCSTAT(stat) (arc_stats.stat.value.ui64) 417 418#define ARCSTAT_INCR(stat, val) \ 419 atomic_add_64(&arc_stats.stat.value.ui64, (val)); 420 421#define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1) 422#define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1) 423 424#define ARCSTAT_MAX(stat, val) { \ 425 uint64_t m; \ 426 while ((val) > (m = arc_stats.stat.value.ui64) && \ 427 (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val)))) \ 428 continue; \ 429} 430 431#define ARCSTAT_MAXSTAT(stat) \ 432 ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64) 433 434/* 435 * We define a macro to allow ARC hits/misses to be easily broken down by 436 * two separate conditions, giving a total of four different subtypes for 437 * each of hits and misses (so eight statistics total). 438 */ 439#define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \ 440 if (cond1) { \ 441 if (cond2) { \ 442 ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \ 443 } else { \ 444 ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \ 445 } \ 446 } else { \ 447 if (cond2) { \ 448 ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \ 449 } else { \ 450 ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\ 451 } \ 452 } 453 454kstat_t *arc_ksp; 455static arc_state_t *arc_anon; 456static arc_state_t *arc_mru; 457static arc_state_t *arc_mru_ghost; 458static arc_state_t *arc_mfu; 459static arc_state_t *arc_mfu_ghost; 460static arc_state_t *arc_l2c_only; 461 462/* 463 * There are several ARC variables that are critical to export as kstats -- 464 * but we don't want to have to grovel around in the kstat whenever we wish to 465 * manipulate them. For these variables, we therefore define them to be in 466 * terms of the statistic variable. This assures that we are not introducing 467 * the possibility of inconsistency by having shadow copies of the variables, 468 * while still allowing the code to be readable. 469 */ 470#define arc_size ARCSTAT(arcstat_size) /* actual total arc size */ 471#define arc_p ARCSTAT(arcstat_p) /* target size of MRU */ 472#define arc_c ARCSTAT(arcstat_c) /* target size of cache */ 473#define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */ 474#define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */ 475 476static int arc_no_grow; /* Don't try to grow cache size */ 477static uint64_t arc_tempreserve; 478static uint64_t arc_loaned_bytes; 479static uint64_t arc_meta_used; 480static uint64_t arc_meta_limit; 481static uint64_t arc_meta_max = 0; 482SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_meta_used, CTLFLAG_RD, &arc_meta_used, 0, 483 "ARC metadata used"); 484SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_meta_limit, CTLFLAG_RW, &arc_meta_limit, 0, 485 "ARC metadata limit"); 486 487typedef struct l2arc_buf_hdr l2arc_buf_hdr_t; 488 489typedef struct arc_callback arc_callback_t; 490 491struct arc_callback { 492 void *acb_private; 493 arc_done_func_t *acb_done; 494 arc_buf_t *acb_buf; 495 zio_t *acb_zio_dummy; 496 arc_callback_t *acb_next; 497}; 498 499typedef struct arc_write_callback arc_write_callback_t; 500 501struct arc_write_callback { 502 void *awcb_private; 503 arc_done_func_t *awcb_ready; 504 arc_done_func_t *awcb_done; 505 arc_buf_t *awcb_buf; 506}; 507 508struct arc_buf_hdr { 509 /* protected by hash lock */ 510 dva_t b_dva; 511 uint64_t b_birth; 512 uint64_t b_cksum0; 513 514 kmutex_t b_freeze_lock; 515 zio_cksum_t *b_freeze_cksum; 516 void *b_thawed; 517 518 arc_buf_hdr_t *b_hash_next; 519 arc_buf_t *b_buf; 520 uint32_t b_flags; 521 uint32_t b_datacnt; 522 523 arc_callback_t *b_acb; 524 kcondvar_t b_cv; 525 526 /* immutable */ 527 arc_buf_contents_t b_type; 528 uint64_t b_size; 529 uint64_t b_spa; 530 531 /* protected by arc state mutex */ 532 arc_state_t *b_state; 533 list_node_t b_arc_node; 534 535 /* updated atomically */ 536 clock_t b_arc_access; 537 538 /* self protecting */ 539 refcount_t b_refcnt; 540 541 l2arc_buf_hdr_t *b_l2hdr; 542 list_node_t b_l2node; 543}; 544 545static arc_buf_t *arc_eviction_list; 546static kmutex_t arc_eviction_mtx; 547static arc_buf_hdr_t arc_eviction_hdr; 548static void arc_get_data_buf(arc_buf_t *buf); 549static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock); 550static int arc_evict_needed(arc_buf_contents_t type); 551static void arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes); 552#ifdef illumos 553static void arc_buf_watch(arc_buf_t *buf); 554#endif /* illumos */ 555 556static boolean_t l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab); 557 558#define GHOST_STATE(state) \ 559 ((state) == arc_mru_ghost || (state) == arc_mfu_ghost || \ 560 (state) == arc_l2c_only) 561 562/* 563 * Private ARC flags. These flags are private ARC only flags that will show up 564 * in b_flags in the arc_hdr_buf_t. Some flags are publicly declared, and can 565 * be passed in as arc_flags in things like arc_read. However, these flags 566 * should never be passed and should only be set by ARC code. When adding new 567 * public flags, make sure not to smash the private ones. 568 */ 569 570#define ARC_IN_HASH_TABLE (1 << 9) /* this buffer is hashed */ 571#define ARC_IO_IN_PROGRESS (1 << 10) /* I/O in progress for buf */ 572#define ARC_IO_ERROR (1 << 11) /* I/O failed for buf */ 573#define ARC_FREED_IN_READ (1 << 12) /* buf freed while in read */ 574#define ARC_BUF_AVAILABLE (1 << 13) /* block not in active use */ 575#define ARC_INDIRECT (1 << 14) /* this is an indirect block */ 576#define ARC_FREE_IN_PROGRESS (1 << 15) /* hdr about to be freed */ 577#define ARC_L2_WRITING (1 << 16) /* L2ARC write in progress */ 578#define ARC_L2_EVICTED (1 << 17) /* evicted during I/O */ 579#define ARC_L2_WRITE_HEAD (1 << 18) /* head of write list */ 580 581#define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_IN_HASH_TABLE) 582#define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS) 583#define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_IO_ERROR) 584#define HDR_PREFETCH(hdr) ((hdr)->b_flags & ARC_PREFETCH) 585#define HDR_FREED_IN_READ(hdr) ((hdr)->b_flags & ARC_FREED_IN_READ) 586#define HDR_BUF_AVAILABLE(hdr) ((hdr)->b_flags & ARC_BUF_AVAILABLE) 587#define HDR_FREE_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FREE_IN_PROGRESS) 588#define HDR_L2CACHE(hdr) ((hdr)->b_flags & ARC_L2CACHE) 589#define HDR_L2_READING(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS && \ 590 (hdr)->b_l2hdr != NULL) 591#define HDR_L2_WRITING(hdr) ((hdr)->b_flags & ARC_L2_WRITING) 592#define HDR_L2_EVICTED(hdr) ((hdr)->b_flags & ARC_L2_EVICTED) 593#define HDR_L2_WRITE_HEAD(hdr) ((hdr)->b_flags & ARC_L2_WRITE_HEAD) 594 595/* 596 * Other sizes 597 */ 598 599#define HDR_SIZE ((int64_t)sizeof (arc_buf_hdr_t)) 600#define L2HDR_SIZE ((int64_t)sizeof (l2arc_buf_hdr_t)) 601 602/* 603 * Hash table routines 604 */ 605 606#define HT_LOCK_PAD CACHE_LINE_SIZE 607 608struct ht_lock { 609 kmutex_t ht_lock; 610#ifdef _KERNEL 611 unsigned char pad[(HT_LOCK_PAD - sizeof (kmutex_t))]; 612#endif 613}; 614 615#define BUF_LOCKS 256 616typedef struct buf_hash_table { 617 uint64_t ht_mask; 618 arc_buf_hdr_t **ht_table; 619 struct ht_lock ht_locks[BUF_LOCKS] __aligned(CACHE_LINE_SIZE); 620} buf_hash_table_t; 621 622static buf_hash_table_t buf_hash_table; 623 624#define BUF_HASH_INDEX(spa, dva, birth) \ 625 (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask) 626#define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)]) 627#define BUF_HASH_LOCK(idx) (&(BUF_HASH_LOCK_NTRY(idx).ht_lock)) 628#define HDR_LOCK(hdr) \ 629 (BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth))) 630 631uint64_t zfs_crc64_table[256]; 632 633/* 634 * Level 2 ARC 635 */ 636 637#define L2ARC_WRITE_SIZE (8 * 1024 * 1024) /* initial write max */ 638#define L2ARC_HEADROOM 2 /* num of writes */ 639#define L2ARC_FEED_SECS 1 /* caching interval secs */ 640#define L2ARC_FEED_MIN_MS 200 /* min caching interval ms */ 641 642#define l2arc_writes_sent ARCSTAT(arcstat_l2_writes_sent) 643#define l2arc_writes_done ARCSTAT(arcstat_l2_writes_done) 644 645/* 646 * L2ARC Performance Tunables 647 */ 648uint64_t l2arc_write_max = L2ARC_WRITE_SIZE; /* default max write size */ 649uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra write during warmup */ 650uint64_t l2arc_headroom = L2ARC_HEADROOM; /* number of dev writes */ 651uint64_t l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */ 652uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval milliseconds */ 653boolean_t l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */ 654boolean_t l2arc_feed_again = B_TRUE; /* turbo warmup */ 655boolean_t l2arc_norw = B_TRUE; /* no reads during writes */ 656 657SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_max, CTLFLAG_RW, 658 &l2arc_write_max, 0, "max write size"); 659SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_boost, CTLFLAG_RW, 660 &l2arc_write_boost, 0, "extra write during warmup"); 661SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_headroom, CTLFLAG_RW, 662 &l2arc_headroom, 0, "number of dev writes"); 663SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_secs, CTLFLAG_RW, 664 &l2arc_feed_secs, 0, "interval seconds"); 665SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_min_ms, CTLFLAG_RW, 666 &l2arc_feed_min_ms, 0, "min interval milliseconds"); 667 668SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_noprefetch, CTLFLAG_RW, 669 &l2arc_noprefetch, 0, "don't cache prefetch bufs"); 670SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_feed_again, CTLFLAG_RW, 671 &l2arc_feed_again, 0, "turbo warmup"); 672SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_norw, CTLFLAG_RW, 673 &l2arc_norw, 0, "no reads during writes"); 674 675SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_size, CTLFLAG_RD, 676 &ARC_anon.arcs_size, 0, "size of anonymous state"); 677SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_metadata_lsize, CTLFLAG_RD, 678 &ARC_anon.arcs_lsize[ARC_BUFC_METADATA], 0, "size of anonymous state"); 679SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_data_lsize, CTLFLAG_RD, 680 &ARC_anon.arcs_lsize[ARC_BUFC_DATA], 0, "size of anonymous state"); 681 682SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_size, CTLFLAG_RD, 683 &ARC_mru.arcs_size, 0, "size of mru state"); 684SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_metadata_lsize, CTLFLAG_RD, 685 &ARC_mru.arcs_lsize[ARC_BUFC_METADATA], 0, "size of metadata in mru state"); 686SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_data_lsize, CTLFLAG_RD, 687 &ARC_mru.arcs_lsize[ARC_BUFC_DATA], 0, "size of data in mru state"); 688 689SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_size, CTLFLAG_RD, 690 &ARC_mru_ghost.arcs_size, 0, "size of mru ghost state"); 691SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_metadata_lsize, CTLFLAG_RD, 692 &ARC_mru_ghost.arcs_lsize[ARC_BUFC_METADATA], 0, 693 "size of metadata in mru ghost state"); 694SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_data_lsize, CTLFLAG_RD, 695 &ARC_mru_ghost.arcs_lsize[ARC_BUFC_DATA], 0, 696 "size of data in mru ghost state"); 697 698SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_size, CTLFLAG_RD, 699 &ARC_mfu.arcs_size, 0, "size of mfu state"); 700SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_metadata_lsize, CTLFLAG_RD, 701 &ARC_mfu.arcs_lsize[ARC_BUFC_METADATA], 0, "size of metadata in mfu state"); 702SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_data_lsize, CTLFLAG_RD, 703 &ARC_mfu.arcs_lsize[ARC_BUFC_DATA], 0, "size of data in mfu state"); 704 705SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_size, CTLFLAG_RD, 706 &ARC_mfu_ghost.arcs_size, 0, "size of mfu ghost state"); 707SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_metadata_lsize, CTLFLAG_RD, 708 &ARC_mfu_ghost.arcs_lsize[ARC_BUFC_METADATA], 0, 709 "size of metadata in mfu ghost state"); 710SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_data_lsize, CTLFLAG_RD, 711 &ARC_mfu_ghost.arcs_lsize[ARC_BUFC_DATA], 0, 712 "size of data in mfu ghost state"); 713 714SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2c_only_size, CTLFLAG_RD, 715 &ARC_l2c_only.arcs_size, 0, "size of mru state"); 716 717/* 718 * L2ARC Internals 719 */ 720typedef struct l2arc_dev { 721 vdev_t *l2ad_vdev; /* vdev */ 722 spa_t *l2ad_spa; /* spa */ 723 uint64_t l2ad_hand; /* next write location */ 724 uint64_t l2ad_write; /* desired write size, bytes */ 725 uint64_t l2ad_boost; /* warmup write boost, bytes */ 726 uint64_t l2ad_start; /* first addr on device */ 727 uint64_t l2ad_end; /* last addr on device */ 728 uint64_t l2ad_evict; /* last addr eviction reached */ 729 boolean_t l2ad_first; /* first sweep through */ 730 boolean_t l2ad_writing; /* currently writing */ 731 list_t *l2ad_buflist; /* buffer list */ 732 list_node_t l2ad_node; /* device list node */ 733} l2arc_dev_t; 734 735static list_t L2ARC_dev_list; /* device list */ 736static list_t *l2arc_dev_list; /* device list pointer */ 737static kmutex_t l2arc_dev_mtx; /* device list mutex */ 738static l2arc_dev_t *l2arc_dev_last; /* last device used */ 739static kmutex_t l2arc_buflist_mtx; /* mutex for all buflists */ 740static list_t L2ARC_free_on_write; /* free after write buf list */ 741static list_t *l2arc_free_on_write; /* free after write list ptr */ 742static kmutex_t l2arc_free_on_write_mtx; /* mutex for list */ 743static uint64_t l2arc_ndev; /* number of devices */ 744 745typedef struct l2arc_read_callback { 746 arc_buf_t *l2rcb_buf; /* read buffer */ 747 spa_t *l2rcb_spa; /* spa */ 748 blkptr_t l2rcb_bp; /* original blkptr */ 749 zbookmark_t l2rcb_zb; /* original bookmark */ 750 int l2rcb_flags; /* original flags */ 751} l2arc_read_callback_t; 752 753typedef struct l2arc_write_callback { 754 l2arc_dev_t *l2wcb_dev; /* device info */ 755 arc_buf_hdr_t *l2wcb_head; /* head of write buflist */ 756} l2arc_write_callback_t; 757 758struct l2arc_buf_hdr { 759 /* protected by arc_buf_hdr mutex */ 760 l2arc_dev_t *b_dev; /* L2ARC device */ 761 uint64_t b_daddr; /* disk address, offset byte */ 762}; 763 764typedef struct l2arc_data_free { 765 /* protected by l2arc_free_on_write_mtx */ 766 void *l2df_data; 767 size_t l2df_size; 768 void (*l2df_func)(void *, size_t); 769 list_node_t l2df_list_node; 770} l2arc_data_free_t; 771 772static kmutex_t l2arc_feed_thr_lock; 773static kcondvar_t l2arc_feed_thr_cv; 774static uint8_t l2arc_thread_exit; 775 776static void l2arc_read_done(zio_t *zio); 777static void l2arc_hdr_stat_add(void); 778static void l2arc_hdr_stat_remove(void); 779 780static uint64_t 781buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth) 782{ 783 uint8_t *vdva = (uint8_t *)dva; 784 uint64_t crc = -1ULL; 785 int i; 786 787 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); 788 789 for (i = 0; i < sizeof (dva_t); i++) 790 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF]; 791 792 crc ^= (spa>>8) ^ birth; 793 794 return (crc); 795} 796 797#define BUF_EMPTY(buf) \ 798 ((buf)->b_dva.dva_word[0] == 0 && \ 799 (buf)->b_dva.dva_word[1] == 0 && \ 800 (buf)->b_birth == 0) 801 802#define BUF_EQUAL(spa, dva, birth, buf) \ 803 ((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \ 804 ((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \ 805 ((buf)->b_birth == birth) && ((buf)->b_spa == spa) 806 807static void 808buf_discard_identity(arc_buf_hdr_t *hdr) 809{ 810 hdr->b_dva.dva_word[0] = 0; 811 hdr->b_dva.dva_word[1] = 0; 812 hdr->b_birth = 0; 813 hdr->b_cksum0 = 0; 814} 815 816static arc_buf_hdr_t * 817buf_hash_find(uint64_t spa, const dva_t *dva, uint64_t birth, kmutex_t **lockp) 818{ 819 uint64_t idx = BUF_HASH_INDEX(spa, dva, birth); 820 kmutex_t *hash_lock = BUF_HASH_LOCK(idx); 821 arc_buf_hdr_t *buf; 822 823 mutex_enter(hash_lock); 824 for (buf = buf_hash_table.ht_table[idx]; buf != NULL; 825 buf = buf->b_hash_next) { 826 if (BUF_EQUAL(spa, dva, birth, buf)) { 827 *lockp = hash_lock; 828 return (buf); 829 } 830 } 831 mutex_exit(hash_lock); 832 *lockp = NULL; 833 return (NULL); 834} 835 836/* 837 * Insert an entry into the hash table. If there is already an element 838 * equal to elem in the hash table, then the already existing element 839 * will be returned and the new element will not be inserted. 840 * Otherwise returns NULL. 841 */ 842static arc_buf_hdr_t * 843buf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp) 844{ 845 uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth); 846 kmutex_t *hash_lock = BUF_HASH_LOCK(idx); 847 arc_buf_hdr_t *fbuf; 848 uint32_t i; 849 850 ASSERT(!HDR_IN_HASH_TABLE(buf)); 851 *lockp = hash_lock; 852 mutex_enter(hash_lock); 853 for (fbuf = buf_hash_table.ht_table[idx], i = 0; fbuf != NULL; 854 fbuf = fbuf->b_hash_next, i++) { 855 if (BUF_EQUAL(buf->b_spa, &buf->b_dva, buf->b_birth, fbuf)) 856 return (fbuf); 857 } 858 859 buf->b_hash_next = buf_hash_table.ht_table[idx]; 860 buf_hash_table.ht_table[idx] = buf; 861 buf->b_flags |= ARC_IN_HASH_TABLE; 862 863 /* collect some hash table performance data */ 864 if (i > 0) { 865 ARCSTAT_BUMP(arcstat_hash_collisions); 866 if (i == 1) 867 ARCSTAT_BUMP(arcstat_hash_chains); 868 869 ARCSTAT_MAX(arcstat_hash_chain_max, i); 870 } 871 872 ARCSTAT_BUMP(arcstat_hash_elements); 873 ARCSTAT_MAXSTAT(arcstat_hash_elements); 874 875 return (NULL); 876} 877 878static void 879buf_hash_remove(arc_buf_hdr_t *buf) 880{ 881 arc_buf_hdr_t *fbuf, **bufp; 882 uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth); 883 884 ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx))); 885 ASSERT(HDR_IN_HASH_TABLE(buf)); 886 887 bufp = &buf_hash_table.ht_table[idx]; 888 while ((fbuf = *bufp) != buf) { 889 ASSERT(fbuf != NULL); 890 bufp = &fbuf->b_hash_next; 891 } 892 *bufp = buf->b_hash_next; 893 buf->b_hash_next = NULL; 894 buf->b_flags &= ~ARC_IN_HASH_TABLE; 895 896 /* collect some hash table performance data */ 897 ARCSTAT_BUMPDOWN(arcstat_hash_elements); 898 899 if (buf_hash_table.ht_table[idx] && 900 buf_hash_table.ht_table[idx]->b_hash_next == NULL) 901 ARCSTAT_BUMPDOWN(arcstat_hash_chains); 902} 903 904/* 905 * Global data structures and functions for the buf kmem cache. 906 */ 907static kmem_cache_t *hdr_cache; 908static kmem_cache_t *buf_cache; 909 910static void 911buf_fini(void) 912{ 913 int i; 914 915 kmem_free(buf_hash_table.ht_table, 916 (buf_hash_table.ht_mask + 1) * sizeof (void *)); 917 for (i = 0; i < BUF_LOCKS; i++) 918 mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock); 919 kmem_cache_destroy(hdr_cache); 920 kmem_cache_destroy(buf_cache); 921} 922 923/* 924 * Constructor callback - called when the cache is empty 925 * and a new buf is requested. 926 */ 927/* ARGSUSED */ 928static int 929hdr_cons(void *vbuf, void *unused, int kmflag) 930{ 931 arc_buf_hdr_t *buf = vbuf; 932 933 bzero(buf, sizeof (arc_buf_hdr_t)); 934 refcount_create(&buf->b_refcnt); 935 cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL); 936 mutex_init(&buf->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL); 937 arc_space_consume(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS); 938 939 return (0); 940} 941 942/* ARGSUSED */ 943static int 944buf_cons(void *vbuf, void *unused, int kmflag) 945{ 946 arc_buf_t *buf = vbuf; 947 948 bzero(buf, sizeof (arc_buf_t)); 949 mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL); 950 arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS); 951 952 return (0); 953} 954 955/* 956 * Destructor callback - called when a cached buf is 957 * no longer required. 958 */ 959/* ARGSUSED */ 960static void 961hdr_dest(void *vbuf, void *unused) 962{ 963 arc_buf_hdr_t *buf = vbuf; 964 965 ASSERT(BUF_EMPTY(buf)); 966 refcount_destroy(&buf->b_refcnt); 967 cv_destroy(&buf->b_cv); 968 mutex_destroy(&buf->b_freeze_lock); 969 arc_space_return(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS); 970} 971 972/* ARGSUSED */ 973static void 974buf_dest(void *vbuf, void *unused) 975{ 976 arc_buf_t *buf = vbuf; 977 978 mutex_destroy(&buf->b_evict_lock); 979 arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS); 980} 981 982/* 983 * Reclaim callback -- invoked when memory is low. 984 */ 985/* ARGSUSED */ 986static void 987hdr_recl(void *unused) 988{ 989 dprintf("hdr_recl called\n"); 990 /* 991 * umem calls the reclaim func when we destroy the buf cache, 992 * which is after we do arc_fini(). 993 */ 994 if (!arc_dead) 995 cv_signal(&arc_reclaim_thr_cv); 996} 997 998static void 999buf_init(void) 1000{ 1001 uint64_t *ct; 1002 uint64_t hsize = 1ULL << 12; 1003 int i, j; 1004 1005 /* 1006 * The hash table is big enough to fill all of physical memory 1007 * with an average 64K block size. The table will take up 1008 * totalmem*sizeof(void*)/64K (eg. 128KB/GB with 8-byte pointers). 1009 */ 1010 while (hsize * 65536 < (uint64_t)physmem * PAGESIZE) 1011 hsize <<= 1; 1012retry: 1013 buf_hash_table.ht_mask = hsize - 1; 1014 buf_hash_table.ht_table = 1015 kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP); 1016 if (buf_hash_table.ht_table == NULL) { 1017 ASSERT(hsize > (1ULL << 8)); 1018 hsize >>= 1; 1019 goto retry; 1020 } 1021 1022 hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t), 1023 0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0); 1024 buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t), 1025 0, buf_cons, buf_dest, NULL, NULL, NULL, 0); 1026 1027 for (i = 0; i < 256; i++) 1028 for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--) 1029 *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY); 1030 1031 for (i = 0; i < BUF_LOCKS; i++) { 1032 mutex_init(&buf_hash_table.ht_locks[i].ht_lock, 1033 NULL, MUTEX_DEFAULT, NULL); 1034 } 1035} 1036 1037#define ARC_MINTIME (hz>>4) /* 62 ms */ 1038 1039static void 1040arc_cksum_verify(arc_buf_t *buf) 1041{ 1042 zio_cksum_t zc; 1043 1044 if (!(zfs_flags & ZFS_DEBUG_MODIFY)) 1045 return; 1046 1047 mutex_enter(&buf->b_hdr->b_freeze_lock); 1048 if (buf->b_hdr->b_freeze_cksum == NULL || 1049 (buf->b_hdr->b_flags & ARC_IO_ERROR)) { 1050 mutex_exit(&buf->b_hdr->b_freeze_lock); 1051 return; 1052 } 1053 fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc); 1054 if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc)) 1055 panic("buffer modified while frozen!"); 1056 mutex_exit(&buf->b_hdr->b_freeze_lock); 1057} 1058 1059static int 1060arc_cksum_equal(arc_buf_t *buf) 1061{ 1062 zio_cksum_t zc; 1063 int equal; 1064 1065 mutex_enter(&buf->b_hdr->b_freeze_lock); 1066 fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc); 1067 equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc); 1068 mutex_exit(&buf->b_hdr->b_freeze_lock); 1069 1070 return (equal); 1071} 1072 1073static void 1074arc_cksum_compute(arc_buf_t *buf, boolean_t force) 1075{ 1076 if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY)) 1077 return; 1078 1079 mutex_enter(&buf->b_hdr->b_freeze_lock); 1080 if (buf->b_hdr->b_freeze_cksum != NULL) { 1081 mutex_exit(&buf->b_hdr->b_freeze_lock); 1082 return; 1083 } 1084 buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP); 1085 fletcher_2_native(buf->b_data, buf->b_hdr->b_size, 1086 buf->b_hdr->b_freeze_cksum); 1087 mutex_exit(&buf->b_hdr->b_freeze_lock); 1088#ifdef illumos 1089 arc_buf_watch(buf); 1090#endif /* illumos */ 1091} 1092 1093#ifdef illumos 1094#ifndef _KERNEL 1095typedef struct procctl { 1096 long cmd; 1097 prwatch_t prwatch; 1098} procctl_t; 1099#endif 1100 1101/* ARGSUSED */ 1102static void 1103arc_buf_unwatch(arc_buf_t *buf) 1104{ 1105#ifndef _KERNEL 1106 if (arc_watch) { 1107 int result; 1108 procctl_t ctl; 1109 ctl.cmd = PCWATCH; 1110 ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data; 1111 ctl.prwatch.pr_size = 0; 1112 ctl.prwatch.pr_wflags = 0; 1113 result = write(arc_procfd, &ctl, sizeof (ctl)); 1114 ASSERT3U(result, ==, sizeof (ctl)); 1115 } 1116#endif 1117} 1118 1119/* ARGSUSED */ 1120static void 1121arc_buf_watch(arc_buf_t *buf) 1122{ 1123#ifndef _KERNEL 1124 if (arc_watch) { 1125 int result; 1126 procctl_t ctl; 1127 ctl.cmd = PCWATCH; 1128 ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data; 1129 ctl.prwatch.pr_size = buf->b_hdr->b_size; 1130 ctl.prwatch.pr_wflags = WA_WRITE; 1131 result = write(arc_procfd, &ctl, sizeof (ctl)); 1132 ASSERT3U(result, ==, sizeof (ctl)); 1133 } 1134#endif 1135} 1136#endif /* illumos */ 1137 1138void 1139arc_buf_thaw(arc_buf_t *buf) 1140{ 1141 if (zfs_flags & ZFS_DEBUG_MODIFY) { 1142 if (buf->b_hdr->b_state != arc_anon) 1143 panic("modifying non-anon buffer!"); 1144 if (buf->b_hdr->b_flags & ARC_IO_IN_PROGRESS) 1145 panic("modifying buffer while i/o in progress!"); 1146 arc_cksum_verify(buf); 1147 } 1148 1149 mutex_enter(&buf->b_hdr->b_freeze_lock); 1150 if (buf->b_hdr->b_freeze_cksum != NULL) { 1151 kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t)); 1152 buf->b_hdr->b_freeze_cksum = NULL; 1153 } 1154 1155 if (zfs_flags & ZFS_DEBUG_MODIFY) { 1156 if (buf->b_hdr->b_thawed) 1157 kmem_free(buf->b_hdr->b_thawed, 1); 1158 buf->b_hdr->b_thawed = kmem_alloc(1, KM_SLEEP); 1159 } 1160 1161 mutex_exit(&buf->b_hdr->b_freeze_lock); 1162 1163#ifdef illumos 1164 arc_buf_unwatch(buf); 1165#endif /* illumos */ 1166} 1167 1168void 1169arc_buf_freeze(arc_buf_t *buf) 1170{ 1171 kmutex_t *hash_lock; 1172 1173 if (!(zfs_flags & ZFS_DEBUG_MODIFY)) 1174 return; 1175 1176 hash_lock = HDR_LOCK(buf->b_hdr); 1177 mutex_enter(hash_lock); 1178 1179 ASSERT(buf->b_hdr->b_freeze_cksum != NULL || 1180 buf->b_hdr->b_state == arc_anon); 1181 arc_cksum_compute(buf, B_FALSE); 1182 mutex_exit(hash_lock); 1183 1184} 1185 1186static void 1187get_buf_info(arc_buf_hdr_t *ab, arc_state_t *state, list_t **list, kmutex_t **lock) 1188{ 1189 uint64_t buf_hashid = buf_hash(ab->b_spa, &ab->b_dva, ab->b_birth); 1190 1191 if (ab->b_type == ARC_BUFC_METADATA) 1192 buf_hashid &= (ARC_BUFC_NUMMETADATALISTS - 1); 1193 else { 1194 buf_hashid &= (ARC_BUFC_NUMDATALISTS - 1); 1195 buf_hashid += ARC_BUFC_NUMMETADATALISTS; 1196 } 1197 1198 *list = &state->arcs_lists[buf_hashid]; 1199 *lock = ARCS_LOCK(state, buf_hashid); 1200} 1201 1202 1203static void 1204add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag) 1205{ 1206 ASSERT(MUTEX_HELD(hash_lock)); 1207 1208 if ((refcount_add(&ab->b_refcnt, tag) == 1) && 1209 (ab->b_state != arc_anon)) { 1210 uint64_t delta = ab->b_size * ab->b_datacnt; 1211 uint64_t *size = &ab->b_state->arcs_lsize[ab->b_type]; 1212 list_t *list; 1213 kmutex_t *lock; 1214 1215 get_buf_info(ab, ab->b_state, &list, &lock); 1216 ASSERT(!MUTEX_HELD(lock)); 1217 mutex_enter(lock); 1218 ASSERT(list_link_active(&ab->b_arc_node)); 1219 list_remove(list, ab); 1220 if (GHOST_STATE(ab->b_state)) { 1221 ASSERT0(ab->b_datacnt); 1222 ASSERT3P(ab->b_buf, ==, NULL); 1223 delta = ab->b_size; 1224 } 1225 ASSERT(delta > 0); 1226 ASSERT3U(*size, >=, delta); 1227 atomic_add_64(size, -delta); 1228 mutex_exit(lock); 1229 /* remove the prefetch flag if we get a reference */ 1230 if (ab->b_flags & ARC_PREFETCH) 1231 ab->b_flags &= ~ARC_PREFETCH; 1232 } 1233} 1234 1235static int 1236remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag) 1237{ 1238 int cnt; 1239 arc_state_t *state = ab->b_state; 1240 1241 ASSERT(state == arc_anon || MUTEX_HELD(hash_lock)); 1242 ASSERT(!GHOST_STATE(state)); 1243 1244 if (((cnt = refcount_remove(&ab->b_refcnt, tag)) == 0) && 1245 (state != arc_anon)) { 1246 uint64_t *size = &state->arcs_lsize[ab->b_type]; 1247 list_t *list; 1248 kmutex_t *lock; 1249 1250 get_buf_info(ab, state, &list, &lock); 1251 ASSERT(!MUTEX_HELD(lock)); 1252 mutex_enter(lock); 1253 ASSERT(!list_link_active(&ab->b_arc_node)); 1254 list_insert_head(list, ab); 1255 ASSERT(ab->b_datacnt > 0); 1256 atomic_add_64(size, ab->b_size * ab->b_datacnt); 1257 mutex_exit(lock); 1258 } 1259 return (cnt); 1260} 1261 1262/* 1263 * Move the supplied buffer to the indicated state. The mutex 1264 * for the buffer must be held by the caller. 1265 */ 1266static void 1267arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock) 1268{ 1269 arc_state_t *old_state = ab->b_state; 1270 int64_t refcnt = refcount_count(&ab->b_refcnt); 1271 uint64_t from_delta, to_delta; 1272 list_t *list; 1273 kmutex_t *lock; 1274 1275 ASSERT(MUTEX_HELD(hash_lock)); 1276 ASSERT(new_state != old_state); 1277 ASSERT(refcnt == 0 || ab->b_datacnt > 0); 1278 ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state)); 1279 ASSERT(ab->b_datacnt <= 1 || old_state != arc_anon); 1280 1281 from_delta = to_delta = ab->b_datacnt * ab->b_size; 1282 1283 /* 1284 * If this buffer is evictable, transfer it from the 1285 * old state list to the new state list. 1286 */ 1287 if (refcnt == 0) { 1288 if (old_state != arc_anon) { 1289 int use_mutex; 1290 uint64_t *size = &old_state->arcs_lsize[ab->b_type]; 1291 1292 get_buf_info(ab, old_state, &list, &lock); 1293 use_mutex = !MUTEX_HELD(lock); 1294 if (use_mutex) 1295 mutex_enter(lock); 1296 1297 ASSERT(list_link_active(&ab->b_arc_node)); 1298 list_remove(list, ab); 1299 1300 /* 1301 * If prefetching out of the ghost cache, 1302 * we will have a non-zero datacnt. 1303 */ 1304 if (GHOST_STATE(old_state) && ab->b_datacnt == 0) { 1305 /* ghost elements have a ghost size */ 1306 ASSERT(ab->b_buf == NULL); 1307 from_delta = ab->b_size; 1308 } 1309 ASSERT3U(*size, >=, from_delta); 1310 atomic_add_64(size, -from_delta); 1311 1312 if (use_mutex) 1313 mutex_exit(lock); 1314 } 1315 if (new_state != arc_anon) { 1316 int use_mutex; 1317 uint64_t *size = &new_state->arcs_lsize[ab->b_type]; 1318 1319 get_buf_info(ab, new_state, &list, &lock); 1320 use_mutex = !MUTEX_HELD(lock); 1321 if (use_mutex) 1322 mutex_enter(lock); 1323 1324 list_insert_head(list, ab); 1325 1326 /* ghost elements have a ghost size */ 1327 if (GHOST_STATE(new_state)) { 1328 ASSERT(ab->b_datacnt == 0); 1329 ASSERT(ab->b_buf == NULL); 1330 to_delta = ab->b_size; 1331 } 1332 atomic_add_64(size, to_delta); 1333 1334 if (use_mutex) 1335 mutex_exit(lock); 1336 } 1337 } 1338 1339 ASSERT(!BUF_EMPTY(ab)); 1340 if (new_state == arc_anon && HDR_IN_HASH_TABLE(ab)) 1341 buf_hash_remove(ab); 1342 1343 /* adjust state sizes */ 1344 if (to_delta) 1345 atomic_add_64(&new_state->arcs_size, to_delta); 1346 if (from_delta) { 1347 ASSERT3U(old_state->arcs_size, >=, from_delta); 1348 atomic_add_64(&old_state->arcs_size, -from_delta); 1349 } 1350 ab->b_state = new_state; 1351 1352 /* adjust l2arc hdr stats */ 1353 if (new_state == arc_l2c_only) 1354 l2arc_hdr_stat_add(); 1355 else if (old_state == arc_l2c_only) 1356 l2arc_hdr_stat_remove(); 1357} 1358 1359void 1360arc_space_consume(uint64_t space, arc_space_type_t type) 1361{ 1362 ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES); 1363 1364 switch (type) { 1365 case ARC_SPACE_DATA: 1366 ARCSTAT_INCR(arcstat_data_size, space); 1367 break; 1368 case ARC_SPACE_OTHER: 1369 ARCSTAT_INCR(arcstat_other_size, space); 1370 break; 1371 case ARC_SPACE_HDRS: 1372 ARCSTAT_INCR(arcstat_hdr_size, space); 1373 break; 1374 case ARC_SPACE_L2HDRS: 1375 ARCSTAT_INCR(arcstat_l2_hdr_size, space); 1376 break; 1377 } 1378 1379 atomic_add_64(&arc_meta_used, space); 1380 atomic_add_64(&arc_size, space); 1381} 1382 1383void 1384arc_space_return(uint64_t space, arc_space_type_t type) 1385{ 1386 ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES); 1387 1388 switch (type) { 1389 case ARC_SPACE_DATA: 1390 ARCSTAT_INCR(arcstat_data_size, -space); 1391 break; 1392 case ARC_SPACE_OTHER: 1393 ARCSTAT_INCR(arcstat_other_size, -space); 1394 break; 1395 case ARC_SPACE_HDRS: 1396 ARCSTAT_INCR(arcstat_hdr_size, -space); 1397 break; 1398 case ARC_SPACE_L2HDRS: 1399 ARCSTAT_INCR(arcstat_l2_hdr_size, -space); 1400 break; 1401 } 1402 1403 ASSERT(arc_meta_used >= space); 1404 if (arc_meta_max < arc_meta_used) 1405 arc_meta_max = arc_meta_used; 1406 atomic_add_64(&arc_meta_used, -space); 1407 ASSERT(arc_size >= space); 1408 atomic_add_64(&arc_size, -space); 1409} 1410 1411void * 1412arc_data_buf_alloc(uint64_t size) 1413{ 1414 if (arc_evict_needed(ARC_BUFC_DATA)) 1415 cv_signal(&arc_reclaim_thr_cv); 1416 atomic_add_64(&arc_size, size); 1417 return (zio_data_buf_alloc(size)); 1418} 1419 1420void 1421arc_data_buf_free(void *buf, uint64_t size) 1422{ 1423 zio_data_buf_free(buf, size); 1424 ASSERT(arc_size >= size); 1425 atomic_add_64(&arc_size, -size); 1426} 1427 1428arc_buf_t * 1429arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type) 1430{ 1431 arc_buf_hdr_t *hdr; 1432 arc_buf_t *buf; 1433 1434 ASSERT3U(size, >, 0); 1435 hdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE); 1436 ASSERT(BUF_EMPTY(hdr)); 1437 hdr->b_size = size; 1438 hdr->b_type = type; 1439 hdr->b_spa = spa_load_guid(spa); 1440 hdr->b_state = arc_anon; 1441 hdr->b_arc_access = 0; 1442 buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); 1443 buf->b_hdr = hdr; 1444 buf->b_data = NULL; 1445 buf->b_efunc = NULL; 1446 buf->b_private = NULL; 1447 buf->b_next = NULL; 1448 hdr->b_buf = buf; 1449 arc_get_data_buf(buf); 1450 hdr->b_datacnt = 1; 1451 hdr->b_flags = 0; 1452 ASSERT(refcount_is_zero(&hdr->b_refcnt)); 1453 (void) refcount_add(&hdr->b_refcnt, tag); 1454 1455 return (buf); 1456} 1457 1458static char *arc_onloan_tag = "onloan"; 1459 1460/* 1461 * Loan out an anonymous arc buffer. Loaned buffers are not counted as in 1462 * flight data by arc_tempreserve_space() until they are "returned". Loaned 1463 * buffers must be returned to the arc before they can be used by the DMU or 1464 * freed. 1465 */ 1466arc_buf_t * 1467arc_loan_buf(spa_t *spa, int size) 1468{ 1469 arc_buf_t *buf; 1470 1471 buf = arc_buf_alloc(spa, size, arc_onloan_tag, ARC_BUFC_DATA); 1472 1473 atomic_add_64(&arc_loaned_bytes, size); 1474 return (buf); 1475} 1476 1477/* 1478 * Return a loaned arc buffer to the arc. 1479 */ 1480void 1481arc_return_buf(arc_buf_t *buf, void *tag) 1482{ 1483 arc_buf_hdr_t *hdr = buf->b_hdr; 1484 1485 ASSERT(buf->b_data != NULL); 1486 (void) refcount_add(&hdr->b_refcnt, tag); 1487 (void) refcount_remove(&hdr->b_refcnt, arc_onloan_tag); 1488 1489 atomic_add_64(&arc_loaned_bytes, -hdr->b_size); 1490} 1491 1492/* Detach an arc_buf from a dbuf (tag) */ 1493void 1494arc_loan_inuse_buf(arc_buf_t *buf, void *tag) 1495{ 1496 arc_buf_hdr_t *hdr; 1497 1498 ASSERT(buf->b_data != NULL); 1499 hdr = buf->b_hdr; 1500 (void) refcount_add(&hdr->b_refcnt, arc_onloan_tag); 1501 (void) refcount_remove(&hdr->b_refcnt, tag); 1502 buf->b_efunc = NULL; 1503 buf->b_private = NULL; 1504 1505 atomic_add_64(&arc_loaned_bytes, hdr->b_size); 1506} 1507 1508static arc_buf_t * 1509arc_buf_clone(arc_buf_t *from) 1510{ 1511 arc_buf_t *buf; 1512 arc_buf_hdr_t *hdr = from->b_hdr; 1513 uint64_t size = hdr->b_size; 1514 1515 ASSERT(hdr->b_state != arc_anon); 1516 1517 buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); 1518 buf->b_hdr = hdr; 1519 buf->b_data = NULL; 1520 buf->b_efunc = NULL; 1521 buf->b_private = NULL; 1522 buf->b_next = hdr->b_buf; 1523 hdr->b_buf = buf; 1524 arc_get_data_buf(buf); 1525 bcopy(from->b_data, buf->b_data, size); 1526 1527 /* 1528 * This buffer already exists in the arc so create a duplicate 1529 * copy for the caller. If the buffer is associated with user data 1530 * then track the size and number of duplicates. These stats will be 1531 * updated as duplicate buffers are created and destroyed. 1532 */ 1533 if (hdr->b_type == ARC_BUFC_DATA) { 1534 ARCSTAT_BUMP(arcstat_duplicate_buffers); 1535 ARCSTAT_INCR(arcstat_duplicate_buffers_size, size); 1536 } 1537 hdr->b_datacnt += 1; 1538 return (buf); 1539} 1540 1541void 1542arc_buf_add_ref(arc_buf_t *buf, void* tag) 1543{ 1544 arc_buf_hdr_t *hdr; 1545 kmutex_t *hash_lock; 1546 1547 /* 1548 * Check to see if this buffer is evicted. Callers 1549 * must verify b_data != NULL to know if the add_ref 1550 * was successful. 1551 */ 1552 mutex_enter(&buf->b_evict_lock); 1553 if (buf->b_data == NULL) { 1554 mutex_exit(&buf->b_evict_lock); 1555 return; 1556 } 1557 hash_lock = HDR_LOCK(buf->b_hdr); 1558 mutex_enter(hash_lock); 1559 hdr = buf->b_hdr; 1560 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 1561 mutex_exit(&buf->b_evict_lock); 1562 1563 ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu); 1564 add_reference(hdr, hash_lock, tag); 1565 DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); 1566 arc_access(hdr, hash_lock); 1567 mutex_exit(hash_lock); 1568 ARCSTAT_BUMP(arcstat_hits); 1569 ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH), 1570 demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, 1571 data, metadata, hits); 1572} 1573 1574/* 1575 * Free the arc data buffer. If it is an l2arc write in progress, 1576 * the buffer is placed on l2arc_free_on_write to be freed later. 1577 */ 1578static void 1579arc_buf_data_free(arc_buf_t *buf, void (*free_func)(void *, size_t)) 1580{ 1581 arc_buf_hdr_t *hdr = buf->b_hdr; 1582 1583 if (HDR_L2_WRITING(hdr)) { 1584 l2arc_data_free_t *df; 1585 df = kmem_alloc(sizeof (l2arc_data_free_t), KM_SLEEP); 1586 df->l2df_data = buf->b_data; 1587 df->l2df_size = hdr->b_size; 1588 df->l2df_func = free_func; 1589 mutex_enter(&l2arc_free_on_write_mtx); 1590 list_insert_head(l2arc_free_on_write, df); 1591 mutex_exit(&l2arc_free_on_write_mtx); 1592 ARCSTAT_BUMP(arcstat_l2_free_on_write); 1593 } else { 1594 free_func(buf->b_data, hdr->b_size); 1595 } 1596} 1597 1598static void 1599arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all) 1600{ 1601 arc_buf_t **bufp; 1602 1603 /* free up data associated with the buf */ 1604 if (buf->b_data) { 1605 arc_state_t *state = buf->b_hdr->b_state; 1606 uint64_t size = buf->b_hdr->b_size; 1607 arc_buf_contents_t type = buf->b_hdr->b_type; 1608 1609 arc_cksum_verify(buf); 1610#ifdef illumos 1611 arc_buf_unwatch(buf); 1612#endif /* illumos */ 1613 1614 if (!recycle) { 1615 if (type == ARC_BUFC_METADATA) { 1616 arc_buf_data_free(buf, zio_buf_free); 1617 arc_space_return(size, ARC_SPACE_DATA); 1618 } else { 1619 ASSERT(type == ARC_BUFC_DATA); 1620 arc_buf_data_free(buf, zio_data_buf_free); 1621 ARCSTAT_INCR(arcstat_data_size, -size); 1622 atomic_add_64(&arc_size, -size); 1623 } 1624 } 1625 if (list_link_active(&buf->b_hdr->b_arc_node)) { 1626 uint64_t *cnt = &state->arcs_lsize[type]; 1627 1628 ASSERT(refcount_is_zero(&buf->b_hdr->b_refcnt)); 1629 ASSERT(state != arc_anon); 1630 1631 ASSERT3U(*cnt, >=, size); 1632 atomic_add_64(cnt, -size); 1633 } 1634 ASSERT3U(state->arcs_size, >=, size); 1635 atomic_add_64(&state->arcs_size, -size); 1636 buf->b_data = NULL; 1637 1638 /* 1639 * If we're destroying a duplicate buffer make sure 1640 * that the appropriate statistics are updated. 1641 */ 1642 if (buf->b_hdr->b_datacnt > 1 && 1643 buf->b_hdr->b_type == ARC_BUFC_DATA) { 1644 ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers); 1645 ARCSTAT_INCR(arcstat_duplicate_buffers_size, -size); 1646 } 1647 ASSERT(buf->b_hdr->b_datacnt > 0); 1648 buf->b_hdr->b_datacnt -= 1; 1649 } 1650 1651 /* only remove the buf if requested */ 1652 if (!all) 1653 return; 1654 1655 /* remove the buf from the hdr list */ 1656 for (bufp = &buf->b_hdr->b_buf; *bufp != buf; bufp = &(*bufp)->b_next) 1657 continue; 1658 *bufp = buf->b_next; 1659 buf->b_next = NULL; 1660 1661 ASSERT(buf->b_efunc == NULL); 1662 1663 /* clean up the buf */ 1664 buf->b_hdr = NULL; 1665 kmem_cache_free(buf_cache, buf); 1666} 1667 1668static void 1669arc_hdr_destroy(arc_buf_hdr_t *hdr) 1670{ 1671 ASSERT(refcount_is_zero(&hdr->b_refcnt)); 1672 ASSERT3P(hdr->b_state, ==, arc_anon); 1673 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 1674 l2arc_buf_hdr_t *l2hdr = hdr->b_l2hdr; 1675 1676 if (l2hdr != NULL) { 1677 boolean_t buflist_held = MUTEX_HELD(&l2arc_buflist_mtx); 1678 /* 1679 * To prevent arc_free() and l2arc_evict() from 1680 * attempting to free the same buffer at the same time, 1681 * a FREE_IN_PROGRESS flag is given to arc_free() to 1682 * give it priority. l2arc_evict() can't destroy this 1683 * header while we are waiting on l2arc_buflist_mtx. 1684 * 1685 * The hdr may be removed from l2ad_buflist before we 1686 * grab l2arc_buflist_mtx, so b_l2hdr is rechecked. 1687 */ 1688 if (!buflist_held) { 1689 mutex_enter(&l2arc_buflist_mtx); 1690 l2hdr = hdr->b_l2hdr; 1691 } 1692 1693 if (l2hdr != NULL) { 1694 list_remove(l2hdr->b_dev->l2ad_buflist, hdr); 1695 ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); 1696 kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t)); 1697 if (hdr->b_state == arc_l2c_only) 1698 l2arc_hdr_stat_remove(); 1699 hdr->b_l2hdr = NULL; 1700 } 1701 1702 if (!buflist_held) 1703 mutex_exit(&l2arc_buflist_mtx); 1704 } 1705 1706 if (!BUF_EMPTY(hdr)) { 1707 ASSERT(!HDR_IN_HASH_TABLE(hdr)); 1708 buf_discard_identity(hdr); 1709 } 1710 while (hdr->b_buf) { 1711 arc_buf_t *buf = hdr->b_buf; 1712 1713 if (buf->b_efunc) { 1714 mutex_enter(&arc_eviction_mtx); 1715 mutex_enter(&buf->b_evict_lock); 1716 ASSERT(buf->b_hdr != NULL); 1717 arc_buf_destroy(hdr->b_buf, FALSE, FALSE); 1718 hdr->b_buf = buf->b_next; 1719 buf->b_hdr = &arc_eviction_hdr; 1720 buf->b_next = arc_eviction_list; 1721 arc_eviction_list = buf; 1722 mutex_exit(&buf->b_evict_lock); 1723 mutex_exit(&arc_eviction_mtx); 1724 } else { 1725 arc_buf_destroy(hdr->b_buf, FALSE, TRUE); 1726 } 1727 } 1728 if (hdr->b_freeze_cksum != NULL) { 1729 kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t)); 1730 hdr->b_freeze_cksum = NULL; 1731 } 1732 if (hdr->b_thawed) { 1733 kmem_free(hdr->b_thawed, 1); 1734 hdr->b_thawed = NULL; 1735 } 1736 1737 ASSERT(!list_link_active(&hdr->b_arc_node)); 1738 ASSERT3P(hdr->b_hash_next, ==, NULL); 1739 ASSERT3P(hdr->b_acb, ==, NULL); 1740 kmem_cache_free(hdr_cache, hdr); 1741} 1742 1743void 1744arc_buf_free(arc_buf_t *buf, void *tag) 1745{ 1746 arc_buf_hdr_t *hdr = buf->b_hdr; 1747 int hashed = hdr->b_state != arc_anon; 1748 1749 ASSERT(buf->b_efunc == NULL); 1750 ASSERT(buf->b_data != NULL); 1751 1752 if (hashed) { 1753 kmutex_t *hash_lock = HDR_LOCK(hdr); 1754 1755 mutex_enter(hash_lock); 1756 hdr = buf->b_hdr; 1757 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 1758 1759 (void) remove_reference(hdr, hash_lock, tag); 1760 if (hdr->b_datacnt > 1) { 1761 arc_buf_destroy(buf, FALSE, TRUE); 1762 } else { 1763 ASSERT(buf == hdr->b_buf); 1764 ASSERT(buf->b_efunc == NULL); 1765 hdr->b_flags |= ARC_BUF_AVAILABLE; 1766 } 1767 mutex_exit(hash_lock); 1768 } else if (HDR_IO_IN_PROGRESS(hdr)) { 1769 int destroy_hdr; 1770 /* 1771 * We are in the middle of an async write. Don't destroy 1772 * this buffer unless the write completes before we finish 1773 * decrementing the reference count. 1774 */ 1775 mutex_enter(&arc_eviction_mtx); 1776 (void) remove_reference(hdr, NULL, tag); 1777 ASSERT(refcount_is_zero(&hdr->b_refcnt)); 1778 destroy_hdr = !HDR_IO_IN_PROGRESS(hdr); 1779 mutex_exit(&arc_eviction_mtx); 1780 if (destroy_hdr) 1781 arc_hdr_destroy(hdr); 1782 } else { 1783 if (remove_reference(hdr, NULL, tag) > 0) 1784 arc_buf_destroy(buf, FALSE, TRUE); 1785 else 1786 arc_hdr_destroy(hdr); 1787 } 1788} 1789 1790int 1791arc_buf_remove_ref(arc_buf_t *buf, void* tag) 1792{ 1793 arc_buf_hdr_t *hdr = buf->b_hdr; 1794 kmutex_t *hash_lock = HDR_LOCK(hdr); 1795 int no_callback = (buf->b_efunc == NULL); 1796 1797 if (hdr->b_state == arc_anon) { 1798 ASSERT(hdr->b_datacnt == 1); 1799 arc_buf_free(buf, tag); 1800 return (no_callback); 1801 } 1802 1803 mutex_enter(hash_lock); 1804 hdr = buf->b_hdr; 1805 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 1806 ASSERT(hdr->b_state != arc_anon); 1807 ASSERT(buf->b_data != NULL); 1808 1809 (void) remove_reference(hdr, hash_lock, tag); 1810 if (hdr->b_datacnt > 1) { 1811 if (no_callback) 1812 arc_buf_destroy(buf, FALSE, TRUE); 1813 } else if (no_callback) { 1814 ASSERT(hdr->b_buf == buf && buf->b_next == NULL); 1815 ASSERT(buf->b_efunc == NULL); 1816 hdr->b_flags |= ARC_BUF_AVAILABLE; 1817 } 1818 ASSERT(no_callback || hdr->b_datacnt > 1 || 1819 refcount_is_zero(&hdr->b_refcnt)); 1820 mutex_exit(hash_lock); 1821 return (no_callback); 1822} 1823 1824int 1825arc_buf_size(arc_buf_t *buf) 1826{ 1827 return (buf->b_hdr->b_size); 1828} 1829 1830/* 1831 * Called from the DMU to determine if the current buffer should be 1832 * evicted. In order to ensure proper locking, the eviction must be initiated 1833 * from the DMU. Return true if the buffer is associated with user data and 1834 * duplicate buffers still exist. 1835 */ 1836boolean_t 1837arc_buf_eviction_needed(arc_buf_t *buf) 1838{ 1839 arc_buf_hdr_t *hdr; 1840 boolean_t evict_needed = B_FALSE; 1841 1842 if (zfs_disable_dup_eviction) 1843 return (B_FALSE); 1844 1845 mutex_enter(&buf->b_evict_lock); 1846 hdr = buf->b_hdr; 1847 if (hdr == NULL) { 1848 /* 1849 * We are in arc_do_user_evicts(); let that function 1850 * perform the eviction. 1851 */ 1852 ASSERT(buf->b_data == NULL); 1853 mutex_exit(&buf->b_evict_lock); 1854 return (B_FALSE); 1855 } else if (buf->b_data == NULL) { 1856 /* 1857 * We have already been added to the arc eviction list; 1858 * recommend eviction. 1859 */ 1860 ASSERT3P(hdr, ==, &arc_eviction_hdr); 1861 mutex_exit(&buf->b_evict_lock); 1862 return (B_TRUE); 1863 } 1864 1865 if (hdr->b_datacnt > 1 && hdr->b_type == ARC_BUFC_DATA) 1866 evict_needed = B_TRUE; 1867 1868 mutex_exit(&buf->b_evict_lock); 1869 return (evict_needed); 1870} 1871 1872/* 1873 * Evict buffers from list until we've removed the specified number of 1874 * bytes. Move the removed buffers to the appropriate evict state. 1875 * If the recycle flag is set, then attempt to "recycle" a buffer: 1876 * - look for a buffer to evict that is `bytes' long. 1877 * - return the data block from this buffer rather than freeing it. 1878 * This flag is used by callers that are trying to make space for a 1879 * new buffer in a full arc cache. 1880 * 1881 * This function makes a "best effort". It skips over any buffers 1882 * it can't get a hash_lock on, and so may not catch all candidates. 1883 * It may also return without evicting as much space as requested. 1884 */ 1885static void * 1886arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle, 1887 arc_buf_contents_t type) 1888{ 1889 arc_state_t *evicted_state; 1890 uint64_t bytes_evicted = 0, skipped = 0, missed = 0; 1891 int64_t bytes_remaining; 1892 arc_buf_hdr_t *ab, *ab_prev = NULL; 1893 list_t *evicted_list, *list, *evicted_list_start, *list_start; 1894 kmutex_t *lock, *evicted_lock; 1895 kmutex_t *hash_lock; 1896 boolean_t have_lock; 1897 void *stolen = NULL; 1898 static int evict_metadata_offset, evict_data_offset; 1899 int i, idx, offset, list_count, count; 1900 1901 ASSERT(state == arc_mru || state == arc_mfu); 1902 1903 evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost; 1904 1905 if (type == ARC_BUFC_METADATA) { 1906 offset = 0; 1907 list_count = ARC_BUFC_NUMMETADATALISTS; 1908 list_start = &state->arcs_lists[0]; 1909 evicted_list_start = &evicted_state->arcs_lists[0]; 1910 idx = evict_metadata_offset; 1911 } else { 1912 offset = ARC_BUFC_NUMMETADATALISTS; 1913 list_start = &state->arcs_lists[offset]; 1914 evicted_list_start = &evicted_state->arcs_lists[offset]; 1915 list_count = ARC_BUFC_NUMDATALISTS; 1916 idx = evict_data_offset; 1917 } 1918 bytes_remaining = evicted_state->arcs_lsize[type]; 1919 count = 0; 1920 1921evict_start: 1922 list = &list_start[idx]; 1923 evicted_list = &evicted_list_start[idx]; 1924 lock = ARCS_LOCK(state, (offset + idx)); 1925 evicted_lock = ARCS_LOCK(evicted_state, (offset + idx)); 1926 1927 mutex_enter(lock); 1928 mutex_enter(evicted_lock); 1929 1930 for (ab = list_tail(list); ab; ab = ab_prev) { 1931 ab_prev = list_prev(list, ab); 1932 bytes_remaining -= (ab->b_size * ab->b_datacnt); 1933 /* prefetch buffers have a minimum lifespan */ 1934 if (HDR_IO_IN_PROGRESS(ab) || 1935 (spa && ab->b_spa != spa) || 1936 (ab->b_flags & (ARC_PREFETCH|ARC_INDIRECT) && 1937 ddi_get_lbolt() - ab->b_arc_access < 1938 arc_min_prefetch_lifespan)) { 1939 skipped++; 1940 continue; 1941 } 1942 /* "lookahead" for better eviction candidate */ 1943 if (recycle && ab->b_size != bytes && 1944 ab_prev && ab_prev->b_size == bytes) 1945 continue; 1946 hash_lock = HDR_LOCK(ab); 1947 have_lock = MUTEX_HELD(hash_lock); 1948 if (have_lock || mutex_tryenter(hash_lock)) { 1949 ASSERT0(refcount_count(&ab->b_refcnt)); 1950 ASSERT(ab->b_datacnt > 0); 1951 while (ab->b_buf) { 1952 arc_buf_t *buf = ab->b_buf; 1953 if (!mutex_tryenter(&buf->b_evict_lock)) { 1954 missed += 1; 1955 break; 1956 } 1957 if (buf->b_data) { 1958 bytes_evicted += ab->b_size; 1959 if (recycle && ab->b_type == type && 1960 ab->b_size == bytes && 1961 !HDR_L2_WRITING(ab)) { 1962 stolen = buf->b_data; 1963 recycle = FALSE; 1964 } 1965 } 1966 if (buf->b_efunc) { 1967 mutex_enter(&arc_eviction_mtx); 1968 arc_buf_destroy(buf, 1969 buf->b_data == stolen, FALSE); 1970 ab->b_buf = buf->b_next; 1971 buf->b_hdr = &arc_eviction_hdr; 1972 buf->b_next = arc_eviction_list; 1973 arc_eviction_list = buf; 1974 mutex_exit(&arc_eviction_mtx); 1975 mutex_exit(&buf->b_evict_lock); 1976 } else { 1977 mutex_exit(&buf->b_evict_lock); 1978 arc_buf_destroy(buf, 1979 buf->b_data == stolen, TRUE); 1980 } 1981 } 1982 1983 if (ab->b_l2hdr) { 1984 ARCSTAT_INCR(arcstat_evict_l2_cached, 1985 ab->b_size); 1986 } else { 1987 if (l2arc_write_eligible(ab->b_spa, ab)) { 1988 ARCSTAT_INCR(arcstat_evict_l2_eligible, 1989 ab->b_size); 1990 } else { 1991 ARCSTAT_INCR( 1992 arcstat_evict_l2_ineligible, 1993 ab->b_size); 1994 } 1995 } 1996 1997 if (ab->b_datacnt == 0) { 1998 arc_change_state(evicted_state, ab, hash_lock); 1999 ASSERT(HDR_IN_HASH_TABLE(ab)); 2000 ab->b_flags |= ARC_IN_HASH_TABLE; 2001 ab->b_flags &= ~ARC_BUF_AVAILABLE; 2002 DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, ab); 2003 } 2004 if (!have_lock) 2005 mutex_exit(hash_lock); 2006 if (bytes >= 0 && bytes_evicted >= bytes) 2007 break; 2008 if (bytes_remaining > 0) { 2009 mutex_exit(evicted_lock); 2010 mutex_exit(lock); 2011 idx = ((idx + 1) & (list_count - 1)); 2012 count++; 2013 goto evict_start; 2014 } 2015 } else { 2016 missed += 1; 2017 } 2018 } 2019 2020 mutex_exit(evicted_lock); 2021 mutex_exit(lock); 2022 2023 idx = ((idx + 1) & (list_count - 1)); 2024 count++; 2025 2026 if (bytes_evicted < bytes) { 2027 if (count < list_count) 2028 goto evict_start; 2029 else 2030 dprintf("only evicted %lld bytes from %x", 2031 (longlong_t)bytes_evicted, state); 2032 } 2033 if (type == ARC_BUFC_METADATA) 2034 evict_metadata_offset = idx; 2035 else 2036 evict_data_offset = idx; 2037 2038 if (skipped) 2039 ARCSTAT_INCR(arcstat_evict_skip, skipped); 2040 2041 if (missed) 2042 ARCSTAT_INCR(arcstat_mutex_miss, missed); 2043 2044 /* 2045 * We have just evicted some date into the ghost state, make 2046 * sure we also adjust the ghost state size if necessary. 2047 */ 2048 if (arc_no_grow && 2049 arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size > arc_c) { 2050 int64_t mru_over = arc_anon->arcs_size + arc_mru->arcs_size + 2051 arc_mru_ghost->arcs_size - arc_c; 2052 2053 if (mru_over > 0 && arc_mru_ghost->arcs_lsize[type] > 0) { 2054 int64_t todelete = 2055 MIN(arc_mru_ghost->arcs_lsize[type], mru_over); 2056 arc_evict_ghost(arc_mru_ghost, 0, todelete); 2057 } else if (arc_mfu_ghost->arcs_lsize[type] > 0) { 2058 int64_t todelete = MIN(arc_mfu_ghost->arcs_lsize[type], 2059 arc_mru_ghost->arcs_size + 2060 arc_mfu_ghost->arcs_size - arc_c); 2061 arc_evict_ghost(arc_mfu_ghost, 0, todelete); 2062 } 2063 } 2064 if (stolen) 2065 ARCSTAT_BUMP(arcstat_stolen); 2066 2067 return (stolen); 2068} 2069 2070/* 2071 * Remove buffers from list until we've removed the specified number of 2072 * bytes. Destroy the buffers that are removed. 2073 */ 2074static void 2075arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes) 2076{ 2077 arc_buf_hdr_t *ab, *ab_prev; 2078 arc_buf_hdr_t marker = { 0 }; 2079 list_t *list, *list_start; 2080 kmutex_t *hash_lock, *lock; 2081 uint64_t bytes_deleted = 0; 2082 uint64_t bufs_skipped = 0; 2083 static int evict_offset; 2084 int list_count, idx = evict_offset; 2085 int offset, count = 0; 2086 2087 ASSERT(GHOST_STATE(state)); 2088 2089 /* 2090 * data lists come after metadata lists 2091 */ 2092 list_start = &state->arcs_lists[ARC_BUFC_NUMMETADATALISTS]; 2093 list_count = ARC_BUFC_NUMDATALISTS; 2094 offset = ARC_BUFC_NUMMETADATALISTS; 2095 2096evict_start: 2097 list = &list_start[idx]; 2098 lock = ARCS_LOCK(state, idx + offset); 2099 2100 mutex_enter(lock); 2101 for (ab = list_tail(list); ab; ab = ab_prev) { 2102 ab_prev = list_prev(list, ab); 2103 if (spa && ab->b_spa != spa) 2104 continue; 2105 2106 /* ignore markers */ 2107 if (ab->b_spa == 0) 2108 continue; 2109 2110 hash_lock = HDR_LOCK(ab); 2111 /* caller may be trying to modify this buffer, skip it */ 2112 if (MUTEX_HELD(hash_lock)) 2113 continue; 2114 if (mutex_tryenter(hash_lock)) { 2115 ASSERT(!HDR_IO_IN_PROGRESS(ab)); 2116 ASSERT(ab->b_buf == NULL); 2117 ARCSTAT_BUMP(arcstat_deleted); 2118 bytes_deleted += ab->b_size; 2119 2120 if (ab->b_l2hdr != NULL) { 2121 /* 2122 * This buffer is cached on the 2nd Level ARC; 2123 * don't destroy the header. 2124 */ 2125 arc_change_state(arc_l2c_only, ab, hash_lock); 2126 mutex_exit(hash_lock); 2127 } else { 2128 arc_change_state(arc_anon, ab, hash_lock); 2129 mutex_exit(hash_lock); 2130 arc_hdr_destroy(ab); 2131 } 2132 2133 DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab); 2134 if (bytes >= 0 && bytes_deleted >= bytes) 2135 break; 2136 } else if (bytes < 0) { 2137 /* 2138 * Insert a list marker and then wait for the 2139 * hash lock to become available. Once its 2140 * available, restart from where we left off. 2141 */ 2142 list_insert_after(list, ab, &marker); 2143 mutex_exit(lock); 2144 mutex_enter(hash_lock); 2145 mutex_exit(hash_lock); 2146 mutex_enter(lock); 2147 ab_prev = list_prev(list, &marker); 2148 list_remove(list, &marker); 2149 } else 2150 bufs_skipped += 1; 2151 } 2152 mutex_exit(lock); 2153 idx = ((idx + 1) & (ARC_BUFC_NUMDATALISTS - 1)); 2154 count++; 2155 2156 if (count < list_count) 2157 goto evict_start; 2158 2159 evict_offset = idx; 2160 if ((uintptr_t)list > (uintptr_t)&state->arcs_lists[ARC_BUFC_NUMMETADATALISTS] && 2161 (bytes < 0 || bytes_deleted < bytes)) { 2162 list_start = &state->arcs_lists[0]; 2163 list_count = ARC_BUFC_NUMMETADATALISTS; 2164 offset = count = 0; 2165 goto evict_start; 2166 } 2167 2168 if (bufs_skipped) { 2169 ARCSTAT_INCR(arcstat_mutex_miss, bufs_skipped); 2170 ASSERT(bytes >= 0); 2171 } 2172 2173 if (bytes_deleted < bytes) 2174 dprintf("only deleted %lld bytes from %p", 2175 (longlong_t)bytes_deleted, state); 2176} 2177 2178static void 2179arc_adjust(void) 2180{ 2181 int64_t adjustment, delta; 2182 2183 /* 2184 * Adjust MRU size 2185 */ 2186 2187 adjustment = MIN((int64_t)(arc_size - arc_c), 2188 (int64_t)(arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used - 2189 arc_p)); 2190 2191 if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) { 2192 delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], adjustment); 2193 (void) arc_evict(arc_mru, 0, delta, FALSE, ARC_BUFC_DATA); 2194 adjustment -= delta; 2195 } 2196 2197 if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) { 2198 delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustment); 2199 (void) arc_evict(arc_mru, 0, delta, FALSE, 2200 ARC_BUFC_METADATA); 2201 } 2202 2203 /* 2204 * Adjust MFU size 2205 */ 2206 2207 adjustment = arc_size - arc_c; 2208 2209 if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) { 2210 delta = MIN(adjustment, arc_mfu->arcs_lsize[ARC_BUFC_DATA]); 2211 (void) arc_evict(arc_mfu, 0, delta, FALSE, ARC_BUFC_DATA); 2212 adjustment -= delta; 2213 } 2214 2215 if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) { 2216 int64_t delta = MIN(adjustment, 2217 arc_mfu->arcs_lsize[ARC_BUFC_METADATA]); 2218 (void) arc_evict(arc_mfu, 0, delta, FALSE, 2219 ARC_BUFC_METADATA); 2220 } 2221 2222 /* 2223 * Adjust ghost lists 2224 */ 2225 2226 adjustment = arc_mru->arcs_size + arc_mru_ghost->arcs_size - arc_c; 2227 2228 if (adjustment > 0 && arc_mru_ghost->arcs_size > 0) { 2229 delta = MIN(arc_mru_ghost->arcs_size, adjustment); 2230 arc_evict_ghost(arc_mru_ghost, 0, delta); 2231 } 2232 2233 adjustment = 2234 arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size - arc_c; 2235 2236 if (adjustment > 0 && arc_mfu_ghost->arcs_size > 0) { 2237 delta = MIN(arc_mfu_ghost->arcs_size, adjustment); 2238 arc_evict_ghost(arc_mfu_ghost, 0, delta); 2239 } 2240} 2241 2242static void 2243arc_do_user_evicts(void) 2244{ 2245 static arc_buf_t *tmp_arc_eviction_list; 2246 2247 /* 2248 * Move list over to avoid LOR 2249 */ 2250restart: 2251 mutex_enter(&arc_eviction_mtx); 2252 tmp_arc_eviction_list = arc_eviction_list; 2253 arc_eviction_list = NULL; 2254 mutex_exit(&arc_eviction_mtx); 2255 2256 while (tmp_arc_eviction_list != NULL) { 2257 arc_buf_t *buf = tmp_arc_eviction_list; 2258 tmp_arc_eviction_list = buf->b_next; 2259 mutex_enter(&buf->b_evict_lock); 2260 buf->b_hdr = NULL; 2261 mutex_exit(&buf->b_evict_lock); 2262 2263 if (buf->b_efunc != NULL) 2264 VERIFY(buf->b_efunc(buf) == 0); 2265 2266 buf->b_efunc = NULL; 2267 buf->b_private = NULL; 2268 kmem_cache_free(buf_cache, buf); 2269 } 2270 2271 if (arc_eviction_list != NULL) 2272 goto restart; 2273} 2274 2275/* 2276 * Flush all *evictable* data from the cache for the given spa. 2277 * NOTE: this will not touch "active" (i.e. referenced) data. 2278 */ 2279void 2280arc_flush(spa_t *spa) 2281{ 2282 uint64_t guid = 0; 2283 2284 if (spa) 2285 guid = spa_load_guid(spa); 2286 2287 while (arc_mru->arcs_lsize[ARC_BUFC_DATA]) { 2288 (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_DATA); 2289 if (spa) 2290 break; 2291 } 2292 while (arc_mru->arcs_lsize[ARC_BUFC_METADATA]) { 2293 (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_METADATA); 2294 if (spa) 2295 break; 2296 } 2297 while (arc_mfu->arcs_lsize[ARC_BUFC_DATA]) { 2298 (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_DATA); 2299 if (spa) 2300 break; 2301 } 2302 while (arc_mfu->arcs_lsize[ARC_BUFC_METADATA]) { 2303 (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_METADATA); 2304 if (spa) 2305 break; 2306 } 2307 2308 arc_evict_ghost(arc_mru_ghost, guid, -1); 2309 arc_evict_ghost(arc_mfu_ghost, guid, -1); 2310 2311 mutex_enter(&arc_reclaim_thr_lock); 2312 arc_do_user_evicts(); 2313 mutex_exit(&arc_reclaim_thr_lock); 2314 ASSERT(spa || arc_eviction_list == NULL); 2315} 2316 2317void 2318arc_shrink(void) 2319{ 2320 if (arc_c > arc_c_min) { 2321 uint64_t to_free; 2322 2323#ifdef _KERNEL 2324 to_free = arc_c >> arc_shrink_shift; 2325#else 2326 to_free = arc_c >> arc_shrink_shift; 2327#endif 2328 if (arc_c > arc_c_min + to_free) 2329 atomic_add_64(&arc_c, -to_free); 2330 else 2331 arc_c = arc_c_min; 2332 2333 atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift)); 2334 if (arc_c > arc_size) 2335 arc_c = MAX(arc_size, arc_c_min); 2336 if (arc_p > arc_c) 2337 arc_p = (arc_c >> 1); 2338 ASSERT(arc_c >= arc_c_min); 2339 ASSERT((int64_t)arc_p >= 0); 2340 } 2341 2342 if (arc_size > arc_c) 2343 arc_adjust(); 2344} 2345 2346static int needfree = 0; 2347 2348static int 2349arc_reclaim_needed(void) 2350{ 2351 2352#ifdef _KERNEL 2353 2354 if (needfree) 2355 return (1); 2356 2357 /* 2358 * Cooperate with pagedaemon when it's time for it to scan 2359 * and reclaim some pages. 2360 */ 2361 if (vm_paging_needed()) 2362 return (1); 2363 2364#ifdef sun 2365 /* 2366 * take 'desfree' extra pages, so we reclaim sooner, rather than later 2367 */ 2368 extra = desfree; 2369 2370 /* 2371 * check that we're out of range of the pageout scanner. It starts to 2372 * schedule paging if freemem is less than lotsfree and needfree. 2373 * lotsfree is the high-water mark for pageout, and needfree is the 2374 * number of needed free pages. We add extra pages here to make sure 2375 * the scanner doesn't start up while we're freeing memory. 2376 */ 2377 if (freemem < lotsfree + needfree + extra) 2378 return (1); 2379 2380 /* 2381 * check to make sure that swapfs has enough space so that anon 2382 * reservations can still succeed. anon_resvmem() checks that the 2383 * availrmem is greater than swapfs_minfree, and the number of reserved 2384 * swap pages. We also add a bit of extra here just to prevent 2385 * circumstances from getting really dire. 2386 */ 2387 if (availrmem < swapfs_minfree + swapfs_reserve + extra) 2388 return (1); 2389 2390#if defined(__i386) 2391 /* 2392 * If we're on an i386 platform, it's possible that we'll exhaust the 2393 * kernel heap space before we ever run out of available physical 2394 * memory. Most checks of the size of the heap_area compare against 2395 * tune.t_minarmem, which is the minimum available real memory that we 2396 * can have in the system. However, this is generally fixed at 25 pages 2397 * which is so low that it's useless. In this comparison, we seek to 2398 * calculate the total heap-size, and reclaim if more than 3/4ths of the 2399 * heap is allocated. (Or, in the calculation, if less than 1/4th is 2400 * free) 2401 */ 2402 if (btop(vmem_size(heap_arena, VMEM_FREE)) < 2403 (btop(vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC)) >> 2)) 2404 return (1); 2405#endif 2406#else /* !sun */ 2407 if (kmem_used() > (kmem_size() * 3) / 4) 2408 return (1); 2409#endif /* sun */ 2410 2411#else 2412 if (spa_get_random(100) == 0) 2413 return (1); 2414#endif 2415 return (0); 2416} 2417 2418extern kmem_cache_t *zio_buf_cache[]; 2419extern kmem_cache_t *zio_data_buf_cache[]; 2420 2421static void 2422arc_kmem_reap_now(arc_reclaim_strategy_t strat) 2423{ 2424 size_t i; 2425 kmem_cache_t *prev_cache = NULL; 2426 kmem_cache_t *prev_data_cache = NULL; 2427 2428#ifdef _KERNEL 2429 if (arc_meta_used >= arc_meta_limit) { 2430 /* 2431 * We are exceeding our meta-data cache limit. 2432 * Purge some DNLC entries to release holds on meta-data. 2433 */ 2434 dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent); 2435 } 2436#if defined(__i386) 2437 /* 2438 * Reclaim unused memory from all kmem caches. 2439 */ 2440 kmem_reap(); 2441#endif 2442#endif 2443 2444 /* 2445 * An aggressive reclamation will shrink the cache size as well as 2446 * reap free buffers from the arc kmem caches. 2447 */ 2448 if (strat == ARC_RECLAIM_AGGR) 2449 arc_shrink(); 2450 2451 for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) { 2452 if (zio_buf_cache[i] != prev_cache) { 2453 prev_cache = zio_buf_cache[i]; 2454 kmem_cache_reap_now(zio_buf_cache[i]); 2455 } 2456 if (zio_data_buf_cache[i] != prev_data_cache) { 2457 prev_data_cache = zio_data_buf_cache[i]; 2458 kmem_cache_reap_now(zio_data_buf_cache[i]); 2459 } 2460 } 2461 kmem_cache_reap_now(buf_cache); 2462 kmem_cache_reap_now(hdr_cache); 2463} 2464 2465static void 2466arc_reclaim_thread(void *dummy __unused) 2467{ 2468 clock_t growtime = 0; 2469 arc_reclaim_strategy_t last_reclaim = ARC_RECLAIM_CONS; 2470 callb_cpr_t cpr; 2471 2472 CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG); 2473 2474 mutex_enter(&arc_reclaim_thr_lock); 2475 while (arc_thread_exit == 0) { 2476 if (arc_reclaim_needed()) { 2477 2478 if (arc_no_grow) { 2479 if (last_reclaim == ARC_RECLAIM_CONS) { 2480 last_reclaim = ARC_RECLAIM_AGGR; 2481 } else { 2482 last_reclaim = ARC_RECLAIM_CONS; 2483 } 2484 } else { 2485 arc_no_grow = TRUE; 2486 last_reclaim = ARC_RECLAIM_AGGR; 2487 membar_producer(); 2488 } 2489 2490 /* reset the growth delay for every reclaim */ 2491 growtime = ddi_get_lbolt() + (arc_grow_retry * hz); 2492 2493 if (needfree && last_reclaim == ARC_RECLAIM_CONS) { 2494 /* 2495 * If needfree is TRUE our vm_lowmem hook 2496 * was called and in that case we must free some 2497 * memory, so switch to aggressive mode. 2498 */ 2499 arc_no_grow = TRUE; 2500 last_reclaim = ARC_RECLAIM_AGGR; 2501 } 2502 arc_kmem_reap_now(last_reclaim); 2503 arc_warm = B_TRUE; 2504 2505 } else if (arc_no_grow && ddi_get_lbolt() >= growtime) { 2506 arc_no_grow = FALSE; 2507 } 2508 2509 arc_adjust(); 2510 2511 if (arc_eviction_list != NULL) 2512 arc_do_user_evicts(); 2513 2514#ifdef _KERNEL 2515 if (needfree) { 2516 needfree = 0; 2517 wakeup(&needfree); 2518 } 2519#endif 2520 2521 /* block until needed, or one second, whichever is shorter */ 2522 CALLB_CPR_SAFE_BEGIN(&cpr); 2523 (void) cv_timedwait(&arc_reclaim_thr_cv, 2524 &arc_reclaim_thr_lock, hz); 2525 CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock); 2526 } 2527 2528 arc_thread_exit = 0; 2529 cv_broadcast(&arc_reclaim_thr_cv); 2530 CALLB_CPR_EXIT(&cpr); /* drops arc_reclaim_thr_lock */ 2531 thread_exit(); 2532} 2533 2534/* 2535 * Adapt arc info given the number of bytes we are trying to add and 2536 * the state that we are comming from. This function is only called 2537 * when we are adding new content to the cache. 2538 */ 2539static void 2540arc_adapt(int bytes, arc_state_t *state) 2541{ 2542 int mult; 2543 uint64_t arc_p_min = (arc_c >> arc_p_min_shift); 2544 2545 if (state == arc_l2c_only) 2546 return; 2547 2548 ASSERT(bytes > 0); 2549 /* 2550 * Adapt the target size of the MRU list: 2551 * - if we just hit in the MRU ghost list, then increase 2552 * the target size of the MRU list. 2553 * - if we just hit in the MFU ghost list, then increase 2554 * the target size of the MFU list by decreasing the 2555 * target size of the MRU list. 2556 */ 2557 if (state == arc_mru_ghost) { 2558 mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ? 2559 1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size)); 2560 mult = MIN(mult, 10); /* avoid wild arc_p adjustment */ 2561 2562 arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult); 2563 } else if (state == arc_mfu_ghost) { 2564 uint64_t delta; 2565 2566 mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ? 2567 1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size)); 2568 mult = MIN(mult, 10); 2569 2570 delta = MIN(bytes * mult, arc_p); 2571 arc_p = MAX(arc_p_min, arc_p - delta); 2572 } 2573 ASSERT((int64_t)arc_p >= 0); 2574 2575 if (arc_reclaim_needed()) { 2576 cv_signal(&arc_reclaim_thr_cv); 2577 return; 2578 } 2579 2580 if (arc_no_grow) 2581 return; 2582 2583 if (arc_c >= arc_c_max) 2584 return; 2585 2586 /* 2587 * If we're within (2 * maxblocksize) bytes of the target 2588 * cache size, increment the target cache size 2589 */ 2590 if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) { 2591 atomic_add_64(&arc_c, (int64_t)bytes); 2592 if (arc_c > arc_c_max) 2593 arc_c = arc_c_max; 2594 else if (state == arc_anon) 2595 atomic_add_64(&arc_p, (int64_t)bytes); 2596 if (arc_p > arc_c) 2597 arc_p = arc_c; 2598 } 2599 ASSERT((int64_t)arc_p >= 0); 2600} 2601 2602/* 2603 * Check if the cache has reached its limits and eviction is required 2604 * prior to insert. 2605 */ 2606static int 2607arc_evict_needed(arc_buf_contents_t type) 2608{ 2609 if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit) 2610 return (1); 2611 2612#ifdef sun 2613#ifdef _KERNEL 2614 /* 2615 * If zio data pages are being allocated out of a separate heap segment, 2616 * then enforce that the size of available vmem for this area remains 2617 * above about 1/32nd free. 2618 */ 2619 if (type == ARC_BUFC_DATA && zio_arena != NULL && 2620 vmem_size(zio_arena, VMEM_FREE) < 2621 (vmem_size(zio_arena, VMEM_ALLOC) >> 5)) 2622 return (1); 2623#endif 2624#endif /* sun */ 2625 2626 if (arc_reclaim_needed()) 2627 return (1); 2628 2629 return (arc_size > arc_c); 2630} 2631 2632/* 2633 * The buffer, supplied as the first argument, needs a data block. 2634 * So, if we are at cache max, determine which cache should be victimized. 2635 * We have the following cases: 2636 * 2637 * 1. Insert for MRU, p > sizeof(arc_anon + arc_mru) -> 2638 * In this situation if we're out of space, but the resident size of the MFU is 2639 * under the limit, victimize the MFU cache to satisfy this insertion request. 2640 * 2641 * 2. Insert for MRU, p <= sizeof(arc_anon + arc_mru) -> 2642 * Here, we've used up all of the available space for the MRU, so we need to 2643 * evict from our own cache instead. Evict from the set of resident MRU 2644 * entries. 2645 * 2646 * 3. Insert for MFU (c - p) > sizeof(arc_mfu) -> 2647 * c minus p represents the MFU space in the cache, since p is the size of the 2648 * cache that is dedicated to the MRU. In this situation there's still space on 2649 * the MFU side, so the MRU side needs to be victimized. 2650 * 2651 * 4. Insert for MFU (c - p) < sizeof(arc_mfu) -> 2652 * MFU's resident set is consuming more space than it has been allotted. In 2653 * this situation, we must victimize our own cache, the MFU, for this insertion. 2654 */ 2655static void 2656arc_get_data_buf(arc_buf_t *buf) 2657{ 2658 arc_state_t *state = buf->b_hdr->b_state; 2659 uint64_t size = buf->b_hdr->b_size; 2660 arc_buf_contents_t type = buf->b_hdr->b_type; 2661 2662 arc_adapt(size, state); 2663 2664 /* 2665 * We have not yet reached cache maximum size, 2666 * just allocate a new buffer. 2667 */ 2668 if (!arc_evict_needed(type)) { 2669 if (type == ARC_BUFC_METADATA) { 2670 buf->b_data = zio_buf_alloc(size); 2671 arc_space_consume(size, ARC_SPACE_DATA); 2672 } else { 2673 ASSERT(type == ARC_BUFC_DATA); 2674 buf->b_data = zio_data_buf_alloc(size); 2675 ARCSTAT_INCR(arcstat_data_size, size); 2676 atomic_add_64(&arc_size, size); 2677 } 2678 goto out; 2679 } 2680 2681 /* 2682 * If we are prefetching from the mfu ghost list, this buffer 2683 * will end up on the mru list; so steal space from there. 2684 */ 2685 if (state == arc_mfu_ghost) 2686 state = buf->b_hdr->b_flags & ARC_PREFETCH ? arc_mru : arc_mfu; 2687 else if (state == arc_mru_ghost) 2688 state = arc_mru; 2689 2690 if (state == arc_mru || state == arc_anon) { 2691 uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size; 2692 state = (arc_mfu->arcs_lsize[type] >= size && 2693 arc_p > mru_used) ? arc_mfu : arc_mru; 2694 } else { 2695 /* MFU cases */ 2696 uint64_t mfu_space = arc_c - arc_p; 2697 state = (arc_mru->arcs_lsize[type] >= size && 2698 mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu; 2699 } 2700 if ((buf->b_data = arc_evict(state, 0, size, TRUE, type)) == NULL) { 2701 if (type == ARC_BUFC_METADATA) { 2702 buf->b_data = zio_buf_alloc(size); 2703 arc_space_consume(size, ARC_SPACE_DATA); 2704 } else { 2705 ASSERT(type == ARC_BUFC_DATA); 2706 buf->b_data = zio_data_buf_alloc(size); 2707 ARCSTAT_INCR(arcstat_data_size, size); 2708 atomic_add_64(&arc_size, size); 2709 } 2710 ARCSTAT_BUMP(arcstat_recycle_miss); 2711 } 2712 ASSERT(buf->b_data != NULL); 2713out: 2714 /* 2715 * Update the state size. Note that ghost states have a 2716 * "ghost size" and so don't need to be updated. 2717 */ 2718 if (!GHOST_STATE(buf->b_hdr->b_state)) { 2719 arc_buf_hdr_t *hdr = buf->b_hdr; 2720 2721 atomic_add_64(&hdr->b_state->arcs_size, size); 2722 if (list_link_active(&hdr->b_arc_node)) { 2723 ASSERT(refcount_is_zero(&hdr->b_refcnt)); 2724 atomic_add_64(&hdr->b_state->arcs_lsize[type], size); 2725 } 2726 /* 2727 * If we are growing the cache, and we are adding anonymous 2728 * data, and we have outgrown arc_p, update arc_p 2729 */ 2730 if (arc_size < arc_c && hdr->b_state == arc_anon && 2731 arc_anon->arcs_size + arc_mru->arcs_size > arc_p) 2732 arc_p = MIN(arc_c, arc_p + size); 2733 } 2734 ARCSTAT_BUMP(arcstat_allocated); 2735} 2736 2737/* 2738 * This routine is called whenever a buffer is accessed. 2739 * NOTE: the hash lock is dropped in this function. 2740 */ 2741static void 2742arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) 2743{ 2744 clock_t now; 2745 2746 ASSERT(MUTEX_HELD(hash_lock)); 2747 2748 if (buf->b_state == arc_anon) { 2749 /* 2750 * This buffer is not in the cache, and does not 2751 * appear in our "ghost" list. Add the new buffer 2752 * to the MRU state. 2753 */ 2754 2755 ASSERT(buf->b_arc_access == 0); 2756 buf->b_arc_access = ddi_get_lbolt(); 2757 DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf); 2758 arc_change_state(arc_mru, buf, hash_lock); 2759 2760 } else if (buf->b_state == arc_mru) { 2761 now = ddi_get_lbolt(); 2762 2763 /* 2764 * If this buffer is here because of a prefetch, then either: 2765 * - clear the flag if this is a "referencing" read 2766 * (any subsequent access will bump this into the MFU state). 2767 * or 2768 * - move the buffer to the head of the list if this is 2769 * another prefetch (to make it less likely to be evicted). 2770 */ 2771 if ((buf->b_flags & ARC_PREFETCH) != 0) { 2772 if (refcount_count(&buf->b_refcnt) == 0) { 2773 ASSERT(list_link_active(&buf->b_arc_node)); 2774 } else { 2775 buf->b_flags &= ~ARC_PREFETCH; 2776 ARCSTAT_BUMP(arcstat_mru_hits); 2777 } 2778 buf->b_arc_access = now; 2779 return; 2780 } 2781 2782 /* 2783 * This buffer has been "accessed" only once so far, 2784 * but it is still in the cache. Move it to the MFU 2785 * state. 2786 */ 2787 if (now > buf->b_arc_access + ARC_MINTIME) { 2788 /* 2789 * More than 125ms have passed since we 2790 * instantiated this buffer. Move it to the 2791 * most frequently used state. 2792 */ 2793 buf->b_arc_access = now; 2794 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); 2795 arc_change_state(arc_mfu, buf, hash_lock); 2796 } 2797 ARCSTAT_BUMP(arcstat_mru_hits); 2798 } else if (buf->b_state == arc_mru_ghost) { 2799 arc_state_t *new_state; 2800 /* 2801 * This buffer has been "accessed" recently, but 2802 * was evicted from the cache. Move it to the 2803 * MFU state. 2804 */ 2805 2806 if (buf->b_flags & ARC_PREFETCH) { 2807 new_state = arc_mru; 2808 if (refcount_count(&buf->b_refcnt) > 0) 2809 buf->b_flags &= ~ARC_PREFETCH; 2810 DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf); 2811 } else { 2812 new_state = arc_mfu; 2813 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); 2814 } 2815 2816 buf->b_arc_access = ddi_get_lbolt(); 2817 arc_change_state(new_state, buf, hash_lock); 2818 2819 ARCSTAT_BUMP(arcstat_mru_ghost_hits); 2820 } else if (buf->b_state == arc_mfu) { 2821 /* 2822 * This buffer has been accessed more than once and is 2823 * still in the cache. Keep it in the MFU state. 2824 * 2825 * NOTE: an add_reference() that occurred when we did 2826 * the arc_read() will have kicked this off the list. 2827 * If it was a prefetch, we will explicitly move it to 2828 * the head of the list now. 2829 */ 2830 if ((buf->b_flags & ARC_PREFETCH) != 0) { 2831 ASSERT(refcount_count(&buf->b_refcnt) == 0); 2832 ASSERT(list_link_active(&buf->b_arc_node)); 2833 } 2834 ARCSTAT_BUMP(arcstat_mfu_hits); 2835 buf->b_arc_access = ddi_get_lbolt(); 2836 } else if (buf->b_state == arc_mfu_ghost) { 2837 arc_state_t *new_state = arc_mfu; 2838 /* 2839 * This buffer has been accessed more than once but has 2840 * been evicted from the cache. Move it back to the 2841 * MFU state. 2842 */ 2843 2844 if (buf->b_flags & ARC_PREFETCH) { 2845 /* 2846 * This is a prefetch access... 2847 * move this block back to the MRU state. 2848 */ 2849 ASSERT0(refcount_count(&buf->b_refcnt)); 2850 new_state = arc_mru; 2851 } 2852 2853 buf->b_arc_access = ddi_get_lbolt(); 2854 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); 2855 arc_change_state(new_state, buf, hash_lock); 2856 2857 ARCSTAT_BUMP(arcstat_mfu_ghost_hits); 2858 } else if (buf->b_state == arc_l2c_only) { 2859 /* 2860 * This buffer is on the 2nd Level ARC. 2861 */ 2862 2863 buf->b_arc_access = ddi_get_lbolt(); 2864 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); 2865 arc_change_state(arc_mfu, buf, hash_lock); 2866 } else { 2867 ASSERT(!"invalid arc state"); 2868 } 2869} 2870 2871/* a generic arc_done_func_t which you can use */ 2872/* ARGSUSED */ 2873void 2874arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg) 2875{ 2876 if (zio == NULL || zio->io_error == 0) 2877 bcopy(buf->b_data, arg, buf->b_hdr->b_size); 2878 VERIFY(arc_buf_remove_ref(buf, arg) == 1); 2879} 2880 2881/* a generic arc_done_func_t */ 2882void 2883arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg) 2884{ 2885 arc_buf_t **bufp = arg; 2886 if (zio && zio->io_error) { 2887 VERIFY(arc_buf_remove_ref(buf, arg) == 1); 2888 *bufp = NULL; 2889 } else { 2890 *bufp = buf; 2891 ASSERT(buf->b_data); 2892 } 2893} 2894 2895static void 2896arc_read_done(zio_t *zio) 2897{ 2898 arc_buf_hdr_t *hdr, *found; 2899 arc_buf_t *buf; 2900 arc_buf_t *abuf; /* buffer we're assigning to callback */ 2901 kmutex_t *hash_lock; 2902 arc_callback_t *callback_list, *acb; 2903 int freeable = FALSE; 2904 2905 buf = zio->io_private; 2906 hdr = buf->b_hdr; 2907 2908 /* 2909 * The hdr was inserted into hash-table and removed from lists 2910 * prior to starting I/O. We should find this header, since 2911 * it's in the hash table, and it should be legit since it's 2912 * not possible to evict it during the I/O. The only possible 2913 * reason for it not to be found is if we were freed during the 2914 * read. 2915 */ 2916 found = buf_hash_find(hdr->b_spa, &hdr->b_dva, hdr->b_birth, 2917 &hash_lock); 2918 2919 ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && hash_lock == NULL) || 2920 (found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) || 2921 (found == hdr && HDR_L2_READING(hdr))); 2922 2923 hdr->b_flags &= ~ARC_L2_EVICTED; 2924 if (l2arc_noprefetch && (hdr->b_flags & ARC_PREFETCH)) 2925 hdr->b_flags &= ~ARC_L2CACHE; 2926 2927 /* byteswap if necessary */ 2928 callback_list = hdr->b_acb; 2929 ASSERT(callback_list != NULL); 2930 if (BP_SHOULD_BYTESWAP(zio->io_bp) && zio->io_error == 0) { 2931 dmu_object_byteswap_t bswap = 2932 DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp)); 2933 arc_byteswap_func_t *func = BP_GET_LEVEL(zio->io_bp) > 0 ? 2934 byteswap_uint64_array : 2935 dmu_ot_byteswap[bswap].ob_func; 2936 func(buf->b_data, hdr->b_size); 2937 } 2938 2939 arc_cksum_compute(buf, B_FALSE); 2940#ifdef illumos 2941 arc_buf_watch(buf); 2942#endif /* illumos */ 2943 2944 if (hash_lock && zio->io_error == 0 && hdr->b_state == arc_anon) { 2945 /* 2946 * Only call arc_access on anonymous buffers. This is because 2947 * if we've issued an I/O for an evicted buffer, we've already 2948 * called arc_access (to prevent any simultaneous readers from 2949 * getting confused). 2950 */ 2951 arc_access(hdr, hash_lock); 2952 } 2953 2954 /* create copies of the data buffer for the callers */ 2955 abuf = buf; 2956 for (acb = callback_list; acb; acb = acb->acb_next) { 2957 if (acb->acb_done) { 2958 if (abuf == NULL) { 2959 ARCSTAT_BUMP(arcstat_duplicate_reads); 2960 abuf = arc_buf_clone(buf); 2961 } 2962 acb->acb_buf = abuf; 2963 abuf = NULL; 2964 } 2965 } 2966 hdr->b_acb = NULL; 2967 hdr->b_flags &= ~ARC_IO_IN_PROGRESS; 2968 ASSERT(!HDR_BUF_AVAILABLE(hdr)); 2969 if (abuf == buf) { 2970 ASSERT(buf->b_efunc == NULL); 2971 ASSERT(hdr->b_datacnt == 1); 2972 hdr->b_flags |= ARC_BUF_AVAILABLE; 2973 } 2974 2975 ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL); 2976 2977 if (zio->io_error != 0) { 2978 hdr->b_flags |= ARC_IO_ERROR; 2979 if (hdr->b_state != arc_anon) 2980 arc_change_state(arc_anon, hdr, hash_lock); 2981 if (HDR_IN_HASH_TABLE(hdr)) 2982 buf_hash_remove(hdr); 2983 freeable = refcount_is_zero(&hdr->b_refcnt); 2984 } 2985 2986 /* 2987 * Broadcast before we drop the hash_lock to avoid the possibility 2988 * that the hdr (and hence the cv) might be freed before we get to 2989 * the cv_broadcast(). 2990 */ 2991 cv_broadcast(&hdr->b_cv); 2992 2993 if (hash_lock) { 2994 mutex_exit(hash_lock); 2995 } else { 2996 /* 2997 * This block was freed while we waited for the read to 2998 * complete. It has been removed from the hash table and 2999 * moved to the anonymous state (so that it won't show up 3000 * in the cache). 3001 */ 3002 ASSERT3P(hdr->b_state, ==, arc_anon); 3003 freeable = refcount_is_zero(&hdr->b_refcnt); 3004 } 3005 3006 /* execute each callback and free its structure */ 3007 while ((acb = callback_list) != NULL) { 3008 if (acb->acb_done) 3009 acb->acb_done(zio, acb->acb_buf, acb->acb_private); 3010 3011 if (acb->acb_zio_dummy != NULL) { 3012 acb->acb_zio_dummy->io_error = zio->io_error; 3013 zio_nowait(acb->acb_zio_dummy); 3014 } 3015 3016 callback_list = acb->acb_next; 3017 kmem_free(acb, sizeof (arc_callback_t)); 3018 } 3019 3020 if (freeable) 3021 arc_hdr_destroy(hdr); 3022} 3023 3024/* 3025 * "Read" the block block at the specified DVA (in bp) via the 3026 * cache. If the block is found in the cache, invoke the provided 3027 * callback immediately and return. Note that the `zio' parameter 3028 * in the callback will be NULL in this case, since no IO was 3029 * required. If the block is not in the cache pass the read request 3030 * on to the spa with a substitute callback function, so that the 3031 * requested block will be added to the cache. 3032 * 3033 * If a read request arrives for a block that has a read in-progress, 3034 * either wait for the in-progress read to complete (and return the 3035 * results); or, if this is a read with a "done" func, add a record 3036 * to the read to invoke the "done" func when the read completes, 3037 * and return; or just return. 3038 * 3039 * arc_read_done() will invoke all the requested "done" functions 3040 * for readers of this block. 3041 */ 3042int 3043arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, 3044 void *private, int priority, int zio_flags, uint32_t *arc_flags, 3045 const zbookmark_t *zb) 3046{ 3047 arc_buf_hdr_t *hdr;
|
3445 3446 /* 3447 * Do we have more than one buf? 3448 */ 3449 if (hdr->b_datacnt > 1) { 3450 arc_buf_hdr_t *nhdr; 3451 arc_buf_t **bufp; 3452 uint64_t blksz = hdr->b_size; 3453 uint64_t spa = hdr->b_spa; 3454 arc_buf_contents_t type = hdr->b_type; 3455 uint32_t flags = hdr->b_flags; 3456 3457 ASSERT(hdr->b_buf != buf || buf->b_next != NULL); 3458 /* 3459 * Pull the data off of this hdr and attach it to 3460 * a new anonymous hdr. 3461 */ 3462 (void) remove_reference(hdr, hash_lock, tag); 3463 bufp = &hdr->b_buf; 3464 while (*bufp != buf) 3465 bufp = &(*bufp)->b_next; 3466 *bufp = buf->b_next; 3467 buf->b_next = NULL; 3468 3469 ASSERT3U(hdr->b_state->arcs_size, >=, hdr->b_size); 3470 atomic_add_64(&hdr->b_state->arcs_size, -hdr->b_size); 3471 if (refcount_is_zero(&hdr->b_refcnt)) { 3472 uint64_t *size = &hdr->b_state->arcs_lsize[hdr->b_type]; 3473 ASSERT3U(*size, >=, hdr->b_size); 3474 atomic_add_64(size, -hdr->b_size); 3475 } 3476 3477 /* 3478 * We're releasing a duplicate user data buffer, update 3479 * our statistics accordingly. 3480 */ 3481 if (hdr->b_type == ARC_BUFC_DATA) { 3482 ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers); 3483 ARCSTAT_INCR(arcstat_duplicate_buffers_size, 3484 -hdr->b_size); 3485 } 3486 hdr->b_datacnt -= 1; 3487 arc_cksum_verify(buf); 3488#ifdef illumos 3489 arc_buf_unwatch(buf); 3490#endif /* illumos */ 3491 3492 mutex_exit(hash_lock); 3493 3494 nhdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE); 3495 nhdr->b_size = blksz; 3496 nhdr->b_spa = spa; 3497 nhdr->b_type = type; 3498 nhdr->b_buf = buf; 3499 nhdr->b_state = arc_anon; 3500 nhdr->b_arc_access = 0; 3501 nhdr->b_flags = flags & ARC_L2_WRITING; 3502 nhdr->b_l2hdr = NULL; 3503 nhdr->b_datacnt = 1; 3504 nhdr->b_freeze_cksum = NULL; 3505 (void) refcount_add(&nhdr->b_refcnt, tag); 3506 buf->b_hdr = nhdr; 3507 mutex_exit(&buf->b_evict_lock); 3508 atomic_add_64(&arc_anon->arcs_size, blksz); 3509 } else { 3510 mutex_exit(&buf->b_evict_lock); 3511 ASSERT(refcount_count(&hdr->b_refcnt) == 1); 3512 ASSERT(!list_link_active(&hdr->b_arc_node)); 3513 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 3514 if (hdr->b_state != arc_anon) 3515 arc_change_state(arc_anon, hdr, hash_lock); 3516 hdr->b_arc_access = 0; 3517 if (hash_lock) 3518 mutex_exit(hash_lock); 3519 3520 buf_discard_identity(hdr); 3521 arc_buf_thaw(buf); 3522 } 3523 buf->b_efunc = NULL; 3524 buf->b_private = NULL; 3525 3526 if (l2hdr) { 3527 list_remove(l2hdr->b_dev->l2ad_buflist, hdr); 3528 kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t)); 3529 ARCSTAT_INCR(arcstat_l2_size, -buf_size); 3530 mutex_exit(&l2arc_buflist_mtx); 3531 } 3532} 3533 3534int 3535arc_released(arc_buf_t *buf) 3536{ 3537 int released; 3538 3539 mutex_enter(&buf->b_evict_lock); 3540 released = (buf->b_data != NULL && buf->b_hdr->b_state == arc_anon); 3541 mutex_exit(&buf->b_evict_lock); 3542 return (released); 3543} 3544 3545int 3546arc_has_callback(arc_buf_t *buf) 3547{ 3548 int callback; 3549 3550 mutex_enter(&buf->b_evict_lock); 3551 callback = (buf->b_efunc != NULL); 3552 mutex_exit(&buf->b_evict_lock); 3553 return (callback); 3554} 3555 3556#ifdef ZFS_DEBUG 3557int 3558arc_referenced(arc_buf_t *buf) 3559{ 3560 int referenced; 3561 3562 mutex_enter(&buf->b_evict_lock); 3563 referenced = (refcount_count(&buf->b_hdr->b_refcnt)); 3564 mutex_exit(&buf->b_evict_lock); 3565 return (referenced); 3566} 3567#endif 3568 3569static void 3570arc_write_ready(zio_t *zio) 3571{ 3572 arc_write_callback_t *callback = zio->io_private; 3573 arc_buf_t *buf = callback->awcb_buf; 3574 arc_buf_hdr_t *hdr = buf->b_hdr; 3575 3576 ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt)); 3577 callback->awcb_ready(zio, buf, callback->awcb_private); 3578 3579 /* 3580 * If the IO is already in progress, then this is a re-write 3581 * attempt, so we need to thaw and re-compute the cksum. 3582 * It is the responsibility of the callback to handle the 3583 * accounting for any re-write attempt. 3584 */ 3585 if (HDR_IO_IN_PROGRESS(hdr)) { 3586 mutex_enter(&hdr->b_freeze_lock); 3587 if (hdr->b_freeze_cksum != NULL) { 3588 kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t)); 3589 hdr->b_freeze_cksum = NULL; 3590 } 3591 mutex_exit(&hdr->b_freeze_lock); 3592 } 3593 arc_cksum_compute(buf, B_FALSE); 3594 hdr->b_flags |= ARC_IO_IN_PROGRESS; 3595} 3596 3597static void 3598arc_write_done(zio_t *zio) 3599{ 3600 arc_write_callback_t *callback = zio->io_private; 3601 arc_buf_t *buf = callback->awcb_buf; 3602 arc_buf_hdr_t *hdr = buf->b_hdr; 3603 3604 ASSERT(hdr->b_acb == NULL); 3605 3606 if (zio->io_error == 0) { 3607 hdr->b_dva = *BP_IDENTITY(zio->io_bp); 3608 hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp); 3609 hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0]; 3610 } else { 3611 ASSERT(BUF_EMPTY(hdr)); 3612 } 3613 3614 /* 3615 * If the block to be written was all-zero, we may have 3616 * compressed it away. In this case no write was performed 3617 * so there will be no dva/birth/checksum. The buffer must 3618 * therefore remain anonymous (and uncached). 3619 */ 3620 if (!BUF_EMPTY(hdr)) { 3621 arc_buf_hdr_t *exists; 3622 kmutex_t *hash_lock; 3623 3624 ASSERT(zio->io_error == 0); 3625 3626 arc_cksum_verify(buf); 3627 3628 exists = buf_hash_insert(hdr, &hash_lock); 3629 if (exists) { 3630 /* 3631 * This can only happen if we overwrite for 3632 * sync-to-convergence, because we remove 3633 * buffers from the hash table when we arc_free(). 3634 */ 3635 if (zio->io_flags & ZIO_FLAG_IO_REWRITE) { 3636 if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp)) 3637 panic("bad overwrite, hdr=%p exists=%p", 3638 (void *)hdr, (void *)exists); 3639 ASSERT(refcount_is_zero(&exists->b_refcnt)); 3640 arc_change_state(arc_anon, exists, hash_lock); 3641 mutex_exit(hash_lock); 3642 arc_hdr_destroy(exists); 3643 exists = buf_hash_insert(hdr, &hash_lock); 3644 ASSERT3P(exists, ==, NULL); 3645 } else if (zio->io_flags & ZIO_FLAG_NOPWRITE) { 3646 /* nopwrite */ 3647 ASSERT(zio->io_prop.zp_nopwrite); 3648 if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp)) 3649 panic("bad nopwrite, hdr=%p exists=%p", 3650 (void *)hdr, (void *)exists); 3651 } else { 3652 /* Dedup */ 3653 ASSERT(hdr->b_datacnt == 1); 3654 ASSERT(hdr->b_state == arc_anon); 3655 ASSERT(BP_GET_DEDUP(zio->io_bp)); 3656 ASSERT(BP_GET_LEVEL(zio->io_bp) == 0); 3657 } 3658 } 3659 hdr->b_flags &= ~ARC_IO_IN_PROGRESS; 3660 /* if it's not anon, we are doing a scrub */ 3661 if (!exists && hdr->b_state == arc_anon) 3662 arc_access(hdr, hash_lock); 3663 mutex_exit(hash_lock); 3664 } else { 3665 hdr->b_flags &= ~ARC_IO_IN_PROGRESS; 3666 } 3667 3668 ASSERT(!refcount_is_zero(&hdr->b_refcnt)); 3669 callback->awcb_done(zio, buf, callback->awcb_private); 3670 3671 kmem_free(callback, sizeof (arc_write_callback_t)); 3672} 3673 3674zio_t * 3675arc_write(zio_t *pio, spa_t *spa, uint64_t txg, 3676 blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, const zio_prop_t *zp, 3677 arc_done_func_t *ready, arc_done_func_t *done, void *private, 3678 int priority, int zio_flags, const zbookmark_t *zb) 3679{ 3680 arc_buf_hdr_t *hdr = buf->b_hdr; 3681 arc_write_callback_t *callback; 3682 zio_t *zio; 3683 3684 ASSERT(ready != NULL); 3685 ASSERT(done != NULL); 3686 ASSERT(!HDR_IO_ERROR(hdr)); 3687 ASSERT((hdr->b_flags & ARC_IO_IN_PROGRESS) == 0); 3688 ASSERT(hdr->b_acb == NULL); 3689 if (l2arc) 3690 hdr->b_flags |= ARC_L2CACHE; 3691 callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP); 3692 callback->awcb_ready = ready; 3693 callback->awcb_done = done; 3694 callback->awcb_private = private; 3695 callback->awcb_buf = buf; 3696 3697 zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, zp, 3698 arc_write_ready, arc_write_done, callback, priority, zio_flags, zb); 3699 3700 return (zio); 3701} 3702 3703static int 3704arc_memory_throttle(uint64_t reserve, uint64_t inflight_data, uint64_t txg) 3705{ 3706#ifdef _KERNEL 3707 uint64_t available_memory = 3708 ptoa((uintmax_t)cnt.v_free_count + cnt.v_cache_count); 3709 static uint64_t page_load = 0; 3710 static uint64_t last_txg = 0; 3711 3712#ifdef sun 3713#if defined(__i386) 3714 available_memory = 3715 MIN(available_memory, vmem_size(heap_arena, VMEM_FREE)); 3716#endif 3717#endif /* sun */ 3718 if (available_memory >= zfs_write_limit_max) 3719 return (0); 3720 3721 if (txg > last_txg) { 3722 last_txg = txg; 3723 page_load = 0; 3724 } 3725 /* 3726 * If we are in pageout, we know that memory is already tight, 3727 * the arc is already going to be evicting, so we just want to 3728 * continue to let page writes occur as quickly as possible. 3729 */ 3730 if (curproc == pageproc) { 3731 if (page_load > available_memory / 4) 3732 return (ERESTART); 3733 /* Note: reserve is inflated, so we deflate */ 3734 page_load += reserve / 8; 3735 return (0); 3736 } else if (page_load > 0 && arc_reclaim_needed()) { 3737 /* memory is low, delay before restarting */ 3738 ARCSTAT_INCR(arcstat_memory_throttle_count, 1); 3739 return (EAGAIN); 3740 } 3741 page_load = 0; 3742 3743 if (arc_size > arc_c_min) { 3744 uint64_t evictable_memory = 3745 arc_mru->arcs_lsize[ARC_BUFC_DATA] + 3746 arc_mru->arcs_lsize[ARC_BUFC_METADATA] + 3747 arc_mfu->arcs_lsize[ARC_BUFC_DATA] + 3748 arc_mfu->arcs_lsize[ARC_BUFC_METADATA]; 3749 available_memory += MIN(evictable_memory, arc_size - arc_c_min); 3750 } 3751 3752 if (inflight_data > available_memory / 4) { 3753 ARCSTAT_INCR(arcstat_memory_throttle_count, 1); 3754 return (ERESTART); 3755 } 3756#endif 3757 return (0); 3758} 3759 3760void 3761arc_tempreserve_clear(uint64_t reserve) 3762{ 3763 atomic_add_64(&arc_tempreserve, -reserve); 3764 ASSERT((int64_t)arc_tempreserve >= 0); 3765} 3766 3767int 3768arc_tempreserve_space(uint64_t reserve, uint64_t txg) 3769{ 3770 int error; 3771 uint64_t anon_size; 3772 3773#ifdef ZFS_DEBUG 3774 /* 3775 * Once in a while, fail for no reason. Everything should cope. 3776 */ 3777 if (spa_get_random(10000) == 0) { 3778 dprintf("forcing random failure\n"); 3779 return (ERESTART); 3780 } 3781#endif 3782 if (reserve > arc_c/4 && !arc_no_grow) 3783 arc_c = MIN(arc_c_max, reserve * 4); 3784 if (reserve > arc_c) 3785 return (ENOMEM); 3786 3787 /* 3788 * Don't count loaned bufs as in flight dirty data to prevent long 3789 * network delays from blocking transactions that are ready to be 3790 * assigned to a txg. 3791 */ 3792 anon_size = MAX((int64_t)(arc_anon->arcs_size - arc_loaned_bytes), 0); 3793 3794 /* 3795 * Writes will, almost always, require additional memory allocations 3796 * in order to compress/encrypt/etc the data. We therefor need to 3797 * make sure that there is sufficient available memory for this. 3798 */ 3799 if (error = arc_memory_throttle(reserve, anon_size, txg)) 3800 return (error); 3801 3802 /* 3803 * Throttle writes when the amount of dirty data in the cache 3804 * gets too large. We try to keep the cache less than half full 3805 * of dirty blocks so that our sync times don't grow too large. 3806 * Note: if two requests come in concurrently, we might let them 3807 * both succeed, when one of them should fail. Not a huge deal. 3808 */ 3809 3810 if (reserve + arc_tempreserve + anon_size > arc_c / 2 && 3811 anon_size > arc_c / 4) { 3812 dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK " 3813 "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n", 3814 arc_tempreserve>>10, 3815 arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10, 3816 arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10, 3817 reserve>>10, arc_c>>10); 3818 return (ERESTART); 3819 } 3820 atomic_add_64(&arc_tempreserve, reserve); 3821 return (0); 3822} 3823 3824static kmutex_t arc_lowmem_lock; 3825#ifdef _KERNEL 3826static eventhandler_tag arc_event_lowmem = NULL; 3827 3828static void 3829arc_lowmem(void *arg __unused, int howto __unused) 3830{ 3831 3832 /* Serialize access via arc_lowmem_lock. */ 3833 mutex_enter(&arc_lowmem_lock); 3834 mutex_enter(&arc_reclaim_thr_lock); 3835 needfree = 1; 3836 cv_signal(&arc_reclaim_thr_cv); 3837 3838 /* 3839 * It is unsafe to block here in arbitrary threads, because we can come 3840 * here from ARC itself and may hold ARC locks and thus risk a deadlock 3841 * with ARC reclaim thread. 3842 */ 3843 if (curproc == pageproc) { 3844 while (needfree) 3845 msleep(&needfree, &arc_reclaim_thr_lock, 0, "zfs:lowmem", 0); 3846 } 3847 mutex_exit(&arc_reclaim_thr_lock); 3848 mutex_exit(&arc_lowmem_lock); 3849} 3850#endif 3851 3852void 3853arc_init(void) 3854{ 3855 int i, prefetch_tunable_set = 0; 3856 3857 mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL); 3858 cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL); 3859 mutex_init(&arc_lowmem_lock, NULL, MUTEX_DEFAULT, NULL); 3860 3861 /* Convert seconds to clock ticks */ 3862 arc_min_prefetch_lifespan = 1 * hz; 3863 3864 /* Start out with 1/8 of all memory */ 3865 arc_c = kmem_size() / 8; 3866 3867#ifdef sun 3868#ifdef _KERNEL 3869 /* 3870 * On architectures where the physical memory can be larger 3871 * than the addressable space (intel in 32-bit mode), we may 3872 * need to limit the cache to 1/8 of VM size. 3873 */ 3874 arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8); 3875#endif 3876#endif /* sun */ 3877 /* set min cache to 1/32 of all memory, or 16MB, whichever is more */ 3878 arc_c_min = MAX(arc_c / 4, 64<<18); 3879 /* set max to 1/2 of all memory, or all but 1GB, whichever is more */ 3880 if (arc_c * 8 >= 1<<30) 3881 arc_c_max = (arc_c * 8) - (1<<30); 3882 else 3883 arc_c_max = arc_c_min; 3884 arc_c_max = MAX(arc_c * 5, arc_c_max); 3885 3886#ifdef _KERNEL 3887 /* 3888 * Allow the tunables to override our calculations if they are 3889 * reasonable (ie. over 16MB) 3890 */ 3891 if (zfs_arc_max > 64<<18 && zfs_arc_max < kmem_size()) 3892 arc_c_max = zfs_arc_max; 3893 if (zfs_arc_min > 64<<18 && zfs_arc_min <= arc_c_max) 3894 arc_c_min = zfs_arc_min; 3895#endif 3896 3897 arc_c = arc_c_max; 3898 arc_p = (arc_c >> 1); 3899 3900 /* limit meta-data to 1/4 of the arc capacity */ 3901 arc_meta_limit = arc_c_max / 4; 3902 3903 /* Allow the tunable to override if it is reasonable */ 3904 if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max) 3905 arc_meta_limit = zfs_arc_meta_limit; 3906 3907 if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0) 3908 arc_c_min = arc_meta_limit / 2; 3909 3910 if (zfs_arc_grow_retry > 0) 3911 arc_grow_retry = zfs_arc_grow_retry; 3912 3913 if (zfs_arc_shrink_shift > 0) 3914 arc_shrink_shift = zfs_arc_shrink_shift; 3915 3916 if (zfs_arc_p_min_shift > 0) 3917 arc_p_min_shift = zfs_arc_p_min_shift; 3918 3919 /* if kmem_flags are set, lets try to use less memory */ 3920 if (kmem_debugging()) 3921 arc_c = arc_c / 2; 3922 if (arc_c < arc_c_min) 3923 arc_c = arc_c_min; 3924 3925 zfs_arc_min = arc_c_min; 3926 zfs_arc_max = arc_c_max; 3927 3928 arc_anon = &ARC_anon; 3929 arc_mru = &ARC_mru; 3930 arc_mru_ghost = &ARC_mru_ghost; 3931 arc_mfu = &ARC_mfu; 3932 arc_mfu_ghost = &ARC_mfu_ghost; 3933 arc_l2c_only = &ARC_l2c_only; 3934 arc_size = 0; 3935 3936 for (i = 0; i < ARC_BUFC_NUMLISTS; i++) { 3937 mutex_init(&arc_anon->arcs_locks[i].arcs_lock, 3938 NULL, MUTEX_DEFAULT, NULL); 3939 mutex_init(&arc_mru->arcs_locks[i].arcs_lock, 3940 NULL, MUTEX_DEFAULT, NULL); 3941 mutex_init(&arc_mru_ghost->arcs_locks[i].arcs_lock, 3942 NULL, MUTEX_DEFAULT, NULL); 3943 mutex_init(&arc_mfu->arcs_locks[i].arcs_lock, 3944 NULL, MUTEX_DEFAULT, NULL); 3945 mutex_init(&arc_mfu_ghost->arcs_locks[i].arcs_lock, 3946 NULL, MUTEX_DEFAULT, NULL); 3947 mutex_init(&arc_l2c_only->arcs_locks[i].arcs_lock, 3948 NULL, MUTEX_DEFAULT, NULL); 3949 3950 list_create(&arc_mru->arcs_lists[i], 3951 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 3952 list_create(&arc_mru_ghost->arcs_lists[i], 3953 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 3954 list_create(&arc_mfu->arcs_lists[i], 3955 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 3956 list_create(&arc_mfu_ghost->arcs_lists[i], 3957 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 3958 list_create(&arc_mfu_ghost->arcs_lists[i], 3959 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 3960 list_create(&arc_l2c_only->arcs_lists[i], 3961 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 3962 } 3963 3964 buf_init(); 3965 3966 arc_thread_exit = 0; 3967 arc_eviction_list = NULL; 3968 mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL); 3969 bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t)); 3970 3971 arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED, 3972 sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); 3973 3974 if (arc_ksp != NULL) { 3975 arc_ksp->ks_data = &arc_stats; 3976 kstat_install(arc_ksp); 3977 } 3978 3979 (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0, 3980 TS_RUN, minclsyspri); 3981 3982#ifdef _KERNEL 3983 arc_event_lowmem = EVENTHANDLER_REGISTER(vm_lowmem, arc_lowmem, NULL, 3984 EVENTHANDLER_PRI_FIRST); 3985#endif 3986 3987 arc_dead = FALSE; 3988 arc_warm = B_FALSE; 3989 3990 if (zfs_write_limit_max == 0) 3991 zfs_write_limit_max = ptob(physmem) >> zfs_write_limit_shift; 3992 else 3993 zfs_write_limit_shift = 0; 3994 mutex_init(&zfs_write_limit_lock, NULL, MUTEX_DEFAULT, NULL); 3995 3996#ifdef _KERNEL 3997 if (TUNABLE_INT_FETCH("vfs.zfs.prefetch_disable", &zfs_prefetch_disable)) 3998 prefetch_tunable_set = 1; 3999 4000#ifdef __i386__ 4001 if (prefetch_tunable_set == 0) { 4002 printf("ZFS NOTICE: Prefetch is disabled by default on i386 " 4003 "-- to enable,\n"); 4004 printf(" add \"vfs.zfs.prefetch_disable=0\" " 4005 "to /boot/loader.conf.\n"); 4006 zfs_prefetch_disable = 1; 4007 } 4008#else 4009 if ((((uint64_t)physmem * PAGESIZE) < (1ULL << 32)) && 4010 prefetch_tunable_set == 0) { 4011 printf("ZFS NOTICE: Prefetch is disabled by default if less " 4012 "than 4GB of RAM is present;\n" 4013 " to enable, add \"vfs.zfs.prefetch_disable=0\" " 4014 "to /boot/loader.conf.\n"); 4015 zfs_prefetch_disable = 1; 4016 } 4017#endif 4018 /* Warn about ZFS memory and address space requirements. */ 4019 if (((uint64_t)physmem * PAGESIZE) < (256 + 128 + 64) * (1 << 20)) { 4020 printf("ZFS WARNING: Recommended minimum RAM size is 512MB; " 4021 "expect unstable behavior.\n"); 4022 } 4023 if (kmem_size() < 512 * (1 << 20)) { 4024 printf("ZFS WARNING: Recommended minimum kmem_size is 512MB; " 4025 "expect unstable behavior.\n"); 4026 printf(" Consider tuning vm.kmem_size and " 4027 "vm.kmem_size_max\n"); 4028 printf(" in /boot/loader.conf.\n"); 4029 } 4030#endif 4031} 4032 4033void 4034arc_fini(void) 4035{ 4036 int i; 4037 4038 mutex_enter(&arc_reclaim_thr_lock); 4039 arc_thread_exit = 1; 4040 cv_signal(&arc_reclaim_thr_cv); 4041 while (arc_thread_exit != 0) 4042 cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock); 4043 mutex_exit(&arc_reclaim_thr_lock); 4044 4045 arc_flush(NULL); 4046 4047 arc_dead = TRUE; 4048 4049 if (arc_ksp != NULL) { 4050 kstat_delete(arc_ksp); 4051 arc_ksp = NULL; 4052 } 4053 4054 mutex_destroy(&arc_eviction_mtx); 4055 mutex_destroy(&arc_reclaim_thr_lock); 4056 cv_destroy(&arc_reclaim_thr_cv); 4057 4058 for (i = 0; i < ARC_BUFC_NUMLISTS; i++) { 4059 list_destroy(&arc_mru->arcs_lists[i]); 4060 list_destroy(&arc_mru_ghost->arcs_lists[i]); 4061 list_destroy(&arc_mfu->arcs_lists[i]); 4062 list_destroy(&arc_mfu_ghost->arcs_lists[i]); 4063 list_destroy(&arc_l2c_only->arcs_lists[i]); 4064 4065 mutex_destroy(&arc_anon->arcs_locks[i].arcs_lock); 4066 mutex_destroy(&arc_mru->arcs_locks[i].arcs_lock); 4067 mutex_destroy(&arc_mru_ghost->arcs_locks[i].arcs_lock); 4068 mutex_destroy(&arc_mfu->arcs_locks[i].arcs_lock); 4069 mutex_destroy(&arc_mfu_ghost->arcs_locks[i].arcs_lock); 4070 mutex_destroy(&arc_l2c_only->arcs_locks[i].arcs_lock); 4071 } 4072 4073 mutex_destroy(&zfs_write_limit_lock); 4074 4075 buf_fini(); 4076 4077 ASSERT(arc_loaned_bytes == 0); 4078 4079 mutex_destroy(&arc_lowmem_lock); 4080#ifdef _KERNEL 4081 if (arc_event_lowmem != NULL) 4082 EVENTHANDLER_DEREGISTER(vm_lowmem, arc_event_lowmem); 4083#endif 4084} 4085 4086/* 4087 * Level 2 ARC 4088 * 4089 * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk. 4090 * It uses dedicated storage devices to hold cached data, which are populated 4091 * using large infrequent writes. The main role of this cache is to boost 4092 * the performance of random read workloads. The intended L2ARC devices 4093 * include short-stroked disks, solid state disks, and other media with 4094 * substantially faster read latency than disk. 4095 * 4096 * +-----------------------+ 4097 * | ARC | 4098 * +-----------------------+ 4099 * | ^ ^ 4100 * | | | 4101 * l2arc_feed_thread() arc_read() 4102 * | | | 4103 * | l2arc read | 4104 * V | | 4105 * +---------------+ | 4106 * | L2ARC | | 4107 * +---------------+ | 4108 * | ^ | 4109 * l2arc_write() | | 4110 * | | | 4111 * V | | 4112 * +-------+ +-------+ 4113 * | vdev | | vdev | 4114 * | cache | | cache | 4115 * +-------+ +-------+ 4116 * +=========+ .-----. 4117 * : L2ARC : |-_____-| 4118 * : devices : | Disks | 4119 * +=========+ `-_____-' 4120 * 4121 * Read requests are satisfied from the following sources, in order: 4122 * 4123 * 1) ARC 4124 * 2) vdev cache of L2ARC devices 4125 * 3) L2ARC devices 4126 * 4) vdev cache of disks 4127 * 5) disks 4128 * 4129 * Some L2ARC device types exhibit extremely slow write performance. 4130 * To accommodate for this there are some significant differences between 4131 * the L2ARC and traditional cache design: 4132 * 4133 * 1. There is no eviction path from the ARC to the L2ARC. Evictions from 4134 * the ARC behave as usual, freeing buffers and placing headers on ghost 4135 * lists. The ARC does not send buffers to the L2ARC during eviction as 4136 * this would add inflated write latencies for all ARC memory pressure. 4137 * 4138 * 2. The L2ARC attempts to cache data from the ARC before it is evicted. 4139 * It does this by periodically scanning buffers from the eviction-end of 4140 * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are 4141 * not already there. It scans until a headroom of buffers is satisfied, 4142 * which itself is a buffer for ARC eviction. The thread that does this is 4143 * l2arc_feed_thread(), illustrated below; example sizes are included to 4144 * provide a better sense of ratio than this diagram: 4145 * 4146 * head --> tail 4147 * +---------------------+----------+ 4148 * ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->. # already on L2ARC 4149 * +---------------------+----------+ | o L2ARC eligible 4150 * ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->| : ARC buffer 4151 * +---------------------+----------+ | 4152 * 15.9 Gbytes ^ 32 Mbytes | 4153 * headroom | 4154 * l2arc_feed_thread() 4155 * | 4156 * l2arc write hand <--[oooo]--' 4157 * | 8 Mbyte 4158 * | write max 4159 * V 4160 * +==============================+ 4161 * L2ARC dev |####|#|###|###| |####| ... | 4162 * +==============================+ 4163 * 32 Gbytes 4164 * 4165 * 3. If an ARC buffer is copied to the L2ARC but then hit instead of 4166 * evicted, then the L2ARC has cached a buffer much sooner than it probably 4167 * needed to, potentially wasting L2ARC device bandwidth and storage. It is 4168 * safe to say that this is an uncommon case, since buffers at the end of 4169 * the ARC lists have moved there due to inactivity. 4170 * 4171 * 4. If the ARC evicts faster than the L2ARC can maintain a headroom, 4172 * then the L2ARC simply misses copying some buffers. This serves as a 4173 * pressure valve to prevent heavy read workloads from both stalling the ARC 4174 * with waits and clogging the L2ARC with writes. This also helps prevent 4175 * the potential for the L2ARC to churn if it attempts to cache content too 4176 * quickly, such as during backups of the entire pool. 4177 * 4178 * 5. After system boot and before the ARC has filled main memory, there are 4179 * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru 4180 * lists can remain mostly static. Instead of searching from tail of these 4181 * lists as pictured, the l2arc_feed_thread() will search from the list heads 4182 * for eligible buffers, greatly increasing its chance of finding them. 4183 * 4184 * The L2ARC device write speed is also boosted during this time so that 4185 * the L2ARC warms up faster. Since there have been no ARC evictions yet, 4186 * there are no L2ARC reads, and no fear of degrading read performance 4187 * through increased writes. 4188 * 4189 * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that 4190 * the vdev queue can aggregate them into larger and fewer writes. Each 4191 * device is written to in a rotor fashion, sweeping writes through 4192 * available space then repeating. 4193 * 4194 * 7. The L2ARC does not store dirty content. It never needs to flush 4195 * write buffers back to disk based storage. 4196 * 4197 * 8. If an ARC buffer is written (and dirtied) which also exists in the 4198 * L2ARC, the now stale L2ARC buffer is immediately dropped. 4199 * 4200 * The performance of the L2ARC can be tweaked by a number of tunables, which 4201 * may be necessary for different workloads: 4202 * 4203 * l2arc_write_max max write bytes per interval 4204 * l2arc_write_boost extra write bytes during device warmup 4205 * l2arc_noprefetch skip caching prefetched buffers 4206 * l2arc_headroom number of max device writes to precache 4207 * l2arc_feed_secs seconds between L2ARC writing 4208 * 4209 * Tunables may be removed or added as future performance improvements are 4210 * integrated, and also may become zpool properties. 4211 * 4212 * There are three key functions that control how the L2ARC warms up: 4213 * 4214 * l2arc_write_eligible() check if a buffer is eligible to cache 4215 * l2arc_write_size() calculate how much to write 4216 * l2arc_write_interval() calculate sleep delay between writes 4217 * 4218 * These three functions determine what to write, how much, and how quickly 4219 * to send writes. 4220 */ 4221 4222static boolean_t 4223l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab) 4224{ 4225 /* 4226 * A buffer is *not* eligible for the L2ARC if it: 4227 * 1. belongs to a different spa. 4228 * 2. is already cached on the L2ARC. 4229 * 3. has an I/O in progress (it may be an incomplete read). 4230 * 4. is flagged not eligible (zfs property). 4231 */ 4232 if (ab->b_spa != spa_guid) { 4233 ARCSTAT_BUMP(arcstat_l2_write_spa_mismatch); 4234 return (B_FALSE); 4235 } 4236 if (ab->b_l2hdr != NULL) { 4237 ARCSTAT_BUMP(arcstat_l2_write_in_l2); 4238 return (B_FALSE); 4239 } 4240 if (HDR_IO_IN_PROGRESS(ab)) { 4241 ARCSTAT_BUMP(arcstat_l2_write_hdr_io_in_progress); 4242 return (B_FALSE); 4243 } 4244 if (!HDR_L2CACHE(ab)) { 4245 ARCSTAT_BUMP(arcstat_l2_write_not_cacheable); 4246 return (B_FALSE); 4247 } 4248 4249 return (B_TRUE); 4250} 4251 4252static uint64_t 4253l2arc_write_size(l2arc_dev_t *dev) 4254{ 4255 uint64_t size; 4256 4257 size = dev->l2ad_write; 4258 4259 if (arc_warm == B_FALSE) 4260 size += dev->l2ad_boost; 4261 4262 return (size); 4263 4264} 4265 4266static clock_t 4267l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote) 4268{ 4269 clock_t interval, next, now; 4270 4271 /* 4272 * If the ARC lists are busy, increase our write rate; if the 4273 * lists are stale, idle back. This is achieved by checking 4274 * how much we previously wrote - if it was more than half of 4275 * what we wanted, schedule the next write much sooner. 4276 */ 4277 if (l2arc_feed_again && wrote > (wanted / 2)) 4278 interval = (hz * l2arc_feed_min_ms) / 1000; 4279 else 4280 interval = hz * l2arc_feed_secs; 4281 4282 now = ddi_get_lbolt(); 4283 next = MAX(now, MIN(now + interval, began + interval)); 4284 4285 return (next); 4286} 4287 4288static void 4289l2arc_hdr_stat_add(void) 4290{ 4291 ARCSTAT_INCR(arcstat_l2_hdr_size, HDR_SIZE + L2HDR_SIZE); 4292 ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE); 4293} 4294 4295static void 4296l2arc_hdr_stat_remove(void) 4297{ 4298 ARCSTAT_INCR(arcstat_l2_hdr_size, -(HDR_SIZE + L2HDR_SIZE)); 4299 ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE); 4300} 4301 4302/* 4303 * Cycle through L2ARC devices. This is how L2ARC load balances. 4304 * If a device is returned, this also returns holding the spa config lock. 4305 */ 4306static l2arc_dev_t * 4307l2arc_dev_get_next(void) 4308{ 4309 l2arc_dev_t *first, *next = NULL; 4310 4311 /* 4312 * Lock out the removal of spas (spa_namespace_lock), then removal 4313 * of cache devices (l2arc_dev_mtx). Once a device has been selected, 4314 * both locks will be dropped and a spa config lock held instead. 4315 */ 4316 mutex_enter(&spa_namespace_lock); 4317 mutex_enter(&l2arc_dev_mtx); 4318 4319 /* if there are no vdevs, there is nothing to do */ 4320 if (l2arc_ndev == 0) 4321 goto out; 4322 4323 first = NULL; 4324 next = l2arc_dev_last; 4325 do { 4326 /* loop around the list looking for a non-faulted vdev */ 4327 if (next == NULL) { 4328 next = list_head(l2arc_dev_list); 4329 } else { 4330 next = list_next(l2arc_dev_list, next); 4331 if (next == NULL) 4332 next = list_head(l2arc_dev_list); 4333 } 4334 4335 /* if we have come back to the start, bail out */ 4336 if (first == NULL) 4337 first = next; 4338 else if (next == first) 4339 break; 4340 4341 } while (vdev_is_dead(next->l2ad_vdev)); 4342 4343 /* if we were unable to find any usable vdevs, return NULL */ 4344 if (vdev_is_dead(next->l2ad_vdev)) 4345 next = NULL; 4346 4347 l2arc_dev_last = next; 4348 4349out: 4350 mutex_exit(&l2arc_dev_mtx); 4351 4352 /* 4353 * Grab the config lock to prevent the 'next' device from being 4354 * removed while we are writing to it. 4355 */ 4356 if (next != NULL) 4357 spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER); 4358 mutex_exit(&spa_namespace_lock); 4359 4360 return (next); 4361} 4362 4363/* 4364 * Free buffers that were tagged for destruction. 4365 */ 4366static void 4367l2arc_do_free_on_write() 4368{ 4369 list_t *buflist; 4370 l2arc_data_free_t *df, *df_prev; 4371 4372 mutex_enter(&l2arc_free_on_write_mtx); 4373 buflist = l2arc_free_on_write; 4374 4375 for (df = list_tail(buflist); df; df = df_prev) { 4376 df_prev = list_prev(buflist, df); 4377 ASSERT(df->l2df_data != NULL); 4378 ASSERT(df->l2df_func != NULL); 4379 df->l2df_func(df->l2df_data, df->l2df_size); 4380 list_remove(buflist, df); 4381 kmem_free(df, sizeof (l2arc_data_free_t)); 4382 } 4383 4384 mutex_exit(&l2arc_free_on_write_mtx); 4385} 4386 4387/* 4388 * A write to a cache device has completed. Update all headers to allow 4389 * reads from these buffers to begin. 4390 */ 4391static void 4392l2arc_write_done(zio_t *zio) 4393{ 4394 l2arc_write_callback_t *cb; 4395 l2arc_dev_t *dev; 4396 list_t *buflist; 4397 arc_buf_hdr_t *head, *ab, *ab_prev; 4398 l2arc_buf_hdr_t *abl2; 4399 kmutex_t *hash_lock; 4400 4401 cb = zio->io_private; 4402 ASSERT(cb != NULL); 4403 dev = cb->l2wcb_dev; 4404 ASSERT(dev != NULL); 4405 head = cb->l2wcb_head; 4406 ASSERT(head != NULL); 4407 buflist = dev->l2ad_buflist; 4408 ASSERT(buflist != NULL); 4409 DTRACE_PROBE2(l2arc__iodone, zio_t *, zio, 4410 l2arc_write_callback_t *, cb); 4411 4412 if (zio->io_error != 0) 4413 ARCSTAT_BUMP(arcstat_l2_writes_error); 4414 4415 mutex_enter(&l2arc_buflist_mtx); 4416 4417 /* 4418 * All writes completed, or an error was hit. 4419 */ 4420 for (ab = list_prev(buflist, head); ab; ab = ab_prev) { 4421 ab_prev = list_prev(buflist, ab); 4422 4423 hash_lock = HDR_LOCK(ab); 4424 if (!mutex_tryenter(hash_lock)) { 4425 /* 4426 * This buffer misses out. It may be in a stage 4427 * of eviction. Its ARC_L2_WRITING flag will be 4428 * left set, denying reads to this buffer. 4429 */ 4430 ARCSTAT_BUMP(arcstat_l2_writes_hdr_miss); 4431 continue; 4432 } 4433 4434 if (zio->io_error != 0) { 4435 /* 4436 * Error - drop L2ARC entry. 4437 */ 4438 list_remove(buflist, ab); 4439 abl2 = ab->b_l2hdr; 4440 ab->b_l2hdr = NULL; 4441 kmem_free(abl2, sizeof (l2arc_buf_hdr_t)); 4442 ARCSTAT_INCR(arcstat_l2_size, -ab->b_size); 4443 } 4444 4445 /* 4446 * Allow ARC to begin reads to this L2ARC entry. 4447 */ 4448 ab->b_flags &= ~ARC_L2_WRITING; 4449 4450 mutex_exit(hash_lock); 4451 } 4452 4453 atomic_inc_64(&l2arc_writes_done); 4454 list_remove(buflist, head); 4455 kmem_cache_free(hdr_cache, head); 4456 mutex_exit(&l2arc_buflist_mtx); 4457 4458 l2arc_do_free_on_write(); 4459 4460 kmem_free(cb, sizeof (l2arc_write_callback_t)); 4461} 4462 4463/* 4464 * A read to a cache device completed. Validate buffer contents before 4465 * handing over to the regular ARC routines. 4466 */ 4467static void 4468l2arc_read_done(zio_t *zio) 4469{ 4470 l2arc_read_callback_t *cb; 4471 arc_buf_hdr_t *hdr; 4472 arc_buf_t *buf; 4473 kmutex_t *hash_lock; 4474 int equal; 4475 4476 ASSERT(zio->io_vd != NULL); 4477 ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE); 4478 4479 spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd); 4480 4481 cb = zio->io_private; 4482 ASSERT(cb != NULL); 4483 buf = cb->l2rcb_buf; 4484 ASSERT(buf != NULL); 4485 4486 hash_lock = HDR_LOCK(buf->b_hdr); 4487 mutex_enter(hash_lock); 4488 hdr = buf->b_hdr; 4489 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 4490 4491 /* 4492 * Check this survived the L2ARC journey. 4493 */ 4494 equal = arc_cksum_equal(buf); 4495 if (equal && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) { 4496 mutex_exit(hash_lock); 4497 zio->io_private = buf; 4498 zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 */ 4499 zio->io_bp = &zio->io_bp_copy; /* XXX fix in L2ARC 2.0 */ 4500 arc_read_done(zio); 4501 } else { 4502 mutex_exit(hash_lock); 4503 /* 4504 * Buffer didn't survive caching. Increment stats and 4505 * reissue to the original storage device. 4506 */ 4507 if (zio->io_error != 0) { 4508 ARCSTAT_BUMP(arcstat_l2_io_error); 4509 } else { 4510 zio->io_error = EIO; 4511 } 4512 if (!equal) 4513 ARCSTAT_BUMP(arcstat_l2_cksum_bad); 4514 4515 /* 4516 * If there's no waiter, issue an async i/o to the primary 4517 * storage now. If there *is* a waiter, the caller must 4518 * issue the i/o in a context where it's OK to block. 4519 */ 4520 if (zio->io_waiter == NULL) { 4521 zio_t *pio = zio_unique_parent(zio); 4522 4523 ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL); 4524 4525 zio_nowait(zio_read(pio, cb->l2rcb_spa, &cb->l2rcb_bp, 4526 buf->b_data, zio->io_size, arc_read_done, buf, 4527 zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb)); 4528 } 4529 } 4530 4531 kmem_free(cb, sizeof (l2arc_read_callback_t)); 4532} 4533 4534/* 4535 * This is the list priority from which the L2ARC will search for pages to 4536 * cache. This is used within loops (0..3) to cycle through lists in the 4537 * desired order. This order can have a significant effect on cache 4538 * performance. 4539 * 4540 * Currently the metadata lists are hit first, MFU then MRU, followed by 4541 * the data lists. This function returns a locked list, and also returns 4542 * the lock pointer. 4543 */ 4544static list_t * 4545l2arc_list_locked(int list_num, kmutex_t **lock) 4546{
| 3449 3450 /* 3451 * Do we have more than one buf? 3452 */ 3453 if (hdr->b_datacnt > 1) { 3454 arc_buf_hdr_t *nhdr; 3455 arc_buf_t **bufp; 3456 uint64_t blksz = hdr->b_size; 3457 uint64_t spa = hdr->b_spa; 3458 arc_buf_contents_t type = hdr->b_type; 3459 uint32_t flags = hdr->b_flags; 3460 3461 ASSERT(hdr->b_buf != buf || buf->b_next != NULL); 3462 /* 3463 * Pull the data off of this hdr and attach it to 3464 * a new anonymous hdr. 3465 */ 3466 (void) remove_reference(hdr, hash_lock, tag); 3467 bufp = &hdr->b_buf; 3468 while (*bufp != buf) 3469 bufp = &(*bufp)->b_next; 3470 *bufp = buf->b_next; 3471 buf->b_next = NULL; 3472 3473 ASSERT3U(hdr->b_state->arcs_size, >=, hdr->b_size); 3474 atomic_add_64(&hdr->b_state->arcs_size, -hdr->b_size); 3475 if (refcount_is_zero(&hdr->b_refcnt)) { 3476 uint64_t *size = &hdr->b_state->arcs_lsize[hdr->b_type]; 3477 ASSERT3U(*size, >=, hdr->b_size); 3478 atomic_add_64(size, -hdr->b_size); 3479 } 3480 3481 /* 3482 * We're releasing a duplicate user data buffer, update 3483 * our statistics accordingly. 3484 */ 3485 if (hdr->b_type == ARC_BUFC_DATA) { 3486 ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers); 3487 ARCSTAT_INCR(arcstat_duplicate_buffers_size, 3488 -hdr->b_size); 3489 } 3490 hdr->b_datacnt -= 1; 3491 arc_cksum_verify(buf); 3492#ifdef illumos 3493 arc_buf_unwatch(buf); 3494#endif /* illumos */ 3495 3496 mutex_exit(hash_lock); 3497 3498 nhdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE); 3499 nhdr->b_size = blksz; 3500 nhdr->b_spa = spa; 3501 nhdr->b_type = type; 3502 nhdr->b_buf = buf; 3503 nhdr->b_state = arc_anon; 3504 nhdr->b_arc_access = 0; 3505 nhdr->b_flags = flags & ARC_L2_WRITING; 3506 nhdr->b_l2hdr = NULL; 3507 nhdr->b_datacnt = 1; 3508 nhdr->b_freeze_cksum = NULL; 3509 (void) refcount_add(&nhdr->b_refcnt, tag); 3510 buf->b_hdr = nhdr; 3511 mutex_exit(&buf->b_evict_lock); 3512 atomic_add_64(&arc_anon->arcs_size, blksz); 3513 } else { 3514 mutex_exit(&buf->b_evict_lock); 3515 ASSERT(refcount_count(&hdr->b_refcnt) == 1); 3516 ASSERT(!list_link_active(&hdr->b_arc_node)); 3517 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 3518 if (hdr->b_state != arc_anon) 3519 arc_change_state(arc_anon, hdr, hash_lock); 3520 hdr->b_arc_access = 0; 3521 if (hash_lock) 3522 mutex_exit(hash_lock); 3523 3524 buf_discard_identity(hdr); 3525 arc_buf_thaw(buf); 3526 } 3527 buf->b_efunc = NULL; 3528 buf->b_private = NULL; 3529 3530 if (l2hdr) { 3531 list_remove(l2hdr->b_dev->l2ad_buflist, hdr); 3532 kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t)); 3533 ARCSTAT_INCR(arcstat_l2_size, -buf_size); 3534 mutex_exit(&l2arc_buflist_mtx); 3535 } 3536} 3537 3538int 3539arc_released(arc_buf_t *buf) 3540{ 3541 int released; 3542 3543 mutex_enter(&buf->b_evict_lock); 3544 released = (buf->b_data != NULL && buf->b_hdr->b_state == arc_anon); 3545 mutex_exit(&buf->b_evict_lock); 3546 return (released); 3547} 3548 3549int 3550arc_has_callback(arc_buf_t *buf) 3551{ 3552 int callback; 3553 3554 mutex_enter(&buf->b_evict_lock); 3555 callback = (buf->b_efunc != NULL); 3556 mutex_exit(&buf->b_evict_lock); 3557 return (callback); 3558} 3559 3560#ifdef ZFS_DEBUG 3561int 3562arc_referenced(arc_buf_t *buf) 3563{ 3564 int referenced; 3565 3566 mutex_enter(&buf->b_evict_lock); 3567 referenced = (refcount_count(&buf->b_hdr->b_refcnt)); 3568 mutex_exit(&buf->b_evict_lock); 3569 return (referenced); 3570} 3571#endif 3572 3573static void 3574arc_write_ready(zio_t *zio) 3575{ 3576 arc_write_callback_t *callback = zio->io_private; 3577 arc_buf_t *buf = callback->awcb_buf; 3578 arc_buf_hdr_t *hdr = buf->b_hdr; 3579 3580 ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt)); 3581 callback->awcb_ready(zio, buf, callback->awcb_private); 3582 3583 /* 3584 * If the IO is already in progress, then this is a re-write 3585 * attempt, so we need to thaw and re-compute the cksum. 3586 * It is the responsibility of the callback to handle the 3587 * accounting for any re-write attempt. 3588 */ 3589 if (HDR_IO_IN_PROGRESS(hdr)) { 3590 mutex_enter(&hdr->b_freeze_lock); 3591 if (hdr->b_freeze_cksum != NULL) { 3592 kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t)); 3593 hdr->b_freeze_cksum = NULL; 3594 } 3595 mutex_exit(&hdr->b_freeze_lock); 3596 } 3597 arc_cksum_compute(buf, B_FALSE); 3598 hdr->b_flags |= ARC_IO_IN_PROGRESS; 3599} 3600 3601static void 3602arc_write_done(zio_t *zio) 3603{ 3604 arc_write_callback_t *callback = zio->io_private; 3605 arc_buf_t *buf = callback->awcb_buf; 3606 arc_buf_hdr_t *hdr = buf->b_hdr; 3607 3608 ASSERT(hdr->b_acb == NULL); 3609 3610 if (zio->io_error == 0) { 3611 hdr->b_dva = *BP_IDENTITY(zio->io_bp); 3612 hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp); 3613 hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0]; 3614 } else { 3615 ASSERT(BUF_EMPTY(hdr)); 3616 } 3617 3618 /* 3619 * If the block to be written was all-zero, we may have 3620 * compressed it away. In this case no write was performed 3621 * so there will be no dva/birth/checksum. The buffer must 3622 * therefore remain anonymous (and uncached). 3623 */ 3624 if (!BUF_EMPTY(hdr)) { 3625 arc_buf_hdr_t *exists; 3626 kmutex_t *hash_lock; 3627 3628 ASSERT(zio->io_error == 0); 3629 3630 arc_cksum_verify(buf); 3631 3632 exists = buf_hash_insert(hdr, &hash_lock); 3633 if (exists) { 3634 /* 3635 * This can only happen if we overwrite for 3636 * sync-to-convergence, because we remove 3637 * buffers from the hash table when we arc_free(). 3638 */ 3639 if (zio->io_flags & ZIO_FLAG_IO_REWRITE) { 3640 if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp)) 3641 panic("bad overwrite, hdr=%p exists=%p", 3642 (void *)hdr, (void *)exists); 3643 ASSERT(refcount_is_zero(&exists->b_refcnt)); 3644 arc_change_state(arc_anon, exists, hash_lock); 3645 mutex_exit(hash_lock); 3646 arc_hdr_destroy(exists); 3647 exists = buf_hash_insert(hdr, &hash_lock); 3648 ASSERT3P(exists, ==, NULL); 3649 } else if (zio->io_flags & ZIO_FLAG_NOPWRITE) { 3650 /* nopwrite */ 3651 ASSERT(zio->io_prop.zp_nopwrite); 3652 if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp)) 3653 panic("bad nopwrite, hdr=%p exists=%p", 3654 (void *)hdr, (void *)exists); 3655 } else { 3656 /* Dedup */ 3657 ASSERT(hdr->b_datacnt == 1); 3658 ASSERT(hdr->b_state == arc_anon); 3659 ASSERT(BP_GET_DEDUP(zio->io_bp)); 3660 ASSERT(BP_GET_LEVEL(zio->io_bp) == 0); 3661 } 3662 } 3663 hdr->b_flags &= ~ARC_IO_IN_PROGRESS; 3664 /* if it's not anon, we are doing a scrub */ 3665 if (!exists && hdr->b_state == arc_anon) 3666 arc_access(hdr, hash_lock); 3667 mutex_exit(hash_lock); 3668 } else { 3669 hdr->b_flags &= ~ARC_IO_IN_PROGRESS; 3670 } 3671 3672 ASSERT(!refcount_is_zero(&hdr->b_refcnt)); 3673 callback->awcb_done(zio, buf, callback->awcb_private); 3674 3675 kmem_free(callback, sizeof (arc_write_callback_t)); 3676} 3677 3678zio_t * 3679arc_write(zio_t *pio, spa_t *spa, uint64_t txg, 3680 blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, const zio_prop_t *zp, 3681 arc_done_func_t *ready, arc_done_func_t *done, void *private, 3682 int priority, int zio_flags, const zbookmark_t *zb) 3683{ 3684 arc_buf_hdr_t *hdr = buf->b_hdr; 3685 arc_write_callback_t *callback; 3686 zio_t *zio; 3687 3688 ASSERT(ready != NULL); 3689 ASSERT(done != NULL); 3690 ASSERT(!HDR_IO_ERROR(hdr)); 3691 ASSERT((hdr->b_flags & ARC_IO_IN_PROGRESS) == 0); 3692 ASSERT(hdr->b_acb == NULL); 3693 if (l2arc) 3694 hdr->b_flags |= ARC_L2CACHE; 3695 callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP); 3696 callback->awcb_ready = ready; 3697 callback->awcb_done = done; 3698 callback->awcb_private = private; 3699 callback->awcb_buf = buf; 3700 3701 zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, zp, 3702 arc_write_ready, arc_write_done, callback, priority, zio_flags, zb); 3703 3704 return (zio); 3705} 3706 3707static int 3708arc_memory_throttle(uint64_t reserve, uint64_t inflight_data, uint64_t txg) 3709{ 3710#ifdef _KERNEL 3711 uint64_t available_memory = 3712 ptoa((uintmax_t)cnt.v_free_count + cnt.v_cache_count); 3713 static uint64_t page_load = 0; 3714 static uint64_t last_txg = 0; 3715 3716#ifdef sun 3717#if defined(__i386) 3718 available_memory = 3719 MIN(available_memory, vmem_size(heap_arena, VMEM_FREE)); 3720#endif 3721#endif /* sun */ 3722 if (available_memory >= zfs_write_limit_max) 3723 return (0); 3724 3725 if (txg > last_txg) { 3726 last_txg = txg; 3727 page_load = 0; 3728 } 3729 /* 3730 * If we are in pageout, we know that memory is already tight, 3731 * the arc is already going to be evicting, so we just want to 3732 * continue to let page writes occur as quickly as possible. 3733 */ 3734 if (curproc == pageproc) { 3735 if (page_load > available_memory / 4) 3736 return (ERESTART); 3737 /* Note: reserve is inflated, so we deflate */ 3738 page_load += reserve / 8; 3739 return (0); 3740 } else if (page_load > 0 && arc_reclaim_needed()) { 3741 /* memory is low, delay before restarting */ 3742 ARCSTAT_INCR(arcstat_memory_throttle_count, 1); 3743 return (EAGAIN); 3744 } 3745 page_load = 0; 3746 3747 if (arc_size > arc_c_min) { 3748 uint64_t evictable_memory = 3749 arc_mru->arcs_lsize[ARC_BUFC_DATA] + 3750 arc_mru->arcs_lsize[ARC_BUFC_METADATA] + 3751 arc_mfu->arcs_lsize[ARC_BUFC_DATA] + 3752 arc_mfu->arcs_lsize[ARC_BUFC_METADATA]; 3753 available_memory += MIN(evictable_memory, arc_size - arc_c_min); 3754 } 3755 3756 if (inflight_data > available_memory / 4) { 3757 ARCSTAT_INCR(arcstat_memory_throttle_count, 1); 3758 return (ERESTART); 3759 } 3760#endif 3761 return (0); 3762} 3763 3764void 3765arc_tempreserve_clear(uint64_t reserve) 3766{ 3767 atomic_add_64(&arc_tempreserve, -reserve); 3768 ASSERT((int64_t)arc_tempreserve >= 0); 3769} 3770 3771int 3772arc_tempreserve_space(uint64_t reserve, uint64_t txg) 3773{ 3774 int error; 3775 uint64_t anon_size; 3776 3777#ifdef ZFS_DEBUG 3778 /* 3779 * Once in a while, fail for no reason. Everything should cope. 3780 */ 3781 if (spa_get_random(10000) == 0) { 3782 dprintf("forcing random failure\n"); 3783 return (ERESTART); 3784 } 3785#endif 3786 if (reserve > arc_c/4 && !arc_no_grow) 3787 arc_c = MIN(arc_c_max, reserve * 4); 3788 if (reserve > arc_c) 3789 return (ENOMEM); 3790 3791 /* 3792 * Don't count loaned bufs as in flight dirty data to prevent long 3793 * network delays from blocking transactions that are ready to be 3794 * assigned to a txg. 3795 */ 3796 anon_size = MAX((int64_t)(arc_anon->arcs_size - arc_loaned_bytes), 0); 3797 3798 /* 3799 * Writes will, almost always, require additional memory allocations 3800 * in order to compress/encrypt/etc the data. We therefor need to 3801 * make sure that there is sufficient available memory for this. 3802 */ 3803 if (error = arc_memory_throttle(reserve, anon_size, txg)) 3804 return (error); 3805 3806 /* 3807 * Throttle writes when the amount of dirty data in the cache 3808 * gets too large. We try to keep the cache less than half full 3809 * of dirty blocks so that our sync times don't grow too large. 3810 * Note: if two requests come in concurrently, we might let them 3811 * both succeed, when one of them should fail. Not a huge deal. 3812 */ 3813 3814 if (reserve + arc_tempreserve + anon_size > arc_c / 2 && 3815 anon_size > arc_c / 4) { 3816 dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK " 3817 "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n", 3818 arc_tempreserve>>10, 3819 arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10, 3820 arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10, 3821 reserve>>10, arc_c>>10); 3822 return (ERESTART); 3823 } 3824 atomic_add_64(&arc_tempreserve, reserve); 3825 return (0); 3826} 3827 3828static kmutex_t arc_lowmem_lock; 3829#ifdef _KERNEL 3830static eventhandler_tag arc_event_lowmem = NULL; 3831 3832static void 3833arc_lowmem(void *arg __unused, int howto __unused) 3834{ 3835 3836 /* Serialize access via arc_lowmem_lock. */ 3837 mutex_enter(&arc_lowmem_lock); 3838 mutex_enter(&arc_reclaim_thr_lock); 3839 needfree = 1; 3840 cv_signal(&arc_reclaim_thr_cv); 3841 3842 /* 3843 * It is unsafe to block here in arbitrary threads, because we can come 3844 * here from ARC itself and may hold ARC locks and thus risk a deadlock 3845 * with ARC reclaim thread. 3846 */ 3847 if (curproc == pageproc) { 3848 while (needfree) 3849 msleep(&needfree, &arc_reclaim_thr_lock, 0, "zfs:lowmem", 0); 3850 } 3851 mutex_exit(&arc_reclaim_thr_lock); 3852 mutex_exit(&arc_lowmem_lock); 3853} 3854#endif 3855 3856void 3857arc_init(void) 3858{ 3859 int i, prefetch_tunable_set = 0; 3860 3861 mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL); 3862 cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL); 3863 mutex_init(&arc_lowmem_lock, NULL, MUTEX_DEFAULT, NULL); 3864 3865 /* Convert seconds to clock ticks */ 3866 arc_min_prefetch_lifespan = 1 * hz; 3867 3868 /* Start out with 1/8 of all memory */ 3869 arc_c = kmem_size() / 8; 3870 3871#ifdef sun 3872#ifdef _KERNEL 3873 /* 3874 * On architectures where the physical memory can be larger 3875 * than the addressable space (intel in 32-bit mode), we may 3876 * need to limit the cache to 1/8 of VM size. 3877 */ 3878 arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8); 3879#endif 3880#endif /* sun */ 3881 /* set min cache to 1/32 of all memory, or 16MB, whichever is more */ 3882 arc_c_min = MAX(arc_c / 4, 64<<18); 3883 /* set max to 1/2 of all memory, or all but 1GB, whichever is more */ 3884 if (arc_c * 8 >= 1<<30) 3885 arc_c_max = (arc_c * 8) - (1<<30); 3886 else 3887 arc_c_max = arc_c_min; 3888 arc_c_max = MAX(arc_c * 5, arc_c_max); 3889 3890#ifdef _KERNEL 3891 /* 3892 * Allow the tunables to override our calculations if they are 3893 * reasonable (ie. over 16MB) 3894 */ 3895 if (zfs_arc_max > 64<<18 && zfs_arc_max < kmem_size()) 3896 arc_c_max = zfs_arc_max; 3897 if (zfs_arc_min > 64<<18 && zfs_arc_min <= arc_c_max) 3898 arc_c_min = zfs_arc_min; 3899#endif 3900 3901 arc_c = arc_c_max; 3902 arc_p = (arc_c >> 1); 3903 3904 /* limit meta-data to 1/4 of the arc capacity */ 3905 arc_meta_limit = arc_c_max / 4; 3906 3907 /* Allow the tunable to override if it is reasonable */ 3908 if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max) 3909 arc_meta_limit = zfs_arc_meta_limit; 3910 3911 if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0) 3912 arc_c_min = arc_meta_limit / 2; 3913 3914 if (zfs_arc_grow_retry > 0) 3915 arc_grow_retry = zfs_arc_grow_retry; 3916 3917 if (zfs_arc_shrink_shift > 0) 3918 arc_shrink_shift = zfs_arc_shrink_shift; 3919 3920 if (zfs_arc_p_min_shift > 0) 3921 arc_p_min_shift = zfs_arc_p_min_shift; 3922 3923 /* if kmem_flags are set, lets try to use less memory */ 3924 if (kmem_debugging()) 3925 arc_c = arc_c / 2; 3926 if (arc_c < arc_c_min) 3927 arc_c = arc_c_min; 3928 3929 zfs_arc_min = arc_c_min; 3930 zfs_arc_max = arc_c_max; 3931 3932 arc_anon = &ARC_anon; 3933 arc_mru = &ARC_mru; 3934 arc_mru_ghost = &ARC_mru_ghost; 3935 arc_mfu = &ARC_mfu; 3936 arc_mfu_ghost = &ARC_mfu_ghost; 3937 arc_l2c_only = &ARC_l2c_only; 3938 arc_size = 0; 3939 3940 for (i = 0; i < ARC_BUFC_NUMLISTS; i++) { 3941 mutex_init(&arc_anon->arcs_locks[i].arcs_lock, 3942 NULL, MUTEX_DEFAULT, NULL); 3943 mutex_init(&arc_mru->arcs_locks[i].arcs_lock, 3944 NULL, MUTEX_DEFAULT, NULL); 3945 mutex_init(&arc_mru_ghost->arcs_locks[i].arcs_lock, 3946 NULL, MUTEX_DEFAULT, NULL); 3947 mutex_init(&arc_mfu->arcs_locks[i].arcs_lock, 3948 NULL, MUTEX_DEFAULT, NULL); 3949 mutex_init(&arc_mfu_ghost->arcs_locks[i].arcs_lock, 3950 NULL, MUTEX_DEFAULT, NULL); 3951 mutex_init(&arc_l2c_only->arcs_locks[i].arcs_lock, 3952 NULL, MUTEX_DEFAULT, NULL); 3953 3954 list_create(&arc_mru->arcs_lists[i], 3955 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 3956 list_create(&arc_mru_ghost->arcs_lists[i], 3957 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 3958 list_create(&arc_mfu->arcs_lists[i], 3959 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 3960 list_create(&arc_mfu_ghost->arcs_lists[i], 3961 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 3962 list_create(&arc_mfu_ghost->arcs_lists[i], 3963 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 3964 list_create(&arc_l2c_only->arcs_lists[i], 3965 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 3966 } 3967 3968 buf_init(); 3969 3970 arc_thread_exit = 0; 3971 arc_eviction_list = NULL; 3972 mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL); 3973 bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t)); 3974 3975 arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED, 3976 sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); 3977 3978 if (arc_ksp != NULL) { 3979 arc_ksp->ks_data = &arc_stats; 3980 kstat_install(arc_ksp); 3981 } 3982 3983 (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0, 3984 TS_RUN, minclsyspri); 3985 3986#ifdef _KERNEL 3987 arc_event_lowmem = EVENTHANDLER_REGISTER(vm_lowmem, arc_lowmem, NULL, 3988 EVENTHANDLER_PRI_FIRST); 3989#endif 3990 3991 arc_dead = FALSE; 3992 arc_warm = B_FALSE; 3993 3994 if (zfs_write_limit_max == 0) 3995 zfs_write_limit_max = ptob(physmem) >> zfs_write_limit_shift; 3996 else 3997 zfs_write_limit_shift = 0; 3998 mutex_init(&zfs_write_limit_lock, NULL, MUTEX_DEFAULT, NULL); 3999 4000#ifdef _KERNEL 4001 if (TUNABLE_INT_FETCH("vfs.zfs.prefetch_disable", &zfs_prefetch_disable)) 4002 prefetch_tunable_set = 1; 4003 4004#ifdef __i386__ 4005 if (prefetch_tunable_set == 0) { 4006 printf("ZFS NOTICE: Prefetch is disabled by default on i386 " 4007 "-- to enable,\n"); 4008 printf(" add \"vfs.zfs.prefetch_disable=0\" " 4009 "to /boot/loader.conf.\n"); 4010 zfs_prefetch_disable = 1; 4011 } 4012#else 4013 if ((((uint64_t)physmem * PAGESIZE) < (1ULL << 32)) && 4014 prefetch_tunable_set == 0) { 4015 printf("ZFS NOTICE: Prefetch is disabled by default if less " 4016 "than 4GB of RAM is present;\n" 4017 " to enable, add \"vfs.zfs.prefetch_disable=0\" " 4018 "to /boot/loader.conf.\n"); 4019 zfs_prefetch_disable = 1; 4020 } 4021#endif 4022 /* Warn about ZFS memory and address space requirements. */ 4023 if (((uint64_t)physmem * PAGESIZE) < (256 + 128 + 64) * (1 << 20)) { 4024 printf("ZFS WARNING: Recommended minimum RAM size is 512MB; " 4025 "expect unstable behavior.\n"); 4026 } 4027 if (kmem_size() < 512 * (1 << 20)) { 4028 printf("ZFS WARNING: Recommended minimum kmem_size is 512MB; " 4029 "expect unstable behavior.\n"); 4030 printf(" Consider tuning vm.kmem_size and " 4031 "vm.kmem_size_max\n"); 4032 printf(" in /boot/loader.conf.\n"); 4033 } 4034#endif 4035} 4036 4037void 4038arc_fini(void) 4039{ 4040 int i; 4041 4042 mutex_enter(&arc_reclaim_thr_lock); 4043 arc_thread_exit = 1; 4044 cv_signal(&arc_reclaim_thr_cv); 4045 while (arc_thread_exit != 0) 4046 cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock); 4047 mutex_exit(&arc_reclaim_thr_lock); 4048 4049 arc_flush(NULL); 4050 4051 arc_dead = TRUE; 4052 4053 if (arc_ksp != NULL) { 4054 kstat_delete(arc_ksp); 4055 arc_ksp = NULL; 4056 } 4057 4058 mutex_destroy(&arc_eviction_mtx); 4059 mutex_destroy(&arc_reclaim_thr_lock); 4060 cv_destroy(&arc_reclaim_thr_cv); 4061 4062 for (i = 0; i < ARC_BUFC_NUMLISTS; i++) { 4063 list_destroy(&arc_mru->arcs_lists[i]); 4064 list_destroy(&arc_mru_ghost->arcs_lists[i]); 4065 list_destroy(&arc_mfu->arcs_lists[i]); 4066 list_destroy(&arc_mfu_ghost->arcs_lists[i]); 4067 list_destroy(&arc_l2c_only->arcs_lists[i]); 4068 4069 mutex_destroy(&arc_anon->arcs_locks[i].arcs_lock); 4070 mutex_destroy(&arc_mru->arcs_locks[i].arcs_lock); 4071 mutex_destroy(&arc_mru_ghost->arcs_locks[i].arcs_lock); 4072 mutex_destroy(&arc_mfu->arcs_locks[i].arcs_lock); 4073 mutex_destroy(&arc_mfu_ghost->arcs_locks[i].arcs_lock); 4074 mutex_destroy(&arc_l2c_only->arcs_locks[i].arcs_lock); 4075 } 4076 4077 mutex_destroy(&zfs_write_limit_lock); 4078 4079 buf_fini(); 4080 4081 ASSERT(arc_loaned_bytes == 0); 4082 4083 mutex_destroy(&arc_lowmem_lock); 4084#ifdef _KERNEL 4085 if (arc_event_lowmem != NULL) 4086 EVENTHANDLER_DEREGISTER(vm_lowmem, arc_event_lowmem); 4087#endif 4088} 4089 4090/* 4091 * Level 2 ARC 4092 * 4093 * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk. 4094 * It uses dedicated storage devices to hold cached data, which are populated 4095 * using large infrequent writes. The main role of this cache is to boost 4096 * the performance of random read workloads. The intended L2ARC devices 4097 * include short-stroked disks, solid state disks, and other media with 4098 * substantially faster read latency than disk. 4099 * 4100 * +-----------------------+ 4101 * | ARC | 4102 * +-----------------------+ 4103 * | ^ ^ 4104 * | | | 4105 * l2arc_feed_thread() arc_read() 4106 * | | | 4107 * | l2arc read | 4108 * V | | 4109 * +---------------+ | 4110 * | L2ARC | | 4111 * +---------------+ | 4112 * | ^ | 4113 * l2arc_write() | | 4114 * | | | 4115 * V | | 4116 * +-------+ +-------+ 4117 * | vdev | | vdev | 4118 * | cache | | cache | 4119 * +-------+ +-------+ 4120 * +=========+ .-----. 4121 * : L2ARC : |-_____-| 4122 * : devices : | Disks | 4123 * +=========+ `-_____-' 4124 * 4125 * Read requests are satisfied from the following sources, in order: 4126 * 4127 * 1) ARC 4128 * 2) vdev cache of L2ARC devices 4129 * 3) L2ARC devices 4130 * 4) vdev cache of disks 4131 * 5) disks 4132 * 4133 * Some L2ARC device types exhibit extremely slow write performance. 4134 * To accommodate for this there are some significant differences between 4135 * the L2ARC and traditional cache design: 4136 * 4137 * 1. There is no eviction path from the ARC to the L2ARC. Evictions from 4138 * the ARC behave as usual, freeing buffers and placing headers on ghost 4139 * lists. The ARC does not send buffers to the L2ARC during eviction as 4140 * this would add inflated write latencies for all ARC memory pressure. 4141 * 4142 * 2. The L2ARC attempts to cache data from the ARC before it is evicted. 4143 * It does this by periodically scanning buffers from the eviction-end of 4144 * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are 4145 * not already there. It scans until a headroom of buffers is satisfied, 4146 * which itself is a buffer for ARC eviction. The thread that does this is 4147 * l2arc_feed_thread(), illustrated below; example sizes are included to 4148 * provide a better sense of ratio than this diagram: 4149 * 4150 * head --> tail 4151 * +---------------------+----------+ 4152 * ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->. # already on L2ARC 4153 * +---------------------+----------+ | o L2ARC eligible 4154 * ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->| : ARC buffer 4155 * +---------------------+----------+ | 4156 * 15.9 Gbytes ^ 32 Mbytes | 4157 * headroom | 4158 * l2arc_feed_thread() 4159 * | 4160 * l2arc write hand <--[oooo]--' 4161 * | 8 Mbyte 4162 * | write max 4163 * V 4164 * +==============================+ 4165 * L2ARC dev |####|#|###|###| |####| ... | 4166 * +==============================+ 4167 * 32 Gbytes 4168 * 4169 * 3. If an ARC buffer is copied to the L2ARC but then hit instead of 4170 * evicted, then the L2ARC has cached a buffer much sooner than it probably 4171 * needed to, potentially wasting L2ARC device bandwidth and storage. It is 4172 * safe to say that this is an uncommon case, since buffers at the end of 4173 * the ARC lists have moved there due to inactivity. 4174 * 4175 * 4. If the ARC evicts faster than the L2ARC can maintain a headroom, 4176 * then the L2ARC simply misses copying some buffers. This serves as a 4177 * pressure valve to prevent heavy read workloads from both stalling the ARC 4178 * with waits and clogging the L2ARC with writes. This also helps prevent 4179 * the potential for the L2ARC to churn if it attempts to cache content too 4180 * quickly, such as during backups of the entire pool. 4181 * 4182 * 5. After system boot and before the ARC has filled main memory, there are 4183 * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru 4184 * lists can remain mostly static. Instead of searching from tail of these 4185 * lists as pictured, the l2arc_feed_thread() will search from the list heads 4186 * for eligible buffers, greatly increasing its chance of finding them. 4187 * 4188 * The L2ARC device write speed is also boosted during this time so that 4189 * the L2ARC warms up faster. Since there have been no ARC evictions yet, 4190 * there are no L2ARC reads, and no fear of degrading read performance 4191 * through increased writes. 4192 * 4193 * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that 4194 * the vdev queue can aggregate them into larger and fewer writes. Each 4195 * device is written to in a rotor fashion, sweeping writes through 4196 * available space then repeating. 4197 * 4198 * 7. The L2ARC does not store dirty content. It never needs to flush 4199 * write buffers back to disk based storage. 4200 * 4201 * 8. If an ARC buffer is written (and dirtied) which also exists in the 4202 * L2ARC, the now stale L2ARC buffer is immediately dropped. 4203 * 4204 * The performance of the L2ARC can be tweaked by a number of tunables, which 4205 * may be necessary for different workloads: 4206 * 4207 * l2arc_write_max max write bytes per interval 4208 * l2arc_write_boost extra write bytes during device warmup 4209 * l2arc_noprefetch skip caching prefetched buffers 4210 * l2arc_headroom number of max device writes to precache 4211 * l2arc_feed_secs seconds between L2ARC writing 4212 * 4213 * Tunables may be removed or added as future performance improvements are 4214 * integrated, and also may become zpool properties. 4215 * 4216 * There are three key functions that control how the L2ARC warms up: 4217 * 4218 * l2arc_write_eligible() check if a buffer is eligible to cache 4219 * l2arc_write_size() calculate how much to write 4220 * l2arc_write_interval() calculate sleep delay between writes 4221 * 4222 * These three functions determine what to write, how much, and how quickly 4223 * to send writes. 4224 */ 4225 4226static boolean_t 4227l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab) 4228{ 4229 /* 4230 * A buffer is *not* eligible for the L2ARC if it: 4231 * 1. belongs to a different spa. 4232 * 2. is already cached on the L2ARC. 4233 * 3. has an I/O in progress (it may be an incomplete read). 4234 * 4. is flagged not eligible (zfs property). 4235 */ 4236 if (ab->b_spa != spa_guid) { 4237 ARCSTAT_BUMP(arcstat_l2_write_spa_mismatch); 4238 return (B_FALSE); 4239 } 4240 if (ab->b_l2hdr != NULL) { 4241 ARCSTAT_BUMP(arcstat_l2_write_in_l2); 4242 return (B_FALSE); 4243 } 4244 if (HDR_IO_IN_PROGRESS(ab)) { 4245 ARCSTAT_BUMP(arcstat_l2_write_hdr_io_in_progress); 4246 return (B_FALSE); 4247 } 4248 if (!HDR_L2CACHE(ab)) { 4249 ARCSTAT_BUMP(arcstat_l2_write_not_cacheable); 4250 return (B_FALSE); 4251 } 4252 4253 return (B_TRUE); 4254} 4255 4256static uint64_t 4257l2arc_write_size(l2arc_dev_t *dev) 4258{ 4259 uint64_t size; 4260 4261 size = dev->l2ad_write; 4262 4263 if (arc_warm == B_FALSE) 4264 size += dev->l2ad_boost; 4265 4266 return (size); 4267 4268} 4269 4270static clock_t 4271l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote) 4272{ 4273 clock_t interval, next, now; 4274 4275 /* 4276 * If the ARC lists are busy, increase our write rate; if the 4277 * lists are stale, idle back. This is achieved by checking 4278 * how much we previously wrote - if it was more than half of 4279 * what we wanted, schedule the next write much sooner. 4280 */ 4281 if (l2arc_feed_again && wrote > (wanted / 2)) 4282 interval = (hz * l2arc_feed_min_ms) / 1000; 4283 else 4284 interval = hz * l2arc_feed_secs; 4285 4286 now = ddi_get_lbolt(); 4287 next = MAX(now, MIN(now + interval, began + interval)); 4288 4289 return (next); 4290} 4291 4292static void 4293l2arc_hdr_stat_add(void) 4294{ 4295 ARCSTAT_INCR(arcstat_l2_hdr_size, HDR_SIZE + L2HDR_SIZE); 4296 ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE); 4297} 4298 4299static void 4300l2arc_hdr_stat_remove(void) 4301{ 4302 ARCSTAT_INCR(arcstat_l2_hdr_size, -(HDR_SIZE + L2HDR_SIZE)); 4303 ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE); 4304} 4305 4306/* 4307 * Cycle through L2ARC devices. This is how L2ARC load balances. 4308 * If a device is returned, this also returns holding the spa config lock. 4309 */ 4310static l2arc_dev_t * 4311l2arc_dev_get_next(void) 4312{ 4313 l2arc_dev_t *first, *next = NULL; 4314 4315 /* 4316 * Lock out the removal of spas (spa_namespace_lock), then removal 4317 * of cache devices (l2arc_dev_mtx). Once a device has been selected, 4318 * both locks will be dropped and a spa config lock held instead. 4319 */ 4320 mutex_enter(&spa_namespace_lock); 4321 mutex_enter(&l2arc_dev_mtx); 4322 4323 /* if there are no vdevs, there is nothing to do */ 4324 if (l2arc_ndev == 0) 4325 goto out; 4326 4327 first = NULL; 4328 next = l2arc_dev_last; 4329 do { 4330 /* loop around the list looking for a non-faulted vdev */ 4331 if (next == NULL) { 4332 next = list_head(l2arc_dev_list); 4333 } else { 4334 next = list_next(l2arc_dev_list, next); 4335 if (next == NULL) 4336 next = list_head(l2arc_dev_list); 4337 } 4338 4339 /* if we have come back to the start, bail out */ 4340 if (first == NULL) 4341 first = next; 4342 else if (next == first) 4343 break; 4344 4345 } while (vdev_is_dead(next->l2ad_vdev)); 4346 4347 /* if we were unable to find any usable vdevs, return NULL */ 4348 if (vdev_is_dead(next->l2ad_vdev)) 4349 next = NULL; 4350 4351 l2arc_dev_last = next; 4352 4353out: 4354 mutex_exit(&l2arc_dev_mtx); 4355 4356 /* 4357 * Grab the config lock to prevent the 'next' device from being 4358 * removed while we are writing to it. 4359 */ 4360 if (next != NULL) 4361 spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER); 4362 mutex_exit(&spa_namespace_lock); 4363 4364 return (next); 4365} 4366 4367/* 4368 * Free buffers that were tagged for destruction. 4369 */ 4370static void 4371l2arc_do_free_on_write() 4372{ 4373 list_t *buflist; 4374 l2arc_data_free_t *df, *df_prev; 4375 4376 mutex_enter(&l2arc_free_on_write_mtx); 4377 buflist = l2arc_free_on_write; 4378 4379 for (df = list_tail(buflist); df; df = df_prev) { 4380 df_prev = list_prev(buflist, df); 4381 ASSERT(df->l2df_data != NULL); 4382 ASSERT(df->l2df_func != NULL); 4383 df->l2df_func(df->l2df_data, df->l2df_size); 4384 list_remove(buflist, df); 4385 kmem_free(df, sizeof (l2arc_data_free_t)); 4386 } 4387 4388 mutex_exit(&l2arc_free_on_write_mtx); 4389} 4390 4391/* 4392 * A write to a cache device has completed. Update all headers to allow 4393 * reads from these buffers to begin. 4394 */ 4395static void 4396l2arc_write_done(zio_t *zio) 4397{ 4398 l2arc_write_callback_t *cb; 4399 l2arc_dev_t *dev; 4400 list_t *buflist; 4401 arc_buf_hdr_t *head, *ab, *ab_prev; 4402 l2arc_buf_hdr_t *abl2; 4403 kmutex_t *hash_lock; 4404 4405 cb = zio->io_private; 4406 ASSERT(cb != NULL); 4407 dev = cb->l2wcb_dev; 4408 ASSERT(dev != NULL); 4409 head = cb->l2wcb_head; 4410 ASSERT(head != NULL); 4411 buflist = dev->l2ad_buflist; 4412 ASSERT(buflist != NULL); 4413 DTRACE_PROBE2(l2arc__iodone, zio_t *, zio, 4414 l2arc_write_callback_t *, cb); 4415 4416 if (zio->io_error != 0) 4417 ARCSTAT_BUMP(arcstat_l2_writes_error); 4418 4419 mutex_enter(&l2arc_buflist_mtx); 4420 4421 /* 4422 * All writes completed, or an error was hit. 4423 */ 4424 for (ab = list_prev(buflist, head); ab; ab = ab_prev) { 4425 ab_prev = list_prev(buflist, ab); 4426 4427 hash_lock = HDR_LOCK(ab); 4428 if (!mutex_tryenter(hash_lock)) { 4429 /* 4430 * This buffer misses out. It may be in a stage 4431 * of eviction. Its ARC_L2_WRITING flag will be 4432 * left set, denying reads to this buffer. 4433 */ 4434 ARCSTAT_BUMP(arcstat_l2_writes_hdr_miss); 4435 continue; 4436 } 4437 4438 if (zio->io_error != 0) { 4439 /* 4440 * Error - drop L2ARC entry. 4441 */ 4442 list_remove(buflist, ab); 4443 abl2 = ab->b_l2hdr; 4444 ab->b_l2hdr = NULL; 4445 kmem_free(abl2, sizeof (l2arc_buf_hdr_t)); 4446 ARCSTAT_INCR(arcstat_l2_size, -ab->b_size); 4447 } 4448 4449 /* 4450 * Allow ARC to begin reads to this L2ARC entry. 4451 */ 4452 ab->b_flags &= ~ARC_L2_WRITING; 4453 4454 mutex_exit(hash_lock); 4455 } 4456 4457 atomic_inc_64(&l2arc_writes_done); 4458 list_remove(buflist, head); 4459 kmem_cache_free(hdr_cache, head); 4460 mutex_exit(&l2arc_buflist_mtx); 4461 4462 l2arc_do_free_on_write(); 4463 4464 kmem_free(cb, sizeof (l2arc_write_callback_t)); 4465} 4466 4467/* 4468 * A read to a cache device completed. Validate buffer contents before 4469 * handing over to the regular ARC routines. 4470 */ 4471static void 4472l2arc_read_done(zio_t *zio) 4473{ 4474 l2arc_read_callback_t *cb; 4475 arc_buf_hdr_t *hdr; 4476 arc_buf_t *buf; 4477 kmutex_t *hash_lock; 4478 int equal; 4479 4480 ASSERT(zio->io_vd != NULL); 4481 ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE); 4482 4483 spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd); 4484 4485 cb = zio->io_private; 4486 ASSERT(cb != NULL); 4487 buf = cb->l2rcb_buf; 4488 ASSERT(buf != NULL); 4489 4490 hash_lock = HDR_LOCK(buf->b_hdr); 4491 mutex_enter(hash_lock); 4492 hdr = buf->b_hdr; 4493 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 4494 4495 /* 4496 * Check this survived the L2ARC journey. 4497 */ 4498 equal = arc_cksum_equal(buf); 4499 if (equal && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) { 4500 mutex_exit(hash_lock); 4501 zio->io_private = buf; 4502 zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 */ 4503 zio->io_bp = &zio->io_bp_copy; /* XXX fix in L2ARC 2.0 */ 4504 arc_read_done(zio); 4505 } else { 4506 mutex_exit(hash_lock); 4507 /* 4508 * Buffer didn't survive caching. Increment stats and 4509 * reissue to the original storage device. 4510 */ 4511 if (zio->io_error != 0) { 4512 ARCSTAT_BUMP(arcstat_l2_io_error); 4513 } else { 4514 zio->io_error = EIO; 4515 } 4516 if (!equal) 4517 ARCSTAT_BUMP(arcstat_l2_cksum_bad); 4518 4519 /* 4520 * If there's no waiter, issue an async i/o to the primary 4521 * storage now. If there *is* a waiter, the caller must 4522 * issue the i/o in a context where it's OK to block. 4523 */ 4524 if (zio->io_waiter == NULL) { 4525 zio_t *pio = zio_unique_parent(zio); 4526 4527 ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL); 4528 4529 zio_nowait(zio_read(pio, cb->l2rcb_spa, &cb->l2rcb_bp, 4530 buf->b_data, zio->io_size, arc_read_done, buf, 4531 zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb)); 4532 } 4533 } 4534 4535 kmem_free(cb, sizeof (l2arc_read_callback_t)); 4536} 4537 4538/* 4539 * This is the list priority from which the L2ARC will search for pages to 4540 * cache. This is used within loops (0..3) to cycle through lists in the 4541 * desired order. This order can have a significant effect on cache 4542 * performance. 4543 * 4544 * Currently the metadata lists are hit first, MFU then MRU, followed by 4545 * the data lists. This function returns a locked list, and also returns 4546 * the lock pointer. 4547 */ 4548static list_t * 4549l2arc_list_locked(int list_num, kmutex_t **lock) 4550{
|