Cross Reference: /freebsd-11.0-release/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c

Deleted Added

sdiff udiff text old ( 246666 ) new ( 247187 )

full compact

arc.c (246666)	arc.c (247187)
1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 / 21/ 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 24 * Copyright (c) 2011 by Delphix. All rights reserved. 25 / 26 27/ 28 * DVA-based Adjustable Replacement Cache 29 * 30 * While much of the theory of operation used here is 31 * based on the self-tuning, low overhead replacement cache 32 * presented by Megiddo and Modha at FAST 2003, there are some 33 * significant differences: 34 * 35 * 1. The Megiddo and Modha model assumes any page is evictable. 36 * Pages in its cache cannot be "locked" into memory. This makes 37 * the eviction algorithm simple: evict the last page in the list. 38 * This also make the performance characteristics easy to reason 39 * about. Our cache is not so simple. At any given moment, some 40 * subset of the blocks in the cache are un-evictable because we 41 * have handed out a reference to them. Blocks are only evictable 42 * when there are no external references active. This makes 43 * eviction far more problematic: we choose to evict the evictable 44 * blocks that are the "lowest" in the list. 45 * 46 * There are times when it is not possible to evict the requested 47 * space. In these circumstances we are unable to adjust the cache 48 * size. To prevent the cache growing unbounded at these times we 49 * implement a "cache throttle" that slows the flow of new data 50 * into the cache until we can make space available. 51 * 52 * 2. The Megiddo and Modha model assumes a fixed cache size. 53 * Pages are evicted when the cache is full and there is a cache 54 * miss. Our model has a variable sized cache. It grows with 55 * high use, but also tries to react to memory pressure from the 56 * operating system: decreasing its size when system memory is 57 * tight. 58 * 59 * 3. The Megiddo and Modha model assumes a fixed page size. All 60 * elements of the cache are therefor exactly the same size. So 61 * when adjusting the cache size following a cache miss, its simply 62 * a matter of choosing a single page to evict. In our model, we 63 * have variable sized cache blocks (rangeing from 512 bytes to 64 * 128K bytes). We therefor choose a set of blocks to evict to make 65 * space for a cache miss that approximates as closely as possible 66 * the space used by the new block. 67 * 68 * See also: "ARC: A Self-Tuning, Low Overhead Replacement Cache" 69 * by N. Megiddo & D. Modha, FAST 2003 70 / 71 72/ 73 * The locking model: 74 * 75 * A new reference to a cache buffer can be obtained in two 76 * ways: 1) via a hash table lookup using the DVA as a key, 77 * or 2) via one of the ARC lists. The arc_read() interface 78 * uses method 1, while the internal arc algorithms for 79 * adjusting the cache use method 2. We therefor provide two 80 * types of locks: 1) the hash table lock array, and 2) the 81 * arc list locks. 82 * 83 * Buffers do not have their own mutexs, rather they rely on the 84 * hash table mutexs for the bulk of their protection (i.e. most 85 * fields in the arc_buf_hdr_t are protected by these mutexs). 86 * 87 * buf_hash_find() returns the appropriate mutex (held) when it 88 * locates the requested buffer in the hash table. It returns 89 * NULL for the mutex if the buffer was not in the table. 90 * 91 * buf_hash_remove() expects the appropriate hash mutex to be 92 * already held before it is invoked. 93 * 94 * Each arc state also has a mutex which is used to protect the 95 * buffer list associated with the state. When attempting to 96 * obtain a hash table lock while holding an arc list lock you 97 * must use: mutex_tryenter() to avoid deadlock. Also note that 98 * the active state mutex must be held before the ghost state mutex. 99 * 100 * Arc buffers may have an associated eviction callback function. 101 * This function will be invoked prior to removing the buffer (e.g. 102 * in arc_do_user_evicts()). Note however that the data associated 103 * with the buffer may be evicted prior to the callback. The callback 104 * must be made with no locks held (to prevent deadlock). Additionally, 105 * the users of callbacks must ensure that their private data is 106 * protected from simultaneous callbacks from arc_buf_evict() 107 * and arc_do_user_evicts(). 108 * 109 * Note that the majority of the performance stats are manipulated 110 * with atomic operations. 111 * 112 * The L2ARC uses the l2arc_buflist_mtx global mutex for the following: 113 * 114 * - L2ARC buflist creation 115 * - L2ARC buflist eviction 116 * - L2ARC write completion, which walks L2ARC buflists 117 * - ARC header destruction, as it removes from L2ARC buflists 118 * - ARC header release, as it removes from L2ARC buflists 119 / 120* 121#include <sys/spa.h> 122#include <sys/zio.h> 123#include <sys/zfs_context.h> 124#include <sys/arc.h> 125#include <sys/refcount.h> 126#include <sys/vdev.h> 127#include <sys/vdev_impl.h> 128#ifdef _KERNEL 129#include <sys/dnlc.h> 130#endif 131#include <sys/callb.h> 132#include <sys/kstat.h> 133#include <zfs_fletcher.h> 134#include <sys/sdt.h> 135 136#include <vm/vm_pageout.h> 137 138#ifdef illumos 139#ifndef _KERNEL 140/* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers / 141boolean_t arc_watch = B_FALSE; 142int arc_procfd; 143#endif 144#endif / illumos / 145* 146static kmutex_t arc_reclaim_thr_lock; 147static kcondvar_t arc_reclaim_thr_cv; /* used to signal reclaim thr / 148static uint8_t arc_thread_exit; 149* 150extern int zfs_write_limit_shift; 151extern uint64_t zfs_write_limit_max; 152extern kmutex_t zfs_write_limit_lock; 153 154#define ARC_REDUCE_DNLC_PERCENT 3 155uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT; 156 157typedef enum arc_reclaim_strategy { 158 ARC_RECLAIM_AGGR, /* Aggressive reclaim strategy / 159* ARC_RECLAIM_CONS /* Conservative reclaim strategy / 160} arc_reclaim_strategy_t; 161* 162/* number of seconds before growing cache again / 163static int arc_grow_retry = 60; 164* 165/* shift of arc_c for calculating both min and max arc_p / 166static int arc_p_min_shift = 4; 167* 168/* log2(fraction of arc to reclaim) / 169static int arc_shrink_shift = 5; 170* 171/* 172 * minimum lifespan of a prefetch block in clock ticks 173 * (initialized in arc_init()) 174 / 175static int arc_min_prefetch_lifespan; 176* 177static int arc_dead; 178extern int zfs_prefetch_disable; 179 180/* 181 * The arc has filled available memory and has now warmed up. 182 / 183static boolean_t arc_warm; 184* 185/* 186 * These tunables are for performance analysis. 187 / 188uint64_t zfs_arc_max; 189uint64_t zfs_arc_min; 190uint64_t zfs_arc_meta_limit = 0; 191int zfs_arc_grow_retry = 0; 192int zfs_arc_shrink_shift = 0; 193int zfs_arc_p_min_shift = 0; 194int zfs_disable_dup_eviction = 0; 195* 196TUNABLE_QUAD("vfs.zfs.arc_max", &zfs_arc_max); 197TUNABLE_QUAD("vfs.zfs.arc_min", &zfs_arc_min); 198TUNABLE_QUAD("vfs.zfs.arc_meta_limit", &zfs_arc_meta_limit); 199SYSCTL_DECL(_vfs_zfs); 200SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_max, CTLFLAG_RDTUN, &zfs_arc_max, 0, 201 "Maximum ARC size"); 202SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_min, CTLFLAG_RDTUN, &zfs_arc_min, 0, 203 "Minimum ARC size"); 204 205/* 206 * Note that buffers can be in one of 6 states: 207 * ARC_anon - anonymous (discussed below) 208 * ARC_mru - recently used, currently cached 209 * ARC_mru_ghost - recentely used, no longer in cache 210 * ARC_mfu - frequently used, currently cached 211 * ARC_mfu_ghost - frequently used, no longer in cache 212 * ARC_l2c_only - exists in L2ARC but not other states 213 * When there are no active references to the buffer, they are 214 * are linked onto a list in one of these arc states. These are 215 * the only buffers that can be evicted or deleted. Within each 216 * state there are multiple lists, one for meta-data and one for 217 * non-meta-data. Meta-data (indirect blocks, blocks of dnodes, 218 * etc.) is tracked separately so that it can be managed more 219 * explicitly: favored over data, limited explicitly. 220 * 221 * Anonymous buffers are buffers that are not associated with 222 * a DVA. These are buffers that hold dirty block copies 223 * before they are written to stable storage. By definition, 224 * they are "ref'd" and are considered part of arc_mru 225 * that cannot be freed. Generally, they will aquire a DVA 226 * as they are written and migrate onto the arc_mru list. 227 * 228 * The ARC_l2c_only state is for buffers that are in the second 229 * level ARC but no longer in any of the ARC_m* lists. The second 230 * level ARC itself may also contain buffers that are in any of 231 * the ARC_m* states - meaning that a buffer can exist in two 232 * places. The reason for the ARC_l2c_only state is to keep the 233 * buffer header in the hash table, so that reads that hit the 234 * second level ARC benefit from these fast lookups. 235 / 236* 237#define ARCS_LOCK_PAD CACHE_LINE_SIZE 238struct arcs_lock { 239 kmutex_t arcs_lock; 240#ifdef _KERNEL 241 unsigned char pad[(ARCS_LOCK_PAD - sizeof (kmutex_t))]; 242#endif 243}; 244 245/* 246 * must be power of two for mask use to work 247 * 248 / 249#define ARC_BUFC_NUMDATALISTS 16 250#define ARC_BUFC_NUMMETADATALISTS 16 251#define ARC_BUFC_NUMLISTS (ARC_BUFC_NUMMETADATALISTS + ARC_BUFC_NUMDATALISTS) 252* 253typedef struct arc_state { 254 uint64_t arcs_lsize[ARC_BUFC_NUMTYPES]; /* amount of evictable data / 255* uint64_t arcs_size; /* total amount of data in this state / 256* list_t arcs_lists[ARC_BUFC_NUMLISTS]; /* list of evictable buffers / 257* struct arcs_lock arcs_locks[ARC_BUFC_NUMLISTS] __aligned(CACHE_LINE_SIZE); 258} arc_state_t; 259 260#define ARCS_LOCK(s, i) (&((s)->arcs_locks[(i)].arcs_lock)) 261 262/* The 6 states: / 263static arc_state_t ARC_anon; 264static arc_state_t ARC_mru; 265static arc_state_t ARC_mru_ghost; 266static arc_state_t ARC_mfu; 267static arc_state_t ARC_mfu_ghost; 268static arc_state_t ARC_l2c_only; 269* 270typedef struct arc_stats { 271 kstat_named_t arcstat_hits; 272 kstat_named_t arcstat_misses; 273 kstat_named_t arcstat_demand_data_hits; 274 kstat_named_t arcstat_demand_data_misses; 275 kstat_named_t arcstat_demand_metadata_hits; 276 kstat_named_t arcstat_demand_metadata_misses; 277 kstat_named_t arcstat_prefetch_data_hits; 278 kstat_named_t arcstat_prefetch_data_misses; 279 kstat_named_t arcstat_prefetch_metadata_hits; 280 kstat_named_t arcstat_prefetch_metadata_misses; 281 kstat_named_t arcstat_mru_hits; 282 kstat_named_t arcstat_mru_ghost_hits; 283 kstat_named_t arcstat_mfu_hits; 284 kstat_named_t arcstat_mfu_ghost_hits; 285 kstat_named_t arcstat_allocated; 286 kstat_named_t arcstat_deleted; 287 kstat_named_t arcstat_stolen; 288 kstat_named_t arcstat_recycle_miss; 289 kstat_named_t arcstat_mutex_miss; 290 kstat_named_t arcstat_evict_skip; 291 kstat_named_t arcstat_evict_l2_cached; 292 kstat_named_t arcstat_evict_l2_eligible; 293 kstat_named_t arcstat_evict_l2_ineligible; 294 kstat_named_t arcstat_hash_elements; 295 kstat_named_t arcstat_hash_elements_max; 296 kstat_named_t arcstat_hash_collisions; 297 kstat_named_t arcstat_hash_chains; 298 kstat_named_t arcstat_hash_chain_max; 299 kstat_named_t arcstat_p; 300 kstat_named_t arcstat_c; 301 kstat_named_t arcstat_c_min; 302 kstat_named_t arcstat_c_max; 303 kstat_named_t arcstat_size; 304 kstat_named_t arcstat_hdr_size; 305 kstat_named_t arcstat_data_size; 306 kstat_named_t arcstat_other_size; 307 kstat_named_t arcstat_l2_hits; 308 kstat_named_t arcstat_l2_misses; 309 kstat_named_t arcstat_l2_feeds; 310 kstat_named_t arcstat_l2_rw_clash; 311 kstat_named_t arcstat_l2_read_bytes; 312 kstat_named_t arcstat_l2_write_bytes; 313 kstat_named_t arcstat_l2_writes_sent; 314 kstat_named_t arcstat_l2_writes_done; 315 kstat_named_t arcstat_l2_writes_error; 316 kstat_named_t arcstat_l2_writes_hdr_miss; 317 kstat_named_t arcstat_l2_evict_lock_retry; 318 kstat_named_t arcstat_l2_evict_reading; 319 kstat_named_t arcstat_l2_free_on_write; 320 kstat_named_t arcstat_l2_abort_lowmem; 321 kstat_named_t arcstat_l2_cksum_bad; 322 kstat_named_t arcstat_l2_io_error; 323 kstat_named_t arcstat_l2_size; 324 kstat_named_t arcstat_l2_hdr_size; 325 kstat_named_t arcstat_l2_write_trylock_fail; 326 kstat_named_t arcstat_l2_write_passed_headroom; 327 kstat_named_t arcstat_l2_write_spa_mismatch; 328 kstat_named_t arcstat_l2_write_in_l2; 329 kstat_named_t arcstat_l2_write_hdr_io_in_progress; 330 kstat_named_t arcstat_l2_write_not_cacheable; 331 kstat_named_t arcstat_l2_write_full; 332 kstat_named_t arcstat_l2_write_buffer_iter; 333 kstat_named_t arcstat_l2_write_pios; 334 kstat_named_t arcstat_l2_write_buffer_bytes_scanned; 335 kstat_named_t arcstat_l2_write_buffer_list_iter; 336 kstat_named_t arcstat_l2_write_buffer_list_null_iter; 337 kstat_named_t arcstat_memory_throttle_count; 338 kstat_named_t arcstat_duplicate_buffers; 339 kstat_named_t arcstat_duplicate_buffers_size; 340 kstat_named_t arcstat_duplicate_reads; 341} arc_stats_t; 342 343static arc_stats_t arc_stats = { 344 { "hits", KSTAT_DATA_UINT64 }, 345 { "misses", KSTAT_DATA_UINT64 }, 346 { "demand_data_hits", KSTAT_DATA_UINT64 }, 347 { "demand_data_misses", KSTAT_DATA_UINT64 }, 348 { "demand_metadata_hits", KSTAT_DATA_UINT64 }, 349 { "demand_metadata_misses", KSTAT_DATA_UINT64 }, 350 { "prefetch_data_hits", KSTAT_DATA_UINT64 }, 351 { "prefetch_data_misses", KSTAT_DATA_UINT64 }, 352 { "prefetch_metadata_hits", KSTAT_DATA_UINT64 }, 353 { "prefetch_metadata_misses", KSTAT_DATA_UINT64 }, 354 { "mru_hits", KSTAT_DATA_UINT64 }, 355 { "mru_ghost_hits", KSTAT_DATA_UINT64 }, 356 { "mfu_hits", KSTAT_DATA_UINT64 }, 357 { "mfu_ghost_hits", KSTAT_DATA_UINT64 }, 358 { "allocated", KSTAT_DATA_UINT64 }, 359 { "deleted", KSTAT_DATA_UINT64 }, 360 { "stolen", KSTAT_DATA_UINT64 }, 361 { "recycle_miss", KSTAT_DATA_UINT64 }, 362 { "mutex_miss", KSTAT_DATA_UINT64 }, 363 { "evict_skip", KSTAT_DATA_UINT64 }, 364 { "evict_l2_cached", KSTAT_DATA_UINT64 }, 365 { "evict_l2_eligible", KSTAT_DATA_UINT64 }, 366 { "evict_l2_ineligible", KSTAT_DATA_UINT64 }, 367 { "hash_elements", KSTAT_DATA_UINT64 }, 368 { "hash_elements_max", KSTAT_DATA_UINT64 }, 369 { "hash_collisions", KSTAT_DATA_UINT64 }, 370 { "hash_chains", KSTAT_DATA_UINT64 }, 371 { "hash_chain_max", KSTAT_DATA_UINT64 }, 372 { "p", KSTAT_DATA_UINT64 }, 373 { "c", KSTAT_DATA_UINT64 }, 374 { "c_min", KSTAT_DATA_UINT64 }, 375 { "c_max", KSTAT_DATA_UINT64 }, 376 { "size", KSTAT_DATA_UINT64 }, 377 { "hdr_size", KSTAT_DATA_UINT64 }, 378 { "data_size", KSTAT_DATA_UINT64 }, 379 { "other_size", KSTAT_DATA_UINT64 }, 380 { "l2_hits", KSTAT_DATA_UINT64 }, 381 { "l2_misses", KSTAT_DATA_UINT64 }, 382 { "l2_feeds", KSTAT_DATA_UINT64 }, 383 { "l2_rw_clash", KSTAT_DATA_UINT64 }, 384 { "l2_read_bytes", KSTAT_DATA_UINT64 }, 385 { "l2_write_bytes", KSTAT_DATA_UINT64 }, 386 { "l2_writes_sent", KSTAT_DATA_UINT64 }, 387 { "l2_writes_done", KSTAT_DATA_UINT64 }, 388 { "l2_writes_error", KSTAT_DATA_UINT64 }, 389 { "l2_writes_hdr_miss", KSTAT_DATA_UINT64 }, 390 { "l2_evict_lock_retry", KSTAT_DATA_UINT64 }, 391 { "l2_evict_reading", KSTAT_DATA_UINT64 }, 392 { "l2_free_on_write", KSTAT_DATA_UINT64 }, 393 { "l2_abort_lowmem", KSTAT_DATA_UINT64 }, 394 { "l2_cksum_bad", KSTAT_DATA_UINT64 }, 395 { "l2_io_error", KSTAT_DATA_UINT64 }, 396 { "l2_size", KSTAT_DATA_UINT64 }, 397 { "l2_hdr_size", KSTAT_DATA_UINT64 }, 398 { "l2_write_trylock_fail", KSTAT_DATA_UINT64 }, 399 { "l2_write_passed_headroom", KSTAT_DATA_UINT64 }, 400 { "l2_write_spa_mismatch", KSTAT_DATA_UINT64 }, 401 { "l2_write_in_l2", KSTAT_DATA_UINT64 }, 402 { "l2_write_io_in_progress", KSTAT_DATA_UINT64 }, 403 { "l2_write_not_cacheable", KSTAT_DATA_UINT64 }, 404 { "l2_write_full", KSTAT_DATA_UINT64 }, 405 { "l2_write_buffer_iter", KSTAT_DATA_UINT64 }, 406 { "l2_write_pios", KSTAT_DATA_UINT64 }, 407 { "l2_write_buffer_bytes_scanned", KSTAT_DATA_UINT64 }, 408 { "l2_write_buffer_list_iter", KSTAT_DATA_UINT64 }, 409 { "l2_write_buffer_list_null_iter", KSTAT_DATA_UINT64 }, 410 { "memory_throttle_count", KSTAT_DATA_UINT64 }, 411 { "duplicate_buffers", KSTAT_DATA_UINT64 }, 412 { "duplicate_buffers_size", KSTAT_DATA_UINT64 }, 413 { "duplicate_reads", KSTAT_DATA_UINT64 } 414}; 415 416#define ARCSTAT(stat) (arc_stats.stat.value.ui64) 417 418#define ARCSTAT_INCR(stat, val) \ 419 atomic_add_64(&arc_stats.stat.value.ui64, (val)); 420 421#define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1) 422#define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1) 423 424#define ARCSTAT_MAX(stat, val) { \ 425 uint64_t m; \ 426 while ((val) > (m = arc_stats.stat.value.ui64) && \ 427 (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val)))) \ 428 continue; \ 429} 430 431#define ARCSTAT_MAXSTAT(stat) \ 432 ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64) 433 434/* 435 * We define a macro to allow ARC hits/misses to be easily broken down by 436 * two separate conditions, giving a total of four different subtypes for 437 * each of hits and misses (so eight statistics total). 438 / 439#define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \ 440* if (cond1) { \ 441 if (cond2) { \ 442 ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \ 443 } else { \ 444 ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \ 445 } \ 446 } else { \ 447 if (cond2) { \ 448 ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \ 449 } else { \ 450 ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\ 451 } \ 452 } 453 454kstat_t arc_ksp; 455static arc_state_t arc_anon; 456static arc_state_t arc_mru; 457static arc_state_t arc_mru_ghost; 458static arc_state_t arc_mfu; 459static arc_state_t arc_mfu_ghost; 460static arc_state_t arc_l2c_only; 461* 462/* 463 * There are several ARC variables that are critical to export as kstats -- 464 * but we don't want to have to grovel around in the kstat whenever we wish to 465 * manipulate them. For these variables, we therefore define them to be in 466 * terms of the statistic variable. This assures that we are not introducing 467 * the possibility of inconsistency by having shadow copies of the variables, 468 * while still allowing the code to be readable. 469 / 470#define arc_size ARCSTAT(arcstat_size) / actual total arc size / 471#define arc_p ARCSTAT(arcstat_p) / target size of MRU / 472#define arc_c ARCSTAT(arcstat_c) / target size of cache / 473#define arc_c_min ARCSTAT(arcstat_c_min) / min target cache size / 474#define arc_c_max ARCSTAT(arcstat_c_max) / max target cache size / 475* 476static int arc_no_grow; /* Don't try to grow cache size / 477static uint64_t arc_tempreserve; 478static uint64_t arc_loaned_bytes; 479static uint64_t arc_meta_used; 480static uint64_t arc_meta_limit; 481static uint64_t arc_meta_max = 0; 482SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_meta_used, CTLFLAG_RD, &arc_meta_used, 0, 483* "ARC metadata used"); 484SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_meta_limit, CTLFLAG_RW, &arc_meta_limit, 0, 485 "ARC metadata limit"); 486 487typedef struct l2arc_buf_hdr l2arc_buf_hdr_t; 488 489typedef struct arc_callback arc_callback_t; 490 491struct arc_callback { 492 void acb_private; 493* arc_done_func_t acb_done; 494* arc_buf_t acb_buf; 495* zio_t acb_zio_dummy; 496* arc_callback_t acb_next; 497}; 498* 499typedef struct arc_write_callback arc_write_callback_t; 500 501struct arc_write_callback { 502 void awcb_private; 503* arc_done_func_t awcb_ready; 504* arc_done_func_t awcb_done; 505* arc_buf_t awcb_buf; 506}; 507* 508struct arc_buf_hdr { 509 /* protected by hash lock / 510* dva_t b_dva; 511 uint64_t b_birth; 512 uint64_t b_cksum0; 513 514 kmutex_t b_freeze_lock; 515 zio_cksum_t b_freeze_cksum; 516* void b_thawed; 517* 518 arc_buf_hdr_t b_hash_next; 519* arc_buf_t b_buf; 520* uint32_t b_flags; 521 uint32_t b_datacnt; 522 523 arc_callback_t b_acb; 524* kcondvar_t b_cv; 525 526 /* immutable / 527* arc_buf_contents_t b_type; 528 uint64_t b_size; 529 uint64_t b_spa; 530 531 /* protected by arc state mutex / 532* arc_state_t b_state; 533* list_node_t b_arc_node; 534 535 /* updated atomically / 536* clock_t b_arc_access; 537 538 /* self protecting / 539* refcount_t b_refcnt; 540 541 l2arc_buf_hdr_t b_l2hdr; 542* list_node_t b_l2node; 543}; 544 545static arc_buf_t arc_eviction_list; 546static kmutex_t arc_eviction_mtx; 547static arc_buf_hdr_t arc_eviction_hdr; 548static void arc_get_data_buf(arc_buf_t buf); 549static void arc_access(arc_buf_hdr_t buf, kmutex_t hash_lock); 550static int arc_evict_needed(arc_buf_contents_t type); 551static void arc_evict_ghost(arc_state_t state, uint64_t spa, int64_t bytes); 552#ifdef illumos 553static void arc_buf_watch(arc_buf_t buf); 554#endif /* illumos / 555* 556static boolean_t l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t ab); 557* 558#define GHOST_STATE(state) \ 559 ((state) == arc_mru_ghost \|\| (state) == arc_mfu_ghost \|\| \ 560 (state) == arc_l2c_only) 561 562/* 563 * Private ARC flags. These flags are private ARC only flags that will show up 564 * in b_flags in the arc_hdr_buf_t. Some flags are publicly declared, and can 565 * be passed in as arc_flags in things like arc_read. However, these flags 566 * should never be passed and should only be set by ARC code. When adding new 567 * public flags, make sure not to smash the private ones. 568 / 569* 570#define ARC_IN_HASH_TABLE (1 << 9) /* this buffer is hashed / 571#define ARC_IO_IN_PROGRESS (1 << 10) / I/O in progress for buf / 572#define ARC_IO_ERROR (1 << 11) / I/O failed for buf / 573#define ARC_FREED_IN_READ (1 << 12) / buf freed while in read / 574#define ARC_BUF_AVAILABLE (1 << 13) / block not in active use / 575#define ARC_INDIRECT (1 << 14) / this is an indirect block / 576#define ARC_FREE_IN_PROGRESS (1 << 15) / hdr about to be freed / 577#define ARC_L2_WRITING (1 << 16) / L2ARC write in progress / 578#define ARC_L2_EVICTED (1 << 17) / evicted during I/O / 579#define ARC_L2_WRITE_HEAD (1 << 18) / head of write list / 580* 581#define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_IN_HASH_TABLE) 582#define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS) 583#define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_IO_ERROR) 584#define HDR_PREFETCH(hdr) ((hdr)->b_flags & ARC_PREFETCH) 585#define HDR_FREED_IN_READ(hdr) ((hdr)->b_flags & ARC_FREED_IN_READ) 586#define HDR_BUF_AVAILABLE(hdr) ((hdr)->b_flags & ARC_BUF_AVAILABLE) 587#define HDR_FREE_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FREE_IN_PROGRESS) 588#define HDR_L2CACHE(hdr) ((hdr)->b_flags & ARC_L2CACHE) 589#define HDR_L2_READING(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS && \ 590 (hdr)->b_l2hdr != NULL) 591#define HDR_L2_WRITING(hdr) ((hdr)->b_flags & ARC_L2_WRITING) 592#define HDR_L2_EVICTED(hdr) ((hdr)->b_flags & ARC_L2_EVICTED) 593#define HDR_L2_WRITE_HEAD(hdr) ((hdr)->b_flags & ARC_L2_WRITE_HEAD) 594 595/* 596 * Other sizes 597 / 598* 599#define HDR_SIZE ((int64_t)sizeof (arc_buf_hdr_t)) 600#define L2HDR_SIZE ((int64_t)sizeof (l2arc_buf_hdr_t)) 601 602/* 603 * Hash table routines 604 / 605* 606#define HT_LOCK_PAD CACHE_LINE_SIZE 607 608struct ht_lock { 609 kmutex_t ht_lock; 610#ifdef _KERNEL 611 unsigned char pad[(HT_LOCK_PAD - sizeof (kmutex_t))]; 612#endif 613}; 614 615#define BUF_LOCKS 256 616typedef struct buf_hash_table { 617 uint64_t ht_mask; 618 arc_buf_hdr_t *ht_table; 619* struct ht_lock ht_locks[BUF_LOCKS] __aligned(CACHE_LINE_SIZE); 620} buf_hash_table_t; 621 622static buf_hash_table_t buf_hash_table; 623 624#define BUF_HASH_INDEX(spa, dva, birth) \ 625 (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask) 626#define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)]) 627#define BUF_HASH_LOCK(idx) (&(BUF_HASH_LOCK_NTRY(idx).ht_lock)) 628#define HDR_LOCK(hdr) \ 629 (BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth))) 630 631uint64_t zfs_crc64_table[256]; 632 633/* 634 * Level 2 ARC 635 / 636* 637#define L2ARC_WRITE_SIZE (8 * 1024 * 1024) /* initial write max / 638#define L2ARC_HEADROOM 2 / num of writes / 639#define L2ARC_FEED_SECS 1 / caching interval secs / 640#define L2ARC_FEED_MIN_MS 200 / min caching interval ms / 641* 642#define l2arc_writes_sent ARCSTAT(arcstat_l2_writes_sent) 643#define l2arc_writes_done ARCSTAT(arcstat_l2_writes_done) 644 645/* 646 * L2ARC Performance Tunables 647 / 648uint64_t l2arc_write_max = L2ARC_WRITE_SIZE; / default max write size / 649uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE; / extra write during warmup / 650uint64_t l2arc_headroom = L2ARC_HEADROOM; / number of dev writes / 651uint64_t l2arc_feed_secs = L2ARC_FEED_SECS; / interval seconds / 652uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; / min interval milliseconds / 653boolean_t l2arc_noprefetch = B_TRUE; / don't cache prefetch bufs / 654boolean_t l2arc_feed_again = B_TRUE; / turbo warmup / 655boolean_t l2arc_norw = B_TRUE; / no reads during writes / 656* 657SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_max, CTLFLAG_RW, 658 &l2arc_write_max, 0, "max write size"); 659SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_boost, CTLFLAG_RW, 660 &l2arc_write_boost, 0, "extra write during warmup"); 661SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_headroom, CTLFLAG_RW, 662 &l2arc_headroom, 0, "number of dev writes"); 663SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_secs, CTLFLAG_RW, 664 &l2arc_feed_secs, 0, "interval seconds"); 665SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_min_ms, CTLFLAG_RW, 666 &l2arc_feed_min_ms, 0, "min interval milliseconds"); 667 668SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_noprefetch, CTLFLAG_RW, 669 &l2arc_noprefetch, 0, "don't cache prefetch bufs"); 670SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_feed_again, CTLFLAG_RW, 671 &l2arc_feed_again, 0, "turbo warmup"); 672SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_norw, CTLFLAG_RW, 673 &l2arc_norw, 0, "no reads during writes"); 674 675SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_size, CTLFLAG_RD, 676 &ARC_anon.arcs_size, 0, "size of anonymous state"); 677SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_metadata_lsize, CTLFLAG_RD, 678 &ARC_anon.arcs_lsize[ARC_BUFC_METADATA], 0, "size of anonymous state"); 679SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_data_lsize, CTLFLAG_RD, 680 &ARC_anon.arcs_lsize[ARC_BUFC_DATA], 0, "size of anonymous state"); 681 682SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_size, CTLFLAG_RD, 683 &ARC_mru.arcs_size, 0, "size of mru state"); 684SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_metadata_lsize, CTLFLAG_RD, 685 &ARC_mru.arcs_lsize[ARC_BUFC_METADATA], 0, "size of metadata in mru state"); 686SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_data_lsize, CTLFLAG_RD, 687 &ARC_mru.arcs_lsize[ARC_BUFC_DATA], 0, "size of data in mru state"); 688 689SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_size, CTLFLAG_RD, 690 &ARC_mru_ghost.arcs_size, 0, "size of mru ghost state"); 691SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_metadata_lsize, CTLFLAG_RD, 692 &ARC_mru_ghost.arcs_lsize[ARC_BUFC_METADATA], 0, 693 "size of metadata in mru ghost state"); 694SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_data_lsize, CTLFLAG_RD, 695 &ARC_mru_ghost.arcs_lsize[ARC_BUFC_DATA], 0, 696 "size of data in mru ghost state"); 697 698SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_size, CTLFLAG_RD, 699 &ARC_mfu.arcs_size, 0, "size of mfu state"); 700SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_metadata_lsize, CTLFLAG_RD, 701 &ARC_mfu.arcs_lsize[ARC_BUFC_METADATA], 0, "size of metadata in mfu state"); 702SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_data_lsize, CTLFLAG_RD, 703 &ARC_mfu.arcs_lsize[ARC_BUFC_DATA], 0, "size of data in mfu state"); 704 705SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_size, CTLFLAG_RD, 706 &ARC_mfu_ghost.arcs_size, 0, "size of mfu ghost state"); 707SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_metadata_lsize, CTLFLAG_RD, 708 &ARC_mfu_ghost.arcs_lsize[ARC_BUFC_METADATA], 0, 709 "size of metadata in mfu ghost state"); 710SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_data_lsize, CTLFLAG_RD, 711 &ARC_mfu_ghost.arcs_lsize[ARC_BUFC_DATA], 0, 712 "size of data in mfu ghost state"); 713 714SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2c_only_size, CTLFLAG_RD, 715 &ARC_l2c_only.arcs_size, 0, "size of mru state"); 716 717/* 718 * L2ARC Internals 719 / 720typedef struct l2arc_dev { 721* vdev_t l2ad_vdev; / vdev / 722* spa_t l2ad_spa; / spa / 723* uint64_t l2ad_hand; /* next write location / 724* uint64_t l2ad_write; /* desired write size, bytes / 725* uint64_t l2ad_boost; /* warmup write boost, bytes / 726* uint64_t l2ad_start; /* first addr on device / 727* uint64_t l2ad_end; /* last addr on device / 728* uint64_t l2ad_evict; /* last addr eviction reached / 729* boolean_t l2ad_first; /* first sweep through / 730* boolean_t l2ad_writing; /* currently writing / 731* list_t l2ad_buflist; / buffer list / 732* list_node_t l2ad_node; /* device list node / 733} l2arc_dev_t; 734* 735static list_t L2ARC_dev_list; /* device list / 736static list_t l2arc_dev_list; /* device list pointer / 737static kmutex_t l2arc_dev_mtx; / device list mutex / 738static l2arc_dev_t l2arc_dev_last; /* last device used / 739static kmutex_t l2arc_buflist_mtx; / mutex for all buflists / 740static list_t L2ARC_free_on_write; / free after write buf list / 741static list_t l2arc_free_on_write; /* free after write list ptr / 742static kmutex_t l2arc_free_on_write_mtx; / mutex for list / 743static uint64_t l2arc_ndev; / number of devices / 744* 745typedef struct l2arc_read_callback { 746 arc_buf_t l2rcb_buf; / read buffer / 747* spa_t l2rcb_spa; / spa / 748* blkptr_t l2rcb_bp; /* original blkptr / 749* zbookmark_t l2rcb_zb; /* original bookmark / 750* int l2rcb_flags; /* original flags / 751} l2arc_read_callback_t; 752* 753typedef struct l2arc_write_callback { 754 l2arc_dev_t l2wcb_dev; / device info / 755* arc_buf_hdr_t l2wcb_head; / head of write buflist / 756} l2arc_write_callback_t; 757* 758struct l2arc_buf_hdr { 759 /* protected by arc_buf_hdr mutex / 760* l2arc_dev_t b_dev; / L2ARC device / 761* uint64_t b_daddr; /* disk address, offset byte / 762}; 763* 764typedef struct l2arc_data_free { 765 /* protected by l2arc_free_on_write_mtx / 766* void l2df_data; 767* size_t l2df_size; 768 void (l2df_func)(void , size_t); 769 list_node_t l2df_list_node; 770} l2arc_data_free_t; 771 772static kmutex_t l2arc_feed_thr_lock; 773static kcondvar_t l2arc_feed_thr_cv; 774static uint8_t l2arc_thread_exit; 775 776static void l2arc_read_done(zio_t zio); 777static void l2arc_hdr_stat_add(void); 778static void l2arc_hdr_stat_remove(void); 779* 780static uint64_t 781buf_hash(uint64_t spa, const dva_t dva, uint64_t birth) 782{ 783* uint8_t vdva = (uint8_t )dva; 784 uint64_t crc = -1ULL; 785 int i; 786 787 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); 788 789 for (i = 0; i < sizeof (dva_t); i++) 790 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF]; 791 792 crc ^= (spa>>8) ^ birth; 793 794 return (crc); 795} 796 797#define BUF_EMPTY(buf) \ 798 ((buf)->b_dva.dva_word[0] == 0 && \ 799 (buf)->b_dva.dva_word[1] == 0 && \ 800 (buf)->b_birth == 0) 801 802#define BUF_EQUAL(spa, dva, birth, buf) \ 803 ((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \ 804 ((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \ 805 ((buf)->b_birth == birth) && ((buf)->b_spa == spa) 806 807static void 808buf_discard_identity(arc_buf_hdr_t hdr) 809{ 810* hdr->b_dva.dva_word[0] = 0; 811 hdr->b_dva.dva_word[1] = 0; 812 hdr->b_birth = 0; 813 hdr->b_cksum0 = 0; 814} 815 816static arc_buf_hdr_t * 817buf_hash_find(uint64_t spa, const dva_t dva, uint64_t birth, kmutex_t lockp) 818{ 819* uint64_t idx = BUF_HASH_INDEX(spa, dva, birth); 820 kmutex_t hash_lock = BUF_HASH_LOCK(idx); 821* arc_buf_hdr_t buf; 822* 823 mutex_enter(hash_lock); 824 for (buf = buf_hash_table.ht_table[idx]; buf != NULL; 825 buf = buf->b_hash_next) { 826 if (BUF_EQUAL(spa, dva, birth, buf)) { 827 lockp = hash_lock; 828* return (buf); 829 } 830 } 831 mutex_exit(hash_lock); 832 lockp = NULL; 833* return (NULL); 834} 835 836/* 837 * Insert an entry into the hash table. If there is already an element 838 * equal to elem in the hash table, then the already existing element 839 * will be returned and the new element will not be inserted. 840 * Otherwise returns NULL. 841 / 842static arc_buf_hdr_t 843buf_hash_insert(arc_buf_hdr_t buf, kmutex_t lockp) 844{ 845* uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth); 846 kmutex_t hash_lock = BUF_HASH_LOCK(idx); 847* arc_buf_hdr_t fbuf; 848* uint32_t i; 849 850 ASSERT(!HDR_IN_HASH_TABLE(buf)); 851 lockp = hash_lock; 852* mutex_enter(hash_lock); 853 for (fbuf = buf_hash_table.ht_table[idx], i = 0; fbuf != NULL; 854 fbuf = fbuf->b_hash_next, i++) { 855 if (BUF_EQUAL(buf->b_spa, &buf->b_dva, buf->b_birth, fbuf)) 856 return (fbuf); 857 } 858 859 buf->b_hash_next = buf_hash_table.ht_table[idx]; 860 buf_hash_table.ht_table[idx] = buf; 861 buf->b_flags \|= ARC_IN_HASH_TABLE; 862 863 /* collect some hash table performance data / 864* if (i > 0) { 865 ARCSTAT_BUMP(arcstat_hash_collisions); 866 if (i == 1) 867 ARCSTAT_BUMP(arcstat_hash_chains); 868 869 ARCSTAT_MAX(arcstat_hash_chain_max, i); 870 } 871 872 ARCSTAT_BUMP(arcstat_hash_elements); 873 ARCSTAT_MAXSTAT(arcstat_hash_elements); 874 875 return (NULL); 876} 877 878static void 879buf_hash_remove(arc_buf_hdr_t buf) 880{ 881* arc_buf_hdr_t fbuf, bufp; 882* uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth); 883 884 ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx))); 885 ASSERT(HDR_IN_HASH_TABLE(buf)); 886 887 bufp = &buf_hash_table.ht_table[idx]; 888 while ((fbuf = bufp) != buf) { 889* ASSERT(fbuf != NULL); 890 bufp = &fbuf->b_hash_next; 891 } 892 bufp = buf->b_hash_next; 893* buf->b_hash_next = NULL; 894 buf->b_flags &= ~ARC_IN_HASH_TABLE; 895 896 /* collect some hash table performance data / 897* ARCSTAT_BUMPDOWN(arcstat_hash_elements); 898 899 if (buf_hash_table.ht_table[idx] && 900 buf_hash_table.ht_table[idx]->b_hash_next == NULL) 901 ARCSTAT_BUMPDOWN(arcstat_hash_chains); 902} 903 904/* 905 * Global data structures and functions for the buf kmem cache. 906 / 907static kmem_cache_t hdr_cache; 908static kmem_cache_t buf_cache; 909* 910static void 911buf_fini(void) 912{ 913 int i; 914 915 kmem_free(buf_hash_table.ht_table, 916 (buf_hash_table.ht_mask + 1) * sizeof (void )); 917* for (i = 0; i < BUF_LOCKS; i++) 918 mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock); 919 kmem_cache_destroy(hdr_cache); 920 kmem_cache_destroy(buf_cache); 921} 922 923/* 924 * Constructor callback - called when the cache is empty 925 * and a new buf is requested. 926 / 927/ ARGSUSED / 928static int 929hdr_cons(void vbuf, void unused, int kmflag) 930{ 931* arc_buf_hdr_t buf = vbuf; 932* 933 bzero(buf, sizeof (arc_buf_hdr_t)); 934 refcount_create(&buf->b_refcnt); 935 cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL); 936 mutex_init(&buf->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL); 937 arc_space_consume(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS); 938 939 return (0); 940} 941 942/* ARGSUSED / 943static int 944buf_cons(void vbuf, void unused, int kmflag) 945{ 946* arc_buf_t buf = vbuf; 947* 948 bzero(buf, sizeof (arc_buf_t)); 949 mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL); 950 arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS); 951 952 return (0); 953} 954 955/* 956 * Destructor callback - called when a cached buf is 957 * no longer required. 958 / 959/ ARGSUSED / 960static void 961hdr_dest(void vbuf, void unused) 962{ 963* arc_buf_hdr_t buf = vbuf; 964* 965 ASSERT(BUF_EMPTY(buf)); 966 refcount_destroy(&buf->b_refcnt); 967 cv_destroy(&buf->b_cv); 968 mutex_destroy(&buf->b_freeze_lock); 969 arc_space_return(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS); 970} 971 972/* ARGSUSED / 973static void 974buf_dest(void vbuf, void unused) 975{ 976* arc_buf_t buf = vbuf; 977* 978 mutex_destroy(&buf->b_evict_lock); 979 arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS); 980} 981 982/* 983 * Reclaim callback -- invoked when memory is low. 984 / 985/ ARGSUSED / 986static void 987hdr_recl(void unused) 988{ 989 dprintf("hdr_recl called\n"); 990 /* 991 * umem calls the reclaim func when we destroy the buf cache, 992 * which is after we do arc_fini(). 993 / 994* if (!arc_dead) 995 cv_signal(&arc_reclaim_thr_cv); 996} 997 998static void 999buf_init(void) 1000{ 1001 uint64_t ct; 1002* uint64_t hsize = 1ULL << 12; 1003 int i, j; 1004 1005 /* 1006 * The hash table is big enough to fill all of physical memory 1007 * with an average 64K block size. The table will take up 1008 * totalmemsizeof(void)/64K (eg. 128KB/GB with 8-byte pointers). 1009 / 1010* while (hsize * 65536 < (uint64_t)physmem * PAGESIZE) 1011 hsize <<= 1; 1012retry: 1013 buf_hash_table.ht_mask = hsize - 1; 1014 buf_hash_table.ht_table = 1015 kmem_zalloc(hsize * sizeof (void), KM_NOSLEEP); 1016* if (buf_hash_table.ht_table == NULL) { 1017 ASSERT(hsize > (1ULL << 8)); 1018 hsize >>= 1; 1019 goto retry; 1020 } 1021 1022 hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t), 1023 0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0); 1024 buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t), 1025 0, buf_cons, buf_dest, NULL, NULL, NULL, 0); 1026 1027 for (i = 0; i < 256; i++) 1028 for (ct = zfs_crc64_table + i, ct = i, j = 8; j > 0; j--) 1029* ct = (ct >> 1) ^ (-(ct & 1) & ZFS_CRC64_POLY); 1030* 1031 for (i = 0; i < BUF_LOCKS; i++) { 1032 mutex_init(&buf_hash_table.ht_locks[i].ht_lock, 1033 NULL, MUTEX_DEFAULT, NULL); 1034 } 1035} 1036 1037#define ARC_MINTIME (hz>>4) /* 62 ms / 1038* 1039static void 1040arc_cksum_verify(arc_buf_t buf) 1041{ 1042* zio_cksum_t zc; 1043 1044 if (!(zfs_flags & ZFS_DEBUG_MODIFY)) 1045 return; 1046 1047 mutex_enter(&buf->b_hdr->b_freeze_lock); 1048 if (buf->b_hdr->b_freeze_cksum == NULL \|\| 1049 (buf->b_hdr->b_flags & ARC_IO_ERROR)) { 1050 mutex_exit(&buf->b_hdr->b_freeze_lock); 1051 return; 1052 } 1053 fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc); 1054 if (!ZIO_CHECKSUM_EQUAL(buf->b_hdr->b_freeze_cksum, zc)) 1055* panic("buffer modified while frozen!"); 1056 mutex_exit(&buf->b_hdr->b_freeze_lock); 1057} 1058 1059static int 1060arc_cksum_equal(arc_buf_t buf) 1061{ 1062* zio_cksum_t zc; 1063 int equal; 1064 1065 mutex_enter(&buf->b_hdr->b_freeze_lock); 1066 fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc); 1067 equal = ZIO_CHECKSUM_EQUAL(buf->b_hdr->b_freeze_cksum, zc); 1068* mutex_exit(&buf->b_hdr->b_freeze_lock); 1069 1070 return (equal); 1071} 1072 1073static void 1074arc_cksum_compute(arc_buf_t buf, boolean_t force) 1075{ 1076* if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY)) 1077 return; 1078 1079 mutex_enter(&buf->b_hdr->b_freeze_lock); 1080 if (buf->b_hdr->b_freeze_cksum != NULL) { 1081 mutex_exit(&buf->b_hdr->b_freeze_lock); 1082 return; 1083 } 1084 buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP); 1085 fletcher_2_native(buf->b_data, buf->b_hdr->b_size, 1086 buf->b_hdr->b_freeze_cksum); 1087 mutex_exit(&buf->b_hdr->b_freeze_lock); 1088#ifdef illumos 1089 arc_buf_watch(buf); 1090#endif /* illumos / 1091} 1092* 1093#ifdef illumos 1094#ifndef _KERNEL 1095typedef struct procctl { 1096 long cmd; 1097 prwatch_t prwatch; 1098} procctl_t; 1099#endif 1100 1101/* ARGSUSED / 1102static void 1103arc_buf_unwatch(arc_buf_t buf) 1104{ 1105#ifndef _KERNEL 1106 if (arc_watch) { 1107 int result; 1108 procctl_t ctl; 1109 ctl.cmd = PCWATCH; 1110 ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data; 1111 ctl.prwatch.pr_size = 0; 1112 ctl.prwatch.pr_wflags = 0; 1113 result = write(arc_procfd, &ctl, sizeof (ctl)); 1114 ASSERT3U(result, ==, sizeof (ctl)); 1115 } 1116#endif 1117} 1118 1119/* ARGSUSED / 1120static void 1121arc_buf_watch(arc_buf_t buf) 1122{ 1123#ifndef _KERNEL 1124 if (arc_watch) { 1125 int result; 1126 procctl_t ctl; 1127 ctl.cmd = PCWATCH; 1128 ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data; 1129 ctl.prwatch.pr_size = buf->b_hdr->b_size; 1130 ctl.prwatch.pr_wflags = WA_WRITE; 1131 result = write(arc_procfd, &ctl, sizeof (ctl)); 1132 ASSERT3U(result, ==, sizeof (ctl)); 1133 } 1134#endif 1135} 1136#endif /* illumos / 1137* 1138void 1139arc_buf_thaw(arc_buf_t buf) 1140{ 1141* if (zfs_flags & ZFS_DEBUG_MODIFY) { 1142 if (buf->b_hdr->b_state != arc_anon) 1143 panic("modifying non-anon buffer!"); 1144 if (buf->b_hdr->b_flags & ARC_IO_IN_PROGRESS) 1145 panic("modifying buffer while i/o in progress!"); 1146 arc_cksum_verify(buf); 1147 } 1148 1149 mutex_enter(&buf->b_hdr->b_freeze_lock); 1150 if (buf->b_hdr->b_freeze_cksum != NULL) { 1151 kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t)); 1152 buf->b_hdr->b_freeze_cksum = NULL; 1153 } 1154 1155 if (zfs_flags & ZFS_DEBUG_MODIFY) { 1156 if (buf->b_hdr->b_thawed) 1157 kmem_free(buf->b_hdr->b_thawed, 1); 1158 buf->b_hdr->b_thawed = kmem_alloc(1, KM_SLEEP); 1159 } 1160 1161 mutex_exit(&buf->b_hdr->b_freeze_lock); 1162 1163#ifdef illumos 1164 arc_buf_unwatch(buf); 1165#endif /* illumos / 1166} 1167* 1168void 1169arc_buf_freeze(arc_buf_t buf) 1170{ 1171* kmutex_t hash_lock; 1172* 1173 if (!(zfs_flags & ZFS_DEBUG_MODIFY)) 1174 return; 1175 1176 hash_lock = HDR_LOCK(buf->b_hdr); 1177 mutex_enter(hash_lock); 1178 1179 ASSERT(buf->b_hdr->b_freeze_cksum != NULL \|\| 1180 buf->b_hdr->b_state == arc_anon); 1181 arc_cksum_compute(buf, B_FALSE); 1182 mutex_exit(hash_lock); 1183 1184} 1185 1186static void 1187get_buf_info(arc_buf_hdr_t ab, arc_state_t state, list_t list, kmutex_t lock) 1188{ 1189 uint64_t buf_hashid = buf_hash(ab->b_spa, &ab->b_dva, ab->b_birth); 1190 1191 if (ab->b_type == ARC_BUFC_METADATA) 1192 buf_hashid &= (ARC_BUFC_NUMMETADATALISTS - 1); 1193 else { 1194 buf_hashid &= (ARC_BUFC_NUMDATALISTS - 1); 1195 buf_hashid += ARC_BUFC_NUMMETADATALISTS; 1196 } 1197 1198 list = &state->arcs_lists[buf_hashid]; 1199* lock = ARCS_LOCK(state, buf_hashid); 1200} 1201* 1202 1203static void 1204add_reference(arc_buf_hdr_t ab, kmutex_t hash_lock, void tag) 1205{ 1206* ASSERT(MUTEX_HELD(hash_lock)); 1207 1208 if ((refcount_add(&ab->b_refcnt, tag) == 1) && 1209 (ab->b_state != arc_anon)) { 1210 uint64_t delta = ab->b_size * ab->b_datacnt; 1211 uint64_t size = &ab->b_state->arcs_lsize[ab->b_type]; 1212* list_t list; 1213* kmutex_t lock; 1214* 1215 get_buf_info(ab, ab->b_state, &list, &lock); 1216 ASSERT(!MUTEX_HELD(lock)); 1217 mutex_enter(lock); 1218 ASSERT(list_link_active(&ab->b_arc_node)); 1219 list_remove(list, ab); 1220 if (GHOST_STATE(ab->b_state)) { 1221 ASSERT0(ab->b_datacnt); 1222 ASSERT3P(ab->b_buf, ==, NULL); 1223 delta = ab->b_size; 1224 } 1225 ASSERT(delta > 0); 1226 ASSERT3U(size, >=, delta); 1227* atomic_add_64(size, -delta); 1228 mutex_exit(lock); 1229 /* remove the prefetch flag if we get a reference / 1230* if (ab->b_flags & ARC_PREFETCH) 1231 ab->b_flags &= ~ARC_PREFETCH; 1232 } 1233} 1234 1235static int 1236remove_reference(arc_buf_hdr_t ab, kmutex_t hash_lock, void tag) 1237{ 1238* int cnt; 1239 arc_state_t state = ab->b_state; 1240* 1241 ASSERT(state == arc_anon \|\| MUTEX_HELD(hash_lock)); 1242 ASSERT(!GHOST_STATE(state)); 1243 1244 if (((cnt = refcount_remove(&ab->b_refcnt, tag)) == 0) && 1245 (state != arc_anon)) { 1246 uint64_t size = &state->arcs_lsize[ab->b_type]; 1247* list_t list; 1248* kmutex_t lock; 1249* 1250 get_buf_info(ab, state, &list, &lock); 1251 ASSERT(!MUTEX_HELD(lock)); 1252 mutex_enter(lock); 1253 ASSERT(!list_link_active(&ab->b_arc_node)); 1254 list_insert_head(list, ab); 1255 ASSERT(ab->b_datacnt > 0); 1256 atomic_add_64(size, ab->b_size * ab->b_datacnt); 1257 mutex_exit(lock); 1258 } 1259 return (cnt); 1260} 1261 1262/* 1263 * Move the supplied buffer to the indicated state. The mutex 1264 * for the buffer must be held by the caller. 1265 / 1266static void 1267arc_change_state(arc_state_t new_state, arc_buf_hdr_t ab, kmutex_t hash_lock) 1268{ 1269 arc_state_t old_state = ab->b_state; 1270* int64_t refcnt = refcount_count(&ab->b_refcnt); 1271 uint64_t from_delta, to_delta; 1272 list_t list; 1273* kmutex_t lock; 1274* 1275 ASSERT(MUTEX_HELD(hash_lock)); 1276 ASSERT(new_state != old_state); 1277 ASSERT(refcnt == 0 \|\| ab->b_datacnt > 0); 1278 ASSERT(ab->b_datacnt == 0 \|\| !GHOST_STATE(new_state)); 1279 ASSERT(ab->b_datacnt <= 1 \|\| old_state != arc_anon); 1280 1281 from_delta = to_delta = ab->b_datacnt * ab->b_size; 1282 1283 /* 1284 * If this buffer is evictable, transfer it from the 1285 * old state list to the new state list. 1286 / 1287* if (refcnt == 0) { 1288 if (old_state != arc_anon) { 1289 int use_mutex; 1290 uint64_t size = &old_state->arcs_lsize[ab->b_type]; 1291* 1292 get_buf_info(ab, old_state, &list, &lock); 1293 use_mutex = !MUTEX_HELD(lock); 1294 if (use_mutex) 1295 mutex_enter(lock); 1296 1297 ASSERT(list_link_active(&ab->b_arc_node)); 1298 list_remove(list, ab); 1299 1300 /* 1301 * If prefetching out of the ghost cache, 1302 * we will have a non-zero datacnt. 1303 / 1304* if (GHOST_STATE(old_state) && ab->b_datacnt == 0) { 1305 /* ghost elements have a ghost size / 1306* ASSERT(ab->b_buf == NULL); 1307 from_delta = ab->b_size; 1308 } 1309 ASSERT3U(size, >=, from_delta); 1310* atomic_add_64(size, -from_delta); 1311 1312 if (use_mutex) 1313 mutex_exit(lock); 1314 } 1315 if (new_state != arc_anon) { 1316 int use_mutex; 1317 uint64_t size = &new_state->arcs_lsize[ab->b_type]; 1318* 1319 get_buf_info(ab, new_state, &list, &lock); 1320 use_mutex = !MUTEX_HELD(lock); 1321 if (use_mutex) 1322 mutex_enter(lock); 1323 1324 list_insert_head(list, ab); 1325 1326 /* ghost elements have a ghost size / 1327* if (GHOST_STATE(new_state)) { 1328 ASSERT(ab->b_datacnt == 0); 1329 ASSERT(ab->b_buf == NULL); 1330 to_delta = ab->b_size; 1331 } 1332 atomic_add_64(size, to_delta); 1333 1334 if (use_mutex) 1335 mutex_exit(lock); 1336 } 1337 } 1338 1339 ASSERT(!BUF_EMPTY(ab)); 1340 if (new_state == arc_anon && HDR_IN_HASH_TABLE(ab)) 1341 buf_hash_remove(ab); 1342 1343 /* adjust state sizes / 1344* if (to_delta) 1345 atomic_add_64(&new_state->arcs_size, to_delta); 1346 if (from_delta) { 1347 ASSERT3U(old_state->arcs_size, >=, from_delta); 1348 atomic_add_64(&old_state->arcs_size, -from_delta); 1349 } 1350 ab->b_state = new_state; 1351 1352 /* adjust l2arc hdr stats / 1353* if (new_state == arc_l2c_only) 1354 l2arc_hdr_stat_add(); 1355 else if (old_state == arc_l2c_only) 1356 l2arc_hdr_stat_remove(); 1357} 1358 1359void 1360arc_space_consume(uint64_t space, arc_space_type_t type) 1361{ 1362 ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES); 1363 1364 switch (type) { 1365 case ARC_SPACE_DATA: 1366 ARCSTAT_INCR(arcstat_data_size, space); 1367 break; 1368 case ARC_SPACE_OTHER: 1369 ARCSTAT_INCR(arcstat_other_size, space); 1370 break; 1371 case ARC_SPACE_HDRS: 1372 ARCSTAT_INCR(arcstat_hdr_size, space); 1373 break; 1374 case ARC_SPACE_L2HDRS: 1375 ARCSTAT_INCR(arcstat_l2_hdr_size, space); 1376 break; 1377 } 1378 1379 atomic_add_64(&arc_meta_used, space); 1380 atomic_add_64(&arc_size, space); 1381} 1382 1383void 1384arc_space_return(uint64_t space, arc_space_type_t type) 1385{ 1386 ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES); 1387 1388 switch (type) { 1389 case ARC_SPACE_DATA: 1390 ARCSTAT_INCR(arcstat_data_size, -space); 1391 break; 1392 case ARC_SPACE_OTHER: 1393 ARCSTAT_INCR(arcstat_other_size, -space); 1394 break; 1395 case ARC_SPACE_HDRS: 1396 ARCSTAT_INCR(arcstat_hdr_size, -space); 1397 break; 1398 case ARC_SPACE_L2HDRS: 1399 ARCSTAT_INCR(arcstat_l2_hdr_size, -space); 1400 break; 1401 } 1402 1403 ASSERT(arc_meta_used >= space); 1404 if (arc_meta_max < arc_meta_used) 1405 arc_meta_max = arc_meta_used; 1406 atomic_add_64(&arc_meta_used, -space); 1407 ASSERT(arc_size >= space); 1408 atomic_add_64(&arc_size, -space); 1409} 1410 1411void * 1412arc_data_buf_alloc(uint64_t size) 1413{ 1414 if (arc_evict_needed(ARC_BUFC_DATA)) 1415 cv_signal(&arc_reclaim_thr_cv); 1416 atomic_add_64(&arc_size, size); 1417 return (zio_data_buf_alloc(size)); 1418} 1419 1420void 1421arc_data_buf_free(void buf, uint64_t size) 1422{ 1423* zio_data_buf_free(buf, size); 1424 ASSERT(arc_size >= size); 1425 atomic_add_64(&arc_size, -size); 1426} 1427 1428arc_buf_t * 1429arc_buf_alloc(spa_t spa, int size, void tag, arc_buf_contents_t type) 1430{ 1431 arc_buf_hdr_t hdr; 1432* arc_buf_t buf; 1433* 1434 ASSERT3U(size, >, 0); 1435 hdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE); 1436 ASSERT(BUF_EMPTY(hdr)); 1437 hdr->b_size = size; 1438 hdr->b_type = type; 1439 hdr->b_spa = spa_load_guid(spa); 1440 hdr->b_state = arc_anon; 1441 hdr->b_arc_access = 0; 1442 buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); 1443 buf->b_hdr = hdr; 1444 buf->b_data = NULL; 1445 buf->b_efunc = NULL; 1446 buf->b_private = NULL; 1447 buf->b_next = NULL; 1448 hdr->b_buf = buf; 1449 arc_get_data_buf(buf); 1450 hdr->b_datacnt = 1; 1451 hdr->b_flags = 0; 1452 ASSERT(refcount_is_zero(&hdr->b_refcnt)); 1453 (void) refcount_add(&hdr->b_refcnt, tag); 1454 1455 return (buf); 1456} 1457 1458static char arc_onloan_tag = "onloan"; 1459* 1460/* 1461 * Loan out an anonymous arc buffer. Loaned buffers are not counted as in 1462 * flight data by arc_tempreserve_space() until they are "returned". Loaned 1463 * buffers must be returned to the arc before they can be used by the DMU or 1464 * freed. 1465 / 1466arc_buf_t 1467arc_loan_buf(spa_t spa, int size) 1468{ 1469* arc_buf_t buf; 1470* 1471 buf = arc_buf_alloc(spa, size, arc_onloan_tag, ARC_BUFC_DATA); 1472 1473 atomic_add_64(&arc_loaned_bytes, size); 1474 return (buf); 1475} 1476 1477/* 1478 * Return a loaned arc buffer to the arc. 1479 / 1480void 1481arc_return_buf(arc_buf_t buf, void tag) 1482{ 1483* arc_buf_hdr_t hdr = buf->b_hdr; 1484* 1485 ASSERT(buf->b_data != NULL); 1486 (void) refcount_add(&hdr->b_refcnt, tag); 1487 (void) refcount_remove(&hdr->b_refcnt, arc_onloan_tag); 1488 1489 atomic_add_64(&arc_loaned_bytes, -hdr->b_size); 1490} 1491 1492/* Detach an arc_buf from a dbuf (tag) / 1493void 1494arc_loan_inuse_buf(arc_buf_t buf, void tag) 1495{ 1496* arc_buf_hdr_t hdr; 1497* 1498 ASSERT(buf->b_data != NULL); 1499 hdr = buf->b_hdr; 1500 (void) refcount_add(&hdr->b_refcnt, arc_onloan_tag); 1501 (void) refcount_remove(&hdr->b_refcnt, tag); 1502 buf->b_efunc = NULL; 1503 buf->b_private = NULL; 1504 1505 atomic_add_64(&arc_loaned_bytes, hdr->b_size); 1506} 1507 1508static arc_buf_t * 1509arc_buf_clone(arc_buf_t from) 1510{ 1511* arc_buf_t buf; 1512* arc_buf_hdr_t hdr = from->b_hdr; 1513* uint64_t size = hdr->b_size; 1514 1515 ASSERT(hdr->b_state != arc_anon); 1516 1517 buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); 1518 buf->b_hdr = hdr; 1519 buf->b_data = NULL; 1520 buf->b_efunc = NULL; 1521 buf->b_private = NULL; 1522 buf->b_next = hdr->b_buf; 1523 hdr->b_buf = buf; 1524 arc_get_data_buf(buf); 1525 bcopy(from->b_data, buf->b_data, size); 1526 1527 /* 1528 * This buffer already exists in the arc so create a duplicate 1529 * copy for the caller. If the buffer is associated with user data 1530 * then track the size and number of duplicates. These stats will be 1531 * updated as duplicate buffers are created and destroyed. 1532 / 1533* if (hdr->b_type == ARC_BUFC_DATA) { 1534 ARCSTAT_BUMP(arcstat_duplicate_buffers); 1535 ARCSTAT_INCR(arcstat_duplicate_buffers_size, size); 1536 } 1537 hdr->b_datacnt += 1; 1538 return (buf); 1539} 1540 1541void 1542arc_buf_add_ref(arc_buf_t buf, void tag) 1543{ 1544 arc_buf_hdr_t hdr; 1545* kmutex_t hash_lock; 1546* 1547 /* 1548 * Check to see if this buffer is evicted. Callers 1549 * must verify b_data != NULL to know if the add_ref 1550 * was successful. 1551 / 1552* mutex_enter(&buf->b_evict_lock); 1553 if (buf->b_data == NULL) { 1554 mutex_exit(&buf->b_evict_lock); 1555 return; 1556 } 1557 hash_lock = HDR_LOCK(buf->b_hdr); 1558 mutex_enter(hash_lock); 1559 hdr = buf->b_hdr; 1560 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 1561 mutex_exit(&buf->b_evict_lock); 1562 1563 ASSERT(hdr->b_state == arc_mru \|\| hdr->b_state == arc_mfu); 1564 add_reference(hdr, hash_lock, tag); 1565 DTRACE_PROBE1(arc__hit, arc_buf_hdr_t , hdr); 1566* arc_access(hdr, hash_lock); 1567 mutex_exit(hash_lock); 1568 ARCSTAT_BUMP(arcstat_hits); 1569 ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH), 1570 demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, 1571 data, metadata, hits); 1572} 1573 1574/* 1575 * Free the arc data buffer. If it is an l2arc write in progress, 1576 * the buffer is placed on l2arc_free_on_write to be freed later. 1577 / 1578static void 1579arc_buf_data_free(arc_buf_t buf, void (free_func)(void , size_t)) 1580{ 1581 arc_buf_hdr_t hdr = buf->b_hdr; 1582* 1583 if (HDR_L2_WRITING(hdr)) { 1584 l2arc_data_free_t df; 1585* df = kmem_alloc(sizeof (l2arc_data_free_t), KM_SLEEP); 1586 df->l2df_data = buf->b_data; 1587 df->l2df_size = hdr->b_size; 1588 df->l2df_func = free_func; 1589 mutex_enter(&l2arc_free_on_write_mtx); 1590 list_insert_head(l2arc_free_on_write, df); 1591 mutex_exit(&l2arc_free_on_write_mtx); 1592 ARCSTAT_BUMP(arcstat_l2_free_on_write); 1593 } else { 1594 free_func(buf->b_data, hdr->b_size); 1595 } 1596} 1597 1598static void 1599arc_buf_destroy(arc_buf_t buf, boolean_t recycle, boolean_t all) 1600{ 1601* arc_buf_t *bufp; 1602* 1603 /* free up data associated with the buf / 1604* if (buf->b_data) { 1605 arc_state_t state = buf->b_hdr->b_state; 1606* uint64_t size = buf->b_hdr->b_size; 1607 arc_buf_contents_t type = buf->b_hdr->b_type; 1608 1609 arc_cksum_verify(buf); 1610#ifdef illumos 1611 arc_buf_unwatch(buf); 1612#endif /* illumos / 1613* 1614 if (!recycle) { 1615 if (type == ARC_BUFC_METADATA) { 1616 arc_buf_data_free(buf, zio_buf_free); 1617 arc_space_return(size, ARC_SPACE_DATA); 1618 } else { 1619 ASSERT(type == ARC_BUFC_DATA); 1620 arc_buf_data_free(buf, zio_data_buf_free); 1621 ARCSTAT_INCR(arcstat_data_size, -size); 1622 atomic_add_64(&arc_size, -size); 1623 } 1624 } 1625 if (list_link_active(&buf->b_hdr->b_arc_node)) { 1626 uint64_t cnt = &state->arcs_lsize[type]; 1627* 1628 ASSERT(refcount_is_zero(&buf->b_hdr->b_refcnt)); 1629 ASSERT(state != arc_anon); 1630 1631 ASSERT3U(cnt, >=, size); 1632* atomic_add_64(cnt, -size); 1633 } 1634 ASSERT3U(state->arcs_size, >=, size); 1635 atomic_add_64(&state->arcs_size, -size); 1636 buf->b_data = NULL; 1637 1638 /* 1639 * If we're destroying a duplicate buffer make sure 1640 * that the appropriate statistics are updated. 1641 / 1642* if (buf->b_hdr->b_datacnt > 1 && 1643 buf->b_hdr->b_type == ARC_BUFC_DATA) { 1644 ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers); 1645 ARCSTAT_INCR(arcstat_duplicate_buffers_size, -size); 1646 } 1647 ASSERT(buf->b_hdr->b_datacnt > 0); 1648 buf->b_hdr->b_datacnt -= 1; 1649 } 1650 1651 /* only remove the buf if requested / 1652* if (!all) 1653 return; 1654 1655 /* remove the buf from the hdr list / 1656* for (bufp = &buf->b_hdr->b_buf; bufp != buf; bufp = &(bufp)->b_next) 1657 continue; 1658 bufp = buf->b_next; 1659* buf->b_next = NULL; 1660 1661 ASSERT(buf->b_efunc == NULL); 1662 1663 /* clean up the buf / 1664* buf->b_hdr = NULL; 1665 kmem_cache_free(buf_cache, buf); 1666} 1667 1668static void 1669arc_hdr_destroy(arc_buf_hdr_t hdr) 1670{ 1671* ASSERT(refcount_is_zero(&hdr->b_refcnt)); 1672 ASSERT3P(hdr->b_state, ==, arc_anon); 1673 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 1674 l2arc_buf_hdr_t l2hdr = hdr->b_l2hdr; 1675* 1676 if (l2hdr != NULL) { 1677 boolean_t buflist_held = MUTEX_HELD(&l2arc_buflist_mtx); 1678 /* 1679 * To prevent arc_free() and l2arc_evict() from 1680 * attempting to free the same buffer at the same time, 1681 * a FREE_IN_PROGRESS flag is given to arc_free() to 1682 * give it priority. l2arc_evict() can't destroy this 1683 * header while we are waiting on l2arc_buflist_mtx. 1684 * 1685 * The hdr may be removed from l2ad_buflist before we 1686 * grab l2arc_buflist_mtx, so b_l2hdr is rechecked. 1687 / 1688* if (!buflist_held) { 1689 mutex_enter(&l2arc_buflist_mtx); 1690 l2hdr = hdr->b_l2hdr; 1691 } 1692 1693 if (l2hdr != NULL) { 1694 list_remove(l2hdr->b_dev->l2ad_buflist, hdr); 1695 ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); 1696 kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t)); 1697 if (hdr->b_state == arc_l2c_only) 1698 l2arc_hdr_stat_remove(); 1699 hdr->b_l2hdr = NULL; 1700 } 1701 1702 if (!buflist_held) 1703 mutex_exit(&l2arc_buflist_mtx); 1704 } 1705 1706 if (!BUF_EMPTY(hdr)) { 1707 ASSERT(!HDR_IN_HASH_TABLE(hdr)); 1708 buf_discard_identity(hdr); 1709 } 1710 while (hdr->b_buf) { 1711 arc_buf_t buf = hdr->b_buf; 1712* 1713 if (buf->b_efunc) { 1714 mutex_enter(&arc_eviction_mtx); 1715 mutex_enter(&buf->b_evict_lock); 1716 ASSERT(buf->b_hdr != NULL); 1717 arc_buf_destroy(hdr->b_buf, FALSE, FALSE); 1718 hdr->b_buf = buf->b_next; 1719 buf->b_hdr = &arc_eviction_hdr; 1720 buf->b_next = arc_eviction_list; 1721 arc_eviction_list = buf; 1722 mutex_exit(&buf->b_evict_lock); 1723 mutex_exit(&arc_eviction_mtx); 1724 } else { 1725 arc_buf_destroy(hdr->b_buf, FALSE, TRUE); 1726 } 1727 } 1728 if (hdr->b_freeze_cksum != NULL) { 1729 kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t)); 1730 hdr->b_freeze_cksum = NULL; 1731 } 1732 if (hdr->b_thawed) { 1733 kmem_free(hdr->b_thawed, 1); 1734 hdr->b_thawed = NULL; 1735 } 1736 1737 ASSERT(!list_link_active(&hdr->b_arc_node)); 1738 ASSERT3P(hdr->b_hash_next, ==, NULL); 1739 ASSERT3P(hdr->b_acb, ==, NULL); 1740 kmem_cache_free(hdr_cache, hdr); 1741} 1742 1743void 1744arc_buf_free(arc_buf_t buf, void tag) 1745{ 1746 arc_buf_hdr_t hdr = buf->b_hdr; 1747* int hashed = hdr->b_state != arc_anon; 1748 1749 ASSERT(buf->b_efunc == NULL); 1750 ASSERT(buf->b_data != NULL); 1751 1752 if (hashed) { 1753 kmutex_t hash_lock = HDR_LOCK(hdr); 1754* 1755 mutex_enter(hash_lock); 1756 hdr = buf->b_hdr; 1757 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 1758 1759 (void) remove_reference(hdr, hash_lock, tag); 1760 if (hdr->b_datacnt > 1) { 1761 arc_buf_destroy(buf, FALSE, TRUE); 1762 } else { 1763 ASSERT(buf == hdr->b_buf); 1764 ASSERT(buf->b_efunc == NULL); 1765 hdr->b_flags \|= ARC_BUF_AVAILABLE; 1766 } 1767 mutex_exit(hash_lock); 1768 } else if (HDR_IO_IN_PROGRESS(hdr)) { 1769 int destroy_hdr; 1770 /* 1771 * We are in the middle of an async write. Don't destroy 1772 * this buffer unless the write completes before we finish 1773 * decrementing the reference count. 1774 / 1775* mutex_enter(&arc_eviction_mtx); 1776 (void) remove_reference(hdr, NULL, tag); 1777 ASSERT(refcount_is_zero(&hdr->b_refcnt)); 1778 destroy_hdr = !HDR_IO_IN_PROGRESS(hdr); 1779 mutex_exit(&arc_eviction_mtx); 1780 if (destroy_hdr) 1781 arc_hdr_destroy(hdr); 1782 } else { 1783 if (remove_reference(hdr, NULL, tag) > 0) 1784 arc_buf_destroy(buf, FALSE, TRUE); 1785 else 1786 arc_hdr_destroy(hdr); 1787 } 1788} 1789 1790int 1791arc_buf_remove_ref(arc_buf_t buf, void tag) 1792{ 1793 arc_buf_hdr_t hdr = buf->b_hdr; 1794* kmutex_t hash_lock = HDR_LOCK(hdr); 1795* int no_callback = (buf->b_efunc == NULL); 1796 1797 if (hdr->b_state == arc_anon) { 1798 ASSERT(hdr->b_datacnt == 1); 1799 arc_buf_free(buf, tag); 1800 return (no_callback); 1801 } 1802 1803 mutex_enter(hash_lock); 1804 hdr = buf->b_hdr; 1805 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 1806 ASSERT(hdr->b_state != arc_anon); 1807 ASSERT(buf->b_data != NULL); 1808 1809 (void) remove_reference(hdr, hash_lock, tag); 1810 if (hdr->b_datacnt > 1) { 1811 if (no_callback) 1812 arc_buf_destroy(buf, FALSE, TRUE); 1813 } else if (no_callback) { 1814 ASSERT(hdr->b_buf == buf && buf->b_next == NULL); 1815 ASSERT(buf->b_efunc == NULL); 1816 hdr->b_flags \|= ARC_BUF_AVAILABLE; 1817 } 1818 ASSERT(no_callback \|\| hdr->b_datacnt > 1 \|\| 1819 refcount_is_zero(&hdr->b_refcnt)); 1820 mutex_exit(hash_lock); 1821 return (no_callback); 1822} 1823 1824int 1825arc_buf_size(arc_buf_t buf) 1826{ 1827* return (buf->b_hdr->b_size); 1828} 1829 1830/* 1831 * Called from the DMU to determine if the current buffer should be 1832 * evicted. In order to ensure proper locking, the eviction must be initiated 1833 * from the DMU. Return true if the buffer is associated with user data and 1834 * duplicate buffers still exist. 1835 / 1836boolean_t 1837arc_buf_eviction_needed(arc_buf_t buf) 1838{ 1839 arc_buf_hdr_t hdr; 1840* boolean_t evict_needed = B_FALSE; 1841 1842 if (zfs_disable_dup_eviction) 1843 return (B_FALSE); 1844 1845 mutex_enter(&buf->b_evict_lock); 1846 hdr = buf->b_hdr; 1847 if (hdr == NULL) { 1848 /* 1849 * We are in arc_do_user_evicts(); let that function 1850 * perform the eviction. 1851 / 1852* ASSERT(buf->b_data == NULL); 1853 mutex_exit(&buf->b_evict_lock); 1854 return (B_FALSE); 1855 } else if (buf->b_data == NULL) { 1856 /* 1857 * We have already been added to the arc eviction list; 1858 * recommend eviction. 1859 / 1860* ASSERT3P(hdr, ==, &arc_eviction_hdr); 1861 mutex_exit(&buf->b_evict_lock); 1862 return (B_TRUE); 1863 } 1864 1865 if (hdr->b_datacnt > 1 && hdr->b_type == ARC_BUFC_DATA) 1866 evict_needed = B_TRUE; 1867 1868 mutex_exit(&buf->b_evict_lock); 1869 return (evict_needed); 1870} 1871 1872/* 1873 * Evict buffers from list until we've removed the specified number of 1874 * bytes. Move the removed buffers to the appropriate evict state. 1875 * If the recycle flag is set, then attempt to "recycle" a buffer: 1876 * - look for a buffer to evict that is `bytes' long. 1877 * - return the data block from this buffer rather than freeing it. 1878 * This flag is used by callers that are trying to make space for a 1879 * new buffer in a full arc cache. 1880 * 1881 * This function makes a "best effort". It skips over any buffers 1882 * it can't get a hash_lock on, and so may not catch all candidates. 1883 * It may also return without evicting as much space as requested. 1884 / 1885static void 1886arc_evict(arc_state_t state, uint64_t spa, int64_t bytes, boolean_t recycle, 1887* arc_buf_contents_t type) 1888{ 1889 arc_state_t evicted_state; 1890* uint64_t bytes_evicted = 0, skipped = 0, missed = 0; 1891 int64_t bytes_remaining; 1892 arc_buf_hdr_t ab, ab_prev = NULL; 1893 list_t evicted_list, list, evicted_list_start, list_start; 1894 kmutex_t lock, evicted_lock; 1895 kmutex_t hash_lock; 1896* boolean_t have_lock; 1897 void stolen = NULL; 1898* static int evict_metadata_offset, evict_data_offset; 1899 int i, idx, offset, list_count, count; 1900 1901 ASSERT(state == arc_mru \|\| state == arc_mfu); 1902 1903 evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost; 1904 1905 if (type == ARC_BUFC_METADATA) { 1906 offset = 0; 1907 list_count = ARC_BUFC_NUMMETADATALISTS; 1908 list_start = &state->arcs_lists[0]; 1909 evicted_list_start = &evicted_state->arcs_lists[0]; 1910 idx = evict_metadata_offset; 1911 } else { 1912 offset = ARC_BUFC_NUMMETADATALISTS; 1913 list_start = &state->arcs_lists[offset]; 1914 evicted_list_start = &evicted_state->arcs_lists[offset]; 1915 list_count = ARC_BUFC_NUMDATALISTS; 1916 idx = evict_data_offset; 1917 } 1918 bytes_remaining = evicted_state->arcs_lsize[type]; 1919 count = 0; 1920 1921evict_start: 1922 list = &list_start[idx]; 1923 evicted_list = &evicted_list_start[idx]; 1924 lock = ARCS_LOCK(state, (offset + idx)); 1925 evicted_lock = ARCS_LOCK(evicted_state, (offset + idx)); 1926 1927 mutex_enter(lock); 1928 mutex_enter(evicted_lock); 1929 1930 for (ab = list_tail(list); ab; ab = ab_prev) { 1931 ab_prev = list_prev(list, ab); 1932 bytes_remaining -= (ab->b_size * ab->b_datacnt); 1933 /* prefetch buffers have a minimum lifespan / 1934* if (HDR_IO_IN_PROGRESS(ab) \|\| 1935 (spa && ab->b_spa != spa) \|\| 1936 (ab->b_flags & (ARC_PREFETCH\|ARC_INDIRECT) && 1937 ddi_get_lbolt() - ab->b_arc_access < 1938 arc_min_prefetch_lifespan)) { 1939 skipped++; 1940 continue; 1941 } 1942 /* "lookahead" for better eviction candidate / 1943* if (recycle && ab->b_size != bytes && 1944 ab_prev && ab_prev->b_size == bytes) 1945 continue; 1946 hash_lock = HDR_LOCK(ab); 1947 have_lock = MUTEX_HELD(hash_lock); 1948 if (have_lock \|\| mutex_tryenter(hash_lock)) { 1949 ASSERT0(refcount_count(&ab->b_refcnt)); 1950 ASSERT(ab->b_datacnt > 0); 1951 while (ab->b_buf) { 1952 arc_buf_t buf = ab->b_buf; 1953* if (!mutex_tryenter(&buf->b_evict_lock)) { 1954 missed += 1; 1955 break; 1956 } 1957 if (buf->b_data) { 1958 bytes_evicted += ab->b_size; 1959 if (recycle && ab->b_type == type && 1960 ab->b_size == bytes && 1961 !HDR_L2_WRITING(ab)) { 1962 stolen = buf->b_data; 1963 recycle = FALSE; 1964 } 1965 } 1966 if (buf->b_efunc) { 1967 mutex_enter(&arc_eviction_mtx); 1968 arc_buf_destroy(buf, 1969 buf->b_data == stolen, FALSE); 1970 ab->b_buf = buf->b_next; 1971 buf->b_hdr = &arc_eviction_hdr; 1972 buf->b_next = arc_eviction_list; 1973 arc_eviction_list = buf; 1974 mutex_exit(&arc_eviction_mtx); 1975 mutex_exit(&buf->b_evict_lock); 1976 } else { 1977 mutex_exit(&buf->b_evict_lock); 1978 arc_buf_destroy(buf, 1979 buf->b_data == stolen, TRUE); 1980 } 1981 } 1982 1983 if (ab->b_l2hdr) { 1984 ARCSTAT_INCR(arcstat_evict_l2_cached, 1985 ab->b_size); 1986 } else { 1987 if (l2arc_write_eligible(ab->b_spa, ab)) { 1988 ARCSTAT_INCR(arcstat_evict_l2_eligible, 1989 ab->b_size); 1990 } else { 1991 ARCSTAT_INCR( 1992 arcstat_evict_l2_ineligible, 1993 ab->b_size); 1994 } 1995 } 1996 1997 if (ab->b_datacnt == 0) { 1998 arc_change_state(evicted_state, ab, hash_lock); 1999 ASSERT(HDR_IN_HASH_TABLE(ab)); 2000 ab->b_flags \|= ARC_IN_HASH_TABLE; 2001 ab->b_flags &= ~ARC_BUF_AVAILABLE; 2002 DTRACE_PROBE1(arc__evict, arc_buf_hdr_t , ab); 2003* } 2004 if (!have_lock) 2005 mutex_exit(hash_lock); 2006 if (bytes >= 0 && bytes_evicted >= bytes) 2007 break; 2008 if (bytes_remaining > 0) { 2009 mutex_exit(evicted_lock); 2010 mutex_exit(lock); 2011 idx = ((idx + 1) & (list_count - 1)); 2012 count++; 2013 goto evict_start; 2014 } 2015 } else { 2016 missed += 1; 2017 } 2018 } 2019 2020 mutex_exit(evicted_lock); 2021 mutex_exit(lock); 2022 2023 idx = ((idx + 1) & (list_count - 1)); 2024 count++; 2025 2026 if (bytes_evicted < bytes) { 2027 if (count < list_count) 2028 goto evict_start; 2029 else 2030 dprintf("only evicted %lld bytes from %x", 2031 (longlong_t)bytes_evicted, state); 2032 } 2033 if (type == ARC_BUFC_METADATA) 2034 evict_metadata_offset = idx; 2035 else 2036 evict_data_offset = idx; 2037 2038 if (skipped) 2039 ARCSTAT_INCR(arcstat_evict_skip, skipped); 2040 2041 if (missed) 2042 ARCSTAT_INCR(arcstat_mutex_miss, missed); 2043 2044 /* 2045 * We have just evicted some date into the ghost state, make 2046 * sure we also adjust the ghost state size if necessary. 2047 / 2048* if (arc_no_grow && 2049 arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size > arc_c) { 2050 int64_t mru_over = arc_anon->arcs_size + arc_mru->arcs_size + 2051 arc_mru_ghost->arcs_size - arc_c; 2052 2053 if (mru_over > 0 && arc_mru_ghost->arcs_lsize[type] > 0) { 2054 int64_t todelete = 2055 MIN(arc_mru_ghost->arcs_lsize[type], mru_over); 2056 arc_evict_ghost(arc_mru_ghost, 0, todelete); 2057 } else if (arc_mfu_ghost->arcs_lsize[type] > 0) { 2058 int64_t todelete = MIN(arc_mfu_ghost->arcs_lsize[type], 2059 arc_mru_ghost->arcs_size + 2060 arc_mfu_ghost->arcs_size - arc_c); 2061 arc_evict_ghost(arc_mfu_ghost, 0, todelete); 2062 } 2063 } 2064 if (stolen) 2065 ARCSTAT_BUMP(arcstat_stolen); 2066 2067 return (stolen); 2068} 2069 2070/* 2071 * Remove buffers from list until we've removed the specified number of 2072 * bytes. Destroy the buffers that are removed. 2073 / 2074static void 2075arc_evict_ghost(arc_state_t state, uint64_t spa, int64_t bytes) 2076{ 2077 arc_buf_hdr_t ab, ab_prev; 2078 arc_buf_hdr_t marker = { 0 }; 2079 list_t list, list_start; 2080 kmutex_t hash_lock, lock; 2081 uint64_t bytes_deleted = 0; 2082 uint64_t bufs_skipped = 0; 2083 static int evict_offset; 2084 int list_count, idx = evict_offset; 2085 int offset, count = 0; 2086 2087 ASSERT(GHOST_STATE(state)); 2088 2089 /* 2090 * data lists come after metadata lists 2091 / 2092* list_start = &state->arcs_lists[ARC_BUFC_NUMMETADATALISTS]; 2093 list_count = ARC_BUFC_NUMDATALISTS; 2094 offset = ARC_BUFC_NUMMETADATALISTS; 2095 2096evict_start: 2097 list = &list_start[idx]; 2098 lock = ARCS_LOCK(state, idx + offset); 2099 2100 mutex_enter(lock); 2101 for (ab = list_tail(list); ab; ab = ab_prev) { 2102 ab_prev = list_prev(list, ab); 2103 if (spa && ab->b_spa != spa) 2104 continue; 2105 2106 /* ignore markers / 2107* if (ab->b_spa == 0) 2108 continue; 2109 2110 hash_lock = HDR_LOCK(ab); 2111 /* caller may be trying to modify this buffer, skip it / 2112* if (MUTEX_HELD(hash_lock)) 2113 continue; 2114 if (mutex_tryenter(hash_lock)) { 2115 ASSERT(!HDR_IO_IN_PROGRESS(ab)); 2116 ASSERT(ab->b_buf == NULL); 2117 ARCSTAT_BUMP(arcstat_deleted); 2118 bytes_deleted += ab->b_size; 2119 2120 if (ab->b_l2hdr != NULL) { 2121 /* 2122 * This buffer is cached on the 2nd Level ARC; 2123 * don't destroy the header. 2124 / 2125* arc_change_state(arc_l2c_only, ab, hash_lock); 2126 mutex_exit(hash_lock); 2127 } else { 2128 arc_change_state(arc_anon, ab, hash_lock); 2129 mutex_exit(hash_lock); 2130 arc_hdr_destroy(ab); 2131 } 2132 2133 DTRACE_PROBE1(arc__delete, arc_buf_hdr_t , ab); 2134* if (bytes >= 0 && bytes_deleted >= bytes) 2135 break; 2136 } else if (bytes < 0) { 2137 /* 2138 * Insert a list marker and then wait for the 2139 * hash lock to become available. Once its 2140 * available, restart from where we left off. 2141 / 2142* list_insert_after(list, ab, &marker); 2143 mutex_exit(lock); 2144 mutex_enter(hash_lock); 2145 mutex_exit(hash_lock); 2146 mutex_enter(lock); 2147 ab_prev = list_prev(list, &marker); 2148 list_remove(list, &marker); 2149 } else 2150 bufs_skipped += 1; 2151 } 2152 mutex_exit(lock); 2153 idx = ((idx + 1) & (ARC_BUFC_NUMDATALISTS - 1)); 2154 count++; 2155 2156 if (count < list_count) 2157 goto evict_start; 2158 2159 evict_offset = idx; 2160 if ((uintptr_t)list > (uintptr_t)&state->arcs_lists[ARC_BUFC_NUMMETADATALISTS] && 2161 (bytes < 0 \|\| bytes_deleted < bytes)) { 2162 list_start = &state->arcs_lists[0]; 2163 list_count = ARC_BUFC_NUMMETADATALISTS; 2164 offset = count = 0; 2165 goto evict_start; 2166 } 2167 2168 if (bufs_skipped) { 2169 ARCSTAT_INCR(arcstat_mutex_miss, bufs_skipped); 2170 ASSERT(bytes >= 0); 2171 } 2172 2173 if (bytes_deleted < bytes) 2174 dprintf("only deleted %lld bytes from %p", 2175 (longlong_t)bytes_deleted, state); 2176} 2177 2178static void 2179arc_adjust(void) 2180{ 2181 int64_t adjustment, delta; 2182 2183 /* 2184 * Adjust MRU size 2185 / 2186* 2187 adjustment = MIN((int64_t)(arc_size - arc_c), 2188 (int64_t)(arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used - 2189 arc_p)); 2190 2191 if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) { 2192 delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], adjustment); 2193 (void) arc_evict(arc_mru, 0, delta, FALSE, ARC_BUFC_DATA); 2194 adjustment -= delta; 2195 } 2196 2197 if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) { 2198 delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustment); 2199 (void) arc_evict(arc_mru, 0, delta, FALSE, 2200 ARC_BUFC_METADATA); 2201 } 2202 2203 /* 2204 * Adjust MFU size 2205 / 2206* 2207 adjustment = arc_size - arc_c; 2208 2209 if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) { 2210 delta = MIN(adjustment, arc_mfu->arcs_lsize[ARC_BUFC_DATA]); 2211 (void) arc_evict(arc_mfu, 0, delta, FALSE, ARC_BUFC_DATA); 2212 adjustment -= delta; 2213 } 2214 2215 if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) { 2216 int64_t delta = MIN(adjustment, 2217 arc_mfu->arcs_lsize[ARC_BUFC_METADATA]); 2218 (void) arc_evict(arc_mfu, 0, delta, FALSE, 2219 ARC_BUFC_METADATA); 2220 } 2221 2222 /* 2223 * Adjust ghost lists 2224 / 2225* 2226 adjustment = arc_mru->arcs_size + arc_mru_ghost->arcs_size - arc_c; 2227 2228 if (adjustment > 0 && arc_mru_ghost->arcs_size > 0) { 2229 delta = MIN(arc_mru_ghost->arcs_size, adjustment); 2230 arc_evict_ghost(arc_mru_ghost, 0, delta); 2231 } 2232 2233 adjustment = 2234 arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size - arc_c; 2235 2236 if (adjustment > 0 && arc_mfu_ghost->arcs_size > 0) { 2237 delta = MIN(arc_mfu_ghost->arcs_size, adjustment); 2238 arc_evict_ghost(arc_mfu_ghost, 0, delta); 2239 } 2240} 2241 2242static void 2243arc_do_user_evicts(void) 2244{ 2245 static arc_buf_t tmp_arc_eviction_list; 2246* 2247 /* 2248 * Move list over to avoid LOR 2249 / 2250restart: 2251* mutex_enter(&arc_eviction_mtx); 2252 tmp_arc_eviction_list = arc_eviction_list; 2253 arc_eviction_list = NULL; 2254 mutex_exit(&arc_eviction_mtx); 2255 2256 while (tmp_arc_eviction_list != NULL) { 2257 arc_buf_t buf = tmp_arc_eviction_list; 2258* tmp_arc_eviction_list = buf->b_next; 2259 mutex_enter(&buf->b_evict_lock); 2260 buf->b_hdr = NULL; 2261 mutex_exit(&buf->b_evict_lock); 2262 2263 if (buf->b_efunc != NULL) 2264 VERIFY(buf->b_efunc(buf) == 0); 2265 2266 buf->b_efunc = NULL; 2267 buf->b_private = NULL; 2268 kmem_cache_free(buf_cache, buf); 2269 } 2270 2271 if (arc_eviction_list != NULL) 2272 goto restart; 2273} 2274 2275/* 2276 * Flush all evictable data from the cache for the given spa. 2277 * NOTE: this will not touch "active" (i.e. referenced) data. 2278 / 2279void 2280arc_flush(spa_t spa) 2281{ 2282 uint64_t guid = 0; 2283 2284 if (spa) 2285 guid = spa_load_guid(spa); 2286 2287 while (arc_mru->arcs_lsize[ARC_BUFC_DATA]) { 2288 (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_DATA); 2289 if (spa) 2290 break; 2291 } 2292 while (arc_mru->arcs_lsize[ARC_BUFC_METADATA]) { 2293 (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_METADATA); 2294 if (spa) 2295 break; 2296 } 2297 while (arc_mfu->arcs_lsize[ARC_BUFC_DATA]) { 2298 (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_DATA); 2299 if (spa) 2300 break; 2301 } 2302 while (arc_mfu->arcs_lsize[ARC_BUFC_METADATA]) { 2303 (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_METADATA); 2304 if (spa) 2305 break; 2306 } 2307 2308 arc_evict_ghost(arc_mru_ghost, guid, -1); 2309 arc_evict_ghost(arc_mfu_ghost, guid, -1); 2310 2311 mutex_enter(&arc_reclaim_thr_lock); 2312 arc_do_user_evicts(); 2313 mutex_exit(&arc_reclaim_thr_lock); 2314 ASSERT(spa \|\| arc_eviction_list == NULL); 2315} 2316 2317void 2318arc_shrink(void) 2319{ 2320 if (arc_c > arc_c_min) { 2321 uint64_t to_free; 2322 2323#ifdef _KERNEL 2324 to_free = arc_c >> arc_shrink_shift; 2325#else 2326 to_free = arc_c >> arc_shrink_shift; 2327#endif 2328 if (arc_c > arc_c_min + to_free) 2329 atomic_add_64(&arc_c, -to_free); 2330 else 2331 arc_c = arc_c_min; 2332 2333 atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift)); 2334 if (arc_c > arc_size) 2335 arc_c = MAX(arc_size, arc_c_min); 2336 if (arc_p > arc_c) 2337 arc_p = (arc_c >> 1); 2338 ASSERT(arc_c >= arc_c_min); 2339 ASSERT((int64_t)arc_p >= 0); 2340 } 2341 2342 if (arc_size > arc_c) 2343 arc_adjust(); 2344} 2345 2346static int needfree = 0; 2347 2348static int 2349arc_reclaim_needed(void) 2350{ 2351 2352#ifdef _KERNEL 2353 2354 if (needfree) 2355 return (1); 2356 2357 /* 2358 * Cooperate with pagedaemon when it's time for it to scan 2359 * and reclaim some pages. 2360 / 2361* if (vm_paging_needed()) 2362 return (1); 2363 2364#ifdef sun 2365 /* 2366 * take 'desfree' extra pages, so we reclaim sooner, rather than later 2367 / 2368* extra = desfree; 2369 2370 /* 2371 * check that we're out of range of the pageout scanner. It starts to 2372 * schedule paging if freemem is less than lotsfree and needfree. 2373 * lotsfree is the high-water mark for pageout, and needfree is the 2374 * number of needed free pages. We add extra pages here to make sure 2375 * the scanner doesn't start up while we're freeing memory. 2376 / 2377* if (freemem < lotsfree + needfree + extra) 2378 return (1); 2379 2380 /* 2381 * check to make sure that swapfs has enough space so that anon 2382 * reservations can still succeed. anon_resvmem() checks that the 2383 * availrmem is greater than swapfs_minfree, and the number of reserved 2384 * swap pages. We also add a bit of extra here just to prevent 2385 * circumstances from getting really dire. 2386 / 2387* if (availrmem < swapfs_minfree + swapfs_reserve + extra) 2388 return (1); 2389 2390#if defined(__i386) 2391 /* 2392 * If we're on an i386 platform, it's possible that we'll exhaust the 2393 * kernel heap space before we ever run out of available physical 2394 * memory. Most checks of the size of the heap_area compare against 2395 * tune.t_minarmem, which is the minimum available real memory that we 2396 * can have in the system. However, this is generally fixed at 25 pages 2397 * which is so low that it's useless. In this comparison, we seek to 2398 * calculate the total heap-size, and reclaim if more than 3/4ths of the 2399 * heap is allocated. (Or, in the calculation, if less than 1/4th is 2400 * free) 2401 / 2402* if (btop(vmem_size(heap_arena, VMEM_FREE)) < 2403 (btop(vmem_size(heap_arena, VMEM_FREE \| VMEM_ALLOC)) >> 2)) 2404 return (1); 2405#endif 2406#else /* !sun / 2407* if (kmem_used() > (kmem_size() * 3) / 4) 2408 return (1); 2409#endif /* sun / 2410* 2411#else 2412 if (spa_get_random(100) == 0) 2413 return (1); 2414#endif 2415 return (0); 2416} 2417 2418extern kmem_cache_t zio_buf_cache[]; 2419extern kmem_cache_t zio_data_buf_cache[]; 2420 2421static void 2422arc_kmem_reap_now(arc_reclaim_strategy_t strat) 2423{ 2424 size_t i; 2425 kmem_cache_t prev_cache = NULL; 2426* kmem_cache_t prev_data_cache = NULL; 2427* 2428#ifdef _KERNEL 2429 if (arc_meta_used >= arc_meta_limit) { 2430 /* 2431 * We are exceeding our meta-data cache limit. 2432 * Purge some DNLC entries to release holds on meta-data. 2433 / 2434* dnlc_reduce_cache((void )(uintptr_t)arc_reduce_dnlc_percent); 2435* } 2436#if defined(__i386) 2437 /* 2438 * Reclaim unused memory from all kmem caches. 2439 / 2440* kmem_reap(); 2441#endif 2442#endif 2443 2444 /* 2445 * An aggressive reclamation will shrink the cache size as well as 2446 * reap free buffers from the arc kmem caches. 2447 / 2448* if (strat == ARC_RECLAIM_AGGR) 2449 arc_shrink(); 2450 2451 for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) { 2452 if (zio_buf_cache[i] != prev_cache) { 2453 prev_cache = zio_buf_cache[i]; 2454 kmem_cache_reap_now(zio_buf_cache[i]); 2455 } 2456 if (zio_data_buf_cache[i] != prev_data_cache) { 2457 prev_data_cache = zio_data_buf_cache[i]; 2458 kmem_cache_reap_now(zio_data_buf_cache[i]); 2459 } 2460 } 2461 kmem_cache_reap_now(buf_cache); 2462 kmem_cache_reap_now(hdr_cache); 2463} 2464 2465static void 2466arc_reclaim_thread(void dummy __unused) 2467{ 2468* clock_t growtime = 0; 2469 arc_reclaim_strategy_t last_reclaim = ARC_RECLAIM_CONS; 2470 callb_cpr_t cpr; 2471 2472 CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG); 2473 2474 mutex_enter(&arc_reclaim_thr_lock); 2475 while (arc_thread_exit == 0) { 2476 if (arc_reclaim_needed()) { 2477 2478 if (arc_no_grow) { 2479 if (last_reclaim == ARC_RECLAIM_CONS) { 2480 last_reclaim = ARC_RECLAIM_AGGR; 2481 } else { 2482 last_reclaim = ARC_RECLAIM_CONS; 2483 } 2484 } else { 2485 arc_no_grow = TRUE; 2486 last_reclaim = ARC_RECLAIM_AGGR; 2487 membar_producer(); 2488 } 2489 2490 /* reset the growth delay for every reclaim / 2491* growtime = ddi_get_lbolt() + (arc_grow_retry * hz); 2492 2493 if (needfree && last_reclaim == ARC_RECLAIM_CONS) { 2494 /* 2495 * If needfree is TRUE our vm_lowmem hook 2496 * was called and in that case we must free some 2497 * memory, so switch to aggressive mode. 2498 / 2499* arc_no_grow = TRUE; 2500 last_reclaim = ARC_RECLAIM_AGGR; 2501 } 2502 arc_kmem_reap_now(last_reclaim); 2503 arc_warm = B_TRUE; 2504 2505 } else if (arc_no_grow && ddi_get_lbolt() >= growtime) { 2506 arc_no_grow = FALSE; 2507 } 2508 2509 arc_adjust(); 2510 2511 if (arc_eviction_list != NULL) 2512 arc_do_user_evicts(); 2513 2514#ifdef _KERNEL 2515 if (needfree) { 2516 needfree = 0; 2517 wakeup(&needfree); 2518 } 2519#endif 2520 2521 /* block until needed, or one second, whichever is shorter / 2522* CALLB_CPR_SAFE_BEGIN(&cpr); 2523 (void) cv_timedwait(&arc_reclaim_thr_cv, 2524 &arc_reclaim_thr_lock, hz); 2525 CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock); 2526 } 2527 2528 arc_thread_exit = 0; 2529 cv_broadcast(&arc_reclaim_thr_cv); 2530 CALLB_CPR_EXIT(&cpr); /* drops arc_reclaim_thr_lock / 2531* thread_exit(); 2532} 2533 2534/* 2535 * Adapt arc info given the number of bytes we are trying to add and 2536 * the state that we are comming from. This function is only called 2537 * when we are adding new content to the cache. 2538 / 2539static void 2540arc_adapt(int bytes, arc_state_t state) 2541{ 2542 int mult; 2543 uint64_t arc_p_min = (arc_c >> arc_p_min_shift); 2544 2545 if (state == arc_l2c_only) 2546 return; 2547 2548 ASSERT(bytes > 0); 2549 /* 2550 * Adapt the target size of the MRU list: 2551 * - if we just hit in the MRU ghost list, then increase 2552 * the target size of the MRU list. 2553 * - if we just hit in the MFU ghost list, then increase 2554 * the target size of the MFU list by decreasing the 2555 * target size of the MRU list. 2556 / 2557* if (state == arc_mru_ghost) { 2558 mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ? 2559 1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size)); 2560 mult = MIN(mult, 10); /* avoid wild arc_p adjustment / 2561* 2562 arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult); 2563 } else if (state == arc_mfu_ghost) { 2564 uint64_t delta; 2565 2566 mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ? 2567 1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size)); 2568 mult = MIN(mult, 10); 2569 2570 delta = MIN(bytes * mult, arc_p); 2571 arc_p = MAX(arc_p_min, arc_p - delta); 2572 } 2573 ASSERT((int64_t)arc_p >= 0); 2574 2575 if (arc_reclaim_needed()) { 2576 cv_signal(&arc_reclaim_thr_cv); 2577 return; 2578 } 2579 2580 if (arc_no_grow) 2581 return; 2582 2583 if (arc_c >= arc_c_max) 2584 return; 2585 2586 /* 2587 * If we're within (2 * maxblocksize) bytes of the target 2588 * cache size, increment the target cache size 2589 / 2590* if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) { 2591 atomic_add_64(&arc_c, (int64_t)bytes); 2592 if (arc_c > arc_c_max) 2593 arc_c = arc_c_max; 2594 else if (state == arc_anon) 2595 atomic_add_64(&arc_p, (int64_t)bytes); 2596 if (arc_p > arc_c) 2597 arc_p = arc_c; 2598 } 2599 ASSERT((int64_t)arc_p >= 0); 2600} 2601 2602/* 2603 * Check if the cache has reached its limits and eviction is required 2604 * prior to insert. 2605 / 2606static int 2607arc_evict_needed(arc_buf_contents_t type) 2608{ 2609* if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit) 2610 return (1); 2611 2612#ifdef sun 2613#ifdef _KERNEL 2614 /* 2615 * If zio data pages are being allocated out of a separate heap segment, 2616 * then enforce that the size of available vmem for this area remains 2617 * above about 1/32nd free. 2618 / 2619* if (type == ARC_BUFC_DATA && zio_arena != NULL && 2620 vmem_size(zio_arena, VMEM_FREE) < 2621 (vmem_size(zio_arena, VMEM_ALLOC) >> 5)) 2622 return (1); 2623#endif 2624#endif /* sun / 2625* 2626 if (arc_reclaim_needed()) 2627 return (1); 2628 2629 return (arc_size > arc_c); 2630} 2631 2632/* 2633 * The buffer, supplied as the first argument, needs a data block. 2634 * So, if we are at cache max, determine which cache should be victimized. 2635 * We have the following cases: 2636 * 2637 * 1. Insert for MRU, p > sizeof(arc_anon + arc_mru) -> 2638 * In this situation if we're out of space, but the resident size of the MFU is 2639 * under the limit, victimize the MFU cache to satisfy this insertion request. 2640 * 2641 * 2. Insert for MRU, p <= sizeof(arc_anon + arc_mru) -> 2642 * Here, we've used up all of the available space for the MRU, so we need to 2643 * evict from our own cache instead. Evict from the set of resident MRU 2644 * entries. 2645 * 2646 * 3. Insert for MFU (c - p) > sizeof(arc_mfu) -> 2647 * c minus p represents the MFU space in the cache, since p is the size of the 2648 * cache that is dedicated to the MRU. In this situation there's still space on 2649 * the MFU side, so the MRU side needs to be victimized. 2650 * 2651 * 4. Insert for MFU (c - p) < sizeof(arc_mfu) -> 2652 * MFU's resident set is consuming more space than it has been allotted. In 2653 * this situation, we must victimize our own cache, the MFU, for this insertion. 2654 / 2655static void 2656arc_get_data_buf(arc_buf_t buf) 2657{ 2658 arc_state_t state = buf->b_hdr->b_state; 2659* uint64_t size = buf->b_hdr->b_size; 2660 arc_buf_contents_t type = buf->b_hdr->b_type; 2661 2662 arc_adapt(size, state); 2663 2664 /* 2665 * We have not yet reached cache maximum size, 2666 * just allocate a new buffer. 2667 / 2668* if (!arc_evict_needed(type)) { 2669 if (type == ARC_BUFC_METADATA) { 2670 buf->b_data = zio_buf_alloc(size); 2671 arc_space_consume(size, ARC_SPACE_DATA); 2672 } else { 2673 ASSERT(type == ARC_BUFC_DATA); 2674 buf->b_data = zio_data_buf_alloc(size); 2675 ARCSTAT_INCR(arcstat_data_size, size); 2676 atomic_add_64(&arc_size, size); 2677 } 2678 goto out; 2679 } 2680 2681 /* 2682 * If we are prefetching from the mfu ghost list, this buffer 2683 * will end up on the mru list; so steal space from there. 2684 / 2685* if (state == arc_mfu_ghost) 2686 state = buf->b_hdr->b_flags & ARC_PREFETCH ? arc_mru : arc_mfu; 2687 else if (state == arc_mru_ghost) 2688 state = arc_mru; 2689 2690 if (state == arc_mru \|\| state == arc_anon) { 2691 uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size; 2692 state = (arc_mfu->arcs_lsize[type] >= size && 2693 arc_p > mru_used) ? arc_mfu : arc_mru; 2694 } else { 2695 /* MFU cases / 2696* uint64_t mfu_space = arc_c - arc_p; 2697 state = (arc_mru->arcs_lsize[type] >= size && 2698 mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu; 2699 } 2700 if ((buf->b_data = arc_evict(state, 0, size, TRUE, type)) == NULL) { 2701 if (type == ARC_BUFC_METADATA) { 2702 buf->b_data = zio_buf_alloc(size); 2703 arc_space_consume(size, ARC_SPACE_DATA); 2704 } else { 2705 ASSERT(type == ARC_BUFC_DATA); 2706 buf->b_data = zio_data_buf_alloc(size); 2707 ARCSTAT_INCR(arcstat_data_size, size); 2708 atomic_add_64(&arc_size, size); 2709 } 2710 ARCSTAT_BUMP(arcstat_recycle_miss); 2711 } 2712 ASSERT(buf->b_data != NULL); 2713out: 2714 /* 2715 * Update the state size. Note that ghost states have a 2716 * "ghost size" and so don't need to be updated. 2717 / 2718* if (!GHOST_STATE(buf->b_hdr->b_state)) { 2719 arc_buf_hdr_t hdr = buf->b_hdr; 2720* 2721 atomic_add_64(&hdr->b_state->arcs_size, size); 2722 if (list_link_active(&hdr->b_arc_node)) { 2723 ASSERT(refcount_is_zero(&hdr->b_refcnt)); 2724 atomic_add_64(&hdr->b_state->arcs_lsize[type], size); 2725 } 2726 /* 2727 * If we are growing the cache, and we are adding anonymous 2728 * data, and we have outgrown arc_p, update arc_p 2729 / 2730* if (arc_size < arc_c && hdr->b_state == arc_anon && 2731 arc_anon->arcs_size + arc_mru->arcs_size > arc_p) 2732 arc_p = MIN(arc_c, arc_p + size); 2733 } 2734 ARCSTAT_BUMP(arcstat_allocated); 2735} 2736 2737/* 2738 * This routine is called whenever a buffer is accessed. 2739 * NOTE: the hash lock is dropped in this function. 2740 / 2741static void 2742arc_access(arc_buf_hdr_t buf, kmutex_t hash_lock) 2743{ 2744* clock_t now; 2745 2746 ASSERT(MUTEX_HELD(hash_lock)); 2747 2748 if (buf->b_state == arc_anon) { 2749 /* 2750 * This buffer is not in the cache, and does not 2751 * appear in our "ghost" list. Add the new buffer 2752 * to the MRU state. 2753 / 2754* 2755 ASSERT(buf->b_arc_access == 0); 2756 buf->b_arc_access = ddi_get_lbolt(); 2757 DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t , buf); 2758* arc_change_state(arc_mru, buf, hash_lock); 2759 2760 } else if (buf->b_state == arc_mru) { 2761 now = ddi_get_lbolt(); 2762 2763 /* 2764 * If this buffer is here because of a prefetch, then either: 2765 * - clear the flag if this is a "referencing" read 2766 * (any subsequent access will bump this into the MFU state). 2767 * or 2768 * - move the buffer to the head of the list if this is 2769 * another prefetch (to make it less likely to be evicted). 2770 / 2771* if ((buf->b_flags & ARC_PREFETCH) != 0) { 2772 if (refcount_count(&buf->b_refcnt) == 0) { 2773 ASSERT(list_link_active(&buf->b_arc_node)); 2774 } else { 2775 buf->b_flags &= ~ARC_PREFETCH; 2776 ARCSTAT_BUMP(arcstat_mru_hits); 2777 } 2778 buf->b_arc_access = now; 2779 return; 2780 } 2781 2782 /* 2783 * This buffer has been "accessed" only once so far, 2784 * but it is still in the cache. Move it to the MFU 2785 * state. 2786 / 2787* if (now > buf->b_arc_access + ARC_MINTIME) { 2788 /* 2789 * More than 125ms have passed since we 2790 * instantiated this buffer. Move it to the 2791 * most frequently used state. 2792 / 2793* buf->b_arc_access = now; 2794 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t , buf); 2795* arc_change_state(arc_mfu, buf, hash_lock); 2796 } 2797 ARCSTAT_BUMP(arcstat_mru_hits); 2798 } else if (buf->b_state == arc_mru_ghost) { 2799 arc_state_t new_state; 2800* /* 2801 * This buffer has been "accessed" recently, but 2802 * was evicted from the cache. Move it to the 2803 * MFU state. 2804 / 2805* 2806 if (buf->b_flags & ARC_PREFETCH) { 2807 new_state = arc_mru; 2808 if (refcount_count(&buf->b_refcnt) > 0) 2809 buf->b_flags &= ~ARC_PREFETCH; 2810 DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t , buf); 2811* } else { 2812 new_state = arc_mfu; 2813 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t , buf); 2814* } 2815 2816 buf->b_arc_access = ddi_get_lbolt(); 2817 arc_change_state(new_state, buf, hash_lock); 2818 2819 ARCSTAT_BUMP(arcstat_mru_ghost_hits); 2820 } else if (buf->b_state == arc_mfu) { 2821 /* 2822 * This buffer has been accessed more than once and is 2823 * still in the cache. Keep it in the MFU state. 2824 * 2825 * NOTE: an add_reference() that occurred when we did 2826 * the arc_read() will have kicked this off the list. 2827 * If it was a prefetch, we will explicitly move it to 2828 * the head of the list now. 2829 / 2830* if ((buf->b_flags & ARC_PREFETCH) != 0) { 2831 ASSERT(refcount_count(&buf->b_refcnt) == 0); 2832 ASSERT(list_link_active(&buf->b_arc_node)); 2833 } 2834 ARCSTAT_BUMP(arcstat_mfu_hits); 2835 buf->b_arc_access = ddi_get_lbolt(); 2836 } else if (buf->b_state == arc_mfu_ghost) { 2837 arc_state_t new_state = arc_mfu; 2838* /* 2839 * This buffer has been accessed more than once but has 2840 * been evicted from the cache. Move it back to the 2841 * MFU state. 2842 / 2843* 2844 if (buf->b_flags & ARC_PREFETCH) { 2845 /* 2846 * This is a prefetch access... 2847 * move this block back to the MRU state. 2848 / 2849* ASSERT0(refcount_count(&buf->b_refcnt)); 2850 new_state = arc_mru; 2851 } 2852 2853 buf->b_arc_access = ddi_get_lbolt(); 2854 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t , buf); 2855* arc_change_state(new_state, buf, hash_lock); 2856 2857 ARCSTAT_BUMP(arcstat_mfu_ghost_hits); 2858 } else if (buf->b_state == arc_l2c_only) { 2859 /* 2860 * This buffer is on the 2nd Level ARC. 2861 / 2862* 2863 buf->b_arc_access = ddi_get_lbolt(); 2864 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t , buf); 2865* arc_change_state(arc_mfu, buf, hash_lock); 2866 } else { 2867 ASSERT(!"invalid arc state"); 2868 } 2869} 2870 2871/* a generic arc_done_func_t which you can use / 2872/ ARGSUSED / 2873void 2874arc_bcopy_func(zio_t zio, arc_buf_t buf, void arg) 2875{ 2876 if (zio == NULL \|\| zio->io_error == 0) 2877 bcopy(buf->b_data, arg, buf->b_hdr->b_size); 2878 VERIFY(arc_buf_remove_ref(buf, arg) == 1); 2879} 2880 2881/* a generic arc_done_func_t / 2882void 2883arc_getbuf_func(zio_t zio, arc_buf_t buf, void arg) 2884{ 2885 arc_buf_t *bufp = arg; 2886* if (zio && zio->io_error) { 2887 VERIFY(arc_buf_remove_ref(buf, arg) == 1); 2888 bufp = NULL; 2889* } else { 2890 bufp = buf; 2891* ASSERT(buf->b_data); 2892 } 2893} 2894 2895static void 2896arc_read_done(zio_t zio) 2897{ 2898* arc_buf_hdr_t hdr, found; 2899 arc_buf_t buf; 2900* arc_buf_t abuf; / buffer we're assigning to callback / 2901* kmutex_t hash_lock; 2902* arc_callback_t callback_list, acb; 2903 int freeable = FALSE; 2904 2905 buf = zio->io_private; 2906 hdr = buf->b_hdr; 2907 2908 /* 2909 * The hdr was inserted into hash-table and removed from lists 2910 * prior to starting I/O. We should find this header, since 2911 * it's in the hash table, and it should be legit since it's 2912 * not possible to evict it during the I/O. The only possible 2913 * reason for it not to be found is if we were freed during the 2914 * read. 2915 / 2916* found = buf_hash_find(hdr->b_spa, &hdr->b_dva, hdr->b_birth, 2917 &hash_lock); 2918 2919 ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && hash_lock == NULL) \|\| 2920 (found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) \|\| 2921 (found == hdr && HDR_L2_READING(hdr))); 2922 2923 hdr->b_flags &= ~ARC_L2_EVICTED; 2924 if (l2arc_noprefetch && (hdr->b_flags & ARC_PREFETCH)) 2925 hdr->b_flags &= ~ARC_L2CACHE; 2926 2927 /* byteswap if necessary / 2928* callback_list = hdr->b_acb; 2929 ASSERT(callback_list != NULL); 2930 if (BP_SHOULD_BYTESWAP(zio->io_bp) && zio->io_error == 0) { 2931 dmu_object_byteswap_t bswap = 2932 DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp)); 2933 arc_byteswap_func_t func = BP_GET_LEVEL(zio->io_bp) > 0 ? 2934* byteswap_uint64_array : 2935 dmu_ot_byteswap[bswap].ob_func; 2936 func(buf->b_data, hdr->b_size); 2937 } 2938 2939 arc_cksum_compute(buf, B_FALSE); 2940#ifdef illumos 2941 arc_buf_watch(buf); 2942#endif /* illumos / 2943* 2944 if (hash_lock && zio->io_error == 0 && hdr->b_state == arc_anon) { 2945 /* 2946 * Only call arc_access on anonymous buffers. This is because 2947 * if we've issued an I/O for an evicted buffer, we've already 2948 * called arc_access (to prevent any simultaneous readers from 2949 * getting confused). 2950 / 2951* arc_access(hdr, hash_lock); 2952 } 2953 2954 /* create copies of the data buffer for the callers / 2955* abuf = buf; 2956 for (acb = callback_list; acb; acb = acb->acb_next) { 2957 if (acb->acb_done) { 2958 if (abuf == NULL) { 2959 ARCSTAT_BUMP(arcstat_duplicate_reads); 2960 abuf = arc_buf_clone(buf); 2961 } 2962 acb->acb_buf = abuf; 2963 abuf = NULL; 2964 } 2965 } 2966 hdr->b_acb = NULL; 2967 hdr->b_flags &= ~ARC_IO_IN_PROGRESS; 2968 ASSERT(!HDR_BUF_AVAILABLE(hdr)); 2969 if (abuf == buf) { 2970 ASSERT(buf->b_efunc == NULL); 2971 ASSERT(hdr->b_datacnt == 1); 2972 hdr->b_flags \|= ARC_BUF_AVAILABLE; 2973 } 2974 2975 ASSERT(refcount_is_zero(&hdr->b_refcnt) \|\| callback_list != NULL); 2976 2977 if (zio->io_error != 0) { 2978 hdr->b_flags \|= ARC_IO_ERROR; 2979 if (hdr->b_state != arc_anon) 2980 arc_change_state(arc_anon, hdr, hash_lock); 2981 if (HDR_IN_HASH_TABLE(hdr)) 2982 buf_hash_remove(hdr); 2983 freeable = refcount_is_zero(&hdr->b_refcnt); 2984 } 2985 2986 /* 2987 * Broadcast before we drop the hash_lock to avoid the possibility 2988 * that the hdr (and hence the cv) might be freed before we get to 2989 * the cv_broadcast(). 2990 / 2991* cv_broadcast(&hdr->b_cv); 2992 2993 if (hash_lock) { 2994 mutex_exit(hash_lock); 2995 } else { 2996 /* 2997 * This block was freed while we waited for the read to 2998 * complete. It has been removed from the hash table and 2999 * moved to the anonymous state (so that it won't show up 3000 * in the cache). 3001 / 3002* ASSERT3P(hdr->b_state, ==, arc_anon); 3003 freeable = refcount_is_zero(&hdr->b_refcnt); 3004 } 3005 3006 /* execute each callback and free its structure / 3007* while ((acb = callback_list) != NULL) { 3008 if (acb->acb_done) 3009 acb->acb_done(zio, acb->acb_buf, acb->acb_private); 3010 3011 if (acb->acb_zio_dummy != NULL) { 3012 acb->acb_zio_dummy->io_error = zio->io_error; 3013 zio_nowait(acb->acb_zio_dummy); 3014 } 3015 3016 callback_list = acb->acb_next; 3017 kmem_free(acb, sizeof (arc_callback_t)); 3018 } 3019 3020 if (freeable) 3021 arc_hdr_destroy(hdr); 3022} 3023 3024/* 3025 * "Read" the block block at the specified DVA (in bp) via the 3026 * cache. If the block is found in the cache, invoke the provided 3027 * callback immediately and return. Note that the `zio' parameter 3028 * in the callback will be NULL in this case, since no IO was 3029 * required. If the block is not in the cache pass the read request 3030 * on to the spa with a substitute callback function, so that the 3031 * requested block will be added to the cache. 3032 * 3033 * If a read request arrives for a block that has a read in-progress, 3034 * either wait for the in-progress read to complete (and return the 3035 * results); or, if this is a read with a "done" func, add a record 3036 * to the read to invoke the "done" func when the read completes, 3037 * and return; or just return. 3038 * 3039 * arc_read_done() will invoke all the requested "done" functions 3040 * for readers of this block. 3041 / 3042int 3043arc_read(zio_t pio, spa_t spa, const blkptr_t bp, arc_done_func_t done, 3044* void private, int priority, int zio_flags, uint32_t arc_flags, 3045 const zbookmark_t zb) 3046{ 3047* arc_buf_hdr_t *hdr;	1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 / 21/ 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 24 * Copyright (c) 2011 by Delphix. All rights reserved. 25 / 26 27/ 28 * DVA-based Adjustable Replacement Cache 29 * 30 * While much of the theory of operation used here is 31 * based on the self-tuning, low overhead replacement cache 32 * presented by Megiddo and Modha at FAST 2003, there are some 33 * significant differences: 34 * 35 * 1. The Megiddo and Modha model assumes any page is evictable. 36 * Pages in its cache cannot be "locked" into memory. This makes 37 * the eviction algorithm simple: evict the last page in the list. 38 * This also make the performance characteristics easy to reason 39 * about. Our cache is not so simple. At any given moment, some 40 * subset of the blocks in the cache are un-evictable because we 41 * have handed out a reference to them. Blocks are only evictable 42 * when there are no external references active. This makes 43 * eviction far more problematic: we choose to evict the evictable 44 * blocks that are the "lowest" in the list. 45 * 46 * There are times when it is not possible to evict the requested 47 * space. In these circumstances we are unable to adjust the cache 48 * size. To prevent the cache growing unbounded at these times we 49 * implement a "cache throttle" that slows the flow of new data 50 * into the cache until we can make space available. 51 * 52 * 2. The Megiddo and Modha model assumes a fixed cache size. 53 * Pages are evicted when the cache is full and there is a cache 54 * miss. Our model has a variable sized cache. It grows with 55 * high use, but also tries to react to memory pressure from the 56 * operating system: decreasing its size when system memory is 57 * tight. 58 * 59 * 3. The Megiddo and Modha model assumes a fixed page size. All 60 * elements of the cache are therefor exactly the same size. So 61 * when adjusting the cache size following a cache miss, its simply 62 * a matter of choosing a single page to evict. In our model, we 63 * have variable sized cache blocks (rangeing from 512 bytes to 64 * 128K bytes). We therefor choose a set of blocks to evict to make 65 * space for a cache miss that approximates as closely as possible 66 * the space used by the new block. 67 * 68 * See also: "ARC: A Self-Tuning, Low Overhead Replacement Cache" 69 * by N. Megiddo & D. Modha, FAST 2003 70 / 71 72/ 73 * The locking model: 74 * 75 * A new reference to a cache buffer can be obtained in two 76 * ways: 1) via a hash table lookup using the DVA as a key, 77 * or 2) via one of the ARC lists. The arc_read() interface 78 * uses method 1, while the internal arc algorithms for 79 * adjusting the cache use method 2. We therefor provide two 80 * types of locks: 1) the hash table lock array, and 2) the 81 * arc list locks. 82 * 83 * Buffers do not have their own mutexs, rather they rely on the 84 * hash table mutexs for the bulk of their protection (i.e. most 85 * fields in the arc_buf_hdr_t are protected by these mutexs). 86 * 87 * buf_hash_find() returns the appropriate mutex (held) when it 88 * locates the requested buffer in the hash table. It returns 89 * NULL for the mutex if the buffer was not in the table. 90 * 91 * buf_hash_remove() expects the appropriate hash mutex to be 92 * already held before it is invoked. 93 * 94 * Each arc state also has a mutex which is used to protect the 95 * buffer list associated with the state. When attempting to 96 * obtain a hash table lock while holding an arc list lock you 97 * must use: mutex_tryenter() to avoid deadlock. Also note that 98 * the active state mutex must be held before the ghost state mutex. 99 * 100 * Arc buffers may have an associated eviction callback function. 101 * This function will be invoked prior to removing the buffer (e.g. 102 * in arc_do_user_evicts()). Note however that the data associated 103 * with the buffer may be evicted prior to the callback. The callback 104 * must be made with no locks held (to prevent deadlock). Additionally, 105 * the users of callbacks must ensure that their private data is 106 * protected from simultaneous callbacks from arc_buf_evict() 107 * and arc_do_user_evicts(). 108 * 109 * Note that the majority of the performance stats are manipulated 110 * with atomic operations. 111 * 112 * The L2ARC uses the l2arc_buflist_mtx global mutex for the following: 113 * 114 * - L2ARC buflist creation 115 * - L2ARC buflist eviction 116 * - L2ARC write completion, which walks L2ARC buflists 117 * - ARC header destruction, as it removes from L2ARC buflists 118 * - ARC header release, as it removes from L2ARC buflists 119 / 120* 121#include <sys/spa.h> 122#include <sys/zio.h> 123#include <sys/zfs_context.h> 124#include <sys/arc.h> 125#include <sys/refcount.h> 126#include <sys/vdev.h> 127#include <sys/vdev_impl.h> 128#ifdef _KERNEL 129#include <sys/dnlc.h> 130#endif 131#include <sys/callb.h> 132#include <sys/kstat.h> 133#include <zfs_fletcher.h> 134#include <sys/sdt.h> 135 136#include <vm/vm_pageout.h> 137 138#ifdef illumos 139#ifndef _KERNEL 140/* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers / 141boolean_t arc_watch = B_FALSE; 142int arc_procfd; 143#endif 144#endif / illumos / 145* 146static kmutex_t arc_reclaim_thr_lock; 147static kcondvar_t arc_reclaim_thr_cv; /* used to signal reclaim thr / 148static uint8_t arc_thread_exit; 149* 150extern int zfs_write_limit_shift; 151extern uint64_t zfs_write_limit_max; 152extern kmutex_t zfs_write_limit_lock; 153 154#define ARC_REDUCE_DNLC_PERCENT 3 155uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT; 156 157typedef enum arc_reclaim_strategy { 158 ARC_RECLAIM_AGGR, /* Aggressive reclaim strategy / 159* ARC_RECLAIM_CONS /* Conservative reclaim strategy / 160} arc_reclaim_strategy_t; 161* 162/* number of seconds before growing cache again / 163static int arc_grow_retry = 60; 164* 165/* shift of arc_c for calculating both min and max arc_p / 166static int arc_p_min_shift = 4; 167* 168/* log2(fraction of arc to reclaim) / 169static int arc_shrink_shift = 5; 170* 171/* 172 * minimum lifespan of a prefetch block in clock ticks 173 * (initialized in arc_init()) 174 / 175static int arc_min_prefetch_lifespan; 176* 177static int arc_dead; 178extern int zfs_prefetch_disable; 179 180/* 181 * The arc has filled available memory and has now warmed up. 182 / 183static boolean_t arc_warm; 184* 185/* 186 * These tunables are for performance analysis. 187 / 188uint64_t zfs_arc_max; 189uint64_t zfs_arc_min; 190uint64_t zfs_arc_meta_limit = 0; 191int zfs_arc_grow_retry = 0; 192int zfs_arc_shrink_shift = 0; 193int zfs_arc_p_min_shift = 0; 194int zfs_disable_dup_eviction = 0; 195* 196TUNABLE_QUAD("vfs.zfs.arc_max", &zfs_arc_max); 197TUNABLE_QUAD("vfs.zfs.arc_min", &zfs_arc_min); 198TUNABLE_QUAD("vfs.zfs.arc_meta_limit", &zfs_arc_meta_limit); 199SYSCTL_DECL(_vfs_zfs); 200SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_max, CTLFLAG_RDTUN, &zfs_arc_max, 0, 201 "Maximum ARC size"); 202SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_min, CTLFLAG_RDTUN, &zfs_arc_min, 0, 203 "Minimum ARC size"); 204 205/* 206 * Note that buffers can be in one of 6 states: 207 * ARC_anon - anonymous (discussed below) 208 * ARC_mru - recently used, currently cached 209 * ARC_mru_ghost - recentely used, no longer in cache 210 * ARC_mfu - frequently used, currently cached 211 * ARC_mfu_ghost - frequently used, no longer in cache 212 * ARC_l2c_only - exists in L2ARC but not other states 213 * When there are no active references to the buffer, they are 214 * are linked onto a list in one of these arc states. These are 215 * the only buffers that can be evicted or deleted. Within each 216 * state there are multiple lists, one for meta-data and one for 217 * non-meta-data. Meta-data (indirect blocks, blocks of dnodes, 218 * etc.) is tracked separately so that it can be managed more 219 * explicitly: favored over data, limited explicitly. 220 * 221 * Anonymous buffers are buffers that are not associated with 222 * a DVA. These are buffers that hold dirty block copies 223 * before they are written to stable storage. By definition, 224 * they are "ref'd" and are considered part of arc_mru 225 * that cannot be freed. Generally, they will aquire a DVA 226 * as they are written and migrate onto the arc_mru list. 227 * 228 * The ARC_l2c_only state is for buffers that are in the second 229 * level ARC but no longer in any of the ARC_m* lists. The second 230 * level ARC itself may also contain buffers that are in any of 231 * the ARC_m* states - meaning that a buffer can exist in two 232 * places. The reason for the ARC_l2c_only state is to keep the 233 * buffer header in the hash table, so that reads that hit the 234 * second level ARC benefit from these fast lookups. 235 / 236* 237#define ARCS_LOCK_PAD CACHE_LINE_SIZE 238struct arcs_lock { 239 kmutex_t arcs_lock; 240#ifdef _KERNEL 241 unsigned char pad[(ARCS_LOCK_PAD - sizeof (kmutex_t))]; 242#endif 243}; 244 245/* 246 * must be power of two for mask use to work 247 * 248 / 249#define ARC_BUFC_NUMDATALISTS 16 250#define ARC_BUFC_NUMMETADATALISTS 16 251#define ARC_BUFC_NUMLISTS (ARC_BUFC_NUMMETADATALISTS + ARC_BUFC_NUMDATALISTS) 252* 253typedef struct arc_state { 254 uint64_t arcs_lsize[ARC_BUFC_NUMTYPES]; /* amount of evictable data / 255* uint64_t arcs_size; /* total amount of data in this state / 256* list_t arcs_lists[ARC_BUFC_NUMLISTS]; /* list of evictable buffers / 257* struct arcs_lock arcs_locks[ARC_BUFC_NUMLISTS] __aligned(CACHE_LINE_SIZE); 258} arc_state_t; 259 260#define ARCS_LOCK(s, i) (&((s)->arcs_locks[(i)].arcs_lock)) 261 262/* The 6 states: / 263static arc_state_t ARC_anon; 264static arc_state_t ARC_mru; 265static arc_state_t ARC_mru_ghost; 266static arc_state_t ARC_mfu; 267static arc_state_t ARC_mfu_ghost; 268static arc_state_t ARC_l2c_only; 269* 270typedef struct arc_stats { 271 kstat_named_t arcstat_hits; 272 kstat_named_t arcstat_misses; 273 kstat_named_t arcstat_demand_data_hits; 274 kstat_named_t arcstat_demand_data_misses; 275 kstat_named_t arcstat_demand_metadata_hits; 276 kstat_named_t arcstat_demand_metadata_misses; 277 kstat_named_t arcstat_prefetch_data_hits; 278 kstat_named_t arcstat_prefetch_data_misses; 279 kstat_named_t arcstat_prefetch_metadata_hits; 280 kstat_named_t arcstat_prefetch_metadata_misses; 281 kstat_named_t arcstat_mru_hits; 282 kstat_named_t arcstat_mru_ghost_hits; 283 kstat_named_t arcstat_mfu_hits; 284 kstat_named_t arcstat_mfu_ghost_hits; 285 kstat_named_t arcstat_allocated; 286 kstat_named_t arcstat_deleted; 287 kstat_named_t arcstat_stolen; 288 kstat_named_t arcstat_recycle_miss; 289 kstat_named_t arcstat_mutex_miss; 290 kstat_named_t arcstat_evict_skip; 291 kstat_named_t arcstat_evict_l2_cached; 292 kstat_named_t arcstat_evict_l2_eligible; 293 kstat_named_t arcstat_evict_l2_ineligible; 294 kstat_named_t arcstat_hash_elements; 295 kstat_named_t arcstat_hash_elements_max; 296 kstat_named_t arcstat_hash_collisions; 297 kstat_named_t arcstat_hash_chains; 298 kstat_named_t arcstat_hash_chain_max; 299 kstat_named_t arcstat_p; 300 kstat_named_t arcstat_c; 301 kstat_named_t arcstat_c_min; 302 kstat_named_t arcstat_c_max; 303 kstat_named_t arcstat_size; 304 kstat_named_t arcstat_hdr_size; 305 kstat_named_t arcstat_data_size; 306 kstat_named_t arcstat_other_size; 307 kstat_named_t arcstat_l2_hits; 308 kstat_named_t arcstat_l2_misses; 309 kstat_named_t arcstat_l2_feeds; 310 kstat_named_t arcstat_l2_rw_clash; 311 kstat_named_t arcstat_l2_read_bytes; 312 kstat_named_t arcstat_l2_write_bytes; 313 kstat_named_t arcstat_l2_writes_sent; 314 kstat_named_t arcstat_l2_writes_done; 315 kstat_named_t arcstat_l2_writes_error; 316 kstat_named_t arcstat_l2_writes_hdr_miss; 317 kstat_named_t arcstat_l2_evict_lock_retry; 318 kstat_named_t arcstat_l2_evict_reading; 319 kstat_named_t arcstat_l2_free_on_write; 320 kstat_named_t arcstat_l2_abort_lowmem; 321 kstat_named_t arcstat_l2_cksum_bad; 322 kstat_named_t arcstat_l2_io_error; 323 kstat_named_t arcstat_l2_size; 324 kstat_named_t arcstat_l2_hdr_size; 325 kstat_named_t arcstat_l2_write_trylock_fail; 326 kstat_named_t arcstat_l2_write_passed_headroom; 327 kstat_named_t arcstat_l2_write_spa_mismatch; 328 kstat_named_t arcstat_l2_write_in_l2; 329 kstat_named_t arcstat_l2_write_hdr_io_in_progress; 330 kstat_named_t arcstat_l2_write_not_cacheable; 331 kstat_named_t arcstat_l2_write_full; 332 kstat_named_t arcstat_l2_write_buffer_iter; 333 kstat_named_t arcstat_l2_write_pios; 334 kstat_named_t arcstat_l2_write_buffer_bytes_scanned; 335 kstat_named_t arcstat_l2_write_buffer_list_iter; 336 kstat_named_t arcstat_l2_write_buffer_list_null_iter; 337 kstat_named_t arcstat_memory_throttle_count; 338 kstat_named_t arcstat_duplicate_buffers; 339 kstat_named_t arcstat_duplicate_buffers_size; 340 kstat_named_t arcstat_duplicate_reads; 341} arc_stats_t; 342 343static arc_stats_t arc_stats = { 344 { "hits", KSTAT_DATA_UINT64 }, 345 { "misses", KSTAT_DATA_UINT64 }, 346 { "demand_data_hits", KSTAT_DATA_UINT64 }, 347 { "demand_data_misses", KSTAT_DATA_UINT64 }, 348 { "demand_metadata_hits", KSTAT_DATA_UINT64 }, 349 { "demand_metadata_misses", KSTAT_DATA_UINT64 }, 350 { "prefetch_data_hits", KSTAT_DATA_UINT64 }, 351 { "prefetch_data_misses", KSTAT_DATA_UINT64 }, 352 { "prefetch_metadata_hits", KSTAT_DATA_UINT64 }, 353 { "prefetch_metadata_misses", KSTAT_DATA_UINT64 }, 354 { "mru_hits", KSTAT_DATA_UINT64 }, 355 { "mru_ghost_hits", KSTAT_DATA_UINT64 }, 356 { "mfu_hits", KSTAT_DATA_UINT64 }, 357 { "mfu_ghost_hits", KSTAT_DATA_UINT64 }, 358 { "allocated", KSTAT_DATA_UINT64 }, 359 { "deleted", KSTAT_DATA_UINT64 }, 360 { "stolen", KSTAT_DATA_UINT64 }, 361 { "recycle_miss", KSTAT_DATA_UINT64 }, 362 { "mutex_miss", KSTAT_DATA_UINT64 }, 363 { "evict_skip", KSTAT_DATA_UINT64 }, 364 { "evict_l2_cached", KSTAT_DATA_UINT64 }, 365 { "evict_l2_eligible", KSTAT_DATA_UINT64 }, 366 { "evict_l2_ineligible", KSTAT_DATA_UINT64 }, 367 { "hash_elements", KSTAT_DATA_UINT64 }, 368 { "hash_elements_max", KSTAT_DATA_UINT64 }, 369 { "hash_collisions", KSTAT_DATA_UINT64 }, 370 { "hash_chains", KSTAT_DATA_UINT64 }, 371 { "hash_chain_max", KSTAT_DATA_UINT64 }, 372 { "p", KSTAT_DATA_UINT64 }, 373 { "c", KSTAT_DATA_UINT64 }, 374 { "c_min", KSTAT_DATA_UINT64 }, 375 { "c_max", KSTAT_DATA_UINT64 }, 376 { "size", KSTAT_DATA_UINT64 }, 377 { "hdr_size", KSTAT_DATA_UINT64 }, 378 { "data_size", KSTAT_DATA_UINT64 }, 379 { "other_size", KSTAT_DATA_UINT64 }, 380 { "l2_hits", KSTAT_DATA_UINT64 }, 381 { "l2_misses", KSTAT_DATA_UINT64 }, 382 { "l2_feeds", KSTAT_DATA_UINT64 }, 383 { "l2_rw_clash", KSTAT_DATA_UINT64 }, 384 { "l2_read_bytes", KSTAT_DATA_UINT64 }, 385 { "l2_write_bytes", KSTAT_DATA_UINT64 }, 386 { "l2_writes_sent", KSTAT_DATA_UINT64 }, 387 { "l2_writes_done", KSTAT_DATA_UINT64 }, 388 { "l2_writes_error", KSTAT_DATA_UINT64 }, 389 { "l2_writes_hdr_miss", KSTAT_DATA_UINT64 }, 390 { "l2_evict_lock_retry", KSTAT_DATA_UINT64 }, 391 { "l2_evict_reading", KSTAT_DATA_UINT64 }, 392 { "l2_free_on_write", KSTAT_DATA_UINT64 }, 393 { "l2_abort_lowmem", KSTAT_DATA_UINT64 }, 394 { "l2_cksum_bad", KSTAT_DATA_UINT64 }, 395 { "l2_io_error", KSTAT_DATA_UINT64 }, 396 { "l2_size", KSTAT_DATA_UINT64 }, 397 { "l2_hdr_size", KSTAT_DATA_UINT64 }, 398 { "l2_write_trylock_fail", KSTAT_DATA_UINT64 }, 399 { "l2_write_passed_headroom", KSTAT_DATA_UINT64 }, 400 { "l2_write_spa_mismatch", KSTAT_DATA_UINT64 }, 401 { "l2_write_in_l2", KSTAT_DATA_UINT64 }, 402 { "l2_write_io_in_progress", KSTAT_DATA_UINT64 }, 403 { "l2_write_not_cacheable", KSTAT_DATA_UINT64 }, 404 { "l2_write_full", KSTAT_DATA_UINT64 }, 405 { "l2_write_buffer_iter", KSTAT_DATA_UINT64 }, 406 { "l2_write_pios", KSTAT_DATA_UINT64 }, 407 { "l2_write_buffer_bytes_scanned", KSTAT_DATA_UINT64 }, 408 { "l2_write_buffer_list_iter", KSTAT_DATA_UINT64 }, 409 { "l2_write_buffer_list_null_iter", KSTAT_DATA_UINT64 }, 410 { "memory_throttle_count", KSTAT_DATA_UINT64 }, 411 { "duplicate_buffers", KSTAT_DATA_UINT64 }, 412 { "duplicate_buffers_size", KSTAT_DATA_UINT64 }, 413 { "duplicate_reads", KSTAT_DATA_UINT64 } 414}; 415 416#define ARCSTAT(stat) (arc_stats.stat.value.ui64) 417 418#define ARCSTAT_INCR(stat, val) \ 419 atomic_add_64(&arc_stats.stat.value.ui64, (val)); 420 421#define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1) 422#define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1) 423 424#define ARCSTAT_MAX(stat, val) { \ 425 uint64_t m; \ 426 while ((val) > (m = arc_stats.stat.value.ui64) && \ 427 (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val)))) \ 428 continue; \ 429} 430 431#define ARCSTAT_MAXSTAT(stat) \ 432 ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64) 433 434/* 435 * We define a macro to allow ARC hits/misses to be easily broken down by 436 * two separate conditions, giving a total of four different subtypes for 437 * each of hits and misses (so eight statistics total). 438 / 439#define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \ 440* if (cond1) { \ 441 if (cond2) { \ 442 ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \ 443 } else { \ 444 ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \ 445 } \ 446 } else { \ 447 if (cond2) { \ 448 ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \ 449 } else { \ 450 ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\ 451 } \ 452 } 453 454kstat_t arc_ksp; 455static arc_state_t arc_anon; 456static arc_state_t arc_mru; 457static arc_state_t arc_mru_ghost; 458static arc_state_t arc_mfu; 459static arc_state_t arc_mfu_ghost; 460static arc_state_t arc_l2c_only; 461* 462/* 463 * There are several ARC variables that are critical to export as kstats -- 464 * but we don't want to have to grovel around in the kstat whenever we wish to 465 * manipulate them. For these variables, we therefore define them to be in 466 * terms of the statistic variable. This assures that we are not introducing 467 * the possibility of inconsistency by having shadow copies of the variables, 468 * while still allowing the code to be readable. 469 / 470#define arc_size ARCSTAT(arcstat_size) / actual total arc size / 471#define arc_p ARCSTAT(arcstat_p) / target size of MRU / 472#define arc_c ARCSTAT(arcstat_c) / target size of cache / 473#define arc_c_min ARCSTAT(arcstat_c_min) / min target cache size / 474#define arc_c_max ARCSTAT(arcstat_c_max) / max target cache size / 475* 476static int arc_no_grow; /* Don't try to grow cache size / 477static uint64_t arc_tempreserve; 478static uint64_t arc_loaned_bytes; 479static uint64_t arc_meta_used; 480static uint64_t arc_meta_limit; 481static uint64_t arc_meta_max = 0; 482SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_meta_used, CTLFLAG_RD, &arc_meta_used, 0, 483* "ARC metadata used"); 484SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_meta_limit, CTLFLAG_RW, &arc_meta_limit, 0, 485 "ARC metadata limit"); 486 487typedef struct l2arc_buf_hdr l2arc_buf_hdr_t; 488 489typedef struct arc_callback arc_callback_t; 490 491struct arc_callback { 492 void acb_private; 493* arc_done_func_t acb_done; 494* arc_buf_t acb_buf; 495* zio_t acb_zio_dummy; 496* arc_callback_t acb_next; 497}; 498* 499typedef struct arc_write_callback arc_write_callback_t; 500 501struct arc_write_callback { 502 void awcb_private; 503* arc_done_func_t awcb_ready; 504* arc_done_func_t awcb_done; 505* arc_buf_t awcb_buf; 506}; 507* 508struct arc_buf_hdr { 509 /* protected by hash lock / 510* dva_t b_dva; 511 uint64_t b_birth; 512 uint64_t b_cksum0; 513 514 kmutex_t b_freeze_lock; 515 zio_cksum_t b_freeze_cksum; 516* void b_thawed; 517* 518 arc_buf_hdr_t b_hash_next; 519* arc_buf_t b_buf; 520* uint32_t b_flags; 521 uint32_t b_datacnt; 522 523 arc_callback_t b_acb; 524* kcondvar_t b_cv; 525 526 /* immutable / 527* arc_buf_contents_t b_type; 528 uint64_t b_size; 529 uint64_t b_spa; 530 531 /* protected by arc state mutex / 532* arc_state_t b_state; 533* list_node_t b_arc_node; 534 535 /* updated atomically / 536* clock_t b_arc_access; 537 538 /* self protecting / 539* refcount_t b_refcnt; 540 541 l2arc_buf_hdr_t b_l2hdr; 542* list_node_t b_l2node; 543}; 544 545static arc_buf_t arc_eviction_list; 546static kmutex_t arc_eviction_mtx; 547static arc_buf_hdr_t arc_eviction_hdr; 548static void arc_get_data_buf(arc_buf_t buf); 549static void arc_access(arc_buf_hdr_t buf, kmutex_t hash_lock); 550static int arc_evict_needed(arc_buf_contents_t type); 551static void arc_evict_ghost(arc_state_t state, uint64_t spa, int64_t bytes); 552#ifdef illumos 553static void arc_buf_watch(arc_buf_t buf); 554#endif /* illumos / 555* 556static boolean_t l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t ab); 557* 558#define GHOST_STATE(state) \ 559 ((state) == arc_mru_ghost \|\| (state) == arc_mfu_ghost \|\| \ 560 (state) == arc_l2c_only) 561 562/* 563 * Private ARC flags. These flags are private ARC only flags that will show up 564 * in b_flags in the arc_hdr_buf_t. Some flags are publicly declared, and can 565 * be passed in as arc_flags in things like arc_read. However, these flags 566 * should never be passed and should only be set by ARC code. When adding new 567 * public flags, make sure not to smash the private ones. 568 / 569* 570#define ARC_IN_HASH_TABLE (1 << 9) /* this buffer is hashed / 571#define ARC_IO_IN_PROGRESS (1 << 10) / I/O in progress for buf / 572#define ARC_IO_ERROR (1 << 11) / I/O failed for buf / 573#define ARC_FREED_IN_READ (1 << 12) / buf freed while in read / 574#define ARC_BUF_AVAILABLE (1 << 13) / block not in active use / 575#define ARC_INDIRECT (1 << 14) / this is an indirect block / 576#define ARC_FREE_IN_PROGRESS (1 << 15) / hdr about to be freed / 577#define ARC_L2_WRITING (1 << 16) / L2ARC write in progress / 578#define ARC_L2_EVICTED (1 << 17) / evicted during I/O / 579#define ARC_L2_WRITE_HEAD (1 << 18) / head of write list / 580* 581#define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_IN_HASH_TABLE) 582#define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS) 583#define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_IO_ERROR) 584#define HDR_PREFETCH(hdr) ((hdr)->b_flags & ARC_PREFETCH) 585#define HDR_FREED_IN_READ(hdr) ((hdr)->b_flags & ARC_FREED_IN_READ) 586#define HDR_BUF_AVAILABLE(hdr) ((hdr)->b_flags & ARC_BUF_AVAILABLE) 587#define HDR_FREE_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FREE_IN_PROGRESS) 588#define HDR_L2CACHE(hdr) ((hdr)->b_flags & ARC_L2CACHE) 589#define HDR_L2_READING(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS && \ 590 (hdr)->b_l2hdr != NULL) 591#define HDR_L2_WRITING(hdr) ((hdr)->b_flags & ARC_L2_WRITING) 592#define HDR_L2_EVICTED(hdr) ((hdr)->b_flags & ARC_L2_EVICTED) 593#define HDR_L2_WRITE_HEAD(hdr) ((hdr)->b_flags & ARC_L2_WRITE_HEAD) 594 595/* 596 * Other sizes 597 / 598* 599#define HDR_SIZE ((int64_t)sizeof (arc_buf_hdr_t)) 600#define L2HDR_SIZE ((int64_t)sizeof (l2arc_buf_hdr_t)) 601 602/* 603 * Hash table routines 604 / 605* 606#define HT_LOCK_PAD CACHE_LINE_SIZE 607 608struct ht_lock { 609 kmutex_t ht_lock; 610#ifdef _KERNEL 611 unsigned char pad[(HT_LOCK_PAD - sizeof (kmutex_t))]; 612#endif 613}; 614 615#define BUF_LOCKS 256 616typedef struct buf_hash_table { 617 uint64_t ht_mask; 618 arc_buf_hdr_t *ht_table; 619* struct ht_lock ht_locks[BUF_LOCKS] __aligned(CACHE_LINE_SIZE); 620} buf_hash_table_t; 621 622static buf_hash_table_t buf_hash_table; 623 624#define BUF_HASH_INDEX(spa, dva, birth) \ 625 (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask) 626#define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)]) 627#define BUF_HASH_LOCK(idx) (&(BUF_HASH_LOCK_NTRY(idx).ht_lock)) 628#define HDR_LOCK(hdr) \ 629 (BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth))) 630 631uint64_t zfs_crc64_table[256]; 632 633/* 634 * Level 2 ARC 635 / 636* 637#define L2ARC_WRITE_SIZE (8 * 1024 * 1024) /* initial write max / 638#define L2ARC_HEADROOM 2 / num of writes / 639#define L2ARC_FEED_SECS 1 / caching interval secs / 640#define L2ARC_FEED_MIN_MS 200 / min caching interval ms / 641* 642#define l2arc_writes_sent ARCSTAT(arcstat_l2_writes_sent) 643#define l2arc_writes_done ARCSTAT(arcstat_l2_writes_done) 644 645/* 646 * L2ARC Performance Tunables 647 / 648uint64_t l2arc_write_max = L2ARC_WRITE_SIZE; / default max write size / 649uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE; / extra write during warmup / 650uint64_t l2arc_headroom = L2ARC_HEADROOM; / number of dev writes / 651uint64_t l2arc_feed_secs = L2ARC_FEED_SECS; / interval seconds / 652uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; / min interval milliseconds / 653boolean_t l2arc_noprefetch = B_TRUE; / don't cache prefetch bufs / 654boolean_t l2arc_feed_again = B_TRUE; / turbo warmup / 655boolean_t l2arc_norw = B_TRUE; / no reads during writes / 656* 657SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_max, CTLFLAG_RW, 658 &l2arc_write_max, 0, "max write size"); 659SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_boost, CTLFLAG_RW, 660 &l2arc_write_boost, 0, "extra write during warmup"); 661SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_headroom, CTLFLAG_RW, 662 &l2arc_headroom, 0, "number of dev writes"); 663SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_secs, CTLFLAG_RW, 664 &l2arc_feed_secs, 0, "interval seconds"); 665SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_min_ms, CTLFLAG_RW, 666 &l2arc_feed_min_ms, 0, "min interval milliseconds"); 667 668SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_noprefetch, CTLFLAG_RW, 669 &l2arc_noprefetch, 0, "don't cache prefetch bufs"); 670SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_feed_again, CTLFLAG_RW, 671 &l2arc_feed_again, 0, "turbo warmup"); 672SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_norw, CTLFLAG_RW, 673 &l2arc_norw, 0, "no reads during writes"); 674 675SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_size, CTLFLAG_RD, 676 &ARC_anon.arcs_size, 0, "size of anonymous state"); 677SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_metadata_lsize, CTLFLAG_RD, 678 &ARC_anon.arcs_lsize[ARC_BUFC_METADATA], 0, "size of anonymous state"); 679SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_data_lsize, CTLFLAG_RD, 680 &ARC_anon.arcs_lsize[ARC_BUFC_DATA], 0, "size of anonymous state"); 681 682SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_size, CTLFLAG_RD, 683 &ARC_mru.arcs_size, 0, "size of mru state"); 684SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_metadata_lsize, CTLFLAG_RD, 685 &ARC_mru.arcs_lsize[ARC_BUFC_METADATA], 0, "size of metadata in mru state"); 686SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_data_lsize, CTLFLAG_RD, 687 &ARC_mru.arcs_lsize[ARC_BUFC_DATA], 0, "size of data in mru state"); 688 689SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_size, CTLFLAG_RD, 690 &ARC_mru_ghost.arcs_size, 0, "size of mru ghost state"); 691SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_metadata_lsize, CTLFLAG_RD, 692 &ARC_mru_ghost.arcs_lsize[ARC_BUFC_METADATA], 0, 693 "size of metadata in mru ghost state"); 694SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_data_lsize, CTLFLAG_RD, 695 &ARC_mru_ghost.arcs_lsize[ARC_BUFC_DATA], 0, 696 "size of data in mru ghost state"); 697 698SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_size, CTLFLAG_RD, 699 &ARC_mfu.arcs_size, 0, "size of mfu state"); 700SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_metadata_lsize, CTLFLAG_RD, 701 &ARC_mfu.arcs_lsize[ARC_BUFC_METADATA], 0, "size of metadata in mfu state"); 702SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_data_lsize, CTLFLAG_RD, 703 &ARC_mfu.arcs_lsize[ARC_BUFC_DATA], 0, "size of data in mfu state"); 704 705SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_size, CTLFLAG_RD, 706 &ARC_mfu_ghost.arcs_size, 0, "size of mfu ghost state"); 707SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_metadata_lsize, CTLFLAG_RD, 708 &ARC_mfu_ghost.arcs_lsize[ARC_BUFC_METADATA], 0, 709 "size of metadata in mfu ghost state"); 710SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_data_lsize, CTLFLAG_RD, 711 &ARC_mfu_ghost.arcs_lsize[ARC_BUFC_DATA], 0, 712 "size of data in mfu ghost state"); 713 714SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2c_only_size, CTLFLAG_RD, 715 &ARC_l2c_only.arcs_size, 0, "size of mru state"); 716 717/* 718 * L2ARC Internals 719 / 720typedef struct l2arc_dev { 721* vdev_t l2ad_vdev; / vdev / 722* spa_t l2ad_spa; / spa / 723* uint64_t l2ad_hand; /* next write location / 724* uint64_t l2ad_write; /* desired write size, bytes / 725* uint64_t l2ad_boost; /* warmup write boost, bytes / 726* uint64_t l2ad_start; /* first addr on device / 727* uint64_t l2ad_end; /* last addr on device / 728* uint64_t l2ad_evict; /* last addr eviction reached / 729* boolean_t l2ad_first; /* first sweep through / 730* boolean_t l2ad_writing; /* currently writing / 731* list_t l2ad_buflist; / buffer list / 732* list_node_t l2ad_node; /* device list node / 733} l2arc_dev_t; 734* 735static list_t L2ARC_dev_list; /* device list / 736static list_t l2arc_dev_list; /* device list pointer / 737static kmutex_t l2arc_dev_mtx; / device list mutex / 738static l2arc_dev_t l2arc_dev_last; /* last device used / 739static kmutex_t l2arc_buflist_mtx; / mutex for all buflists / 740static list_t L2ARC_free_on_write; / free after write buf list / 741static list_t l2arc_free_on_write; /* free after write list ptr / 742static kmutex_t l2arc_free_on_write_mtx; / mutex for list / 743static uint64_t l2arc_ndev; / number of devices / 744* 745typedef struct l2arc_read_callback { 746 arc_buf_t l2rcb_buf; / read buffer / 747* spa_t l2rcb_spa; / spa / 748* blkptr_t l2rcb_bp; /* original blkptr / 749* zbookmark_t l2rcb_zb; /* original bookmark / 750* int l2rcb_flags; /* original flags / 751} l2arc_read_callback_t; 752* 753typedef struct l2arc_write_callback { 754 l2arc_dev_t l2wcb_dev; / device info / 755* arc_buf_hdr_t l2wcb_head; / head of write buflist / 756} l2arc_write_callback_t; 757* 758struct l2arc_buf_hdr { 759 /* protected by arc_buf_hdr mutex / 760* l2arc_dev_t b_dev; / L2ARC device / 761* uint64_t b_daddr; /* disk address, offset byte / 762}; 763* 764typedef struct l2arc_data_free { 765 /* protected by l2arc_free_on_write_mtx / 766* void l2df_data; 767* size_t l2df_size; 768 void (l2df_func)(void , size_t); 769 list_node_t l2df_list_node; 770} l2arc_data_free_t; 771 772static kmutex_t l2arc_feed_thr_lock; 773static kcondvar_t l2arc_feed_thr_cv; 774static uint8_t l2arc_thread_exit; 775 776static void l2arc_read_done(zio_t zio); 777static void l2arc_hdr_stat_add(void); 778static void l2arc_hdr_stat_remove(void); 779* 780static uint64_t 781buf_hash(uint64_t spa, const dva_t dva, uint64_t birth) 782{ 783* uint8_t vdva = (uint8_t )dva; 784 uint64_t crc = -1ULL; 785 int i; 786 787 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); 788 789 for (i = 0; i < sizeof (dva_t); i++) 790 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF]; 791 792 crc ^= (spa>>8) ^ birth; 793 794 return (crc); 795} 796 797#define BUF_EMPTY(buf) \ 798 ((buf)->b_dva.dva_word[0] == 0 && \ 799 (buf)->b_dva.dva_word[1] == 0 && \ 800 (buf)->b_birth == 0) 801 802#define BUF_EQUAL(spa, dva, birth, buf) \ 803 ((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \ 804 ((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \ 805 ((buf)->b_birth == birth) && ((buf)->b_spa == spa) 806 807static void 808buf_discard_identity(arc_buf_hdr_t hdr) 809{ 810* hdr->b_dva.dva_word[0] = 0; 811 hdr->b_dva.dva_word[1] = 0; 812 hdr->b_birth = 0; 813 hdr->b_cksum0 = 0; 814} 815 816static arc_buf_hdr_t * 817buf_hash_find(uint64_t spa, const dva_t dva, uint64_t birth, kmutex_t lockp) 818{ 819* uint64_t idx = BUF_HASH_INDEX(spa, dva, birth); 820 kmutex_t hash_lock = BUF_HASH_LOCK(idx); 821* arc_buf_hdr_t buf; 822* 823 mutex_enter(hash_lock); 824 for (buf = buf_hash_table.ht_table[idx]; buf != NULL; 825 buf = buf->b_hash_next) { 826 if (BUF_EQUAL(spa, dva, birth, buf)) { 827 lockp = hash_lock; 828* return (buf); 829 } 830 } 831 mutex_exit(hash_lock); 832 lockp = NULL; 833* return (NULL); 834} 835 836/* 837 * Insert an entry into the hash table. If there is already an element 838 * equal to elem in the hash table, then the already existing element 839 * will be returned and the new element will not be inserted. 840 * Otherwise returns NULL. 841 / 842static arc_buf_hdr_t 843buf_hash_insert(arc_buf_hdr_t buf, kmutex_t lockp) 844{ 845* uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth); 846 kmutex_t hash_lock = BUF_HASH_LOCK(idx); 847* arc_buf_hdr_t fbuf; 848* uint32_t i; 849 850 ASSERT(!HDR_IN_HASH_TABLE(buf)); 851 lockp = hash_lock; 852* mutex_enter(hash_lock); 853 for (fbuf = buf_hash_table.ht_table[idx], i = 0; fbuf != NULL; 854 fbuf = fbuf->b_hash_next, i++) { 855 if (BUF_EQUAL(buf->b_spa, &buf->b_dva, buf->b_birth, fbuf)) 856 return (fbuf); 857 } 858 859 buf->b_hash_next = buf_hash_table.ht_table[idx]; 860 buf_hash_table.ht_table[idx] = buf; 861 buf->b_flags \|= ARC_IN_HASH_TABLE; 862 863 /* collect some hash table performance data / 864* if (i > 0) { 865 ARCSTAT_BUMP(arcstat_hash_collisions); 866 if (i == 1) 867 ARCSTAT_BUMP(arcstat_hash_chains); 868 869 ARCSTAT_MAX(arcstat_hash_chain_max, i); 870 } 871 872 ARCSTAT_BUMP(arcstat_hash_elements); 873 ARCSTAT_MAXSTAT(arcstat_hash_elements); 874 875 return (NULL); 876} 877 878static void 879buf_hash_remove(arc_buf_hdr_t buf) 880{ 881* arc_buf_hdr_t fbuf, bufp; 882* uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth); 883 884 ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx))); 885 ASSERT(HDR_IN_HASH_TABLE(buf)); 886 887 bufp = &buf_hash_table.ht_table[idx]; 888 while ((fbuf = bufp) != buf) { 889* ASSERT(fbuf != NULL); 890 bufp = &fbuf->b_hash_next; 891 } 892 bufp = buf->b_hash_next; 893* buf->b_hash_next = NULL; 894 buf->b_flags &= ~ARC_IN_HASH_TABLE; 895 896 /* collect some hash table performance data / 897* ARCSTAT_BUMPDOWN(arcstat_hash_elements); 898 899 if (buf_hash_table.ht_table[idx] && 900 buf_hash_table.ht_table[idx]->b_hash_next == NULL) 901 ARCSTAT_BUMPDOWN(arcstat_hash_chains); 902} 903 904/* 905 * Global data structures and functions for the buf kmem cache. 906 / 907static kmem_cache_t hdr_cache; 908static kmem_cache_t buf_cache; 909* 910static void 911buf_fini(void) 912{ 913 int i; 914 915 kmem_free(buf_hash_table.ht_table, 916 (buf_hash_table.ht_mask + 1) * sizeof (void )); 917* for (i = 0; i < BUF_LOCKS; i++) 918 mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock); 919 kmem_cache_destroy(hdr_cache); 920 kmem_cache_destroy(buf_cache); 921} 922 923/* 924 * Constructor callback - called when the cache is empty 925 * and a new buf is requested. 926 / 927/ ARGSUSED / 928static int 929hdr_cons(void vbuf, void unused, int kmflag) 930{ 931* arc_buf_hdr_t buf = vbuf; 932* 933 bzero(buf, sizeof (arc_buf_hdr_t)); 934 refcount_create(&buf->b_refcnt); 935 cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL); 936 mutex_init(&buf->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL); 937 arc_space_consume(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS); 938 939 return (0); 940} 941 942/* ARGSUSED / 943static int 944buf_cons(void vbuf, void unused, int kmflag) 945{ 946* arc_buf_t buf = vbuf; 947* 948 bzero(buf, sizeof (arc_buf_t)); 949 mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL); 950 arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS); 951 952 return (0); 953} 954 955/* 956 * Destructor callback - called when a cached buf is 957 * no longer required. 958 / 959/ ARGSUSED / 960static void 961hdr_dest(void vbuf, void unused) 962{ 963* arc_buf_hdr_t buf = vbuf; 964* 965 ASSERT(BUF_EMPTY(buf)); 966 refcount_destroy(&buf->b_refcnt); 967 cv_destroy(&buf->b_cv); 968 mutex_destroy(&buf->b_freeze_lock); 969 arc_space_return(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS); 970} 971 972/* ARGSUSED / 973static void 974buf_dest(void vbuf, void unused) 975{ 976* arc_buf_t buf = vbuf; 977* 978 mutex_destroy(&buf->b_evict_lock); 979 arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS); 980} 981 982/* 983 * Reclaim callback -- invoked when memory is low. 984 / 985/ ARGSUSED / 986static void 987hdr_recl(void unused) 988{ 989 dprintf("hdr_recl called\n"); 990 /* 991 * umem calls the reclaim func when we destroy the buf cache, 992 * which is after we do arc_fini(). 993 / 994* if (!arc_dead) 995 cv_signal(&arc_reclaim_thr_cv); 996} 997 998static void 999buf_init(void) 1000{ 1001 uint64_t ct; 1002* uint64_t hsize = 1ULL << 12; 1003 int i, j; 1004 1005 /* 1006 * The hash table is big enough to fill all of physical memory 1007 * with an average 64K block size. The table will take up 1008 * totalmemsizeof(void)/64K (eg. 128KB/GB with 8-byte pointers). 1009 / 1010* while (hsize * 65536 < (uint64_t)physmem * PAGESIZE) 1011 hsize <<= 1; 1012retry: 1013 buf_hash_table.ht_mask = hsize - 1; 1014 buf_hash_table.ht_table = 1015 kmem_zalloc(hsize * sizeof (void), KM_NOSLEEP); 1016* if (buf_hash_table.ht_table == NULL) { 1017 ASSERT(hsize > (1ULL << 8)); 1018 hsize >>= 1; 1019 goto retry; 1020 } 1021 1022 hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t), 1023 0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0); 1024 buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t), 1025 0, buf_cons, buf_dest, NULL, NULL, NULL, 0); 1026 1027 for (i = 0; i < 256; i++) 1028 for (ct = zfs_crc64_table + i, ct = i, j = 8; j > 0; j--) 1029* ct = (ct >> 1) ^ (-(ct & 1) & ZFS_CRC64_POLY); 1030* 1031 for (i = 0; i < BUF_LOCKS; i++) { 1032 mutex_init(&buf_hash_table.ht_locks[i].ht_lock, 1033 NULL, MUTEX_DEFAULT, NULL); 1034 } 1035} 1036 1037#define ARC_MINTIME (hz>>4) /* 62 ms / 1038* 1039static void 1040arc_cksum_verify(arc_buf_t buf) 1041{ 1042* zio_cksum_t zc; 1043 1044 if (!(zfs_flags & ZFS_DEBUG_MODIFY)) 1045 return; 1046 1047 mutex_enter(&buf->b_hdr->b_freeze_lock); 1048 if (buf->b_hdr->b_freeze_cksum == NULL \|\| 1049 (buf->b_hdr->b_flags & ARC_IO_ERROR)) { 1050 mutex_exit(&buf->b_hdr->b_freeze_lock); 1051 return; 1052 } 1053 fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc); 1054 if (!ZIO_CHECKSUM_EQUAL(buf->b_hdr->b_freeze_cksum, zc)) 1055* panic("buffer modified while frozen!"); 1056 mutex_exit(&buf->b_hdr->b_freeze_lock); 1057} 1058 1059static int 1060arc_cksum_equal(arc_buf_t buf) 1061{ 1062* zio_cksum_t zc; 1063 int equal; 1064 1065 mutex_enter(&buf->b_hdr->b_freeze_lock); 1066 fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc); 1067 equal = ZIO_CHECKSUM_EQUAL(buf->b_hdr->b_freeze_cksum, zc); 1068* mutex_exit(&buf->b_hdr->b_freeze_lock); 1069 1070 return (equal); 1071} 1072 1073static void 1074arc_cksum_compute(arc_buf_t buf, boolean_t force) 1075{ 1076* if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY)) 1077 return; 1078 1079 mutex_enter(&buf->b_hdr->b_freeze_lock); 1080 if (buf->b_hdr->b_freeze_cksum != NULL) { 1081 mutex_exit(&buf->b_hdr->b_freeze_lock); 1082 return; 1083 } 1084 buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP); 1085 fletcher_2_native(buf->b_data, buf->b_hdr->b_size, 1086 buf->b_hdr->b_freeze_cksum); 1087 mutex_exit(&buf->b_hdr->b_freeze_lock); 1088#ifdef illumos 1089 arc_buf_watch(buf); 1090#endif /* illumos / 1091} 1092* 1093#ifdef illumos 1094#ifndef _KERNEL 1095typedef struct procctl { 1096 long cmd; 1097 prwatch_t prwatch; 1098} procctl_t; 1099#endif 1100 1101/* ARGSUSED / 1102static void 1103arc_buf_unwatch(arc_buf_t buf) 1104{ 1105#ifndef _KERNEL 1106 if (arc_watch) { 1107 int result; 1108 procctl_t ctl; 1109 ctl.cmd = PCWATCH; 1110 ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data; 1111 ctl.prwatch.pr_size = 0; 1112 ctl.prwatch.pr_wflags = 0; 1113 result = write(arc_procfd, &ctl, sizeof (ctl)); 1114 ASSERT3U(result, ==, sizeof (ctl)); 1115 } 1116#endif 1117} 1118 1119/* ARGSUSED / 1120static void 1121arc_buf_watch(arc_buf_t buf) 1122{ 1123#ifndef _KERNEL 1124 if (arc_watch) { 1125 int result; 1126 procctl_t ctl; 1127 ctl.cmd = PCWATCH; 1128 ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data; 1129 ctl.prwatch.pr_size = buf->b_hdr->b_size; 1130 ctl.prwatch.pr_wflags = WA_WRITE; 1131 result = write(arc_procfd, &ctl, sizeof (ctl)); 1132 ASSERT3U(result, ==, sizeof (ctl)); 1133 } 1134#endif 1135} 1136#endif /* illumos / 1137* 1138void 1139arc_buf_thaw(arc_buf_t buf) 1140{ 1141* if (zfs_flags & ZFS_DEBUG_MODIFY) { 1142 if (buf->b_hdr->b_state != arc_anon) 1143 panic("modifying non-anon buffer!"); 1144 if (buf->b_hdr->b_flags & ARC_IO_IN_PROGRESS) 1145 panic("modifying buffer while i/o in progress!"); 1146 arc_cksum_verify(buf); 1147 } 1148 1149 mutex_enter(&buf->b_hdr->b_freeze_lock); 1150 if (buf->b_hdr->b_freeze_cksum != NULL) { 1151 kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t)); 1152 buf->b_hdr->b_freeze_cksum = NULL; 1153 } 1154 1155 if (zfs_flags & ZFS_DEBUG_MODIFY) { 1156 if (buf->b_hdr->b_thawed) 1157 kmem_free(buf->b_hdr->b_thawed, 1); 1158 buf->b_hdr->b_thawed = kmem_alloc(1, KM_SLEEP); 1159 } 1160 1161 mutex_exit(&buf->b_hdr->b_freeze_lock); 1162 1163#ifdef illumos 1164 arc_buf_unwatch(buf); 1165#endif /* illumos / 1166} 1167* 1168void 1169arc_buf_freeze(arc_buf_t buf) 1170{ 1171* kmutex_t hash_lock; 1172* 1173 if (!(zfs_flags & ZFS_DEBUG_MODIFY)) 1174 return; 1175 1176 hash_lock = HDR_LOCK(buf->b_hdr); 1177 mutex_enter(hash_lock); 1178 1179 ASSERT(buf->b_hdr->b_freeze_cksum != NULL \|\| 1180 buf->b_hdr->b_state == arc_anon); 1181 arc_cksum_compute(buf, B_FALSE); 1182 mutex_exit(hash_lock); 1183 1184} 1185 1186static void 1187get_buf_info(arc_buf_hdr_t ab, arc_state_t state, list_t list, kmutex_t lock) 1188{ 1189 uint64_t buf_hashid = buf_hash(ab->b_spa, &ab->b_dva, ab->b_birth); 1190 1191 if (ab->b_type == ARC_BUFC_METADATA) 1192 buf_hashid &= (ARC_BUFC_NUMMETADATALISTS - 1); 1193 else { 1194 buf_hashid &= (ARC_BUFC_NUMDATALISTS - 1); 1195 buf_hashid += ARC_BUFC_NUMMETADATALISTS; 1196 } 1197 1198 list = &state->arcs_lists[buf_hashid]; 1199* lock = ARCS_LOCK(state, buf_hashid); 1200} 1201* 1202 1203static void 1204add_reference(arc_buf_hdr_t ab, kmutex_t hash_lock, void tag) 1205{ 1206* ASSERT(MUTEX_HELD(hash_lock)); 1207 1208 if ((refcount_add(&ab->b_refcnt, tag) == 1) && 1209 (ab->b_state != arc_anon)) { 1210 uint64_t delta = ab->b_size * ab->b_datacnt; 1211 uint64_t size = &ab->b_state->arcs_lsize[ab->b_type]; 1212* list_t list; 1213* kmutex_t lock; 1214* 1215 get_buf_info(ab, ab->b_state, &list, &lock); 1216 ASSERT(!MUTEX_HELD(lock)); 1217 mutex_enter(lock); 1218 ASSERT(list_link_active(&ab->b_arc_node)); 1219 list_remove(list, ab); 1220 if (GHOST_STATE(ab->b_state)) { 1221 ASSERT0(ab->b_datacnt); 1222 ASSERT3P(ab->b_buf, ==, NULL); 1223 delta = ab->b_size; 1224 } 1225 ASSERT(delta > 0); 1226 ASSERT3U(size, >=, delta); 1227* atomic_add_64(size, -delta); 1228 mutex_exit(lock); 1229 /* remove the prefetch flag if we get a reference / 1230* if (ab->b_flags & ARC_PREFETCH) 1231 ab->b_flags &= ~ARC_PREFETCH; 1232 } 1233} 1234 1235static int 1236remove_reference(arc_buf_hdr_t ab, kmutex_t hash_lock, void tag) 1237{ 1238* int cnt; 1239 arc_state_t state = ab->b_state; 1240* 1241 ASSERT(state == arc_anon \|\| MUTEX_HELD(hash_lock)); 1242 ASSERT(!GHOST_STATE(state)); 1243 1244 if (((cnt = refcount_remove(&ab->b_refcnt, tag)) == 0) && 1245 (state != arc_anon)) { 1246 uint64_t size = &state->arcs_lsize[ab->b_type]; 1247* list_t list; 1248* kmutex_t lock; 1249* 1250 get_buf_info(ab, state, &list, &lock); 1251 ASSERT(!MUTEX_HELD(lock)); 1252 mutex_enter(lock); 1253 ASSERT(!list_link_active(&ab->b_arc_node)); 1254 list_insert_head(list, ab); 1255 ASSERT(ab->b_datacnt > 0); 1256 atomic_add_64(size, ab->b_size * ab->b_datacnt); 1257 mutex_exit(lock); 1258 } 1259 return (cnt); 1260} 1261 1262/* 1263 * Move the supplied buffer to the indicated state. The mutex 1264 * for the buffer must be held by the caller. 1265 / 1266static void 1267arc_change_state(arc_state_t new_state, arc_buf_hdr_t ab, kmutex_t hash_lock) 1268{ 1269 arc_state_t old_state = ab->b_state; 1270* int64_t refcnt = refcount_count(&ab->b_refcnt); 1271 uint64_t from_delta, to_delta; 1272 list_t list; 1273* kmutex_t lock; 1274* 1275 ASSERT(MUTEX_HELD(hash_lock)); 1276 ASSERT(new_state != old_state); 1277 ASSERT(refcnt == 0 \|\| ab->b_datacnt > 0); 1278 ASSERT(ab->b_datacnt == 0 \|\| !GHOST_STATE(new_state)); 1279 ASSERT(ab->b_datacnt <= 1 \|\| old_state != arc_anon); 1280 1281 from_delta = to_delta = ab->b_datacnt * ab->b_size; 1282 1283 /* 1284 * If this buffer is evictable, transfer it from the 1285 * old state list to the new state list. 1286 / 1287* if (refcnt == 0) { 1288 if (old_state != arc_anon) { 1289 int use_mutex; 1290 uint64_t size = &old_state->arcs_lsize[ab->b_type]; 1291* 1292 get_buf_info(ab, old_state, &list, &lock); 1293 use_mutex = !MUTEX_HELD(lock); 1294 if (use_mutex) 1295 mutex_enter(lock); 1296 1297 ASSERT(list_link_active(&ab->b_arc_node)); 1298 list_remove(list, ab); 1299 1300 /* 1301 * If prefetching out of the ghost cache, 1302 * we will have a non-zero datacnt. 1303 / 1304* if (GHOST_STATE(old_state) && ab->b_datacnt == 0) { 1305 /* ghost elements have a ghost size / 1306* ASSERT(ab->b_buf == NULL); 1307 from_delta = ab->b_size; 1308 } 1309 ASSERT3U(size, >=, from_delta); 1310* atomic_add_64(size, -from_delta); 1311 1312 if (use_mutex) 1313 mutex_exit(lock); 1314 } 1315 if (new_state != arc_anon) { 1316 int use_mutex; 1317 uint64_t size = &new_state->arcs_lsize[ab->b_type]; 1318* 1319 get_buf_info(ab, new_state, &list, &lock); 1320 use_mutex = !MUTEX_HELD(lock); 1321 if (use_mutex) 1322 mutex_enter(lock); 1323 1324 list_insert_head(list, ab); 1325 1326 /* ghost elements have a ghost size / 1327* if (GHOST_STATE(new_state)) { 1328 ASSERT(ab->b_datacnt == 0); 1329 ASSERT(ab->b_buf == NULL); 1330 to_delta = ab->b_size; 1331 } 1332 atomic_add_64(size, to_delta); 1333 1334 if (use_mutex) 1335 mutex_exit(lock); 1336 } 1337 } 1338 1339 ASSERT(!BUF_EMPTY(ab)); 1340 if (new_state == arc_anon && HDR_IN_HASH_TABLE(ab)) 1341 buf_hash_remove(ab); 1342 1343 /* adjust state sizes / 1344* if (to_delta) 1345 atomic_add_64(&new_state->arcs_size, to_delta); 1346 if (from_delta) { 1347 ASSERT3U(old_state->arcs_size, >=, from_delta); 1348 atomic_add_64(&old_state->arcs_size, -from_delta); 1349 } 1350 ab->b_state = new_state; 1351 1352 /* adjust l2arc hdr stats / 1353* if (new_state == arc_l2c_only) 1354 l2arc_hdr_stat_add(); 1355 else if (old_state == arc_l2c_only) 1356 l2arc_hdr_stat_remove(); 1357} 1358 1359void 1360arc_space_consume(uint64_t space, arc_space_type_t type) 1361{ 1362 ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES); 1363 1364 switch (type) { 1365 case ARC_SPACE_DATA: 1366 ARCSTAT_INCR(arcstat_data_size, space); 1367 break; 1368 case ARC_SPACE_OTHER: 1369 ARCSTAT_INCR(arcstat_other_size, space); 1370 break; 1371 case ARC_SPACE_HDRS: 1372 ARCSTAT_INCR(arcstat_hdr_size, space); 1373 break; 1374 case ARC_SPACE_L2HDRS: 1375 ARCSTAT_INCR(arcstat_l2_hdr_size, space); 1376 break; 1377 } 1378 1379 atomic_add_64(&arc_meta_used, space); 1380 atomic_add_64(&arc_size, space); 1381} 1382 1383void 1384arc_space_return(uint64_t space, arc_space_type_t type) 1385{ 1386 ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES); 1387 1388 switch (type) { 1389 case ARC_SPACE_DATA: 1390 ARCSTAT_INCR(arcstat_data_size, -space); 1391 break; 1392 case ARC_SPACE_OTHER: 1393 ARCSTAT_INCR(arcstat_other_size, -space); 1394 break; 1395 case ARC_SPACE_HDRS: 1396 ARCSTAT_INCR(arcstat_hdr_size, -space); 1397 break; 1398 case ARC_SPACE_L2HDRS: 1399 ARCSTAT_INCR(arcstat_l2_hdr_size, -space); 1400 break; 1401 } 1402 1403 ASSERT(arc_meta_used >= space); 1404 if (arc_meta_max < arc_meta_used) 1405 arc_meta_max = arc_meta_used; 1406 atomic_add_64(&arc_meta_used, -space); 1407 ASSERT(arc_size >= space); 1408 atomic_add_64(&arc_size, -space); 1409} 1410 1411void * 1412arc_data_buf_alloc(uint64_t size) 1413{ 1414 if (arc_evict_needed(ARC_BUFC_DATA)) 1415 cv_signal(&arc_reclaim_thr_cv); 1416 atomic_add_64(&arc_size, size); 1417 return (zio_data_buf_alloc(size)); 1418} 1419 1420void 1421arc_data_buf_free(void buf, uint64_t size) 1422{ 1423* zio_data_buf_free(buf, size); 1424 ASSERT(arc_size >= size); 1425 atomic_add_64(&arc_size, -size); 1426} 1427 1428arc_buf_t * 1429arc_buf_alloc(spa_t spa, int size, void tag, arc_buf_contents_t type) 1430{ 1431 arc_buf_hdr_t hdr; 1432* arc_buf_t buf; 1433* 1434 ASSERT3U(size, >, 0); 1435 hdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE); 1436 ASSERT(BUF_EMPTY(hdr)); 1437 hdr->b_size = size; 1438 hdr->b_type = type; 1439 hdr->b_spa = spa_load_guid(spa); 1440 hdr->b_state = arc_anon; 1441 hdr->b_arc_access = 0; 1442 buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); 1443 buf->b_hdr = hdr; 1444 buf->b_data = NULL; 1445 buf->b_efunc = NULL; 1446 buf->b_private = NULL; 1447 buf->b_next = NULL; 1448 hdr->b_buf = buf; 1449 arc_get_data_buf(buf); 1450 hdr->b_datacnt = 1; 1451 hdr->b_flags = 0; 1452 ASSERT(refcount_is_zero(&hdr->b_refcnt)); 1453 (void) refcount_add(&hdr->b_refcnt, tag); 1454 1455 return (buf); 1456} 1457 1458static char arc_onloan_tag = "onloan"; 1459* 1460/* 1461 * Loan out an anonymous arc buffer. Loaned buffers are not counted as in 1462 * flight data by arc_tempreserve_space() until they are "returned". Loaned 1463 * buffers must be returned to the arc before they can be used by the DMU or 1464 * freed. 1465 / 1466arc_buf_t 1467arc_loan_buf(spa_t spa, int size) 1468{ 1469* arc_buf_t buf; 1470* 1471 buf = arc_buf_alloc(spa, size, arc_onloan_tag, ARC_BUFC_DATA); 1472 1473 atomic_add_64(&arc_loaned_bytes, size); 1474 return (buf); 1475} 1476 1477/* 1478 * Return a loaned arc buffer to the arc. 1479 / 1480void 1481arc_return_buf(arc_buf_t buf, void tag) 1482{ 1483* arc_buf_hdr_t hdr = buf->b_hdr; 1484* 1485 ASSERT(buf->b_data != NULL); 1486 (void) refcount_add(&hdr->b_refcnt, tag); 1487 (void) refcount_remove(&hdr->b_refcnt, arc_onloan_tag); 1488 1489 atomic_add_64(&arc_loaned_bytes, -hdr->b_size); 1490} 1491 1492/* Detach an arc_buf from a dbuf (tag) / 1493void 1494arc_loan_inuse_buf(arc_buf_t buf, void tag) 1495{ 1496* arc_buf_hdr_t hdr; 1497* 1498 ASSERT(buf->b_data != NULL); 1499 hdr = buf->b_hdr; 1500 (void) refcount_add(&hdr->b_refcnt, arc_onloan_tag); 1501 (void) refcount_remove(&hdr->b_refcnt, tag); 1502 buf->b_efunc = NULL; 1503 buf->b_private = NULL; 1504 1505 atomic_add_64(&arc_loaned_bytes, hdr->b_size); 1506} 1507 1508static arc_buf_t * 1509arc_buf_clone(arc_buf_t from) 1510{ 1511* arc_buf_t buf; 1512* arc_buf_hdr_t hdr = from->b_hdr; 1513* uint64_t size = hdr->b_size; 1514 1515 ASSERT(hdr->b_state != arc_anon); 1516 1517 buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); 1518 buf->b_hdr = hdr; 1519 buf->b_data = NULL; 1520 buf->b_efunc = NULL; 1521 buf->b_private = NULL; 1522 buf->b_next = hdr->b_buf; 1523 hdr->b_buf = buf; 1524 arc_get_data_buf(buf); 1525 bcopy(from->b_data, buf->b_data, size); 1526 1527 /* 1528 * This buffer already exists in the arc so create a duplicate 1529 * copy for the caller. If the buffer is associated with user data 1530 * then track the size and number of duplicates. These stats will be 1531 * updated as duplicate buffers are created and destroyed. 1532 / 1533* if (hdr->b_type == ARC_BUFC_DATA) { 1534 ARCSTAT_BUMP(arcstat_duplicate_buffers); 1535 ARCSTAT_INCR(arcstat_duplicate_buffers_size, size); 1536 } 1537 hdr->b_datacnt += 1; 1538 return (buf); 1539} 1540 1541void 1542arc_buf_add_ref(arc_buf_t buf, void tag) 1543{ 1544 arc_buf_hdr_t hdr; 1545* kmutex_t hash_lock; 1546* 1547 /* 1548 * Check to see if this buffer is evicted. Callers 1549 * must verify b_data != NULL to know if the add_ref 1550 * was successful. 1551 / 1552* mutex_enter(&buf->b_evict_lock); 1553 if (buf->b_data == NULL) { 1554 mutex_exit(&buf->b_evict_lock); 1555 return; 1556 } 1557 hash_lock = HDR_LOCK(buf->b_hdr); 1558 mutex_enter(hash_lock); 1559 hdr = buf->b_hdr; 1560 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 1561 mutex_exit(&buf->b_evict_lock); 1562 1563 ASSERT(hdr->b_state == arc_mru \|\| hdr->b_state == arc_mfu); 1564 add_reference(hdr, hash_lock, tag); 1565 DTRACE_PROBE1(arc__hit, arc_buf_hdr_t , hdr); 1566* arc_access(hdr, hash_lock); 1567 mutex_exit(hash_lock); 1568 ARCSTAT_BUMP(arcstat_hits); 1569 ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH), 1570 demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, 1571 data, metadata, hits); 1572} 1573 1574/* 1575 * Free the arc data buffer. If it is an l2arc write in progress, 1576 * the buffer is placed on l2arc_free_on_write to be freed later. 1577 / 1578static void 1579arc_buf_data_free(arc_buf_t buf, void (free_func)(void , size_t)) 1580{ 1581 arc_buf_hdr_t hdr = buf->b_hdr; 1582* 1583 if (HDR_L2_WRITING(hdr)) { 1584 l2arc_data_free_t df; 1585* df = kmem_alloc(sizeof (l2arc_data_free_t), KM_SLEEP); 1586 df->l2df_data = buf->b_data; 1587 df->l2df_size = hdr->b_size; 1588 df->l2df_func = free_func; 1589 mutex_enter(&l2arc_free_on_write_mtx); 1590 list_insert_head(l2arc_free_on_write, df); 1591 mutex_exit(&l2arc_free_on_write_mtx); 1592 ARCSTAT_BUMP(arcstat_l2_free_on_write); 1593 } else { 1594 free_func(buf->b_data, hdr->b_size); 1595 } 1596} 1597 1598static void 1599arc_buf_destroy(arc_buf_t buf, boolean_t recycle, boolean_t all) 1600{ 1601* arc_buf_t *bufp; 1602* 1603 /* free up data associated with the buf / 1604* if (buf->b_data) { 1605 arc_state_t state = buf->b_hdr->b_state; 1606* uint64_t size = buf->b_hdr->b_size; 1607 arc_buf_contents_t type = buf->b_hdr->b_type; 1608 1609 arc_cksum_verify(buf); 1610#ifdef illumos 1611 arc_buf_unwatch(buf); 1612#endif /* illumos / 1613* 1614 if (!recycle) { 1615 if (type == ARC_BUFC_METADATA) { 1616 arc_buf_data_free(buf, zio_buf_free); 1617 arc_space_return(size, ARC_SPACE_DATA); 1618 } else { 1619 ASSERT(type == ARC_BUFC_DATA); 1620 arc_buf_data_free(buf, zio_data_buf_free); 1621 ARCSTAT_INCR(arcstat_data_size, -size); 1622 atomic_add_64(&arc_size, -size); 1623 } 1624 } 1625 if (list_link_active(&buf->b_hdr->b_arc_node)) { 1626 uint64_t cnt = &state->arcs_lsize[type]; 1627* 1628 ASSERT(refcount_is_zero(&buf->b_hdr->b_refcnt)); 1629 ASSERT(state != arc_anon); 1630 1631 ASSERT3U(cnt, >=, size); 1632* atomic_add_64(cnt, -size); 1633 } 1634 ASSERT3U(state->arcs_size, >=, size); 1635 atomic_add_64(&state->arcs_size, -size); 1636 buf->b_data = NULL; 1637 1638 /* 1639 * If we're destroying a duplicate buffer make sure 1640 * that the appropriate statistics are updated. 1641 / 1642* if (buf->b_hdr->b_datacnt > 1 && 1643 buf->b_hdr->b_type == ARC_BUFC_DATA) { 1644 ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers); 1645 ARCSTAT_INCR(arcstat_duplicate_buffers_size, -size); 1646 } 1647 ASSERT(buf->b_hdr->b_datacnt > 0); 1648 buf->b_hdr->b_datacnt -= 1; 1649 } 1650 1651 /* only remove the buf if requested / 1652* if (!all) 1653 return; 1654 1655 /* remove the buf from the hdr list / 1656* for (bufp = &buf->b_hdr->b_buf; bufp != buf; bufp = &(bufp)->b_next) 1657 continue; 1658 bufp = buf->b_next; 1659* buf->b_next = NULL; 1660 1661 ASSERT(buf->b_efunc == NULL); 1662 1663 /* clean up the buf / 1664* buf->b_hdr = NULL; 1665 kmem_cache_free(buf_cache, buf); 1666} 1667 1668static void 1669arc_hdr_destroy(arc_buf_hdr_t hdr) 1670{ 1671* ASSERT(refcount_is_zero(&hdr->b_refcnt)); 1672 ASSERT3P(hdr->b_state, ==, arc_anon); 1673 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 1674 l2arc_buf_hdr_t l2hdr = hdr->b_l2hdr; 1675* 1676 if (l2hdr != NULL) { 1677 boolean_t buflist_held = MUTEX_HELD(&l2arc_buflist_mtx); 1678 /* 1679 * To prevent arc_free() and l2arc_evict() from 1680 * attempting to free the same buffer at the same time, 1681 * a FREE_IN_PROGRESS flag is given to arc_free() to 1682 * give it priority. l2arc_evict() can't destroy this 1683 * header while we are waiting on l2arc_buflist_mtx. 1684 * 1685 * The hdr may be removed from l2ad_buflist before we 1686 * grab l2arc_buflist_mtx, so b_l2hdr is rechecked. 1687 / 1688* if (!buflist_held) { 1689 mutex_enter(&l2arc_buflist_mtx); 1690 l2hdr = hdr->b_l2hdr; 1691 } 1692 1693 if (l2hdr != NULL) { 1694 list_remove(l2hdr->b_dev->l2ad_buflist, hdr); 1695 ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); 1696 kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t)); 1697 if (hdr->b_state == arc_l2c_only) 1698 l2arc_hdr_stat_remove(); 1699 hdr->b_l2hdr = NULL; 1700 } 1701 1702 if (!buflist_held) 1703 mutex_exit(&l2arc_buflist_mtx); 1704 } 1705 1706 if (!BUF_EMPTY(hdr)) { 1707 ASSERT(!HDR_IN_HASH_TABLE(hdr)); 1708 buf_discard_identity(hdr); 1709 } 1710 while (hdr->b_buf) { 1711 arc_buf_t buf = hdr->b_buf; 1712* 1713 if (buf->b_efunc) { 1714 mutex_enter(&arc_eviction_mtx); 1715 mutex_enter(&buf->b_evict_lock); 1716 ASSERT(buf->b_hdr != NULL); 1717 arc_buf_destroy(hdr->b_buf, FALSE, FALSE); 1718 hdr->b_buf = buf->b_next; 1719 buf->b_hdr = &arc_eviction_hdr; 1720 buf->b_next = arc_eviction_list; 1721 arc_eviction_list = buf; 1722 mutex_exit(&buf->b_evict_lock); 1723 mutex_exit(&arc_eviction_mtx); 1724 } else { 1725 arc_buf_destroy(hdr->b_buf, FALSE, TRUE); 1726 } 1727 } 1728 if (hdr->b_freeze_cksum != NULL) { 1729 kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t)); 1730 hdr->b_freeze_cksum = NULL; 1731 } 1732 if (hdr->b_thawed) { 1733 kmem_free(hdr->b_thawed, 1); 1734 hdr->b_thawed = NULL; 1735 } 1736 1737 ASSERT(!list_link_active(&hdr->b_arc_node)); 1738 ASSERT3P(hdr->b_hash_next, ==, NULL); 1739 ASSERT3P(hdr->b_acb, ==, NULL); 1740 kmem_cache_free(hdr_cache, hdr); 1741} 1742 1743void 1744arc_buf_free(arc_buf_t buf, void tag) 1745{ 1746 arc_buf_hdr_t hdr = buf->b_hdr; 1747* int hashed = hdr->b_state != arc_anon; 1748 1749 ASSERT(buf->b_efunc == NULL); 1750 ASSERT(buf->b_data != NULL); 1751 1752 if (hashed) { 1753 kmutex_t hash_lock = HDR_LOCK(hdr); 1754* 1755 mutex_enter(hash_lock); 1756 hdr = buf->b_hdr; 1757 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 1758 1759 (void) remove_reference(hdr, hash_lock, tag); 1760 if (hdr->b_datacnt > 1) { 1761 arc_buf_destroy(buf, FALSE, TRUE); 1762 } else { 1763 ASSERT(buf == hdr->b_buf); 1764 ASSERT(buf->b_efunc == NULL); 1765 hdr->b_flags \|= ARC_BUF_AVAILABLE; 1766 } 1767 mutex_exit(hash_lock); 1768 } else if (HDR_IO_IN_PROGRESS(hdr)) { 1769 int destroy_hdr; 1770 /* 1771 * We are in the middle of an async write. Don't destroy 1772 * this buffer unless the write completes before we finish 1773 * decrementing the reference count. 1774 / 1775* mutex_enter(&arc_eviction_mtx); 1776 (void) remove_reference(hdr, NULL, tag); 1777 ASSERT(refcount_is_zero(&hdr->b_refcnt)); 1778 destroy_hdr = !HDR_IO_IN_PROGRESS(hdr); 1779 mutex_exit(&arc_eviction_mtx); 1780 if (destroy_hdr) 1781 arc_hdr_destroy(hdr); 1782 } else { 1783 if (remove_reference(hdr, NULL, tag) > 0) 1784 arc_buf_destroy(buf, FALSE, TRUE); 1785 else 1786 arc_hdr_destroy(hdr); 1787 } 1788} 1789 1790int 1791arc_buf_remove_ref(arc_buf_t buf, void tag) 1792{ 1793 arc_buf_hdr_t hdr = buf->b_hdr; 1794* kmutex_t hash_lock = HDR_LOCK(hdr); 1795* int no_callback = (buf->b_efunc == NULL); 1796 1797 if (hdr->b_state == arc_anon) { 1798 ASSERT(hdr->b_datacnt == 1); 1799 arc_buf_free(buf, tag); 1800 return (no_callback); 1801 } 1802 1803 mutex_enter(hash_lock); 1804 hdr = buf->b_hdr; 1805 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 1806 ASSERT(hdr->b_state != arc_anon); 1807 ASSERT(buf->b_data != NULL); 1808 1809 (void) remove_reference(hdr, hash_lock, tag); 1810 if (hdr->b_datacnt > 1) { 1811 if (no_callback) 1812 arc_buf_destroy(buf, FALSE, TRUE); 1813 } else if (no_callback) { 1814 ASSERT(hdr->b_buf == buf && buf->b_next == NULL); 1815 ASSERT(buf->b_efunc == NULL); 1816 hdr->b_flags \|= ARC_BUF_AVAILABLE; 1817 } 1818 ASSERT(no_callback \|\| hdr->b_datacnt > 1 \|\| 1819 refcount_is_zero(&hdr->b_refcnt)); 1820 mutex_exit(hash_lock); 1821 return (no_callback); 1822} 1823 1824int 1825arc_buf_size(arc_buf_t buf) 1826{ 1827* return (buf->b_hdr->b_size); 1828} 1829 1830/* 1831 * Called from the DMU to determine if the current buffer should be 1832 * evicted. In order to ensure proper locking, the eviction must be initiated 1833 * from the DMU. Return true if the buffer is associated with user data and 1834 * duplicate buffers still exist. 1835 / 1836boolean_t 1837arc_buf_eviction_needed(arc_buf_t buf) 1838{ 1839 arc_buf_hdr_t hdr; 1840* boolean_t evict_needed = B_FALSE; 1841 1842 if (zfs_disable_dup_eviction) 1843 return (B_FALSE); 1844 1845 mutex_enter(&buf->b_evict_lock); 1846 hdr = buf->b_hdr; 1847 if (hdr == NULL) { 1848 /* 1849 * We are in arc_do_user_evicts(); let that function 1850 * perform the eviction. 1851 / 1852* ASSERT(buf->b_data == NULL); 1853 mutex_exit(&buf->b_evict_lock); 1854 return (B_FALSE); 1855 } else if (buf->b_data == NULL) { 1856 /* 1857 * We have already been added to the arc eviction list; 1858 * recommend eviction. 1859 / 1860* ASSERT3P(hdr, ==, &arc_eviction_hdr); 1861 mutex_exit(&buf->b_evict_lock); 1862 return (B_TRUE); 1863 } 1864 1865 if (hdr->b_datacnt > 1 && hdr->b_type == ARC_BUFC_DATA) 1866 evict_needed = B_TRUE; 1867 1868 mutex_exit(&buf->b_evict_lock); 1869 return (evict_needed); 1870} 1871 1872/* 1873 * Evict buffers from list until we've removed the specified number of 1874 * bytes. Move the removed buffers to the appropriate evict state. 1875 * If the recycle flag is set, then attempt to "recycle" a buffer: 1876 * - look for a buffer to evict that is `bytes' long. 1877 * - return the data block from this buffer rather than freeing it. 1878 * This flag is used by callers that are trying to make space for a 1879 * new buffer in a full arc cache. 1880 * 1881 * This function makes a "best effort". It skips over any buffers 1882 * it can't get a hash_lock on, and so may not catch all candidates. 1883 * It may also return without evicting as much space as requested. 1884 / 1885static void 1886arc_evict(arc_state_t state, uint64_t spa, int64_t bytes, boolean_t recycle, 1887* arc_buf_contents_t type) 1888{ 1889 arc_state_t evicted_state; 1890* uint64_t bytes_evicted = 0, skipped = 0, missed = 0; 1891 int64_t bytes_remaining; 1892 arc_buf_hdr_t ab, ab_prev = NULL; 1893 list_t evicted_list, list, evicted_list_start, list_start; 1894 kmutex_t lock, evicted_lock; 1895 kmutex_t hash_lock; 1896* boolean_t have_lock; 1897 void stolen = NULL; 1898* static int evict_metadata_offset, evict_data_offset; 1899 int i, idx, offset, list_count, count; 1900 1901 ASSERT(state == arc_mru \|\| state == arc_mfu); 1902 1903 evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost; 1904 1905 if (type == ARC_BUFC_METADATA) { 1906 offset = 0; 1907 list_count = ARC_BUFC_NUMMETADATALISTS; 1908 list_start = &state->arcs_lists[0]; 1909 evicted_list_start = &evicted_state->arcs_lists[0]; 1910 idx = evict_metadata_offset; 1911 } else { 1912 offset = ARC_BUFC_NUMMETADATALISTS; 1913 list_start = &state->arcs_lists[offset]; 1914 evicted_list_start = &evicted_state->arcs_lists[offset]; 1915 list_count = ARC_BUFC_NUMDATALISTS; 1916 idx = evict_data_offset; 1917 } 1918 bytes_remaining = evicted_state->arcs_lsize[type]; 1919 count = 0; 1920 1921evict_start: 1922 list = &list_start[idx]; 1923 evicted_list = &evicted_list_start[idx]; 1924 lock = ARCS_LOCK(state, (offset + idx)); 1925 evicted_lock = ARCS_LOCK(evicted_state, (offset + idx)); 1926 1927 mutex_enter(lock); 1928 mutex_enter(evicted_lock); 1929 1930 for (ab = list_tail(list); ab; ab = ab_prev) { 1931 ab_prev = list_prev(list, ab); 1932 bytes_remaining -= (ab->b_size * ab->b_datacnt); 1933 /* prefetch buffers have a minimum lifespan / 1934* if (HDR_IO_IN_PROGRESS(ab) \|\| 1935 (spa && ab->b_spa != spa) \|\| 1936 (ab->b_flags & (ARC_PREFETCH\|ARC_INDIRECT) && 1937 ddi_get_lbolt() - ab->b_arc_access < 1938 arc_min_prefetch_lifespan)) { 1939 skipped++; 1940 continue; 1941 } 1942 /* "lookahead" for better eviction candidate / 1943* if (recycle && ab->b_size != bytes && 1944 ab_prev && ab_prev->b_size == bytes) 1945 continue; 1946 hash_lock = HDR_LOCK(ab); 1947 have_lock = MUTEX_HELD(hash_lock); 1948 if (have_lock \|\| mutex_tryenter(hash_lock)) { 1949 ASSERT0(refcount_count(&ab->b_refcnt)); 1950 ASSERT(ab->b_datacnt > 0); 1951 while (ab->b_buf) { 1952 arc_buf_t buf = ab->b_buf; 1953* if (!mutex_tryenter(&buf->b_evict_lock)) { 1954 missed += 1; 1955 break; 1956 } 1957 if (buf->b_data) { 1958 bytes_evicted += ab->b_size; 1959 if (recycle && ab->b_type == type && 1960 ab->b_size == bytes && 1961 !HDR_L2_WRITING(ab)) { 1962 stolen = buf->b_data; 1963 recycle = FALSE; 1964 } 1965 } 1966 if (buf->b_efunc) { 1967 mutex_enter(&arc_eviction_mtx); 1968 arc_buf_destroy(buf, 1969 buf->b_data == stolen, FALSE); 1970 ab->b_buf = buf->b_next; 1971 buf->b_hdr = &arc_eviction_hdr; 1972 buf->b_next = arc_eviction_list; 1973 arc_eviction_list = buf; 1974 mutex_exit(&arc_eviction_mtx); 1975 mutex_exit(&buf->b_evict_lock); 1976 } else { 1977 mutex_exit(&buf->b_evict_lock); 1978 arc_buf_destroy(buf, 1979 buf->b_data == stolen, TRUE); 1980 } 1981 } 1982 1983 if (ab->b_l2hdr) { 1984 ARCSTAT_INCR(arcstat_evict_l2_cached, 1985 ab->b_size); 1986 } else { 1987 if (l2arc_write_eligible(ab->b_spa, ab)) { 1988 ARCSTAT_INCR(arcstat_evict_l2_eligible, 1989 ab->b_size); 1990 } else { 1991 ARCSTAT_INCR( 1992 arcstat_evict_l2_ineligible, 1993 ab->b_size); 1994 } 1995 } 1996 1997 if (ab->b_datacnt == 0) { 1998 arc_change_state(evicted_state, ab, hash_lock); 1999 ASSERT(HDR_IN_HASH_TABLE(ab)); 2000 ab->b_flags \|= ARC_IN_HASH_TABLE; 2001 ab->b_flags &= ~ARC_BUF_AVAILABLE; 2002 DTRACE_PROBE1(arc__evict, arc_buf_hdr_t , ab); 2003* } 2004 if (!have_lock) 2005 mutex_exit(hash_lock); 2006 if (bytes >= 0 && bytes_evicted >= bytes) 2007 break; 2008 if (bytes_remaining > 0) { 2009 mutex_exit(evicted_lock); 2010 mutex_exit(lock); 2011 idx = ((idx + 1) & (list_count - 1)); 2012 count++; 2013 goto evict_start; 2014 } 2015 } else { 2016 missed += 1; 2017 } 2018 } 2019 2020 mutex_exit(evicted_lock); 2021 mutex_exit(lock); 2022 2023 idx = ((idx + 1) & (list_count - 1)); 2024 count++; 2025 2026 if (bytes_evicted < bytes) { 2027 if (count < list_count) 2028 goto evict_start; 2029 else 2030 dprintf("only evicted %lld bytes from %x", 2031 (longlong_t)bytes_evicted, state); 2032 } 2033 if (type == ARC_BUFC_METADATA) 2034 evict_metadata_offset = idx; 2035 else 2036 evict_data_offset = idx; 2037 2038 if (skipped) 2039 ARCSTAT_INCR(arcstat_evict_skip, skipped); 2040 2041 if (missed) 2042 ARCSTAT_INCR(arcstat_mutex_miss, missed); 2043 2044 /* 2045 * We have just evicted some date into the ghost state, make 2046 * sure we also adjust the ghost state size if necessary. 2047 / 2048* if (arc_no_grow && 2049 arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size > arc_c) { 2050 int64_t mru_over = arc_anon->arcs_size + arc_mru->arcs_size + 2051 arc_mru_ghost->arcs_size - arc_c; 2052 2053 if (mru_over > 0 && arc_mru_ghost->arcs_lsize[type] > 0) { 2054 int64_t todelete = 2055 MIN(arc_mru_ghost->arcs_lsize[type], mru_over); 2056 arc_evict_ghost(arc_mru_ghost, 0, todelete); 2057 } else if (arc_mfu_ghost->arcs_lsize[type] > 0) { 2058 int64_t todelete = MIN(arc_mfu_ghost->arcs_lsize[type], 2059 arc_mru_ghost->arcs_size + 2060 arc_mfu_ghost->arcs_size - arc_c); 2061 arc_evict_ghost(arc_mfu_ghost, 0, todelete); 2062 } 2063 } 2064 if (stolen) 2065 ARCSTAT_BUMP(arcstat_stolen); 2066 2067 return (stolen); 2068} 2069 2070/* 2071 * Remove buffers from list until we've removed the specified number of 2072 * bytes. Destroy the buffers that are removed. 2073 / 2074static void 2075arc_evict_ghost(arc_state_t state, uint64_t spa, int64_t bytes) 2076{ 2077 arc_buf_hdr_t ab, ab_prev; 2078 arc_buf_hdr_t marker = { 0 }; 2079 list_t list, list_start; 2080 kmutex_t hash_lock, lock; 2081 uint64_t bytes_deleted = 0; 2082 uint64_t bufs_skipped = 0; 2083 static int evict_offset; 2084 int list_count, idx = evict_offset; 2085 int offset, count = 0; 2086 2087 ASSERT(GHOST_STATE(state)); 2088 2089 /* 2090 * data lists come after metadata lists 2091 / 2092* list_start = &state->arcs_lists[ARC_BUFC_NUMMETADATALISTS]; 2093 list_count = ARC_BUFC_NUMDATALISTS; 2094 offset = ARC_BUFC_NUMMETADATALISTS; 2095 2096evict_start: 2097 list = &list_start[idx]; 2098 lock = ARCS_LOCK(state, idx + offset); 2099 2100 mutex_enter(lock); 2101 for (ab = list_tail(list); ab; ab = ab_prev) { 2102 ab_prev = list_prev(list, ab); 2103 if (spa && ab->b_spa != spa) 2104 continue; 2105 2106 /* ignore markers / 2107* if (ab->b_spa == 0) 2108 continue; 2109 2110 hash_lock = HDR_LOCK(ab); 2111 /* caller may be trying to modify this buffer, skip it / 2112* if (MUTEX_HELD(hash_lock)) 2113 continue; 2114 if (mutex_tryenter(hash_lock)) { 2115 ASSERT(!HDR_IO_IN_PROGRESS(ab)); 2116 ASSERT(ab->b_buf == NULL); 2117 ARCSTAT_BUMP(arcstat_deleted); 2118 bytes_deleted += ab->b_size; 2119 2120 if (ab->b_l2hdr != NULL) { 2121 /* 2122 * This buffer is cached on the 2nd Level ARC; 2123 * don't destroy the header. 2124 / 2125* arc_change_state(arc_l2c_only, ab, hash_lock); 2126 mutex_exit(hash_lock); 2127 } else { 2128 arc_change_state(arc_anon, ab, hash_lock); 2129 mutex_exit(hash_lock); 2130 arc_hdr_destroy(ab); 2131 } 2132 2133 DTRACE_PROBE1(arc__delete, arc_buf_hdr_t , ab); 2134* if (bytes >= 0 && bytes_deleted >= bytes) 2135 break; 2136 } else if (bytes < 0) { 2137 /* 2138 * Insert a list marker and then wait for the 2139 * hash lock to become available. Once its 2140 * available, restart from where we left off. 2141 / 2142* list_insert_after(list, ab, &marker); 2143 mutex_exit(lock); 2144 mutex_enter(hash_lock); 2145 mutex_exit(hash_lock); 2146 mutex_enter(lock); 2147 ab_prev = list_prev(list, &marker); 2148 list_remove(list, &marker); 2149 } else 2150 bufs_skipped += 1; 2151 } 2152 mutex_exit(lock); 2153 idx = ((idx + 1) & (ARC_BUFC_NUMDATALISTS - 1)); 2154 count++; 2155 2156 if (count < list_count) 2157 goto evict_start; 2158 2159 evict_offset = idx; 2160 if ((uintptr_t)list > (uintptr_t)&state->arcs_lists[ARC_BUFC_NUMMETADATALISTS] && 2161 (bytes < 0 \|\| bytes_deleted < bytes)) { 2162 list_start = &state->arcs_lists[0]; 2163 list_count = ARC_BUFC_NUMMETADATALISTS; 2164 offset = count = 0; 2165 goto evict_start; 2166 } 2167 2168 if (bufs_skipped) { 2169 ARCSTAT_INCR(arcstat_mutex_miss, bufs_skipped); 2170 ASSERT(bytes >= 0); 2171 } 2172 2173 if (bytes_deleted < bytes) 2174 dprintf("only deleted %lld bytes from %p", 2175 (longlong_t)bytes_deleted, state); 2176} 2177 2178static void 2179arc_adjust(void) 2180{ 2181 int64_t adjustment, delta; 2182 2183 /* 2184 * Adjust MRU size 2185 / 2186* 2187 adjustment = MIN((int64_t)(arc_size - arc_c), 2188 (int64_t)(arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used - 2189 arc_p)); 2190 2191 if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) { 2192 delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], adjustment); 2193 (void) arc_evict(arc_mru, 0, delta, FALSE, ARC_BUFC_DATA); 2194 adjustment -= delta; 2195 } 2196 2197 if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) { 2198 delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustment); 2199 (void) arc_evict(arc_mru, 0, delta, FALSE, 2200 ARC_BUFC_METADATA); 2201 } 2202 2203 /* 2204 * Adjust MFU size 2205 / 2206* 2207 adjustment = arc_size - arc_c; 2208 2209 if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) { 2210 delta = MIN(adjustment, arc_mfu->arcs_lsize[ARC_BUFC_DATA]); 2211 (void) arc_evict(arc_mfu, 0, delta, FALSE, ARC_BUFC_DATA); 2212 adjustment -= delta; 2213 } 2214 2215 if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) { 2216 int64_t delta = MIN(adjustment, 2217 arc_mfu->arcs_lsize[ARC_BUFC_METADATA]); 2218 (void) arc_evict(arc_mfu, 0, delta, FALSE, 2219 ARC_BUFC_METADATA); 2220 } 2221 2222 /* 2223 * Adjust ghost lists 2224 / 2225* 2226 adjustment = arc_mru->arcs_size + arc_mru_ghost->arcs_size - arc_c; 2227 2228 if (adjustment > 0 && arc_mru_ghost->arcs_size > 0) { 2229 delta = MIN(arc_mru_ghost->arcs_size, adjustment); 2230 arc_evict_ghost(arc_mru_ghost, 0, delta); 2231 } 2232 2233 adjustment = 2234 arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size - arc_c; 2235 2236 if (adjustment > 0 && arc_mfu_ghost->arcs_size > 0) { 2237 delta = MIN(arc_mfu_ghost->arcs_size, adjustment); 2238 arc_evict_ghost(arc_mfu_ghost, 0, delta); 2239 } 2240} 2241 2242static void 2243arc_do_user_evicts(void) 2244{ 2245 static arc_buf_t tmp_arc_eviction_list; 2246* 2247 /* 2248 * Move list over to avoid LOR 2249 / 2250restart: 2251* mutex_enter(&arc_eviction_mtx); 2252 tmp_arc_eviction_list = arc_eviction_list; 2253 arc_eviction_list = NULL; 2254 mutex_exit(&arc_eviction_mtx); 2255 2256 while (tmp_arc_eviction_list != NULL) { 2257 arc_buf_t buf = tmp_arc_eviction_list; 2258* tmp_arc_eviction_list = buf->b_next; 2259 mutex_enter(&buf->b_evict_lock); 2260 buf->b_hdr = NULL; 2261 mutex_exit(&buf->b_evict_lock); 2262 2263 if (buf->b_efunc != NULL) 2264 VERIFY(buf->b_efunc(buf) == 0); 2265 2266 buf->b_efunc = NULL; 2267 buf->b_private = NULL; 2268 kmem_cache_free(buf_cache, buf); 2269 } 2270 2271 if (arc_eviction_list != NULL) 2272 goto restart; 2273} 2274 2275/* 2276 * Flush all evictable data from the cache for the given spa. 2277 * NOTE: this will not touch "active" (i.e. referenced) data. 2278 / 2279void 2280arc_flush(spa_t spa) 2281{ 2282 uint64_t guid = 0; 2283 2284 if (spa) 2285 guid = spa_load_guid(spa); 2286 2287 while (arc_mru->arcs_lsize[ARC_BUFC_DATA]) { 2288 (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_DATA); 2289 if (spa) 2290 break; 2291 } 2292 while (arc_mru->arcs_lsize[ARC_BUFC_METADATA]) { 2293 (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_METADATA); 2294 if (spa) 2295 break; 2296 } 2297 while (arc_mfu->arcs_lsize[ARC_BUFC_DATA]) { 2298 (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_DATA); 2299 if (spa) 2300 break; 2301 } 2302 while (arc_mfu->arcs_lsize[ARC_BUFC_METADATA]) { 2303 (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_METADATA); 2304 if (spa) 2305 break; 2306 } 2307 2308 arc_evict_ghost(arc_mru_ghost, guid, -1); 2309 arc_evict_ghost(arc_mfu_ghost, guid, -1); 2310 2311 mutex_enter(&arc_reclaim_thr_lock); 2312 arc_do_user_evicts(); 2313 mutex_exit(&arc_reclaim_thr_lock); 2314 ASSERT(spa \|\| arc_eviction_list == NULL); 2315} 2316 2317void 2318arc_shrink(void) 2319{ 2320 if (arc_c > arc_c_min) { 2321 uint64_t to_free; 2322 2323#ifdef _KERNEL 2324 to_free = arc_c >> arc_shrink_shift; 2325#else 2326 to_free = arc_c >> arc_shrink_shift; 2327#endif 2328 if (arc_c > arc_c_min + to_free) 2329 atomic_add_64(&arc_c, -to_free); 2330 else 2331 arc_c = arc_c_min; 2332 2333 atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift)); 2334 if (arc_c > arc_size) 2335 arc_c = MAX(arc_size, arc_c_min); 2336 if (arc_p > arc_c) 2337 arc_p = (arc_c >> 1); 2338 ASSERT(arc_c >= arc_c_min); 2339 ASSERT((int64_t)arc_p >= 0); 2340 } 2341 2342 if (arc_size > arc_c) 2343 arc_adjust(); 2344} 2345 2346static int needfree = 0; 2347 2348static int 2349arc_reclaim_needed(void) 2350{ 2351 2352#ifdef _KERNEL 2353 2354 if (needfree) 2355 return (1); 2356 2357 /* 2358 * Cooperate with pagedaemon when it's time for it to scan 2359 * and reclaim some pages. 2360 / 2361* if (vm_paging_needed()) 2362 return (1); 2363 2364#ifdef sun 2365 /* 2366 * take 'desfree' extra pages, so we reclaim sooner, rather than later 2367 / 2368* extra = desfree; 2369 2370 /* 2371 * check that we're out of range of the pageout scanner. It starts to 2372 * schedule paging if freemem is less than lotsfree and needfree. 2373 * lotsfree is the high-water mark for pageout, and needfree is the 2374 * number of needed free pages. We add extra pages here to make sure 2375 * the scanner doesn't start up while we're freeing memory. 2376 / 2377* if (freemem < lotsfree + needfree + extra) 2378 return (1); 2379 2380 /* 2381 * check to make sure that swapfs has enough space so that anon 2382 * reservations can still succeed. anon_resvmem() checks that the 2383 * availrmem is greater than swapfs_minfree, and the number of reserved 2384 * swap pages. We also add a bit of extra here just to prevent 2385 * circumstances from getting really dire. 2386 / 2387* if (availrmem < swapfs_minfree + swapfs_reserve + extra) 2388 return (1); 2389 2390#if defined(__i386) 2391 /* 2392 * If we're on an i386 platform, it's possible that we'll exhaust the 2393 * kernel heap space before we ever run out of available physical 2394 * memory. Most checks of the size of the heap_area compare against 2395 * tune.t_minarmem, which is the minimum available real memory that we 2396 * can have in the system. However, this is generally fixed at 25 pages 2397 * which is so low that it's useless. In this comparison, we seek to 2398 * calculate the total heap-size, and reclaim if more than 3/4ths of the 2399 * heap is allocated. (Or, in the calculation, if less than 1/4th is 2400 * free) 2401 / 2402* if (btop(vmem_size(heap_arena, VMEM_FREE)) < 2403 (btop(vmem_size(heap_arena, VMEM_FREE \| VMEM_ALLOC)) >> 2)) 2404 return (1); 2405#endif 2406#else /* !sun / 2407* if (kmem_used() > (kmem_size() * 3) / 4) 2408 return (1); 2409#endif /* sun / 2410* 2411#else 2412 if (spa_get_random(100) == 0) 2413 return (1); 2414#endif 2415 return (0); 2416} 2417 2418extern kmem_cache_t zio_buf_cache[]; 2419extern kmem_cache_t zio_data_buf_cache[]; 2420 2421static void 2422arc_kmem_reap_now(arc_reclaim_strategy_t strat) 2423{ 2424 size_t i; 2425 kmem_cache_t prev_cache = NULL; 2426* kmem_cache_t prev_data_cache = NULL; 2427* 2428#ifdef _KERNEL 2429 if (arc_meta_used >= arc_meta_limit) { 2430 /* 2431 * We are exceeding our meta-data cache limit. 2432 * Purge some DNLC entries to release holds on meta-data. 2433 / 2434* dnlc_reduce_cache((void )(uintptr_t)arc_reduce_dnlc_percent); 2435* } 2436#if defined(__i386) 2437 /* 2438 * Reclaim unused memory from all kmem caches. 2439 / 2440* kmem_reap(); 2441#endif 2442#endif 2443 2444 /* 2445 * An aggressive reclamation will shrink the cache size as well as 2446 * reap free buffers from the arc kmem caches. 2447 / 2448* if (strat == ARC_RECLAIM_AGGR) 2449 arc_shrink(); 2450 2451 for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) { 2452 if (zio_buf_cache[i] != prev_cache) { 2453 prev_cache = zio_buf_cache[i]; 2454 kmem_cache_reap_now(zio_buf_cache[i]); 2455 } 2456 if (zio_data_buf_cache[i] != prev_data_cache) { 2457 prev_data_cache = zio_data_buf_cache[i]; 2458 kmem_cache_reap_now(zio_data_buf_cache[i]); 2459 } 2460 } 2461 kmem_cache_reap_now(buf_cache); 2462 kmem_cache_reap_now(hdr_cache); 2463} 2464 2465static void 2466arc_reclaim_thread(void dummy __unused) 2467{ 2468* clock_t growtime = 0; 2469 arc_reclaim_strategy_t last_reclaim = ARC_RECLAIM_CONS; 2470 callb_cpr_t cpr; 2471 2472 CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG); 2473 2474 mutex_enter(&arc_reclaim_thr_lock); 2475 while (arc_thread_exit == 0) { 2476 if (arc_reclaim_needed()) { 2477 2478 if (arc_no_grow) { 2479 if (last_reclaim == ARC_RECLAIM_CONS) { 2480 last_reclaim = ARC_RECLAIM_AGGR; 2481 } else { 2482 last_reclaim = ARC_RECLAIM_CONS; 2483 } 2484 } else { 2485 arc_no_grow = TRUE; 2486 last_reclaim = ARC_RECLAIM_AGGR; 2487 membar_producer(); 2488 } 2489 2490 /* reset the growth delay for every reclaim / 2491* growtime = ddi_get_lbolt() + (arc_grow_retry * hz); 2492 2493 if (needfree && last_reclaim == ARC_RECLAIM_CONS) { 2494 /* 2495 * If needfree is TRUE our vm_lowmem hook 2496 * was called and in that case we must free some 2497 * memory, so switch to aggressive mode. 2498 / 2499* arc_no_grow = TRUE; 2500 last_reclaim = ARC_RECLAIM_AGGR; 2501 } 2502 arc_kmem_reap_now(last_reclaim); 2503 arc_warm = B_TRUE; 2504 2505 } else if (arc_no_grow && ddi_get_lbolt() >= growtime) { 2506 arc_no_grow = FALSE; 2507 } 2508 2509 arc_adjust(); 2510 2511 if (arc_eviction_list != NULL) 2512 arc_do_user_evicts(); 2513 2514#ifdef _KERNEL 2515 if (needfree) { 2516 needfree = 0; 2517 wakeup(&needfree); 2518 } 2519#endif 2520 2521 /* block until needed, or one second, whichever is shorter / 2522* CALLB_CPR_SAFE_BEGIN(&cpr); 2523 (void) cv_timedwait(&arc_reclaim_thr_cv, 2524 &arc_reclaim_thr_lock, hz); 2525 CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock); 2526 } 2527 2528 arc_thread_exit = 0; 2529 cv_broadcast(&arc_reclaim_thr_cv); 2530 CALLB_CPR_EXIT(&cpr); /* drops arc_reclaim_thr_lock / 2531* thread_exit(); 2532} 2533 2534/* 2535 * Adapt arc info given the number of bytes we are trying to add and 2536 * the state that we are comming from. This function is only called 2537 * when we are adding new content to the cache. 2538 / 2539static void 2540arc_adapt(int bytes, arc_state_t state) 2541{ 2542 int mult; 2543 uint64_t arc_p_min = (arc_c >> arc_p_min_shift); 2544 2545 if (state == arc_l2c_only) 2546 return; 2547 2548 ASSERT(bytes > 0); 2549 /* 2550 * Adapt the target size of the MRU list: 2551 * - if we just hit in the MRU ghost list, then increase 2552 * the target size of the MRU list. 2553 * - if we just hit in the MFU ghost list, then increase 2554 * the target size of the MFU list by decreasing the 2555 * target size of the MRU list. 2556 / 2557* if (state == arc_mru_ghost) { 2558 mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ? 2559 1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size)); 2560 mult = MIN(mult, 10); /* avoid wild arc_p adjustment / 2561* 2562 arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult); 2563 } else if (state == arc_mfu_ghost) { 2564 uint64_t delta; 2565 2566 mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ? 2567 1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size)); 2568 mult = MIN(mult, 10); 2569 2570 delta = MIN(bytes * mult, arc_p); 2571 arc_p = MAX(arc_p_min, arc_p - delta); 2572 } 2573 ASSERT((int64_t)arc_p >= 0); 2574 2575 if (arc_reclaim_needed()) { 2576 cv_signal(&arc_reclaim_thr_cv); 2577 return; 2578 } 2579 2580 if (arc_no_grow) 2581 return; 2582 2583 if (arc_c >= arc_c_max) 2584 return; 2585 2586 /* 2587 * If we're within (2 * maxblocksize) bytes of the target 2588 * cache size, increment the target cache size 2589 / 2590* if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) { 2591 atomic_add_64(&arc_c, (int64_t)bytes); 2592 if (arc_c > arc_c_max) 2593 arc_c = arc_c_max; 2594 else if (state == arc_anon) 2595 atomic_add_64(&arc_p, (int64_t)bytes); 2596 if (arc_p > arc_c) 2597 arc_p = arc_c; 2598 } 2599 ASSERT((int64_t)arc_p >= 0); 2600} 2601 2602/* 2603 * Check if the cache has reached its limits and eviction is required 2604 * prior to insert. 2605 / 2606static int 2607arc_evict_needed(arc_buf_contents_t type) 2608{ 2609* if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit) 2610 return (1); 2611 2612#ifdef sun 2613#ifdef _KERNEL 2614 /* 2615 * If zio data pages are being allocated out of a separate heap segment, 2616 * then enforce that the size of available vmem for this area remains 2617 * above about 1/32nd free. 2618 / 2619* if (type == ARC_BUFC_DATA && zio_arena != NULL && 2620 vmem_size(zio_arena, VMEM_FREE) < 2621 (vmem_size(zio_arena, VMEM_ALLOC) >> 5)) 2622 return (1); 2623#endif 2624#endif /* sun / 2625* 2626 if (arc_reclaim_needed()) 2627 return (1); 2628 2629 return (arc_size > arc_c); 2630} 2631 2632/* 2633 * The buffer, supplied as the first argument, needs a data block. 2634 * So, if we are at cache max, determine which cache should be victimized. 2635 * We have the following cases: 2636 * 2637 * 1. Insert for MRU, p > sizeof(arc_anon + arc_mru) -> 2638 * In this situation if we're out of space, but the resident size of the MFU is 2639 * under the limit, victimize the MFU cache to satisfy this insertion request. 2640 * 2641 * 2. Insert for MRU, p <= sizeof(arc_anon + arc_mru) -> 2642 * Here, we've used up all of the available space for the MRU, so we need to 2643 * evict from our own cache instead. Evict from the set of resident MRU 2644 * entries. 2645 * 2646 * 3. Insert for MFU (c - p) > sizeof(arc_mfu) -> 2647 * c minus p represents the MFU space in the cache, since p is the size of the 2648 * cache that is dedicated to the MRU. In this situation there's still space on 2649 * the MFU side, so the MRU side needs to be victimized. 2650 * 2651 * 4. Insert for MFU (c - p) < sizeof(arc_mfu) -> 2652 * MFU's resident set is consuming more space than it has been allotted. In 2653 * this situation, we must victimize our own cache, the MFU, for this insertion. 2654 / 2655static void 2656arc_get_data_buf(arc_buf_t buf) 2657{ 2658 arc_state_t state = buf->b_hdr->b_state; 2659* uint64_t size = buf->b_hdr->b_size; 2660 arc_buf_contents_t type = buf->b_hdr->b_type; 2661 2662 arc_adapt(size, state); 2663 2664 /* 2665 * We have not yet reached cache maximum size, 2666 * just allocate a new buffer. 2667 / 2668* if (!arc_evict_needed(type)) { 2669 if (type == ARC_BUFC_METADATA) { 2670 buf->b_data = zio_buf_alloc(size); 2671 arc_space_consume(size, ARC_SPACE_DATA); 2672 } else { 2673 ASSERT(type == ARC_BUFC_DATA); 2674 buf->b_data = zio_data_buf_alloc(size); 2675 ARCSTAT_INCR(arcstat_data_size, size); 2676 atomic_add_64(&arc_size, size); 2677 } 2678 goto out; 2679 } 2680 2681 /* 2682 * If we are prefetching from the mfu ghost list, this buffer 2683 * will end up on the mru list; so steal space from there. 2684 / 2685* if (state == arc_mfu_ghost) 2686 state = buf->b_hdr->b_flags & ARC_PREFETCH ? arc_mru : arc_mfu; 2687 else if (state == arc_mru_ghost) 2688 state = arc_mru; 2689 2690 if (state == arc_mru \|\| state == arc_anon) { 2691 uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size; 2692 state = (arc_mfu->arcs_lsize[type] >= size && 2693 arc_p > mru_used) ? arc_mfu : arc_mru; 2694 } else { 2695 /* MFU cases / 2696* uint64_t mfu_space = arc_c - arc_p; 2697 state = (arc_mru->arcs_lsize[type] >= size && 2698 mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu; 2699 } 2700 if ((buf->b_data = arc_evict(state, 0, size, TRUE, type)) == NULL) { 2701 if (type == ARC_BUFC_METADATA) { 2702 buf->b_data = zio_buf_alloc(size); 2703 arc_space_consume(size, ARC_SPACE_DATA); 2704 } else { 2705 ASSERT(type == ARC_BUFC_DATA); 2706 buf->b_data = zio_data_buf_alloc(size); 2707 ARCSTAT_INCR(arcstat_data_size, size); 2708 atomic_add_64(&arc_size, size); 2709 } 2710 ARCSTAT_BUMP(arcstat_recycle_miss); 2711 } 2712 ASSERT(buf->b_data != NULL); 2713out: 2714 /* 2715 * Update the state size. Note that ghost states have a 2716 * "ghost size" and so don't need to be updated. 2717 / 2718* if (!GHOST_STATE(buf->b_hdr->b_state)) { 2719 arc_buf_hdr_t hdr = buf->b_hdr; 2720* 2721 atomic_add_64(&hdr->b_state->arcs_size, size); 2722 if (list_link_active(&hdr->b_arc_node)) { 2723 ASSERT(refcount_is_zero(&hdr->b_refcnt)); 2724 atomic_add_64(&hdr->b_state->arcs_lsize[type], size); 2725 } 2726 /* 2727 * If we are growing the cache, and we are adding anonymous 2728 * data, and we have outgrown arc_p, update arc_p 2729 / 2730* if (arc_size < arc_c && hdr->b_state == arc_anon && 2731 arc_anon->arcs_size + arc_mru->arcs_size > arc_p) 2732 arc_p = MIN(arc_c, arc_p + size); 2733 } 2734 ARCSTAT_BUMP(arcstat_allocated); 2735} 2736 2737/* 2738 * This routine is called whenever a buffer is accessed. 2739 * NOTE: the hash lock is dropped in this function. 2740 / 2741static void 2742arc_access(arc_buf_hdr_t buf, kmutex_t hash_lock) 2743{ 2744* clock_t now; 2745 2746 ASSERT(MUTEX_HELD(hash_lock)); 2747 2748 if (buf->b_state == arc_anon) { 2749 /* 2750 * This buffer is not in the cache, and does not 2751 * appear in our "ghost" list. Add the new buffer 2752 * to the MRU state. 2753 / 2754* 2755 ASSERT(buf->b_arc_access == 0); 2756 buf->b_arc_access = ddi_get_lbolt(); 2757 DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t , buf); 2758* arc_change_state(arc_mru, buf, hash_lock); 2759 2760 } else if (buf->b_state == arc_mru) { 2761 now = ddi_get_lbolt(); 2762 2763 /* 2764 * If this buffer is here because of a prefetch, then either: 2765 * - clear the flag if this is a "referencing" read 2766 * (any subsequent access will bump this into the MFU state). 2767 * or 2768 * - move the buffer to the head of the list if this is 2769 * another prefetch (to make it less likely to be evicted). 2770 / 2771* if ((buf->b_flags & ARC_PREFETCH) != 0) { 2772 if (refcount_count(&buf->b_refcnt) == 0) { 2773 ASSERT(list_link_active(&buf->b_arc_node)); 2774 } else { 2775 buf->b_flags &= ~ARC_PREFETCH; 2776 ARCSTAT_BUMP(arcstat_mru_hits); 2777 } 2778 buf->b_arc_access = now; 2779 return; 2780 } 2781 2782 /* 2783 * This buffer has been "accessed" only once so far, 2784 * but it is still in the cache. Move it to the MFU 2785 * state. 2786 / 2787* if (now > buf->b_arc_access + ARC_MINTIME) { 2788 /* 2789 * More than 125ms have passed since we 2790 * instantiated this buffer. Move it to the 2791 * most frequently used state. 2792 / 2793* buf->b_arc_access = now; 2794 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t , buf); 2795* arc_change_state(arc_mfu, buf, hash_lock); 2796 } 2797 ARCSTAT_BUMP(arcstat_mru_hits); 2798 } else if (buf->b_state == arc_mru_ghost) { 2799 arc_state_t new_state; 2800* /* 2801 * This buffer has been "accessed" recently, but 2802 * was evicted from the cache. Move it to the 2803 * MFU state. 2804 / 2805* 2806 if (buf->b_flags & ARC_PREFETCH) { 2807 new_state = arc_mru; 2808 if (refcount_count(&buf->b_refcnt) > 0) 2809 buf->b_flags &= ~ARC_PREFETCH; 2810 DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t , buf); 2811* } else { 2812 new_state = arc_mfu; 2813 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t , buf); 2814* } 2815 2816 buf->b_arc_access = ddi_get_lbolt(); 2817 arc_change_state(new_state, buf, hash_lock); 2818 2819 ARCSTAT_BUMP(arcstat_mru_ghost_hits); 2820 } else if (buf->b_state == arc_mfu) { 2821 /* 2822 * This buffer has been accessed more than once and is 2823 * still in the cache. Keep it in the MFU state. 2824 * 2825 * NOTE: an add_reference() that occurred when we did 2826 * the arc_read() will have kicked this off the list. 2827 * If it was a prefetch, we will explicitly move it to 2828 * the head of the list now. 2829 / 2830* if ((buf->b_flags & ARC_PREFETCH) != 0) { 2831 ASSERT(refcount_count(&buf->b_refcnt) == 0); 2832 ASSERT(list_link_active(&buf->b_arc_node)); 2833 } 2834 ARCSTAT_BUMP(arcstat_mfu_hits); 2835 buf->b_arc_access = ddi_get_lbolt(); 2836 } else if (buf->b_state == arc_mfu_ghost) { 2837 arc_state_t new_state = arc_mfu; 2838* /* 2839 * This buffer has been accessed more than once but has 2840 * been evicted from the cache. Move it back to the 2841 * MFU state. 2842 / 2843* 2844 if (buf->b_flags & ARC_PREFETCH) { 2845 /* 2846 * This is a prefetch access... 2847 * move this block back to the MRU state. 2848 / 2849* ASSERT0(refcount_count(&buf->b_refcnt)); 2850 new_state = arc_mru; 2851 } 2852 2853 buf->b_arc_access = ddi_get_lbolt(); 2854 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t , buf); 2855* arc_change_state(new_state, buf, hash_lock); 2856 2857 ARCSTAT_BUMP(arcstat_mfu_ghost_hits); 2858 } else if (buf->b_state == arc_l2c_only) { 2859 /* 2860 * This buffer is on the 2nd Level ARC. 2861 / 2862* 2863 buf->b_arc_access = ddi_get_lbolt(); 2864 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t , buf); 2865* arc_change_state(arc_mfu, buf, hash_lock); 2866 } else { 2867 ASSERT(!"invalid arc state"); 2868 } 2869} 2870 2871/* a generic arc_done_func_t which you can use / 2872/ ARGSUSED / 2873void 2874arc_bcopy_func(zio_t zio, arc_buf_t buf, void arg) 2875{ 2876 if (zio == NULL \|\| zio->io_error == 0) 2877 bcopy(buf->b_data, arg, buf->b_hdr->b_size); 2878 VERIFY(arc_buf_remove_ref(buf, arg) == 1); 2879} 2880 2881/* a generic arc_done_func_t / 2882void 2883arc_getbuf_func(zio_t zio, arc_buf_t buf, void arg) 2884{ 2885 arc_buf_t *bufp = arg; 2886* if (zio && zio->io_error) { 2887 VERIFY(arc_buf_remove_ref(buf, arg) == 1); 2888 bufp = NULL; 2889* } else { 2890 bufp = buf; 2891* ASSERT(buf->b_data); 2892 } 2893} 2894 2895static void 2896arc_read_done(zio_t zio) 2897{ 2898* arc_buf_hdr_t hdr, found; 2899 arc_buf_t buf; 2900* arc_buf_t abuf; / buffer we're assigning to callback / 2901* kmutex_t hash_lock; 2902* arc_callback_t callback_list, acb; 2903 int freeable = FALSE; 2904 2905 buf = zio->io_private; 2906 hdr = buf->b_hdr; 2907 2908 /* 2909 * The hdr was inserted into hash-table and removed from lists 2910 * prior to starting I/O. We should find this header, since 2911 * it's in the hash table, and it should be legit since it's 2912 * not possible to evict it during the I/O. The only possible 2913 * reason for it not to be found is if we were freed during the 2914 * read. 2915 / 2916* found = buf_hash_find(hdr->b_spa, &hdr->b_dva, hdr->b_birth, 2917 &hash_lock); 2918 2919 ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && hash_lock == NULL) \|\| 2920 (found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) \|\| 2921 (found == hdr && HDR_L2_READING(hdr))); 2922 2923 hdr->b_flags &= ~ARC_L2_EVICTED; 2924 if (l2arc_noprefetch && (hdr->b_flags & ARC_PREFETCH)) 2925 hdr->b_flags &= ~ARC_L2CACHE; 2926 2927 /* byteswap if necessary / 2928* callback_list = hdr->b_acb; 2929 ASSERT(callback_list != NULL); 2930 if (BP_SHOULD_BYTESWAP(zio->io_bp) && zio->io_error == 0) { 2931 dmu_object_byteswap_t bswap = 2932 DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp)); 2933 arc_byteswap_func_t func = BP_GET_LEVEL(zio->io_bp) > 0 ? 2934* byteswap_uint64_array : 2935 dmu_ot_byteswap[bswap].ob_func; 2936 func(buf->b_data, hdr->b_size); 2937 } 2938 2939 arc_cksum_compute(buf, B_FALSE); 2940#ifdef illumos 2941 arc_buf_watch(buf); 2942#endif /* illumos / 2943* 2944 if (hash_lock && zio->io_error == 0 && hdr->b_state == arc_anon) { 2945 /* 2946 * Only call arc_access on anonymous buffers. This is because 2947 * if we've issued an I/O for an evicted buffer, we've already 2948 * called arc_access (to prevent any simultaneous readers from 2949 * getting confused). 2950 / 2951* arc_access(hdr, hash_lock); 2952 } 2953 2954 /* create copies of the data buffer for the callers / 2955* abuf = buf; 2956 for (acb = callback_list; acb; acb = acb->acb_next) { 2957 if (acb->acb_done) { 2958 if (abuf == NULL) { 2959 ARCSTAT_BUMP(arcstat_duplicate_reads); 2960 abuf = arc_buf_clone(buf); 2961 } 2962 acb->acb_buf = abuf; 2963 abuf = NULL; 2964 } 2965 } 2966 hdr->b_acb = NULL; 2967 hdr->b_flags &= ~ARC_IO_IN_PROGRESS; 2968 ASSERT(!HDR_BUF_AVAILABLE(hdr)); 2969 if (abuf == buf) { 2970 ASSERT(buf->b_efunc == NULL); 2971 ASSERT(hdr->b_datacnt == 1); 2972 hdr->b_flags \|= ARC_BUF_AVAILABLE; 2973 } 2974 2975 ASSERT(refcount_is_zero(&hdr->b_refcnt) \|\| callback_list != NULL); 2976 2977 if (zio->io_error != 0) { 2978 hdr->b_flags \|= ARC_IO_ERROR; 2979 if (hdr->b_state != arc_anon) 2980 arc_change_state(arc_anon, hdr, hash_lock); 2981 if (HDR_IN_HASH_TABLE(hdr)) 2982 buf_hash_remove(hdr); 2983 freeable = refcount_is_zero(&hdr->b_refcnt); 2984 } 2985 2986 /* 2987 * Broadcast before we drop the hash_lock to avoid the possibility 2988 * that the hdr (and hence the cv) might be freed before we get to 2989 * the cv_broadcast(). 2990 / 2991* cv_broadcast(&hdr->b_cv); 2992 2993 if (hash_lock) { 2994 mutex_exit(hash_lock); 2995 } else { 2996 /* 2997 * This block was freed while we waited for the read to 2998 * complete. It has been removed from the hash table and 2999 * moved to the anonymous state (so that it won't show up 3000 * in the cache). 3001 / 3002* ASSERT3P(hdr->b_state, ==, arc_anon); 3003 freeable = refcount_is_zero(&hdr->b_refcnt); 3004 } 3005 3006 /* execute each callback and free its structure / 3007* while ((acb = callback_list) != NULL) { 3008 if (acb->acb_done) 3009 acb->acb_done(zio, acb->acb_buf, acb->acb_private); 3010 3011 if (acb->acb_zio_dummy != NULL) { 3012 acb->acb_zio_dummy->io_error = zio->io_error; 3013 zio_nowait(acb->acb_zio_dummy); 3014 } 3015 3016 callback_list = acb->acb_next; 3017 kmem_free(acb, sizeof (arc_callback_t)); 3018 } 3019 3020 if (freeable) 3021 arc_hdr_destroy(hdr); 3022} 3023 3024/* 3025 * "Read" the block block at the specified DVA (in bp) via the 3026 * cache. If the block is found in the cache, invoke the provided 3027 * callback immediately and return. Note that the `zio' parameter 3028 * in the callback will be NULL in this case, since no IO was 3029 * required. If the block is not in the cache pass the read request 3030 * on to the spa with a substitute callback function, so that the 3031 * requested block will be added to the cache. 3032 * 3033 * If a read request arrives for a block that has a read in-progress, 3034 * either wait for the in-progress read to complete (and return the 3035 * results); or, if this is a read with a "done" func, add a record 3036 * to the read to invoke the "done" func when the read completes, 3037 * and return; or just return. 3038 * 3039 * arc_read_done() will invoke all the requested "done" functions 3040 * for readers of this block. 3041 / 3042int 3043arc_read(zio_t pio, spa_t spa, const blkptr_t bp, arc_done_func_t done, 3044* void private, int priority, int zio_flags, uint32_t arc_flags, 3045 const zbookmark_t zb) 3046{ 3047* arc_buf_hdr_t *hdr;
3048 arc_buf_t *buf;	3048 arc_buf_t *buf = NULL;
3049 kmutex_t hash_lock; 3050* zio_t rzio; 3051* uint64_t guid = spa_load_guid(spa); 3052 3053top: 3054 hdr = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp), 3055 &hash_lock); 3056 if (hdr && hdr->b_datacnt > 0) { 3057 3058 arc_flags \|= ARC_CACHED; 3059* 3060 if (HDR_IO_IN_PROGRESS(hdr)) { 3061 3062 if (arc_flags & ARC_WAIT) { 3063* cv_wait(&hdr->b_cv, hash_lock); 3064 mutex_exit(hash_lock); 3065 goto top; 3066 } 3067 ASSERT(arc_flags & ARC_NOWAIT); 3068* 3069 if (done) { 3070 arc_callback_t acb = NULL; 3071* 3072 acb = kmem_zalloc(sizeof (arc_callback_t), 3073 KM_SLEEP); 3074 acb->acb_done = done; 3075 acb->acb_private = private; 3076 if (pio != NULL) 3077 acb->acb_zio_dummy = zio_null(pio, 3078 spa, NULL, NULL, NULL, zio_flags); 3079 3080 ASSERT(acb->acb_done != NULL); 3081 acb->acb_next = hdr->b_acb; 3082 hdr->b_acb = acb; 3083 add_reference(hdr, hash_lock, private); 3084 mutex_exit(hash_lock); 3085 return (0); 3086 } 3087 mutex_exit(hash_lock); 3088 return (0); 3089 } 3090 3091 ASSERT(hdr->b_state == arc_mru \|\| hdr->b_state == arc_mfu); 3092 3093 if (done) { 3094 add_reference(hdr, hash_lock, private); 3095 /* 3096 * If this block is already in use, create a new 3097 * copy of the data so that we will be guaranteed 3098 * that arc_release() will always succeed. 3099 / 3100* buf = hdr->b_buf; 3101 ASSERT(buf); 3102 ASSERT(buf->b_data); 3103 if (HDR_BUF_AVAILABLE(hdr)) { 3104 ASSERT(buf->b_efunc == NULL); 3105 hdr->b_flags &= ~ARC_BUF_AVAILABLE; 3106 } else { 3107 buf = arc_buf_clone(buf); 3108 } 3109 3110 } else if (arc_flags & ARC_PREFETCH && 3111* refcount_count(&hdr->b_refcnt) == 0) { 3112 hdr->b_flags \|= ARC_PREFETCH; 3113 } 3114 DTRACE_PROBE1(arc__hit, arc_buf_hdr_t , hdr); 3115* arc_access(hdr, hash_lock); 3116 if (arc_flags & ARC_L2CACHE) 3117* hdr->b_flags \|= ARC_L2CACHE; 3118 mutex_exit(hash_lock); 3119 ARCSTAT_BUMP(arcstat_hits); 3120 ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH), 3121 demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, 3122 data, metadata, hits); 3123 3124 if (done) 3125 done(NULL, buf, private); 3126 } else { 3127 uint64_t size = BP_GET_LSIZE(bp); 3128 arc_callback_t acb; 3129* vdev_t *vd = NULL;	3049 kmutex_t hash_lock; 3050* zio_t rzio; 3051* uint64_t guid = spa_load_guid(spa); 3052 3053top: 3054 hdr = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp), 3055 &hash_lock); 3056 if (hdr && hdr->b_datacnt > 0) { 3057 3058 arc_flags \|= ARC_CACHED; 3059* 3060 if (HDR_IO_IN_PROGRESS(hdr)) { 3061 3062 if (arc_flags & ARC_WAIT) { 3063* cv_wait(&hdr->b_cv, hash_lock); 3064 mutex_exit(hash_lock); 3065 goto top; 3066 } 3067 ASSERT(arc_flags & ARC_NOWAIT); 3068* 3069 if (done) { 3070 arc_callback_t acb = NULL; 3071* 3072 acb = kmem_zalloc(sizeof (arc_callback_t), 3073 KM_SLEEP); 3074 acb->acb_done = done; 3075 acb->acb_private = private; 3076 if (pio != NULL) 3077 acb->acb_zio_dummy = zio_null(pio, 3078 spa, NULL, NULL, NULL, zio_flags); 3079 3080 ASSERT(acb->acb_done != NULL); 3081 acb->acb_next = hdr->b_acb; 3082 hdr->b_acb = acb; 3083 add_reference(hdr, hash_lock, private); 3084 mutex_exit(hash_lock); 3085 return (0); 3086 } 3087 mutex_exit(hash_lock); 3088 return (0); 3089 } 3090 3091 ASSERT(hdr->b_state == arc_mru \|\| hdr->b_state == arc_mfu); 3092 3093 if (done) { 3094 add_reference(hdr, hash_lock, private); 3095 /* 3096 * If this block is already in use, create a new 3097 * copy of the data so that we will be guaranteed 3098 * that arc_release() will always succeed. 3099 / 3100* buf = hdr->b_buf; 3101 ASSERT(buf); 3102 ASSERT(buf->b_data); 3103 if (HDR_BUF_AVAILABLE(hdr)) { 3104 ASSERT(buf->b_efunc == NULL); 3105 hdr->b_flags &= ~ARC_BUF_AVAILABLE; 3106 } else { 3107 buf = arc_buf_clone(buf); 3108 } 3109 3110 } else if (arc_flags & ARC_PREFETCH && 3111* refcount_count(&hdr->b_refcnt) == 0) { 3112 hdr->b_flags \|= ARC_PREFETCH; 3113 } 3114 DTRACE_PROBE1(arc__hit, arc_buf_hdr_t , hdr); 3115* arc_access(hdr, hash_lock); 3116 if (arc_flags & ARC_L2CACHE) 3117* hdr->b_flags \|= ARC_L2CACHE; 3118 mutex_exit(hash_lock); 3119 ARCSTAT_BUMP(arcstat_hits); 3120 ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH), 3121 demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, 3122 data, metadata, hits); 3123 3124 if (done) 3125 done(NULL, buf, private); 3126 } else { 3127 uint64_t size = BP_GET_LSIZE(bp); 3128 arc_callback_t acb; 3129* vdev_t *vd = NULL;
3130 uint64_t addr;	3130 uint64_t addr = 0;
3131 boolean_t devw = B_FALSE; 3132 3133 if (hdr == NULL) { 3134 /* this block is not in the cache / 3135* arc_buf_hdr_t exists; 3136* arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp); 3137 buf = arc_buf_alloc(spa, size, private, type); 3138 hdr = buf->b_hdr; 3139 hdr->b_dva = BP_IDENTITY(bp); 3140* hdr->b_birth = BP_PHYSICAL_BIRTH(bp); 3141 hdr->b_cksum0 = bp->blk_cksum.zc_word[0]; 3142 exists = buf_hash_insert(hdr, &hash_lock); 3143 if (exists) { 3144 /* somebody beat us to the hash insert / 3145* mutex_exit(hash_lock); 3146 buf_discard_identity(hdr); 3147 (void) arc_buf_remove_ref(buf, private); 3148 goto top; /* restart the IO request / 3149* } 3150 /* if this is a prefetch, we don't have a reference / 3151* if (arc_flags & ARC_PREFETCH) { 3152* (void) remove_reference(hdr, hash_lock, 3153 private); 3154 hdr->b_flags \|= ARC_PREFETCH; 3155 } 3156 if (arc_flags & ARC_L2CACHE) 3157* hdr->b_flags \|= ARC_L2CACHE; 3158 if (BP_GET_LEVEL(bp) > 0) 3159 hdr->b_flags \|= ARC_INDIRECT; 3160 } else { 3161 /* this block is in the ghost cache / 3162* ASSERT(GHOST_STATE(hdr->b_state)); 3163 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 3164 ASSERT0(refcount_count(&hdr->b_refcnt)); 3165 ASSERT(hdr->b_buf == NULL); 3166 3167 /* if this is a prefetch, we don't have a reference / 3168* if (arc_flags & ARC_PREFETCH) 3169* hdr->b_flags \|= ARC_PREFETCH; 3170 else 3171 add_reference(hdr, hash_lock, private); 3172 if (arc_flags & ARC_L2CACHE) 3173* hdr->b_flags \|= ARC_L2CACHE; 3174 buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); 3175 buf->b_hdr = hdr; 3176 buf->b_data = NULL; 3177 buf->b_efunc = NULL; 3178 buf->b_private = NULL; 3179 buf->b_next = NULL; 3180 hdr->b_buf = buf; 3181 ASSERT(hdr->b_datacnt == 0); 3182 hdr->b_datacnt = 1; 3183 arc_get_data_buf(buf); 3184 arc_access(hdr, hash_lock); 3185 } 3186 3187 ASSERT(!GHOST_STATE(hdr->b_state)); 3188 3189 acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); 3190 acb->acb_done = done; 3191 acb->acb_private = private; 3192 3193 ASSERT(hdr->b_acb == NULL); 3194 hdr->b_acb = acb; 3195 hdr->b_flags \|= ARC_IO_IN_PROGRESS; 3196 3197 if (HDR_L2CACHE(hdr) && hdr->b_l2hdr != NULL && 3198 (vd = hdr->b_l2hdr->b_dev->l2ad_vdev) != NULL) { 3199 devw = hdr->b_l2hdr->b_dev->l2ad_writing; 3200 addr = hdr->b_l2hdr->b_daddr; 3201 /* 3202 * Lock out device removal. 3203 / 3204* if (vdev_is_dead(vd) \|\| 3205 !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER)) 3206 vd = NULL; 3207 } 3208 3209 mutex_exit(hash_lock); 3210 3211 ASSERT3U(hdr->b_size, ==, size); 3212 DTRACE_PROBE4(arc__miss, arc_buf_hdr_t , hdr, blkptr_t , bp, 3213 uint64_t, size, zbookmark_t , zb); 3214* ARCSTAT_BUMP(arcstat_misses); 3215 ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH), 3216 demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, 3217 data, metadata, misses); 3218#ifdef _KERNEL 3219 curthread->td_ru.ru_inblock++; 3220#endif 3221 3222 if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) { 3223 /* 3224 * Read from the L2ARC if the following are true: 3225 * 1. The L2ARC vdev was previously cached. 3226 * 2. This buffer still has L2ARC metadata. 3227 * 3. This buffer isn't currently writing to the L2ARC. 3228 * 4. The L2ARC entry wasn't evicted, which may 3229 * also have invalidated the vdev. 3230 * 5. This isn't prefetch and l2arc_noprefetch is set. 3231 / 3232* if (hdr->b_l2hdr != NULL && 3233 !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) && 3234 !(l2arc_noprefetch && HDR_PREFETCH(hdr))) { 3235 l2arc_read_callback_t cb; 3236* 3237 DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t , hdr); 3238* ARCSTAT_BUMP(arcstat_l2_hits); 3239 3240 cb = kmem_zalloc(sizeof (l2arc_read_callback_t), 3241 KM_SLEEP); 3242 cb->l2rcb_buf = buf; 3243 cb->l2rcb_spa = spa; 3244 cb->l2rcb_bp = bp; 3245* cb->l2rcb_zb = zb; 3246* cb->l2rcb_flags = zio_flags; 3247	3131 boolean_t devw = B_FALSE; 3132 3133 if (hdr == NULL) { 3134 /* this block is not in the cache / 3135* arc_buf_hdr_t exists; 3136* arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp); 3137 buf = arc_buf_alloc(spa, size, private, type); 3138 hdr = buf->b_hdr; 3139 hdr->b_dva = BP_IDENTITY(bp); 3140* hdr->b_birth = BP_PHYSICAL_BIRTH(bp); 3141 hdr->b_cksum0 = bp->blk_cksum.zc_word[0]; 3142 exists = buf_hash_insert(hdr, &hash_lock); 3143 if (exists) { 3144 /* somebody beat us to the hash insert / 3145* mutex_exit(hash_lock); 3146 buf_discard_identity(hdr); 3147 (void) arc_buf_remove_ref(buf, private); 3148 goto top; /* restart the IO request / 3149* } 3150 /* if this is a prefetch, we don't have a reference / 3151* if (arc_flags & ARC_PREFETCH) { 3152* (void) remove_reference(hdr, hash_lock, 3153 private); 3154 hdr->b_flags \|= ARC_PREFETCH; 3155 } 3156 if (arc_flags & ARC_L2CACHE) 3157* hdr->b_flags \|= ARC_L2CACHE; 3158 if (BP_GET_LEVEL(bp) > 0) 3159 hdr->b_flags \|= ARC_INDIRECT; 3160 } else { 3161 /* this block is in the ghost cache / 3162* ASSERT(GHOST_STATE(hdr->b_state)); 3163 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 3164 ASSERT0(refcount_count(&hdr->b_refcnt)); 3165 ASSERT(hdr->b_buf == NULL); 3166 3167 /* if this is a prefetch, we don't have a reference / 3168* if (arc_flags & ARC_PREFETCH) 3169* hdr->b_flags \|= ARC_PREFETCH; 3170 else 3171 add_reference(hdr, hash_lock, private); 3172 if (arc_flags & ARC_L2CACHE) 3173* hdr->b_flags \|= ARC_L2CACHE; 3174 buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); 3175 buf->b_hdr = hdr; 3176 buf->b_data = NULL; 3177 buf->b_efunc = NULL; 3178 buf->b_private = NULL; 3179 buf->b_next = NULL; 3180 hdr->b_buf = buf; 3181 ASSERT(hdr->b_datacnt == 0); 3182 hdr->b_datacnt = 1; 3183 arc_get_data_buf(buf); 3184 arc_access(hdr, hash_lock); 3185 } 3186 3187 ASSERT(!GHOST_STATE(hdr->b_state)); 3188 3189 acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); 3190 acb->acb_done = done; 3191 acb->acb_private = private; 3192 3193 ASSERT(hdr->b_acb == NULL); 3194 hdr->b_acb = acb; 3195 hdr->b_flags \|= ARC_IO_IN_PROGRESS; 3196 3197 if (HDR_L2CACHE(hdr) && hdr->b_l2hdr != NULL && 3198 (vd = hdr->b_l2hdr->b_dev->l2ad_vdev) != NULL) { 3199 devw = hdr->b_l2hdr->b_dev->l2ad_writing; 3200 addr = hdr->b_l2hdr->b_daddr; 3201 /* 3202 * Lock out device removal. 3203 / 3204* if (vdev_is_dead(vd) \|\| 3205 !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER)) 3206 vd = NULL; 3207 } 3208 3209 mutex_exit(hash_lock); 3210 3211 ASSERT3U(hdr->b_size, ==, size); 3212 DTRACE_PROBE4(arc__miss, arc_buf_hdr_t , hdr, blkptr_t , bp, 3213 uint64_t, size, zbookmark_t , zb); 3214* ARCSTAT_BUMP(arcstat_misses); 3215 ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH), 3216 demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, 3217 data, metadata, misses); 3218#ifdef _KERNEL 3219 curthread->td_ru.ru_inblock++; 3220#endif 3221 3222 if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) { 3223 /* 3224 * Read from the L2ARC if the following are true: 3225 * 1. The L2ARC vdev was previously cached. 3226 * 2. This buffer still has L2ARC metadata. 3227 * 3. This buffer isn't currently writing to the L2ARC. 3228 * 4. The L2ARC entry wasn't evicted, which may 3229 * also have invalidated the vdev. 3230 * 5. This isn't prefetch and l2arc_noprefetch is set. 3231 / 3232* if (hdr->b_l2hdr != NULL && 3233 !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) && 3234 !(l2arc_noprefetch && HDR_PREFETCH(hdr))) { 3235 l2arc_read_callback_t cb; 3236* 3237 DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t , hdr); 3238* ARCSTAT_BUMP(arcstat_l2_hits); 3239 3240 cb = kmem_zalloc(sizeof (l2arc_read_callback_t), 3241 KM_SLEEP); 3242 cb->l2rcb_buf = buf; 3243 cb->l2rcb_spa = spa; 3244 cb->l2rcb_bp = bp; 3245* cb->l2rcb_zb = zb; 3246* cb->l2rcb_flags = zio_flags; 3247
	3248 ASSERT(addr >= VDEV_LABEL_START_SIZE && 3249 addr + size < vd->vdev_psize - 3250 VDEV_LABEL_END_SIZE); 3251
3248 /* 3249 * l2arc read. The SCL_L2ARC lock will be 3250 * released by l2arc_read_done(). 3251 / 3252* rzio = zio_read_phys(pio, vd, addr, size, 3253 buf->b_data, ZIO_CHECKSUM_OFF, 3254 l2arc_read_done, cb, priority, zio_flags \| 3255 ZIO_FLAG_DONT_CACHE \| ZIO_FLAG_CANFAIL \| 3256 ZIO_FLAG_DONT_PROPAGATE \| 3257 ZIO_FLAG_DONT_RETRY, B_FALSE); 3258 DTRACE_PROBE2(l2arc__read, vdev_t , vd, 3259* zio_t , rzio); 3260* ARCSTAT_INCR(arcstat_l2_read_bytes, size); 3261 3262 if (arc_flags & ARC_NOWAIT) { 3263* zio_nowait(rzio); 3264 return (0); 3265 } 3266 3267 ASSERT(arc_flags & ARC_WAIT); 3268* if (zio_wait(rzio) == 0) 3269 return (0); 3270 3271 /* l2arc read error; goto zio_read() / 3272* } else { 3273 DTRACE_PROBE1(l2arc__miss, 3274 arc_buf_hdr_t , hdr); 3275* ARCSTAT_BUMP(arcstat_l2_misses); 3276 if (HDR_L2_WRITING(hdr)) 3277 ARCSTAT_BUMP(arcstat_l2_rw_clash); 3278 spa_config_exit(spa, SCL_L2ARC, vd); 3279 } 3280 } else { 3281 if (vd != NULL) 3282 spa_config_exit(spa, SCL_L2ARC, vd); 3283 if (l2arc_ndev != 0) { 3284 DTRACE_PROBE1(l2arc__miss, 3285 arc_buf_hdr_t , hdr); 3286* ARCSTAT_BUMP(arcstat_l2_misses); 3287 } 3288 } 3289 3290 rzio = zio_read(pio, spa, bp, buf->b_data, size, 3291 arc_read_done, buf, priority, zio_flags, zb); 3292 3293 if (arc_flags & ARC_WAIT) 3294* return (zio_wait(rzio)); 3295 3296 ASSERT(arc_flags & ARC_NOWAIT); 3297* zio_nowait(rzio); 3298 } 3299 return (0); 3300} 3301 3302void 3303arc_set_callback(arc_buf_t buf, arc_evict_func_t func, void private) 3304{ 3305* ASSERT(buf->b_hdr != NULL); 3306 ASSERT(buf->b_hdr->b_state != arc_anon); 3307 ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt) \|\| func == NULL); 3308 ASSERT(buf->b_efunc == NULL); 3309 ASSERT(!HDR_BUF_AVAILABLE(buf->b_hdr)); 3310 3311 buf->b_efunc = func; 3312 buf->b_private = private; 3313} 3314 3315/* 3316 * This is used by the DMU to let the ARC know that a buffer is 3317 * being evicted, so the ARC should clean up. If this arc buf 3318 * is not yet in the evicted state, it will be put there. 3319 / 3320int 3321arc_buf_evict(arc_buf_t buf) 3322{ 3323 arc_buf_hdr_t hdr; 3324* kmutex_t hash_lock; 3325* arc_buf_t *bufp; 3326* list_t list, evicted_list; 3327 kmutex_t lock, evicted_lock; 3328 3329 mutex_enter(&buf->b_evict_lock); 3330 hdr = buf->b_hdr; 3331 if (hdr == NULL) { 3332 /* 3333 * We are in arc_do_user_evicts(). 3334 / 3335* ASSERT(buf->b_data == NULL); 3336 mutex_exit(&buf->b_evict_lock); 3337 return (0); 3338 } else if (buf->b_data == NULL) { 3339 arc_buf_t copy = buf; / structure assignment / 3340* /* 3341 * We are on the eviction list; process this buffer now 3342 * but let arc_do_user_evicts() do the reaping. 3343 / 3344* buf->b_efunc = NULL; 3345 mutex_exit(&buf->b_evict_lock); 3346 VERIFY(copy.b_efunc(&copy) == 0); 3347 return (1); 3348 } 3349 hash_lock = HDR_LOCK(hdr); 3350 mutex_enter(hash_lock); 3351 hdr = buf->b_hdr; 3352 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 3353 3354 ASSERT3U(refcount_count(&hdr->b_refcnt), <, hdr->b_datacnt); 3355 ASSERT(hdr->b_state == arc_mru \|\| hdr->b_state == arc_mfu); 3356 3357 /* 3358 * Pull this buffer off of the hdr 3359 / 3360* bufp = &hdr->b_buf; 3361 while (bufp != buf) 3362* bufp = &(bufp)->b_next; 3363* bufp = buf->b_next; 3364* 3365 ASSERT(buf->b_data != NULL); 3366 arc_buf_destroy(buf, FALSE, FALSE); 3367 3368 if (hdr->b_datacnt == 0) { 3369 arc_state_t old_state = hdr->b_state; 3370* arc_state_t evicted_state; 3371* 3372 ASSERT(hdr->b_buf == NULL); 3373 ASSERT(refcount_is_zero(&hdr->b_refcnt)); 3374 3375 evicted_state = 3376 (old_state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost; 3377 3378 get_buf_info(hdr, old_state, &list, &lock); 3379 get_buf_info(hdr, evicted_state, &evicted_list, &evicted_lock); 3380 mutex_enter(lock); 3381 mutex_enter(evicted_lock); 3382 3383 arc_change_state(evicted_state, hdr, hash_lock); 3384 ASSERT(HDR_IN_HASH_TABLE(hdr)); 3385 hdr->b_flags \|= ARC_IN_HASH_TABLE; 3386 hdr->b_flags &= ~ARC_BUF_AVAILABLE; 3387 3388 mutex_exit(evicted_lock); 3389 mutex_exit(lock); 3390 } 3391 mutex_exit(hash_lock); 3392 mutex_exit(&buf->b_evict_lock); 3393 3394 VERIFY(buf->b_efunc(buf) == 0); 3395 buf->b_efunc = NULL; 3396 buf->b_private = NULL; 3397 buf->b_hdr = NULL; 3398 buf->b_next = NULL; 3399 kmem_cache_free(buf_cache, buf); 3400 return (1); 3401} 3402 3403/* 3404 * Release this buffer from the cache. This must be done 3405 * after a read and prior to modifying the buffer contents. 3406 * If the buffer has more than one reference, we must make 3407 * a new hdr for the buffer. 3408 / 3409void 3410arc_release(arc_buf_t buf, void tag) 3411{ 3412* arc_buf_hdr_t hdr; 3413* kmutex_t hash_lock = NULL; 3414* l2arc_buf_hdr_t l2hdr; 3415* uint64_t buf_size; 3416 3417 /* 3418 * It would be nice to assert that if it's DMU metadata (level > 3419 * 0 \|\| it's the dnode file), then it must be syncing context. 3420 * But we don't know that information at this level. 3421 / 3422* 3423 mutex_enter(&buf->b_evict_lock); 3424 hdr = buf->b_hdr; 3425 3426 /* this buffer is not on any list / 3427* ASSERT(refcount_count(&hdr->b_refcnt) > 0); 3428 3429 if (hdr->b_state == arc_anon) { 3430 /* this buffer is already released / 3431* ASSERT(buf->b_efunc == NULL); 3432 } else { 3433 hash_lock = HDR_LOCK(hdr); 3434 mutex_enter(hash_lock); 3435 hdr = buf->b_hdr; 3436 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 3437 } 3438 3439 l2hdr = hdr->b_l2hdr; 3440 if (l2hdr) { 3441 mutex_enter(&l2arc_buflist_mtx); 3442 hdr->b_l2hdr = NULL;	3252 /* 3253 * l2arc read. The SCL_L2ARC lock will be 3254 * released by l2arc_read_done(). 3255 / 3256* rzio = zio_read_phys(pio, vd, addr, size, 3257 buf->b_data, ZIO_CHECKSUM_OFF, 3258 l2arc_read_done, cb, priority, zio_flags \| 3259 ZIO_FLAG_DONT_CACHE \| ZIO_FLAG_CANFAIL \| 3260 ZIO_FLAG_DONT_PROPAGATE \| 3261 ZIO_FLAG_DONT_RETRY, B_FALSE); 3262 DTRACE_PROBE2(l2arc__read, vdev_t , vd, 3263* zio_t , rzio); 3264* ARCSTAT_INCR(arcstat_l2_read_bytes, size); 3265 3266 if (arc_flags & ARC_NOWAIT) { 3267* zio_nowait(rzio); 3268 return (0); 3269 } 3270 3271 ASSERT(arc_flags & ARC_WAIT); 3272* if (zio_wait(rzio) == 0) 3273 return (0); 3274 3275 /* l2arc read error; goto zio_read() / 3276* } else { 3277 DTRACE_PROBE1(l2arc__miss, 3278 arc_buf_hdr_t , hdr); 3279* ARCSTAT_BUMP(arcstat_l2_misses); 3280 if (HDR_L2_WRITING(hdr)) 3281 ARCSTAT_BUMP(arcstat_l2_rw_clash); 3282 spa_config_exit(spa, SCL_L2ARC, vd); 3283 } 3284 } else { 3285 if (vd != NULL) 3286 spa_config_exit(spa, SCL_L2ARC, vd); 3287 if (l2arc_ndev != 0) { 3288 DTRACE_PROBE1(l2arc__miss, 3289 arc_buf_hdr_t , hdr); 3290* ARCSTAT_BUMP(arcstat_l2_misses); 3291 } 3292 } 3293 3294 rzio = zio_read(pio, spa, bp, buf->b_data, size, 3295 arc_read_done, buf, priority, zio_flags, zb); 3296 3297 if (arc_flags & ARC_WAIT) 3298* return (zio_wait(rzio)); 3299 3300 ASSERT(arc_flags & ARC_NOWAIT); 3301* zio_nowait(rzio); 3302 } 3303 return (0); 3304} 3305 3306void 3307arc_set_callback(arc_buf_t buf, arc_evict_func_t func, void private) 3308{ 3309* ASSERT(buf->b_hdr != NULL); 3310 ASSERT(buf->b_hdr->b_state != arc_anon); 3311 ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt) \|\| func == NULL); 3312 ASSERT(buf->b_efunc == NULL); 3313 ASSERT(!HDR_BUF_AVAILABLE(buf->b_hdr)); 3314 3315 buf->b_efunc = func; 3316 buf->b_private = private; 3317} 3318 3319/* 3320 * This is used by the DMU to let the ARC know that a buffer is 3321 * being evicted, so the ARC should clean up. If this arc buf 3322 * is not yet in the evicted state, it will be put there. 3323 / 3324int 3325arc_buf_evict(arc_buf_t buf) 3326{ 3327 arc_buf_hdr_t hdr; 3328* kmutex_t hash_lock; 3329* arc_buf_t *bufp; 3330* list_t list, evicted_list; 3331 kmutex_t lock, evicted_lock; 3332 3333 mutex_enter(&buf->b_evict_lock); 3334 hdr = buf->b_hdr; 3335 if (hdr == NULL) { 3336 /* 3337 * We are in arc_do_user_evicts(). 3338 / 3339* ASSERT(buf->b_data == NULL); 3340 mutex_exit(&buf->b_evict_lock); 3341 return (0); 3342 } else if (buf->b_data == NULL) { 3343 arc_buf_t copy = buf; / structure assignment / 3344* /* 3345 * We are on the eviction list; process this buffer now 3346 * but let arc_do_user_evicts() do the reaping. 3347 / 3348* buf->b_efunc = NULL; 3349 mutex_exit(&buf->b_evict_lock); 3350 VERIFY(copy.b_efunc(&copy) == 0); 3351 return (1); 3352 } 3353 hash_lock = HDR_LOCK(hdr); 3354 mutex_enter(hash_lock); 3355 hdr = buf->b_hdr; 3356 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 3357 3358 ASSERT3U(refcount_count(&hdr->b_refcnt), <, hdr->b_datacnt); 3359 ASSERT(hdr->b_state == arc_mru \|\| hdr->b_state == arc_mfu); 3360 3361 /* 3362 * Pull this buffer off of the hdr 3363 / 3364* bufp = &hdr->b_buf; 3365 while (bufp != buf) 3366* bufp = &(bufp)->b_next; 3367* bufp = buf->b_next; 3368* 3369 ASSERT(buf->b_data != NULL); 3370 arc_buf_destroy(buf, FALSE, FALSE); 3371 3372 if (hdr->b_datacnt == 0) { 3373 arc_state_t old_state = hdr->b_state; 3374* arc_state_t evicted_state; 3375* 3376 ASSERT(hdr->b_buf == NULL); 3377 ASSERT(refcount_is_zero(&hdr->b_refcnt)); 3378 3379 evicted_state = 3380 (old_state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost; 3381 3382 get_buf_info(hdr, old_state, &list, &lock); 3383 get_buf_info(hdr, evicted_state, &evicted_list, &evicted_lock); 3384 mutex_enter(lock); 3385 mutex_enter(evicted_lock); 3386 3387 arc_change_state(evicted_state, hdr, hash_lock); 3388 ASSERT(HDR_IN_HASH_TABLE(hdr)); 3389 hdr->b_flags \|= ARC_IN_HASH_TABLE; 3390 hdr->b_flags &= ~ARC_BUF_AVAILABLE; 3391 3392 mutex_exit(evicted_lock); 3393 mutex_exit(lock); 3394 } 3395 mutex_exit(hash_lock); 3396 mutex_exit(&buf->b_evict_lock); 3397 3398 VERIFY(buf->b_efunc(buf) == 0); 3399 buf->b_efunc = NULL; 3400 buf->b_private = NULL; 3401 buf->b_hdr = NULL; 3402 buf->b_next = NULL; 3403 kmem_cache_free(buf_cache, buf); 3404 return (1); 3405} 3406 3407/* 3408 * Release this buffer from the cache. This must be done 3409 * after a read and prior to modifying the buffer contents. 3410 * If the buffer has more than one reference, we must make 3411 * a new hdr for the buffer. 3412 / 3413void 3414arc_release(arc_buf_t buf, void tag) 3415{ 3416* arc_buf_hdr_t hdr; 3417* kmutex_t hash_lock = NULL; 3418* l2arc_buf_hdr_t l2hdr; 3419* uint64_t buf_size; 3420 3421 /* 3422 * It would be nice to assert that if it's DMU metadata (level > 3423 * 0 \|\| it's the dnode file), then it must be syncing context. 3424 * But we don't know that information at this level. 3425 / 3426* 3427 mutex_enter(&buf->b_evict_lock); 3428 hdr = buf->b_hdr; 3429 3430 /* this buffer is not on any list / 3431* ASSERT(refcount_count(&hdr->b_refcnt) > 0); 3432 3433 if (hdr->b_state == arc_anon) { 3434 /* this buffer is already released / 3435* ASSERT(buf->b_efunc == NULL); 3436 } else { 3437 hash_lock = HDR_LOCK(hdr); 3438 mutex_enter(hash_lock); 3439 hdr = buf->b_hdr; 3440 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 3441 } 3442 3443 l2hdr = hdr->b_l2hdr; 3444 if (l2hdr) { 3445 mutex_enter(&l2arc_buflist_mtx); 3446 hdr->b_l2hdr = NULL;
3443 buf_size = hdr->b_size;
3444 }	3447 }
	3448 buf_size = hdr->b_size;
3445 3446 /* 3447 * Do we have more than one buf? 3448 / 3449* if (hdr->b_datacnt > 1) { 3450 arc_buf_hdr_t nhdr; 3451* arc_buf_t *bufp; 3452* uint64_t blksz = hdr->b_size; 3453 uint64_t spa = hdr->b_spa; 3454 arc_buf_contents_t type = hdr->b_type; 3455 uint32_t flags = hdr->b_flags; 3456 3457 ASSERT(hdr->b_buf != buf \|\| buf->b_next != NULL); 3458 /* 3459 * Pull the data off of this hdr and attach it to 3460 * a new anonymous hdr. 3461 / 3462* (void) remove_reference(hdr, hash_lock, tag); 3463 bufp = &hdr->b_buf; 3464 while (bufp != buf) 3465* bufp = &(bufp)->b_next; 3466* bufp = buf->b_next; 3467* buf->b_next = NULL; 3468 3469 ASSERT3U(hdr->b_state->arcs_size, >=, hdr->b_size); 3470 atomic_add_64(&hdr->b_state->arcs_size, -hdr->b_size); 3471 if (refcount_is_zero(&hdr->b_refcnt)) { 3472 uint64_t size = &hdr->b_state->arcs_lsize[hdr->b_type]; 3473* ASSERT3U(size, >=, hdr->b_size); 3474* atomic_add_64(size, -hdr->b_size); 3475 } 3476 3477 /* 3478 * We're releasing a duplicate user data buffer, update 3479 * our statistics accordingly. 3480 / 3481* if (hdr->b_type == ARC_BUFC_DATA) { 3482 ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers); 3483 ARCSTAT_INCR(arcstat_duplicate_buffers_size, 3484 -hdr->b_size); 3485 } 3486 hdr->b_datacnt -= 1; 3487 arc_cksum_verify(buf); 3488#ifdef illumos 3489 arc_buf_unwatch(buf); 3490#endif /* illumos / 3491* 3492 mutex_exit(hash_lock); 3493 3494 nhdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE); 3495 nhdr->b_size = blksz; 3496 nhdr->b_spa = spa; 3497 nhdr->b_type = type; 3498 nhdr->b_buf = buf; 3499 nhdr->b_state = arc_anon; 3500 nhdr->b_arc_access = 0; 3501 nhdr->b_flags = flags & ARC_L2_WRITING; 3502 nhdr->b_l2hdr = NULL; 3503 nhdr->b_datacnt = 1; 3504 nhdr->b_freeze_cksum = NULL; 3505 (void) refcount_add(&nhdr->b_refcnt, tag); 3506 buf->b_hdr = nhdr; 3507 mutex_exit(&buf->b_evict_lock); 3508 atomic_add_64(&arc_anon->arcs_size, blksz); 3509 } else { 3510 mutex_exit(&buf->b_evict_lock); 3511 ASSERT(refcount_count(&hdr->b_refcnt) == 1); 3512 ASSERT(!list_link_active(&hdr->b_arc_node)); 3513 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 3514 if (hdr->b_state != arc_anon) 3515 arc_change_state(arc_anon, hdr, hash_lock); 3516 hdr->b_arc_access = 0; 3517 if (hash_lock) 3518 mutex_exit(hash_lock); 3519 3520 buf_discard_identity(hdr); 3521 arc_buf_thaw(buf); 3522 } 3523 buf->b_efunc = NULL; 3524 buf->b_private = NULL; 3525 3526 if (l2hdr) { 3527 list_remove(l2hdr->b_dev->l2ad_buflist, hdr); 3528 kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t)); 3529 ARCSTAT_INCR(arcstat_l2_size, -buf_size); 3530 mutex_exit(&l2arc_buflist_mtx); 3531 } 3532} 3533 3534int 3535arc_released(arc_buf_t buf) 3536{ 3537* int released; 3538 3539 mutex_enter(&buf->b_evict_lock); 3540 released = (buf->b_data != NULL && buf->b_hdr->b_state == arc_anon); 3541 mutex_exit(&buf->b_evict_lock); 3542 return (released); 3543} 3544 3545int 3546arc_has_callback(arc_buf_t buf) 3547{ 3548* int callback; 3549 3550 mutex_enter(&buf->b_evict_lock); 3551 callback = (buf->b_efunc != NULL); 3552 mutex_exit(&buf->b_evict_lock); 3553 return (callback); 3554} 3555 3556#ifdef ZFS_DEBUG 3557int 3558arc_referenced(arc_buf_t buf) 3559{ 3560* int referenced; 3561 3562 mutex_enter(&buf->b_evict_lock); 3563 referenced = (refcount_count(&buf->b_hdr->b_refcnt)); 3564 mutex_exit(&buf->b_evict_lock); 3565 return (referenced); 3566} 3567#endif 3568 3569static void 3570arc_write_ready(zio_t zio) 3571{ 3572* arc_write_callback_t callback = zio->io_private; 3573* arc_buf_t buf = callback->awcb_buf; 3574* arc_buf_hdr_t hdr = buf->b_hdr; 3575* 3576 ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt)); 3577 callback->awcb_ready(zio, buf, callback->awcb_private); 3578 3579 /* 3580 * If the IO is already in progress, then this is a re-write 3581 * attempt, so we need to thaw and re-compute the cksum. 3582 * It is the responsibility of the callback to handle the 3583 * accounting for any re-write attempt. 3584 / 3585* if (HDR_IO_IN_PROGRESS(hdr)) { 3586 mutex_enter(&hdr->b_freeze_lock); 3587 if (hdr->b_freeze_cksum != NULL) { 3588 kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t)); 3589 hdr->b_freeze_cksum = NULL; 3590 } 3591 mutex_exit(&hdr->b_freeze_lock); 3592 } 3593 arc_cksum_compute(buf, B_FALSE); 3594 hdr->b_flags \|= ARC_IO_IN_PROGRESS; 3595} 3596 3597static void 3598arc_write_done(zio_t zio) 3599{ 3600* arc_write_callback_t callback = zio->io_private; 3601* arc_buf_t buf = callback->awcb_buf; 3602* arc_buf_hdr_t hdr = buf->b_hdr; 3603* 3604 ASSERT(hdr->b_acb == NULL); 3605 3606 if (zio->io_error == 0) { 3607 hdr->b_dva = BP_IDENTITY(zio->io_bp); 3608* hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp); 3609 hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0]; 3610 } else { 3611 ASSERT(BUF_EMPTY(hdr)); 3612 } 3613 3614 /* 3615 * If the block to be written was all-zero, we may have 3616 * compressed it away. In this case no write was performed 3617 * so there will be no dva/birth/checksum. The buffer must 3618 * therefore remain anonymous (and uncached). 3619 / 3620* if (!BUF_EMPTY(hdr)) { 3621 arc_buf_hdr_t exists; 3622* kmutex_t hash_lock; 3623* 3624 ASSERT(zio->io_error == 0); 3625 3626 arc_cksum_verify(buf); 3627 3628 exists = buf_hash_insert(hdr, &hash_lock); 3629 if (exists) { 3630 /* 3631 * This can only happen if we overwrite for 3632 * sync-to-convergence, because we remove 3633 * buffers from the hash table when we arc_free(). 3634 / 3635* if (zio->io_flags & ZIO_FLAG_IO_REWRITE) { 3636 if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp)) 3637 panic("bad overwrite, hdr=%p exists=%p", 3638 (void )hdr, (void )exists); 3639 ASSERT(refcount_is_zero(&exists->b_refcnt)); 3640 arc_change_state(arc_anon, exists, hash_lock); 3641 mutex_exit(hash_lock); 3642 arc_hdr_destroy(exists); 3643 exists = buf_hash_insert(hdr, &hash_lock); 3644 ASSERT3P(exists, ==, NULL); 3645 } else if (zio->io_flags & ZIO_FLAG_NOPWRITE) { 3646 /* nopwrite / 3647* ASSERT(zio->io_prop.zp_nopwrite); 3648 if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp)) 3649 panic("bad nopwrite, hdr=%p exists=%p", 3650 (void )hdr, (void )exists); 3651 } else { 3652 /* Dedup / 3653* ASSERT(hdr->b_datacnt == 1); 3654 ASSERT(hdr->b_state == arc_anon); 3655 ASSERT(BP_GET_DEDUP(zio->io_bp)); 3656 ASSERT(BP_GET_LEVEL(zio->io_bp) == 0); 3657 } 3658 } 3659 hdr->b_flags &= ~ARC_IO_IN_PROGRESS; 3660 /* if it's not anon, we are doing a scrub / 3661* if (!exists && hdr->b_state == arc_anon) 3662 arc_access(hdr, hash_lock); 3663 mutex_exit(hash_lock); 3664 } else { 3665 hdr->b_flags &= ~ARC_IO_IN_PROGRESS; 3666 } 3667 3668 ASSERT(!refcount_is_zero(&hdr->b_refcnt)); 3669 callback->awcb_done(zio, buf, callback->awcb_private); 3670 3671 kmem_free(callback, sizeof (arc_write_callback_t)); 3672} 3673 3674zio_t * 3675arc_write(zio_t pio, spa_t spa, uint64_t txg, 3676 blkptr_t bp, arc_buf_t buf, boolean_t l2arc, const zio_prop_t zp, 3677* arc_done_func_t ready, arc_done_func_t done, void private, 3678* int priority, int zio_flags, const zbookmark_t zb) 3679{ 3680* arc_buf_hdr_t hdr = buf->b_hdr; 3681* arc_write_callback_t callback; 3682* zio_t zio; 3683* 3684 ASSERT(ready != NULL); 3685 ASSERT(done != NULL); 3686 ASSERT(!HDR_IO_ERROR(hdr)); 3687 ASSERT((hdr->b_flags & ARC_IO_IN_PROGRESS) == 0); 3688 ASSERT(hdr->b_acb == NULL); 3689 if (l2arc) 3690 hdr->b_flags \|= ARC_L2CACHE; 3691 callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP); 3692 callback->awcb_ready = ready; 3693 callback->awcb_done = done; 3694 callback->awcb_private = private; 3695 callback->awcb_buf = buf; 3696 3697 zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, zp, 3698 arc_write_ready, arc_write_done, callback, priority, zio_flags, zb); 3699 3700 return (zio); 3701} 3702 3703static int 3704arc_memory_throttle(uint64_t reserve, uint64_t inflight_data, uint64_t txg) 3705{ 3706#ifdef _KERNEL 3707 uint64_t available_memory = 3708 ptoa((uintmax_t)cnt.v_free_count + cnt.v_cache_count); 3709 static uint64_t page_load = 0; 3710 static uint64_t last_txg = 0; 3711 3712#ifdef sun 3713#if defined(__i386) 3714 available_memory = 3715 MIN(available_memory, vmem_size(heap_arena, VMEM_FREE)); 3716#endif 3717#endif /* sun / 3718* if (available_memory >= zfs_write_limit_max) 3719 return (0); 3720 3721 if (txg > last_txg) { 3722 last_txg = txg; 3723 page_load = 0; 3724 } 3725 /* 3726 * If we are in pageout, we know that memory is already tight, 3727 * the arc is already going to be evicting, so we just want to 3728 * continue to let page writes occur as quickly as possible. 3729 / 3730* if (curproc == pageproc) { 3731 if (page_load > available_memory / 4) 3732 return (ERESTART); 3733 /* Note: reserve is inflated, so we deflate / 3734* page_load += reserve / 8; 3735 return (0); 3736 } else if (page_load > 0 && arc_reclaim_needed()) { 3737 /* memory is low, delay before restarting / 3738* ARCSTAT_INCR(arcstat_memory_throttle_count, 1); 3739 return (EAGAIN); 3740 } 3741 page_load = 0; 3742 3743 if (arc_size > arc_c_min) { 3744 uint64_t evictable_memory = 3745 arc_mru->arcs_lsize[ARC_BUFC_DATA] + 3746 arc_mru->arcs_lsize[ARC_BUFC_METADATA] + 3747 arc_mfu->arcs_lsize[ARC_BUFC_DATA] + 3748 arc_mfu->arcs_lsize[ARC_BUFC_METADATA]; 3749 available_memory += MIN(evictable_memory, arc_size - arc_c_min); 3750 } 3751 3752 if (inflight_data > available_memory / 4) { 3753 ARCSTAT_INCR(arcstat_memory_throttle_count, 1); 3754 return (ERESTART); 3755 } 3756#endif 3757 return (0); 3758} 3759 3760void 3761arc_tempreserve_clear(uint64_t reserve) 3762{ 3763 atomic_add_64(&arc_tempreserve, -reserve); 3764 ASSERT((int64_t)arc_tempreserve >= 0); 3765} 3766 3767int 3768arc_tempreserve_space(uint64_t reserve, uint64_t txg) 3769{ 3770 int error; 3771 uint64_t anon_size; 3772 3773#ifdef ZFS_DEBUG 3774 /* 3775 * Once in a while, fail for no reason. Everything should cope. 3776 / 3777* if (spa_get_random(10000) == 0) { 3778 dprintf("forcing random failure\n"); 3779 return (ERESTART); 3780 } 3781#endif 3782 if (reserve > arc_c/4 && !arc_no_grow) 3783 arc_c = MIN(arc_c_max, reserve * 4); 3784 if (reserve > arc_c) 3785 return (ENOMEM); 3786 3787 /* 3788 * Don't count loaned bufs as in flight dirty data to prevent long 3789 * network delays from blocking transactions that are ready to be 3790 * assigned to a txg. 3791 / 3792* anon_size = MAX((int64_t)(arc_anon->arcs_size - arc_loaned_bytes), 0); 3793 3794 /* 3795 * Writes will, almost always, require additional memory allocations 3796 * in order to compress/encrypt/etc the data. We therefor need to 3797 * make sure that there is sufficient available memory for this. 3798 / 3799* if (error = arc_memory_throttle(reserve, anon_size, txg)) 3800 return (error); 3801 3802 /* 3803 * Throttle writes when the amount of dirty data in the cache 3804 * gets too large. We try to keep the cache less than half full 3805 * of dirty blocks so that our sync times don't grow too large. 3806 * Note: if two requests come in concurrently, we might let them 3807 * both succeed, when one of them should fail. Not a huge deal. 3808 / 3809* 3810 if (reserve + arc_tempreserve + anon_size > arc_c / 2 && 3811 anon_size > arc_c / 4) { 3812 dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK " 3813 "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n", 3814 arc_tempreserve>>10, 3815 arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10, 3816 arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10, 3817 reserve>>10, arc_c>>10); 3818 return (ERESTART); 3819 } 3820 atomic_add_64(&arc_tempreserve, reserve); 3821 return (0); 3822} 3823 3824static kmutex_t arc_lowmem_lock; 3825#ifdef _KERNEL 3826static eventhandler_tag arc_event_lowmem = NULL; 3827 3828static void 3829arc_lowmem(void arg __unused, int howto __unused) 3830{ 3831* 3832 /* Serialize access via arc_lowmem_lock. / 3833* mutex_enter(&arc_lowmem_lock); 3834 mutex_enter(&arc_reclaim_thr_lock); 3835 needfree = 1; 3836 cv_signal(&arc_reclaim_thr_cv); 3837 3838 /* 3839 * It is unsafe to block here in arbitrary threads, because we can come 3840 * here from ARC itself and may hold ARC locks and thus risk a deadlock 3841 * with ARC reclaim thread. 3842 / 3843* if (curproc == pageproc) { 3844 while (needfree) 3845 msleep(&needfree, &arc_reclaim_thr_lock, 0, "zfs:lowmem", 0); 3846 } 3847 mutex_exit(&arc_reclaim_thr_lock); 3848 mutex_exit(&arc_lowmem_lock); 3849} 3850#endif 3851 3852void 3853arc_init(void) 3854{ 3855 int i, prefetch_tunable_set = 0; 3856 3857 mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL); 3858 cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL); 3859 mutex_init(&arc_lowmem_lock, NULL, MUTEX_DEFAULT, NULL); 3860 3861 /* Convert seconds to clock ticks / 3862* arc_min_prefetch_lifespan = 1 * hz; 3863 3864 /* Start out with 1/8 of all memory / 3865* arc_c = kmem_size() / 8; 3866 3867#ifdef sun 3868#ifdef _KERNEL 3869 /* 3870 * On architectures where the physical memory can be larger 3871 * than the addressable space (intel in 32-bit mode), we may 3872 * need to limit the cache to 1/8 of VM size. 3873 / 3874* arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC \| VMEM_FREE) / 8); 3875#endif 3876#endif /* sun / 3877* /* set min cache to 1/32 of all memory, or 16MB, whichever is more / 3878* arc_c_min = MAX(arc_c / 4, 64<<18); 3879 /* set max to 1/2 of all memory, or all but 1GB, whichever is more / 3880* if (arc_c * 8 >= 1<<30) 3881 arc_c_max = (arc_c * 8) - (1<<30); 3882 else 3883 arc_c_max = arc_c_min; 3884 arc_c_max = MAX(arc_c * 5, arc_c_max); 3885 3886#ifdef _KERNEL 3887 /* 3888 * Allow the tunables to override our calculations if they are 3889 * reasonable (ie. over 16MB) 3890 / 3891* if (zfs_arc_max > 64<<18 && zfs_arc_max < kmem_size()) 3892 arc_c_max = zfs_arc_max; 3893 if (zfs_arc_min > 64<<18 && zfs_arc_min <= arc_c_max) 3894 arc_c_min = zfs_arc_min; 3895#endif 3896 3897 arc_c = arc_c_max; 3898 arc_p = (arc_c >> 1); 3899 3900 /* limit meta-data to 1/4 of the arc capacity / 3901* arc_meta_limit = arc_c_max / 4; 3902 3903 /* Allow the tunable to override if it is reasonable / 3904* if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max) 3905 arc_meta_limit = zfs_arc_meta_limit; 3906 3907 if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0) 3908 arc_c_min = arc_meta_limit / 2; 3909 3910 if (zfs_arc_grow_retry > 0) 3911 arc_grow_retry = zfs_arc_grow_retry; 3912 3913 if (zfs_arc_shrink_shift > 0) 3914 arc_shrink_shift = zfs_arc_shrink_shift; 3915 3916 if (zfs_arc_p_min_shift > 0) 3917 arc_p_min_shift = zfs_arc_p_min_shift; 3918 3919 /* if kmem_flags are set, lets try to use less memory / 3920* if (kmem_debugging()) 3921 arc_c = arc_c / 2; 3922 if (arc_c < arc_c_min) 3923 arc_c = arc_c_min; 3924 3925 zfs_arc_min = arc_c_min; 3926 zfs_arc_max = arc_c_max; 3927 3928 arc_anon = &ARC_anon; 3929 arc_mru = &ARC_mru; 3930 arc_mru_ghost = &ARC_mru_ghost; 3931 arc_mfu = &ARC_mfu; 3932 arc_mfu_ghost = &ARC_mfu_ghost; 3933 arc_l2c_only = &ARC_l2c_only; 3934 arc_size = 0; 3935 3936 for (i = 0; i < ARC_BUFC_NUMLISTS; i++) { 3937 mutex_init(&arc_anon->arcs_locks[i].arcs_lock, 3938 NULL, MUTEX_DEFAULT, NULL); 3939 mutex_init(&arc_mru->arcs_locks[i].arcs_lock, 3940 NULL, MUTEX_DEFAULT, NULL); 3941 mutex_init(&arc_mru_ghost->arcs_locks[i].arcs_lock, 3942 NULL, MUTEX_DEFAULT, NULL); 3943 mutex_init(&arc_mfu->arcs_locks[i].arcs_lock, 3944 NULL, MUTEX_DEFAULT, NULL); 3945 mutex_init(&arc_mfu_ghost->arcs_locks[i].arcs_lock, 3946 NULL, MUTEX_DEFAULT, NULL); 3947 mutex_init(&arc_l2c_only->arcs_locks[i].arcs_lock, 3948 NULL, MUTEX_DEFAULT, NULL); 3949 3950 list_create(&arc_mru->arcs_lists[i], 3951 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 3952 list_create(&arc_mru_ghost->arcs_lists[i], 3953 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 3954 list_create(&arc_mfu->arcs_lists[i], 3955 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 3956 list_create(&arc_mfu_ghost->arcs_lists[i], 3957 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 3958 list_create(&arc_mfu_ghost->arcs_lists[i], 3959 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 3960 list_create(&arc_l2c_only->arcs_lists[i], 3961 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 3962 } 3963 3964 buf_init(); 3965 3966 arc_thread_exit = 0; 3967 arc_eviction_list = NULL; 3968 mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL); 3969 bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t)); 3970 3971 arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED, 3972 sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); 3973 3974 if (arc_ksp != NULL) { 3975 arc_ksp->ks_data = &arc_stats; 3976 kstat_install(arc_ksp); 3977 } 3978 3979 (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0, 3980 TS_RUN, minclsyspri); 3981 3982#ifdef _KERNEL 3983 arc_event_lowmem = EVENTHANDLER_REGISTER(vm_lowmem, arc_lowmem, NULL, 3984 EVENTHANDLER_PRI_FIRST); 3985#endif 3986 3987 arc_dead = FALSE; 3988 arc_warm = B_FALSE; 3989 3990 if (zfs_write_limit_max == 0) 3991 zfs_write_limit_max = ptob(physmem) >> zfs_write_limit_shift; 3992 else 3993 zfs_write_limit_shift = 0; 3994 mutex_init(&zfs_write_limit_lock, NULL, MUTEX_DEFAULT, NULL); 3995 3996#ifdef _KERNEL 3997 if (TUNABLE_INT_FETCH("vfs.zfs.prefetch_disable", &zfs_prefetch_disable)) 3998 prefetch_tunable_set = 1; 3999 4000#ifdef __i386__ 4001 if (prefetch_tunable_set == 0) { 4002 printf("ZFS NOTICE: Prefetch is disabled by default on i386 " 4003 "-- to enable,\n"); 4004 printf(" add \"vfs.zfs.prefetch_disable=0\" " 4005 "to /boot/loader.conf.\n"); 4006 zfs_prefetch_disable = 1; 4007 } 4008#else 4009 if ((((uint64_t)physmem * PAGESIZE) < (1ULL << 32)) && 4010 prefetch_tunable_set == 0) { 4011 printf("ZFS NOTICE: Prefetch is disabled by default if less " 4012 "than 4GB of RAM is present;\n" 4013 " to enable, add \"vfs.zfs.prefetch_disable=0\" " 4014 "to /boot/loader.conf.\n"); 4015 zfs_prefetch_disable = 1; 4016 } 4017#endif 4018 /* Warn about ZFS memory and address space requirements. / 4019* if (((uint64_t)physmem * PAGESIZE) < (256 + 128 + 64) * (1 << 20)) { 4020 printf("ZFS WARNING: Recommended minimum RAM size is 512MB; " 4021 "expect unstable behavior.\n"); 4022 } 4023 if (kmem_size() < 512 * (1 << 20)) { 4024 printf("ZFS WARNING: Recommended minimum kmem_size is 512MB; " 4025 "expect unstable behavior.\n"); 4026 printf(" Consider tuning vm.kmem_size and " 4027 "vm.kmem_size_max\n"); 4028 printf(" in /boot/loader.conf.\n"); 4029 } 4030#endif 4031} 4032 4033void 4034arc_fini(void) 4035{ 4036 int i; 4037 4038 mutex_enter(&arc_reclaim_thr_lock); 4039 arc_thread_exit = 1; 4040 cv_signal(&arc_reclaim_thr_cv); 4041 while (arc_thread_exit != 0) 4042 cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock); 4043 mutex_exit(&arc_reclaim_thr_lock); 4044 4045 arc_flush(NULL); 4046 4047 arc_dead = TRUE; 4048 4049 if (arc_ksp != NULL) { 4050 kstat_delete(arc_ksp); 4051 arc_ksp = NULL; 4052 } 4053 4054 mutex_destroy(&arc_eviction_mtx); 4055 mutex_destroy(&arc_reclaim_thr_lock); 4056 cv_destroy(&arc_reclaim_thr_cv); 4057 4058 for (i = 0; i < ARC_BUFC_NUMLISTS; i++) { 4059 list_destroy(&arc_mru->arcs_lists[i]); 4060 list_destroy(&arc_mru_ghost->arcs_lists[i]); 4061 list_destroy(&arc_mfu->arcs_lists[i]); 4062 list_destroy(&arc_mfu_ghost->arcs_lists[i]); 4063 list_destroy(&arc_l2c_only->arcs_lists[i]); 4064 4065 mutex_destroy(&arc_anon->arcs_locks[i].arcs_lock); 4066 mutex_destroy(&arc_mru->arcs_locks[i].arcs_lock); 4067 mutex_destroy(&arc_mru_ghost->arcs_locks[i].arcs_lock); 4068 mutex_destroy(&arc_mfu->arcs_locks[i].arcs_lock); 4069 mutex_destroy(&arc_mfu_ghost->arcs_locks[i].arcs_lock); 4070 mutex_destroy(&arc_l2c_only->arcs_locks[i].arcs_lock); 4071 } 4072 4073 mutex_destroy(&zfs_write_limit_lock); 4074 4075 buf_fini(); 4076 4077 ASSERT(arc_loaned_bytes == 0); 4078 4079 mutex_destroy(&arc_lowmem_lock); 4080#ifdef _KERNEL 4081 if (arc_event_lowmem != NULL) 4082 EVENTHANDLER_DEREGISTER(vm_lowmem, arc_event_lowmem); 4083#endif 4084} 4085 4086/* 4087 * Level 2 ARC 4088 * 4089 * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk. 4090 * It uses dedicated storage devices to hold cached data, which are populated 4091 * using large infrequent writes. The main role of this cache is to boost 4092 * the performance of random read workloads. The intended L2ARC devices 4093 * include short-stroked disks, solid state disks, and other media with 4094 * substantially faster read latency than disk. 4095 * 4096 * +-----------------------+ 4097 * \| ARC \| 4098 * +-----------------------+ 4099 * \| ^ ^ 4100 * \| \| \| 4101 * l2arc_feed_thread() arc_read() 4102 * \| \| \| 4103 * \| l2arc read \| 4104 * V \| \| 4105 * +---------------+ \| 4106 * \| L2ARC \| \| 4107 * +---------------+ \| 4108 * \| ^ \| 4109 * l2arc_write() \| \| 4110 * \| \| \| 4111 * V \| \| 4112 * +-------+ +-------+ 4113 * \| vdev \| \| vdev \| 4114 * \| cache \| \| cache \| 4115 * +-------+ +-------+ 4116 * +=========+ .-----. 4117 * : L2ARC : \|-_____-\| 4118 * : devices : \| Disks \| 4119 * +=========+ `-_____-' 4120 * 4121 * Read requests are satisfied from the following sources, in order: 4122 * 4123 * 1) ARC 4124 * 2) vdev cache of L2ARC devices 4125 * 3) L2ARC devices 4126 * 4) vdev cache of disks 4127 * 5) disks 4128 * 4129 * Some L2ARC device types exhibit extremely slow write performance. 4130 * To accommodate for this there are some significant differences between 4131 * the L2ARC and traditional cache design: 4132 * 4133 * 1. There is no eviction path from the ARC to the L2ARC. Evictions from 4134 * the ARC behave as usual, freeing buffers and placing headers on ghost 4135 * lists. The ARC does not send buffers to the L2ARC during eviction as 4136 * this would add inflated write latencies for all ARC memory pressure. 4137 * 4138 * 2. The L2ARC attempts to cache data from the ARC before it is evicted. 4139 * It does this by periodically scanning buffers from the eviction-end of 4140 * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are 4141 * not already there. It scans until a headroom of buffers is satisfied, 4142 * which itself is a buffer for ARC eviction. The thread that does this is 4143 * l2arc_feed_thread(), illustrated below; example sizes are included to 4144 * provide a better sense of ratio than this diagram: 4145 * 4146 * head --> tail 4147 * +---------------------+----------+ 4148 * ARC_mfu \|:::::#:::::::::::::::\|o#o###o###\|-->. # already on L2ARC 4149 * +---------------------+----------+ \| o L2ARC eligible 4150 * ARC_mru \|:#:::::::::::::::::::\|#o#ooo####\|-->\| : ARC buffer 4151 * +---------------------+----------+ \| 4152 * 15.9 Gbytes ^ 32 Mbytes \| 4153 * headroom \| 4154 * l2arc_feed_thread() 4155 * \| 4156 * l2arc write hand <--[oooo]--' 4157 * \| 8 Mbyte 4158 * \| write max 4159 * V 4160 * +==============================+ 4161 * L2ARC dev \|####\|#\|###\|###\| \|####\| ... \| 4162 * +==============================+ 4163 * 32 Gbytes 4164 * 4165 * 3. If an ARC buffer is copied to the L2ARC but then hit instead of 4166 * evicted, then the L2ARC has cached a buffer much sooner than it probably 4167 * needed to, potentially wasting L2ARC device bandwidth and storage. It is 4168 * safe to say that this is an uncommon case, since buffers at the end of 4169 * the ARC lists have moved there due to inactivity. 4170 * 4171 * 4. If the ARC evicts faster than the L2ARC can maintain a headroom, 4172 * then the L2ARC simply misses copying some buffers. This serves as a 4173 * pressure valve to prevent heavy read workloads from both stalling the ARC 4174 * with waits and clogging the L2ARC with writes. This also helps prevent 4175 * the potential for the L2ARC to churn if it attempts to cache content too 4176 * quickly, such as during backups of the entire pool. 4177 * 4178 * 5. After system boot and before the ARC has filled main memory, there are 4179 * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru 4180 * lists can remain mostly static. Instead of searching from tail of these 4181 * lists as pictured, the l2arc_feed_thread() will search from the list heads 4182 * for eligible buffers, greatly increasing its chance of finding them. 4183 * 4184 * The L2ARC device write speed is also boosted during this time so that 4185 * the L2ARC warms up faster. Since there have been no ARC evictions yet, 4186 * there are no L2ARC reads, and no fear of degrading read performance 4187 * through increased writes. 4188 * 4189 * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that 4190 * the vdev queue can aggregate them into larger and fewer writes. Each 4191 * device is written to in a rotor fashion, sweeping writes through 4192 * available space then repeating. 4193 * 4194 * 7. The L2ARC does not store dirty content. It never needs to flush 4195 * write buffers back to disk based storage. 4196 * 4197 * 8. If an ARC buffer is written (and dirtied) which also exists in the 4198 * L2ARC, the now stale L2ARC buffer is immediately dropped. 4199 * 4200 * The performance of the L2ARC can be tweaked by a number of tunables, which 4201 * may be necessary for different workloads: 4202 * 4203 * l2arc_write_max max write bytes per interval 4204 * l2arc_write_boost extra write bytes during device warmup 4205 * l2arc_noprefetch skip caching prefetched buffers 4206 * l2arc_headroom number of max device writes to precache 4207 * l2arc_feed_secs seconds between L2ARC writing 4208 * 4209 * Tunables may be removed or added as future performance improvements are 4210 * integrated, and also may become zpool properties. 4211 * 4212 * There are three key functions that control how the L2ARC warms up: 4213 * 4214 * l2arc_write_eligible() check if a buffer is eligible to cache 4215 * l2arc_write_size() calculate how much to write 4216 * l2arc_write_interval() calculate sleep delay between writes 4217 * 4218 * These three functions determine what to write, how much, and how quickly 4219 * to send writes. 4220 / 4221* 4222static boolean_t 4223l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t ab) 4224{ 4225* /* 4226 * A buffer is not eligible for the L2ARC if it: 4227 * 1. belongs to a different spa. 4228 * 2. is already cached on the L2ARC. 4229 * 3. has an I/O in progress (it may be an incomplete read). 4230 * 4. is flagged not eligible (zfs property). 4231 / 4232* if (ab->b_spa != spa_guid) { 4233 ARCSTAT_BUMP(arcstat_l2_write_spa_mismatch); 4234 return (B_FALSE); 4235 } 4236 if (ab->b_l2hdr != NULL) { 4237 ARCSTAT_BUMP(arcstat_l2_write_in_l2); 4238 return (B_FALSE); 4239 } 4240 if (HDR_IO_IN_PROGRESS(ab)) { 4241 ARCSTAT_BUMP(arcstat_l2_write_hdr_io_in_progress); 4242 return (B_FALSE); 4243 } 4244 if (!HDR_L2CACHE(ab)) { 4245 ARCSTAT_BUMP(arcstat_l2_write_not_cacheable); 4246 return (B_FALSE); 4247 } 4248 4249 return (B_TRUE); 4250} 4251 4252static uint64_t 4253l2arc_write_size(l2arc_dev_t dev) 4254{ 4255* uint64_t size; 4256 4257 size = dev->l2ad_write; 4258 4259 if (arc_warm == B_FALSE) 4260 size += dev->l2ad_boost; 4261 4262 return (size); 4263 4264} 4265 4266static clock_t 4267l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote) 4268{ 4269 clock_t interval, next, now; 4270 4271 /* 4272 * If the ARC lists are busy, increase our write rate; if the 4273 * lists are stale, idle back. This is achieved by checking 4274 * how much we previously wrote - if it was more than half of 4275 * what we wanted, schedule the next write much sooner. 4276 / 4277* if (l2arc_feed_again && wrote > (wanted / 2)) 4278 interval = (hz * l2arc_feed_min_ms) / 1000; 4279 else 4280 interval = hz * l2arc_feed_secs; 4281 4282 now = ddi_get_lbolt(); 4283 next = MAX(now, MIN(now + interval, began + interval)); 4284 4285 return (next); 4286} 4287 4288static void 4289l2arc_hdr_stat_add(void) 4290{ 4291 ARCSTAT_INCR(arcstat_l2_hdr_size, HDR_SIZE + L2HDR_SIZE); 4292 ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE); 4293} 4294 4295static void 4296l2arc_hdr_stat_remove(void) 4297{ 4298 ARCSTAT_INCR(arcstat_l2_hdr_size, -(HDR_SIZE + L2HDR_SIZE)); 4299 ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE); 4300} 4301 4302/* 4303 * Cycle through L2ARC devices. This is how L2ARC load balances. 4304 * If a device is returned, this also returns holding the spa config lock. 4305 / 4306static l2arc_dev_t 4307l2arc_dev_get_next(void) 4308{ 4309 l2arc_dev_t first, next = NULL; 4310 4311 /* 4312 * Lock out the removal of spas (spa_namespace_lock), then removal 4313 * of cache devices (l2arc_dev_mtx). Once a device has been selected, 4314 * both locks will be dropped and a spa config lock held instead. 4315 / 4316* mutex_enter(&spa_namespace_lock); 4317 mutex_enter(&l2arc_dev_mtx); 4318 4319 /* if there are no vdevs, there is nothing to do / 4320* if (l2arc_ndev == 0) 4321 goto out; 4322 4323 first = NULL; 4324 next = l2arc_dev_last; 4325 do { 4326 /* loop around the list looking for a non-faulted vdev / 4327* if (next == NULL) { 4328 next = list_head(l2arc_dev_list); 4329 } else { 4330 next = list_next(l2arc_dev_list, next); 4331 if (next == NULL) 4332 next = list_head(l2arc_dev_list); 4333 } 4334 4335 /* if we have come back to the start, bail out / 4336* if (first == NULL) 4337 first = next; 4338 else if (next == first) 4339 break; 4340 4341 } while (vdev_is_dead(next->l2ad_vdev)); 4342 4343 /* if we were unable to find any usable vdevs, return NULL / 4344* if (vdev_is_dead(next->l2ad_vdev)) 4345 next = NULL; 4346 4347 l2arc_dev_last = next; 4348 4349out: 4350 mutex_exit(&l2arc_dev_mtx); 4351 4352 /* 4353 * Grab the config lock to prevent the 'next' device from being 4354 * removed while we are writing to it. 4355 / 4356* if (next != NULL) 4357 spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER); 4358 mutex_exit(&spa_namespace_lock); 4359 4360 return (next); 4361} 4362 4363/* 4364 * Free buffers that were tagged for destruction. 4365 / 4366static void 4367l2arc_do_free_on_write() 4368{ 4369* list_t buflist; 4370* l2arc_data_free_t df, df_prev; 4371 4372 mutex_enter(&l2arc_free_on_write_mtx); 4373 buflist = l2arc_free_on_write; 4374 4375 for (df = list_tail(buflist); df; df = df_prev) { 4376 df_prev = list_prev(buflist, df); 4377 ASSERT(df->l2df_data != NULL); 4378 ASSERT(df->l2df_func != NULL); 4379 df->l2df_func(df->l2df_data, df->l2df_size); 4380 list_remove(buflist, df); 4381 kmem_free(df, sizeof (l2arc_data_free_t)); 4382 } 4383 4384 mutex_exit(&l2arc_free_on_write_mtx); 4385} 4386 4387/* 4388 * A write to a cache device has completed. Update all headers to allow 4389 * reads from these buffers to begin. 4390 / 4391static void 4392l2arc_write_done(zio_t zio) 4393{ 4394 l2arc_write_callback_t cb; 4395* l2arc_dev_t dev; 4396* list_t buflist; 4397* arc_buf_hdr_t head, ab, ab_prev; 4398* l2arc_buf_hdr_t abl2; 4399* kmutex_t hash_lock; 4400* 4401 cb = zio->io_private; 4402 ASSERT(cb != NULL); 4403 dev = cb->l2wcb_dev; 4404 ASSERT(dev != NULL); 4405 head = cb->l2wcb_head; 4406 ASSERT(head != NULL); 4407 buflist = dev->l2ad_buflist; 4408 ASSERT(buflist != NULL); 4409 DTRACE_PROBE2(l2arc__iodone, zio_t , zio, 4410* l2arc_write_callback_t , cb); 4411* 4412 if (zio->io_error != 0) 4413 ARCSTAT_BUMP(arcstat_l2_writes_error); 4414 4415 mutex_enter(&l2arc_buflist_mtx); 4416 4417 /* 4418 * All writes completed, or an error was hit. 4419 / 4420* for (ab = list_prev(buflist, head); ab; ab = ab_prev) { 4421 ab_prev = list_prev(buflist, ab); 4422 4423 hash_lock = HDR_LOCK(ab); 4424 if (!mutex_tryenter(hash_lock)) { 4425 /* 4426 * This buffer misses out. It may be in a stage 4427 * of eviction. Its ARC_L2_WRITING flag will be 4428 * left set, denying reads to this buffer. 4429 / 4430* ARCSTAT_BUMP(arcstat_l2_writes_hdr_miss); 4431 continue; 4432 } 4433 4434 if (zio->io_error != 0) { 4435 /* 4436 * Error - drop L2ARC entry. 4437 / 4438* list_remove(buflist, ab); 4439 abl2 = ab->b_l2hdr; 4440 ab->b_l2hdr = NULL; 4441 kmem_free(abl2, sizeof (l2arc_buf_hdr_t)); 4442 ARCSTAT_INCR(arcstat_l2_size, -ab->b_size); 4443 } 4444 4445 /* 4446 * Allow ARC to begin reads to this L2ARC entry. 4447 / 4448* ab->b_flags &= ~ARC_L2_WRITING; 4449 4450 mutex_exit(hash_lock); 4451 } 4452 4453 atomic_inc_64(&l2arc_writes_done); 4454 list_remove(buflist, head); 4455 kmem_cache_free(hdr_cache, head); 4456 mutex_exit(&l2arc_buflist_mtx); 4457 4458 l2arc_do_free_on_write(); 4459 4460 kmem_free(cb, sizeof (l2arc_write_callback_t)); 4461} 4462 4463/* 4464 * A read to a cache device completed. Validate buffer contents before 4465 * handing over to the regular ARC routines. 4466 / 4467static void 4468l2arc_read_done(zio_t zio) 4469{ 4470 l2arc_read_callback_t cb; 4471* arc_buf_hdr_t hdr; 4472* arc_buf_t buf; 4473* kmutex_t hash_lock; 4474* int equal; 4475 4476 ASSERT(zio->io_vd != NULL); 4477 ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE); 4478 4479 spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd); 4480 4481 cb = zio->io_private; 4482 ASSERT(cb != NULL); 4483 buf = cb->l2rcb_buf; 4484 ASSERT(buf != NULL); 4485 4486 hash_lock = HDR_LOCK(buf->b_hdr); 4487 mutex_enter(hash_lock); 4488 hdr = buf->b_hdr; 4489 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 4490 4491 /* 4492 * Check this survived the L2ARC journey. 4493 / 4494* equal = arc_cksum_equal(buf); 4495 if (equal && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) { 4496 mutex_exit(hash_lock); 4497 zio->io_private = buf; 4498 zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 / 4499* zio->io_bp = &zio->io_bp_copy; /* XXX fix in L2ARC 2.0 / 4500* arc_read_done(zio); 4501 } else { 4502 mutex_exit(hash_lock); 4503 /* 4504 * Buffer didn't survive caching. Increment stats and 4505 * reissue to the original storage device. 4506 / 4507* if (zio->io_error != 0) { 4508 ARCSTAT_BUMP(arcstat_l2_io_error); 4509 } else { 4510 zio->io_error = EIO; 4511 } 4512 if (!equal) 4513 ARCSTAT_BUMP(arcstat_l2_cksum_bad); 4514 4515 /* 4516 * If there's no waiter, issue an async i/o to the primary 4517 * storage now. If there is a waiter, the caller must 4518 * issue the i/o in a context where it's OK to block. 4519 / 4520* if (zio->io_waiter == NULL) { 4521 zio_t pio = zio_unique_parent(zio); 4522* 4523 ASSERT(!pio \|\| pio->io_child_type == ZIO_CHILD_LOGICAL); 4524 4525 zio_nowait(zio_read(pio, cb->l2rcb_spa, &cb->l2rcb_bp, 4526 buf->b_data, zio->io_size, arc_read_done, buf, 4527 zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb)); 4528 } 4529 } 4530 4531 kmem_free(cb, sizeof (l2arc_read_callback_t)); 4532} 4533 4534/* 4535 * This is the list priority from which the L2ARC will search for pages to 4536 * cache. This is used within loops (0..3) to cycle through lists in the 4537 * desired order. This order can have a significant effect on cache 4538 * performance. 4539 * 4540 * Currently the metadata lists are hit first, MFU then MRU, followed by 4541 * the data lists. This function returns a locked list, and also returns 4542 * the lock pointer. 4543 / 4544static list_t 4545l2arc_list_locked(int list_num, kmutex_t *lock) 4546*{	3449 3450 /* 3451 * Do we have more than one buf? 3452 / 3453* if (hdr->b_datacnt > 1) { 3454 arc_buf_hdr_t nhdr; 3455* arc_buf_t *bufp; 3456* uint64_t blksz = hdr->b_size; 3457 uint64_t spa = hdr->b_spa; 3458 arc_buf_contents_t type = hdr->b_type; 3459 uint32_t flags = hdr->b_flags; 3460 3461 ASSERT(hdr->b_buf != buf \|\| buf->b_next != NULL); 3462 /* 3463 * Pull the data off of this hdr and attach it to 3464 * a new anonymous hdr. 3465 / 3466* (void) remove_reference(hdr, hash_lock, tag); 3467 bufp = &hdr->b_buf; 3468 while (bufp != buf) 3469* bufp = &(bufp)->b_next; 3470* bufp = buf->b_next; 3471* buf->b_next = NULL; 3472 3473 ASSERT3U(hdr->b_state->arcs_size, >=, hdr->b_size); 3474 atomic_add_64(&hdr->b_state->arcs_size, -hdr->b_size); 3475 if (refcount_is_zero(&hdr->b_refcnt)) { 3476 uint64_t size = &hdr->b_state->arcs_lsize[hdr->b_type]; 3477* ASSERT3U(size, >=, hdr->b_size); 3478* atomic_add_64(size, -hdr->b_size); 3479 } 3480 3481 /* 3482 * We're releasing a duplicate user data buffer, update 3483 * our statistics accordingly. 3484 / 3485* if (hdr->b_type == ARC_BUFC_DATA) { 3486 ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers); 3487 ARCSTAT_INCR(arcstat_duplicate_buffers_size, 3488 -hdr->b_size); 3489 } 3490 hdr->b_datacnt -= 1; 3491 arc_cksum_verify(buf); 3492#ifdef illumos 3493 arc_buf_unwatch(buf); 3494#endif /* illumos / 3495* 3496 mutex_exit(hash_lock); 3497 3498 nhdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE); 3499 nhdr->b_size = blksz; 3500 nhdr->b_spa = spa; 3501 nhdr->b_type = type; 3502 nhdr->b_buf = buf; 3503 nhdr->b_state = arc_anon; 3504 nhdr->b_arc_access = 0; 3505 nhdr->b_flags = flags & ARC_L2_WRITING; 3506 nhdr->b_l2hdr = NULL; 3507 nhdr->b_datacnt = 1; 3508 nhdr->b_freeze_cksum = NULL; 3509 (void) refcount_add(&nhdr->b_refcnt, tag); 3510 buf->b_hdr = nhdr; 3511 mutex_exit(&buf->b_evict_lock); 3512 atomic_add_64(&arc_anon->arcs_size, blksz); 3513 } else { 3514 mutex_exit(&buf->b_evict_lock); 3515 ASSERT(refcount_count(&hdr->b_refcnt) == 1); 3516 ASSERT(!list_link_active(&hdr->b_arc_node)); 3517 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 3518 if (hdr->b_state != arc_anon) 3519 arc_change_state(arc_anon, hdr, hash_lock); 3520 hdr->b_arc_access = 0; 3521 if (hash_lock) 3522 mutex_exit(hash_lock); 3523 3524 buf_discard_identity(hdr); 3525 arc_buf_thaw(buf); 3526 } 3527 buf->b_efunc = NULL; 3528 buf->b_private = NULL; 3529 3530 if (l2hdr) { 3531 list_remove(l2hdr->b_dev->l2ad_buflist, hdr); 3532 kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t)); 3533 ARCSTAT_INCR(arcstat_l2_size, -buf_size); 3534 mutex_exit(&l2arc_buflist_mtx); 3535 } 3536} 3537 3538int 3539arc_released(arc_buf_t buf) 3540{ 3541* int released; 3542 3543 mutex_enter(&buf->b_evict_lock); 3544 released = (buf->b_data != NULL && buf->b_hdr->b_state == arc_anon); 3545 mutex_exit(&buf->b_evict_lock); 3546 return (released); 3547} 3548 3549int 3550arc_has_callback(arc_buf_t buf) 3551{ 3552* int callback; 3553 3554 mutex_enter(&buf->b_evict_lock); 3555 callback = (buf->b_efunc != NULL); 3556 mutex_exit(&buf->b_evict_lock); 3557 return (callback); 3558} 3559 3560#ifdef ZFS_DEBUG 3561int 3562arc_referenced(arc_buf_t buf) 3563{ 3564* int referenced; 3565 3566 mutex_enter(&buf->b_evict_lock); 3567 referenced = (refcount_count(&buf->b_hdr->b_refcnt)); 3568 mutex_exit(&buf->b_evict_lock); 3569 return (referenced); 3570} 3571#endif 3572 3573static void 3574arc_write_ready(zio_t zio) 3575{ 3576* arc_write_callback_t callback = zio->io_private; 3577* arc_buf_t buf = callback->awcb_buf; 3578* arc_buf_hdr_t hdr = buf->b_hdr; 3579* 3580 ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt)); 3581 callback->awcb_ready(zio, buf, callback->awcb_private); 3582 3583 /* 3584 * If the IO is already in progress, then this is a re-write 3585 * attempt, so we need to thaw and re-compute the cksum. 3586 * It is the responsibility of the callback to handle the 3587 * accounting for any re-write attempt. 3588 / 3589* if (HDR_IO_IN_PROGRESS(hdr)) { 3590 mutex_enter(&hdr->b_freeze_lock); 3591 if (hdr->b_freeze_cksum != NULL) { 3592 kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t)); 3593 hdr->b_freeze_cksum = NULL; 3594 } 3595 mutex_exit(&hdr->b_freeze_lock); 3596 } 3597 arc_cksum_compute(buf, B_FALSE); 3598 hdr->b_flags \|= ARC_IO_IN_PROGRESS; 3599} 3600 3601static void 3602arc_write_done(zio_t zio) 3603{ 3604* arc_write_callback_t callback = zio->io_private; 3605* arc_buf_t buf = callback->awcb_buf; 3606* arc_buf_hdr_t hdr = buf->b_hdr; 3607* 3608 ASSERT(hdr->b_acb == NULL); 3609 3610 if (zio->io_error == 0) { 3611 hdr->b_dva = BP_IDENTITY(zio->io_bp); 3612* hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp); 3613 hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0]; 3614 } else { 3615 ASSERT(BUF_EMPTY(hdr)); 3616 } 3617 3618 /* 3619 * If the block to be written was all-zero, we may have 3620 * compressed it away. In this case no write was performed 3621 * so there will be no dva/birth/checksum. The buffer must 3622 * therefore remain anonymous (and uncached). 3623 / 3624* if (!BUF_EMPTY(hdr)) { 3625 arc_buf_hdr_t exists; 3626* kmutex_t hash_lock; 3627* 3628 ASSERT(zio->io_error == 0); 3629 3630 arc_cksum_verify(buf); 3631 3632 exists = buf_hash_insert(hdr, &hash_lock); 3633 if (exists) { 3634 /* 3635 * This can only happen if we overwrite for 3636 * sync-to-convergence, because we remove 3637 * buffers from the hash table when we arc_free(). 3638 / 3639* if (zio->io_flags & ZIO_FLAG_IO_REWRITE) { 3640 if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp)) 3641 panic("bad overwrite, hdr=%p exists=%p", 3642 (void )hdr, (void )exists); 3643 ASSERT(refcount_is_zero(&exists->b_refcnt)); 3644 arc_change_state(arc_anon, exists, hash_lock); 3645 mutex_exit(hash_lock); 3646 arc_hdr_destroy(exists); 3647 exists = buf_hash_insert(hdr, &hash_lock); 3648 ASSERT3P(exists, ==, NULL); 3649 } else if (zio->io_flags & ZIO_FLAG_NOPWRITE) { 3650 /* nopwrite / 3651* ASSERT(zio->io_prop.zp_nopwrite); 3652 if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp)) 3653 panic("bad nopwrite, hdr=%p exists=%p", 3654 (void )hdr, (void )exists); 3655 } else { 3656 /* Dedup / 3657* ASSERT(hdr->b_datacnt == 1); 3658 ASSERT(hdr->b_state == arc_anon); 3659 ASSERT(BP_GET_DEDUP(zio->io_bp)); 3660 ASSERT(BP_GET_LEVEL(zio->io_bp) == 0); 3661 } 3662 } 3663 hdr->b_flags &= ~ARC_IO_IN_PROGRESS; 3664 /* if it's not anon, we are doing a scrub / 3665* if (!exists && hdr->b_state == arc_anon) 3666 arc_access(hdr, hash_lock); 3667 mutex_exit(hash_lock); 3668 } else { 3669 hdr->b_flags &= ~ARC_IO_IN_PROGRESS; 3670 } 3671 3672 ASSERT(!refcount_is_zero(&hdr->b_refcnt)); 3673 callback->awcb_done(zio, buf, callback->awcb_private); 3674 3675 kmem_free(callback, sizeof (arc_write_callback_t)); 3676} 3677 3678zio_t * 3679arc_write(zio_t pio, spa_t spa, uint64_t txg, 3680 blkptr_t bp, arc_buf_t buf, boolean_t l2arc, const zio_prop_t zp, 3681* arc_done_func_t ready, arc_done_func_t done, void private, 3682* int priority, int zio_flags, const zbookmark_t zb) 3683{ 3684* arc_buf_hdr_t hdr = buf->b_hdr; 3685* arc_write_callback_t callback; 3686* zio_t zio; 3687* 3688 ASSERT(ready != NULL); 3689 ASSERT(done != NULL); 3690 ASSERT(!HDR_IO_ERROR(hdr)); 3691 ASSERT((hdr->b_flags & ARC_IO_IN_PROGRESS) == 0); 3692 ASSERT(hdr->b_acb == NULL); 3693 if (l2arc) 3694 hdr->b_flags \|= ARC_L2CACHE; 3695 callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP); 3696 callback->awcb_ready = ready; 3697 callback->awcb_done = done; 3698 callback->awcb_private = private; 3699 callback->awcb_buf = buf; 3700 3701 zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, zp, 3702 arc_write_ready, arc_write_done, callback, priority, zio_flags, zb); 3703 3704 return (zio); 3705} 3706 3707static int 3708arc_memory_throttle(uint64_t reserve, uint64_t inflight_data, uint64_t txg) 3709{ 3710#ifdef _KERNEL 3711 uint64_t available_memory = 3712 ptoa((uintmax_t)cnt.v_free_count + cnt.v_cache_count); 3713 static uint64_t page_load = 0; 3714 static uint64_t last_txg = 0; 3715 3716#ifdef sun 3717#if defined(__i386) 3718 available_memory = 3719 MIN(available_memory, vmem_size(heap_arena, VMEM_FREE)); 3720#endif 3721#endif /* sun / 3722* if (available_memory >= zfs_write_limit_max) 3723 return (0); 3724 3725 if (txg > last_txg) { 3726 last_txg = txg; 3727 page_load = 0; 3728 } 3729 /* 3730 * If we are in pageout, we know that memory is already tight, 3731 * the arc is already going to be evicting, so we just want to 3732 * continue to let page writes occur as quickly as possible. 3733 / 3734* if (curproc == pageproc) { 3735 if (page_load > available_memory / 4) 3736 return (ERESTART); 3737 /* Note: reserve is inflated, so we deflate / 3738* page_load += reserve / 8; 3739 return (0); 3740 } else if (page_load > 0 && arc_reclaim_needed()) { 3741 /* memory is low, delay before restarting / 3742* ARCSTAT_INCR(arcstat_memory_throttle_count, 1); 3743 return (EAGAIN); 3744 } 3745 page_load = 0; 3746 3747 if (arc_size > arc_c_min) { 3748 uint64_t evictable_memory = 3749 arc_mru->arcs_lsize[ARC_BUFC_DATA] + 3750 arc_mru->arcs_lsize[ARC_BUFC_METADATA] + 3751 arc_mfu->arcs_lsize[ARC_BUFC_DATA] + 3752 arc_mfu->arcs_lsize[ARC_BUFC_METADATA]; 3753 available_memory += MIN(evictable_memory, arc_size - arc_c_min); 3754 } 3755 3756 if (inflight_data > available_memory / 4) { 3757 ARCSTAT_INCR(arcstat_memory_throttle_count, 1); 3758 return (ERESTART); 3759 } 3760#endif 3761 return (0); 3762} 3763 3764void 3765arc_tempreserve_clear(uint64_t reserve) 3766{ 3767 atomic_add_64(&arc_tempreserve, -reserve); 3768 ASSERT((int64_t)arc_tempreserve >= 0); 3769} 3770 3771int 3772arc_tempreserve_space(uint64_t reserve, uint64_t txg) 3773{ 3774 int error; 3775 uint64_t anon_size; 3776 3777#ifdef ZFS_DEBUG 3778 /* 3779 * Once in a while, fail for no reason. Everything should cope. 3780 / 3781* if (spa_get_random(10000) == 0) { 3782 dprintf("forcing random failure\n"); 3783 return (ERESTART); 3784 } 3785#endif 3786 if (reserve > arc_c/4 && !arc_no_grow) 3787 arc_c = MIN(arc_c_max, reserve * 4); 3788 if (reserve > arc_c) 3789 return (ENOMEM); 3790 3791 /* 3792 * Don't count loaned bufs as in flight dirty data to prevent long 3793 * network delays from blocking transactions that are ready to be 3794 * assigned to a txg. 3795 / 3796* anon_size = MAX((int64_t)(arc_anon->arcs_size - arc_loaned_bytes), 0); 3797 3798 /* 3799 * Writes will, almost always, require additional memory allocations 3800 * in order to compress/encrypt/etc the data. We therefor need to 3801 * make sure that there is sufficient available memory for this. 3802 / 3803* if (error = arc_memory_throttle(reserve, anon_size, txg)) 3804 return (error); 3805 3806 /* 3807 * Throttle writes when the amount of dirty data in the cache 3808 * gets too large. We try to keep the cache less than half full 3809 * of dirty blocks so that our sync times don't grow too large. 3810 * Note: if two requests come in concurrently, we might let them 3811 * both succeed, when one of them should fail. Not a huge deal. 3812 / 3813* 3814 if (reserve + arc_tempreserve + anon_size > arc_c / 2 && 3815 anon_size > arc_c / 4) { 3816 dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK " 3817 "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n", 3818 arc_tempreserve>>10, 3819 arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10, 3820 arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10, 3821 reserve>>10, arc_c>>10); 3822 return (ERESTART); 3823 } 3824 atomic_add_64(&arc_tempreserve, reserve); 3825 return (0); 3826} 3827 3828static kmutex_t arc_lowmem_lock; 3829#ifdef _KERNEL 3830static eventhandler_tag arc_event_lowmem = NULL; 3831 3832static void 3833arc_lowmem(void arg __unused, int howto __unused) 3834{ 3835* 3836 /* Serialize access via arc_lowmem_lock. / 3837* mutex_enter(&arc_lowmem_lock); 3838 mutex_enter(&arc_reclaim_thr_lock); 3839 needfree = 1; 3840 cv_signal(&arc_reclaim_thr_cv); 3841 3842 /* 3843 * It is unsafe to block here in arbitrary threads, because we can come 3844 * here from ARC itself and may hold ARC locks and thus risk a deadlock 3845 * with ARC reclaim thread. 3846 / 3847* if (curproc == pageproc) { 3848 while (needfree) 3849 msleep(&needfree, &arc_reclaim_thr_lock, 0, "zfs:lowmem", 0); 3850 } 3851 mutex_exit(&arc_reclaim_thr_lock); 3852 mutex_exit(&arc_lowmem_lock); 3853} 3854#endif 3855 3856void 3857arc_init(void) 3858{ 3859 int i, prefetch_tunable_set = 0; 3860 3861 mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL); 3862 cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL); 3863 mutex_init(&arc_lowmem_lock, NULL, MUTEX_DEFAULT, NULL); 3864 3865 /* Convert seconds to clock ticks / 3866* arc_min_prefetch_lifespan = 1 * hz; 3867 3868 /* Start out with 1/8 of all memory / 3869* arc_c = kmem_size() / 8; 3870 3871#ifdef sun 3872#ifdef _KERNEL 3873 /* 3874 * On architectures where the physical memory can be larger 3875 * than the addressable space (intel in 32-bit mode), we may 3876 * need to limit the cache to 1/8 of VM size. 3877 / 3878* arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC \| VMEM_FREE) / 8); 3879#endif 3880#endif /* sun / 3881* /* set min cache to 1/32 of all memory, or 16MB, whichever is more / 3882* arc_c_min = MAX(arc_c / 4, 64<<18); 3883 /* set max to 1/2 of all memory, or all but 1GB, whichever is more / 3884* if (arc_c * 8 >= 1<<30) 3885 arc_c_max = (arc_c * 8) - (1<<30); 3886 else 3887 arc_c_max = arc_c_min; 3888 arc_c_max = MAX(arc_c * 5, arc_c_max); 3889 3890#ifdef _KERNEL 3891 /* 3892 * Allow the tunables to override our calculations if they are 3893 * reasonable (ie. over 16MB) 3894 / 3895* if (zfs_arc_max > 64<<18 && zfs_arc_max < kmem_size()) 3896 arc_c_max = zfs_arc_max; 3897 if (zfs_arc_min > 64<<18 && zfs_arc_min <= arc_c_max) 3898 arc_c_min = zfs_arc_min; 3899#endif 3900 3901 arc_c = arc_c_max; 3902 arc_p = (arc_c >> 1); 3903 3904 /* limit meta-data to 1/4 of the arc capacity / 3905* arc_meta_limit = arc_c_max / 4; 3906 3907 /* Allow the tunable to override if it is reasonable / 3908* if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max) 3909 arc_meta_limit = zfs_arc_meta_limit; 3910 3911 if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0) 3912 arc_c_min = arc_meta_limit / 2; 3913 3914 if (zfs_arc_grow_retry > 0) 3915 arc_grow_retry = zfs_arc_grow_retry; 3916 3917 if (zfs_arc_shrink_shift > 0) 3918 arc_shrink_shift = zfs_arc_shrink_shift; 3919 3920 if (zfs_arc_p_min_shift > 0) 3921 arc_p_min_shift = zfs_arc_p_min_shift; 3922 3923 /* if kmem_flags are set, lets try to use less memory / 3924* if (kmem_debugging()) 3925 arc_c = arc_c / 2; 3926 if (arc_c < arc_c_min) 3927 arc_c = arc_c_min; 3928 3929 zfs_arc_min = arc_c_min; 3930 zfs_arc_max = arc_c_max; 3931 3932 arc_anon = &ARC_anon; 3933 arc_mru = &ARC_mru; 3934 arc_mru_ghost = &ARC_mru_ghost; 3935 arc_mfu = &ARC_mfu; 3936 arc_mfu_ghost = &ARC_mfu_ghost; 3937 arc_l2c_only = &ARC_l2c_only; 3938 arc_size = 0; 3939 3940 for (i = 0; i < ARC_BUFC_NUMLISTS; i++) { 3941 mutex_init(&arc_anon->arcs_locks[i].arcs_lock, 3942 NULL, MUTEX_DEFAULT, NULL); 3943 mutex_init(&arc_mru->arcs_locks[i].arcs_lock, 3944 NULL, MUTEX_DEFAULT, NULL); 3945 mutex_init(&arc_mru_ghost->arcs_locks[i].arcs_lock, 3946 NULL, MUTEX_DEFAULT, NULL); 3947 mutex_init(&arc_mfu->arcs_locks[i].arcs_lock, 3948 NULL, MUTEX_DEFAULT, NULL); 3949 mutex_init(&arc_mfu_ghost->arcs_locks[i].arcs_lock, 3950 NULL, MUTEX_DEFAULT, NULL); 3951 mutex_init(&arc_l2c_only->arcs_locks[i].arcs_lock, 3952 NULL, MUTEX_DEFAULT, NULL); 3953 3954 list_create(&arc_mru->arcs_lists[i], 3955 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 3956 list_create(&arc_mru_ghost->arcs_lists[i], 3957 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 3958 list_create(&arc_mfu->arcs_lists[i], 3959 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 3960 list_create(&arc_mfu_ghost->arcs_lists[i], 3961 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 3962 list_create(&arc_mfu_ghost->arcs_lists[i], 3963 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 3964 list_create(&arc_l2c_only->arcs_lists[i], 3965 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 3966 } 3967 3968 buf_init(); 3969 3970 arc_thread_exit = 0; 3971 arc_eviction_list = NULL; 3972 mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL); 3973 bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t)); 3974 3975 arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED, 3976 sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); 3977 3978 if (arc_ksp != NULL) { 3979 arc_ksp->ks_data = &arc_stats; 3980 kstat_install(arc_ksp); 3981 } 3982 3983 (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0, 3984 TS_RUN, minclsyspri); 3985 3986#ifdef _KERNEL 3987 arc_event_lowmem = EVENTHANDLER_REGISTER(vm_lowmem, arc_lowmem, NULL, 3988 EVENTHANDLER_PRI_FIRST); 3989#endif 3990 3991 arc_dead = FALSE; 3992 arc_warm = B_FALSE; 3993 3994 if (zfs_write_limit_max == 0) 3995 zfs_write_limit_max = ptob(physmem) >> zfs_write_limit_shift; 3996 else 3997 zfs_write_limit_shift = 0; 3998 mutex_init(&zfs_write_limit_lock, NULL, MUTEX_DEFAULT, NULL); 3999 4000#ifdef _KERNEL 4001 if (TUNABLE_INT_FETCH("vfs.zfs.prefetch_disable", &zfs_prefetch_disable)) 4002 prefetch_tunable_set = 1; 4003 4004#ifdef __i386__ 4005 if (prefetch_tunable_set == 0) { 4006 printf("ZFS NOTICE: Prefetch is disabled by default on i386 " 4007 "-- to enable,\n"); 4008 printf(" add \"vfs.zfs.prefetch_disable=0\" " 4009 "to /boot/loader.conf.\n"); 4010 zfs_prefetch_disable = 1; 4011 } 4012#else 4013 if ((((uint64_t)physmem * PAGESIZE) < (1ULL << 32)) && 4014 prefetch_tunable_set == 0) { 4015 printf("ZFS NOTICE: Prefetch is disabled by default if less " 4016 "than 4GB of RAM is present;\n" 4017 " to enable, add \"vfs.zfs.prefetch_disable=0\" " 4018 "to /boot/loader.conf.\n"); 4019 zfs_prefetch_disable = 1; 4020 } 4021#endif 4022 /* Warn about ZFS memory and address space requirements. / 4023* if (((uint64_t)physmem * PAGESIZE) < (256 + 128 + 64) * (1 << 20)) { 4024 printf("ZFS WARNING: Recommended minimum RAM size is 512MB; " 4025 "expect unstable behavior.\n"); 4026 } 4027 if (kmem_size() < 512 * (1 << 20)) { 4028 printf("ZFS WARNING: Recommended minimum kmem_size is 512MB; " 4029 "expect unstable behavior.\n"); 4030 printf(" Consider tuning vm.kmem_size and " 4031 "vm.kmem_size_max\n"); 4032 printf(" in /boot/loader.conf.\n"); 4033 } 4034#endif 4035} 4036 4037void 4038arc_fini(void) 4039{ 4040 int i; 4041 4042 mutex_enter(&arc_reclaim_thr_lock); 4043 arc_thread_exit = 1; 4044 cv_signal(&arc_reclaim_thr_cv); 4045 while (arc_thread_exit != 0) 4046 cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock); 4047 mutex_exit(&arc_reclaim_thr_lock); 4048 4049 arc_flush(NULL); 4050 4051 arc_dead = TRUE; 4052 4053 if (arc_ksp != NULL) { 4054 kstat_delete(arc_ksp); 4055 arc_ksp = NULL; 4056 } 4057 4058 mutex_destroy(&arc_eviction_mtx); 4059 mutex_destroy(&arc_reclaim_thr_lock); 4060 cv_destroy(&arc_reclaim_thr_cv); 4061 4062 for (i = 0; i < ARC_BUFC_NUMLISTS; i++) { 4063 list_destroy(&arc_mru->arcs_lists[i]); 4064 list_destroy(&arc_mru_ghost->arcs_lists[i]); 4065 list_destroy(&arc_mfu->arcs_lists[i]); 4066 list_destroy(&arc_mfu_ghost->arcs_lists[i]); 4067 list_destroy(&arc_l2c_only->arcs_lists[i]); 4068 4069 mutex_destroy(&arc_anon->arcs_locks[i].arcs_lock); 4070 mutex_destroy(&arc_mru->arcs_locks[i].arcs_lock); 4071 mutex_destroy(&arc_mru_ghost->arcs_locks[i].arcs_lock); 4072 mutex_destroy(&arc_mfu->arcs_locks[i].arcs_lock); 4073 mutex_destroy(&arc_mfu_ghost->arcs_locks[i].arcs_lock); 4074 mutex_destroy(&arc_l2c_only->arcs_locks[i].arcs_lock); 4075 } 4076 4077 mutex_destroy(&zfs_write_limit_lock); 4078 4079 buf_fini(); 4080 4081 ASSERT(arc_loaned_bytes == 0); 4082 4083 mutex_destroy(&arc_lowmem_lock); 4084#ifdef _KERNEL 4085 if (arc_event_lowmem != NULL) 4086 EVENTHANDLER_DEREGISTER(vm_lowmem, arc_event_lowmem); 4087#endif 4088} 4089 4090/* 4091 * Level 2 ARC 4092 * 4093 * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk. 4094 * It uses dedicated storage devices to hold cached data, which are populated 4095 * using large infrequent writes. The main role of this cache is to boost 4096 * the performance of random read workloads. The intended L2ARC devices 4097 * include short-stroked disks, solid state disks, and other media with 4098 * substantially faster read latency than disk. 4099 * 4100 * +-----------------------+ 4101 * \| ARC \| 4102 * +-----------------------+ 4103 * \| ^ ^ 4104 * \| \| \| 4105 * l2arc_feed_thread() arc_read() 4106 * \| \| \| 4107 * \| l2arc read \| 4108 * V \| \| 4109 * +---------------+ \| 4110 * \| L2ARC \| \| 4111 * +---------------+ \| 4112 * \| ^ \| 4113 * l2arc_write() \| \| 4114 * \| \| \| 4115 * V \| \| 4116 * +-------+ +-------+ 4117 * \| vdev \| \| vdev \| 4118 * \| cache \| \| cache \| 4119 * +-------+ +-------+ 4120 * +=========+ .-----. 4121 * : L2ARC : \|-_____-\| 4122 * : devices : \| Disks \| 4123 * +=========+ `-_____-' 4124 * 4125 * Read requests are satisfied from the following sources, in order: 4126 * 4127 * 1) ARC 4128 * 2) vdev cache of L2ARC devices 4129 * 3) L2ARC devices 4130 * 4) vdev cache of disks 4131 * 5) disks 4132 * 4133 * Some L2ARC device types exhibit extremely slow write performance. 4134 * To accommodate for this there are some significant differences between 4135 * the L2ARC and traditional cache design: 4136 * 4137 * 1. There is no eviction path from the ARC to the L2ARC. Evictions from 4138 * the ARC behave as usual, freeing buffers and placing headers on ghost 4139 * lists. The ARC does not send buffers to the L2ARC during eviction as 4140 * this would add inflated write latencies for all ARC memory pressure. 4141 * 4142 * 2. The L2ARC attempts to cache data from the ARC before it is evicted. 4143 * It does this by periodically scanning buffers from the eviction-end of 4144 * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are 4145 * not already there. It scans until a headroom of buffers is satisfied, 4146 * which itself is a buffer for ARC eviction. The thread that does this is 4147 * l2arc_feed_thread(), illustrated below; example sizes are included to 4148 * provide a better sense of ratio than this diagram: 4149 * 4150 * head --> tail 4151 * +---------------------+----------+ 4152 * ARC_mfu \|:::::#:::::::::::::::\|o#o###o###\|-->. # already on L2ARC 4153 * +---------------------+----------+ \| o L2ARC eligible 4154 * ARC_mru \|:#:::::::::::::::::::\|#o#ooo####\|-->\| : ARC buffer 4155 * +---------------------+----------+ \| 4156 * 15.9 Gbytes ^ 32 Mbytes \| 4157 * headroom \| 4158 * l2arc_feed_thread() 4159 * \| 4160 * l2arc write hand <--[oooo]--' 4161 * \| 8 Mbyte 4162 * \| write max 4163 * V 4164 * +==============================+ 4165 * L2ARC dev \|####\|#\|###\|###\| \|####\| ... \| 4166 * +==============================+ 4167 * 32 Gbytes 4168 * 4169 * 3. If an ARC buffer is copied to the L2ARC but then hit instead of 4170 * evicted, then the L2ARC has cached a buffer much sooner than it probably 4171 * needed to, potentially wasting L2ARC device bandwidth and storage. It is 4172 * safe to say that this is an uncommon case, since buffers at the end of 4173 * the ARC lists have moved there due to inactivity. 4174 * 4175 * 4. If the ARC evicts faster than the L2ARC can maintain a headroom, 4176 * then the L2ARC simply misses copying some buffers. This serves as a 4177 * pressure valve to prevent heavy read workloads from both stalling the ARC 4178 * with waits and clogging the L2ARC with writes. This also helps prevent 4179 * the potential for the L2ARC to churn if it attempts to cache content too 4180 * quickly, such as during backups of the entire pool. 4181 * 4182 * 5. After system boot and before the ARC has filled main memory, there are 4183 * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru 4184 * lists can remain mostly static. Instead of searching from tail of these 4185 * lists as pictured, the l2arc_feed_thread() will search from the list heads 4186 * for eligible buffers, greatly increasing its chance of finding them. 4187 * 4188 * The L2ARC device write speed is also boosted during this time so that 4189 * the L2ARC warms up faster. Since there have been no ARC evictions yet, 4190 * there are no L2ARC reads, and no fear of degrading read performance 4191 * through increased writes. 4192 * 4193 * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that 4194 * the vdev queue can aggregate them into larger and fewer writes. Each 4195 * device is written to in a rotor fashion, sweeping writes through 4196 * available space then repeating. 4197 * 4198 * 7. The L2ARC does not store dirty content. It never needs to flush 4199 * write buffers back to disk based storage. 4200 * 4201 * 8. If an ARC buffer is written (and dirtied) which also exists in the 4202 * L2ARC, the now stale L2ARC buffer is immediately dropped. 4203 * 4204 * The performance of the L2ARC can be tweaked by a number of tunables, which 4205 * may be necessary for different workloads: 4206 * 4207 * l2arc_write_max max write bytes per interval 4208 * l2arc_write_boost extra write bytes during device warmup 4209 * l2arc_noprefetch skip caching prefetched buffers 4210 * l2arc_headroom number of max device writes to precache 4211 * l2arc_feed_secs seconds between L2ARC writing 4212 * 4213 * Tunables may be removed or added as future performance improvements are 4214 * integrated, and also may become zpool properties. 4215 * 4216 * There are three key functions that control how the L2ARC warms up: 4217 * 4218 * l2arc_write_eligible() check if a buffer is eligible to cache 4219 * l2arc_write_size() calculate how much to write 4220 * l2arc_write_interval() calculate sleep delay between writes 4221 * 4222 * These three functions determine what to write, how much, and how quickly 4223 * to send writes. 4224 / 4225* 4226static boolean_t 4227l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t ab) 4228{ 4229* /* 4230 * A buffer is not eligible for the L2ARC if it: 4231 * 1. belongs to a different spa. 4232 * 2. is already cached on the L2ARC. 4233 * 3. has an I/O in progress (it may be an incomplete read). 4234 * 4. is flagged not eligible (zfs property). 4235 / 4236* if (ab->b_spa != spa_guid) { 4237 ARCSTAT_BUMP(arcstat_l2_write_spa_mismatch); 4238 return (B_FALSE); 4239 } 4240 if (ab->b_l2hdr != NULL) { 4241 ARCSTAT_BUMP(arcstat_l2_write_in_l2); 4242 return (B_FALSE); 4243 } 4244 if (HDR_IO_IN_PROGRESS(ab)) { 4245 ARCSTAT_BUMP(arcstat_l2_write_hdr_io_in_progress); 4246 return (B_FALSE); 4247 } 4248 if (!HDR_L2CACHE(ab)) { 4249 ARCSTAT_BUMP(arcstat_l2_write_not_cacheable); 4250 return (B_FALSE); 4251 } 4252 4253 return (B_TRUE); 4254} 4255 4256static uint64_t 4257l2arc_write_size(l2arc_dev_t dev) 4258{ 4259* uint64_t size; 4260 4261 size = dev->l2ad_write; 4262 4263 if (arc_warm == B_FALSE) 4264 size += dev->l2ad_boost; 4265 4266 return (size); 4267 4268} 4269 4270static clock_t 4271l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote) 4272{ 4273 clock_t interval, next, now; 4274 4275 /* 4276 * If the ARC lists are busy, increase our write rate; if the 4277 * lists are stale, idle back. This is achieved by checking 4278 * how much we previously wrote - if it was more than half of 4279 * what we wanted, schedule the next write much sooner. 4280 / 4281* if (l2arc_feed_again && wrote > (wanted / 2)) 4282 interval = (hz * l2arc_feed_min_ms) / 1000; 4283 else 4284 interval = hz * l2arc_feed_secs; 4285 4286 now = ddi_get_lbolt(); 4287 next = MAX(now, MIN(now + interval, began + interval)); 4288 4289 return (next); 4290} 4291 4292static void 4293l2arc_hdr_stat_add(void) 4294{ 4295 ARCSTAT_INCR(arcstat_l2_hdr_size, HDR_SIZE + L2HDR_SIZE); 4296 ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE); 4297} 4298 4299static void 4300l2arc_hdr_stat_remove(void) 4301{ 4302 ARCSTAT_INCR(arcstat_l2_hdr_size, -(HDR_SIZE + L2HDR_SIZE)); 4303 ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE); 4304} 4305 4306/* 4307 * Cycle through L2ARC devices. This is how L2ARC load balances. 4308 * If a device is returned, this also returns holding the spa config lock. 4309 / 4310static l2arc_dev_t 4311l2arc_dev_get_next(void) 4312{ 4313 l2arc_dev_t first, next = NULL; 4314 4315 /* 4316 * Lock out the removal of spas (spa_namespace_lock), then removal 4317 * of cache devices (l2arc_dev_mtx). Once a device has been selected, 4318 * both locks will be dropped and a spa config lock held instead. 4319 / 4320* mutex_enter(&spa_namespace_lock); 4321 mutex_enter(&l2arc_dev_mtx); 4322 4323 /* if there are no vdevs, there is nothing to do / 4324* if (l2arc_ndev == 0) 4325 goto out; 4326 4327 first = NULL; 4328 next = l2arc_dev_last; 4329 do { 4330 /* loop around the list looking for a non-faulted vdev / 4331* if (next == NULL) { 4332 next = list_head(l2arc_dev_list); 4333 } else { 4334 next = list_next(l2arc_dev_list, next); 4335 if (next == NULL) 4336 next = list_head(l2arc_dev_list); 4337 } 4338 4339 /* if we have come back to the start, bail out / 4340* if (first == NULL) 4341 first = next; 4342 else if (next == first) 4343 break; 4344 4345 } while (vdev_is_dead(next->l2ad_vdev)); 4346 4347 /* if we were unable to find any usable vdevs, return NULL / 4348* if (vdev_is_dead(next->l2ad_vdev)) 4349 next = NULL; 4350 4351 l2arc_dev_last = next; 4352 4353out: 4354 mutex_exit(&l2arc_dev_mtx); 4355 4356 /* 4357 * Grab the config lock to prevent the 'next' device from being 4358 * removed while we are writing to it. 4359 / 4360* if (next != NULL) 4361 spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER); 4362 mutex_exit(&spa_namespace_lock); 4363 4364 return (next); 4365} 4366 4367/* 4368 * Free buffers that were tagged for destruction. 4369 / 4370static void 4371l2arc_do_free_on_write() 4372{ 4373* list_t buflist; 4374* l2arc_data_free_t df, df_prev; 4375 4376 mutex_enter(&l2arc_free_on_write_mtx); 4377 buflist = l2arc_free_on_write; 4378 4379 for (df = list_tail(buflist); df; df = df_prev) { 4380 df_prev = list_prev(buflist, df); 4381 ASSERT(df->l2df_data != NULL); 4382 ASSERT(df->l2df_func != NULL); 4383 df->l2df_func(df->l2df_data, df->l2df_size); 4384 list_remove(buflist, df); 4385 kmem_free(df, sizeof (l2arc_data_free_t)); 4386 } 4387 4388 mutex_exit(&l2arc_free_on_write_mtx); 4389} 4390 4391/* 4392 * A write to a cache device has completed. Update all headers to allow 4393 * reads from these buffers to begin. 4394 / 4395static void 4396l2arc_write_done(zio_t zio) 4397{ 4398 l2arc_write_callback_t cb; 4399* l2arc_dev_t dev; 4400* list_t buflist; 4401* arc_buf_hdr_t head, ab, ab_prev; 4402* l2arc_buf_hdr_t abl2; 4403* kmutex_t hash_lock; 4404* 4405 cb = zio->io_private; 4406 ASSERT(cb != NULL); 4407 dev = cb->l2wcb_dev; 4408 ASSERT(dev != NULL); 4409 head = cb->l2wcb_head; 4410 ASSERT(head != NULL); 4411 buflist = dev->l2ad_buflist; 4412 ASSERT(buflist != NULL); 4413 DTRACE_PROBE2(l2arc__iodone, zio_t , zio, 4414* l2arc_write_callback_t , cb); 4415* 4416 if (zio->io_error != 0) 4417 ARCSTAT_BUMP(arcstat_l2_writes_error); 4418 4419 mutex_enter(&l2arc_buflist_mtx); 4420 4421 /* 4422 * All writes completed, or an error was hit. 4423 / 4424* for (ab = list_prev(buflist, head); ab; ab = ab_prev) { 4425 ab_prev = list_prev(buflist, ab); 4426 4427 hash_lock = HDR_LOCK(ab); 4428 if (!mutex_tryenter(hash_lock)) { 4429 /* 4430 * This buffer misses out. It may be in a stage 4431 * of eviction. Its ARC_L2_WRITING flag will be 4432 * left set, denying reads to this buffer. 4433 / 4434* ARCSTAT_BUMP(arcstat_l2_writes_hdr_miss); 4435 continue; 4436 } 4437 4438 if (zio->io_error != 0) { 4439 /* 4440 * Error - drop L2ARC entry. 4441 / 4442* list_remove(buflist, ab); 4443 abl2 = ab->b_l2hdr; 4444 ab->b_l2hdr = NULL; 4445 kmem_free(abl2, sizeof (l2arc_buf_hdr_t)); 4446 ARCSTAT_INCR(arcstat_l2_size, -ab->b_size); 4447 } 4448 4449 /* 4450 * Allow ARC to begin reads to this L2ARC entry. 4451 / 4452* ab->b_flags &= ~ARC_L2_WRITING; 4453 4454 mutex_exit(hash_lock); 4455 } 4456 4457 atomic_inc_64(&l2arc_writes_done); 4458 list_remove(buflist, head); 4459 kmem_cache_free(hdr_cache, head); 4460 mutex_exit(&l2arc_buflist_mtx); 4461 4462 l2arc_do_free_on_write(); 4463 4464 kmem_free(cb, sizeof (l2arc_write_callback_t)); 4465} 4466 4467/* 4468 * A read to a cache device completed. Validate buffer contents before 4469 * handing over to the regular ARC routines. 4470 / 4471static void 4472l2arc_read_done(zio_t zio) 4473{ 4474 l2arc_read_callback_t cb; 4475* arc_buf_hdr_t hdr; 4476* arc_buf_t buf; 4477* kmutex_t hash_lock; 4478* int equal; 4479 4480 ASSERT(zio->io_vd != NULL); 4481 ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE); 4482 4483 spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd); 4484 4485 cb = zio->io_private; 4486 ASSERT(cb != NULL); 4487 buf = cb->l2rcb_buf; 4488 ASSERT(buf != NULL); 4489 4490 hash_lock = HDR_LOCK(buf->b_hdr); 4491 mutex_enter(hash_lock); 4492 hdr = buf->b_hdr; 4493 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 4494 4495 /* 4496 * Check this survived the L2ARC journey. 4497 / 4498* equal = arc_cksum_equal(buf); 4499 if (equal && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) { 4500 mutex_exit(hash_lock); 4501 zio->io_private = buf; 4502 zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 / 4503* zio->io_bp = &zio->io_bp_copy; /* XXX fix in L2ARC 2.0 / 4504* arc_read_done(zio); 4505 } else { 4506 mutex_exit(hash_lock); 4507 /* 4508 * Buffer didn't survive caching. Increment stats and 4509 * reissue to the original storage device. 4510 / 4511* if (zio->io_error != 0) { 4512 ARCSTAT_BUMP(arcstat_l2_io_error); 4513 } else { 4514 zio->io_error = EIO; 4515 } 4516 if (!equal) 4517 ARCSTAT_BUMP(arcstat_l2_cksum_bad); 4518 4519 /* 4520 * If there's no waiter, issue an async i/o to the primary 4521 * storage now. If there is a waiter, the caller must 4522 * issue the i/o in a context where it's OK to block. 4523 / 4524* if (zio->io_waiter == NULL) { 4525 zio_t pio = zio_unique_parent(zio); 4526* 4527 ASSERT(!pio \|\| pio->io_child_type == ZIO_CHILD_LOGICAL); 4528 4529 zio_nowait(zio_read(pio, cb->l2rcb_spa, &cb->l2rcb_bp, 4530 buf->b_data, zio->io_size, arc_read_done, buf, 4531 zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb)); 4532 } 4533 } 4534 4535 kmem_free(cb, sizeof (l2arc_read_callback_t)); 4536} 4537 4538/* 4539 * This is the list priority from which the L2ARC will search for pages to 4540 * cache. This is used within loops (0..3) to cycle through lists in the 4541 * desired order. This order can have a significant effect on cache 4542 * performance. 4543 * 4544 * Currently the metadata lists are hit first, MFU then MRU, followed by 4545 * the data lists. This function returns a locked list, and also returns 4546 * the lock pointer. 4547 / 4548static list_t 4549l2arc_list_locked(int list_num, kmutex_t *lock) 4550*{
4547 list_t *list;	4551 list_t *list = NULL;
4548 int idx; 4549 4550 ASSERT(list_num >= 0 && list_num < 2 * ARC_BUFC_NUMLISTS); 4551 4552 if (list_num < ARC_BUFC_NUMMETADATALISTS) { 4553 idx = list_num; 4554 list = &arc_mfu->arcs_lists[idx]; 4555 lock = ARCS_LOCK(arc_mfu, idx); 4556* } else if (list_num < ARC_BUFC_NUMMETADATALISTS * 2) { 4557 idx = list_num - ARC_BUFC_NUMMETADATALISTS; 4558 list = &arc_mru->arcs_lists[idx]; 4559 lock = ARCS_LOCK(arc_mru, idx); 4560* } else if (list_num < (ARC_BUFC_NUMMETADATALISTS * 2 + 4561 ARC_BUFC_NUMDATALISTS)) { 4562 idx = list_num - ARC_BUFC_NUMMETADATALISTS; 4563 list = &arc_mfu->arcs_lists[idx]; 4564 lock = ARCS_LOCK(arc_mfu, idx); 4565* } else { 4566 idx = list_num - ARC_BUFC_NUMLISTS; 4567 list = &arc_mru->arcs_lists[idx]; 4568 lock = ARCS_LOCK(arc_mru, idx); 4569* } 4570 4571 ASSERT(!(MUTEX_HELD(lock))); 4572* mutex_enter(lock); 4573* return (list); 4574} 4575 4576/* 4577 * Evict buffers from the device write hand to the distance specified in 4578 * bytes. This distance may span populated buffers, it may span nothing. 4579 * This is clearing a region on the L2ARC device ready for writing. 4580 * If the 'all' boolean is set, every buffer is evicted. 4581 / 4582static void 4583l2arc_evict(l2arc_dev_t dev, uint64_t distance, boolean_t all) 4584{ 4585 list_t buflist; 4586* l2arc_buf_hdr_t abl2; 4587* arc_buf_hdr_t ab, ab_prev; 4588 kmutex_t hash_lock; 4589* uint64_t taddr; 4590 4591 buflist = dev->l2ad_buflist; 4592 4593 if (buflist == NULL) 4594 return; 4595 4596 if (!all && dev->l2ad_first) { 4597 /* 4598 * This is the first sweep through the device. There is 4599 * nothing to evict. 4600 / 4601* return; 4602 } 4603 4604 if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) { 4605 /* 4606 * When nearing the end of the device, evict to the end 4607 * before the device write hand jumps to the start. 4608 / 4609* taddr = dev->l2ad_end; 4610 } else { 4611 taddr = dev->l2ad_hand + distance; 4612 } 4613 DTRACE_PROBE4(l2arc__evict, l2arc_dev_t , dev, list_t , buflist, 4614 uint64_t, taddr, boolean_t, all); 4615 4616top: 4617 mutex_enter(&l2arc_buflist_mtx); 4618 for (ab = list_tail(buflist); ab; ab = ab_prev) { 4619 ab_prev = list_prev(buflist, ab); 4620 4621 hash_lock = HDR_LOCK(ab); 4622 if (!mutex_tryenter(hash_lock)) { 4623 /* 4624 * Missed the hash lock. Retry. 4625 / 4626* ARCSTAT_BUMP(arcstat_l2_evict_lock_retry); 4627 mutex_exit(&l2arc_buflist_mtx); 4628 mutex_enter(hash_lock); 4629 mutex_exit(hash_lock); 4630 goto top; 4631 } 4632 4633 if (HDR_L2_WRITE_HEAD(ab)) { 4634 /* 4635 * We hit a write head node. Leave it for 4636 * l2arc_write_done(). 4637 / 4638* list_remove(buflist, ab); 4639 mutex_exit(hash_lock); 4640 continue; 4641 } 4642 4643 if (!all && ab->b_l2hdr != NULL && 4644 (ab->b_l2hdr->b_daddr > taddr \|\| 4645 ab->b_l2hdr->b_daddr < dev->l2ad_hand)) { 4646 /* 4647 * We've evicted to the target address, 4648 * or the end of the device. 4649 / 4650* mutex_exit(hash_lock); 4651 break; 4652 } 4653 4654 if (HDR_FREE_IN_PROGRESS(ab)) { 4655 /* 4656 * Already on the path to destruction. 4657 / 4658* mutex_exit(hash_lock); 4659 continue; 4660 } 4661 4662 if (ab->b_state == arc_l2c_only) { 4663 ASSERT(!HDR_L2_READING(ab)); 4664 /* 4665 * This doesn't exist in the ARC. Destroy. 4666 * arc_hdr_destroy() will call list_remove() 4667 * and decrement arcstat_l2_size. 4668 / 4669* arc_change_state(arc_anon, ab, hash_lock); 4670 arc_hdr_destroy(ab); 4671 } else { 4672 /* 4673 * Invalidate issued or about to be issued 4674 * reads, since we may be about to write 4675 * over this location. 4676 / 4677* if (HDR_L2_READING(ab)) { 4678 ARCSTAT_BUMP(arcstat_l2_evict_reading); 4679 ab->b_flags \|= ARC_L2_EVICTED; 4680 } 4681 4682 /* 4683 * Tell ARC this no longer exists in L2ARC. 4684 / 4685* if (ab->b_l2hdr != NULL) { 4686 abl2 = ab->b_l2hdr; 4687 ab->b_l2hdr = NULL; 4688 kmem_free(abl2, sizeof (l2arc_buf_hdr_t)); 4689 ARCSTAT_INCR(arcstat_l2_size, -ab->b_size); 4690 } 4691 list_remove(buflist, ab); 4692 4693 /* 4694 * This may have been leftover after a 4695 * failed write. 4696 / 4697* ab->b_flags &= ~ARC_L2_WRITING; 4698 } 4699 mutex_exit(hash_lock); 4700 } 4701 mutex_exit(&l2arc_buflist_mtx); 4702 4703 vdev_space_update(dev->l2ad_vdev, -(taddr - dev->l2ad_evict), 0, 0); 4704 dev->l2ad_evict = taddr; 4705} 4706 4707/* 4708 * Find and write ARC buffers to the L2ARC device. 4709 * 4710 * An ARC_L2_WRITING flag is set so that the L2ARC buffers are not valid 4711 * for reading until they have completed writing. 4712 / 4713static uint64_t 4714l2arc_write_buffers(spa_t spa, l2arc_dev_t dev, uint64_t target_sz) 4715{ 4716* arc_buf_hdr_t ab, ab_prev, head; 4717* l2arc_buf_hdr_t hdrl2; 4718* list_t list; 4719* uint64_t passed_sz, write_sz, buf_sz, headroom; 4720 void buf_data; 4721* kmutex_t hash_lock, list_lock; 4722 boolean_t have_lock, full; 4723 l2arc_write_callback_t cb; 4724* zio_t pio, wzio; 4725 uint64_t guid = spa_load_guid(spa); 4726 int try; 4727 4728 ASSERT(dev->l2ad_vdev != NULL); 4729 4730 pio = NULL; 4731 write_sz = 0; 4732 full = B_FALSE; 4733 head = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE); 4734 head->b_flags \|= ARC_L2_WRITE_HEAD; 4735 4736 ARCSTAT_BUMP(arcstat_l2_write_buffer_iter); 4737 /* 4738 * Copy buffers for L2ARC writing. 4739 / 4740* mutex_enter(&l2arc_buflist_mtx); 4741 for (try = 0; try < 2 * ARC_BUFC_NUMLISTS; try++) { 4742 list = l2arc_list_locked(try, &list_lock); 4743 passed_sz = 0; 4744 ARCSTAT_BUMP(arcstat_l2_write_buffer_list_iter); 4745 4746 /* 4747 * L2ARC fast warmup. 4748 * 4749 * Until the ARC is warm and starts to evict, read from the 4750 * head of the ARC lists rather than the tail. 4751 / 4752* headroom = target_sz * l2arc_headroom; 4753 if (arc_warm == B_FALSE) 4754 ab = list_head(list); 4755 else 4756 ab = list_tail(list); 4757 if (ab == NULL) 4758 ARCSTAT_BUMP(arcstat_l2_write_buffer_list_null_iter); 4759 4760 for (; ab; ab = ab_prev) { 4761 if (arc_warm == B_FALSE) 4762 ab_prev = list_next(list, ab); 4763 else 4764 ab_prev = list_prev(list, ab); 4765 ARCSTAT_INCR(arcstat_l2_write_buffer_bytes_scanned, ab->b_size); 4766 4767 hash_lock = HDR_LOCK(ab); 4768 have_lock = MUTEX_HELD(hash_lock); 4769 if (!have_lock && !mutex_tryenter(hash_lock)) { 4770 ARCSTAT_BUMP(arcstat_l2_write_trylock_fail); 4771 /* 4772 * Skip this buffer rather than waiting. 4773 / 4774* continue; 4775 } 4776 4777 passed_sz += ab->b_size; 4778 if (passed_sz > headroom) { 4779 /* 4780 * Searched too far. 4781 / 4782* mutex_exit(hash_lock); 4783 ARCSTAT_BUMP(arcstat_l2_write_passed_headroom); 4784 break; 4785 } 4786 4787 if (!l2arc_write_eligible(guid, ab)) { 4788 mutex_exit(hash_lock); 4789 continue; 4790 } 4791 4792 if ((write_sz + ab->b_size) > target_sz) { 4793 full = B_TRUE; 4794 mutex_exit(hash_lock); 4795 ARCSTAT_BUMP(arcstat_l2_write_full); 4796 break; 4797 } 4798 4799 if (pio == NULL) { 4800 /* 4801 * Insert a dummy header on the buflist so 4802 * l2arc_write_done() can find where the 4803 * write buffers begin without searching. 4804 / 4805* list_insert_head(dev->l2ad_buflist, head); 4806 4807 cb = kmem_alloc( 4808 sizeof (l2arc_write_callback_t), KM_SLEEP); 4809 cb->l2wcb_dev = dev; 4810 cb->l2wcb_head = head; 4811 pio = zio_root(spa, l2arc_write_done, cb, 4812 ZIO_FLAG_CANFAIL); 4813 ARCSTAT_BUMP(arcstat_l2_write_pios); 4814 } 4815 4816 /* 4817 * Create and add a new L2ARC header. 4818 / 4819* hdrl2 = kmem_zalloc(sizeof (l2arc_buf_hdr_t), KM_SLEEP); 4820 hdrl2->b_dev = dev; 4821 hdrl2->b_daddr = dev->l2ad_hand; 4822 4823 ab->b_flags \|= ARC_L2_WRITING; 4824 ab->b_l2hdr = hdrl2; 4825 list_insert_head(dev->l2ad_buflist, ab); 4826 buf_data = ab->b_buf->b_data; 4827 buf_sz = ab->b_size; 4828 4829 /* 4830 * Compute and store the buffer cksum before 4831 * writing. On debug the cksum is verified first. 4832 / 4833* arc_cksum_verify(ab->b_buf); 4834 arc_cksum_compute(ab->b_buf, B_TRUE); 4835 4836 mutex_exit(hash_lock); 4837 4838 wzio = zio_write_phys(pio, dev->l2ad_vdev, 4839 dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF, 4840 NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE, 4841 ZIO_FLAG_CANFAIL, B_FALSE); 4842 4843 DTRACE_PROBE2(l2arc__write, vdev_t , dev->l2ad_vdev, 4844* zio_t , wzio); 4845* (void) zio_nowait(wzio); 4846 4847 /* 4848 * Keep the clock hand suitably device-aligned. 4849 / 4850* buf_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz); 4851 4852 write_sz += buf_sz; 4853 dev->l2ad_hand += buf_sz; 4854 } 4855 4856 mutex_exit(list_lock); 4857 4858 if (full == B_TRUE) 4859 break; 4860 } 4861 mutex_exit(&l2arc_buflist_mtx); 4862 4863 if (pio == NULL) { 4864 ASSERT0(write_sz); 4865 kmem_cache_free(hdr_cache, head); 4866 return (0); 4867 } 4868 4869 ASSERT3U(write_sz, <=, target_sz); 4870 ARCSTAT_BUMP(arcstat_l2_writes_sent); 4871 ARCSTAT_INCR(arcstat_l2_write_bytes, write_sz); 4872 ARCSTAT_INCR(arcstat_l2_size, write_sz); 4873 vdev_space_update(dev->l2ad_vdev, write_sz, 0, 0); 4874 4875 /* 4876 * Bump device hand to the device start if it is approaching the end. 4877 * l2arc_evict() will already have evicted ahead for this case. 4878 / 4879* if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) { 4880 vdev_space_update(dev->l2ad_vdev, 4881 dev->l2ad_end - dev->l2ad_hand, 0, 0); 4882 dev->l2ad_hand = dev->l2ad_start; 4883 dev->l2ad_evict = dev->l2ad_start; 4884 dev->l2ad_first = B_FALSE; 4885 } 4886 4887 dev->l2ad_writing = B_TRUE; 4888 (void) zio_wait(pio); 4889 dev->l2ad_writing = B_FALSE; 4890 4891 return (write_sz); 4892} 4893 4894/* 4895 * This thread feeds the L2ARC at regular intervals. This is the beating 4896 * heart of the L2ARC. 4897 / 4898static void 4899l2arc_feed_thread(void dummy __unused) 4900{ 4901 callb_cpr_t cpr; 4902 l2arc_dev_t dev; 4903* spa_t spa; 4904* uint64_t size, wrote; 4905 clock_t begin, next = ddi_get_lbolt(); 4906 4907 CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG); 4908 4909 mutex_enter(&l2arc_feed_thr_lock); 4910 4911 while (l2arc_thread_exit == 0) { 4912 CALLB_CPR_SAFE_BEGIN(&cpr); 4913 (void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock, 4914 next - ddi_get_lbolt()); 4915 CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock); 4916 next = ddi_get_lbolt() + hz; 4917 4918 /* 4919 * Quick check for L2ARC devices. 4920 / 4921* mutex_enter(&l2arc_dev_mtx); 4922 if (l2arc_ndev == 0) { 4923 mutex_exit(&l2arc_dev_mtx); 4924 continue; 4925 } 4926 mutex_exit(&l2arc_dev_mtx); 4927 begin = ddi_get_lbolt(); 4928 4929 /* 4930 * This selects the next l2arc device to write to, and in 4931 * doing so the next spa to feed from: dev->l2ad_spa. This 4932 * will return NULL if there are now no l2arc devices or if 4933 * they are all faulted. 4934 * 4935 * If a device is returned, its spa's config lock is also 4936 * held to prevent device removal. l2arc_dev_get_next() 4937 * will grab and release l2arc_dev_mtx. 4938 / 4939* if ((dev = l2arc_dev_get_next()) == NULL) 4940 continue; 4941 4942 spa = dev->l2ad_spa; 4943 ASSERT(spa != NULL); 4944 4945 /* 4946 * If the pool is read-only then force the feed thread to 4947 * sleep a little longer. 4948 / 4949* if (!spa_writeable(spa)) { 4950 next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz; 4951 spa_config_exit(spa, SCL_L2ARC, dev); 4952 continue; 4953 } 4954 4955 /* 4956 * Avoid contributing to memory pressure. 4957 / 4958* if (arc_reclaim_needed()) { 4959 ARCSTAT_BUMP(arcstat_l2_abort_lowmem); 4960 spa_config_exit(spa, SCL_L2ARC, dev); 4961 continue; 4962 } 4963 4964 ARCSTAT_BUMP(arcstat_l2_feeds); 4965 4966 size = l2arc_write_size(dev); 4967 4968 /* 4969 * Evict L2ARC buffers that will be overwritten. 4970 / 4971* l2arc_evict(dev, size, B_FALSE); 4972 4973 /* 4974 * Write ARC buffers. 4975 / 4976* wrote = l2arc_write_buffers(spa, dev, size); 4977 4978 /* 4979 * Calculate interval between writes. 4980 / 4981* next = l2arc_write_interval(begin, size, wrote); 4982 spa_config_exit(spa, SCL_L2ARC, dev); 4983 } 4984 4985 l2arc_thread_exit = 0; 4986 cv_broadcast(&l2arc_feed_thr_cv); 4987 CALLB_CPR_EXIT(&cpr); /* drops l2arc_feed_thr_lock / 4988* thread_exit(); 4989} 4990 4991boolean_t 4992l2arc_vdev_present(vdev_t vd) 4993{ 4994* l2arc_dev_t dev; 4995* 4996 mutex_enter(&l2arc_dev_mtx); 4997 for (dev = list_head(l2arc_dev_list); dev != NULL; 4998 dev = list_next(l2arc_dev_list, dev)) { 4999 if (dev->l2ad_vdev == vd) 5000 break; 5001 } 5002 mutex_exit(&l2arc_dev_mtx); 5003 5004 return (dev != NULL); 5005} 5006 5007/* 5008 * Add a vdev for use by the L2ARC. By this point the spa has already 5009 * validated the vdev and opened it. 5010 / 5011void 5012l2arc_add_vdev(spa_t spa, vdev_t vd) 5013{ 5014* l2arc_dev_t adddev; 5015* 5016 ASSERT(!l2arc_vdev_present(vd)); 5017 5018 /* 5019 * Create a new l2arc device entry. 5020 / 5021* adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP); 5022 adddev->l2ad_spa = spa; 5023 adddev->l2ad_vdev = vd; 5024 adddev->l2ad_write = l2arc_write_max; 5025 adddev->l2ad_boost = l2arc_write_boost; 5026 adddev->l2ad_start = VDEV_LABEL_START_SIZE; 5027 adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd); 5028 adddev->l2ad_hand = adddev->l2ad_start; 5029 adddev->l2ad_evict = adddev->l2ad_start; 5030 adddev->l2ad_first = B_TRUE; 5031 adddev->l2ad_writing = B_FALSE; 5032 ASSERT3U(adddev->l2ad_write, >, 0); 5033 5034 /* 5035 * This is a list of all ARC buffers that are still valid on the 5036 * device. 5037 / 5038* adddev->l2ad_buflist = kmem_zalloc(sizeof (list_t), KM_SLEEP); 5039 list_create(adddev->l2ad_buflist, sizeof (arc_buf_hdr_t), 5040 offsetof(arc_buf_hdr_t, b_l2node)); 5041 5042 vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand); 5043 5044 /* 5045 * Add device to global list 5046 / 5047* mutex_enter(&l2arc_dev_mtx); 5048 list_insert_head(l2arc_dev_list, adddev); 5049 atomic_inc_64(&l2arc_ndev); 5050 mutex_exit(&l2arc_dev_mtx); 5051} 5052 5053/* 5054 * Remove a vdev from the L2ARC. 5055 / 5056void 5057l2arc_remove_vdev(vdev_t vd) 5058{ 5059 l2arc_dev_t dev, nextdev, remdev = NULL; 5060* 5061 /* 5062 * Find the device by vdev 5063 / 5064* mutex_enter(&l2arc_dev_mtx); 5065 for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) { 5066 nextdev = list_next(l2arc_dev_list, dev); 5067 if (vd == dev->l2ad_vdev) { 5068 remdev = dev; 5069 break; 5070 } 5071 } 5072 ASSERT(remdev != NULL); 5073 5074 /* 5075 * Remove device from global list 5076 / 5077* list_remove(l2arc_dev_list, remdev); 5078 l2arc_dev_last = NULL; /* may have been invalidated / 5079* atomic_dec_64(&l2arc_ndev); 5080 mutex_exit(&l2arc_dev_mtx); 5081 5082 /* 5083 * Clear all buflists and ARC references. L2ARC device flush. 5084 / 5085* l2arc_evict(remdev, 0, B_TRUE); 5086 list_destroy(remdev->l2ad_buflist); 5087 kmem_free(remdev->l2ad_buflist, sizeof (list_t)); 5088 kmem_free(remdev, sizeof (l2arc_dev_t)); 5089} 5090 5091void 5092l2arc_init(void) 5093{ 5094 l2arc_thread_exit = 0; 5095 l2arc_ndev = 0; 5096 l2arc_writes_sent = 0; 5097 l2arc_writes_done = 0; 5098 5099 mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL); 5100 cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL); 5101 mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL); 5102 mutex_init(&l2arc_buflist_mtx, NULL, MUTEX_DEFAULT, NULL); 5103 mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL); 5104 5105 l2arc_dev_list = &L2ARC_dev_list; 5106 l2arc_free_on_write = &L2ARC_free_on_write; 5107 list_create(l2arc_dev_list, sizeof (l2arc_dev_t), 5108 offsetof(l2arc_dev_t, l2ad_node)); 5109 list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t), 5110 offsetof(l2arc_data_free_t, l2df_list_node)); 5111} 5112 5113void 5114l2arc_fini(void) 5115{ 5116 /* 5117 * This is called from dmu_fini(), which is called from spa_fini(); 5118 * Because of this, we can assume that all l2arc devices have 5119 * already been removed when the pools themselves were removed. 5120 / 5121* 5122 l2arc_do_free_on_write(); 5123 5124 mutex_destroy(&l2arc_feed_thr_lock); 5125 cv_destroy(&l2arc_feed_thr_cv); 5126 mutex_destroy(&l2arc_dev_mtx); 5127 mutex_destroy(&l2arc_buflist_mtx); 5128 mutex_destroy(&l2arc_free_on_write_mtx); 5129 5130 list_destroy(l2arc_dev_list); 5131 list_destroy(l2arc_free_on_write); 5132} 5133 5134void 5135l2arc_start(void) 5136{ 5137 if (!(spa_mode_global & FWRITE)) 5138 return; 5139 5140 (void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0, 5141 TS_RUN, minclsyspri); 5142} 5143 5144void 5145l2arc_stop(void) 5146{ 5147 if (!(spa_mode_global & FWRITE)) 5148 return; 5149 5150 mutex_enter(&l2arc_feed_thr_lock); 5151 cv_signal(&l2arc_feed_thr_cv); /* kick thread out of startup / 5152* l2arc_thread_exit = 1; 5153 while (l2arc_thread_exit != 0) 5154 cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock); 5155 mutex_exit(&l2arc_feed_thr_lock); 5156}	4552 int idx; 4553 4554 ASSERT(list_num >= 0 && list_num < 2 * ARC_BUFC_NUMLISTS); 4555 4556 if (list_num < ARC_BUFC_NUMMETADATALISTS) { 4557 idx = list_num; 4558 list = &arc_mfu->arcs_lists[idx]; 4559 lock = ARCS_LOCK(arc_mfu, idx); 4560* } else if (list_num < ARC_BUFC_NUMMETADATALISTS * 2) { 4561 idx = list_num - ARC_BUFC_NUMMETADATALISTS; 4562 list = &arc_mru->arcs_lists[idx]; 4563 lock = ARCS_LOCK(arc_mru, idx); 4564* } else if (list_num < (ARC_BUFC_NUMMETADATALISTS * 2 + 4565 ARC_BUFC_NUMDATALISTS)) { 4566 idx = list_num - ARC_BUFC_NUMMETADATALISTS; 4567 list = &arc_mfu->arcs_lists[idx]; 4568 lock = ARCS_LOCK(arc_mfu, idx); 4569* } else { 4570 idx = list_num - ARC_BUFC_NUMLISTS; 4571 list = &arc_mru->arcs_lists[idx]; 4572 lock = ARCS_LOCK(arc_mru, idx); 4573* } 4574 4575 ASSERT(!(MUTEX_HELD(lock))); 4576* mutex_enter(lock); 4577* return (list); 4578} 4579 4580/* 4581 * Evict buffers from the device write hand to the distance specified in 4582 * bytes. This distance may span populated buffers, it may span nothing. 4583 * This is clearing a region on the L2ARC device ready for writing. 4584 * If the 'all' boolean is set, every buffer is evicted. 4585 / 4586static void 4587l2arc_evict(l2arc_dev_t dev, uint64_t distance, boolean_t all) 4588{ 4589 list_t buflist; 4590* l2arc_buf_hdr_t abl2; 4591* arc_buf_hdr_t ab, ab_prev; 4592 kmutex_t hash_lock; 4593* uint64_t taddr; 4594 4595 buflist = dev->l2ad_buflist; 4596 4597 if (buflist == NULL) 4598 return; 4599 4600 if (!all && dev->l2ad_first) { 4601 /* 4602 * This is the first sweep through the device. There is 4603 * nothing to evict. 4604 / 4605* return; 4606 } 4607 4608 if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) { 4609 /* 4610 * When nearing the end of the device, evict to the end 4611 * before the device write hand jumps to the start. 4612 / 4613* taddr = dev->l2ad_end; 4614 } else { 4615 taddr = dev->l2ad_hand + distance; 4616 } 4617 DTRACE_PROBE4(l2arc__evict, l2arc_dev_t , dev, list_t , buflist, 4618 uint64_t, taddr, boolean_t, all); 4619 4620top: 4621 mutex_enter(&l2arc_buflist_mtx); 4622 for (ab = list_tail(buflist); ab; ab = ab_prev) { 4623 ab_prev = list_prev(buflist, ab); 4624 4625 hash_lock = HDR_LOCK(ab); 4626 if (!mutex_tryenter(hash_lock)) { 4627 /* 4628 * Missed the hash lock. Retry. 4629 / 4630* ARCSTAT_BUMP(arcstat_l2_evict_lock_retry); 4631 mutex_exit(&l2arc_buflist_mtx); 4632 mutex_enter(hash_lock); 4633 mutex_exit(hash_lock); 4634 goto top; 4635 } 4636 4637 if (HDR_L2_WRITE_HEAD(ab)) { 4638 /* 4639 * We hit a write head node. Leave it for 4640 * l2arc_write_done(). 4641 / 4642* list_remove(buflist, ab); 4643 mutex_exit(hash_lock); 4644 continue; 4645 } 4646 4647 if (!all && ab->b_l2hdr != NULL && 4648 (ab->b_l2hdr->b_daddr > taddr \|\| 4649 ab->b_l2hdr->b_daddr < dev->l2ad_hand)) { 4650 /* 4651 * We've evicted to the target address, 4652 * or the end of the device. 4653 / 4654* mutex_exit(hash_lock); 4655 break; 4656 } 4657 4658 if (HDR_FREE_IN_PROGRESS(ab)) { 4659 /* 4660 * Already on the path to destruction. 4661 / 4662* mutex_exit(hash_lock); 4663 continue; 4664 } 4665 4666 if (ab->b_state == arc_l2c_only) { 4667 ASSERT(!HDR_L2_READING(ab)); 4668 /* 4669 * This doesn't exist in the ARC. Destroy. 4670 * arc_hdr_destroy() will call list_remove() 4671 * and decrement arcstat_l2_size. 4672 / 4673* arc_change_state(arc_anon, ab, hash_lock); 4674 arc_hdr_destroy(ab); 4675 } else { 4676 /* 4677 * Invalidate issued or about to be issued 4678 * reads, since we may be about to write 4679 * over this location. 4680 / 4681* if (HDR_L2_READING(ab)) { 4682 ARCSTAT_BUMP(arcstat_l2_evict_reading); 4683 ab->b_flags \|= ARC_L2_EVICTED; 4684 } 4685 4686 /* 4687 * Tell ARC this no longer exists in L2ARC. 4688 / 4689* if (ab->b_l2hdr != NULL) { 4690 abl2 = ab->b_l2hdr; 4691 ab->b_l2hdr = NULL; 4692 kmem_free(abl2, sizeof (l2arc_buf_hdr_t)); 4693 ARCSTAT_INCR(arcstat_l2_size, -ab->b_size); 4694 } 4695 list_remove(buflist, ab); 4696 4697 /* 4698 * This may have been leftover after a 4699 * failed write. 4700 / 4701* ab->b_flags &= ~ARC_L2_WRITING; 4702 } 4703 mutex_exit(hash_lock); 4704 } 4705 mutex_exit(&l2arc_buflist_mtx); 4706 4707 vdev_space_update(dev->l2ad_vdev, -(taddr - dev->l2ad_evict), 0, 0); 4708 dev->l2ad_evict = taddr; 4709} 4710 4711/* 4712 * Find and write ARC buffers to the L2ARC device. 4713 * 4714 * An ARC_L2_WRITING flag is set so that the L2ARC buffers are not valid 4715 * for reading until they have completed writing. 4716 / 4717static uint64_t 4718l2arc_write_buffers(spa_t spa, l2arc_dev_t dev, uint64_t target_sz) 4719{ 4720* arc_buf_hdr_t ab, ab_prev, head; 4721* l2arc_buf_hdr_t hdrl2; 4722* list_t list; 4723* uint64_t passed_sz, write_sz, buf_sz, headroom; 4724 void buf_data; 4725* kmutex_t hash_lock, list_lock; 4726 boolean_t have_lock, full; 4727 l2arc_write_callback_t cb; 4728* zio_t pio, wzio; 4729 uint64_t guid = spa_load_guid(spa); 4730 int try; 4731 4732 ASSERT(dev->l2ad_vdev != NULL); 4733 4734 pio = NULL; 4735 write_sz = 0; 4736 full = B_FALSE; 4737 head = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE); 4738 head->b_flags \|= ARC_L2_WRITE_HEAD; 4739 4740 ARCSTAT_BUMP(arcstat_l2_write_buffer_iter); 4741 /* 4742 * Copy buffers for L2ARC writing. 4743 / 4744* mutex_enter(&l2arc_buflist_mtx); 4745 for (try = 0; try < 2 * ARC_BUFC_NUMLISTS; try++) { 4746 list = l2arc_list_locked(try, &list_lock); 4747 passed_sz = 0; 4748 ARCSTAT_BUMP(arcstat_l2_write_buffer_list_iter); 4749 4750 /* 4751 * L2ARC fast warmup. 4752 * 4753 * Until the ARC is warm and starts to evict, read from the 4754 * head of the ARC lists rather than the tail. 4755 / 4756* headroom = target_sz * l2arc_headroom; 4757 if (arc_warm == B_FALSE) 4758 ab = list_head(list); 4759 else 4760 ab = list_tail(list); 4761 if (ab == NULL) 4762 ARCSTAT_BUMP(arcstat_l2_write_buffer_list_null_iter); 4763 4764 for (; ab; ab = ab_prev) { 4765 if (arc_warm == B_FALSE) 4766 ab_prev = list_next(list, ab); 4767 else 4768 ab_prev = list_prev(list, ab); 4769 ARCSTAT_INCR(arcstat_l2_write_buffer_bytes_scanned, ab->b_size); 4770 4771 hash_lock = HDR_LOCK(ab); 4772 have_lock = MUTEX_HELD(hash_lock); 4773 if (!have_lock && !mutex_tryenter(hash_lock)) { 4774 ARCSTAT_BUMP(arcstat_l2_write_trylock_fail); 4775 /* 4776 * Skip this buffer rather than waiting. 4777 / 4778* continue; 4779 } 4780 4781 passed_sz += ab->b_size; 4782 if (passed_sz > headroom) { 4783 /* 4784 * Searched too far. 4785 / 4786* mutex_exit(hash_lock); 4787 ARCSTAT_BUMP(arcstat_l2_write_passed_headroom); 4788 break; 4789 } 4790 4791 if (!l2arc_write_eligible(guid, ab)) { 4792 mutex_exit(hash_lock); 4793 continue; 4794 } 4795 4796 if ((write_sz + ab->b_size) > target_sz) { 4797 full = B_TRUE; 4798 mutex_exit(hash_lock); 4799 ARCSTAT_BUMP(arcstat_l2_write_full); 4800 break; 4801 } 4802 4803 if (pio == NULL) { 4804 /* 4805 * Insert a dummy header on the buflist so 4806 * l2arc_write_done() can find where the 4807 * write buffers begin without searching. 4808 / 4809* list_insert_head(dev->l2ad_buflist, head); 4810 4811 cb = kmem_alloc( 4812 sizeof (l2arc_write_callback_t), KM_SLEEP); 4813 cb->l2wcb_dev = dev; 4814 cb->l2wcb_head = head; 4815 pio = zio_root(spa, l2arc_write_done, cb, 4816 ZIO_FLAG_CANFAIL); 4817 ARCSTAT_BUMP(arcstat_l2_write_pios); 4818 } 4819 4820 /* 4821 * Create and add a new L2ARC header. 4822 / 4823* hdrl2 = kmem_zalloc(sizeof (l2arc_buf_hdr_t), KM_SLEEP); 4824 hdrl2->b_dev = dev; 4825 hdrl2->b_daddr = dev->l2ad_hand; 4826 4827 ab->b_flags \|= ARC_L2_WRITING; 4828 ab->b_l2hdr = hdrl2; 4829 list_insert_head(dev->l2ad_buflist, ab); 4830 buf_data = ab->b_buf->b_data; 4831 buf_sz = ab->b_size; 4832 4833 /* 4834 * Compute and store the buffer cksum before 4835 * writing. On debug the cksum is verified first. 4836 / 4837* arc_cksum_verify(ab->b_buf); 4838 arc_cksum_compute(ab->b_buf, B_TRUE); 4839 4840 mutex_exit(hash_lock); 4841 4842 wzio = zio_write_phys(pio, dev->l2ad_vdev, 4843 dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF, 4844 NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE, 4845 ZIO_FLAG_CANFAIL, B_FALSE); 4846 4847 DTRACE_PROBE2(l2arc__write, vdev_t , dev->l2ad_vdev, 4848* zio_t , wzio); 4849* (void) zio_nowait(wzio); 4850 4851 /* 4852 * Keep the clock hand suitably device-aligned. 4853 / 4854* buf_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz); 4855 4856 write_sz += buf_sz; 4857 dev->l2ad_hand += buf_sz; 4858 } 4859 4860 mutex_exit(list_lock); 4861 4862 if (full == B_TRUE) 4863 break; 4864 } 4865 mutex_exit(&l2arc_buflist_mtx); 4866 4867 if (pio == NULL) { 4868 ASSERT0(write_sz); 4869 kmem_cache_free(hdr_cache, head); 4870 return (0); 4871 } 4872 4873 ASSERT3U(write_sz, <=, target_sz); 4874 ARCSTAT_BUMP(arcstat_l2_writes_sent); 4875 ARCSTAT_INCR(arcstat_l2_write_bytes, write_sz); 4876 ARCSTAT_INCR(arcstat_l2_size, write_sz); 4877 vdev_space_update(dev->l2ad_vdev, write_sz, 0, 0); 4878 4879 /* 4880 * Bump device hand to the device start if it is approaching the end. 4881 * l2arc_evict() will already have evicted ahead for this case. 4882 / 4883* if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) { 4884 vdev_space_update(dev->l2ad_vdev, 4885 dev->l2ad_end - dev->l2ad_hand, 0, 0); 4886 dev->l2ad_hand = dev->l2ad_start; 4887 dev->l2ad_evict = dev->l2ad_start; 4888 dev->l2ad_first = B_FALSE; 4889 } 4890 4891 dev->l2ad_writing = B_TRUE; 4892 (void) zio_wait(pio); 4893 dev->l2ad_writing = B_FALSE; 4894 4895 return (write_sz); 4896} 4897 4898/* 4899 * This thread feeds the L2ARC at regular intervals. This is the beating 4900 * heart of the L2ARC. 4901 / 4902static void 4903l2arc_feed_thread(void dummy __unused) 4904{ 4905 callb_cpr_t cpr; 4906 l2arc_dev_t dev; 4907* spa_t spa; 4908* uint64_t size, wrote; 4909 clock_t begin, next = ddi_get_lbolt(); 4910 4911 CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG); 4912 4913 mutex_enter(&l2arc_feed_thr_lock); 4914 4915 while (l2arc_thread_exit == 0) { 4916 CALLB_CPR_SAFE_BEGIN(&cpr); 4917 (void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock, 4918 next - ddi_get_lbolt()); 4919 CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock); 4920 next = ddi_get_lbolt() + hz; 4921 4922 /* 4923 * Quick check for L2ARC devices. 4924 / 4925* mutex_enter(&l2arc_dev_mtx); 4926 if (l2arc_ndev == 0) { 4927 mutex_exit(&l2arc_dev_mtx); 4928 continue; 4929 } 4930 mutex_exit(&l2arc_dev_mtx); 4931 begin = ddi_get_lbolt(); 4932 4933 /* 4934 * This selects the next l2arc device to write to, and in 4935 * doing so the next spa to feed from: dev->l2ad_spa. This 4936 * will return NULL if there are now no l2arc devices or if 4937 * they are all faulted. 4938 * 4939 * If a device is returned, its spa's config lock is also 4940 * held to prevent device removal. l2arc_dev_get_next() 4941 * will grab and release l2arc_dev_mtx. 4942 / 4943* if ((dev = l2arc_dev_get_next()) == NULL) 4944 continue; 4945 4946 spa = dev->l2ad_spa; 4947 ASSERT(spa != NULL); 4948 4949 /* 4950 * If the pool is read-only then force the feed thread to 4951 * sleep a little longer. 4952 / 4953* if (!spa_writeable(spa)) { 4954 next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz; 4955 spa_config_exit(spa, SCL_L2ARC, dev); 4956 continue; 4957 } 4958 4959 /* 4960 * Avoid contributing to memory pressure. 4961 / 4962* if (arc_reclaim_needed()) { 4963 ARCSTAT_BUMP(arcstat_l2_abort_lowmem); 4964 spa_config_exit(spa, SCL_L2ARC, dev); 4965 continue; 4966 } 4967 4968 ARCSTAT_BUMP(arcstat_l2_feeds); 4969 4970 size = l2arc_write_size(dev); 4971 4972 /* 4973 * Evict L2ARC buffers that will be overwritten. 4974 / 4975* l2arc_evict(dev, size, B_FALSE); 4976 4977 /* 4978 * Write ARC buffers. 4979 / 4980* wrote = l2arc_write_buffers(spa, dev, size); 4981 4982 /* 4983 * Calculate interval between writes. 4984 / 4985* next = l2arc_write_interval(begin, size, wrote); 4986 spa_config_exit(spa, SCL_L2ARC, dev); 4987 } 4988 4989 l2arc_thread_exit = 0; 4990 cv_broadcast(&l2arc_feed_thr_cv); 4991 CALLB_CPR_EXIT(&cpr); /* drops l2arc_feed_thr_lock / 4992* thread_exit(); 4993} 4994 4995boolean_t 4996l2arc_vdev_present(vdev_t vd) 4997{ 4998* l2arc_dev_t dev; 4999* 5000 mutex_enter(&l2arc_dev_mtx); 5001 for (dev = list_head(l2arc_dev_list); dev != NULL; 5002 dev = list_next(l2arc_dev_list, dev)) { 5003 if (dev->l2ad_vdev == vd) 5004 break; 5005 } 5006 mutex_exit(&l2arc_dev_mtx); 5007 5008 return (dev != NULL); 5009} 5010 5011/* 5012 * Add a vdev for use by the L2ARC. By this point the spa has already 5013 * validated the vdev and opened it. 5014 / 5015void 5016l2arc_add_vdev(spa_t spa, vdev_t vd) 5017{ 5018* l2arc_dev_t adddev; 5019* 5020 ASSERT(!l2arc_vdev_present(vd)); 5021 5022 /* 5023 * Create a new l2arc device entry. 5024 / 5025* adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP); 5026 adddev->l2ad_spa = spa; 5027 adddev->l2ad_vdev = vd; 5028 adddev->l2ad_write = l2arc_write_max; 5029 adddev->l2ad_boost = l2arc_write_boost; 5030 adddev->l2ad_start = VDEV_LABEL_START_SIZE; 5031 adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd); 5032 adddev->l2ad_hand = adddev->l2ad_start; 5033 adddev->l2ad_evict = adddev->l2ad_start; 5034 adddev->l2ad_first = B_TRUE; 5035 adddev->l2ad_writing = B_FALSE; 5036 ASSERT3U(adddev->l2ad_write, >, 0); 5037 5038 /* 5039 * This is a list of all ARC buffers that are still valid on the 5040 * device. 5041 / 5042* adddev->l2ad_buflist = kmem_zalloc(sizeof (list_t), KM_SLEEP); 5043 list_create(adddev->l2ad_buflist, sizeof (arc_buf_hdr_t), 5044 offsetof(arc_buf_hdr_t, b_l2node)); 5045 5046 vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand); 5047 5048 /* 5049 * Add device to global list 5050 / 5051* mutex_enter(&l2arc_dev_mtx); 5052 list_insert_head(l2arc_dev_list, adddev); 5053 atomic_inc_64(&l2arc_ndev); 5054 mutex_exit(&l2arc_dev_mtx); 5055} 5056 5057/* 5058 * Remove a vdev from the L2ARC. 5059 / 5060void 5061l2arc_remove_vdev(vdev_t vd) 5062{ 5063 l2arc_dev_t dev, nextdev, remdev = NULL; 5064* 5065 /* 5066 * Find the device by vdev 5067 / 5068* mutex_enter(&l2arc_dev_mtx); 5069 for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) { 5070 nextdev = list_next(l2arc_dev_list, dev); 5071 if (vd == dev->l2ad_vdev) { 5072 remdev = dev; 5073 break; 5074 } 5075 } 5076 ASSERT(remdev != NULL); 5077 5078 /* 5079 * Remove device from global list 5080 / 5081* list_remove(l2arc_dev_list, remdev); 5082 l2arc_dev_last = NULL; /* may have been invalidated / 5083* atomic_dec_64(&l2arc_ndev); 5084 mutex_exit(&l2arc_dev_mtx); 5085 5086 /* 5087 * Clear all buflists and ARC references. L2ARC device flush. 5088 / 5089* l2arc_evict(remdev, 0, B_TRUE); 5090 list_destroy(remdev->l2ad_buflist); 5091 kmem_free(remdev->l2ad_buflist, sizeof (list_t)); 5092 kmem_free(remdev, sizeof (l2arc_dev_t)); 5093} 5094 5095void 5096l2arc_init(void) 5097{ 5098 l2arc_thread_exit = 0; 5099 l2arc_ndev = 0; 5100 l2arc_writes_sent = 0; 5101 l2arc_writes_done = 0; 5102 5103 mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL); 5104 cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL); 5105 mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL); 5106 mutex_init(&l2arc_buflist_mtx, NULL, MUTEX_DEFAULT, NULL); 5107 mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL); 5108 5109 l2arc_dev_list = &L2ARC_dev_list; 5110 l2arc_free_on_write = &L2ARC_free_on_write; 5111 list_create(l2arc_dev_list, sizeof (l2arc_dev_t), 5112 offsetof(l2arc_dev_t, l2ad_node)); 5113 list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t), 5114 offsetof(l2arc_data_free_t, l2df_list_node)); 5115} 5116 5117void 5118l2arc_fini(void) 5119{ 5120 /* 5121 * This is called from dmu_fini(), which is called from spa_fini(); 5122 * Because of this, we can assume that all l2arc devices have 5123 * already been removed when the pools themselves were removed. 5124 / 5125* 5126 l2arc_do_free_on_write(); 5127 5128 mutex_destroy(&l2arc_feed_thr_lock); 5129 cv_destroy(&l2arc_feed_thr_cv); 5130 mutex_destroy(&l2arc_dev_mtx); 5131 mutex_destroy(&l2arc_buflist_mtx); 5132 mutex_destroy(&l2arc_free_on_write_mtx); 5133 5134 list_destroy(l2arc_dev_list); 5135 list_destroy(l2arc_free_on_write); 5136} 5137 5138void 5139l2arc_start(void) 5140{ 5141 if (!(spa_mode_global & FWRITE)) 5142 return; 5143 5144 (void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0, 5145 TS_RUN, minclsyspri); 5146} 5147 5148void 5149l2arc_stop(void) 5150{ 5151 if (!(spa_mode_global & FWRITE)) 5152 return; 5153 5154 mutex_enter(&l2arc_feed_thr_lock); 5155 cv_signal(&l2arc_feed_thr_cv); /* kick thread out of startup / 5156* l2arc_thread_exit = 1; 5157 while (l2arc_thread_exit != 0) 5158 cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock); 5159 mutex_exit(&l2arc_feed_thr_lock); 5160}