arc.c revision 286598
1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21/* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2012, Joyent, Inc. All rights reserved. 24 * Copyright (c) 2011, 2014 by Delphix. All rights reserved. 25 * Copyright (c) 2014 by Saso Kiselkov. All rights reserved. 26 * Copyright 2014 Nexenta Systems, Inc. All rights reserved. 27 */ 28 29/* 30 * DVA-based Adjustable Replacement Cache 31 * 32 * While much of the theory of operation used here is 33 * based on the self-tuning, low overhead replacement cache 34 * presented by Megiddo and Modha at FAST 2003, there are some 35 * significant differences: 36 * 37 * 1. The Megiddo and Modha model assumes any page is evictable. 38 * Pages in its cache cannot be "locked" into memory. This makes 39 * the eviction algorithm simple: evict the last page in the list. 40 * This also make the performance characteristics easy to reason 41 * about. Our cache is not so simple. At any given moment, some 42 * subset of the blocks in the cache are un-evictable because we 43 * have handed out a reference to them. Blocks are only evictable 44 * when there are no external references active. This makes 45 * eviction far more problematic: we choose to evict the evictable 46 * blocks that are the "lowest" in the list. 47 * 48 * There are times when it is not possible to evict the requested 49 * space. In these circumstances we are unable to adjust the cache 50 * size. To prevent the cache growing unbounded at these times we 51 * implement a "cache throttle" that slows the flow of new data 52 * into the cache until we can make space available. 53 * 54 * 2. The Megiddo and Modha model assumes a fixed cache size. 55 * Pages are evicted when the cache is full and there is a cache 56 * miss. Our model has a variable sized cache. It grows with 57 * high use, but also tries to react to memory pressure from the 58 * operating system: decreasing its size when system memory is 59 * tight. 60 * 61 * 3. The Megiddo and Modha model assumes a fixed page size. All 62 * elements of the cache are therefore exactly the same size. So 63 * when adjusting the cache size following a cache miss, its simply 64 * a matter of choosing a single page to evict. In our model, we 65 * have variable sized cache blocks (rangeing from 512 bytes to 66 * 128K bytes). We therefore choose a set of blocks to evict to make 67 * space for a cache miss that approximates as closely as possible 68 * the space used by the new block. 69 * 70 * See also: "ARC: A Self-Tuning, Low Overhead Replacement Cache" 71 * by N. Megiddo & D. Modha, FAST 2003 72 */ 73 74/* 75 * The locking model: 76 * 77 * A new reference to a cache buffer can be obtained in two 78 * ways: 1) via a hash table lookup using the DVA as a key, 79 * or 2) via one of the ARC lists. The arc_read() interface 80 * uses method 1, while the internal arc algorithms for 81 * adjusting the cache use method 2. We therefore provide two 82 * types of locks: 1) the hash table lock array, and 2) the 83 * arc list locks. 84 * 85 * Buffers do not have their own mutexs, rather they rely on the 86 * hash table mutexs for the bulk of their protection (i.e. most 87 * fields in the arc_buf_hdr_t are protected by these mutexs). 88 * 89 * buf_hash_find() returns the appropriate mutex (held) when it 90 * locates the requested buffer in the hash table. It returns 91 * NULL for the mutex if the buffer was not in the table. 92 * 93 * buf_hash_remove() expects the appropriate hash mutex to be 94 * already held before it is invoked. 95 * 96 * Each arc state also has a mutex which is used to protect the 97 * buffer list associated with the state. When attempting to 98 * obtain a hash table lock while holding an arc list lock you 99 * must use: mutex_tryenter() to avoid deadlock. Also note that 100 * the active state mutex must be held before the ghost state mutex. 101 * 102 * Arc buffers may have an associated eviction callback function. 103 * This function will be invoked prior to removing the buffer (e.g. 104 * in arc_do_user_evicts()). Note however that the data associated 105 * with the buffer may be evicted prior to the callback. The callback 106 * must be made with *no locks held* (to prevent deadlock). Additionally, 107 * the users of callbacks must ensure that their private data is 108 * protected from simultaneous callbacks from arc_clear_callback() 109 * and arc_do_user_evicts(). 110 * 111 * Note that the majority of the performance stats are manipulated 112 * with atomic operations. 113 * 114 * The L2ARC uses the l2ad_mtx on each vdev for the following: 115 * 116 * - L2ARC buflist creation 117 * - L2ARC buflist eviction 118 * - L2ARC write completion, which walks L2ARC buflists 119 * - ARC header destruction, as it removes from L2ARC buflists 120 * - ARC header release, as it removes from L2ARC buflists 121 */ 122 123#include <sys/spa.h> 124#include <sys/zio.h> 125#include <sys/zio_compress.h> 126#include <sys/zfs_context.h> 127#include <sys/arc.h> 128#include <sys/refcount.h> 129#include <sys/vdev.h> 130#include <sys/vdev_impl.h> 131#include <sys/dsl_pool.h> 132#ifdef _KERNEL 133#include <sys/dnlc.h> 134#endif 135#include <sys/callb.h> 136#include <sys/kstat.h> 137#include <sys/trim_map.h> 138#include <zfs_fletcher.h> 139#include <sys/sdt.h> 140 141#include <vm/vm_pageout.h> 142#include <machine/vmparam.h> 143 144#ifdef illumos 145#ifndef _KERNEL 146/* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */ 147boolean_t arc_watch = B_FALSE; 148int arc_procfd; 149#endif 150#endif /* illumos */ 151 152static kmutex_t arc_reclaim_thr_lock; 153static kcondvar_t arc_reclaim_thr_cv; /* used to signal reclaim thr */ 154static uint8_t arc_thread_exit; 155 156#define ARC_REDUCE_DNLC_PERCENT 3 157uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT; 158 159typedef enum arc_reclaim_strategy { 160 ARC_RECLAIM_AGGR, /* Aggressive reclaim strategy */ 161 ARC_RECLAIM_CONS /* Conservative reclaim strategy */ 162} arc_reclaim_strategy_t; 163 164/* 165 * The number of iterations through arc_evict_*() before we 166 * drop & reacquire the lock. 167 */ 168int arc_evict_iterations = 100; 169 170/* number of seconds before growing cache again */ 171static int arc_grow_retry = 60; 172 173/* shift of arc_c for calculating both min and max arc_p */ 174static int arc_p_min_shift = 4; 175 176/* log2(fraction of arc to reclaim) */ 177static int arc_shrink_shift = 5; 178 179/* 180 * minimum lifespan of a prefetch block in clock ticks 181 * (initialized in arc_init()) 182 */ 183static int arc_min_prefetch_lifespan; 184 185/* 186 * If this percent of memory is free, don't throttle. 187 */ 188int arc_lotsfree_percent = 10; 189 190static int arc_dead; 191extern int zfs_prefetch_disable; 192 193/* 194 * The arc has filled available memory and has now warmed up. 195 */ 196static boolean_t arc_warm; 197 198uint64_t zfs_arc_max; 199uint64_t zfs_arc_min; 200uint64_t zfs_arc_meta_limit = 0; 201uint64_t zfs_arc_meta_min = 0; 202int zfs_arc_grow_retry = 0; 203int zfs_arc_shrink_shift = 0; 204int zfs_arc_p_min_shift = 0; 205int zfs_disable_dup_eviction = 0; 206uint64_t zfs_arc_average_blocksize = 8 * 1024; /* 8KB */ 207u_int zfs_arc_free_target = 0; 208 209static int sysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS); 210static int sysctl_vfs_zfs_arc_meta_limit(SYSCTL_HANDLER_ARGS); 211 212#ifdef _KERNEL 213static void 214arc_free_target_init(void *unused __unused) 215{ 216 217 zfs_arc_free_target = vm_pageout_wakeup_thresh; 218} 219SYSINIT(arc_free_target_init, SI_SUB_KTHREAD_PAGE, SI_ORDER_ANY, 220 arc_free_target_init, NULL); 221 222TUNABLE_QUAD("vfs.zfs.arc_meta_limit", &zfs_arc_meta_limit); 223TUNABLE_QUAD("vfs.zfs.arc_meta_min", &zfs_arc_meta_min); 224TUNABLE_INT("vfs.zfs.arc_shrink_shift", &zfs_arc_shrink_shift); 225SYSCTL_DECL(_vfs_zfs); 226SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_max, CTLFLAG_RDTUN, &zfs_arc_max, 0, 227 "Maximum ARC size"); 228SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_min, CTLFLAG_RDTUN, &zfs_arc_min, 0, 229 "Minimum ARC size"); 230SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_average_blocksize, CTLFLAG_RDTUN, 231 &zfs_arc_average_blocksize, 0, 232 "ARC average blocksize"); 233SYSCTL_INT(_vfs_zfs, OID_AUTO, arc_shrink_shift, CTLFLAG_RW, 234 &arc_shrink_shift, 0, 235 "log2(fraction of arc to reclaim)"); 236 237/* 238 * We don't have a tunable for arc_free_target due to the dependency on 239 * pagedaemon initialisation. 240 */ 241SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_free_target, 242 CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(u_int), 243 sysctl_vfs_zfs_arc_free_target, "IU", 244 "Desired number of free pages below which ARC triggers reclaim"); 245 246static int 247sysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS) 248{ 249 u_int val; 250 int err; 251 252 val = zfs_arc_free_target; 253 err = sysctl_handle_int(oidp, &val, 0, req); 254 if (err != 0 || req->newptr == NULL) 255 return (err); 256 257 if (val < minfree) 258 return (EINVAL); 259 if (val > vm_cnt.v_page_count) 260 return (EINVAL); 261 262 zfs_arc_free_target = val; 263 264 return (0); 265} 266 267/* 268 * Must be declared here, before the definition of corresponding kstat 269 * macro which uses the same names will confuse the compiler. 270 */ 271SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_meta_limit, 272 CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(uint64_t), 273 sysctl_vfs_zfs_arc_meta_limit, "QU", 274 "ARC metadata limit"); 275#endif 276 277/* 278 * Note that buffers can be in one of 6 states: 279 * ARC_anon - anonymous (discussed below) 280 * ARC_mru - recently used, currently cached 281 * ARC_mru_ghost - recentely used, no longer in cache 282 * ARC_mfu - frequently used, currently cached 283 * ARC_mfu_ghost - frequently used, no longer in cache 284 * ARC_l2c_only - exists in L2ARC but not other states 285 * When there are no active references to the buffer, they are 286 * are linked onto a list in one of these arc states. These are 287 * the only buffers that can be evicted or deleted. Within each 288 * state there are multiple lists, one for meta-data and one for 289 * non-meta-data. Meta-data (indirect blocks, blocks of dnodes, 290 * etc.) is tracked separately so that it can be managed more 291 * explicitly: favored over data, limited explicitly. 292 * 293 * Anonymous buffers are buffers that are not associated with 294 * a DVA. These are buffers that hold dirty block copies 295 * before they are written to stable storage. By definition, 296 * they are "ref'd" and are considered part of arc_mru 297 * that cannot be freed. Generally, they will aquire a DVA 298 * as they are written and migrate onto the arc_mru list. 299 * 300 * The ARC_l2c_only state is for buffers that are in the second 301 * level ARC but no longer in any of the ARC_m* lists. The second 302 * level ARC itself may also contain buffers that are in any of 303 * the ARC_m* states - meaning that a buffer can exist in two 304 * places. The reason for the ARC_l2c_only state is to keep the 305 * buffer header in the hash table, so that reads that hit the 306 * second level ARC benefit from these fast lookups. 307 */ 308 309#define ARCS_LOCK_PAD CACHE_LINE_SIZE 310struct arcs_lock { 311 kmutex_t arcs_lock; 312#ifdef _KERNEL 313 unsigned char pad[(ARCS_LOCK_PAD - sizeof (kmutex_t))]; 314#endif 315}; 316 317/* 318 * must be power of two for mask use to work 319 * 320 */ 321#define ARC_BUFC_NUMDATALISTS 16 322#define ARC_BUFC_NUMMETADATALISTS 16 323#define ARC_BUFC_NUMLISTS (ARC_BUFC_NUMMETADATALISTS + ARC_BUFC_NUMDATALISTS) 324 325typedef struct arc_state { 326 uint64_t arcs_lsize[ARC_BUFC_NUMTYPES]; /* amount of evictable data */ 327 uint64_t arcs_size; /* total amount of data in this state */ 328 list_t arcs_lists[ARC_BUFC_NUMLISTS]; /* list of evictable buffers */ 329 struct arcs_lock arcs_locks[ARC_BUFC_NUMLISTS] __aligned(CACHE_LINE_SIZE); 330} arc_state_t; 331 332#define ARCS_LOCK(s, i) (&((s)->arcs_locks[(i)].arcs_lock)) 333 334/* The 6 states: */ 335static arc_state_t ARC_anon; 336static arc_state_t ARC_mru; 337static arc_state_t ARC_mru_ghost; 338static arc_state_t ARC_mfu; 339static arc_state_t ARC_mfu_ghost; 340static arc_state_t ARC_l2c_only; 341 342typedef struct arc_stats { 343 kstat_named_t arcstat_hits; 344 kstat_named_t arcstat_misses; 345 kstat_named_t arcstat_demand_data_hits; 346 kstat_named_t arcstat_demand_data_misses; 347 kstat_named_t arcstat_demand_metadata_hits; 348 kstat_named_t arcstat_demand_metadata_misses; 349 kstat_named_t arcstat_prefetch_data_hits; 350 kstat_named_t arcstat_prefetch_data_misses; 351 kstat_named_t arcstat_prefetch_metadata_hits; 352 kstat_named_t arcstat_prefetch_metadata_misses; 353 kstat_named_t arcstat_mru_hits; 354 kstat_named_t arcstat_mru_ghost_hits; 355 kstat_named_t arcstat_mfu_hits; 356 kstat_named_t arcstat_mfu_ghost_hits; 357 kstat_named_t arcstat_allocated; 358 kstat_named_t arcstat_deleted; 359 kstat_named_t arcstat_stolen; 360 kstat_named_t arcstat_recycle_miss; 361 /* 362 * Number of buffers that could not be evicted because the hash lock 363 * was held by another thread. The lock may not necessarily be held 364 * by something using the same buffer, since hash locks are shared 365 * by multiple buffers. 366 */ 367 kstat_named_t arcstat_mutex_miss; 368 /* 369 * Number of buffers skipped because they have I/O in progress, are 370 * indrect prefetch buffers that have not lived long enough, or are 371 * not from the spa we're trying to evict from. 372 */ 373 kstat_named_t arcstat_evict_skip; 374 kstat_named_t arcstat_evict_l2_cached; 375 kstat_named_t arcstat_evict_l2_eligible; 376 kstat_named_t arcstat_evict_l2_ineligible; 377 kstat_named_t arcstat_hash_elements; 378 kstat_named_t arcstat_hash_elements_max; 379 kstat_named_t arcstat_hash_collisions; 380 kstat_named_t arcstat_hash_chains; 381 kstat_named_t arcstat_hash_chain_max; 382 kstat_named_t arcstat_p; 383 kstat_named_t arcstat_c; 384 kstat_named_t arcstat_c_min; 385 kstat_named_t arcstat_c_max; 386 kstat_named_t arcstat_size; 387 /* 388 * Number of bytes consumed by internal ARC structures necessary 389 * for tracking purposes; these structures are not actually 390 * backed by ARC buffers. This includes arc_buf_hdr_t structures 391 * (allocated via arc_buf_hdr_t_full and arc_buf_hdr_t_l2only 392 * caches), and arc_buf_t structures (allocated via arc_buf_t 393 * cache). 394 */ 395 kstat_named_t arcstat_hdr_size; 396 /* 397 * Number of bytes consumed by ARC buffers of type equal to 398 * ARC_BUFC_DATA. This is generally consumed by buffers backing 399 * on disk user data (e.g. plain file contents). 400 */ 401 kstat_named_t arcstat_data_size; 402 /* 403 * Number of bytes consumed by ARC buffers of type equal to 404 * ARC_BUFC_METADATA. This is generally consumed by buffers 405 * backing on disk data that is used for internal ZFS 406 * structures (e.g. ZAP, dnode, indirect blocks, etc). 407 */ 408 kstat_named_t arcstat_metadata_size; 409 /* 410 * Number of bytes consumed by various buffers and structures 411 * not actually backed with ARC buffers. This includes bonus 412 * buffers (allocated directly via zio_buf_* functions), 413 * dmu_buf_impl_t structures (allocated via dmu_buf_impl_t 414 * cache), and dnode_t structures (allocated via dnode_t cache). 415 */ 416 kstat_named_t arcstat_other_size; 417 /* 418 * Total number of bytes consumed by ARC buffers residing in the 419 * arc_anon state. This includes *all* buffers in the arc_anon 420 * state; e.g. data, metadata, evictable, and unevictable buffers 421 * are all included in this value. 422 */ 423 kstat_named_t arcstat_anon_size; 424 /* 425 * Number of bytes consumed by ARC buffers that meet the 426 * following criteria: backing buffers of type ARC_BUFC_DATA, 427 * residing in the arc_anon state, and are eligible for eviction 428 * (e.g. have no outstanding holds on the buffer). 429 */ 430 kstat_named_t arcstat_anon_evictable_data; 431 /* 432 * Number of bytes consumed by ARC buffers that meet the 433 * following criteria: backing buffers of type ARC_BUFC_METADATA, 434 * residing in the arc_anon state, and are eligible for eviction 435 * (e.g. have no outstanding holds on the buffer). 436 */ 437 kstat_named_t arcstat_anon_evictable_metadata; 438 /* 439 * Total number of bytes consumed by ARC buffers residing in the 440 * arc_mru state. This includes *all* buffers in the arc_mru 441 * state; e.g. data, metadata, evictable, and unevictable buffers 442 * are all included in this value. 443 */ 444 kstat_named_t arcstat_mru_size; 445 /* 446 * Number of bytes consumed by ARC buffers that meet the 447 * following criteria: backing buffers of type ARC_BUFC_DATA, 448 * residing in the arc_mru state, and are eligible for eviction 449 * (e.g. have no outstanding holds on the buffer). 450 */ 451 kstat_named_t arcstat_mru_evictable_data; 452 /* 453 * Number of bytes consumed by ARC buffers that meet the 454 * following criteria: backing buffers of type ARC_BUFC_METADATA, 455 * residing in the arc_mru state, and are eligible for eviction 456 * (e.g. have no outstanding holds on the buffer). 457 */ 458 kstat_named_t arcstat_mru_evictable_metadata; 459 /* 460 * Total number of bytes that *would have been* consumed by ARC 461 * buffers in the arc_mru_ghost state. The key thing to note 462 * here, is the fact that this size doesn't actually indicate 463 * RAM consumption. The ghost lists only consist of headers and 464 * don't actually have ARC buffers linked off of these headers. 465 * Thus, *if* the headers had associated ARC buffers, these 466 * buffers *would have* consumed this number of bytes. 467 */ 468 kstat_named_t arcstat_mru_ghost_size; 469 /* 470 * Number of bytes that *would have been* consumed by ARC 471 * buffers that are eligible for eviction, of type 472 * ARC_BUFC_DATA, and linked off the arc_mru_ghost state. 473 */ 474 kstat_named_t arcstat_mru_ghost_evictable_data; 475 /* 476 * Number of bytes that *would have been* consumed by ARC 477 * buffers that are eligible for eviction, of type 478 * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state. 479 */ 480 kstat_named_t arcstat_mru_ghost_evictable_metadata; 481 /* 482 * Total number of bytes consumed by ARC buffers residing in the 483 * arc_mfu state. This includes *all* buffers in the arc_mfu 484 * state; e.g. data, metadata, evictable, and unevictable buffers 485 * are all included in this value. 486 */ 487 kstat_named_t arcstat_mfu_size; 488 /* 489 * Number of bytes consumed by ARC buffers that are eligible for 490 * eviction, of type ARC_BUFC_DATA, and reside in the arc_mfu 491 * state. 492 */ 493 kstat_named_t arcstat_mfu_evictable_data; 494 /* 495 * Number of bytes consumed by ARC buffers that are eligible for 496 * eviction, of type ARC_BUFC_METADATA, and reside in the 497 * arc_mfu state. 498 */ 499 kstat_named_t arcstat_mfu_evictable_metadata; 500 /* 501 * Total number of bytes that *would have been* consumed by ARC 502 * buffers in the arc_mfu_ghost state. See the comment above 503 * arcstat_mru_ghost_size for more details. 504 */ 505 kstat_named_t arcstat_mfu_ghost_size; 506 /* 507 * Number of bytes that *would have been* consumed by ARC 508 * buffers that are eligible for eviction, of type 509 * ARC_BUFC_DATA, and linked off the arc_mfu_ghost state. 510 */ 511 kstat_named_t arcstat_mfu_ghost_evictable_data; 512 /* 513 * Number of bytes that *would have been* consumed by ARC 514 * buffers that are eligible for eviction, of type 515 * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state. 516 */ 517 kstat_named_t arcstat_mfu_ghost_evictable_metadata; 518 kstat_named_t arcstat_l2_hits; 519 kstat_named_t arcstat_l2_misses; 520 kstat_named_t arcstat_l2_feeds; 521 kstat_named_t arcstat_l2_rw_clash; 522 kstat_named_t arcstat_l2_read_bytes; 523 kstat_named_t arcstat_l2_write_bytes; 524 kstat_named_t arcstat_l2_writes_sent; 525 kstat_named_t arcstat_l2_writes_done; 526 kstat_named_t arcstat_l2_writes_error; 527 kstat_named_t arcstat_l2_writes_hdr_miss; 528 kstat_named_t arcstat_l2_evict_lock_retry; 529 kstat_named_t arcstat_l2_evict_reading; 530 kstat_named_t arcstat_l2_evict_l1cached; 531 kstat_named_t arcstat_l2_free_on_write; 532 kstat_named_t arcstat_l2_cdata_free_on_write; 533 kstat_named_t arcstat_l2_abort_lowmem; 534 kstat_named_t arcstat_l2_cksum_bad; 535 kstat_named_t arcstat_l2_io_error; 536 kstat_named_t arcstat_l2_size; 537 kstat_named_t arcstat_l2_asize; 538 kstat_named_t arcstat_l2_hdr_size; 539 kstat_named_t arcstat_l2_compress_successes; 540 kstat_named_t arcstat_l2_compress_zeros; 541 kstat_named_t arcstat_l2_compress_failures; 542 kstat_named_t arcstat_l2_write_trylock_fail; 543 kstat_named_t arcstat_l2_write_passed_headroom; 544 kstat_named_t arcstat_l2_write_spa_mismatch; 545 kstat_named_t arcstat_l2_write_in_l2; 546 kstat_named_t arcstat_l2_write_hdr_io_in_progress; 547 kstat_named_t arcstat_l2_write_not_cacheable; 548 kstat_named_t arcstat_l2_write_full; 549 kstat_named_t arcstat_l2_write_buffer_iter; 550 kstat_named_t arcstat_l2_write_pios; 551 kstat_named_t arcstat_l2_write_buffer_bytes_scanned; 552 kstat_named_t arcstat_l2_write_buffer_list_iter; 553 kstat_named_t arcstat_l2_write_buffer_list_null_iter; 554 kstat_named_t arcstat_memory_throttle_count; 555 kstat_named_t arcstat_duplicate_buffers; 556 kstat_named_t arcstat_duplicate_buffers_size; 557 kstat_named_t arcstat_duplicate_reads; 558 kstat_named_t arcstat_meta_used; 559 kstat_named_t arcstat_meta_limit; 560 kstat_named_t arcstat_meta_max; 561 kstat_named_t arcstat_meta_min; 562} arc_stats_t; 563 564static arc_stats_t arc_stats = { 565 { "hits", KSTAT_DATA_UINT64 }, 566 { "misses", KSTAT_DATA_UINT64 }, 567 { "demand_data_hits", KSTAT_DATA_UINT64 }, 568 { "demand_data_misses", KSTAT_DATA_UINT64 }, 569 { "demand_metadata_hits", KSTAT_DATA_UINT64 }, 570 { "demand_metadata_misses", KSTAT_DATA_UINT64 }, 571 { "prefetch_data_hits", KSTAT_DATA_UINT64 }, 572 { "prefetch_data_misses", KSTAT_DATA_UINT64 }, 573 { "prefetch_metadata_hits", KSTAT_DATA_UINT64 }, 574 { "prefetch_metadata_misses", KSTAT_DATA_UINT64 }, 575 { "mru_hits", KSTAT_DATA_UINT64 }, 576 { "mru_ghost_hits", KSTAT_DATA_UINT64 }, 577 { "mfu_hits", KSTAT_DATA_UINT64 }, 578 { "mfu_ghost_hits", KSTAT_DATA_UINT64 }, 579 { "allocated", KSTAT_DATA_UINT64 }, 580 { "deleted", KSTAT_DATA_UINT64 }, 581 { "stolen", KSTAT_DATA_UINT64 }, 582 { "recycle_miss", KSTAT_DATA_UINT64 }, 583 { "mutex_miss", KSTAT_DATA_UINT64 }, 584 { "evict_skip", KSTAT_DATA_UINT64 }, 585 { "evict_l2_cached", KSTAT_DATA_UINT64 }, 586 { "evict_l2_eligible", KSTAT_DATA_UINT64 }, 587 { "evict_l2_ineligible", KSTAT_DATA_UINT64 }, 588 { "hash_elements", KSTAT_DATA_UINT64 }, 589 { "hash_elements_max", KSTAT_DATA_UINT64 }, 590 { "hash_collisions", KSTAT_DATA_UINT64 }, 591 { "hash_chains", KSTAT_DATA_UINT64 }, 592 { "hash_chain_max", KSTAT_DATA_UINT64 }, 593 { "p", KSTAT_DATA_UINT64 }, 594 { "c", KSTAT_DATA_UINT64 }, 595 { "c_min", KSTAT_DATA_UINT64 }, 596 { "c_max", KSTAT_DATA_UINT64 }, 597 { "size", KSTAT_DATA_UINT64 }, 598 { "hdr_size", KSTAT_DATA_UINT64 }, 599 { "data_size", KSTAT_DATA_UINT64 }, 600 { "metadata_size", KSTAT_DATA_UINT64 }, 601 { "other_size", KSTAT_DATA_UINT64 }, 602 { "anon_size", KSTAT_DATA_UINT64 }, 603 { "anon_evictable_data", KSTAT_DATA_UINT64 }, 604 { "anon_evictable_metadata", KSTAT_DATA_UINT64 }, 605 { "mru_size", KSTAT_DATA_UINT64 }, 606 { "mru_evictable_data", KSTAT_DATA_UINT64 }, 607 { "mru_evictable_metadata", KSTAT_DATA_UINT64 }, 608 { "mru_ghost_size", KSTAT_DATA_UINT64 }, 609 { "mru_ghost_evictable_data", KSTAT_DATA_UINT64 }, 610 { "mru_ghost_evictable_metadata", KSTAT_DATA_UINT64 }, 611 { "mfu_size", KSTAT_DATA_UINT64 }, 612 { "mfu_evictable_data", KSTAT_DATA_UINT64 }, 613 { "mfu_evictable_metadata", KSTAT_DATA_UINT64 }, 614 { "mfu_ghost_size", KSTAT_DATA_UINT64 }, 615 { "mfu_ghost_evictable_data", KSTAT_DATA_UINT64 }, 616 { "mfu_ghost_evictable_metadata", KSTAT_DATA_UINT64 }, 617 { "l2_hits", KSTAT_DATA_UINT64 }, 618 { "l2_misses", KSTAT_DATA_UINT64 }, 619 { "l2_feeds", KSTAT_DATA_UINT64 }, 620 { "l2_rw_clash", KSTAT_DATA_UINT64 }, 621 { "l2_read_bytes", KSTAT_DATA_UINT64 }, 622 { "l2_write_bytes", KSTAT_DATA_UINT64 }, 623 { "l2_writes_sent", KSTAT_DATA_UINT64 }, 624 { "l2_writes_done", KSTAT_DATA_UINT64 }, 625 { "l2_writes_error", KSTAT_DATA_UINT64 }, 626 { "l2_writes_hdr_miss", KSTAT_DATA_UINT64 }, 627 { "l2_evict_lock_retry", KSTAT_DATA_UINT64 }, 628 { "l2_evict_reading", KSTAT_DATA_UINT64 }, 629 { "l2_evict_l1cached", KSTAT_DATA_UINT64 }, 630 { "l2_free_on_write", KSTAT_DATA_UINT64 }, 631 { "l2_cdata_free_on_write", KSTAT_DATA_UINT64 }, 632 { "l2_abort_lowmem", KSTAT_DATA_UINT64 }, 633 { "l2_cksum_bad", KSTAT_DATA_UINT64 }, 634 { "l2_io_error", KSTAT_DATA_UINT64 }, 635 { "l2_size", KSTAT_DATA_UINT64 }, 636 { "l2_asize", KSTAT_DATA_UINT64 }, 637 { "l2_hdr_size", KSTAT_DATA_UINT64 }, 638 { "l2_compress_successes", KSTAT_DATA_UINT64 }, 639 { "l2_compress_zeros", KSTAT_DATA_UINT64 }, 640 { "l2_compress_failures", KSTAT_DATA_UINT64 }, 641 { "l2_write_trylock_fail", KSTAT_DATA_UINT64 }, 642 { "l2_write_passed_headroom", KSTAT_DATA_UINT64 }, 643 { "l2_write_spa_mismatch", KSTAT_DATA_UINT64 }, 644 { "l2_write_in_l2", KSTAT_DATA_UINT64 }, 645 { "l2_write_io_in_progress", KSTAT_DATA_UINT64 }, 646 { "l2_write_not_cacheable", KSTAT_DATA_UINT64 }, 647 { "l2_write_full", KSTAT_DATA_UINT64 }, 648 { "l2_write_buffer_iter", KSTAT_DATA_UINT64 }, 649 { "l2_write_pios", KSTAT_DATA_UINT64 }, 650 { "l2_write_buffer_bytes_scanned", KSTAT_DATA_UINT64 }, 651 { "l2_write_buffer_list_iter", KSTAT_DATA_UINT64 }, 652 { "l2_write_buffer_list_null_iter", KSTAT_DATA_UINT64 }, 653 { "memory_throttle_count", KSTAT_DATA_UINT64 }, 654 { "duplicate_buffers", KSTAT_DATA_UINT64 }, 655 { "duplicate_buffers_size", KSTAT_DATA_UINT64 }, 656 { "duplicate_reads", KSTAT_DATA_UINT64 }, 657 { "arc_meta_used", KSTAT_DATA_UINT64 }, 658 { "arc_meta_limit", KSTAT_DATA_UINT64 }, 659 { "arc_meta_max", KSTAT_DATA_UINT64 }, 660 { "arc_meta_min", KSTAT_DATA_UINT64 } 661}; 662 663#define ARCSTAT(stat) (arc_stats.stat.value.ui64) 664 665#define ARCSTAT_INCR(stat, val) \ 666 atomic_add_64(&arc_stats.stat.value.ui64, (val)) 667 668#define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1) 669#define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1) 670 671#define ARCSTAT_MAX(stat, val) { \ 672 uint64_t m; \ 673 while ((val) > (m = arc_stats.stat.value.ui64) && \ 674 (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val)))) \ 675 continue; \ 676} 677 678#define ARCSTAT_MAXSTAT(stat) \ 679 ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64) 680 681/* 682 * We define a macro to allow ARC hits/misses to be easily broken down by 683 * two separate conditions, giving a total of four different subtypes for 684 * each of hits and misses (so eight statistics total). 685 */ 686#define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \ 687 if (cond1) { \ 688 if (cond2) { \ 689 ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \ 690 } else { \ 691 ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \ 692 } \ 693 } else { \ 694 if (cond2) { \ 695 ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \ 696 } else { \ 697 ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\ 698 } \ 699 } 700 701kstat_t *arc_ksp; 702static arc_state_t *arc_anon; 703static arc_state_t *arc_mru; 704static arc_state_t *arc_mru_ghost; 705static arc_state_t *arc_mfu; 706static arc_state_t *arc_mfu_ghost; 707static arc_state_t *arc_l2c_only; 708 709/* 710 * There are several ARC variables that are critical to export as kstats -- 711 * but we don't want to have to grovel around in the kstat whenever we wish to 712 * manipulate them. For these variables, we therefore define them to be in 713 * terms of the statistic variable. This assures that we are not introducing 714 * the possibility of inconsistency by having shadow copies of the variables, 715 * while still allowing the code to be readable. 716 */ 717#define arc_size ARCSTAT(arcstat_size) /* actual total arc size */ 718#define arc_p ARCSTAT(arcstat_p) /* target size of MRU */ 719#define arc_c ARCSTAT(arcstat_c) /* target size of cache */ 720#define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */ 721#define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */ 722#define arc_meta_limit ARCSTAT(arcstat_meta_limit) /* max size for metadata */ 723#define arc_meta_min ARCSTAT(arcstat_meta_min) /* min size for metadata */ 724#define arc_meta_used ARCSTAT(arcstat_meta_used) /* size of metadata */ 725#define arc_meta_max ARCSTAT(arcstat_meta_max) /* max size of metadata */ 726 727#define L2ARC_IS_VALID_COMPRESS(_c_) \ 728 ((_c_) == ZIO_COMPRESS_LZ4 || (_c_) == ZIO_COMPRESS_EMPTY) 729 730static int arc_no_grow; /* Don't try to grow cache size */ 731static uint64_t arc_tempreserve; 732static uint64_t arc_loaned_bytes; 733 734typedef struct arc_callback arc_callback_t; 735 736struct arc_callback { 737 void *acb_private; 738 arc_done_func_t *acb_done; 739 arc_buf_t *acb_buf; 740 zio_t *acb_zio_dummy; 741 arc_callback_t *acb_next; 742}; 743 744typedef struct arc_write_callback arc_write_callback_t; 745 746struct arc_write_callback { 747 void *awcb_private; 748 arc_done_func_t *awcb_ready; 749 arc_done_func_t *awcb_physdone; 750 arc_done_func_t *awcb_done; 751 arc_buf_t *awcb_buf; 752}; 753 754/* 755 * ARC buffers are separated into multiple structs as a memory saving measure: 756 * - Common fields struct, always defined, and embedded within it: 757 * - L2-only fields, always allocated but undefined when not in L2ARC 758 * - L1-only fields, only allocated when in L1ARC 759 * 760 * Buffer in L1 Buffer only in L2 761 * +------------------------+ +------------------------+ 762 * | arc_buf_hdr_t | | arc_buf_hdr_t | 763 * | | | | 764 * | | | | 765 * | | | | 766 * +------------------------+ +------------------------+ 767 * | l2arc_buf_hdr_t | | l2arc_buf_hdr_t | 768 * | (undefined if L1-only) | | | 769 * +------------------------+ +------------------------+ 770 * | l1arc_buf_hdr_t | 771 * | | 772 * | | 773 * | | 774 * | | 775 * +------------------------+ 776 * 777 * Because it's possible for the L2ARC to become extremely large, we can wind 778 * up eating a lot of memory in L2ARC buffer headers, so the size of a header 779 * is minimized by only allocating the fields necessary for an L1-cached buffer 780 * when a header is actually in the L1 cache. The sub-headers (l1arc_buf_hdr and 781 * l2arc_buf_hdr) are embedded rather than allocated separately to save a couple 782 * words in pointers. arc_hdr_realloc() is used to switch a header between 783 * these two allocation states. 784 */ 785typedef struct l1arc_buf_hdr { 786 kmutex_t b_freeze_lock; 787#ifdef ZFS_DEBUG 788 /* 789 * used for debugging wtih kmem_flags - by allocating and freeing 790 * b_thawed when the buffer is thawed, we get a record of the stack 791 * trace that thawed it. 792 */ 793 void *b_thawed; 794#endif 795 796 arc_buf_t *b_buf; 797 uint32_t b_datacnt; 798 /* for waiting on writes to complete */ 799 kcondvar_t b_cv; 800 801 /* protected by arc state mutex */ 802 arc_state_t *b_state; 803 list_node_t b_arc_node; 804 805 /* updated atomically */ 806 clock_t b_arc_access; 807 808 /* self protecting */ 809 refcount_t b_refcnt; 810 811 arc_callback_t *b_acb; 812 /* temporary buffer holder for in-flight compressed data */ 813 void *b_tmp_cdata; 814} l1arc_buf_hdr_t; 815 816typedef struct l2arc_dev l2arc_dev_t; 817 818typedef struct l2arc_buf_hdr { 819 /* protected by arc_buf_hdr mutex */ 820 l2arc_dev_t *b_dev; /* L2ARC device */ 821 uint64_t b_daddr; /* disk address, offset byte */ 822 /* real alloc'd buffer size depending on b_compress applied */ 823 int32_t b_asize; 824 825 list_node_t b_l2node; 826} l2arc_buf_hdr_t; 827 828struct arc_buf_hdr { 829 /* protected by hash lock */ 830 dva_t b_dva; 831 uint64_t b_birth; 832 /* 833 * Even though this checksum is only set/verified when a buffer is in 834 * the L1 cache, it needs to be in the set of common fields because it 835 * must be preserved from the time before a buffer is written out to 836 * L2ARC until after it is read back in. 837 */ 838 zio_cksum_t *b_freeze_cksum; 839 840 arc_buf_hdr_t *b_hash_next; 841 arc_flags_t b_flags; 842 843 /* immutable */ 844 int32_t b_size; 845 uint64_t b_spa; 846 847 /* L2ARC fields. Undefined when not in L2ARC. */ 848 l2arc_buf_hdr_t b_l2hdr; 849 /* L1ARC fields. Undefined when in l2arc_only state */ 850 l1arc_buf_hdr_t b_l1hdr; 851}; 852 853#ifdef _KERNEL 854static int 855sysctl_vfs_zfs_arc_meta_limit(SYSCTL_HANDLER_ARGS) 856{ 857 uint64_t val; 858 int err; 859 860 val = arc_meta_limit; 861 err = sysctl_handle_64(oidp, &val, 0, req); 862 if (err != 0 || req->newptr == NULL) 863 return (err); 864 865 if (val <= 0 || val > arc_c_max) 866 return (EINVAL); 867 868 arc_meta_limit = val; 869 return (0); 870} 871#endif 872 873static arc_buf_t *arc_eviction_list; 874static kmutex_t arc_eviction_mtx; 875static arc_buf_hdr_t arc_eviction_hdr; 876 877#define GHOST_STATE(state) \ 878 ((state) == arc_mru_ghost || (state) == arc_mfu_ghost || \ 879 (state) == arc_l2c_only) 880 881#define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_FLAG_IN_HASH_TABLE) 882#define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) 883#define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_FLAG_IO_ERROR) 884#define HDR_PREFETCH(hdr) ((hdr)->b_flags & ARC_FLAG_PREFETCH) 885#define HDR_FREED_IN_READ(hdr) ((hdr)->b_flags & ARC_FLAG_FREED_IN_READ) 886#define HDR_BUF_AVAILABLE(hdr) ((hdr)->b_flags & ARC_FLAG_BUF_AVAILABLE) 887 888#define HDR_L2CACHE(hdr) ((hdr)->b_flags & ARC_FLAG_L2CACHE) 889#define HDR_L2COMPRESS(hdr) ((hdr)->b_flags & ARC_FLAG_L2COMPRESS) 890#define HDR_L2_READING(hdr) \ 891 (((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) && \ 892 ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR)) 893#define HDR_L2_WRITING(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITING) 894#define HDR_L2_EVICTED(hdr) ((hdr)->b_flags & ARC_FLAG_L2_EVICTED) 895#define HDR_L2_WRITE_HEAD(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITE_HEAD) 896 897#define HDR_ISTYPE_METADATA(hdr) \ 898 ((hdr)->b_flags & ARC_FLAG_BUFC_METADATA) 899#define HDR_ISTYPE_DATA(hdr) (!HDR_ISTYPE_METADATA(hdr)) 900 901#define HDR_HAS_L1HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L1HDR) 902#define HDR_HAS_L2HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR) 903 904/* For storing compression mode in b_flags */ 905#define HDR_COMPRESS_OFFSET 24 906#define HDR_COMPRESS_NBITS 7 907 908#define HDR_GET_COMPRESS(hdr) ((enum zio_compress)BF32_GET(hdr->b_flags, \ 909 HDR_COMPRESS_OFFSET, HDR_COMPRESS_NBITS)) 910#define HDR_SET_COMPRESS(hdr, cmp) BF32_SET(hdr->b_flags, \ 911 HDR_COMPRESS_OFFSET, HDR_COMPRESS_NBITS, (cmp)) 912 913/* 914 * Other sizes 915 */ 916 917#define HDR_FULL_SIZE ((int64_t)sizeof (arc_buf_hdr_t)) 918#define HDR_L2ONLY_SIZE ((int64_t)offsetof(arc_buf_hdr_t, b_l1hdr)) 919 920/* 921 * Hash table routines 922 */ 923 924#define HT_LOCK_PAD CACHE_LINE_SIZE 925 926struct ht_lock { 927 kmutex_t ht_lock; 928#ifdef _KERNEL 929 unsigned char pad[(HT_LOCK_PAD - sizeof (kmutex_t))]; 930#endif 931}; 932 933#define BUF_LOCKS 256 934typedef struct buf_hash_table { 935 uint64_t ht_mask; 936 arc_buf_hdr_t **ht_table; 937 struct ht_lock ht_locks[BUF_LOCKS] __aligned(CACHE_LINE_SIZE); 938} buf_hash_table_t; 939 940static buf_hash_table_t buf_hash_table; 941 942#define BUF_HASH_INDEX(spa, dva, birth) \ 943 (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask) 944#define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)]) 945#define BUF_HASH_LOCK(idx) (&(BUF_HASH_LOCK_NTRY(idx).ht_lock)) 946#define HDR_LOCK(hdr) \ 947 (BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth))) 948 949uint64_t zfs_crc64_table[256]; 950 951/* 952 * Level 2 ARC 953 */ 954 955#define L2ARC_WRITE_SIZE (8 * 1024 * 1024) /* initial write max */ 956#define L2ARC_HEADROOM 2 /* num of writes */ 957/* 958 * If we discover during ARC scan any buffers to be compressed, we boost 959 * our headroom for the next scanning cycle by this percentage multiple. 960 */ 961#define L2ARC_HEADROOM_BOOST 200 962#define L2ARC_FEED_SECS 1 /* caching interval secs */ 963#define L2ARC_FEED_MIN_MS 200 /* min caching interval ms */ 964 965/* 966 * Used to distinguish headers that are being process by 967 * l2arc_write_buffers(), but have yet to be assigned to a l2arc disk 968 * address. This can happen when the header is added to the l2arc's list 969 * of buffers to write in the first stage of l2arc_write_buffers(), but 970 * has not yet been written out which happens in the second stage of 971 * l2arc_write_buffers(). 972 */ 973#define L2ARC_ADDR_UNSET ((uint64_t)(-1)) 974 975#define l2arc_writes_sent ARCSTAT(arcstat_l2_writes_sent) 976#define l2arc_writes_done ARCSTAT(arcstat_l2_writes_done) 977 978/* L2ARC Performance Tunables */ 979uint64_t l2arc_write_max = L2ARC_WRITE_SIZE; /* default max write size */ 980uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra write during warmup */ 981uint64_t l2arc_headroom = L2ARC_HEADROOM; /* number of dev writes */ 982uint64_t l2arc_headroom_boost = L2ARC_HEADROOM_BOOST; 983uint64_t l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */ 984uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval milliseconds */ 985boolean_t l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */ 986boolean_t l2arc_feed_again = B_TRUE; /* turbo warmup */ 987boolean_t l2arc_norw = B_TRUE; /* no reads during writes */ 988 989SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_max, CTLFLAG_RW, 990 &l2arc_write_max, 0, "max write size"); 991SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_boost, CTLFLAG_RW, 992 &l2arc_write_boost, 0, "extra write during warmup"); 993SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_headroom, CTLFLAG_RW, 994 &l2arc_headroom, 0, "number of dev writes"); 995SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_secs, CTLFLAG_RW, 996 &l2arc_feed_secs, 0, "interval seconds"); 997SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_min_ms, CTLFLAG_RW, 998 &l2arc_feed_min_ms, 0, "min interval milliseconds"); 999 1000SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_noprefetch, CTLFLAG_RW, 1001 &l2arc_noprefetch, 0, "don't cache prefetch bufs"); 1002SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_feed_again, CTLFLAG_RW, 1003 &l2arc_feed_again, 0, "turbo warmup"); 1004SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_norw, CTLFLAG_RW, 1005 &l2arc_norw, 0, "no reads during writes"); 1006 1007SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_size, CTLFLAG_RD, 1008 &ARC_anon.arcs_size, 0, "size of anonymous state"); 1009SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_metadata_lsize, CTLFLAG_RD, 1010 &ARC_anon.arcs_lsize[ARC_BUFC_METADATA], 0, "size of anonymous state"); 1011SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_data_lsize, CTLFLAG_RD, 1012 &ARC_anon.arcs_lsize[ARC_BUFC_DATA], 0, "size of anonymous state"); 1013 1014SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_size, CTLFLAG_RD, 1015 &ARC_mru.arcs_size, 0, "size of mru state"); 1016SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_metadata_lsize, CTLFLAG_RD, 1017 &ARC_mru.arcs_lsize[ARC_BUFC_METADATA], 0, "size of metadata in mru state"); 1018SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_data_lsize, CTLFLAG_RD, 1019 &ARC_mru.arcs_lsize[ARC_BUFC_DATA], 0, "size of data in mru state"); 1020 1021SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_size, CTLFLAG_RD, 1022 &ARC_mru_ghost.arcs_size, 0, "size of mru ghost state"); 1023SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_metadata_lsize, CTLFLAG_RD, 1024 &ARC_mru_ghost.arcs_lsize[ARC_BUFC_METADATA], 0, 1025 "size of metadata in mru ghost state"); 1026SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_data_lsize, CTLFLAG_RD, 1027 &ARC_mru_ghost.arcs_lsize[ARC_BUFC_DATA], 0, 1028 "size of data in mru ghost state"); 1029 1030SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_size, CTLFLAG_RD, 1031 &ARC_mfu.arcs_size, 0, "size of mfu state"); 1032SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_metadata_lsize, CTLFLAG_RD, 1033 &ARC_mfu.arcs_lsize[ARC_BUFC_METADATA], 0, "size of metadata in mfu state"); 1034SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_data_lsize, CTLFLAG_RD, 1035 &ARC_mfu.arcs_lsize[ARC_BUFC_DATA], 0, "size of data in mfu state"); 1036 1037SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_size, CTLFLAG_RD, 1038 &ARC_mfu_ghost.arcs_size, 0, "size of mfu ghost state"); 1039SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_metadata_lsize, CTLFLAG_RD, 1040 &ARC_mfu_ghost.arcs_lsize[ARC_BUFC_METADATA], 0, 1041 "size of metadata in mfu ghost state"); 1042SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_data_lsize, CTLFLAG_RD, 1043 &ARC_mfu_ghost.arcs_lsize[ARC_BUFC_DATA], 0, 1044 "size of data in mfu ghost state"); 1045 1046SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2c_only_size, CTLFLAG_RD, 1047 &ARC_l2c_only.arcs_size, 0, "size of mru state"); 1048 1049/* 1050 * L2ARC Internals 1051 */ 1052struct l2arc_dev { 1053 vdev_t *l2ad_vdev; /* vdev */ 1054 spa_t *l2ad_spa; /* spa */ 1055 uint64_t l2ad_hand; /* next write location */ 1056 uint64_t l2ad_start; /* first addr on device */ 1057 uint64_t l2ad_end; /* last addr on device */ 1058 boolean_t l2ad_first; /* first sweep through */ 1059 boolean_t l2ad_writing; /* currently writing */ 1060 kmutex_t l2ad_mtx; /* lock for buffer list */ 1061 list_t l2ad_buflist; /* buffer list */ 1062 list_node_t l2ad_node; /* device list node */ 1063 refcount_t l2ad_alloc; /* allocated bytes */ 1064}; 1065 1066static list_t L2ARC_dev_list; /* device list */ 1067static list_t *l2arc_dev_list; /* device list pointer */ 1068static kmutex_t l2arc_dev_mtx; /* device list mutex */ 1069static l2arc_dev_t *l2arc_dev_last; /* last device used */ 1070static list_t L2ARC_free_on_write; /* free after write buf list */ 1071static list_t *l2arc_free_on_write; /* free after write list ptr */ 1072static kmutex_t l2arc_free_on_write_mtx; /* mutex for list */ 1073static uint64_t l2arc_ndev; /* number of devices */ 1074 1075typedef struct l2arc_read_callback { 1076 arc_buf_t *l2rcb_buf; /* read buffer */ 1077 spa_t *l2rcb_spa; /* spa */ 1078 blkptr_t l2rcb_bp; /* original blkptr */ 1079 zbookmark_phys_t l2rcb_zb; /* original bookmark */ 1080 int l2rcb_flags; /* original flags */ 1081 enum zio_compress l2rcb_compress; /* applied compress */ 1082} l2arc_read_callback_t; 1083 1084typedef struct l2arc_write_callback { 1085 l2arc_dev_t *l2wcb_dev; /* device info */ 1086 arc_buf_hdr_t *l2wcb_head; /* head of write buflist */ 1087} l2arc_write_callback_t; 1088 1089typedef struct l2arc_data_free { 1090 /* protected by l2arc_free_on_write_mtx */ 1091 void *l2df_data; 1092 size_t l2df_size; 1093 void (*l2df_func)(void *, size_t); 1094 list_node_t l2df_list_node; 1095} l2arc_data_free_t; 1096 1097static kmutex_t l2arc_feed_thr_lock; 1098static kcondvar_t l2arc_feed_thr_cv; 1099static uint8_t l2arc_thread_exit; 1100 1101static void arc_get_data_buf(arc_buf_t *); 1102static void arc_access(arc_buf_hdr_t *, kmutex_t *); 1103static int arc_evict_needed(arc_buf_contents_t); 1104static void arc_evict_ghost(arc_state_t *, uint64_t, int64_t); 1105static void arc_buf_watch(arc_buf_t *); 1106 1107static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *); 1108static uint32_t arc_bufc_to_flags(arc_buf_contents_t); 1109 1110static boolean_t l2arc_write_eligible(uint64_t, arc_buf_hdr_t *); 1111static void l2arc_read_done(zio_t *); 1112 1113static boolean_t l2arc_compress_buf(arc_buf_hdr_t *); 1114static void l2arc_decompress_zio(zio_t *, arc_buf_hdr_t *, enum zio_compress); 1115static void l2arc_release_cdata_buf(arc_buf_hdr_t *); 1116 1117static uint64_t 1118buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth) 1119{ 1120 uint8_t *vdva = (uint8_t *)dva; 1121 uint64_t crc = -1ULL; 1122 int i; 1123 1124 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); 1125 1126 for (i = 0; i < sizeof (dva_t); i++) 1127 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF]; 1128 1129 crc ^= (spa>>8) ^ birth; 1130 1131 return (crc); 1132} 1133 1134#define BUF_EMPTY(buf) \ 1135 ((buf)->b_dva.dva_word[0] == 0 && \ 1136 (buf)->b_dva.dva_word[1] == 0) 1137 1138#define BUF_EQUAL(spa, dva, birth, buf) \ 1139 ((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \ 1140 ((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \ 1141 ((buf)->b_birth == birth) && ((buf)->b_spa == spa) 1142 1143static void 1144buf_discard_identity(arc_buf_hdr_t *hdr) 1145{ 1146 hdr->b_dva.dva_word[0] = 0; 1147 hdr->b_dva.dva_word[1] = 0; 1148 hdr->b_birth = 0; 1149} 1150 1151static arc_buf_hdr_t * 1152buf_hash_find(uint64_t spa, const blkptr_t *bp, kmutex_t **lockp) 1153{ 1154 const dva_t *dva = BP_IDENTITY(bp); 1155 uint64_t birth = BP_PHYSICAL_BIRTH(bp); 1156 uint64_t idx = BUF_HASH_INDEX(spa, dva, birth); 1157 kmutex_t *hash_lock = BUF_HASH_LOCK(idx); 1158 arc_buf_hdr_t *hdr; 1159 1160 mutex_enter(hash_lock); 1161 for (hdr = buf_hash_table.ht_table[idx]; hdr != NULL; 1162 hdr = hdr->b_hash_next) { 1163 if (BUF_EQUAL(spa, dva, birth, hdr)) { 1164 *lockp = hash_lock; 1165 return (hdr); 1166 } 1167 } 1168 mutex_exit(hash_lock); 1169 *lockp = NULL; 1170 return (NULL); 1171} 1172 1173/* 1174 * Insert an entry into the hash table. If there is already an element 1175 * equal to elem in the hash table, then the already existing element 1176 * will be returned and the new element will not be inserted. 1177 * Otherwise returns NULL. 1178 * If lockp == NULL, the caller is assumed to already hold the hash lock. 1179 */ 1180static arc_buf_hdr_t * 1181buf_hash_insert(arc_buf_hdr_t *hdr, kmutex_t **lockp) 1182{ 1183 uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth); 1184 kmutex_t *hash_lock = BUF_HASH_LOCK(idx); 1185 arc_buf_hdr_t *fhdr; 1186 uint32_t i; 1187 1188 ASSERT(!DVA_IS_EMPTY(&hdr->b_dva)); 1189 ASSERT(hdr->b_birth != 0); 1190 ASSERT(!HDR_IN_HASH_TABLE(hdr)); 1191 1192 if (lockp != NULL) { 1193 *lockp = hash_lock; 1194 mutex_enter(hash_lock); 1195 } else { 1196 ASSERT(MUTEX_HELD(hash_lock)); 1197 } 1198 1199 for (fhdr = buf_hash_table.ht_table[idx], i = 0; fhdr != NULL; 1200 fhdr = fhdr->b_hash_next, i++) { 1201 if (BUF_EQUAL(hdr->b_spa, &hdr->b_dva, hdr->b_birth, fhdr)) 1202 return (fhdr); 1203 } 1204 1205 hdr->b_hash_next = buf_hash_table.ht_table[idx]; 1206 buf_hash_table.ht_table[idx] = hdr; 1207 hdr->b_flags |= ARC_FLAG_IN_HASH_TABLE; 1208 1209 /* collect some hash table performance data */ 1210 if (i > 0) { 1211 ARCSTAT_BUMP(arcstat_hash_collisions); 1212 if (i == 1) 1213 ARCSTAT_BUMP(arcstat_hash_chains); 1214 1215 ARCSTAT_MAX(arcstat_hash_chain_max, i); 1216 } 1217 1218 ARCSTAT_BUMP(arcstat_hash_elements); 1219 ARCSTAT_MAXSTAT(arcstat_hash_elements); 1220 1221 return (NULL); 1222} 1223 1224static void 1225buf_hash_remove(arc_buf_hdr_t *hdr) 1226{ 1227 arc_buf_hdr_t *fhdr, **hdrp; 1228 uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth); 1229 1230 ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx))); 1231 ASSERT(HDR_IN_HASH_TABLE(hdr)); 1232 1233 hdrp = &buf_hash_table.ht_table[idx]; 1234 while ((fhdr = *hdrp) != hdr) { 1235 ASSERT(fhdr != NULL); 1236 hdrp = &fhdr->b_hash_next; 1237 } 1238 *hdrp = hdr->b_hash_next; 1239 hdr->b_hash_next = NULL; 1240 hdr->b_flags &= ~ARC_FLAG_IN_HASH_TABLE; 1241 1242 /* collect some hash table performance data */ 1243 ARCSTAT_BUMPDOWN(arcstat_hash_elements); 1244 1245 if (buf_hash_table.ht_table[idx] && 1246 buf_hash_table.ht_table[idx]->b_hash_next == NULL) 1247 ARCSTAT_BUMPDOWN(arcstat_hash_chains); 1248} 1249 1250/* 1251 * Global data structures and functions for the buf kmem cache. 1252 */ 1253static kmem_cache_t *hdr_full_cache; 1254static kmem_cache_t *hdr_l2only_cache; 1255static kmem_cache_t *buf_cache; 1256 1257static void 1258buf_fini(void) 1259{ 1260 int i; 1261 1262 kmem_free(buf_hash_table.ht_table, 1263 (buf_hash_table.ht_mask + 1) * sizeof (void *)); 1264 for (i = 0; i < BUF_LOCKS; i++) 1265 mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock); 1266 kmem_cache_destroy(hdr_full_cache); 1267 kmem_cache_destroy(hdr_l2only_cache); 1268 kmem_cache_destroy(buf_cache); 1269} 1270 1271/* 1272 * Constructor callback - called when the cache is empty 1273 * and a new buf is requested. 1274 */ 1275/* ARGSUSED */ 1276static int 1277hdr_full_cons(void *vbuf, void *unused, int kmflag) 1278{ 1279 arc_buf_hdr_t *hdr = vbuf; 1280 1281 bzero(hdr, HDR_FULL_SIZE); 1282 cv_init(&hdr->b_l1hdr.b_cv, NULL, CV_DEFAULT, NULL); 1283 refcount_create(&hdr->b_l1hdr.b_refcnt); 1284 mutex_init(&hdr->b_l1hdr.b_freeze_lock, NULL, MUTEX_DEFAULT, NULL); 1285 arc_space_consume(HDR_FULL_SIZE, ARC_SPACE_HDRS); 1286 1287 return (0); 1288} 1289 1290/* ARGSUSED */ 1291static int 1292hdr_l2only_cons(void *vbuf, void *unused, int kmflag) 1293{ 1294 arc_buf_hdr_t *hdr = vbuf; 1295 1296 bzero(hdr, HDR_L2ONLY_SIZE); 1297 arc_space_consume(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS); 1298 1299 return (0); 1300} 1301 1302/* ARGSUSED */ 1303static int 1304buf_cons(void *vbuf, void *unused, int kmflag) 1305{ 1306 arc_buf_t *buf = vbuf; 1307 1308 bzero(buf, sizeof (arc_buf_t)); 1309 mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL); 1310 arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS); 1311 1312 return (0); 1313} 1314 1315/* 1316 * Destructor callback - called when a cached buf is 1317 * no longer required. 1318 */ 1319/* ARGSUSED */ 1320static void 1321hdr_full_dest(void *vbuf, void *unused) 1322{ 1323 arc_buf_hdr_t *hdr = vbuf; 1324 1325 ASSERT(BUF_EMPTY(hdr)); 1326 cv_destroy(&hdr->b_l1hdr.b_cv); 1327 refcount_destroy(&hdr->b_l1hdr.b_refcnt); 1328 mutex_destroy(&hdr->b_l1hdr.b_freeze_lock); 1329 arc_space_return(HDR_FULL_SIZE, ARC_SPACE_HDRS); 1330} 1331 1332/* ARGSUSED */ 1333static void 1334hdr_l2only_dest(void *vbuf, void *unused) 1335{ 1336 arc_buf_hdr_t *hdr = vbuf; 1337 1338 ASSERT(BUF_EMPTY(hdr)); 1339 arc_space_return(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS); 1340} 1341 1342/* ARGSUSED */ 1343static void 1344buf_dest(void *vbuf, void *unused) 1345{ 1346 arc_buf_t *buf = vbuf; 1347 1348 mutex_destroy(&buf->b_evict_lock); 1349 arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS); 1350} 1351 1352/* 1353 * Reclaim callback -- invoked when memory is low. 1354 */ 1355/* ARGSUSED */ 1356static void 1357hdr_recl(void *unused) 1358{ 1359 dprintf("hdr_recl called\n"); 1360 /* 1361 * umem calls the reclaim func when we destroy the buf cache, 1362 * which is after we do arc_fini(). 1363 */ 1364 if (!arc_dead) 1365 cv_signal(&arc_reclaim_thr_cv); 1366} 1367 1368static void 1369buf_init(void) 1370{ 1371 uint64_t *ct; 1372 uint64_t hsize = 1ULL << 12; 1373 int i, j; 1374 1375 /* 1376 * The hash table is big enough to fill all of physical memory 1377 * with an average block size of zfs_arc_average_blocksize (default 8K). 1378 * By default, the table will take up 1379 * totalmem * sizeof(void*) / 8K (1MB per GB with 8-byte pointers). 1380 */ 1381 while (hsize * zfs_arc_average_blocksize < (uint64_t)physmem * PAGESIZE) 1382 hsize <<= 1; 1383retry: 1384 buf_hash_table.ht_mask = hsize - 1; 1385 buf_hash_table.ht_table = 1386 kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP); 1387 if (buf_hash_table.ht_table == NULL) { 1388 ASSERT(hsize > (1ULL << 8)); 1389 hsize >>= 1; 1390 goto retry; 1391 } 1392 1393 hdr_full_cache = kmem_cache_create("arc_buf_hdr_t_full", HDR_FULL_SIZE, 1394 0, hdr_full_cons, hdr_full_dest, hdr_recl, NULL, NULL, 0); 1395 hdr_l2only_cache = kmem_cache_create("arc_buf_hdr_t_l2only", 1396 HDR_L2ONLY_SIZE, 0, hdr_l2only_cons, hdr_l2only_dest, hdr_recl, 1397 NULL, NULL, 0); 1398 buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t), 1399 0, buf_cons, buf_dest, NULL, NULL, NULL, 0); 1400 1401 for (i = 0; i < 256; i++) 1402 for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--) 1403 *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY); 1404 1405 for (i = 0; i < BUF_LOCKS; i++) { 1406 mutex_init(&buf_hash_table.ht_locks[i].ht_lock, 1407 NULL, MUTEX_DEFAULT, NULL); 1408 } 1409} 1410 1411/* 1412 * Transition between the two allocation states for the arc_buf_hdr struct. 1413 * The arc_buf_hdr struct can be allocated with (hdr_full_cache) or without 1414 * (hdr_l2only_cache) the fields necessary for the L1 cache - the smaller 1415 * version is used when a cache buffer is only in the L2ARC in order to reduce 1416 * memory usage. 1417 */ 1418static arc_buf_hdr_t * 1419arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new) 1420{ 1421 ASSERT(HDR_HAS_L2HDR(hdr)); 1422 1423 arc_buf_hdr_t *nhdr; 1424 l2arc_dev_t *dev = hdr->b_l2hdr.b_dev; 1425 1426 ASSERT((old == hdr_full_cache && new == hdr_l2only_cache) || 1427 (old == hdr_l2only_cache && new == hdr_full_cache)); 1428 1429 nhdr = kmem_cache_alloc(new, KM_PUSHPAGE); 1430 1431 ASSERT(MUTEX_HELD(HDR_LOCK(hdr))); 1432 buf_hash_remove(hdr); 1433 1434 bcopy(hdr, nhdr, HDR_L2ONLY_SIZE); 1435 1436 if (new == hdr_full_cache) { 1437 nhdr->b_flags |= ARC_FLAG_HAS_L1HDR; 1438 /* 1439 * arc_access and arc_change_state need to be aware that a 1440 * header has just come out of L2ARC, so we set its state to 1441 * l2c_only even though it's about to change. 1442 */ 1443 nhdr->b_l1hdr.b_state = arc_l2c_only; 1444 } else { 1445 ASSERT(hdr->b_l1hdr.b_buf == NULL); 1446 ASSERT0(hdr->b_l1hdr.b_datacnt); 1447 ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node)); 1448 /* 1449 * We might be removing the L1hdr of a buffer which was just 1450 * written out to L2ARC. If such a buffer is compressed then we 1451 * need to free its b_tmp_cdata before destroying the header. 1452 */ 1453 if (hdr->b_l1hdr.b_tmp_cdata != NULL && 1454 HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF) 1455 l2arc_release_cdata_buf(hdr); 1456 nhdr->b_flags &= ~ARC_FLAG_HAS_L1HDR; 1457 } 1458 /* 1459 * The header has been reallocated so we need to re-insert it into any 1460 * lists it was on. 1461 */ 1462 (void) buf_hash_insert(nhdr, NULL); 1463 1464 ASSERT(list_link_active(&hdr->b_l2hdr.b_l2node)); 1465 1466 mutex_enter(&dev->l2ad_mtx); 1467 1468 /* 1469 * We must place the realloc'ed header back into the list at 1470 * the same spot. Otherwise, if it's placed earlier in the list, 1471 * l2arc_write_buffers() could find it during the function's 1472 * write phase, and try to write it out to the l2arc. 1473 */ 1474 list_insert_after(&dev->l2ad_buflist, hdr, nhdr); 1475 list_remove(&dev->l2ad_buflist, hdr); 1476 1477 mutex_exit(&dev->l2ad_mtx); 1478 1479 /* 1480 * Since we're using the pointer address as the tag when 1481 * incrementing and decrementing the l2ad_alloc refcount, we 1482 * must remove the old pointer (that we're about to destroy) and 1483 * add the new pointer to the refcount. Otherwise we'd remove 1484 * the wrong pointer address when calling arc_hdr_destroy() later. 1485 */ 1486 1487 (void) refcount_remove_many(&dev->l2ad_alloc, 1488 hdr->b_l2hdr.b_asize, hdr); 1489 1490 (void) refcount_add_many(&dev->l2ad_alloc, 1491 nhdr->b_l2hdr.b_asize, nhdr); 1492 1493 buf_discard_identity(hdr); 1494 hdr->b_freeze_cksum = NULL; 1495 kmem_cache_free(old, hdr); 1496 1497 return (nhdr); 1498} 1499 1500 1501#define ARC_MINTIME (hz>>4) /* 62 ms */ 1502 1503static void 1504arc_cksum_verify(arc_buf_t *buf) 1505{ 1506 zio_cksum_t zc; 1507 1508 if (!(zfs_flags & ZFS_DEBUG_MODIFY)) 1509 return; 1510 1511 mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1512 if (buf->b_hdr->b_freeze_cksum == NULL || HDR_IO_ERROR(buf->b_hdr)) { 1513 mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1514 return; 1515 } 1516 fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc); 1517 if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc)) 1518 panic("buffer modified while frozen!"); 1519 mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1520} 1521 1522static int 1523arc_cksum_equal(arc_buf_t *buf) 1524{ 1525 zio_cksum_t zc; 1526 int equal; 1527 1528 mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1529 fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc); 1530 equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc); 1531 mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1532 1533 return (equal); 1534} 1535 1536static void 1537arc_cksum_compute(arc_buf_t *buf, boolean_t force) 1538{ 1539 if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY)) 1540 return; 1541 1542 mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1543 if (buf->b_hdr->b_freeze_cksum != NULL) { 1544 mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1545 return; 1546 } 1547 buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP); 1548 fletcher_2_native(buf->b_data, buf->b_hdr->b_size, 1549 buf->b_hdr->b_freeze_cksum); 1550 mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1551#ifdef illumos 1552 arc_buf_watch(buf); 1553#endif 1554} 1555 1556#ifdef illumos 1557#ifndef _KERNEL 1558typedef struct procctl { 1559 long cmd; 1560 prwatch_t prwatch; 1561} procctl_t; 1562#endif 1563 1564/* ARGSUSED */ 1565static void 1566arc_buf_unwatch(arc_buf_t *buf) 1567{ 1568#ifndef _KERNEL 1569 if (arc_watch) { 1570 int result; 1571 procctl_t ctl; 1572 ctl.cmd = PCWATCH; 1573 ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data; 1574 ctl.prwatch.pr_size = 0; 1575 ctl.prwatch.pr_wflags = 0; 1576 result = write(arc_procfd, &ctl, sizeof (ctl)); 1577 ASSERT3U(result, ==, sizeof (ctl)); 1578 } 1579#endif 1580} 1581 1582/* ARGSUSED */ 1583static void 1584arc_buf_watch(arc_buf_t *buf) 1585{ 1586#ifndef _KERNEL 1587 if (arc_watch) { 1588 int result; 1589 procctl_t ctl; 1590 ctl.cmd = PCWATCH; 1591 ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data; 1592 ctl.prwatch.pr_size = buf->b_hdr->b_size; 1593 ctl.prwatch.pr_wflags = WA_WRITE; 1594 result = write(arc_procfd, &ctl, sizeof (ctl)); 1595 ASSERT3U(result, ==, sizeof (ctl)); 1596 } 1597#endif 1598} 1599#endif /* illumos */ 1600 1601static arc_buf_contents_t 1602arc_buf_type(arc_buf_hdr_t *hdr) 1603{ 1604 if (HDR_ISTYPE_METADATA(hdr)) { 1605 return (ARC_BUFC_METADATA); 1606 } else { 1607 return (ARC_BUFC_DATA); 1608 } 1609} 1610 1611static uint32_t 1612arc_bufc_to_flags(arc_buf_contents_t type) 1613{ 1614 switch (type) { 1615 case ARC_BUFC_DATA: 1616 /* metadata field is 0 if buffer contains normal data */ 1617 return (0); 1618 case ARC_BUFC_METADATA: 1619 return (ARC_FLAG_BUFC_METADATA); 1620 default: 1621 break; 1622 } 1623 panic("undefined ARC buffer type!"); 1624 return ((uint32_t)-1); 1625} 1626 1627void 1628arc_buf_thaw(arc_buf_t *buf) 1629{ 1630 if (zfs_flags & ZFS_DEBUG_MODIFY) { 1631 if (buf->b_hdr->b_l1hdr.b_state != arc_anon) 1632 panic("modifying non-anon buffer!"); 1633 if (HDR_IO_IN_PROGRESS(buf->b_hdr)) 1634 panic("modifying buffer while i/o in progress!"); 1635 arc_cksum_verify(buf); 1636 } 1637 1638 mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1639 if (buf->b_hdr->b_freeze_cksum != NULL) { 1640 kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t)); 1641 buf->b_hdr->b_freeze_cksum = NULL; 1642 } 1643 1644#ifdef ZFS_DEBUG 1645 if (zfs_flags & ZFS_DEBUG_MODIFY) { 1646 if (buf->b_hdr->b_l1hdr.b_thawed != NULL) 1647 kmem_free(buf->b_hdr->b_l1hdr.b_thawed, 1); 1648 buf->b_hdr->b_l1hdr.b_thawed = kmem_alloc(1, KM_SLEEP); 1649 } 1650#endif 1651 1652 mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1653 1654#ifdef illumos 1655 arc_buf_unwatch(buf); 1656#endif 1657} 1658 1659void 1660arc_buf_freeze(arc_buf_t *buf) 1661{ 1662 kmutex_t *hash_lock; 1663 1664 if (!(zfs_flags & ZFS_DEBUG_MODIFY)) 1665 return; 1666 1667 hash_lock = HDR_LOCK(buf->b_hdr); 1668 mutex_enter(hash_lock); 1669 1670 ASSERT(buf->b_hdr->b_freeze_cksum != NULL || 1671 buf->b_hdr->b_l1hdr.b_state == arc_anon); 1672 arc_cksum_compute(buf, B_FALSE); 1673 mutex_exit(hash_lock); 1674 1675} 1676 1677static void 1678get_buf_info(arc_buf_hdr_t *hdr, arc_state_t *state, list_t **list, kmutex_t **lock) 1679{ 1680 uint64_t buf_hashid = buf_hash(hdr->b_spa, &hdr->b_dva, hdr->b_birth); 1681 1682 if (arc_buf_type(hdr) == ARC_BUFC_METADATA) 1683 buf_hashid &= (ARC_BUFC_NUMMETADATALISTS - 1); 1684 else { 1685 buf_hashid &= (ARC_BUFC_NUMDATALISTS - 1); 1686 buf_hashid += ARC_BUFC_NUMMETADATALISTS; 1687 } 1688 1689 *list = &state->arcs_lists[buf_hashid]; 1690 *lock = ARCS_LOCK(state, buf_hashid); 1691} 1692 1693 1694static void 1695add_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag) 1696{ 1697 ASSERT(HDR_HAS_L1HDR(hdr)); 1698 ASSERT(MUTEX_HELD(hash_lock)); 1699 arc_state_t *state = hdr->b_l1hdr.b_state; 1700 1701 if ((refcount_add(&hdr->b_l1hdr.b_refcnt, tag) == 1) && 1702 (state != arc_anon)) { 1703 /* We don't use the L2-only state list. */ 1704 if (state != arc_l2c_only) { 1705 uint64_t delta = hdr->b_size * hdr->b_l1hdr.b_datacnt; 1706 uint64_t *size = &state->arcs_lsize[arc_buf_type(hdr)]; 1707 list_t *list; 1708 kmutex_t *lock; 1709 1710 get_buf_info(hdr, state, &list, &lock); 1711 ASSERT(!MUTEX_HELD(lock)); 1712 mutex_enter(lock); 1713 ASSERT(list_link_active(&hdr->b_l1hdr.b_arc_node)); 1714 list_remove(list, hdr); 1715 if (GHOST_STATE(state)) { 1716 ASSERT0(hdr->b_l1hdr.b_datacnt); 1717 ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); 1718 delta = hdr->b_size; 1719 } 1720 ASSERT(delta > 0); 1721 ASSERT3U(*size, >=, delta); 1722 atomic_add_64(size, -delta); 1723 mutex_exit(lock); 1724 } 1725 /* remove the prefetch flag if we get a reference */ 1726 hdr->b_flags &= ~ARC_FLAG_PREFETCH; 1727 } 1728} 1729 1730static int 1731remove_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag) 1732{ 1733 int cnt; 1734 arc_state_t *state = hdr->b_l1hdr.b_state; 1735 1736 ASSERT(HDR_HAS_L1HDR(hdr)); 1737 ASSERT(state == arc_anon || MUTEX_HELD(hash_lock)); 1738 ASSERT(!GHOST_STATE(state)); 1739 1740 /* 1741 * arc_l2c_only counts as a ghost state so we don't need to explicitly 1742 * check to prevent usage of the arc_l2c_only list. 1743 */ 1744 if (((cnt = refcount_remove(&hdr->b_l1hdr.b_refcnt, tag)) == 0) && 1745 (state != arc_anon)) { 1746 uint64_t *size = &state->arcs_lsize[arc_buf_type(hdr)]; 1747 list_t *list; 1748 kmutex_t *lock; 1749 1750 get_buf_info(hdr, state, &list, &lock); 1751 ASSERT(!MUTEX_HELD(lock)); 1752 mutex_enter(lock); 1753 ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node)); 1754 list_insert_head(list, hdr); 1755 ASSERT(hdr->b_l1hdr.b_datacnt > 0); 1756 atomic_add_64(size, hdr->b_size * 1757 hdr->b_l1hdr.b_datacnt); 1758 mutex_exit(lock); 1759 } 1760 return (cnt); 1761} 1762 1763/* 1764 * Move the supplied buffer to the indicated state. The mutex 1765 * for the buffer must be held by the caller. 1766 */ 1767static void 1768arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, 1769 kmutex_t *hash_lock) 1770{ 1771 arc_state_t *old_state; 1772 int64_t refcnt; 1773 uint32_t datacnt; 1774 uint64_t from_delta, to_delta; 1775 arc_buf_contents_t buftype = arc_buf_type(hdr); 1776 list_t *list; 1777 kmutex_t *lock; 1778 1779 /* 1780 * We almost always have an L1 hdr here, since we call arc_hdr_realloc() 1781 * in arc_read() when bringing a buffer out of the L2ARC. However, the 1782 * L1 hdr doesn't always exist when we change state to arc_anon before 1783 * destroying a header, in which case reallocating to add the L1 hdr is 1784 * pointless. 1785 */ 1786 if (HDR_HAS_L1HDR(hdr)) { 1787 old_state = hdr->b_l1hdr.b_state; 1788 refcnt = refcount_count(&hdr->b_l1hdr.b_refcnt); 1789 datacnt = hdr->b_l1hdr.b_datacnt; 1790 } else { 1791 old_state = arc_l2c_only; 1792 refcnt = 0; 1793 datacnt = 0; 1794 } 1795 1796 ASSERT(MUTEX_HELD(hash_lock)); 1797 ASSERT3P(new_state, !=, old_state); 1798 ASSERT(refcnt == 0 || datacnt > 0); 1799 ASSERT(!GHOST_STATE(new_state) || datacnt == 0); 1800 ASSERT(old_state != arc_anon || datacnt <= 1); 1801 1802 from_delta = to_delta = datacnt * hdr->b_size; 1803 1804 /* 1805 * If this buffer is evictable, transfer it from the 1806 * old state list to the new state list. 1807 */ 1808 if (refcnt == 0) { 1809 if (old_state != arc_anon && old_state != arc_l2c_only) { 1810 int use_mutex; 1811 uint64_t *size = &old_state->arcs_lsize[buftype]; 1812 1813 get_buf_info(hdr, old_state, &list, &lock); 1814 use_mutex = !MUTEX_HELD(lock); 1815 if (use_mutex) 1816 mutex_enter(lock); 1817 1818 ASSERT(HDR_HAS_L1HDR(hdr)); 1819 ASSERT(list_link_active(&hdr->b_l1hdr.b_arc_node)); 1820 list_remove(list, hdr); 1821 1822 /* 1823 * If prefetching out of the ghost cache, 1824 * we will have a non-zero datacnt. 1825 */ 1826 if (GHOST_STATE(old_state) && datacnt == 0) { 1827 /* ghost elements have a ghost size */ 1828 ASSERT(hdr->b_l1hdr.b_buf == NULL); 1829 from_delta = hdr->b_size; 1830 } 1831 ASSERT3U(*size, >=, from_delta); 1832 atomic_add_64(size, -from_delta); 1833 1834 if (use_mutex) 1835 mutex_exit(lock); 1836 } 1837 if (new_state != arc_anon && new_state != arc_l2c_only) { 1838 int use_mutex; 1839 uint64_t *size = &new_state->arcs_lsize[buftype]; 1840 1841 /* 1842 * An L1 header always exists here, since if we're 1843 * moving to some L1-cached state (i.e. not l2c_only or 1844 * anonymous), we realloc the header to add an L1hdr 1845 * beforehand. 1846 */ 1847 ASSERT(HDR_HAS_L1HDR(hdr)); 1848 get_buf_info(hdr, new_state, &list, &lock); 1849 use_mutex = !MUTEX_HELD(lock); 1850 if (use_mutex) 1851 mutex_enter(lock); 1852 1853 list_insert_head(list, hdr); 1854 1855 /* ghost elements have a ghost size */ 1856 if (GHOST_STATE(new_state)) { 1857 ASSERT(datacnt == 0); 1858 ASSERT(hdr->b_l1hdr.b_buf == NULL); 1859 to_delta = hdr->b_size; 1860 } 1861 atomic_add_64(size, to_delta); 1862 1863 if (use_mutex) 1864 mutex_exit(lock); 1865 } 1866 } 1867 1868 ASSERT(!BUF_EMPTY(hdr)); 1869 if (new_state == arc_anon && HDR_IN_HASH_TABLE(hdr)) 1870 buf_hash_remove(hdr); 1871 1872 /* adjust state sizes (ignore arc_l2c_only) */ 1873 if (to_delta && new_state != arc_l2c_only) 1874 atomic_add_64(&new_state->arcs_size, to_delta); 1875 if (from_delta && old_state != arc_l2c_only) { 1876 ASSERT3U(old_state->arcs_size, >=, from_delta); 1877 atomic_add_64(&old_state->arcs_size, -from_delta); 1878 } 1879 if (HDR_HAS_L1HDR(hdr)) 1880 hdr->b_l1hdr.b_state = new_state; 1881 1882 /* 1883 * L2 headers should never be on the L2 state list since they don't 1884 * have L1 headers allocated. 1885 */ 1886#ifdef illumos 1887 ASSERT(list_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_DATA]) && 1888 list_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA])); 1889#endif 1890} 1891 1892void 1893arc_space_consume(uint64_t space, arc_space_type_t type) 1894{ 1895 ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES); 1896 1897 switch (type) { 1898 case ARC_SPACE_DATA: 1899 ARCSTAT_INCR(arcstat_data_size, space); 1900 break; 1901 case ARC_SPACE_META: 1902 ARCSTAT_INCR(arcstat_metadata_size, space); 1903 break; 1904 case ARC_SPACE_OTHER: 1905 ARCSTAT_INCR(arcstat_other_size, space); 1906 break; 1907 case ARC_SPACE_HDRS: 1908 ARCSTAT_INCR(arcstat_hdr_size, space); 1909 break; 1910 case ARC_SPACE_L2HDRS: 1911 ARCSTAT_INCR(arcstat_l2_hdr_size, space); 1912 break; 1913 } 1914 1915 if (type != ARC_SPACE_DATA) 1916 ARCSTAT_INCR(arcstat_meta_used, space); 1917 1918 atomic_add_64(&arc_size, space); 1919} 1920 1921void 1922arc_space_return(uint64_t space, arc_space_type_t type) 1923{ 1924 ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES); 1925 1926 switch (type) { 1927 case ARC_SPACE_DATA: 1928 ARCSTAT_INCR(arcstat_data_size, -space); 1929 break; 1930 case ARC_SPACE_META: 1931 ARCSTAT_INCR(arcstat_metadata_size, -space); 1932 break; 1933 case ARC_SPACE_OTHER: 1934 ARCSTAT_INCR(arcstat_other_size, -space); 1935 break; 1936 case ARC_SPACE_HDRS: 1937 ARCSTAT_INCR(arcstat_hdr_size, -space); 1938 break; 1939 case ARC_SPACE_L2HDRS: 1940 ARCSTAT_INCR(arcstat_l2_hdr_size, -space); 1941 break; 1942 } 1943 1944 if (type != ARC_SPACE_DATA) { 1945 ASSERT(arc_meta_used >= space); 1946 if (arc_meta_max < arc_meta_used) 1947 arc_meta_max = arc_meta_used; 1948 ARCSTAT_INCR(arcstat_meta_used, -space); 1949 } 1950 1951 ASSERT(arc_size >= space); 1952 atomic_add_64(&arc_size, -space); 1953} 1954 1955arc_buf_t * 1956arc_buf_alloc(spa_t *spa, int32_t size, void *tag, arc_buf_contents_t type) 1957{ 1958 arc_buf_hdr_t *hdr; 1959 arc_buf_t *buf; 1960 1961 ASSERT3U(size, >, 0); 1962 hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE); 1963 ASSERT(BUF_EMPTY(hdr)); 1964 ASSERT3P(hdr->b_freeze_cksum, ==, NULL); 1965 hdr->b_size = size; 1966 hdr->b_spa = spa_load_guid(spa); 1967 1968 buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); 1969 buf->b_hdr = hdr; 1970 buf->b_data = NULL; 1971 buf->b_efunc = NULL; 1972 buf->b_private = NULL; 1973 buf->b_next = NULL; 1974 1975 hdr->b_flags = arc_bufc_to_flags(type); 1976 hdr->b_flags |= ARC_FLAG_HAS_L1HDR; 1977 1978 hdr->b_l1hdr.b_buf = buf; 1979 hdr->b_l1hdr.b_state = arc_anon; 1980 hdr->b_l1hdr.b_arc_access = 0; 1981 hdr->b_l1hdr.b_datacnt = 1; 1982 1983 arc_get_data_buf(buf); 1984 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 1985 (void) refcount_add(&hdr->b_l1hdr.b_refcnt, tag); 1986 1987 return (buf); 1988} 1989 1990static char *arc_onloan_tag = "onloan"; 1991 1992/* 1993 * Loan out an anonymous arc buffer. Loaned buffers are not counted as in 1994 * flight data by arc_tempreserve_space() until they are "returned". Loaned 1995 * buffers must be returned to the arc before they can be used by the DMU or 1996 * freed. 1997 */ 1998arc_buf_t * 1999arc_loan_buf(spa_t *spa, int size) 2000{ 2001 arc_buf_t *buf; 2002 2003 buf = arc_buf_alloc(spa, size, arc_onloan_tag, ARC_BUFC_DATA); 2004 2005 atomic_add_64(&arc_loaned_bytes, size); 2006 return (buf); 2007} 2008 2009/* 2010 * Return a loaned arc buffer to the arc. 2011 */ 2012void 2013arc_return_buf(arc_buf_t *buf, void *tag) 2014{ 2015 arc_buf_hdr_t *hdr = buf->b_hdr; 2016 2017 ASSERT(buf->b_data != NULL); 2018 ASSERT(HDR_HAS_L1HDR(hdr)); 2019 (void) refcount_add(&hdr->b_l1hdr.b_refcnt, tag); 2020 (void) refcount_remove(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag); 2021 2022 atomic_add_64(&arc_loaned_bytes, -hdr->b_size); 2023} 2024 2025/* Detach an arc_buf from a dbuf (tag) */ 2026void 2027arc_loan_inuse_buf(arc_buf_t *buf, void *tag) 2028{ 2029 arc_buf_hdr_t *hdr = buf->b_hdr; 2030 2031 ASSERT(buf->b_data != NULL); 2032 ASSERT(HDR_HAS_L1HDR(hdr)); 2033 (void) refcount_add(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag); 2034 (void) refcount_remove(&hdr->b_l1hdr.b_refcnt, tag); 2035 buf->b_efunc = NULL; 2036 buf->b_private = NULL; 2037 2038 atomic_add_64(&arc_loaned_bytes, hdr->b_size); 2039} 2040 2041static arc_buf_t * 2042arc_buf_clone(arc_buf_t *from) 2043{ 2044 arc_buf_t *buf; 2045 arc_buf_hdr_t *hdr = from->b_hdr; 2046 uint64_t size = hdr->b_size; 2047 2048 ASSERT(HDR_HAS_L1HDR(hdr)); 2049 ASSERT(hdr->b_l1hdr.b_state != arc_anon); 2050 2051 buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); 2052 buf->b_hdr = hdr; 2053 buf->b_data = NULL; 2054 buf->b_efunc = NULL; 2055 buf->b_private = NULL; 2056 buf->b_next = hdr->b_l1hdr.b_buf; 2057 hdr->b_l1hdr.b_buf = buf; 2058 arc_get_data_buf(buf); 2059 bcopy(from->b_data, buf->b_data, size); 2060 2061 /* 2062 * This buffer already exists in the arc so create a duplicate 2063 * copy for the caller. If the buffer is associated with user data 2064 * then track the size and number of duplicates. These stats will be 2065 * updated as duplicate buffers are created and destroyed. 2066 */ 2067 if (HDR_ISTYPE_DATA(hdr)) { 2068 ARCSTAT_BUMP(arcstat_duplicate_buffers); 2069 ARCSTAT_INCR(arcstat_duplicate_buffers_size, size); 2070 } 2071 hdr->b_l1hdr.b_datacnt += 1; 2072 return (buf); 2073} 2074 2075void 2076arc_buf_add_ref(arc_buf_t *buf, void* tag) 2077{ 2078 arc_buf_hdr_t *hdr; 2079 kmutex_t *hash_lock; 2080 2081 /* 2082 * Check to see if this buffer is evicted. Callers 2083 * must verify b_data != NULL to know if the add_ref 2084 * was successful. 2085 */ 2086 mutex_enter(&buf->b_evict_lock); 2087 if (buf->b_data == NULL) { 2088 mutex_exit(&buf->b_evict_lock); 2089 return; 2090 } 2091 hash_lock = HDR_LOCK(buf->b_hdr); 2092 mutex_enter(hash_lock); 2093 hdr = buf->b_hdr; 2094 ASSERT(HDR_HAS_L1HDR(hdr)); 2095 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 2096 mutex_exit(&buf->b_evict_lock); 2097 2098 ASSERT(hdr->b_l1hdr.b_state == arc_mru || 2099 hdr->b_l1hdr.b_state == arc_mfu); 2100 2101 add_reference(hdr, hash_lock, tag); 2102 DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); 2103 arc_access(hdr, hash_lock); 2104 mutex_exit(hash_lock); 2105 ARCSTAT_BUMP(arcstat_hits); 2106 ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr), 2107 demand, prefetch, !HDR_ISTYPE_METADATA(hdr), 2108 data, metadata, hits); 2109} 2110 2111static void 2112arc_buf_free_on_write(void *data, size_t size, 2113 void (*free_func)(void *, size_t)) 2114{ 2115 l2arc_data_free_t *df; 2116 2117 df = kmem_alloc(sizeof (l2arc_data_free_t), KM_SLEEP); 2118 df->l2df_data = data; 2119 df->l2df_size = size; 2120 df->l2df_func = free_func; 2121 mutex_enter(&l2arc_free_on_write_mtx); 2122 list_insert_head(l2arc_free_on_write, df); 2123 mutex_exit(&l2arc_free_on_write_mtx); 2124} 2125 2126/* 2127 * Free the arc data buffer. If it is an l2arc write in progress, 2128 * the buffer is placed on l2arc_free_on_write to be freed later. 2129 */ 2130static void 2131arc_buf_data_free(arc_buf_t *buf, void (*free_func)(void *, size_t)) 2132{ 2133 arc_buf_hdr_t *hdr = buf->b_hdr; 2134 2135 if (HDR_L2_WRITING(hdr)) { 2136 arc_buf_free_on_write(buf->b_data, hdr->b_size, free_func); 2137 ARCSTAT_BUMP(arcstat_l2_free_on_write); 2138 } else { 2139 free_func(buf->b_data, hdr->b_size); 2140 } 2141} 2142 2143/* 2144 * Free up buf->b_data and if 'remove' is set, then pull the 2145 * arc_buf_t off of the the arc_buf_hdr_t's list and free it. 2146 */ 2147static void 2148arc_buf_l2_cdata_free(arc_buf_hdr_t *hdr) 2149{ 2150 ASSERT(HDR_HAS_L2HDR(hdr)); 2151 ASSERT(MUTEX_HELD(&hdr->b_l2hdr.b_dev->l2ad_mtx)); 2152 2153 /* 2154 * The b_tmp_cdata field is linked off of the b_l1hdr, so if 2155 * that doesn't exist, the header is in the arc_l2c_only state, 2156 * and there isn't anything to free (it's already been freed). 2157 */ 2158 if (!HDR_HAS_L1HDR(hdr)) 2159 return; 2160 2161 if (hdr->b_l1hdr.b_tmp_cdata == NULL) 2162 return; 2163 2164 ASSERT(HDR_L2_WRITING(hdr)); 2165 arc_buf_free_on_write(hdr->b_l1hdr.b_tmp_cdata, hdr->b_size, 2166 zio_data_buf_free); 2167 2168 ARCSTAT_BUMP(arcstat_l2_cdata_free_on_write); 2169 hdr->b_l1hdr.b_tmp_cdata = NULL; 2170} 2171 2172static void 2173arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t remove) 2174{ 2175 arc_buf_t **bufp; 2176 2177 /* free up data associated with the buf */ 2178 if (buf->b_data != NULL) { 2179 arc_state_t *state = buf->b_hdr->b_l1hdr.b_state; 2180 uint64_t size = buf->b_hdr->b_size; 2181 arc_buf_contents_t type = arc_buf_type(buf->b_hdr); 2182 2183 arc_cksum_verify(buf); 2184#ifdef illumos 2185 arc_buf_unwatch(buf); 2186#endif 2187 2188 if (!recycle) { 2189 if (type == ARC_BUFC_METADATA) { 2190 arc_buf_data_free(buf, zio_buf_free); 2191 arc_space_return(size, ARC_SPACE_META); 2192 } else { 2193 ASSERT(type == ARC_BUFC_DATA); 2194 arc_buf_data_free(buf, zio_data_buf_free); 2195 arc_space_return(size, ARC_SPACE_DATA); 2196 } 2197 } 2198 if (list_link_active(&buf->b_hdr->b_l1hdr.b_arc_node)) { 2199 uint64_t *cnt = &state->arcs_lsize[type]; 2200 2201 ASSERT(refcount_is_zero( 2202 &buf->b_hdr->b_l1hdr.b_refcnt)); 2203 ASSERT(state != arc_anon && state != arc_l2c_only); 2204 2205 ASSERT3U(*cnt, >=, size); 2206 atomic_add_64(cnt, -size); 2207 } 2208 ASSERT3U(state->arcs_size, >=, size); 2209 atomic_add_64(&state->arcs_size, -size); 2210 buf->b_data = NULL; 2211 2212 /* 2213 * If we're destroying a duplicate buffer make sure 2214 * that the appropriate statistics are updated. 2215 */ 2216 if (buf->b_hdr->b_l1hdr.b_datacnt > 1 && 2217 HDR_ISTYPE_DATA(buf->b_hdr)) { 2218 ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers); 2219 ARCSTAT_INCR(arcstat_duplicate_buffers_size, -size); 2220 } 2221 ASSERT(buf->b_hdr->b_l1hdr.b_datacnt > 0); 2222 buf->b_hdr->b_l1hdr.b_datacnt -= 1; 2223 } 2224 2225 /* only remove the buf if requested */ 2226 if (!remove) 2227 return; 2228 2229 /* remove the buf from the hdr list */ 2230 for (bufp = &buf->b_hdr->b_l1hdr.b_buf; *bufp != buf; 2231 bufp = &(*bufp)->b_next) 2232 continue; 2233 *bufp = buf->b_next; 2234 buf->b_next = NULL; 2235 2236 ASSERT(buf->b_efunc == NULL); 2237 2238 /* clean up the buf */ 2239 buf->b_hdr = NULL; 2240 kmem_cache_free(buf_cache, buf); 2241} 2242 2243static void 2244arc_hdr_l2hdr_destroy(arc_buf_hdr_t *hdr) 2245{ 2246 l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr; 2247 l2arc_dev_t *dev = l2hdr->b_dev; 2248 2249 ASSERT(MUTEX_HELD(&dev->l2ad_mtx)); 2250 ASSERT(HDR_HAS_L2HDR(hdr)); 2251 2252 list_remove(&dev->l2ad_buflist, hdr); 2253 2254 /* 2255 * We don't want to leak the b_tmp_cdata buffer that was 2256 * allocated in l2arc_write_buffers() 2257 */ 2258 arc_buf_l2_cdata_free(hdr); 2259 2260 /* 2261 * If the l2hdr's b_daddr is equal to L2ARC_ADDR_UNSET, then 2262 * this header is being processed by l2arc_write_buffers() (i.e. 2263 * it's in the first stage of l2arc_write_buffers()). 2264 * Re-affirming that truth here, just to serve as a reminder. If 2265 * b_daddr does not equal L2ARC_ADDR_UNSET, then the header may or 2266 * may not have its HDR_L2_WRITING flag set. (the write may have 2267 * completed, in which case HDR_L2_WRITING will be false and the 2268 * b_daddr field will point to the address of the buffer on disk). 2269 */ 2270 IMPLY(l2hdr->b_daddr == L2ARC_ADDR_UNSET, HDR_L2_WRITING(hdr)); 2271 2272 /* 2273 * If b_daddr is equal to L2ARC_ADDR_UNSET, we're racing with 2274 * l2arc_write_buffers(). Since we've just removed this header 2275 * from the l2arc buffer list, this header will never reach the 2276 * second stage of l2arc_write_buffers(), which increments the 2277 * accounting stats for this header. Thus, we must be careful 2278 * not to decrement them for this header either. 2279 */ 2280 if (l2hdr->b_daddr != L2ARC_ADDR_UNSET) { 2281 ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize); 2282 ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); 2283 2284 vdev_space_update(dev->l2ad_vdev, 2285 -l2hdr->b_asize, 0, 0); 2286 2287 (void) refcount_remove_many(&dev->l2ad_alloc, 2288 l2hdr->b_asize, hdr); 2289 } 2290 2291 hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR; 2292} 2293 2294static void 2295arc_hdr_destroy(arc_buf_hdr_t *hdr) 2296{ 2297 if (HDR_HAS_L1HDR(hdr)) { 2298 ASSERT(hdr->b_l1hdr.b_buf == NULL || 2299 hdr->b_l1hdr.b_datacnt > 0); 2300 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 2301 ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); 2302 } 2303 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 2304 ASSERT(!HDR_IN_HASH_TABLE(hdr)); 2305 2306 if (HDR_HAS_L2HDR(hdr)) { 2307 l2arc_dev_t *dev = hdr->b_l2hdr.b_dev; 2308 boolean_t buflist_held = MUTEX_HELD(&dev->l2ad_mtx); 2309 2310 if (!buflist_held) 2311 mutex_enter(&dev->l2ad_mtx); 2312 2313 /* 2314 * Even though we checked this conditional above, we 2315 * need to check this again now that we have the 2316 * l2ad_mtx. This is because we could be racing with 2317 * another thread calling l2arc_evict() which might have 2318 * destroyed this header's L2 portion as we were waiting 2319 * to acquire the l2ad_mtx. If that happens, we don't 2320 * want to re-destroy the header's L2 portion. 2321 */ 2322 if (HDR_HAS_L2HDR(hdr)) { 2323 trim_map_free(dev->l2ad_vdev, hdr->b_l2hdr.b_daddr, 2324 hdr->b_l2hdr.b_asize, 0); 2325 arc_hdr_l2hdr_destroy(hdr); 2326 } 2327 2328 if (!buflist_held) 2329 mutex_exit(&dev->l2ad_mtx); 2330 } 2331 2332 if (!BUF_EMPTY(hdr)) 2333 buf_discard_identity(hdr); 2334 if (hdr->b_freeze_cksum != NULL) { 2335 kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t)); 2336 hdr->b_freeze_cksum = NULL; 2337 } 2338 2339 if (HDR_HAS_L1HDR(hdr)) { 2340 while (hdr->b_l1hdr.b_buf) { 2341 arc_buf_t *buf = hdr->b_l1hdr.b_buf; 2342 2343 if (buf->b_efunc != NULL) { 2344 mutex_enter(&arc_eviction_mtx); 2345 mutex_enter(&buf->b_evict_lock); 2346 ASSERT(buf->b_hdr != NULL); 2347 arc_buf_destroy(hdr->b_l1hdr.b_buf, FALSE, 2348 FALSE); 2349 hdr->b_l1hdr.b_buf = buf->b_next; 2350 buf->b_hdr = &arc_eviction_hdr; 2351 buf->b_next = arc_eviction_list; 2352 arc_eviction_list = buf; 2353 mutex_exit(&buf->b_evict_lock); 2354 mutex_exit(&arc_eviction_mtx); 2355 } else { 2356 arc_buf_destroy(hdr->b_l1hdr.b_buf, FALSE, 2357 TRUE); 2358 } 2359 } 2360#ifdef ZFS_DEBUG 2361 if (hdr->b_l1hdr.b_thawed != NULL) { 2362 kmem_free(hdr->b_l1hdr.b_thawed, 1); 2363 hdr->b_l1hdr.b_thawed = NULL; 2364 } 2365#endif 2366 } 2367 2368 ASSERT3P(hdr->b_hash_next, ==, NULL); 2369 if (HDR_HAS_L1HDR(hdr)) { 2370 ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node)); 2371 ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); 2372 kmem_cache_free(hdr_full_cache, hdr); 2373 } else { 2374 kmem_cache_free(hdr_l2only_cache, hdr); 2375 } 2376} 2377 2378void 2379arc_buf_free(arc_buf_t *buf, void *tag) 2380{ 2381 arc_buf_hdr_t *hdr = buf->b_hdr; 2382 int hashed = hdr->b_l1hdr.b_state != arc_anon; 2383 2384 ASSERT(buf->b_efunc == NULL); 2385 ASSERT(buf->b_data != NULL); 2386 2387 if (hashed) { 2388 kmutex_t *hash_lock = HDR_LOCK(hdr); 2389 2390 mutex_enter(hash_lock); 2391 hdr = buf->b_hdr; 2392 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 2393 2394 (void) remove_reference(hdr, hash_lock, tag); 2395 if (hdr->b_l1hdr.b_datacnt > 1) { 2396 arc_buf_destroy(buf, FALSE, TRUE); 2397 } else { 2398 ASSERT(buf == hdr->b_l1hdr.b_buf); 2399 ASSERT(buf->b_efunc == NULL); 2400 hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE; 2401 } 2402 mutex_exit(hash_lock); 2403 } else if (HDR_IO_IN_PROGRESS(hdr)) { 2404 int destroy_hdr; 2405 /* 2406 * We are in the middle of an async write. Don't destroy 2407 * this buffer unless the write completes before we finish 2408 * decrementing the reference count. 2409 */ 2410 mutex_enter(&arc_eviction_mtx); 2411 (void) remove_reference(hdr, NULL, tag); 2412 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 2413 destroy_hdr = !HDR_IO_IN_PROGRESS(hdr); 2414 mutex_exit(&arc_eviction_mtx); 2415 if (destroy_hdr) 2416 arc_hdr_destroy(hdr); 2417 } else { 2418 if (remove_reference(hdr, NULL, tag) > 0) 2419 arc_buf_destroy(buf, FALSE, TRUE); 2420 else 2421 arc_hdr_destroy(hdr); 2422 } 2423} 2424 2425boolean_t 2426arc_buf_remove_ref(arc_buf_t *buf, void* tag) 2427{ 2428 arc_buf_hdr_t *hdr = buf->b_hdr; 2429 kmutex_t *hash_lock = HDR_LOCK(hdr); 2430 boolean_t no_callback = (buf->b_efunc == NULL); 2431 2432 if (hdr->b_l1hdr.b_state == arc_anon) { 2433 ASSERT(hdr->b_l1hdr.b_datacnt == 1); 2434 arc_buf_free(buf, tag); 2435 return (no_callback); 2436 } 2437 2438 mutex_enter(hash_lock); 2439 hdr = buf->b_hdr; 2440 ASSERT(hdr->b_l1hdr.b_datacnt > 0); 2441 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 2442 ASSERT(hdr->b_l1hdr.b_state != arc_anon); 2443 ASSERT(buf->b_data != NULL); 2444 2445 (void) remove_reference(hdr, hash_lock, tag); 2446 if (hdr->b_l1hdr.b_datacnt > 1) { 2447 if (no_callback) 2448 arc_buf_destroy(buf, FALSE, TRUE); 2449 } else if (no_callback) { 2450 ASSERT(hdr->b_l1hdr.b_buf == buf && buf->b_next == NULL); 2451 ASSERT(buf->b_efunc == NULL); 2452 hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE; 2453 } 2454 ASSERT(no_callback || hdr->b_l1hdr.b_datacnt > 1 || 2455 refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 2456 mutex_exit(hash_lock); 2457 return (no_callback); 2458} 2459 2460int32_t 2461arc_buf_size(arc_buf_t *buf) 2462{ 2463 return (buf->b_hdr->b_size); 2464} 2465 2466/* 2467 * Called from the DMU to determine if the current buffer should be 2468 * evicted. In order to ensure proper locking, the eviction must be initiated 2469 * from the DMU. Return true if the buffer is associated with user data and 2470 * duplicate buffers still exist. 2471 */ 2472boolean_t 2473arc_buf_eviction_needed(arc_buf_t *buf) 2474{ 2475 arc_buf_hdr_t *hdr; 2476 boolean_t evict_needed = B_FALSE; 2477 2478 if (zfs_disable_dup_eviction) 2479 return (B_FALSE); 2480 2481 mutex_enter(&buf->b_evict_lock); 2482 hdr = buf->b_hdr; 2483 if (hdr == NULL) { 2484 /* 2485 * We are in arc_do_user_evicts(); let that function 2486 * perform the eviction. 2487 */ 2488 ASSERT(buf->b_data == NULL); 2489 mutex_exit(&buf->b_evict_lock); 2490 return (B_FALSE); 2491 } else if (buf->b_data == NULL) { 2492 /* 2493 * We have already been added to the arc eviction list; 2494 * recommend eviction. 2495 */ 2496 ASSERT3P(hdr, ==, &arc_eviction_hdr); 2497 mutex_exit(&buf->b_evict_lock); 2498 return (B_TRUE); 2499 } 2500 2501 if (hdr->b_l1hdr.b_datacnt > 1 && HDR_ISTYPE_DATA(hdr)) 2502 evict_needed = B_TRUE; 2503 2504 mutex_exit(&buf->b_evict_lock); 2505 return (evict_needed); 2506} 2507 2508/* 2509 * Evict buffers from list until we've removed the specified number of 2510 * bytes. Move the removed buffers to the appropriate evict state. 2511 * If the recycle flag is set, then attempt to "recycle" a buffer: 2512 * - look for a buffer to evict that is `bytes' long. 2513 * - return the data block from this buffer rather than freeing it. 2514 * This flag is used by callers that are trying to make space for a 2515 * new buffer in a full arc cache. 2516 * 2517 * This function makes a "best effort". It skips over any buffers 2518 * it can't get a hash_lock on, and so may not catch all candidates. 2519 * It may also return without evicting as much space as requested. 2520 */ 2521static void * 2522arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle, 2523 arc_buf_contents_t type) 2524{ 2525 arc_state_t *evicted_state; 2526 uint64_t bytes_evicted = 0, skipped = 0, missed = 0; 2527 int64_t bytes_remaining; 2528 arc_buf_hdr_t *hdr, *hdr_prev = NULL; 2529 list_t *evicted_list, *list, *evicted_list_start, *list_start; 2530 kmutex_t *lock, *evicted_lock; 2531 kmutex_t *hash_lock; 2532 boolean_t have_lock; 2533 void *stolen = NULL; 2534 arc_buf_hdr_t marker = { 0 }; 2535 int count = 0; 2536 static int evict_metadata_offset, evict_data_offset; 2537 int i, idx, offset, list_count, lists; 2538 2539 ASSERT(state == arc_mru || state == arc_mfu); 2540 2541 evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost; 2542 2543 /* 2544 * Decide which "type" (data vs metadata) to recycle from. 2545 * 2546 * If we are over the metadata limit, recycle from metadata. 2547 * If we are under the metadata minimum, recycle from data. 2548 * Otherwise, recycle from whichever type has the oldest (least 2549 * recently accessed) header. This is not yet implemented. 2550 */ 2551 if (recycle) { 2552 arc_buf_contents_t realtype; 2553 if (state->arcs_lsize[ARC_BUFC_DATA] == 0) { 2554 realtype = ARC_BUFC_METADATA; 2555 } else if (state->arcs_lsize[ARC_BUFC_METADATA] == 0) { 2556 realtype = ARC_BUFC_DATA; 2557 } else if (arc_meta_used >= arc_meta_limit) { 2558 realtype = ARC_BUFC_METADATA; 2559 } else if (arc_meta_used <= arc_meta_min) { 2560 realtype = ARC_BUFC_DATA; 2561#ifdef illumos 2562 } else if (HDR_HAS_L1HDR(data_hdr) && 2563 HDR_HAS_L1HDR(metadata_hdr) && 2564 data_hdr->b_l1hdr.b_arc_access < 2565 metadata_hdr->b_l1hdr.b_arc_access) { 2566 realtype = ARC_BUFC_DATA; 2567 } else { 2568 realtype = ARC_BUFC_METADATA; 2569#else 2570 } else { 2571 /* TODO */ 2572 realtype = type; 2573#endif 2574 } 2575 if (realtype != type) { 2576 /* 2577 * If we want to evict from a different list, 2578 * we can not recycle, because DATA vs METADATA 2579 * buffers are segregated into different kmem 2580 * caches (and vmem arenas). 2581 */ 2582 type = realtype; 2583 recycle = B_FALSE; 2584 } 2585 } 2586 2587 if (type == ARC_BUFC_METADATA) { 2588 offset = 0; 2589 list_count = ARC_BUFC_NUMMETADATALISTS; 2590 list_start = &state->arcs_lists[0]; 2591 evicted_list_start = &evicted_state->arcs_lists[0]; 2592 idx = evict_metadata_offset; 2593 } else { 2594 offset = ARC_BUFC_NUMMETADATALISTS; 2595 list_start = &state->arcs_lists[offset]; 2596 evicted_list_start = &evicted_state->arcs_lists[offset]; 2597 list_count = ARC_BUFC_NUMDATALISTS; 2598 idx = evict_data_offset; 2599 } 2600 bytes_remaining = evicted_state->arcs_lsize[type]; 2601 lists = 0; 2602 2603evict_start: 2604 list = &list_start[idx]; 2605 evicted_list = &evicted_list_start[idx]; 2606 lock = ARCS_LOCK(state, (offset + idx)); 2607 evicted_lock = ARCS_LOCK(evicted_state, (offset + idx)); 2608 2609 /* 2610 * The ghost list lock must be acquired first in order to prevent 2611 * a 3 party deadlock: 2612 * 2613 * - arc_evict_ghost acquires arc_*_ghost->arcs_mtx, followed by 2614 * l2ad_mtx in arc_hdr_realloc 2615 * - l2arc_write_buffers acquires l2ad_mtx, followed by arc_*->arcs_mtx 2616 * - arc_evict acquires arc_*_ghost->arcs_mtx, followed by 2617 * arc_*_ghost->arcs_mtx and forms a deadlock cycle. 2618 * 2619 * This situation is avoided by acquiring the ghost list lock first. 2620 */ 2621 mutex_enter(evicted_lock); 2622 mutex_enter(lock); 2623 2624 for (hdr = list_tail(list); hdr; hdr = hdr_prev) { 2625 hdr_prev = list_prev(list, hdr); 2626 if (HDR_HAS_L1HDR(hdr)) { 2627 bytes_remaining -= 2628 (hdr->b_size * hdr->b_l1hdr.b_datacnt); 2629 } 2630 /* prefetch buffers have a minimum lifespan */ 2631 if (HDR_IO_IN_PROGRESS(hdr) || 2632 (spa && hdr->b_spa != spa) || 2633 ((hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT)) && 2634 ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access < 2635 arc_min_prefetch_lifespan)) { 2636 skipped++; 2637 continue; 2638 } 2639 /* "lookahead" for better eviction candidate */ 2640 if (recycle && hdr->b_size != bytes && 2641 hdr_prev && hdr_prev->b_size == bytes) 2642 continue; 2643 2644 /* ignore markers */ 2645 if (hdr->b_spa == 0) 2646 continue; 2647 2648 /* 2649 * It may take a long time to evict all the bufs requested. 2650 * To avoid blocking all arc activity, periodically drop 2651 * the arcs_mtx and give other threads a chance to run 2652 * before reacquiring the lock. 2653 * 2654 * If we are looking for a buffer to recycle, we are in 2655 * the hot code path, so don't sleep. 2656 */ 2657 if (!recycle && count++ > arc_evict_iterations) { 2658 list_insert_after(list, hdr, &marker); 2659 mutex_exit(lock); 2660 mutex_exit(evicted_lock); 2661 kpreempt(KPREEMPT_SYNC); 2662 mutex_enter(evicted_lock); 2663 mutex_enter(lock); 2664 hdr_prev = list_prev(list, &marker); 2665 list_remove(list, &marker); 2666 count = 0; 2667 continue; 2668 } 2669 2670 hash_lock = HDR_LOCK(hdr); 2671 have_lock = MUTEX_HELD(hash_lock); 2672 if (have_lock || mutex_tryenter(hash_lock)) { 2673 ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt)); 2674 ASSERT3U(hdr->b_l1hdr.b_datacnt, >, 0); 2675 while (hdr->b_l1hdr.b_buf) { 2676 arc_buf_t *buf = hdr->b_l1hdr.b_buf; 2677 if (!mutex_tryenter(&buf->b_evict_lock)) { 2678 missed += 1; 2679 break; 2680 } 2681 if (buf->b_data != NULL) { 2682 bytes_evicted += hdr->b_size; 2683 if (recycle && 2684 arc_buf_type(hdr) == type && 2685 hdr->b_size == bytes && 2686 !HDR_L2_WRITING(hdr)) { 2687 stolen = buf->b_data; 2688 recycle = FALSE; 2689 } 2690 } 2691 if (buf->b_efunc != NULL) { 2692 mutex_enter(&arc_eviction_mtx); 2693 arc_buf_destroy(buf, 2694 buf->b_data == stolen, FALSE); 2695 hdr->b_l1hdr.b_buf = buf->b_next; 2696 buf->b_hdr = &arc_eviction_hdr; 2697 buf->b_next = arc_eviction_list; 2698 arc_eviction_list = buf; 2699 mutex_exit(&arc_eviction_mtx); 2700 mutex_exit(&buf->b_evict_lock); 2701 } else { 2702 mutex_exit(&buf->b_evict_lock); 2703 arc_buf_destroy(buf, 2704 buf->b_data == stolen, TRUE); 2705 } 2706 } 2707 2708 if (HDR_HAS_L2HDR(hdr)) { 2709 ARCSTAT_INCR(arcstat_evict_l2_cached, 2710 hdr->b_size); 2711 } else { 2712 if (l2arc_write_eligible(hdr->b_spa, hdr)) { 2713 ARCSTAT_INCR(arcstat_evict_l2_eligible, 2714 hdr->b_size); 2715 } else { 2716 ARCSTAT_INCR( 2717 arcstat_evict_l2_ineligible, 2718 hdr->b_size); 2719 } 2720 } 2721 2722 if (hdr->b_l1hdr.b_datacnt == 0) { 2723 arc_change_state(evicted_state, hdr, hash_lock); 2724 ASSERT(HDR_IN_HASH_TABLE(hdr)); 2725 hdr->b_flags |= ARC_FLAG_IN_HASH_TABLE; 2726 hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE; 2727 DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, hdr); 2728 } 2729 if (!have_lock) 2730 mutex_exit(hash_lock); 2731 if (bytes >= 0 && bytes_evicted >= bytes) 2732 break; 2733 if (bytes_remaining > 0) { 2734 mutex_exit(evicted_lock); 2735 mutex_exit(lock); 2736 idx = ((idx + 1) & (list_count - 1)); 2737 lists++; 2738 goto evict_start; 2739 } 2740 } else { 2741 missed += 1; 2742 } 2743 } 2744 2745 mutex_exit(lock); 2746 mutex_exit(evicted_lock); 2747 2748 idx = ((idx + 1) & (list_count - 1)); 2749 lists++; 2750 2751 if (bytes_evicted < bytes) { 2752 if (lists < list_count) 2753 goto evict_start; 2754 else 2755 dprintf("only evicted %lld bytes from %x", 2756 (longlong_t)bytes_evicted, state); 2757 } 2758 if (type == ARC_BUFC_METADATA) 2759 evict_metadata_offset = idx; 2760 else 2761 evict_data_offset = idx; 2762 2763 if (skipped) 2764 ARCSTAT_INCR(arcstat_evict_skip, skipped); 2765 2766 if (missed) 2767 ARCSTAT_INCR(arcstat_mutex_miss, missed); 2768 2769 /* 2770 * Note: we have just evicted some data into the ghost state, 2771 * potentially putting the ghost size over the desired size. Rather 2772 * that evicting from the ghost list in this hot code path, leave 2773 * this chore to the arc_reclaim_thread(). 2774 */ 2775 2776 if (stolen) 2777 ARCSTAT_BUMP(arcstat_stolen); 2778 return (stolen); 2779} 2780 2781/* 2782 * Remove buffers from list until we've removed the specified number of 2783 * bytes. Destroy the buffers that are removed. 2784 */ 2785static void 2786arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes) 2787{ 2788 arc_buf_hdr_t *hdr, *hdr_prev; 2789 arc_buf_hdr_t marker = { 0 }; 2790 list_t *list, *list_start; 2791 kmutex_t *hash_lock, *lock; 2792 uint64_t bytes_deleted = 0; 2793 uint64_t bufs_skipped = 0; 2794 int count = 0; 2795 static int evict_offset; 2796 int list_count, idx = evict_offset; 2797 int offset, lists = 0; 2798 2799 ASSERT(GHOST_STATE(state)); 2800 2801 /* 2802 * data lists come after metadata lists 2803 */ 2804 list_start = &state->arcs_lists[ARC_BUFC_NUMMETADATALISTS]; 2805 list_count = ARC_BUFC_NUMDATALISTS; 2806 offset = ARC_BUFC_NUMMETADATALISTS; 2807 2808evict_start: 2809 list = &list_start[idx]; 2810 lock = ARCS_LOCK(state, idx + offset); 2811 2812 mutex_enter(lock); 2813 for (hdr = list_tail(list); hdr; hdr = hdr_prev) { 2814 hdr_prev = list_prev(list, hdr); 2815 if (arc_buf_type(hdr) >= ARC_BUFC_NUMTYPES) 2816 panic("invalid hdr=%p", (void *)hdr); 2817 if (spa && hdr->b_spa != spa) 2818 continue; 2819 2820 /* ignore markers */ 2821 if (hdr->b_spa == 0) 2822 continue; 2823 2824 hash_lock = HDR_LOCK(hdr); 2825 /* caller may be trying to modify this buffer, skip it */ 2826 if (MUTEX_HELD(hash_lock)) 2827 continue; 2828 2829 /* 2830 * It may take a long time to evict all the bufs requested. 2831 * To avoid blocking all arc activity, periodically drop 2832 * the arcs_mtx and give other threads a chance to run 2833 * before reacquiring the lock. 2834 */ 2835 if (count++ > arc_evict_iterations) { 2836 list_insert_after(list, hdr, &marker); 2837 mutex_exit(lock); 2838 kpreempt(KPREEMPT_SYNC); 2839 mutex_enter(lock); 2840 hdr_prev = list_prev(list, &marker); 2841 list_remove(list, &marker); 2842 count = 0; 2843 continue; 2844 } 2845 if (mutex_tryenter(hash_lock)) { 2846 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 2847 ASSERT(!HDR_HAS_L1HDR(hdr) || 2848 hdr->b_l1hdr.b_buf == NULL); 2849 ARCSTAT_BUMP(arcstat_deleted); 2850 bytes_deleted += hdr->b_size; 2851 2852 if (HDR_HAS_L2HDR(hdr)) { 2853 /* 2854 * This buffer is cached on the 2nd Level ARC; 2855 * don't destroy the header. 2856 */ 2857 arc_change_state(arc_l2c_only, hdr, hash_lock); 2858 /* 2859 * dropping from L1+L2 cached to L2-only, 2860 * realloc to remove the L1 header. 2861 */ 2862 hdr = arc_hdr_realloc(hdr, hdr_full_cache, 2863 hdr_l2only_cache); 2864 mutex_exit(hash_lock); 2865 } else { 2866 arc_change_state(arc_anon, hdr, hash_lock); 2867 mutex_exit(hash_lock); 2868 arc_hdr_destroy(hdr); 2869 } 2870 2871 DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, hdr); 2872 if (bytes >= 0 && bytes_deleted >= bytes) 2873 break; 2874 } else if (bytes < 0) { 2875 /* 2876 * Insert a list marker and then wait for the 2877 * hash lock to become available. Once its 2878 * available, restart from where we left off. 2879 */ 2880 list_insert_after(list, hdr, &marker); 2881 mutex_exit(lock); 2882 mutex_enter(hash_lock); 2883 mutex_exit(hash_lock); 2884 mutex_enter(lock); 2885 hdr_prev = list_prev(list, &marker); 2886 list_remove(list, &marker); 2887 } else { 2888 bufs_skipped += 1; 2889 } 2890 2891 } 2892 mutex_exit(lock); 2893 idx = ((idx + 1) & (ARC_BUFC_NUMDATALISTS - 1)); 2894 lists++; 2895 2896 if (lists < list_count) 2897 goto evict_start; 2898 2899 evict_offset = idx; 2900 if ((uintptr_t)list > (uintptr_t)&state->arcs_lists[ARC_BUFC_NUMMETADATALISTS] && 2901 (bytes < 0 || bytes_deleted < bytes)) { 2902 list_start = &state->arcs_lists[0]; 2903 list_count = ARC_BUFC_NUMMETADATALISTS; 2904 offset = lists = 0; 2905 goto evict_start; 2906 } 2907 2908 if (bufs_skipped) { 2909 ARCSTAT_INCR(arcstat_mutex_miss, bufs_skipped); 2910 ASSERT(bytes >= 0); 2911 } 2912 2913 if (bytes_deleted < bytes) 2914 dprintf("only deleted %lld bytes from %p", 2915 (longlong_t)bytes_deleted, state); 2916} 2917 2918static void 2919arc_adjust(void) 2920{ 2921 int64_t adjustment, delta; 2922 2923 /* 2924 * Adjust MRU size 2925 */ 2926 2927 adjustment = MIN((int64_t)(arc_size - arc_c), 2928 (int64_t)(arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used - 2929 arc_p)); 2930 2931 if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) { 2932 delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], adjustment); 2933 (void) arc_evict(arc_mru, 0, delta, FALSE, ARC_BUFC_DATA); 2934 adjustment -= delta; 2935 } 2936 2937 if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) { 2938 delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustment); 2939 (void) arc_evict(arc_mru, 0, delta, FALSE, 2940 ARC_BUFC_METADATA); 2941 } 2942 2943 /* 2944 * Adjust MFU size 2945 */ 2946 2947 adjustment = arc_size - arc_c; 2948 2949 if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) { 2950 delta = MIN(adjustment, arc_mfu->arcs_lsize[ARC_BUFC_DATA]); 2951 (void) arc_evict(arc_mfu, 0, delta, FALSE, ARC_BUFC_DATA); 2952 adjustment -= delta; 2953 } 2954 2955 if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) { 2956 int64_t delta = MIN(adjustment, 2957 arc_mfu->arcs_lsize[ARC_BUFC_METADATA]); 2958 (void) arc_evict(arc_mfu, 0, delta, FALSE, 2959 ARC_BUFC_METADATA); 2960 } 2961 2962 /* 2963 * Adjust ghost lists 2964 */ 2965 2966 adjustment = arc_mru->arcs_size + arc_mru_ghost->arcs_size - arc_c; 2967 2968 if (adjustment > 0 && arc_mru_ghost->arcs_size > 0) { 2969 delta = MIN(arc_mru_ghost->arcs_size, adjustment); 2970 arc_evict_ghost(arc_mru_ghost, 0, delta); 2971 } 2972 2973 adjustment = 2974 arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size - arc_c; 2975 2976 if (adjustment > 0 && arc_mfu_ghost->arcs_size > 0) { 2977 delta = MIN(arc_mfu_ghost->arcs_size, adjustment); 2978 arc_evict_ghost(arc_mfu_ghost, 0, delta); 2979 } 2980} 2981 2982static void 2983arc_do_user_evicts(void) 2984{ 2985 static arc_buf_t *tmp_arc_eviction_list; 2986 2987 /* 2988 * Move list over to avoid LOR 2989 */ 2990restart: 2991 mutex_enter(&arc_eviction_mtx); 2992 tmp_arc_eviction_list = arc_eviction_list; 2993 arc_eviction_list = NULL; 2994 mutex_exit(&arc_eviction_mtx); 2995 2996 while (tmp_arc_eviction_list != NULL) { 2997 arc_buf_t *buf = tmp_arc_eviction_list; 2998 tmp_arc_eviction_list = buf->b_next; 2999 mutex_enter(&buf->b_evict_lock); 3000 buf->b_hdr = NULL; 3001 mutex_exit(&buf->b_evict_lock); 3002 3003 if (buf->b_efunc != NULL) 3004 VERIFY0(buf->b_efunc(buf->b_private)); 3005 3006 buf->b_efunc = NULL; 3007 buf->b_private = NULL; 3008 kmem_cache_free(buf_cache, buf); 3009 } 3010 3011 if (arc_eviction_list != NULL) 3012 goto restart; 3013} 3014 3015/* 3016 * Flush all *evictable* data from the cache for the given spa. 3017 * NOTE: this will not touch "active" (i.e. referenced) data. 3018 */ 3019void 3020arc_flush(spa_t *spa) 3021{ 3022 uint64_t guid = 0; 3023 3024 if (spa != NULL) 3025 guid = spa_load_guid(spa); 3026 3027 while (arc_mru->arcs_lsize[ARC_BUFC_DATA]) { 3028 (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_DATA); 3029 if (spa != NULL) 3030 break; 3031 } 3032 while (arc_mru->arcs_lsize[ARC_BUFC_METADATA]) { 3033 (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_METADATA); 3034 if (spa != NULL) 3035 break; 3036 } 3037 while (arc_mfu->arcs_lsize[ARC_BUFC_DATA]) { 3038 (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_DATA); 3039 if (spa != NULL) 3040 break; 3041 } 3042 while (arc_mfu->arcs_lsize[ARC_BUFC_METADATA]) { 3043 (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_METADATA); 3044 if (spa != NULL) 3045 break; 3046 } 3047 3048 arc_evict_ghost(arc_mru_ghost, guid, -1); 3049 arc_evict_ghost(arc_mfu_ghost, guid, -1); 3050 3051 mutex_enter(&arc_reclaim_thr_lock); 3052 arc_do_user_evicts(); 3053 mutex_exit(&arc_reclaim_thr_lock); 3054 ASSERT(spa || arc_eviction_list == NULL); 3055} 3056 3057void 3058arc_shrink(void) 3059{ 3060 3061 if (arc_c > arc_c_min) { 3062 uint64_t to_free; 3063 3064 to_free = arc_c >> arc_shrink_shift; 3065 DTRACE_PROBE4(arc__shrink, uint64_t, arc_c, uint64_t, 3066 arc_c_min, uint64_t, arc_p, uint64_t, to_free); 3067 if (arc_c > arc_c_min + to_free) 3068 atomic_add_64(&arc_c, -to_free); 3069 else 3070 arc_c = arc_c_min; 3071 3072 atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift)); 3073 if (arc_c > arc_size) 3074 arc_c = MAX(arc_size, arc_c_min); 3075 if (arc_p > arc_c) 3076 arc_p = (arc_c >> 1); 3077 3078 DTRACE_PROBE2(arc__shrunk, uint64_t, arc_c, uint64_t, 3079 arc_p); 3080 3081 ASSERT(arc_c >= arc_c_min); 3082 ASSERT((int64_t)arc_p >= 0); 3083 } 3084 3085 if (arc_size > arc_c) { 3086 DTRACE_PROBE2(arc__shrink_adjust, uint64_t, arc_size, 3087 uint64_t, arc_c); 3088 arc_adjust(); 3089 } 3090} 3091 3092static int needfree = 0; 3093 3094static int 3095arc_reclaim_needed(void) 3096{ 3097 3098#ifdef _KERNEL 3099 3100 if (needfree) { 3101 DTRACE_PROBE(arc__reclaim_needfree); 3102 return (1); 3103 } 3104 3105 /* 3106 * Cooperate with pagedaemon when it's time for it to scan 3107 * and reclaim some pages. 3108 */ 3109 if (freemem < zfs_arc_free_target) { 3110 DTRACE_PROBE2(arc__reclaim_freemem, uint64_t, 3111 freemem, uint64_t, zfs_arc_free_target); 3112 return (1); 3113 } 3114 3115#ifdef illumos 3116 /* 3117 * take 'desfree' extra pages, so we reclaim sooner, rather than later 3118 */ 3119 extra = desfree; 3120 3121 /* 3122 * check that we're out of range of the pageout scanner. It starts to 3123 * schedule paging if freemem is less than lotsfree and needfree. 3124 * lotsfree is the high-water mark for pageout, and needfree is the 3125 * number of needed free pages. We add extra pages here to make sure 3126 * the scanner doesn't start up while we're freeing memory. 3127 */ 3128 if (freemem < lotsfree + needfree + extra) 3129 return (1); 3130 3131 /* 3132 * check to make sure that swapfs has enough space so that anon 3133 * reservations can still succeed. anon_resvmem() checks that the 3134 * availrmem is greater than swapfs_minfree, and the number of reserved 3135 * swap pages. We also add a bit of extra here just to prevent 3136 * circumstances from getting really dire. 3137 */ 3138 if (availrmem < swapfs_minfree + swapfs_reserve + extra) 3139 return (1); 3140 3141 /* 3142 * Check that we have enough availrmem that memory locking (e.g., via 3143 * mlock(3C) or memcntl(2)) can still succeed. (pages_pp_maximum 3144 * stores the number of pages that cannot be locked; when availrmem 3145 * drops below pages_pp_maximum, page locking mechanisms such as 3146 * page_pp_lock() will fail.) 3147 */ 3148 if (availrmem <= pages_pp_maximum) 3149 return (1); 3150 3151#endif /* illumos */ 3152#if defined(__i386) || !defined(UMA_MD_SMALL_ALLOC) 3153 /* 3154 * If we're on an i386 platform, it's possible that we'll exhaust the 3155 * kernel heap space before we ever run out of available physical 3156 * memory. Most checks of the size of the heap_area compare against 3157 * tune.t_minarmem, which is the minimum available real memory that we 3158 * can have in the system. However, this is generally fixed at 25 pages 3159 * which is so low that it's useless. In this comparison, we seek to 3160 * calculate the total heap-size, and reclaim if more than 3/4ths of the 3161 * heap is allocated. (Or, in the calculation, if less than 1/4th is 3162 * free) 3163 */ 3164 if (vmem_size(heap_arena, VMEM_FREE) < 3165 (vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC) >> 2)) { 3166 DTRACE_PROBE2(arc__reclaim_used, uint64_t, 3167 vmem_size(heap_arena, VMEM_FREE), uint64_t, 3168 (vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC)) >> 2); 3169 return (1); 3170 } 3171#define zio_arena NULL 3172#else 3173#define zio_arena heap_arena 3174#endif 3175 3176 /* 3177 * If zio data pages are being allocated out of a separate heap segment, 3178 * then enforce that the size of available vmem for this arena remains 3179 * above about 1/16th free. 3180 * 3181 * Note: The 1/16th arena free requirement was put in place 3182 * to aggressively evict memory from the arc in order to avoid 3183 * memory fragmentation issues. 3184 */ 3185 if (zio_arena != NULL && 3186 vmem_size(zio_arena, VMEM_FREE) < 3187 (vmem_size(zio_arena, VMEM_ALLOC) >> 4)) 3188 return (1); 3189 3190 /* 3191 * Above limits know nothing about real level of KVA fragmentation. 3192 * Start aggressive reclamation if too little sequential KVA left. 3193 */ 3194 if (vmem_size(heap_arena, VMEM_MAXFREE) < zfs_max_recordsize) { 3195 DTRACE_PROBE2(arc__reclaim_maxfree, uint64_t, 3196 vmem_size(heap_arena, VMEM_MAXFREE), 3197 uint64_t, zfs_max_recordsize); 3198 return (1); 3199 } 3200 3201#else /* _KERNEL */ 3202 if (spa_get_random(100) == 0) 3203 return (1); 3204#endif /* _KERNEL */ 3205 DTRACE_PROBE(arc__reclaim_no); 3206 3207 return (0); 3208} 3209 3210extern kmem_cache_t *zio_buf_cache[]; 3211extern kmem_cache_t *zio_data_buf_cache[]; 3212extern kmem_cache_t *range_seg_cache; 3213 3214static __noinline void 3215arc_kmem_reap_now(arc_reclaim_strategy_t strat) 3216{ 3217 size_t i; 3218 kmem_cache_t *prev_cache = NULL; 3219 kmem_cache_t *prev_data_cache = NULL; 3220 3221 DTRACE_PROBE(arc__kmem_reap_start); 3222#ifdef _KERNEL 3223 if (arc_meta_used >= arc_meta_limit) { 3224 /* 3225 * We are exceeding our meta-data cache limit. 3226 * Purge some DNLC entries to release holds on meta-data. 3227 */ 3228 dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent); 3229 } 3230#if defined(__i386) 3231 /* 3232 * Reclaim unused memory from all kmem caches. 3233 */ 3234 kmem_reap(); 3235#endif 3236#endif 3237 3238 /* 3239 * An aggressive reclamation will shrink the cache size as well as 3240 * reap free buffers from the arc kmem caches. 3241 */ 3242 if (strat == ARC_RECLAIM_AGGR) 3243 arc_shrink(); 3244 3245 for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) { 3246 if (zio_buf_cache[i] != prev_cache) { 3247 prev_cache = zio_buf_cache[i]; 3248 kmem_cache_reap_now(zio_buf_cache[i]); 3249 } 3250 if (zio_data_buf_cache[i] != prev_data_cache) { 3251 prev_data_cache = zio_data_buf_cache[i]; 3252 kmem_cache_reap_now(zio_data_buf_cache[i]); 3253 } 3254 } 3255 kmem_cache_reap_now(buf_cache); 3256 kmem_cache_reap_now(hdr_full_cache); 3257 kmem_cache_reap_now(hdr_l2only_cache); 3258 kmem_cache_reap_now(range_seg_cache); 3259 3260#ifdef illumos 3261 /* 3262 * Ask the vmem arena to reclaim unused memory from its 3263 * quantum caches. 3264 */ 3265 if (zio_arena != NULL && strat == ARC_RECLAIM_AGGR) 3266 vmem_qcache_reap(zio_arena); 3267#endif 3268 DTRACE_PROBE(arc__kmem_reap_end); 3269} 3270 3271static void 3272arc_reclaim_thread(void *dummy __unused) 3273{ 3274 clock_t growtime = 0; 3275 arc_reclaim_strategy_t last_reclaim = ARC_RECLAIM_CONS; 3276 callb_cpr_t cpr; 3277 3278 CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG); 3279 3280 mutex_enter(&arc_reclaim_thr_lock); 3281 while (arc_thread_exit == 0) { 3282 if (arc_reclaim_needed()) { 3283 3284 if (arc_no_grow) { 3285 if (last_reclaim == ARC_RECLAIM_CONS) { 3286 DTRACE_PROBE(arc__reclaim_aggr_no_grow); 3287 last_reclaim = ARC_RECLAIM_AGGR; 3288 } else { 3289 last_reclaim = ARC_RECLAIM_CONS; 3290 } 3291 } else { 3292 arc_no_grow = TRUE; 3293 last_reclaim = ARC_RECLAIM_AGGR; 3294 DTRACE_PROBE(arc__reclaim_aggr); 3295 membar_producer(); 3296 } 3297 3298 /* reset the growth delay for every reclaim */ 3299 growtime = ddi_get_lbolt() + (arc_grow_retry * hz); 3300 3301 if (needfree && last_reclaim == ARC_RECLAIM_CONS) { 3302 /* 3303 * If needfree is TRUE our vm_lowmem hook 3304 * was called and in that case we must free some 3305 * memory, so switch to aggressive mode. 3306 */ 3307 arc_no_grow = TRUE; 3308 last_reclaim = ARC_RECLAIM_AGGR; 3309 } 3310 arc_kmem_reap_now(last_reclaim); 3311 arc_warm = B_TRUE; 3312 3313 } else if (arc_no_grow && ddi_get_lbolt() >= growtime) { 3314 arc_no_grow = FALSE; 3315 } 3316 3317 arc_adjust(); 3318 3319 if (arc_eviction_list != NULL) 3320 arc_do_user_evicts(); 3321 3322#ifdef _KERNEL 3323 if (needfree) { 3324 needfree = 0; 3325 wakeup(&needfree); 3326 } 3327#endif 3328 3329 /* 3330 * This is necessary in order for the mdb ::arc dcmd to 3331 * show up to date information. Since the ::arc command 3332 * does not call the kstat's update function, without 3333 * this call, the command may show stale stats for the 3334 * anon, mru, mru_ghost, mfu, and mfu_ghost lists. Even 3335 * with this change, the data might be up to 1 second 3336 * out of date; but that should suffice. The arc_state_t 3337 * structures can be queried directly if more accurate 3338 * information is needed. 3339 */ 3340 if (arc_ksp != NULL) 3341 arc_ksp->ks_update(arc_ksp, KSTAT_READ); 3342 3343 /* block until needed, or one second, whichever is shorter */ 3344 CALLB_CPR_SAFE_BEGIN(&cpr); 3345 (void) cv_timedwait(&arc_reclaim_thr_cv, 3346 &arc_reclaim_thr_lock, hz); 3347 CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock); 3348 } 3349 3350 arc_thread_exit = 0; 3351 cv_broadcast(&arc_reclaim_thr_cv); 3352 CALLB_CPR_EXIT(&cpr); /* drops arc_reclaim_thr_lock */ 3353 thread_exit(); 3354} 3355 3356/* 3357 * Adapt arc info given the number of bytes we are trying to add and 3358 * the state that we are comming from. This function is only called 3359 * when we are adding new content to the cache. 3360 */ 3361static void 3362arc_adapt(int bytes, arc_state_t *state) 3363{ 3364 int mult; 3365 uint64_t arc_p_min = (arc_c >> arc_p_min_shift); 3366 3367 if (state == arc_l2c_only) 3368 return; 3369 3370 ASSERT(bytes > 0); 3371 /* 3372 * Adapt the target size of the MRU list: 3373 * - if we just hit in the MRU ghost list, then increase 3374 * the target size of the MRU list. 3375 * - if we just hit in the MFU ghost list, then increase 3376 * the target size of the MFU list by decreasing the 3377 * target size of the MRU list. 3378 */ 3379 if (state == arc_mru_ghost) { 3380 mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ? 3381 1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size)); 3382 mult = MIN(mult, 10); /* avoid wild arc_p adjustment */ 3383 3384 arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult); 3385 } else if (state == arc_mfu_ghost) { 3386 uint64_t delta; 3387 3388 mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ? 3389 1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size)); 3390 mult = MIN(mult, 10); 3391 3392 delta = MIN(bytes * mult, arc_p); 3393 arc_p = MAX(arc_p_min, arc_p - delta); 3394 } 3395 ASSERT((int64_t)arc_p >= 0); 3396 3397 if (arc_reclaim_needed()) { 3398 cv_signal(&arc_reclaim_thr_cv); 3399 return; 3400 } 3401 3402 if (arc_no_grow) 3403 return; 3404 3405 if (arc_c >= arc_c_max) 3406 return; 3407 3408 /* 3409 * If we're within (2 * maxblocksize) bytes of the target 3410 * cache size, increment the target cache size 3411 */ 3412 if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) { 3413 DTRACE_PROBE1(arc__inc_adapt, int, bytes); 3414 atomic_add_64(&arc_c, (int64_t)bytes); 3415 if (arc_c > arc_c_max) 3416 arc_c = arc_c_max; 3417 else if (state == arc_anon) 3418 atomic_add_64(&arc_p, (int64_t)bytes); 3419 if (arc_p > arc_c) 3420 arc_p = arc_c; 3421 } 3422 ASSERT((int64_t)arc_p >= 0); 3423} 3424 3425/* 3426 * Check if the cache has reached its limits and eviction is required 3427 * prior to insert. 3428 */ 3429static int 3430arc_evict_needed(arc_buf_contents_t type) 3431{ 3432 if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit) 3433 return (1); 3434 3435 if (arc_reclaim_needed()) 3436 return (1); 3437 3438 return (arc_size > arc_c); 3439} 3440 3441/* 3442 * The buffer, supplied as the first argument, needs a data block. 3443 * So, if we are at cache max, determine which cache should be victimized. 3444 * We have the following cases: 3445 * 3446 * 1. Insert for MRU, p > sizeof(arc_anon + arc_mru) -> 3447 * In this situation if we're out of space, but the resident size of the MFU is 3448 * under the limit, victimize the MFU cache to satisfy this insertion request. 3449 * 3450 * 2. Insert for MRU, p <= sizeof(arc_anon + arc_mru) -> 3451 * Here, we've used up all of the available space for the MRU, so we need to 3452 * evict from our own cache instead. Evict from the set of resident MRU 3453 * entries. 3454 * 3455 * 3. Insert for MFU (c - p) > sizeof(arc_mfu) -> 3456 * c minus p represents the MFU space in the cache, since p is the size of the 3457 * cache that is dedicated to the MRU. In this situation there's still space on 3458 * the MFU side, so the MRU side needs to be victimized. 3459 * 3460 * 4. Insert for MFU (c - p) < sizeof(arc_mfu) -> 3461 * MFU's resident set is consuming more space than it has been allotted. In 3462 * this situation, we must victimize our own cache, the MFU, for this insertion. 3463 */ 3464static void 3465arc_get_data_buf(arc_buf_t *buf) 3466{ 3467 arc_state_t *state = buf->b_hdr->b_l1hdr.b_state; 3468 uint64_t size = buf->b_hdr->b_size; 3469 arc_buf_contents_t type = arc_buf_type(buf->b_hdr); 3470 3471 arc_adapt(size, state); 3472 3473 /* 3474 * We have not yet reached cache maximum size, 3475 * just allocate a new buffer. 3476 */ 3477 if (!arc_evict_needed(type)) { 3478 if (type == ARC_BUFC_METADATA) { 3479 buf->b_data = zio_buf_alloc(size); 3480 arc_space_consume(size, ARC_SPACE_META); 3481 } else { 3482 ASSERT(type == ARC_BUFC_DATA); 3483 buf->b_data = zio_data_buf_alloc(size); 3484 arc_space_consume(size, ARC_SPACE_DATA); 3485 } 3486 goto out; 3487 } 3488 3489 /* 3490 * If we are prefetching from the mfu ghost list, this buffer 3491 * will end up on the mru list; so steal space from there. 3492 */ 3493 if (state == arc_mfu_ghost) 3494 state = HDR_PREFETCH(buf->b_hdr) ? arc_mru : arc_mfu; 3495 else if (state == arc_mru_ghost) 3496 state = arc_mru; 3497 3498 if (state == arc_mru || state == arc_anon) { 3499 uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size; 3500 state = (arc_mfu->arcs_lsize[type] >= size && 3501 arc_p > mru_used) ? arc_mfu : arc_mru; 3502 } else { 3503 /* MFU cases */ 3504 uint64_t mfu_space = arc_c - arc_p; 3505 state = (arc_mru->arcs_lsize[type] >= size && 3506 mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu; 3507 } 3508 if ((buf->b_data = arc_evict(state, 0, size, TRUE, type)) == NULL) { 3509 if (type == ARC_BUFC_METADATA) { 3510 buf->b_data = zio_buf_alloc(size); 3511 arc_space_consume(size, ARC_SPACE_META); 3512 } else { 3513 ASSERT(type == ARC_BUFC_DATA); 3514 buf->b_data = zio_data_buf_alloc(size); 3515 arc_space_consume(size, ARC_SPACE_DATA); 3516 } 3517 ARCSTAT_BUMP(arcstat_recycle_miss); 3518 } 3519 ASSERT(buf->b_data != NULL); 3520out: 3521 /* 3522 * Update the state size. Note that ghost states have a 3523 * "ghost size" and so don't need to be updated. 3524 */ 3525 if (!GHOST_STATE(buf->b_hdr->b_l1hdr.b_state)) { 3526 arc_buf_hdr_t *hdr = buf->b_hdr; 3527 3528 atomic_add_64(&hdr->b_l1hdr.b_state->arcs_size, size); 3529 if (list_link_active(&hdr->b_l1hdr.b_arc_node)) { 3530 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 3531 atomic_add_64(&hdr->b_l1hdr.b_state->arcs_lsize[type], 3532 size); 3533 } 3534 /* 3535 * If we are growing the cache, and we are adding anonymous 3536 * data, and we have outgrown arc_p, update arc_p 3537 */ 3538 if (arc_size < arc_c && hdr->b_l1hdr.b_state == arc_anon && 3539 arc_anon->arcs_size + arc_mru->arcs_size > arc_p) 3540 arc_p = MIN(arc_c, arc_p + size); 3541 } 3542 ARCSTAT_BUMP(arcstat_allocated); 3543} 3544 3545/* 3546 * This routine is called whenever a buffer is accessed. 3547 * NOTE: the hash lock is dropped in this function. 3548 */ 3549static void 3550arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) 3551{ 3552 clock_t now; 3553 3554 ASSERT(MUTEX_HELD(hash_lock)); 3555 ASSERT(HDR_HAS_L1HDR(hdr)); 3556 3557 if (hdr->b_l1hdr.b_state == arc_anon) { 3558 /* 3559 * This buffer is not in the cache, and does not 3560 * appear in our "ghost" list. Add the new buffer 3561 * to the MRU state. 3562 */ 3563 3564 ASSERT0(hdr->b_l1hdr.b_arc_access); 3565 hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); 3566 DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr); 3567 arc_change_state(arc_mru, hdr, hash_lock); 3568 3569 } else if (hdr->b_l1hdr.b_state == arc_mru) { 3570 now = ddi_get_lbolt(); 3571 3572 /* 3573 * If this buffer is here because of a prefetch, then either: 3574 * - clear the flag if this is a "referencing" read 3575 * (any subsequent access will bump this into the MFU state). 3576 * or 3577 * - move the buffer to the head of the list if this is 3578 * another prefetch (to make it less likely to be evicted). 3579 */ 3580 if (HDR_PREFETCH(hdr)) { 3581 if (refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) { 3582 ASSERT(list_link_active( 3583 &hdr->b_l1hdr.b_arc_node)); 3584 } else { 3585 hdr->b_flags &= ~ARC_FLAG_PREFETCH; 3586 ARCSTAT_BUMP(arcstat_mru_hits); 3587 } 3588 hdr->b_l1hdr.b_arc_access = now; 3589 return; 3590 } 3591 3592 /* 3593 * This buffer has been "accessed" only once so far, 3594 * but it is still in the cache. Move it to the MFU 3595 * state. 3596 */ 3597 if (now > hdr->b_l1hdr.b_arc_access + ARC_MINTIME) { 3598 /* 3599 * More than 125ms have passed since we 3600 * instantiated this buffer. Move it to the 3601 * most frequently used state. 3602 */ 3603 hdr->b_l1hdr.b_arc_access = now; 3604 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); 3605 arc_change_state(arc_mfu, hdr, hash_lock); 3606 } 3607 ARCSTAT_BUMP(arcstat_mru_hits); 3608 } else if (hdr->b_l1hdr.b_state == arc_mru_ghost) { 3609 arc_state_t *new_state; 3610 /* 3611 * This buffer has been "accessed" recently, but 3612 * was evicted from the cache. Move it to the 3613 * MFU state. 3614 */ 3615 3616 if (HDR_PREFETCH(hdr)) { 3617 new_state = arc_mru; 3618 if (refcount_count(&hdr->b_l1hdr.b_refcnt) > 0) 3619 hdr->b_flags &= ~ARC_FLAG_PREFETCH; 3620 DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr); 3621 } else { 3622 new_state = arc_mfu; 3623 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); 3624 } 3625 3626 hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); 3627 arc_change_state(new_state, hdr, hash_lock); 3628 3629 ARCSTAT_BUMP(arcstat_mru_ghost_hits); 3630 } else if (hdr->b_l1hdr.b_state == arc_mfu) { 3631 /* 3632 * This buffer has been accessed more than once and is 3633 * still in the cache. Keep it in the MFU state. 3634 * 3635 * NOTE: an add_reference() that occurred when we did 3636 * the arc_read() will have kicked this off the list. 3637 * If it was a prefetch, we will explicitly move it to 3638 * the head of the list now. 3639 */ 3640 if ((HDR_PREFETCH(hdr)) != 0) { 3641 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 3642 ASSERT(list_link_active(&hdr->b_l1hdr.b_arc_node)); 3643 } 3644 ARCSTAT_BUMP(arcstat_mfu_hits); 3645 hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); 3646 } else if (hdr->b_l1hdr.b_state == arc_mfu_ghost) { 3647 arc_state_t *new_state = arc_mfu; 3648 /* 3649 * This buffer has been accessed more than once but has 3650 * been evicted from the cache. Move it back to the 3651 * MFU state. 3652 */ 3653 3654 if (HDR_PREFETCH(hdr)) { 3655 /* 3656 * This is a prefetch access... 3657 * move this block back to the MRU state. 3658 */ 3659 ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt)); 3660 new_state = arc_mru; 3661 } 3662 3663 hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); 3664 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); 3665 arc_change_state(new_state, hdr, hash_lock); 3666 3667 ARCSTAT_BUMP(arcstat_mfu_ghost_hits); 3668 } else if (hdr->b_l1hdr.b_state == arc_l2c_only) { 3669 /* 3670 * This buffer is on the 2nd Level ARC. 3671 */ 3672 3673 hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); 3674 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); 3675 arc_change_state(arc_mfu, hdr, hash_lock); 3676 } else { 3677 ASSERT(!"invalid arc state"); 3678 } 3679} 3680 3681/* a generic arc_done_func_t which you can use */ 3682/* ARGSUSED */ 3683void 3684arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg) 3685{ 3686 if (zio == NULL || zio->io_error == 0) 3687 bcopy(buf->b_data, arg, buf->b_hdr->b_size); 3688 VERIFY(arc_buf_remove_ref(buf, arg)); 3689} 3690 3691/* a generic arc_done_func_t */ 3692void 3693arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg) 3694{ 3695 arc_buf_t **bufp = arg; 3696 if (zio && zio->io_error) { 3697 VERIFY(arc_buf_remove_ref(buf, arg)); 3698 *bufp = NULL; 3699 } else { 3700 *bufp = buf; 3701 ASSERT(buf->b_data); 3702 } 3703} 3704 3705static void 3706arc_read_done(zio_t *zio) 3707{ 3708 arc_buf_hdr_t *hdr; 3709 arc_buf_t *buf; 3710 arc_buf_t *abuf; /* buffer we're assigning to callback */ 3711 kmutex_t *hash_lock = NULL; 3712 arc_callback_t *callback_list, *acb; 3713 int freeable = FALSE; 3714 3715 buf = zio->io_private; 3716 hdr = buf->b_hdr; 3717 3718 /* 3719 * The hdr was inserted into hash-table and removed from lists 3720 * prior to starting I/O. We should find this header, since 3721 * it's in the hash table, and it should be legit since it's 3722 * not possible to evict it during the I/O. The only possible 3723 * reason for it not to be found is if we were freed during the 3724 * read. 3725 */ 3726 if (HDR_IN_HASH_TABLE(hdr)) { 3727 ASSERT3U(hdr->b_birth, ==, BP_PHYSICAL_BIRTH(zio->io_bp)); 3728 ASSERT3U(hdr->b_dva.dva_word[0], ==, 3729 BP_IDENTITY(zio->io_bp)->dva_word[0]); 3730 ASSERT3U(hdr->b_dva.dva_word[1], ==, 3731 BP_IDENTITY(zio->io_bp)->dva_word[1]); 3732 3733 arc_buf_hdr_t *found = buf_hash_find(hdr->b_spa, zio->io_bp, 3734 &hash_lock); 3735 3736 ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && 3737 hash_lock == NULL) || 3738 (found == hdr && 3739 DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) || 3740 (found == hdr && HDR_L2_READING(hdr))); 3741 } 3742 3743 hdr->b_flags &= ~ARC_FLAG_L2_EVICTED; 3744 if (l2arc_noprefetch && HDR_PREFETCH(hdr)) 3745 hdr->b_flags &= ~ARC_FLAG_L2CACHE; 3746 3747 /* byteswap if necessary */ 3748 callback_list = hdr->b_l1hdr.b_acb; 3749 ASSERT(callback_list != NULL); 3750 if (BP_SHOULD_BYTESWAP(zio->io_bp) && zio->io_error == 0) { 3751 dmu_object_byteswap_t bswap = 3752 DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp)); 3753 arc_byteswap_func_t *func = BP_GET_LEVEL(zio->io_bp) > 0 ? 3754 byteswap_uint64_array : 3755 dmu_ot_byteswap[bswap].ob_func; 3756 func(buf->b_data, hdr->b_size); 3757 } 3758 3759 arc_cksum_compute(buf, B_FALSE); 3760#ifdef illumos 3761 arc_buf_watch(buf); 3762#endif 3763 3764 if (hash_lock && zio->io_error == 0 && 3765 hdr->b_l1hdr.b_state == arc_anon) { 3766 /* 3767 * Only call arc_access on anonymous buffers. This is because 3768 * if we've issued an I/O for an evicted buffer, we've already 3769 * called arc_access (to prevent any simultaneous readers from 3770 * getting confused). 3771 */ 3772 arc_access(hdr, hash_lock); 3773 } 3774 3775 /* create copies of the data buffer for the callers */ 3776 abuf = buf; 3777 for (acb = callback_list; acb; acb = acb->acb_next) { 3778 if (acb->acb_done) { 3779 if (abuf == NULL) { 3780 ARCSTAT_BUMP(arcstat_duplicate_reads); 3781 abuf = arc_buf_clone(buf); 3782 } 3783 acb->acb_buf = abuf; 3784 abuf = NULL; 3785 } 3786 } 3787 hdr->b_l1hdr.b_acb = NULL; 3788 hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS; 3789 ASSERT(!HDR_BUF_AVAILABLE(hdr)); 3790 if (abuf == buf) { 3791 ASSERT(buf->b_efunc == NULL); 3792 ASSERT(hdr->b_l1hdr.b_datacnt == 1); 3793 hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE; 3794 } 3795 3796 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt) || 3797 callback_list != NULL); 3798 3799 if (zio->io_error != 0) { 3800 hdr->b_flags |= ARC_FLAG_IO_ERROR; 3801 if (hdr->b_l1hdr.b_state != arc_anon) 3802 arc_change_state(arc_anon, hdr, hash_lock); 3803 if (HDR_IN_HASH_TABLE(hdr)) 3804 buf_hash_remove(hdr); 3805 freeable = refcount_is_zero(&hdr->b_l1hdr.b_refcnt); 3806 } 3807 3808 /* 3809 * Broadcast before we drop the hash_lock to avoid the possibility 3810 * that the hdr (and hence the cv) might be freed before we get to 3811 * the cv_broadcast(). 3812 */ 3813 cv_broadcast(&hdr->b_l1hdr.b_cv); 3814 3815 if (hash_lock != NULL) { 3816 mutex_exit(hash_lock); 3817 } else { 3818 /* 3819 * This block was freed while we waited for the read to 3820 * complete. It has been removed from the hash table and 3821 * moved to the anonymous state (so that it won't show up 3822 * in the cache). 3823 */ 3824 ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); 3825 freeable = refcount_is_zero(&hdr->b_l1hdr.b_refcnt); 3826 } 3827 3828 /* execute each callback and free its structure */ 3829 while ((acb = callback_list) != NULL) { 3830 if (acb->acb_done) 3831 acb->acb_done(zio, acb->acb_buf, acb->acb_private); 3832 3833 if (acb->acb_zio_dummy != NULL) { 3834 acb->acb_zio_dummy->io_error = zio->io_error; 3835 zio_nowait(acb->acb_zio_dummy); 3836 } 3837 3838 callback_list = acb->acb_next; 3839 kmem_free(acb, sizeof (arc_callback_t)); 3840 } 3841 3842 if (freeable) 3843 arc_hdr_destroy(hdr); 3844} 3845 3846/* 3847 * "Read" the block block at the specified DVA (in bp) via the 3848 * cache. If the block is found in the cache, invoke the provided 3849 * callback immediately and return. Note that the `zio' parameter 3850 * in the callback will be NULL in this case, since no IO was 3851 * required. If the block is not in the cache pass the read request 3852 * on to the spa with a substitute callback function, so that the 3853 * requested block will be added to the cache. 3854 * 3855 * If a read request arrives for a block that has a read in-progress, 3856 * either wait for the in-progress read to complete (and return the 3857 * results); or, if this is a read with a "done" func, add a record 3858 * to the read to invoke the "done" func when the read completes, 3859 * and return; or just return. 3860 * 3861 * arc_read_done() will invoke all the requested "done" functions 3862 * for readers of this block. 3863 */ 3864int 3865arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, 3866 void *private, zio_priority_t priority, int zio_flags, 3867 arc_flags_t *arc_flags, const zbookmark_phys_t *zb) 3868{ 3869 arc_buf_hdr_t *hdr = NULL; 3870 arc_buf_t *buf = NULL; 3871 kmutex_t *hash_lock = NULL; 3872 zio_t *rzio; 3873 uint64_t guid = spa_load_guid(spa); 3874 3875 ASSERT(!BP_IS_EMBEDDED(bp) || 3876 BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA); 3877 3878top: 3879 if (!BP_IS_EMBEDDED(bp)) { 3880 /* 3881 * Embedded BP's have no DVA and require no I/O to "read". 3882 * Create an anonymous arc buf to back it. 3883 */ 3884 hdr = buf_hash_find(guid, bp, &hash_lock); 3885 } 3886 3887 if (hdr != NULL && HDR_HAS_L1HDR(hdr) && hdr->b_l1hdr.b_datacnt > 0) { 3888 3889 *arc_flags |= ARC_FLAG_CACHED; 3890 3891 if (HDR_IO_IN_PROGRESS(hdr)) { 3892 3893 if (*arc_flags & ARC_FLAG_WAIT) { 3894 cv_wait(&hdr->b_l1hdr.b_cv, hash_lock); 3895 mutex_exit(hash_lock); 3896 goto top; 3897 } 3898 ASSERT(*arc_flags & ARC_FLAG_NOWAIT); 3899 3900 if (done) { 3901 arc_callback_t *acb = NULL; 3902 3903 acb = kmem_zalloc(sizeof (arc_callback_t), 3904 KM_SLEEP); 3905 acb->acb_done = done; 3906 acb->acb_private = private; 3907 if (pio != NULL) 3908 acb->acb_zio_dummy = zio_null(pio, 3909 spa, NULL, NULL, NULL, zio_flags); 3910 3911 ASSERT(acb->acb_done != NULL); 3912 acb->acb_next = hdr->b_l1hdr.b_acb; 3913 hdr->b_l1hdr.b_acb = acb; 3914 add_reference(hdr, hash_lock, private); 3915 mutex_exit(hash_lock); 3916 return (0); 3917 } 3918 mutex_exit(hash_lock); 3919 return (0); 3920 } 3921 3922 ASSERT(hdr->b_l1hdr.b_state == arc_mru || 3923 hdr->b_l1hdr.b_state == arc_mfu); 3924 3925 if (done) { 3926 add_reference(hdr, hash_lock, private); 3927 /* 3928 * If this block is already in use, create a new 3929 * copy of the data so that we will be guaranteed 3930 * that arc_release() will always succeed. 3931 */ 3932 buf = hdr->b_l1hdr.b_buf; 3933 ASSERT(buf); 3934 ASSERT(buf->b_data); 3935 if (HDR_BUF_AVAILABLE(hdr)) { 3936 ASSERT(buf->b_efunc == NULL); 3937 hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE; 3938 } else { 3939 buf = arc_buf_clone(buf); 3940 } 3941 3942 } else if (*arc_flags & ARC_FLAG_PREFETCH && 3943 refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) { 3944 hdr->b_flags |= ARC_FLAG_PREFETCH; 3945 } 3946 DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); 3947 arc_access(hdr, hash_lock); 3948 if (*arc_flags & ARC_FLAG_L2CACHE) 3949 hdr->b_flags |= ARC_FLAG_L2CACHE; 3950 if (*arc_flags & ARC_FLAG_L2COMPRESS) 3951 hdr->b_flags |= ARC_FLAG_L2COMPRESS; 3952 mutex_exit(hash_lock); 3953 ARCSTAT_BUMP(arcstat_hits); 3954 ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr), 3955 demand, prefetch, !HDR_ISTYPE_METADATA(hdr), 3956 data, metadata, hits); 3957 3958 if (done) 3959 done(NULL, buf, private); 3960 } else { 3961 uint64_t size = BP_GET_LSIZE(bp); 3962 arc_callback_t *acb; 3963 vdev_t *vd = NULL; 3964 uint64_t addr = 0; 3965 boolean_t devw = B_FALSE; 3966 enum zio_compress b_compress = ZIO_COMPRESS_OFF; 3967 int32_t b_asize = 0; 3968 3969 if (hdr == NULL) { 3970 /* this block is not in the cache */ 3971 arc_buf_hdr_t *exists = NULL; 3972 arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp); 3973 buf = arc_buf_alloc(spa, size, private, type); 3974 hdr = buf->b_hdr; 3975 if (!BP_IS_EMBEDDED(bp)) { 3976 hdr->b_dva = *BP_IDENTITY(bp); 3977 hdr->b_birth = BP_PHYSICAL_BIRTH(bp); 3978 exists = buf_hash_insert(hdr, &hash_lock); 3979 } 3980 if (exists != NULL) { 3981 /* somebody beat us to the hash insert */ 3982 mutex_exit(hash_lock); 3983 buf_discard_identity(hdr); 3984 (void) arc_buf_remove_ref(buf, private); 3985 goto top; /* restart the IO request */ 3986 } 3987 3988 /* if this is a prefetch, we don't have a reference */ 3989 if (*arc_flags & ARC_FLAG_PREFETCH) { 3990 (void) remove_reference(hdr, hash_lock, 3991 private); 3992 hdr->b_flags |= ARC_FLAG_PREFETCH; 3993 } 3994 if (*arc_flags & ARC_FLAG_L2CACHE) 3995 hdr->b_flags |= ARC_FLAG_L2CACHE; 3996 if (*arc_flags & ARC_FLAG_L2COMPRESS) 3997 hdr->b_flags |= ARC_FLAG_L2COMPRESS; 3998 if (BP_GET_LEVEL(bp) > 0) 3999 hdr->b_flags |= ARC_FLAG_INDIRECT; 4000 } else { 4001 /* 4002 * This block is in the ghost cache. If it was L2-only 4003 * (and thus didn't have an L1 hdr), we realloc the 4004 * header to add an L1 hdr. 4005 */ 4006 if (!HDR_HAS_L1HDR(hdr)) { 4007 hdr = arc_hdr_realloc(hdr, hdr_l2only_cache, 4008 hdr_full_cache); 4009 } 4010 4011 ASSERT(GHOST_STATE(hdr->b_l1hdr.b_state)); 4012 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 4013 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 4014 ASSERT(hdr->b_l1hdr.b_buf == NULL); 4015 4016 /* if this is a prefetch, we don't have a reference */ 4017 if (*arc_flags & ARC_FLAG_PREFETCH) 4018 hdr->b_flags |= ARC_FLAG_PREFETCH; 4019 else 4020 add_reference(hdr, hash_lock, private); 4021 if (*arc_flags & ARC_FLAG_L2CACHE) 4022 hdr->b_flags |= ARC_FLAG_L2CACHE; 4023 if (*arc_flags & ARC_FLAG_L2COMPRESS) 4024 hdr->b_flags |= ARC_FLAG_L2COMPRESS; 4025 buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); 4026 buf->b_hdr = hdr; 4027 buf->b_data = NULL; 4028 buf->b_efunc = NULL; 4029 buf->b_private = NULL; 4030 buf->b_next = NULL; 4031 hdr->b_l1hdr.b_buf = buf; 4032 ASSERT0(hdr->b_l1hdr.b_datacnt); 4033 hdr->b_l1hdr.b_datacnt = 1; 4034 arc_get_data_buf(buf); 4035 arc_access(hdr, hash_lock); 4036 } 4037 4038 ASSERT(!GHOST_STATE(hdr->b_l1hdr.b_state)); 4039 4040 acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); 4041 acb->acb_done = done; 4042 acb->acb_private = private; 4043 4044 ASSERT(hdr->b_l1hdr.b_acb == NULL); 4045 hdr->b_l1hdr.b_acb = acb; 4046 hdr->b_flags |= ARC_FLAG_IO_IN_PROGRESS; 4047 4048 if (HDR_HAS_L2HDR(hdr) && 4049 (vd = hdr->b_l2hdr.b_dev->l2ad_vdev) != NULL) { 4050 devw = hdr->b_l2hdr.b_dev->l2ad_writing; 4051 addr = hdr->b_l2hdr.b_daddr; 4052 b_compress = HDR_GET_COMPRESS(hdr); 4053 b_asize = hdr->b_l2hdr.b_asize; 4054 /* 4055 * Lock out device removal. 4056 */ 4057 if (vdev_is_dead(vd) || 4058 !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER)) 4059 vd = NULL; 4060 } 4061 4062 if (hash_lock != NULL) 4063 mutex_exit(hash_lock); 4064 4065 /* 4066 * At this point, we have a level 1 cache miss. Try again in 4067 * L2ARC if possible. 4068 */ 4069 ASSERT3U(hdr->b_size, ==, size); 4070 DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp, 4071 uint64_t, size, zbookmark_phys_t *, zb); 4072 ARCSTAT_BUMP(arcstat_misses); 4073 ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr), 4074 demand, prefetch, !HDR_ISTYPE_METADATA(hdr), 4075 data, metadata, misses); 4076#ifdef _KERNEL 4077 curthread->td_ru.ru_inblock++; 4078#endif 4079 4080 if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) { 4081 /* 4082 * Read from the L2ARC if the following are true: 4083 * 1. The L2ARC vdev was previously cached. 4084 * 2. This buffer still has L2ARC metadata. 4085 * 3. This buffer isn't currently writing to the L2ARC. 4086 * 4. The L2ARC entry wasn't evicted, which may 4087 * also have invalidated the vdev. 4088 * 5. This isn't prefetch and l2arc_noprefetch is set. 4089 */ 4090 if (HDR_HAS_L2HDR(hdr) && 4091 !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) && 4092 !(l2arc_noprefetch && HDR_PREFETCH(hdr))) { 4093 l2arc_read_callback_t *cb; 4094 4095 DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr); 4096 ARCSTAT_BUMP(arcstat_l2_hits); 4097 4098 cb = kmem_zalloc(sizeof (l2arc_read_callback_t), 4099 KM_SLEEP); 4100 cb->l2rcb_buf = buf; 4101 cb->l2rcb_spa = spa; 4102 cb->l2rcb_bp = *bp; 4103 cb->l2rcb_zb = *zb; 4104 cb->l2rcb_flags = zio_flags; 4105 cb->l2rcb_compress = b_compress; 4106 4107 ASSERT(addr >= VDEV_LABEL_START_SIZE && 4108 addr + size < vd->vdev_psize - 4109 VDEV_LABEL_END_SIZE); 4110 4111 /* 4112 * l2arc read. The SCL_L2ARC lock will be 4113 * released by l2arc_read_done(). 4114 * Issue a null zio if the underlying buffer 4115 * was squashed to zero size by compression. 4116 */ 4117 if (b_compress == ZIO_COMPRESS_EMPTY) { 4118 rzio = zio_null(pio, spa, vd, 4119 l2arc_read_done, cb, 4120 zio_flags | ZIO_FLAG_DONT_CACHE | 4121 ZIO_FLAG_CANFAIL | 4122 ZIO_FLAG_DONT_PROPAGATE | 4123 ZIO_FLAG_DONT_RETRY); 4124 } else { 4125 rzio = zio_read_phys(pio, vd, addr, 4126 b_asize, buf->b_data, 4127 ZIO_CHECKSUM_OFF, 4128 l2arc_read_done, cb, priority, 4129 zio_flags | ZIO_FLAG_DONT_CACHE | 4130 ZIO_FLAG_CANFAIL | 4131 ZIO_FLAG_DONT_PROPAGATE | 4132 ZIO_FLAG_DONT_RETRY, B_FALSE); 4133 } 4134 DTRACE_PROBE2(l2arc__read, vdev_t *, vd, 4135 zio_t *, rzio); 4136 ARCSTAT_INCR(arcstat_l2_read_bytes, b_asize); 4137 4138 if (*arc_flags & ARC_FLAG_NOWAIT) { 4139 zio_nowait(rzio); 4140 return (0); 4141 } 4142 4143 ASSERT(*arc_flags & ARC_FLAG_WAIT); 4144 if (zio_wait(rzio) == 0) 4145 return (0); 4146 4147 /* l2arc read error; goto zio_read() */ 4148 } else { 4149 DTRACE_PROBE1(l2arc__miss, 4150 arc_buf_hdr_t *, hdr); 4151 ARCSTAT_BUMP(arcstat_l2_misses); 4152 if (HDR_L2_WRITING(hdr)) 4153 ARCSTAT_BUMP(arcstat_l2_rw_clash); 4154 spa_config_exit(spa, SCL_L2ARC, vd); 4155 } 4156 } else { 4157 if (vd != NULL) 4158 spa_config_exit(spa, SCL_L2ARC, vd); 4159 if (l2arc_ndev != 0) { 4160 DTRACE_PROBE1(l2arc__miss, 4161 arc_buf_hdr_t *, hdr); 4162 ARCSTAT_BUMP(arcstat_l2_misses); 4163 } 4164 } 4165 4166 rzio = zio_read(pio, spa, bp, buf->b_data, size, 4167 arc_read_done, buf, priority, zio_flags, zb); 4168 4169 if (*arc_flags & ARC_FLAG_WAIT) 4170 return (zio_wait(rzio)); 4171 4172 ASSERT(*arc_flags & ARC_FLAG_NOWAIT); 4173 zio_nowait(rzio); 4174 } 4175 return (0); 4176} 4177 4178void 4179arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private) 4180{ 4181 ASSERT(buf->b_hdr != NULL); 4182 ASSERT(buf->b_hdr->b_l1hdr.b_state != arc_anon); 4183 ASSERT(!refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt) || 4184 func == NULL); 4185 ASSERT(buf->b_efunc == NULL); 4186 ASSERT(!HDR_BUF_AVAILABLE(buf->b_hdr)); 4187 4188 buf->b_efunc = func; 4189 buf->b_private = private; 4190} 4191 4192/* 4193 * Notify the arc that a block was freed, and thus will never be used again. 4194 */ 4195void 4196arc_freed(spa_t *spa, const blkptr_t *bp) 4197{ 4198 arc_buf_hdr_t *hdr; 4199 kmutex_t *hash_lock; 4200 uint64_t guid = spa_load_guid(spa); 4201 4202 ASSERT(!BP_IS_EMBEDDED(bp)); 4203 4204 hdr = buf_hash_find(guid, bp, &hash_lock); 4205 if (hdr == NULL) 4206 return; 4207 if (HDR_BUF_AVAILABLE(hdr)) { 4208 arc_buf_t *buf = hdr->b_l1hdr.b_buf; 4209 add_reference(hdr, hash_lock, FTAG); 4210 hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE; 4211 mutex_exit(hash_lock); 4212 4213 arc_release(buf, FTAG); 4214 (void) arc_buf_remove_ref(buf, FTAG); 4215 } else { 4216 mutex_exit(hash_lock); 4217 } 4218 4219} 4220 4221/* 4222 * Clear the user eviction callback set by arc_set_callback(), first calling 4223 * it if it exists. Because the presence of a callback keeps an arc_buf cached 4224 * clearing the callback may result in the arc_buf being destroyed. However, 4225 * it will not result in the *last* arc_buf being destroyed, hence the data 4226 * will remain cached in the ARC. We make a copy of the arc buffer here so 4227 * that we can process the callback without holding any locks. 4228 * 4229 * It's possible that the callback is already in the process of being cleared 4230 * by another thread. In this case we can not clear the callback. 4231 * 4232 * Returns B_TRUE if the callback was successfully called and cleared. 4233 */ 4234boolean_t 4235arc_clear_callback(arc_buf_t *buf) 4236{ 4237 arc_buf_hdr_t *hdr; 4238 kmutex_t *hash_lock; 4239 arc_evict_func_t *efunc = buf->b_efunc; 4240 void *private = buf->b_private; 4241 list_t *list, *evicted_list; 4242 kmutex_t *lock, *evicted_lock; 4243 4244 mutex_enter(&buf->b_evict_lock); 4245 hdr = buf->b_hdr; 4246 if (hdr == NULL) { 4247 /* 4248 * We are in arc_do_user_evicts(). 4249 */ 4250 ASSERT(buf->b_data == NULL); 4251 mutex_exit(&buf->b_evict_lock); 4252 return (B_FALSE); 4253 } else if (buf->b_data == NULL) { 4254 /* 4255 * We are on the eviction list; process this buffer now 4256 * but let arc_do_user_evicts() do the reaping. 4257 */ 4258 buf->b_efunc = NULL; 4259 mutex_exit(&buf->b_evict_lock); 4260 VERIFY0(efunc(private)); 4261 return (B_TRUE); 4262 } 4263 hash_lock = HDR_LOCK(hdr); 4264 mutex_enter(hash_lock); 4265 hdr = buf->b_hdr; 4266 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 4267 4268 ASSERT3U(refcount_count(&hdr->b_l1hdr.b_refcnt), <, 4269 hdr->b_l1hdr.b_datacnt); 4270 ASSERT(hdr->b_l1hdr.b_state == arc_mru || 4271 hdr->b_l1hdr.b_state == arc_mfu); 4272 4273 buf->b_efunc = NULL; 4274 buf->b_private = NULL; 4275 4276 if (hdr->b_l1hdr.b_datacnt > 1) { 4277 mutex_exit(&buf->b_evict_lock); 4278 arc_buf_destroy(buf, FALSE, TRUE); 4279 } else { 4280 ASSERT(buf == hdr->b_l1hdr.b_buf); 4281 hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE; 4282 mutex_exit(&buf->b_evict_lock); 4283 } 4284 4285 mutex_exit(hash_lock); 4286 VERIFY0(efunc(private)); 4287 return (B_TRUE); 4288} 4289 4290/* 4291 * Release this buffer from the cache, making it an anonymous buffer. This 4292 * must be done after a read and prior to modifying the buffer contents. 4293 * If the buffer has more than one reference, we must make 4294 * a new hdr for the buffer. 4295 */ 4296void 4297arc_release(arc_buf_t *buf, void *tag) 4298{ 4299 arc_buf_hdr_t *hdr = buf->b_hdr; 4300 4301 /* 4302 * It would be nice to assert that if it's DMU metadata (level > 4303 * 0 || it's the dnode file), then it must be syncing context. 4304 * But we don't know that information at this level. 4305 */ 4306 4307 mutex_enter(&buf->b_evict_lock); 4308 /* 4309 * We don't grab the hash lock prior to this check, because if 4310 * the buffer's header is in the arc_anon state, it won't be 4311 * linked into the hash table. 4312 */ 4313 if (hdr->b_l1hdr.b_state == arc_anon) { 4314 mutex_exit(&buf->b_evict_lock); 4315 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 4316 ASSERT(!HDR_IN_HASH_TABLE(hdr)); 4317 ASSERT(!HDR_HAS_L2HDR(hdr)); 4318 ASSERT(BUF_EMPTY(hdr)); 4319 ASSERT3U(hdr->b_l1hdr.b_datacnt, ==, 1); 4320 ASSERT3S(refcount_count(&hdr->b_l1hdr.b_refcnt), ==, 1); 4321 ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node)); 4322 4323 ASSERT3P(buf->b_efunc, ==, NULL); 4324 ASSERT3P(buf->b_private, ==, NULL); 4325 4326 hdr->b_l1hdr.b_arc_access = 0; 4327 arc_buf_thaw(buf); 4328 4329 return; 4330 } 4331 4332 kmutex_t *hash_lock = HDR_LOCK(hdr); 4333 mutex_enter(hash_lock); 4334 4335 /* 4336 * This assignment is only valid as long as the hash_lock is 4337 * held, we must be careful not to reference state or the 4338 * b_state field after dropping the lock. 4339 */ 4340 arc_state_t *state = hdr->b_l1hdr.b_state; 4341 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 4342 ASSERT3P(state, !=, arc_anon); 4343 4344 /* this buffer is not on any list */ 4345 ASSERT(refcount_count(&hdr->b_l1hdr.b_refcnt) > 0); 4346 4347 if (HDR_HAS_L2HDR(hdr)) { 4348 mutex_enter(&hdr->b_l2hdr.b_dev->l2ad_mtx); 4349 4350 /* 4351 * We have to recheck this conditional again now that 4352 * we're holding the l2ad_mtx to prevent a race with 4353 * another thread which might be concurrently calling 4354 * l2arc_evict(). In that case, l2arc_evict() might have 4355 * destroyed the header's L2 portion as we were waiting 4356 * to acquire the l2ad_mtx. 4357 */ 4358 if (HDR_HAS_L2HDR(hdr)) { 4359 trim_map_free(hdr->b_l2hdr.b_dev->l2ad_vdev, 4360 hdr->b_l2hdr.b_daddr, hdr->b_l2hdr.b_asize, 0); 4361 arc_hdr_l2hdr_destroy(hdr); 4362 } 4363 4364 mutex_exit(&hdr->b_l2hdr.b_dev->l2ad_mtx); 4365 } 4366 4367 /* 4368 * Do we have more than one buf? 4369 */ 4370 if (hdr->b_l1hdr.b_datacnt > 1) { 4371 arc_buf_hdr_t *nhdr; 4372 arc_buf_t **bufp; 4373 uint64_t blksz = hdr->b_size; 4374 uint64_t spa = hdr->b_spa; 4375 arc_buf_contents_t type = arc_buf_type(hdr); 4376 uint32_t flags = hdr->b_flags; 4377 4378 ASSERT(hdr->b_l1hdr.b_buf != buf || buf->b_next != NULL); 4379 /* 4380 * Pull the data off of this hdr and attach it to 4381 * a new anonymous hdr. 4382 */ 4383 (void) remove_reference(hdr, hash_lock, tag); 4384 bufp = &hdr->b_l1hdr.b_buf; 4385 while (*bufp != buf) 4386 bufp = &(*bufp)->b_next; 4387 *bufp = buf->b_next; 4388 buf->b_next = NULL; 4389 4390 ASSERT3P(state, !=, arc_l2c_only); 4391 ASSERT3U(state->arcs_size, >=, hdr->b_size); 4392 atomic_add_64(&state->arcs_size, -hdr->b_size); 4393 if (refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) { 4394 ASSERT3P(state, !=, arc_l2c_only); 4395 uint64_t *size = &state->arcs_lsize[type]; 4396 ASSERT3U(*size, >=, hdr->b_size); 4397 atomic_add_64(size, -hdr->b_size); 4398 } 4399 4400 /* 4401 * We're releasing a duplicate user data buffer, update 4402 * our statistics accordingly. 4403 */ 4404 if (HDR_ISTYPE_DATA(hdr)) { 4405 ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers); 4406 ARCSTAT_INCR(arcstat_duplicate_buffers_size, 4407 -hdr->b_size); 4408 } 4409 hdr->b_l1hdr.b_datacnt -= 1; 4410 arc_cksum_verify(buf); 4411#ifdef illumos 4412 arc_buf_unwatch(buf); 4413#endif 4414 4415 mutex_exit(hash_lock); 4416 4417 nhdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE); 4418 nhdr->b_size = blksz; 4419 nhdr->b_spa = spa; 4420 4421 nhdr->b_flags = flags & ARC_FLAG_L2_WRITING; 4422 nhdr->b_flags |= arc_bufc_to_flags(type); 4423 nhdr->b_flags |= ARC_FLAG_HAS_L1HDR; 4424 4425 nhdr->b_l1hdr.b_buf = buf; 4426 nhdr->b_l1hdr.b_datacnt = 1; 4427 nhdr->b_l1hdr.b_state = arc_anon; 4428 nhdr->b_l1hdr.b_arc_access = 0; 4429 nhdr->b_freeze_cksum = NULL; 4430 4431 (void) refcount_add(&nhdr->b_l1hdr.b_refcnt, tag); 4432 buf->b_hdr = nhdr; 4433 mutex_exit(&buf->b_evict_lock); 4434 atomic_add_64(&arc_anon->arcs_size, blksz); 4435 } else { 4436 mutex_exit(&buf->b_evict_lock); 4437 ASSERT(refcount_count(&hdr->b_l1hdr.b_refcnt) == 1); 4438 /* protected by hash lock */ 4439 ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node)); 4440 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 4441 arc_change_state(arc_anon, hdr, hash_lock); 4442 hdr->b_l1hdr.b_arc_access = 0; 4443 mutex_exit(hash_lock); 4444 4445 buf_discard_identity(hdr); 4446 arc_buf_thaw(buf); 4447 } 4448 buf->b_efunc = NULL; 4449 buf->b_private = NULL; 4450} 4451 4452int 4453arc_released(arc_buf_t *buf) 4454{ 4455 int released; 4456 4457 mutex_enter(&buf->b_evict_lock); 4458 released = (buf->b_data != NULL && 4459 buf->b_hdr->b_l1hdr.b_state == arc_anon); 4460 mutex_exit(&buf->b_evict_lock); 4461 return (released); 4462} 4463 4464#ifdef ZFS_DEBUG 4465int 4466arc_referenced(arc_buf_t *buf) 4467{ 4468 int referenced; 4469 4470 mutex_enter(&buf->b_evict_lock); 4471 referenced = (refcount_count(&buf->b_hdr->b_l1hdr.b_refcnt)); 4472 mutex_exit(&buf->b_evict_lock); 4473 return (referenced); 4474} 4475#endif 4476 4477static void 4478arc_write_ready(zio_t *zio) 4479{ 4480 arc_write_callback_t *callback = zio->io_private; 4481 arc_buf_t *buf = callback->awcb_buf; 4482 arc_buf_hdr_t *hdr = buf->b_hdr; 4483 4484 ASSERT(HDR_HAS_L1HDR(hdr)); 4485 ASSERT(!refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt)); 4486 ASSERT(hdr->b_l1hdr.b_datacnt > 0); 4487 callback->awcb_ready(zio, buf, callback->awcb_private); 4488 4489 /* 4490 * If the IO is already in progress, then this is a re-write 4491 * attempt, so we need to thaw and re-compute the cksum. 4492 * It is the responsibility of the callback to handle the 4493 * accounting for any re-write attempt. 4494 */ 4495 if (HDR_IO_IN_PROGRESS(hdr)) { 4496 mutex_enter(&hdr->b_l1hdr.b_freeze_lock); 4497 if (hdr->b_freeze_cksum != NULL) { 4498 kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t)); 4499 hdr->b_freeze_cksum = NULL; 4500 } 4501 mutex_exit(&hdr->b_l1hdr.b_freeze_lock); 4502 } 4503 arc_cksum_compute(buf, B_FALSE); 4504 hdr->b_flags |= ARC_FLAG_IO_IN_PROGRESS; 4505} 4506 4507/* 4508 * The SPA calls this callback for each physical write that happens on behalf 4509 * of a logical write. See the comment in dbuf_write_physdone() for details. 4510 */ 4511static void 4512arc_write_physdone(zio_t *zio) 4513{ 4514 arc_write_callback_t *cb = zio->io_private; 4515 if (cb->awcb_physdone != NULL) 4516 cb->awcb_physdone(zio, cb->awcb_buf, cb->awcb_private); 4517} 4518 4519static void 4520arc_write_done(zio_t *zio) 4521{ 4522 arc_write_callback_t *callback = zio->io_private; 4523 arc_buf_t *buf = callback->awcb_buf; 4524 arc_buf_hdr_t *hdr = buf->b_hdr; 4525 4526 ASSERT(hdr->b_l1hdr.b_acb == NULL); 4527 4528 if (zio->io_error == 0) { 4529 if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) { 4530 buf_discard_identity(hdr); 4531 } else { 4532 hdr->b_dva = *BP_IDENTITY(zio->io_bp); 4533 hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp); 4534 } 4535 } else { 4536 ASSERT(BUF_EMPTY(hdr)); 4537 } 4538 4539 /* 4540 * If the block to be written was all-zero or compressed enough to be 4541 * embedded in the BP, no write was performed so there will be no 4542 * dva/birth/checksum. The buffer must therefore remain anonymous 4543 * (and uncached). 4544 */ 4545 if (!BUF_EMPTY(hdr)) { 4546 arc_buf_hdr_t *exists; 4547 kmutex_t *hash_lock; 4548 4549 ASSERT(zio->io_error == 0); 4550 4551 arc_cksum_verify(buf); 4552 4553 exists = buf_hash_insert(hdr, &hash_lock); 4554 if (exists != NULL) { 4555 /* 4556 * This can only happen if we overwrite for 4557 * sync-to-convergence, because we remove 4558 * buffers from the hash table when we arc_free(). 4559 */ 4560 if (zio->io_flags & ZIO_FLAG_IO_REWRITE) { 4561 if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp)) 4562 panic("bad overwrite, hdr=%p exists=%p", 4563 (void *)hdr, (void *)exists); 4564 ASSERT(refcount_is_zero( 4565 &exists->b_l1hdr.b_refcnt)); 4566 arc_change_state(arc_anon, exists, hash_lock); 4567 mutex_exit(hash_lock); 4568 arc_hdr_destroy(exists); 4569 exists = buf_hash_insert(hdr, &hash_lock); 4570 ASSERT3P(exists, ==, NULL); 4571 } else if (zio->io_flags & ZIO_FLAG_NOPWRITE) { 4572 /* nopwrite */ 4573 ASSERT(zio->io_prop.zp_nopwrite); 4574 if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp)) 4575 panic("bad nopwrite, hdr=%p exists=%p", 4576 (void *)hdr, (void *)exists); 4577 } else { 4578 /* Dedup */ 4579 ASSERT(hdr->b_l1hdr.b_datacnt == 1); 4580 ASSERT(hdr->b_l1hdr.b_state == arc_anon); 4581 ASSERT(BP_GET_DEDUP(zio->io_bp)); 4582 ASSERT(BP_GET_LEVEL(zio->io_bp) == 0); 4583 } 4584 } 4585 hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS; 4586 /* if it's not anon, we are doing a scrub */ 4587 if (exists == NULL && hdr->b_l1hdr.b_state == arc_anon) 4588 arc_access(hdr, hash_lock); 4589 mutex_exit(hash_lock); 4590 } else { 4591 hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS; 4592 } 4593 4594 ASSERT(!refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 4595 callback->awcb_done(zio, buf, callback->awcb_private); 4596 4597 kmem_free(callback, sizeof (arc_write_callback_t)); 4598} 4599 4600zio_t * 4601arc_write(zio_t *pio, spa_t *spa, uint64_t txg, 4602 blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, boolean_t l2arc_compress, 4603 const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *physdone, 4604 arc_done_func_t *done, void *private, zio_priority_t priority, 4605 int zio_flags, const zbookmark_phys_t *zb) 4606{ 4607 arc_buf_hdr_t *hdr = buf->b_hdr; 4608 arc_write_callback_t *callback; 4609 zio_t *zio; 4610 4611 ASSERT(ready != NULL); 4612 ASSERT(done != NULL); 4613 ASSERT(!HDR_IO_ERROR(hdr)); 4614 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 4615 ASSERT(hdr->b_l1hdr.b_acb == NULL); 4616 ASSERT(hdr->b_l1hdr.b_datacnt > 0); 4617 if (l2arc) 4618 hdr->b_flags |= ARC_FLAG_L2CACHE; 4619 if (l2arc_compress) 4620 hdr->b_flags |= ARC_FLAG_L2COMPRESS; 4621 callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP); 4622 callback->awcb_ready = ready; 4623 callback->awcb_physdone = physdone; 4624 callback->awcb_done = done; 4625 callback->awcb_private = private; 4626 callback->awcb_buf = buf; 4627 4628 zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, zp, 4629 arc_write_ready, arc_write_physdone, arc_write_done, callback, 4630 priority, zio_flags, zb); 4631 4632 return (zio); 4633} 4634 4635static int 4636arc_memory_throttle(uint64_t reserve, uint64_t txg) 4637{ 4638#ifdef _KERNEL 4639 uint64_t available_memory = ptob(freemem); 4640 static uint64_t page_load = 0; 4641 static uint64_t last_txg = 0; 4642 4643#if defined(__i386) || !defined(UMA_MD_SMALL_ALLOC) 4644 available_memory = 4645 MIN(available_memory, ptob(vmem_size(heap_arena, VMEM_FREE))); 4646#endif 4647 4648 if (freemem > (uint64_t)physmem * arc_lotsfree_percent / 100) 4649 return (0); 4650 4651 if (txg > last_txg) { 4652 last_txg = txg; 4653 page_load = 0; 4654 } 4655 /* 4656 * If we are in pageout, we know that memory is already tight, 4657 * the arc is already going to be evicting, so we just want to 4658 * continue to let page writes occur as quickly as possible. 4659 */ 4660 if (curproc == pageproc) { 4661 if (page_load > MAX(ptob(minfree), available_memory) / 4) 4662 return (SET_ERROR(ERESTART)); 4663 /* Note: reserve is inflated, so we deflate */ 4664 page_load += reserve / 8; 4665 return (0); 4666 } else if (page_load > 0 && arc_reclaim_needed()) { 4667 /* memory is low, delay before restarting */ 4668 ARCSTAT_INCR(arcstat_memory_throttle_count, 1); 4669 return (SET_ERROR(EAGAIN)); 4670 } 4671 page_load = 0; 4672#endif 4673 return (0); 4674} 4675 4676static void 4677arc_kstat_update_state(arc_state_t *state, kstat_named_t *size, 4678 kstat_named_t *evict_data, kstat_named_t *evict_metadata) 4679{ 4680 size->value.ui64 = state->arcs_size; 4681 evict_data->value.ui64 = state->arcs_lsize[ARC_BUFC_DATA]; 4682 evict_metadata->value.ui64 = state->arcs_lsize[ARC_BUFC_METADATA]; 4683} 4684 4685static int 4686arc_kstat_update(kstat_t *ksp, int rw) 4687{ 4688 arc_stats_t *as = ksp->ks_data; 4689 4690 if (rw == KSTAT_WRITE) { 4691 return (EACCES); 4692 } else { 4693 arc_kstat_update_state(arc_anon, 4694 &as->arcstat_anon_size, 4695 &as->arcstat_anon_evictable_data, 4696 &as->arcstat_anon_evictable_metadata); 4697 arc_kstat_update_state(arc_mru, 4698 &as->arcstat_mru_size, 4699 &as->arcstat_mru_evictable_data, 4700 &as->arcstat_mru_evictable_metadata); 4701 arc_kstat_update_state(arc_mru_ghost, 4702 &as->arcstat_mru_ghost_size, 4703 &as->arcstat_mru_ghost_evictable_data, 4704 &as->arcstat_mru_ghost_evictable_metadata); 4705 arc_kstat_update_state(arc_mfu, 4706 &as->arcstat_mfu_size, 4707 &as->arcstat_mfu_evictable_data, 4708 &as->arcstat_mfu_evictable_metadata); 4709 arc_kstat_update_state(arc_mfu_ghost, 4710 &as->arcstat_mfu_ghost_size, 4711 &as->arcstat_mfu_ghost_evictable_data, 4712 &as->arcstat_mfu_ghost_evictable_metadata); 4713 } 4714 4715 return (0); 4716} 4717 4718void 4719arc_tempreserve_clear(uint64_t reserve) 4720{ 4721 atomic_add_64(&arc_tempreserve, -reserve); 4722 ASSERT((int64_t)arc_tempreserve >= 0); 4723} 4724 4725int 4726arc_tempreserve_space(uint64_t reserve, uint64_t txg) 4727{ 4728 int error; 4729 uint64_t anon_size; 4730 4731 if (reserve > arc_c/4 && !arc_no_grow) { 4732 arc_c = MIN(arc_c_max, reserve * 4); 4733 DTRACE_PROBE1(arc__set_reserve, uint64_t, arc_c); 4734 } 4735 if (reserve > arc_c) 4736 return (SET_ERROR(ENOMEM)); 4737 4738 /* 4739 * Don't count loaned bufs as in flight dirty data to prevent long 4740 * network delays from blocking transactions that are ready to be 4741 * assigned to a txg. 4742 */ 4743 anon_size = MAX((int64_t)(arc_anon->arcs_size - arc_loaned_bytes), 0); 4744 4745 /* 4746 * Writes will, almost always, require additional memory allocations 4747 * in order to compress/encrypt/etc the data. We therefore need to 4748 * make sure that there is sufficient available memory for this. 4749 */ 4750 error = arc_memory_throttle(reserve, txg); 4751 if (error != 0) 4752 return (error); 4753 4754 /* 4755 * Throttle writes when the amount of dirty data in the cache 4756 * gets too large. We try to keep the cache less than half full 4757 * of dirty blocks so that our sync times don't grow too large. 4758 * Note: if two requests come in concurrently, we might let them 4759 * both succeed, when one of them should fail. Not a huge deal. 4760 */ 4761 4762 if (reserve + arc_tempreserve + anon_size > arc_c / 2 && 4763 anon_size > arc_c / 4) { 4764 dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK " 4765 "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n", 4766 arc_tempreserve>>10, 4767 arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10, 4768 arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10, 4769 reserve>>10, arc_c>>10); 4770 return (SET_ERROR(ERESTART)); 4771 } 4772 atomic_add_64(&arc_tempreserve, reserve); 4773 return (0); 4774} 4775 4776static kmutex_t arc_lowmem_lock; 4777#ifdef _KERNEL 4778static eventhandler_tag arc_event_lowmem = NULL; 4779 4780static void 4781arc_lowmem(void *arg __unused, int howto __unused) 4782{ 4783 4784 /* Serialize access via arc_lowmem_lock. */ 4785 mutex_enter(&arc_lowmem_lock); 4786 mutex_enter(&arc_reclaim_thr_lock); 4787 needfree = 1; 4788 DTRACE_PROBE(arc__needfree); 4789 cv_signal(&arc_reclaim_thr_cv); 4790 4791 /* 4792 * It is unsafe to block here in arbitrary threads, because we can come 4793 * here from ARC itself and may hold ARC locks and thus risk a deadlock 4794 * with ARC reclaim thread. 4795 */ 4796 if (curproc == pageproc) { 4797 while (needfree) 4798 msleep(&needfree, &arc_reclaim_thr_lock, 0, "zfs:lowmem", 0); 4799 } 4800 mutex_exit(&arc_reclaim_thr_lock); 4801 mutex_exit(&arc_lowmem_lock); 4802} 4803#endif 4804 4805void 4806arc_init(void) 4807{ 4808 int i, prefetch_tunable_set = 0; 4809 4810 mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL); 4811 cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL); 4812 mutex_init(&arc_lowmem_lock, NULL, MUTEX_DEFAULT, NULL); 4813 4814 /* Convert seconds to clock ticks */ 4815 arc_min_prefetch_lifespan = 1 * hz; 4816 4817 /* Start out with 1/8 of all memory */ 4818 arc_c = kmem_size() / 8; 4819 4820#ifdef illumos 4821#ifdef _KERNEL 4822 /* 4823 * On architectures where the physical memory can be larger 4824 * than the addressable space (intel in 32-bit mode), we may 4825 * need to limit the cache to 1/8 of VM size. 4826 */ 4827 arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8); 4828#endif 4829#endif /* illumos */ 4830 /* set min cache to 1/32 of all memory, or 16MB, whichever is more */ 4831 arc_c_min = MAX(arc_c / 4, 16 << 20); 4832 /* set max to 1/2 of all memory, or all but 1GB, whichever is more */ 4833 if (arc_c * 8 >= 1 << 30) 4834 arc_c_max = (arc_c * 8) - (1 << 30); 4835 else 4836 arc_c_max = arc_c_min; 4837 arc_c_max = MAX(arc_c * 5, arc_c_max); 4838 4839#ifdef _KERNEL 4840 /* 4841 * Allow the tunables to override our calculations if they are 4842 * reasonable (ie. over 16MB) 4843 */ 4844 if (zfs_arc_max > 16 << 20 && zfs_arc_max < kmem_size()) 4845 arc_c_max = zfs_arc_max; 4846 if (zfs_arc_min > 16 << 20 && zfs_arc_min <= arc_c_max) 4847 arc_c_min = zfs_arc_min; 4848#endif 4849 4850 arc_c = arc_c_max; 4851 arc_p = (arc_c >> 1); 4852 4853 /* limit meta-data to 1/4 of the arc capacity */ 4854 arc_meta_limit = arc_c_max / 4; 4855 4856 /* Allow the tunable to override if it is reasonable */ 4857 if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max) 4858 arc_meta_limit = zfs_arc_meta_limit; 4859 4860 if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0) 4861 arc_c_min = arc_meta_limit / 2; 4862 4863 if (zfs_arc_meta_min > 0) { 4864 arc_meta_min = zfs_arc_meta_min; 4865 } else { 4866 arc_meta_min = arc_c_min / 2; 4867 } 4868 4869 if (zfs_arc_grow_retry > 0) 4870 arc_grow_retry = zfs_arc_grow_retry; 4871 4872 if (zfs_arc_shrink_shift > 0) 4873 arc_shrink_shift = zfs_arc_shrink_shift; 4874 4875 if (zfs_arc_p_min_shift > 0) 4876 arc_p_min_shift = zfs_arc_p_min_shift; 4877 4878 /* if kmem_flags are set, lets try to use less memory */ 4879 if (kmem_debugging()) 4880 arc_c = arc_c / 2; 4881 if (arc_c < arc_c_min) 4882 arc_c = arc_c_min; 4883 4884 zfs_arc_min = arc_c_min; 4885 zfs_arc_max = arc_c_max; 4886 4887 arc_anon = &ARC_anon; 4888 arc_mru = &ARC_mru; 4889 arc_mru_ghost = &ARC_mru_ghost; 4890 arc_mfu = &ARC_mfu; 4891 arc_mfu_ghost = &ARC_mfu_ghost; 4892 arc_l2c_only = &ARC_l2c_only; 4893 arc_size = 0; 4894 4895 for (i = 0; i < ARC_BUFC_NUMLISTS; i++) { 4896 mutex_init(&arc_anon->arcs_locks[i].arcs_lock, 4897 NULL, MUTEX_DEFAULT, NULL); 4898 mutex_init(&arc_mru->arcs_locks[i].arcs_lock, 4899 NULL, MUTEX_DEFAULT, NULL); 4900 mutex_init(&arc_mru_ghost->arcs_locks[i].arcs_lock, 4901 NULL, MUTEX_DEFAULT, NULL); 4902 mutex_init(&arc_mfu->arcs_locks[i].arcs_lock, 4903 NULL, MUTEX_DEFAULT, NULL); 4904 mutex_init(&arc_mfu_ghost->arcs_locks[i].arcs_lock, 4905 NULL, MUTEX_DEFAULT, NULL); 4906 mutex_init(&arc_l2c_only->arcs_locks[i].arcs_lock, 4907 NULL, MUTEX_DEFAULT, NULL); 4908 4909 list_create(&arc_mru->arcs_lists[i], 4910 sizeof (arc_buf_hdr_t), 4911 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node)); 4912 list_create(&arc_mru_ghost->arcs_lists[i], 4913 sizeof (arc_buf_hdr_t), 4914 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node)); 4915 list_create(&arc_mfu->arcs_lists[i], 4916 sizeof (arc_buf_hdr_t), 4917 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node)); 4918 list_create(&arc_mfu_ghost->arcs_lists[i], 4919 sizeof (arc_buf_hdr_t), 4920 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node)); 4921 list_create(&arc_mfu_ghost->arcs_lists[i], 4922 sizeof (arc_buf_hdr_t), 4923 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node)); 4924 list_create(&arc_l2c_only->arcs_lists[i], 4925 sizeof (arc_buf_hdr_t), 4926 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node)); 4927 } 4928 4929 buf_init(); 4930 4931 arc_thread_exit = 0; 4932 arc_eviction_list = NULL; 4933 mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL); 4934 bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t)); 4935 4936 arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED, 4937 sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); 4938 4939 if (arc_ksp != NULL) { 4940 arc_ksp->ks_data = &arc_stats; 4941 arc_ksp->ks_update = arc_kstat_update; 4942 kstat_install(arc_ksp); 4943 } 4944 4945 (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0, 4946 TS_RUN, minclsyspri); 4947 4948#ifdef _KERNEL 4949 arc_event_lowmem = EVENTHANDLER_REGISTER(vm_lowmem, arc_lowmem, NULL, 4950 EVENTHANDLER_PRI_FIRST); 4951#endif 4952 4953 arc_dead = FALSE; 4954 arc_warm = B_FALSE; 4955 4956 /* 4957 * Calculate maximum amount of dirty data per pool. 4958 * 4959 * If it has been set by /etc/system, take that. 4960 * Otherwise, use a percentage of physical memory defined by 4961 * zfs_dirty_data_max_percent (default 10%) with a cap at 4962 * zfs_dirty_data_max_max (default 4GB). 4963 */ 4964 if (zfs_dirty_data_max == 0) { 4965 zfs_dirty_data_max = ptob(physmem) * 4966 zfs_dirty_data_max_percent / 100; 4967 zfs_dirty_data_max = MIN(zfs_dirty_data_max, 4968 zfs_dirty_data_max_max); 4969 } 4970 4971#ifdef _KERNEL 4972 if (TUNABLE_INT_FETCH("vfs.zfs.prefetch_disable", &zfs_prefetch_disable)) 4973 prefetch_tunable_set = 1; 4974 4975#ifdef __i386__ 4976 if (prefetch_tunable_set == 0) { 4977 printf("ZFS NOTICE: Prefetch is disabled by default on i386 " 4978 "-- to enable,\n"); 4979 printf(" add \"vfs.zfs.prefetch_disable=0\" " 4980 "to /boot/loader.conf.\n"); 4981 zfs_prefetch_disable = 1; 4982 } 4983#else 4984 if ((((uint64_t)physmem * PAGESIZE) < (1ULL << 32)) && 4985 prefetch_tunable_set == 0) { 4986 printf("ZFS NOTICE: Prefetch is disabled by default if less " 4987 "than 4GB of RAM is present;\n" 4988 " to enable, add \"vfs.zfs.prefetch_disable=0\" " 4989 "to /boot/loader.conf.\n"); 4990 zfs_prefetch_disable = 1; 4991 } 4992#endif 4993 /* Warn about ZFS memory and address space requirements. */ 4994 if (((uint64_t)physmem * PAGESIZE) < (256 + 128 + 64) * (1 << 20)) { 4995 printf("ZFS WARNING: Recommended minimum RAM size is 512MB; " 4996 "expect unstable behavior.\n"); 4997 } 4998 if (kmem_size() < 512 * (1 << 20)) { 4999 printf("ZFS WARNING: Recommended minimum kmem_size is 512MB; " 5000 "expect unstable behavior.\n"); 5001 printf(" Consider tuning vm.kmem_size and " 5002 "vm.kmem_size_max\n"); 5003 printf(" in /boot/loader.conf.\n"); 5004 } 5005#endif 5006} 5007 5008void 5009arc_fini(void) 5010{ 5011 int i; 5012 5013 mutex_enter(&arc_reclaim_thr_lock); 5014 arc_thread_exit = 1; 5015 cv_signal(&arc_reclaim_thr_cv); 5016 while (arc_thread_exit != 0) 5017 cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock); 5018 mutex_exit(&arc_reclaim_thr_lock); 5019 5020 arc_flush(NULL); 5021 5022 arc_dead = TRUE; 5023 5024 if (arc_ksp != NULL) { 5025 kstat_delete(arc_ksp); 5026 arc_ksp = NULL; 5027 } 5028 5029 mutex_destroy(&arc_eviction_mtx); 5030 mutex_destroy(&arc_reclaim_thr_lock); 5031 cv_destroy(&arc_reclaim_thr_cv); 5032 5033 for (i = 0; i < ARC_BUFC_NUMLISTS; i++) { 5034 list_destroy(&arc_mru->arcs_lists[i]); 5035 list_destroy(&arc_mru_ghost->arcs_lists[i]); 5036 list_destroy(&arc_mfu->arcs_lists[i]); 5037 list_destroy(&arc_mfu_ghost->arcs_lists[i]); 5038 list_destroy(&arc_l2c_only->arcs_lists[i]); 5039 5040 mutex_destroy(&arc_anon->arcs_locks[i].arcs_lock); 5041 mutex_destroy(&arc_mru->arcs_locks[i].arcs_lock); 5042 mutex_destroy(&arc_mru_ghost->arcs_locks[i].arcs_lock); 5043 mutex_destroy(&arc_mfu->arcs_locks[i].arcs_lock); 5044 mutex_destroy(&arc_mfu_ghost->arcs_locks[i].arcs_lock); 5045 mutex_destroy(&arc_l2c_only->arcs_locks[i].arcs_lock); 5046 } 5047 5048 buf_fini(); 5049 5050 ASSERT0(arc_loaned_bytes); 5051 5052 mutex_destroy(&arc_lowmem_lock); 5053#ifdef _KERNEL 5054 if (arc_event_lowmem != NULL) 5055 EVENTHANDLER_DEREGISTER(vm_lowmem, arc_event_lowmem); 5056#endif 5057} 5058 5059/* 5060 * Level 2 ARC 5061 * 5062 * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk. 5063 * It uses dedicated storage devices to hold cached data, which are populated 5064 * using large infrequent writes. The main role of this cache is to boost 5065 * the performance of random read workloads. The intended L2ARC devices 5066 * include short-stroked disks, solid state disks, and other media with 5067 * substantially faster read latency than disk. 5068 * 5069 * +-----------------------+ 5070 * | ARC | 5071 * +-----------------------+ 5072 * | ^ ^ 5073 * | | | 5074 * l2arc_feed_thread() arc_read() 5075 * | | | 5076 * | l2arc read | 5077 * V | | 5078 * +---------------+ | 5079 * | L2ARC | | 5080 * +---------------+ | 5081 * | ^ | 5082 * l2arc_write() | | 5083 * | | | 5084 * V | | 5085 * +-------+ +-------+ 5086 * | vdev | | vdev | 5087 * | cache | | cache | 5088 * +-------+ +-------+ 5089 * +=========+ .-----. 5090 * : L2ARC : |-_____-| 5091 * : devices : | Disks | 5092 * +=========+ `-_____-' 5093 * 5094 * Read requests are satisfied from the following sources, in order: 5095 * 5096 * 1) ARC 5097 * 2) vdev cache of L2ARC devices 5098 * 3) L2ARC devices 5099 * 4) vdev cache of disks 5100 * 5) disks 5101 * 5102 * Some L2ARC device types exhibit extremely slow write performance. 5103 * To accommodate for this there are some significant differences between 5104 * the L2ARC and traditional cache design: 5105 * 5106 * 1. There is no eviction path from the ARC to the L2ARC. Evictions from 5107 * the ARC behave as usual, freeing buffers and placing headers on ghost 5108 * lists. The ARC does not send buffers to the L2ARC during eviction as 5109 * this would add inflated write latencies for all ARC memory pressure. 5110 * 5111 * 2. The L2ARC attempts to cache data from the ARC before it is evicted. 5112 * It does this by periodically scanning buffers from the eviction-end of 5113 * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are 5114 * not already there. It scans until a headroom of buffers is satisfied, 5115 * which itself is a buffer for ARC eviction. If a compressible buffer is 5116 * found during scanning and selected for writing to an L2ARC device, we 5117 * temporarily boost scanning headroom during the next scan cycle to make 5118 * sure we adapt to compression effects (which might significantly reduce 5119 * the data volume we write to L2ARC). The thread that does this is 5120 * l2arc_feed_thread(), illustrated below; example sizes are included to 5121 * provide a better sense of ratio than this diagram: 5122 * 5123 * head --> tail 5124 * +---------------------+----------+ 5125 * ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->. # already on L2ARC 5126 * +---------------------+----------+ | o L2ARC eligible 5127 * ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->| : ARC buffer 5128 * +---------------------+----------+ | 5129 * 15.9 Gbytes ^ 32 Mbytes | 5130 * headroom | 5131 * l2arc_feed_thread() 5132 * | 5133 * l2arc write hand <--[oooo]--' 5134 * | 8 Mbyte 5135 * | write max 5136 * V 5137 * +==============================+ 5138 * L2ARC dev |####|#|###|###| |####| ... | 5139 * +==============================+ 5140 * 32 Gbytes 5141 * 5142 * 3. If an ARC buffer is copied to the L2ARC but then hit instead of 5143 * evicted, then the L2ARC has cached a buffer much sooner than it probably 5144 * needed to, potentially wasting L2ARC device bandwidth and storage. It is 5145 * safe to say that this is an uncommon case, since buffers at the end of 5146 * the ARC lists have moved there due to inactivity. 5147 * 5148 * 4. If the ARC evicts faster than the L2ARC can maintain a headroom, 5149 * then the L2ARC simply misses copying some buffers. This serves as a 5150 * pressure valve to prevent heavy read workloads from both stalling the ARC 5151 * with waits and clogging the L2ARC with writes. This also helps prevent 5152 * the potential for the L2ARC to churn if it attempts to cache content too 5153 * quickly, such as during backups of the entire pool. 5154 * 5155 * 5. After system boot and before the ARC has filled main memory, there are 5156 * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru 5157 * lists can remain mostly static. Instead of searching from tail of these 5158 * lists as pictured, the l2arc_feed_thread() will search from the list heads 5159 * for eligible buffers, greatly increasing its chance of finding them. 5160 * 5161 * The L2ARC device write speed is also boosted during this time so that 5162 * the L2ARC warms up faster. Since there have been no ARC evictions yet, 5163 * there are no L2ARC reads, and no fear of degrading read performance 5164 * through increased writes. 5165 * 5166 * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that 5167 * the vdev queue can aggregate them into larger and fewer writes. Each 5168 * device is written to in a rotor fashion, sweeping writes through 5169 * available space then repeating. 5170 * 5171 * 7. The L2ARC does not store dirty content. It never needs to flush 5172 * write buffers back to disk based storage. 5173 * 5174 * 8. If an ARC buffer is written (and dirtied) which also exists in the 5175 * L2ARC, the now stale L2ARC buffer is immediately dropped. 5176 * 5177 * The performance of the L2ARC can be tweaked by a number of tunables, which 5178 * may be necessary for different workloads: 5179 * 5180 * l2arc_write_max max write bytes per interval 5181 * l2arc_write_boost extra write bytes during device warmup 5182 * l2arc_noprefetch skip caching prefetched buffers 5183 * l2arc_headroom number of max device writes to precache 5184 * l2arc_headroom_boost when we find compressed buffers during ARC 5185 * scanning, we multiply headroom by this 5186 * percentage factor for the next scan cycle, 5187 * since more compressed buffers are likely to 5188 * be present 5189 * l2arc_feed_secs seconds between L2ARC writing 5190 * 5191 * Tunables may be removed or added as future performance improvements are 5192 * integrated, and also may become zpool properties. 5193 * 5194 * There are three key functions that control how the L2ARC warms up: 5195 * 5196 * l2arc_write_eligible() check if a buffer is eligible to cache 5197 * l2arc_write_size() calculate how much to write 5198 * l2arc_write_interval() calculate sleep delay between writes 5199 * 5200 * These three functions determine what to write, how much, and how quickly 5201 * to send writes. 5202 */ 5203 5204static boolean_t 5205l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *hdr) 5206{ 5207 /* 5208 * A buffer is *not* eligible for the L2ARC if it: 5209 * 1. belongs to a different spa. 5210 * 2. is already cached on the L2ARC. 5211 * 3. has an I/O in progress (it may be an incomplete read). 5212 * 4. is flagged not eligible (zfs property). 5213 */ 5214 if (hdr->b_spa != spa_guid) { 5215 ARCSTAT_BUMP(arcstat_l2_write_spa_mismatch); 5216 return (B_FALSE); 5217 } 5218 if (HDR_HAS_L2HDR(hdr)) { 5219 ARCSTAT_BUMP(arcstat_l2_write_in_l2); 5220 return (B_FALSE); 5221 } 5222 if (HDR_IO_IN_PROGRESS(hdr)) { 5223 ARCSTAT_BUMP(arcstat_l2_write_hdr_io_in_progress); 5224 return (B_FALSE); 5225 } 5226 if (!HDR_L2CACHE(hdr)) { 5227 ARCSTAT_BUMP(arcstat_l2_write_not_cacheable); 5228 return (B_FALSE); 5229 } 5230 5231 return (B_TRUE); 5232} 5233 5234static uint64_t 5235l2arc_write_size(void) 5236{ 5237 uint64_t size; 5238 5239 /* 5240 * Make sure our globals have meaningful values in case the user 5241 * altered them. 5242 */ 5243 size = l2arc_write_max; 5244 if (size == 0) { 5245 cmn_err(CE_NOTE, "Bad value for l2arc_write_max, value must " 5246 "be greater than zero, resetting it to the default (%d)", 5247 L2ARC_WRITE_SIZE); 5248 size = l2arc_write_max = L2ARC_WRITE_SIZE; 5249 } 5250 5251 if (arc_warm == B_FALSE) 5252 size += l2arc_write_boost; 5253 5254 return (size); 5255 5256} 5257 5258static clock_t 5259l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote) 5260{ 5261 clock_t interval, next, now; 5262 5263 /* 5264 * If the ARC lists are busy, increase our write rate; if the 5265 * lists are stale, idle back. This is achieved by checking 5266 * how much we previously wrote - if it was more than half of 5267 * what we wanted, schedule the next write much sooner. 5268 */ 5269 if (l2arc_feed_again && wrote > (wanted / 2)) 5270 interval = (hz * l2arc_feed_min_ms) / 1000; 5271 else 5272 interval = hz * l2arc_feed_secs; 5273 5274 now = ddi_get_lbolt(); 5275 next = MAX(now, MIN(now + interval, began + interval)); 5276 5277 return (next); 5278} 5279 5280/* 5281 * Cycle through L2ARC devices. This is how L2ARC load balances. 5282 * If a device is returned, this also returns holding the spa config lock. 5283 */ 5284static l2arc_dev_t * 5285l2arc_dev_get_next(void) 5286{ 5287 l2arc_dev_t *first, *next = NULL; 5288 5289 /* 5290 * Lock out the removal of spas (spa_namespace_lock), then removal 5291 * of cache devices (l2arc_dev_mtx). Once a device has been selected, 5292 * both locks will be dropped and a spa config lock held instead. 5293 */ 5294 mutex_enter(&spa_namespace_lock); 5295 mutex_enter(&l2arc_dev_mtx); 5296 5297 /* if there are no vdevs, there is nothing to do */ 5298 if (l2arc_ndev == 0) 5299 goto out; 5300 5301 first = NULL; 5302 next = l2arc_dev_last; 5303 do { 5304 /* loop around the list looking for a non-faulted vdev */ 5305 if (next == NULL) { 5306 next = list_head(l2arc_dev_list); 5307 } else { 5308 next = list_next(l2arc_dev_list, next); 5309 if (next == NULL) 5310 next = list_head(l2arc_dev_list); 5311 } 5312 5313 /* if we have come back to the start, bail out */ 5314 if (first == NULL) 5315 first = next; 5316 else if (next == first) 5317 break; 5318 5319 } while (vdev_is_dead(next->l2ad_vdev)); 5320 5321 /* if we were unable to find any usable vdevs, return NULL */ 5322 if (vdev_is_dead(next->l2ad_vdev)) 5323 next = NULL; 5324 5325 l2arc_dev_last = next; 5326 5327out: 5328 mutex_exit(&l2arc_dev_mtx); 5329 5330 /* 5331 * Grab the config lock to prevent the 'next' device from being 5332 * removed while we are writing to it. 5333 */ 5334 if (next != NULL) 5335 spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER); 5336 mutex_exit(&spa_namespace_lock); 5337 5338 return (next); 5339} 5340 5341/* 5342 * Free buffers that were tagged for destruction. 5343 */ 5344static void 5345l2arc_do_free_on_write() 5346{ 5347 list_t *buflist; 5348 l2arc_data_free_t *df, *df_prev; 5349 5350 mutex_enter(&l2arc_free_on_write_mtx); 5351 buflist = l2arc_free_on_write; 5352 5353 for (df = list_tail(buflist); df; df = df_prev) { 5354 df_prev = list_prev(buflist, df); 5355 ASSERT(df->l2df_data != NULL); 5356 ASSERT(df->l2df_func != NULL); 5357 df->l2df_func(df->l2df_data, df->l2df_size); 5358 list_remove(buflist, df); 5359 kmem_free(df, sizeof (l2arc_data_free_t)); 5360 } 5361 5362 mutex_exit(&l2arc_free_on_write_mtx); 5363} 5364 5365/* 5366 * A write to a cache device has completed. Update all headers to allow 5367 * reads from these buffers to begin. 5368 */ 5369static void 5370l2arc_write_done(zio_t *zio) 5371{ 5372 l2arc_write_callback_t *cb; 5373 l2arc_dev_t *dev; 5374 list_t *buflist; 5375 arc_buf_hdr_t *head, *hdr, *hdr_prev; 5376 kmutex_t *hash_lock; 5377 int64_t bytes_dropped = 0; 5378 5379 cb = zio->io_private; 5380 ASSERT(cb != NULL); 5381 dev = cb->l2wcb_dev; 5382 ASSERT(dev != NULL); 5383 head = cb->l2wcb_head; 5384 ASSERT(head != NULL); 5385 buflist = &dev->l2ad_buflist; 5386 ASSERT(buflist != NULL); 5387 DTRACE_PROBE2(l2arc__iodone, zio_t *, zio, 5388 l2arc_write_callback_t *, cb); 5389 5390 if (zio->io_error != 0) 5391 ARCSTAT_BUMP(arcstat_l2_writes_error); 5392 5393 mutex_enter(&dev->l2ad_mtx); 5394 5395 /* 5396 * All writes completed, or an error was hit. 5397 */ 5398 for (hdr = list_prev(buflist, head); hdr; hdr = hdr_prev) { 5399 hdr_prev = list_prev(buflist, hdr); 5400 5401 hash_lock = HDR_LOCK(hdr); 5402 if (!mutex_tryenter(hash_lock)) { 5403 /* 5404 * This buffer misses out. It may be in a stage 5405 * of eviction. Its ARC_FLAG_L2_WRITING flag will be 5406 * left set, denying reads to this buffer. 5407 */ 5408 ARCSTAT_BUMP(arcstat_l2_writes_hdr_miss); 5409 continue; 5410 } 5411 5412 /* 5413 * It's possible that this buffer got evicted from the L1 cache 5414 * before we grabbed the vdev + hash locks, in which case 5415 * arc_hdr_realloc freed b_tmp_cdata for us if it was allocated. 5416 * Only free the buffer if we still have an L1 hdr. 5417 */ 5418 if (HDR_HAS_L1HDR(hdr) && hdr->b_l1hdr.b_tmp_cdata != NULL && 5419 HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF) 5420 l2arc_release_cdata_buf(hdr); 5421 5422 if (zio->io_error != 0) { 5423 /* 5424 * Error - drop L2ARC entry. 5425 */ 5426 trim_map_free(hdr->b_l2hdr.b_dev->l2ad_vdev, 5427 hdr->b_l2hdr.b_daddr, hdr->b_l2hdr.b_asize, 0); 5428 hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR; 5429 5430 ARCSTAT_INCR(arcstat_l2_asize, -hdr->b_l2hdr.b_asize); 5431 ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); 5432 5433 bytes_dropped += hdr->b_l2hdr.b_asize; 5434 (void) refcount_remove_many(&dev->l2ad_alloc, 5435 hdr->b_l2hdr.b_asize, hdr); 5436 } 5437 5438 /* 5439 * Allow ARC to begin reads to this L2ARC entry. 5440 */ 5441 hdr->b_flags &= ~ARC_FLAG_L2_WRITING; 5442 5443 mutex_exit(hash_lock); 5444 } 5445 5446 atomic_inc_64(&l2arc_writes_done); 5447 list_remove(buflist, head); 5448 ASSERT(!HDR_HAS_L1HDR(head)); 5449 kmem_cache_free(hdr_l2only_cache, head); 5450 mutex_exit(&dev->l2ad_mtx); 5451 5452 vdev_space_update(dev->l2ad_vdev, -bytes_dropped, 0, 0); 5453 5454 l2arc_do_free_on_write(); 5455 5456 kmem_free(cb, sizeof (l2arc_write_callback_t)); 5457} 5458 5459/* 5460 * A read to a cache device completed. Validate buffer contents before 5461 * handing over to the regular ARC routines. 5462 */ 5463static void 5464l2arc_read_done(zio_t *zio) 5465{ 5466 l2arc_read_callback_t *cb; 5467 arc_buf_hdr_t *hdr; 5468 arc_buf_t *buf; 5469 kmutex_t *hash_lock; 5470 int equal; 5471 5472 ASSERT(zio->io_vd != NULL); 5473 ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE); 5474 5475 spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd); 5476 5477 cb = zio->io_private; 5478 ASSERT(cb != NULL); 5479 buf = cb->l2rcb_buf; 5480 ASSERT(buf != NULL); 5481 5482 hash_lock = HDR_LOCK(buf->b_hdr); 5483 mutex_enter(hash_lock); 5484 hdr = buf->b_hdr; 5485 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 5486 5487 /* 5488 * If the buffer was compressed, decompress it first. 5489 */ 5490 if (cb->l2rcb_compress != ZIO_COMPRESS_OFF) 5491 l2arc_decompress_zio(zio, hdr, cb->l2rcb_compress); 5492 ASSERT(zio->io_data != NULL); 5493 5494 /* 5495 * Check this survived the L2ARC journey. 5496 */ 5497 equal = arc_cksum_equal(buf); 5498 if (equal && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) { 5499 mutex_exit(hash_lock); 5500 zio->io_private = buf; 5501 zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 */ 5502 zio->io_bp = &zio->io_bp_copy; /* XXX fix in L2ARC 2.0 */ 5503 arc_read_done(zio); 5504 } else { 5505 mutex_exit(hash_lock); 5506 /* 5507 * Buffer didn't survive caching. Increment stats and 5508 * reissue to the original storage device. 5509 */ 5510 if (zio->io_error != 0) { 5511 ARCSTAT_BUMP(arcstat_l2_io_error); 5512 } else { 5513 zio->io_error = SET_ERROR(EIO); 5514 } 5515 if (!equal) 5516 ARCSTAT_BUMP(arcstat_l2_cksum_bad); 5517 5518 /* 5519 * If there's no waiter, issue an async i/o to the primary 5520 * storage now. If there *is* a waiter, the caller must 5521 * issue the i/o in a context where it's OK to block. 5522 */ 5523 if (zio->io_waiter == NULL) { 5524 zio_t *pio = zio_unique_parent(zio); 5525 5526 ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL); 5527 5528 zio_nowait(zio_read(pio, cb->l2rcb_spa, &cb->l2rcb_bp, 5529 buf->b_data, zio->io_size, arc_read_done, buf, 5530 zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb)); 5531 } 5532 } 5533 5534 kmem_free(cb, sizeof (l2arc_read_callback_t)); 5535} 5536 5537/* 5538 * This is the list priority from which the L2ARC will search for pages to 5539 * cache. This is used within loops (0..3) to cycle through lists in the 5540 * desired order. This order can have a significant effect on cache 5541 * performance. 5542 * 5543 * Currently the metadata lists are hit first, MFU then MRU, followed by 5544 * the data lists. This function returns a locked list, and also returns 5545 * the lock pointer. 5546 */ 5547static list_t * 5548l2arc_list_locked(int list_num, kmutex_t **lock) 5549{ 5550 list_t *list = NULL; 5551 int idx; 5552 5553 ASSERT(list_num >= 0 && list_num < 2 * ARC_BUFC_NUMLISTS); 5554 5555 if (list_num < ARC_BUFC_NUMMETADATALISTS) { 5556 idx = list_num; 5557 list = &arc_mfu->arcs_lists[idx]; 5558 *lock = ARCS_LOCK(arc_mfu, idx); 5559 } else if (list_num < ARC_BUFC_NUMMETADATALISTS * 2) { 5560 idx = list_num - ARC_BUFC_NUMMETADATALISTS; 5561 list = &arc_mru->arcs_lists[idx]; 5562 *lock = ARCS_LOCK(arc_mru, idx); 5563 } else if (list_num < (ARC_BUFC_NUMMETADATALISTS * 2 + 5564 ARC_BUFC_NUMDATALISTS)) { 5565 idx = list_num - ARC_BUFC_NUMMETADATALISTS; 5566 list = &arc_mfu->arcs_lists[idx]; 5567 *lock = ARCS_LOCK(arc_mfu, idx); 5568 } else { 5569 idx = list_num - ARC_BUFC_NUMLISTS; 5570 list = &arc_mru->arcs_lists[idx]; 5571 *lock = ARCS_LOCK(arc_mru, idx); 5572 } 5573 5574 ASSERT(!(MUTEX_HELD(*lock))); 5575 mutex_enter(*lock); 5576 return (list); 5577} 5578 5579/* 5580 * Evict buffers from the device write hand to the distance specified in 5581 * bytes. This distance may span populated buffers, it may span nothing. 5582 * This is clearing a region on the L2ARC device ready for writing. 5583 * If the 'all' boolean is set, every buffer is evicted. 5584 */ 5585static void 5586l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) 5587{ 5588 list_t *buflist; 5589 arc_buf_hdr_t *hdr, *hdr_prev; 5590 kmutex_t *hash_lock; 5591 uint64_t taddr; 5592 5593 buflist = &dev->l2ad_buflist; 5594 5595 if (!all && dev->l2ad_first) { 5596 /* 5597 * This is the first sweep through the device. There is 5598 * nothing to evict. 5599 */ 5600 return; 5601 } 5602 5603 if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) { 5604 /* 5605 * When nearing the end of the device, evict to the end 5606 * before the device write hand jumps to the start. 5607 */ 5608 taddr = dev->l2ad_end; 5609 } else { 5610 taddr = dev->l2ad_hand + distance; 5611 } 5612 DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist, 5613 uint64_t, taddr, boolean_t, all); 5614 5615top: 5616 mutex_enter(&dev->l2ad_mtx); 5617 for (hdr = list_tail(buflist); hdr; hdr = hdr_prev) { 5618 hdr_prev = list_prev(buflist, hdr); 5619 5620 hash_lock = HDR_LOCK(hdr); 5621 if (!mutex_tryenter(hash_lock)) { 5622 /* 5623 * Missed the hash lock. Retry. 5624 */ 5625 ARCSTAT_BUMP(arcstat_l2_evict_lock_retry); 5626 mutex_exit(&dev->l2ad_mtx); 5627 mutex_enter(hash_lock); 5628 mutex_exit(hash_lock); 5629 goto top; 5630 } 5631 5632 if (HDR_L2_WRITE_HEAD(hdr)) { 5633 /* 5634 * We hit a write head node. Leave it for 5635 * l2arc_write_done(). 5636 */ 5637 list_remove(buflist, hdr); 5638 mutex_exit(hash_lock); 5639 continue; 5640 } 5641 5642 if (!all && HDR_HAS_L2HDR(hdr) && 5643 (hdr->b_l2hdr.b_daddr > taddr || 5644 hdr->b_l2hdr.b_daddr < dev->l2ad_hand)) { 5645 /* 5646 * We've evicted to the target address, 5647 * or the end of the device. 5648 */ 5649 mutex_exit(hash_lock); 5650 break; 5651 } 5652 5653 ASSERT(HDR_HAS_L2HDR(hdr)); 5654 if (!HDR_HAS_L1HDR(hdr)) { 5655 ASSERT(!HDR_L2_READING(hdr)); 5656 /* 5657 * This doesn't exist in the ARC. Destroy. 5658 * arc_hdr_destroy() will call list_remove() 5659 * and decrement arcstat_l2_size. 5660 */ 5661 arc_change_state(arc_anon, hdr, hash_lock); 5662 arc_hdr_destroy(hdr); 5663 } else { 5664 ASSERT(hdr->b_l1hdr.b_state != arc_l2c_only); 5665 ARCSTAT_BUMP(arcstat_l2_evict_l1cached); 5666 /* 5667 * Invalidate issued or about to be issued 5668 * reads, since we may be about to write 5669 * over this location. 5670 */ 5671 if (HDR_L2_READING(hdr)) { 5672 ARCSTAT_BUMP(arcstat_l2_evict_reading); 5673 hdr->b_flags |= ARC_FLAG_L2_EVICTED; 5674 } 5675 5676 arc_hdr_l2hdr_destroy(hdr); 5677 } 5678 mutex_exit(hash_lock); 5679 } 5680 mutex_exit(&dev->l2ad_mtx); 5681} 5682 5683/* 5684 * Find and write ARC buffers to the L2ARC device. 5685 * 5686 * An ARC_FLAG_L2_WRITING flag is set so that the L2ARC buffers are not valid 5687 * for reading until they have completed writing. 5688 * The headroom_boost is an in-out parameter used to maintain headroom boost 5689 * state between calls to this function. 5690 * 5691 * Returns the number of bytes actually written (which may be smaller than 5692 * the delta by which the device hand has changed due to alignment). 5693 */ 5694static uint64_t 5695l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, 5696 boolean_t *headroom_boost) 5697{ 5698 arc_buf_hdr_t *hdr, *hdr_prev, *head; 5699 list_t *list; 5700 uint64_t write_asize, write_psize, write_sz, headroom, 5701 buf_compress_minsz; 5702 void *buf_data; 5703 kmutex_t *list_lock; 5704 boolean_t full; 5705 l2arc_write_callback_t *cb; 5706 zio_t *pio, *wzio; 5707 uint64_t guid = spa_load_guid(spa); 5708 const boolean_t do_headroom_boost = *headroom_boost; 5709 int try; 5710 5711 ASSERT(dev->l2ad_vdev != NULL); 5712 5713 /* Lower the flag now, we might want to raise it again later. */ 5714 *headroom_boost = B_FALSE; 5715 5716 pio = NULL; 5717 write_sz = write_asize = write_psize = 0; 5718 full = B_FALSE; 5719 head = kmem_cache_alloc(hdr_l2only_cache, KM_PUSHPAGE); 5720 head->b_flags |= ARC_FLAG_L2_WRITE_HEAD; 5721 head->b_flags |= ARC_FLAG_HAS_L2HDR; 5722 5723 ARCSTAT_BUMP(arcstat_l2_write_buffer_iter); 5724 /* 5725 * We will want to try to compress buffers that are at least 2x the 5726 * device sector size. 5727 */ 5728 buf_compress_minsz = 2 << dev->l2ad_vdev->vdev_ashift; 5729 5730 /* 5731 * Copy buffers for L2ARC writing. 5732 */ 5733 mutex_enter(&dev->l2ad_mtx); 5734 for (try = 0; try < 2 * ARC_BUFC_NUMLISTS; try++) { 5735 uint64_t passed_sz = 0; 5736 5737 list = l2arc_list_locked(try, &list_lock); 5738 ARCSTAT_BUMP(arcstat_l2_write_buffer_list_iter); 5739 5740 /* 5741 * L2ARC fast warmup. 5742 * 5743 * Until the ARC is warm and starts to evict, read from the 5744 * head of the ARC lists rather than the tail. 5745 */ 5746 if (arc_warm == B_FALSE) 5747 hdr = list_head(list); 5748 else 5749 hdr = list_tail(list); 5750 if (hdr == NULL) 5751 ARCSTAT_BUMP(arcstat_l2_write_buffer_list_null_iter); 5752 5753 headroom = target_sz * l2arc_headroom * 2 / ARC_BUFC_NUMLISTS; 5754 if (do_headroom_boost) 5755 headroom = (headroom * l2arc_headroom_boost) / 100; 5756 5757 for (; hdr; hdr = hdr_prev) { 5758 kmutex_t *hash_lock; 5759 uint64_t buf_sz; 5760 5761 if (arc_warm == B_FALSE) 5762 hdr_prev = list_next(list, hdr); 5763 else 5764 hdr_prev = list_prev(list, hdr); 5765 ARCSTAT_INCR(arcstat_l2_write_buffer_bytes_scanned, hdr->b_size); 5766 5767 hash_lock = HDR_LOCK(hdr); 5768 if (!mutex_tryenter(hash_lock)) { 5769 ARCSTAT_BUMP(arcstat_l2_write_trylock_fail); 5770 /* 5771 * Skip this buffer rather than waiting. 5772 */ 5773 continue; 5774 } 5775 5776 passed_sz += hdr->b_size; 5777 if (passed_sz > headroom) { 5778 /* 5779 * Searched too far. 5780 */ 5781 mutex_exit(hash_lock); 5782 ARCSTAT_BUMP(arcstat_l2_write_passed_headroom); 5783 break; 5784 } 5785 5786 if (!l2arc_write_eligible(guid, hdr)) { 5787 mutex_exit(hash_lock); 5788 continue; 5789 } 5790 5791 if ((write_sz + hdr->b_size) > target_sz) { 5792 full = B_TRUE; 5793 mutex_exit(hash_lock); 5794 ARCSTAT_BUMP(arcstat_l2_write_full); 5795 break; 5796 } 5797 5798 if (pio == NULL) { 5799 /* 5800 * Insert a dummy header on the buflist so 5801 * l2arc_write_done() can find where the 5802 * write buffers begin without searching. 5803 */ 5804 list_insert_head(&dev->l2ad_buflist, head); 5805 5806 cb = kmem_alloc( 5807 sizeof (l2arc_write_callback_t), KM_SLEEP); 5808 cb->l2wcb_dev = dev; 5809 cb->l2wcb_head = head; 5810 pio = zio_root(spa, l2arc_write_done, cb, 5811 ZIO_FLAG_CANFAIL); 5812 ARCSTAT_BUMP(arcstat_l2_write_pios); 5813 } 5814 5815 /* 5816 * Create and add a new L2ARC header. 5817 */ 5818 hdr->b_l2hdr.b_dev = dev; 5819 hdr->b_flags |= ARC_FLAG_L2_WRITING; 5820 /* 5821 * Temporarily stash the data buffer in b_tmp_cdata. 5822 * The subsequent write step will pick it up from 5823 * there. This is because can't access b_l1hdr.b_buf 5824 * without holding the hash_lock, which we in turn 5825 * can't access without holding the ARC list locks 5826 * (which we want to avoid during compression/writing). 5827 */ 5828 HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_OFF); 5829 hdr->b_l2hdr.b_asize = hdr->b_size; 5830 hdr->b_l1hdr.b_tmp_cdata = hdr->b_l1hdr.b_buf->b_data; 5831 5832 /* 5833 * Explicitly set the b_daddr field to a known 5834 * value which means "invalid address". This 5835 * enables us to differentiate which stage of 5836 * l2arc_write_buffers() the particular header 5837 * is in (e.g. this loop, or the one below). 5838 * ARC_FLAG_L2_WRITING is not enough to make 5839 * this distinction, and we need to know in 5840 * order to do proper l2arc vdev accounting in 5841 * arc_release() and arc_hdr_destroy(). 5842 * 5843 * Note, we can't use a new flag to distinguish 5844 * the two stages because we don't hold the 5845 * header's hash_lock below, in the second stage 5846 * of this function. Thus, we can't simply 5847 * change the b_flags field to denote that the 5848 * IO has been sent. We can change the b_daddr 5849 * field of the L2 portion, though, since we'll 5850 * be holding the l2ad_mtx; which is why we're 5851 * using it to denote the header's state change. 5852 */ 5853 hdr->b_l2hdr.b_daddr = L2ARC_ADDR_UNSET; 5854 5855 buf_sz = hdr->b_size; 5856 hdr->b_flags |= ARC_FLAG_HAS_L2HDR; 5857 5858 list_insert_head(&dev->l2ad_buflist, hdr); 5859 5860 /* 5861 * Compute and store the buffer cksum before 5862 * writing. On debug the cksum is verified first. 5863 */ 5864 arc_cksum_verify(hdr->b_l1hdr.b_buf); 5865 arc_cksum_compute(hdr->b_l1hdr.b_buf, B_TRUE); 5866 5867 mutex_exit(hash_lock); 5868 5869 write_sz += buf_sz; 5870 } 5871 5872 mutex_exit(list_lock); 5873 5874 if (full == B_TRUE) 5875 break; 5876 } 5877 5878 /* No buffers selected for writing? */ 5879 if (pio == NULL) { 5880 ASSERT0(write_sz); 5881 mutex_exit(&dev->l2ad_mtx); 5882 ASSERT(!HDR_HAS_L1HDR(head)); 5883 kmem_cache_free(hdr_l2only_cache, head); 5884 return (0); 5885 } 5886 5887 /* 5888 * Now start writing the buffers. We're starting at the write head 5889 * and work backwards, retracing the course of the buffer selector 5890 * loop above. 5891 */ 5892 for (hdr = list_prev(&dev->l2ad_buflist, head); hdr; 5893 hdr = list_prev(&dev->l2ad_buflist, hdr)) { 5894 uint64_t buf_sz; 5895 5896 /* 5897 * We shouldn't need to lock the buffer here, since we flagged 5898 * it as ARC_FLAG_L2_WRITING in the previous step, but we must 5899 * take care to only access its L2 cache parameters. In 5900 * particular, hdr->l1hdr.b_buf may be invalid by now due to 5901 * ARC eviction. 5902 */ 5903 hdr->b_l2hdr.b_daddr = dev->l2ad_hand; 5904 5905 if ((HDR_L2COMPRESS(hdr)) && 5906 hdr->b_l2hdr.b_asize >= buf_compress_minsz) { 5907 if (l2arc_compress_buf(hdr)) { 5908 /* 5909 * If compression succeeded, enable headroom 5910 * boost on the next scan cycle. 5911 */ 5912 *headroom_boost = B_TRUE; 5913 } 5914 } 5915 5916 /* 5917 * Pick up the buffer data we had previously stashed away 5918 * (and now potentially also compressed). 5919 */ 5920 buf_data = hdr->b_l1hdr.b_tmp_cdata; 5921 buf_sz = hdr->b_l2hdr.b_asize; 5922 5923 /* 5924 * If the data has not been compressed, then clear b_tmp_cdata 5925 * to make sure that it points only to a temporary compression 5926 * buffer. 5927 */ 5928 if (!L2ARC_IS_VALID_COMPRESS(HDR_GET_COMPRESS(hdr))) 5929 hdr->b_l1hdr.b_tmp_cdata = NULL; 5930 5931 /* 5932 * We need to do this regardless if buf_sz is zero or 5933 * not, otherwise, when this l2hdr is evicted we'll 5934 * remove a reference that was never added. 5935 */ 5936 (void) refcount_add_many(&dev->l2ad_alloc, buf_sz, hdr); 5937 5938 /* Compression may have squashed the buffer to zero length. */ 5939 if (buf_sz != 0) { 5940 uint64_t buf_p_sz; 5941 5942 wzio = zio_write_phys(pio, dev->l2ad_vdev, 5943 dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF, 5944 NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE, 5945 ZIO_FLAG_CANFAIL, B_FALSE); 5946 5947 DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev, 5948 zio_t *, wzio); 5949 (void) zio_nowait(wzio); 5950 5951 write_asize += buf_sz; 5952 5953 /* 5954 * Keep the clock hand suitably device-aligned. 5955 */ 5956 buf_p_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz); 5957 write_psize += buf_p_sz; 5958 dev->l2ad_hand += buf_p_sz; 5959 } 5960 } 5961 5962 mutex_exit(&dev->l2ad_mtx); 5963 5964 ASSERT3U(write_asize, <=, target_sz); 5965 ARCSTAT_BUMP(arcstat_l2_writes_sent); 5966 ARCSTAT_INCR(arcstat_l2_write_bytes, write_asize); 5967 ARCSTAT_INCR(arcstat_l2_size, write_sz); 5968 ARCSTAT_INCR(arcstat_l2_asize, write_asize); 5969 vdev_space_update(dev->l2ad_vdev, write_asize, 0, 0); 5970 5971 /* 5972 * Bump device hand to the device start if it is approaching the end. 5973 * l2arc_evict() will already have evicted ahead for this case. 5974 */ 5975 if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) { 5976 dev->l2ad_hand = dev->l2ad_start; 5977 dev->l2ad_first = B_FALSE; 5978 } 5979 5980 dev->l2ad_writing = B_TRUE; 5981 (void) zio_wait(pio); 5982 dev->l2ad_writing = B_FALSE; 5983 5984 return (write_asize); 5985} 5986 5987/* 5988 * Compresses an L2ARC buffer. 5989 * The data to be compressed must be prefilled in l1hdr.b_tmp_cdata and its 5990 * size in l2hdr->b_asize. This routine tries to compress the data and 5991 * depending on the compression result there are three possible outcomes: 5992 * *) The buffer was incompressible. The original l2hdr contents were left 5993 * untouched and are ready for writing to an L2 device. 5994 * *) The buffer was all-zeros, so there is no need to write it to an L2 5995 * device. To indicate this situation b_tmp_cdata is NULL'ed, b_asize is 5996 * set to zero and b_compress is set to ZIO_COMPRESS_EMPTY. 5997 * *) Compression succeeded and b_tmp_cdata was replaced with a temporary 5998 * data buffer which holds the compressed data to be written, and b_asize 5999 * tells us how much data there is. b_compress is set to the appropriate 6000 * compression algorithm. Once writing is done, invoke 6001 * l2arc_release_cdata_buf on this l2hdr to free this temporary buffer. 6002 * 6003 * Returns B_TRUE if compression succeeded, or B_FALSE if it didn't (the 6004 * buffer was incompressible). 6005 */ 6006static boolean_t 6007l2arc_compress_buf(arc_buf_hdr_t *hdr) 6008{ 6009 void *cdata; 6010 size_t csize, len, rounded; 6011 ASSERT(HDR_HAS_L2HDR(hdr)); 6012 l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr; 6013 6014 ASSERT(HDR_HAS_L1HDR(hdr)); 6015 ASSERT(HDR_GET_COMPRESS(hdr) == ZIO_COMPRESS_OFF); 6016 ASSERT(hdr->b_l1hdr.b_tmp_cdata != NULL); 6017 6018 len = l2hdr->b_asize; 6019 cdata = zio_data_buf_alloc(len); 6020 ASSERT3P(cdata, !=, NULL); 6021 csize = zio_compress_data(ZIO_COMPRESS_LZ4, hdr->b_l1hdr.b_tmp_cdata, 6022 cdata, l2hdr->b_asize); 6023 6024 if (csize == 0) { 6025 /* zero block, indicate that there's nothing to write */ 6026 zio_data_buf_free(cdata, len); 6027 HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_EMPTY); 6028 l2hdr->b_asize = 0; 6029 hdr->b_l1hdr.b_tmp_cdata = NULL; 6030 ARCSTAT_BUMP(arcstat_l2_compress_zeros); 6031 return (B_TRUE); 6032 } 6033 6034 rounded = P2ROUNDUP(csize, 6035 (size_t)1 << l2hdr->b_dev->l2ad_vdev->vdev_ashift); 6036 if (rounded < len) { 6037 /* 6038 * Compression succeeded, we'll keep the cdata around for 6039 * writing and release it afterwards. 6040 */ 6041 if (rounded > csize) { 6042 bzero((char *)cdata + csize, rounded - csize); 6043 csize = rounded; 6044 } 6045 HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_LZ4); 6046 l2hdr->b_asize = csize; 6047 hdr->b_l1hdr.b_tmp_cdata = cdata; 6048 ARCSTAT_BUMP(arcstat_l2_compress_successes); 6049 return (B_TRUE); 6050 } else { 6051 /* 6052 * Compression failed, release the compressed buffer. 6053 * l2hdr will be left unmodified. 6054 */ 6055 zio_data_buf_free(cdata, len); 6056 ARCSTAT_BUMP(arcstat_l2_compress_failures); 6057 return (B_FALSE); 6058 } 6059} 6060 6061/* 6062 * Decompresses a zio read back from an l2arc device. On success, the 6063 * underlying zio's io_data buffer is overwritten by the uncompressed 6064 * version. On decompression error (corrupt compressed stream), the 6065 * zio->io_error value is set to signal an I/O error. 6066 * 6067 * Please note that the compressed data stream is not checksummed, so 6068 * if the underlying device is experiencing data corruption, we may feed 6069 * corrupt data to the decompressor, so the decompressor needs to be 6070 * able to handle this situation (LZ4 does). 6071 */ 6072static void 6073l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr, enum zio_compress c) 6074{ 6075 ASSERT(L2ARC_IS_VALID_COMPRESS(c)); 6076 6077 if (zio->io_error != 0) { 6078 /* 6079 * An io error has occured, just restore the original io 6080 * size in preparation for a main pool read. 6081 */ 6082 zio->io_orig_size = zio->io_size = hdr->b_size; 6083 return; 6084 } 6085 6086 if (c == ZIO_COMPRESS_EMPTY) { 6087 /* 6088 * An empty buffer results in a null zio, which means we 6089 * need to fill its io_data after we're done restoring the 6090 * buffer's contents. 6091 */ 6092 ASSERT(hdr->b_l1hdr.b_buf != NULL); 6093 bzero(hdr->b_l1hdr.b_buf->b_data, hdr->b_size); 6094 zio->io_data = zio->io_orig_data = hdr->b_l1hdr.b_buf->b_data; 6095 } else { 6096 ASSERT(zio->io_data != NULL); 6097 /* 6098 * We copy the compressed data from the start of the arc buffer 6099 * (the zio_read will have pulled in only what we need, the 6100 * rest is garbage which we will overwrite at decompression) 6101 * and then decompress back to the ARC data buffer. This way we 6102 * can minimize copying by simply decompressing back over the 6103 * original compressed data (rather than decompressing to an 6104 * aux buffer and then copying back the uncompressed buffer, 6105 * which is likely to be much larger). 6106 */ 6107 uint64_t csize; 6108 void *cdata; 6109 6110 csize = zio->io_size; 6111 cdata = zio_data_buf_alloc(csize); 6112 bcopy(zio->io_data, cdata, csize); 6113 if (zio_decompress_data(c, cdata, zio->io_data, csize, 6114 hdr->b_size) != 0) 6115 zio->io_error = EIO; 6116 zio_data_buf_free(cdata, csize); 6117 } 6118 6119 /* Restore the expected uncompressed IO size. */ 6120 zio->io_orig_size = zio->io_size = hdr->b_size; 6121} 6122 6123/* 6124 * Releases the temporary b_tmp_cdata buffer in an l2arc header structure. 6125 * This buffer serves as a temporary holder of compressed data while 6126 * the buffer entry is being written to an l2arc device. Once that is 6127 * done, we can dispose of it. 6128 */ 6129static void 6130l2arc_release_cdata_buf(arc_buf_hdr_t *hdr) 6131{ 6132 ASSERT(HDR_HAS_L1HDR(hdr)); 6133 if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_EMPTY) { 6134 /* 6135 * If the data was compressed, then we've allocated a 6136 * temporary buffer for it, so now we need to release it. 6137 */ 6138 ASSERT(hdr->b_l1hdr.b_tmp_cdata != NULL); 6139 zio_data_buf_free(hdr->b_l1hdr.b_tmp_cdata, 6140 hdr->b_size); 6141 hdr->b_l1hdr.b_tmp_cdata = NULL; 6142 } else { 6143 ASSERT(hdr->b_l1hdr.b_tmp_cdata == NULL); 6144 } 6145} 6146 6147/* 6148 * This thread feeds the L2ARC at regular intervals. This is the beating 6149 * heart of the L2ARC. 6150 */ 6151static void 6152l2arc_feed_thread(void *dummy __unused) 6153{ 6154 callb_cpr_t cpr; 6155 l2arc_dev_t *dev; 6156 spa_t *spa; 6157 uint64_t size, wrote; 6158 clock_t begin, next = ddi_get_lbolt(); 6159 boolean_t headroom_boost = B_FALSE; 6160 6161 CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG); 6162 6163 mutex_enter(&l2arc_feed_thr_lock); 6164 6165 while (l2arc_thread_exit == 0) { 6166 CALLB_CPR_SAFE_BEGIN(&cpr); 6167 (void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock, 6168 next - ddi_get_lbolt()); 6169 CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock); 6170 next = ddi_get_lbolt() + hz; 6171 6172 /* 6173 * Quick check for L2ARC devices. 6174 */ 6175 mutex_enter(&l2arc_dev_mtx); 6176 if (l2arc_ndev == 0) { 6177 mutex_exit(&l2arc_dev_mtx); 6178 continue; 6179 } 6180 mutex_exit(&l2arc_dev_mtx); 6181 begin = ddi_get_lbolt(); 6182 6183 /* 6184 * This selects the next l2arc device to write to, and in 6185 * doing so the next spa to feed from: dev->l2ad_spa. This 6186 * will return NULL if there are now no l2arc devices or if 6187 * they are all faulted. 6188 * 6189 * If a device is returned, its spa's config lock is also 6190 * held to prevent device removal. l2arc_dev_get_next() 6191 * will grab and release l2arc_dev_mtx. 6192 */ 6193 if ((dev = l2arc_dev_get_next()) == NULL) 6194 continue; 6195 6196 spa = dev->l2ad_spa; 6197 ASSERT(spa != NULL); 6198 6199 /* 6200 * If the pool is read-only then force the feed thread to 6201 * sleep a little longer. 6202 */ 6203 if (!spa_writeable(spa)) { 6204 next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz; 6205 spa_config_exit(spa, SCL_L2ARC, dev); 6206 continue; 6207 } 6208 6209 /* 6210 * Avoid contributing to memory pressure. 6211 */ 6212 if (arc_reclaim_needed()) { 6213 ARCSTAT_BUMP(arcstat_l2_abort_lowmem); 6214 spa_config_exit(spa, SCL_L2ARC, dev); 6215 continue; 6216 } 6217 6218 ARCSTAT_BUMP(arcstat_l2_feeds); 6219 6220 size = l2arc_write_size(); 6221 6222 /* 6223 * Evict L2ARC buffers that will be overwritten. 6224 */ 6225 l2arc_evict(dev, size, B_FALSE); 6226 6227 /* 6228 * Write ARC buffers. 6229 */ 6230 wrote = l2arc_write_buffers(spa, dev, size, &headroom_boost); 6231 6232 /* 6233 * Calculate interval between writes. 6234 */ 6235 next = l2arc_write_interval(begin, size, wrote); 6236 spa_config_exit(spa, SCL_L2ARC, dev); 6237 } 6238 6239 l2arc_thread_exit = 0; 6240 cv_broadcast(&l2arc_feed_thr_cv); 6241 CALLB_CPR_EXIT(&cpr); /* drops l2arc_feed_thr_lock */ 6242 thread_exit(); 6243} 6244 6245boolean_t 6246l2arc_vdev_present(vdev_t *vd) 6247{ 6248 l2arc_dev_t *dev; 6249 6250 mutex_enter(&l2arc_dev_mtx); 6251 for (dev = list_head(l2arc_dev_list); dev != NULL; 6252 dev = list_next(l2arc_dev_list, dev)) { 6253 if (dev->l2ad_vdev == vd) 6254 break; 6255 } 6256 mutex_exit(&l2arc_dev_mtx); 6257 6258 return (dev != NULL); 6259} 6260 6261/* 6262 * Add a vdev for use by the L2ARC. By this point the spa has already 6263 * validated the vdev and opened it. 6264 */ 6265void 6266l2arc_add_vdev(spa_t *spa, vdev_t *vd) 6267{ 6268 l2arc_dev_t *adddev; 6269 6270 ASSERT(!l2arc_vdev_present(vd)); 6271 6272 vdev_ashift_optimize(vd); 6273 6274 /* 6275 * Create a new l2arc device entry. 6276 */ 6277 adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP); 6278 adddev->l2ad_spa = spa; 6279 adddev->l2ad_vdev = vd; 6280 adddev->l2ad_start = VDEV_LABEL_START_SIZE; 6281 adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd); 6282 adddev->l2ad_hand = adddev->l2ad_start; 6283 adddev->l2ad_first = B_TRUE; 6284 adddev->l2ad_writing = B_FALSE; 6285 6286 mutex_init(&adddev->l2ad_mtx, NULL, MUTEX_DEFAULT, NULL); 6287 /* 6288 * This is a list of all ARC buffers that are still valid on the 6289 * device. 6290 */ 6291 list_create(&adddev->l2ad_buflist, sizeof (arc_buf_hdr_t), 6292 offsetof(arc_buf_hdr_t, b_l2hdr.b_l2node)); 6293 6294 vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand); 6295 refcount_create(&adddev->l2ad_alloc); 6296 6297 /* 6298 * Add device to global list 6299 */ 6300 mutex_enter(&l2arc_dev_mtx); 6301 list_insert_head(l2arc_dev_list, adddev); 6302 atomic_inc_64(&l2arc_ndev); 6303 mutex_exit(&l2arc_dev_mtx); 6304} 6305 6306/* 6307 * Remove a vdev from the L2ARC. 6308 */ 6309void 6310l2arc_remove_vdev(vdev_t *vd) 6311{ 6312 l2arc_dev_t *dev, *nextdev, *remdev = NULL; 6313 6314 /* 6315 * Find the device by vdev 6316 */ 6317 mutex_enter(&l2arc_dev_mtx); 6318 for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) { 6319 nextdev = list_next(l2arc_dev_list, dev); 6320 if (vd == dev->l2ad_vdev) { 6321 remdev = dev; 6322 break; 6323 } 6324 } 6325 ASSERT(remdev != NULL); 6326 6327 /* 6328 * Remove device from global list 6329 */ 6330 list_remove(l2arc_dev_list, remdev); 6331 l2arc_dev_last = NULL; /* may have been invalidated */ 6332 atomic_dec_64(&l2arc_ndev); 6333 mutex_exit(&l2arc_dev_mtx); 6334 6335 /* 6336 * Clear all buflists and ARC references. L2ARC device flush. 6337 */ 6338 l2arc_evict(remdev, 0, B_TRUE); 6339 list_destroy(&remdev->l2ad_buflist); 6340 mutex_destroy(&remdev->l2ad_mtx); 6341 refcount_destroy(&remdev->l2ad_alloc); 6342 kmem_free(remdev, sizeof (l2arc_dev_t)); 6343} 6344 6345void 6346l2arc_init(void) 6347{ 6348 l2arc_thread_exit = 0; 6349 l2arc_ndev = 0; 6350 l2arc_writes_sent = 0; 6351 l2arc_writes_done = 0; 6352 6353 mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL); 6354 cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL); 6355 mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL); 6356 mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL); 6357 6358 l2arc_dev_list = &L2ARC_dev_list; 6359 l2arc_free_on_write = &L2ARC_free_on_write; 6360 list_create(l2arc_dev_list, sizeof (l2arc_dev_t), 6361 offsetof(l2arc_dev_t, l2ad_node)); 6362 list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t), 6363 offsetof(l2arc_data_free_t, l2df_list_node)); 6364} 6365 6366void 6367l2arc_fini(void) 6368{ 6369 /* 6370 * This is called from dmu_fini(), which is called from spa_fini(); 6371 * Because of this, we can assume that all l2arc devices have 6372 * already been removed when the pools themselves were removed. 6373 */ 6374 6375 l2arc_do_free_on_write(); 6376 6377 mutex_destroy(&l2arc_feed_thr_lock); 6378 cv_destroy(&l2arc_feed_thr_cv); 6379 mutex_destroy(&l2arc_dev_mtx); 6380 mutex_destroy(&l2arc_free_on_write_mtx); 6381 6382 list_destroy(l2arc_dev_list); 6383 list_destroy(l2arc_free_on_write); 6384} 6385 6386void 6387l2arc_start(void) 6388{ 6389 if (!(spa_mode_global & FWRITE)) 6390 return; 6391 6392 (void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0, 6393 TS_RUN, minclsyspri); 6394} 6395 6396void 6397l2arc_stop(void) 6398{ 6399 if (!(spa_mode_global & FWRITE)) 6400 return; 6401 6402 mutex_enter(&l2arc_feed_thr_lock); 6403 cv_signal(&l2arc_feed_thr_cv); /* kick thread out of startup */ 6404 l2arc_thread_exit = 1; 6405 while (l2arc_thread_exit != 0) 6406 cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock); 6407 mutex_exit(&l2arc_feed_thr_lock); 6408} 6409