arc.c revision 288557
1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21/* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2012, Joyent, Inc. All rights reserved. 24 * Copyright (c) 2011, 2014 by Delphix. All rights reserved. 25 * Copyright (c) 2014 by Saso Kiselkov. All rights reserved. 26 * Copyright 2014 Nexenta Systems, Inc. All rights reserved. 27 */ 28 29/* 30 * DVA-based Adjustable Replacement Cache 31 * 32 * While much of the theory of operation used here is 33 * based on the self-tuning, low overhead replacement cache 34 * presented by Megiddo and Modha at FAST 2003, there are some 35 * significant differences: 36 * 37 * 1. The Megiddo and Modha model assumes any page is evictable. 38 * Pages in its cache cannot be "locked" into memory. This makes 39 * the eviction algorithm simple: evict the last page in the list. 40 * This also make the performance characteristics easy to reason 41 * about. Our cache is not so simple. At any given moment, some 42 * subset of the blocks in the cache are un-evictable because we 43 * have handed out a reference to them. Blocks are only evictable 44 * when there are no external references active. This makes 45 * eviction far more problematic: we choose to evict the evictable 46 * blocks that are the "lowest" in the list. 47 * 48 * There are times when it is not possible to evict the requested 49 * space. In these circumstances we are unable to adjust the cache 50 * size. To prevent the cache growing unbounded at these times we 51 * implement a "cache throttle" that slows the flow of new data 52 * into the cache until we can make space available. 53 * 54 * 2. The Megiddo and Modha model assumes a fixed cache size. 55 * Pages are evicted when the cache is full and there is a cache 56 * miss. Our model has a variable sized cache. It grows with 57 * high use, but also tries to react to memory pressure from the 58 * operating system: decreasing its size when system memory is 59 * tight. 60 * 61 * 3. The Megiddo and Modha model assumes a fixed page size. All 62 * elements of the cache are therefore exactly the same size. So 63 * when adjusting the cache size following a cache miss, its simply 64 * a matter of choosing a single page to evict. In our model, we 65 * have variable sized cache blocks (rangeing from 512 bytes to 66 * 128K bytes). We therefore choose a set of blocks to evict to make 67 * space for a cache miss that approximates as closely as possible 68 * the space used by the new block. 69 * 70 * See also: "ARC: A Self-Tuning, Low Overhead Replacement Cache" 71 * by N. Megiddo & D. Modha, FAST 2003 72 */ 73 74/* 75 * The locking model: 76 * 77 * A new reference to a cache buffer can be obtained in two 78 * ways: 1) via a hash table lookup using the DVA as a key, 79 * or 2) via one of the ARC lists. The arc_read() interface 80 * uses method 1, while the internal arc algorithms for 81 * adjusting the cache use method 2. We therefore provide two 82 * types of locks: 1) the hash table lock array, and 2) the 83 * arc list locks. 84 * 85 * Buffers do not have their own mutexs, rather they rely on the 86 * hash table mutexs for the bulk of their protection (i.e. most 87 * fields in the arc_buf_hdr_t are protected by these mutexs). 88 * 89 * buf_hash_find() returns the appropriate mutex (held) when it 90 * locates the requested buffer in the hash table. It returns 91 * NULL for the mutex if the buffer was not in the table. 92 * 93 * buf_hash_remove() expects the appropriate hash mutex to be 94 * already held before it is invoked. 95 * 96 * Each arc state also has a mutex which is used to protect the 97 * buffer list associated with the state. When attempting to 98 * obtain a hash table lock while holding an arc list lock you 99 * must use: mutex_tryenter() to avoid deadlock. Also note that 100 * the active state mutex must be held before the ghost state mutex. 101 * 102 * Arc buffers may have an associated eviction callback function. 103 * This function will be invoked prior to removing the buffer (e.g. 104 * in arc_do_user_evicts()). Note however that the data associated 105 * with the buffer may be evicted prior to the callback. The callback 106 * must be made with *no locks held* (to prevent deadlock). Additionally, 107 * the users of callbacks must ensure that their private data is 108 * protected from simultaneous callbacks from arc_clear_callback() 109 * and arc_do_user_evicts(). 110 * 111 * Note that the majority of the performance stats are manipulated 112 * with atomic operations. 113 * 114 * The L2ARC uses the l2ad_mtx on each vdev for the following: 115 * 116 * - L2ARC buflist creation 117 * - L2ARC buflist eviction 118 * - L2ARC write completion, which walks L2ARC buflists 119 * - ARC header destruction, as it removes from L2ARC buflists 120 * - ARC header release, as it removes from L2ARC buflists 121 */ 122 123#include <sys/spa.h> 124#include <sys/zio.h> 125#include <sys/zio_compress.h> 126#include <sys/zfs_context.h> 127#include <sys/arc.h> 128#include <sys/refcount.h> 129#include <sys/vdev.h> 130#include <sys/vdev_impl.h> 131#include <sys/dsl_pool.h> 132#ifdef _KERNEL 133#include <sys/dnlc.h> 134#endif 135#include <sys/callb.h> 136#include <sys/kstat.h> 137#include <sys/trim_map.h> 138#include <zfs_fletcher.h> 139#include <sys/sdt.h> 140 141#include <vm/vm_pageout.h> 142#include <machine/vmparam.h> 143 144#ifdef illumos 145#ifndef _KERNEL 146/* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */ 147boolean_t arc_watch = B_FALSE; 148int arc_procfd; 149#endif 150#endif /* illumos */ 151 152static kmutex_t arc_reclaim_thr_lock; 153static kcondvar_t arc_reclaim_thr_cv; /* used to signal reclaim thr */ 154static uint8_t arc_thread_exit; 155 156#define ARC_REDUCE_DNLC_PERCENT 3 157uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT; 158 159typedef enum arc_reclaim_strategy { 160 ARC_RECLAIM_AGGR, /* Aggressive reclaim strategy */ 161 ARC_RECLAIM_CONS /* Conservative reclaim strategy */ 162} arc_reclaim_strategy_t; 163 164/* 165 * The number of iterations through arc_evict_*() before we 166 * drop & reacquire the lock. 167 */ 168int arc_evict_iterations = 100; 169 170/* number of seconds before growing cache again */ 171static int arc_grow_retry = 60; 172 173/* shift of arc_c for calculating both min and max arc_p */ 174static int arc_p_min_shift = 4; 175 176/* log2(fraction of arc to reclaim) */ 177static int arc_shrink_shift = 5; 178 179/* 180 * minimum lifespan of a prefetch block in clock ticks 181 * (initialized in arc_init()) 182 */ 183static int arc_min_prefetch_lifespan; 184 185/* 186 * If this percent of memory is free, don't throttle. 187 */ 188int arc_lotsfree_percent = 10; 189 190static int arc_dead; 191extern int zfs_prefetch_disable; 192 193/* 194 * The arc has filled available memory and has now warmed up. 195 */ 196static boolean_t arc_warm; 197 198uint64_t zfs_arc_max; 199uint64_t zfs_arc_min; 200uint64_t zfs_arc_meta_limit = 0; 201uint64_t zfs_arc_meta_min = 0; 202int zfs_arc_grow_retry = 0; 203int zfs_arc_shrink_shift = 0; 204int zfs_arc_p_min_shift = 0; 205int zfs_disable_dup_eviction = 0; 206uint64_t zfs_arc_average_blocksize = 8 * 1024; /* 8KB */ 207u_int zfs_arc_free_target = 0; 208 209static int sysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS); 210static int sysctl_vfs_zfs_arc_meta_limit(SYSCTL_HANDLER_ARGS); 211 212#ifdef _KERNEL 213static void 214arc_free_target_init(void *unused __unused) 215{ 216 217 zfs_arc_free_target = vm_pageout_wakeup_thresh; 218} 219SYSINIT(arc_free_target_init, SI_SUB_KTHREAD_PAGE, SI_ORDER_ANY, 220 arc_free_target_init, NULL); 221 222TUNABLE_QUAD("vfs.zfs.arc_max", &zfs_arc_max); 223TUNABLE_QUAD("vfs.zfs.arc_min", &zfs_arc_min); 224TUNABLE_QUAD("vfs.zfs.arc_meta_limit", &zfs_arc_meta_limit); 225TUNABLE_QUAD("vfs.zfs.arc_meta_min", &zfs_arc_meta_min); 226TUNABLE_QUAD("vfs.zfs.arc_average_blocksize", &zfs_arc_average_blocksize); 227TUNABLE_INT("vfs.zfs.arc_shrink_shift", &zfs_arc_shrink_shift); 228SYSCTL_DECL(_vfs_zfs); 229SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_max, CTLFLAG_RDTUN, &zfs_arc_max, 0, 230 "Maximum ARC size"); 231SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_min, CTLFLAG_RDTUN, &zfs_arc_min, 0, 232 "Minimum ARC size"); 233SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_average_blocksize, CTLFLAG_RDTUN, 234 &zfs_arc_average_blocksize, 0, 235 "ARC average blocksize"); 236SYSCTL_INT(_vfs_zfs, OID_AUTO, arc_shrink_shift, CTLFLAG_RW, 237 &arc_shrink_shift, 0, 238 "log2(fraction of arc to reclaim)"); 239 240/* 241 * We don't have a tunable for arc_free_target due to the dependency on 242 * pagedaemon initialisation. 243 */ 244SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_free_target, 245 CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(u_int), 246 sysctl_vfs_zfs_arc_free_target, "IU", 247 "Desired number of free pages below which ARC triggers reclaim"); 248 249static int 250sysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS) 251{ 252 u_int val; 253 int err; 254 255 val = zfs_arc_free_target; 256 err = sysctl_handle_int(oidp, &val, 0, req); 257 if (err != 0 || req->newptr == NULL) 258 return (err); 259 260 if (val < minfree) 261 return (EINVAL); 262 if (val > cnt.v_page_count) 263 return (EINVAL); 264 265 zfs_arc_free_target = val; 266 267 return (0); 268} 269 270/* 271 * Must be declared here, before the definition of corresponding kstat 272 * macro which uses the same names will confuse the compiler. 273 */ 274SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_meta_limit, 275 CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(uint64_t), 276 sysctl_vfs_zfs_arc_meta_limit, "QU", 277 "ARC metadata limit"); 278#endif 279 280/* 281 * Note that buffers can be in one of 6 states: 282 * ARC_anon - anonymous (discussed below) 283 * ARC_mru - recently used, currently cached 284 * ARC_mru_ghost - recentely used, no longer in cache 285 * ARC_mfu - frequently used, currently cached 286 * ARC_mfu_ghost - frequently used, no longer in cache 287 * ARC_l2c_only - exists in L2ARC but not other states 288 * When there are no active references to the buffer, they are 289 * are linked onto a list in one of these arc states. These are 290 * the only buffers that can be evicted or deleted. Within each 291 * state there are multiple lists, one for meta-data and one for 292 * non-meta-data. Meta-data (indirect blocks, blocks of dnodes, 293 * etc.) is tracked separately so that it can be managed more 294 * explicitly: favored over data, limited explicitly. 295 * 296 * Anonymous buffers are buffers that are not associated with 297 * a DVA. These are buffers that hold dirty block copies 298 * before they are written to stable storage. By definition, 299 * they are "ref'd" and are considered part of arc_mru 300 * that cannot be freed. Generally, they will aquire a DVA 301 * as they are written and migrate onto the arc_mru list. 302 * 303 * The ARC_l2c_only state is for buffers that are in the second 304 * level ARC but no longer in any of the ARC_m* lists. The second 305 * level ARC itself may also contain buffers that are in any of 306 * the ARC_m* states - meaning that a buffer can exist in two 307 * places. The reason for the ARC_l2c_only state is to keep the 308 * buffer header in the hash table, so that reads that hit the 309 * second level ARC benefit from these fast lookups. 310 */ 311 312#define ARCS_LOCK_PAD CACHE_LINE_SIZE 313struct arcs_lock { 314 kmutex_t arcs_lock; 315#ifdef _KERNEL 316 unsigned char pad[(ARCS_LOCK_PAD - sizeof (kmutex_t))]; 317#endif 318}; 319 320/* 321 * must be power of two for mask use to work 322 * 323 */ 324#define ARC_BUFC_NUMDATALISTS 16 325#define ARC_BUFC_NUMMETADATALISTS 16 326#define ARC_BUFC_NUMLISTS (ARC_BUFC_NUMMETADATALISTS + ARC_BUFC_NUMDATALISTS) 327 328typedef struct arc_state { 329 uint64_t arcs_lsize[ARC_BUFC_NUMTYPES]; /* amount of evictable data */ 330 uint64_t arcs_size; /* total amount of data in this state */ 331 list_t arcs_lists[ARC_BUFC_NUMLISTS]; /* list of evictable buffers */ 332 struct arcs_lock arcs_locks[ARC_BUFC_NUMLISTS] __aligned(CACHE_LINE_SIZE); 333} arc_state_t; 334 335#define ARCS_LOCK(s, i) (&((s)->arcs_locks[(i)].arcs_lock)) 336 337/* The 6 states: */ 338static arc_state_t ARC_anon; 339static arc_state_t ARC_mru; 340static arc_state_t ARC_mru_ghost; 341static arc_state_t ARC_mfu; 342static arc_state_t ARC_mfu_ghost; 343static arc_state_t ARC_l2c_only; 344 345typedef struct arc_stats { 346 kstat_named_t arcstat_hits; 347 kstat_named_t arcstat_misses; 348 kstat_named_t arcstat_demand_data_hits; 349 kstat_named_t arcstat_demand_data_misses; 350 kstat_named_t arcstat_demand_metadata_hits; 351 kstat_named_t arcstat_demand_metadata_misses; 352 kstat_named_t arcstat_prefetch_data_hits; 353 kstat_named_t arcstat_prefetch_data_misses; 354 kstat_named_t arcstat_prefetch_metadata_hits; 355 kstat_named_t arcstat_prefetch_metadata_misses; 356 kstat_named_t arcstat_mru_hits; 357 kstat_named_t arcstat_mru_ghost_hits; 358 kstat_named_t arcstat_mfu_hits; 359 kstat_named_t arcstat_mfu_ghost_hits; 360 kstat_named_t arcstat_allocated; 361 kstat_named_t arcstat_deleted; 362 kstat_named_t arcstat_stolen; 363 kstat_named_t arcstat_recycle_miss; 364 /* 365 * Number of buffers that could not be evicted because the hash lock 366 * was held by another thread. The lock may not necessarily be held 367 * by something using the same buffer, since hash locks are shared 368 * by multiple buffers. 369 */ 370 kstat_named_t arcstat_mutex_miss; 371 /* 372 * Number of buffers skipped because they have I/O in progress, are 373 * indrect prefetch buffers that have not lived long enough, or are 374 * not from the spa we're trying to evict from. 375 */ 376 kstat_named_t arcstat_evict_skip; 377 kstat_named_t arcstat_evict_l2_cached; 378 kstat_named_t arcstat_evict_l2_eligible; 379 kstat_named_t arcstat_evict_l2_ineligible; 380 kstat_named_t arcstat_hash_elements; 381 kstat_named_t arcstat_hash_elements_max; 382 kstat_named_t arcstat_hash_collisions; 383 kstat_named_t arcstat_hash_chains; 384 kstat_named_t arcstat_hash_chain_max; 385 kstat_named_t arcstat_p; 386 kstat_named_t arcstat_c; 387 kstat_named_t arcstat_c_min; 388 kstat_named_t arcstat_c_max; 389 kstat_named_t arcstat_size; 390 /* 391 * Number of bytes consumed by internal ARC structures necessary 392 * for tracking purposes; these structures are not actually 393 * backed by ARC buffers. This includes arc_buf_hdr_t structures 394 * (allocated via arc_buf_hdr_t_full and arc_buf_hdr_t_l2only 395 * caches), and arc_buf_t structures (allocated via arc_buf_t 396 * cache). 397 */ 398 kstat_named_t arcstat_hdr_size; 399 /* 400 * Number of bytes consumed by ARC buffers of type equal to 401 * ARC_BUFC_DATA. This is generally consumed by buffers backing 402 * on disk user data (e.g. plain file contents). 403 */ 404 kstat_named_t arcstat_data_size; 405 /* 406 * Number of bytes consumed by ARC buffers of type equal to 407 * ARC_BUFC_METADATA. This is generally consumed by buffers 408 * backing on disk data that is used for internal ZFS 409 * structures (e.g. ZAP, dnode, indirect blocks, etc). 410 */ 411 kstat_named_t arcstat_metadata_size; 412 /* 413 * Number of bytes consumed by various buffers and structures 414 * not actually backed with ARC buffers. This includes bonus 415 * buffers (allocated directly via zio_buf_* functions), 416 * dmu_buf_impl_t structures (allocated via dmu_buf_impl_t 417 * cache), and dnode_t structures (allocated via dnode_t cache). 418 */ 419 kstat_named_t arcstat_other_size; 420 /* 421 * Total number of bytes consumed by ARC buffers residing in the 422 * arc_anon state. This includes *all* buffers in the arc_anon 423 * state; e.g. data, metadata, evictable, and unevictable buffers 424 * are all included in this value. 425 */ 426 kstat_named_t arcstat_anon_size; 427 /* 428 * Number of bytes consumed by ARC buffers that meet the 429 * following criteria: backing buffers of type ARC_BUFC_DATA, 430 * residing in the arc_anon state, and are eligible for eviction 431 * (e.g. have no outstanding holds on the buffer). 432 */ 433 kstat_named_t arcstat_anon_evictable_data; 434 /* 435 * Number of bytes consumed by ARC buffers that meet the 436 * following criteria: backing buffers of type ARC_BUFC_METADATA, 437 * residing in the arc_anon state, and are eligible for eviction 438 * (e.g. have no outstanding holds on the buffer). 439 */ 440 kstat_named_t arcstat_anon_evictable_metadata; 441 /* 442 * Total number of bytes consumed by ARC buffers residing in the 443 * arc_mru state. This includes *all* buffers in the arc_mru 444 * state; e.g. data, metadata, evictable, and unevictable buffers 445 * are all included in this value. 446 */ 447 kstat_named_t arcstat_mru_size; 448 /* 449 * Number of bytes consumed by ARC buffers that meet the 450 * following criteria: backing buffers of type ARC_BUFC_DATA, 451 * residing in the arc_mru state, and are eligible for eviction 452 * (e.g. have no outstanding holds on the buffer). 453 */ 454 kstat_named_t arcstat_mru_evictable_data; 455 /* 456 * Number of bytes consumed by ARC buffers that meet the 457 * following criteria: backing buffers of type ARC_BUFC_METADATA, 458 * residing in the arc_mru state, and are eligible for eviction 459 * (e.g. have no outstanding holds on the buffer). 460 */ 461 kstat_named_t arcstat_mru_evictable_metadata; 462 /* 463 * Total number of bytes that *would have been* consumed by ARC 464 * buffers in the arc_mru_ghost state. The key thing to note 465 * here, is the fact that this size doesn't actually indicate 466 * RAM consumption. The ghost lists only consist of headers and 467 * don't actually have ARC buffers linked off of these headers. 468 * Thus, *if* the headers had associated ARC buffers, these 469 * buffers *would have* consumed this number of bytes. 470 */ 471 kstat_named_t arcstat_mru_ghost_size; 472 /* 473 * Number of bytes that *would have been* consumed by ARC 474 * buffers that are eligible for eviction, of type 475 * ARC_BUFC_DATA, and linked off the arc_mru_ghost state. 476 */ 477 kstat_named_t arcstat_mru_ghost_evictable_data; 478 /* 479 * Number of bytes that *would have been* consumed by ARC 480 * buffers that are eligible for eviction, of type 481 * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state. 482 */ 483 kstat_named_t arcstat_mru_ghost_evictable_metadata; 484 /* 485 * Total number of bytes consumed by ARC buffers residing in the 486 * arc_mfu state. This includes *all* buffers in the arc_mfu 487 * state; e.g. data, metadata, evictable, and unevictable buffers 488 * are all included in this value. 489 */ 490 kstat_named_t arcstat_mfu_size; 491 /* 492 * Number of bytes consumed by ARC buffers that are eligible for 493 * eviction, of type ARC_BUFC_DATA, and reside in the arc_mfu 494 * state. 495 */ 496 kstat_named_t arcstat_mfu_evictable_data; 497 /* 498 * Number of bytes consumed by ARC buffers that are eligible for 499 * eviction, of type ARC_BUFC_METADATA, and reside in the 500 * arc_mfu state. 501 */ 502 kstat_named_t arcstat_mfu_evictable_metadata; 503 /* 504 * Total number of bytes that *would have been* consumed by ARC 505 * buffers in the arc_mfu_ghost state. See the comment above 506 * arcstat_mru_ghost_size for more details. 507 */ 508 kstat_named_t arcstat_mfu_ghost_size; 509 /* 510 * Number of bytes that *would have been* consumed by ARC 511 * buffers that are eligible for eviction, of type 512 * ARC_BUFC_DATA, and linked off the arc_mfu_ghost state. 513 */ 514 kstat_named_t arcstat_mfu_ghost_evictable_data; 515 /* 516 * Number of bytes that *would have been* consumed by ARC 517 * buffers that are eligible for eviction, of type 518 * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state. 519 */ 520 kstat_named_t arcstat_mfu_ghost_evictable_metadata; 521 kstat_named_t arcstat_l2_hits; 522 kstat_named_t arcstat_l2_misses; 523 kstat_named_t arcstat_l2_feeds; 524 kstat_named_t arcstat_l2_rw_clash; 525 kstat_named_t arcstat_l2_read_bytes; 526 kstat_named_t arcstat_l2_write_bytes; 527 kstat_named_t arcstat_l2_writes_sent; 528 kstat_named_t arcstat_l2_writes_done; 529 kstat_named_t arcstat_l2_writes_error; 530 kstat_named_t arcstat_l2_writes_hdr_miss; 531 kstat_named_t arcstat_l2_evict_lock_retry; 532 kstat_named_t arcstat_l2_evict_reading; 533 kstat_named_t arcstat_l2_evict_l1cached; 534 kstat_named_t arcstat_l2_free_on_write; 535 kstat_named_t arcstat_l2_cdata_free_on_write; 536 kstat_named_t arcstat_l2_abort_lowmem; 537 kstat_named_t arcstat_l2_cksum_bad; 538 kstat_named_t arcstat_l2_io_error; 539 kstat_named_t arcstat_l2_size; 540 kstat_named_t arcstat_l2_asize; 541 kstat_named_t arcstat_l2_hdr_size; 542 kstat_named_t arcstat_l2_compress_successes; 543 kstat_named_t arcstat_l2_compress_zeros; 544 kstat_named_t arcstat_l2_compress_failures; 545 kstat_named_t arcstat_l2_write_trylock_fail; 546 kstat_named_t arcstat_l2_write_passed_headroom; 547 kstat_named_t arcstat_l2_write_spa_mismatch; 548 kstat_named_t arcstat_l2_write_in_l2; 549 kstat_named_t arcstat_l2_write_hdr_io_in_progress; 550 kstat_named_t arcstat_l2_write_not_cacheable; 551 kstat_named_t arcstat_l2_write_full; 552 kstat_named_t arcstat_l2_write_buffer_iter; 553 kstat_named_t arcstat_l2_write_pios; 554 kstat_named_t arcstat_l2_write_buffer_bytes_scanned; 555 kstat_named_t arcstat_l2_write_buffer_list_iter; 556 kstat_named_t arcstat_l2_write_buffer_list_null_iter; 557 kstat_named_t arcstat_memory_throttle_count; 558 kstat_named_t arcstat_duplicate_buffers; 559 kstat_named_t arcstat_duplicate_buffers_size; 560 kstat_named_t arcstat_duplicate_reads; 561 kstat_named_t arcstat_meta_used; 562 kstat_named_t arcstat_meta_limit; 563 kstat_named_t arcstat_meta_max; 564 kstat_named_t arcstat_meta_min; 565} arc_stats_t; 566 567static arc_stats_t arc_stats = { 568 { "hits", KSTAT_DATA_UINT64 }, 569 { "misses", KSTAT_DATA_UINT64 }, 570 { "demand_data_hits", KSTAT_DATA_UINT64 }, 571 { "demand_data_misses", KSTAT_DATA_UINT64 }, 572 { "demand_metadata_hits", KSTAT_DATA_UINT64 }, 573 { "demand_metadata_misses", KSTAT_DATA_UINT64 }, 574 { "prefetch_data_hits", KSTAT_DATA_UINT64 }, 575 { "prefetch_data_misses", KSTAT_DATA_UINT64 }, 576 { "prefetch_metadata_hits", KSTAT_DATA_UINT64 }, 577 { "prefetch_metadata_misses", KSTAT_DATA_UINT64 }, 578 { "mru_hits", KSTAT_DATA_UINT64 }, 579 { "mru_ghost_hits", KSTAT_DATA_UINT64 }, 580 { "mfu_hits", KSTAT_DATA_UINT64 }, 581 { "mfu_ghost_hits", KSTAT_DATA_UINT64 }, 582 { "allocated", KSTAT_DATA_UINT64 }, 583 { "deleted", KSTAT_DATA_UINT64 }, 584 { "stolen", KSTAT_DATA_UINT64 }, 585 { "recycle_miss", KSTAT_DATA_UINT64 }, 586 { "mutex_miss", KSTAT_DATA_UINT64 }, 587 { "evict_skip", KSTAT_DATA_UINT64 }, 588 { "evict_l2_cached", KSTAT_DATA_UINT64 }, 589 { "evict_l2_eligible", KSTAT_DATA_UINT64 }, 590 { "evict_l2_ineligible", KSTAT_DATA_UINT64 }, 591 { "hash_elements", KSTAT_DATA_UINT64 }, 592 { "hash_elements_max", KSTAT_DATA_UINT64 }, 593 { "hash_collisions", KSTAT_DATA_UINT64 }, 594 { "hash_chains", KSTAT_DATA_UINT64 }, 595 { "hash_chain_max", KSTAT_DATA_UINT64 }, 596 { "p", KSTAT_DATA_UINT64 }, 597 { "c", KSTAT_DATA_UINT64 }, 598 { "c_min", KSTAT_DATA_UINT64 }, 599 { "c_max", KSTAT_DATA_UINT64 }, 600 { "size", KSTAT_DATA_UINT64 }, 601 { "hdr_size", KSTAT_DATA_UINT64 }, 602 { "data_size", KSTAT_DATA_UINT64 }, 603 { "metadata_size", KSTAT_DATA_UINT64 }, 604 { "other_size", KSTAT_DATA_UINT64 }, 605 { "anon_size", KSTAT_DATA_UINT64 }, 606 { "anon_evictable_data", KSTAT_DATA_UINT64 }, 607 { "anon_evictable_metadata", KSTAT_DATA_UINT64 }, 608 { "mru_size", KSTAT_DATA_UINT64 }, 609 { "mru_evictable_data", KSTAT_DATA_UINT64 }, 610 { "mru_evictable_metadata", KSTAT_DATA_UINT64 }, 611 { "mru_ghost_size", KSTAT_DATA_UINT64 }, 612 { "mru_ghost_evictable_data", KSTAT_DATA_UINT64 }, 613 { "mru_ghost_evictable_metadata", KSTAT_DATA_UINT64 }, 614 { "mfu_size", KSTAT_DATA_UINT64 }, 615 { "mfu_evictable_data", KSTAT_DATA_UINT64 }, 616 { "mfu_evictable_metadata", KSTAT_DATA_UINT64 }, 617 { "mfu_ghost_size", KSTAT_DATA_UINT64 }, 618 { "mfu_ghost_evictable_data", KSTAT_DATA_UINT64 }, 619 { "mfu_ghost_evictable_metadata", KSTAT_DATA_UINT64 }, 620 { "l2_hits", KSTAT_DATA_UINT64 }, 621 { "l2_misses", KSTAT_DATA_UINT64 }, 622 { "l2_feeds", KSTAT_DATA_UINT64 }, 623 { "l2_rw_clash", KSTAT_DATA_UINT64 }, 624 { "l2_read_bytes", KSTAT_DATA_UINT64 }, 625 { "l2_write_bytes", KSTAT_DATA_UINT64 }, 626 { "l2_writes_sent", KSTAT_DATA_UINT64 }, 627 { "l2_writes_done", KSTAT_DATA_UINT64 }, 628 { "l2_writes_error", KSTAT_DATA_UINT64 }, 629 { "l2_writes_hdr_miss", KSTAT_DATA_UINT64 }, 630 { "l2_evict_lock_retry", KSTAT_DATA_UINT64 }, 631 { "l2_evict_reading", KSTAT_DATA_UINT64 }, 632 { "l2_evict_l1cached", KSTAT_DATA_UINT64 }, 633 { "l2_free_on_write", KSTAT_DATA_UINT64 }, 634 { "l2_cdata_free_on_write", KSTAT_DATA_UINT64 }, 635 { "l2_abort_lowmem", KSTAT_DATA_UINT64 }, 636 { "l2_cksum_bad", KSTAT_DATA_UINT64 }, 637 { "l2_io_error", KSTAT_DATA_UINT64 }, 638 { "l2_size", KSTAT_DATA_UINT64 }, 639 { "l2_asize", KSTAT_DATA_UINT64 }, 640 { "l2_hdr_size", KSTAT_DATA_UINT64 }, 641 { "l2_compress_successes", KSTAT_DATA_UINT64 }, 642 { "l2_compress_zeros", KSTAT_DATA_UINT64 }, 643 { "l2_compress_failures", KSTAT_DATA_UINT64 }, 644 { "l2_write_trylock_fail", KSTAT_DATA_UINT64 }, 645 { "l2_write_passed_headroom", KSTAT_DATA_UINT64 }, 646 { "l2_write_spa_mismatch", KSTAT_DATA_UINT64 }, 647 { "l2_write_in_l2", KSTAT_DATA_UINT64 }, 648 { "l2_write_io_in_progress", KSTAT_DATA_UINT64 }, 649 { "l2_write_not_cacheable", KSTAT_DATA_UINT64 }, 650 { "l2_write_full", KSTAT_DATA_UINT64 }, 651 { "l2_write_buffer_iter", KSTAT_DATA_UINT64 }, 652 { "l2_write_pios", KSTAT_DATA_UINT64 }, 653 { "l2_write_buffer_bytes_scanned", KSTAT_DATA_UINT64 }, 654 { "l2_write_buffer_list_iter", KSTAT_DATA_UINT64 }, 655 { "l2_write_buffer_list_null_iter", KSTAT_DATA_UINT64 }, 656 { "memory_throttle_count", KSTAT_DATA_UINT64 }, 657 { "duplicate_buffers", KSTAT_DATA_UINT64 }, 658 { "duplicate_buffers_size", KSTAT_DATA_UINT64 }, 659 { "duplicate_reads", KSTAT_DATA_UINT64 }, 660 { "arc_meta_used", KSTAT_DATA_UINT64 }, 661 { "arc_meta_limit", KSTAT_DATA_UINT64 }, 662 { "arc_meta_max", KSTAT_DATA_UINT64 }, 663 { "arc_meta_min", KSTAT_DATA_UINT64 } 664}; 665 666#define ARCSTAT(stat) (arc_stats.stat.value.ui64) 667 668#define ARCSTAT_INCR(stat, val) \ 669 atomic_add_64(&arc_stats.stat.value.ui64, (val)) 670 671#define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1) 672#define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1) 673 674#define ARCSTAT_MAX(stat, val) { \ 675 uint64_t m; \ 676 while ((val) > (m = arc_stats.stat.value.ui64) && \ 677 (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val)))) \ 678 continue; \ 679} 680 681#define ARCSTAT_MAXSTAT(stat) \ 682 ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64) 683 684/* 685 * We define a macro to allow ARC hits/misses to be easily broken down by 686 * two separate conditions, giving a total of four different subtypes for 687 * each of hits and misses (so eight statistics total). 688 */ 689#define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \ 690 if (cond1) { \ 691 if (cond2) { \ 692 ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \ 693 } else { \ 694 ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \ 695 } \ 696 } else { \ 697 if (cond2) { \ 698 ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \ 699 } else { \ 700 ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\ 701 } \ 702 } 703 704kstat_t *arc_ksp; 705static arc_state_t *arc_anon; 706static arc_state_t *arc_mru; 707static arc_state_t *arc_mru_ghost; 708static arc_state_t *arc_mfu; 709static arc_state_t *arc_mfu_ghost; 710static arc_state_t *arc_l2c_only; 711 712/* 713 * There are several ARC variables that are critical to export as kstats -- 714 * but we don't want to have to grovel around in the kstat whenever we wish to 715 * manipulate them. For these variables, we therefore define them to be in 716 * terms of the statistic variable. This assures that we are not introducing 717 * the possibility of inconsistency by having shadow copies of the variables, 718 * while still allowing the code to be readable. 719 */ 720#define arc_size ARCSTAT(arcstat_size) /* actual total arc size */ 721#define arc_p ARCSTAT(arcstat_p) /* target size of MRU */ 722#define arc_c ARCSTAT(arcstat_c) /* target size of cache */ 723#define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */ 724#define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */ 725#define arc_meta_limit ARCSTAT(arcstat_meta_limit) /* max size for metadata */ 726#define arc_meta_min ARCSTAT(arcstat_meta_min) /* min size for metadata */ 727#define arc_meta_used ARCSTAT(arcstat_meta_used) /* size of metadata */ 728#define arc_meta_max ARCSTAT(arcstat_meta_max) /* max size of metadata */ 729 730#define L2ARC_IS_VALID_COMPRESS(_c_) \ 731 ((_c_) == ZIO_COMPRESS_LZ4 || (_c_) == ZIO_COMPRESS_EMPTY) 732 733static int arc_no_grow; /* Don't try to grow cache size */ 734static uint64_t arc_tempreserve; 735static uint64_t arc_loaned_bytes; 736 737typedef struct arc_callback arc_callback_t; 738 739struct arc_callback { 740 void *acb_private; 741 arc_done_func_t *acb_done; 742 arc_buf_t *acb_buf; 743 zio_t *acb_zio_dummy; 744 arc_callback_t *acb_next; 745}; 746 747typedef struct arc_write_callback arc_write_callback_t; 748 749struct arc_write_callback { 750 void *awcb_private; 751 arc_done_func_t *awcb_ready; 752 arc_done_func_t *awcb_physdone; 753 arc_done_func_t *awcb_done; 754 arc_buf_t *awcb_buf; 755}; 756 757/* 758 * ARC buffers are separated into multiple structs as a memory saving measure: 759 * - Common fields struct, always defined, and embedded within it: 760 * - L2-only fields, always allocated but undefined when not in L2ARC 761 * - L1-only fields, only allocated when in L1ARC 762 * 763 * Buffer in L1 Buffer only in L2 764 * +------------------------+ +------------------------+ 765 * | arc_buf_hdr_t | | arc_buf_hdr_t | 766 * | | | | 767 * | | | | 768 * | | | | 769 * +------------------------+ +------------------------+ 770 * | l2arc_buf_hdr_t | | l2arc_buf_hdr_t | 771 * | (undefined if L1-only) | | | 772 * +------------------------+ +------------------------+ 773 * | l1arc_buf_hdr_t | 774 * | | 775 * | | 776 * | | 777 * | | 778 * +------------------------+ 779 * 780 * Because it's possible for the L2ARC to become extremely large, we can wind 781 * up eating a lot of memory in L2ARC buffer headers, so the size of a header 782 * is minimized by only allocating the fields necessary for an L1-cached buffer 783 * when a header is actually in the L1 cache. The sub-headers (l1arc_buf_hdr and 784 * l2arc_buf_hdr) are embedded rather than allocated separately to save a couple 785 * words in pointers. arc_hdr_realloc() is used to switch a header between 786 * these two allocation states. 787 */ 788typedef struct l1arc_buf_hdr { 789 kmutex_t b_freeze_lock; 790#ifdef ZFS_DEBUG 791 /* 792 * used for debugging wtih kmem_flags - by allocating and freeing 793 * b_thawed when the buffer is thawed, we get a record of the stack 794 * trace that thawed it. 795 */ 796 void *b_thawed; 797#endif 798 799 arc_buf_t *b_buf; 800 uint32_t b_datacnt; 801 /* for waiting on writes to complete */ 802 kcondvar_t b_cv; 803 804 /* protected by arc state mutex */ 805 arc_state_t *b_state; 806 list_node_t b_arc_node; 807 808 /* updated atomically */ 809 clock_t b_arc_access; 810 811 /* self protecting */ 812 refcount_t b_refcnt; 813 814 arc_callback_t *b_acb; 815 /* temporary buffer holder for in-flight compressed data */ 816 void *b_tmp_cdata; 817} l1arc_buf_hdr_t; 818 819typedef struct l2arc_dev l2arc_dev_t; 820 821typedef struct l2arc_buf_hdr { 822 /* protected by arc_buf_hdr mutex */ 823 l2arc_dev_t *b_dev; /* L2ARC device */ 824 uint64_t b_daddr; /* disk address, offset byte */ 825 /* real alloc'd buffer size depending on b_compress applied */ 826 int32_t b_asize; 827 828 list_node_t b_l2node; 829} l2arc_buf_hdr_t; 830 831struct arc_buf_hdr { 832 /* protected by hash lock */ 833 dva_t b_dva; 834 uint64_t b_birth; 835 /* 836 * Even though this checksum is only set/verified when a buffer is in 837 * the L1 cache, it needs to be in the set of common fields because it 838 * must be preserved from the time before a buffer is written out to 839 * L2ARC until after it is read back in. 840 */ 841 zio_cksum_t *b_freeze_cksum; 842 843 arc_buf_hdr_t *b_hash_next; 844 arc_flags_t b_flags; 845 846 /* immutable */ 847 int32_t b_size; 848 uint64_t b_spa; 849 850 /* L2ARC fields. Undefined when not in L2ARC. */ 851 l2arc_buf_hdr_t b_l2hdr; 852 /* L1ARC fields. Undefined when in l2arc_only state */ 853 l1arc_buf_hdr_t b_l1hdr; 854}; 855 856#ifdef _KERNEL 857static int 858sysctl_vfs_zfs_arc_meta_limit(SYSCTL_HANDLER_ARGS) 859{ 860 uint64_t val; 861 int err; 862 863 val = arc_meta_limit; 864 err = sysctl_handle_64(oidp, &val, 0, req); 865 if (err != 0 || req->newptr == NULL) 866 return (err); 867 868 if (val <= 0 || val > arc_c_max) 869 return (EINVAL); 870 871 arc_meta_limit = val; 872 return (0); 873} 874#endif 875 876static arc_buf_t *arc_eviction_list; 877static kmutex_t arc_eviction_mtx; 878static arc_buf_hdr_t arc_eviction_hdr; 879 880#define GHOST_STATE(state) \ 881 ((state) == arc_mru_ghost || (state) == arc_mfu_ghost || \ 882 (state) == arc_l2c_only) 883 884#define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_FLAG_IN_HASH_TABLE) 885#define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) 886#define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_FLAG_IO_ERROR) 887#define HDR_PREFETCH(hdr) ((hdr)->b_flags & ARC_FLAG_PREFETCH) 888#define HDR_FREED_IN_READ(hdr) ((hdr)->b_flags & ARC_FLAG_FREED_IN_READ) 889#define HDR_BUF_AVAILABLE(hdr) ((hdr)->b_flags & ARC_FLAG_BUF_AVAILABLE) 890 891#define HDR_L2CACHE(hdr) ((hdr)->b_flags & ARC_FLAG_L2CACHE) 892#define HDR_L2COMPRESS(hdr) ((hdr)->b_flags & ARC_FLAG_L2COMPRESS) 893#define HDR_L2_READING(hdr) \ 894 (((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) && \ 895 ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR)) 896#define HDR_L2_WRITING(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITING) 897#define HDR_L2_EVICTED(hdr) ((hdr)->b_flags & ARC_FLAG_L2_EVICTED) 898#define HDR_L2_WRITE_HEAD(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITE_HEAD) 899 900#define HDR_ISTYPE_METADATA(hdr) \ 901 ((hdr)->b_flags & ARC_FLAG_BUFC_METADATA) 902#define HDR_ISTYPE_DATA(hdr) (!HDR_ISTYPE_METADATA(hdr)) 903 904#define HDR_HAS_L1HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L1HDR) 905#define HDR_HAS_L2HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR) 906 907/* For storing compression mode in b_flags */ 908#define HDR_COMPRESS_OFFSET 24 909#define HDR_COMPRESS_NBITS 7 910 911#define HDR_GET_COMPRESS(hdr) ((enum zio_compress)BF32_GET(hdr->b_flags, \ 912 HDR_COMPRESS_OFFSET, HDR_COMPRESS_NBITS)) 913#define HDR_SET_COMPRESS(hdr, cmp) BF32_SET(hdr->b_flags, \ 914 HDR_COMPRESS_OFFSET, HDR_COMPRESS_NBITS, (cmp)) 915 916/* 917 * Other sizes 918 */ 919 920#define HDR_FULL_SIZE ((int64_t)sizeof (arc_buf_hdr_t)) 921#define HDR_L2ONLY_SIZE ((int64_t)offsetof(arc_buf_hdr_t, b_l1hdr)) 922 923/* 924 * Hash table routines 925 */ 926 927#define HT_LOCK_PAD CACHE_LINE_SIZE 928 929struct ht_lock { 930 kmutex_t ht_lock; 931#ifdef _KERNEL 932 unsigned char pad[(HT_LOCK_PAD - sizeof (kmutex_t))]; 933#endif 934}; 935 936#define BUF_LOCKS 256 937typedef struct buf_hash_table { 938 uint64_t ht_mask; 939 arc_buf_hdr_t **ht_table; 940 struct ht_lock ht_locks[BUF_LOCKS] __aligned(CACHE_LINE_SIZE); 941} buf_hash_table_t; 942 943static buf_hash_table_t buf_hash_table; 944 945#define BUF_HASH_INDEX(spa, dva, birth) \ 946 (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask) 947#define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)]) 948#define BUF_HASH_LOCK(idx) (&(BUF_HASH_LOCK_NTRY(idx).ht_lock)) 949#define HDR_LOCK(hdr) \ 950 (BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth))) 951 952uint64_t zfs_crc64_table[256]; 953 954/* 955 * Level 2 ARC 956 */ 957 958#define L2ARC_WRITE_SIZE (8 * 1024 * 1024) /* initial write max */ 959#define L2ARC_HEADROOM 2 /* num of writes */ 960/* 961 * If we discover during ARC scan any buffers to be compressed, we boost 962 * our headroom for the next scanning cycle by this percentage multiple. 963 */ 964#define L2ARC_HEADROOM_BOOST 200 965#define L2ARC_FEED_SECS 1 /* caching interval secs */ 966#define L2ARC_FEED_MIN_MS 200 /* min caching interval ms */ 967 968/* 969 * Used to distinguish headers that are being process by 970 * l2arc_write_buffers(), but have yet to be assigned to a l2arc disk 971 * address. This can happen when the header is added to the l2arc's list 972 * of buffers to write in the first stage of l2arc_write_buffers(), but 973 * has not yet been written out which happens in the second stage of 974 * l2arc_write_buffers(). 975 */ 976#define L2ARC_ADDR_UNSET ((uint64_t)(-1)) 977 978#define l2arc_writes_sent ARCSTAT(arcstat_l2_writes_sent) 979#define l2arc_writes_done ARCSTAT(arcstat_l2_writes_done) 980 981/* L2ARC Performance Tunables */ 982uint64_t l2arc_write_max = L2ARC_WRITE_SIZE; /* default max write size */ 983uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra write during warmup */ 984uint64_t l2arc_headroom = L2ARC_HEADROOM; /* number of dev writes */ 985uint64_t l2arc_headroom_boost = L2ARC_HEADROOM_BOOST; 986uint64_t l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */ 987uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval milliseconds */ 988boolean_t l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */ 989boolean_t l2arc_feed_again = B_TRUE; /* turbo warmup */ 990boolean_t l2arc_norw = B_TRUE; /* no reads during writes */ 991 992SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_max, CTLFLAG_RW, 993 &l2arc_write_max, 0, "max write size"); 994SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_boost, CTLFLAG_RW, 995 &l2arc_write_boost, 0, "extra write during warmup"); 996SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_headroom, CTLFLAG_RW, 997 &l2arc_headroom, 0, "number of dev writes"); 998SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_secs, CTLFLAG_RW, 999 &l2arc_feed_secs, 0, "interval seconds"); 1000SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_min_ms, CTLFLAG_RW, 1001 &l2arc_feed_min_ms, 0, "min interval milliseconds"); 1002 1003SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_noprefetch, CTLFLAG_RW, 1004 &l2arc_noprefetch, 0, "don't cache prefetch bufs"); 1005SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_feed_again, CTLFLAG_RW, 1006 &l2arc_feed_again, 0, "turbo warmup"); 1007SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_norw, CTLFLAG_RW, 1008 &l2arc_norw, 0, "no reads during writes"); 1009 1010SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_size, CTLFLAG_RD, 1011 &ARC_anon.arcs_size, 0, "size of anonymous state"); 1012SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_metadata_lsize, CTLFLAG_RD, 1013 &ARC_anon.arcs_lsize[ARC_BUFC_METADATA], 0, "size of anonymous state"); 1014SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_data_lsize, CTLFLAG_RD, 1015 &ARC_anon.arcs_lsize[ARC_BUFC_DATA], 0, "size of anonymous state"); 1016 1017SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_size, CTLFLAG_RD, 1018 &ARC_mru.arcs_size, 0, "size of mru state"); 1019SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_metadata_lsize, CTLFLAG_RD, 1020 &ARC_mru.arcs_lsize[ARC_BUFC_METADATA], 0, "size of metadata in mru state"); 1021SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_data_lsize, CTLFLAG_RD, 1022 &ARC_mru.arcs_lsize[ARC_BUFC_DATA], 0, "size of data in mru state"); 1023 1024SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_size, CTLFLAG_RD, 1025 &ARC_mru_ghost.arcs_size, 0, "size of mru ghost state"); 1026SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_metadata_lsize, CTLFLAG_RD, 1027 &ARC_mru_ghost.arcs_lsize[ARC_BUFC_METADATA], 0, 1028 "size of metadata in mru ghost state"); 1029SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_data_lsize, CTLFLAG_RD, 1030 &ARC_mru_ghost.arcs_lsize[ARC_BUFC_DATA], 0, 1031 "size of data in mru ghost state"); 1032 1033SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_size, CTLFLAG_RD, 1034 &ARC_mfu.arcs_size, 0, "size of mfu state"); 1035SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_metadata_lsize, CTLFLAG_RD, 1036 &ARC_mfu.arcs_lsize[ARC_BUFC_METADATA], 0, "size of metadata in mfu state"); 1037SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_data_lsize, CTLFLAG_RD, 1038 &ARC_mfu.arcs_lsize[ARC_BUFC_DATA], 0, "size of data in mfu state"); 1039 1040SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_size, CTLFLAG_RD, 1041 &ARC_mfu_ghost.arcs_size, 0, "size of mfu ghost state"); 1042SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_metadata_lsize, CTLFLAG_RD, 1043 &ARC_mfu_ghost.arcs_lsize[ARC_BUFC_METADATA], 0, 1044 "size of metadata in mfu ghost state"); 1045SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_data_lsize, CTLFLAG_RD, 1046 &ARC_mfu_ghost.arcs_lsize[ARC_BUFC_DATA], 0, 1047 "size of data in mfu ghost state"); 1048 1049SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2c_only_size, CTLFLAG_RD, 1050 &ARC_l2c_only.arcs_size, 0, "size of mru state"); 1051 1052/* 1053 * L2ARC Internals 1054 */ 1055struct l2arc_dev { 1056 vdev_t *l2ad_vdev; /* vdev */ 1057 spa_t *l2ad_spa; /* spa */ 1058 uint64_t l2ad_hand; /* next write location */ 1059 uint64_t l2ad_start; /* first addr on device */ 1060 uint64_t l2ad_end; /* last addr on device */ 1061 boolean_t l2ad_first; /* first sweep through */ 1062 boolean_t l2ad_writing; /* currently writing */ 1063 kmutex_t l2ad_mtx; /* lock for buffer list */ 1064 list_t l2ad_buflist; /* buffer list */ 1065 list_node_t l2ad_node; /* device list node */ 1066 refcount_t l2ad_alloc; /* allocated bytes */ 1067}; 1068 1069static list_t L2ARC_dev_list; /* device list */ 1070static list_t *l2arc_dev_list; /* device list pointer */ 1071static kmutex_t l2arc_dev_mtx; /* device list mutex */ 1072static l2arc_dev_t *l2arc_dev_last; /* last device used */ 1073static list_t L2ARC_free_on_write; /* free after write buf list */ 1074static list_t *l2arc_free_on_write; /* free after write list ptr */ 1075static kmutex_t l2arc_free_on_write_mtx; /* mutex for list */ 1076static uint64_t l2arc_ndev; /* number of devices */ 1077 1078typedef struct l2arc_read_callback { 1079 arc_buf_t *l2rcb_buf; /* read buffer */ 1080 spa_t *l2rcb_spa; /* spa */ 1081 blkptr_t l2rcb_bp; /* original blkptr */ 1082 zbookmark_phys_t l2rcb_zb; /* original bookmark */ 1083 int l2rcb_flags; /* original flags */ 1084 enum zio_compress l2rcb_compress; /* applied compress */ 1085} l2arc_read_callback_t; 1086 1087typedef struct l2arc_write_callback { 1088 l2arc_dev_t *l2wcb_dev; /* device info */ 1089 arc_buf_hdr_t *l2wcb_head; /* head of write buflist */ 1090} l2arc_write_callback_t; 1091 1092typedef struct l2arc_data_free { 1093 /* protected by l2arc_free_on_write_mtx */ 1094 void *l2df_data; 1095 size_t l2df_size; 1096 void (*l2df_func)(void *, size_t); 1097 list_node_t l2df_list_node; 1098} l2arc_data_free_t; 1099 1100static kmutex_t l2arc_feed_thr_lock; 1101static kcondvar_t l2arc_feed_thr_cv; 1102static uint8_t l2arc_thread_exit; 1103 1104static void arc_get_data_buf(arc_buf_t *); 1105static void arc_access(arc_buf_hdr_t *, kmutex_t *); 1106static int arc_evict_needed(arc_buf_contents_t); 1107static void arc_evict_ghost(arc_state_t *, uint64_t, int64_t); 1108static void arc_buf_watch(arc_buf_t *); 1109 1110static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *); 1111static uint32_t arc_bufc_to_flags(arc_buf_contents_t); 1112 1113static boolean_t l2arc_write_eligible(uint64_t, arc_buf_hdr_t *); 1114static void l2arc_read_done(zio_t *); 1115 1116static boolean_t l2arc_compress_buf(arc_buf_hdr_t *); 1117static void l2arc_decompress_zio(zio_t *, arc_buf_hdr_t *, enum zio_compress); 1118static void l2arc_release_cdata_buf(arc_buf_hdr_t *); 1119 1120static uint64_t 1121buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth) 1122{ 1123 uint8_t *vdva = (uint8_t *)dva; 1124 uint64_t crc = -1ULL; 1125 int i; 1126 1127 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); 1128 1129 for (i = 0; i < sizeof (dva_t); i++) 1130 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF]; 1131 1132 crc ^= (spa>>8) ^ birth; 1133 1134 return (crc); 1135} 1136 1137#define BUF_EMPTY(buf) \ 1138 ((buf)->b_dva.dva_word[0] == 0 && \ 1139 (buf)->b_dva.dva_word[1] == 0) 1140 1141#define BUF_EQUAL(spa, dva, birth, buf) \ 1142 ((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \ 1143 ((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \ 1144 ((buf)->b_birth == birth) && ((buf)->b_spa == spa) 1145 1146static void 1147buf_discard_identity(arc_buf_hdr_t *hdr) 1148{ 1149 hdr->b_dva.dva_word[0] = 0; 1150 hdr->b_dva.dva_word[1] = 0; 1151 hdr->b_birth = 0; 1152} 1153 1154static arc_buf_hdr_t * 1155buf_hash_find(uint64_t spa, const blkptr_t *bp, kmutex_t **lockp) 1156{ 1157 const dva_t *dva = BP_IDENTITY(bp); 1158 uint64_t birth = BP_PHYSICAL_BIRTH(bp); 1159 uint64_t idx = BUF_HASH_INDEX(spa, dva, birth); 1160 kmutex_t *hash_lock = BUF_HASH_LOCK(idx); 1161 arc_buf_hdr_t *hdr; 1162 1163 mutex_enter(hash_lock); 1164 for (hdr = buf_hash_table.ht_table[idx]; hdr != NULL; 1165 hdr = hdr->b_hash_next) { 1166 if (BUF_EQUAL(spa, dva, birth, hdr)) { 1167 *lockp = hash_lock; 1168 return (hdr); 1169 } 1170 } 1171 mutex_exit(hash_lock); 1172 *lockp = NULL; 1173 return (NULL); 1174} 1175 1176/* 1177 * Insert an entry into the hash table. If there is already an element 1178 * equal to elem in the hash table, then the already existing element 1179 * will be returned and the new element will not be inserted. 1180 * Otherwise returns NULL. 1181 * If lockp == NULL, the caller is assumed to already hold the hash lock. 1182 */ 1183static arc_buf_hdr_t * 1184buf_hash_insert(arc_buf_hdr_t *hdr, kmutex_t **lockp) 1185{ 1186 uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth); 1187 kmutex_t *hash_lock = BUF_HASH_LOCK(idx); 1188 arc_buf_hdr_t *fhdr; 1189 uint32_t i; 1190 1191 ASSERT(!DVA_IS_EMPTY(&hdr->b_dva)); 1192 ASSERT(hdr->b_birth != 0); 1193 ASSERT(!HDR_IN_HASH_TABLE(hdr)); 1194 1195 if (lockp != NULL) { 1196 *lockp = hash_lock; 1197 mutex_enter(hash_lock); 1198 } else { 1199 ASSERT(MUTEX_HELD(hash_lock)); 1200 } 1201 1202 for (fhdr = buf_hash_table.ht_table[idx], i = 0; fhdr != NULL; 1203 fhdr = fhdr->b_hash_next, i++) { 1204 if (BUF_EQUAL(hdr->b_spa, &hdr->b_dva, hdr->b_birth, fhdr)) 1205 return (fhdr); 1206 } 1207 1208 hdr->b_hash_next = buf_hash_table.ht_table[idx]; 1209 buf_hash_table.ht_table[idx] = hdr; 1210 hdr->b_flags |= ARC_FLAG_IN_HASH_TABLE; 1211 1212 /* collect some hash table performance data */ 1213 if (i > 0) { 1214 ARCSTAT_BUMP(arcstat_hash_collisions); 1215 if (i == 1) 1216 ARCSTAT_BUMP(arcstat_hash_chains); 1217 1218 ARCSTAT_MAX(arcstat_hash_chain_max, i); 1219 } 1220 1221 ARCSTAT_BUMP(arcstat_hash_elements); 1222 ARCSTAT_MAXSTAT(arcstat_hash_elements); 1223 1224 return (NULL); 1225} 1226 1227static void 1228buf_hash_remove(arc_buf_hdr_t *hdr) 1229{ 1230 arc_buf_hdr_t *fhdr, **hdrp; 1231 uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth); 1232 1233 ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx))); 1234 ASSERT(HDR_IN_HASH_TABLE(hdr)); 1235 1236 hdrp = &buf_hash_table.ht_table[idx]; 1237 while ((fhdr = *hdrp) != hdr) { 1238 ASSERT(fhdr != NULL); 1239 hdrp = &fhdr->b_hash_next; 1240 } 1241 *hdrp = hdr->b_hash_next; 1242 hdr->b_hash_next = NULL; 1243 hdr->b_flags &= ~ARC_FLAG_IN_HASH_TABLE; 1244 1245 /* collect some hash table performance data */ 1246 ARCSTAT_BUMPDOWN(arcstat_hash_elements); 1247 1248 if (buf_hash_table.ht_table[idx] && 1249 buf_hash_table.ht_table[idx]->b_hash_next == NULL) 1250 ARCSTAT_BUMPDOWN(arcstat_hash_chains); 1251} 1252 1253/* 1254 * Global data structures and functions for the buf kmem cache. 1255 */ 1256static kmem_cache_t *hdr_full_cache; 1257static kmem_cache_t *hdr_l2only_cache; 1258static kmem_cache_t *buf_cache; 1259 1260static void 1261buf_fini(void) 1262{ 1263 int i; 1264 1265 kmem_free(buf_hash_table.ht_table, 1266 (buf_hash_table.ht_mask + 1) * sizeof (void *)); 1267 for (i = 0; i < BUF_LOCKS; i++) 1268 mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock); 1269 kmem_cache_destroy(hdr_full_cache); 1270 kmem_cache_destroy(hdr_l2only_cache); 1271 kmem_cache_destroy(buf_cache); 1272} 1273 1274/* 1275 * Constructor callback - called when the cache is empty 1276 * and a new buf is requested. 1277 */ 1278/* ARGSUSED */ 1279static int 1280hdr_full_cons(void *vbuf, void *unused, int kmflag) 1281{ 1282 arc_buf_hdr_t *hdr = vbuf; 1283 1284 bzero(hdr, HDR_FULL_SIZE); 1285 cv_init(&hdr->b_l1hdr.b_cv, NULL, CV_DEFAULT, NULL); 1286 refcount_create(&hdr->b_l1hdr.b_refcnt); 1287 mutex_init(&hdr->b_l1hdr.b_freeze_lock, NULL, MUTEX_DEFAULT, NULL); 1288 arc_space_consume(HDR_FULL_SIZE, ARC_SPACE_HDRS); 1289 1290 return (0); 1291} 1292 1293/* ARGSUSED */ 1294static int 1295hdr_l2only_cons(void *vbuf, void *unused, int kmflag) 1296{ 1297 arc_buf_hdr_t *hdr = vbuf; 1298 1299 bzero(hdr, HDR_L2ONLY_SIZE); 1300 arc_space_consume(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS); 1301 1302 return (0); 1303} 1304 1305/* ARGSUSED */ 1306static int 1307buf_cons(void *vbuf, void *unused, int kmflag) 1308{ 1309 arc_buf_t *buf = vbuf; 1310 1311 bzero(buf, sizeof (arc_buf_t)); 1312 mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL); 1313 arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS); 1314 1315 return (0); 1316} 1317 1318/* 1319 * Destructor callback - called when a cached buf is 1320 * no longer required. 1321 */ 1322/* ARGSUSED */ 1323static void 1324hdr_full_dest(void *vbuf, void *unused) 1325{ 1326 arc_buf_hdr_t *hdr = vbuf; 1327 1328 ASSERT(BUF_EMPTY(hdr)); 1329 cv_destroy(&hdr->b_l1hdr.b_cv); 1330 refcount_destroy(&hdr->b_l1hdr.b_refcnt); 1331 mutex_destroy(&hdr->b_l1hdr.b_freeze_lock); 1332 arc_space_return(HDR_FULL_SIZE, ARC_SPACE_HDRS); 1333} 1334 1335/* ARGSUSED */ 1336static void 1337hdr_l2only_dest(void *vbuf, void *unused) 1338{ 1339 arc_buf_hdr_t *hdr = vbuf; 1340 1341 ASSERT(BUF_EMPTY(hdr)); 1342 arc_space_return(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS); 1343} 1344 1345/* ARGSUSED */ 1346static void 1347buf_dest(void *vbuf, void *unused) 1348{ 1349 arc_buf_t *buf = vbuf; 1350 1351 mutex_destroy(&buf->b_evict_lock); 1352 arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS); 1353} 1354 1355/* 1356 * Reclaim callback -- invoked when memory is low. 1357 */ 1358/* ARGSUSED */ 1359static void 1360hdr_recl(void *unused) 1361{ 1362 dprintf("hdr_recl called\n"); 1363 /* 1364 * umem calls the reclaim func when we destroy the buf cache, 1365 * which is after we do arc_fini(). 1366 */ 1367 if (!arc_dead) 1368 cv_signal(&arc_reclaim_thr_cv); 1369} 1370 1371static void 1372buf_init(void) 1373{ 1374 uint64_t *ct; 1375 uint64_t hsize = 1ULL << 12; 1376 int i, j; 1377 1378 /* 1379 * The hash table is big enough to fill all of physical memory 1380 * with an average block size of zfs_arc_average_blocksize (default 8K). 1381 * By default, the table will take up 1382 * totalmem * sizeof(void*) / 8K (1MB per GB with 8-byte pointers). 1383 */ 1384 while (hsize * zfs_arc_average_blocksize < (uint64_t)physmem * PAGESIZE) 1385 hsize <<= 1; 1386retry: 1387 buf_hash_table.ht_mask = hsize - 1; 1388 buf_hash_table.ht_table = 1389 kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP); 1390 if (buf_hash_table.ht_table == NULL) { 1391 ASSERT(hsize > (1ULL << 8)); 1392 hsize >>= 1; 1393 goto retry; 1394 } 1395 1396 hdr_full_cache = kmem_cache_create("arc_buf_hdr_t_full", HDR_FULL_SIZE, 1397 0, hdr_full_cons, hdr_full_dest, hdr_recl, NULL, NULL, 0); 1398 hdr_l2only_cache = kmem_cache_create("arc_buf_hdr_t_l2only", 1399 HDR_L2ONLY_SIZE, 0, hdr_l2only_cons, hdr_l2only_dest, hdr_recl, 1400 NULL, NULL, 0); 1401 buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t), 1402 0, buf_cons, buf_dest, NULL, NULL, NULL, 0); 1403 1404 for (i = 0; i < 256; i++) 1405 for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--) 1406 *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY); 1407 1408 for (i = 0; i < BUF_LOCKS; i++) { 1409 mutex_init(&buf_hash_table.ht_locks[i].ht_lock, 1410 NULL, MUTEX_DEFAULT, NULL); 1411 } 1412} 1413 1414/* 1415 * Transition between the two allocation states for the arc_buf_hdr struct. 1416 * The arc_buf_hdr struct can be allocated with (hdr_full_cache) or without 1417 * (hdr_l2only_cache) the fields necessary for the L1 cache - the smaller 1418 * version is used when a cache buffer is only in the L2ARC in order to reduce 1419 * memory usage. 1420 */ 1421static arc_buf_hdr_t * 1422arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new) 1423{ 1424 ASSERT(HDR_HAS_L2HDR(hdr)); 1425 1426 arc_buf_hdr_t *nhdr; 1427 l2arc_dev_t *dev = hdr->b_l2hdr.b_dev; 1428 1429 ASSERT((old == hdr_full_cache && new == hdr_l2only_cache) || 1430 (old == hdr_l2only_cache && new == hdr_full_cache)); 1431 1432 nhdr = kmem_cache_alloc(new, KM_PUSHPAGE); 1433 1434 ASSERT(MUTEX_HELD(HDR_LOCK(hdr))); 1435 buf_hash_remove(hdr); 1436 1437 bcopy(hdr, nhdr, HDR_L2ONLY_SIZE); 1438 1439 if (new == hdr_full_cache) { 1440 nhdr->b_flags |= ARC_FLAG_HAS_L1HDR; 1441 /* 1442 * arc_access and arc_change_state need to be aware that a 1443 * header has just come out of L2ARC, so we set its state to 1444 * l2c_only even though it's about to change. 1445 */ 1446 nhdr->b_l1hdr.b_state = arc_l2c_only; 1447 } else { 1448 ASSERT(hdr->b_l1hdr.b_buf == NULL); 1449 ASSERT0(hdr->b_l1hdr.b_datacnt); 1450 ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node)); 1451 /* 1452 * We might be removing the L1hdr of a buffer which was just 1453 * written out to L2ARC. If such a buffer is compressed then we 1454 * need to free its b_tmp_cdata before destroying the header. 1455 */ 1456 if (hdr->b_l1hdr.b_tmp_cdata != NULL && 1457 HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF) 1458 l2arc_release_cdata_buf(hdr); 1459 nhdr->b_flags &= ~ARC_FLAG_HAS_L1HDR; 1460 } 1461 /* 1462 * The header has been reallocated so we need to re-insert it into any 1463 * lists it was on. 1464 */ 1465 (void) buf_hash_insert(nhdr, NULL); 1466 1467 ASSERT(list_link_active(&hdr->b_l2hdr.b_l2node)); 1468 1469 mutex_enter(&dev->l2ad_mtx); 1470 1471 /* 1472 * We must place the realloc'ed header back into the list at 1473 * the same spot. Otherwise, if it's placed earlier in the list, 1474 * l2arc_write_buffers() could find it during the function's 1475 * write phase, and try to write it out to the l2arc. 1476 */ 1477 list_insert_after(&dev->l2ad_buflist, hdr, nhdr); 1478 list_remove(&dev->l2ad_buflist, hdr); 1479 1480 mutex_exit(&dev->l2ad_mtx); 1481 1482 /* 1483 * Since we're using the pointer address as the tag when 1484 * incrementing and decrementing the l2ad_alloc refcount, we 1485 * must remove the old pointer (that we're about to destroy) and 1486 * add the new pointer to the refcount. Otherwise we'd remove 1487 * the wrong pointer address when calling arc_hdr_destroy() later. 1488 */ 1489 1490 (void) refcount_remove_many(&dev->l2ad_alloc, 1491 hdr->b_l2hdr.b_asize, hdr); 1492 1493 (void) refcount_add_many(&dev->l2ad_alloc, 1494 nhdr->b_l2hdr.b_asize, nhdr); 1495 1496 buf_discard_identity(hdr); 1497 hdr->b_freeze_cksum = NULL; 1498 kmem_cache_free(old, hdr); 1499 1500 return (nhdr); 1501} 1502 1503 1504#define ARC_MINTIME (hz>>4) /* 62 ms */ 1505 1506static void 1507arc_cksum_verify(arc_buf_t *buf) 1508{ 1509 zio_cksum_t zc; 1510 1511 if (!(zfs_flags & ZFS_DEBUG_MODIFY)) 1512 return; 1513 1514 mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1515 if (buf->b_hdr->b_freeze_cksum == NULL || HDR_IO_ERROR(buf->b_hdr)) { 1516 mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1517 return; 1518 } 1519 fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc); 1520 if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc)) 1521 panic("buffer modified while frozen!"); 1522 mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1523} 1524 1525static int 1526arc_cksum_equal(arc_buf_t *buf) 1527{ 1528 zio_cksum_t zc; 1529 int equal; 1530 1531 mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1532 fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc); 1533 equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc); 1534 mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1535 1536 return (equal); 1537} 1538 1539static void 1540arc_cksum_compute(arc_buf_t *buf, boolean_t force) 1541{ 1542 if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY)) 1543 return; 1544 1545 mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1546 if (buf->b_hdr->b_freeze_cksum != NULL) { 1547 mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1548 return; 1549 } 1550 buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP); 1551 fletcher_2_native(buf->b_data, buf->b_hdr->b_size, 1552 buf->b_hdr->b_freeze_cksum); 1553 mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1554#ifdef illumos 1555 arc_buf_watch(buf); 1556#endif /* illumos */ 1557} 1558 1559#ifdef illumos 1560#ifndef _KERNEL 1561typedef struct procctl { 1562 long cmd; 1563 prwatch_t prwatch; 1564} procctl_t; 1565#endif 1566 1567/* ARGSUSED */ 1568static void 1569arc_buf_unwatch(arc_buf_t *buf) 1570{ 1571#ifndef _KERNEL 1572 if (arc_watch) { 1573 int result; 1574 procctl_t ctl; 1575 ctl.cmd = PCWATCH; 1576 ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data; 1577 ctl.prwatch.pr_size = 0; 1578 ctl.prwatch.pr_wflags = 0; 1579 result = write(arc_procfd, &ctl, sizeof (ctl)); 1580 ASSERT3U(result, ==, sizeof (ctl)); 1581 } 1582#endif 1583} 1584 1585/* ARGSUSED */ 1586static void 1587arc_buf_watch(arc_buf_t *buf) 1588{ 1589#ifndef _KERNEL 1590 if (arc_watch) { 1591 int result; 1592 procctl_t ctl; 1593 ctl.cmd = PCWATCH; 1594 ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data; 1595 ctl.prwatch.pr_size = buf->b_hdr->b_size; 1596 ctl.prwatch.pr_wflags = WA_WRITE; 1597 result = write(arc_procfd, &ctl, sizeof (ctl)); 1598 ASSERT3U(result, ==, sizeof (ctl)); 1599 } 1600#endif 1601} 1602#endif /* illumos */ 1603 1604static arc_buf_contents_t 1605arc_buf_type(arc_buf_hdr_t *hdr) 1606{ 1607 if (HDR_ISTYPE_METADATA(hdr)) { 1608 return (ARC_BUFC_METADATA); 1609 } else { 1610 return (ARC_BUFC_DATA); 1611 } 1612} 1613 1614static uint32_t 1615arc_bufc_to_flags(arc_buf_contents_t type) 1616{ 1617 switch (type) { 1618 case ARC_BUFC_DATA: 1619 /* metadata field is 0 if buffer contains normal data */ 1620 return (0); 1621 case ARC_BUFC_METADATA: 1622 return (ARC_FLAG_BUFC_METADATA); 1623 default: 1624 break; 1625 } 1626 panic("undefined ARC buffer type!"); 1627 return ((uint32_t)-1); 1628} 1629 1630void 1631arc_buf_thaw(arc_buf_t *buf) 1632{ 1633 if (zfs_flags & ZFS_DEBUG_MODIFY) { 1634 if (buf->b_hdr->b_l1hdr.b_state != arc_anon) 1635 panic("modifying non-anon buffer!"); 1636 if (HDR_IO_IN_PROGRESS(buf->b_hdr)) 1637 panic("modifying buffer while i/o in progress!"); 1638 arc_cksum_verify(buf); 1639 } 1640 1641 mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1642 if (buf->b_hdr->b_freeze_cksum != NULL) { 1643 kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t)); 1644 buf->b_hdr->b_freeze_cksum = NULL; 1645 } 1646 1647#ifdef ZFS_DEBUG 1648 if (zfs_flags & ZFS_DEBUG_MODIFY) { 1649 if (buf->b_hdr->b_l1hdr.b_thawed != NULL) 1650 kmem_free(buf->b_hdr->b_l1hdr.b_thawed, 1); 1651 buf->b_hdr->b_l1hdr.b_thawed = kmem_alloc(1, KM_SLEEP); 1652 } 1653#endif 1654 1655 mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1656 1657#ifdef illumos 1658 arc_buf_unwatch(buf); 1659#endif /* illumos */ 1660} 1661 1662void 1663arc_buf_freeze(arc_buf_t *buf) 1664{ 1665 kmutex_t *hash_lock; 1666 1667 if (!(zfs_flags & ZFS_DEBUG_MODIFY)) 1668 return; 1669 1670 hash_lock = HDR_LOCK(buf->b_hdr); 1671 mutex_enter(hash_lock); 1672 1673 ASSERT(buf->b_hdr->b_freeze_cksum != NULL || 1674 buf->b_hdr->b_l1hdr.b_state == arc_anon); 1675 arc_cksum_compute(buf, B_FALSE); 1676 mutex_exit(hash_lock); 1677 1678} 1679 1680static void 1681get_buf_info(arc_buf_hdr_t *hdr, arc_state_t *state, list_t **list, kmutex_t **lock) 1682{ 1683 uint64_t buf_hashid = buf_hash(hdr->b_spa, &hdr->b_dva, hdr->b_birth); 1684 1685 if (arc_buf_type(hdr) == ARC_BUFC_METADATA) 1686 buf_hashid &= (ARC_BUFC_NUMMETADATALISTS - 1); 1687 else { 1688 buf_hashid &= (ARC_BUFC_NUMDATALISTS - 1); 1689 buf_hashid += ARC_BUFC_NUMMETADATALISTS; 1690 } 1691 1692 *list = &state->arcs_lists[buf_hashid]; 1693 *lock = ARCS_LOCK(state, buf_hashid); 1694} 1695 1696 1697static void 1698add_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag) 1699{ 1700 ASSERT(HDR_HAS_L1HDR(hdr)); 1701 ASSERT(MUTEX_HELD(hash_lock)); 1702 arc_state_t *state = hdr->b_l1hdr.b_state; 1703 1704 if ((refcount_add(&hdr->b_l1hdr.b_refcnt, tag) == 1) && 1705 (state != arc_anon)) { 1706 /* We don't use the L2-only state list. */ 1707 if (state != arc_l2c_only) { 1708 uint64_t delta = hdr->b_size * hdr->b_l1hdr.b_datacnt; 1709 uint64_t *size = &state->arcs_lsize[arc_buf_type(hdr)]; 1710 list_t *list; 1711 kmutex_t *lock; 1712 1713 get_buf_info(hdr, state, &list, &lock); 1714 ASSERT(!MUTEX_HELD(lock)); 1715 mutex_enter(lock); 1716 ASSERT(list_link_active(&hdr->b_l1hdr.b_arc_node)); 1717 list_remove(list, hdr); 1718 if (GHOST_STATE(state)) { 1719 ASSERT0(hdr->b_l1hdr.b_datacnt); 1720 ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); 1721 delta = hdr->b_size; 1722 } 1723 ASSERT(delta > 0); 1724 ASSERT3U(*size, >=, delta); 1725 atomic_add_64(size, -delta); 1726 mutex_exit(lock); 1727 } 1728 /* remove the prefetch flag if we get a reference */ 1729 hdr->b_flags &= ~ARC_FLAG_PREFETCH; 1730 } 1731} 1732 1733static int 1734remove_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag) 1735{ 1736 int cnt; 1737 arc_state_t *state = hdr->b_l1hdr.b_state; 1738 1739 ASSERT(HDR_HAS_L1HDR(hdr)); 1740 ASSERT(state == arc_anon || MUTEX_HELD(hash_lock)); 1741 ASSERT(!GHOST_STATE(state)); 1742 1743 /* 1744 * arc_l2c_only counts as a ghost state so we don't need to explicitly 1745 * check to prevent usage of the arc_l2c_only list. 1746 */ 1747 if (((cnt = refcount_remove(&hdr->b_l1hdr.b_refcnt, tag)) == 0) && 1748 (state != arc_anon)) { 1749 uint64_t *size = &state->arcs_lsize[arc_buf_type(hdr)]; 1750 list_t *list; 1751 kmutex_t *lock; 1752 1753 get_buf_info(hdr, state, &list, &lock); 1754 ASSERT(!MUTEX_HELD(lock)); 1755 mutex_enter(lock); 1756 ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node)); 1757 list_insert_head(list, hdr); 1758 ASSERT(hdr->b_l1hdr.b_datacnt > 0); 1759 atomic_add_64(size, hdr->b_size * 1760 hdr->b_l1hdr.b_datacnt); 1761 mutex_exit(lock); 1762 } 1763 return (cnt); 1764} 1765 1766/* 1767 * Move the supplied buffer to the indicated state. The mutex 1768 * for the buffer must be held by the caller. 1769 */ 1770static void 1771arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, 1772 kmutex_t *hash_lock) 1773{ 1774 arc_state_t *old_state; 1775 int64_t refcnt; 1776 uint32_t datacnt; 1777 uint64_t from_delta, to_delta; 1778 arc_buf_contents_t buftype = arc_buf_type(hdr); 1779 list_t *list; 1780 kmutex_t *lock; 1781 1782 /* 1783 * We almost always have an L1 hdr here, since we call arc_hdr_realloc() 1784 * in arc_read() when bringing a buffer out of the L2ARC. However, the 1785 * L1 hdr doesn't always exist when we change state to arc_anon before 1786 * destroying a header, in which case reallocating to add the L1 hdr is 1787 * pointless. 1788 */ 1789 if (HDR_HAS_L1HDR(hdr)) { 1790 old_state = hdr->b_l1hdr.b_state; 1791 refcnt = refcount_count(&hdr->b_l1hdr.b_refcnt); 1792 datacnt = hdr->b_l1hdr.b_datacnt; 1793 } else { 1794 old_state = arc_l2c_only; 1795 refcnt = 0; 1796 datacnt = 0; 1797 } 1798 1799 ASSERT(MUTEX_HELD(hash_lock)); 1800 ASSERT3P(new_state, !=, old_state); 1801 ASSERT(refcnt == 0 || datacnt > 0); 1802 ASSERT(!GHOST_STATE(new_state) || datacnt == 0); 1803 ASSERT(old_state != arc_anon || datacnt <= 1); 1804 1805 from_delta = to_delta = datacnt * hdr->b_size; 1806 1807 /* 1808 * If this buffer is evictable, transfer it from the 1809 * old state list to the new state list. 1810 */ 1811 if (refcnt == 0) { 1812 if (old_state != arc_anon && old_state != arc_l2c_only) { 1813 int use_mutex; 1814 uint64_t *size = &old_state->arcs_lsize[buftype]; 1815 1816 get_buf_info(hdr, old_state, &list, &lock); 1817 use_mutex = !MUTEX_HELD(lock); 1818 if (use_mutex) 1819 mutex_enter(lock); 1820 1821 ASSERT(HDR_HAS_L1HDR(hdr)); 1822 ASSERT(list_link_active(&hdr->b_l1hdr.b_arc_node)); 1823 list_remove(list, hdr); 1824 1825 /* 1826 * If prefetching out of the ghost cache, 1827 * we will have a non-zero datacnt. 1828 */ 1829 if (GHOST_STATE(old_state) && datacnt == 0) { 1830 /* ghost elements have a ghost size */ 1831 ASSERT(hdr->b_l1hdr.b_buf == NULL); 1832 from_delta = hdr->b_size; 1833 } 1834 ASSERT3U(*size, >=, from_delta); 1835 atomic_add_64(size, -from_delta); 1836 1837 if (use_mutex) 1838 mutex_exit(lock); 1839 } 1840 if (new_state != arc_anon && new_state != arc_l2c_only) { 1841 int use_mutex; 1842 uint64_t *size = &new_state->arcs_lsize[buftype]; 1843 1844 /* 1845 * An L1 header always exists here, since if we're 1846 * moving to some L1-cached state (i.e. not l2c_only or 1847 * anonymous), we realloc the header to add an L1hdr 1848 * beforehand. 1849 */ 1850 ASSERT(HDR_HAS_L1HDR(hdr)); 1851 get_buf_info(hdr, new_state, &list, &lock); 1852 use_mutex = !MUTEX_HELD(lock); 1853 if (use_mutex) 1854 mutex_enter(lock); 1855 1856 list_insert_head(list, hdr); 1857 1858 /* ghost elements have a ghost size */ 1859 if (GHOST_STATE(new_state)) { 1860 ASSERT(datacnt == 0); 1861 ASSERT(hdr->b_l1hdr.b_buf == NULL); 1862 to_delta = hdr->b_size; 1863 } 1864 atomic_add_64(size, to_delta); 1865 1866 if (use_mutex) 1867 mutex_exit(lock); 1868 } 1869 } 1870 1871 ASSERT(!BUF_EMPTY(hdr)); 1872 if (new_state == arc_anon && HDR_IN_HASH_TABLE(hdr)) 1873 buf_hash_remove(hdr); 1874 1875 /* adjust state sizes (ignore arc_l2c_only) */ 1876 if (to_delta && new_state != arc_l2c_only) 1877 atomic_add_64(&new_state->arcs_size, to_delta); 1878 if (from_delta && old_state != arc_l2c_only) { 1879 ASSERT3U(old_state->arcs_size, >=, from_delta); 1880 atomic_add_64(&old_state->arcs_size, -from_delta); 1881 } 1882 if (HDR_HAS_L1HDR(hdr)) 1883 hdr->b_l1hdr.b_state = new_state; 1884 1885 /* 1886 * L2 headers should never be on the L2 state list since they don't 1887 * have L1 headers allocated. 1888 */ 1889#ifdef illumos 1890 ASSERT(list_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_DATA]) && 1891 list_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA])); 1892#endif 1893} 1894 1895void 1896arc_space_consume(uint64_t space, arc_space_type_t type) 1897{ 1898 ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES); 1899 1900 switch (type) { 1901 case ARC_SPACE_DATA: 1902 ARCSTAT_INCR(arcstat_data_size, space); 1903 break; 1904 case ARC_SPACE_META: 1905 ARCSTAT_INCR(arcstat_metadata_size, space); 1906 break; 1907 case ARC_SPACE_OTHER: 1908 ARCSTAT_INCR(arcstat_other_size, space); 1909 break; 1910 case ARC_SPACE_HDRS: 1911 ARCSTAT_INCR(arcstat_hdr_size, space); 1912 break; 1913 case ARC_SPACE_L2HDRS: 1914 ARCSTAT_INCR(arcstat_l2_hdr_size, space); 1915 break; 1916 } 1917 1918 if (type != ARC_SPACE_DATA) 1919 ARCSTAT_INCR(arcstat_meta_used, space); 1920 1921 atomic_add_64(&arc_size, space); 1922} 1923 1924void 1925arc_space_return(uint64_t space, arc_space_type_t type) 1926{ 1927 ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES); 1928 1929 switch (type) { 1930 case ARC_SPACE_DATA: 1931 ARCSTAT_INCR(arcstat_data_size, -space); 1932 break; 1933 case ARC_SPACE_META: 1934 ARCSTAT_INCR(arcstat_metadata_size, -space); 1935 break; 1936 case ARC_SPACE_OTHER: 1937 ARCSTAT_INCR(arcstat_other_size, -space); 1938 break; 1939 case ARC_SPACE_HDRS: 1940 ARCSTAT_INCR(arcstat_hdr_size, -space); 1941 break; 1942 case ARC_SPACE_L2HDRS: 1943 ARCSTAT_INCR(arcstat_l2_hdr_size, -space); 1944 break; 1945 } 1946 1947 if (type != ARC_SPACE_DATA) { 1948 ASSERT(arc_meta_used >= space); 1949 if (arc_meta_max < arc_meta_used) 1950 arc_meta_max = arc_meta_used; 1951 ARCSTAT_INCR(arcstat_meta_used, -space); 1952 } 1953 1954 ASSERT(arc_size >= space); 1955 atomic_add_64(&arc_size, -space); 1956} 1957 1958arc_buf_t * 1959arc_buf_alloc(spa_t *spa, int32_t size, void *tag, arc_buf_contents_t type) 1960{ 1961 arc_buf_hdr_t *hdr; 1962 arc_buf_t *buf; 1963 1964 ASSERT3U(size, >, 0); 1965 hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE); 1966 ASSERT(BUF_EMPTY(hdr)); 1967 ASSERT3P(hdr->b_freeze_cksum, ==, NULL); 1968 hdr->b_size = size; 1969 hdr->b_spa = spa_load_guid(spa); 1970 1971 buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); 1972 buf->b_hdr = hdr; 1973 buf->b_data = NULL; 1974 buf->b_efunc = NULL; 1975 buf->b_private = NULL; 1976 buf->b_next = NULL; 1977 1978 hdr->b_flags = arc_bufc_to_flags(type); 1979 hdr->b_flags |= ARC_FLAG_HAS_L1HDR; 1980 1981 hdr->b_l1hdr.b_buf = buf; 1982 hdr->b_l1hdr.b_state = arc_anon; 1983 hdr->b_l1hdr.b_arc_access = 0; 1984 hdr->b_l1hdr.b_datacnt = 1; 1985 1986 arc_get_data_buf(buf); 1987 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 1988 (void) refcount_add(&hdr->b_l1hdr.b_refcnt, tag); 1989 1990 return (buf); 1991} 1992 1993static char *arc_onloan_tag = "onloan"; 1994 1995/* 1996 * Loan out an anonymous arc buffer. Loaned buffers are not counted as in 1997 * flight data by arc_tempreserve_space() until they are "returned". Loaned 1998 * buffers must be returned to the arc before they can be used by the DMU or 1999 * freed. 2000 */ 2001arc_buf_t * 2002arc_loan_buf(spa_t *spa, int size) 2003{ 2004 arc_buf_t *buf; 2005 2006 buf = arc_buf_alloc(spa, size, arc_onloan_tag, ARC_BUFC_DATA); 2007 2008 atomic_add_64(&arc_loaned_bytes, size); 2009 return (buf); 2010} 2011 2012/* 2013 * Return a loaned arc buffer to the arc. 2014 */ 2015void 2016arc_return_buf(arc_buf_t *buf, void *tag) 2017{ 2018 arc_buf_hdr_t *hdr = buf->b_hdr; 2019 2020 ASSERT(buf->b_data != NULL); 2021 ASSERT(HDR_HAS_L1HDR(hdr)); 2022 (void) refcount_add(&hdr->b_l1hdr.b_refcnt, tag); 2023 (void) refcount_remove(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag); 2024 2025 atomic_add_64(&arc_loaned_bytes, -hdr->b_size); 2026} 2027 2028/* Detach an arc_buf from a dbuf (tag) */ 2029void 2030arc_loan_inuse_buf(arc_buf_t *buf, void *tag) 2031{ 2032 arc_buf_hdr_t *hdr = buf->b_hdr; 2033 2034 ASSERT(buf->b_data != NULL); 2035 ASSERT(HDR_HAS_L1HDR(hdr)); 2036 (void) refcount_add(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag); 2037 (void) refcount_remove(&hdr->b_l1hdr.b_refcnt, tag); 2038 buf->b_efunc = NULL; 2039 buf->b_private = NULL; 2040 2041 atomic_add_64(&arc_loaned_bytes, hdr->b_size); 2042} 2043 2044static arc_buf_t * 2045arc_buf_clone(arc_buf_t *from) 2046{ 2047 arc_buf_t *buf; 2048 arc_buf_hdr_t *hdr = from->b_hdr; 2049 uint64_t size = hdr->b_size; 2050 2051 ASSERT(HDR_HAS_L1HDR(hdr)); 2052 ASSERT(hdr->b_l1hdr.b_state != arc_anon); 2053 2054 buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); 2055 buf->b_hdr = hdr; 2056 buf->b_data = NULL; 2057 buf->b_efunc = NULL; 2058 buf->b_private = NULL; 2059 buf->b_next = hdr->b_l1hdr.b_buf; 2060 hdr->b_l1hdr.b_buf = buf; 2061 arc_get_data_buf(buf); 2062 bcopy(from->b_data, buf->b_data, size); 2063 2064 /* 2065 * This buffer already exists in the arc so create a duplicate 2066 * copy for the caller. If the buffer is associated with user data 2067 * then track the size and number of duplicates. These stats will be 2068 * updated as duplicate buffers are created and destroyed. 2069 */ 2070 if (HDR_ISTYPE_DATA(hdr)) { 2071 ARCSTAT_BUMP(arcstat_duplicate_buffers); 2072 ARCSTAT_INCR(arcstat_duplicate_buffers_size, size); 2073 } 2074 hdr->b_l1hdr.b_datacnt += 1; 2075 return (buf); 2076} 2077 2078void 2079arc_buf_add_ref(arc_buf_t *buf, void* tag) 2080{ 2081 arc_buf_hdr_t *hdr; 2082 kmutex_t *hash_lock; 2083 2084 /* 2085 * Check to see if this buffer is evicted. Callers 2086 * must verify b_data != NULL to know if the add_ref 2087 * was successful. 2088 */ 2089 mutex_enter(&buf->b_evict_lock); 2090 if (buf->b_data == NULL) { 2091 mutex_exit(&buf->b_evict_lock); 2092 return; 2093 } 2094 hash_lock = HDR_LOCK(buf->b_hdr); 2095 mutex_enter(hash_lock); 2096 hdr = buf->b_hdr; 2097 ASSERT(HDR_HAS_L1HDR(hdr)); 2098 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 2099 mutex_exit(&buf->b_evict_lock); 2100 2101 ASSERT(hdr->b_l1hdr.b_state == arc_mru || 2102 hdr->b_l1hdr.b_state == arc_mfu); 2103 2104 add_reference(hdr, hash_lock, tag); 2105 DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); 2106 arc_access(hdr, hash_lock); 2107 mutex_exit(hash_lock); 2108 ARCSTAT_BUMP(arcstat_hits); 2109 ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr), 2110 demand, prefetch, !HDR_ISTYPE_METADATA(hdr), 2111 data, metadata, hits); 2112} 2113 2114static void 2115arc_buf_free_on_write(void *data, size_t size, 2116 void (*free_func)(void *, size_t)) 2117{ 2118 l2arc_data_free_t *df; 2119 2120 df = kmem_alloc(sizeof (l2arc_data_free_t), KM_SLEEP); 2121 df->l2df_data = data; 2122 df->l2df_size = size; 2123 df->l2df_func = free_func; 2124 mutex_enter(&l2arc_free_on_write_mtx); 2125 list_insert_head(l2arc_free_on_write, df); 2126 mutex_exit(&l2arc_free_on_write_mtx); 2127} 2128 2129/* 2130 * Free the arc data buffer. If it is an l2arc write in progress, 2131 * the buffer is placed on l2arc_free_on_write to be freed later. 2132 */ 2133static void 2134arc_buf_data_free(arc_buf_t *buf, void (*free_func)(void *, size_t)) 2135{ 2136 arc_buf_hdr_t *hdr = buf->b_hdr; 2137 2138 if (HDR_L2_WRITING(hdr)) { 2139 arc_buf_free_on_write(buf->b_data, hdr->b_size, free_func); 2140 ARCSTAT_BUMP(arcstat_l2_free_on_write); 2141 } else { 2142 free_func(buf->b_data, hdr->b_size); 2143 } 2144} 2145 2146/* 2147 * Free up buf->b_data and if 'remove' is set, then pull the 2148 * arc_buf_t off of the the arc_buf_hdr_t's list and free it. 2149 */ 2150static void 2151arc_buf_l2_cdata_free(arc_buf_hdr_t *hdr) 2152{ 2153 ASSERT(HDR_HAS_L2HDR(hdr)); 2154 ASSERT(MUTEX_HELD(&hdr->b_l2hdr.b_dev->l2ad_mtx)); 2155 2156 /* 2157 * The b_tmp_cdata field is linked off of the b_l1hdr, so if 2158 * that doesn't exist, the header is in the arc_l2c_only state, 2159 * and there isn't anything to free (it's already been freed). 2160 */ 2161 if (!HDR_HAS_L1HDR(hdr)) 2162 return; 2163 2164 if (hdr->b_l1hdr.b_tmp_cdata == NULL) 2165 return; 2166 2167 ASSERT(HDR_L2_WRITING(hdr)); 2168 arc_buf_free_on_write(hdr->b_l1hdr.b_tmp_cdata, hdr->b_size, 2169 zio_data_buf_free); 2170 2171 ARCSTAT_BUMP(arcstat_l2_cdata_free_on_write); 2172 hdr->b_l1hdr.b_tmp_cdata = NULL; 2173} 2174 2175static void 2176arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t remove) 2177{ 2178 arc_buf_t **bufp; 2179 2180 /* free up data associated with the buf */ 2181 if (buf->b_data != NULL) { 2182 arc_state_t *state = buf->b_hdr->b_l1hdr.b_state; 2183 uint64_t size = buf->b_hdr->b_size; 2184 arc_buf_contents_t type = arc_buf_type(buf->b_hdr); 2185 2186 arc_cksum_verify(buf); 2187#ifdef illumos 2188 arc_buf_unwatch(buf); 2189#endif /* illumos */ 2190 2191 if (!recycle) { 2192 if (type == ARC_BUFC_METADATA) { 2193 arc_buf_data_free(buf, zio_buf_free); 2194 arc_space_return(size, ARC_SPACE_META); 2195 } else { 2196 ASSERT(type == ARC_BUFC_DATA); 2197 arc_buf_data_free(buf, zio_data_buf_free); 2198 arc_space_return(size, ARC_SPACE_DATA); 2199 } 2200 } 2201 if (list_link_active(&buf->b_hdr->b_l1hdr.b_arc_node)) { 2202 uint64_t *cnt = &state->arcs_lsize[type]; 2203 2204 ASSERT(refcount_is_zero( 2205 &buf->b_hdr->b_l1hdr.b_refcnt)); 2206 ASSERT(state != arc_anon && state != arc_l2c_only); 2207 2208 ASSERT3U(*cnt, >=, size); 2209 atomic_add_64(cnt, -size); 2210 } 2211 ASSERT3U(state->arcs_size, >=, size); 2212 atomic_add_64(&state->arcs_size, -size); 2213 buf->b_data = NULL; 2214 2215 /* 2216 * If we're destroying a duplicate buffer make sure 2217 * that the appropriate statistics are updated. 2218 */ 2219 if (buf->b_hdr->b_l1hdr.b_datacnt > 1 && 2220 HDR_ISTYPE_DATA(buf->b_hdr)) { 2221 ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers); 2222 ARCSTAT_INCR(arcstat_duplicate_buffers_size, -size); 2223 } 2224 ASSERT(buf->b_hdr->b_l1hdr.b_datacnt > 0); 2225 buf->b_hdr->b_l1hdr.b_datacnt -= 1; 2226 } 2227 2228 /* only remove the buf if requested */ 2229 if (!remove) 2230 return; 2231 2232 /* remove the buf from the hdr list */ 2233 for (bufp = &buf->b_hdr->b_l1hdr.b_buf; *bufp != buf; 2234 bufp = &(*bufp)->b_next) 2235 continue; 2236 *bufp = buf->b_next; 2237 buf->b_next = NULL; 2238 2239 ASSERT(buf->b_efunc == NULL); 2240 2241 /* clean up the buf */ 2242 buf->b_hdr = NULL; 2243 kmem_cache_free(buf_cache, buf); 2244} 2245 2246static void 2247arc_hdr_l2hdr_destroy(arc_buf_hdr_t *hdr) 2248{ 2249 l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr; 2250 l2arc_dev_t *dev = l2hdr->b_dev; 2251 2252 ASSERT(MUTEX_HELD(&dev->l2ad_mtx)); 2253 ASSERT(HDR_HAS_L2HDR(hdr)); 2254 2255 list_remove(&dev->l2ad_buflist, hdr); 2256 2257 /* 2258 * We don't want to leak the b_tmp_cdata buffer that was 2259 * allocated in l2arc_write_buffers() 2260 */ 2261 arc_buf_l2_cdata_free(hdr); 2262 2263 /* 2264 * If the l2hdr's b_daddr is equal to L2ARC_ADDR_UNSET, then 2265 * this header is being processed by l2arc_write_buffers() (i.e. 2266 * it's in the first stage of l2arc_write_buffers()). 2267 * Re-affirming that truth here, just to serve as a reminder. If 2268 * b_daddr does not equal L2ARC_ADDR_UNSET, then the header may or 2269 * may not have its HDR_L2_WRITING flag set. (the write may have 2270 * completed, in which case HDR_L2_WRITING will be false and the 2271 * b_daddr field will point to the address of the buffer on disk). 2272 */ 2273 IMPLY(l2hdr->b_daddr == L2ARC_ADDR_UNSET, HDR_L2_WRITING(hdr)); 2274 2275 /* 2276 * If b_daddr is equal to L2ARC_ADDR_UNSET, we're racing with 2277 * l2arc_write_buffers(). Since we've just removed this header 2278 * from the l2arc buffer list, this header will never reach the 2279 * second stage of l2arc_write_buffers(), which increments the 2280 * accounting stats for this header. Thus, we must be careful 2281 * not to decrement them for this header either. 2282 */ 2283 if (l2hdr->b_daddr != L2ARC_ADDR_UNSET) { 2284 ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize); 2285 ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); 2286 2287 vdev_space_update(dev->l2ad_vdev, 2288 -l2hdr->b_asize, 0, 0); 2289 2290 (void) refcount_remove_many(&dev->l2ad_alloc, 2291 l2hdr->b_asize, hdr); 2292 } 2293 2294 hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR; 2295} 2296 2297static void 2298arc_hdr_destroy(arc_buf_hdr_t *hdr) 2299{ 2300 if (HDR_HAS_L1HDR(hdr)) { 2301 ASSERT(hdr->b_l1hdr.b_buf == NULL || 2302 hdr->b_l1hdr.b_datacnt > 0); 2303 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 2304 ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); 2305 } 2306 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 2307 ASSERT(!HDR_IN_HASH_TABLE(hdr)); 2308 2309 if (HDR_HAS_L2HDR(hdr)) { 2310 l2arc_dev_t *dev = hdr->b_l2hdr.b_dev; 2311 boolean_t buflist_held = MUTEX_HELD(&dev->l2ad_mtx); 2312 2313 if (!buflist_held) 2314 mutex_enter(&dev->l2ad_mtx); 2315 2316 /* 2317 * Even though we checked this conditional above, we 2318 * need to check this again now that we have the 2319 * l2ad_mtx. This is because we could be racing with 2320 * another thread calling l2arc_evict() which might have 2321 * destroyed this header's L2 portion as we were waiting 2322 * to acquire the l2ad_mtx. If that happens, we don't 2323 * want to re-destroy the header's L2 portion. 2324 */ 2325 if (HDR_HAS_L2HDR(hdr)) { 2326 trim_map_free(dev->l2ad_vdev, hdr->b_l2hdr.b_daddr, 2327 hdr->b_l2hdr.b_asize, 0); 2328 arc_hdr_l2hdr_destroy(hdr); 2329 } 2330 2331 if (!buflist_held) 2332 mutex_exit(&dev->l2ad_mtx); 2333 } 2334 2335 if (!BUF_EMPTY(hdr)) 2336 buf_discard_identity(hdr); 2337 if (hdr->b_freeze_cksum != NULL) { 2338 kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t)); 2339 hdr->b_freeze_cksum = NULL; 2340 } 2341 2342 if (HDR_HAS_L1HDR(hdr)) { 2343 while (hdr->b_l1hdr.b_buf) { 2344 arc_buf_t *buf = hdr->b_l1hdr.b_buf; 2345 2346 if (buf->b_efunc != NULL) { 2347 mutex_enter(&arc_eviction_mtx); 2348 mutex_enter(&buf->b_evict_lock); 2349 ASSERT(buf->b_hdr != NULL); 2350 arc_buf_destroy(hdr->b_l1hdr.b_buf, FALSE, 2351 FALSE); 2352 hdr->b_l1hdr.b_buf = buf->b_next; 2353 buf->b_hdr = &arc_eviction_hdr; 2354 buf->b_next = arc_eviction_list; 2355 arc_eviction_list = buf; 2356 mutex_exit(&buf->b_evict_lock); 2357 mutex_exit(&arc_eviction_mtx); 2358 } else { 2359 arc_buf_destroy(hdr->b_l1hdr.b_buf, FALSE, 2360 TRUE); 2361 } 2362 } 2363#ifdef ZFS_DEBUG 2364 if (hdr->b_l1hdr.b_thawed != NULL) { 2365 kmem_free(hdr->b_l1hdr.b_thawed, 1); 2366 hdr->b_l1hdr.b_thawed = NULL; 2367 } 2368#endif 2369 } 2370 2371 ASSERT3P(hdr->b_hash_next, ==, NULL); 2372 if (HDR_HAS_L1HDR(hdr)) { 2373 ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node)); 2374 ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); 2375 kmem_cache_free(hdr_full_cache, hdr); 2376 } else { 2377 kmem_cache_free(hdr_l2only_cache, hdr); 2378 } 2379} 2380 2381void 2382arc_buf_free(arc_buf_t *buf, void *tag) 2383{ 2384 arc_buf_hdr_t *hdr = buf->b_hdr; 2385 int hashed = hdr->b_l1hdr.b_state != arc_anon; 2386 2387 ASSERT(buf->b_efunc == NULL); 2388 ASSERT(buf->b_data != NULL); 2389 2390 if (hashed) { 2391 kmutex_t *hash_lock = HDR_LOCK(hdr); 2392 2393 mutex_enter(hash_lock); 2394 hdr = buf->b_hdr; 2395 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 2396 2397 (void) remove_reference(hdr, hash_lock, tag); 2398 if (hdr->b_l1hdr.b_datacnt > 1) { 2399 arc_buf_destroy(buf, FALSE, TRUE); 2400 } else { 2401 ASSERT(buf == hdr->b_l1hdr.b_buf); 2402 ASSERT(buf->b_efunc == NULL); 2403 hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE; 2404 } 2405 mutex_exit(hash_lock); 2406 } else if (HDR_IO_IN_PROGRESS(hdr)) { 2407 int destroy_hdr; 2408 /* 2409 * We are in the middle of an async write. Don't destroy 2410 * this buffer unless the write completes before we finish 2411 * decrementing the reference count. 2412 */ 2413 mutex_enter(&arc_eviction_mtx); 2414 (void) remove_reference(hdr, NULL, tag); 2415 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 2416 destroy_hdr = !HDR_IO_IN_PROGRESS(hdr); 2417 mutex_exit(&arc_eviction_mtx); 2418 if (destroy_hdr) 2419 arc_hdr_destroy(hdr); 2420 } else { 2421 if (remove_reference(hdr, NULL, tag) > 0) 2422 arc_buf_destroy(buf, FALSE, TRUE); 2423 else 2424 arc_hdr_destroy(hdr); 2425 } 2426} 2427 2428boolean_t 2429arc_buf_remove_ref(arc_buf_t *buf, void* tag) 2430{ 2431 arc_buf_hdr_t *hdr = buf->b_hdr; 2432 kmutex_t *hash_lock = HDR_LOCK(hdr); 2433 boolean_t no_callback = (buf->b_efunc == NULL); 2434 2435 if (hdr->b_l1hdr.b_state == arc_anon) { 2436 ASSERT(hdr->b_l1hdr.b_datacnt == 1); 2437 arc_buf_free(buf, tag); 2438 return (no_callback); 2439 } 2440 2441 mutex_enter(hash_lock); 2442 hdr = buf->b_hdr; 2443 ASSERT(hdr->b_l1hdr.b_datacnt > 0); 2444 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 2445 ASSERT(hdr->b_l1hdr.b_state != arc_anon); 2446 ASSERT(buf->b_data != NULL); 2447 2448 (void) remove_reference(hdr, hash_lock, tag); 2449 if (hdr->b_l1hdr.b_datacnt > 1) { 2450 if (no_callback) 2451 arc_buf_destroy(buf, FALSE, TRUE); 2452 } else if (no_callback) { 2453 ASSERT(hdr->b_l1hdr.b_buf == buf && buf->b_next == NULL); 2454 ASSERT(buf->b_efunc == NULL); 2455 hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE; 2456 } 2457 ASSERT(no_callback || hdr->b_l1hdr.b_datacnt > 1 || 2458 refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 2459 mutex_exit(hash_lock); 2460 return (no_callback); 2461} 2462 2463int32_t 2464arc_buf_size(arc_buf_t *buf) 2465{ 2466 return (buf->b_hdr->b_size); 2467} 2468 2469/* 2470 * Called from the DMU to determine if the current buffer should be 2471 * evicted. In order to ensure proper locking, the eviction must be initiated 2472 * from the DMU. Return true if the buffer is associated with user data and 2473 * duplicate buffers still exist. 2474 */ 2475boolean_t 2476arc_buf_eviction_needed(arc_buf_t *buf) 2477{ 2478 arc_buf_hdr_t *hdr; 2479 boolean_t evict_needed = B_FALSE; 2480 2481 if (zfs_disable_dup_eviction) 2482 return (B_FALSE); 2483 2484 mutex_enter(&buf->b_evict_lock); 2485 hdr = buf->b_hdr; 2486 if (hdr == NULL) { 2487 /* 2488 * We are in arc_do_user_evicts(); let that function 2489 * perform the eviction. 2490 */ 2491 ASSERT(buf->b_data == NULL); 2492 mutex_exit(&buf->b_evict_lock); 2493 return (B_FALSE); 2494 } else if (buf->b_data == NULL) { 2495 /* 2496 * We have already been added to the arc eviction list; 2497 * recommend eviction. 2498 */ 2499 ASSERT3P(hdr, ==, &arc_eviction_hdr); 2500 mutex_exit(&buf->b_evict_lock); 2501 return (B_TRUE); 2502 } 2503 2504 if (hdr->b_l1hdr.b_datacnt > 1 && HDR_ISTYPE_DATA(hdr)) 2505 evict_needed = B_TRUE; 2506 2507 mutex_exit(&buf->b_evict_lock); 2508 return (evict_needed); 2509} 2510 2511/* 2512 * Evict buffers from list until we've removed the specified number of 2513 * bytes. Move the removed buffers to the appropriate evict state. 2514 * If the recycle flag is set, then attempt to "recycle" a buffer: 2515 * - look for a buffer to evict that is `bytes' long. 2516 * - return the data block from this buffer rather than freeing it. 2517 * This flag is used by callers that are trying to make space for a 2518 * new buffer in a full arc cache. 2519 * 2520 * This function makes a "best effort". It skips over any buffers 2521 * it can't get a hash_lock on, and so may not catch all candidates. 2522 * It may also return without evicting as much space as requested. 2523 */ 2524static void * 2525arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle, 2526 arc_buf_contents_t type) 2527{ 2528 arc_state_t *evicted_state; 2529 uint64_t bytes_evicted = 0, skipped = 0, missed = 0; 2530 int64_t bytes_remaining; 2531 arc_buf_hdr_t *hdr, *hdr_prev = NULL; 2532 list_t *evicted_list, *list, *evicted_list_start, *list_start; 2533 kmutex_t *lock, *evicted_lock; 2534 kmutex_t *hash_lock; 2535 boolean_t have_lock; 2536 void *stolen = NULL; 2537 arc_buf_hdr_t marker = { 0 }; 2538 int count = 0; 2539 static int evict_metadata_offset, evict_data_offset; 2540 int i, idx, offset, list_count, lists; 2541 2542 ASSERT(state == arc_mru || state == arc_mfu); 2543 2544 evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost; 2545 2546 /* 2547 * Decide which "type" (data vs metadata) to recycle from. 2548 * 2549 * If we are over the metadata limit, recycle from metadata. 2550 * If we are under the metadata minimum, recycle from data. 2551 * Otherwise, recycle from whichever type has the oldest (least 2552 * recently accessed) header. This is not yet implemented. 2553 */ 2554 if (recycle) { 2555 arc_buf_contents_t realtype; 2556 if (state->arcs_lsize[ARC_BUFC_DATA] == 0) { 2557 realtype = ARC_BUFC_METADATA; 2558 } else if (state->arcs_lsize[ARC_BUFC_METADATA] == 0) { 2559 realtype = ARC_BUFC_DATA; 2560 } else if (arc_meta_used >= arc_meta_limit) { 2561 realtype = ARC_BUFC_METADATA; 2562 } else if (arc_meta_used <= arc_meta_min) { 2563 realtype = ARC_BUFC_DATA; 2564#ifdef illumos 2565 } else if (HDR_HAS_L1HDR(data_hdr) && 2566 HDR_HAS_L1HDR(metadata_hdr) && 2567 data_hdr->b_l1hdr.b_arc_access < 2568 metadata_hdr->b_l1hdr.b_arc_access) { 2569 realtype = ARC_BUFC_DATA; 2570 } else { 2571 realtype = ARC_BUFC_METADATA; 2572#else 2573 } else { 2574 /* TODO */ 2575 realtype = type; 2576#endif 2577 } 2578 if (realtype != type) { 2579 /* 2580 * If we want to evict from a different list, 2581 * we can not recycle, because DATA vs METADATA 2582 * buffers are segregated into different kmem 2583 * caches (and vmem arenas). 2584 */ 2585 type = realtype; 2586 recycle = B_FALSE; 2587 } 2588 } 2589 2590 if (type == ARC_BUFC_METADATA) { 2591 offset = 0; 2592 list_count = ARC_BUFC_NUMMETADATALISTS; 2593 list_start = &state->arcs_lists[0]; 2594 evicted_list_start = &evicted_state->arcs_lists[0]; 2595 idx = evict_metadata_offset; 2596 } else { 2597 offset = ARC_BUFC_NUMMETADATALISTS; 2598 list_start = &state->arcs_lists[offset]; 2599 evicted_list_start = &evicted_state->arcs_lists[offset]; 2600 list_count = ARC_BUFC_NUMDATALISTS; 2601 idx = evict_data_offset; 2602 } 2603 bytes_remaining = evicted_state->arcs_lsize[type]; 2604 lists = 0; 2605 2606evict_start: 2607 list = &list_start[idx]; 2608 evicted_list = &evicted_list_start[idx]; 2609 lock = ARCS_LOCK(state, (offset + idx)); 2610 evicted_lock = ARCS_LOCK(evicted_state, (offset + idx)); 2611 2612 /* 2613 * The ghost list lock must be acquired first in order to prevent 2614 * a 3 party deadlock: 2615 * 2616 * - arc_evict_ghost acquires arc_*_ghost->arcs_mtx, followed by 2617 * l2ad_mtx in arc_hdr_realloc 2618 * - l2arc_write_buffers acquires l2ad_mtx, followed by arc_*->arcs_mtx 2619 * - arc_evict acquires arc_*_ghost->arcs_mtx, followed by 2620 * arc_*_ghost->arcs_mtx and forms a deadlock cycle. 2621 * 2622 * This situation is avoided by acquiring the ghost list lock first. 2623 */ 2624 mutex_enter(evicted_lock); 2625 mutex_enter(lock); 2626 2627 for (hdr = list_tail(list); hdr; hdr = hdr_prev) { 2628 hdr_prev = list_prev(list, hdr); 2629 if (HDR_HAS_L1HDR(hdr)) { 2630 bytes_remaining -= 2631 (hdr->b_size * hdr->b_l1hdr.b_datacnt); 2632 } 2633 /* prefetch buffers have a minimum lifespan */ 2634 if (HDR_IO_IN_PROGRESS(hdr) || 2635 (spa && hdr->b_spa != spa) || 2636 ((hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT)) && 2637 ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access < 2638 arc_min_prefetch_lifespan)) { 2639 skipped++; 2640 continue; 2641 } 2642 /* "lookahead" for better eviction candidate */ 2643 if (recycle && hdr->b_size != bytes && 2644 hdr_prev && hdr_prev->b_size == bytes) 2645 continue; 2646 2647 /* ignore markers */ 2648 if (hdr->b_spa == 0) 2649 continue; 2650 2651 /* 2652 * It may take a long time to evict all the bufs requested. 2653 * To avoid blocking all arc activity, periodically drop 2654 * the arcs_mtx and give other threads a chance to run 2655 * before reacquiring the lock. 2656 * 2657 * If we are looking for a buffer to recycle, we are in 2658 * the hot code path, so don't sleep. 2659 */ 2660 if (!recycle && count++ > arc_evict_iterations) { 2661 list_insert_after(list, hdr, &marker); 2662 mutex_exit(lock); 2663 mutex_exit(evicted_lock); 2664 kpreempt(KPREEMPT_SYNC); 2665 mutex_enter(evicted_lock); 2666 mutex_enter(lock); 2667 hdr_prev = list_prev(list, &marker); 2668 list_remove(list, &marker); 2669 count = 0; 2670 continue; 2671 } 2672 2673 hash_lock = HDR_LOCK(hdr); 2674 have_lock = MUTEX_HELD(hash_lock); 2675 if (have_lock || mutex_tryenter(hash_lock)) { 2676 ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt)); 2677 ASSERT3U(hdr->b_l1hdr.b_datacnt, >, 0); 2678 while (hdr->b_l1hdr.b_buf) { 2679 arc_buf_t *buf = hdr->b_l1hdr.b_buf; 2680 if (!mutex_tryenter(&buf->b_evict_lock)) { 2681 missed += 1; 2682 break; 2683 } 2684 if (buf->b_data != NULL) { 2685 bytes_evicted += hdr->b_size; 2686 if (recycle && 2687 arc_buf_type(hdr) == type && 2688 hdr->b_size == bytes && 2689 !HDR_L2_WRITING(hdr)) { 2690 stolen = buf->b_data; 2691 recycle = FALSE; 2692 } 2693 } 2694 if (buf->b_efunc != NULL) { 2695 mutex_enter(&arc_eviction_mtx); 2696 arc_buf_destroy(buf, 2697 buf->b_data == stolen, FALSE); 2698 hdr->b_l1hdr.b_buf = buf->b_next; 2699 buf->b_hdr = &arc_eviction_hdr; 2700 buf->b_next = arc_eviction_list; 2701 arc_eviction_list = buf; 2702 mutex_exit(&arc_eviction_mtx); 2703 mutex_exit(&buf->b_evict_lock); 2704 } else { 2705 mutex_exit(&buf->b_evict_lock); 2706 arc_buf_destroy(buf, 2707 buf->b_data == stolen, TRUE); 2708 } 2709 } 2710 2711 if (HDR_HAS_L2HDR(hdr)) { 2712 ARCSTAT_INCR(arcstat_evict_l2_cached, 2713 hdr->b_size); 2714 } else { 2715 if (l2arc_write_eligible(hdr->b_spa, hdr)) { 2716 ARCSTAT_INCR(arcstat_evict_l2_eligible, 2717 hdr->b_size); 2718 } else { 2719 ARCSTAT_INCR( 2720 arcstat_evict_l2_ineligible, 2721 hdr->b_size); 2722 } 2723 } 2724 2725 if (hdr->b_l1hdr.b_datacnt == 0) { 2726 arc_change_state(evicted_state, hdr, hash_lock); 2727 ASSERT(HDR_IN_HASH_TABLE(hdr)); 2728 hdr->b_flags |= ARC_FLAG_IN_HASH_TABLE; 2729 hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE; 2730 DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, hdr); 2731 } 2732 if (!have_lock) 2733 mutex_exit(hash_lock); 2734 if (bytes >= 0 && bytes_evicted >= bytes) 2735 break; 2736 if (bytes_remaining > 0) { 2737 mutex_exit(evicted_lock); 2738 mutex_exit(lock); 2739 idx = ((idx + 1) & (list_count - 1)); 2740 lists++; 2741 goto evict_start; 2742 } 2743 } else { 2744 missed += 1; 2745 } 2746 } 2747 2748 mutex_exit(lock); 2749 mutex_exit(evicted_lock); 2750 2751 idx = ((idx + 1) & (list_count - 1)); 2752 lists++; 2753 2754 if (bytes_evicted < bytes) { 2755 if (lists < list_count) 2756 goto evict_start; 2757 else 2758 dprintf("only evicted %lld bytes from %x", 2759 (longlong_t)bytes_evicted, state); 2760 } 2761 if (type == ARC_BUFC_METADATA) 2762 evict_metadata_offset = idx; 2763 else 2764 evict_data_offset = idx; 2765 2766 if (skipped) 2767 ARCSTAT_INCR(arcstat_evict_skip, skipped); 2768 2769 if (missed) 2770 ARCSTAT_INCR(arcstat_mutex_miss, missed); 2771 2772 /* 2773 * Note: we have just evicted some data into the ghost state, 2774 * potentially putting the ghost size over the desired size. Rather 2775 * that evicting from the ghost list in this hot code path, leave 2776 * this chore to the arc_reclaim_thread(). 2777 */ 2778 2779 if (stolen) 2780 ARCSTAT_BUMP(arcstat_stolen); 2781 return (stolen); 2782} 2783 2784/* 2785 * Remove buffers from list until we've removed the specified number of 2786 * bytes. Destroy the buffers that are removed. 2787 */ 2788static void 2789arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes) 2790{ 2791 arc_buf_hdr_t *hdr, *hdr_prev; 2792 arc_buf_hdr_t marker = { 0 }; 2793 list_t *list, *list_start; 2794 kmutex_t *hash_lock, *lock; 2795 uint64_t bytes_deleted = 0; 2796 uint64_t bufs_skipped = 0; 2797 int count = 0; 2798 static int evict_offset; 2799 int list_count, idx = evict_offset; 2800 int offset, lists = 0; 2801 2802 ASSERT(GHOST_STATE(state)); 2803 2804 /* 2805 * data lists come after metadata lists 2806 */ 2807 list_start = &state->arcs_lists[ARC_BUFC_NUMMETADATALISTS]; 2808 list_count = ARC_BUFC_NUMDATALISTS; 2809 offset = ARC_BUFC_NUMMETADATALISTS; 2810 2811evict_start: 2812 list = &list_start[idx]; 2813 lock = ARCS_LOCK(state, idx + offset); 2814 2815 mutex_enter(lock); 2816 for (hdr = list_tail(list); hdr; hdr = hdr_prev) { 2817 hdr_prev = list_prev(list, hdr); 2818 if (arc_buf_type(hdr) >= ARC_BUFC_NUMTYPES) 2819 panic("invalid hdr=%p", (void *)hdr); 2820 if (spa && hdr->b_spa != spa) 2821 continue; 2822 2823 /* ignore markers */ 2824 if (hdr->b_spa == 0) 2825 continue; 2826 2827 hash_lock = HDR_LOCK(hdr); 2828 /* caller may be trying to modify this buffer, skip it */ 2829 if (MUTEX_HELD(hash_lock)) 2830 continue; 2831 2832 /* 2833 * It may take a long time to evict all the bufs requested. 2834 * To avoid blocking all arc activity, periodically drop 2835 * the arcs_mtx and give other threads a chance to run 2836 * before reacquiring the lock. 2837 */ 2838 if (count++ > arc_evict_iterations) { 2839 list_insert_after(list, hdr, &marker); 2840 mutex_exit(lock); 2841 kpreempt(KPREEMPT_SYNC); 2842 mutex_enter(lock); 2843 hdr_prev = list_prev(list, &marker); 2844 list_remove(list, &marker); 2845 count = 0; 2846 continue; 2847 } 2848 if (mutex_tryenter(hash_lock)) { 2849 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 2850 ASSERT(!HDR_HAS_L1HDR(hdr) || 2851 hdr->b_l1hdr.b_buf == NULL); 2852 ARCSTAT_BUMP(arcstat_deleted); 2853 bytes_deleted += hdr->b_size; 2854 2855 if (HDR_HAS_L2HDR(hdr)) { 2856 /* 2857 * This buffer is cached on the 2nd Level ARC; 2858 * don't destroy the header. 2859 */ 2860 arc_change_state(arc_l2c_only, hdr, hash_lock); 2861 /* 2862 * dropping from L1+L2 cached to L2-only, 2863 * realloc to remove the L1 header. 2864 */ 2865 hdr = arc_hdr_realloc(hdr, hdr_full_cache, 2866 hdr_l2only_cache); 2867 mutex_exit(hash_lock); 2868 } else { 2869 arc_change_state(arc_anon, hdr, hash_lock); 2870 mutex_exit(hash_lock); 2871 arc_hdr_destroy(hdr); 2872 } 2873 2874 DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, hdr); 2875 if (bytes >= 0 && bytes_deleted >= bytes) 2876 break; 2877 } else if (bytes < 0) { 2878 /* 2879 * Insert a list marker and then wait for the 2880 * hash lock to become available. Once its 2881 * available, restart from where we left off. 2882 */ 2883 list_insert_after(list, hdr, &marker); 2884 mutex_exit(lock); 2885 mutex_enter(hash_lock); 2886 mutex_exit(hash_lock); 2887 mutex_enter(lock); 2888 hdr_prev = list_prev(list, &marker); 2889 list_remove(list, &marker); 2890 } else { 2891 bufs_skipped += 1; 2892 } 2893 2894 } 2895 mutex_exit(lock); 2896 idx = ((idx + 1) & (ARC_BUFC_NUMDATALISTS - 1)); 2897 lists++; 2898 2899 if (lists < list_count) 2900 goto evict_start; 2901 2902 evict_offset = idx; 2903 if ((uintptr_t)list > (uintptr_t)&state->arcs_lists[ARC_BUFC_NUMMETADATALISTS] && 2904 (bytes < 0 || bytes_deleted < bytes)) { 2905 list_start = &state->arcs_lists[0]; 2906 list_count = ARC_BUFC_NUMMETADATALISTS; 2907 offset = lists = 0; 2908 goto evict_start; 2909 } 2910 2911 if (bufs_skipped) { 2912 ARCSTAT_INCR(arcstat_mutex_miss, bufs_skipped); 2913 ASSERT(bytes >= 0); 2914 } 2915 2916 if (bytes_deleted < bytes) 2917 dprintf("only deleted %lld bytes from %p", 2918 (longlong_t)bytes_deleted, state); 2919} 2920 2921static void 2922arc_adjust(void) 2923{ 2924 int64_t adjustment, delta; 2925 2926 /* 2927 * Adjust MRU size 2928 */ 2929 2930 adjustment = MIN((int64_t)(arc_size - arc_c), 2931 (int64_t)(arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used - 2932 arc_p)); 2933 2934 if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) { 2935 delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], adjustment); 2936 (void) arc_evict(arc_mru, 0, delta, FALSE, ARC_BUFC_DATA); 2937 adjustment -= delta; 2938 } 2939 2940 if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) { 2941 delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustment); 2942 (void) arc_evict(arc_mru, 0, delta, FALSE, 2943 ARC_BUFC_METADATA); 2944 } 2945 2946 /* 2947 * Adjust MFU size 2948 */ 2949 2950 adjustment = arc_size - arc_c; 2951 2952 if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) { 2953 delta = MIN(adjustment, arc_mfu->arcs_lsize[ARC_BUFC_DATA]); 2954 (void) arc_evict(arc_mfu, 0, delta, FALSE, ARC_BUFC_DATA); 2955 adjustment -= delta; 2956 } 2957 2958 if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) { 2959 int64_t delta = MIN(adjustment, 2960 arc_mfu->arcs_lsize[ARC_BUFC_METADATA]); 2961 (void) arc_evict(arc_mfu, 0, delta, FALSE, 2962 ARC_BUFC_METADATA); 2963 } 2964 2965 /* 2966 * Adjust ghost lists 2967 */ 2968 2969 adjustment = arc_mru->arcs_size + arc_mru_ghost->arcs_size - arc_c; 2970 2971 if (adjustment > 0 && arc_mru_ghost->arcs_size > 0) { 2972 delta = MIN(arc_mru_ghost->arcs_size, adjustment); 2973 arc_evict_ghost(arc_mru_ghost, 0, delta); 2974 } 2975 2976 adjustment = 2977 arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size - arc_c; 2978 2979 if (adjustment > 0 && arc_mfu_ghost->arcs_size > 0) { 2980 delta = MIN(arc_mfu_ghost->arcs_size, adjustment); 2981 arc_evict_ghost(arc_mfu_ghost, 0, delta); 2982 } 2983} 2984 2985static void 2986arc_do_user_evicts(void) 2987{ 2988 static arc_buf_t *tmp_arc_eviction_list; 2989 2990 /* 2991 * Move list over to avoid LOR 2992 */ 2993restart: 2994 mutex_enter(&arc_eviction_mtx); 2995 tmp_arc_eviction_list = arc_eviction_list; 2996 arc_eviction_list = NULL; 2997 mutex_exit(&arc_eviction_mtx); 2998 2999 while (tmp_arc_eviction_list != NULL) { 3000 arc_buf_t *buf = tmp_arc_eviction_list; 3001 tmp_arc_eviction_list = buf->b_next; 3002 mutex_enter(&buf->b_evict_lock); 3003 buf->b_hdr = NULL; 3004 mutex_exit(&buf->b_evict_lock); 3005 3006 if (buf->b_efunc != NULL) 3007 VERIFY0(buf->b_efunc(buf->b_private)); 3008 3009 buf->b_efunc = NULL; 3010 buf->b_private = NULL; 3011 kmem_cache_free(buf_cache, buf); 3012 } 3013 3014 if (arc_eviction_list != NULL) 3015 goto restart; 3016} 3017 3018/* 3019 * Flush all *evictable* data from the cache for the given spa. 3020 * NOTE: this will not touch "active" (i.e. referenced) data. 3021 */ 3022void 3023arc_flush(spa_t *spa) 3024{ 3025 uint64_t guid = 0; 3026 3027 if (spa != NULL) 3028 guid = spa_load_guid(spa); 3029 3030 while (arc_mru->arcs_lsize[ARC_BUFC_DATA]) { 3031 (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_DATA); 3032 if (spa != NULL) 3033 break; 3034 } 3035 while (arc_mru->arcs_lsize[ARC_BUFC_METADATA]) { 3036 (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_METADATA); 3037 if (spa != NULL) 3038 break; 3039 } 3040 while (arc_mfu->arcs_lsize[ARC_BUFC_DATA]) { 3041 (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_DATA); 3042 if (spa != NULL) 3043 break; 3044 } 3045 while (arc_mfu->arcs_lsize[ARC_BUFC_METADATA]) { 3046 (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_METADATA); 3047 if (spa != NULL) 3048 break; 3049 } 3050 3051 arc_evict_ghost(arc_mru_ghost, guid, -1); 3052 arc_evict_ghost(arc_mfu_ghost, guid, -1); 3053 3054 mutex_enter(&arc_reclaim_thr_lock); 3055 arc_do_user_evicts(); 3056 mutex_exit(&arc_reclaim_thr_lock); 3057 ASSERT(spa || arc_eviction_list == NULL); 3058} 3059 3060void 3061arc_shrink(void) 3062{ 3063 3064 if (arc_c > arc_c_min) { 3065 uint64_t to_free; 3066 3067 to_free = arc_c >> arc_shrink_shift; 3068 DTRACE_PROBE4(arc__shrink, uint64_t, arc_c, uint64_t, 3069 arc_c_min, uint64_t, arc_p, uint64_t, to_free); 3070 if (arc_c > arc_c_min + to_free) 3071 atomic_add_64(&arc_c, -to_free); 3072 else 3073 arc_c = arc_c_min; 3074 3075 atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift)); 3076 if (arc_c > arc_size) 3077 arc_c = MAX(arc_size, arc_c_min); 3078 if (arc_p > arc_c) 3079 arc_p = (arc_c >> 1); 3080 3081 DTRACE_PROBE2(arc__shrunk, uint64_t, arc_c, uint64_t, 3082 arc_p); 3083 3084 ASSERT(arc_c >= arc_c_min); 3085 ASSERT((int64_t)arc_p >= 0); 3086 } 3087 3088 if (arc_size > arc_c) { 3089 DTRACE_PROBE2(arc__shrink_adjust, uint64_t, arc_size, 3090 uint64_t, arc_c); 3091 arc_adjust(); 3092 } 3093} 3094 3095static int needfree = 0; 3096 3097static int 3098arc_reclaim_needed(void) 3099{ 3100 3101#ifdef _KERNEL 3102 3103 if (needfree) { 3104 DTRACE_PROBE(arc__reclaim_needfree); 3105 return (1); 3106 } 3107 3108 /* 3109 * Cooperate with pagedaemon when it's time for it to scan 3110 * and reclaim some pages. 3111 */ 3112 if (freemem < zfs_arc_free_target) { 3113 DTRACE_PROBE2(arc__reclaim_freemem, uint64_t, 3114 freemem, uint64_t, zfs_arc_free_target); 3115 return (1); 3116 } 3117 3118#ifdef sun 3119 /* 3120 * take 'desfree' extra pages, so we reclaim sooner, rather than later 3121 */ 3122 extra = desfree; 3123 3124 /* 3125 * check that we're out of range of the pageout scanner. It starts to 3126 * schedule paging if freemem is less than lotsfree and needfree. 3127 * lotsfree is the high-water mark for pageout, and needfree is the 3128 * number of needed free pages. We add extra pages here to make sure 3129 * the scanner doesn't start up while we're freeing memory. 3130 */ 3131 if (freemem < lotsfree + needfree + extra) 3132 return (1); 3133 3134 /* 3135 * check to make sure that swapfs has enough space so that anon 3136 * reservations can still succeed. anon_resvmem() checks that the 3137 * availrmem is greater than swapfs_minfree, and the number of reserved 3138 * swap pages. We also add a bit of extra here just to prevent 3139 * circumstances from getting really dire. 3140 */ 3141 if (availrmem < swapfs_minfree + swapfs_reserve + extra) 3142 return (1); 3143 3144 /* 3145 * Check that we have enough availrmem that memory locking (e.g., via 3146 * mlock(3C) or memcntl(2)) can still succeed. (pages_pp_maximum 3147 * stores the number of pages that cannot be locked; when availrmem 3148 * drops below pages_pp_maximum, page locking mechanisms such as 3149 * page_pp_lock() will fail.) 3150 */ 3151 if (availrmem <= pages_pp_maximum) 3152 return (1); 3153 3154#endif /* sun */ 3155#if defined(__i386) || !defined(UMA_MD_SMALL_ALLOC) 3156 /* 3157 * If we're on an i386 platform, it's possible that we'll exhaust the 3158 * kernel heap space before we ever run out of available physical 3159 * memory. Most checks of the size of the heap_area compare against 3160 * tune.t_minarmem, which is the minimum available real memory that we 3161 * can have in the system. However, this is generally fixed at 25 pages 3162 * which is so low that it's useless. In this comparison, we seek to 3163 * calculate the total heap-size, and reclaim if more than 3/4ths of the 3164 * heap is allocated. (Or, in the calculation, if less than 1/4th is 3165 * free) 3166 */ 3167 if (vmem_size(heap_arena, VMEM_FREE) < 3168 (vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC) >> 2)) { 3169 DTRACE_PROBE2(arc__reclaim_used, uint64_t, 3170 vmem_size(heap_arena, VMEM_FREE), uint64_t, 3171 (vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC)) >> 2); 3172 return (1); 3173 } 3174#define zio_arena NULL 3175#else 3176#define zio_arena heap_arena 3177#endif 3178 3179 /* 3180 * If zio data pages are being allocated out of a separate heap segment, 3181 * then enforce that the size of available vmem for this arena remains 3182 * above about 1/16th free. 3183 * 3184 * Note: The 1/16th arena free requirement was put in place 3185 * to aggressively evict memory from the arc in order to avoid 3186 * memory fragmentation issues. 3187 */ 3188 if (zio_arena != NULL && 3189 vmem_size(zio_arena, VMEM_FREE) < 3190 (vmem_size(zio_arena, VMEM_ALLOC) >> 4)) 3191 return (1); 3192 3193 /* 3194 * Above limits know nothing about real level of KVA fragmentation. 3195 * Start aggressive reclamation if too little sequential KVA left. 3196 */ 3197 if (vmem_size(heap_arena, VMEM_MAXFREE) < zfs_max_recordsize) { 3198 DTRACE_PROBE2(arc__reclaim_maxfree, uint64_t, 3199 vmem_size(heap_arena, VMEM_MAXFREE), 3200 uint64_t, zfs_max_recordsize); 3201 return (1); 3202 } 3203 3204#else /* _KERNEL */ 3205 if (spa_get_random(100) == 0) 3206 return (1); 3207#endif /* _KERNEL */ 3208 DTRACE_PROBE(arc__reclaim_no); 3209 3210 return (0); 3211} 3212 3213extern kmem_cache_t *zio_buf_cache[]; 3214extern kmem_cache_t *zio_data_buf_cache[]; 3215extern kmem_cache_t *range_seg_cache; 3216 3217static __noinline void 3218arc_kmem_reap_now(arc_reclaim_strategy_t strat) 3219{ 3220 size_t i; 3221 kmem_cache_t *prev_cache = NULL; 3222 kmem_cache_t *prev_data_cache = NULL; 3223 3224 DTRACE_PROBE(arc__kmem_reap_start); 3225#ifdef _KERNEL 3226 if (arc_meta_used >= arc_meta_limit) { 3227 /* 3228 * We are exceeding our meta-data cache limit. 3229 * Purge some DNLC entries to release holds on meta-data. 3230 */ 3231 dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent); 3232 } 3233#if defined(__i386) 3234 /* 3235 * Reclaim unused memory from all kmem caches. 3236 */ 3237 kmem_reap(); 3238#endif 3239#endif 3240 3241 /* 3242 * An aggressive reclamation will shrink the cache size as well as 3243 * reap free buffers from the arc kmem caches. 3244 */ 3245 if (strat == ARC_RECLAIM_AGGR) 3246 arc_shrink(); 3247 3248 for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) { 3249 if (zio_buf_cache[i] != prev_cache) { 3250 prev_cache = zio_buf_cache[i]; 3251 kmem_cache_reap_now(zio_buf_cache[i]); 3252 } 3253 if (zio_data_buf_cache[i] != prev_data_cache) { 3254 prev_data_cache = zio_data_buf_cache[i]; 3255 kmem_cache_reap_now(zio_data_buf_cache[i]); 3256 } 3257 } 3258 kmem_cache_reap_now(buf_cache); 3259 kmem_cache_reap_now(hdr_full_cache); 3260 kmem_cache_reap_now(hdr_l2only_cache); 3261 kmem_cache_reap_now(range_seg_cache); 3262 3263#ifdef sun 3264 /* 3265 * Ask the vmem arena to reclaim unused memory from its 3266 * quantum caches. 3267 */ 3268 if (zio_arena != NULL && strat == ARC_RECLAIM_AGGR) 3269 vmem_qcache_reap(zio_arena); 3270#endif 3271 DTRACE_PROBE(arc__kmem_reap_end); 3272} 3273 3274static void 3275arc_reclaim_thread(void *dummy __unused) 3276{ 3277 clock_t growtime = 0; 3278 arc_reclaim_strategy_t last_reclaim = ARC_RECLAIM_CONS; 3279 callb_cpr_t cpr; 3280 3281 CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG); 3282 3283 mutex_enter(&arc_reclaim_thr_lock); 3284 while (arc_thread_exit == 0) { 3285 if (arc_reclaim_needed()) { 3286 3287 if (arc_no_grow) { 3288 if (last_reclaim == ARC_RECLAIM_CONS) { 3289 DTRACE_PROBE(arc__reclaim_aggr_no_grow); 3290 last_reclaim = ARC_RECLAIM_AGGR; 3291 } else { 3292 last_reclaim = ARC_RECLAIM_CONS; 3293 } 3294 } else { 3295 arc_no_grow = TRUE; 3296 last_reclaim = ARC_RECLAIM_AGGR; 3297 DTRACE_PROBE(arc__reclaim_aggr); 3298 membar_producer(); 3299 } 3300 3301 /* reset the growth delay for every reclaim */ 3302 growtime = ddi_get_lbolt() + (arc_grow_retry * hz); 3303 3304 if (needfree && last_reclaim == ARC_RECLAIM_CONS) { 3305 /* 3306 * If needfree is TRUE our vm_lowmem hook 3307 * was called and in that case we must free some 3308 * memory, so switch to aggressive mode. 3309 */ 3310 arc_no_grow = TRUE; 3311 last_reclaim = ARC_RECLAIM_AGGR; 3312 } 3313 arc_kmem_reap_now(last_reclaim); 3314 arc_warm = B_TRUE; 3315 3316 } else if (arc_no_grow && ddi_get_lbolt() >= growtime) { 3317 arc_no_grow = FALSE; 3318 } 3319 3320 arc_adjust(); 3321 3322 if (arc_eviction_list != NULL) 3323 arc_do_user_evicts(); 3324 3325#ifdef _KERNEL 3326 if (needfree) { 3327 needfree = 0; 3328 wakeup(&needfree); 3329 } 3330#endif 3331 3332 /* 3333 * This is necessary in order for the mdb ::arc dcmd to 3334 * show up to date information. Since the ::arc command 3335 * does not call the kstat's update function, without 3336 * this call, the command may show stale stats for the 3337 * anon, mru, mru_ghost, mfu, and mfu_ghost lists. Even 3338 * with this change, the data might be up to 1 second 3339 * out of date; but that should suffice. The arc_state_t 3340 * structures can be queried directly if more accurate 3341 * information is needed. 3342 */ 3343 if (arc_ksp != NULL) 3344 arc_ksp->ks_update(arc_ksp, KSTAT_READ); 3345 3346 /* block until needed, or one second, whichever is shorter */ 3347 CALLB_CPR_SAFE_BEGIN(&cpr); 3348 (void) cv_timedwait(&arc_reclaim_thr_cv, 3349 &arc_reclaim_thr_lock, hz); 3350 CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock); 3351 } 3352 3353 arc_thread_exit = 0; 3354 cv_broadcast(&arc_reclaim_thr_cv); 3355 CALLB_CPR_EXIT(&cpr); /* drops arc_reclaim_thr_lock */ 3356 thread_exit(); 3357} 3358 3359/* 3360 * Adapt arc info given the number of bytes we are trying to add and 3361 * the state that we are comming from. This function is only called 3362 * when we are adding new content to the cache. 3363 */ 3364static void 3365arc_adapt(int bytes, arc_state_t *state) 3366{ 3367 int mult; 3368 uint64_t arc_p_min = (arc_c >> arc_p_min_shift); 3369 3370 if (state == arc_l2c_only) 3371 return; 3372 3373 ASSERT(bytes > 0); 3374 /* 3375 * Adapt the target size of the MRU list: 3376 * - if we just hit in the MRU ghost list, then increase 3377 * the target size of the MRU list. 3378 * - if we just hit in the MFU ghost list, then increase 3379 * the target size of the MFU list by decreasing the 3380 * target size of the MRU list. 3381 */ 3382 if (state == arc_mru_ghost) { 3383 mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ? 3384 1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size)); 3385 mult = MIN(mult, 10); /* avoid wild arc_p adjustment */ 3386 3387 arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult); 3388 } else if (state == arc_mfu_ghost) { 3389 uint64_t delta; 3390 3391 mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ? 3392 1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size)); 3393 mult = MIN(mult, 10); 3394 3395 delta = MIN(bytes * mult, arc_p); 3396 arc_p = MAX(arc_p_min, arc_p - delta); 3397 } 3398 ASSERT((int64_t)arc_p >= 0); 3399 3400 if (arc_reclaim_needed()) { 3401 cv_signal(&arc_reclaim_thr_cv); 3402 return; 3403 } 3404 3405 if (arc_no_grow) 3406 return; 3407 3408 if (arc_c >= arc_c_max) 3409 return; 3410 3411 /* 3412 * If we're within (2 * maxblocksize) bytes of the target 3413 * cache size, increment the target cache size 3414 */ 3415 if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) { 3416 DTRACE_PROBE1(arc__inc_adapt, int, bytes); 3417 atomic_add_64(&arc_c, (int64_t)bytes); 3418 if (arc_c > arc_c_max) 3419 arc_c = arc_c_max; 3420 else if (state == arc_anon) 3421 atomic_add_64(&arc_p, (int64_t)bytes); 3422 if (arc_p > arc_c) 3423 arc_p = arc_c; 3424 } 3425 ASSERT((int64_t)arc_p >= 0); 3426} 3427 3428/* 3429 * Check if the cache has reached its limits and eviction is required 3430 * prior to insert. 3431 */ 3432static int 3433arc_evict_needed(arc_buf_contents_t type) 3434{ 3435 if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit) 3436 return (1); 3437 3438 if (arc_reclaim_needed()) 3439 return (1); 3440 3441 return (arc_size > arc_c); 3442} 3443 3444/* 3445 * The buffer, supplied as the first argument, needs a data block. 3446 * So, if we are at cache max, determine which cache should be victimized. 3447 * We have the following cases: 3448 * 3449 * 1. Insert for MRU, p > sizeof(arc_anon + arc_mru) -> 3450 * In this situation if we're out of space, but the resident size of the MFU is 3451 * under the limit, victimize the MFU cache to satisfy this insertion request. 3452 * 3453 * 2. Insert for MRU, p <= sizeof(arc_anon + arc_mru) -> 3454 * Here, we've used up all of the available space for the MRU, so we need to 3455 * evict from our own cache instead. Evict from the set of resident MRU 3456 * entries. 3457 * 3458 * 3. Insert for MFU (c - p) > sizeof(arc_mfu) -> 3459 * c minus p represents the MFU space in the cache, since p is the size of the 3460 * cache that is dedicated to the MRU. In this situation there's still space on 3461 * the MFU side, so the MRU side needs to be victimized. 3462 * 3463 * 4. Insert for MFU (c - p) < sizeof(arc_mfu) -> 3464 * MFU's resident set is consuming more space than it has been allotted. In 3465 * this situation, we must victimize our own cache, the MFU, for this insertion. 3466 */ 3467static void 3468arc_get_data_buf(arc_buf_t *buf) 3469{ 3470 arc_state_t *state = buf->b_hdr->b_l1hdr.b_state; 3471 uint64_t size = buf->b_hdr->b_size; 3472 arc_buf_contents_t type = arc_buf_type(buf->b_hdr); 3473 3474 arc_adapt(size, state); 3475 3476 /* 3477 * We have not yet reached cache maximum size, 3478 * just allocate a new buffer. 3479 */ 3480 if (!arc_evict_needed(type)) { 3481 if (type == ARC_BUFC_METADATA) { 3482 buf->b_data = zio_buf_alloc(size); 3483 arc_space_consume(size, ARC_SPACE_META); 3484 } else { 3485 ASSERT(type == ARC_BUFC_DATA); 3486 buf->b_data = zio_data_buf_alloc(size); 3487 arc_space_consume(size, ARC_SPACE_DATA); 3488 } 3489 goto out; 3490 } 3491 3492 /* 3493 * If we are prefetching from the mfu ghost list, this buffer 3494 * will end up on the mru list; so steal space from there. 3495 */ 3496 if (state == arc_mfu_ghost) 3497 state = HDR_PREFETCH(buf->b_hdr) ? arc_mru : arc_mfu; 3498 else if (state == arc_mru_ghost) 3499 state = arc_mru; 3500 3501 if (state == arc_mru || state == arc_anon) { 3502 uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size; 3503 state = (arc_mfu->arcs_lsize[type] >= size && 3504 arc_p > mru_used) ? arc_mfu : arc_mru; 3505 } else { 3506 /* MFU cases */ 3507 uint64_t mfu_space = arc_c - arc_p; 3508 state = (arc_mru->arcs_lsize[type] >= size && 3509 mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu; 3510 } 3511 if ((buf->b_data = arc_evict(state, 0, size, TRUE, type)) == NULL) { 3512 if (type == ARC_BUFC_METADATA) { 3513 buf->b_data = zio_buf_alloc(size); 3514 arc_space_consume(size, ARC_SPACE_META); 3515 } else { 3516 ASSERT(type == ARC_BUFC_DATA); 3517 buf->b_data = zio_data_buf_alloc(size); 3518 arc_space_consume(size, ARC_SPACE_DATA); 3519 } 3520 ARCSTAT_BUMP(arcstat_recycle_miss); 3521 } 3522 ASSERT(buf->b_data != NULL); 3523out: 3524 /* 3525 * Update the state size. Note that ghost states have a 3526 * "ghost size" and so don't need to be updated. 3527 */ 3528 if (!GHOST_STATE(buf->b_hdr->b_l1hdr.b_state)) { 3529 arc_buf_hdr_t *hdr = buf->b_hdr; 3530 3531 atomic_add_64(&hdr->b_l1hdr.b_state->arcs_size, size); 3532 if (list_link_active(&hdr->b_l1hdr.b_arc_node)) { 3533 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 3534 atomic_add_64(&hdr->b_l1hdr.b_state->arcs_lsize[type], 3535 size); 3536 } 3537 /* 3538 * If we are growing the cache, and we are adding anonymous 3539 * data, and we have outgrown arc_p, update arc_p 3540 */ 3541 if (arc_size < arc_c && hdr->b_l1hdr.b_state == arc_anon && 3542 arc_anon->arcs_size + arc_mru->arcs_size > arc_p) 3543 arc_p = MIN(arc_c, arc_p + size); 3544 } 3545 ARCSTAT_BUMP(arcstat_allocated); 3546} 3547 3548/* 3549 * This routine is called whenever a buffer is accessed. 3550 * NOTE: the hash lock is dropped in this function. 3551 */ 3552static void 3553arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) 3554{ 3555 clock_t now; 3556 3557 ASSERT(MUTEX_HELD(hash_lock)); 3558 ASSERT(HDR_HAS_L1HDR(hdr)); 3559 3560 if (hdr->b_l1hdr.b_state == arc_anon) { 3561 /* 3562 * This buffer is not in the cache, and does not 3563 * appear in our "ghost" list. Add the new buffer 3564 * to the MRU state. 3565 */ 3566 3567 ASSERT0(hdr->b_l1hdr.b_arc_access); 3568 hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); 3569 DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr); 3570 arc_change_state(arc_mru, hdr, hash_lock); 3571 3572 } else if (hdr->b_l1hdr.b_state == arc_mru) { 3573 now = ddi_get_lbolt(); 3574 3575 /* 3576 * If this buffer is here because of a prefetch, then either: 3577 * - clear the flag if this is a "referencing" read 3578 * (any subsequent access will bump this into the MFU state). 3579 * or 3580 * - move the buffer to the head of the list if this is 3581 * another prefetch (to make it less likely to be evicted). 3582 */ 3583 if (HDR_PREFETCH(hdr)) { 3584 if (refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) { 3585 ASSERT(list_link_active( 3586 &hdr->b_l1hdr.b_arc_node)); 3587 } else { 3588 hdr->b_flags &= ~ARC_FLAG_PREFETCH; 3589 ARCSTAT_BUMP(arcstat_mru_hits); 3590 } 3591 hdr->b_l1hdr.b_arc_access = now; 3592 return; 3593 } 3594 3595 /* 3596 * This buffer has been "accessed" only once so far, 3597 * but it is still in the cache. Move it to the MFU 3598 * state. 3599 */ 3600 if (now > hdr->b_l1hdr.b_arc_access + ARC_MINTIME) { 3601 /* 3602 * More than 125ms have passed since we 3603 * instantiated this buffer. Move it to the 3604 * most frequently used state. 3605 */ 3606 hdr->b_l1hdr.b_arc_access = now; 3607 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); 3608 arc_change_state(arc_mfu, hdr, hash_lock); 3609 } 3610 ARCSTAT_BUMP(arcstat_mru_hits); 3611 } else if (hdr->b_l1hdr.b_state == arc_mru_ghost) { 3612 arc_state_t *new_state; 3613 /* 3614 * This buffer has been "accessed" recently, but 3615 * was evicted from the cache. Move it to the 3616 * MFU state. 3617 */ 3618 3619 if (HDR_PREFETCH(hdr)) { 3620 new_state = arc_mru; 3621 if (refcount_count(&hdr->b_l1hdr.b_refcnt) > 0) 3622 hdr->b_flags &= ~ARC_FLAG_PREFETCH; 3623 DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr); 3624 } else { 3625 new_state = arc_mfu; 3626 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); 3627 } 3628 3629 hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); 3630 arc_change_state(new_state, hdr, hash_lock); 3631 3632 ARCSTAT_BUMP(arcstat_mru_ghost_hits); 3633 } else if (hdr->b_l1hdr.b_state == arc_mfu) { 3634 /* 3635 * This buffer has been accessed more than once and is 3636 * still in the cache. Keep it in the MFU state. 3637 * 3638 * NOTE: an add_reference() that occurred when we did 3639 * the arc_read() will have kicked this off the list. 3640 * If it was a prefetch, we will explicitly move it to 3641 * the head of the list now. 3642 */ 3643 if ((HDR_PREFETCH(hdr)) != 0) { 3644 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 3645 ASSERT(list_link_active(&hdr->b_l1hdr.b_arc_node)); 3646 } 3647 ARCSTAT_BUMP(arcstat_mfu_hits); 3648 hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); 3649 } else if (hdr->b_l1hdr.b_state == arc_mfu_ghost) { 3650 arc_state_t *new_state = arc_mfu; 3651 /* 3652 * This buffer has been accessed more than once but has 3653 * been evicted from the cache. Move it back to the 3654 * MFU state. 3655 */ 3656 3657 if (HDR_PREFETCH(hdr)) { 3658 /* 3659 * This is a prefetch access... 3660 * move this block back to the MRU state. 3661 */ 3662 ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt)); 3663 new_state = arc_mru; 3664 } 3665 3666 hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); 3667 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); 3668 arc_change_state(new_state, hdr, hash_lock); 3669 3670 ARCSTAT_BUMP(arcstat_mfu_ghost_hits); 3671 } else if (hdr->b_l1hdr.b_state == arc_l2c_only) { 3672 /* 3673 * This buffer is on the 2nd Level ARC. 3674 */ 3675 3676 hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); 3677 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); 3678 arc_change_state(arc_mfu, hdr, hash_lock); 3679 } else { 3680 ASSERT(!"invalid arc state"); 3681 } 3682} 3683 3684/* a generic arc_done_func_t which you can use */ 3685/* ARGSUSED */ 3686void 3687arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg) 3688{ 3689 if (zio == NULL || zio->io_error == 0) 3690 bcopy(buf->b_data, arg, buf->b_hdr->b_size); 3691 VERIFY(arc_buf_remove_ref(buf, arg)); 3692} 3693 3694/* a generic arc_done_func_t */ 3695void 3696arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg) 3697{ 3698 arc_buf_t **bufp = arg; 3699 if (zio && zio->io_error) { 3700 VERIFY(arc_buf_remove_ref(buf, arg)); 3701 *bufp = NULL; 3702 } else { 3703 *bufp = buf; 3704 ASSERT(buf->b_data); 3705 } 3706} 3707 3708static void 3709arc_read_done(zio_t *zio) 3710{ 3711 arc_buf_hdr_t *hdr; 3712 arc_buf_t *buf; 3713 arc_buf_t *abuf; /* buffer we're assigning to callback */ 3714 kmutex_t *hash_lock = NULL; 3715 arc_callback_t *callback_list, *acb; 3716 int freeable = FALSE; 3717 3718 buf = zio->io_private; 3719 hdr = buf->b_hdr; 3720 3721 /* 3722 * The hdr was inserted into hash-table and removed from lists 3723 * prior to starting I/O. We should find this header, since 3724 * it's in the hash table, and it should be legit since it's 3725 * not possible to evict it during the I/O. The only possible 3726 * reason for it not to be found is if we were freed during the 3727 * read. 3728 */ 3729 if (HDR_IN_HASH_TABLE(hdr)) { 3730 ASSERT3U(hdr->b_birth, ==, BP_PHYSICAL_BIRTH(zio->io_bp)); 3731 ASSERT3U(hdr->b_dva.dva_word[0], ==, 3732 BP_IDENTITY(zio->io_bp)->dva_word[0]); 3733 ASSERT3U(hdr->b_dva.dva_word[1], ==, 3734 BP_IDENTITY(zio->io_bp)->dva_word[1]); 3735 3736 arc_buf_hdr_t *found = buf_hash_find(hdr->b_spa, zio->io_bp, 3737 &hash_lock); 3738 3739 ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && 3740 hash_lock == NULL) || 3741 (found == hdr && 3742 DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) || 3743 (found == hdr && HDR_L2_READING(hdr))); 3744 } 3745 3746 hdr->b_flags &= ~ARC_FLAG_L2_EVICTED; 3747 if (l2arc_noprefetch && HDR_PREFETCH(hdr)) 3748 hdr->b_flags &= ~ARC_FLAG_L2CACHE; 3749 3750 /* byteswap if necessary */ 3751 callback_list = hdr->b_l1hdr.b_acb; 3752 ASSERT(callback_list != NULL); 3753 if (BP_SHOULD_BYTESWAP(zio->io_bp) && zio->io_error == 0) { 3754 dmu_object_byteswap_t bswap = 3755 DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp)); 3756 arc_byteswap_func_t *func = BP_GET_LEVEL(zio->io_bp) > 0 ? 3757 byteswap_uint64_array : 3758 dmu_ot_byteswap[bswap].ob_func; 3759 func(buf->b_data, hdr->b_size); 3760 } 3761 3762 arc_cksum_compute(buf, B_FALSE); 3763#ifdef illumos 3764 arc_buf_watch(buf); 3765#endif /* illumos */ 3766 3767 if (hash_lock && zio->io_error == 0 && 3768 hdr->b_l1hdr.b_state == arc_anon) { 3769 /* 3770 * Only call arc_access on anonymous buffers. This is because 3771 * if we've issued an I/O for an evicted buffer, we've already 3772 * called arc_access (to prevent any simultaneous readers from 3773 * getting confused). 3774 */ 3775 arc_access(hdr, hash_lock); 3776 } 3777 3778 /* create copies of the data buffer for the callers */ 3779 abuf = buf; 3780 for (acb = callback_list; acb; acb = acb->acb_next) { 3781 if (acb->acb_done) { 3782 if (abuf == NULL) { 3783 ARCSTAT_BUMP(arcstat_duplicate_reads); 3784 abuf = arc_buf_clone(buf); 3785 } 3786 acb->acb_buf = abuf; 3787 abuf = NULL; 3788 } 3789 } 3790 hdr->b_l1hdr.b_acb = NULL; 3791 hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS; 3792 ASSERT(!HDR_BUF_AVAILABLE(hdr)); 3793 if (abuf == buf) { 3794 ASSERT(buf->b_efunc == NULL); 3795 ASSERT(hdr->b_l1hdr.b_datacnt == 1); 3796 hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE; 3797 } 3798 3799 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt) || 3800 callback_list != NULL); 3801 3802 if (zio->io_error != 0) { 3803 hdr->b_flags |= ARC_FLAG_IO_ERROR; 3804 if (hdr->b_l1hdr.b_state != arc_anon) 3805 arc_change_state(arc_anon, hdr, hash_lock); 3806 if (HDR_IN_HASH_TABLE(hdr)) 3807 buf_hash_remove(hdr); 3808 freeable = refcount_is_zero(&hdr->b_l1hdr.b_refcnt); 3809 } 3810 3811 /* 3812 * Broadcast before we drop the hash_lock to avoid the possibility 3813 * that the hdr (and hence the cv) might be freed before we get to 3814 * the cv_broadcast(). 3815 */ 3816 cv_broadcast(&hdr->b_l1hdr.b_cv); 3817 3818 if (hash_lock != NULL) { 3819 mutex_exit(hash_lock); 3820 } else { 3821 /* 3822 * This block was freed while we waited for the read to 3823 * complete. It has been removed from the hash table and 3824 * moved to the anonymous state (so that it won't show up 3825 * in the cache). 3826 */ 3827 ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); 3828 freeable = refcount_is_zero(&hdr->b_l1hdr.b_refcnt); 3829 } 3830 3831 /* execute each callback and free its structure */ 3832 while ((acb = callback_list) != NULL) { 3833 if (acb->acb_done) 3834 acb->acb_done(zio, acb->acb_buf, acb->acb_private); 3835 3836 if (acb->acb_zio_dummy != NULL) { 3837 acb->acb_zio_dummy->io_error = zio->io_error; 3838 zio_nowait(acb->acb_zio_dummy); 3839 } 3840 3841 callback_list = acb->acb_next; 3842 kmem_free(acb, sizeof (arc_callback_t)); 3843 } 3844 3845 if (freeable) 3846 arc_hdr_destroy(hdr); 3847} 3848 3849/* 3850 * "Read" the block block at the specified DVA (in bp) via the 3851 * cache. If the block is found in the cache, invoke the provided 3852 * callback immediately and return. Note that the `zio' parameter 3853 * in the callback will be NULL in this case, since no IO was 3854 * required. If the block is not in the cache pass the read request 3855 * on to the spa with a substitute callback function, so that the 3856 * requested block will be added to the cache. 3857 * 3858 * If a read request arrives for a block that has a read in-progress, 3859 * either wait for the in-progress read to complete (and return the 3860 * results); or, if this is a read with a "done" func, add a record 3861 * to the read to invoke the "done" func when the read completes, 3862 * and return; or just return. 3863 * 3864 * arc_read_done() will invoke all the requested "done" functions 3865 * for readers of this block. 3866 */ 3867int 3868arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, 3869 void *private, zio_priority_t priority, int zio_flags, 3870 arc_flags_t *arc_flags, const zbookmark_phys_t *zb) 3871{ 3872 arc_buf_hdr_t *hdr = NULL; 3873 arc_buf_t *buf = NULL; 3874 kmutex_t *hash_lock = NULL; 3875 zio_t *rzio; 3876 uint64_t guid = spa_load_guid(spa); 3877 3878 ASSERT(!BP_IS_EMBEDDED(bp) || 3879 BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA); 3880 3881top: 3882 if (!BP_IS_EMBEDDED(bp)) { 3883 /* 3884 * Embedded BP's have no DVA and require no I/O to "read". 3885 * Create an anonymous arc buf to back it. 3886 */ 3887 hdr = buf_hash_find(guid, bp, &hash_lock); 3888 } 3889 3890 if (hdr != NULL && HDR_HAS_L1HDR(hdr) && hdr->b_l1hdr.b_datacnt > 0) { 3891 3892 *arc_flags |= ARC_FLAG_CACHED; 3893 3894 if (HDR_IO_IN_PROGRESS(hdr)) { 3895 3896 if (*arc_flags & ARC_FLAG_WAIT) { 3897 cv_wait(&hdr->b_l1hdr.b_cv, hash_lock); 3898 mutex_exit(hash_lock); 3899 goto top; 3900 } 3901 ASSERT(*arc_flags & ARC_FLAG_NOWAIT); 3902 3903 if (done) { 3904 arc_callback_t *acb = NULL; 3905 3906 acb = kmem_zalloc(sizeof (arc_callback_t), 3907 KM_SLEEP); 3908 acb->acb_done = done; 3909 acb->acb_private = private; 3910 if (pio != NULL) 3911 acb->acb_zio_dummy = zio_null(pio, 3912 spa, NULL, NULL, NULL, zio_flags); 3913 3914 ASSERT(acb->acb_done != NULL); 3915 acb->acb_next = hdr->b_l1hdr.b_acb; 3916 hdr->b_l1hdr.b_acb = acb; 3917 add_reference(hdr, hash_lock, private); 3918 mutex_exit(hash_lock); 3919 return (0); 3920 } 3921 mutex_exit(hash_lock); 3922 return (0); 3923 } 3924 3925 ASSERT(hdr->b_l1hdr.b_state == arc_mru || 3926 hdr->b_l1hdr.b_state == arc_mfu); 3927 3928 if (done) { 3929 add_reference(hdr, hash_lock, private); 3930 /* 3931 * If this block is already in use, create a new 3932 * copy of the data so that we will be guaranteed 3933 * that arc_release() will always succeed. 3934 */ 3935 buf = hdr->b_l1hdr.b_buf; 3936 ASSERT(buf); 3937 ASSERT(buf->b_data); 3938 if (HDR_BUF_AVAILABLE(hdr)) { 3939 ASSERT(buf->b_efunc == NULL); 3940 hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE; 3941 } else { 3942 buf = arc_buf_clone(buf); 3943 } 3944 3945 } else if (*arc_flags & ARC_FLAG_PREFETCH && 3946 refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) { 3947 hdr->b_flags |= ARC_FLAG_PREFETCH; 3948 } 3949 DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); 3950 arc_access(hdr, hash_lock); 3951 if (*arc_flags & ARC_FLAG_L2CACHE) 3952 hdr->b_flags |= ARC_FLAG_L2CACHE; 3953 if (*arc_flags & ARC_FLAG_L2COMPRESS) 3954 hdr->b_flags |= ARC_FLAG_L2COMPRESS; 3955 mutex_exit(hash_lock); 3956 ARCSTAT_BUMP(arcstat_hits); 3957 ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr), 3958 demand, prefetch, !HDR_ISTYPE_METADATA(hdr), 3959 data, metadata, hits); 3960 3961 if (done) 3962 done(NULL, buf, private); 3963 } else { 3964 uint64_t size = BP_GET_LSIZE(bp); 3965 arc_callback_t *acb; 3966 vdev_t *vd = NULL; 3967 uint64_t addr = 0; 3968 boolean_t devw = B_FALSE; 3969 enum zio_compress b_compress = ZIO_COMPRESS_OFF; 3970 int32_t b_asize = 0; 3971 3972 if (hdr == NULL) { 3973 /* this block is not in the cache */ 3974 arc_buf_hdr_t *exists = NULL; 3975 arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp); 3976 buf = arc_buf_alloc(spa, size, private, type); 3977 hdr = buf->b_hdr; 3978 if (!BP_IS_EMBEDDED(bp)) { 3979 hdr->b_dva = *BP_IDENTITY(bp); 3980 hdr->b_birth = BP_PHYSICAL_BIRTH(bp); 3981 exists = buf_hash_insert(hdr, &hash_lock); 3982 } 3983 if (exists != NULL) { 3984 /* somebody beat us to the hash insert */ 3985 mutex_exit(hash_lock); 3986 buf_discard_identity(hdr); 3987 (void) arc_buf_remove_ref(buf, private); 3988 goto top; /* restart the IO request */ 3989 } 3990 3991 /* if this is a prefetch, we don't have a reference */ 3992 if (*arc_flags & ARC_FLAG_PREFETCH) { 3993 (void) remove_reference(hdr, hash_lock, 3994 private); 3995 hdr->b_flags |= ARC_FLAG_PREFETCH; 3996 } 3997 if (*arc_flags & ARC_FLAG_L2CACHE) 3998 hdr->b_flags |= ARC_FLAG_L2CACHE; 3999 if (*arc_flags & ARC_FLAG_L2COMPRESS) 4000 hdr->b_flags |= ARC_FLAG_L2COMPRESS; 4001 if (BP_GET_LEVEL(bp) > 0) 4002 hdr->b_flags |= ARC_FLAG_INDIRECT; 4003 } else { 4004 /* 4005 * This block is in the ghost cache. If it was L2-only 4006 * (and thus didn't have an L1 hdr), we realloc the 4007 * header to add an L1 hdr. 4008 */ 4009 if (!HDR_HAS_L1HDR(hdr)) { 4010 hdr = arc_hdr_realloc(hdr, hdr_l2only_cache, 4011 hdr_full_cache); 4012 } 4013 4014 ASSERT(GHOST_STATE(hdr->b_l1hdr.b_state)); 4015 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 4016 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 4017 ASSERT(hdr->b_l1hdr.b_buf == NULL); 4018 4019 /* if this is a prefetch, we don't have a reference */ 4020 if (*arc_flags & ARC_FLAG_PREFETCH) 4021 hdr->b_flags |= ARC_FLAG_PREFETCH; 4022 else 4023 add_reference(hdr, hash_lock, private); 4024 if (*arc_flags & ARC_FLAG_L2CACHE) 4025 hdr->b_flags |= ARC_FLAG_L2CACHE; 4026 if (*arc_flags & ARC_FLAG_L2COMPRESS) 4027 hdr->b_flags |= ARC_FLAG_L2COMPRESS; 4028 buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); 4029 buf->b_hdr = hdr; 4030 buf->b_data = NULL; 4031 buf->b_efunc = NULL; 4032 buf->b_private = NULL; 4033 buf->b_next = NULL; 4034 hdr->b_l1hdr.b_buf = buf; 4035 ASSERT0(hdr->b_l1hdr.b_datacnt); 4036 hdr->b_l1hdr.b_datacnt = 1; 4037 arc_get_data_buf(buf); 4038 arc_access(hdr, hash_lock); 4039 } 4040 4041 ASSERT(!GHOST_STATE(hdr->b_l1hdr.b_state)); 4042 4043 acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); 4044 acb->acb_done = done; 4045 acb->acb_private = private; 4046 4047 ASSERT(hdr->b_l1hdr.b_acb == NULL); 4048 hdr->b_l1hdr.b_acb = acb; 4049 hdr->b_flags |= ARC_FLAG_IO_IN_PROGRESS; 4050 4051 if (HDR_HAS_L2HDR(hdr) && 4052 (vd = hdr->b_l2hdr.b_dev->l2ad_vdev) != NULL) { 4053 devw = hdr->b_l2hdr.b_dev->l2ad_writing; 4054 addr = hdr->b_l2hdr.b_daddr; 4055 b_compress = HDR_GET_COMPRESS(hdr); 4056 b_asize = hdr->b_l2hdr.b_asize; 4057 /* 4058 * Lock out device removal. 4059 */ 4060 if (vdev_is_dead(vd) || 4061 !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER)) 4062 vd = NULL; 4063 } 4064 4065 if (hash_lock != NULL) 4066 mutex_exit(hash_lock); 4067 4068 /* 4069 * At this point, we have a level 1 cache miss. Try again in 4070 * L2ARC if possible. 4071 */ 4072 ASSERT3U(hdr->b_size, ==, size); 4073 DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp, 4074 uint64_t, size, zbookmark_phys_t *, zb); 4075 ARCSTAT_BUMP(arcstat_misses); 4076 ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr), 4077 demand, prefetch, !HDR_ISTYPE_METADATA(hdr), 4078 data, metadata, misses); 4079#ifdef _KERNEL 4080 curthread->td_ru.ru_inblock++; 4081#endif 4082 4083 if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) { 4084 /* 4085 * Read from the L2ARC if the following are true: 4086 * 1. The L2ARC vdev was previously cached. 4087 * 2. This buffer still has L2ARC metadata. 4088 * 3. This buffer isn't currently writing to the L2ARC. 4089 * 4. The L2ARC entry wasn't evicted, which may 4090 * also have invalidated the vdev. 4091 * 5. This isn't prefetch and l2arc_noprefetch is set. 4092 */ 4093 if (HDR_HAS_L2HDR(hdr) && 4094 !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) && 4095 !(l2arc_noprefetch && HDR_PREFETCH(hdr))) { 4096 l2arc_read_callback_t *cb; 4097 4098 DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr); 4099 ARCSTAT_BUMP(arcstat_l2_hits); 4100 4101 cb = kmem_zalloc(sizeof (l2arc_read_callback_t), 4102 KM_SLEEP); 4103 cb->l2rcb_buf = buf; 4104 cb->l2rcb_spa = spa; 4105 cb->l2rcb_bp = *bp; 4106 cb->l2rcb_zb = *zb; 4107 cb->l2rcb_flags = zio_flags; 4108 cb->l2rcb_compress = b_compress; 4109 4110 ASSERT(addr >= VDEV_LABEL_START_SIZE && 4111 addr + size < vd->vdev_psize - 4112 VDEV_LABEL_END_SIZE); 4113 4114 /* 4115 * l2arc read. The SCL_L2ARC lock will be 4116 * released by l2arc_read_done(). 4117 * Issue a null zio if the underlying buffer 4118 * was squashed to zero size by compression. 4119 */ 4120 if (b_compress == ZIO_COMPRESS_EMPTY) { 4121 rzio = zio_null(pio, spa, vd, 4122 l2arc_read_done, cb, 4123 zio_flags | ZIO_FLAG_DONT_CACHE | 4124 ZIO_FLAG_CANFAIL | 4125 ZIO_FLAG_DONT_PROPAGATE | 4126 ZIO_FLAG_DONT_RETRY); 4127 } else { 4128 rzio = zio_read_phys(pio, vd, addr, 4129 b_asize, buf->b_data, 4130 ZIO_CHECKSUM_OFF, 4131 l2arc_read_done, cb, priority, 4132 zio_flags | ZIO_FLAG_DONT_CACHE | 4133 ZIO_FLAG_CANFAIL | 4134 ZIO_FLAG_DONT_PROPAGATE | 4135 ZIO_FLAG_DONT_RETRY, B_FALSE); 4136 } 4137 DTRACE_PROBE2(l2arc__read, vdev_t *, vd, 4138 zio_t *, rzio); 4139 ARCSTAT_INCR(arcstat_l2_read_bytes, b_asize); 4140 4141 if (*arc_flags & ARC_FLAG_NOWAIT) { 4142 zio_nowait(rzio); 4143 return (0); 4144 } 4145 4146 ASSERT(*arc_flags & ARC_FLAG_WAIT); 4147 if (zio_wait(rzio) == 0) 4148 return (0); 4149 4150 /* l2arc read error; goto zio_read() */ 4151 } else { 4152 DTRACE_PROBE1(l2arc__miss, 4153 arc_buf_hdr_t *, hdr); 4154 ARCSTAT_BUMP(arcstat_l2_misses); 4155 if (HDR_L2_WRITING(hdr)) 4156 ARCSTAT_BUMP(arcstat_l2_rw_clash); 4157 spa_config_exit(spa, SCL_L2ARC, vd); 4158 } 4159 } else { 4160 if (vd != NULL) 4161 spa_config_exit(spa, SCL_L2ARC, vd); 4162 if (l2arc_ndev != 0) { 4163 DTRACE_PROBE1(l2arc__miss, 4164 arc_buf_hdr_t *, hdr); 4165 ARCSTAT_BUMP(arcstat_l2_misses); 4166 } 4167 } 4168 4169 rzio = zio_read(pio, spa, bp, buf->b_data, size, 4170 arc_read_done, buf, priority, zio_flags, zb); 4171 4172 if (*arc_flags & ARC_FLAG_WAIT) 4173 return (zio_wait(rzio)); 4174 4175 ASSERT(*arc_flags & ARC_FLAG_NOWAIT); 4176 zio_nowait(rzio); 4177 } 4178 return (0); 4179} 4180 4181void 4182arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private) 4183{ 4184 ASSERT(buf->b_hdr != NULL); 4185 ASSERT(buf->b_hdr->b_l1hdr.b_state != arc_anon); 4186 ASSERT(!refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt) || 4187 func == NULL); 4188 ASSERT(buf->b_efunc == NULL); 4189 ASSERT(!HDR_BUF_AVAILABLE(buf->b_hdr)); 4190 4191 buf->b_efunc = func; 4192 buf->b_private = private; 4193} 4194 4195/* 4196 * Notify the arc that a block was freed, and thus will never be used again. 4197 */ 4198void 4199arc_freed(spa_t *spa, const blkptr_t *bp) 4200{ 4201 arc_buf_hdr_t *hdr; 4202 kmutex_t *hash_lock; 4203 uint64_t guid = spa_load_guid(spa); 4204 4205 ASSERT(!BP_IS_EMBEDDED(bp)); 4206 4207 hdr = buf_hash_find(guid, bp, &hash_lock); 4208 if (hdr == NULL) 4209 return; 4210 if (HDR_BUF_AVAILABLE(hdr)) { 4211 arc_buf_t *buf = hdr->b_l1hdr.b_buf; 4212 add_reference(hdr, hash_lock, FTAG); 4213 hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE; 4214 mutex_exit(hash_lock); 4215 4216 arc_release(buf, FTAG); 4217 (void) arc_buf_remove_ref(buf, FTAG); 4218 } else { 4219 mutex_exit(hash_lock); 4220 } 4221 4222} 4223 4224/* 4225 * Clear the user eviction callback set by arc_set_callback(), first calling 4226 * it if it exists. Because the presence of a callback keeps an arc_buf cached 4227 * clearing the callback may result in the arc_buf being destroyed. However, 4228 * it will not result in the *last* arc_buf being destroyed, hence the data 4229 * will remain cached in the ARC. We make a copy of the arc buffer here so 4230 * that we can process the callback without holding any locks. 4231 * 4232 * It's possible that the callback is already in the process of being cleared 4233 * by another thread. In this case we can not clear the callback. 4234 * 4235 * Returns B_TRUE if the callback was successfully called and cleared. 4236 */ 4237boolean_t 4238arc_clear_callback(arc_buf_t *buf) 4239{ 4240 arc_buf_hdr_t *hdr; 4241 kmutex_t *hash_lock; 4242 arc_evict_func_t *efunc = buf->b_efunc; 4243 void *private = buf->b_private; 4244 list_t *list, *evicted_list; 4245 kmutex_t *lock, *evicted_lock; 4246 4247 mutex_enter(&buf->b_evict_lock); 4248 hdr = buf->b_hdr; 4249 if (hdr == NULL) { 4250 /* 4251 * We are in arc_do_user_evicts(). 4252 */ 4253 ASSERT(buf->b_data == NULL); 4254 mutex_exit(&buf->b_evict_lock); 4255 return (B_FALSE); 4256 } else if (buf->b_data == NULL) { 4257 /* 4258 * We are on the eviction list; process this buffer now 4259 * but let arc_do_user_evicts() do the reaping. 4260 */ 4261 buf->b_efunc = NULL; 4262 mutex_exit(&buf->b_evict_lock); 4263 VERIFY0(efunc(private)); 4264 return (B_TRUE); 4265 } 4266 hash_lock = HDR_LOCK(hdr); 4267 mutex_enter(hash_lock); 4268 hdr = buf->b_hdr; 4269 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 4270 4271 ASSERT3U(refcount_count(&hdr->b_l1hdr.b_refcnt), <, 4272 hdr->b_l1hdr.b_datacnt); 4273 ASSERT(hdr->b_l1hdr.b_state == arc_mru || 4274 hdr->b_l1hdr.b_state == arc_mfu); 4275 4276 buf->b_efunc = NULL; 4277 buf->b_private = NULL; 4278 4279 if (hdr->b_l1hdr.b_datacnt > 1) { 4280 mutex_exit(&buf->b_evict_lock); 4281 arc_buf_destroy(buf, FALSE, TRUE); 4282 } else { 4283 ASSERT(buf == hdr->b_l1hdr.b_buf); 4284 hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE; 4285 mutex_exit(&buf->b_evict_lock); 4286 } 4287 4288 mutex_exit(hash_lock); 4289 VERIFY0(efunc(private)); 4290 return (B_TRUE); 4291} 4292 4293/* 4294 * Release this buffer from the cache, making it an anonymous buffer. This 4295 * must be done after a read and prior to modifying the buffer contents. 4296 * If the buffer has more than one reference, we must make 4297 * a new hdr for the buffer. 4298 */ 4299void 4300arc_release(arc_buf_t *buf, void *tag) 4301{ 4302 arc_buf_hdr_t *hdr = buf->b_hdr; 4303 4304 /* 4305 * It would be nice to assert that if it's DMU metadata (level > 4306 * 0 || it's the dnode file), then it must be syncing context. 4307 * But we don't know that information at this level. 4308 */ 4309 4310 mutex_enter(&buf->b_evict_lock); 4311 /* 4312 * We don't grab the hash lock prior to this check, because if 4313 * the buffer's header is in the arc_anon state, it won't be 4314 * linked into the hash table. 4315 */ 4316 if (hdr->b_l1hdr.b_state == arc_anon) { 4317 mutex_exit(&buf->b_evict_lock); 4318 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 4319 ASSERT(!HDR_IN_HASH_TABLE(hdr)); 4320 ASSERT(!HDR_HAS_L2HDR(hdr)); 4321 ASSERT(BUF_EMPTY(hdr)); 4322 ASSERT3U(hdr->b_l1hdr.b_datacnt, ==, 1); 4323 ASSERT3S(refcount_count(&hdr->b_l1hdr.b_refcnt), ==, 1); 4324 ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node)); 4325 4326 ASSERT3P(buf->b_efunc, ==, NULL); 4327 ASSERT3P(buf->b_private, ==, NULL); 4328 4329 hdr->b_l1hdr.b_arc_access = 0; 4330 arc_buf_thaw(buf); 4331 4332 return; 4333 } 4334 4335 kmutex_t *hash_lock = HDR_LOCK(hdr); 4336 mutex_enter(hash_lock); 4337 4338 /* 4339 * This assignment is only valid as long as the hash_lock is 4340 * held, we must be careful not to reference state or the 4341 * b_state field after dropping the lock. 4342 */ 4343 arc_state_t *state = hdr->b_l1hdr.b_state; 4344 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 4345 ASSERT3P(state, !=, arc_anon); 4346 4347 /* this buffer is not on any list */ 4348 ASSERT(refcount_count(&hdr->b_l1hdr.b_refcnt) > 0); 4349 4350 if (HDR_HAS_L2HDR(hdr)) { 4351 mutex_enter(&hdr->b_l2hdr.b_dev->l2ad_mtx); 4352 4353 /* 4354 * We have to recheck this conditional again now that 4355 * we're holding the l2ad_mtx to prevent a race with 4356 * another thread which might be concurrently calling 4357 * l2arc_evict(). In that case, l2arc_evict() might have 4358 * destroyed the header's L2 portion as we were waiting 4359 * to acquire the l2ad_mtx. 4360 */ 4361 if (HDR_HAS_L2HDR(hdr)) { 4362 trim_map_free(hdr->b_l2hdr.b_dev->l2ad_vdev, 4363 hdr->b_l2hdr.b_daddr, hdr->b_l2hdr.b_asize, 0); 4364 arc_hdr_l2hdr_destroy(hdr); 4365 } 4366 4367 mutex_exit(&hdr->b_l2hdr.b_dev->l2ad_mtx); 4368 } 4369 4370 /* 4371 * Do we have more than one buf? 4372 */ 4373 if (hdr->b_l1hdr.b_datacnt > 1) { 4374 arc_buf_hdr_t *nhdr; 4375 arc_buf_t **bufp; 4376 uint64_t blksz = hdr->b_size; 4377 uint64_t spa = hdr->b_spa; 4378 arc_buf_contents_t type = arc_buf_type(hdr); 4379 uint32_t flags = hdr->b_flags; 4380 4381 ASSERT(hdr->b_l1hdr.b_buf != buf || buf->b_next != NULL); 4382 /* 4383 * Pull the data off of this hdr and attach it to 4384 * a new anonymous hdr. 4385 */ 4386 (void) remove_reference(hdr, hash_lock, tag); 4387 bufp = &hdr->b_l1hdr.b_buf; 4388 while (*bufp != buf) 4389 bufp = &(*bufp)->b_next; 4390 *bufp = buf->b_next; 4391 buf->b_next = NULL; 4392 4393 ASSERT3P(state, !=, arc_l2c_only); 4394 ASSERT3U(state->arcs_size, >=, hdr->b_size); 4395 atomic_add_64(&state->arcs_size, -hdr->b_size); 4396 if (refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) { 4397 ASSERT3P(state, !=, arc_l2c_only); 4398 uint64_t *size = &state->arcs_lsize[type]; 4399 ASSERT3U(*size, >=, hdr->b_size); 4400 atomic_add_64(size, -hdr->b_size); 4401 } 4402 4403 /* 4404 * We're releasing a duplicate user data buffer, update 4405 * our statistics accordingly. 4406 */ 4407 if (HDR_ISTYPE_DATA(hdr)) { 4408 ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers); 4409 ARCSTAT_INCR(arcstat_duplicate_buffers_size, 4410 -hdr->b_size); 4411 } 4412 hdr->b_l1hdr.b_datacnt -= 1; 4413 arc_cksum_verify(buf); 4414#ifdef illumos 4415 arc_buf_unwatch(buf); 4416#endif /* illumos */ 4417 4418 mutex_exit(hash_lock); 4419 4420 nhdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE); 4421 nhdr->b_size = blksz; 4422 nhdr->b_spa = spa; 4423 4424 nhdr->b_flags = flags & ARC_FLAG_L2_WRITING; 4425 nhdr->b_flags |= arc_bufc_to_flags(type); 4426 nhdr->b_flags |= ARC_FLAG_HAS_L1HDR; 4427 4428 nhdr->b_l1hdr.b_buf = buf; 4429 nhdr->b_l1hdr.b_datacnt = 1; 4430 nhdr->b_l1hdr.b_state = arc_anon; 4431 nhdr->b_l1hdr.b_arc_access = 0; 4432 nhdr->b_freeze_cksum = NULL; 4433 4434 (void) refcount_add(&nhdr->b_l1hdr.b_refcnt, tag); 4435 buf->b_hdr = nhdr; 4436 mutex_exit(&buf->b_evict_lock); 4437 atomic_add_64(&arc_anon->arcs_size, blksz); 4438 } else { 4439 mutex_exit(&buf->b_evict_lock); 4440 ASSERT(refcount_count(&hdr->b_l1hdr.b_refcnt) == 1); 4441 /* protected by hash lock */ 4442 ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node)); 4443 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 4444 arc_change_state(arc_anon, hdr, hash_lock); 4445 hdr->b_l1hdr.b_arc_access = 0; 4446 mutex_exit(hash_lock); 4447 4448 buf_discard_identity(hdr); 4449 arc_buf_thaw(buf); 4450 } 4451 buf->b_efunc = NULL; 4452 buf->b_private = NULL; 4453} 4454 4455int 4456arc_released(arc_buf_t *buf) 4457{ 4458 int released; 4459 4460 mutex_enter(&buf->b_evict_lock); 4461 released = (buf->b_data != NULL && 4462 buf->b_hdr->b_l1hdr.b_state == arc_anon); 4463 mutex_exit(&buf->b_evict_lock); 4464 return (released); 4465} 4466 4467#ifdef ZFS_DEBUG 4468int 4469arc_referenced(arc_buf_t *buf) 4470{ 4471 int referenced; 4472 4473 mutex_enter(&buf->b_evict_lock); 4474 referenced = (refcount_count(&buf->b_hdr->b_l1hdr.b_refcnt)); 4475 mutex_exit(&buf->b_evict_lock); 4476 return (referenced); 4477} 4478#endif 4479 4480static void 4481arc_write_ready(zio_t *zio) 4482{ 4483 arc_write_callback_t *callback = zio->io_private; 4484 arc_buf_t *buf = callback->awcb_buf; 4485 arc_buf_hdr_t *hdr = buf->b_hdr; 4486 4487 ASSERT(HDR_HAS_L1HDR(hdr)); 4488 ASSERT(!refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt)); 4489 ASSERT(hdr->b_l1hdr.b_datacnt > 0); 4490 callback->awcb_ready(zio, buf, callback->awcb_private); 4491 4492 /* 4493 * If the IO is already in progress, then this is a re-write 4494 * attempt, so we need to thaw and re-compute the cksum. 4495 * It is the responsibility of the callback to handle the 4496 * accounting for any re-write attempt. 4497 */ 4498 if (HDR_IO_IN_PROGRESS(hdr)) { 4499 mutex_enter(&hdr->b_l1hdr.b_freeze_lock); 4500 if (hdr->b_freeze_cksum != NULL) { 4501 kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t)); 4502 hdr->b_freeze_cksum = NULL; 4503 } 4504 mutex_exit(&hdr->b_l1hdr.b_freeze_lock); 4505 } 4506 arc_cksum_compute(buf, B_FALSE); 4507 hdr->b_flags |= ARC_FLAG_IO_IN_PROGRESS; 4508} 4509 4510/* 4511 * The SPA calls this callback for each physical write that happens on behalf 4512 * of a logical write. See the comment in dbuf_write_physdone() for details. 4513 */ 4514static void 4515arc_write_physdone(zio_t *zio) 4516{ 4517 arc_write_callback_t *cb = zio->io_private; 4518 if (cb->awcb_physdone != NULL) 4519 cb->awcb_physdone(zio, cb->awcb_buf, cb->awcb_private); 4520} 4521 4522static void 4523arc_write_done(zio_t *zio) 4524{ 4525 arc_write_callback_t *callback = zio->io_private; 4526 arc_buf_t *buf = callback->awcb_buf; 4527 arc_buf_hdr_t *hdr = buf->b_hdr; 4528 4529 ASSERT(hdr->b_l1hdr.b_acb == NULL); 4530 4531 if (zio->io_error == 0) { 4532 if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) { 4533 buf_discard_identity(hdr); 4534 } else { 4535 hdr->b_dva = *BP_IDENTITY(zio->io_bp); 4536 hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp); 4537 } 4538 } else { 4539 ASSERT(BUF_EMPTY(hdr)); 4540 } 4541 4542 /* 4543 * If the block to be written was all-zero or compressed enough to be 4544 * embedded in the BP, no write was performed so there will be no 4545 * dva/birth/checksum. The buffer must therefore remain anonymous 4546 * (and uncached). 4547 */ 4548 if (!BUF_EMPTY(hdr)) { 4549 arc_buf_hdr_t *exists; 4550 kmutex_t *hash_lock; 4551 4552 ASSERT(zio->io_error == 0); 4553 4554 arc_cksum_verify(buf); 4555 4556 exists = buf_hash_insert(hdr, &hash_lock); 4557 if (exists != NULL) { 4558 /* 4559 * This can only happen if we overwrite for 4560 * sync-to-convergence, because we remove 4561 * buffers from the hash table when we arc_free(). 4562 */ 4563 if (zio->io_flags & ZIO_FLAG_IO_REWRITE) { 4564 if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp)) 4565 panic("bad overwrite, hdr=%p exists=%p", 4566 (void *)hdr, (void *)exists); 4567 ASSERT(refcount_is_zero( 4568 &exists->b_l1hdr.b_refcnt)); 4569 arc_change_state(arc_anon, exists, hash_lock); 4570 mutex_exit(hash_lock); 4571 arc_hdr_destroy(exists); 4572 exists = buf_hash_insert(hdr, &hash_lock); 4573 ASSERT3P(exists, ==, NULL); 4574 } else if (zio->io_flags & ZIO_FLAG_NOPWRITE) { 4575 /* nopwrite */ 4576 ASSERT(zio->io_prop.zp_nopwrite); 4577 if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp)) 4578 panic("bad nopwrite, hdr=%p exists=%p", 4579 (void *)hdr, (void *)exists); 4580 } else { 4581 /* Dedup */ 4582 ASSERT(hdr->b_l1hdr.b_datacnt == 1); 4583 ASSERT(hdr->b_l1hdr.b_state == arc_anon); 4584 ASSERT(BP_GET_DEDUP(zio->io_bp)); 4585 ASSERT(BP_GET_LEVEL(zio->io_bp) == 0); 4586 } 4587 } 4588 hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS; 4589 /* if it's not anon, we are doing a scrub */ 4590 if (exists == NULL && hdr->b_l1hdr.b_state == arc_anon) 4591 arc_access(hdr, hash_lock); 4592 mutex_exit(hash_lock); 4593 } else { 4594 hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS; 4595 } 4596 4597 ASSERT(!refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 4598 callback->awcb_done(zio, buf, callback->awcb_private); 4599 4600 kmem_free(callback, sizeof (arc_write_callback_t)); 4601} 4602 4603zio_t * 4604arc_write(zio_t *pio, spa_t *spa, uint64_t txg, 4605 blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, boolean_t l2arc_compress, 4606 const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *physdone, 4607 arc_done_func_t *done, void *private, zio_priority_t priority, 4608 int zio_flags, const zbookmark_phys_t *zb) 4609{ 4610 arc_buf_hdr_t *hdr = buf->b_hdr; 4611 arc_write_callback_t *callback; 4612 zio_t *zio; 4613 4614 ASSERT(ready != NULL); 4615 ASSERT(done != NULL); 4616 ASSERT(!HDR_IO_ERROR(hdr)); 4617 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 4618 ASSERT(hdr->b_l1hdr.b_acb == NULL); 4619 ASSERT(hdr->b_l1hdr.b_datacnt > 0); 4620 if (l2arc) 4621 hdr->b_flags |= ARC_FLAG_L2CACHE; 4622 if (l2arc_compress) 4623 hdr->b_flags |= ARC_FLAG_L2COMPRESS; 4624 callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP); 4625 callback->awcb_ready = ready; 4626 callback->awcb_physdone = physdone; 4627 callback->awcb_done = done; 4628 callback->awcb_private = private; 4629 callback->awcb_buf = buf; 4630 4631 zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, zp, 4632 arc_write_ready, arc_write_physdone, arc_write_done, callback, 4633 priority, zio_flags, zb); 4634 4635 return (zio); 4636} 4637 4638static int 4639arc_memory_throttle(uint64_t reserve, uint64_t txg) 4640{ 4641#ifdef _KERNEL 4642 uint64_t available_memory = ptob(freemem); 4643 static uint64_t page_load = 0; 4644 static uint64_t last_txg = 0; 4645 4646#if defined(__i386) || !defined(UMA_MD_SMALL_ALLOC) 4647 available_memory = 4648 MIN(available_memory, ptob(vmem_size(heap_arena, VMEM_FREE))); 4649#endif 4650 4651 if (freemem > (uint64_t)physmem * arc_lotsfree_percent / 100) 4652 return (0); 4653 4654 if (txg > last_txg) { 4655 last_txg = txg; 4656 page_load = 0; 4657 } 4658 /* 4659 * If we are in pageout, we know that memory is already tight, 4660 * the arc is already going to be evicting, so we just want to 4661 * continue to let page writes occur as quickly as possible. 4662 */ 4663 if (curproc == pageproc) { 4664 if (page_load > MAX(ptob(minfree), available_memory) / 4) 4665 return (SET_ERROR(ERESTART)); 4666 /* Note: reserve is inflated, so we deflate */ 4667 page_load += reserve / 8; 4668 return (0); 4669 } else if (page_load > 0 && arc_reclaim_needed()) { 4670 /* memory is low, delay before restarting */ 4671 ARCSTAT_INCR(arcstat_memory_throttle_count, 1); 4672 return (SET_ERROR(EAGAIN)); 4673 } 4674 page_load = 0; 4675#endif 4676 return (0); 4677} 4678 4679static void 4680arc_kstat_update_state(arc_state_t *state, kstat_named_t *size, 4681 kstat_named_t *evict_data, kstat_named_t *evict_metadata) 4682{ 4683 size->value.ui64 = state->arcs_size; 4684 evict_data->value.ui64 = state->arcs_lsize[ARC_BUFC_DATA]; 4685 evict_metadata->value.ui64 = state->arcs_lsize[ARC_BUFC_METADATA]; 4686} 4687 4688static int 4689arc_kstat_update(kstat_t *ksp, int rw) 4690{ 4691 arc_stats_t *as = ksp->ks_data; 4692 4693 if (rw == KSTAT_WRITE) { 4694 return (EACCES); 4695 } else { 4696 arc_kstat_update_state(arc_anon, 4697 &as->arcstat_anon_size, 4698 &as->arcstat_anon_evictable_data, 4699 &as->arcstat_anon_evictable_metadata); 4700 arc_kstat_update_state(arc_mru, 4701 &as->arcstat_mru_size, 4702 &as->arcstat_mru_evictable_data, 4703 &as->arcstat_mru_evictable_metadata); 4704 arc_kstat_update_state(arc_mru_ghost, 4705 &as->arcstat_mru_ghost_size, 4706 &as->arcstat_mru_ghost_evictable_data, 4707 &as->arcstat_mru_ghost_evictable_metadata); 4708 arc_kstat_update_state(arc_mfu, 4709 &as->arcstat_mfu_size, 4710 &as->arcstat_mfu_evictable_data, 4711 &as->arcstat_mfu_evictable_metadata); 4712 arc_kstat_update_state(arc_mfu_ghost, 4713 &as->arcstat_mfu_ghost_size, 4714 &as->arcstat_mfu_ghost_evictable_data, 4715 &as->arcstat_mfu_ghost_evictable_metadata); 4716 } 4717 4718 return (0); 4719} 4720 4721void 4722arc_tempreserve_clear(uint64_t reserve) 4723{ 4724 atomic_add_64(&arc_tempreserve, -reserve); 4725 ASSERT((int64_t)arc_tempreserve >= 0); 4726} 4727 4728int 4729arc_tempreserve_space(uint64_t reserve, uint64_t txg) 4730{ 4731 int error; 4732 uint64_t anon_size; 4733 4734 if (reserve > arc_c/4 && !arc_no_grow) { 4735 arc_c = MIN(arc_c_max, reserve * 4); 4736 DTRACE_PROBE1(arc__set_reserve, uint64_t, arc_c); 4737 } 4738 if (reserve > arc_c) 4739 return (SET_ERROR(ENOMEM)); 4740 4741 /* 4742 * Don't count loaned bufs as in flight dirty data to prevent long 4743 * network delays from blocking transactions that are ready to be 4744 * assigned to a txg. 4745 */ 4746 anon_size = MAX((int64_t)(arc_anon->arcs_size - arc_loaned_bytes), 0); 4747 4748 /* 4749 * Writes will, almost always, require additional memory allocations 4750 * in order to compress/encrypt/etc the data. We therefore need to 4751 * make sure that there is sufficient available memory for this. 4752 */ 4753 error = arc_memory_throttle(reserve, txg); 4754 if (error != 0) 4755 return (error); 4756 4757 /* 4758 * Throttle writes when the amount of dirty data in the cache 4759 * gets too large. We try to keep the cache less than half full 4760 * of dirty blocks so that our sync times don't grow too large. 4761 * Note: if two requests come in concurrently, we might let them 4762 * both succeed, when one of them should fail. Not a huge deal. 4763 */ 4764 4765 if (reserve + arc_tempreserve + anon_size > arc_c / 2 && 4766 anon_size > arc_c / 4) { 4767 dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK " 4768 "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n", 4769 arc_tempreserve>>10, 4770 arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10, 4771 arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10, 4772 reserve>>10, arc_c>>10); 4773 return (SET_ERROR(ERESTART)); 4774 } 4775 atomic_add_64(&arc_tempreserve, reserve); 4776 return (0); 4777} 4778 4779static kmutex_t arc_lowmem_lock; 4780#ifdef _KERNEL 4781static eventhandler_tag arc_event_lowmem = NULL; 4782 4783static void 4784arc_lowmem(void *arg __unused, int howto __unused) 4785{ 4786 4787 /* Serialize access via arc_lowmem_lock. */ 4788 mutex_enter(&arc_lowmem_lock); 4789 mutex_enter(&arc_reclaim_thr_lock); 4790 needfree = 1; 4791 DTRACE_PROBE(arc__needfree); 4792 cv_signal(&arc_reclaim_thr_cv); 4793 4794 /* 4795 * It is unsafe to block here in arbitrary threads, because we can come 4796 * here from ARC itself and may hold ARC locks and thus risk a deadlock 4797 * with ARC reclaim thread. 4798 */ 4799 if (curproc == pageproc) { 4800 while (needfree) 4801 msleep(&needfree, &arc_reclaim_thr_lock, 0, "zfs:lowmem", 0); 4802 } 4803 mutex_exit(&arc_reclaim_thr_lock); 4804 mutex_exit(&arc_lowmem_lock); 4805} 4806#endif 4807 4808void 4809arc_init(void) 4810{ 4811 int i, prefetch_tunable_set = 0; 4812 4813 mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL); 4814 cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL); 4815 mutex_init(&arc_lowmem_lock, NULL, MUTEX_DEFAULT, NULL); 4816 4817 /* Convert seconds to clock ticks */ 4818 arc_min_prefetch_lifespan = 1 * hz; 4819 4820 /* Start out with 1/8 of all memory */ 4821 arc_c = kmem_size() / 8; 4822 4823#ifdef sun 4824#ifdef _KERNEL 4825 /* 4826 * On architectures where the physical memory can be larger 4827 * than the addressable space (intel in 32-bit mode), we may 4828 * need to limit the cache to 1/8 of VM size. 4829 */ 4830 arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8); 4831#endif 4832#endif /* sun */ 4833 /* set min cache to 1/32 of all memory, or 16MB, whichever is more */ 4834 arc_c_min = MAX(arc_c / 4, 16 << 20); 4835 /* set max to 1/2 of all memory, or all but 1GB, whichever is more */ 4836 if (arc_c * 8 >= 1 << 30) 4837 arc_c_max = (arc_c * 8) - (1 << 30); 4838 else 4839 arc_c_max = arc_c_min; 4840 arc_c_max = MAX(arc_c * 5, arc_c_max); 4841 4842#ifdef _KERNEL 4843 /* 4844 * Allow the tunables to override our calculations if they are 4845 * reasonable (ie. over 16MB) 4846 */ 4847 if (zfs_arc_max > 16 << 20 && zfs_arc_max < kmem_size()) 4848 arc_c_max = zfs_arc_max; 4849 if (zfs_arc_min > 16 << 20 && zfs_arc_min <= arc_c_max) 4850 arc_c_min = zfs_arc_min; 4851#endif 4852 4853 arc_c = arc_c_max; 4854 arc_p = (arc_c >> 1); 4855 4856 /* limit meta-data to 1/4 of the arc capacity */ 4857 arc_meta_limit = arc_c_max / 4; 4858 4859 /* Allow the tunable to override if it is reasonable */ 4860 if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max) 4861 arc_meta_limit = zfs_arc_meta_limit; 4862 4863 if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0) 4864 arc_c_min = arc_meta_limit / 2; 4865 4866 if (zfs_arc_meta_min > 0) { 4867 arc_meta_min = zfs_arc_meta_min; 4868 } else { 4869 arc_meta_min = arc_c_min / 2; 4870 } 4871 4872 if (zfs_arc_grow_retry > 0) 4873 arc_grow_retry = zfs_arc_grow_retry; 4874 4875 if (zfs_arc_shrink_shift > 0) 4876 arc_shrink_shift = zfs_arc_shrink_shift; 4877 4878 if (zfs_arc_p_min_shift > 0) 4879 arc_p_min_shift = zfs_arc_p_min_shift; 4880 4881 /* if kmem_flags are set, lets try to use less memory */ 4882 if (kmem_debugging()) 4883 arc_c = arc_c / 2; 4884 if (arc_c < arc_c_min) 4885 arc_c = arc_c_min; 4886 4887 zfs_arc_min = arc_c_min; 4888 zfs_arc_max = arc_c_max; 4889 4890 arc_anon = &ARC_anon; 4891 arc_mru = &ARC_mru; 4892 arc_mru_ghost = &ARC_mru_ghost; 4893 arc_mfu = &ARC_mfu; 4894 arc_mfu_ghost = &ARC_mfu_ghost; 4895 arc_l2c_only = &ARC_l2c_only; 4896 arc_size = 0; 4897 4898 for (i = 0; i < ARC_BUFC_NUMLISTS; i++) { 4899 mutex_init(&arc_anon->arcs_locks[i].arcs_lock, 4900 NULL, MUTEX_DEFAULT, NULL); 4901 mutex_init(&arc_mru->arcs_locks[i].arcs_lock, 4902 NULL, MUTEX_DEFAULT, NULL); 4903 mutex_init(&arc_mru_ghost->arcs_locks[i].arcs_lock, 4904 NULL, MUTEX_DEFAULT, NULL); 4905 mutex_init(&arc_mfu->arcs_locks[i].arcs_lock, 4906 NULL, MUTEX_DEFAULT, NULL); 4907 mutex_init(&arc_mfu_ghost->arcs_locks[i].arcs_lock, 4908 NULL, MUTEX_DEFAULT, NULL); 4909 mutex_init(&arc_l2c_only->arcs_locks[i].arcs_lock, 4910 NULL, MUTEX_DEFAULT, NULL); 4911 4912 list_create(&arc_mru->arcs_lists[i], 4913 sizeof (arc_buf_hdr_t), 4914 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node)); 4915 list_create(&arc_mru_ghost->arcs_lists[i], 4916 sizeof (arc_buf_hdr_t), 4917 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node)); 4918 list_create(&arc_mfu->arcs_lists[i], 4919 sizeof (arc_buf_hdr_t), 4920 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node)); 4921 list_create(&arc_mfu_ghost->arcs_lists[i], 4922 sizeof (arc_buf_hdr_t), 4923 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node)); 4924 list_create(&arc_mfu_ghost->arcs_lists[i], 4925 sizeof (arc_buf_hdr_t), 4926 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node)); 4927 list_create(&arc_l2c_only->arcs_lists[i], 4928 sizeof (arc_buf_hdr_t), 4929 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node)); 4930 } 4931 4932 buf_init(); 4933 4934 arc_thread_exit = 0; 4935 arc_eviction_list = NULL; 4936 mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL); 4937 bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t)); 4938 4939 arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED, 4940 sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); 4941 4942 if (arc_ksp != NULL) { 4943 arc_ksp->ks_data = &arc_stats; 4944 arc_ksp->ks_update = arc_kstat_update; 4945 kstat_install(arc_ksp); 4946 } 4947 4948 (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0, 4949 TS_RUN, minclsyspri); 4950 4951#ifdef _KERNEL 4952 arc_event_lowmem = EVENTHANDLER_REGISTER(vm_lowmem, arc_lowmem, NULL, 4953 EVENTHANDLER_PRI_FIRST); 4954#endif 4955 4956 arc_dead = FALSE; 4957 arc_warm = B_FALSE; 4958 4959 /* 4960 * Calculate maximum amount of dirty data per pool. 4961 * 4962 * If it has been set by /etc/system, take that. 4963 * Otherwise, use a percentage of physical memory defined by 4964 * zfs_dirty_data_max_percent (default 10%) with a cap at 4965 * zfs_dirty_data_max_max (default 4GB). 4966 */ 4967 if (zfs_dirty_data_max == 0) { 4968 zfs_dirty_data_max = ptob(physmem) * 4969 zfs_dirty_data_max_percent / 100; 4970 zfs_dirty_data_max = MIN(zfs_dirty_data_max, 4971 zfs_dirty_data_max_max); 4972 } 4973 4974#ifdef _KERNEL 4975 if (TUNABLE_INT_FETCH("vfs.zfs.prefetch_disable", &zfs_prefetch_disable)) 4976 prefetch_tunable_set = 1; 4977 4978#ifdef __i386__ 4979 if (prefetch_tunable_set == 0) { 4980 printf("ZFS NOTICE: Prefetch is disabled by default on i386 " 4981 "-- to enable,\n"); 4982 printf(" add \"vfs.zfs.prefetch_disable=0\" " 4983 "to /boot/loader.conf.\n"); 4984 zfs_prefetch_disable = 1; 4985 } 4986#else 4987 if ((((uint64_t)physmem * PAGESIZE) < (1ULL << 32)) && 4988 prefetch_tunable_set == 0) { 4989 printf("ZFS NOTICE: Prefetch is disabled by default if less " 4990 "than 4GB of RAM is present;\n" 4991 " to enable, add \"vfs.zfs.prefetch_disable=0\" " 4992 "to /boot/loader.conf.\n"); 4993 zfs_prefetch_disable = 1; 4994 } 4995#endif 4996 /* Warn about ZFS memory and address space requirements. */ 4997 if (((uint64_t)physmem * PAGESIZE) < (256 + 128 + 64) * (1 << 20)) { 4998 printf("ZFS WARNING: Recommended minimum RAM size is 512MB; " 4999 "expect unstable behavior.\n"); 5000 } 5001 if (kmem_size() < 512 * (1 << 20)) { 5002 printf("ZFS WARNING: Recommended minimum kmem_size is 512MB; " 5003 "expect unstable behavior.\n"); 5004 printf(" Consider tuning vm.kmem_size and " 5005 "vm.kmem_size_max\n"); 5006 printf(" in /boot/loader.conf.\n"); 5007 } 5008#endif 5009} 5010 5011void 5012arc_fini(void) 5013{ 5014 int i; 5015 5016 mutex_enter(&arc_reclaim_thr_lock); 5017 arc_thread_exit = 1; 5018 cv_signal(&arc_reclaim_thr_cv); 5019 while (arc_thread_exit != 0) 5020 cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock); 5021 mutex_exit(&arc_reclaim_thr_lock); 5022 5023 arc_flush(NULL); 5024 5025 arc_dead = TRUE; 5026 5027 if (arc_ksp != NULL) { 5028 kstat_delete(arc_ksp); 5029 arc_ksp = NULL; 5030 } 5031 5032 mutex_destroy(&arc_eviction_mtx); 5033 mutex_destroy(&arc_reclaim_thr_lock); 5034 cv_destroy(&arc_reclaim_thr_cv); 5035 5036 for (i = 0; i < ARC_BUFC_NUMLISTS; i++) { 5037 list_destroy(&arc_mru->arcs_lists[i]); 5038 list_destroy(&arc_mru_ghost->arcs_lists[i]); 5039 list_destroy(&arc_mfu->arcs_lists[i]); 5040 list_destroy(&arc_mfu_ghost->arcs_lists[i]); 5041 list_destroy(&arc_l2c_only->arcs_lists[i]); 5042 5043 mutex_destroy(&arc_anon->arcs_locks[i].arcs_lock); 5044 mutex_destroy(&arc_mru->arcs_locks[i].arcs_lock); 5045 mutex_destroy(&arc_mru_ghost->arcs_locks[i].arcs_lock); 5046 mutex_destroy(&arc_mfu->arcs_locks[i].arcs_lock); 5047 mutex_destroy(&arc_mfu_ghost->arcs_locks[i].arcs_lock); 5048 mutex_destroy(&arc_l2c_only->arcs_locks[i].arcs_lock); 5049 } 5050 5051 buf_fini(); 5052 5053 ASSERT0(arc_loaned_bytes); 5054 5055 mutex_destroy(&arc_lowmem_lock); 5056#ifdef _KERNEL 5057 if (arc_event_lowmem != NULL) 5058 EVENTHANDLER_DEREGISTER(vm_lowmem, arc_event_lowmem); 5059#endif 5060} 5061 5062/* 5063 * Level 2 ARC 5064 * 5065 * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk. 5066 * It uses dedicated storage devices to hold cached data, which are populated 5067 * using large infrequent writes. The main role of this cache is to boost 5068 * the performance of random read workloads. The intended L2ARC devices 5069 * include short-stroked disks, solid state disks, and other media with 5070 * substantially faster read latency than disk. 5071 * 5072 * +-----------------------+ 5073 * | ARC | 5074 * +-----------------------+ 5075 * | ^ ^ 5076 * | | | 5077 * l2arc_feed_thread() arc_read() 5078 * | | | 5079 * | l2arc read | 5080 * V | | 5081 * +---------------+ | 5082 * | L2ARC | | 5083 * +---------------+ | 5084 * | ^ | 5085 * l2arc_write() | | 5086 * | | | 5087 * V | | 5088 * +-------+ +-------+ 5089 * | vdev | | vdev | 5090 * | cache | | cache | 5091 * +-------+ +-------+ 5092 * +=========+ .-----. 5093 * : L2ARC : |-_____-| 5094 * : devices : | Disks | 5095 * +=========+ `-_____-' 5096 * 5097 * Read requests are satisfied from the following sources, in order: 5098 * 5099 * 1) ARC 5100 * 2) vdev cache of L2ARC devices 5101 * 3) L2ARC devices 5102 * 4) vdev cache of disks 5103 * 5) disks 5104 * 5105 * Some L2ARC device types exhibit extremely slow write performance. 5106 * To accommodate for this there are some significant differences between 5107 * the L2ARC and traditional cache design: 5108 * 5109 * 1. There is no eviction path from the ARC to the L2ARC. Evictions from 5110 * the ARC behave as usual, freeing buffers and placing headers on ghost 5111 * lists. The ARC does not send buffers to the L2ARC during eviction as 5112 * this would add inflated write latencies for all ARC memory pressure. 5113 * 5114 * 2. The L2ARC attempts to cache data from the ARC before it is evicted. 5115 * It does this by periodically scanning buffers from the eviction-end of 5116 * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are 5117 * not already there. It scans until a headroom of buffers is satisfied, 5118 * which itself is a buffer for ARC eviction. If a compressible buffer is 5119 * found during scanning and selected for writing to an L2ARC device, we 5120 * temporarily boost scanning headroom during the next scan cycle to make 5121 * sure we adapt to compression effects (which might significantly reduce 5122 * the data volume we write to L2ARC). The thread that does this is 5123 * l2arc_feed_thread(), illustrated below; example sizes are included to 5124 * provide a better sense of ratio than this diagram: 5125 * 5126 * head --> tail 5127 * +---------------------+----------+ 5128 * ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->. # already on L2ARC 5129 * +---------------------+----------+ | o L2ARC eligible 5130 * ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->| : ARC buffer 5131 * +---------------------+----------+ | 5132 * 15.9 Gbytes ^ 32 Mbytes | 5133 * headroom | 5134 * l2arc_feed_thread() 5135 * | 5136 * l2arc write hand <--[oooo]--' 5137 * | 8 Mbyte 5138 * | write max 5139 * V 5140 * +==============================+ 5141 * L2ARC dev |####|#|###|###| |####| ... | 5142 * +==============================+ 5143 * 32 Gbytes 5144 * 5145 * 3. If an ARC buffer is copied to the L2ARC but then hit instead of 5146 * evicted, then the L2ARC has cached a buffer much sooner than it probably 5147 * needed to, potentially wasting L2ARC device bandwidth and storage. It is 5148 * safe to say that this is an uncommon case, since buffers at the end of 5149 * the ARC lists have moved there due to inactivity. 5150 * 5151 * 4. If the ARC evicts faster than the L2ARC can maintain a headroom, 5152 * then the L2ARC simply misses copying some buffers. This serves as a 5153 * pressure valve to prevent heavy read workloads from both stalling the ARC 5154 * with waits and clogging the L2ARC with writes. This also helps prevent 5155 * the potential for the L2ARC to churn if it attempts to cache content too 5156 * quickly, such as during backups of the entire pool. 5157 * 5158 * 5. After system boot and before the ARC has filled main memory, there are 5159 * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru 5160 * lists can remain mostly static. Instead of searching from tail of these 5161 * lists as pictured, the l2arc_feed_thread() will search from the list heads 5162 * for eligible buffers, greatly increasing its chance of finding them. 5163 * 5164 * The L2ARC device write speed is also boosted during this time so that 5165 * the L2ARC warms up faster. Since there have been no ARC evictions yet, 5166 * there are no L2ARC reads, and no fear of degrading read performance 5167 * through increased writes. 5168 * 5169 * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that 5170 * the vdev queue can aggregate them into larger and fewer writes. Each 5171 * device is written to in a rotor fashion, sweeping writes through 5172 * available space then repeating. 5173 * 5174 * 7. The L2ARC does not store dirty content. It never needs to flush 5175 * write buffers back to disk based storage. 5176 * 5177 * 8. If an ARC buffer is written (and dirtied) which also exists in the 5178 * L2ARC, the now stale L2ARC buffer is immediately dropped. 5179 * 5180 * The performance of the L2ARC can be tweaked by a number of tunables, which 5181 * may be necessary for different workloads: 5182 * 5183 * l2arc_write_max max write bytes per interval 5184 * l2arc_write_boost extra write bytes during device warmup 5185 * l2arc_noprefetch skip caching prefetched buffers 5186 * l2arc_headroom number of max device writes to precache 5187 * l2arc_headroom_boost when we find compressed buffers during ARC 5188 * scanning, we multiply headroom by this 5189 * percentage factor for the next scan cycle, 5190 * since more compressed buffers are likely to 5191 * be present 5192 * l2arc_feed_secs seconds between L2ARC writing 5193 * 5194 * Tunables may be removed or added as future performance improvements are 5195 * integrated, and also may become zpool properties. 5196 * 5197 * There are three key functions that control how the L2ARC warms up: 5198 * 5199 * l2arc_write_eligible() check if a buffer is eligible to cache 5200 * l2arc_write_size() calculate how much to write 5201 * l2arc_write_interval() calculate sleep delay between writes 5202 * 5203 * These three functions determine what to write, how much, and how quickly 5204 * to send writes. 5205 */ 5206 5207static boolean_t 5208l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *hdr) 5209{ 5210 /* 5211 * A buffer is *not* eligible for the L2ARC if it: 5212 * 1. belongs to a different spa. 5213 * 2. is already cached on the L2ARC. 5214 * 3. has an I/O in progress (it may be an incomplete read). 5215 * 4. is flagged not eligible (zfs property). 5216 */ 5217 if (hdr->b_spa != spa_guid) { 5218 ARCSTAT_BUMP(arcstat_l2_write_spa_mismatch); 5219 return (B_FALSE); 5220 } 5221 if (HDR_HAS_L2HDR(hdr)) { 5222 ARCSTAT_BUMP(arcstat_l2_write_in_l2); 5223 return (B_FALSE); 5224 } 5225 if (HDR_IO_IN_PROGRESS(hdr)) { 5226 ARCSTAT_BUMP(arcstat_l2_write_hdr_io_in_progress); 5227 return (B_FALSE); 5228 } 5229 if (!HDR_L2CACHE(hdr)) { 5230 ARCSTAT_BUMP(arcstat_l2_write_not_cacheable); 5231 return (B_FALSE); 5232 } 5233 5234 return (B_TRUE); 5235} 5236 5237static uint64_t 5238l2arc_write_size(void) 5239{ 5240 uint64_t size; 5241 5242 /* 5243 * Make sure our globals have meaningful values in case the user 5244 * altered them. 5245 */ 5246 size = l2arc_write_max; 5247 if (size == 0) { 5248 cmn_err(CE_NOTE, "Bad value for l2arc_write_max, value must " 5249 "be greater than zero, resetting it to the default (%d)", 5250 L2ARC_WRITE_SIZE); 5251 size = l2arc_write_max = L2ARC_WRITE_SIZE; 5252 } 5253 5254 if (arc_warm == B_FALSE) 5255 size += l2arc_write_boost; 5256 5257 return (size); 5258 5259} 5260 5261static clock_t 5262l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote) 5263{ 5264 clock_t interval, next, now; 5265 5266 /* 5267 * If the ARC lists are busy, increase our write rate; if the 5268 * lists are stale, idle back. This is achieved by checking 5269 * how much we previously wrote - if it was more than half of 5270 * what we wanted, schedule the next write much sooner. 5271 */ 5272 if (l2arc_feed_again && wrote > (wanted / 2)) 5273 interval = (hz * l2arc_feed_min_ms) / 1000; 5274 else 5275 interval = hz * l2arc_feed_secs; 5276 5277 now = ddi_get_lbolt(); 5278 next = MAX(now, MIN(now + interval, began + interval)); 5279 5280 return (next); 5281} 5282 5283/* 5284 * Cycle through L2ARC devices. This is how L2ARC load balances. 5285 * If a device is returned, this also returns holding the spa config lock. 5286 */ 5287static l2arc_dev_t * 5288l2arc_dev_get_next(void) 5289{ 5290 l2arc_dev_t *first, *next = NULL; 5291 5292 /* 5293 * Lock out the removal of spas (spa_namespace_lock), then removal 5294 * of cache devices (l2arc_dev_mtx). Once a device has been selected, 5295 * both locks will be dropped and a spa config lock held instead. 5296 */ 5297 mutex_enter(&spa_namespace_lock); 5298 mutex_enter(&l2arc_dev_mtx); 5299 5300 /* if there are no vdevs, there is nothing to do */ 5301 if (l2arc_ndev == 0) 5302 goto out; 5303 5304 first = NULL; 5305 next = l2arc_dev_last; 5306 do { 5307 /* loop around the list looking for a non-faulted vdev */ 5308 if (next == NULL) { 5309 next = list_head(l2arc_dev_list); 5310 } else { 5311 next = list_next(l2arc_dev_list, next); 5312 if (next == NULL) 5313 next = list_head(l2arc_dev_list); 5314 } 5315 5316 /* if we have come back to the start, bail out */ 5317 if (first == NULL) 5318 first = next; 5319 else if (next == first) 5320 break; 5321 5322 } while (vdev_is_dead(next->l2ad_vdev)); 5323 5324 /* if we were unable to find any usable vdevs, return NULL */ 5325 if (vdev_is_dead(next->l2ad_vdev)) 5326 next = NULL; 5327 5328 l2arc_dev_last = next; 5329 5330out: 5331 mutex_exit(&l2arc_dev_mtx); 5332 5333 /* 5334 * Grab the config lock to prevent the 'next' device from being 5335 * removed while we are writing to it. 5336 */ 5337 if (next != NULL) 5338 spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER); 5339 mutex_exit(&spa_namespace_lock); 5340 5341 return (next); 5342} 5343 5344/* 5345 * Free buffers that were tagged for destruction. 5346 */ 5347static void 5348l2arc_do_free_on_write() 5349{ 5350 list_t *buflist; 5351 l2arc_data_free_t *df, *df_prev; 5352 5353 mutex_enter(&l2arc_free_on_write_mtx); 5354 buflist = l2arc_free_on_write; 5355 5356 for (df = list_tail(buflist); df; df = df_prev) { 5357 df_prev = list_prev(buflist, df); 5358 ASSERT(df->l2df_data != NULL); 5359 ASSERT(df->l2df_func != NULL); 5360 df->l2df_func(df->l2df_data, df->l2df_size); 5361 list_remove(buflist, df); 5362 kmem_free(df, sizeof (l2arc_data_free_t)); 5363 } 5364 5365 mutex_exit(&l2arc_free_on_write_mtx); 5366} 5367 5368/* 5369 * A write to a cache device has completed. Update all headers to allow 5370 * reads from these buffers to begin. 5371 */ 5372static void 5373l2arc_write_done(zio_t *zio) 5374{ 5375 l2arc_write_callback_t *cb; 5376 l2arc_dev_t *dev; 5377 list_t *buflist; 5378 arc_buf_hdr_t *head, *hdr, *hdr_prev; 5379 kmutex_t *hash_lock; 5380 int64_t bytes_dropped = 0; 5381 5382 cb = zio->io_private; 5383 ASSERT(cb != NULL); 5384 dev = cb->l2wcb_dev; 5385 ASSERT(dev != NULL); 5386 head = cb->l2wcb_head; 5387 ASSERT(head != NULL); 5388 buflist = &dev->l2ad_buflist; 5389 ASSERT(buflist != NULL); 5390 DTRACE_PROBE2(l2arc__iodone, zio_t *, zio, 5391 l2arc_write_callback_t *, cb); 5392 5393 if (zio->io_error != 0) 5394 ARCSTAT_BUMP(arcstat_l2_writes_error); 5395 5396 mutex_enter(&dev->l2ad_mtx); 5397 5398 /* 5399 * All writes completed, or an error was hit. 5400 */ 5401 for (hdr = list_prev(buflist, head); hdr; hdr = hdr_prev) { 5402 hdr_prev = list_prev(buflist, hdr); 5403 5404 hash_lock = HDR_LOCK(hdr); 5405 if (!mutex_tryenter(hash_lock)) { 5406 /* 5407 * This buffer misses out. It may be in a stage 5408 * of eviction. Its ARC_FLAG_L2_WRITING flag will be 5409 * left set, denying reads to this buffer. 5410 */ 5411 ARCSTAT_BUMP(arcstat_l2_writes_hdr_miss); 5412 continue; 5413 } 5414 5415 /* 5416 * It's possible that this buffer got evicted from the L1 cache 5417 * before we grabbed the vdev + hash locks, in which case 5418 * arc_hdr_realloc freed b_tmp_cdata for us if it was allocated. 5419 * Only free the buffer if we still have an L1 hdr. 5420 */ 5421 if (HDR_HAS_L1HDR(hdr) && hdr->b_l1hdr.b_tmp_cdata != NULL && 5422 HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF) 5423 l2arc_release_cdata_buf(hdr); 5424 5425 if (zio->io_error != 0) { 5426 /* 5427 * Error - drop L2ARC entry. 5428 */ 5429 trim_map_free(hdr->b_l2hdr.b_dev->l2ad_vdev, 5430 hdr->b_l2hdr.b_daddr, hdr->b_l2hdr.b_asize, 0); 5431 hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR; 5432 5433 ARCSTAT_INCR(arcstat_l2_asize, -hdr->b_l2hdr.b_asize); 5434 ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); 5435 5436 bytes_dropped += hdr->b_l2hdr.b_asize; 5437 (void) refcount_remove_many(&dev->l2ad_alloc, 5438 hdr->b_l2hdr.b_asize, hdr); 5439 } 5440 5441 /* 5442 * Allow ARC to begin reads to this L2ARC entry. 5443 */ 5444 hdr->b_flags &= ~ARC_FLAG_L2_WRITING; 5445 5446 mutex_exit(hash_lock); 5447 } 5448 5449 atomic_inc_64(&l2arc_writes_done); 5450 list_remove(buflist, head); 5451 ASSERT(!HDR_HAS_L1HDR(head)); 5452 kmem_cache_free(hdr_l2only_cache, head); 5453 mutex_exit(&dev->l2ad_mtx); 5454 5455 vdev_space_update(dev->l2ad_vdev, -bytes_dropped, 0, 0); 5456 5457 l2arc_do_free_on_write(); 5458 5459 kmem_free(cb, sizeof (l2arc_write_callback_t)); 5460} 5461 5462/* 5463 * A read to a cache device completed. Validate buffer contents before 5464 * handing over to the regular ARC routines. 5465 */ 5466static void 5467l2arc_read_done(zio_t *zio) 5468{ 5469 l2arc_read_callback_t *cb; 5470 arc_buf_hdr_t *hdr; 5471 arc_buf_t *buf; 5472 kmutex_t *hash_lock; 5473 int equal; 5474 5475 ASSERT(zio->io_vd != NULL); 5476 ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE); 5477 5478 spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd); 5479 5480 cb = zio->io_private; 5481 ASSERT(cb != NULL); 5482 buf = cb->l2rcb_buf; 5483 ASSERT(buf != NULL); 5484 5485 hash_lock = HDR_LOCK(buf->b_hdr); 5486 mutex_enter(hash_lock); 5487 hdr = buf->b_hdr; 5488 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 5489 5490 /* 5491 * If the buffer was compressed, decompress it first. 5492 */ 5493 if (cb->l2rcb_compress != ZIO_COMPRESS_OFF) 5494 l2arc_decompress_zio(zio, hdr, cb->l2rcb_compress); 5495 ASSERT(zio->io_data != NULL); 5496 5497 /* 5498 * Check this survived the L2ARC journey. 5499 */ 5500 equal = arc_cksum_equal(buf); 5501 if (equal && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) { 5502 mutex_exit(hash_lock); 5503 zio->io_private = buf; 5504 zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 */ 5505 zio->io_bp = &zio->io_bp_copy; /* XXX fix in L2ARC 2.0 */ 5506 arc_read_done(zio); 5507 } else { 5508 mutex_exit(hash_lock); 5509 /* 5510 * Buffer didn't survive caching. Increment stats and 5511 * reissue to the original storage device. 5512 */ 5513 if (zio->io_error != 0) { 5514 ARCSTAT_BUMP(arcstat_l2_io_error); 5515 } else { 5516 zio->io_error = SET_ERROR(EIO); 5517 } 5518 if (!equal) 5519 ARCSTAT_BUMP(arcstat_l2_cksum_bad); 5520 5521 /* 5522 * If there's no waiter, issue an async i/o to the primary 5523 * storage now. If there *is* a waiter, the caller must 5524 * issue the i/o in a context where it's OK to block. 5525 */ 5526 if (zio->io_waiter == NULL) { 5527 zio_t *pio = zio_unique_parent(zio); 5528 5529 ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL); 5530 5531 zio_nowait(zio_read(pio, cb->l2rcb_spa, &cb->l2rcb_bp, 5532 buf->b_data, zio->io_size, arc_read_done, buf, 5533 zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb)); 5534 } 5535 } 5536 5537 kmem_free(cb, sizeof (l2arc_read_callback_t)); 5538} 5539 5540/* 5541 * This is the list priority from which the L2ARC will search for pages to 5542 * cache. This is used within loops (0..3) to cycle through lists in the 5543 * desired order. This order can have a significant effect on cache 5544 * performance. 5545 * 5546 * Currently the metadata lists are hit first, MFU then MRU, followed by 5547 * the data lists. This function returns a locked list, and also returns 5548 * the lock pointer. 5549 */ 5550static list_t * 5551l2arc_list_locked(int list_num, kmutex_t **lock) 5552{ 5553 list_t *list = NULL; 5554 int idx; 5555 5556 ASSERT(list_num >= 0 && list_num < 2 * ARC_BUFC_NUMLISTS); 5557 5558 if (list_num < ARC_BUFC_NUMMETADATALISTS) { 5559 idx = list_num; 5560 list = &arc_mfu->arcs_lists[idx]; 5561 *lock = ARCS_LOCK(arc_mfu, idx); 5562 } else if (list_num < ARC_BUFC_NUMMETADATALISTS * 2) { 5563 idx = list_num - ARC_BUFC_NUMMETADATALISTS; 5564 list = &arc_mru->arcs_lists[idx]; 5565 *lock = ARCS_LOCK(arc_mru, idx); 5566 } else if (list_num < (ARC_BUFC_NUMMETADATALISTS * 2 + 5567 ARC_BUFC_NUMDATALISTS)) { 5568 idx = list_num - ARC_BUFC_NUMMETADATALISTS; 5569 list = &arc_mfu->arcs_lists[idx]; 5570 *lock = ARCS_LOCK(arc_mfu, idx); 5571 } else { 5572 idx = list_num - ARC_BUFC_NUMLISTS; 5573 list = &arc_mru->arcs_lists[idx]; 5574 *lock = ARCS_LOCK(arc_mru, idx); 5575 } 5576 5577 ASSERT(!(MUTEX_HELD(*lock))); 5578 mutex_enter(*lock); 5579 return (list); 5580} 5581 5582/* 5583 * Evict buffers from the device write hand to the distance specified in 5584 * bytes. This distance may span populated buffers, it may span nothing. 5585 * This is clearing a region on the L2ARC device ready for writing. 5586 * If the 'all' boolean is set, every buffer is evicted. 5587 */ 5588static void 5589l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) 5590{ 5591 list_t *buflist; 5592 arc_buf_hdr_t *hdr, *hdr_prev; 5593 kmutex_t *hash_lock; 5594 uint64_t taddr; 5595 5596 buflist = &dev->l2ad_buflist; 5597 5598 if (!all && dev->l2ad_first) { 5599 /* 5600 * This is the first sweep through the device. There is 5601 * nothing to evict. 5602 */ 5603 return; 5604 } 5605 5606 if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) { 5607 /* 5608 * When nearing the end of the device, evict to the end 5609 * before the device write hand jumps to the start. 5610 */ 5611 taddr = dev->l2ad_end; 5612 } else { 5613 taddr = dev->l2ad_hand + distance; 5614 } 5615 DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist, 5616 uint64_t, taddr, boolean_t, all); 5617 5618top: 5619 mutex_enter(&dev->l2ad_mtx); 5620 for (hdr = list_tail(buflist); hdr; hdr = hdr_prev) { 5621 hdr_prev = list_prev(buflist, hdr); 5622 5623 hash_lock = HDR_LOCK(hdr); 5624 if (!mutex_tryenter(hash_lock)) { 5625 /* 5626 * Missed the hash lock. Retry. 5627 */ 5628 ARCSTAT_BUMP(arcstat_l2_evict_lock_retry); 5629 mutex_exit(&dev->l2ad_mtx); 5630 mutex_enter(hash_lock); 5631 mutex_exit(hash_lock); 5632 goto top; 5633 } 5634 5635 if (HDR_L2_WRITE_HEAD(hdr)) { 5636 /* 5637 * We hit a write head node. Leave it for 5638 * l2arc_write_done(). 5639 */ 5640 list_remove(buflist, hdr); 5641 mutex_exit(hash_lock); 5642 continue; 5643 } 5644 5645 if (!all && HDR_HAS_L2HDR(hdr) && 5646 (hdr->b_l2hdr.b_daddr > taddr || 5647 hdr->b_l2hdr.b_daddr < dev->l2ad_hand)) { 5648 /* 5649 * We've evicted to the target address, 5650 * or the end of the device. 5651 */ 5652 mutex_exit(hash_lock); 5653 break; 5654 } 5655 5656 ASSERT(HDR_HAS_L2HDR(hdr)); 5657 if (!HDR_HAS_L1HDR(hdr)) { 5658 ASSERT(!HDR_L2_READING(hdr)); 5659 /* 5660 * This doesn't exist in the ARC. Destroy. 5661 * arc_hdr_destroy() will call list_remove() 5662 * and decrement arcstat_l2_size. 5663 */ 5664 arc_change_state(arc_anon, hdr, hash_lock); 5665 arc_hdr_destroy(hdr); 5666 } else { 5667 ASSERT(hdr->b_l1hdr.b_state != arc_l2c_only); 5668 ARCSTAT_BUMP(arcstat_l2_evict_l1cached); 5669 /* 5670 * Invalidate issued or about to be issued 5671 * reads, since we may be about to write 5672 * over this location. 5673 */ 5674 if (HDR_L2_READING(hdr)) { 5675 ARCSTAT_BUMP(arcstat_l2_evict_reading); 5676 hdr->b_flags |= ARC_FLAG_L2_EVICTED; 5677 } 5678 5679 arc_hdr_l2hdr_destroy(hdr); 5680 } 5681 mutex_exit(hash_lock); 5682 } 5683 mutex_exit(&dev->l2ad_mtx); 5684} 5685 5686/* 5687 * Find and write ARC buffers to the L2ARC device. 5688 * 5689 * An ARC_FLAG_L2_WRITING flag is set so that the L2ARC buffers are not valid 5690 * for reading until they have completed writing. 5691 * The headroom_boost is an in-out parameter used to maintain headroom boost 5692 * state between calls to this function. 5693 * 5694 * Returns the number of bytes actually written (which may be smaller than 5695 * the delta by which the device hand has changed due to alignment). 5696 */ 5697static uint64_t 5698l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, 5699 boolean_t *headroom_boost) 5700{ 5701 arc_buf_hdr_t *hdr, *hdr_prev, *head; 5702 list_t *list; 5703 uint64_t write_asize, write_sz, headroom, buf_compress_minsz; 5704 void *buf_data; 5705 kmutex_t *list_lock; 5706 boolean_t full; 5707 l2arc_write_callback_t *cb; 5708 zio_t *pio, *wzio; 5709 uint64_t guid = spa_load_guid(spa); 5710 const boolean_t do_headroom_boost = *headroom_boost; 5711 int try; 5712 5713 ASSERT(dev->l2ad_vdev != NULL); 5714 5715 /* Lower the flag now, we might want to raise it again later. */ 5716 *headroom_boost = B_FALSE; 5717 5718 pio = NULL; 5719 write_sz = write_asize = 0; 5720 full = B_FALSE; 5721 head = kmem_cache_alloc(hdr_l2only_cache, KM_PUSHPAGE); 5722 head->b_flags |= ARC_FLAG_L2_WRITE_HEAD; 5723 head->b_flags |= ARC_FLAG_HAS_L2HDR; 5724 5725 ARCSTAT_BUMP(arcstat_l2_write_buffer_iter); 5726 /* 5727 * We will want to try to compress buffers that are at least 2x the 5728 * device sector size. 5729 */ 5730 buf_compress_minsz = 2 << dev->l2ad_vdev->vdev_ashift; 5731 5732 /* 5733 * Copy buffers for L2ARC writing. 5734 */ 5735 mutex_enter(&dev->l2ad_mtx); 5736 for (try = 0; try < 2 * ARC_BUFC_NUMLISTS; try++) { 5737 uint64_t passed_sz = 0; 5738 5739 list = l2arc_list_locked(try, &list_lock); 5740 ARCSTAT_BUMP(arcstat_l2_write_buffer_list_iter); 5741 5742 /* 5743 * L2ARC fast warmup. 5744 * 5745 * Until the ARC is warm and starts to evict, read from the 5746 * head of the ARC lists rather than the tail. 5747 */ 5748 if (arc_warm == B_FALSE) 5749 hdr = list_head(list); 5750 else 5751 hdr = list_tail(list); 5752 if (hdr == NULL) 5753 ARCSTAT_BUMP(arcstat_l2_write_buffer_list_null_iter); 5754 5755 headroom = target_sz * l2arc_headroom * 2 / ARC_BUFC_NUMLISTS; 5756 if (do_headroom_boost) 5757 headroom = (headroom * l2arc_headroom_boost) / 100; 5758 5759 for (; hdr; hdr = hdr_prev) { 5760 kmutex_t *hash_lock; 5761 uint64_t buf_sz; 5762 uint64_t buf_a_sz; 5763 5764 if (arc_warm == B_FALSE) 5765 hdr_prev = list_next(list, hdr); 5766 else 5767 hdr_prev = list_prev(list, hdr); 5768 ARCSTAT_INCR(arcstat_l2_write_buffer_bytes_scanned, hdr->b_size); 5769 5770 hash_lock = HDR_LOCK(hdr); 5771 if (!mutex_tryenter(hash_lock)) { 5772 ARCSTAT_BUMP(arcstat_l2_write_trylock_fail); 5773 /* 5774 * Skip this buffer rather than waiting. 5775 */ 5776 continue; 5777 } 5778 5779 passed_sz += hdr->b_size; 5780 if (passed_sz > headroom) { 5781 /* 5782 * Searched too far. 5783 */ 5784 mutex_exit(hash_lock); 5785 ARCSTAT_BUMP(arcstat_l2_write_passed_headroom); 5786 break; 5787 } 5788 5789 if (!l2arc_write_eligible(guid, hdr)) { 5790 mutex_exit(hash_lock); 5791 continue; 5792 } 5793 5794 /* 5795 * Assume that the buffer is not going to be compressed 5796 * and could take more space on disk because of a larger 5797 * disk block size. 5798 */ 5799 buf_sz = hdr->b_size; 5800 buf_a_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz); 5801 5802 if ((write_asize + buf_a_sz) > target_sz) { 5803 full = B_TRUE; 5804 mutex_exit(hash_lock); 5805 ARCSTAT_BUMP(arcstat_l2_write_full); 5806 break; 5807 } 5808 5809 if (pio == NULL) { 5810 /* 5811 * Insert a dummy header on the buflist so 5812 * l2arc_write_done() can find where the 5813 * write buffers begin without searching. 5814 */ 5815 list_insert_head(&dev->l2ad_buflist, head); 5816 5817 cb = kmem_alloc( 5818 sizeof (l2arc_write_callback_t), KM_SLEEP); 5819 cb->l2wcb_dev = dev; 5820 cb->l2wcb_head = head; 5821 pio = zio_root(spa, l2arc_write_done, cb, 5822 ZIO_FLAG_CANFAIL); 5823 ARCSTAT_BUMP(arcstat_l2_write_pios); 5824 } 5825 5826 /* 5827 * Create and add a new L2ARC header. 5828 */ 5829 hdr->b_l2hdr.b_dev = dev; 5830 hdr->b_flags |= ARC_FLAG_L2_WRITING; 5831 /* 5832 * Temporarily stash the data buffer in b_tmp_cdata. 5833 * The subsequent write step will pick it up from 5834 * there. This is because can't access b_l1hdr.b_buf 5835 * without holding the hash_lock, which we in turn 5836 * can't access without holding the ARC list locks 5837 * (which we want to avoid during compression/writing). 5838 */ 5839 HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_OFF); 5840 hdr->b_l2hdr.b_asize = hdr->b_size; 5841 hdr->b_l1hdr.b_tmp_cdata = hdr->b_l1hdr.b_buf->b_data; 5842 5843 /* 5844 * Explicitly set the b_daddr field to a known 5845 * value which means "invalid address". This 5846 * enables us to differentiate which stage of 5847 * l2arc_write_buffers() the particular header 5848 * is in (e.g. this loop, or the one below). 5849 * ARC_FLAG_L2_WRITING is not enough to make 5850 * this distinction, and we need to know in 5851 * order to do proper l2arc vdev accounting in 5852 * arc_release() and arc_hdr_destroy(). 5853 * 5854 * Note, we can't use a new flag to distinguish 5855 * the two stages because we don't hold the 5856 * header's hash_lock below, in the second stage 5857 * of this function. Thus, we can't simply 5858 * change the b_flags field to denote that the 5859 * IO has been sent. We can change the b_daddr 5860 * field of the L2 portion, though, since we'll 5861 * be holding the l2ad_mtx; which is why we're 5862 * using it to denote the header's state change. 5863 */ 5864 hdr->b_l2hdr.b_daddr = L2ARC_ADDR_UNSET; 5865 hdr->b_flags |= ARC_FLAG_HAS_L2HDR; 5866 5867 list_insert_head(&dev->l2ad_buflist, hdr); 5868 5869 /* 5870 * Compute and store the buffer cksum before 5871 * writing. On debug the cksum is verified first. 5872 */ 5873 arc_cksum_verify(hdr->b_l1hdr.b_buf); 5874 arc_cksum_compute(hdr->b_l1hdr.b_buf, B_TRUE); 5875 5876 mutex_exit(hash_lock); 5877 5878 write_sz += buf_sz; 5879 write_asize += buf_a_sz; 5880 } 5881 5882 mutex_exit(list_lock); 5883 5884 if (full == B_TRUE) 5885 break; 5886 } 5887 5888 /* No buffers selected for writing? */ 5889 if (pio == NULL) { 5890 ASSERT0(write_sz); 5891 mutex_exit(&dev->l2ad_mtx); 5892 ASSERT(!HDR_HAS_L1HDR(head)); 5893 kmem_cache_free(hdr_l2only_cache, head); 5894 return (0); 5895 } 5896 5897 /* 5898 * Note that elsewhere in this file arcstat_l2_asize 5899 * and the used space on l2ad_vdev are updated using b_asize, 5900 * which is not necessarily rounded up to the device block size. 5901 * Too keep accounting consistent we do the same here as well: 5902 * stats_size accumulates the sum of b_asize of the written buffers, 5903 * while write_asize accumulates the sum of b_asize rounded up 5904 * to the device block size. 5905 * The latter sum is used only to validate the corectness of the code. 5906 */ 5907 uint64_t stats_size = 0; 5908 write_asize = 0; 5909 5910 /* 5911 * Now start writing the buffers. We're starting at the write head 5912 * and work backwards, retracing the course of the buffer selector 5913 * loop above. 5914 */ 5915 for (hdr = list_prev(&dev->l2ad_buflist, head); hdr; 5916 hdr = list_prev(&dev->l2ad_buflist, hdr)) { 5917 uint64_t buf_sz; 5918 5919 /* 5920 * We shouldn't need to lock the buffer here, since we flagged 5921 * it as ARC_FLAG_L2_WRITING in the previous step, but we must 5922 * take care to only access its L2 cache parameters. In 5923 * particular, hdr->l1hdr.b_buf may be invalid by now due to 5924 * ARC eviction. 5925 */ 5926 hdr->b_l2hdr.b_daddr = dev->l2ad_hand; 5927 5928 if ((HDR_L2COMPRESS(hdr)) && 5929 hdr->b_l2hdr.b_asize >= buf_compress_minsz) { 5930 if (l2arc_compress_buf(hdr)) { 5931 /* 5932 * If compression succeeded, enable headroom 5933 * boost on the next scan cycle. 5934 */ 5935 *headroom_boost = B_TRUE; 5936 } 5937 } 5938 5939 /* 5940 * Pick up the buffer data we had previously stashed away 5941 * (and now potentially also compressed). 5942 */ 5943 buf_data = hdr->b_l1hdr.b_tmp_cdata; 5944 buf_sz = hdr->b_l2hdr.b_asize; 5945 5946 /* 5947 * If the data has not been compressed, then clear b_tmp_cdata 5948 * to make sure that it points only to a temporary compression 5949 * buffer. 5950 */ 5951 if (!L2ARC_IS_VALID_COMPRESS(HDR_GET_COMPRESS(hdr))) 5952 hdr->b_l1hdr.b_tmp_cdata = NULL; 5953 5954 /* 5955 * We need to do this regardless if buf_sz is zero or 5956 * not, otherwise, when this l2hdr is evicted we'll 5957 * remove a reference that was never added. 5958 */ 5959 (void) refcount_add_many(&dev->l2ad_alloc, buf_sz, hdr); 5960 5961 /* Compression may have squashed the buffer to zero length. */ 5962 if (buf_sz != 0) { 5963 uint64_t buf_a_sz; 5964 5965 wzio = zio_write_phys(pio, dev->l2ad_vdev, 5966 dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF, 5967 NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE, 5968 ZIO_FLAG_CANFAIL, B_FALSE); 5969 5970 DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev, 5971 zio_t *, wzio); 5972 (void) zio_nowait(wzio); 5973 5974 stats_size += buf_sz; 5975 5976 /* 5977 * Keep the clock hand suitably device-aligned. 5978 */ 5979 buf_a_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz); 5980 write_asize += buf_a_sz; 5981 dev->l2ad_hand += buf_a_sz; 5982 } 5983 } 5984 5985 mutex_exit(&dev->l2ad_mtx); 5986 5987 ASSERT3U(write_asize, <=, target_sz); 5988 ARCSTAT_BUMP(arcstat_l2_writes_sent); 5989 ARCSTAT_INCR(arcstat_l2_write_bytes, write_asize); 5990 ARCSTAT_INCR(arcstat_l2_size, write_sz); 5991 ARCSTAT_INCR(arcstat_l2_asize, stats_size); 5992 vdev_space_update(dev->l2ad_vdev, stats_size, 0, 0); 5993 5994 /* 5995 * Bump device hand to the device start if it is approaching the end. 5996 * l2arc_evict() will already have evicted ahead for this case. 5997 */ 5998 if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) { 5999 dev->l2ad_hand = dev->l2ad_start; 6000 dev->l2ad_first = B_FALSE; 6001 } 6002 6003 dev->l2ad_writing = B_TRUE; 6004 (void) zio_wait(pio); 6005 dev->l2ad_writing = B_FALSE; 6006 6007 return (write_asize); 6008} 6009 6010/* 6011 * Compresses an L2ARC buffer. 6012 * The data to be compressed must be prefilled in l1hdr.b_tmp_cdata and its 6013 * size in l2hdr->b_asize. This routine tries to compress the data and 6014 * depending on the compression result there are three possible outcomes: 6015 * *) The buffer was incompressible. The original l2hdr contents were left 6016 * untouched and are ready for writing to an L2 device. 6017 * *) The buffer was all-zeros, so there is no need to write it to an L2 6018 * device. To indicate this situation b_tmp_cdata is NULL'ed, b_asize is 6019 * set to zero and b_compress is set to ZIO_COMPRESS_EMPTY. 6020 * *) Compression succeeded and b_tmp_cdata was replaced with a temporary 6021 * data buffer which holds the compressed data to be written, and b_asize 6022 * tells us how much data there is. b_compress is set to the appropriate 6023 * compression algorithm. Once writing is done, invoke 6024 * l2arc_release_cdata_buf on this l2hdr to free this temporary buffer. 6025 * 6026 * Returns B_TRUE if compression succeeded, or B_FALSE if it didn't (the 6027 * buffer was incompressible). 6028 */ 6029static boolean_t 6030l2arc_compress_buf(arc_buf_hdr_t *hdr) 6031{ 6032 void *cdata; 6033 size_t csize, len, rounded; 6034 ASSERT(HDR_HAS_L2HDR(hdr)); 6035 l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr; 6036 6037 ASSERT(HDR_HAS_L1HDR(hdr)); 6038 ASSERT(HDR_GET_COMPRESS(hdr) == ZIO_COMPRESS_OFF); 6039 ASSERT(hdr->b_l1hdr.b_tmp_cdata != NULL); 6040 6041 len = l2hdr->b_asize; 6042 cdata = zio_data_buf_alloc(len); 6043 ASSERT3P(cdata, !=, NULL); 6044 csize = zio_compress_data(ZIO_COMPRESS_LZ4, hdr->b_l1hdr.b_tmp_cdata, 6045 cdata, l2hdr->b_asize); 6046 6047 if (csize == 0) { 6048 /* zero block, indicate that there's nothing to write */ 6049 zio_data_buf_free(cdata, len); 6050 HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_EMPTY); 6051 l2hdr->b_asize = 0; 6052 hdr->b_l1hdr.b_tmp_cdata = NULL; 6053 ARCSTAT_BUMP(arcstat_l2_compress_zeros); 6054 return (B_TRUE); 6055 } 6056 6057 rounded = P2ROUNDUP(csize, 6058 (size_t)1 << l2hdr->b_dev->l2ad_vdev->vdev_ashift); 6059 if (rounded < len) { 6060 /* 6061 * Compression succeeded, we'll keep the cdata around for 6062 * writing and release it afterwards. 6063 */ 6064 if (rounded > csize) { 6065 bzero((char *)cdata + csize, rounded - csize); 6066 csize = rounded; 6067 } 6068 HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_LZ4); 6069 l2hdr->b_asize = csize; 6070 hdr->b_l1hdr.b_tmp_cdata = cdata; 6071 ARCSTAT_BUMP(arcstat_l2_compress_successes); 6072 return (B_TRUE); 6073 } else { 6074 /* 6075 * Compression failed, release the compressed buffer. 6076 * l2hdr will be left unmodified. 6077 */ 6078 zio_data_buf_free(cdata, len); 6079 ARCSTAT_BUMP(arcstat_l2_compress_failures); 6080 return (B_FALSE); 6081 } 6082} 6083 6084/* 6085 * Decompresses a zio read back from an l2arc device. On success, the 6086 * underlying zio's io_data buffer is overwritten by the uncompressed 6087 * version. On decompression error (corrupt compressed stream), the 6088 * zio->io_error value is set to signal an I/O error. 6089 * 6090 * Please note that the compressed data stream is not checksummed, so 6091 * if the underlying device is experiencing data corruption, we may feed 6092 * corrupt data to the decompressor, so the decompressor needs to be 6093 * able to handle this situation (LZ4 does). 6094 */ 6095static void 6096l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr, enum zio_compress c) 6097{ 6098 ASSERT(L2ARC_IS_VALID_COMPRESS(c)); 6099 6100 if (zio->io_error != 0) { 6101 /* 6102 * An io error has occured, just restore the original io 6103 * size in preparation for a main pool read. 6104 */ 6105 zio->io_orig_size = zio->io_size = hdr->b_size; 6106 return; 6107 } 6108 6109 if (c == ZIO_COMPRESS_EMPTY) { 6110 /* 6111 * An empty buffer results in a null zio, which means we 6112 * need to fill its io_data after we're done restoring the 6113 * buffer's contents. 6114 */ 6115 ASSERT(hdr->b_l1hdr.b_buf != NULL); 6116 bzero(hdr->b_l1hdr.b_buf->b_data, hdr->b_size); 6117 zio->io_data = zio->io_orig_data = hdr->b_l1hdr.b_buf->b_data; 6118 } else { 6119 ASSERT(zio->io_data != NULL); 6120 /* 6121 * We copy the compressed data from the start of the arc buffer 6122 * (the zio_read will have pulled in only what we need, the 6123 * rest is garbage which we will overwrite at decompression) 6124 * and then decompress back to the ARC data buffer. This way we 6125 * can minimize copying by simply decompressing back over the 6126 * original compressed data (rather than decompressing to an 6127 * aux buffer and then copying back the uncompressed buffer, 6128 * which is likely to be much larger). 6129 */ 6130 uint64_t csize; 6131 void *cdata; 6132 6133 csize = zio->io_size; 6134 cdata = zio_data_buf_alloc(csize); 6135 bcopy(zio->io_data, cdata, csize); 6136 if (zio_decompress_data(c, cdata, zio->io_data, csize, 6137 hdr->b_size) != 0) 6138 zio->io_error = EIO; 6139 zio_data_buf_free(cdata, csize); 6140 } 6141 6142 /* Restore the expected uncompressed IO size. */ 6143 zio->io_orig_size = zio->io_size = hdr->b_size; 6144} 6145 6146/* 6147 * Releases the temporary b_tmp_cdata buffer in an l2arc header structure. 6148 * This buffer serves as a temporary holder of compressed data while 6149 * the buffer entry is being written to an l2arc device. Once that is 6150 * done, we can dispose of it. 6151 */ 6152static void 6153l2arc_release_cdata_buf(arc_buf_hdr_t *hdr) 6154{ 6155 ASSERT(HDR_HAS_L1HDR(hdr)); 6156 if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_EMPTY) { 6157 /* 6158 * If the data was compressed, then we've allocated a 6159 * temporary buffer for it, so now we need to release it. 6160 */ 6161 ASSERT(hdr->b_l1hdr.b_tmp_cdata != NULL); 6162 zio_data_buf_free(hdr->b_l1hdr.b_tmp_cdata, 6163 hdr->b_size); 6164 hdr->b_l1hdr.b_tmp_cdata = NULL; 6165 } else { 6166 ASSERT(hdr->b_l1hdr.b_tmp_cdata == NULL); 6167 } 6168} 6169 6170/* 6171 * This thread feeds the L2ARC at regular intervals. This is the beating 6172 * heart of the L2ARC. 6173 */ 6174static void 6175l2arc_feed_thread(void *dummy __unused) 6176{ 6177 callb_cpr_t cpr; 6178 l2arc_dev_t *dev; 6179 spa_t *spa; 6180 uint64_t size, wrote; 6181 clock_t begin, next = ddi_get_lbolt(); 6182 boolean_t headroom_boost = B_FALSE; 6183 6184 CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG); 6185 6186 mutex_enter(&l2arc_feed_thr_lock); 6187 6188 while (l2arc_thread_exit == 0) { 6189 CALLB_CPR_SAFE_BEGIN(&cpr); 6190 (void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock, 6191 next - ddi_get_lbolt()); 6192 CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock); 6193 next = ddi_get_lbolt() + hz; 6194 6195 /* 6196 * Quick check for L2ARC devices. 6197 */ 6198 mutex_enter(&l2arc_dev_mtx); 6199 if (l2arc_ndev == 0) { 6200 mutex_exit(&l2arc_dev_mtx); 6201 continue; 6202 } 6203 mutex_exit(&l2arc_dev_mtx); 6204 begin = ddi_get_lbolt(); 6205 6206 /* 6207 * This selects the next l2arc device to write to, and in 6208 * doing so the next spa to feed from: dev->l2ad_spa. This 6209 * will return NULL if there are now no l2arc devices or if 6210 * they are all faulted. 6211 * 6212 * If a device is returned, its spa's config lock is also 6213 * held to prevent device removal. l2arc_dev_get_next() 6214 * will grab and release l2arc_dev_mtx. 6215 */ 6216 if ((dev = l2arc_dev_get_next()) == NULL) 6217 continue; 6218 6219 spa = dev->l2ad_spa; 6220 ASSERT(spa != NULL); 6221 6222 /* 6223 * If the pool is read-only then force the feed thread to 6224 * sleep a little longer. 6225 */ 6226 if (!spa_writeable(spa)) { 6227 next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz; 6228 spa_config_exit(spa, SCL_L2ARC, dev); 6229 continue; 6230 } 6231 6232 /* 6233 * Avoid contributing to memory pressure. 6234 */ 6235 if (arc_reclaim_needed()) { 6236 ARCSTAT_BUMP(arcstat_l2_abort_lowmem); 6237 spa_config_exit(spa, SCL_L2ARC, dev); 6238 continue; 6239 } 6240 6241 ARCSTAT_BUMP(arcstat_l2_feeds); 6242 6243 size = l2arc_write_size(); 6244 6245 /* 6246 * Evict L2ARC buffers that will be overwritten. 6247 */ 6248 l2arc_evict(dev, size, B_FALSE); 6249 6250 /* 6251 * Write ARC buffers. 6252 */ 6253 wrote = l2arc_write_buffers(spa, dev, size, &headroom_boost); 6254 6255 /* 6256 * Calculate interval between writes. 6257 */ 6258 next = l2arc_write_interval(begin, size, wrote); 6259 spa_config_exit(spa, SCL_L2ARC, dev); 6260 } 6261 6262 l2arc_thread_exit = 0; 6263 cv_broadcast(&l2arc_feed_thr_cv); 6264 CALLB_CPR_EXIT(&cpr); /* drops l2arc_feed_thr_lock */ 6265 thread_exit(); 6266} 6267 6268boolean_t 6269l2arc_vdev_present(vdev_t *vd) 6270{ 6271 l2arc_dev_t *dev; 6272 6273 mutex_enter(&l2arc_dev_mtx); 6274 for (dev = list_head(l2arc_dev_list); dev != NULL; 6275 dev = list_next(l2arc_dev_list, dev)) { 6276 if (dev->l2ad_vdev == vd) 6277 break; 6278 } 6279 mutex_exit(&l2arc_dev_mtx); 6280 6281 return (dev != NULL); 6282} 6283 6284/* 6285 * Add a vdev for use by the L2ARC. By this point the spa has already 6286 * validated the vdev and opened it. 6287 */ 6288void 6289l2arc_add_vdev(spa_t *spa, vdev_t *vd) 6290{ 6291 l2arc_dev_t *adddev; 6292 6293 ASSERT(!l2arc_vdev_present(vd)); 6294 6295 vdev_ashift_optimize(vd); 6296 6297 /* 6298 * Create a new l2arc device entry. 6299 */ 6300 adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP); 6301 adddev->l2ad_spa = spa; 6302 adddev->l2ad_vdev = vd; 6303 adddev->l2ad_start = VDEV_LABEL_START_SIZE; 6304 adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd); 6305 adddev->l2ad_hand = adddev->l2ad_start; 6306 adddev->l2ad_first = B_TRUE; 6307 adddev->l2ad_writing = B_FALSE; 6308 6309 mutex_init(&adddev->l2ad_mtx, NULL, MUTEX_DEFAULT, NULL); 6310 /* 6311 * This is a list of all ARC buffers that are still valid on the 6312 * device. 6313 */ 6314 list_create(&adddev->l2ad_buflist, sizeof (arc_buf_hdr_t), 6315 offsetof(arc_buf_hdr_t, b_l2hdr.b_l2node)); 6316 6317 vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand); 6318 refcount_create(&adddev->l2ad_alloc); 6319 6320 /* 6321 * Add device to global list 6322 */ 6323 mutex_enter(&l2arc_dev_mtx); 6324 list_insert_head(l2arc_dev_list, adddev); 6325 atomic_inc_64(&l2arc_ndev); 6326 mutex_exit(&l2arc_dev_mtx); 6327} 6328 6329/* 6330 * Remove a vdev from the L2ARC. 6331 */ 6332void 6333l2arc_remove_vdev(vdev_t *vd) 6334{ 6335 l2arc_dev_t *dev, *nextdev, *remdev = NULL; 6336 6337 /* 6338 * Find the device by vdev 6339 */ 6340 mutex_enter(&l2arc_dev_mtx); 6341 for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) { 6342 nextdev = list_next(l2arc_dev_list, dev); 6343 if (vd == dev->l2ad_vdev) { 6344 remdev = dev; 6345 break; 6346 } 6347 } 6348 ASSERT(remdev != NULL); 6349 6350 /* 6351 * Remove device from global list 6352 */ 6353 list_remove(l2arc_dev_list, remdev); 6354 l2arc_dev_last = NULL; /* may have been invalidated */ 6355 atomic_dec_64(&l2arc_ndev); 6356 mutex_exit(&l2arc_dev_mtx); 6357 6358 /* 6359 * Clear all buflists and ARC references. L2ARC device flush. 6360 */ 6361 l2arc_evict(remdev, 0, B_TRUE); 6362 list_destroy(&remdev->l2ad_buflist); 6363 mutex_destroy(&remdev->l2ad_mtx); 6364 refcount_destroy(&remdev->l2ad_alloc); 6365 kmem_free(remdev, sizeof (l2arc_dev_t)); 6366} 6367 6368void 6369l2arc_init(void) 6370{ 6371 l2arc_thread_exit = 0; 6372 l2arc_ndev = 0; 6373 l2arc_writes_sent = 0; 6374 l2arc_writes_done = 0; 6375 6376 mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL); 6377 cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL); 6378 mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL); 6379 mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL); 6380 6381 l2arc_dev_list = &L2ARC_dev_list; 6382 l2arc_free_on_write = &L2ARC_free_on_write; 6383 list_create(l2arc_dev_list, sizeof (l2arc_dev_t), 6384 offsetof(l2arc_dev_t, l2ad_node)); 6385 list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t), 6386 offsetof(l2arc_data_free_t, l2df_list_node)); 6387} 6388 6389void 6390l2arc_fini(void) 6391{ 6392 /* 6393 * This is called from dmu_fini(), which is called from spa_fini(); 6394 * Because of this, we can assume that all l2arc devices have 6395 * already been removed when the pools themselves were removed. 6396 */ 6397 6398 l2arc_do_free_on_write(); 6399 6400 mutex_destroy(&l2arc_feed_thr_lock); 6401 cv_destroy(&l2arc_feed_thr_cv); 6402 mutex_destroy(&l2arc_dev_mtx); 6403 mutex_destroy(&l2arc_free_on_write_mtx); 6404 6405 list_destroy(l2arc_dev_list); 6406 list_destroy(l2arc_free_on_write); 6407} 6408 6409void 6410l2arc_start(void) 6411{ 6412 if (!(spa_mode_global & FWRITE)) 6413 return; 6414 6415 (void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0, 6416 TS_RUN, minclsyspri); 6417} 6418 6419void 6420l2arc_stop(void) 6421{ 6422 if (!(spa_mode_global & FWRITE)) 6423 return; 6424 6425 mutex_enter(&l2arc_feed_thr_lock); 6426 cv_signal(&l2arc_feed_thr_cv); /* kick thread out of startup */ 6427 l2arc_thread_exit = 1; 6428 while (l2arc_thread_exit != 0) 6429 cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock); 6430 mutex_exit(&l2arc_feed_thr_lock); 6431} 6432