arc.c revision 288548
1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21/* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2012, Joyent, Inc. All rights reserved. 24 * Copyright (c) 2011, 2014 by Delphix. All rights reserved. 25 * Copyright (c) 2014 by Saso Kiselkov. All rights reserved. 26 * Copyright 2014 Nexenta Systems, Inc. All rights reserved. 27 */ 28 29/* 30 * DVA-based Adjustable Replacement Cache 31 * 32 * While much of the theory of operation used here is 33 * based on the self-tuning, low overhead replacement cache 34 * presented by Megiddo and Modha at FAST 2003, there are some 35 * significant differences: 36 * 37 * 1. The Megiddo and Modha model assumes any page is evictable. 38 * Pages in its cache cannot be "locked" into memory. This makes 39 * the eviction algorithm simple: evict the last page in the list. 40 * This also make the performance characteristics easy to reason 41 * about. Our cache is not so simple. At any given moment, some 42 * subset of the blocks in the cache are un-evictable because we 43 * have handed out a reference to them. Blocks are only evictable 44 * when there are no external references active. This makes 45 * eviction far more problematic: we choose to evict the evictable 46 * blocks that are the "lowest" in the list. 47 * 48 * There are times when it is not possible to evict the requested 49 * space. In these circumstances we are unable to adjust the cache 50 * size. To prevent the cache growing unbounded at these times we 51 * implement a "cache throttle" that slows the flow of new data 52 * into the cache until we can make space available. 53 * 54 * 2. The Megiddo and Modha model assumes a fixed cache size. 55 * Pages are evicted when the cache is full and there is a cache 56 * miss. Our model has a variable sized cache. It grows with 57 * high use, but also tries to react to memory pressure from the 58 * operating system: decreasing its size when system memory is 59 * tight. 60 * 61 * 3. The Megiddo and Modha model assumes a fixed page size. All 62 * elements of the cache are therefore exactly the same size. So 63 * when adjusting the cache size following a cache miss, its simply 64 * a matter of choosing a single page to evict. In our model, we 65 * have variable sized cache blocks (rangeing from 512 bytes to 66 * 128K bytes). We therefore choose a set of blocks to evict to make 67 * space for a cache miss that approximates as closely as possible 68 * the space used by the new block. 69 * 70 * See also: "ARC: A Self-Tuning, Low Overhead Replacement Cache" 71 * by N. Megiddo & D. Modha, FAST 2003 72 */ 73 74/* 75 * The locking model: 76 * 77 * A new reference to a cache buffer can be obtained in two 78 * ways: 1) via a hash table lookup using the DVA as a key, 79 * or 2) via one of the ARC lists. The arc_read() interface 80 * uses method 1, while the internal arc algorithms for 81 * adjusting the cache use method 2. We therefore provide two 82 * types of locks: 1) the hash table lock array, and 2) the 83 * arc list locks. 84 * 85 * Buffers do not have their own mutexs, rather they rely on the 86 * hash table mutexs for the bulk of their protection (i.e. most 87 * fields in the arc_buf_hdr_t are protected by these mutexs). 88 * 89 * buf_hash_find() returns the appropriate mutex (held) when it 90 * locates the requested buffer in the hash table. It returns 91 * NULL for the mutex if the buffer was not in the table. 92 * 93 * buf_hash_remove() expects the appropriate hash mutex to be 94 * already held before it is invoked. 95 * 96 * Each arc state also has a mutex which is used to protect the 97 * buffer list associated with the state. When attempting to 98 * obtain a hash table lock while holding an arc list lock you 99 * must use: mutex_tryenter() to avoid deadlock. Also note that 100 * the active state mutex must be held before the ghost state mutex. 101 * 102 * Arc buffers may have an associated eviction callback function. 103 * This function will be invoked prior to removing the buffer (e.g. 104 * in arc_do_user_evicts()). Note however that the data associated 105 * with the buffer may be evicted prior to the callback. The callback 106 * must be made with *no locks held* (to prevent deadlock). Additionally, 107 * the users of callbacks must ensure that their private data is 108 * protected from simultaneous callbacks from arc_clear_callback() 109 * and arc_do_user_evicts(). 110 * 111 * Note that the majority of the performance stats are manipulated 112 * with atomic operations. 113 * 114 * The L2ARC uses the l2ad_mtx on each vdev for the following: 115 * 116 * - L2ARC buflist creation 117 * - L2ARC buflist eviction 118 * - L2ARC write completion, which walks L2ARC buflists 119 * - ARC header destruction, as it removes from L2ARC buflists 120 * - ARC header release, as it removes from L2ARC buflists 121 */ 122 123#include <sys/spa.h> 124#include <sys/zio.h> 125#include <sys/zio_compress.h> 126#include <sys/zfs_context.h> 127#include <sys/arc.h> 128#include <sys/refcount.h> 129#include <sys/vdev.h> 130#include <sys/vdev_impl.h> 131#include <sys/dsl_pool.h> 132#ifdef _KERNEL 133#include <sys/dnlc.h> 134#endif 135#include <sys/callb.h> 136#include <sys/kstat.h> 137#include <sys/trim_map.h> 138#include <zfs_fletcher.h> 139#include <sys/sdt.h> 140 141#include <vm/vm_pageout.h> 142#include <machine/vmparam.h> 143 144#ifdef illumos 145#ifndef _KERNEL 146/* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */ 147boolean_t arc_watch = B_FALSE; 148int arc_procfd; 149#endif 150#endif /* illumos */ 151 152static kmutex_t arc_reclaim_thr_lock; 153static kcondvar_t arc_reclaim_thr_cv; /* used to signal reclaim thr */ 154static uint8_t arc_thread_exit; 155 156#define ARC_REDUCE_DNLC_PERCENT 3 157uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT; 158 159typedef enum arc_reclaim_strategy { 160 ARC_RECLAIM_AGGR, /* Aggressive reclaim strategy */ 161 ARC_RECLAIM_CONS /* Conservative reclaim strategy */ 162} arc_reclaim_strategy_t; 163 164/* 165 * The number of iterations through arc_evict_*() before we 166 * drop & reacquire the lock. 167 */ 168int arc_evict_iterations = 100; 169 170/* number of seconds before growing cache again */ 171static int arc_grow_retry = 60; 172 173/* shift of arc_c for calculating both min and max arc_p */ 174static int arc_p_min_shift = 4; 175 176/* log2(fraction of arc to reclaim) */ 177static int arc_shrink_shift = 5; 178 179/* 180 * minimum lifespan of a prefetch block in clock ticks 181 * (initialized in arc_init()) 182 */ 183static int arc_min_prefetch_lifespan; 184 185/* 186 * If this percent of memory is free, don't throttle. 187 */ 188int arc_lotsfree_percent = 10; 189 190static int arc_dead; 191extern int zfs_prefetch_disable; 192 193/* 194 * The arc has filled available memory and has now warmed up. 195 */ 196static boolean_t arc_warm; 197 198uint64_t zfs_arc_max; 199uint64_t zfs_arc_min; 200uint64_t zfs_arc_meta_limit = 0; 201uint64_t zfs_arc_meta_min = 0; 202int zfs_arc_grow_retry = 0; 203int zfs_arc_shrink_shift = 0; 204int zfs_arc_p_min_shift = 0; 205int zfs_disable_dup_eviction = 0; 206uint64_t zfs_arc_average_blocksize = 8 * 1024; /* 8KB */ 207u_int zfs_arc_free_target = 0; 208 209static int sysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS); 210static int sysctl_vfs_zfs_arc_meta_limit(SYSCTL_HANDLER_ARGS); 211 212#ifdef _KERNEL 213static void 214arc_free_target_init(void *unused __unused) 215{ 216 217 zfs_arc_free_target = vm_pageout_wakeup_thresh; 218} 219SYSINIT(arc_free_target_init, SI_SUB_KTHREAD_PAGE, SI_ORDER_ANY, 220 arc_free_target_init, NULL); 221 222TUNABLE_QUAD("vfs.zfs.arc_max", &zfs_arc_max); 223TUNABLE_QUAD("vfs.zfs.arc_min", &zfs_arc_min); 224TUNABLE_QUAD("vfs.zfs.arc_meta_limit", &zfs_arc_meta_limit); 225TUNABLE_QUAD("vfs.zfs.arc_meta_min", &zfs_arc_meta_min); 226TUNABLE_QUAD("vfs.zfs.arc_average_blocksize", &zfs_arc_average_blocksize); 227TUNABLE_INT("vfs.zfs.arc_shrink_shift", &zfs_arc_shrink_shift); 228SYSCTL_DECL(_vfs_zfs); 229SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_max, CTLFLAG_RDTUN, &zfs_arc_max, 0, 230 "Maximum ARC size"); 231SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_min, CTLFLAG_RDTUN, &zfs_arc_min, 0, 232 "Minimum ARC size"); 233SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_average_blocksize, CTLFLAG_RDTUN, 234 &zfs_arc_average_blocksize, 0, 235 "ARC average blocksize"); 236SYSCTL_INT(_vfs_zfs, OID_AUTO, arc_shrink_shift, CTLFLAG_RW, 237 &arc_shrink_shift, 0, 238 "log2(fraction of arc to reclaim)"); 239 240/* 241 * We don't have a tunable for arc_free_target due to the dependency on 242 * pagedaemon initialisation. 243 */ 244SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_free_target, 245 CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(u_int), 246 sysctl_vfs_zfs_arc_free_target, "IU", 247 "Desired number of free pages below which ARC triggers reclaim"); 248 249static int 250sysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS) 251{ 252 u_int val; 253 int err; 254 255 val = zfs_arc_free_target; 256 err = sysctl_handle_int(oidp, &val, 0, req); 257 if (err != 0 || req->newptr == NULL) 258 return (err); 259 260 if (val < minfree) 261 return (EINVAL); 262 if (val > cnt.v_page_count) 263 return (EINVAL); 264 265 zfs_arc_free_target = val; 266 267 return (0); 268} 269 270/* 271 * Must be declared here, before the definition of corresponding kstat 272 * macro which uses the same names will confuse the compiler. 273 */ 274SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_meta_limit, 275 CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(uint64_t), 276 sysctl_vfs_zfs_arc_meta_limit, "QU", 277 "ARC metadata limit"); 278#endif 279 280/* 281 * Note that buffers can be in one of 6 states: 282 * ARC_anon - anonymous (discussed below) 283 * ARC_mru - recently used, currently cached 284 * ARC_mru_ghost - recentely used, no longer in cache 285 * ARC_mfu - frequently used, currently cached 286 * ARC_mfu_ghost - frequently used, no longer in cache 287 * ARC_l2c_only - exists in L2ARC but not other states 288 * When there are no active references to the buffer, they are 289 * are linked onto a list in one of these arc states. These are 290 * the only buffers that can be evicted or deleted. Within each 291 * state there are multiple lists, one for meta-data and one for 292 * non-meta-data. Meta-data (indirect blocks, blocks of dnodes, 293 * etc.) is tracked separately so that it can be managed more 294 * explicitly: favored over data, limited explicitly. 295 * 296 * Anonymous buffers are buffers that are not associated with 297 * a DVA. These are buffers that hold dirty block copies 298 * before they are written to stable storage. By definition, 299 * they are "ref'd" and are considered part of arc_mru 300 * that cannot be freed. Generally, they will aquire a DVA 301 * as they are written and migrate onto the arc_mru list. 302 * 303 * The ARC_l2c_only state is for buffers that are in the second 304 * level ARC but no longer in any of the ARC_m* lists. The second 305 * level ARC itself may also contain buffers that are in any of 306 * the ARC_m* states - meaning that a buffer can exist in two 307 * places. The reason for the ARC_l2c_only state is to keep the 308 * buffer header in the hash table, so that reads that hit the 309 * second level ARC benefit from these fast lookups. 310 */ 311 312#define ARCS_LOCK_PAD CACHE_LINE_SIZE 313struct arcs_lock { 314 kmutex_t arcs_lock; 315#ifdef _KERNEL 316 unsigned char pad[(ARCS_LOCK_PAD - sizeof (kmutex_t))]; 317#endif 318}; 319 320/* 321 * must be power of two for mask use to work 322 * 323 */ 324#define ARC_BUFC_NUMDATALISTS 16 325#define ARC_BUFC_NUMMETADATALISTS 16 326#define ARC_BUFC_NUMLISTS (ARC_BUFC_NUMMETADATALISTS + ARC_BUFC_NUMDATALISTS) 327 328typedef struct arc_state { 329 uint64_t arcs_lsize[ARC_BUFC_NUMTYPES]; /* amount of evictable data */ 330 uint64_t arcs_size; /* total amount of data in this state */ 331 list_t arcs_lists[ARC_BUFC_NUMLISTS]; /* list of evictable buffers */ 332 struct arcs_lock arcs_locks[ARC_BUFC_NUMLISTS] __aligned(CACHE_LINE_SIZE); 333} arc_state_t; 334 335#define ARCS_LOCK(s, i) (&((s)->arcs_locks[(i)].arcs_lock)) 336 337/* The 6 states: */ 338static arc_state_t ARC_anon; 339static arc_state_t ARC_mru; 340static arc_state_t ARC_mru_ghost; 341static arc_state_t ARC_mfu; 342static arc_state_t ARC_mfu_ghost; 343static arc_state_t ARC_l2c_only; 344 345typedef struct arc_stats { 346 kstat_named_t arcstat_hits; 347 kstat_named_t arcstat_misses; 348 kstat_named_t arcstat_demand_data_hits; 349 kstat_named_t arcstat_demand_data_misses; 350 kstat_named_t arcstat_demand_metadata_hits; 351 kstat_named_t arcstat_demand_metadata_misses; 352 kstat_named_t arcstat_prefetch_data_hits; 353 kstat_named_t arcstat_prefetch_data_misses; 354 kstat_named_t arcstat_prefetch_metadata_hits; 355 kstat_named_t arcstat_prefetch_metadata_misses; 356 kstat_named_t arcstat_mru_hits; 357 kstat_named_t arcstat_mru_ghost_hits; 358 kstat_named_t arcstat_mfu_hits; 359 kstat_named_t arcstat_mfu_ghost_hits; 360 kstat_named_t arcstat_allocated; 361 kstat_named_t arcstat_deleted; 362 kstat_named_t arcstat_stolen; 363 kstat_named_t arcstat_recycle_miss; 364 /* 365 * Number of buffers that could not be evicted because the hash lock 366 * was held by another thread. The lock may not necessarily be held 367 * by something using the same buffer, since hash locks are shared 368 * by multiple buffers. 369 */ 370 kstat_named_t arcstat_mutex_miss; 371 /* 372 * Number of buffers skipped because they have I/O in progress, are 373 * indrect prefetch buffers that have not lived long enough, or are 374 * not from the spa we're trying to evict from. 375 */ 376 kstat_named_t arcstat_evict_skip; 377 kstat_named_t arcstat_evict_l2_cached; 378 kstat_named_t arcstat_evict_l2_eligible; 379 kstat_named_t arcstat_evict_l2_ineligible; 380 kstat_named_t arcstat_hash_elements; 381 kstat_named_t arcstat_hash_elements_max; 382 kstat_named_t arcstat_hash_collisions; 383 kstat_named_t arcstat_hash_chains; 384 kstat_named_t arcstat_hash_chain_max; 385 kstat_named_t arcstat_p; 386 kstat_named_t arcstat_c; 387 kstat_named_t arcstat_c_min; 388 kstat_named_t arcstat_c_max; 389 kstat_named_t arcstat_size; 390 /* 391 * Number of bytes consumed by internal ARC structures necessary 392 * for tracking purposes; these structures are not actually 393 * backed by ARC buffers. This includes arc_buf_hdr_t structures 394 * (allocated via arc_buf_hdr_t_full and arc_buf_hdr_t_l2only 395 * caches), and arc_buf_t structures (allocated via arc_buf_t 396 * cache). 397 */ 398 kstat_named_t arcstat_hdr_size; 399 /* 400 * Number of bytes consumed by ARC buffers of type equal to 401 * ARC_BUFC_DATA. This is generally consumed by buffers backing 402 * on disk user data (e.g. plain file contents). 403 */ 404 kstat_named_t arcstat_data_size; 405 /* 406 * Number of bytes consumed by ARC buffers of type equal to 407 * ARC_BUFC_METADATA. This is generally consumed by buffers 408 * backing on disk data that is used for internal ZFS 409 * structures (e.g. ZAP, dnode, indirect blocks, etc). 410 */ 411 kstat_named_t arcstat_metadata_size; 412 /* 413 * Number of bytes consumed by various buffers and structures 414 * not actually backed with ARC buffers. This includes bonus 415 * buffers (allocated directly via zio_buf_* functions), 416 * dmu_buf_impl_t structures (allocated via dmu_buf_impl_t 417 * cache), and dnode_t structures (allocated via dnode_t cache). 418 */ 419 kstat_named_t arcstat_other_size; 420 /* 421 * Total number of bytes consumed by ARC buffers residing in the 422 * arc_anon state. This includes *all* buffers in the arc_anon 423 * state; e.g. data, metadata, evictable, and unevictable buffers 424 * are all included in this value. 425 */ 426 kstat_named_t arcstat_anon_size; 427 /* 428 * Number of bytes consumed by ARC buffers that meet the 429 * following criteria: backing buffers of type ARC_BUFC_DATA, 430 * residing in the arc_anon state, and are eligible for eviction 431 * (e.g. have no outstanding holds on the buffer). 432 */ 433 kstat_named_t arcstat_anon_evictable_data; 434 /* 435 * Number of bytes consumed by ARC buffers that meet the 436 * following criteria: backing buffers of type ARC_BUFC_METADATA, 437 * residing in the arc_anon state, and are eligible for eviction 438 * (e.g. have no outstanding holds on the buffer). 439 */ 440 kstat_named_t arcstat_anon_evictable_metadata; 441 /* 442 * Total number of bytes consumed by ARC buffers residing in the 443 * arc_mru state. This includes *all* buffers in the arc_mru 444 * state; e.g. data, metadata, evictable, and unevictable buffers 445 * are all included in this value. 446 */ 447 kstat_named_t arcstat_mru_size; 448 /* 449 * Number of bytes consumed by ARC buffers that meet the 450 * following criteria: backing buffers of type ARC_BUFC_DATA, 451 * residing in the arc_mru state, and are eligible for eviction 452 * (e.g. have no outstanding holds on the buffer). 453 */ 454 kstat_named_t arcstat_mru_evictable_data; 455 /* 456 * Number of bytes consumed by ARC buffers that meet the 457 * following criteria: backing buffers of type ARC_BUFC_METADATA, 458 * residing in the arc_mru state, and are eligible for eviction 459 * (e.g. have no outstanding holds on the buffer). 460 */ 461 kstat_named_t arcstat_mru_evictable_metadata; 462 /* 463 * Total number of bytes that *would have been* consumed by ARC 464 * buffers in the arc_mru_ghost state. The key thing to note 465 * here, is the fact that this size doesn't actually indicate 466 * RAM consumption. The ghost lists only consist of headers and 467 * don't actually have ARC buffers linked off of these headers. 468 * Thus, *if* the headers had associated ARC buffers, these 469 * buffers *would have* consumed this number of bytes. 470 */ 471 kstat_named_t arcstat_mru_ghost_size; 472 /* 473 * Number of bytes that *would have been* consumed by ARC 474 * buffers that are eligible for eviction, of type 475 * ARC_BUFC_DATA, and linked off the arc_mru_ghost state. 476 */ 477 kstat_named_t arcstat_mru_ghost_evictable_data; 478 /* 479 * Number of bytes that *would have been* consumed by ARC 480 * buffers that are eligible for eviction, of type 481 * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state. 482 */ 483 kstat_named_t arcstat_mru_ghost_evictable_metadata; 484 /* 485 * Total number of bytes consumed by ARC buffers residing in the 486 * arc_mfu state. This includes *all* buffers in the arc_mfu 487 * state; e.g. data, metadata, evictable, and unevictable buffers 488 * are all included in this value. 489 */ 490 kstat_named_t arcstat_mfu_size; 491 /* 492 * Number of bytes consumed by ARC buffers that are eligible for 493 * eviction, of type ARC_BUFC_DATA, and reside in the arc_mfu 494 * state. 495 */ 496 kstat_named_t arcstat_mfu_evictable_data; 497 /* 498 * Number of bytes consumed by ARC buffers that are eligible for 499 * eviction, of type ARC_BUFC_METADATA, and reside in the 500 * arc_mfu state. 501 */ 502 kstat_named_t arcstat_mfu_evictable_metadata; 503 /* 504 * Total number of bytes that *would have been* consumed by ARC 505 * buffers in the arc_mfu_ghost state. See the comment above 506 * arcstat_mru_ghost_size for more details. 507 */ 508 kstat_named_t arcstat_mfu_ghost_size; 509 /* 510 * Number of bytes that *would have been* consumed by ARC 511 * buffers that are eligible for eviction, of type 512 * ARC_BUFC_DATA, and linked off the arc_mfu_ghost state. 513 */ 514 kstat_named_t arcstat_mfu_ghost_evictable_data; 515 /* 516 * Number of bytes that *would have been* consumed by ARC 517 * buffers that are eligible for eviction, of type 518 * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state. 519 */ 520 kstat_named_t arcstat_mfu_ghost_evictable_metadata; 521 kstat_named_t arcstat_l2_hits; 522 kstat_named_t arcstat_l2_misses; 523 kstat_named_t arcstat_l2_feeds; 524 kstat_named_t arcstat_l2_rw_clash; 525 kstat_named_t arcstat_l2_read_bytes; 526 kstat_named_t arcstat_l2_write_bytes; 527 kstat_named_t arcstat_l2_writes_sent; 528 kstat_named_t arcstat_l2_writes_done; 529 kstat_named_t arcstat_l2_writes_error; 530 kstat_named_t arcstat_l2_writes_hdr_miss; 531 kstat_named_t arcstat_l2_evict_lock_retry; 532 kstat_named_t arcstat_l2_evict_reading; 533 kstat_named_t arcstat_l2_evict_l1cached; 534 kstat_named_t arcstat_l2_free_on_write; 535 kstat_named_t arcstat_l2_cdata_free_on_write; 536 kstat_named_t arcstat_l2_abort_lowmem; 537 kstat_named_t arcstat_l2_cksum_bad; 538 kstat_named_t arcstat_l2_io_error; 539 kstat_named_t arcstat_l2_size; 540 kstat_named_t arcstat_l2_asize; 541 kstat_named_t arcstat_l2_hdr_size; 542 kstat_named_t arcstat_l2_compress_successes; 543 kstat_named_t arcstat_l2_compress_zeros; 544 kstat_named_t arcstat_l2_compress_failures; 545 kstat_named_t arcstat_l2_write_trylock_fail; 546 kstat_named_t arcstat_l2_write_passed_headroom; 547 kstat_named_t arcstat_l2_write_spa_mismatch; 548 kstat_named_t arcstat_l2_write_in_l2; 549 kstat_named_t arcstat_l2_write_hdr_io_in_progress; 550 kstat_named_t arcstat_l2_write_not_cacheable; 551 kstat_named_t arcstat_l2_write_full; 552 kstat_named_t arcstat_l2_write_buffer_iter; 553 kstat_named_t arcstat_l2_write_pios; 554 kstat_named_t arcstat_l2_write_buffer_bytes_scanned; 555 kstat_named_t arcstat_l2_write_buffer_list_iter; 556 kstat_named_t arcstat_l2_write_buffer_list_null_iter; 557 kstat_named_t arcstat_memory_throttle_count; 558 kstat_named_t arcstat_duplicate_buffers; 559 kstat_named_t arcstat_duplicate_buffers_size; 560 kstat_named_t arcstat_duplicate_reads; 561 kstat_named_t arcstat_meta_used; 562 kstat_named_t arcstat_meta_limit; 563 kstat_named_t arcstat_meta_max; 564 kstat_named_t arcstat_meta_min; 565} arc_stats_t; 566 567static arc_stats_t arc_stats = { 568 { "hits", KSTAT_DATA_UINT64 }, 569 { "misses", KSTAT_DATA_UINT64 }, 570 { "demand_data_hits", KSTAT_DATA_UINT64 }, 571 { "demand_data_misses", KSTAT_DATA_UINT64 }, 572 { "demand_metadata_hits", KSTAT_DATA_UINT64 }, 573 { "demand_metadata_misses", KSTAT_DATA_UINT64 }, 574 { "prefetch_data_hits", KSTAT_DATA_UINT64 }, 575 { "prefetch_data_misses", KSTAT_DATA_UINT64 }, 576 { "prefetch_metadata_hits", KSTAT_DATA_UINT64 }, 577 { "prefetch_metadata_misses", KSTAT_DATA_UINT64 }, 578 { "mru_hits", KSTAT_DATA_UINT64 }, 579 { "mru_ghost_hits", KSTAT_DATA_UINT64 }, 580 { "mfu_hits", KSTAT_DATA_UINT64 }, 581 { "mfu_ghost_hits", KSTAT_DATA_UINT64 }, 582 { "allocated", KSTAT_DATA_UINT64 }, 583 { "deleted", KSTAT_DATA_UINT64 }, 584 { "stolen", KSTAT_DATA_UINT64 }, 585 { "recycle_miss", KSTAT_DATA_UINT64 }, 586 { "mutex_miss", KSTAT_DATA_UINT64 }, 587 { "evict_skip", KSTAT_DATA_UINT64 }, 588 { "evict_l2_cached", KSTAT_DATA_UINT64 }, 589 { "evict_l2_eligible", KSTAT_DATA_UINT64 }, 590 { "evict_l2_ineligible", KSTAT_DATA_UINT64 }, 591 { "hash_elements", KSTAT_DATA_UINT64 }, 592 { "hash_elements_max", KSTAT_DATA_UINT64 }, 593 { "hash_collisions", KSTAT_DATA_UINT64 }, 594 { "hash_chains", KSTAT_DATA_UINT64 }, 595 { "hash_chain_max", KSTAT_DATA_UINT64 }, 596 { "p", KSTAT_DATA_UINT64 }, 597 { "c", KSTAT_DATA_UINT64 }, 598 { "c_min", KSTAT_DATA_UINT64 }, 599 { "c_max", KSTAT_DATA_UINT64 }, 600 { "size", KSTAT_DATA_UINT64 }, 601 { "hdr_size", KSTAT_DATA_UINT64 }, 602 { "data_size", KSTAT_DATA_UINT64 }, 603 { "metadata_size", KSTAT_DATA_UINT64 }, 604 { "other_size", KSTAT_DATA_UINT64 }, 605 { "anon_size", KSTAT_DATA_UINT64 }, 606 { "anon_evictable_data", KSTAT_DATA_UINT64 }, 607 { "anon_evictable_metadata", KSTAT_DATA_UINT64 }, 608 { "mru_size", KSTAT_DATA_UINT64 }, 609 { "mru_evictable_data", KSTAT_DATA_UINT64 }, 610 { "mru_evictable_metadata", KSTAT_DATA_UINT64 }, 611 { "mru_ghost_size", KSTAT_DATA_UINT64 }, 612 { "mru_ghost_evictable_data", KSTAT_DATA_UINT64 }, 613 { "mru_ghost_evictable_metadata", KSTAT_DATA_UINT64 }, 614 { "mfu_size", KSTAT_DATA_UINT64 }, 615 { "mfu_evictable_data", KSTAT_DATA_UINT64 }, 616 { "mfu_evictable_metadata", KSTAT_DATA_UINT64 }, 617 { "mfu_ghost_size", KSTAT_DATA_UINT64 }, 618 { "mfu_ghost_evictable_data", KSTAT_DATA_UINT64 }, 619 { "mfu_ghost_evictable_metadata", KSTAT_DATA_UINT64 }, 620 { "l2_hits", KSTAT_DATA_UINT64 }, 621 { "l2_misses", KSTAT_DATA_UINT64 }, 622 { "l2_feeds", KSTAT_DATA_UINT64 }, 623 { "l2_rw_clash", KSTAT_DATA_UINT64 }, 624 { "l2_read_bytes", KSTAT_DATA_UINT64 }, 625 { "l2_write_bytes", KSTAT_DATA_UINT64 }, 626 { "l2_writes_sent", KSTAT_DATA_UINT64 }, 627 { "l2_writes_done", KSTAT_DATA_UINT64 }, 628 { "l2_writes_error", KSTAT_DATA_UINT64 }, 629 { "l2_writes_hdr_miss", KSTAT_DATA_UINT64 }, 630 { "l2_evict_lock_retry", KSTAT_DATA_UINT64 }, 631 { "l2_evict_reading", KSTAT_DATA_UINT64 }, 632 { "l2_evict_l1cached", KSTAT_DATA_UINT64 }, 633 { "l2_free_on_write", KSTAT_DATA_UINT64 }, 634 { "l2_cdata_free_on_write", KSTAT_DATA_UINT64 }, 635 { "l2_abort_lowmem", KSTAT_DATA_UINT64 }, 636 { "l2_cksum_bad", KSTAT_DATA_UINT64 }, 637 { "l2_io_error", KSTAT_DATA_UINT64 }, 638 { "l2_size", KSTAT_DATA_UINT64 }, 639 { "l2_asize", KSTAT_DATA_UINT64 }, 640 { "l2_hdr_size", KSTAT_DATA_UINT64 }, 641 { "l2_compress_successes", KSTAT_DATA_UINT64 }, 642 { "l2_compress_zeros", KSTAT_DATA_UINT64 }, 643 { "l2_compress_failures", KSTAT_DATA_UINT64 }, 644 { "l2_write_trylock_fail", KSTAT_DATA_UINT64 }, 645 { "l2_write_passed_headroom", KSTAT_DATA_UINT64 }, 646 { "l2_write_spa_mismatch", KSTAT_DATA_UINT64 }, 647 { "l2_write_in_l2", KSTAT_DATA_UINT64 }, 648 { "l2_write_io_in_progress", KSTAT_DATA_UINT64 }, 649 { "l2_write_not_cacheable", KSTAT_DATA_UINT64 }, 650 { "l2_write_full", KSTAT_DATA_UINT64 }, 651 { "l2_write_buffer_iter", KSTAT_DATA_UINT64 }, 652 { "l2_write_pios", KSTAT_DATA_UINT64 }, 653 { "l2_write_buffer_bytes_scanned", KSTAT_DATA_UINT64 }, 654 { "l2_write_buffer_list_iter", KSTAT_DATA_UINT64 }, 655 { "l2_write_buffer_list_null_iter", KSTAT_DATA_UINT64 }, 656 { "memory_throttle_count", KSTAT_DATA_UINT64 }, 657 { "duplicate_buffers", KSTAT_DATA_UINT64 }, 658 { "duplicate_buffers_size", KSTAT_DATA_UINT64 }, 659 { "duplicate_reads", KSTAT_DATA_UINT64 }, 660 { "arc_meta_used", KSTAT_DATA_UINT64 }, 661 { "arc_meta_limit", KSTAT_DATA_UINT64 }, 662 { "arc_meta_max", KSTAT_DATA_UINT64 }, 663 { "arc_meta_min", KSTAT_DATA_UINT64 } 664}; 665 666#define ARCSTAT(stat) (arc_stats.stat.value.ui64) 667 668#define ARCSTAT_INCR(stat, val) \ 669 atomic_add_64(&arc_stats.stat.value.ui64, (val)) 670 671#define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1) 672#define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1) 673 674#define ARCSTAT_MAX(stat, val) { \ 675 uint64_t m; \ 676 while ((val) > (m = arc_stats.stat.value.ui64) && \ 677 (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val)))) \ 678 continue; \ 679} 680 681#define ARCSTAT_MAXSTAT(stat) \ 682 ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64) 683 684/* 685 * We define a macro to allow ARC hits/misses to be easily broken down by 686 * two separate conditions, giving a total of four different subtypes for 687 * each of hits and misses (so eight statistics total). 688 */ 689#define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \ 690 if (cond1) { \ 691 if (cond2) { \ 692 ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \ 693 } else { \ 694 ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \ 695 } \ 696 } else { \ 697 if (cond2) { \ 698 ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \ 699 } else { \ 700 ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\ 701 } \ 702 } 703 704kstat_t *arc_ksp; 705static arc_state_t *arc_anon; 706static arc_state_t *arc_mru; 707static arc_state_t *arc_mru_ghost; 708static arc_state_t *arc_mfu; 709static arc_state_t *arc_mfu_ghost; 710static arc_state_t *arc_l2c_only; 711 712/* 713 * There are several ARC variables that are critical to export as kstats -- 714 * but we don't want to have to grovel around in the kstat whenever we wish to 715 * manipulate them. For these variables, we therefore define them to be in 716 * terms of the statistic variable. This assures that we are not introducing 717 * the possibility of inconsistency by having shadow copies of the variables, 718 * while still allowing the code to be readable. 719 */ 720#define arc_size ARCSTAT(arcstat_size) /* actual total arc size */ 721#define arc_p ARCSTAT(arcstat_p) /* target size of MRU */ 722#define arc_c ARCSTAT(arcstat_c) /* target size of cache */ 723#define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */ 724#define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */ 725#define arc_meta_limit ARCSTAT(arcstat_meta_limit) /* max size for metadata */ 726#define arc_meta_min ARCSTAT(arcstat_meta_min) /* min size for metadata */ 727#define arc_meta_used ARCSTAT(arcstat_meta_used) /* size of metadata */ 728#define arc_meta_max ARCSTAT(arcstat_meta_max) /* max size of metadata */ 729 730#define L2ARC_IS_VALID_COMPRESS(_c_) \ 731 ((_c_) == ZIO_COMPRESS_LZ4 || (_c_) == ZIO_COMPRESS_EMPTY) 732 733static int arc_no_grow; /* Don't try to grow cache size */ 734static uint64_t arc_tempreserve; 735static uint64_t arc_loaned_bytes; 736 737typedef struct arc_callback arc_callback_t; 738 739struct arc_callback { 740 void *acb_private; 741 arc_done_func_t *acb_done; 742 arc_buf_t *acb_buf; 743 zio_t *acb_zio_dummy; 744 arc_callback_t *acb_next; 745}; 746 747typedef struct arc_write_callback arc_write_callback_t; 748 749struct arc_write_callback { 750 void *awcb_private; 751 arc_done_func_t *awcb_ready; 752 arc_done_func_t *awcb_physdone; 753 arc_done_func_t *awcb_done; 754 arc_buf_t *awcb_buf; 755}; 756 757/* 758 * ARC buffers are separated into multiple structs as a memory saving measure: 759 * - Common fields struct, always defined, and embedded within it: 760 * - L2-only fields, always allocated but undefined when not in L2ARC 761 * - L1-only fields, only allocated when in L1ARC 762 * 763 * Buffer in L1 Buffer only in L2 764 * +------------------------+ +------------------------+ 765 * | arc_buf_hdr_t | | arc_buf_hdr_t | 766 * | | | | 767 * | | | | 768 * | | | | 769 * +------------------------+ +------------------------+ 770 * | l2arc_buf_hdr_t | | l2arc_buf_hdr_t | 771 * | (undefined if L1-only) | | | 772 * +------------------------+ +------------------------+ 773 * | l1arc_buf_hdr_t | 774 * | | 775 * | | 776 * | | 777 * | | 778 * +------------------------+ 779 * 780 * Because it's possible for the L2ARC to become extremely large, we can wind 781 * up eating a lot of memory in L2ARC buffer headers, so the size of a header 782 * is minimized by only allocating the fields necessary for an L1-cached buffer 783 * when a header is actually in the L1 cache. The sub-headers (l1arc_buf_hdr and 784 * l2arc_buf_hdr) are embedded rather than allocated separately to save a couple 785 * words in pointers. arc_hdr_realloc() is used to switch a header between 786 * these two allocation states. 787 */ 788typedef struct l1arc_buf_hdr { 789 kmutex_t b_freeze_lock; 790#ifdef ZFS_DEBUG 791 /* 792 * used for debugging wtih kmem_flags - by allocating and freeing 793 * b_thawed when the buffer is thawed, we get a record of the stack 794 * trace that thawed it. 795 */ 796 void *b_thawed; 797#endif 798 799 arc_buf_t *b_buf; 800 uint32_t b_datacnt; 801 /* for waiting on writes to complete */ 802 kcondvar_t b_cv; 803 804 /* protected by arc state mutex */ 805 arc_state_t *b_state; 806 list_node_t b_arc_node; 807 808 /* updated atomically */ 809 clock_t b_arc_access; 810 811 /* self protecting */ 812 refcount_t b_refcnt; 813 814 arc_callback_t *b_acb; 815 /* temporary buffer holder for in-flight compressed data */ 816 void *b_tmp_cdata; 817} l1arc_buf_hdr_t; 818 819typedef struct l2arc_dev l2arc_dev_t; 820 821typedef struct l2arc_buf_hdr { 822 /* protected by arc_buf_hdr mutex */ 823 l2arc_dev_t *b_dev; /* L2ARC device */ 824 uint64_t b_daddr; /* disk address, offset byte */ 825 /* real alloc'd buffer size depending on b_compress applied */ 826 int32_t b_asize; 827 828 list_node_t b_l2node; 829} l2arc_buf_hdr_t; 830 831struct arc_buf_hdr { 832 /* protected by hash lock */ 833 dva_t b_dva; 834 uint64_t b_birth; 835 /* 836 * Even though this checksum is only set/verified when a buffer is in 837 * the L1 cache, it needs to be in the set of common fields because it 838 * must be preserved from the time before a buffer is written out to 839 * L2ARC until after it is read back in. 840 */ 841 zio_cksum_t *b_freeze_cksum; 842 843 arc_buf_hdr_t *b_hash_next; 844 arc_flags_t b_flags; 845 846 /* immutable */ 847 int32_t b_size; 848 uint64_t b_spa; 849 850 /* L2ARC fields. Undefined when not in L2ARC. */ 851 l2arc_buf_hdr_t b_l2hdr; 852 /* L1ARC fields. Undefined when in l2arc_only state */ 853 l1arc_buf_hdr_t b_l1hdr; 854}; 855 856#ifdef _KERNEL 857static int 858sysctl_vfs_zfs_arc_meta_limit(SYSCTL_HANDLER_ARGS) 859{ 860 uint64_t val; 861 int err; 862 863 val = arc_meta_limit; 864 err = sysctl_handle_64(oidp, &val, 0, req); 865 if (err != 0 || req->newptr == NULL) 866 return (err); 867 868 if (val <= 0 || val > arc_c_max) 869 return (EINVAL); 870 871 arc_meta_limit = val; 872 return (0); 873} 874#endif 875 876static arc_buf_t *arc_eviction_list; 877static kmutex_t arc_eviction_mtx; 878static arc_buf_hdr_t arc_eviction_hdr; 879 880#define GHOST_STATE(state) \ 881 ((state) == arc_mru_ghost || (state) == arc_mfu_ghost || \ 882 (state) == arc_l2c_only) 883 884#define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_FLAG_IN_HASH_TABLE) 885#define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) 886#define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_FLAG_IO_ERROR) 887#define HDR_PREFETCH(hdr) ((hdr)->b_flags & ARC_FLAG_PREFETCH) 888#define HDR_FREED_IN_READ(hdr) ((hdr)->b_flags & ARC_FLAG_FREED_IN_READ) 889#define HDR_BUF_AVAILABLE(hdr) ((hdr)->b_flags & ARC_FLAG_BUF_AVAILABLE) 890 891#define HDR_L2CACHE(hdr) ((hdr)->b_flags & ARC_FLAG_L2CACHE) 892#define HDR_L2COMPRESS(hdr) ((hdr)->b_flags & ARC_FLAG_L2COMPRESS) 893#define HDR_L2_READING(hdr) \ 894 (((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) && \ 895 ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR)) 896#define HDR_L2_WRITING(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITING) 897#define HDR_L2_EVICTED(hdr) ((hdr)->b_flags & ARC_FLAG_L2_EVICTED) 898#define HDR_L2_WRITE_HEAD(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITE_HEAD) 899 900#define HDR_ISTYPE_METADATA(hdr) \ 901 ((hdr)->b_flags & ARC_FLAG_BUFC_METADATA) 902#define HDR_ISTYPE_DATA(hdr) (!HDR_ISTYPE_METADATA(hdr)) 903 904#define HDR_HAS_L1HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L1HDR) 905#define HDR_HAS_L2HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR) 906 907/* For storing compression mode in b_flags */ 908#define HDR_COMPRESS_OFFSET 24 909#define HDR_COMPRESS_NBITS 7 910 911#define HDR_GET_COMPRESS(hdr) ((enum zio_compress)BF32_GET(hdr->b_flags, \ 912 HDR_COMPRESS_OFFSET, HDR_COMPRESS_NBITS)) 913#define HDR_SET_COMPRESS(hdr, cmp) BF32_SET(hdr->b_flags, \ 914 HDR_COMPRESS_OFFSET, HDR_COMPRESS_NBITS, (cmp)) 915 916/* 917 * Other sizes 918 */ 919 920#define HDR_FULL_SIZE ((int64_t)sizeof (arc_buf_hdr_t)) 921#define HDR_L2ONLY_SIZE ((int64_t)offsetof(arc_buf_hdr_t, b_l1hdr)) 922 923/* 924 * Hash table routines 925 */ 926 927#define HT_LOCK_PAD CACHE_LINE_SIZE 928 929struct ht_lock { 930 kmutex_t ht_lock; 931#ifdef _KERNEL 932 unsigned char pad[(HT_LOCK_PAD - sizeof (kmutex_t))]; 933#endif 934}; 935 936#define BUF_LOCKS 256 937typedef struct buf_hash_table { 938 uint64_t ht_mask; 939 arc_buf_hdr_t **ht_table; 940 struct ht_lock ht_locks[BUF_LOCKS] __aligned(CACHE_LINE_SIZE); 941} buf_hash_table_t; 942 943static buf_hash_table_t buf_hash_table; 944 945#define BUF_HASH_INDEX(spa, dva, birth) \ 946 (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask) 947#define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)]) 948#define BUF_HASH_LOCK(idx) (&(BUF_HASH_LOCK_NTRY(idx).ht_lock)) 949#define HDR_LOCK(hdr) \ 950 (BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth))) 951 952uint64_t zfs_crc64_table[256]; 953 954/* 955 * Level 2 ARC 956 */ 957 958#define L2ARC_WRITE_SIZE (8 * 1024 * 1024) /* initial write max */ 959#define L2ARC_HEADROOM 2 /* num of writes */ 960/* 961 * If we discover during ARC scan any buffers to be compressed, we boost 962 * our headroom for the next scanning cycle by this percentage multiple. 963 */ 964#define L2ARC_HEADROOM_BOOST 200 965#define L2ARC_FEED_SECS 1 /* caching interval secs */ 966#define L2ARC_FEED_MIN_MS 200 /* min caching interval ms */ 967 968#define l2arc_writes_sent ARCSTAT(arcstat_l2_writes_sent) 969#define l2arc_writes_done ARCSTAT(arcstat_l2_writes_done) 970 971/* L2ARC Performance Tunables */ 972uint64_t l2arc_write_max = L2ARC_WRITE_SIZE; /* default max write size */ 973uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra write during warmup */ 974uint64_t l2arc_headroom = L2ARC_HEADROOM; /* number of dev writes */ 975uint64_t l2arc_headroom_boost = L2ARC_HEADROOM_BOOST; 976uint64_t l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */ 977uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval milliseconds */ 978boolean_t l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */ 979boolean_t l2arc_feed_again = B_TRUE; /* turbo warmup */ 980boolean_t l2arc_norw = B_TRUE; /* no reads during writes */ 981 982SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_max, CTLFLAG_RW, 983 &l2arc_write_max, 0, "max write size"); 984SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_boost, CTLFLAG_RW, 985 &l2arc_write_boost, 0, "extra write during warmup"); 986SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_headroom, CTLFLAG_RW, 987 &l2arc_headroom, 0, "number of dev writes"); 988SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_secs, CTLFLAG_RW, 989 &l2arc_feed_secs, 0, "interval seconds"); 990SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_min_ms, CTLFLAG_RW, 991 &l2arc_feed_min_ms, 0, "min interval milliseconds"); 992 993SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_noprefetch, CTLFLAG_RW, 994 &l2arc_noprefetch, 0, "don't cache prefetch bufs"); 995SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_feed_again, CTLFLAG_RW, 996 &l2arc_feed_again, 0, "turbo warmup"); 997SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_norw, CTLFLAG_RW, 998 &l2arc_norw, 0, "no reads during writes"); 999 1000SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_size, CTLFLAG_RD, 1001 &ARC_anon.arcs_size, 0, "size of anonymous state"); 1002SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_metadata_lsize, CTLFLAG_RD, 1003 &ARC_anon.arcs_lsize[ARC_BUFC_METADATA], 0, "size of anonymous state"); 1004SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_data_lsize, CTLFLAG_RD, 1005 &ARC_anon.arcs_lsize[ARC_BUFC_DATA], 0, "size of anonymous state"); 1006 1007SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_size, CTLFLAG_RD, 1008 &ARC_mru.arcs_size, 0, "size of mru state"); 1009SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_metadata_lsize, CTLFLAG_RD, 1010 &ARC_mru.arcs_lsize[ARC_BUFC_METADATA], 0, "size of metadata in mru state"); 1011SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_data_lsize, CTLFLAG_RD, 1012 &ARC_mru.arcs_lsize[ARC_BUFC_DATA], 0, "size of data in mru state"); 1013 1014SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_size, CTLFLAG_RD, 1015 &ARC_mru_ghost.arcs_size, 0, "size of mru ghost state"); 1016SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_metadata_lsize, CTLFLAG_RD, 1017 &ARC_mru_ghost.arcs_lsize[ARC_BUFC_METADATA], 0, 1018 "size of metadata in mru ghost state"); 1019SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_data_lsize, CTLFLAG_RD, 1020 &ARC_mru_ghost.arcs_lsize[ARC_BUFC_DATA], 0, 1021 "size of data in mru ghost state"); 1022 1023SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_size, CTLFLAG_RD, 1024 &ARC_mfu.arcs_size, 0, "size of mfu state"); 1025SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_metadata_lsize, CTLFLAG_RD, 1026 &ARC_mfu.arcs_lsize[ARC_BUFC_METADATA], 0, "size of metadata in mfu state"); 1027SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_data_lsize, CTLFLAG_RD, 1028 &ARC_mfu.arcs_lsize[ARC_BUFC_DATA], 0, "size of data in mfu state"); 1029 1030SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_size, CTLFLAG_RD, 1031 &ARC_mfu_ghost.arcs_size, 0, "size of mfu ghost state"); 1032SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_metadata_lsize, CTLFLAG_RD, 1033 &ARC_mfu_ghost.arcs_lsize[ARC_BUFC_METADATA], 0, 1034 "size of metadata in mfu ghost state"); 1035SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_data_lsize, CTLFLAG_RD, 1036 &ARC_mfu_ghost.arcs_lsize[ARC_BUFC_DATA], 0, 1037 "size of data in mfu ghost state"); 1038 1039SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2c_only_size, CTLFLAG_RD, 1040 &ARC_l2c_only.arcs_size, 0, "size of mru state"); 1041 1042/* 1043 * L2ARC Internals 1044 */ 1045struct l2arc_dev { 1046 vdev_t *l2ad_vdev; /* vdev */ 1047 spa_t *l2ad_spa; /* spa */ 1048 uint64_t l2ad_hand; /* next write location */ 1049 uint64_t l2ad_start; /* first addr on device */ 1050 uint64_t l2ad_end; /* last addr on device */ 1051 uint64_t l2ad_evict; /* last addr eviction reached */ 1052 boolean_t l2ad_first; /* first sweep through */ 1053 boolean_t l2ad_writing; /* currently writing */ 1054 kmutex_t l2ad_mtx; /* lock for buffer list */ 1055 list_t l2ad_buflist; /* buffer list */ 1056 list_node_t l2ad_node; /* device list node */ 1057}; 1058 1059static list_t L2ARC_dev_list; /* device list */ 1060static list_t *l2arc_dev_list; /* device list pointer */ 1061static kmutex_t l2arc_dev_mtx; /* device list mutex */ 1062static l2arc_dev_t *l2arc_dev_last; /* last device used */ 1063static list_t L2ARC_free_on_write; /* free after write buf list */ 1064static list_t *l2arc_free_on_write; /* free after write list ptr */ 1065static kmutex_t l2arc_free_on_write_mtx; /* mutex for list */ 1066static uint64_t l2arc_ndev; /* number of devices */ 1067 1068typedef struct l2arc_read_callback { 1069 arc_buf_t *l2rcb_buf; /* read buffer */ 1070 spa_t *l2rcb_spa; /* spa */ 1071 blkptr_t l2rcb_bp; /* original blkptr */ 1072 zbookmark_phys_t l2rcb_zb; /* original bookmark */ 1073 int l2rcb_flags; /* original flags */ 1074 enum zio_compress l2rcb_compress; /* applied compress */ 1075} l2arc_read_callback_t; 1076 1077typedef struct l2arc_write_callback { 1078 l2arc_dev_t *l2wcb_dev; /* device info */ 1079 arc_buf_hdr_t *l2wcb_head; /* head of write buflist */ 1080} l2arc_write_callback_t; 1081 1082typedef struct l2arc_data_free { 1083 /* protected by l2arc_free_on_write_mtx */ 1084 void *l2df_data; 1085 size_t l2df_size; 1086 void (*l2df_func)(void *, size_t); 1087 list_node_t l2df_list_node; 1088} l2arc_data_free_t; 1089 1090static kmutex_t l2arc_feed_thr_lock; 1091static kcondvar_t l2arc_feed_thr_cv; 1092static uint8_t l2arc_thread_exit; 1093 1094static void arc_get_data_buf(arc_buf_t *); 1095static void arc_access(arc_buf_hdr_t *, kmutex_t *); 1096static int arc_evict_needed(arc_buf_contents_t); 1097static void arc_evict_ghost(arc_state_t *, uint64_t, int64_t); 1098static void arc_buf_watch(arc_buf_t *); 1099 1100static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *); 1101static uint32_t arc_bufc_to_flags(arc_buf_contents_t); 1102 1103static boolean_t l2arc_write_eligible(uint64_t, arc_buf_hdr_t *); 1104static void l2arc_read_done(zio_t *); 1105 1106static boolean_t l2arc_compress_buf(arc_buf_hdr_t *); 1107static void l2arc_decompress_zio(zio_t *, arc_buf_hdr_t *, enum zio_compress); 1108static void l2arc_release_cdata_buf(arc_buf_hdr_t *); 1109 1110static uint64_t 1111buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth) 1112{ 1113 uint8_t *vdva = (uint8_t *)dva; 1114 uint64_t crc = -1ULL; 1115 int i; 1116 1117 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); 1118 1119 for (i = 0; i < sizeof (dva_t); i++) 1120 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF]; 1121 1122 crc ^= (spa>>8) ^ birth; 1123 1124 return (crc); 1125} 1126 1127#define BUF_EMPTY(buf) \ 1128 ((buf)->b_dva.dva_word[0] == 0 && \ 1129 (buf)->b_dva.dva_word[1] == 0) 1130 1131#define BUF_EQUAL(spa, dva, birth, buf) \ 1132 ((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \ 1133 ((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \ 1134 ((buf)->b_birth == birth) && ((buf)->b_spa == spa) 1135 1136static void 1137buf_discard_identity(arc_buf_hdr_t *hdr) 1138{ 1139 hdr->b_dva.dva_word[0] = 0; 1140 hdr->b_dva.dva_word[1] = 0; 1141 hdr->b_birth = 0; 1142} 1143 1144static arc_buf_hdr_t * 1145buf_hash_find(uint64_t spa, const blkptr_t *bp, kmutex_t **lockp) 1146{ 1147 const dva_t *dva = BP_IDENTITY(bp); 1148 uint64_t birth = BP_PHYSICAL_BIRTH(bp); 1149 uint64_t idx = BUF_HASH_INDEX(spa, dva, birth); 1150 kmutex_t *hash_lock = BUF_HASH_LOCK(idx); 1151 arc_buf_hdr_t *hdr; 1152 1153 mutex_enter(hash_lock); 1154 for (hdr = buf_hash_table.ht_table[idx]; hdr != NULL; 1155 hdr = hdr->b_hash_next) { 1156 if (BUF_EQUAL(spa, dva, birth, hdr)) { 1157 *lockp = hash_lock; 1158 return (hdr); 1159 } 1160 } 1161 mutex_exit(hash_lock); 1162 *lockp = NULL; 1163 return (NULL); 1164} 1165 1166/* 1167 * Insert an entry into the hash table. If there is already an element 1168 * equal to elem in the hash table, then the already existing element 1169 * will be returned and the new element will not be inserted. 1170 * Otherwise returns NULL. 1171 * If lockp == NULL, the caller is assumed to already hold the hash lock. 1172 */ 1173static arc_buf_hdr_t * 1174buf_hash_insert(arc_buf_hdr_t *hdr, kmutex_t **lockp) 1175{ 1176 uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth); 1177 kmutex_t *hash_lock = BUF_HASH_LOCK(idx); 1178 arc_buf_hdr_t *fhdr; 1179 uint32_t i; 1180 1181 ASSERT(!DVA_IS_EMPTY(&hdr->b_dva)); 1182 ASSERT(hdr->b_birth != 0); 1183 ASSERT(!HDR_IN_HASH_TABLE(hdr)); 1184 1185 if (lockp != NULL) { 1186 *lockp = hash_lock; 1187 mutex_enter(hash_lock); 1188 } else { 1189 ASSERT(MUTEX_HELD(hash_lock)); 1190 } 1191 1192 for (fhdr = buf_hash_table.ht_table[idx], i = 0; fhdr != NULL; 1193 fhdr = fhdr->b_hash_next, i++) { 1194 if (BUF_EQUAL(hdr->b_spa, &hdr->b_dva, hdr->b_birth, fhdr)) 1195 return (fhdr); 1196 } 1197 1198 hdr->b_hash_next = buf_hash_table.ht_table[idx]; 1199 buf_hash_table.ht_table[idx] = hdr; 1200 hdr->b_flags |= ARC_FLAG_IN_HASH_TABLE; 1201 1202 /* collect some hash table performance data */ 1203 if (i > 0) { 1204 ARCSTAT_BUMP(arcstat_hash_collisions); 1205 if (i == 1) 1206 ARCSTAT_BUMP(arcstat_hash_chains); 1207 1208 ARCSTAT_MAX(arcstat_hash_chain_max, i); 1209 } 1210 1211 ARCSTAT_BUMP(arcstat_hash_elements); 1212 ARCSTAT_MAXSTAT(arcstat_hash_elements); 1213 1214 return (NULL); 1215} 1216 1217static void 1218buf_hash_remove(arc_buf_hdr_t *hdr) 1219{ 1220 arc_buf_hdr_t *fhdr, **hdrp; 1221 uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth); 1222 1223 ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx))); 1224 ASSERT(HDR_IN_HASH_TABLE(hdr)); 1225 1226 hdrp = &buf_hash_table.ht_table[idx]; 1227 while ((fhdr = *hdrp) != hdr) { 1228 ASSERT(fhdr != NULL); 1229 hdrp = &fhdr->b_hash_next; 1230 } 1231 *hdrp = hdr->b_hash_next; 1232 hdr->b_hash_next = NULL; 1233 hdr->b_flags &= ~ARC_FLAG_IN_HASH_TABLE; 1234 1235 /* collect some hash table performance data */ 1236 ARCSTAT_BUMPDOWN(arcstat_hash_elements); 1237 1238 if (buf_hash_table.ht_table[idx] && 1239 buf_hash_table.ht_table[idx]->b_hash_next == NULL) 1240 ARCSTAT_BUMPDOWN(arcstat_hash_chains); 1241} 1242 1243/* 1244 * Global data structures and functions for the buf kmem cache. 1245 */ 1246static kmem_cache_t *hdr_full_cache; 1247static kmem_cache_t *hdr_l2only_cache; 1248static kmem_cache_t *buf_cache; 1249 1250static void 1251buf_fini(void) 1252{ 1253 int i; 1254 1255 kmem_free(buf_hash_table.ht_table, 1256 (buf_hash_table.ht_mask + 1) * sizeof (void *)); 1257 for (i = 0; i < BUF_LOCKS; i++) 1258 mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock); 1259 kmem_cache_destroy(hdr_full_cache); 1260 kmem_cache_destroy(hdr_l2only_cache); 1261 kmem_cache_destroy(buf_cache); 1262} 1263 1264/* 1265 * Constructor callback - called when the cache is empty 1266 * and a new buf is requested. 1267 */ 1268/* ARGSUSED */ 1269static int 1270hdr_full_cons(void *vbuf, void *unused, int kmflag) 1271{ 1272 arc_buf_hdr_t *hdr = vbuf; 1273 1274 bzero(hdr, HDR_FULL_SIZE); 1275 cv_init(&hdr->b_l1hdr.b_cv, NULL, CV_DEFAULT, NULL); 1276 refcount_create(&hdr->b_l1hdr.b_refcnt); 1277 mutex_init(&hdr->b_l1hdr.b_freeze_lock, NULL, MUTEX_DEFAULT, NULL); 1278 arc_space_consume(HDR_FULL_SIZE, ARC_SPACE_HDRS); 1279 1280 return (0); 1281} 1282 1283/* ARGSUSED */ 1284static int 1285hdr_l2only_cons(void *vbuf, void *unused, int kmflag) 1286{ 1287 arc_buf_hdr_t *hdr = vbuf; 1288 1289 bzero(hdr, HDR_L2ONLY_SIZE); 1290 arc_space_consume(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS); 1291 1292 return (0); 1293} 1294 1295/* ARGSUSED */ 1296static int 1297buf_cons(void *vbuf, void *unused, int kmflag) 1298{ 1299 arc_buf_t *buf = vbuf; 1300 1301 bzero(buf, sizeof (arc_buf_t)); 1302 mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL); 1303 arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS); 1304 1305 return (0); 1306} 1307 1308/* 1309 * Destructor callback - called when a cached buf is 1310 * no longer required. 1311 */ 1312/* ARGSUSED */ 1313static void 1314hdr_full_dest(void *vbuf, void *unused) 1315{ 1316 arc_buf_hdr_t *hdr = vbuf; 1317 1318 ASSERT(BUF_EMPTY(hdr)); 1319 cv_destroy(&hdr->b_l1hdr.b_cv); 1320 refcount_destroy(&hdr->b_l1hdr.b_refcnt); 1321 mutex_destroy(&hdr->b_l1hdr.b_freeze_lock); 1322 arc_space_return(HDR_FULL_SIZE, ARC_SPACE_HDRS); 1323} 1324 1325/* ARGSUSED */ 1326static void 1327hdr_l2only_dest(void *vbuf, void *unused) 1328{ 1329 arc_buf_hdr_t *hdr = vbuf; 1330 1331 ASSERT(BUF_EMPTY(hdr)); 1332 arc_space_return(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS); 1333} 1334 1335/* ARGSUSED */ 1336static void 1337buf_dest(void *vbuf, void *unused) 1338{ 1339 arc_buf_t *buf = vbuf; 1340 1341 mutex_destroy(&buf->b_evict_lock); 1342 arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS); 1343} 1344 1345/* 1346 * Reclaim callback -- invoked when memory is low. 1347 */ 1348/* ARGSUSED */ 1349static void 1350hdr_recl(void *unused) 1351{ 1352 dprintf("hdr_recl called\n"); 1353 /* 1354 * umem calls the reclaim func when we destroy the buf cache, 1355 * which is after we do arc_fini(). 1356 */ 1357 if (!arc_dead) 1358 cv_signal(&arc_reclaim_thr_cv); 1359} 1360 1361static void 1362buf_init(void) 1363{ 1364 uint64_t *ct; 1365 uint64_t hsize = 1ULL << 12; 1366 int i, j; 1367 1368 /* 1369 * The hash table is big enough to fill all of physical memory 1370 * with an average block size of zfs_arc_average_blocksize (default 8K). 1371 * By default, the table will take up 1372 * totalmem * sizeof(void*) / 8K (1MB per GB with 8-byte pointers). 1373 */ 1374 while (hsize * zfs_arc_average_blocksize < (uint64_t)physmem * PAGESIZE) 1375 hsize <<= 1; 1376retry: 1377 buf_hash_table.ht_mask = hsize - 1; 1378 buf_hash_table.ht_table = 1379 kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP); 1380 if (buf_hash_table.ht_table == NULL) { 1381 ASSERT(hsize > (1ULL << 8)); 1382 hsize >>= 1; 1383 goto retry; 1384 } 1385 1386 hdr_full_cache = kmem_cache_create("arc_buf_hdr_t_full", HDR_FULL_SIZE, 1387 0, hdr_full_cons, hdr_full_dest, hdr_recl, NULL, NULL, 0); 1388 hdr_l2only_cache = kmem_cache_create("arc_buf_hdr_t_l2only", 1389 HDR_L2ONLY_SIZE, 0, hdr_l2only_cons, hdr_l2only_dest, hdr_recl, 1390 NULL, NULL, 0); 1391 buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t), 1392 0, buf_cons, buf_dest, NULL, NULL, NULL, 0); 1393 1394 for (i = 0; i < 256; i++) 1395 for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--) 1396 *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY); 1397 1398 for (i = 0; i < BUF_LOCKS; i++) { 1399 mutex_init(&buf_hash_table.ht_locks[i].ht_lock, 1400 NULL, MUTEX_DEFAULT, NULL); 1401 } 1402} 1403 1404/* 1405 * Transition between the two allocation states for the arc_buf_hdr struct. 1406 * The arc_buf_hdr struct can be allocated with (hdr_full_cache) or without 1407 * (hdr_l2only_cache) the fields necessary for the L1 cache - the smaller 1408 * version is used when a cache buffer is only in the L2ARC in order to reduce 1409 * memory usage. 1410 */ 1411static arc_buf_hdr_t * 1412arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new) 1413{ 1414 ASSERT(HDR_HAS_L2HDR(hdr)); 1415 1416 arc_buf_hdr_t *nhdr; 1417 l2arc_dev_t *dev = hdr->b_l2hdr.b_dev; 1418 1419 ASSERT((old == hdr_full_cache && new == hdr_l2only_cache) || 1420 (old == hdr_l2only_cache && new == hdr_full_cache)); 1421 1422 nhdr = kmem_cache_alloc(new, KM_PUSHPAGE); 1423 1424 ASSERT(MUTEX_HELD(HDR_LOCK(hdr))); 1425 buf_hash_remove(hdr); 1426 1427 bcopy(hdr, nhdr, HDR_L2ONLY_SIZE); 1428 if (new == hdr_full_cache) { 1429 nhdr->b_flags |= ARC_FLAG_HAS_L1HDR; 1430 /* 1431 * arc_access and arc_change_state need to be aware that a 1432 * header has just come out of L2ARC, so we set its state to 1433 * l2c_only even though it's about to change. 1434 */ 1435 nhdr->b_l1hdr.b_state = arc_l2c_only; 1436 } else { 1437 ASSERT(hdr->b_l1hdr.b_buf == NULL); 1438 ASSERT0(hdr->b_l1hdr.b_datacnt); 1439 ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node)); 1440 /* 1441 * We might be removing the L1hdr of a buffer which was just 1442 * written out to L2ARC. If such a buffer is compressed then we 1443 * need to free its b_tmp_cdata before destroying the header. 1444 */ 1445 if (hdr->b_l1hdr.b_tmp_cdata != NULL && 1446 HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF) 1447 l2arc_release_cdata_buf(hdr); 1448 nhdr->b_flags &= ~ARC_FLAG_HAS_L1HDR; 1449 } 1450 /* 1451 * The header has been reallocated so we need to re-insert it into any 1452 * lists it was on. 1453 */ 1454 (void) buf_hash_insert(nhdr, NULL); 1455 1456 ASSERT(list_link_active(&hdr->b_l2hdr.b_l2node)); 1457 1458 mutex_enter(&dev->l2ad_mtx); 1459 1460 /* 1461 * We must place the realloc'ed header back into the list at 1462 * the same spot. Otherwise, if it's placed earlier in the list, 1463 * l2arc_write_buffers() could find it during the function's 1464 * write phase, and try to write it out to the l2arc. 1465 */ 1466 list_insert_after(&dev->l2ad_buflist, hdr, nhdr); 1467 list_remove(&dev->l2ad_buflist, hdr); 1468 1469 mutex_exit(&dev->l2ad_mtx); 1470 1471 buf_discard_identity(hdr); 1472 hdr->b_freeze_cksum = NULL; 1473 kmem_cache_free(old, hdr); 1474 1475 return (nhdr); 1476} 1477 1478 1479#define ARC_MINTIME (hz>>4) /* 62 ms */ 1480 1481static void 1482arc_cksum_verify(arc_buf_t *buf) 1483{ 1484 zio_cksum_t zc; 1485 1486 if (!(zfs_flags & ZFS_DEBUG_MODIFY)) 1487 return; 1488 1489 mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1490 if (buf->b_hdr->b_freeze_cksum == NULL || HDR_IO_ERROR(buf->b_hdr)) { 1491 mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1492 return; 1493 } 1494 fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc); 1495 if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc)) 1496 panic("buffer modified while frozen!"); 1497 mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1498} 1499 1500static int 1501arc_cksum_equal(arc_buf_t *buf) 1502{ 1503 zio_cksum_t zc; 1504 int equal; 1505 1506 mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1507 fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc); 1508 equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc); 1509 mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1510 1511 return (equal); 1512} 1513 1514static void 1515arc_cksum_compute(arc_buf_t *buf, boolean_t force) 1516{ 1517 if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY)) 1518 return; 1519 1520 mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1521 if (buf->b_hdr->b_freeze_cksum != NULL) { 1522 mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1523 return; 1524 } 1525 buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP); 1526 fletcher_2_native(buf->b_data, buf->b_hdr->b_size, 1527 buf->b_hdr->b_freeze_cksum); 1528 mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1529#ifdef illumos 1530 arc_buf_watch(buf); 1531#endif /* illumos */ 1532} 1533 1534#ifdef illumos 1535#ifndef _KERNEL 1536typedef struct procctl { 1537 long cmd; 1538 prwatch_t prwatch; 1539} procctl_t; 1540#endif 1541 1542/* ARGSUSED */ 1543static void 1544arc_buf_unwatch(arc_buf_t *buf) 1545{ 1546#ifndef _KERNEL 1547 if (arc_watch) { 1548 int result; 1549 procctl_t ctl; 1550 ctl.cmd = PCWATCH; 1551 ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data; 1552 ctl.prwatch.pr_size = 0; 1553 ctl.prwatch.pr_wflags = 0; 1554 result = write(arc_procfd, &ctl, sizeof (ctl)); 1555 ASSERT3U(result, ==, sizeof (ctl)); 1556 } 1557#endif 1558} 1559 1560/* ARGSUSED */ 1561static void 1562arc_buf_watch(arc_buf_t *buf) 1563{ 1564#ifndef _KERNEL 1565 if (arc_watch) { 1566 int result; 1567 procctl_t ctl; 1568 ctl.cmd = PCWATCH; 1569 ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data; 1570 ctl.prwatch.pr_size = buf->b_hdr->b_size; 1571 ctl.prwatch.pr_wflags = WA_WRITE; 1572 result = write(arc_procfd, &ctl, sizeof (ctl)); 1573 ASSERT3U(result, ==, sizeof (ctl)); 1574 } 1575#endif 1576} 1577#endif /* illumos */ 1578 1579static arc_buf_contents_t 1580arc_buf_type(arc_buf_hdr_t *hdr) 1581{ 1582 if (HDR_ISTYPE_METADATA(hdr)) { 1583 return (ARC_BUFC_METADATA); 1584 } else { 1585 return (ARC_BUFC_DATA); 1586 } 1587} 1588 1589static uint32_t 1590arc_bufc_to_flags(arc_buf_contents_t type) 1591{ 1592 switch (type) { 1593 case ARC_BUFC_DATA: 1594 /* metadata field is 0 if buffer contains normal data */ 1595 return (0); 1596 case ARC_BUFC_METADATA: 1597 return (ARC_FLAG_BUFC_METADATA); 1598 default: 1599 break; 1600 } 1601 panic("undefined ARC buffer type!"); 1602 return ((uint32_t)-1); 1603} 1604 1605void 1606arc_buf_thaw(arc_buf_t *buf) 1607{ 1608 if (zfs_flags & ZFS_DEBUG_MODIFY) { 1609 if (buf->b_hdr->b_l1hdr.b_state != arc_anon) 1610 panic("modifying non-anon buffer!"); 1611 if (HDR_IO_IN_PROGRESS(buf->b_hdr)) 1612 panic("modifying buffer while i/o in progress!"); 1613 arc_cksum_verify(buf); 1614 } 1615 1616 mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1617 if (buf->b_hdr->b_freeze_cksum != NULL) { 1618 kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t)); 1619 buf->b_hdr->b_freeze_cksum = NULL; 1620 } 1621 1622#ifdef ZFS_DEBUG 1623 if (zfs_flags & ZFS_DEBUG_MODIFY) { 1624 if (buf->b_hdr->b_l1hdr.b_thawed != NULL) 1625 kmem_free(buf->b_hdr->b_l1hdr.b_thawed, 1); 1626 buf->b_hdr->b_l1hdr.b_thawed = kmem_alloc(1, KM_SLEEP); 1627 } 1628#endif 1629 1630 mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1631 1632#ifdef illumos 1633 arc_buf_unwatch(buf); 1634#endif /* illumos */ 1635} 1636 1637void 1638arc_buf_freeze(arc_buf_t *buf) 1639{ 1640 kmutex_t *hash_lock; 1641 1642 if (!(zfs_flags & ZFS_DEBUG_MODIFY)) 1643 return; 1644 1645 hash_lock = HDR_LOCK(buf->b_hdr); 1646 mutex_enter(hash_lock); 1647 1648 ASSERT(buf->b_hdr->b_freeze_cksum != NULL || 1649 buf->b_hdr->b_l1hdr.b_state == arc_anon); 1650 arc_cksum_compute(buf, B_FALSE); 1651 mutex_exit(hash_lock); 1652 1653} 1654 1655static void 1656get_buf_info(arc_buf_hdr_t *hdr, arc_state_t *state, list_t **list, kmutex_t **lock) 1657{ 1658 uint64_t buf_hashid = buf_hash(hdr->b_spa, &hdr->b_dva, hdr->b_birth); 1659 1660 if (arc_buf_type(hdr) == ARC_BUFC_METADATA) 1661 buf_hashid &= (ARC_BUFC_NUMMETADATALISTS - 1); 1662 else { 1663 buf_hashid &= (ARC_BUFC_NUMDATALISTS - 1); 1664 buf_hashid += ARC_BUFC_NUMMETADATALISTS; 1665 } 1666 1667 *list = &state->arcs_lists[buf_hashid]; 1668 *lock = ARCS_LOCK(state, buf_hashid); 1669} 1670 1671 1672static void 1673add_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag) 1674{ 1675 ASSERT(HDR_HAS_L1HDR(hdr)); 1676 ASSERT(MUTEX_HELD(hash_lock)); 1677 arc_state_t *state = hdr->b_l1hdr.b_state; 1678 1679 if ((refcount_add(&hdr->b_l1hdr.b_refcnt, tag) == 1) && 1680 (state != arc_anon)) { 1681 /* We don't use the L2-only state list. */ 1682 if (state != arc_l2c_only) { 1683 uint64_t delta = hdr->b_size * hdr->b_l1hdr.b_datacnt; 1684 uint64_t *size = &state->arcs_lsize[arc_buf_type(hdr)]; 1685 list_t *list; 1686 kmutex_t *lock; 1687 1688 get_buf_info(hdr, state, &list, &lock); 1689 ASSERT(!MUTEX_HELD(lock)); 1690 mutex_enter(lock); 1691 ASSERT(list_link_active(&hdr->b_l1hdr.b_arc_node)); 1692 list_remove(list, hdr); 1693 if (GHOST_STATE(state)) { 1694 ASSERT0(hdr->b_l1hdr.b_datacnt); 1695 ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); 1696 delta = hdr->b_size; 1697 } 1698 ASSERT(delta > 0); 1699 ASSERT3U(*size, >=, delta); 1700 atomic_add_64(size, -delta); 1701 mutex_exit(lock); 1702 } 1703 /* remove the prefetch flag if we get a reference */ 1704 hdr->b_flags &= ~ARC_FLAG_PREFETCH; 1705 } 1706} 1707 1708static int 1709remove_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag) 1710{ 1711 int cnt; 1712 arc_state_t *state = hdr->b_l1hdr.b_state; 1713 1714 ASSERT(HDR_HAS_L1HDR(hdr)); 1715 ASSERT(state == arc_anon || MUTEX_HELD(hash_lock)); 1716 ASSERT(!GHOST_STATE(state)); 1717 1718 /* 1719 * arc_l2c_only counts as a ghost state so we don't need to explicitly 1720 * check to prevent usage of the arc_l2c_only list. 1721 */ 1722 if (((cnt = refcount_remove(&hdr->b_l1hdr.b_refcnt, tag)) == 0) && 1723 (state != arc_anon)) { 1724 uint64_t *size = &state->arcs_lsize[arc_buf_type(hdr)]; 1725 list_t *list; 1726 kmutex_t *lock; 1727 1728 get_buf_info(hdr, state, &list, &lock); 1729 ASSERT(!MUTEX_HELD(lock)); 1730 mutex_enter(lock); 1731 ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node)); 1732 list_insert_head(list, hdr); 1733 ASSERT(hdr->b_l1hdr.b_datacnt > 0); 1734 atomic_add_64(size, hdr->b_size * 1735 hdr->b_l1hdr.b_datacnt); 1736 mutex_exit(lock); 1737 } 1738 return (cnt); 1739} 1740 1741/* 1742 * Move the supplied buffer to the indicated state. The mutex 1743 * for the buffer must be held by the caller. 1744 */ 1745static void 1746arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, 1747 kmutex_t *hash_lock) 1748{ 1749 arc_state_t *old_state; 1750 int64_t refcnt; 1751 uint32_t datacnt; 1752 uint64_t from_delta, to_delta; 1753 arc_buf_contents_t buftype = arc_buf_type(hdr); 1754 list_t *list; 1755 kmutex_t *lock; 1756 1757 /* 1758 * We almost always have an L1 hdr here, since we call arc_hdr_realloc() 1759 * in arc_read() when bringing a buffer out of the L2ARC. However, the 1760 * L1 hdr doesn't always exist when we change state to arc_anon before 1761 * destroying a header, in which case reallocating to add the L1 hdr is 1762 * pointless. 1763 */ 1764 if (HDR_HAS_L1HDR(hdr)) { 1765 old_state = hdr->b_l1hdr.b_state; 1766 refcnt = refcount_count(&hdr->b_l1hdr.b_refcnt); 1767 datacnt = hdr->b_l1hdr.b_datacnt; 1768 } else { 1769 old_state = arc_l2c_only; 1770 refcnt = 0; 1771 datacnt = 0; 1772 } 1773 1774 ASSERT(MUTEX_HELD(hash_lock)); 1775 ASSERT3P(new_state, !=, old_state); 1776 ASSERT(refcnt == 0 || datacnt > 0); 1777 ASSERT(!GHOST_STATE(new_state) || datacnt == 0); 1778 ASSERT(old_state != arc_anon || datacnt <= 1); 1779 1780 from_delta = to_delta = datacnt * hdr->b_size; 1781 1782 /* 1783 * If this buffer is evictable, transfer it from the 1784 * old state list to the new state list. 1785 */ 1786 if (refcnt == 0) { 1787 if (old_state != arc_anon && old_state != arc_l2c_only) { 1788 int use_mutex; 1789 uint64_t *size = &old_state->arcs_lsize[buftype]; 1790 1791 get_buf_info(hdr, old_state, &list, &lock); 1792 use_mutex = !MUTEX_HELD(lock); 1793 if (use_mutex) 1794 mutex_enter(lock); 1795 1796 ASSERT(HDR_HAS_L1HDR(hdr)); 1797 ASSERT(list_link_active(&hdr->b_l1hdr.b_arc_node)); 1798 list_remove(list, hdr); 1799 1800 /* 1801 * If prefetching out of the ghost cache, 1802 * we will have a non-zero datacnt. 1803 */ 1804 if (GHOST_STATE(old_state) && datacnt == 0) { 1805 /* ghost elements have a ghost size */ 1806 ASSERT(hdr->b_l1hdr.b_buf == NULL); 1807 from_delta = hdr->b_size; 1808 } 1809 ASSERT3U(*size, >=, from_delta); 1810 atomic_add_64(size, -from_delta); 1811 1812 if (use_mutex) 1813 mutex_exit(lock); 1814 } 1815 if (new_state != arc_anon && new_state != arc_l2c_only) { 1816 int use_mutex; 1817 uint64_t *size = &new_state->arcs_lsize[buftype]; 1818 1819 /* 1820 * An L1 header always exists here, since if we're 1821 * moving to some L1-cached state (i.e. not l2c_only or 1822 * anonymous), we realloc the header to add an L1hdr 1823 * beforehand. 1824 */ 1825 ASSERT(HDR_HAS_L1HDR(hdr)); 1826 get_buf_info(hdr, new_state, &list, &lock); 1827 use_mutex = !MUTEX_HELD(lock); 1828 if (use_mutex) 1829 mutex_enter(lock); 1830 1831 list_insert_head(list, hdr); 1832 1833 /* ghost elements have a ghost size */ 1834 if (GHOST_STATE(new_state)) { 1835 ASSERT(datacnt == 0); 1836 ASSERT(hdr->b_l1hdr.b_buf == NULL); 1837 to_delta = hdr->b_size; 1838 } 1839 atomic_add_64(size, to_delta); 1840 1841 if (use_mutex) 1842 mutex_exit(lock); 1843 } 1844 } 1845 1846 ASSERT(!BUF_EMPTY(hdr)); 1847 if (new_state == arc_anon && HDR_IN_HASH_TABLE(hdr)) 1848 buf_hash_remove(hdr); 1849 1850 /* adjust state sizes (ignore arc_l2c_only) */ 1851 if (to_delta && new_state != arc_l2c_only) 1852 atomic_add_64(&new_state->arcs_size, to_delta); 1853 if (from_delta && old_state != arc_l2c_only) { 1854 ASSERT3U(old_state->arcs_size, >=, from_delta); 1855 atomic_add_64(&old_state->arcs_size, -from_delta); 1856 } 1857 if (HDR_HAS_L1HDR(hdr)) 1858 hdr->b_l1hdr.b_state = new_state; 1859 1860 /* 1861 * L2 headers should never be on the L2 state list since they don't 1862 * have L1 headers allocated. 1863 */ 1864 ASSERT(list_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_DATA]) && 1865 list_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA])); 1866} 1867 1868void 1869arc_space_consume(uint64_t space, arc_space_type_t type) 1870{ 1871 ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES); 1872 1873 switch (type) { 1874 case ARC_SPACE_DATA: 1875 ARCSTAT_INCR(arcstat_data_size, space); 1876 break; 1877 case ARC_SPACE_META: 1878 ARCSTAT_INCR(arcstat_metadata_size, space); 1879 break; 1880 case ARC_SPACE_OTHER: 1881 ARCSTAT_INCR(arcstat_other_size, space); 1882 break; 1883 case ARC_SPACE_HDRS: 1884 ARCSTAT_INCR(arcstat_hdr_size, space); 1885 break; 1886 case ARC_SPACE_L2HDRS: 1887 ARCSTAT_INCR(arcstat_l2_hdr_size, space); 1888 break; 1889 } 1890 1891 if (type != ARC_SPACE_DATA) 1892 ARCSTAT_INCR(arcstat_meta_used, space); 1893 1894 atomic_add_64(&arc_size, space); 1895} 1896 1897void 1898arc_space_return(uint64_t space, arc_space_type_t type) 1899{ 1900 ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES); 1901 1902 switch (type) { 1903 case ARC_SPACE_DATA: 1904 ARCSTAT_INCR(arcstat_data_size, -space); 1905 break; 1906 case ARC_SPACE_META: 1907 ARCSTAT_INCR(arcstat_metadata_size, -space); 1908 break; 1909 case ARC_SPACE_OTHER: 1910 ARCSTAT_INCR(arcstat_other_size, -space); 1911 break; 1912 case ARC_SPACE_HDRS: 1913 ARCSTAT_INCR(arcstat_hdr_size, -space); 1914 break; 1915 case ARC_SPACE_L2HDRS: 1916 ARCSTAT_INCR(arcstat_l2_hdr_size, -space); 1917 break; 1918 } 1919 1920 if (type != ARC_SPACE_DATA) { 1921 ASSERT(arc_meta_used >= space); 1922 if (arc_meta_max < arc_meta_used) 1923 arc_meta_max = arc_meta_used; 1924 ARCSTAT_INCR(arcstat_meta_used, -space); 1925 } 1926 1927 ASSERT(arc_size >= space); 1928 atomic_add_64(&arc_size, -space); 1929} 1930 1931arc_buf_t * 1932arc_buf_alloc(spa_t *spa, int32_t size, void *tag, arc_buf_contents_t type) 1933{ 1934 arc_buf_hdr_t *hdr; 1935 arc_buf_t *buf; 1936 1937 ASSERT3U(size, >, 0); 1938 hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE); 1939 ASSERT(BUF_EMPTY(hdr)); 1940 ASSERT3P(hdr->b_freeze_cksum, ==, NULL); 1941 hdr->b_size = size; 1942 hdr->b_spa = spa_load_guid(spa); 1943 1944 buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); 1945 buf->b_hdr = hdr; 1946 buf->b_data = NULL; 1947 buf->b_efunc = NULL; 1948 buf->b_private = NULL; 1949 buf->b_next = NULL; 1950 1951 hdr->b_flags = arc_bufc_to_flags(type); 1952 hdr->b_flags |= ARC_FLAG_HAS_L1HDR; 1953 1954 hdr->b_l1hdr.b_buf = buf; 1955 hdr->b_l1hdr.b_state = arc_anon; 1956 hdr->b_l1hdr.b_arc_access = 0; 1957 hdr->b_l1hdr.b_datacnt = 1; 1958 1959 arc_get_data_buf(buf); 1960 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 1961 (void) refcount_add(&hdr->b_l1hdr.b_refcnt, tag); 1962 1963 return (buf); 1964} 1965 1966static char *arc_onloan_tag = "onloan"; 1967 1968/* 1969 * Loan out an anonymous arc buffer. Loaned buffers are not counted as in 1970 * flight data by arc_tempreserve_space() until they are "returned". Loaned 1971 * buffers must be returned to the arc before they can be used by the DMU or 1972 * freed. 1973 */ 1974arc_buf_t * 1975arc_loan_buf(spa_t *spa, int size) 1976{ 1977 arc_buf_t *buf; 1978 1979 buf = arc_buf_alloc(spa, size, arc_onloan_tag, ARC_BUFC_DATA); 1980 1981 atomic_add_64(&arc_loaned_bytes, size); 1982 return (buf); 1983} 1984 1985/* 1986 * Return a loaned arc buffer to the arc. 1987 */ 1988void 1989arc_return_buf(arc_buf_t *buf, void *tag) 1990{ 1991 arc_buf_hdr_t *hdr = buf->b_hdr; 1992 1993 ASSERT(buf->b_data != NULL); 1994 ASSERT(HDR_HAS_L1HDR(hdr)); 1995 (void) refcount_add(&hdr->b_l1hdr.b_refcnt, tag); 1996 (void) refcount_remove(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag); 1997 1998 atomic_add_64(&arc_loaned_bytes, -hdr->b_size); 1999} 2000 2001/* Detach an arc_buf from a dbuf (tag) */ 2002void 2003arc_loan_inuse_buf(arc_buf_t *buf, void *tag) 2004{ 2005 arc_buf_hdr_t *hdr = buf->b_hdr; 2006 2007 ASSERT(buf->b_data != NULL); 2008 ASSERT(HDR_HAS_L1HDR(hdr)); 2009 (void) refcount_add(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag); 2010 (void) refcount_remove(&hdr->b_l1hdr.b_refcnt, tag); 2011 buf->b_efunc = NULL; 2012 buf->b_private = NULL; 2013 2014 atomic_add_64(&arc_loaned_bytes, hdr->b_size); 2015} 2016 2017static arc_buf_t * 2018arc_buf_clone(arc_buf_t *from) 2019{ 2020 arc_buf_t *buf; 2021 arc_buf_hdr_t *hdr = from->b_hdr; 2022 uint64_t size = hdr->b_size; 2023 2024 ASSERT(HDR_HAS_L1HDR(hdr)); 2025 ASSERT(hdr->b_l1hdr.b_state != arc_anon); 2026 2027 buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); 2028 buf->b_hdr = hdr; 2029 buf->b_data = NULL; 2030 buf->b_efunc = NULL; 2031 buf->b_private = NULL; 2032 buf->b_next = hdr->b_l1hdr.b_buf; 2033 hdr->b_l1hdr.b_buf = buf; 2034 arc_get_data_buf(buf); 2035 bcopy(from->b_data, buf->b_data, size); 2036 2037 /* 2038 * This buffer already exists in the arc so create a duplicate 2039 * copy for the caller. If the buffer is associated with user data 2040 * then track the size and number of duplicates. These stats will be 2041 * updated as duplicate buffers are created and destroyed. 2042 */ 2043 if (HDR_ISTYPE_DATA(hdr)) { 2044 ARCSTAT_BUMP(arcstat_duplicate_buffers); 2045 ARCSTAT_INCR(arcstat_duplicate_buffers_size, size); 2046 } 2047 hdr->b_l1hdr.b_datacnt += 1; 2048 return (buf); 2049} 2050 2051void 2052arc_buf_add_ref(arc_buf_t *buf, void* tag) 2053{ 2054 arc_buf_hdr_t *hdr; 2055 kmutex_t *hash_lock; 2056 2057 /* 2058 * Check to see if this buffer is evicted. Callers 2059 * must verify b_data != NULL to know if the add_ref 2060 * was successful. 2061 */ 2062 mutex_enter(&buf->b_evict_lock); 2063 if (buf->b_data == NULL) { 2064 mutex_exit(&buf->b_evict_lock); 2065 return; 2066 } 2067 hash_lock = HDR_LOCK(buf->b_hdr); 2068 mutex_enter(hash_lock); 2069 hdr = buf->b_hdr; 2070 ASSERT(HDR_HAS_L1HDR(hdr)); 2071 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 2072 mutex_exit(&buf->b_evict_lock); 2073 2074 ASSERT(hdr->b_l1hdr.b_state == arc_mru || 2075 hdr->b_l1hdr.b_state == arc_mfu); 2076 2077 add_reference(hdr, hash_lock, tag); 2078 DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); 2079 arc_access(hdr, hash_lock); 2080 mutex_exit(hash_lock); 2081 ARCSTAT_BUMP(arcstat_hits); 2082 ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr), 2083 demand, prefetch, !HDR_ISTYPE_METADATA(hdr), 2084 data, metadata, hits); 2085} 2086 2087static void 2088arc_buf_free_on_write(void *data, size_t size, 2089 void (*free_func)(void *, size_t)) 2090{ 2091 l2arc_data_free_t *df; 2092 2093 df = kmem_alloc(sizeof (l2arc_data_free_t), KM_SLEEP); 2094 df->l2df_data = data; 2095 df->l2df_size = size; 2096 df->l2df_func = free_func; 2097 mutex_enter(&l2arc_free_on_write_mtx); 2098 list_insert_head(l2arc_free_on_write, df); 2099 mutex_exit(&l2arc_free_on_write_mtx); 2100} 2101 2102/* 2103 * Free the arc data buffer. If it is an l2arc write in progress, 2104 * the buffer is placed on l2arc_free_on_write to be freed later. 2105 */ 2106static void 2107arc_buf_data_free(arc_buf_t *buf, void (*free_func)(void *, size_t)) 2108{ 2109 arc_buf_hdr_t *hdr = buf->b_hdr; 2110 2111 if (HDR_L2_WRITING(hdr)) { 2112 arc_buf_free_on_write(buf->b_data, hdr->b_size, free_func); 2113 ARCSTAT_BUMP(arcstat_l2_free_on_write); 2114 } else { 2115 free_func(buf->b_data, hdr->b_size); 2116 } 2117} 2118 2119/* 2120 * Free up buf->b_data and if 'remove' is set, then pull the 2121 * arc_buf_t off of the the arc_buf_hdr_t's list and free it. 2122 */ 2123static void 2124arc_buf_l2_cdata_free(arc_buf_hdr_t *hdr) 2125{ 2126 ASSERT(HDR_HAS_L2HDR(hdr)); 2127 ASSERT(MUTEX_HELD(&hdr->b_l2hdr.b_dev->l2ad_mtx)); 2128 2129 /* 2130 * The b_tmp_cdata field is linked off of the b_l1hdr, so if 2131 * that doesn't exist, the header is in the arc_l2c_only state, 2132 * and there isn't anything to free (it's already been freed). 2133 */ 2134 if (!HDR_HAS_L1HDR(hdr)) 2135 return; 2136 2137 if (hdr->b_l1hdr.b_tmp_cdata == NULL) 2138 return; 2139 2140 ASSERT(HDR_L2_WRITING(hdr)); 2141 arc_buf_free_on_write(hdr->b_l1hdr.b_tmp_cdata, hdr->b_size, 2142 zio_data_buf_free); 2143 2144 ARCSTAT_BUMP(arcstat_l2_cdata_free_on_write); 2145 hdr->b_l1hdr.b_tmp_cdata = NULL; 2146} 2147 2148static void 2149arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t remove) 2150{ 2151 arc_buf_t **bufp; 2152 2153 /* free up data associated with the buf */ 2154 if (buf->b_data != NULL) { 2155 arc_state_t *state = buf->b_hdr->b_l1hdr.b_state; 2156 uint64_t size = buf->b_hdr->b_size; 2157 arc_buf_contents_t type = arc_buf_type(buf->b_hdr); 2158 2159 arc_cksum_verify(buf); 2160#ifdef illumos 2161 arc_buf_unwatch(buf); 2162#endif /* illumos */ 2163 2164 if (!recycle) { 2165 if (type == ARC_BUFC_METADATA) { 2166 arc_buf_data_free(buf, zio_buf_free); 2167 arc_space_return(size, ARC_SPACE_META); 2168 } else { 2169 ASSERT(type == ARC_BUFC_DATA); 2170 arc_buf_data_free(buf, zio_data_buf_free); 2171 arc_space_return(size, ARC_SPACE_DATA); 2172 } 2173 } 2174 if (list_link_active(&buf->b_hdr->b_l1hdr.b_arc_node)) { 2175 uint64_t *cnt = &state->arcs_lsize[type]; 2176 2177 ASSERT(refcount_is_zero( 2178 &buf->b_hdr->b_l1hdr.b_refcnt)); 2179 ASSERT(state != arc_anon && state != arc_l2c_only); 2180 2181 ASSERT3U(*cnt, >=, size); 2182 atomic_add_64(cnt, -size); 2183 } 2184 ASSERT3U(state->arcs_size, >=, size); 2185 atomic_add_64(&state->arcs_size, -size); 2186 buf->b_data = NULL; 2187 2188 /* 2189 * If we're destroying a duplicate buffer make sure 2190 * that the appropriate statistics are updated. 2191 */ 2192 if (buf->b_hdr->b_l1hdr.b_datacnt > 1 && 2193 HDR_ISTYPE_DATA(buf->b_hdr)) { 2194 ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers); 2195 ARCSTAT_INCR(arcstat_duplicate_buffers_size, -size); 2196 } 2197 ASSERT(buf->b_hdr->b_l1hdr.b_datacnt > 0); 2198 buf->b_hdr->b_l1hdr.b_datacnt -= 1; 2199 } 2200 2201 /* only remove the buf if requested */ 2202 if (!remove) 2203 return; 2204 2205 /* remove the buf from the hdr list */ 2206 for (bufp = &buf->b_hdr->b_l1hdr.b_buf; *bufp != buf; 2207 bufp = &(*bufp)->b_next) 2208 continue; 2209 *bufp = buf->b_next; 2210 buf->b_next = NULL; 2211 2212 ASSERT(buf->b_efunc == NULL); 2213 2214 /* clean up the buf */ 2215 buf->b_hdr = NULL; 2216 kmem_cache_free(buf_cache, buf); 2217} 2218 2219static void 2220arc_hdr_destroy(arc_buf_hdr_t *hdr) 2221{ 2222 if (HDR_HAS_L1HDR(hdr)) { 2223 ASSERT(hdr->b_l1hdr.b_buf == NULL || 2224 hdr->b_l1hdr.b_datacnt > 0); 2225 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 2226 ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); 2227 } 2228 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 2229 ASSERT(!HDR_IN_HASH_TABLE(hdr)); 2230 2231 if (HDR_HAS_L2HDR(hdr)) { 2232 l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr; 2233 boolean_t buflist_held = MUTEX_HELD(&l2hdr->b_dev->l2ad_mtx); 2234 2235 if (!buflist_held) { 2236 mutex_enter(&l2hdr->b_dev->l2ad_mtx); 2237 l2hdr = &hdr->b_l2hdr; 2238 } 2239 2240 trim_map_free(l2hdr->b_dev->l2ad_vdev, l2hdr->b_daddr, 2241 l2hdr->b_asize, 0); 2242 list_remove(&l2hdr->b_dev->l2ad_buflist, hdr); 2243 2244 /* 2245 * We don't want to leak the b_tmp_cdata buffer that was 2246 * allocated in l2arc_write_buffers() 2247 */ 2248 arc_buf_l2_cdata_free(hdr); 2249 2250 ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); 2251 ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize); 2252 2253 if (!buflist_held) 2254 mutex_exit(&l2hdr->b_dev->l2ad_mtx); 2255 2256 hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR; 2257 } 2258 2259 if (!BUF_EMPTY(hdr)) 2260 buf_discard_identity(hdr); 2261 if (hdr->b_freeze_cksum != NULL) { 2262 kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t)); 2263 hdr->b_freeze_cksum = NULL; 2264 } 2265 2266 if (HDR_HAS_L1HDR(hdr)) { 2267 while (hdr->b_l1hdr.b_buf) { 2268 arc_buf_t *buf = hdr->b_l1hdr.b_buf; 2269 2270 if (buf->b_efunc != NULL) { 2271 mutex_enter(&arc_eviction_mtx); 2272 mutex_enter(&buf->b_evict_lock); 2273 ASSERT(buf->b_hdr != NULL); 2274 arc_buf_destroy(hdr->b_l1hdr.b_buf, FALSE, 2275 FALSE); 2276 hdr->b_l1hdr.b_buf = buf->b_next; 2277 buf->b_hdr = &arc_eviction_hdr; 2278 buf->b_next = arc_eviction_list; 2279 arc_eviction_list = buf; 2280 mutex_exit(&buf->b_evict_lock); 2281 mutex_exit(&arc_eviction_mtx); 2282 } else { 2283 arc_buf_destroy(hdr->b_l1hdr.b_buf, FALSE, 2284 TRUE); 2285 } 2286 } 2287#ifdef ZFS_DEBUG 2288 if (hdr->b_l1hdr.b_thawed != NULL) { 2289 kmem_free(hdr->b_l1hdr.b_thawed, 1); 2290 hdr->b_l1hdr.b_thawed = NULL; 2291 } 2292#endif 2293 } 2294 2295 ASSERT3P(hdr->b_hash_next, ==, NULL); 2296 if (HDR_HAS_L1HDR(hdr)) { 2297 ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node)); 2298 ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); 2299 kmem_cache_free(hdr_full_cache, hdr); 2300 } else { 2301 kmem_cache_free(hdr_l2only_cache, hdr); 2302 } 2303} 2304 2305void 2306arc_buf_free(arc_buf_t *buf, void *tag) 2307{ 2308 arc_buf_hdr_t *hdr = buf->b_hdr; 2309 int hashed = hdr->b_l1hdr.b_state != arc_anon; 2310 2311 ASSERT(buf->b_efunc == NULL); 2312 ASSERT(buf->b_data != NULL); 2313 2314 if (hashed) { 2315 kmutex_t *hash_lock = HDR_LOCK(hdr); 2316 2317 mutex_enter(hash_lock); 2318 hdr = buf->b_hdr; 2319 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 2320 2321 (void) remove_reference(hdr, hash_lock, tag); 2322 if (hdr->b_l1hdr.b_datacnt > 1) { 2323 arc_buf_destroy(buf, FALSE, TRUE); 2324 } else { 2325 ASSERT(buf == hdr->b_l1hdr.b_buf); 2326 ASSERT(buf->b_efunc == NULL); 2327 hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE; 2328 } 2329 mutex_exit(hash_lock); 2330 } else if (HDR_IO_IN_PROGRESS(hdr)) { 2331 int destroy_hdr; 2332 /* 2333 * We are in the middle of an async write. Don't destroy 2334 * this buffer unless the write completes before we finish 2335 * decrementing the reference count. 2336 */ 2337 mutex_enter(&arc_eviction_mtx); 2338 (void) remove_reference(hdr, NULL, tag); 2339 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 2340 destroy_hdr = !HDR_IO_IN_PROGRESS(hdr); 2341 mutex_exit(&arc_eviction_mtx); 2342 if (destroy_hdr) 2343 arc_hdr_destroy(hdr); 2344 } else { 2345 if (remove_reference(hdr, NULL, tag) > 0) 2346 arc_buf_destroy(buf, FALSE, TRUE); 2347 else 2348 arc_hdr_destroy(hdr); 2349 } 2350} 2351 2352boolean_t 2353arc_buf_remove_ref(arc_buf_t *buf, void* tag) 2354{ 2355 arc_buf_hdr_t *hdr = buf->b_hdr; 2356 kmutex_t *hash_lock = HDR_LOCK(hdr); 2357 boolean_t no_callback = (buf->b_efunc == NULL); 2358 2359 if (hdr->b_l1hdr.b_state == arc_anon) { 2360 ASSERT(hdr->b_l1hdr.b_datacnt == 1); 2361 arc_buf_free(buf, tag); 2362 return (no_callback); 2363 } 2364 2365 mutex_enter(hash_lock); 2366 hdr = buf->b_hdr; 2367 ASSERT(hdr->b_l1hdr.b_datacnt > 0); 2368 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 2369 ASSERT(hdr->b_l1hdr.b_state != arc_anon); 2370 ASSERT(buf->b_data != NULL); 2371 2372 (void) remove_reference(hdr, hash_lock, tag); 2373 if (hdr->b_l1hdr.b_datacnt > 1) { 2374 if (no_callback) 2375 arc_buf_destroy(buf, FALSE, TRUE); 2376 } else if (no_callback) { 2377 ASSERT(hdr->b_l1hdr.b_buf == buf && buf->b_next == NULL); 2378 ASSERT(buf->b_efunc == NULL); 2379 hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE; 2380 } 2381 ASSERT(no_callback || hdr->b_l1hdr.b_datacnt > 1 || 2382 refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 2383 mutex_exit(hash_lock); 2384 return (no_callback); 2385} 2386 2387int32_t 2388arc_buf_size(arc_buf_t *buf) 2389{ 2390 return (buf->b_hdr->b_size); 2391} 2392 2393/* 2394 * Called from the DMU to determine if the current buffer should be 2395 * evicted. In order to ensure proper locking, the eviction must be initiated 2396 * from the DMU. Return true if the buffer is associated with user data and 2397 * duplicate buffers still exist. 2398 */ 2399boolean_t 2400arc_buf_eviction_needed(arc_buf_t *buf) 2401{ 2402 arc_buf_hdr_t *hdr; 2403 boolean_t evict_needed = B_FALSE; 2404 2405 if (zfs_disable_dup_eviction) 2406 return (B_FALSE); 2407 2408 mutex_enter(&buf->b_evict_lock); 2409 hdr = buf->b_hdr; 2410 if (hdr == NULL) { 2411 /* 2412 * We are in arc_do_user_evicts(); let that function 2413 * perform the eviction. 2414 */ 2415 ASSERT(buf->b_data == NULL); 2416 mutex_exit(&buf->b_evict_lock); 2417 return (B_FALSE); 2418 } else if (buf->b_data == NULL) { 2419 /* 2420 * We have already been added to the arc eviction list; 2421 * recommend eviction. 2422 */ 2423 ASSERT3P(hdr, ==, &arc_eviction_hdr); 2424 mutex_exit(&buf->b_evict_lock); 2425 return (B_TRUE); 2426 } 2427 2428 if (hdr->b_l1hdr.b_datacnt > 1 && HDR_ISTYPE_DATA(hdr)) 2429 evict_needed = B_TRUE; 2430 2431 mutex_exit(&buf->b_evict_lock); 2432 return (evict_needed); 2433} 2434 2435/* 2436 * Evict buffers from list until we've removed the specified number of 2437 * bytes. Move the removed buffers to the appropriate evict state. 2438 * If the recycle flag is set, then attempt to "recycle" a buffer: 2439 * - look for a buffer to evict that is `bytes' long. 2440 * - return the data block from this buffer rather than freeing it. 2441 * This flag is used by callers that are trying to make space for a 2442 * new buffer in a full arc cache. 2443 * 2444 * This function makes a "best effort". It skips over any buffers 2445 * it can't get a hash_lock on, and so may not catch all candidates. 2446 * It may also return without evicting as much space as requested. 2447 */ 2448static void * 2449arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle, 2450 arc_buf_contents_t type) 2451{ 2452 arc_state_t *evicted_state; 2453 uint64_t bytes_evicted = 0, skipped = 0, missed = 0; 2454 int64_t bytes_remaining; 2455 arc_buf_hdr_t *hdr, *hdr_prev = NULL; 2456 list_t *evicted_list, *list, *evicted_list_start, *list_start; 2457 kmutex_t *lock, *evicted_lock; 2458 kmutex_t *hash_lock; 2459 boolean_t have_lock; 2460 void *stolen = NULL; 2461 arc_buf_hdr_t marker = { 0 }; 2462 int count = 0; 2463 static int evict_metadata_offset, evict_data_offset; 2464 int i, idx, offset, list_count, lists; 2465 2466 ASSERT(state == arc_mru || state == arc_mfu); 2467 2468 evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost; 2469 2470 /* 2471 * Decide which "type" (data vs metadata) to recycle from. 2472 * 2473 * If we are over the metadata limit, recycle from metadata. 2474 * If we are under the metadata minimum, recycle from data. 2475 * Otherwise, recycle from whichever type has the oldest (least 2476 * recently accessed) header. This is not yet implemented. 2477 */ 2478 if (recycle) { 2479 arc_buf_contents_t realtype; 2480 if (state->arcs_lsize[ARC_BUFC_DATA] == 0) { 2481 realtype = ARC_BUFC_METADATA; 2482 } else if (state->arcs_lsize[ARC_BUFC_METADATA] == 0) { 2483 realtype = ARC_BUFC_DATA; 2484 } else if (arc_meta_used >= arc_meta_limit) { 2485 realtype = ARC_BUFC_METADATA; 2486 } else if (arc_meta_used <= arc_meta_min) { 2487 realtype = ARC_BUFC_DATA; 2488#ifdef illumos 2489 } else if (HDR_HAS_L1HDR(data_hdr) && 2490 HDR_HAS_L1HDR(metadata_hdr) && 2491 data_hdr->b_l1hdr.b_arc_access < 2492 metadata_hdr->b_l1hdr.b_arc_access) { 2493 realtype = ARC_BUFC_DATA; 2494 } else { 2495 realtype = ARC_BUFC_METADATA; 2496#else 2497 } else { 2498 /* TODO */ 2499 realtype = type; 2500#endif 2501 } 2502 if (realtype != type) { 2503 /* 2504 * If we want to evict from a different list, 2505 * we can not recycle, because DATA vs METADATA 2506 * buffers are segregated into different kmem 2507 * caches (and vmem arenas). 2508 */ 2509 type = realtype; 2510 recycle = B_FALSE; 2511 } 2512 } 2513 2514 if (type == ARC_BUFC_METADATA) { 2515 offset = 0; 2516 list_count = ARC_BUFC_NUMMETADATALISTS; 2517 list_start = &state->arcs_lists[0]; 2518 evicted_list_start = &evicted_state->arcs_lists[0]; 2519 idx = evict_metadata_offset; 2520 } else { 2521 offset = ARC_BUFC_NUMMETADATALISTS; 2522 list_start = &state->arcs_lists[offset]; 2523 evicted_list_start = &evicted_state->arcs_lists[offset]; 2524 list_count = ARC_BUFC_NUMDATALISTS; 2525 idx = evict_data_offset; 2526 } 2527 bytes_remaining = evicted_state->arcs_lsize[type]; 2528 lists = 0; 2529 2530evict_start: 2531 list = &list_start[idx]; 2532 evicted_list = &evicted_list_start[idx]; 2533 lock = ARCS_LOCK(state, (offset + idx)); 2534 evicted_lock = ARCS_LOCK(evicted_state, (offset + idx)); 2535 2536 /* 2537 * The ghost list lock must be acquired first in order to prevent 2538 * a 3 party deadlock: 2539 * 2540 * - arc_evict_ghost acquires arc_*_ghost->arcs_mtx, followed by 2541 * l2ad_mtx in arc_hdr_realloc 2542 * - l2arc_write_buffers acquires l2ad_mtx, followed by arc_*->arcs_mtx 2543 * - arc_evict acquires arc_*_ghost->arcs_mtx, followed by 2544 * arc_*_ghost->arcs_mtx and forms a deadlock cycle. 2545 * 2546 * This situation is avoided by acquiring the ghost list lock first. 2547 */ 2548 mutex_enter(evicted_lock); 2549 mutex_enter(lock); 2550 2551 for (hdr = list_tail(list); hdr; hdr = hdr_prev) { 2552 hdr_prev = list_prev(list, hdr); 2553 if (HDR_HAS_L1HDR(hdr)) { 2554 bytes_remaining -= 2555 (hdr->b_size * hdr->b_l1hdr.b_datacnt); 2556 } 2557 /* prefetch buffers have a minimum lifespan */ 2558 if (HDR_IO_IN_PROGRESS(hdr) || 2559 (spa && hdr->b_spa != spa) || 2560 ((hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT)) && 2561 ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access < 2562 arc_min_prefetch_lifespan)) { 2563 skipped++; 2564 continue; 2565 } 2566 /* "lookahead" for better eviction candidate */ 2567 if (recycle && hdr->b_size != bytes && 2568 hdr_prev && hdr_prev->b_size == bytes) 2569 continue; 2570 2571 /* ignore markers */ 2572 if (hdr->b_spa == 0) 2573 continue; 2574 2575 /* 2576 * It may take a long time to evict all the bufs requested. 2577 * To avoid blocking all arc activity, periodically drop 2578 * the arcs_mtx and give other threads a chance to run 2579 * before reacquiring the lock. 2580 * 2581 * If we are looking for a buffer to recycle, we are in 2582 * the hot code path, so don't sleep. 2583 */ 2584 if (!recycle && count++ > arc_evict_iterations) { 2585 list_insert_after(list, hdr, &marker); 2586 mutex_exit(lock); 2587 mutex_exit(evicted_lock); 2588 kpreempt(KPREEMPT_SYNC); 2589 mutex_enter(evicted_lock); 2590 mutex_enter(lock); 2591 hdr_prev = list_prev(list, &marker); 2592 list_remove(list, &marker); 2593 count = 0; 2594 continue; 2595 } 2596 2597 hash_lock = HDR_LOCK(hdr); 2598 have_lock = MUTEX_HELD(hash_lock); 2599 if (have_lock || mutex_tryenter(hash_lock)) { 2600 ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt)); 2601 ASSERT3U(hdr->b_l1hdr.b_datacnt, >, 0); 2602 while (hdr->b_l1hdr.b_buf) { 2603 arc_buf_t *buf = hdr->b_l1hdr.b_buf; 2604 if (!mutex_tryenter(&buf->b_evict_lock)) { 2605 missed += 1; 2606 break; 2607 } 2608 if (buf->b_data != NULL) { 2609 bytes_evicted += hdr->b_size; 2610 if (recycle && 2611 arc_buf_type(hdr) == type && 2612 hdr->b_size == bytes && 2613 !HDR_L2_WRITING(hdr)) { 2614 stolen = buf->b_data; 2615 recycle = FALSE; 2616 } 2617 } 2618 if (buf->b_efunc != NULL) { 2619 mutex_enter(&arc_eviction_mtx); 2620 arc_buf_destroy(buf, 2621 buf->b_data == stolen, FALSE); 2622 hdr->b_l1hdr.b_buf = buf->b_next; 2623 buf->b_hdr = &arc_eviction_hdr; 2624 buf->b_next = arc_eviction_list; 2625 arc_eviction_list = buf; 2626 mutex_exit(&arc_eviction_mtx); 2627 mutex_exit(&buf->b_evict_lock); 2628 } else { 2629 mutex_exit(&buf->b_evict_lock); 2630 arc_buf_destroy(buf, 2631 buf->b_data == stolen, TRUE); 2632 } 2633 } 2634 2635 if (HDR_HAS_L2HDR(hdr)) { 2636 ARCSTAT_INCR(arcstat_evict_l2_cached, 2637 hdr->b_size); 2638 } else { 2639 if (l2arc_write_eligible(hdr->b_spa, hdr)) { 2640 ARCSTAT_INCR(arcstat_evict_l2_eligible, 2641 hdr->b_size); 2642 } else { 2643 ARCSTAT_INCR( 2644 arcstat_evict_l2_ineligible, 2645 hdr->b_size); 2646 } 2647 } 2648 2649 if (hdr->b_l1hdr.b_datacnt == 0) { 2650 arc_change_state(evicted_state, hdr, hash_lock); 2651 ASSERT(HDR_IN_HASH_TABLE(hdr)); 2652 hdr->b_flags |= ARC_FLAG_IN_HASH_TABLE; 2653 hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE; 2654 DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, hdr); 2655 } 2656 if (!have_lock) 2657 mutex_exit(hash_lock); 2658 if (bytes >= 0 && bytes_evicted >= bytes) 2659 break; 2660 if (bytes_remaining > 0) { 2661 mutex_exit(evicted_lock); 2662 mutex_exit(lock); 2663 idx = ((idx + 1) & (list_count - 1)); 2664 lists++; 2665 goto evict_start; 2666 } 2667 } else { 2668 missed += 1; 2669 } 2670 } 2671 2672 mutex_exit(lock); 2673 mutex_exit(evicted_lock); 2674 2675 idx = ((idx + 1) & (list_count - 1)); 2676 lists++; 2677 2678 if (bytes_evicted < bytes) { 2679 if (lists < list_count) 2680 goto evict_start; 2681 else 2682 dprintf("only evicted %lld bytes from %x", 2683 (longlong_t)bytes_evicted, state); 2684 } 2685 if (type == ARC_BUFC_METADATA) 2686 evict_metadata_offset = idx; 2687 else 2688 evict_data_offset = idx; 2689 2690 if (skipped) 2691 ARCSTAT_INCR(arcstat_evict_skip, skipped); 2692 2693 if (missed) 2694 ARCSTAT_INCR(arcstat_mutex_miss, missed); 2695 2696 /* 2697 * Note: we have just evicted some data into the ghost state, 2698 * potentially putting the ghost size over the desired size. Rather 2699 * that evicting from the ghost list in this hot code path, leave 2700 * this chore to the arc_reclaim_thread(). 2701 */ 2702 2703 if (stolen) 2704 ARCSTAT_BUMP(arcstat_stolen); 2705 return (stolen); 2706} 2707 2708/* 2709 * Remove buffers from list until we've removed the specified number of 2710 * bytes. Destroy the buffers that are removed. 2711 */ 2712static void 2713arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes) 2714{ 2715 arc_buf_hdr_t *hdr, *hdr_prev; 2716 arc_buf_hdr_t marker = { 0 }; 2717 list_t *list, *list_start; 2718 kmutex_t *hash_lock, *lock; 2719 uint64_t bytes_deleted = 0; 2720 uint64_t bufs_skipped = 0; 2721 int count = 0; 2722 static int evict_offset; 2723 int list_count, idx = evict_offset; 2724 int offset, lists = 0; 2725 2726 ASSERT(GHOST_STATE(state)); 2727 2728 /* 2729 * data lists come after metadata lists 2730 */ 2731 list_start = &state->arcs_lists[ARC_BUFC_NUMMETADATALISTS]; 2732 list_count = ARC_BUFC_NUMDATALISTS; 2733 offset = ARC_BUFC_NUMMETADATALISTS; 2734 2735evict_start: 2736 list = &list_start[idx]; 2737 lock = ARCS_LOCK(state, idx + offset); 2738 2739 mutex_enter(lock); 2740 for (hdr = list_tail(list); hdr; hdr = hdr_prev) { 2741 hdr_prev = list_prev(list, hdr); 2742 if (arc_buf_type(hdr) >= ARC_BUFC_NUMTYPES) 2743 panic("invalid hdr=%p", (void *)hdr); 2744 if (spa && hdr->b_spa != spa) 2745 continue; 2746 2747 /* ignore markers */ 2748 if (hdr->b_spa == 0) 2749 continue; 2750 2751 hash_lock = HDR_LOCK(hdr); 2752 /* caller may be trying to modify this buffer, skip it */ 2753 if (MUTEX_HELD(hash_lock)) 2754 continue; 2755 2756 /* 2757 * It may take a long time to evict all the bufs requested. 2758 * To avoid blocking all arc activity, periodically drop 2759 * the arcs_mtx and give other threads a chance to run 2760 * before reacquiring the lock. 2761 */ 2762 if (count++ > arc_evict_iterations) { 2763 list_insert_after(list, hdr, &marker); 2764 mutex_exit(lock); 2765 kpreempt(KPREEMPT_SYNC); 2766 mutex_enter(lock); 2767 hdr_prev = list_prev(list, &marker); 2768 list_remove(list, &marker); 2769 count = 0; 2770 continue; 2771 } 2772 if (mutex_tryenter(hash_lock)) { 2773 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 2774 ASSERT(!HDR_HAS_L1HDR(hdr) || 2775 hdr->b_l1hdr.b_buf == NULL); 2776 ARCSTAT_BUMP(arcstat_deleted); 2777 bytes_deleted += hdr->b_size; 2778 2779 if (HDR_HAS_L2HDR(hdr)) { 2780 /* 2781 * This buffer is cached on the 2nd Level ARC; 2782 * don't destroy the header. 2783 */ 2784 arc_change_state(arc_l2c_only, hdr, hash_lock); 2785 /* 2786 * dropping from L1+L2 cached to L2-only, 2787 * realloc to remove the L1 header. 2788 */ 2789 hdr = arc_hdr_realloc(hdr, hdr_full_cache, 2790 hdr_l2only_cache); 2791 mutex_exit(hash_lock); 2792 } else { 2793 arc_change_state(arc_anon, hdr, hash_lock); 2794 mutex_exit(hash_lock); 2795 arc_hdr_destroy(hdr); 2796 } 2797 2798 DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, hdr); 2799 if (bytes >= 0 && bytes_deleted >= bytes) 2800 break; 2801 } else if (bytes < 0) { 2802 /* 2803 * Insert a list marker and then wait for the 2804 * hash lock to become available. Once its 2805 * available, restart from where we left off. 2806 */ 2807 list_insert_after(list, hdr, &marker); 2808 mutex_exit(lock); 2809 mutex_enter(hash_lock); 2810 mutex_exit(hash_lock); 2811 mutex_enter(lock); 2812 hdr_prev = list_prev(list, &marker); 2813 list_remove(list, &marker); 2814 } else { 2815 bufs_skipped += 1; 2816 } 2817 2818 } 2819 mutex_exit(lock); 2820 idx = ((idx + 1) & (ARC_BUFC_NUMDATALISTS - 1)); 2821 lists++; 2822 2823 if (lists < list_count) 2824 goto evict_start; 2825 2826 evict_offset = idx; 2827 if ((uintptr_t)list > (uintptr_t)&state->arcs_lists[ARC_BUFC_NUMMETADATALISTS] && 2828 (bytes < 0 || bytes_deleted < bytes)) { 2829 list_start = &state->arcs_lists[0]; 2830 list_count = ARC_BUFC_NUMMETADATALISTS; 2831 offset = lists = 0; 2832 goto evict_start; 2833 } 2834 2835 if (bufs_skipped) { 2836 ARCSTAT_INCR(arcstat_mutex_miss, bufs_skipped); 2837 ASSERT(bytes >= 0); 2838 } 2839 2840 if (bytes_deleted < bytes) 2841 dprintf("only deleted %lld bytes from %p", 2842 (longlong_t)bytes_deleted, state); 2843} 2844 2845static void 2846arc_adjust(void) 2847{ 2848 int64_t adjustment, delta; 2849 2850 /* 2851 * Adjust MRU size 2852 */ 2853 2854 adjustment = MIN((int64_t)(arc_size - arc_c), 2855 (int64_t)(arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used - 2856 arc_p)); 2857 2858 if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) { 2859 delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], adjustment); 2860 (void) arc_evict(arc_mru, 0, delta, FALSE, ARC_BUFC_DATA); 2861 adjustment -= delta; 2862 } 2863 2864 if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) { 2865 delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustment); 2866 (void) arc_evict(arc_mru, 0, delta, FALSE, 2867 ARC_BUFC_METADATA); 2868 } 2869 2870 /* 2871 * Adjust MFU size 2872 */ 2873 2874 adjustment = arc_size - arc_c; 2875 2876 if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) { 2877 delta = MIN(adjustment, arc_mfu->arcs_lsize[ARC_BUFC_DATA]); 2878 (void) arc_evict(arc_mfu, 0, delta, FALSE, ARC_BUFC_DATA); 2879 adjustment -= delta; 2880 } 2881 2882 if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) { 2883 int64_t delta = MIN(adjustment, 2884 arc_mfu->arcs_lsize[ARC_BUFC_METADATA]); 2885 (void) arc_evict(arc_mfu, 0, delta, FALSE, 2886 ARC_BUFC_METADATA); 2887 } 2888 2889 /* 2890 * Adjust ghost lists 2891 */ 2892 2893 adjustment = arc_mru->arcs_size + arc_mru_ghost->arcs_size - arc_c; 2894 2895 if (adjustment > 0 && arc_mru_ghost->arcs_size > 0) { 2896 delta = MIN(arc_mru_ghost->arcs_size, adjustment); 2897 arc_evict_ghost(arc_mru_ghost, 0, delta); 2898 } 2899 2900 adjustment = 2901 arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size - arc_c; 2902 2903 if (adjustment > 0 && arc_mfu_ghost->arcs_size > 0) { 2904 delta = MIN(arc_mfu_ghost->arcs_size, adjustment); 2905 arc_evict_ghost(arc_mfu_ghost, 0, delta); 2906 } 2907} 2908 2909static void 2910arc_do_user_evicts(void) 2911{ 2912 static arc_buf_t *tmp_arc_eviction_list; 2913 2914 /* 2915 * Move list over to avoid LOR 2916 */ 2917restart: 2918 mutex_enter(&arc_eviction_mtx); 2919 tmp_arc_eviction_list = arc_eviction_list; 2920 arc_eviction_list = NULL; 2921 mutex_exit(&arc_eviction_mtx); 2922 2923 while (tmp_arc_eviction_list != NULL) { 2924 arc_buf_t *buf = tmp_arc_eviction_list; 2925 tmp_arc_eviction_list = buf->b_next; 2926 mutex_enter(&buf->b_evict_lock); 2927 buf->b_hdr = NULL; 2928 mutex_exit(&buf->b_evict_lock); 2929 2930 if (buf->b_efunc != NULL) 2931 VERIFY0(buf->b_efunc(buf->b_private)); 2932 2933 buf->b_efunc = NULL; 2934 buf->b_private = NULL; 2935 kmem_cache_free(buf_cache, buf); 2936 } 2937 2938 if (arc_eviction_list != NULL) 2939 goto restart; 2940} 2941 2942/* 2943 * Flush all *evictable* data from the cache for the given spa. 2944 * NOTE: this will not touch "active" (i.e. referenced) data. 2945 */ 2946void 2947arc_flush(spa_t *spa) 2948{ 2949 uint64_t guid = 0; 2950 2951 if (spa != NULL) 2952 guid = spa_load_guid(spa); 2953 2954 while (arc_mru->arcs_lsize[ARC_BUFC_DATA]) { 2955 (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_DATA); 2956 if (spa != NULL) 2957 break; 2958 } 2959 while (arc_mru->arcs_lsize[ARC_BUFC_METADATA]) { 2960 (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_METADATA); 2961 if (spa != NULL) 2962 break; 2963 } 2964 while (arc_mfu->arcs_lsize[ARC_BUFC_DATA]) { 2965 (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_DATA); 2966 if (spa != NULL) 2967 break; 2968 } 2969 while (arc_mfu->arcs_lsize[ARC_BUFC_METADATA]) { 2970 (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_METADATA); 2971 if (spa != NULL) 2972 break; 2973 } 2974 2975 arc_evict_ghost(arc_mru_ghost, guid, -1); 2976 arc_evict_ghost(arc_mfu_ghost, guid, -1); 2977 2978 mutex_enter(&arc_reclaim_thr_lock); 2979 arc_do_user_evicts(); 2980 mutex_exit(&arc_reclaim_thr_lock); 2981 ASSERT(spa || arc_eviction_list == NULL); 2982} 2983 2984void 2985arc_shrink(void) 2986{ 2987 2988 if (arc_c > arc_c_min) { 2989 uint64_t to_free; 2990 2991 to_free = arc_c >> arc_shrink_shift; 2992 DTRACE_PROBE4(arc__shrink, uint64_t, arc_c, uint64_t, 2993 arc_c_min, uint64_t, arc_p, uint64_t, to_free); 2994 if (arc_c > arc_c_min + to_free) 2995 atomic_add_64(&arc_c, -to_free); 2996 else 2997 arc_c = arc_c_min; 2998 2999 atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift)); 3000 if (arc_c > arc_size) 3001 arc_c = MAX(arc_size, arc_c_min); 3002 if (arc_p > arc_c) 3003 arc_p = (arc_c >> 1); 3004 3005 DTRACE_PROBE2(arc__shrunk, uint64_t, arc_c, uint64_t, 3006 arc_p); 3007 3008 ASSERT(arc_c >= arc_c_min); 3009 ASSERT((int64_t)arc_p >= 0); 3010 } 3011 3012 if (arc_size > arc_c) { 3013 DTRACE_PROBE2(arc__shrink_adjust, uint64_t, arc_size, 3014 uint64_t, arc_c); 3015 arc_adjust(); 3016 } 3017} 3018 3019static int needfree = 0; 3020 3021static int 3022arc_reclaim_needed(void) 3023{ 3024 3025#ifdef _KERNEL 3026 3027 if (needfree) { 3028 DTRACE_PROBE(arc__reclaim_needfree); 3029 return (1); 3030 } 3031 3032 /* 3033 * Cooperate with pagedaemon when it's time for it to scan 3034 * and reclaim some pages. 3035 */ 3036 if (freemem < zfs_arc_free_target) { 3037 DTRACE_PROBE2(arc__reclaim_freemem, uint64_t, 3038 freemem, uint64_t, zfs_arc_free_target); 3039 return (1); 3040 } 3041 3042#ifdef sun 3043 /* 3044 * take 'desfree' extra pages, so we reclaim sooner, rather than later 3045 */ 3046 extra = desfree; 3047 3048 /* 3049 * check that we're out of range of the pageout scanner. It starts to 3050 * schedule paging if freemem is less than lotsfree and needfree. 3051 * lotsfree is the high-water mark for pageout, and needfree is the 3052 * number of needed free pages. We add extra pages here to make sure 3053 * the scanner doesn't start up while we're freeing memory. 3054 */ 3055 if (freemem < lotsfree + needfree + extra) 3056 return (1); 3057 3058 /* 3059 * check to make sure that swapfs has enough space so that anon 3060 * reservations can still succeed. anon_resvmem() checks that the 3061 * availrmem is greater than swapfs_minfree, and the number of reserved 3062 * swap pages. We also add a bit of extra here just to prevent 3063 * circumstances from getting really dire. 3064 */ 3065 if (availrmem < swapfs_minfree + swapfs_reserve + extra) 3066 return (1); 3067 3068 /* 3069 * Check that we have enough availrmem that memory locking (e.g., via 3070 * mlock(3C) or memcntl(2)) can still succeed. (pages_pp_maximum 3071 * stores the number of pages that cannot be locked; when availrmem 3072 * drops below pages_pp_maximum, page locking mechanisms such as 3073 * page_pp_lock() will fail.) 3074 */ 3075 if (availrmem <= pages_pp_maximum) 3076 return (1); 3077 3078#endif /* sun */ 3079#if defined(__i386) || !defined(UMA_MD_SMALL_ALLOC) 3080 /* 3081 * If we're on an i386 platform, it's possible that we'll exhaust the 3082 * kernel heap space before we ever run out of available physical 3083 * memory. Most checks of the size of the heap_area compare against 3084 * tune.t_minarmem, which is the minimum available real memory that we 3085 * can have in the system. However, this is generally fixed at 25 pages 3086 * which is so low that it's useless. In this comparison, we seek to 3087 * calculate the total heap-size, and reclaim if more than 3/4ths of the 3088 * heap is allocated. (Or, in the calculation, if less than 1/4th is 3089 * free) 3090 */ 3091 if (vmem_size(heap_arena, VMEM_FREE) < 3092 (vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC) >> 2)) { 3093 DTRACE_PROBE2(arc__reclaim_used, uint64_t, 3094 vmem_size(heap_arena, VMEM_FREE), uint64_t, 3095 (vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC)) >> 2); 3096 return (1); 3097 } 3098#define zio_arena NULL 3099#else 3100#define zio_arena heap_arena 3101#endif 3102 3103 /* 3104 * If zio data pages are being allocated out of a separate heap segment, 3105 * then enforce that the size of available vmem for this arena remains 3106 * above about 1/16th free. 3107 * 3108 * Note: The 1/16th arena free requirement was put in place 3109 * to aggressively evict memory from the arc in order to avoid 3110 * memory fragmentation issues. 3111 */ 3112 if (zio_arena != NULL && 3113 vmem_size(zio_arena, VMEM_FREE) < 3114 (vmem_size(zio_arena, VMEM_ALLOC) >> 4)) 3115 return (1); 3116 3117 /* 3118 * Above limits know nothing about real level of KVA fragmentation. 3119 * Start aggressive reclamation if too little sequential KVA left. 3120 */ 3121 if (vmem_size(heap_arena, VMEM_MAXFREE) < zfs_max_recordsize) { 3122 DTRACE_PROBE2(arc__reclaim_maxfree, uint64_t, 3123 vmem_size(heap_arena, VMEM_MAXFREE), 3124 uint64_t, zfs_max_recordsize); 3125 return (1); 3126 } 3127 3128#else /* _KERNEL */ 3129 if (spa_get_random(100) == 0) 3130 return (1); 3131#endif /* _KERNEL */ 3132 DTRACE_PROBE(arc__reclaim_no); 3133 3134 return (0); 3135} 3136 3137extern kmem_cache_t *zio_buf_cache[]; 3138extern kmem_cache_t *zio_data_buf_cache[]; 3139extern kmem_cache_t *range_seg_cache; 3140 3141static __noinline void 3142arc_kmem_reap_now(arc_reclaim_strategy_t strat) 3143{ 3144 size_t i; 3145 kmem_cache_t *prev_cache = NULL; 3146 kmem_cache_t *prev_data_cache = NULL; 3147 3148 DTRACE_PROBE(arc__kmem_reap_start); 3149#ifdef _KERNEL 3150 if (arc_meta_used >= arc_meta_limit) { 3151 /* 3152 * We are exceeding our meta-data cache limit. 3153 * Purge some DNLC entries to release holds on meta-data. 3154 */ 3155 dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent); 3156 } 3157#if defined(__i386) 3158 /* 3159 * Reclaim unused memory from all kmem caches. 3160 */ 3161 kmem_reap(); 3162#endif 3163#endif 3164 3165 /* 3166 * An aggressive reclamation will shrink the cache size as well as 3167 * reap free buffers from the arc kmem caches. 3168 */ 3169 if (strat == ARC_RECLAIM_AGGR) 3170 arc_shrink(); 3171 3172 for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) { 3173 if (zio_buf_cache[i] != prev_cache) { 3174 prev_cache = zio_buf_cache[i]; 3175 kmem_cache_reap_now(zio_buf_cache[i]); 3176 } 3177 if (zio_data_buf_cache[i] != prev_data_cache) { 3178 prev_data_cache = zio_data_buf_cache[i]; 3179 kmem_cache_reap_now(zio_data_buf_cache[i]); 3180 } 3181 } 3182 kmem_cache_reap_now(buf_cache); 3183 kmem_cache_reap_now(hdr_full_cache); 3184 kmem_cache_reap_now(hdr_l2only_cache); 3185 kmem_cache_reap_now(range_seg_cache); 3186 3187#ifdef sun 3188 /* 3189 * Ask the vmem arena to reclaim unused memory from its 3190 * quantum caches. 3191 */ 3192 if (zio_arena != NULL && strat == ARC_RECLAIM_AGGR) 3193 vmem_qcache_reap(zio_arena); 3194#endif 3195 DTRACE_PROBE(arc__kmem_reap_end); 3196} 3197 3198static void 3199arc_reclaim_thread(void *dummy __unused) 3200{ 3201 clock_t growtime = 0; 3202 arc_reclaim_strategy_t last_reclaim = ARC_RECLAIM_CONS; 3203 callb_cpr_t cpr; 3204 3205 CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG); 3206 3207 mutex_enter(&arc_reclaim_thr_lock); 3208 while (arc_thread_exit == 0) { 3209 if (arc_reclaim_needed()) { 3210 3211 if (arc_no_grow) { 3212 if (last_reclaim == ARC_RECLAIM_CONS) { 3213 DTRACE_PROBE(arc__reclaim_aggr_no_grow); 3214 last_reclaim = ARC_RECLAIM_AGGR; 3215 } else { 3216 last_reclaim = ARC_RECLAIM_CONS; 3217 } 3218 } else { 3219 arc_no_grow = TRUE; 3220 last_reclaim = ARC_RECLAIM_AGGR; 3221 DTRACE_PROBE(arc__reclaim_aggr); 3222 membar_producer(); 3223 } 3224 3225 /* reset the growth delay for every reclaim */ 3226 growtime = ddi_get_lbolt() + (arc_grow_retry * hz); 3227 3228 if (needfree && last_reclaim == ARC_RECLAIM_CONS) { 3229 /* 3230 * If needfree is TRUE our vm_lowmem hook 3231 * was called and in that case we must free some 3232 * memory, so switch to aggressive mode. 3233 */ 3234 arc_no_grow = TRUE; 3235 last_reclaim = ARC_RECLAIM_AGGR; 3236 } 3237 arc_kmem_reap_now(last_reclaim); 3238 arc_warm = B_TRUE; 3239 3240 } else if (arc_no_grow && ddi_get_lbolt() >= growtime) { 3241 arc_no_grow = FALSE; 3242 } 3243 3244 arc_adjust(); 3245 3246 if (arc_eviction_list != NULL) 3247 arc_do_user_evicts(); 3248 3249#ifdef _KERNEL 3250 if (needfree) { 3251 needfree = 0; 3252 wakeup(&needfree); 3253 } 3254#endif 3255 3256 /* 3257 * This is necessary in order for the mdb ::arc dcmd to 3258 * show up to date information. Since the ::arc command 3259 * does not call the kstat's update function, without 3260 * this call, the command may show stale stats for the 3261 * anon, mru, mru_ghost, mfu, and mfu_ghost lists. Even 3262 * with this change, the data might be up to 1 second 3263 * out of date; but that should suffice. The arc_state_t 3264 * structures can be queried directly if more accurate 3265 * information is needed. 3266 */ 3267 if (arc_ksp != NULL) 3268 arc_ksp->ks_update(arc_ksp, KSTAT_READ); 3269 3270 /* block until needed, or one second, whichever is shorter */ 3271 CALLB_CPR_SAFE_BEGIN(&cpr); 3272 (void) cv_timedwait(&arc_reclaim_thr_cv, 3273 &arc_reclaim_thr_lock, hz); 3274 CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock); 3275 } 3276 3277 arc_thread_exit = 0; 3278 cv_broadcast(&arc_reclaim_thr_cv); 3279 CALLB_CPR_EXIT(&cpr); /* drops arc_reclaim_thr_lock */ 3280 thread_exit(); 3281} 3282 3283/* 3284 * Adapt arc info given the number of bytes we are trying to add and 3285 * the state that we are comming from. This function is only called 3286 * when we are adding new content to the cache. 3287 */ 3288static void 3289arc_adapt(int bytes, arc_state_t *state) 3290{ 3291 int mult; 3292 uint64_t arc_p_min = (arc_c >> arc_p_min_shift); 3293 3294 if (state == arc_l2c_only) 3295 return; 3296 3297 ASSERT(bytes > 0); 3298 /* 3299 * Adapt the target size of the MRU list: 3300 * - if we just hit in the MRU ghost list, then increase 3301 * the target size of the MRU list. 3302 * - if we just hit in the MFU ghost list, then increase 3303 * the target size of the MFU list by decreasing the 3304 * target size of the MRU list. 3305 */ 3306 if (state == arc_mru_ghost) { 3307 mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ? 3308 1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size)); 3309 mult = MIN(mult, 10); /* avoid wild arc_p adjustment */ 3310 3311 arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult); 3312 } else if (state == arc_mfu_ghost) { 3313 uint64_t delta; 3314 3315 mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ? 3316 1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size)); 3317 mult = MIN(mult, 10); 3318 3319 delta = MIN(bytes * mult, arc_p); 3320 arc_p = MAX(arc_p_min, arc_p - delta); 3321 } 3322 ASSERT((int64_t)arc_p >= 0); 3323 3324 if (arc_reclaim_needed()) { 3325 cv_signal(&arc_reclaim_thr_cv); 3326 return; 3327 } 3328 3329 if (arc_no_grow) 3330 return; 3331 3332 if (arc_c >= arc_c_max) 3333 return; 3334 3335 /* 3336 * If we're within (2 * maxblocksize) bytes of the target 3337 * cache size, increment the target cache size 3338 */ 3339 if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) { 3340 DTRACE_PROBE1(arc__inc_adapt, int, bytes); 3341 atomic_add_64(&arc_c, (int64_t)bytes); 3342 if (arc_c > arc_c_max) 3343 arc_c = arc_c_max; 3344 else if (state == arc_anon) 3345 atomic_add_64(&arc_p, (int64_t)bytes); 3346 if (arc_p > arc_c) 3347 arc_p = arc_c; 3348 } 3349 ASSERT((int64_t)arc_p >= 0); 3350} 3351 3352/* 3353 * Check if the cache has reached its limits and eviction is required 3354 * prior to insert. 3355 */ 3356static int 3357arc_evict_needed(arc_buf_contents_t type) 3358{ 3359 if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit) 3360 return (1); 3361 3362 if (arc_reclaim_needed()) 3363 return (1); 3364 3365 return (arc_size > arc_c); 3366} 3367 3368/* 3369 * The buffer, supplied as the first argument, needs a data block. 3370 * So, if we are at cache max, determine which cache should be victimized. 3371 * We have the following cases: 3372 * 3373 * 1. Insert for MRU, p > sizeof(arc_anon + arc_mru) -> 3374 * In this situation if we're out of space, but the resident size of the MFU is 3375 * under the limit, victimize the MFU cache to satisfy this insertion request. 3376 * 3377 * 2. Insert for MRU, p <= sizeof(arc_anon + arc_mru) -> 3378 * Here, we've used up all of the available space for the MRU, so we need to 3379 * evict from our own cache instead. Evict from the set of resident MRU 3380 * entries. 3381 * 3382 * 3. Insert for MFU (c - p) > sizeof(arc_mfu) -> 3383 * c minus p represents the MFU space in the cache, since p is the size of the 3384 * cache that is dedicated to the MRU. In this situation there's still space on 3385 * the MFU side, so the MRU side needs to be victimized. 3386 * 3387 * 4. Insert for MFU (c - p) < sizeof(arc_mfu) -> 3388 * MFU's resident set is consuming more space than it has been allotted. In 3389 * this situation, we must victimize our own cache, the MFU, for this insertion. 3390 */ 3391static void 3392arc_get_data_buf(arc_buf_t *buf) 3393{ 3394 arc_state_t *state = buf->b_hdr->b_l1hdr.b_state; 3395 uint64_t size = buf->b_hdr->b_size; 3396 arc_buf_contents_t type = arc_buf_type(buf->b_hdr); 3397 3398 arc_adapt(size, state); 3399 3400 /* 3401 * We have not yet reached cache maximum size, 3402 * just allocate a new buffer. 3403 */ 3404 if (!arc_evict_needed(type)) { 3405 if (type == ARC_BUFC_METADATA) { 3406 buf->b_data = zio_buf_alloc(size); 3407 arc_space_consume(size, ARC_SPACE_META); 3408 } else { 3409 ASSERT(type == ARC_BUFC_DATA); 3410 buf->b_data = zio_data_buf_alloc(size); 3411 arc_space_consume(size, ARC_SPACE_DATA); 3412 } 3413 goto out; 3414 } 3415 3416 /* 3417 * If we are prefetching from the mfu ghost list, this buffer 3418 * will end up on the mru list; so steal space from there. 3419 */ 3420 if (state == arc_mfu_ghost) 3421 state = HDR_PREFETCH(buf->b_hdr) ? arc_mru : arc_mfu; 3422 else if (state == arc_mru_ghost) 3423 state = arc_mru; 3424 3425 if (state == arc_mru || state == arc_anon) { 3426 uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size; 3427 state = (arc_mfu->arcs_lsize[type] >= size && 3428 arc_p > mru_used) ? arc_mfu : arc_mru; 3429 } else { 3430 /* MFU cases */ 3431 uint64_t mfu_space = arc_c - arc_p; 3432 state = (arc_mru->arcs_lsize[type] >= size && 3433 mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu; 3434 } 3435 if ((buf->b_data = arc_evict(state, 0, size, TRUE, type)) == NULL) { 3436 if (type == ARC_BUFC_METADATA) { 3437 buf->b_data = zio_buf_alloc(size); 3438 arc_space_consume(size, ARC_SPACE_META); 3439 } else { 3440 ASSERT(type == ARC_BUFC_DATA); 3441 buf->b_data = zio_data_buf_alloc(size); 3442 arc_space_consume(size, ARC_SPACE_DATA); 3443 } 3444 ARCSTAT_BUMP(arcstat_recycle_miss); 3445 } 3446 ASSERT(buf->b_data != NULL); 3447out: 3448 /* 3449 * Update the state size. Note that ghost states have a 3450 * "ghost size" and so don't need to be updated. 3451 */ 3452 if (!GHOST_STATE(buf->b_hdr->b_l1hdr.b_state)) { 3453 arc_buf_hdr_t *hdr = buf->b_hdr; 3454 3455 atomic_add_64(&hdr->b_l1hdr.b_state->arcs_size, size); 3456 if (list_link_active(&hdr->b_l1hdr.b_arc_node)) { 3457 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 3458 atomic_add_64(&hdr->b_l1hdr.b_state->arcs_lsize[type], 3459 size); 3460 } 3461 /* 3462 * If we are growing the cache, and we are adding anonymous 3463 * data, and we have outgrown arc_p, update arc_p 3464 */ 3465 if (arc_size < arc_c && hdr->b_l1hdr.b_state == arc_anon && 3466 arc_anon->arcs_size + arc_mru->arcs_size > arc_p) 3467 arc_p = MIN(arc_c, arc_p + size); 3468 } 3469 ARCSTAT_BUMP(arcstat_allocated); 3470} 3471 3472/* 3473 * This routine is called whenever a buffer is accessed. 3474 * NOTE: the hash lock is dropped in this function. 3475 */ 3476static void 3477arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) 3478{ 3479 clock_t now; 3480 3481 ASSERT(MUTEX_HELD(hash_lock)); 3482 ASSERT(HDR_HAS_L1HDR(hdr)); 3483 3484 if (hdr->b_l1hdr.b_state == arc_anon) { 3485 /* 3486 * This buffer is not in the cache, and does not 3487 * appear in our "ghost" list. Add the new buffer 3488 * to the MRU state. 3489 */ 3490 3491 ASSERT0(hdr->b_l1hdr.b_arc_access); 3492 hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); 3493 DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr); 3494 arc_change_state(arc_mru, hdr, hash_lock); 3495 3496 } else if (hdr->b_l1hdr.b_state == arc_mru) { 3497 now = ddi_get_lbolt(); 3498 3499 /* 3500 * If this buffer is here because of a prefetch, then either: 3501 * - clear the flag if this is a "referencing" read 3502 * (any subsequent access will bump this into the MFU state). 3503 * or 3504 * - move the buffer to the head of the list if this is 3505 * another prefetch (to make it less likely to be evicted). 3506 */ 3507 if (HDR_PREFETCH(hdr)) { 3508 if (refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) { 3509 ASSERT(list_link_active( 3510 &hdr->b_l1hdr.b_arc_node)); 3511 } else { 3512 hdr->b_flags &= ~ARC_FLAG_PREFETCH; 3513 ARCSTAT_BUMP(arcstat_mru_hits); 3514 } 3515 hdr->b_l1hdr.b_arc_access = now; 3516 return; 3517 } 3518 3519 /* 3520 * This buffer has been "accessed" only once so far, 3521 * but it is still in the cache. Move it to the MFU 3522 * state. 3523 */ 3524 if (now > hdr->b_l1hdr.b_arc_access + ARC_MINTIME) { 3525 /* 3526 * More than 125ms have passed since we 3527 * instantiated this buffer. Move it to the 3528 * most frequently used state. 3529 */ 3530 hdr->b_l1hdr.b_arc_access = now; 3531 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); 3532 arc_change_state(arc_mfu, hdr, hash_lock); 3533 } 3534 ARCSTAT_BUMP(arcstat_mru_hits); 3535 } else if (hdr->b_l1hdr.b_state == arc_mru_ghost) { 3536 arc_state_t *new_state; 3537 /* 3538 * This buffer has been "accessed" recently, but 3539 * was evicted from the cache. Move it to the 3540 * MFU state. 3541 */ 3542 3543 if (HDR_PREFETCH(hdr)) { 3544 new_state = arc_mru; 3545 if (refcount_count(&hdr->b_l1hdr.b_refcnt) > 0) 3546 hdr->b_flags &= ~ARC_FLAG_PREFETCH; 3547 DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr); 3548 } else { 3549 new_state = arc_mfu; 3550 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); 3551 } 3552 3553 hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); 3554 arc_change_state(new_state, hdr, hash_lock); 3555 3556 ARCSTAT_BUMP(arcstat_mru_ghost_hits); 3557 } else if (hdr->b_l1hdr.b_state == arc_mfu) { 3558 /* 3559 * This buffer has been accessed more than once and is 3560 * still in the cache. Keep it in the MFU state. 3561 * 3562 * NOTE: an add_reference() that occurred when we did 3563 * the arc_read() will have kicked this off the list. 3564 * If it was a prefetch, we will explicitly move it to 3565 * the head of the list now. 3566 */ 3567 if ((HDR_PREFETCH(hdr)) != 0) { 3568 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 3569 ASSERT(list_link_active(&hdr->b_l1hdr.b_arc_node)); 3570 } 3571 ARCSTAT_BUMP(arcstat_mfu_hits); 3572 hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); 3573 } else if (hdr->b_l1hdr.b_state == arc_mfu_ghost) { 3574 arc_state_t *new_state = arc_mfu; 3575 /* 3576 * This buffer has been accessed more than once but has 3577 * been evicted from the cache. Move it back to the 3578 * MFU state. 3579 */ 3580 3581 if (HDR_PREFETCH(hdr)) { 3582 /* 3583 * This is a prefetch access... 3584 * move this block back to the MRU state. 3585 */ 3586 ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt)); 3587 new_state = arc_mru; 3588 } 3589 3590 hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); 3591 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); 3592 arc_change_state(new_state, hdr, hash_lock); 3593 3594 ARCSTAT_BUMP(arcstat_mfu_ghost_hits); 3595 } else if (hdr->b_l1hdr.b_state == arc_l2c_only) { 3596 /* 3597 * This buffer is on the 2nd Level ARC. 3598 */ 3599 3600 hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); 3601 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); 3602 arc_change_state(arc_mfu, hdr, hash_lock); 3603 } else { 3604 ASSERT(!"invalid arc state"); 3605 } 3606} 3607 3608/* a generic arc_done_func_t which you can use */ 3609/* ARGSUSED */ 3610void 3611arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg) 3612{ 3613 if (zio == NULL || zio->io_error == 0) 3614 bcopy(buf->b_data, arg, buf->b_hdr->b_size); 3615 VERIFY(arc_buf_remove_ref(buf, arg)); 3616} 3617 3618/* a generic arc_done_func_t */ 3619void 3620arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg) 3621{ 3622 arc_buf_t **bufp = arg; 3623 if (zio && zio->io_error) { 3624 VERIFY(arc_buf_remove_ref(buf, arg)); 3625 *bufp = NULL; 3626 } else { 3627 *bufp = buf; 3628 ASSERT(buf->b_data); 3629 } 3630} 3631 3632static void 3633arc_read_done(zio_t *zio) 3634{ 3635 arc_buf_hdr_t *hdr; 3636 arc_buf_t *buf; 3637 arc_buf_t *abuf; /* buffer we're assigning to callback */ 3638 kmutex_t *hash_lock = NULL; 3639 arc_callback_t *callback_list, *acb; 3640 int freeable = FALSE; 3641 3642 buf = zio->io_private; 3643 hdr = buf->b_hdr; 3644 3645 /* 3646 * The hdr was inserted into hash-table and removed from lists 3647 * prior to starting I/O. We should find this header, since 3648 * it's in the hash table, and it should be legit since it's 3649 * not possible to evict it during the I/O. The only possible 3650 * reason for it not to be found is if we were freed during the 3651 * read. 3652 */ 3653 if (HDR_IN_HASH_TABLE(hdr)) { 3654 ASSERT3U(hdr->b_birth, ==, BP_PHYSICAL_BIRTH(zio->io_bp)); 3655 ASSERT3U(hdr->b_dva.dva_word[0], ==, 3656 BP_IDENTITY(zio->io_bp)->dva_word[0]); 3657 ASSERT3U(hdr->b_dva.dva_word[1], ==, 3658 BP_IDENTITY(zio->io_bp)->dva_word[1]); 3659 3660 arc_buf_hdr_t *found = buf_hash_find(hdr->b_spa, zio->io_bp, 3661 &hash_lock); 3662 3663 ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && 3664 hash_lock == NULL) || 3665 (found == hdr && 3666 DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) || 3667 (found == hdr && HDR_L2_READING(hdr))); 3668 } 3669 3670 hdr->b_flags &= ~ARC_FLAG_L2_EVICTED; 3671 if (l2arc_noprefetch && HDR_PREFETCH(hdr)) 3672 hdr->b_flags &= ~ARC_FLAG_L2CACHE; 3673 3674 /* byteswap if necessary */ 3675 callback_list = hdr->b_l1hdr.b_acb; 3676 ASSERT(callback_list != NULL); 3677 if (BP_SHOULD_BYTESWAP(zio->io_bp) && zio->io_error == 0) { 3678 dmu_object_byteswap_t bswap = 3679 DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp)); 3680 arc_byteswap_func_t *func = BP_GET_LEVEL(zio->io_bp) > 0 ? 3681 byteswap_uint64_array : 3682 dmu_ot_byteswap[bswap].ob_func; 3683 func(buf->b_data, hdr->b_size); 3684 } 3685 3686 arc_cksum_compute(buf, B_FALSE); 3687#ifdef illumos 3688 arc_buf_watch(buf); 3689#endif /* illumos */ 3690 3691 if (hash_lock && zio->io_error == 0 && 3692 hdr->b_l1hdr.b_state == arc_anon) { 3693 /* 3694 * Only call arc_access on anonymous buffers. This is because 3695 * if we've issued an I/O for an evicted buffer, we've already 3696 * called arc_access (to prevent any simultaneous readers from 3697 * getting confused). 3698 */ 3699 arc_access(hdr, hash_lock); 3700 } 3701 3702 /* create copies of the data buffer for the callers */ 3703 abuf = buf; 3704 for (acb = callback_list; acb; acb = acb->acb_next) { 3705 if (acb->acb_done) { 3706 if (abuf == NULL) { 3707 ARCSTAT_BUMP(arcstat_duplicate_reads); 3708 abuf = arc_buf_clone(buf); 3709 } 3710 acb->acb_buf = abuf; 3711 abuf = NULL; 3712 } 3713 } 3714 hdr->b_l1hdr.b_acb = NULL; 3715 hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS; 3716 ASSERT(!HDR_BUF_AVAILABLE(hdr)); 3717 if (abuf == buf) { 3718 ASSERT(buf->b_efunc == NULL); 3719 ASSERT(hdr->b_l1hdr.b_datacnt == 1); 3720 hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE; 3721 } 3722 3723 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt) || 3724 callback_list != NULL); 3725 3726 if (zio->io_error != 0) { 3727 hdr->b_flags |= ARC_FLAG_IO_ERROR; 3728 if (hdr->b_l1hdr.b_state != arc_anon) 3729 arc_change_state(arc_anon, hdr, hash_lock); 3730 if (HDR_IN_HASH_TABLE(hdr)) 3731 buf_hash_remove(hdr); 3732 freeable = refcount_is_zero(&hdr->b_l1hdr.b_refcnt); 3733 } 3734 3735 /* 3736 * Broadcast before we drop the hash_lock to avoid the possibility 3737 * that the hdr (and hence the cv) might be freed before we get to 3738 * the cv_broadcast(). 3739 */ 3740 cv_broadcast(&hdr->b_l1hdr.b_cv); 3741 3742 if (hash_lock != NULL) { 3743 mutex_exit(hash_lock); 3744 } else { 3745 /* 3746 * This block was freed while we waited for the read to 3747 * complete. It has been removed from the hash table and 3748 * moved to the anonymous state (so that it won't show up 3749 * in the cache). 3750 */ 3751 ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); 3752 freeable = refcount_is_zero(&hdr->b_l1hdr.b_refcnt); 3753 } 3754 3755 /* execute each callback and free its structure */ 3756 while ((acb = callback_list) != NULL) { 3757 if (acb->acb_done) 3758 acb->acb_done(zio, acb->acb_buf, acb->acb_private); 3759 3760 if (acb->acb_zio_dummy != NULL) { 3761 acb->acb_zio_dummy->io_error = zio->io_error; 3762 zio_nowait(acb->acb_zio_dummy); 3763 } 3764 3765 callback_list = acb->acb_next; 3766 kmem_free(acb, sizeof (arc_callback_t)); 3767 } 3768 3769 if (freeable) 3770 arc_hdr_destroy(hdr); 3771} 3772 3773/* 3774 * "Read" the block block at the specified DVA (in bp) via the 3775 * cache. If the block is found in the cache, invoke the provided 3776 * callback immediately and return. Note that the `zio' parameter 3777 * in the callback will be NULL in this case, since no IO was 3778 * required. If the block is not in the cache pass the read request 3779 * on to the spa with a substitute callback function, so that the 3780 * requested block will be added to the cache. 3781 * 3782 * If a read request arrives for a block that has a read in-progress, 3783 * either wait for the in-progress read to complete (and return the 3784 * results); or, if this is a read with a "done" func, add a record 3785 * to the read to invoke the "done" func when the read completes, 3786 * and return; or just return. 3787 * 3788 * arc_read_done() will invoke all the requested "done" functions 3789 * for readers of this block. 3790 */ 3791int 3792arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, 3793 void *private, zio_priority_t priority, int zio_flags, 3794 arc_flags_t *arc_flags, const zbookmark_phys_t *zb) 3795{ 3796 arc_buf_hdr_t *hdr = NULL; 3797 arc_buf_t *buf = NULL; 3798 kmutex_t *hash_lock = NULL; 3799 zio_t *rzio; 3800 uint64_t guid = spa_load_guid(spa); 3801 3802 ASSERT(!BP_IS_EMBEDDED(bp) || 3803 BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA); 3804 3805top: 3806 if (!BP_IS_EMBEDDED(bp)) { 3807 /* 3808 * Embedded BP's have no DVA and require no I/O to "read". 3809 * Create an anonymous arc buf to back it. 3810 */ 3811 hdr = buf_hash_find(guid, bp, &hash_lock); 3812 } 3813 3814 if (hdr != NULL && HDR_HAS_L1HDR(hdr) && hdr->b_l1hdr.b_datacnt > 0) { 3815 3816 *arc_flags |= ARC_FLAG_CACHED; 3817 3818 if (HDR_IO_IN_PROGRESS(hdr)) { 3819 3820 if (*arc_flags & ARC_FLAG_WAIT) { 3821 cv_wait(&hdr->b_l1hdr.b_cv, hash_lock); 3822 mutex_exit(hash_lock); 3823 goto top; 3824 } 3825 ASSERT(*arc_flags & ARC_FLAG_NOWAIT); 3826 3827 if (done) { 3828 arc_callback_t *acb = NULL; 3829 3830 acb = kmem_zalloc(sizeof (arc_callback_t), 3831 KM_SLEEP); 3832 acb->acb_done = done; 3833 acb->acb_private = private; 3834 if (pio != NULL) 3835 acb->acb_zio_dummy = zio_null(pio, 3836 spa, NULL, NULL, NULL, zio_flags); 3837 3838 ASSERT(acb->acb_done != NULL); 3839 acb->acb_next = hdr->b_l1hdr.b_acb; 3840 hdr->b_l1hdr.b_acb = acb; 3841 add_reference(hdr, hash_lock, private); 3842 mutex_exit(hash_lock); 3843 return (0); 3844 } 3845 mutex_exit(hash_lock); 3846 return (0); 3847 } 3848 3849 ASSERT(hdr->b_l1hdr.b_state == arc_mru || 3850 hdr->b_l1hdr.b_state == arc_mfu); 3851 3852 if (done) { 3853 add_reference(hdr, hash_lock, private); 3854 /* 3855 * If this block is already in use, create a new 3856 * copy of the data so that we will be guaranteed 3857 * that arc_release() will always succeed. 3858 */ 3859 buf = hdr->b_l1hdr.b_buf; 3860 ASSERT(buf); 3861 ASSERT(buf->b_data); 3862 if (HDR_BUF_AVAILABLE(hdr)) { 3863 ASSERT(buf->b_efunc == NULL); 3864 hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE; 3865 } else { 3866 buf = arc_buf_clone(buf); 3867 } 3868 3869 } else if (*arc_flags & ARC_FLAG_PREFETCH && 3870 refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) { 3871 hdr->b_flags |= ARC_FLAG_PREFETCH; 3872 } 3873 DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); 3874 arc_access(hdr, hash_lock); 3875 if (*arc_flags & ARC_FLAG_L2CACHE) 3876 hdr->b_flags |= ARC_FLAG_L2CACHE; 3877 if (*arc_flags & ARC_FLAG_L2COMPRESS) 3878 hdr->b_flags |= ARC_FLAG_L2COMPRESS; 3879 mutex_exit(hash_lock); 3880 ARCSTAT_BUMP(arcstat_hits); 3881 ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr), 3882 demand, prefetch, !HDR_ISTYPE_METADATA(hdr), 3883 data, metadata, hits); 3884 3885 if (done) 3886 done(NULL, buf, private); 3887 } else { 3888 uint64_t size = BP_GET_LSIZE(bp); 3889 arc_callback_t *acb; 3890 vdev_t *vd = NULL; 3891 uint64_t addr = 0; 3892 boolean_t devw = B_FALSE; 3893 enum zio_compress b_compress = ZIO_COMPRESS_OFF; 3894 int32_t b_asize = 0; 3895 3896 if (hdr == NULL) { 3897 /* this block is not in the cache */ 3898 arc_buf_hdr_t *exists = NULL; 3899 arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp); 3900 buf = arc_buf_alloc(spa, size, private, type); 3901 hdr = buf->b_hdr; 3902 if (!BP_IS_EMBEDDED(bp)) { 3903 hdr->b_dva = *BP_IDENTITY(bp); 3904 hdr->b_birth = BP_PHYSICAL_BIRTH(bp); 3905 exists = buf_hash_insert(hdr, &hash_lock); 3906 } 3907 if (exists != NULL) { 3908 /* somebody beat us to the hash insert */ 3909 mutex_exit(hash_lock); 3910 buf_discard_identity(hdr); 3911 (void) arc_buf_remove_ref(buf, private); 3912 goto top; /* restart the IO request */ 3913 } 3914 3915 /* if this is a prefetch, we don't have a reference */ 3916 if (*arc_flags & ARC_FLAG_PREFETCH) { 3917 (void) remove_reference(hdr, hash_lock, 3918 private); 3919 hdr->b_flags |= ARC_FLAG_PREFETCH; 3920 } 3921 if (*arc_flags & ARC_FLAG_L2CACHE) 3922 hdr->b_flags |= ARC_FLAG_L2CACHE; 3923 if (*arc_flags & ARC_FLAG_L2COMPRESS) 3924 hdr->b_flags |= ARC_FLAG_L2COMPRESS; 3925 if (BP_GET_LEVEL(bp) > 0) 3926 hdr->b_flags |= ARC_FLAG_INDIRECT; 3927 } else { 3928 /* 3929 * This block is in the ghost cache. If it was L2-only 3930 * (and thus didn't have an L1 hdr), we realloc the 3931 * header to add an L1 hdr. 3932 */ 3933 if (!HDR_HAS_L1HDR(hdr)) { 3934 hdr = arc_hdr_realloc(hdr, hdr_l2only_cache, 3935 hdr_full_cache); 3936 } 3937 3938 ASSERT(GHOST_STATE(hdr->b_l1hdr.b_state)); 3939 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 3940 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 3941 ASSERT(hdr->b_l1hdr.b_buf == NULL); 3942 3943 /* if this is a prefetch, we don't have a reference */ 3944 if (*arc_flags & ARC_FLAG_PREFETCH) 3945 hdr->b_flags |= ARC_FLAG_PREFETCH; 3946 else 3947 add_reference(hdr, hash_lock, private); 3948 if (*arc_flags & ARC_FLAG_L2CACHE) 3949 hdr->b_flags |= ARC_FLAG_L2CACHE; 3950 if (*arc_flags & ARC_FLAG_L2COMPRESS) 3951 hdr->b_flags |= ARC_FLAG_L2COMPRESS; 3952 buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); 3953 buf->b_hdr = hdr; 3954 buf->b_data = NULL; 3955 buf->b_efunc = NULL; 3956 buf->b_private = NULL; 3957 buf->b_next = NULL; 3958 hdr->b_l1hdr.b_buf = buf; 3959 ASSERT0(hdr->b_l1hdr.b_datacnt); 3960 hdr->b_l1hdr.b_datacnt = 1; 3961 arc_get_data_buf(buf); 3962 arc_access(hdr, hash_lock); 3963 } 3964 3965 ASSERT(!GHOST_STATE(hdr->b_l1hdr.b_state)); 3966 3967 acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); 3968 acb->acb_done = done; 3969 acb->acb_private = private; 3970 3971 ASSERT(hdr->b_l1hdr.b_acb == NULL); 3972 hdr->b_l1hdr.b_acb = acb; 3973 hdr->b_flags |= ARC_FLAG_IO_IN_PROGRESS; 3974 3975 if (HDR_HAS_L2HDR(hdr) && 3976 (vd = hdr->b_l2hdr.b_dev->l2ad_vdev) != NULL) { 3977 devw = hdr->b_l2hdr.b_dev->l2ad_writing; 3978 addr = hdr->b_l2hdr.b_daddr; 3979 b_compress = HDR_GET_COMPRESS(hdr); 3980 b_asize = hdr->b_l2hdr.b_asize; 3981 /* 3982 * Lock out device removal. 3983 */ 3984 if (vdev_is_dead(vd) || 3985 !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER)) 3986 vd = NULL; 3987 } 3988 3989 if (hash_lock != NULL) 3990 mutex_exit(hash_lock); 3991 3992 /* 3993 * At this point, we have a level 1 cache miss. Try again in 3994 * L2ARC if possible. 3995 */ 3996 ASSERT3U(hdr->b_size, ==, size); 3997 DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp, 3998 uint64_t, size, zbookmark_phys_t *, zb); 3999 ARCSTAT_BUMP(arcstat_misses); 4000 ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr), 4001 demand, prefetch, !HDR_ISTYPE_METADATA(hdr), 4002 data, metadata, misses); 4003#ifdef _KERNEL 4004 curthread->td_ru.ru_inblock++; 4005#endif 4006 4007 if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) { 4008 /* 4009 * Read from the L2ARC if the following are true: 4010 * 1. The L2ARC vdev was previously cached. 4011 * 2. This buffer still has L2ARC metadata. 4012 * 3. This buffer isn't currently writing to the L2ARC. 4013 * 4. The L2ARC entry wasn't evicted, which may 4014 * also have invalidated the vdev. 4015 * 5. This isn't prefetch and l2arc_noprefetch is set. 4016 */ 4017 if (HDR_HAS_L2HDR(hdr) && 4018 !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) && 4019 !(l2arc_noprefetch && HDR_PREFETCH(hdr))) { 4020 l2arc_read_callback_t *cb; 4021 4022 DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr); 4023 ARCSTAT_BUMP(arcstat_l2_hits); 4024 4025 cb = kmem_zalloc(sizeof (l2arc_read_callback_t), 4026 KM_SLEEP); 4027 cb->l2rcb_buf = buf; 4028 cb->l2rcb_spa = spa; 4029 cb->l2rcb_bp = *bp; 4030 cb->l2rcb_zb = *zb; 4031 cb->l2rcb_flags = zio_flags; 4032 cb->l2rcb_compress = b_compress; 4033 4034 ASSERT(addr >= VDEV_LABEL_START_SIZE && 4035 addr + size < vd->vdev_psize - 4036 VDEV_LABEL_END_SIZE); 4037 4038 /* 4039 * l2arc read. The SCL_L2ARC lock will be 4040 * released by l2arc_read_done(). 4041 * Issue a null zio if the underlying buffer 4042 * was squashed to zero size by compression. 4043 */ 4044 if (b_compress == ZIO_COMPRESS_EMPTY) { 4045 rzio = zio_null(pio, spa, vd, 4046 l2arc_read_done, cb, 4047 zio_flags | ZIO_FLAG_DONT_CACHE | 4048 ZIO_FLAG_CANFAIL | 4049 ZIO_FLAG_DONT_PROPAGATE | 4050 ZIO_FLAG_DONT_RETRY); 4051 } else { 4052 rzio = zio_read_phys(pio, vd, addr, 4053 b_asize, buf->b_data, 4054 ZIO_CHECKSUM_OFF, 4055 l2arc_read_done, cb, priority, 4056 zio_flags | ZIO_FLAG_DONT_CACHE | 4057 ZIO_FLAG_CANFAIL | 4058 ZIO_FLAG_DONT_PROPAGATE | 4059 ZIO_FLAG_DONT_RETRY, B_FALSE); 4060 } 4061 DTRACE_PROBE2(l2arc__read, vdev_t *, vd, 4062 zio_t *, rzio); 4063 ARCSTAT_INCR(arcstat_l2_read_bytes, b_asize); 4064 4065 if (*arc_flags & ARC_FLAG_NOWAIT) { 4066 zio_nowait(rzio); 4067 return (0); 4068 } 4069 4070 ASSERT(*arc_flags & ARC_FLAG_WAIT); 4071 if (zio_wait(rzio) == 0) 4072 return (0); 4073 4074 /* l2arc read error; goto zio_read() */ 4075 } else { 4076 DTRACE_PROBE1(l2arc__miss, 4077 arc_buf_hdr_t *, hdr); 4078 ARCSTAT_BUMP(arcstat_l2_misses); 4079 if (HDR_L2_WRITING(hdr)) 4080 ARCSTAT_BUMP(arcstat_l2_rw_clash); 4081 spa_config_exit(spa, SCL_L2ARC, vd); 4082 } 4083 } else { 4084 if (vd != NULL) 4085 spa_config_exit(spa, SCL_L2ARC, vd); 4086 if (l2arc_ndev != 0) { 4087 DTRACE_PROBE1(l2arc__miss, 4088 arc_buf_hdr_t *, hdr); 4089 ARCSTAT_BUMP(arcstat_l2_misses); 4090 } 4091 } 4092 4093 rzio = zio_read(pio, spa, bp, buf->b_data, size, 4094 arc_read_done, buf, priority, zio_flags, zb); 4095 4096 if (*arc_flags & ARC_FLAG_WAIT) 4097 return (zio_wait(rzio)); 4098 4099 ASSERT(*arc_flags & ARC_FLAG_NOWAIT); 4100 zio_nowait(rzio); 4101 } 4102 return (0); 4103} 4104 4105void 4106arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private) 4107{ 4108 ASSERT(buf->b_hdr != NULL); 4109 ASSERT(buf->b_hdr->b_l1hdr.b_state != arc_anon); 4110 ASSERT(!refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt) || 4111 func == NULL); 4112 ASSERT(buf->b_efunc == NULL); 4113 ASSERT(!HDR_BUF_AVAILABLE(buf->b_hdr)); 4114 4115 buf->b_efunc = func; 4116 buf->b_private = private; 4117} 4118 4119/* 4120 * Notify the arc that a block was freed, and thus will never be used again. 4121 */ 4122void 4123arc_freed(spa_t *spa, const blkptr_t *bp) 4124{ 4125 arc_buf_hdr_t *hdr; 4126 kmutex_t *hash_lock; 4127 uint64_t guid = spa_load_guid(spa); 4128 4129 ASSERT(!BP_IS_EMBEDDED(bp)); 4130 4131 hdr = buf_hash_find(guid, bp, &hash_lock); 4132 if (hdr == NULL) 4133 return; 4134 if (HDR_BUF_AVAILABLE(hdr)) { 4135 arc_buf_t *buf = hdr->b_l1hdr.b_buf; 4136 add_reference(hdr, hash_lock, FTAG); 4137 hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE; 4138 mutex_exit(hash_lock); 4139 4140 arc_release(buf, FTAG); 4141 (void) arc_buf_remove_ref(buf, FTAG); 4142 } else { 4143 mutex_exit(hash_lock); 4144 } 4145 4146} 4147 4148/* 4149 * Clear the user eviction callback set by arc_set_callback(), first calling 4150 * it if it exists. Because the presence of a callback keeps an arc_buf cached 4151 * clearing the callback may result in the arc_buf being destroyed. However, 4152 * it will not result in the *last* arc_buf being destroyed, hence the data 4153 * will remain cached in the ARC. We make a copy of the arc buffer here so 4154 * that we can process the callback without holding any locks. 4155 * 4156 * It's possible that the callback is already in the process of being cleared 4157 * by another thread. In this case we can not clear the callback. 4158 * 4159 * Returns B_TRUE if the callback was successfully called and cleared. 4160 */ 4161boolean_t 4162arc_clear_callback(arc_buf_t *buf) 4163{ 4164 arc_buf_hdr_t *hdr; 4165 kmutex_t *hash_lock; 4166 arc_evict_func_t *efunc = buf->b_efunc; 4167 void *private = buf->b_private; 4168 list_t *list, *evicted_list; 4169 kmutex_t *lock, *evicted_lock; 4170 4171 mutex_enter(&buf->b_evict_lock); 4172 hdr = buf->b_hdr; 4173 if (hdr == NULL) { 4174 /* 4175 * We are in arc_do_user_evicts(). 4176 */ 4177 ASSERT(buf->b_data == NULL); 4178 mutex_exit(&buf->b_evict_lock); 4179 return (B_FALSE); 4180 } else if (buf->b_data == NULL) { 4181 /* 4182 * We are on the eviction list; process this buffer now 4183 * but let arc_do_user_evicts() do the reaping. 4184 */ 4185 buf->b_efunc = NULL; 4186 mutex_exit(&buf->b_evict_lock); 4187 VERIFY0(efunc(private)); 4188 return (B_TRUE); 4189 } 4190 hash_lock = HDR_LOCK(hdr); 4191 mutex_enter(hash_lock); 4192 hdr = buf->b_hdr; 4193 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 4194 4195 ASSERT3U(refcount_count(&hdr->b_l1hdr.b_refcnt), <, 4196 hdr->b_l1hdr.b_datacnt); 4197 ASSERT(hdr->b_l1hdr.b_state == arc_mru || 4198 hdr->b_l1hdr.b_state == arc_mfu); 4199 4200 buf->b_efunc = NULL; 4201 buf->b_private = NULL; 4202 4203 if (hdr->b_l1hdr.b_datacnt > 1) { 4204 mutex_exit(&buf->b_evict_lock); 4205 arc_buf_destroy(buf, FALSE, TRUE); 4206 } else { 4207 ASSERT(buf == hdr->b_l1hdr.b_buf); 4208 hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE; 4209 mutex_exit(&buf->b_evict_lock); 4210 } 4211 4212 mutex_exit(hash_lock); 4213 VERIFY0(efunc(private)); 4214 return (B_TRUE); 4215} 4216 4217/* 4218 * Release this buffer from the cache, making it an anonymous buffer. This 4219 * must be done after a read and prior to modifying the buffer contents. 4220 * If the buffer has more than one reference, we must make 4221 * a new hdr for the buffer. 4222 */ 4223void 4224arc_release(arc_buf_t *buf, void *tag) 4225{ 4226 arc_buf_hdr_t *hdr = buf->b_hdr; 4227 4228 /* 4229 * It would be nice to assert that if it's DMU metadata (level > 4230 * 0 || it's the dnode file), then it must be syncing context. 4231 * But we don't know that information at this level. 4232 */ 4233 4234 mutex_enter(&buf->b_evict_lock); 4235 /* 4236 * We don't grab the hash lock prior to this check, because if 4237 * the buffer's header is in the arc_anon state, it won't be 4238 * linked into the hash table. 4239 */ 4240 if (hdr->b_l1hdr.b_state == arc_anon) { 4241 mutex_exit(&buf->b_evict_lock); 4242 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 4243 ASSERT(!HDR_IN_HASH_TABLE(hdr)); 4244 ASSERT(!HDR_HAS_L2HDR(hdr)); 4245 ASSERT(BUF_EMPTY(hdr)); 4246 ASSERT3U(hdr->b_l1hdr.b_datacnt, ==, 1); 4247 ASSERT3S(refcount_count(&hdr->b_l1hdr.b_refcnt), ==, 1); 4248 ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node)); 4249 4250 ASSERT3P(buf->b_efunc, ==, NULL); 4251 ASSERT3P(buf->b_private, ==, NULL); 4252 4253 hdr->b_l1hdr.b_arc_access = 0; 4254 arc_buf_thaw(buf); 4255 4256 return; 4257 } 4258 4259 kmutex_t *hash_lock = HDR_LOCK(hdr); 4260 mutex_enter(hash_lock); 4261 4262 /* 4263 * This assignment is only valid as long as the hash_lock is 4264 * held, we must be careful not to reference state or the 4265 * b_state field after dropping the lock. 4266 */ 4267 arc_state_t *state = hdr->b_l1hdr.b_state; 4268 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 4269 ASSERT3P(state, !=, arc_anon); 4270 4271 /* this buffer is not on any list */ 4272 ASSERT(refcount_count(&hdr->b_l1hdr.b_refcnt) > 0); 4273 4274 if (HDR_HAS_L2HDR(hdr)) { 4275 ARCSTAT_INCR(arcstat_l2_asize, -hdr->b_l2hdr.b_asize); 4276 ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); 4277 4278 mutex_enter(&hdr->b_l2hdr.b_dev->l2ad_mtx); 4279 trim_map_free(hdr->b_l2hdr.b_dev->l2ad_vdev, 4280 hdr->b_l2hdr.b_daddr, hdr->b_l2hdr.b_asize, 0); 4281 list_remove(&hdr->b_l2hdr.b_dev->l2ad_buflist, hdr); 4282 4283 /* 4284 * We don't want to leak the b_tmp_cdata buffer that was 4285 * allocated in l2arc_write_buffers() 4286 */ 4287 arc_buf_l2_cdata_free(hdr); 4288 4289 mutex_exit(&hdr->b_l2hdr.b_dev->l2ad_mtx); 4290 4291 hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR; 4292 } 4293 4294 /* 4295 * Do we have more than one buf? 4296 */ 4297 if (hdr->b_l1hdr.b_datacnt > 1) { 4298 arc_buf_hdr_t *nhdr; 4299 arc_buf_t **bufp; 4300 uint64_t blksz = hdr->b_size; 4301 uint64_t spa = hdr->b_spa; 4302 arc_buf_contents_t type = arc_buf_type(hdr); 4303 uint32_t flags = hdr->b_flags; 4304 4305 ASSERT(hdr->b_l1hdr.b_buf != buf || buf->b_next != NULL); 4306 /* 4307 * Pull the data off of this hdr and attach it to 4308 * a new anonymous hdr. 4309 */ 4310 (void) remove_reference(hdr, hash_lock, tag); 4311 bufp = &hdr->b_l1hdr.b_buf; 4312 while (*bufp != buf) 4313 bufp = &(*bufp)->b_next; 4314 *bufp = buf->b_next; 4315 buf->b_next = NULL; 4316 4317 ASSERT3P(state, !=, arc_l2c_only); 4318 ASSERT3U(state->arcs_size, >=, hdr->b_size); 4319 atomic_add_64(&state->arcs_size, -hdr->b_size); 4320 if (refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) { 4321 ASSERT3P(state, !=, arc_l2c_only); 4322 uint64_t *size = &state->arcs_lsize[type]; 4323 ASSERT3U(*size, >=, hdr->b_size); 4324 atomic_add_64(size, -hdr->b_size); 4325 } 4326 4327 /* 4328 * We're releasing a duplicate user data buffer, update 4329 * our statistics accordingly. 4330 */ 4331 if (HDR_ISTYPE_DATA(hdr)) { 4332 ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers); 4333 ARCSTAT_INCR(arcstat_duplicate_buffers_size, 4334 -hdr->b_size); 4335 } 4336 hdr->b_l1hdr.b_datacnt -= 1; 4337 arc_cksum_verify(buf); 4338#ifdef illumos 4339 arc_buf_unwatch(buf); 4340#endif /* illumos */ 4341 4342 mutex_exit(hash_lock); 4343 4344 nhdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE); 4345 nhdr->b_size = blksz; 4346 nhdr->b_spa = spa; 4347 4348 nhdr->b_flags = flags & ARC_FLAG_L2_WRITING; 4349 nhdr->b_flags |= arc_bufc_to_flags(type); 4350 nhdr->b_flags |= ARC_FLAG_HAS_L1HDR; 4351 4352 nhdr->b_l1hdr.b_buf = buf; 4353 nhdr->b_l1hdr.b_datacnt = 1; 4354 nhdr->b_l1hdr.b_state = arc_anon; 4355 nhdr->b_l1hdr.b_arc_access = 0; 4356 nhdr->b_freeze_cksum = NULL; 4357 4358 (void) refcount_add(&nhdr->b_l1hdr.b_refcnt, tag); 4359 buf->b_hdr = nhdr; 4360 mutex_exit(&buf->b_evict_lock); 4361 atomic_add_64(&arc_anon->arcs_size, blksz); 4362 } else { 4363 mutex_exit(&buf->b_evict_lock); 4364 ASSERT(refcount_count(&hdr->b_l1hdr.b_refcnt) == 1); 4365 /* protected by hash lock */ 4366 ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node)); 4367 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 4368 arc_change_state(arc_anon, hdr, hash_lock); 4369 hdr->b_l1hdr.b_arc_access = 0; 4370 mutex_exit(hash_lock); 4371 4372 buf_discard_identity(hdr); 4373 arc_buf_thaw(buf); 4374 } 4375 buf->b_efunc = NULL; 4376 buf->b_private = NULL; 4377} 4378 4379int 4380arc_released(arc_buf_t *buf) 4381{ 4382 int released; 4383 4384 mutex_enter(&buf->b_evict_lock); 4385 released = (buf->b_data != NULL && 4386 buf->b_hdr->b_l1hdr.b_state == arc_anon); 4387 mutex_exit(&buf->b_evict_lock); 4388 return (released); 4389} 4390 4391#ifdef ZFS_DEBUG 4392int 4393arc_referenced(arc_buf_t *buf) 4394{ 4395 int referenced; 4396 4397 mutex_enter(&buf->b_evict_lock); 4398 referenced = (refcount_count(&buf->b_hdr->b_l1hdr.b_refcnt)); 4399 mutex_exit(&buf->b_evict_lock); 4400 return (referenced); 4401} 4402#endif 4403 4404static void 4405arc_write_ready(zio_t *zio) 4406{ 4407 arc_write_callback_t *callback = zio->io_private; 4408 arc_buf_t *buf = callback->awcb_buf; 4409 arc_buf_hdr_t *hdr = buf->b_hdr; 4410 4411 ASSERT(HDR_HAS_L1HDR(hdr)); 4412 ASSERT(!refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt)); 4413 ASSERT(hdr->b_l1hdr.b_datacnt > 0); 4414 callback->awcb_ready(zio, buf, callback->awcb_private); 4415 4416 /* 4417 * If the IO is already in progress, then this is a re-write 4418 * attempt, so we need to thaw and re-compute the cksum. 4419 * It is the responsibility of the callback to handle the 4420 * accounting for any re-write attempt. 4421 */ 4422 if (HDR_IO_IN_PROGRESS(hdr)) { 4423 mutex_enter(&hdr->b_l1hdr.b_freeze_lock); 4424 if (hdr->b_freeze_cksum != NULL) { 4425 kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t)); 4426 hdr->b_freeze_cksum = NULL; 4427 } 4428 mutex_exit(&hdr->b_l1hdr.b_freeze_lock); 4429 } 4430 arc_cksum_compute(buf, B_FALSE); 4431 hdr->b_flags |= ARC_FLAG_IO_IN_PROGRESS; 4432} 4433 4434/* 4435 * The SPA calls this callback for each physical write that happens on behalf 4436 * of a logical write. See the comment in dbuf_write_physdone() for details. 4437 */ 4438static void 4439arc_write_physdone(zio_t *zio) 4440{ 4441 arc_write_callback_t *cb = zio->io_private; 4442 if (cb->awcb_physdone != NULL) 4443 cb->awcb_physdone(zio, cb->awcb_buf, cb->awcb_private); 4444} 4445 4446static void 4447arc_write_done(zio_t *zio) 4448{ 4449 arc_write_callback_t *callback = zio->io_private; 4450 arc_buf_t *buf = callback->awcb_buf; 4451 arc_buf_hdr_t *hdr = buf->b_hdr; 4452 4453 ASSERT(hdr->b_l1hdr.b_acb == NULL); 4454 4455 if (zio->io_error == 0) { 4456 if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) { 4457 buf_discard_identity(hdr); 4458 } else { 4459 hdr->b_dva = *BP_IDENTITY(zio->io_bp); 4460 hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp); 4461 } 4462 } else { 4463 ASSERT(BUF_EMPTY(hdr)); 4464 } 4465 4466 /* 4467 * If the block to be written was all-zero or compressed enough to be 4468 * embedded in the BP, no write was performed so there will be no 4469 * dva/birth/checksum. The buffer must therefore remain anonymous 4470 * (and uncached). 4471 */ 4472 if (!BUF_EMPTY(hdr)) { 4473 arc_buf_hdr_t *exists; 4474 kmutex_t *hash_lock; 4475 4476 ASSERT(zio->io_error == 0); 4477 4478 arc_cksum_verify(buf); 4479 4480 exists = buf_hash_insert(hdr, &hash_lock); 4481 if (exists != NULL) { 4482 /* 4483 * This can only happen if we overwrite for 4484 * sync-to-convergence, because we remove 4485 * buffers from the hash table when we arc_free(). 4486 */ 4487 if (zio->io_flags & ZIO_FLAG_IO_REWRITE) { 4488 if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp)) 4489 panic("bad overwrite, hdr=%p exists=%p", 4490 (void *)hdr, (void *)exists); 4491 ASSERT(refcount_is_zero( 4492 &exists->b_l1hdr.b_refcnt)); 4493 arc_change_state(arc_anon, exists, hash_lock); 4494 mutex_exit(hash_lock); 4495 arc_hdr_destroy(exists); 4496 exists = buf_hash_insert(hdr, &hash_lock); 4497 ASSERT3P(exists, ==, NULL); 4498 } else if (zio->io_flags & ZIO_FLAG_NOPWRITE) { 4499 /* nopwrite */ 4500 ASSERT(zio->io_prop.zp_nopwrite); 4501 if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp)) 4502 panic("bad nopwrite, hdr=%p exists=%p", 4503 (void *)hdr, (void *)exists); 4504 } else { 4505 /* Dedup */ 4506 ASSERT(hdr->b_l1hdr.b_datacnt == 1); 4507 ASSERT(hdr->b_l1hdr.b_state == arc_anon); 4508 ASSERT(BP_GET_DEDUP(zio->io_bp)); 4509 ASSERT(BP_GET_LEVEL(zio->io_bp) == 0); 4510 } 4511 } 4512 hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS; 4513 /* if it's not anon, we are doing a scrub */ 4514 if (exists == NULL && hdr->b_l1hdr.b_state == arc_anon) 4515 arc_access(hdr, hash_lock); 4516 mutex_exit(hash_lock); 4517 } else { 4518 hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS; 4519 } 4520 4521 ASSERT(!refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 4522 callback->awcb_done(zio, buf, callback->awcb_private); 4523 4524 kmem_free(callback, sizeof (arc_write_callback_t)); 4525} 4526 4527zio_t * 4528arc_write(zio_t *pio, spa_t *spa, uint64_t txg, 4529 blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, boolean_t l2arc_compress, 4530 const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *physdone, 4531 arc_done_func_t *done, void *private, zio_priority_t priority, 4532 int zio_flags, const zbookmark_phys_t *zb) 4533{ 4534 arc_buf_hdr_t *hdr = buf->b_hdr; 4535 arc_write_callback_t *callback; 4536 zio_t *zio; 4537 4538 ASSERT(ready != NULL); 4539 ASSERT(done != NULL); 4540 ASSERT(!HDR_IO_ERROR(hdr)); 4541 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 4542 ASSERT(hdr->b_l1hdr.b_acb == NULL); 4543 ASSERT(hdr->b_l1hdr.b_datacnt > 0); 4544 if (l2arc) 4545 hdr->b_flags |= ARC_FLAG_L2CACHE; 4546 if (l2arc_compress) 4547 hdr->b_flags |= ARC_FLAG_L2COMPRESS; 4548 callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP); 4549 callback->awcb_ready = ready; 4550 callback->awcb_physdone = physdone; 4551 callback->awcb_done = done; 4552 callback->awcb_private = private; 4553 callback->awcb_buf = buf; 4554 4555 zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, zp, 4556 arc_write_ready, arc_write_physdone, arc_write_done, callback, 4557 priority, zio_flags, zb); 4558 4559 return (zio); 4560} 4561 4562static int 4563arc_memory_throttle(uint64_t reserve, uint64_t txg) 4564{ 4565#ifdef _KERNEL 4566 uint64_t available_memory = ptob(freemem); 4567 static uint64_t page_load = 0; 4568 static uint64_t last_txg = 0; 4569 4570#if defined(__i386) || !defined(UMA_MD_SMALL_ALLOC) 4571 available_memory = 4572 MIN(available_memory, ptob(vmem_size(heap_arena, VMEM_FREE))); 4573#endif 4574 4575 if (freemem > (uint64_t)physmem * arc_lotsfree_percent / 100) 4576 return (0); 4577 4578 if (txg > last_txg) { 4579 last_txg = txg; 4580 page_load = 0; 4581 } 4582 /* 4583 * If we are in pageout, we know that memory is already tight, 4584 * the arc is already going to be evicting, so we just want to 4585 * continue to let page writes occur as quickly as possible. 4586 */ 4587 if (curproc == pageproc) { 4588 if (page_load > MAX(ptob(minfree), available_memory) / 4) 4589 return (SET_ERROR(ERESTART)); 4590 /* Note: reserve is inflated, so we deflate */ 4591 page_load += reserve / 8; 4592 return (0); 4593 } else if (page_load > 0 && arc_reclaim_needed()) { 4594 /* memory is low, delay before restarting */ 4595 ARCSTAT_INCR(arcstat_memory_throttle_count, 1); 4596 return (SET_ERROR(EAGAIN)); 4597 } 4598 page_load = 0; 4599#endif 4600 return (0); 4601} 4602 4603static void 4604arc_kstat_update_state(arc_state_t *state, kstat_named_t *size, 4605 kstat_named_t *evict_data, kstat_named_t *evict_metadata) 4606{ 4607 size->value.ui64 = state->arcs_size; 4608 evict_data->value.ui64 = state->arcs_lsize[ARC_BUFC_DATA]; 4609 evict_metadata->value.ui64 = state->arcs_lsize[ARC_BUFC_METADATA]; 4610} 4611 4612static int 4613arc_kstat_update(kstat_t *ksp, int rw) 4614{ 4615 arc_stats_t *as = ksp->ks_data; 4616 4617 if (rw == KSTAT_WRITE) { 4618 return (EACCES); 4619 } else { 4620 arc_kstat_update_state(arc_anon, 4621 &as->arcstat_anon_size, 4622 &as->arcstat_anon_evictable_data, 4623 &as->arcstat_anon_evictable_metadata); 4624 arc_kstat_update_state(arc_mru, 4625 &as->arcstat_mru_size, 4626 &as->arcstat_mru_evictable_data, 4627 &as->arcstat_mru_evictable_metadata); 4628 arc_kstat_update_state(arc_mru_ghost, 4629 &as->arcstat_mru_ghost_size, 4630 &as->arcstat_mru_ghost_evictable_data, 4631 &as->arcstat_mru_ghost_evictable_metadata); 4632 arc_kstat_update_state(arc_mfu, 4633 &as->arcstat_mfu_size, 4634 &as->arcstat_mfu_evictable_data, 4635 &as->arcstat_mfu_evictable_metadata); 4636 arc_kstat_update_state(arc_mfu_ghost, 4637 &as->arcstat_mfu_ghost_size, 4638 &as->arcstat_mfu_ghost_evictable_data, 4639 &as->arcstat_mfu_ghost_evictable_metadata); 4640 } 4641 4642 return (0); 4643} 4644 4645void 4646arc_tempreserve_clear(uint64_t reserve) 4647{ 4648 atomic_add_64(&arc_tempreserve, -reserve); 4649 ASSERT((int64_t)arc_tempreserve >= 0); 4650} 4651 4652int 4653arc_tempreserve_space(uint64_t reserve, uint64_t txg) 4654{ 4655 int error; 4656 uint64_t anon_size; 4657 4658 if (reserve > arc_c/4 && !arc_no_grow) { 4659 arc_c = MIN(arc_c_max, reserve * 4); 4660 DTRACE_PROBE1(arc__set_reserve, uint64_t, arc_c); 4661 } 4662 if (reserve > arc_c) 4663 return (SET_ERROR(ENOMEM)); 4664 4665 /* 4666 * Don't count loaned bufs as in flight dirty data to prevent long 4667 * network delays from blocking transactions that are ready to be 4668 * assigned to a txg. 4669 */ 4670 anon_size = MAX((int64_t)(arc_anon->arcs_size - arc_loaned_bytes), 0); 4671 4672 /* 4673 * Writes will, almost always, require additional memory allocations 4674 * in order to compress/encrypt/etc the data. We therefore need to 4675 * make sure that there is sufficient available memory for this. 4676 */ 4677 error = arc_memory_throttle(reserve, txg); 4678 if (error != 0) 4679 return (error); 4680 4681 /* 4682 * Throttle writes when the amount of dirty data in the cache 4683 * gets too large. We try to keep the cache less than half full 4684 * of dirty blocks so that our sync times don't grow too large. 4685 * Note: if two requests come in concurrently, we might let them 4686 * both succeed, when one of them should fail. Not a huge deal. 4687 */ 4688 4689 if (reserve + arc_tempreserve + anon_size > arc_c / 2 && 4690 anon_size > arc_c / 4) { 4691 dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK " 4692 "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n", 4693 arc_tempreserve>>10, 4694 arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10, 4695 arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10, 4696 reserve>>10, arc_c>>10); 4697 return (SET_ERROR(ERESTART)); 4698 } 4699 atomic_add_64(&arc_tempreserve, reserve); 4700 return (0); 4701} 4702 4703static kmutex_t arc_lowmem_lock; 4704#ifdef _KERNEL 4705static eventhandler_tag arc_event_lowmem = NULL; 4706 4707static void 4708arc_lowmem(void *arg __unused, int howto __unused) 4709{ 4710 4711 /* Serialize access via arc_lowmem_lock. */ 4712 mutex_enter(&arc_lowmem_lock); 4713 mutex_enter(&arc_reclaim_thr_lock); 4714 needfree = 1; 4715 DTRACE_PROBE(arc__needfree); 4716 cv_signal(&arc_reclaim_thr_cv); 4717 4718 /* 4719 * It is unsafe to block here in arbitrary threads, because we can come 4720 * here from ARC itself and may hold ARC locks and thus risk a deadlock 4721 * with ARC reclaim thread. 4722 */ 4723 if (curproc == pageproc) { 4724 while (needfree) 4725 msleep(&needfree, &arc_reclaim_thr_lock, 0, "zfs:lowmem", 0); 4726 } 4727 mutex_exit(&arc_reclaim_thr_lock); 4728 mutex_exit(&arc_lowmem_lock); 4729} 4730#endif 4731 4732void 4733arc_init(void) 4734{ 4735 int i, prefetch_tunable_set = 0; 4736 4737 mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL); 4738 cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL); 4739 mutex_init(&arc_lowmem_lock, NULL, MUTEX_DEFAULT, NULL); 4740 4741 /* Convert seconds to clock ticks */ 4742 arc_min_prefetch_lifespan = 1 * hz; 4743 4744 /* Start out with 1/8 of all memory */ 4745 arc_c = kmem_size() / 8; 4746 4747#ifdef sun 4748#ifdef _KERNEL 4749 /* 4750 * On architectures where the physical memory can be larger 4751 * than the addressable space (intel in 32-bit mode), we may 4752 * need to limit the cache to 1/8 of VM size. 4753 */ 4754 arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8); 4755#endif 4756#endif /* sun */ 4757 /* set min cache to 1/32 of all memory, or 16MB, whichever is more */ 4758 arc_c_min = MAX(arc_c / 4, 16 << 20); 4759 /* set max to 1/2 of all memory, or all but 1GB, whichever is more */ 4760 if (arc_c * 8 >= 1 << 30) 4761 arc_c_max = (arc_c * 8) - (1 << 30); 4762 else 4763 arc_c_max = arc_c_min; 4764 arc_c_max = MAX(arc_c * 5, arc_c_max); 4765 4766#ifdef _KERNEL 4767 /* 4768 * Allow the tunables to override our calculations if they are 4769 * reasonable (ie. over 16MB) 4770 */ 4771 if (zfs_arc_max > 16 << 20 && zfs_arc_max < kmem_size()) 4772 arc_c_max = zfs_arc_max; 4773 if (zfs_arc_min > 16 << 20 && zfs_arc_min <= arc_c_max) 4774 arc_c_min = zfs_arc_min; 4775#endif 4776 4777 arc_c = arc_c_max; 4778 arc_p = (arc_c >> 1); 4779 4780 /* limit meta-data to 1/4 of the arc capacity */ 4781 arc_meta_limit = arc_c_max / 4; 4782 4783 /* Allow the tunable to override if it is reasonable */ 4784 if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max) 4785 arc_meta_limit = zfs_arc_meta_limit; 4786 4787 if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0) 4788 arc_c_min = arc_meta_limit / 2; 4789 4790 if (zfs_arc_meta_min > 0) { 4791 arc_meta_min = zfs_arc_meta_min; 4792 } else { 4793 arc_meta_min = arc_c_min / 2; 4794 } 4795 4796 if (zfs_arc_grow_retry > 0) 4797 arc_grow_retry = zfs_arc_grow_retry; 4798 4799 if (zfs_arc_shrink_shift > 0) 4800 arc_shrink_shift = zfs_arc_shrink_shift; 4801 4802 if (zfs_arc_p_min_shift > 0) 4803 arc_p_min_shift = zfs_arc_p_min_shift; 4804 4805 /* if kmem_flags are set, lets try to use less memory */ 4806 if (kmem_debugging()) 4807 arc_c = arc_c / 2; 4808 if (arc_c < arc_c_min) 4809 arc_c = arc_c_min; 4810 4811 zfs_arc_min = arc_c_min; 4812 zfs_arc_max = arc_c_max; 4813 4814 arc_anon = &ARC_anon; 4815 arc_mru = &ARC_mru; 4816 arc_mru_ghost = &ARC_mru_ghost; 4817 arc_mfu = &ARC_mfu; 4818 arc_mfu_ghost = &ARC_mfu_ghost; 4819 arc_l2c_only = &ARC_l2c_only; 4820 arc_size = 0; 4821 4822 for (i = 0; i < ARC_BUFC_NUMLISTS; i++) { 4823 mutex_init(&arc_anon->arcs_locks[i].arcs_lock, 4824 NULL, MUTEX_DEFAULT, NULL); 4825 mutex_init(&arc_mru->arcs_locks[i].arcs_lock, 4826 NULL, MUTEX_DEFAULT, NULL); 4827 mutex_init(&arc_mru_ghost->arcs_locks[i].arcs_lock, 4828 NULL, MUTEX_DEFAULT, NULL); 4829 mutex_init(&arc_mfu->arcs_locks[i].arcs_lock, 4830 NULL, MUTEX_DEFAULT, NULL); 4831 mutex_init(&arc_mfu_ghost->arcs_locks[i].arcs_lock, 4832 NULL, MUTEX_DEFAULT, NULL); 4833 mutex_init(&arc_l2c_only->arcs_locks[i].arcs_lock, 4834 NULL, MUTEX_DEFAULT, NULL); 4835 4836 list_create(&arc_mru->arcs_lists[i], 4837 sizeof (arc_buf_hdr_t), 4838 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node)); 4839 list_create(&arc_mru_ghost->arcs_lists[i], 4840 sizeof (arc_buf_hdr_t), 4841 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node)); 4842 list_create(&arc_mfu->arcs_lists[i], 4843 sizeof (arc_buf_hdr_t), 4844 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node)); 4845 list_create(&arc_mfu_ghost->arcs_lists[i], 4846 sizeof (arc_buf_hdr_t), 4847 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node)); 4848 list_create(&arc_mfu_ghost->arcs_lists[i], 4849 sizeof (arc_buf_hdr_t), 4850 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node)); 4851 list_create(&arc_l2c_only->arcs_lists[i], 4852 sizeof (arc_buf_hdr_t), 4853 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node)); 4854 } 4855 4856 buf_init(); 4857 4858 arc_thread_exit = 0; 4859 arc_eviction_list = NULL; 4860 mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL); 4861 bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t)); 4862 4863 arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED, 4864 sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); 4865 4866 if (arc_ksp != NULL) { 4867 arc_ksp->ks_data = &arc_stats; 4868 arc_ksp->ks_update = arc_kstat_update; 4869 kstat_install(arc_ksp); 4870 } 4871 4872 (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0, 4873 TS_RUN, minclsyspri); 4874 4875#ifdef _KERNEL 4876 arc_event_lowmem = EVENTHANDLER_REGISTER(vm_lowmem, arc_lowmem, NULL, 4877 EVENTHANDLER_PRI_FIRST); 4878#endif 4879 4880 arc_dead = FALSE; 4881 arc_warm = B_FALSE; 4882 4883 /* 4884 * Calculate maximum amount of dirty data per pool. 4885 * 4886 * If it has been set by /etc/system, take that. 4887 * Otherwise, use a percentage of physical memory defined by 4888 * zfs_dirty_data_max_percent (default 10%) with a cap at 4889 * zfs_dirty_data_max_max (default 4GB). 4890 */ 4891 if (zfs_dirty_data_max == 0) { 4892 zfs_dirty_data_max = ptob(physmem) * 4893 zfs_dirty_data_max_percent / 100; 4894 zfs_dirty_data_max = MIN(zfs_dirty_data_max, 4895 zfs_dirty_data_max_max); 4896 } 4897 4898#ifdef _KERNEL 4899 if (TUNABLE_INT_FETCH("vfs.zfs.prefetch_disable", &zfs_prefetch_disable)) 4900 prefetch_tunable_set = 1; 4901 4902#ifdef __i386__ 4903 if (prefetch_tunable_set == 0) { 4904 printf("ZFS NOTICE: Prefetch is disabled by default on i386 " 4905 "-- to enable,\n"); 4906 printf(" add \"vfs.zfs.prefetch_disable=0\" " 4907 "to /boot/loader.conf.\n"); 4908 zfs_prefetch_disable = 1; 4909 } 4910#else 4911 if ((((uint64_t)physmem * PAGESIZE) < (1ULL << 32)) && 4912 prefetch_tunable_set == 0) { 4913 printf("ZFS NOTICE: Prefetch is disabled by default if less " 4914 "than 4GB of RAM is present;\n" 4915 " to enable, add \"vfs.zfs.prefetch_disable=0\" " 4916 "to /boot/loader.conf.\n"); 4917 zfs_prefetch_disable = 1; 4918 } 4919#endif 4920 /* Warn about ZFS memory and address space requirements. */ 4921 if (((uint64_t)physmem * PAGESIZE) < (256 + 128 + 64) * (1 << 20)) { 4922 printf("ZFS WARNING: Recommended minimum RAM size is 512MB; " 4923 "expect unstable behavior.\n"); 4924 } 4925 if (kmem_size() < 512 * (1 << 20)) { 4926 printf("ZFS WARNING: Recommended minimum kmem_size is 512MB; " 4927 "expect unstable behavior.\n"); 4928 printf(" Consider tuning vm.kmem_size and " 4929 "vm.kmem_size_max\n"); 4930 printf(" in /boot/loader.conf.\n"); 4931 } 4932#endif 4933} 4934 4935void 4936arc_fini(void) 4937{ 4938 int i; 4939 4940 mutex_enter(&arc_reclaim_thr_lock); 4941 arc_thread_exit = 1; 4942 cv_signal(&arc_reclaim_thr_cv); 4943 while (arc_thread_exit != 0) 4944 cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock); 4945 mutex_exit(&arc_reclaim_thr_lock); 4946 4947 arc_flush(NULL); 4948 4949 arc_dead = TRUE; 4950 4951 if (arc_ksp != NULL) { 4952 kstat_delete(arc_ksp); 4953 arc_ksp = NULL; 4954 } 4955 4956 mutex_destroy(&arc_eviction_mtx); 4957 mutex_destroy(&arc_reclaim_thr_lock); 4958 cv_destroy(&arc_reclaim_thr_cv); 4959 4960 for (i = 0; i < ARC_BUFC_NUMLISTS; i++) { 4961 list_destroy(&arc_mru->arcs_lists[i]); 4962 list_destroy(&arc_mru_ghost->arcs_lists[i]); 4963 list_destroy(&arc_mfu->arcs_lists[i]); 4964 list_destroy(&arc_mfu_ghost->arcs_lists[i]); 4965 list_destroy(&arc_l2c_only->arcs_lists[i]); 4966 4967 mutex_destroy(&arc_anon->arcs_locks[i].arcs_lock); 4968 mutex_destroy(&arc_mru->arcs_locks[i].arcs_lock); 4969 mutex_destroy(&arc_mru_ghost->arcs_locks[i].arcs_lock); 4970 mutex_destroy(&arc_mfu->arcs_locks[i].arcs_lock); 4971 mutex_destroy(&arc_mfu_ghost->arcs_locks[i].arcs_lock); 4972 mutex_destroy(&arc_l2c_only->arcs_locks[i].arcs_lock); 4973 } 4974 4975 buf_fini(); 4976 4977 ASSERT0(arc_loaned_bytes); 4978 4979 mutex_destroy(&arc_lowmem_lock); 4980#ifdef _KERNEL 4981 if (arc_event_lowmem != NULL) 4982 EVENTHANDLER_DEREGISTER(vm_lowmem, arc_event_lowmem); 4983#endif 4984} 4985 4986/* 4987 * Level 2 ARC 4988 * 4989 * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk. 4990 * It uses dedicated storage devices to hold cached data, which are populated 4991 * using large infrequent writes. The main role of this cache is to boost 4992 * the performance of random read workloads. The intended L2ARC devices 4993 * include short-stroked disks, solid state disks, and other media with 4994 * substantially faster read latency than disk. 4995 * 4996 * +-----------------------+ 4997 * | ARC | 4998 * +-----------------------+ 4999 * | ^ ^ 5000 * | | | 5001 * l2arc_feed_thread() arc_read() 5002 * | | | 5003 * | l2arc read | 5004 * V | | 5005 * +---------------+ | 5006 * | L2ARC | | 5007 * +---------------+ | 5008 * | ^ | 5009 * l2arc_write() | | 5010 * | | | 5011 * V | | 5012 * +-------+ +-------+ 5013 * | vdev | | vdev | 5014 * | cache | | cache | 5015 * +-------+ +-------+ 5016 * +=========+ .-----. 5017 * : L2ARC : |-_____-| 5018 * : devices : | Disks | 5019 * +=========+ `-_____-' 5020 * 5021 * Read requests are satisfied from the following sources, in order: 5022 * 5023 * 1) ARC 5024 * 2) vdev cache of L2ARC devices 5025 * 3) L2ARC devices 5026 * 4) vdev cache of disks 5027 * 5) disks 5028 * 5029 * Some L2ARC device types exhibit extremely slow write performance. 5030 * To accommodate for this there are some significant differences between 5031 * the L2ARC and traditional cache design: 5032 * 5033 * 1. There is no eviction path from the ARC to the L2ARC. Evictions from 5034 * the ARC behave as usual, freeing buffers and placing headers on ghost 5035 * lists. The ARC does not send buffers to the L2ARC during eviction as 5036 * this would add inflated write latencies for all ARC memory pressure. 5037 * 5038 * 2. The L2ARC attempts to cache data from the ARC before it is evicted. 5039 * It does this by periodically scanning buffers from the eviction-end of 5040 * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are 5041 * not already there. It scans until a headroom of buffers is satisfied, 5042 * which itself is a buffer for ARC eviction. If a compressible buffer is 5043 * found during scanning and selected for writing to an L2ARC device, we 5044 * temporarily boost scanning headroom during the next scan cycle to make 5045 * sure we adapt to compression effects (which might significantly reduce 5046 * the data volume we write to L2ARC). The thread that does this is 5047 * l2arc_feed_thread(), illustrated below; example sizes are included to 5048 * provide a better sense of ratio than this diagram: 5049 * 5050 * head --> tail 5051 * +---------------------+----------+ 5052 * ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->. # already on L2ARC 5053 * +---------------------+----------+ | o L2ARC eligible 5054 * ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->| : ARC buffer 5055 * +---------------------+----------+ | 5056 * 15.9 Gbytes ^ 32 Mbytes | 5057 * headroom | 5058 * l2arc_feed_thread() 5059 * | 5060 * l2arc write hand <--[oooo]--' 5061 * | 8 Mbyte 5062 * | write max 5063 * V 5064 * +==============================+ 5065 * L2ARC dev |####|#|###|###| |####| ... | 5066 * +==============================+ 5067 * 32 Gbytes 5068 * 5069 * 3. If an ARC buffer is copied to the L2ARC but then hit instead of 5070 * evicted, then the L2ARC has cached a buffer much sooner than it probably 5071 * needed to, potentially wasting L2ARC device bandwidth and storage. It is 5072 * safe to say that this is an uncommon case, since buffers at the end of 5073 * the ARC lists have moved there due to inactivity. 5074 * 5075 * 4. If the ARC evicts faster than the L2ARC can maintain a headroom, 5076 * then the L2ARC simply misses copying some buffers. This serves as a 5077 * pressure valve to prevent heavy read workloads from both stalling the ARC 5078 * with waits and clogging the L2ARC with writes. This also helps prevent 5079 * the potential for the L2ARC to churn if it attempts to cache content too 5080 * quickly, such as during backups of the entire pool. 5081 * 5082 * 5. After system boot and before the ARC has filled main memory, there are 5083 * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru 5084 * lists can remain mostly static. Instead of searching from tail of these 5085 * lists as pictured, the l2arc_feed_thread() will search from the list heads 5086 * for eligible buffers, greatly increasing its chance of finding them. 5087 * 5088 * The L2ARC device write speed is also boosted during this time so that 5089 * the L2ARC warms up faster. Since there have been no ARC evictions yet, 5090 * there are no L2ARC reads, and no fear of degrading read performance 5091 * through increased writes. 5092 * 5093 * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that 5094 * the vdev queue can aggregate them into larger and fewer writes. Each 5095 * device is written to in a rotor fashion, sweeping writes through 5096 * available space then repeating. 5097 * 5098 * 7. The L2ARC does not store dirty content. It never needs to flush 5099 * write buffers back to disk based storage. 5100 * 5101 * 8. If an ARC buffer is written (and dirtied) which also exists in the 5102 * L2ARC, the now stale L2ARC buffer is immediately dropped. 5103 * 5104 * The performance of the L2ARC can be tweaked by a number of tunables, which 5105 * may be necessary for different workloads: 5106 * 5107 * l2arc_write_max max write bytes per interval 5108 * l2arc_write_boost extra write bytes during device warmup 5109 * l2arc_noprefetch skip caching prefetched buffers 5110 * l2arc_headroom number of max device writes to precache 5111 * l2arc_headroom_boost when we find compressed buffers during ARC 5112 * scanning, we multiply headroom by this 5113 * percentage factor for the next scan cycle, 5114 * since more compressed buffers are likely to 5115 * be present 5116 * l2arc_feed_secs seconds between L2ARC writing 5117 * 5118 * Tunables may be removed or added as future performance improvements are 5119 * integrated, and also may become zpool properties. 5120 * 5121 * There are three key functions that control how the L2ARC warms up: 5122 * 5123 * l2arc_write_eligible() check if a buffer is eligible to cache 5124 * l2arc_write_size() calculate how much to write 5125 * l2arc_write_interval() calculate sleep delay between writes 5126 * 5127 * These three functions determine what to write, how much, and how quickly 5128 * to send writes. 5129 */ 5130 5131static boolean_t 5132l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *hdr) 5133{ 5134 /* 5135 * A buffer is *not* eligible for the L2ARC if it: 5136 * 1. belongs to a different spa. 5137 * 2. is already cached on the L2ARC. 5138 * 3. has an I/O in progress (it may be an incomplete read). 5139 * 4. is flagged not eligible (zfs property). 5140 */ 5141 if (hdr->b_spa != spa_guid) { 5142 ARCSTAT_BUMP(arcstat_l2_write_spa_mismatch); 5143 return (B_FALSE); 5144 } 5145 if (HDR_HAS_L2HDR(hdr)) { 5146 ARCSTAT_BUMP(arcstat_l2_write_in_l2); 5147 return (B_FALSE); 5148 } 5149 if (HDR_IO_IN_PROGRESS(hdr)) { 5150 ARCSTAT_BUMP(arcstat_l2_write_hdr_io_in_progress); 5151 return (B_FALSE); 5152 } 5153 if (!HDR_L2CACHE(hdr)) { 5154 ARCSTAT_BUMP(arcstat_l2_write_not_cacheable); 5155 return (B_FALSE); 5156 } 5157 5158 return (B_TRUE); 5159} 5160 5161static uint64_t 5162l2arc_write_size(void) 5163{ 5164 uint64_t size; 5165 5166 /* 5167 * Make sure our globals have meaningful values in case the user 5168 * altered them. 5169 */ 5170 size = l2arc_write_max; 5171 if (size == 0) { 5172 cmn_err(CE_NOTE, "Bad value for l2arc_write_max, value must " 5173 "be greater than zero, resetting it to the default (%d)", 5174 L2ARC_WRITE_SIZE); 5175 size = l2arc_write_max = L2ARC_WRITE_SIZE; 5176 } 5177 5178 if (arc_warm == B_FALSE) 5179 size += l2arc_write_boost; 5180 5181 return (size); 5182 5183} 5184 5185static clock_t 5186l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote) 5187{ 5188 clock_t interval, next, now; 5189 5190 /* 5191 * If the ARC lists are busy, increase our write rate; if the 5192 * lists are stale, idle back. This is achieved by checking 5193 * how much we previously wrote - if it was more than half of 5194 * what we wanted, schedule the next write much sooner. 5195 */ 5196 if (l2arc_feed_again && wrote > (wanted / 2)) 5197 interval = (hz * l2arc_feed_min_ms) / 1000; 5198 else 5199 interval = hz * l2arc_feed_secs; 5200 5201 now = ddi_get_lbolt(); 5202 next = MAX(now, MIN(now + interval, began + interval)); 5203 5204 return (next); 5205} 5206 5207/* 5208 * Cycle through L2ARC devices. This is how L2ARC load balances. 5209 * If a device is returned, this also returns holding the spa config lock. 5210 */ 5211static l2arc_dev_t * 5212l2arc_dev_get_next(void) 5213{ 5214 l2arc_dev_t *first, *next = NULL; 5215 5216 /* 5217 * Lock out the removal of spas (spa_namespace_lock), then removal 5218 * of cache devices (l2arc_dev_mtx). Once a device has been selected, 5219 * both locks will be dropped and a spa config lock held instead. 5220 */ 5221 mutex_enter(&spa_namespace_lock); 5222 mutex_enter(&l2arc_dev_mtx); 5223 5224 /* if there are no vdevs, there is nothing to do */ 5225 if (l2arc_ndev == 0) 5226 goto out; 5227 5228 first = NULL; 5229 next = l2arc_dev_last; 5230 do { 5231 /* loop around the list looking for a non-faulted vdev */ 5232 if (next == NULL) { 5233 next = list_head(l2arc_dev_list); 5234 } else { 5235 next = list_next(l2arc_dev_list, next); 5236 if (next == NULL) 5237 next = list_head(l2arc_dev_list); 5238 } 5239 5240 /* if we have come back to the start, bail out */ 5241 if (first == NULL) 5242 first = next; 5243 else if (next == first) 5244 break; 5245 5246 } while (vdev_is_dead(next->l2ad_vdev)); 5247 5248 /* if we were unable to find any usable vdevs, return NULL */ 5249 if (vdev_is_dead(next->l2ad_vdev)) 5250 next = NULL; 5251 5252 l2arc_dev_last = next; 5253 5254out: 5255 mutex_exit(&l2arc_dev_mtx); 5256 5257 /* 5258 * Grab the config lock to prevent the 'next' device from being 5259 * removed while we are writing to it. 5260 */ 5261 if (next != NULL) 5262 spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER); 5263 mutex_exit(&spa_namespace_lock); 5264 5265 return (next); 5266} 5267 5268/* 5269 * Free buffers that were tagged for destruction. 5270 */ 5271static void 5272l2arc_do_free_on_write() 5273{ 5274 list_t *buflist; 5275 l2arc_data_free_t *df, *df_prev; 5276 5277 mutex_enter(&l2arc_free_on_write_mtx); 5278 buflist = l2arc_free_on_write; 5279 5280 for (df = list_tail(buflist); df; df = df_prev) { 5281 df_prev = list_prev(buflist, df); 5282 ASSERT(df->l2df_data != NULL); 5283 ASSERT(df->l2df_func != NULL); 5284 df->l2df_func(df->l2df_data, df->l2df_size); 5285 list_remove(buflist, df); 5286 kmem_free(df, sizeof (l2arc_data_free_t)); 5287 } 5288 5289 mutex_exit(&l2arc_free_on_write_mtx); 5290} 5291 5292/* 5293 * A write to a cache device has completed. Update all headers to allow 5294 * reads from these buffers to begin. 5295 */ 5296static void 5297l2arc_write_done(zio_t *zio) 5298{ 5299 l2arc_write_callback_t *cb; 5300 l2arc_dev_t *dev; 5301 list_t *buflist; 5302 arc_buf_hdr_t *head, *hdr, *hdr_prev; 5303 kmutex_t *hash_lock; 5304 int64_t bytes_dropped = 0; 5305 5306 cb = zio->io_private; 5307 ASSERT(cb != NULL); 5308 dev = cb->l2wcb_dev; 5309 ASSERT(dev != NULL); 5310 head = cb->l2wcb_head; 5311 ASSERT(head != NULL); 5312 buflist = &dev->l2ad_buflist; 5313 ASSERT(buflist != NULL); 5314 DTRACE_PROBE2(l2arc__iodone, zio_t *, zio, 5315 l2arc_write_callback_t *, cb); 5316 5317 if (zio->io_error != 0) 5318 ARCSTAT_BUMP(arcstat_l2_writes_error); 5319 5320 mutex_enter(&dev->l2ad_mtx); 5321 5322 /* 5323 * All writes completed, or an error was hit. 5324 */ 5325 for (hdr = list_prev(buflist, head); hdr; hdr = hdr_prev) { 5326 hdr_prev = list_prev(buflist, hdr); 5327 5328 hash_lock = HDR_LOCK(hdr); 5329 if (!mutex_tryenter(hash_lock)) { 5330 /* 5331 * This buffer misses out. It may be in a stage 5332 * of eviction. Its ARC_FLAG_L2_WRITING flag will be 5333 * left set, denying reads to this buffer. 5334 */ 5335 ARCSTAT_BUMP(arcstat_l2_writes_hdr_miss); 5336 continue; 5337 } 5338 5339 /* 5340 * It's possible that this buffer got evicted from the L1 cache 5341 * before we grabbed the vdev + hash locks, in which case 5342 * arc_hdr_realloc freed b_tmp_cdata for us if it was allocated. 5343 * Only free the buffer if we still have an L1 hdr. 5344 */ 5345 if (HDR_HAS_L1HDR(hdr) && hdr->b_l1hdr.b_tmp_cdata != NULL && 5346 HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF) 5347 l2arc_release_cdata_buf(hdr); 5348 5349 if (zio->io_error != 0) { 5350 /* 5351 * Error - drop L2ARC entry. 5352 */ 5353 trim_map_free(hdr->b_l2hdr.b_dev->l2ad_vdev, 5354 hdr->b_l2hdr.b_daddr, hdr->b_l2hdr.b_asize, 0); 5355 hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR; 5356 5357 ARCSTAT_INCR(arcstat_l2_asize, -hdr->b_l2hdr.b_asize); 5358 ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); 5359 } 5360 5361 /* 5362 * Allow ARC to begin reads to this L2ARC entry. 5363 */ 5364 hdr->b_flags &= ~ARC_FLAG_L2_WRITING; 5365 5366 mutex_exit(hash_lock); 5367 } 5368 5369 atomic_inc_64(&l2arc_writes_done); 5370 list_remove(buflist, head); 5371 ASSERT(!HDR_HAS_L1HDR(head)); 5372 kmem_cache_free(hdr_l2only_cache, head); 5373 mutex_exit(&dev->l2ad_mtx); 5374 5375 vdev_space_update(dev->l2ad_vdev, -bytes_dropped, 0, 0); 5376 5377 l2arc_do_free_on_write(); 5378 5379 kmem_free(cb, sizeof (l2arc_write_callback_t)); 5380} 5381 5382/* 5383 * A read to a cache device completed. Validate buffer contents before 5384 * handing over to the regular ARC routines. 5385 */ 5386static void 5387l2arc_read_done(zio_t *zio) 5388{ 5389 l2arc_read_callback_t *cb; 5390 arc_buf_hdr_t *hdr; 5391 arc_buf_t *buf; 5392 kmutex_t *hash_lock; 5393 int equal; 5394 5395 ASSERT(zio->io_vd != NULL); 5396 ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE); 5397 5398 spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd); 5399 5400 cb = zio->io_private; 5401 ASSERT(cb != NULL); 5402 buf = cb->l2rcb_buf; 5403 ASSERT(buf != NULL); 5404 5405 hash_lock = HDR_LOCK(buf->b_hdr); 5406 mutex_enter(hash_lock); 5407 hdr = buf->b_hdr; 5408 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 5409 5410 /* 5411 * If the buffer was compressed, decompress it first. 5412 */ 5413 if (cb->l2rcb_compress != ZIO_COMPRESS_OFF) 5414 l2arc_decompress_zio(zio, hdr, cb->l2rcb_compress); 5415 ASSERT(zio->io_data != NULL); 5416 5417 /* 5418 * Check this survived the L2ARC journey. 5419 */ 5420 equal = arc_cksum_equal(buf); 5421 if (equal && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) { 5422 mutex_exit(hash_lock); 5423 zio->io_private = buf; 5424 zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 */ 5425 zio->io_bp = &zio->io_bp_copy; /* XXX fix in L2ARC 2.0 */ 5426 arc_read_done(zio); 5427 } else { 5428 mutex_exit(hash_lock); 5429 /* 5430 * Buffer didn't survive caching. Increment stats and 5431 * reissue to the original storage device. 5432 */ 5433 if (zio->io_error != 0) { 5434 ARCSTAT_BUMP(arcstat_l2_io_error); 5435 } else { 5436 zio->io_error = SET_ERROR(EIO); 5437 } 5438 if (!equal) 5439 ARCSTAT_BUMP(arcstat_l2_cksum_bad); 5440 5441 /* 5442 * If there's no waiter, issue an async i/o to the primary 5443 * storage now. If there *is* a waiter, the caller must 5444 * issue the i/o in a context where it's OK to block. 5445 */ 5446 if (zio->io_waiter == NULL) { 5447 zio_t *pio = zio_unique_parent(zio); 5448 5449 ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL); 5450 5451 zio_nowait(zio_read(pio, cb->l2rcb_spa, &cb->l2rcb_bp, 5452 buf->b_data, zio->io_size, arc_read_done, buf, 5453 zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb)); 5454 } 5455 } 5456 5457 kmem_free(cb, sizeof (l2arc_read_callback_t)); 5458} 5459 5460/* 5461 * This is the list priority from which the L2ARC will search for pages to 5462 * cache. This is used within loops (0..3) to cycle through lists in the 5463 * desired order. This order can have a significant effect on cache 5464 * performance. 5465 * 5466 * Currently the metadata lists are hit first, MFU then MRU, followed by 5467 * the data lists. This function returns a locked list, and also returns 5468 * the lock pointer. 5469 */ 5470static list_t * 5471l2arc_list_locked(int list_num, kmutex_t **lock) 5472{ 5473 list_t *list = NULL; 5474 int idx; 5475 5476 ASSERT(list_num >= 0 && list_num < 2 * ARC_BUFC_NUMLISTS); 5477 5478 if (list_num < ARC_BUFC_NUMMETADATALISTS) { 5479 idx = list_num; 5480 list = &arc_mfu->arcs_lists[idx]; 5481 *lock = ARCS_LOCK(arc_mfu, idx); 5482 } else if (list_num < ARC_BUFC_NUMMETADATALISTS * 2) { 5483 idx = list_num - ARC_BUFC_NUMMETADATALISTS; 5484 list = &arc_mru->arcs_lists[idx]; 5485 *lock = ARCS_LOCK(arc_mru, idx); 5486 } else if (list_num < (ARC_BUFC_NUMMETADATALISTS * 2 + 5487 ARC_BUFC_NUMDATALISTS)) { 5488 idx = list_num - ARC_BUFC_NUMMETADATALISTS; 5489 list = &arc_mfu->arcs_lists[idx]; 5490 *lock = ARCS_LOCK(arc_mfu, idx); 5491 } else { 5492 idx = list_num - ARC_BUFC_NUMLISTS; 5493 list = &arc_mru->arcs_lists[idx]; 5494 *lock = ARCS_LOCK(arc_mru, idx); 5495 } 5496 5497 ASSERT(!(MUTEX_HELD(*lock))); 5498 mutex_enter(*lock); 5499 return (list); 5500} 5501 5502/* 5503 * Evict buffers from the device write hand to the distance specified in 5504 * bytes. This distance may span populated buffers, it may span nothing. 5505 * This is clearing a region on the L2ARC device ready for writing. 5506 * If the 'all' boolean is set, every buffer is evicted. 5507 */ 5508static void 5509l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) 5510{ 5511 list_t *buflist; 5512 arc_buf_hdr_t *hdr, *hdr_prev; 5513 kmutex_t *hash_lock; 5514 uint64_t taddr; 5515 int64_t bytes_evicted = 0; 5516 5517 buflist = &dev->l2ad_buflist; 5518 5519 if (!all && dev->l2ad_first) { 5520 /* 5521 * This is the first sweep through the device. There is 5522 * nothing to evict. 5523 */ 5524 return; 5525 } 5526 5527 if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) { 5528 /* 5529 * When nearing the end of the device, evict to the end 5530 * before the device write hand jumps to the start. 5531 */ 5532 taddr = dev->l2ad_end; 5533 } else { 5534 taddr = dev->l2ad_hand + distance; 5535 } 5536 DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist, 5537 uint64_t, taddr, boolean_t, all); 5538 5539top: 5540 mutex_enter(&dev->l2ad_mtx); 5541 for (hdr = list_tail(buflist); hdr; hdr = hdr_prev) { 5542 hdr_prev = list_prev(buflist, hdr); 5543 5544 hash_lock = HDR_LOCK(hdr); 5545 if (!mutex_tryenter(hash_lock)) { 5546 /* 5547 * Missed the hash lock. Retry. 5548 */ 5549 ARCSTAT_BUMP(arcstat_l2_evict_lock_retry); 5550 mutex_exit(&dev->l2ad_mtx); 5551 mutex_enter(hash_lock); 5552 mutex_exit(hash_lock); 5553 goto top; 5554 } 5555 5556 if (HDR_L2_WRITE_HEAD(hdr)) { 5557 /* 5558 * We hit a write head node. Leave it for 5559 * l2arc_write_done(). 5560 */ 5561 list_remove(buflist, hdr); 5562 mutex_exit(hash_lock); 5563 continue; 5564 } 5565 5566 if (!all && HDR_HAS_L2HDR(hdr) && 5567 (hdr->b_l2hdr.b_daddr > taddr || 5568 hdr->b_l2hdr.b_daddr < dev->l2ad_hand)) { 5569 /* 5570 * We've evicted to the target address, 5571 * or the end of the device. 5572 */ 5573 mutex_exit(hash_lock); 5574 break; 5575 } 5576 5577 ASSERT(HDR_HAS_L2HDR(hdr)); 5578 if (!HDR_HAS_L1HDR(hdr)) { 5579 ASSERT(!HDR_L2_READING(hdr)); 5580 /* 5581 * This doesn't exist in the ARC. Destroy. 5582 * arc_hdr_destroy() will call list_remove() 5583 * and decrement arcstat_l2_size. 5584 */ 5585 arc_change_state(arc_anon, hdr, hash_lock); 5586 arc_hdr_destroy(hdr); 5587 } else { 5588 ASSERT(hdr->b_l1hdr.b_state != arc_l2c_only); 5589 ARCSTAT_BUMP(arcstat_l2_evict_l1cached); 5590 /* 5591 * Invalidate issued or about to be issued 5592 * reads, since we may be about to write 5593 * over this location. 5594 */ 5595 if (HDR_L2_READING(hdr)) { 5596 ARCSTAT_BUMP(arcstat_l2_evict_reading); 5597 hdr->b_flags |= ARC_FLAG_L2_EVICTED; 5598 } 5599 5600 /* Tell ARC this no longer exists in L2ARC. */ 5601 ARCSTAT_INCR(arcstat_l2_asize, -hdr->b_l2hdr.b_asize); 5602 ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); 5603 hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR; 5604 list_remove(buflist, hdr); 5605 5606 /* This may have been leftover after a failed write. */ 5607 hdr->b_flags &= ~ARC_FLAG_L2_WRITING; 5608 } 5609 mutex_exit(hash_lock); 5610 } 5611 mutex_exit(&dev->l2ad_mtx); 5612 5613 vdev_space_update(dev->l2ad_vdev, -bytes_evicted, 0, 0); 5614 dev->l2ad_evict = taddr; 5615} 5616 5617/* 5618 * Find and write ARC buffers to the L2ARC device. 5619 * 5620 * An ARC_FLAG_L2_WRITING flag is set so that the L2ARC buffers are not valid 5621 * for reading until they have completed writing. 5622 * The headroom_boost is an in-out parameter used to maintain headroom boost 5623 * state between calls to this function. 5624 * 5625 * Returns the number of bytes actually written (which may be smaller than 5626 * the delta by which the device hand has changed due to alignment). 5627 */ 5628static uint64_t 5629l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, 5630 boolean_t *headroom_boost) 5631{ 5632 arc_buf_hdr_t *hdr, *hdr_prev, *head; 5633 list_t *list; 5634 uint64_t write_asize, write_sz, headroom, buf_compress_minsz; 5635 void *buf_data; 5636 kmutex_t *list_lock; 5637 boolean_t full; 5638 l2arc_write_callback_t *cb; 5639 zio_t *pio, *wzio; 5640 uint64_t guid = spa_load_guid(spa); 5641 const boolean_t do_headroom_boost = *headroom_boost; 5642 int try; 5643 5644 ASSERT(dev->l2ad_vdev != NULL); 5645 5646 /* Lower the flag now, we might want to raise it again later. */ 5647 *headroom_boost = B_FALSE; 5648 5649 pio = NULL; 5650 write_sz = write_asize = 0; 5651 full = B_FALSE; 5652 head = kmem_cache_alloc(hdr_l2only_cache, KM_PUSHPAGE); 5653 head->b_flags |= ARC_FLAG_L2_WRITE_HEAD; 5654 head->b_flags |= ARC_FLAG_HAS_L2HDR; 5655 5656 ARCSTAT_BUMP(arcstat_l2_write_buffer_iter); 5657 /* 5658 * We will want to try to compress buffers that are at least 2x the 5659 * device sector size. 5660 */ 5661 buf_compress_minsz = 2 << dev->l2ad_vdev->vdev_ashift; 5662 5663 /* 5664 * Copy buffers for L2ARC writing. 5665 */ 5666 mutex_enter(&dev->l2ad_mtx); 5667 for (try = 0; try < 2 * ARC_BUFC_NUMLISTS; try++) { 5668 uint64_t passed_sz = 0; 5669 5670 list = l2arc_list_locked(try, &list_lock); 5671 ARCSTAT_BUMP(arcstat_l2_write_buffer_list_iter); 5672 5673 /* 5674 * L2ARC fast warmup. 5675 * 5676 * Until the ARC is warm and starts to evict, read from the 5677 * head of the ARC lists rather than the tail. 5678 */ 5679 if (arc_warm == B_FALSE) 5680 hdr = list_head(list); 5681 else 5682 hdr = list_tail(list); 5683 if (hdr == NULL) 5684 ARCSTAT_BUMP(arcstat_l2_write_buffer_list_null_iter); 5685 5686 headroom = target_sz * l2arc_headroom * 2 / ARC_BUFC_NUMLISTS; 5687 if (do_headroom_boost) 5688 headroom = (headroom * l2arc_headroom_boost) / 100; 5689 5690 for (; hdr; hdr = hdr_prev) { 5691 kmutex_t *hash_lock; 5692 uint64_t buf_sz; 5693 uint64_t buf_a_sz; 5694 5695 if (arc_warm == B_FALSE) 5696 hdr_prev = list_next(list, hdr); 5697 else 5698 hdr_prev = list_prev(list, hdr); 5699 ARCSTAT_INCR(arcstat_l2_write_buffer_bytes_scanned, hdr->b_size); 5700 5701 hash_lock = HDR_LOCK(hdr); 5702 if (!mutex_tryenter(hash_lock)) { 5703 ARCSTAT_BUMP(arcstat_l2_write_trylock_fail); 5704 /* 5705 * Skip this buffer rather than waiting. 5706 */ 5707 continue; 5708 } 5709 5710 passed_sz += hdr->b_size; 5711 if (passed_sz > headroom) { 5712 /* 5713 * Searched too far. 5714 */ 5715 mutex_exit(hash_lock); 5716 ARCSTAT_BUMP(arcstat_l2_write_passed_headroom); 5717 break; 5718 } 5719 5720 if (!l2arc_write_eligible(guid, hdr)) { 5721 mutex_exit(hash_lock); 5722 continue; 5723 } 5724 5725 /* 5726 * Assume that the buffer is not going to be compressed 5727 * and could take more space on disk because of a larger 5728 * disk block size. 5729 */ 5730 buf_sz = hdr->b_size; 5731 buf_a_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz); 5732 5733 if ((write_asize + buf_a_sz) > target_sz) { 5734 full = B_TRUE; 5735 mutex_exit(hash_lock); 5736 ARCSTAT_BUMP(arcstat_l2_write_full); 5737 break; 5738 } 5739 5740 if (pio == NULL) { 5741 /* 5742 * Insert a dummy header on the buflist so 5743 * l2arc_write_done() can find where the 5744 * write buffers begin without searching. 5745 */ 5746 list_insert_head(&dev->l2ad_buflist, head); 5747 5748 cb = kmem_alloc( 5749 sizeof (l2arc_write_callback_t), KM_SLEEP); 5750 cb->l2wcb_dev = dev; 5751 cb->l2wcb_head = head; 5752 pio = zio_root(spa, l2arc_write_done, cb, 5753 ZIO_FLAG_CANFAIL); 5754 ARCSTAT_BUMP(arcstat_l2_write_pios); 5755 } 5756 5757 /* 5758 * Create and add a new L2ARC header. 5759 */ 5760 hdr->b_l2hdr.b_dev = dev; 5761 hdr->b_flags |= ARC_FLAG_L2_WRITING; 5762 /* 5763 * Temporarily stash the data buffer in b_tmp_cdata. 5764 * The subsequent write step will pick it up from 5765 * there. This is because can't access b_l1hdr.b_buf 5766 * without holding the hash_lock, which we in turn 5767 * can't access without holding the ARC list locks 5768 * (which we want to avoid during compression/writing). 5769 */ 5770 HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_OFF); 5771 hdr->b_l2hdr.b_asize = hdr->b_size; 5772 hdr->b_l1hdr.b_tmp_cdata = hdr->b_l1hdr.b_buf->b_data; 5773 5774 hdr->b_flags |= ARC_FLAG_HAS_L2HDR; 5775 5776 list_insert_head(&dev->l2ad_buflist, hdr); 5777 5778 /* 5779 * Compute and store the buffer cksum before 5780 * writing. On debug the cksum is verified first. 5781 */ 5782 arc_cksum_verify(hdr->b_l1hdr.b_buf); 5783 arc_cksum_compute(hdr->b_l1hdr.b_buf, B_TRUE); 5784 5785 mutex_exit(hash_lock); 5786 5787 write_sz += buf_sz; 5788 write_asize += buf_a_sz; 5789 } 5790 5791 mutex_exit(list_lock); 5792 5793 if (full == B_TRUE) 5794 break; 5795 } 5796 5797 /* No buffers selected for writing? */ 5798 if (pio == NULL) { 5799 ASSERT0(write_sz); 5800 mutex_exit(&dev->l2ad_mtx); 5801 ASSERT(!HDR_HAS_L1HDR(head)); 5802 kmem_cache_free(hdr_l2only_cache, head); 5803 return (0); 5804 } 5805 5806 /* 5807 * Note that elsewhere in this file arcstat_l2_asize 5808 * and the used space on l2ad_vdev are updated using b_asize, 5809 * which is not necessarily rounded up to the device block size. 5810 * Too keep accounting consistent we do the same here as well: 5811 * stats_size accumulates the sum of b_asize of the written buffers, 5812 * while write_asize accumulates the sum of b_asize rounded up 5813 * to the device block size. 5814 * The latter sum is used only to validate the corectness of the code. 5815 */ 5816 uint64_t stats_size = 0; 5817 write_asize = 0; 5818 5819 /* 5820 * Now start writing the buffers. We're starting at the write head 5821 * and work backwards, retracing the course of the buffer selector 5822 * loop above. 5823 */ 5824 for (hdr = list_prev(&dev->l2ad_buflist, head); hdr; 5825 hdr = list_prev(&dev->l2ad_buflist, hdr)) { 5826 uint64_t buf_sz; 5827 5828 /* 5829 * We shouldn't need to lock the buffer here, since we flagged 5830 * it as ARC_FLAG_L2_WRITING in the previous step, but we must 5831 * take care to only access its L2 cache parameters. In 5832 * particular, hdr->l1hdr.b_buf may be invalid by now due to 5833 * ARC eviction. 5834 */ 5835 hdr->b_l2hdr.b_daddr = dev->l2ad_hand; 5836 5837 if ((HDR_L2COMPRESS(hdr)) && 5838 hdr->b_l2hdr.b_asize >= buf_compress_minsz) { 5839 if (l2arc_compress_buf(hdr)) { 5840 /* 5841 * If compression succeeded, enable headroom 5842 * boost on the next scan cycle. 5843 */ 5844 *headroom_boost = B_TRUE; 5845 } 5846 } 5847 5848 /* 5849 * Pick up the buffer data we had previously stashed away 5850 * (and now potentially also compressed). 5851 */ 5852 buf_data = hdr->b_l1hdr.b_tmp_cdata; 5853 buf_sz = hdr->b_l2hdr.b_asize; 5854 5855 /* 5856 * If the data has not been compressed, then clear b_tmp_cdata 5857 * to make sure that it points only to a temporary compression 5858 * buffer. 5859 */ 5860 if (!L2ARC_IS_VALID_COMPRESS(HDR_GET_COMPRESS(hdr))) 5861 hdr->b_l1hdr.b_tmp_cdata = NULL; 5862 5863 /* Compression may have squashed the buffer to zero length. */ 5864 if (buf_sz != 0) { 5865 uint64_t buf_a_sz; 5866 5867 wzio = zio_write_phys(pio, dev->l2ad_vdev, 5868 dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF, 5869 NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE, 5870 ZIO_FLAG_CANFAIL, B_FALSE); 5871 5872 DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev, 5873 zio_t *, wzio); 5874 (void) zio_nowait(wzio); 5875 5876 stats_size += buf_sz; 5877 /* 5878 * Keep the clock hand suitably device-aligned. 5879 */ 5880 buf_a_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz); 5881 write_asize += buf_a_sz; 5882 dev->l2ad_hand += buf_a_sz; 5883 } 5884 } 5885 5886 mutex_exit(&dev->l2ad_mtx); 5887 5888 ASSERT3U(write_asize, <=, target_sz); 5889 ARCSTAT_BUMP(arcstat_l2_writes_sent); 5890 ARCSTAT_INCR(arcstat_l2_write_bytes, write_asize); 5891 ARCSTAT_INCR(arcstat_l2_size, write_sz); 5892 ARCSTAT_INCR(arcstat_l2_asize, stats_size); 5893 vdev_space_update(dev->l2ad_vdev, stats_size, 0, 0); 5894 5895 /* 5896 * Bump device hand to the device start if it is approaching the end. 5897 * l2arc_evict() will already have evicted ahead for this case. 5898 */ 5899 if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) { 5900 dev->l2ad_hand = dev->l2ad_start; 5901 dev->l2ad_evict = dev->l2ad_start; 5902 dev->l2ad_first = B_FALSE; 5903 } 5904 5905 dev->l2ad_writing = B_TRUE; 5906 (void) zio_wait(pio); 5907 dev->l2ad_writing = B_FALSE; 5908 5909 return (write_asize); 5910} 5911 5912/* 5913 * Compresses an L2ARC buffer. 5914 * The data to be compressed must be prefilled in l1hdr.b_tmp_cdata and its 5915 * size in l2hdr->b_asize. This routine tries to compress the data and 5916 * depending on the compression result there are three possible outcomes: 5917 * *) The buffer was incompressible. The original l2hdr contents were left 5918 * untouched and are ready for writing to an L2 device. 5919 * *) The buffer was all-zeros, so there is no need to write it to an L2 5920 * device. To indicate this situation b_tmp_cdata is NULL'ed, b_asize is 5921 * set to zero and b_compress is set to ZIO_COMPRESS_EMPTY. 5922 * *) Compression succeeded and b_tmp_cdata was replaced with a temporary 5923 * data buffer which holds the compressed data to be written, and b_asize 5924 * tells us how much data there is. b_compress is set to the appropriate 5925 * compression algorithm. Once writing is done, invoke 5926 * l2arc_release_cdata_buf on this l2hdr to free this temporary buffer. 5927 * 5928 * Returns B_TRUE if compression succeeded, or B_FALSE if it didn't (the 5929 * buffer was incompressible). 5930 */ 5931static boolean_t 5932l2arc_compress_buf(arc_buf_hdr_t *hdr) 5933{ 5934 void *cdata; 5935 size_t csize, len, rounded; 5936 ASSERT(HDR_HAS_L2HDR(hdr)); 5937 l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr; 5938 5939 ASSERT(HDR_HAS_L1HDR(hdr)); 5940 ASSERT(HDR_GET_COMPRESS(hdr) == ZIO_COMPRESS_OFF); 5941 ASSERT(hdr->b_l1hdr.b_tmp_cdata != NULL); 5942 5943 len = l2hdr->b_asize; 5944 cdata = zio_data_buf_alloc(len); 5945 ASSERT3P(cdata, !=, NULL); 5946 csize = zio_compress_data(ZIO_COMPRESS_LZ4, hdr->b_l1hdr.b_tmp_cdata, 5947 cdata, l2hdr->b_asize); 5948 5949 if (csize == 0) { 5950 /* zero block, indicate that there's nothing to write */ 5951 zio_data_buf_free(cdata, len); 5952 HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_EMPTY); 5953 l2hdr->b_asize = 0; 5954 hdr->b_l1hdr.b_tmp_cdata = NULL; 5955 ARCSTAT_BUMP(arcstat_l2_compress_zeros); 5956 return (B_TRUE); 5957 } 5958 5959 rounded = P2ROUNDUP(csize, 5960 (size_t)1 << l2hdr->b_dev->l2ad_vdev->vdev_ashift); 5961 if (rounded < len) { 5962 /* 5963 * Compression succeeded, we'll keep the cdata around for 5964 * writing and release it afterwards. 5965 */ 5966 if (rounded > csize) { 5967 bzero((char *)cdata + csize, rounded - csize); 5968 csize = rounded; 5969 } 5970 HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_LZ4); 5971 l2hdr->b_asize = csize; 5972 hdr->b_l1hdr.b_tmp_cdata = cdata; 5973 ARCSTAT_BUMP(arcstat_l2_compress_successes); 5974 return (B_TRUE); 5975 } else { 5976 /* 5977 * Compression failed, release the compressed buffer. 5978 * l2hdr will be left unmodified. 5979 */ 5980 zio_data_buf_free(cdata, len); 5981 ARCSTAT_BUMP(arcstat_l2_compress_failures); 5982 return (B_FALSE); 5983 } 5984} 5985 5986/* 5987 * Decompresses a zio read back from an l2arc device. On success, the 5988 * underlying zio's io_data buffer is overwritten by the uncompressed 5989 * version. On decompression error (corrupt compressed stream), the 5990 * zio->io_error value is set to signal an I/O error. 5991 * 5992 * Please note that the compressed data stream is not checksummed, so 5993 * if the underlying device is experiencing data corruption, we may feed 5994 * corrupt data to the decompressor, so the decompressor needs to be 5995 * able to handle this situation (LZ4 does). 5996 */ 5997static void 5998l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr, enum zio_compress c) 5999{ 6000 ASSERT(L2ARC_IS_VALID_COMPRESS(c)); 6001 6002 if (zio->io_error != 0) { 6003 /* 6004 * An io error has occured, just restore the original io 6005 * size in preparation for a main pool read. 6006 */ 6007 zio->io_orig_size = zio->io_size = hdr->b_size; 6008 return; 6009 } 6010 6011 if (c == ZIO_COMPRESS_EMPTY) { 6012 /* 6013 * An empty buffer results in a null zio, which means we 6014 * need to fill its io_data after we're done restoring the 6015 * buffer's contents. 6016 */ 6017 ASSERT(hdr->b_l1hdr.b_buf != NULL); 6018 bzero(hdr->b_l1hdr.b_buf->b_data, hdr->b_size); 6019 zio->io_data = zio->io_orig_data = hdr->b_l1hdr.b_buf->b_data; 6020 } else { 6021 ASSERT(zio->io_data != NULL); 6022 /* 6023 * We copy the compressed data from the start of the arc buffer 6024 * (the zio_read will have pulled in only what we need, the 6025 * rest is garbage which we will overwrite at decompression) 6026 * and then decompress back to the ARC data buffer. This way we 6027 * can minimize copying by simply decompressing back over the 6028 * original compressed data (rather than decompressing to an 6029 * aux buffer and then copying back the uncompressed buffer, 6030 * which is likely to be much larger). 6031 */ 6032 uint64_t csize; 6033 void *cdata; 6034 6035 csize = zio->io_size; 6036 cdata = zio_data_buf_alloc(csize); 6037 bcopy(zio->io_data, cdata, csize); 6038 if (zio_decompress_data(c, cdata, zio->io_data, csize, 6039 hdr->b_size) != 0) 6040 zio->io_error = EIO; 6041 zio_data_buf_free(cdata, csize); 6042 } 6043 6044 /* Restore the expected uncompressed IO size. */ 6045 zio->io_orig_size = zio->io_size = hdr->b_size; 6046} 6047 6048/* 6049 * Releases the temporary b_tmp_cdata buffer in an l2arc header structure. 6050 * This buffer serves as a temporary holder of compressed data while 6051 * the buffer entry is being written to an l2arc device. Once that is 6052 * done, we can dispose of it. 6053 */ 6054static void 6055l2arc_release_cdata_buf(arc_buf_hdr_t *hdr) 6056{ 6057 ASSERT(HDR_HAS_L1HDR(hdr)); 6058 if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_EMPTY) { 6059 /* 6060 * If the data was compressed, then we've allocated a 6061 * temporary buffer for it, so now we need to release it. 6062 */ 6063 ASSERT(hdr->b_l1hdr.b_tmp_cdata != NULL); 6064 zio_data_buf_free(hdr->b_l1hdr.b_tmp_cdata, 6065 hdr->b_size); 6066 hdr->b_l1hdr.b_tmp_cdata = NULL; 6067 } else { 6068 ASSERT(hdr->b_l1hdr.b_tmp_cdata == NULL); 6069 } 6070} 6071 6072/* 6073 * This thread feeds the L2ARC at regular intervals. This is the beating 6074 * heart of the L2ARC. 6075 */ 6076static void 6077l2arc_feed_thread(void *dummy __unused) 6078{ 6079 callb_cpr_t cpr; 6080 l2arc_dev_t *dev; 6081 spa_t *spa; 6082 uint64_t size, wrote; 6083 clock_t begin, next = ddi_get_lbolt(); 6084 boolean_t headroom_boost = B_FALSE; 6085 6086 CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG); 6087 6088 mutex_enter(&l2arc_feed_thr_lock); 6089 6090 while (l2arc_thread_exit == 0) { 6091 CALLB_CPR_SAFE_BEGIN(&cpr); 6092 (void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock, 6093 next - ddi_get_lbolt()); 6094 CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock); 6095 next = ddi_get_lbolt() + hz; 6096 6097 /* 6098 * Quick check for L2ARC devices. 6099 */ 6100 mutex_enter(&l2arc_dev_mtx); 6101 if (l2arc_ndev == 0) { 6102 mutex_exit(&l2arc_dev_mtx); 6103 continue; 6104 } 6105 mutex_exit(&l2arc_dev_mtx); 6106 begin = ddi_get_lbolt(); 6107 6108 /* 6109 * This selects the next l2arc device to write to, and in 6110 * doing so the next spa to feed from: dev->l2ad_spa. This 6111 * will return NULL if there are now no l2arc devices or if 6112 * they are all faulted. 6113 * 6114 * If a device is returned, its spa's config lock is also 6115 * held to prevent device removal. l2arc_dev_get_next() 6116 * will grab and release l2arc_dev_mtx. 6117 */ 6118 if ((dev = l2arc_dev_get_next()) == NULL) 6119 continue; 6120 6121 spa = dev->l2ad_spa; 6122 ASSERT(spa != NULL); 6123 6124 /* 6125 * If the pool is read-only then force the feed thread to 6126 * sleep a little longer. 6127 */ 6128 if (!spa_writeable(spa)) { 6129 next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz; 6130 spa_config_exit(spa, SCL_L2ARC, dev); 6131 continue; 6132 } 6133 6134 /* 6135 * Avoid contributing to memory pressure. 6136 */ 6137 if (arc_reclaim_needed()) { 6138 ARCSTAT_BUMP(arcstat_l2_abort_lowmem); 6139 spa_config_exit(spa, SCL_L2ARC, dev); 6140 continue; 6141 } 6142 6143 ARCSTAT_BUMP(arcstat_l2_feeds); 6144 6145 size = l2arc_write_size(); 6146 6147 /* 6148 * Evict L2ARC buffers that will be overwritten. 6149 */ 6150 l2arc_evict(dev, size, B_FALSE); 6151 6152 /* 6153 * Write ARC buffers. 6154 */ 6155 wrote = l2arc_write_buffers(spa, dev, size, &headroom_boost); 6156 6157 /* 6158 * Calculate interval between writes. 6159 */ 6160 next = l2arc_write_interval(begin, size, wrote); 6161 spa_config_exit(spa, SCL_L2ARC, dev); 6162 } 6163 6164 l2arc_thread_exit = 0; 6165 cv_broadcast(&l2arc_feed_thr_cv); 6166 CALLB_CPR_EXIT(&cpr); /* drops l2arc_feed_thr_lock */ 6167 thread_exit(); 6168} 6169 6170boolean_t 6171l2arc_vdev_present(vdev_t *vd) 6172{ 6173 l2arc_dev_t *dev; 6174 6175 mutex_enter(&l2arc_dev_mtx); 6176 for (dev = list_head(l2arc_dev_list); dev != NULL; 6177 dev = list_next(l2arc_dev_list, dev)) { 6178 if (dev->l2ad_vdev == vd) 6179 break; 6180 } 6181 mutex_exit(&l2arc_dev_mtx); 6182 6183 return (dev != NULL); 6184} 6185 6186/* 6187 * Add a vdev for use by the L2ARC. By this point the spa has already 6188 * validated the vdev and opened it. 6189 */ 6190void 6191l2arc_add_vdev(spa_t *spa, vdev_t *vd) 6192{ 6193 l2arc_dev_t *adddev; 6194 6195 ASSERT(!l2arc_vdev_present(vd)); 6196 6197 vdev_ashift_optimize(vd); 6198 6199 /* 6200 * Create a new l2arc device entry. 6201 */ 6202 adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP); 6203 adddev->l2ad_spa = spa; 6204 adddev->l2ad_vdev = vd; 6205 adddev->l2ad_start = VDEV_LABEL_START_SIZE; 6206 adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd); 6207 adddev->l2ad_hand = adddev->l2ad_start; 6208 adddev->l2ad_evict = adddev->l2ad_start; 6209 adddev->l2ad_first = B_TRUE; 6210 adddev->l2ad_writing = B_FALSE; 6211 6212 mutex_init(&adddev->l2ad_mtx, NULL, MUTEX_DEFAULT, NULL); 6213 /* 6214 * This is a list of all ARC buffers that are still valid on the 6215 * device. 6216 */ 6217 list_create(&adddev->l2ad_buflist, sizeof (arc_buf_hdr_t), 6218 offsetof(arc_buf_hdr_t, b_l2hdr.b_l2node)); 6219 6220 vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand); 6221 6222 /* 6223 * Add device to global list 6224 */ 6225 mutex_enter(&l2arc_dev_mtx); 6226 list_insert_head(l2arc_dev_list, adddev); 6227 atomic_inc_64(&l2arc_ndev); 6228 mutex_exit(&l2arc_dev_mtx); 6229} 6230 6231/* 6232 * Remove a vdev from the L2ARC. 6233 */ 6234void 6235l2arc_remove_vdev(vdev_t *vd) 6236{ 6237 l2arc_dev_t *dev, *nextdev, *remdev = NULL; 6238 6239 /* 6240 * Find the device by vdev 6241 */ 6242 mutex_enter(&l2arc_dev_mtx); 6243 for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) { 6244 nextdev = list_next(l2arc_dev_list, dev); 6245 if (vd == dev->l2ad_vdev) { 6246 remdev = dev; 6247 break; 6248 } 6249 } 6250 ASSERT(remdev != NULL); 6251 6252 /* 6253 * Remove device from global list 6254 */ 6255 list_remove(l2arc_dev_list, remdev); 6256 l2arc_dev_last = NULL; /* may have been invalidated */ 6257 atomic_dec_64(&l2arc_ndev); 6258 mutex_exit(&l2arc_dev_mtx); 6259 6260 /* 6261 * Clear all buflists and ARC references. L2ARC device flush. 6262 */ 6263 l2arc_evict(remdev, 0, B_TRUE); 6264 list_destroy(&remdev->l2ad_buflist); 6265 mutex_destroy(&remdev->l2ad_mtx); 6266 kmem_free(remdev, sizeof (l2arc_dev_t)); 6267} 6268 6269void 6270l2arc_init(void) 6271{ 6272 l2arc_thread_exit = 0; 6273 l2arc_ndev = 0; 6274 l2arc_writes_sent = 0; 6275 l2arc_writes_done = 0; 6276 6277 mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL); 6278 cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL); 6279 mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL); 6280 mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL); 6281 6282 l2arc_dev_list = &L2ARC_dev_list; 6283 l2arc_free_on_write = &L2ARC_free_on_write; 6284 list_create(l2arc_dev_list, sizeof (l2arc_dev_t), 6285 offsetof(l2arc_dev_t, l2ad_node)); 6286 list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t), 6287 offsetof(l2arc_data_free_t, l2df_list_node)); 6288} 6289 6290void 6291l2arc_fini(void) 6292{ 6293 /* 6294 * This is called from dmu_fini(), which is called from spa_fini(); 6295 * Because of this, we can assume that all l2arc devices have 6296 * already been removed when the pools themselves were removed. 6297 */ 6298 6299 l2arc_do_free_on_write(); 6300 6301 mutex_destroy(&l2arc_feed_thr_lock); 6302 cv_destroy(&l2arc_feed_thr_cv); 6303 mutex_destroy(&l2arc_dev_mtx); 6304 mutex_destroy(&l2arc_free_on_write_mtx); 6305 6306 list_destroy(l2arc_dev_list); 6307 list_destroy(l2arc_free_on_write); 6308} 6309 6310void 6311l2arc_start(void) 6312{ 6313 if (!(spa_mode_global & FWRITE)) 6314 return; 6315 6316 (void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0, 6317 TS_RUN, minclsyspri); 6318} 6319 6320void 6321l2arc_stop(void) 6322{ 6323 if (!(spa_mode_global & FWRITE)) 6324 return; 6325 6326 mutex_enter(&l2arc_feed_thr_lock); 6327 cv_signal(&l2arc_feed_thr_cv); /* kick thread out of startup */ 6328 l2arc_thread_exit = 1; 6329 while (l2arc_thread_exit != 0) 6330 cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock); 6331 mutex_exit(&l2arc_feed_thr_lock); 6332} 6333