arc.c revision 286574
1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21/* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2012, Joyent, Inc. All rights reserved. 24 * Copyright (c) 2011, 2014 by Delphix. All rights reserved. 25 * Copyright (c) 2014 by Saso Kiselkov. All rights reserved. 26 * Copyright 2014 Nexenta Systems, Inc. All rights reserved. 27 */ 28 29/* 30 * DVA-based Adjustable Replacement Cache 31 * 32 * While much of the theory of operation used here is 33 * based on the self-tuning, low overhead replacement cache 34 * presented by Megiddo and Modha at FAST 2003, there are some 35 * significant differences: 36 * 37 * 1. The Megiddo and Modha model assumes any page is evictable. 38 * Pages in its cache cannot be "locked" into memory. This makes 39 * the eviction algorithm simple: evict the last page in the list. 40 * This also make the performance characteristics easy to reason 41 * about. Our cache is not so simple. At any given moment, some 42 * subset of the blocks in the cache are un-evictable because we 43 * have handed out a reference to them. Blocks are only evictable 44 * when there are no external references active. This makes 45 * eviction far more problematic: we choose to evict the evictable 46 * blocks that are the "lowest" in the list. 47 * 48 * There are times when it is not possible to evict the requested 49 * space. In these circumstances we are unable to adjust the cache 50 * size. To prevent the cache growing unbounded at these times we 51 * implement a "cache throttle" that slows the flow of new data 52 * into the cache until we can make space available. 53 * 54 * 2. The Megiddo and Modha model assumes a fixed cache size. 55 * Pages are evicted when the cache is full and there is a cache 56 * miss. Our model has a variable sized cache. It grows with 57 * high use, but also tries to react to memory pressure from the 58 * operating system: decreasing its size when system memory is 59 * tight. 60 * 61 * 3. The Megiddo and Modha model assumes a fixed page size. All 62 * elements of the cache are therefore exactly the same size. So 63 * when adjusting the cache size following a cache miss, its simply 64 * a matter of choosing a single page to evict. In our model, we 65 * have variable sized cache blocks (rangeing from 512 bytes to 66 * 128K bytes). We therefore choose a set of blocks to evict to make 67 * space for a cache miss that approximates as closely as possible 68 * the space used by the new block. 69 * 70 * See also: "ARC: A Self-Tuning, Low Overhead Replacement Cache" 71 * by N. Megiddo & D. Modha, FAST 2003 72 */ 73 74/* 75 * The locking model: 76 * 77 * A new reference to a cache buffer can be obtained in two 78 * ways: 1) via a hash table lookup using the DVA as a key, 79 * or 2) via one of the ARC lists. The arc_read() interface 80 * uses method 1, while the internal arc algorithms for 81 * adjusting the cache use method 2. We therefore provide two 82 * types of locks: 1) the hash table lock array, and 2) the 83 * arc list locks. 84 * 85 * Buffers do not have their own mutexs, rather they rely on the 86 * hash table mutexs for the bulk of their protection (i.e. most 87 * fields in the arc_buf_hdr_t are protected by these mutexs). 88 * 89 * buf_hash_find() returns the appropriate mutex (held) when it 90 * locates the requested buffer in the hash table. It returns 91 * NULL for the mutex if the buffer was not in the table. 92 * 93 * buf_hash_remove() expects the appropriate hash mutex to be 94 * already held before it is invoked. 95 * 96 * Each arc state also has a mutex which is used to protect the 97 * buffer list associated with the state. When attempting to 98 * obtain a hash table lock while holding an arc list lock you 99 * must use: mutex_tryenter() to avoid deadlock. Also note that 100 * the active state mutex must be held before the ghost state mutex. 101 * 102 * Arc buffers may have an associated eviction callback function. 103 * This function will be invoked prior to removing the buffer (e.g. 104 * in arc_do_user_evicts()). Note however that the data associated 105 * with the buffer may be evicted prior to the callback. The callback 106 * must be made with *no locks held* (to prevent deadlock). Additionally, 107 * the users of callbacks must ensure that their private data is 108 * protected from simultaneous callbacks from arc_clear_callback() 109 * and arc_do_user_evicts(). 110 * 111 * Note that the majority of the performance stats are manipulated 112 * with atomic operations. 113 * 114 * The L2ARC uses the l2ad_mtx on each vdev for the following: 115 * 116 * - L2ARC buflist creation 117 * - L2ARC buflist eviction 118 * - L2ARC write completion, which walks L2ARC buflists 119 * - ARC header destruction, as it removes from L2ARC buflists 120 * - ARC header release, as it removes from L2ARC buflists 121 */ 122 123#include <sys/spa.h> 124#include <sys/zio.h> 125#include <sys/zio_compress.h> 126#include <sys/zfs_context.h> 127#include <sys/arc.h> 128#include <sys/refcount.h> 129#include <sys/vdev.h> 130#include <sys/vdev_impl.h> 131#include <sys/dsl_pool.h> 132#ifdef _KERNEL 133#include <sys/dnlc.h> 134#endif 135#include <sys/callb.h> 136#include <sys/kstat.h> 137#include <sys/trim_map.h> 138#include <zfs_fletcher.h> 139#include <sys/sdt.h> 140 141#include <vm/vm_pageout.h> 142#include <machine/vmparam.h> 143 144#ifdef illumos 145#ifndef _KERNEL 146/* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */ 147boolean_t arc_watch = B_FALSE; 148int arc_procfd; 149#endif 150#endif /* illumos */ 151 152static kmutex_t arc_reclaim_thr_lock; 153static kcondvar_t arc_reclaim_thr_cv; /* used to signal reclaim thr */ 154static uint8_t arc_thread_exit; 155 156#define ARC_REDUCE_DNLC_PERCENT 3 157uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT; 158 159typedef enum arc_reclaim_strategy { 160 ARC_RECLAIM_AGGR, /* Aggressive reclaim strategy */ 161 ARC_RECLAIM_CONS /* Conservative reclaim strategy */ 162} arc_reclaim_strategy_t; 163 164/* 165 * The number of iterations through arc_evict_*() before we 166 * drop & reacquire the lock. 167 */ 168int arc_evict_iterations = 100; 169 170/* number of seconds before growing cache again */ 171static int arc_grow_retry = 60; 172 173/* shift of arc_c for calculating both min and max arc_p */ 174static int arc_p_min_shift = 4; 175 176/* log2(fraction of arc to reclaim) */ 177static int arc_shrink_shift = 5; 178 179/* 180 * minimum lifespan of a prefetch block in clock ticks 181 * (initialized in arc_init()) 182 */ 183static int arc_min_prefetch_lifespan; 184 185/* 186 * If this percent of memory is free, don't throttle. 187 */ 188int arc_lotsfree_percent = 10; 189 190static int arc_dead; 191extern int zfs_prefetch_disable; 192 193/* 194 * The arc has filled available memory and has now warmed up. 195 */ 196static boolean_t arc_warm; 197 198uint64_t zfs_arc_max; 199uint64_t zfs_arc_min; 200uint64_t zfs_arc_meta_limit = 0; 201uint64_t zfs_arc_meta_min = 0; 202int zfs_arc_grow_retry = 0; 203int zfs_arc_shrink_shift = 0; 204int zfs_arc_p_min_shift = 0; 205int zfs_disable_dup_eviction = 0; 206uint64_t zfs_arc_average_blocksize = 8 * 1024; /* 8KB */ 207u_int zfs_arc_free_target = 0; 208 209static int sysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS); 210static int sysctl_vfs_zfs_arc_meta_limit(SYSCTL_HANDLER_ARGS); 211 212#ifdef _KERNEL 213static void 214arc_free_target_init(void *unused __unused) 215{ 216 217 zfs_arc_free_target = vm_pageout_wakeup_thresh; 218} 219SYSINIT(arc_free_target_init, SI_SUB_KTHREAD_PAGE, SI_ORDER_ANY, 220 arc_free_target_init, NULL); 221 222TUNABLE_QUAD("vfs.zfs.arc_meta_limit", &zfs_arc_meta_limit); 223TUNABLE_QUAD("vfs.zfs.arc_meta_min", &zfs_arc_meta_min); 224TUNABLE_INT("vfs.zfs.arc_shrink_shift", &zfs_arc_shrink_shift); 225SYSCTL_DECL(_vfs_zfs); 226SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_max, CTLFLAG_RDTUN, &zfs_arc_max, 0, 227 "Maximum ARC size"); 228SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_min, CTLFLAG_RDTUN, &zfs_arc_min, 0, 229 "Minimum ARC size"); 230SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_average_blocksize, CTLFLAG_RDTUN, 231 &zfs_arc_average_blocksize, 0, 232 "ARC average blocksize"); 233SYSCTL_INT(_vfs_zfs, OID_AUTO, arc_shrink_shift, CTLFLAG_RW, 234 &arc_shrink_shift, 0, 235 "log2(fraction of arc to reclaim)"); 236 237/* 238 * We don't have a tunable for arc_free_target due to the dependency on 239 * pagedaemon initialisation. 240 */ 241SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_free_target, 242 CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(u_int), 243 sysctl_vfs_zfs_arc_free_target, "IU", 244 "Desired number of free pages below which ARC triggers reclaim"); 245 246static int 247sysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS) 248{ 249 u_int val; 250 int err; 251 252 val = zfs_arc_free_target; 253 err = sysctl_handle_int(oidp, &val, 0, req); 254 if (err != 0 || req->newptr == NULL) 255 return (err); 256 257 if (val < minfree) 258 return (EINVAL); 259 if (val > vm_cnt.v_page_count) 260 return (EINVAL); 261 262 zfs_arc_free_target = val; 263 264 return (0); 265} 266 267/* 268 * Must be declared here, before the definition of corresponding kstat 269 * macro which uses the same names will confuse the compiler. 270 */ 271SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_meta_limit, 272 CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(uint64_t), 273 sysctl_vfs_zfs_arc_meta_limit, "QU", 274 "ARC metadata limit"); 275#endif 276 277/* 278 * Note that buffers can be in one of 6 states: 279 * ARC_anon - anonymous (discussed below) 280 * ARC_mru - recently used, currently cached 281 * ARC_mru_ghost - recentely used, no longer in cache 282 * ARC_mfu - frequently used, currently cached 283 * ARC_mfu_ghost - frequently used, no longer in cache 284 * ARC_l2c_only - exists in L2ARC but not other states 285 * When there are no active references to the buffer, they are 286 * are linked onto a list in one of these arc states. These are 287 * the only buffers that can be evicted or deleted. Within each 288 * state there are multiple lists, one for meta-data and one for 289 * non-meta-data. Meta-data (indirect blocks, blocks of dnodes, 290 * etc.) is tracked separately so that it can be managed more 291 * explicitly: favored over data, limited explicitly. 292 * 293 * Anonymous buffers are buffers that are not associated with 294 * a DVA. These are buffers that hold dirty block copies 295 * before they are written to stable storage. By definition, 296 * they are "ref'd" and are considered part of arc_mru 297 * that cannot be freed. Generally, they will aquire a DVA 298 * as they are written and migrate onto the arc_mru list. 299 * 300 * The ARC_l2c_only state is for buffers that are in the second 301 * level ARC but no longer in any of the ARC_m* lists. The second 302 * level ARC itself may also contain buffers that are in any of 303 * the ARC_m* states - meaning that a buffer can exist in two 304 * places. The reason for the ARC_l2c_only state is to keep the 305 * buffer header in the hash table, so that reads that hit the 306 * second level ARC benefit from these fast lookups. 307 */ 308 309#define ARCS_LOCK_PAD CACHE_LINE_SIZE 310struct arcs_lock { 311 kmutex_t arcs_lock; 312#ifdef _KERNEL 313 unsigned char pad[(ARCS_LOCK_PAD - sizeof (kmutex_t))]; 314#endif 315}; 316 317/* 318 * must be power of two for mask use to work 319 * 320 */ 321#define ARC_BUFC_NUMDATALISTS 16 322#define ARC_BUFC_NUMMETADATALISTS 16 323#define ARC_BUFC_NUMLISTS (ARC_BUFC_NUMMETADATALISTS + ARC_BUFC_NUMDATALISTS) 324 325typedef struct arc_state { 326 uint64_t arcs_lsize[ARC_BUFC_NUMTYPES]; /* amount of evictable data */ 327 uint64_t arcs_size; /* total amount of data in this state */ 328 list_t arcs_lists[ARC_BUFC_NUMLISTS]; /* list of evictable buffers */ 329 struct arcs_lock arcs_locks[ARC_BUFC_NUMLISTS] __aligned(CACHE_LINE_SIZE); 330} arc_state_t; 331 332#define ARCS_LOCK(s, i) (&((s)->arcs_locks[(i)].arcs_lock)) 333 334/* The 6 states: */ 335static arc_state_t ARC_anon; 336static arc_state_t ARC_mru; 337static arc_state_t ARC_mru_ghost; 338static arc_state_t ARC_mfu; 339static arc_state_t ARC_mfu_ghost; 340static arc_state_t ARC_l2c_only; 341 342typedef struct arc_stats { 343 kstat_named_t arcstat_hits; 344 kstat_named_t arcstat_misses; 345 kstat_named_t arcstat_demand_data_hits; 346 kstat_named_t arcstat_demand_data_misses; 347 kstat_named_t arcstat_demand_metadata_hits; 348 kstat_named_t arcstat_demand_metadata_misses; 349 kstat_named_t arcstat_prefetch_data_hits; 350 kstat_named_t arcstat_prefetch_data_misses; 351 kstat_named_t arcstat_prefetch_metadata_hits; 352 kstat_named_t arcstat_prefetch_metadata_misses; 353 kstat_named_t arcstat_mru_hits; 354 kstat_named_t arcstat_mru_ghost_hits; 355 kstat_named_t arcstat_mfu_hits; 356 kstat_named_t arcstat_mfu_ghost_hits; 357 kstat_named_t arcstat_allocated; 358 kstat_named_t arcstat_deleted; 359 kstat_named_t arcstat_stolen; 360 kstat_named_t arcstat_recycle_miss; 361 /* 362 * Number of buffers that could not be evicted because the hash lock 363 * was held by another thread. The lock may not necessarily be held 364 * by something using the same buffer, since hash locks are shared 365 * by multiple buffers. 366 */ 367 kstat_named_t arcstat_mutex_miss; 368 /* 369 * Number of buffers skipped because they have I/O in progress, are 370 * indrect prefetch buffers that have not lived long enough, or are 371 * not from the spa we're trying to evict from. 372 */ 373 kstat_named_t arcstat_evict_skip; 374 kstat_named_t arcstat_evict_l2_cached; 375 kstat_named_t arcstat_evict_l2_eligible; 376 kstat_named_t arcstat_evict_l2_ineligible; 377 kstat_named_t arcstat_hash_elements; 378 kstat_named_t arcstat_hash_elements_max; 379 kstat_named_t arcstat_hash_collisions; 380 kstat_named_t arcstat_hash_chains; 381 kstat_named_t arcstat_hash_chain_max; 382 kstat_named_t arcstat_p; 383 kstat_named_t arcstat_c; 384 kstat_named_t arcstat_c_min; 385 kstat_named_t arcstat_c_max; 386 kstat_named_t arcstat_size; 387 /* 388 * Number of bytes consumed by internal ARC structures necessary 389 * for tracking purposes; these structures are not actually 390 * backed by ARC buffers. This includes arc_buf_hdr_t structures 391 * (allocated via arc_buf_hdr_t_full and arc_buf_hdr_t_l2only 392 * caches), and arc_buf_t structures (allocated via arc_buf_t 393 * cache). 394 */ 395 kstat_named_t arcstat_hdr_size; 396 /* 397 * Number of bytes consumed by ARC buffers of type equal to 398 * ARC_BUFC_DATA. This is generally consumed by buffers backing 399 * on disk user data (e.g. plain file contents). 400 */ 401 kstat_named_t arcstat_data_size; 402 /* 403 * Number of bytes consumed by ARC buffers of type equal to 404 * ARC_BUFC_METADATA. This is generally consumed by buffers 405 * backing on disk data that is used for internal ZFS 406 * structures (e.g. ZAP, dnode, indirect blocks, etc). 407 */ 408 kstat_named_t arcstat_metadata_size; 409 /* 410 * Number of bytes consumed by various buffers and structures 411 * not actually backed with ARC buffers. This includes bonus 412 * buffers (allocated directly via zio_buf_* functions), 413 * dmu_buf_impl_t structures (allocated via dmu_buf_impl_t 414 * cache), and dnode_t structures (allocated via dnode_t cache). 415 */ 416 kstat_named_t arcstat_other_size; 417 /* 418 * Total number of bytes consumed by ARC buffers residing in the 419 * arc_anon state. This includes *all* buffers in the arc_anon 420 * state; e.g. data, metadata, evictable, and unevictable buffers 421 * are all included in this value. 422 */ 423 kstat_named_t arcstat_anon_size; 424 /* 425 * Number of bytes consumed by ARC buffers that meet the 426 * following criteria: backing buffers of type ARC_BUFC_DATA, 427 * residing in the arc_anon state, and are eligible for eviction 428 * (e.g. have no outstanding holds on the buffer). 429 */ 430 kstat_named_t arcstat_anon_evictable_data; 431 /* 432 * Number of bytes consumed by ARC buffers that meet the 433 * following criteria: backing buffers of type ARC_BUFC_METADATA, 434 * residing in the arc_anon state, and are eligible for eviction 435 * (e.g. have no outstanding holds on the buffer). 436 */ 437 kstat_named_t arcstat_anon_evictable_metadata; 438 /* 439 * Total number of bytes consumed by ARC buffers residing in the 440 * arc_mru state. This includes *all* buffers in the arc_mru 441 * state; e.g. data, metadata, evictable, and unevictable buffers 442 * are all included in this value. 443 */ 444 kstat_named_t arcstat_mru_size; 445 /* 446 * Number of bytes consumed by ARC buffers that meet the 447 * following criteria: backing buffers of type ARC_BUFC_DATA, 448 * residing in the arc_mru state, and are eligible for eviction 449 * (e.g. have no outstanding holds on the buffer). 450 */ 451 kstat_named_t arcstat_mru_evictable_data; 452 /* 453 * Number of bytes consumed by ARC buffers that meet the 454 * following criteria: backing buffers of type ARC_BUFC_METADATA, 455 * residing in the arc_mru state, and are eligible for eviction 456 * (e.g. have no outstanding holds on the buffer). 457 */ 458 kstat_named_t arcstat_mru_evictable_metadata; 459 /* 460 * Total number of bytes that *would have been* consumed by ARC 461 * buffers in the arc_mru_ghost state. The key thing to note 462 * here, is the fact that this size doesn't actually indicate 463 * RAM consumption. The ghost lists only consist of headers and 464 * don't actually have ARC buffers linked off of these headers. 465 * Thus, *if* the headers had associated ARC buffers, these 466 * buffers *would have* consumed this number of bytes. 467 */ 468 kstat_named_t arcstat_mru_ghost_size; 469 /* 470 * Number of bytes that *would have been* consumed by ARC 471 * buffers that are eligible for eviction, of type 472 * ARC_BUFC_DATA, and linked off the arc_mru_ghost state. 473 */ 474 kstat_named_t arcstat_mru_ghost_evictable_data; 475 /* 476 * Number of bytes that *would have been* consumed by ARC 477 * buffers that are eligible for eviction, of type 478 * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state. 479 */ 480 kstat_named_t arcstat_mru_ghost_evictable_metadata; 481 /* 482 * Total number of bytes consumed by ARC buffers residing in the 483 * arc_mfu state. This includes *all* buffers in the arc_mfu 484 * state; e.g. data, metadata, evictable, and unevictable buffers 485 * are all included in this value. 486 */ 487 kstat_named_t arcstat_mfu_size; 488 /* 489 * Number of bytes consumed by ARC buffers that are eligible for 490 * eviction, of type ARC_BUFC_DATA, and reside in the arc_mfu 491 * state. 492 */ 493 kstat_named_t arcstat_mfu_evictable_data; 494 /* 495 * Number of bytes consumed by ARC buffers that are eligible for 496 * eviction, of type ARC_BUFC_METADATA, and reside in the 497 * arc_mfu state. 498 */ 499 kstat_named_t arcstat_mfu_evictable_metadata; 500 /* 501 * Total number of bytes that *would have been* consumed by ARC 502 * buffers in the arc_mfu_ghost state. See the comment above 503 * arcstat_mru_ghost_size for more details. 504 */ 505 kstat_named_t arcstat_mfu_ghost_size; 506 /* 507 * Number of bytes that *would have been* consumed by ARC 508 * buffers that are eligible for eviction, of type 509 * ARC_BUFC_DATA, and linked off the arc_mfu_ghost state. 510 */ 511 kstat_named_t arcstat_mfu_ghost_evictable_data; 512 /* 513 * Number of bytes that *would have been* consumed by ARC 514 * buffers that are eligible for eviction, of type 515 * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state. 516 */ 517 kstat_named_t arcstat_mfu_ghost_evictable_metadata; 518 kstat_named_t arcstat_l2_hits; 519 kstat_named_t arcstat_l2_misses; 520 kstat_named_t arcstat_l2_feeds; 521 kstat_named_t arcstat_l2_rw_clash; 522 kstat_named_t arcstat_l2_read_bytes; 523 kstat_named_t arcstat_l2_write_bytes; 524 kstat_named_t arcstat_l2_writes_sent; 525 kstat_named_t arcstat_l2_writes_done; 526 kstat_named_t arcstat_l2_writes_error; 527 kstat_named_t arcstat_l2_writes_hdr_miss; 528 kstat_named_t arcstat_l2_evict_lock_retry; 529 kstat_named_t arcstat_l2_evict_reading; 530 kstat_named_t arcstat_l2_evict_l1cached; 531 kstat_named_t arcstat_l2_free_on_write; 532 kstat_named_t arcstat_l2_cdata_free_on_write; 533 kstat_named_t arcstat_l2_abort_lowmem; 534 kstat_named_t arcstat_l2_cksum_bad; 535 kstat_named_t arcstat_l2_io_error; 536 kstat_named_t arcstat_l2_size; 537 kstat_named_t arcstat_l2_asize; 538 kstat_named_t arcstat_l2_hdr_size; 539 kstat_named_t arcstat_l2_compress_successes; 540 kstat_named_t arcstat_l2_compress_zeros; 541 kstat_named_t arcstat_l2_compress_failures; 542 kstat_named_t arcstat_l2_write_trylock_fail; 543 kstat_named_t arcstat_l2_write_passed_headroom; 544 kstat_named_t arcstat_l2_write_spa_mismatch; 545 kstat_named_t arcstat_l2_write_in_l2; 546 kstat_named_t arcstat_l2_write_hdr_io_in_progress; 547 kstat_named_t arcstat_l2_write_not_cacheable; 548 kstat_named_t arcstat_l2_write_full; 549 kstat_named_t arcstat_l2_write_buffer_iter; 550 kstat_named_t arcstat_l2_write_pios; 551 kstat_named_t arcstat_l2_write_buffer_bytes_scanned; 552 kstat_named_t arcstat_l2_write_buffer_list_iter; 553 kstat_named_t arcstat_l2_write_buffer_list_null_iter; 554 kstat_named_t arcstat_memory_throttle_count; 555 kstat_named_t arcstat_duplicate_buffers; 556 kstat_named_t arcstat_duplicate_buffers_size; 557 kstat_named_t arcstat_duplicate_reads; 558 kstat_named_t arcstat_meta_used; 559 kstat_named_t arcstat_meta_limit; 560 kstat_named_t arcstat_meta_max; 561 kstat_named_t arcstat_meta_min; 562} arc_stats_t; 563 564static arc_stats_t arc_stats = { 565 { "hits", KSTAT_DATA_UINT64 }, 566 { "misses", KSTAT_DATA_UINT64 }, 567 { "demand_data_hits", KSTAT_DATA_UINT64 }, 568 { "demand_data_misses", KSTAT_DATA_UINT64 }, 569 { "demand_metadata_hits", KSTAT_DATA_UINT64 }, 570 { "demand_metadata_misses", KSTAT_DATA_UINT64 }, 571 { "prefetch_data_hits", KSTAT_DATA_UINT64 }, 572 { "prefetch_data_misses", KSTAT_DATA_UINT64 }, 573 { "prefetch_metadata_hits", KSTAT_DATA_UINT64 }, 574 { "prefetch_metadata_misses", KSTAT_DATA_UINT64 }, 575 { "mru_hits", KSTAT_DATA_UINT64 }, 576 { "mru_ghost_hits", KSTAT_DATA_UINT64 }, 577 { "mfu_hits", KSTAT_DATA_UINT64 }, 578 { "mfu_ghost_hits", KSTAT_DATA_UINT64 }, 579 { "allocated", KSTAT_DATA_UINT64 }, 580 { "deleted", KSTAT_DATA_UINT64 }, 581 { "stolen", KSTAT_DATA_UINT64 }, 582 { "recycle_miss", KSTAT_DATA_UINT64 }, 583 { "mutex_miss", KSTAT_DATA_UINT64 }, 584 { "evict_skip", KSTAT_DATA_UINT64 }, 585 { "evict_l2_cached", KSTAT_DATA_UINT64 }, 586 { "evict_l2_eligible", KSTAT_DATA_UINT64 }, 587 { "evict_l2_ineligible", KSTAT_DATA_UINT64 }, 588 { "hash_elements", KSTAT_DATA_UINT64 }, 589 { "hash_elements_max", KSTAT_DATA_UINT64 }, 590 { "hash_collisions", KSTAT_DATA_UINT64 }, 591 { "hash_chains", KSTAT_DATA_UINT64 }, 592 { "hash_chain_max", KSTAT_DATA_UINT64 }, 593 { "p", KSTAT_DATA_UINT64 }, 594 { "c", KSTAT_DATA_UINT64 }, 595 { "c_min", KSTAT_DATA_UINT64 }, 596 { "c_max", KSTAT_DATA_UINT64 }, 597 { "size", KSTAT_DATA_UINT64 }, 598 { "hdr_size", KSTAT_DATA_UINT64 }, 599 { "data_size", KSTAT_DATA_UINT64 }, 600 { "metadata_size", KSTAT_DATA_UINT64 }, 601 { "other_size", KSTAT_DATA_UINT64 }, 602 { "anon_size", KSTAT_DATA_UINT64 }, 603 { "anon_evictable_data", KSTAT_DATA_UINT64 }, 604 { "anon_evictable_metadata", KSTAT_DATA_UINT64 }, 605 { "mru_size", KSTAT_DATA_UINT64 }, 606 { "mru_evictable_data", KSTAT_DATA_UINT64 }, 607 { "mru_evictable_metadata", KSTAT_DATA_UINT64 }, 608 { "mru_ghost_size", KSTAT_DATA_UINT64 }, 609 { "mru_ghost_evictable_data", KSTAT_DATA_UINT64 }, 610 { "mru_ghost_evictable_metadata", KSTAT_DATA_UINT64 }, 611 { "mfu_size", KSTAT_DATA_UINT64 }, 612 { "mfu_evictable_data", KSTAT_DATA_UINT64 }, 613 { "mfu_evictable_metadata", KSTAT_DATA_UINT64 }, 614 { "mfu_ghost_size", KSTAT_DATA_UINT64 }, 615 { "mfu_ghost_evictable_data", KSTAT_DATA_UINT64 }, 616 { "mfu_ghost_evictable_metadata", KSTAT_DATA_UINT64 }, 617 { "l2_hits", KSTAT_DATA_UINT64 }, 618 { "l2_misses", KSTAT_DATA_UINT64 }, 619 { "l2_feeds", KSTAT_DATA_UINT64 }, 620 { "l2_rw_clash", KSTAT_DATA_UINT64 }, 621 { "l2_read_bytes", KSTAT_DATA_UINT64 }, 622 { "l2_write_bytes", KSTAT_DATA_UINT64 }, 623 { "l2_writes_sent", KSTAT_DATA_UINT64 }, 624 { "l2_writes_done", KSTAT_DATA_UINT64 }, 625 { "l2_writes_error", KSTAT_DATA_UINT64 }, 626 { "l2_writes_hdr_miss", KSTAT_DATA_UINT64 }, 627 { "l2_evict_lock_retry", KSTAT_DATA_UINT64 }, 628 { "l2_evict_reading", KSTAT_DATA_UINT64 }, 629 { "l2_evict_l1cached", KSTAT_DATA_UINT64 }, 630 { "l2_free_on_write", KSTAT_DATA_UINT64 }, 631 { "l2_cdata_free_on_write", KSTAT_DATA_UINT64 }, 632 { "l2_abort_lowmem", KSTAT_DATA_UINT64 }, 633 { "l2_cksum_bad", KSTAT_DATA_UINT64 }, 634 { "l2_io_error", KSTAT_DATA_UINT64 }, 635 { "l2_size", KSTAT_DATA_UINT64 }, 636 { "l2_asize", KSTAT_DATA_UINT64 }, 637 { "l2_hdr_size", KSTAT_DATA_UINT64 }, 638 { "l2_compress_successes", KSTAT_DATA_UINT64 }, 639 { "l2_compress_zeros", KSTAT_DATA_UINT64 }, 640 { "l2_compress_failures", KSTAT_DATA_UINT64 }, 641 { "l2_write_trylock_fail", KSTAT_DATA_UINT64 }, 642 { "l2_write_passed_headroom", KSTAT_DATA_UINT64 }, 643 { "l2_write_spa_mismatch", KSTAT_DATA_UINT64 }, 644 { "l2_write_in_l2", KSTAT_DATA_UINT64 }, 645 { "l2_write_io_in_progress", KSTAT_DATA_UINT64 }, 646 { "l2_write_not_cacheable", KSTAT_DATA_UINT64 }, 647 { "l2_write_full", KSTAT_DATA_UINT64 }, 648 { "l2_write_buffer_iter", KSTAT_DATA_UINT64 }, 649 { "l2_write_pios", KSTAT_DATA_UINT64 }, 650 { "l2_write_buffer_bytes_scanned", KSTAT_DATA_UINT64 }, 651 { "l2_write_buffer_list_iter", KSTAT_DATA_UINT64 }, 652 { "l2_write_buffer_list_null_iter", KSTAT_DATA_UINT64 }, 653 { "memory_throttle_count", KSTAT_DATA_UINT64 }, 654 { "duplicate_buffers", KSTAT_DATA_UINT64 }, 655 { "duplicate_buffers_size", KSTAT_DATA_UINT64 }, 656 { "duplicate_reads", KSTAT_DATA_UINT64 }, 657 { "arc_meta_used", KSTAT_DATA_UINT64 }, 658 { "arc_meta_limit", KSTAT_DATA_UINT64 }, 659 { "arc_meta_max", KSTAT_DATA_UINT64 }, 660 { "arc_meta_min", KSTAT_DATA_UINT64 } 661}; 662 663#define ARCSTAT(stat) (arc_stats.stat.value.ui64) 664 665#define ARCSTAT_INCR(stat, val) \ 666 atomic_add_64(&arc_stats.stat.value.ui64, (val)) 667 668#define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1) 669#define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1) 670 671#define ARCSTAT_MAX(stat, val) { \ 672 uint64_t m; \ 673 while ((val) > (m = arc_stats.stat.value.ui64) && \ 674 (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val)))) \ 675 continue; \ 676} 677 678#define ARCSTAT_MAXSTAT(stat) \ 679 ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64) 680 681/* 682 * We define a macro to allow ARC hits/misses to be easily broken down by 683 * two separate conditions, giving a total of four different subtypes for 684 * each of hits and misses (so eight statistics total). 685 */ 686#define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \ 687 if (cond1) { \ 688 if (cond2) { \ 689 ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \ 690 } else { \ 691 ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \ 692 } \ 693 } else { \ 694 if (cond2) { \ 695 ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \ 696 } else { \ 697 ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\ 698 } \ 699 } 700 701kstat_t *arc_ksp; 702static arc_state_t *arc_anon; 703static arc_state_t *arc_mru; 704static arc_state_t *arc_mru_ghost; 705static arc_state_t *arc_mfu; 706static arc_state_t *arc_mfu_ghost; 707static arc_state_t *arc_l2c_only; 708 709/* 710 * There are several ARC variables that are critical to export as kstats -- 711 * but we don't want to have to grovel around in the kstat whenever we wish to 712 * manipulate them. For these variables, we therefore define them to be in 713 * terms of the statistic variable. This assures that we are not introducing 714 * the possibility of inconsistency by having shadow copies of the variables, 715 * while still allowing the code to be readable. 716 */ 717#define arc_size ARCSTAT(arcstat_size) /* actual total arc size */ 718#define arc_p ARCSTAT(arcstat_p) /* target size of MRU */ 719#define arc_c ARCSTAT(arcstat_c) /* target size of cache */ 720#define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */ 721#define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */ 722#define arc_meta_limit ARCSTAT(arcstat_meta_limit) /* max size for metadata */ 723#define arc_meta_min ARCSTAT(arcstat_meta_min) /* min size for metadata */ 724#define arc_meta_used ARCSTAT(arcstat_meta_used) /* size of metadata */ 725#define arc_meta_max ARCSTAT(arcstat_meta_max) /* max size of metadata */ 726 727#define L2ARC_IS_VALID_COMPRESS(_c_) \ 728 ((_c_) == ZIO_COMPRESS_LZ4 || (_c_) == ZIO_COMPRESS_EMPTY) 729 730static int arc_no_grow; /* Don't try to grow cache size */ 731static uint64_t arc_tempreserve; 732static uint64_t arc_loaned_bytes; 733 734typedef struct arc_callback arc_callback_t; 735 736struct arc_callback { 737 void *acb_private; 738 arc_done_func_t *acb_done; 739 arc_buf_t *acb_buf; 740 zio_t *acb_zio_dummy; 741 arc_callback_t *acb_next; 742}; 743 744typedef struct arc_write_callback arc_write_callback_t; 745 746struct arc_write_callback { 747 void *awcb_private; 748 arc_done_func_t *awcb_ready; 749 arc_done_func_t *awcb_physdone; 750 arc_done_func_t *awcb_done; 751 arc_buf_t *awcb_buf; 752}; 753 754/* 755 * ARC buffers are separated into multiple structs as a memory saving measure: 756 * - Common fields struct, always defined, and embedded within it: 757 * - L2-only fields, always allocated but undefined when not in L2ARC 758 * - L1-only fields, only allocated when in L1ARC 759 * 760 * Buffer in L1 Buffer only in L2 761 * +------------------------+ +------------------------+ 762 * | arc_buf_hdr_t | | arc_buf_hdr_t | 763 * | | | | 764 * | | | | 765 * | | | | 766 * +------------------------+ +------------------------+ 767 * | l2arc_buf_hdr_t | | l2arc_buf_hdr_t | 768 * | (undefined if L1-only) | | | 769 * +------------------------+ +------------------------+ 770 * | l1arc_buf_hdr_t | 771 * | | 772 * | | 773 * | | 774 * | | 775 * +------------------------+ 776 * 777 * Because it's possible for the L2ARC to become extremely large, we can wind 778 * up eating a lot of memory in L2ARC buffer headers, so the size of a header 779 * is minimized by only allocating the fields necessary for an L1-cached buffer 780 * when a header is actually in the L1 cache. The sub-headers (l1arc_buf_hdr and 781 * l2arc_buf_hdr) are embedded rather than allocated separately to save a couple 782 * words in pointers. arc_hdr_realloc() is used to switch a header between 783 * these two allocation states. 784 */ 785typedef struct l1arc_buf_hdr { 786 kmutex_t b_freeze_lock; 787#ifdef ZFS_DEBUG 788 /* 789 * used for debugging wtih kmem_flags - by allocating and freeing 790 * b_thawed when the buffer is thawed, we get a record of the stack 791 * trace that thawed it. 792 */ 793 void *b_thawed; 794#endif 795 796 arc_buf_t *b_buf; 797 uint32_t b_datacnt; 798 /* for waiting on writes to complete */ 799 kcondvar_t b_cv; 800 801 /* protected by arc state mutex */ 802 arc_state_t *b_state; 803 list_node_t b_arc_node; 804 805 /* updated atomically */ 806 clock_t b_arc_access; 807 808 /* self protecting */ 809 refcount_t b_refcnt; 810 811 arc_callback_t *b_acb; 812 /* temporary buffer holder for in-flight compressed data */ 813 void *b_tmp_cdata; 814} l1arc_buf_hdr_t; 815 816typedef struct l2arc_dev l2arc_dev_t; 817 818typedef struct l2arc_buf_hdr { 819 /* protected by arc_buf_hdr mutex */ 820 l2arc_dev_t *b_dev; /* L2ARC device */ 821 uint64_t b_daddr; /* disk address, offset byte */ 822 /* real alloc'd buffer size depending on b_compress applied */ 823 int32_t b_asize; 824 825 list_node_t b_l2node; 826} l2arc_buf_hdr_t; 827 828struct arc_buf_hdr { 829 /* protected by hash lock */ 830 dva_t b_dva; 831 uint64_t b_birth; 832 /* 833 * Even though this checksum is only set/verified when a buffer is in 834 * the L1 cache, it needs to be in the set of common fields because it 835 * must be preserved from the time before a buffer is written out to 836 * L2ARC until after it is read back in. 837 */ 838 zio_cksum_t *b_freeze_cksum; 839 840 arc_buf_hdr_t *b_hash_next; 841 arc_flags_t b_flags; 842 843 /* immutable */ 844 int32_t b_size; 845 uint64_t b_spa; 846 847 /* L2ARC fields. Undefined when not in L2ARC. */ 848 l2arc_buf_hdr_t b_l2hdr; 849 /* L1ARC fields. Undefined when in l2arc_only state */ 850 l1arc_buf_hdr_t b_l1hdr; 851}; 852 853#ifdef _KERNEL 854static int 855sysctl_vfs_zfs_arc_meta_limit(SYSCTL_HANDLER_ARGS) 856{ 857 uint64_t val; 858 int err; 859 860 val = arc_meta_limit; 861 err = sysctl_handle_64(oidp, &val, 0, req); 862 if (err != 0 || req->newptr == NULL) 863 return (err); 864 865 if (val <= 0 || val > arc_c_max) 866 return (EINVAL); 867 868 arc_meta_limit = val; 869 return (0); 870} 871#endif 872 873static arc_buf_t *arc_eviction_list; 874static kmutex_t arc_eviction_mtx; 875static arc_buf_hdr_t arc_eviction_hdr; 876 877#define GHOST_STATE(state) \ 878 ((state) == arc_mru_ghost || (state) == arc_mfu_ghost || \ 879 (state) == arc_l2c_only) 880 881#define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_FLAG_IN_HASH_TABLE) 882#define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) 883#define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_FLAG_IO_ERROR) 884#define HDR_PREFETCH(hdr) ((hdr)->b_flags & ARC_FLAG_PREFETCH) 885#define HDR_FREED_IN_READ(hdr) ((hdr)->b_flags & ARC_FLAG_FREED_IN_READ) 886#define HDR_BUF_AVAILABLE(hdr) ((hdr)->b_flags & ARC_FLAG_BUF_AVAILABLE) 887 888#define HDR_L2CACHE(hdr) ((hdr)->b_flags & ARC_FLAG_L2CACHE) 889#define HDR_L2COMPRESS(hdr) ((hdr)->b_flags & ARC_FLAG_L2COMPRESS) 890#define HDR_L2_READING(hdr) \ 891 (((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) && \ 892 ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR)) 893#define HDR_L2_WRITING(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITING) 894#define HDR_L2_EVICTED(hdr) ((hdr)->b_flags & ARC_FLAG_L2_EVICTED) 895#define HDR_L2_WRITE_HEAD(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITE_HEAD) 896 897#define HDR_ISTYPE_METADATA(hdr) \ 898 ((hdr)->b_flags & ARC_FLAG_BUFC_METADATA) 899#define HDR_ISTYPE_DATA(hdr) (!HDR_ISTYPE_METADATA(hdr)) 900 901#define HDR_HAS_L1HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L1HDR) 902#define HDR_HAS_L2HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR) 903 904/* For storing compression mode in b_flags */ 905#define HDR_COMPRESS_OFFSET 24 906#define HDR_COMPRESS_NBITS 7 907 908#define HDR_GET_COMPRESS(hdr) ((enum zio_compress)BF32_GET(hdr->b_flags, \ 909 HDR_COMPRESS_OFFSET, HDR_COMPRESS_NBITS)) 910#define HDR_SET_COMPRESS(hdr, cmp) BF32_SET(hdr->b_flags, \ 911 HDR_COMPRESS_OFFSET, HDR_COMPRESS_NBITS, (cmp)) 912 913/* 914 * Other sizes 915 */ 916 917#define HDR_FULL_SIZE ((int64_t)sizeof (arc_buf_hdr_t)) 918#define HDR_L2ONLY_SIZE ((int64_t)offsetof(arc_buf_hdr_t, b_l1hdr)) 919 920/* 921 * Hash table routines 922 */ 923 924#define HT_LOCK_PAD CACHE_LINE_SIZE 925 926struct ht_lock { 927 kmutex_t ht_lock; 928#ifdef _KERNEL 929 unsigned char pad[(HT_LOCK_PAD - sizeof (kmutex_t))]; 930#endif 931}; 932 933#define BUF_LOCKS 256 934typedef struct buf_hash_table { 935 uint64_t ht_mask; 936 arc_buf_hdr_t **ht_table; 937 struct ht_lock ht_locks[BUF_LOCKS] __aligned(CACHE_LINE_SIZE); 938} buf_hash_table_t; 939 940static buf_hash_table_t buf_hash_table; 941 942#define BUF_HASH_INDEX(spa, dva, birth) \ 943 (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask) 944#define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)]) 945#define BUF_HASH_LOCK(idx) (&(BUF_HASH_LOCK_NTRY(idx).ht_lock)) 946#define HDR_LOCK(hdr) \ 947 (BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth))) 948 949uint64_t zfs_crc64_table[256]; 950 951/* 952 * Level 2 ARC 953 */ 954 955#define L2ARC_WRITE_SIZE (8 * 1024 * 1024) /* initial write max */ 956#define L2ARC_HEADROOM 2 /* num of writes */ 957/* 958 * If we discover during ARC scan any buffers to be compressed, we boost 959 * our headroom for the next scanning cycle by this percentage multiple. 960 */ 961#define L2ARC_HEADROOM_BOOST 200 962#define L2ARC_FEED_SECS 1 /* caching interval secs */ 963#define L2ARC_FEED_MIN_MS 200 /* min caching interval ms */ 964 965#define l2arc_writes_sent ARCSTAT(arcstat_l2_writes_sent) 966#define l2arc_writes_done ARCSTAT(arcstat_l2_writes_done) 967 968/* L2ARC Performance Tunables */ 969uint64_t l2arc_write_max = L2ARC_WRITE_SIZE; /* default max write size */ 970uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra write during warmup */ 971uint64_t l2arc_headroom = L2ARC_HEADROOM; /* number of dev writes */ 972uint64_t l2arc_headroom_boost = L2ARC_HEADROOM_BOOST; 973uint64_t l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */ 974uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval milliseconds */ 975boolean_t l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */ 976boolean_t l2arc_feed_again = B_TRUE; /* turbo warmup */ 977boolean_t l2arc_norw = B_TRUE; /* no reads during writes */ 978 979SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_max, CTLFLAG_RW, 980 &l2arc_write_max, 0, "max write size"); 981SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_boost, CTLFLAG_RW, 982 &l2arc_write_boost, 0, "extra write during warmup"); 983SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_headroom, CTLFLAG_RW, 984 &l2arc_headroom, 0, "number of dev writes"); 985SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_secs, CTLFLAG_RW, 986 &l2arc_feed_secs, 0, "interval seconds"); 987SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_min_ms, CTLFLAG_RW, 988 &l2arc_feed_min_ms, 0, "min interval milliseconds"); 989 990SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_noprefetch, CTLFLAG_RW, 991 &l2arc_noprefetch, 0, "don't cache prefetch bufs"); 992SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_feed_again, CTLFLAG_RW, 993 &l2arc_feed_again, 0, "turbo warmup"); 994SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_norw, CTLFLAG_RW, 995 &l2arc_norw, 0, "no reads during writes"); 996 997SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_size, CTLFLAG_RD, 998 &ARC_anon.arcs_size, 0, "size of anonymous state"); 999SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_metadata_lsize, CTLFLAG_RD, 1000 &ARC_anon.arcs_lsize[ARC_BUFC_METADATA], 0, "size of anonymous state"); 1001SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_data_lsize, CTLFLAG_RD, 1002 &ARC_anon.arcs_lsize[ARC_BUFC_DATA], 0, "size of anonymous state"); 1003 1004SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_size, CTLFLAG_RD, 1005 &ARC_mru.arcs_size, 0, "size of mru state"); 1006SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_metadata_lsize, CTLFLAG_RD, 1007 &ARC_mru.arcs_lsize[ARC_BUFC_METADATA], 0, "size of metadata in mru state"); 1008SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_data_lsize, CTLFLAG_RD, 1009 &ARC_mru.arcs_lsize[ARC_BUFC_DATA], 0, "size of data in mru state"); 1010 1011SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_size, CTLFLAG_RD, 1012 &ARC_mru_ghost.arcs_size, 0, "size of mru ghost state"); 1013SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_metadata_lsize, CTLFLAG_RD, 1014 &ARC_mru_ghost.arcs_lsize[ARC_BUFC_METADATA], 0, 1015 "size of metadata in mru ghost state"); 1016SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_data_lsize, CTLFLAG_RD, 1017 &ARC_mru_ghost.arcs_lsize[ARC_BUFC_DATA], 0, 1018 "size of data in mru ghost state"); 1019 1020SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_size, CTLFLAG_RD, 1021 &ARC_mfu.arcs_size, 0, "size of mfu state"); 1022SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_metadata_lsize, CTLFLAG_RD, 1023 &ARC_mfu.arcs_lsize[ARC_BUFC_METADATA], 0, "size of metadata in mfu state"); 1024SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_data_lsize, CTLFLAG_RD, 1025 &ARC_mfu.arcs_lsize[ARC_BUFC_DATA], 0, "size of data in mfu state"); 1026 1027SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_size, CTLFLAG_RD, 1028 &ARC_mfu_ghost.arcs_size, 0, "size of mfu ghost state"); 1029SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_metadata_lsize, CTLFLAG_RD, 1030 &ARC_mfu_ghost.arcs_lsize[ARC_BUFC_METADATA], 0, 1031 "size of metadata in mfu ghost state"); 1032SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_data_lsize, CTLFLAG_RD, 1033 &ARC_mfu_ghost.arcs_lsize[ARC_BUFC_DATA], 0, 1034 "size of data in mfu ghost state"); 1035 1036SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2c_only_size, CTLFLAG_RD, 1037 &ARC_l2c_only.arcs_size, 0, "size of mru state"); 1038 1039/* 1040 * L2ARC Internals 1041 */ 1042struct l2arc_dev { 1043 vdev_t *l2ad_vdev; /* vdev */ 1044 spa_t *l2ad_spa; /* spa */ 1045 uint64_t l2ad_hand; /* next write location */ 1046 uint64_t l2ad_start; /* first addr on device */ 1047 uint64_t l2ad_end; /* last addr on device */ 1048 uint64_t l2ad_evict; /* last addr eviction reached */ 1049 boolean_t l2ad_first; /* first sweep through */ 1050 boolean_t l2ad_writing; /* currently writing */ 1051 kmutex_t l2ad_mtx; /* lock for buffer list */ 1052 list_t l2ad_buflist; /* buffer list */ 1053 list_node_t l2ad_node; /* device list node */ 1054}; 1055 1056static list_t L2ARC_dev_list; /* device list */ 1057static list_t *l2arc_dev_list; /* device list pointer */ 1058static kmutex_t l2arc_dev_mtx; /* device list mutex */ 1059static l2arc_dev_t *l2arc_dev_last; /* last device used */ 1060static list_t L2ARC_free_on_write; /* free after write buf list */ 1061static list_t *l2arc_free_on_write; /* free after write list ptr */ 1062static kmutex_t l2arc_free_on_write_mtx; /* mutex for list */ 1063static uint64_t l2arc_ndev; /* number of devices */ 1064 1065typedef struct l2arc_read_callback { 1066 arc_buf_t *l2rcb_buf; /* read buffer */ 1067 spa_t *l2rcb_spa; /* spa */ 1068 blkptr_t l2rcb_bp; /* original blkptr */ 1069 zbookmark_phys_t l2rcb_zb; /* original bookmark */ 1070 int l2rcb_flags; /* original flags */ 1071 enum zio_compress l2rcb_compress; /* applied compress */ 1072} l2arc_read_callback_t; 1073 1074typedef struct l2arc_write_callback { 1075 l2arc_dev_t *l2wcb_dev; /* device info */ 1076 arc_buf_hdr_t *l2wcb_head; /* head of write buflist */ 1077} l2arc_write_callback_t; 1078 1079typedef struct l2arc_data_free { 1080 /* protected by l2arc_free_on_write_mtx */ 1081 void *l2df_data; 1082 size_t l2df_size; 1083 void (*l2df_func)(void *, size_t); 1084 list_node_t l2df_list_node; 1085} l2arc_data_free_t; 1086 1087static kmutex_t l2arc_feed_thr_lock; 1088static kcondvar_t l2arc_feed_thr_cv; 1089static uint8_t l2arc_thread_exit; 1090 1091static void arc_get_data_buf(arc_buf_t *); 1092static void arc_access(arc_buf_hdr_t *, kmutex_t *); 1093static int arc_evict_needed(arc_buf_contents_t); 1094static void arc_evict_ghost(arc_state_t *, uint64_t, int64_t); 1095static void arc_buf_watch(arc_buf_t *); 1096 1097static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *); 1098static uint32_t arc_bufc_to_flags(arc_buf_contents_t); 1099 1100static boolean_t l2arc_write_eligible(uint64_t, arc_buf_hdr_t *); 1101static void l2arc_read_done(zio_t *); 1102 1103static boolean_t l2arc_compress_buf(arc_buf_hdr_t *); 1104static void l2arc_decompress_zio(zio_t *, arc_buf_hdr_t *, enum zio_compress); 1105static void l2arc_release_cdata_buf(arc_buf_hdr_t *); 1106 1107static uint64_t 1108buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth) 1109{ 1110 uint8_t *vdva = (uint8_t *)dva; 1111 uint64_t crc = -1ULL; 1112 int i; 1113 1114 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); 1115 1116 for (i = 0; i < sizeof (dva_t); i++) 1117 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF]; 1118 1119 crc ^= (spa>>8) ^ birth; 1120 1121 return (crc); 1122} 1123 1124#define BUF_EMPTY(buf) \ 1125 ((buf)->b_dva.dva_word[0] == 0 && \ 1126 (buf)->b_dva.dva_word[1] == 0) 1127 1128#define BUF_EQUAL(spa, dva, birth, buf) \ 1129 ((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \ 1130 ((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \ 1131 ((buf)->b_birth == birth) && ((buf)->b_spa == spa) 1132 1133static void 1134buf_discard_identity(arc_buf_hdr_t *hdr) 1135{ 1136 hdr->b_dva.dva_word[0] = 0; 1137 hdr->b_dva.dva_word[1] = 0; 1138 hdr->b_birth = 0; 1139} 1140 1141static arc_buf_hdr_t * 1142buf_hash_find(uint64_t spa, const blkptr_t *bp, kmutex_t **lockp) 1143{ 1144 const dva_t *dva = BP_IDENTITY(bp); 1145 uint64_t birth = BP_PHYSICAL_BIRTH(bp); 1146 uint64_t idx = BUF_HASH_INDEX(spa, dva, birth); 1147 kmutex_t *hash_lock = BUF_HASH_LOCK(idx); 1148 arc_buf_hdr_t *hdr; 1149 1150 mutex_enter(hash_lock); 1151 for (hdr = buf_hash_table.ht_table[idx]; hdr != NULL; 1152 hdr = hdr->b_hash_next) { 1153 if (BUF_EQUAL(spa, dva, birth, hdr)) { 1154 *lockp = hash_lock; 1155 return (hdr); 1156 } 1157 } 1158 mutex_exit(hash_lock); 1159 *lockp = NULL; 1160 return (NULL); 1161} 1162 1163/* 1164 * Insert an entry into the hash table. If there is already an element 1165 * equal to elem in the hash table, then the already existing element 1166 * will be returned and the new element will not be inserted. 1167 * Otherwise returns NULL. 1168 * If lockp == NULL, the caller is assumed to already hold the hash lock. 1169 */ 1170static arc_buf_hdr_t * 1171buf_hash_insert(arc_buf_hdr_t *hdr, kmutex_t **lockp) 1172{ 1173 uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth); 1174 kmutex_t *hash_lock = BUF_HASH_LOCK(idx); 1175 arc_buf_hdr_t *fhdr; 1176 uint32_t i; 1177 1178 ASSERT(!DVA_IS_EMPTY(&hdr->b_dva)); 1179 ASSERT(hdr->b_birth != 0); 1180 ASSERT(!HDR_IN_HASH_TABLE(hdr)); 1181 1182 if (lockp != NULL) { 1183 *lockp = hash_lock; 1184 mutex_enter(hash_lock); 1185 } else { 1186 ASSERT(MUTEX_HELD(hash_lock)); 1187 } 1188 1189 for (fhdr = buf_hash_table.ht_table[idx], i = 0; fhdr != NULL; 1190 fhdr = fhdr->b_hash_next, i++) { 1191 if (BUF_EQUAL(hdr->b_spa, &hdr->b_dva, hdr->b_birth, fhdr)) 1192 return (fhdr); 1193 } 1194 1195 hdr->b_hash_next = buf_hash_table.ht_table[idx]; 1196 buf_hash_table.ht_table[idx] = hdr; 1197 hdr->b_flags |= ARC_FLAG_IN_HASH_TABLE; 1198 1199 /* collect some hash table performance data */ 1200 if (i > 0) { 1201 ARCSTAT_BUMP(arcstat_hash_collisions); 1202 if (i == 1) 1203 ARCSTAT_BUMP(arcstat_hash_chains); 1204 1205 ARCSTAT_MAX(arcstat_hash_chain_max, i); 1206 } 1207 1208 ARCSTAT_BUMP(arcstat_hash_elements); 1209 ARCSTAT_MAXSTAT(arcstat_hash_elements); 1210 1211 return (NULL); 1212} 1213 1214static void 1215buf_hash_remove(arc_buf_hdr_t *hdr) 1216{ 1217 arc_buf_hdr_t *fhdr, **hdrp; 1218 uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth); 1219 1220 ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx))); 1221 ASSERT(HDR_IN_HASH_TABLE(hdr)); 1222 1223 hdrp = &buf_hash_table.ht_table[idx]; 1224 while ((fhdr = *hdrp) != hdr) { 1225 ASSERT(fhdr != NULL); 1226 hdrp = &fhdr->b_hash_next; 1227 } 1228 *hdrp = hdr->b_hash_next; 1229 hdr->b_hash_next = NULL; 1230 hdr->b_flags &= ~ARC_FLAG_IN_HASH_TABLE; 1231 1232 /* collect some hash table performance data */ 1233 ARCSTAT_BUMPDOWN(arcstat_hash_elements); 1234 1235 if (buf_hash_table.ht_table[idx] && 1236 buf_hash_table.ht_table[idx]->b_hash_next == NULL) 1237 ARCSTAT_BUMPDOWN(arcstat_hash_chains); 1238} 1239 1240/* 1241 * Global data structures and functions for the buf kmem cache. 1242 */ 1243static kmem_cache_t *hdr_full_cache; 1244static kmem_cache_t *hdr_l2only_cache; 1245static kmem_cache_t *buf_cache; 1246 1247static void 1248buf_fini(void) 1249{ 1250 int i; 1251 1252 kmem_free(buf_hash_table.ht_table, 1253 (buf_hash_table.ht_mask + 1) * sizeof (void *)); 1254 for (i = 0; i < BUF_LOCKS; i++) 1255 mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock); 1256 kmem_cache_destroy(hdr_full_cache); 1257 kmem_cache_destroy(hdr_l2only_cache); 1258 kmem_cache_destroy(buf_cache); 1259} 1260 1261/* 1262 * Constructor callback - called when the cache is empty 1263 * and a new buf is requested. 1264 */ 1265/* ARGSUSED */ 1266static int 1267hdr_full_cons(void *vbuf, void *unused, int kmflag) 1268{ 1269 arc_buf_hdr_t *hdr = vbuf; 1270 1271 bzero(hdr, HDR_FULL_SIZE); 1272 cv_init(&hdr->b_l1hdr.b_cv, NULL, CV_DEFAULT, NULL); 1273 refcount_create(&hdr->b_l1hdr.b_refcnt); 1274 mutex_init(&hdr->b_l1hdr.b_freeze_lock, NULL, MUTEX_DEFAULT, NULL); 1275 arc_space_consume(HDR_FULL_SIZE, ARC_SPACE_HDRS); 1276 1277 return (0); 1278} 1279 1280/* ARGSUSED */ 1281static int 1282hdr_l2only_cons(void *vbuf, void *unused, int kmflag) 1283{ 1284 arc_buf_hdr_t *hdr = vbuf; 1285 1286 bzero(hdr, HDR_L2ONLY_SIZE); 1287 arc_space_consume(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS); 1288 1289 return (0); 1290} 1291 1292/* ARGSUSED */ 1293static int 1294buf_cons(void *vbuf, void *unused, int kmflag) 1295{ 1296 arc_buf_t *buf = vbuf; 1297 1298 bzero(buf, sizeof (arc_buf_t)); 1299 mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL); 1300 arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS); 1301 1302 return (0); 1303} 1304 1305/* 1306 * Destructor callback - called when a cached buf is 1307 * no longer required. 1308 */ 1309/* ARGSUSED */ 1310static void 1311hdr_full_dest(void *vbuf, void *unused) 1312{ 1313 arc_buf_hdr_t *hdr = vbuf; 1314 1315 ASSERT(BUF_EMPTY(hdr)); 1316 cv_destroy(&hdr->b_l1hdr.b_cv); 1317 refcount_destroy(&hdr->b_l1hdr.b_refcnt); 1318 mutex_destroy(&hdr->b_l1hdr.b_freeze_lock); 1319 arc_space_return(HDR_FULL_SIZE, ARC_SPACE_HDRS); 1320} 1321 1322/* ARGSUSED */ 1323static void 1324hdr_l2only_dest(void *vbuf, void *unused) 1325{ 1326 arc_buf_hdr_t *hdr = vbuf; 1327 1328 ASSERT(BUF_EMPTY(hdr)); 1329 arc_space_return(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS); 1330} 1331 1332/* ARGSUSED */ 1333static void 1334buf_dest(void *vbuf, void *unused) 1335{ 1336 arc_buf_t *buf = vbuf; 1337 1338 mutex_destroy(&buf->b_evict_lock); 1339 arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS); 1340} 1341 1342/* 1343 * Reclaim callback -- invoked when memory is low. 1344 */ 1345/* ARGSUSED */ 1346static void 1347hdr_recl(void *unused) 1348{ 1349 dprintf("hdr_recl called\n"); 1350 /* 1351 * umem calls the reclaim func when we destroy the buf cache, 1352 * which is after we do arc_fini(). 1353 */ 1354 if (!arc_dead) 1355 cv_signal(&arc_reclaim_thr_cv); 1356} 1357 1358static void 1359buf_init(void) 1360{ 1361 uint64_t *ct; 1362 uint64_t hsize = 1ULL << 12; 1363 int i, j; 1364 1365 /* 1366 * The hash table is big enough to fill all of physical memory 1367 * with an average block size of zfs_arc_average_blocksize (default 8K). 1368 * By default, the table will take up 1369 * totalmem * sizeof(void*) / 8K (1MB per GB with 8-byte pointers). 1370 */ 1371 while (hsize * zfs_arc_average_blocksize < (uint64_t)physmem * PAGESIZE) 1372 hsize <<= 1; 1373retry: 1374 buf_hash_table.ht_mask = hsize - 1; 1375 buf_hash_table.ht_table = 1376 kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP); 1377 if (buf_hash_table.ht_table == NULL) { 1378 ASSERT(hsize > (1ULL << 8)); 1379 hsize >>= 1; 1380 goto retry; 1381 } 1382 1383 hdr_full_cache = kmem_cache_create("arc_buf_hdr_t_full", HDR_FULL_SIZE, 1384 0, hdr_full_cons, hdr_full_dest, hdr_recl, NULL, NULL, 0); 1385 hdr_l2only_cache = kmem_cache_create("arc_buf_hdr_t_l2only", 1386 HDR_L2ONLY_SIZE, 0, hdr_l2only_cons, hdr_l2only_dest, hdr_recl, 1387 NULL, NULL, 0); 1388 buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t), 1389 0, buf_cons, buf_dest, NULL, NULL, NULL, 0); 1390 1391 for (i = 0; i < 256; i++) 1392 for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--) 1393 *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY); 1394 1395 for (i = 0; i < BUF_LOCKS; i++) { 1396 mutex_init(&buf_hash_table.ht_locks[i].ht_lock, 1397 NULL, MUTEX_DEFAULT, NULL); 1398 } 1399} 1400 1401/* 1402 * Transition between the two allocation states for the arc_buf_hdr struct. 1403 * The arc_buf_hdr struct can be allocated with (hdr_full_cache) or without 1404 * (hdr_l2only_cache) the fields necessary for the L1 cache - the smaller 1405 * version is used when a cache buffer is only in the L2ARC in order to reduce 1406 * memory usage. 1407 */ 1408static arc_buf_hdr_t * 1409arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new) 1410{ 1411 ASSERT(HDR_HAS_L2HDR(hdr)); 1412 1413 arc_buf_hdr_t *nhdr; 1414 l2arc_dev_t *dev = hdr->b_l2hdr.b_dev; 1415 1416 ASSERT((old == hdr_full_cache && new == hdr_l2only_cache) || 1417 (old == hdr_l2only_cache && new == hdr_full_cache)); 1418 1419 nhdr = kmem_cache_alloc(new, KM_PUSHPAGE); 1420 1421 ASSERT(MUTEX_HELD(HDR_LOCK(hdr))); 1422 buf_hash_remove(hdr); 1423 1424 bcopy(hdr, nhdr, HDR_L2ONLY_SIZE); 1425 if (new == hdr_full_cache) { 1426 nhdr->b_flags |= ARC_FLAG_HAS_L1HDR; 1427 /* 1428 * arc_access and arc_change_state need to be aware that a 1429 * header has just come out of L2ARC, so we set its state to 1430 * l2c_only even though it's about to change. 1431 */ 1432 nhdr->b_l1hdr.b_state = arc_l2c_only; 1433 } else { 1434 ASSERT(hdr->b_l1hdr.b_buf == NULL); 1435 ASSERT0(hdr->b_l1hdr.b_datacnt); 1436 ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node)); 1437 /* 1438 * We might be removing the L1hdr of a buffer which was just 1439 * written out to L2ARC. If such a buffer is compressed then we 1440 * need to free its b_tmp_cdata before destroying the header. 1441 */ 1442 if (hdr->b_l1hdr.b_tmp_cdata != NULL && 1443 HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF) 1444 l2arc_release_cdata_buf(hdr); 1445 nhdr->b_flags &= ~ARC_FLAG_HAS_L1HDR; 1446 } 1447 /* 1448 * The header has been reallocated so we need to re-insert it into any 1449 * lists it was on. 1450 */ 1451 (void) buf_hash_insert(nhdr, NULL); 1452 1453 ASSERT(list_link_active(&hdr->b_l2hdr.b_l2node)); 1454 1455 mutex_enter(&dev->l2ad_mtx); 1456 1457 /* 1458 * We must place the realloc'ed header back into the list at 1459 * the same spot. Otherwise, if it's placed earlier in the list, 1460 * l2arc_write_buffers() could find it during the function's 1461 * write phase, and try to write it out to the l2arc. 1462 */ 1463 list_insert_after(&dev->l2ad_buflist, hdr, nhdr); 1464 list_remove(&dev->l2ad_buflist, hdr); 1465 1466 mutex_exit(&dev->l2ad_mtx); 1467 1468 buf_discard_identity(hdr); 1469 hdr->b_freeze_cksum = NULL; 1470 kmem_cache_free(old, hdr); 1471 1472 return (nhdr); 1473} 1474 1475 1476#define ARC_MINTIME (hz>>4) /* 62 ms */ 1477 1478static void 1479arc_cksum_verify(arc_buf_t *buf) 1480{ 1481 zio_cksum_t zc; 1482 1483 if (!(zfs_flags & ZFS_DEBUG_MODIFY)) 1484 return; 1485 1486 mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1487 if (buf->b_hdr->b_freeze_cksum == NULL || HDR_IO_ERROR(buf->b_hdr)) { 1488 mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1489 return; 1490 } 1491 fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc); 1492 if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc)) 1493 panic("buffer modified while frozen!"); 1494 mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1495} 1496 1497static int 1498arc_cksum_equal(arc_buf_t *buf) 1499{ 1500 zio_cksum_t zc; 1501 int equal; 1502 1503 mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1504 fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc); 1505 equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc); 1506 mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1507 1508 return (equal); 1509} 1510 1511static void 1512arc_cksum_compute(arc_buf_t *buf, boolean_t force) 1513{ 1514 if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY)) 1515 return; 1516 1517 mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1518 if (buf->b_hdr->b_freeze_cksum != NULL) { 1519 mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1520 return; 1521 } 1522 buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP); 1523 fletcher_2_native(buf->b_data, buf->b_hdr->b_size, 1524 buf->b_hdr->b_freeze_cksum); 1525 mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1526#ifdef illumos 1527 arc_buf_watch(buf); 1528#endif 1529} 1530 1531#ifdef illumos 1532#ifndef _KERNEL 1533typedef struct procctl { 1534 long cmd; 1535 prwatch_t prwatch; 1536} procctl_t; 1537#endif 1538 1539/* ARGSUSED */ 1540static void 1541arc_buf_unwatch(arc_buf_t *buf) 1542{ 1543#ifndef _KERNEL 1544 if (arc_watch) { 1545 int result; 1546 procctl_t ctl; 1547 ctl.cmd = PCWATCH; 1548 ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data; 1549 ctl.prwatch.pr_size = 0; 1550 ctl.prwatch.pr_wflags = 0; 1551 result = write(arc_procfd, &ctl, sizeof (ctl)); 1552 ASSERT3U(result, ==, sizeof (ctl)); 1553 } 1554#endif 1555} 1556 1557/* ARGSUSED */ 1558static void 1559arc_buf_watch(arc_buf_t *buf) 1560{ 1561#ifndef _KERNEL 1562 if (arc_watch) { 1563 int result; 1564 procctl_t ctl; 1565 ctl.cmd = PCWATCH; 1566 ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data; 1567 ctl.prwatch.pr_size = buf->b_hdr->b_size; 1568 ctl.prwatch.pr_wflags = WA_WRITE; 1569 result = write(arc_procfd, &ctl, sizeof (ctl)); 1570 ASSERT3U(result, ==, sizeof (ctl)); 1571 } 1572#endif 1573} 1574#endif /* illumos */ 1575 1576static arc_buf_contents_t 1577arc_buf_type(arc_buf_hdr_t *hdr) 1578{ 1579 if (HDR_ISTYPE_METADATA(hdr)) { 1580 return (ARC_BUFC_METADATA); 1581 } else { 1582 return (ARC_BUFC_DATA); 1583 } 1584} 1585 1586static uint32_t 1587arc_bufc_to_flags(arc_buf_contents_t type) 1588{ 1589 switch (type) { 1590 case ARC_BUFC_DATA: 1591 /* metadata field is 0 if buffer contains normal data */ 1592 return (0); 1593 case ARC_BUFC_METADATA: 1594 return (ARC_FLAG_BUFC_METADATA); 1595 default: 1596 break; 1597 } 1598 panic("undefined ARC buffer type!"); 1599 return ((uint32_t)-1); 1600} 1601 1602void 1603arc_buf_thaw(arc_buf_t *buf) 1604{ 1605 if (zfs_flags & ZFS_DEBUG_MODIFY) { 1606 if (buf->b_hdr->b_l1hdr.b_state != arc_anon) 1607 panic("modifying non-anon buffer!"); 1608 if (HDR_IO_IN_PROGRESS(buf->b_hdr)) 1609 panic("modifying buffer while i/o in progress!"); 1610 arc_cksum_verify(buf); 1611 } 1612 1613 mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1614 if (buf->b_hdr->b_freeze_cksum != NULL) { 1615 kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t)); 1616 buf->b_hdr->b_freeze_cksum = NULL; 1617 } 1618 1619#ifdef ZFS_DEBUG 1620 if (zfs_flags & ZFS_DEBUG_MODIFY) { 1621 if (buf->b_hdr->b_l1hdr.b_thawed != NULL) 1622 kmem_free(buf->b_hdr->b_l1hdr.b_thawed, 1); 1623 buf->b_hdr->b_l1hdr.b_thawed = kmem_alloc(1, KM_SLEEP); 1624 } 1625#endif 1626 1627 mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1628 1629#ifdef illumos 1630 arc_buf_unwatch(buf); 1631#endif 1632} 1633 1634void 1635arc_buf_freeze(arc_buf_t *buf) 1636{ 1637 kmutex_t *hash_lock; 1638 1639 if (!(zfs_flags & ZFS_DEBUG_MODIFY)) 1640 return; 1641 1642 hash_lock = HDR_LOCK(buf->b_hdr); 1643 mutex_enter(hash_lock); 1644 1645 ASSERT(buf->b_hdr->b_freeze_cksum != NULL || 1646 buf->b_hdr->b_l1hdr.b_state == arc_anon); 1647 arc_cksum_compute(buf, B_FALSE); 1648 mutex_exit(hash_lock); 1649 1650} 1651 1652static void 1653get_buf_info(arc_buf_hdr_t *hdr, arc_state_t *state, list_t **list, kmutex_t **lock) 1654{ 1655 uint64_t buf_hashid = buf_hash(hdr->b_spa, &hdr->b_dva, hdr->b_birth); 1656 1657 if (arc_buf_type(hdr) == ARC_BUFC_METADATA) 1658 buf_hashid &= (ARC_BUFC_NUMMETADATALISTS - 1); 1659 else { 1660 buf_hashid &= (ARC_BUFC_NUMDATALISTS - 1); 1661 buf_hashid += ARC_BUFC_NUMMETADATALISTS; 1662 } 1663 1664 *list = &state->arcs_lists[buf_hashid]; 1665 *lock = ARCS_LOCK(state, buf_hashid); 1666} 1667 1668 1669static void 1670add_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag) 1671{ 1672 ASSERT(HDR_HAS_L1HDR(hdr)); 1673 ASSERT(MUTEX_HELD(hash_lock)); 1674 arc_state_t *state = hdr->b_l1hdr.b_state; 1675 1676 if ((refcount_add(&hdr->b_l1hdr.b_refcnt, tag) == 1) && 1677 (state != arc_anon)) { 1678 /* We don't use the L2-only state list. */ 1679 if (state != arc_l2c_only) { 1680 uint64_t delta = hdr->b_size * hdr->b_l1hdr.b_datacnt; 1681 uint64_t *size = &state->arcs_lsize[arc_buf_type(hdr)]; 1682 list_t *list; 1683 kmutex_t *lock; 1684 1685 get_buf_info(hdr, state, &list, &lock); 1686 ASSERT(!MUTEX_HELD(lock)); 1687 mutex_enter(lock); 1688 ASSERT(list_link_active(&hdr->b_l1hdr.b_arc_node)); 1689 list_remove(list, hdr); 1690 if (GHOST_STATE(state)) { 1691 ASSERT0(hdr->b_l1hdr.b_datacnt); 1692 ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); 1693 delta = hdr->b_size; 1694 } 1695 ASSERT(delta > 0); 1696 ASSERT3U(*size, >=, delta); 1697 atomic_add_64(size, -delta); 1698 mutex_exit(lock); 1699 } 1700 /* remove the prefetch flag if we get a reference */ 1701 hdr->b_flags &= ~ARC_FLAG_PREFETCH; 1702 } 1703} 1704 1705static int 1706remove_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag) 1707{ 1708 int cnt; 1709 arc_state_t *state = hdr->b_l1hdr.b_state; 1710 1711 ASSERT(HDR_HAS_L1HDR(hdr)); 1712 ASSERT(state == arc_anon || MUTEX_HELD(hash_lock)); 1713 ASSERT(!GHOST_STATE(state)); 1714 1715 /* 1716 * arc_l2c_only counts as a ghost state so we don't need to explicitly 1717 * check to prevent usage of the arc_l2c_only list. 1718 */ 1719 if (((cnt = refcount_remove(&hdr->b_l1hdr.b_refcnt, tag)) == 0) && 1720 (state != arc_anon)) { 1721 uint64_t *size = &state->arcs_lsize[arc_buf_type(hdr)]; 1722 list_t *list; 1723 kmutex_t *lock; 1724 1725 get_buf_info(hdr, state, &list, &lock); 1726 ASSERT(!MUTEX_HELD(lock)); 1727 mutex_enter(lock); 1728 ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node)); 1729 list_insert_head(list, hdr); 1730 ASSERT(hdr->b_l1hdr.b_datacnt > 0); 1731 atomic_add_64(size, hdr->b_size * 1732 hdr->b_l1hdr.b_datacnt); 1733 mutex_exit(lock); 1734 } 1735 return (cnt); 1736} 1737 1738/* 1739 * Move the supplied buffer to the indicated state. The mutex 1740 * for the buffer must be held by the caller. 1741 */ 1742static void 1743arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, 1744 kmutex_t *hash_lock) 1745{ 1746 arc_state_t *old_state; 1747 int64_t refcnt; 1748 uint32_t datacnt; 1749 uint64_t from_delta, to_delta; 1750 arc_buf_contents_t buftype = arc_buf_type(hdr); 1751 list_t *list; 1752 kmutex_t *lock; 1753 1754 /* 1755 * We almost always have an L1 hdr here, since we call arc_hdr_realloc() 1756 * in arc_read() when bringing a buffer out of the L2ARC. However, the 1757 * L1 hdr doesn't always exist when we change state to arc_anon before 1758 * destroying a header, in which case reallocating to add the L1 hdr is 1759 * pointless. 1760 */ 1761 if (HDR_HAS_L1HDR(hdr)) { 1762 old_state = hdr->b_l1hdr.b_state; 1763 refcnt = refcount_count(&hdr->b_l1hdr.b_refcnt); 1764 datacnt = hdr->b_l1hdr.b_datacnt; 1765 } else { 1766 old_state = arc_l2c_only; 1767 refcnt = 0; 1768 datacnt = 0; 1769 } 1770 1771 ASSERT(MUTEX_HELD(hash_lock)); 1772 ASSERT3P(new_state, !=, old_state); 1773 ASSERT(refcnt == 0 || datacnt > 0); 1774 ASSERT(!GHOST_STATE(new_state) || datacnt == 0); 1775 ASSERT(old_state != arc_anon || datacnt <= 1); 1776 1777 from_delta = to_delta = datacnt * hdr->b_size; 1778 1779 /* 1780 * If this buffer is evictable, transfer it from the 1781 * old state list to the new state list. 1782 */ 1783 if (refcnt == 0) { 1784 if (old_state != arc_anon && old_state != arc_l2c_only) { 1785 int use_mutex; 1786 uint64_t *size = &old_state->arcs_lsize[buftype]; 1787 1788 get_buf_info(hdr, old_state, &list, &lock); 1789 use_mutex = !MUTEX_HELD(lock); 1790 if (use_mutex) 1791 mutex_enter(lock); 1792 1793 ASSERT(HDR_HAS_L1HDR(hdr)); 1794 ASSERT(list_link_active(&hdr->b_l1hdr.b_arc_node)); 1795 list_remove(list, hdr); 1796 1797 /* 1798 * If prefetching out of the ghost cache, 1799 * we will have a non-zero datacnt. 1800 */ 1801 if (GHOST_STATE(old_state) && datacnt == 0) { 1802 /* ghost elements have a ghost size */ 1803 ASSERT(hdr->b_l1hdr.b_buf == NULL); 1804 from_delta = hdr->b_size; 1805 } 1806 ASSERT3U(*size, >=, from_delta); 1807 atomic_add_64(size, -from_delta); 1808 1809 if (use_mutex) 1810 mutex_exit(lock); 1811 } 1812 if (new_state != arc_anon && new_state != arc_l2c_only) { 1813 int use_mutex; 1814 uint64_t *size = &new_state->arcs_lsize[buftype]; 1815 1816 /* 1817 * An L1 header always exists here, since if we're 1818 * moving to some L1-cached state (i.e. not l2c_only or 1819 * anonymous), we realloc the header to add an L1hdr 1820 * beforehand. 1821 */ 1822 ASSERT(HDR_HAS_L1HDR(hdr)); 1823 get_buf_info(hdr, new_state, &list, &lock); 1824 use_mutex = !MUTEX_HELD(lock); 1825 if (use_mutex) 1826 mutex_enter(lock); 1827 1828 list_insert_head(list, hdr); 1829 1830 /* ghost elements have a ghost size */ 1831 if (GHOST_STATE(new_state)) { 1832 ASSERT(datacnt == 0); 1833 ASSERT(hdr->b_l1hdr.b_buf == NULL); 1834 to_delta = hdr->b_size; 1835 } 1836 atomic_add_64(size, to_delta); 1837 1838 if (use_mutex) 1839 mutex_exit(lock); 1840 } 1841 } 1842 1843 ASSERT(!BUF_EMPTY(hdr)); 1844 if (new_state == arc_anon && HDR_IN_HASH_TABLE(hdr)) 1845 buf_hash_remove(hdr); 1846 1847 /* adjust state sizes (ignore arc_l2c_only) */ 1848 if (to_delta && new_state != arc_l2c_only) 1849 atomic_add_64(&new_state->arcs_size, to_delta); 1850 if (from_delta && old_state != arc_l2c_only) { 1851 ASSERT3U(old_state->arcs_size, >=, from_delta); 1852 atomic_add_64(&old_state->arcs_size, -from_delta); 1853 } 1854 if (HDR_HAS_L1HDR(hdr)) 1855 hdr->b_l1hdr.b_state = new_state; 1856 1857 /* 1858 * L2 headers should never be on the L2 state list since they don't 1859 * have L1 headers allocated. 1860 */ 1861 ASSERT(list_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_DATA]) && 1862 list_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA])); 1863} 1864 1865void 1866arc_space_consume(uint64_t space, arc_space_type_t type) 1867{ 1868 ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES); 1869 1870 switch (type) { 1871 case ARC_SPACE_DATA: 1872 ARCSTAT_INCR(arcstat_data_size, space); 1873 break; 1874 case ARC_SPACE_META: 1875 ARCSTAT_INCR(arcstat_metadata_size, space); 1876 break; 1877 case ARC_SPACE_OTHER: 1878 ARCSTAT_INCR(arcstat_other_size, space); 1879 break; 1880 case ARC_SPACE_HDRS: 1881 ARCSTAT_INCR(arcstat_hdr_size, space); 1882 break; 1883 case ARC_SPACE_L2HDRS: 1884 ARCSTAT_INCR(arcstat_l2_hdr_size, space); 1885 break; 1886 } 1887 1888 if (type != ARC_SPACE_DATA) 1889 ARCSTAT_INCR(arcstat_meta_used, space); 1890 1891 atomic_add_64(&arc_size, space); 1892} 1893 1894void 1895arc_space_return(uint64_t space, arc_space_type_t type) 1896{ 1897 ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES); 1898 1899 switch (type) { 1900 case ARC_SPACE_DATA: 1901 ARCSTAT_INCR(arcstat_data_size, -space); 1902 break; 1903 case ARC_SPACE_META: 1904 ARCSTAT_INCR(arcstat_metadata_size, -space); 1905 break; 1906 case ARC_SPACE_OTHER: 1907 ARCSTAT_INCR(arcstat_other_size, -space); 1908 break; 1909 case ARC_SPACE_HDRS: 1910 ARCSTAT_INCR(arcstat_hdr_size, -space); 1911 break; 1912 case ARC_SPACE_L2HDRS: 1913 ARCSTAT_INCR(arcstat_l2_hdr_size, -space); 1914 break; 1915 } 1916 1917 if (type != ARC_SPACE_DATA) { 1918 ASSERT(arc_meta_used >= space); 1919 if (arc_meta_max < arc_meta_used) 1920 arc_meta_max = arc_meta_used; 1921 ARCSTAT_INCR(arcstat_meta_used, -space); 1922 } 1923 1924 ASSERT(arc_size >= space); 1925 atomic_add_64(&arc_size, -space); 1926} 1927 1928arc_buf_t * 1929arc_buf_alloc(spa_t *spa, int32_t size, void *tag, arc_buf_contents_t type) 1930{ 1931 arc_buf_hdr_t *hdr; 1932 arc_buf_t *buf; 1933 1934 ASSERT3U(size, >, 0); 1935 hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE); 1936 ASSERT(BUF_EMPTY(hdr)); 1937 ASSERT3P(hdr->b_freeze_cksum, ==, NULL); 1938 hdr->b_size = size; 1939 hdr->b_spa = spa_load_guid(spa); 1940 1941 buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); 1942 buf->b_hdr = hdr; 1943 buf->b_data = NULL; 1944 buf->b_efunc = NULL; 1945 buf->b_private = NULL; 1946 buf->b_next = NULL; 1947 1948 hdr->b_flags = arc_bufc_to_flags(type); 1949 hdr->b_flags |= ARC_FLAG_HAS_L1HDR; 1950 1951 hdr->b_l1hdr.b_buf = buf; 1952 hdr->b_l1hdr.b_state = arc_anon; 1953 hdr->b_l1hdr.b_arc_access = 0; 1954 hdr->b_l1hdr.b_datacnt = 1; 1955 1956 arc_get_data_buf(buf); 1957 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 1958 (void) refcount_add(&hdr->b_l1hdr.b_refcnt, tag); 1959 1960 return (buf); 1961} 1962 1963static char *arc_onloan_tag = "onloan"; 1964 1965/* 1966 * Loan out an anonymous arc buffer. Loaned buffers are not counted as in 1967 * flight data by arc_tempreserve_space() until they are "returned". Loaned 1968 * buffers must be returned to the arc before they can be used by the DMU or 1969 * freed. 1970 */ 1971arc_buf_t * 1972arc_loan_buf(spa_t *spa, int size) 1973{ 1974 arc_buf_t *buf; 1975 1976 buf = arc_buf_alloc(spa, size, arc_onloan_tag, ARC_BUFC_DATA); 1977 1978 atomic_add_64(&arc_loaned_bytes, size); 1979 return (buf); 1980} 1981 1982/* 1983 * Return a loaned arc buffer to the arc. 1984 */ 1985void 1986arc_return_buf(arc_buf_t *buf, void *tag) 1987{ 1988 arc_buf_hdr_t *hdr = buf->b_hdr; 1989 1990 ASSERT(buf->b_data != NULL); 1991 ASSERT(HDR_HAS_L1HDR(hdr)); 1992 (void) refcount_add(&hdr->b_l1hdr.b_refcnt, tag); 1993 (void) refcount_remove(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag); 1994 1995 atomic_add_64(&arc_loaned_bytes, -hdr->b_size); 1996} 1997 1998/* Detach an arc_buf from a dbuf (tag) */ 1999void 2000arc_loan_inuse_buf(arc_buf_t *buf, void *tag) 2001{ 2002 arc_buf_hdr_t *hdr = buf->b_hdr; 2003 2004 ASSERT(buf->b_data != NULL); 2005 ASSERT(HDR_HAS_L1HDR(hdr)); 2006 (void) refcount_add(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag); 2007 (void) refcount_remove(&hdr->b_l1hdr.b_refcnt, tag); 2008 buf->b_efunc = NULL; 2009 buf->b_private = NULL; 2010 2011 atomic_add_64(&arc_loaned_bytes, hdr->b_size); 2012} 2013 2014static arc_buf_t * 2015arc_buf_clone(arc_buf_t *from) 2016{ 2017 arc_buf_t *buf; 2018 arc_buf_hdr_t *hdr = from->b_hdr; 2019 uint64_t size = hdr->b_size; 2020 2021 ASSERT(HDR_HAS_L1HDR(hdr)); 2022 ASSERT(hdr->b_l1hdr.b_state != arc_anon); 2023 2024 buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); 2025 buf->b_hdr = hdr; 2026 buf->b_data = NULL; 2027 buf->b_efunc = NULL; 2028 buf->b_private = NULL; 2029 buf->b_next = hdr->b_l1hdr.b_buf; 2030 hdr->b_l1hdr.b_buf = buf; 2031 arc_get_data_buf(buf); 2032 bcopy(from->b_data, buf->b_data, size); 2033 2034 /* 2035 * This buffer already exists in the arc so create a duplicate 2036 * copy for the caller. If the buffer is associated with user data 2037 * then track the size and number of duplicates. These stats will be 2038 * updated as duplicate buffers are created and destroyed. 2039 */ 2040 if (HDR_ISTYPE_DATA(hdr)) { 2041 ARCSTAT_BUMP(arcstat_duplicate_buffers); 2042 ARCSTAT_INCR(arcstat_duplicate_buffers_size, size); 2043 } 2044 hdr->b_l1hdr.b_datacnt += 1; 2045 return (buf); 2046} 2047 2048void 2049arc_buf_add_ref(arc_buf_t *buf, void* tag) 2050{ 2051 arc_buf_hdr_t *hdr; 2052 kmutex_t *hash_lock; 2053 2054 /* 2055 * Check to see if this buffer is evicted. Callers 2056 * must verify b_data != NULL to know if the add_ref 2057 * was successful. 2058 */ 2059 mutex_enter(&buf->b_evict_lock); 2060 if (buf->b_data == NULL) { 2061 mutex_exit(&buf->b_evict_lock); 2062 return; 2063 } 2064 hash_lock = HDR_LOCK(buf->b_hdr); 2065 mutex_enter(hash_lock); 2066 hdr = buf->b_hdr; 2067 ASSERT(HDR_HAS_L1HDR(hdr)); 2068 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 2069 mutex_exit(&buf->b_evict_lock); 2070 2071 ASSERT(hdr->b_l1hdr.b_state == arc_mru || 2072 hdr->b_l1hdr.b_state == arc_mfu); 2073 2074 add_reference(hdr, hash_lock, tag); 2075 DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); 2076 arc_access(hdr, hash_lock); 2077 mutex_exit(hash_lock); 2078 ARCSTAT_BUMP(arcstat_hits); 2079 ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr), 2080 demand, prefetch, !HDR_ISTYPE_METADATA(hdr), 2081 data, metadata, hits); 2082} 2083 2084static void 2085arc_buf_free_on_write(void *data, size_t size, 2086 void (*free_func)(void *, size_t)) 2087{ 2088 l2arc_data_free_t *df; 2089 2090 df = kmem_alloc(sizeof (l2arc_data_free_t), KM_SLEEP); 2091 df->l2df_data = data; 2092 df->l2df_size = size; 2093 df->l2df_func = free_func; 2094 mutex_enter(&l2arc_free_on_write_mtx); 2095 list_insert_head(l2arc_free_on_write, df); 2096 mutex_exit(&l2arc_free_on_write_mtx); 2097} 2098 2099/* 2100 * Free the arc data buffer. If it is an l2arc write in progress, 2101 * the buffer is placed on l2arc_free_on_write to be freed later. 2102 */ 2103static void 2104arc_buf_data_free(arc_buf_t *buf, void (*free_func)(void *, size_t)) 2105{ 2106 arc_buf_hdr_t *hdr = buf->b_hdr; 2107 2108 if (HDR_L2_WRITING(hdr)) { 2109 arc_buf_free_on_write(buf->b_data, hdr->b_size, free_func); 2110 ARCSTAT_BUMP(arcstat_l2_free_on_write); 2111 } else { 2112 free_func(buf->b_data, hdr->b_size); 2113 } 2114} 2115 2116/* 2117 * Free up buf->b_data and if 'remove' is set, then pull the 2118 * arc_buf_t off of the the arc_buf_hdr_t's list and free it. 2119 */ 2120static void 2121arc_buf_l2_cdata_free(arc_buf_hdr_t *hdr) 2122{ 2123 ASSERT(HDR_HAS_L2HDR(hdr)); 2124 ASSERT(MUTEX_HELD(&hdr->b_l2hdr.b_dev->l2ad_mtx)); 2125 2126 /* 2127 * The b_tmp_cdata field is linked off of the b_l1hdr, so if 2128 * that doesn't exist, the header is in the arc_l2c_only state, 2129 * and there isn't anything to free (it's already been freed). 2130 */ 2131 if (!HDR_HAS_L1HDR(hdr)) 2132 return; 2133 2134 if (hdr->b_l1hdr.b_tmp_cdata == NULL) 2135 return; 2136 2137 ASSERT(HDR_L2_WRITING(hdr)); 2138 arc_buf_free_on_write(hdr->b_l1hdr.b_tmp_cdata, hdr->b_size, 2139 zio_data_buf_free); 2140 2141 ARCSTAT_BUMP(arcstat_l2_cdata_free_on_write); 2142 hdr->b_l1hdr.b_tmp_cdata = NULL; 2143} 2144 2145static void 2146arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t remove) 2147{ 2148 arc_buf_t **bufp; 2149 2150 /* free up data associated with the buf */ 2151 if (buf->b_data != NULL) { 2152 arc_state_t *state = buf->b_hdr->b_l1hdr.b_state; 2153 uint64_t size = buf->b_hdr->b_size; 2154 arc_buf_contents_t type = arc_buf_type(buf->b_hdr); 2155 2156 arc_cksum_verify(buf); 2157#ifdef illumos 2158 arc_buf_unwatch(buf); 2159#endif 2160 2161 if (!recycle) { 2162 if (type == ARC_BUFC_METADATA) { 2163 arc_buf_data_free(buf, zio_buf_free); 2164 arc_space_return(size, ARC_SPACE_META); 2165 } else { 2166 ASSERT(type == ARC_BUFC_DATA); 2167 arc_buf_data_free(buf, zio_data_buf_free); 2168 arc_space_return(size, ARC_SPACE_DATA); 2169 } 2170 } 2171 if (list_link_active(&buf->b_hdr->b_l1hdr.b_arc_node)) { 2172 uint64_t *cnt = &state->arcs_lsize[type]; 2173 2174 ASSERT(refcount_is_zero( 2175 &buf->b_hdr->b_l1hdr.b_refcnt)); 2176 ASSERT(state != arc_anon && state != arc_l2c_only); 2177 2178 ASSERT3U(*cnt, >=, size); 2179 atomic_add_64(cnt, -size); 2180 } 2181 ASSERT3U(state->arcs_size, >=, size); 2182 atomic_add_64(&state->arcs_size, -size); 2183 buf->b_data = NULL; 2184 2185 /* 2186 * If we're destroying a duplicate buffer make sure 2187 * that the appropriate statistics are updated. 2188 */ 2189 if (buf->b_hdr->b_l1hdr.b_datacnt > 1 && 2190 HDR_ISTYPE_DATA(buf->b_hdr)) { 2191 ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers); 2192 ARCSTAT_INCR(arcstat_duplicate_buffers_size, -size); 2193 } 2194 ASSERT(buf->b_hdr->b_l1hdr.b_datacnt > 0); 2195 buf->b_hdr->b_l1hdr.b_datacnt -= 1; 2196 } 2197 2198 /* only remove the buf if requested */ 2199 if (!remove) 2200 return; 2201 2202 /* remove the buf from the hdr list */ 2203 for (bufp = &buf->b_hdr->b_l1hdr.b_buf; *bufp != buf; 2204 bufp = &(*bufp)->b_next) 2205 continue; 2206 *bufp = buf->b_next; 2207 buf->b_next = NULL; 2208 2209 ASSERT(buf->b_efunc == NULL); 2210 2211 /* clean up the buf */ 2212 buf->b_hdr = NULL; 2213 kmem_cache_free(buf_cache, buf); 2214} 2215 2216static void 2217arc_hdr_destroy(arc_buf_hdr_t *hdr) 2218{ 2219 if (HDR_HAS_L1HDR(hdr)) { 2220 ASSERT(hdr->b_l1hdr.b_buf == NULL || 2221 hdr->b_l1hdr.b_datacnt > 0); 2222 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 2223 ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); 2224 } 2225 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 2226 ASSERT(!HDR_IN_HASH_TABLE(hdr)); 2227 2228 if (HDR_HAS_L2HDR(hdr)) { 2229 l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr; 2230 boolean_t buflist_held = MUTEX_HELD(&l2hdr->b_dev->l2ad_mtx); 2231 2232 if (!buflist_held) { 2233 mutex_enter(&l2hdr->b_dev->l2ad_mtx); 2234 l2hdr = &hdr->b_l2hdr; 2235 } 2236 2237 trim_map_free(l2hdr->b_dev->l2ad_vdev, l2hdr->b_daddr, 2238 l2hdr->b_asize, 0); 2239 list_remove(&l2hdr->b_dev->l2ad_buflist, hdr); 2240 2241 /* 2242 * We don't want to leak the b_tmp_cdata buffer that was 2243 * allocated in l2arc_write_buffers() 2244 */ 2245 arc_buf_l2_cdata_free(hdr); 2246 2247 ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); 2248 ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize); 2249 2250 if (!buflist_held) 2251 mutex_exit(&l2hdr->b_dev->l2ad_mtx); 2252 2253 hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR; 2254 } 2255 2256 if (!BUF_EMPTY(hdr)) 2257 buf_discard_identity(hdr); 2258 if (hdr->b_freeze_cksum != NULL) { 2259 kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t)); 2260 hdr->b_freeze_cksum = NULL; 2261 } 2262 2263 if (HDR_HAS_L1HDR(hdr)) { 2264 while (hdr->b_l1hdr.b_buf) { 2265 arc_buf_t *buf = hdr->b_l1hdr.b_buf; 2266 2267 if (buf->b_efunc != NULL) { 2268 mutex_enter(&arc_eviction_mtx); 2269 mutex_enter(&buf->b_evict_lock); 2270 ASSERT(buf->b_hdr != NULL); 2271 arc_buf_destroy(hdr->b_l1hdr.b_buf, FALSE, 2272 FALSE); 2273 hdr->b_l1hdr.b_buf = buf->b_next; 2274 buf->b_hdr = &arc_eviction_hdr; 2275 buf->b_next = arc_eviction_list; 2276 arc_eviction_list = buf; 2277 mutex_exit(&buf->b_evict_lock); 2278 mutex_exit(&arc_eviction_mtx); 2279 } else { 2280 arc_buf_destroy(hdr->b_l1hdr.b_buf, FALSE, 2281 TRUE); 2282 } 2283 } 2284#ifdef ZFS_DEBUG 2285 if (hdr->b_l1hdr.b_thawed != NULL) { 2286 kmem_free(hdr->b_l1hdr.b_thawed, 1); 2287 hdr->b_l1hdr.b_thawed = NULL; 2288 } 2289#endif 2290 } 2291 2292 ASSERT3P(hdr->b_hash_next, ==, NULL); 2293 if (HDR_HAS_L1HDR(hdr)) { 2294 ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node)); 2295 ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); 2296 kmem_cache_free(hdr_full_cache, hdr); 2297 } else { 2298 kmem_cache_free(hdr_l2only_cache, hdr); 2299 } 2300} 2301 2302void 2303arc_buf_free(arc_buf_t *buf, void *tag) 2304{ 2305 arc_buf_hdr_t *hdr = buf->b_hdr; 2306 int hashed = hdr->b_l1hdr.b_state != arc_anon; 2307 2308 ASSERT(buf->b_efunc == NULL); 2309 ASSERT(buf->b_data != NULL); 2310 2311 if (hashed) { 2312 kmutex_t *hash_lock = HDR_LOCK(hdr); 2313 2314 mutex_enter(hash_lock); 2315 hdr = buf->b_hdr; 2316 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 2317 2318 (void) remove_reference(hdr, hash_lock, tag); 2319 if (hdr->b_l1hdr.b_datacnt > 1) { 2320 arc_buf_destroy(buf, FALSE, TRUE); 2321 } else { 2322 ASSERT(buf == hdr->b_l1hdr.b_buf); 2323 ASSERT(buf->b_efunc == NULL); 2324 hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE; 2325 } 2326 mutex_exit(hash_lock); 2327 } else if (HDR_IO_IN_PROGRESS(hdr)) { 2328 int destroy_hdr; 2329 /* 2330 * We are in the middle of an async write. Don't destroy 2331 * this buffer unless the write completes before we finish 2332 * decrementing the reference count. 2333 */ 2334 mutex_enter(&arc_eviction_mtx); 2335 (void) remove_reference(hdr, NULL, tag); 2336 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 2337 destroy_hdr = !HDR_IO_IN_PROGRESS(hdr); 2338 mutex_exit(&arc_eviction_mtx); 2339 if (destroy_hdr) 2340 arc_hdr_destroy(hdr); 2341 } else { 2342 if (remove_reference(hdr, NULL, tag) > 0) 2343 arc_buf_destroy(buf, FALSE, TRUE); 2344 else 2345 arc_hdr_destroy(hdr); 2346 } 2347} 2348 2349boolean_t 2350arc_buf_remove_ref(arc_buf_t *buf, void* tag) 2351{ 2352 arc_buf_hdr_t *hdr = buf->b_hdr; 2353 kmutex_t *hash_lock = HDR_LOCK(hdr); 2354 boolean_t no_callback = (buf->b_efunc == NULL); 2355 2356 if (hdr->b_l1hdr.b_state == arc_anon) { 2357 ASSERT(hdr->b_l1hdr.b_datacnt == 1); 2358 arc_buf_free(buf, tag); 2359 return (no_callback); 2360 } 2361 2362 mutex_enter(hash_lock); 2363 hdr = buf->b_hdr; 2364 ASSERT(hdr->b_l1hdr.b_datacnt > 0); 2365 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 2366 ASSERT(hdr->b_l1hdr.b_state != arc_anon); 2367 ASSERT(buf->b_data != NULL); 2368 2369 (void) remove_reference(hdr, hash_lock, tag); 2370 if (hdr->b_l1hdr.b_datacnt > 1) { 2371 if (no_callback) 2372 arc_buf_destroy(buf, FALSE, TRUE); 2373 } else if (no_callback) { 2374 ASSERT(hdr->b_l1hdr.b_buf == buf && buf->b_next == NULL); 2375 ASSERT(buf->b_efunc == NULL); 2376 hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE; 2377 } 2378 ASSERT(no_callback || hdr->b_l1hdr.b_datacnt > 1 || 2379 refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 2380 mutex_exit(hash_lock); 2381 return (no_callback); 2382} 2383 2384int32_t 2385arc_buf_size(arc_buf_t *buf) 2386{ 2387 return (buf->b_hdr->b_size); 2388} 2389 2390/* 2391 * Called from the DMU to determine if the current buffer should be 2392 * evicted. In order to ensure proper locking, the eviction must be initiated 2393 * from the DMU. Return true if the buffer is associated with user data and 2394 * duplicate buffers still exist. 2395 */ 2396boolean_t 2397arc_buf_eviction_needed(arc_buf_t *buf) 2398{ 2399 arc_buf_hdr_t *hdr; 2400 boolean_t evict_needed = B_FALSE; 2401 2402 if (zfs_disable_dup_eviction) 2403 return (B_FALSE); 2404 2405 mutex_enter(&buf->b_evict_lock); 2406 hdr = buf->b_hdr; 2407 if (hdr == NULL) { 2408 /* 2409 * We are in arc_do_user_evicts(); let that function 2410 * perform the eviction. 2411 */ 2412 ASSERT(buf->b_data == NULL); 2413 mutex_exit(&buf->b_evict_lock); 2414 return (B_FALSE); 2415 } else if (buf->b_data == NULL) { 2416 /* 2417 * We have already been added to the arc eviction list; 2418 * recommend eviction. 2419 */ 2420 ASSERT3P(hdr, ==, &arc_eviction_hdr); 2421 mutex_exit(&buf->b_evict_lock); 2422 return (B_TRUE); 2423 } 2424 2425 if (hdr->b_l1hdr.b_datacnt > 1 && HDR_ISTYPE_DATA(hdr)) 2426 evict_needed = B_TRUE; 2427 2428 mutex_exit(&buf->b_evict_lock); 2429 return (evict_needed); 2430} 2431 2432/* 2433 * Evict buffers from list until we've removed the specified number of 2434 * bytes. Move the removed buffers to the appropriate evict state. 2435 * If the recycle flag is set, then attempt to "recycle" a buffer: 2436 * - look for a buffer to evict that is `bytes' long. 2437 * - return the data block from this buffer rather than freeing it. 2438 * This flag is used by callers that are trying to make space for a 2439 * new buffer in a full arc cache. 2440 * 2441 * This function makes a "best effort". It skips over any buffers 2442 * it can't get a hash_lock on, and so may not catch all candidates. 2443 * It may also return without evicting as much space as requested. 2444 */ 2445static void * 2446arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle, 2447 arc_buf_contents_t type) 2448{ 2449 arc_state_t *evicted_state; 2450 uint64_t bytes_evicted = 0, skipped = 0, missed = 0; 2451 int64_t bytes_remaining; 2452 arc_buf_hdr_t *hdr, *hdr_prev = NULL; 2453 list_t *evicted_list, *list, *evicted_list_start, *list_start; 2454 kmutex_t *lock, *evicted_lock; 2455 kmutex_t *hash_lock; 2456 boolean_t have_lock; 2457 void *stolen = NULL; 2458 arc_buf_hdr_t marker = { 0 }; 2459 int count = 0; 2460 static int evict_metadata_offset, evict_data_offset; 2461 int i, idx, offset, list_count, lists; 2462 2463 ASSERT(state == arc_mru || state == arc_mfu); 2464 2465 evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost; 2466 2467 /* 2468 * Decide which "type" (data vs metadata) to recycle from. 2469 * 2470 * If we are over the metadata limit, recycle from metadata. 2471 * If we are under the metadata minimum, recycle from data. 2472 * Otherwise, recycle from whichever type has the oldest (least 2473 * recently accessed) header. This is not yet implemented. 2474 */ 2475 if (recycle) { 2476 arc_buf_contents_t realtype; 2477 if (state->arcs_lsize[ARC_BUFC_DATA] == 0) { 2478 realtype = ARC_BUFC_METADATA; 2479 } else if (state->arcs_lsize[ARC_BUFC_METADATA] == 0) { 2480 realtype = ARC_BUFC_DATA; 2481 } else if (arc_meta_used >= arc_meta_limit) { 2482 realtype = ARC_BUFC_METADATA; 2483 } else if (arc_meta_used <= arc_meta_min) { 2484 realtype = ARC_BUFC_DATA; 2485#ifdef illumos 2486 } else if (HDR_HAS_L1HDR(data_hdr) && 2487 HDR_HAS_L1HDR(metadata_hdr) && 2488 data_hdr->b_l1hdr.b_arc_access < 2489 metadata_hdr->b_l1hdr.b_arc_access) { 2490 realtype = ARC_BUFC_DATA; 2491 } else { 2492 realtype = ARC_BUFC_METADATA; 2493#else 2494 } else { 2495 /* TODO */ 2496 realtype = type; 2497#endif 2498 } 2499 if (realtype != type) { 2500 /* 2501 * If we want to evict from a different list, 2502 * we can not recycle, because DATA vs METADATA 2503 * buffers are segregated into different kmem 2504 * caches (and vmem arenas). 2505 */ 2506 type = realtype; 2507 recycle = B_FALSE; 2508 } 2509 } 2510 2511 if (type == ARC_BUFC_METADATA) { 2512 offset = 0; 2513 list_count = ARC_BUFC_NUMMETADATALISTS; 2514 list_start = &state->arcs_lists[0]; 2515 evicted_list_start = &evicted_state->arcs_lists[0]; 2516 idx = evict_metadata_offset; 2517 } else { 2518 offset = ARC_BUFC_NUMMETADATALISTS; 2519 list_start = &state->arcs_lists[offset]; 2520 evicted_list_start = &evicted_state->arcs_lists[offset]; 2521 list_count = ARC_BUFC_NUMDATALISTS; 2522 idx = evict_data_offset; 2523 } 2524 bytes_remaining = evicted_state->arcs_lsize[type]; 2525 lists = 0; 2526 2527evict_start: 2528 list = &list_start[idx]; 2529 evicted_list = &evicted_list_start[idx]; 2530 lock = ARCS_LOCK(state, (offset + idx)); 2531 evicted_lock = ARCS_LOCK(evicted_state, (offset + idx)); 2532 2533 /* 2534 * The ghost list lock must be acquired first in order to prevent 2535 * a 3 party deadlock: 2536 * 2537 * - arc_evict_ghost acquires arc_*_ghost->arcs_mtx, followed by 2538 * l2ad_mtx in arc_hdr_realloc 2539 * - l2arc_write_buffers acquires l2ad_mtx, followed by arc_*->arcs_mtx 2540 * - arc_evict acquires arc_*_ghost->arcs_mtx, followed by 2541 * arc_*_ghost->arcs_mtx and forms a deadlock cycle. 2542 * 2543 * This situation is avoided by acquiring the ghost list lock first. 2544 */ 2545 mutex_enter(evicted_lock); 2546 mutex_enter(lock); 2547 2548 for (hdr = list_tail(list); hdr; hdr = hdr_prev) { 2549 hdr_prev = list_prev(list, hdr); 2550 if (HDR_HAS_L1HDR(hdr)) { 2551 bytes_remaining -= 2552 (hdr->b_size * hdr->b_l1hdr.b_datacnt); 2553 } 2554 /* prefetch buffers have a minimum lifespan */ 2555 if (HDR_IO_IN_PROGRESS(hdr) || 2556 (spa && hdr->b_spa != spa) || 2557 ((hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT)) && 2558 ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access < 2559 arc_min_prefetch_lifespan)) { 2560 skipped++; 2561 continue; 2562 } 2563 /* "lookahead" for better eviction candidate */ 2564 if (recycle && hdr->b_size != bytes && 2565 hdr_prev && hdr_prev->b_size == bytes) 2566 continue; 2567 2568 /* ignore markers */ 2569 if (hdr->b_spa == 0) 2570 continue; 2571 2572 /* 2573 * It may take a long time to evict all the bufs requested. 2574 * To avoid blocking all arc activity, periodically drop 2575 * the arcs_mtx and give other threads a chance to run 2576 * before reacquiring the lock. 2577 * 2578 * If we are looking for a buffer to recycle, we are in 2579 * the hot code path, so don't sleep. 2580 */ 2581 if (!recycle && count++ > arc_evict_iterations) { 2582 list_insert_after(list, hdr, &marker); 2583 mutex_exit(lock); 2584 mutex_exit(evicted_lock); 2585 kpreempt(KPREEMPT_SYNC); 2586 mutex_enter(evicted_lock); 2587 mutex_enter(lock); 2588 hdr_prev = list_prev(list, &marker); 2589 list_remove(list, &marker); 2590 count = 0; 2591 continue; 2592 } 2593 2594 hash_lock = HDR_LOCK(hdr); 2595 have_lock = MUTEX_HELD(hash_lock); 2596 if (have_lock || mutex_tryenter(hash_lock)) { 2597 ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt)); 2598 ASSERT3U(hdr->b_l1hdr.b_datacnt, >, 0); 2599 while (hdr->b_l1hdr.b_buf) { 2600 arc_buf_t *buf = hdr->b_l1hdr.b_buf; 2601 if (!mutex_tryenter(&buf->b_evict_lock)) { 2602 missed += 1; 2603 break; 2604 } 2605 if (buf->b_data != NULL) { 2606 bytes_evicted += hdr->b_size; 2607 if (recycle && 2608 arc_buf_type(hdr) == type && 2609 hdr->b_size == bytes && 2610 !HDR_L2_WRITING(hdr)) { 2611 stolen = buf->b_data; 2612 recycle = FALSE; 2613 } 2614 } 2615 if (buf->b_efunc != NULL) { 2616 mutex_enter(&arc_eviction_mtx); 2617 arc_buf_destroy(buf, 2618 buf->b_data == stolen, FALSE); 2619 hdr->b_l1hdr.b_buf = buf->b_next; 2620 buf->b_hdr = &arc_eviction_hdr; 2621 buf->b_next = arc_eviction_list; 2622 arc_eviction_list = buf; 2623 mutex_exit(&arc_eviction_mtx); 2624 mutex_exit(&buf->b_evict_lock); 2625 } else { 2626 mutex_exit(&buf->b_evict_lock); 2627 arc_buf_destroy(buf, 2628 buf->b_data == stolen, TRUE); 2629 } 2630 } 2631 2632 if (HDR_HAS_L2HDR(hdr)) { 2633 ARCSTAT_INCR(arcstat_evict_l2_cached, 2634 hdr->b_size); 2635 } else { 2636 if (l2arc_write_eligible(hdr->b_spa, hdr)) { 2637 ARCSTAT_INCR(arcstat_evict_l2_eligible, 2638 hdr->b_size); 2639 } else { 2640 ARCSTAT_INCR( 2641 arcstat_evict_l2_ineligible, 2642 hdr->b_size); 2643 } 2644 } 2645 2646 if (hdr->b_l1hdr.b_datacnt == 0) { 2647 arc_change_state(evicted_state, hdr, hash_lock); 2648 ASSERT(HDR_IN_HASH_TABLE(hdr)); 2649 hdr->b_flags |= ARC_FLAG_IN_HASH_TABLE; 2650 hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE; 2651 DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, hdr); 2652 } 2653 if (!have_lock) 2654 mutex_exit(hash_lock); 2655 if (bytes >= 0 && bytes_evicted >= bytes) 2656 break; 2657 if (bytes_remaining > 0) { 2658 mutex_exit(evicted_lock); 2659 mutex_exit(lock); 2660 idx = ((idx + 1) & (list_count - 1)); 2661 lists++; 2662 goto evict_start; 2663 } 2664 } else { 2665 missed += 1; 2666 } 2667 } 2668 2669 mutex_exit(lock); 2670 mutex_exit(evicted_lock); 2671 2672 idx = ((idx + 1) & (list_count - 1)); 2673 lists++; 2674 2675 if (bytes_evicted < bytes) { 2676 if (lists < list_count) 2677 goto evict_start; 2678 else 2679 dprintf("only evicted %lld bytes from %x", 2680 (longlong_t)bytes_evicted, state); 2681 } 2682 if (type == ARC_BUFC_METADATA) 2683 evict_metadata_offset = idx; 2684 else 2685 evict_data_offset = idx; 2686 2687 if (skipped) 2688 ARCSTAT_INCR(arcstat_evict_skip, skipped); 2689 2690 if (missed) 2691 ARCSTAT_INCR(arcstat_mutex_miss, missed); 2692 2693 /* 2694 * Note: we have just evicted some data into the ghost state, 2695 * potentially putting the ghost size over the desired size. Rather 2696 * that evicting from the ghost list in this hot code path, leave 2697 * this chore to the arc_reclaim_thread(). 2698 */ 2699 2700 if (stolen) 2701 ARCSTAT_BUMP(arcstat_stolen); 2702 return (stolen); 2703} 2704 2705/* 2706 * Remove buffers from list until we've removed the specified number of 2707 * bytes. Destroy the buffers that are removed. 2708 */ 2709static void 2710arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes) 2711{ 2712 arc_buf_hdr_t *hdr, *hdr_prev; 2713 arc_buf_hdr_t marker = { 0 }; 2714 list_t *list, *list_start; 2715 kmutex_t *hash_lock, *lock; 2716 uint64_t bytes_deleted = 0; 2717 uint64_t bufs_skipped = 0; 2718 int count = 0; 2719 static int evict_offset; 2720 int list_count, idx = evict_offset; 2721 int offset, lists = 0; 2722 2723 ASSERT(GHOST_STATE(state)); 2724 2725 /* 2726 * data lists come after metadata lists 2727 */ 2728 list_start = &state->arcs_lists[ARC_BUFC_NUMMETADATALISTS]; 2729 list_count = ARC_BUFC_NUMDATALISTS; 2730 offset = ARC_BUFC_NUMMETADATALISTS; 2731 2732evict_start: 2733 list = &list_start[idx]; 2734 lock = ARCS_LOCK(state, idx + offset); 2735 2736 mutex_enter(lock); 2737 for (hdr = list_tail(list); hdr; hdr = hdr_prev) { 2738 hdr_prev = list_prev(list, hdr); 2739 if (arc_buf_type(hdr) >= ARC_BUFC_NUMTYPES) 2740 panic("invalid hdr=%p", (void *)hdr); 2741 if (spa && hdr->b_spa != spa) 2742 continue; 2743 2744 /* ignore markers */ 2745 if (hdr->b_spa == 0) 2746 continue; 2747 2748 hash_lock = HDR_LOCK(hdr); 2749 /* caller may be trying to modify this buffer, skip it */ 2750 if (MUTEX_HELD(hash_lock)) 2751 continue; 2752 2753 /* 2754 * It may take a long time to evict all the bufs requested. 2755 * To avoid blocking all arc activity, periodically drop 2756 * the arcs_mtx and give other threads a chance to run 2757 * before reacquiring the lock. 2758 */ 2759 if (count++ > arc_evict_iterations) { 2760 list_insert_after(list, hdr, &marker); 2761 mutex_exit(lock); 2762 kpreempt(KPREEMPT_SYNC); 2763 mutex_enter(lock); 2764 hdr_prev = list_prev(list, &marker); 2765 list_remove(list, &marker); 2766 count = 0; 2767 continue; 2768 } 2769 if (mutex_tryenter(hash_lock)) { 2770 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 2771 ASSERT(!HDR_HAS_L1HDR(hdr) || 2772 hdr->b_l1hdr.b_buf == NULL); 2773 ARCSTAT_BUMP(arcstat_deleted); 2774 bytes_deleted += hdr->b_size; 2775 2776 if (HDR_HAS_L2HDR(hdr)) { 2777 /* 2778 * This buffer is cached on the 2nd Level ARC; 2779 * don't destroy the header. 2780 */ 2781 arc_change_state(arc_l2c_only, hdr, hash_lock); 2782 /* 2783 * dropping from L1+L2 cached to L2-only, 2784 * realloc to remove the L1 header. 2785 */ 2786 hdr = arc_hdr_realloc(hdr, hdr_full_cache, 2787 hdr_l2only_cache); 2788 mutex_exit(hash_lock); 2789 } else { 2790 arc_change_state(arc_anon, hdr, hash_lock); 2791 mutex_exit(hash_lock); 2792 arc_hdr_destroy(hdr); 2793 } 2794 2795 DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, hdr); 2796 if (bytes >= 0 && bytes_deleted >= bytes) 2797 break; 2798 } else if (bytes < 0) { 2799 /* 2800 * Insert a list marker and then wait for the 2801 * hash lock to become available. Once its 2802 * available, restart from where we left off. 2803 */ 2804 list_insert_after(list, hdr, &marker); 2805 mutex_exit(lock); 2806 mutex_enter(hash_lock); 2807 mutex_exit(hash_lock); 2808 mutex_enter(lock); 2809 hdr_prev = list_prev(list, &marker); 2810 list_remove(list, &marker); 2811 } else { 2812 bufs_skipped += 1; 2813 } 2814 2815 } 2816 mutex_exit(lock); 2817 idx = ((idx + 1) & (ARC_BUFC_NUMDATALISTS - 1)); 2818 lists++; 2819 2820 if (lists < list_count) 2821 goto evict_start; 2822 2823 evict_offset = idx; 2824 if ((uintptr_t)list > (uintptr_t)&state->arcs_lists[ARC_BUFC_NUMMETADATALISTS] && 2825 (bytes < 0 || bytes_deleted < bytes)) { 2826 list_start = &state->arcs_lists[0]; 2827 list_count = ARC_BUFC_NUMMETADATALISTS; 2828 offset = lists = 0; 2829 goto evict_start; 2830 } 2831 2832 if (bufs_skipped) { 2833 ARCSTAT_INCR(arcstat_mutex_miss, bufs_skipped); 2834 ASSERT(bytes >= 0); 2835 } 2836 2837 if (bytes_deleted < bytes) 2838 dprintf("only deleted %lld bytes from %p", 2839 (longlong_t)bytes_deleted, state); 2840} 2841 2842static void 2843arc_adjust(void) 2844{ 2845 int64_t adjustment, delta; 2846 2847 /* 2848 * Adjust MRU size 2849 */ 2850 2851 adjustment = MIN((int64_t)(arc_size - arc_c), 2852 (int64_t)(arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used - 2853 arc_p)); 2854 2855 if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) { 2856 delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], adjustment); 2857 (void) arc_evict(arc_mru, 0, delta, FALSE, ARC_BUFC_DATA); 2858 adjustment -= delta; 2859 } 2860 2861 if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) { 2862 delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustment); 2863 (void) arc_evict(arc_mru, 0, delta, FALSE, 2864 ARC_BUFC_METADATA); 2865 } 2866 2867 /* 2868 * Adjust MFU size 2869 */ 2870 2871 adjustment = arc_size - arc_c; 2872 2873 if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) { 2874 delta = MIN(adjustment, arc_mfu->arcs_lsize[ARC_BUFC_DATA]); 2875 (void) arc_evict(arc_mfu, 0, delta, FALSE, ARC_BUFC_DATA); 2876 adjustment -= delta; 2877 } 2878 2879 if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) { 2880 int64_t delta = MIN(adjustment, 2881 arc_mfu->arcs_lsize[ARC_BUFC_METADATA]); 2882 (void) arc_evict(arc_mfu, 0, delta, FALSE, 2883 ARC_BUFC_METADATA); 2884 } 2885 2886 /* 2887 * Adjust ghost lists 2888 */ 2889 2890 adjustment = arc_mru->arcs_size + arc_mru_ghost->arcs_size - arc_c; 2891 2892 if (adjustment > 0 && arc_mru_ghost->arcs_size > 0) { 2893 delta = MIN(arc_mru_ghost->arcs_size, adjustment); 2894 arc_evict_ghost(arc_mru_ghost, 0, delta); 2895 } 2896 2897 adjustment = 2898 arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size - arc_c; 2899 2900 if (adjustment > 0 && arc_mfu_ghost->arcs_size > 0) { 2901 delta = MIN(arc_mfu_ghost->arcs_size, adjustment); 2902 arc_evict_ghost(arc_mfu_ghost, 0, delta); 2903 } 2904} 2905 2906static void 2907arc_do_user_evicts(void) 2908{ 2909 static arc_buf_t *tmp_arc_eviction_list; 2910 2911 /* 2912 * Move list over to avoid LOR 2913 */ 2914restart: 2915 mutex_enter(&arc_eviction_mtx); 2916 tmp_arc_eviction_list = arc_eviction_list; 2917 arc_eviction_list = NULL; 2918 mutex_exit(&arc_eviction_mtx); 2919 2920 while (tmp_arc_eviction_list != NULL) { 2921 arc_buf_t *buf = tmp_arc_eviction_list; 2922 tmp_arc_eviction_list = buf->b_next; 2923 mutex_enter(&buf->b_evict_lock); 2924 buf->b_hdr = NULL; 2925 mutex_exit(&buf->b_evict_lock); 2926 2927 if (buf->b_efunc != NULL) 2928 VERIFY0(buf->b_efunc(buf->b_private)); 2929 2930 buf->b_efunc = NULL; 2931 buf->b_private = NULL; 2932 kmem_cache_free(buf_cache, buf); 2933 } 2934 2935 if (arc_eviction_list != NULL) 2936 goto restart; 2937} 2938 2939/* 2940 * Flush all *evictable* data from the cache for the given spa. 2941 * NOTE: this will not touch "active" (i.e. referenced) data. 2942 */ 2943void 2944arc_flush(spa_t *spa) 2945{ 2946 uint64_t guid = 0; 2947 2948 if (spa != NULL) 2949 guid = spa_load_guid(spa); 2950 2951 while (arc_mru->arcs_lsize[ARC_BUFC_DATA]) { 2952 (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_DATA); 2953 if (spa != NULL) 2954 break; 2955 } 2956 while (arc_mru->arcs_lsize[ARC_BUFC_METADATA]) { 2957 (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_METADATA); 2958 if (spa != NULL) 2959 break; 2960 } 2961 while (arc_mfu->arcs_lsize[ARC_BUFC_DATA]) { 2962 (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_DATA); 2963 if (spa != NULL) 2964 break; 2965 } 2966 while (arc_mfu->arcs_lsize[ARC_BUFC_METADATA]) { 2967 (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_METADATA); 2968 if (spa != NULL) 2969 break; 2970 } 2971 2972 arc_evict_ghost(arc_mru_ghost, guid, -1); 2973 arc_evict_ghost(arc_mfu_ghost, guid, -1); 2974 2975 mutex_enter(&arc_reclaim_thr_lock); 2976 arc_do_user_evicts(); 2977 mutex_exit(&arc_reclaim_thr_lock); 2978 ASSERT(spa || arc_eviction_list == NULL); 2979} 2980 2981void 2982arc_shrink(void) 2983{ 2984 2985 if (arc_c > arc_c_min) { 2986 uint64_t to_free; 2987 2988 to_free = arc_c >> arc_shrink_shift; 2989 DTRACE_PROBE4(arc__shrink, uint64_t, arc_c, uint64_t, 2990 arc_c_min, uint64_t, arc_p, uint64_t, to_free); 2991 if (arc_c > arc_c_min + to_free) 2992 atomic_add_64(&arc_c, -to_free); 2993 else 2994 arc_c = arc_c_min; 2995 2996 atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift)); 2997 if (arc_c > arc_size) 2998 arc_c = MAX(arc_size, arc_c_min); 2999 if (arc_p > arc_c) 3000 arc_p = (arc_c >> 1); 3001 3002 DTRACE_PROBE2(arc__shrunk, uint64_t, arc_c, uint64_t, 3003 arc_p); 3004 3005 ASSERT(arc_c >= arc_c_min); 3006 ASSERT((int64_t)arc_p >= 0); 3007 } 3008 3009 if (arc_size > arc_c) { 3010 DTRACE_PROBE2(arc__shrink_adjust, uint64_t, arc_size, 3011 uint64_t, arc_c); 3012 arc_adjust(); 3013 } 3014} 3015 3016static int needfree = 0; 3017 3018static int 3019arc_reclaim_needed(void) 3020{ 3021 3022#ifdef _KERNEL 3023 3024 if (needfree) { 3025 DTRACE_PROBE(arc__reclaim_needfree); 3026 return (1); 3027 } 3028 3029 /* 3030 * Cooperate with pagedaemon when it's time for it to scan 3031 * and reclaim some pages. 3032 */ 3033 if (freemem < zfs_arc_free_target) { 3034 DTRACE_PROBE2(arc__reclaim_freemem, uint64_t, 3035 freemem, uint64_t, zfs_arc_free_target); 3036 return (1); 3037 } 3038 3039#ifdef illumos 3040 /* 3041 * take 'desfree' extra pages, so we reclaim sooner, rather than later 3042 */ 3043 extra = desfree; 3044 3045 /* 3046 * check that we're out of range of the pageout scanner. It starts to 3047 * schedule paging if freemem is less than lotsfree and needfree. 3048 * lotsfree is the high-water mark for pageout, and needfree is the 3049 * number of needed free pages. We add extra pages here to make sure 3050 * the scanner doesn't start up while we're freeing memory. 3051 */ 3052 if (freemem < lotsfree + needfree + extra) 3053 return (1); 3054 3055 /* 3056 * check to make sure that swapfs has enough space so that anon 3057 * reservations can still succeed. anon_resvmem() checks that the 3058 * availrmem is greater than swapfs_minfree, and the number of reserved 3059 * swap pages. We also add a bit of extra here just to prevent 3060 * circumstances from getting really dire. 3061 */ 3062 if (availrmem < swapfs_minfree + swapfs_reserve + extra) 3063 return (1); 3064 3065 /* 3066 * Check that we have enough availrmem that memory locking (e.g., via 3067 * mlock(3C) or memcntl(2)) can still succeed. (pages_pp_maximum 3068 * stores the number of pages that cannot be locked; when availrmem 3069 * drops below pages_pp_maximum, page locking mechanisms such as 3070 * page_pp_lock() will fail.) 3071 */ 3072 if (availrmem <= pages_pp_maximum) 3073 return (1); 3074 3075#endif /* illumos */ 3076#if defined(__i386) || !defined(UMA_MD_SMALL_ALLOC) 3077 /* 3078 * If we're on an i386 platform, it's possible that we'll exhaust the 3079 * kernel heap space before we ever run out of available physical 3080 * memory. Most checks of the size of the heap_area compare against 3081 * tune.t_minarmem, which is the minimum available real memory that we 3082 * can have in the system. However, this is generally fixed at 25 pages 3083 * which is so low that it's useless. In this comparison, we seek to 3084 * calculate the total heap-size, and reclaim if more than 3/4ths of the 3085 * heap is allocated. (Or, in the calculation, if less than 1/4th is 3086 * free) 3087 */ 3088 if (vmem_size(heap_arena, VMEM_FREE) < 3089 (vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC) >> 2)) { 3090 DTRACE_PROBE2(arc__reclaim_used, uint64_t, 3091 vmem_size(heap_arena, VMEM_FREE), uint64_t, 3092 (vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC)) >> 2); 3093 return (1); 3094 } 3095#define zio_arena NULL 3096#else 3097#define zio_arena heap_arena 3098#endif 3099 3100 /* 3101 * If zio data pages are being allocated out of a separate heap segment, 3102 * then enforce that the size of available vmem for this arena remains 3103 * above about 1/16th free. 3104 * 3105 * Note: The 1/16th arena free requirement was put in place 3106 * to aggressively evict memory from the arc in order to avoid 3107 * memory fragmentation issues. 3108 */ 3109 if (zio_arena != NULL && 3110 vmem_size(zio_arena, VMEM_FREE) < 3111 (vmem_size(zio_arena, VMEM_ALLOC) >> 4)) 3112 return (1); 3113 3114 /* 3115 * Above limits know nothing about real level of KVA fragmentation. 3116 * Start aggressive reclamation if too little sequential KVA left. 3117 */ 3118 if (vmem_size(heap_arena, VMEM_MAXFREE) < zfs_max_recordsize) { 3119 DTRACE_PROBE2(arc__reclaim_maxfree, uint64_t, 3120 vmem_size(heap_arena, VMEM_MAXFREE), 3121 uint64_t, zfs_max_recordsize); 3122 return (1); 3123 } 3124 3125#else /* _KERNEL */ 3126 if (spa_get_random(100) == 0) 3127 return (1); 3128#endif /* _KERNEL */ 3129 DTRACE_PROBE(arc__reclaim_no); 3130 3131 return (0); 3132} 3133 3134extern kmem_cache_t *zio_buf_cache[]; 3135extern kmem_cache_t *zio_data_buf_cache[]; 3136extern kmem_cache_t *range_seg_cache; 3137 3138static __noinline void 3139arc_kmem_reap_now(arc_reclaim_strategy_t strat) 3140{ 3141 size_t i; 3142 kmem_cache_t *prev_cache = NULL; 3143 kmem_cache_t *prev_data_cache = NULL; 3144 3145 DTRACE_PROBE(arc__kmem_reap_start); 3146#ifdef _KERNEL 3147 if (arc_meta_used >= arc_meta_limit) { 3148 /* 3149 * We are exceeding our meta-data cache limit. 3150 * Purge some DNLC entries to release holds on meta-data. 3151 */ 3152 dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent); 3153 } 3154#if defined(__i386) 3155 /* 3156 * Reclaim unused memory from all kmem caches. 3157 */ 3158 kmem_reap(); 3159#endif 3160#endif 3161 3162 /* 3163 * An aggressive reclamation will shrink the cache size as well as 3164 * reap free buffers from the arc kmem caches. 3165 */ 3166 if (strat == ARC_RECLAIM_AGGR) 3167 arc_shrink(); 3168 3169 for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) { 3170 if (zio_buf_cache[i] != prev_cache) { 3171 prev_cache = zio_buf_cache[i]; 3172 kmem_cache_reap_now(zio_buf_cache[i]); 3173 } 3174 if (zio_data_buf_cache[i] != prev_data_cache) { 3175 prev_data_cache = zio_data_buf_cache[i]; 3176 kmem_cache_reap_now(zio_data_buf_cache[i]); 3177 } 3178 } 3179 kmem_cache_reap_now(buf_cache); 3180 kmem_cache_reap_now(hdr_full_cache); 3181 kmem_cache_reap_now(hdr_l2only_cache); 3182 kmem_cache_reap_now(range_seg_cache); 3183 3184#ifdef illumos 3185 /* 3186 * Ask the vmem arena to reclaim unused memory from its 3187 * quantum caches. 3188 */ 3189 if (zio_arena != NULL && strat == ARC_RECLAIM_AGGR) 3190 vmem_qcache_reap(zio_arena); 3191#endif 3192 DTRACE_PROBE(arc__kmem_reap_end); 3193} 3194 3195static void 3196arc_reclaim_thread(void *dummy __unused) 3197{ 3198 clock_t growtime = 0; 3199 arc_reclaim_strategy_t last_reclaim = ARC_RECLAIM_CONS; 3200 callb_cpr_t cpr; 3201 3202 CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG); 3203 3204 mutex_enter(&arc_reclaim_thr_lock); 3205 while (arc_thread_exit == 0) { 3206 if (arc_reclaim_needed()) { 3207 3208 if (arc_no_grow) { 3209 if (last_reclaim == ARC_RECLAIM_CONS) { 3210 DTRACE_PROBE(arc__reclaim_aggr_no_grow); 3211 last_reclaim = ARC_RECLAIM_AGGR; 3212 } else { 3213 last_reclaim = ARC_RECLAIM_CONS; 3214 } 3215 } else { 3216 arc_no_grow = TRUE; 3217 last_reclaim = ARC_RECLAIM_AGGR; 3218 DTRACE_PROBE(arc__reclaim_aggr); 3219 membar_producer(); 3220 } 3221 3222 /* reset the growth delay for every reclaim */ 3223 growtime = ddi_get_lbolt() + (arc_grow_retry * hz); 3224 3225 if (needfree && last_reclaim == ARC_RECLAIM_CONS) { 3226 /* 3227 * If needfree is TRUE our vm_lowmem hook 3228 * was called and in that case we must free some 3229 * memory, so switch to aggressive mode. 3230 */ 3231 arc_no_grow = TRUE; 3232 last_reclaim = ARC_RECLAIM_AGGR; 3233 } 3234 arc_kmem_reap_now(last_reclaim); 3235 arc_warm = B_TRUE; 3236 3237 } else if (arc_no_grow && ddi_get_lbolt() >= growtime) { 3238 arc_no_grow = FALSE; 3239 } 3240 3241 arc_adjust(); 3242 3243 if (arc_eviction_list != NULL) 3244 arc_do_user_evicts(); 3245 3246#ifdef _KERNEL 3247 if (needfree) { 3248 needfree = 0; 3249 wakeup(&needfree); 3250 } 3251#endif 3252 3253 /* 3254 * This is necessary in order for the mdb ::arc dcmd to 3255 * show up to date information. Since the ::arc command 3256 * does not call the kstat's update function, without 3257 * this call, the command may show stale stats for the 3258 * anon, mru, mru_ghost, mfu, and mfu_ghost lists. Even 3259 * with this change, the data might be up to 1 second 3260 * out of date; but that should suffice. The arc_state_t 3261 * structures can be queried directly if more accurate 3262 * information is needed. 3263 */ 3264 if (arc_ksp != NULL) 3265 arc_ksp->ks_update(arc_ksp, KSTAT_READ); 3266 3267 /* block until needed, or one second, whichever is shorter */ 3268 CALLB_CPR_SAFE_BEGIN(&cpr); 3269 (void) cv_timedwait(&arc_reclaim_thr_cv, 3270 &arc_reclaim_thr_lock, hz); 3271 CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock); 3272 } 3273 3274 arc_thread_exit = 0; 3275 cv_broadcast(&arc_reclaim_thr_cv); 3276 CALLB_CPR_EXIT(&cpr); /* drops arc_reclaim_thr_lock */ 3277 thread_exit(); 3278} 3279 3280/* 3281 * Adapt arc info given the number of bytes we are trying to add and 3282 * the state that we are comming from. This function is only called 3283 * when we are adding new content to the cache. 3284 */ 3285static void 3286arc_adapt(int bytes, arc_state_t *state) 3287{ 3288 int mult; 3289 uint64_t arc_p_min = (arc_c >> arc_p_min_shift); 3290 3291 if (state == arc_l2c_only) 3292 return; 3293 3294 ASSERT(bytes > 0); 3295 /* 3296 * Adapt the target size of the MRU list: 3297 * - if we just hit in the MRU ghost list, then increase 3298 * the target size of the MRU list. 3299 * - if we just hit in the MFU ghost list, then increase 3300 * the target size of the MFU list by decreasing the 3301 * target size of the MRU list. 3302 */ 3303 if (state == arc_mru_ghost) { 3304 mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ? 3305 1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size)); 3306 mult = MIN(mult, 10); /* avoid wild arc_p adjustment */ 3307 3308 arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult); 3309 } else if (state == arc_mfu_ghost) { 3310 uint64_t delta; 3311 3312 mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ? 3313 1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size)); 3314 mult = MIN(mult, 10); 3315 3316 delta = MIN(bytes * mult, arc_p); 3317 arc_p = MAX(arc_p_min, arc_p - delta); 3318 } 3319 ASSERT((int64_t)arc_p >= 0); 3320 3321 if (arc_reclaim_needed()) { 3322 cv_signal(&arc_reclaim_thr_cv); 3323 return; 3324 } 3325 3326 if (arc_no_grow) 3327 return; 3328 3329 if (arc_c >= arc_c_max) 3330 return; 3331 3332 /* 3333 * If we're within (2 * maxblocksize) bytes of the target 3334 * cache size, increment the target cache size 3335 */ 3336 if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) { 3337 DTRACE_PROBE1(arc__inc_adapt, int, bytes); 3338 atomic_add_64(&arc_c, (int64_t)bytes); 3339 if (arc_c > arc_c_max) 3340 arc_c = arc_c_max; 3341 else if (state == arc_anon) 3342 atomic_add_64(&arc_p, (int64_t)bytes); 3343 if (arc_p > arc_c) 3344 arc_p = arc_c; 3345 } 3346 ASSERT((int64_t)arc_p >= 0); 3347} 3348 3349/* 3350 * Check if the cache has reached its limits and eviction is required 3351 * prior to insert. 3352 */ 3353static int 3354arc_evict_needed(arc_buf_contents_t type) 3355{ 3356 if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit) 3357 return (1); 3358 3359 if (arc_reclaim_needed()) 3360 return (1); 3361 3362 return (arc_size > arc_c); 3363} 3364 3365/* 3366 * The buffer, supplied as the first argument, needs a data block. 3367 * So, if we are at cache max, determine which cache should be victimized. 3368 * We have the following cases: 3369 * 3370 * 1. Insert for MRU, p > sizeof(arc_anon + arc_mru) -> 3371 * In this situation if we're out of space, but the resident size of the MFU is 3372 * under the limit, victimize the MFU cache to satisfy this insertion request. 3373 * 3374 * 2. Insert for MRU, p <= sizeof(arc_anon + arc_mru) -> 3375 * Here, we've used up all of the available space for the MRU, so we need to 3376 * evict from our own cache instead. Evict from the set of resident MRU 3377 * entries. 3378 * 3379 * 3. Insert for MFU (c - p) > sizeof(arc_mfu) -> 3380 * c minus p represents the MFU space in the cache, since p is the size of the 3381 * cache that is dedicated to the MRU. In this situation there's still space on 3382 * the MFU side, so the MRU side needs to be victimized. 3383 * 3384 * 4. Insert for MFU (c - p) < sizeof(arc_mfu) -> 3385 * MFU's resident set is consuming more space than it has been allotted. In 3386 * this situation, we must victimize our own cache, the MFU, for this insertion. 3387 */ 3388static void 3389arc_get_data_buf(arc_buf_t *buf) 3390{ 3391 arc_state_t *state = buf->b_hdr->b_l1hdr.b_state; 3392 uint64_t size = buf->b_hdr->b_size; 3393 arc_buf_contents_t type = arc_buf_type(buf->b_hdr); 3394 3395 arc_adapt(size, state); 3396 3397 /* 3398 * We have not yet reached cache maximum size, 3399 * just allocate a new buffer. 3400 */ 3401 if (!arc_evict_needed(type)) { 3402 if (type == ARC_BUFC_METADATA) { 3403 buf->b_data = zio_buf_alloc(size); 3404 arc_space_consume(size, ARC_SPACE_META); 3405 } else { 3406 ASSERT(type == ARC_BUFC_DATA); 3407 buf->b_data = zio_data_buf_alloc(size); 3408 arc_space_consume(size, ARC_SPACE_DATA); 3409 } 3410 goto out; 3411 } 3412 3413 /* 3414 * If we are prefetching from the mfu ghost list, this buffer 3415 * will end up on the mru list; so steal space from there. 3416 */ 3417 if (state == arc_mfu_ghost) 3418 state = HDR_PREFETCH(buf->b_hdr) ? arc_mru : arc_mfu; 3419 else if (state == arc_mru_ghost) 3420 state = arc_mru; 3421 3422 if (state == arc_mru || state == arc_anon) { 3423 uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size; 3424 state = (arc_mfu->arcs_lsize[type] >= size && 3425 arc_p > mru_used) ? arc_mfu : arc_mru; 3426 } else { 3427 /* MFU cases */ 3428 uint64_t mfu_space = arc_c - arc_p; 3429 state = (arc_mru->arcs_lsize[type] >= size && 3430 mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu; 3431 } 3432 if ((buf->b_data = arc_evict(state, 0, size, TRUE, type)) == NULL) { 3433 if (type == ARC_BUFC_METADATA) { 3434 buf->b_data = zio_buf_alloc(size); 3435 arc_space_consume(size, ARC_SPACE_META); 3436 } else { 3437 ASSERT(type == ARC_BUFC_DATA); 3438 buf->b_data = zio_data_buf_alloc(size); 3439 arc_space_consume(size, ARC_SPACE_DATA); 3440 } 3441 ARCSTAT_BUMP(arcstat_recycle_miss); 3442 } 3443 ASSERT(buf->b_data != NULL); 3444out: 3445 /* 3446 * Update the state size. Note that ghost states have a 3447 * "ghost size" and so don't need to be updated. 3448 */ 3449 if (!GHOST_STATE(buf->b_hdr->b_l1hdr.b_state)) { 3450 arc_buf_hdr_t *hdr = buf->b_hdr; 3451 3452 atomic_add_64(&hdr->b_l1hdr.b_state->arcs_size, size); 3453 if (list_link_active(&hdr->b_l1hdr.b_arc_node)) { 3454 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 3455 atomic_add_64(&hdr->b_l1hdr.b_state->arcs_lsize[type], 3456 size); 3457 } 3458 /* 3459 * If we are growing the cache, and we are adding anonymous 3460 * data, and we have outgrown arc_p, update arc_p 3461 */ 3462 if (arc_size < arc_c && hdr->b_l1hdr.b_state == arc_anon && 3463 arc_anon->arcs_size + arc_mru->arcs_size > arc_p) 3464 arc_p = MIN(arc_c, arc_p + size); 3465 } 3466 ARCSTAT_BUMP(arcstat_allocated); 3467} 3468 3469/* 3470 * This routine is called whenever a buffer is accessed. 3471 * NOTE: the hash lock is dropped in this function. 3472 */ 3473static void 3474arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) 3475{ 3476 clock_t now; 3477 3478 ASSERT(MUTEX_HELD(hash_lock)); 3479 ASSERT(HDR_HAS_L1HDR(hdr)); 3480 3481 if (hdr->b_l1hdr.b_state == arc_anon) { 3482 /* 3483 * This buffer is not in the cache, and does not 3484 * appear in our "ghost" list. Add the new buffer 3485 * to the MRU state. 3486 */ 3487 3488 ASSERT0(hdr->b_l1hdr.b_arc_access); 3489 hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); 3490 DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr); 3491 arc_change_state(arc_mru, hdr, hash_lock); 3492 3493 } else if (hdr->b_l1hdr.b_state == arc_mru) { 3494 now = ddi_get_lbolt(); 3495 3496 /* 3497 * If this buffer is here because of a prefetch, then either: 3498 * - clear the flag if this is a "referencing" read 3499 * (any subsequent access will bump this into the MFU state). 3500 * or 3501 * - move the buffer to the head of the list if this is 3502 * another prefetch (to make it less likely to be evicted). 3503 */ 3504 if (HDR_PREFETCH(hdr)) { 3505 if (refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) { 3506 ASSERT(list_link_active( 3507 &hdr->b_l1hdr.b_arc_node)); 3508 } else { 3509 hdr->b_flags &= ~ARC_FLAG_PREFETCH; 3510 ARCSTAT_BUMP(arcstat_mru_hits); 3511 } 3512 hdr->b_l1hdr.b_arc_access = now; 3513 return; 3514 } 3515 3516 /* 3517 * This buffer has been "accessed" only once so far, 3518 * but it is still in the cache. Move it to the MFU 3519 * state. 3520 */ 3521 if (now > hdr->b_l1hdr.b_arc_access + ARC_MINTIME) { 3522 /* 3523 * More than 125ms have passed since we 3524 * instantiated this buffer. Move it to the 3525 * most frequently used state. 3526 */ 3527 hdr->b_l1hdr.b_arc_access = now; 3528 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); 3529 arc_change_state(arc_mfu, hdr, hash_lock); 3530 } 3531 ARCSTAT_BUMP(arcstat_mru_hits); 3532 } else if (hdr->b_l1hdr.b_state == arc_mru_ghost) { 3533 arc_state_t *new_state; 3534 /* 3535 * This buffer has been "accessed" recently, but 3536 * was evicted from the cache. Move it to the 3537 * MFU state. 3538 */ 3539 3540 if (HDR_PREFETCH(hdr)) { 3541 new_state = arc_mru; 3542 if (refcount_count(&hdr->b_l1hdr.b_refcnt) > 0) 3543 hdr->b_flags &= ~ARC_FLAG_PREFETCH; 3544 DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr); 3545 } else { 3546 new_state = arc_mfu; 3547 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); 3548 } 3549 3550 hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); 3551 arc_change_state(new_state, hdr, hash_lock); 3552 3553 ARCSTAT_BUMP(arcstat_mru_ghost_hits); 3554 } else if (hdr->b_l1hdr.b_state == arc_mfu) { 3555 /* 3556 * This buffer has been accessed more than once and is 3557 * still in the cache. Keep it in the MFU state. 3558 * 3559 * NOTE: an add_reference() that occurred when we did 3560 * the arc_read() will have kicked this off the list. 3561 * If it was a prefetch, we will explicitly move it to 3562 * the head of the list now. 3563 */ 3564 if ((HDR_PREFETCH(hdr)) != 0) { 3565 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 3566 ASSERT(list_link_active(&hdr->b_l1hdr.b_arc_node)); 3567 } 3568 ARCSTAT_BUMP(arcstat_mfu_hits); 3569 hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); 3570 } else if (hdr->b_l1hdr.b_state == arc_mfu_ghost) { 3571 arc_state_t *new_state = arc_mfu; 3572 /* 3573 * This buffer has been accessed more than once but has 3574 * been evicted from the cache. Move it back to the 3575 * MFU state. 3576 */ 3577 3578 if (HDR_PREFETCH(hdr)) { 3579 /* 3580 * This is a prefetch access... 3581 * move this block back to the MRU state. 3582 */ 3583 ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt)); 3584 new_state = arc_mru; 3585 } 3586 3587 hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); 3588 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); 3589 arc_change_state(new_state, hdr, hash_lock); 3590 3591 ARCSTAT_BUMP(arcstat_mfu_ghost_hits); 3592 } else if (hdr->b_l1hdr.b_state == arc_l2c_only) { 3593 /* 3594 * This buffer is on the 2nd Level ARC. 3595 */ 3596 3597 hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); 3598 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); 3599 arc_change_state(arc_mfu, hdr, hash_lock); 3600 } else { 3601 ASSERT(!"invalid arc state"); 3602 } 3603} 3604 3605/* a generic arc_done_func_t which you can use */ 3606/* ARGSUSED */ 3607void 3608arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg) 3609{ 3610 if (zio == NULL || zio->io_error == 0) 3611 bcopy(buf->b_data, arg, buf->b_hdr->b_size); 3612 VERIFY(arc_buf_remove_ref(buf, arg)); 3613} 3614 3615/* a generic arc_done_func_t */ 3616void 3617arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg) 3618{ 3619 arc_buf_t **bufp = arg; 3620 if (zio && zio->io_error) { 3621 VERIFY(arc_buf_remove_ref(buf, arg)); 3622 *bufp = NULL; 3623 } else { 3624 *bufp = buf; 3625 ASSERT(buf->b_data); 3626 } 3627} 3628 3629static void 3630arc_read_done(zio_t *zio) 3631{ 3632 arc_buf_hdr_t *hdr; 3633 arc_buf_t *buf; 3634 arc_buf_t *abuf; /* buffer we're assigning to callback */ 3635 kmutex_t *hash_lock = NULL; 3636 arc_callback_t *callback_list, *acb; 3637 int freeable = FALSE; 3638 3639 buf = zio->io_private; 3640 hdr = buf->b_hdr; 3641 3642 /* 3643 * The hdr was inserted into hash-table and removed from lists 3644 * prior to starting I/O. We should find this header, since 3645 * it's in the hash table, and it should be legit since it's 3646 * not possible to evict it during the I/O. The only possible 3647 * reason for it not to be found is if we were freed during the 3648 * read. 3649 */ 3650 if (HDR_IN_HASH_TABLE(hdr)) { 3651 ASSERT3U(hdr->b_birth, ==, BP_PHYSICAL_BIRTH(zio->io_bp)); 3652 ASSERT3U(hdr->b_dva.dva_word[0], ==, 3653 BP_IDENTITY(zio->io_bp)->dva_word[0]); 3654 ASSERT3U(hdr->b_dva.dva_word[1], ==, 3655 BP_IDENTITY(zio->io_bp)->dva_word[1]); 3656 3657 arc_buf_hdr_t *found = buf_hash_find(hdr->b_spa, zio->io_bp, 3658 &hash_lock); 3659 3660 ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && 3661 hash_lock == NULL) || 3662 (found == hdr && 3663 DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) || 3664 (found == hdr && HDR_L2_READING(hdr))); 3665 } 3666 3667 hdr->b_flags &= ~ARC_FLAG_L2_EVICTED; 3668 if (l2arc_noprefetch && HDR_PREFETCH(hdr)) 3669 hdr->b_flags &= ~ARC_FLAG_L2CACHE; 3670 3671 /* byteswap if necessary */ 3672 callback_list = hdr->b_l1hdr.b_acb; 3673 ASSERT(callback_list != NULL); 3674 if (BP_SHOULD_BYTESWAP(zio->io_bp) && zio->io_error == 0) { 3675 dmu_object_byteswap_t bswap = 3676 DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp)); 3677 arc_byteswap_func_t *func = BP_GET_LEVEL(zio->io_bp) > 0 ? 3678 byteswap_uint64_array : 3679 dmu_ot_byteswap[bswap].ob_func; 3680 func(buf->b_data, hdr->b_size); 3681 } 3682 3683 arc_cksum_compute(buf, B_FALSE); 3684#ifdef illumos 3685 arc_buf_watch(buf); 3686#endif 3687 3688 if (hash_lock && zio->io_error == 0 && 3689 hdr->b_l1hdr.b_state == arc_anon) { 3690 /* 3691 * Only call arc_access on anonymous buffers. This is because 3692 * if we've issued an I/O for an evicted buffer, we've already 3693 * called arc_access (to prevent any simultaneous readers from 3694 * getting confused). 3695 */ 3696 arc_access(hdr, hash_lock); 3697 } 3698 3699 /* create copies of the data buffer for the callers */ 3700 abuf = buf; 3701 for (acb = callback_list; acb; acb = acb->acb_next) { 3702 if (acb->acb_done) { 3703 if (abuf == NULL) { 3704 ARCSTAT_BUMP(arcstat_duplicate_reads); 3705 abuf = arc_buf_clone(buf); 3706 } 3707 acb->acb_buf = abuf; 3708 abuf = NULL; 3709 } 3710 } 3711 hdr->b_l1hdr.b_acb = NULL; 3712 hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS; 3713 ASSERT(!HDR_BUF_AVAILABLE(hdr)); 3714 if (abuf == buf) { 3715 ASSERT(buf->b_efunc == NULL); 3716 ASSERT(hdr->b_l1hdr.b_datacnt == 1); 3717 hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE; 3718 } 3719 3720 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt) || 3721 callback_list != NULL); 3722 3723 if (zio->io_error != 0) { 3724 hdr->b_flags |= ARC_FLAG_IO_ERROR; 3725 if (hdr->b_l1hdr.b_state != arc_anon) 3726 arc_change_state(arc_anon, hdr, hash_lock); 3727 if (HDR_IN_HASH_TABLE(hdr)) 3728 buf_hash_remove(hdr); 3729 freeable = refcount_is_zero(&hdr->b_l1hdr.b_refcnt); 3730 } 3731 3732 /* 3733 * Broadcast before we drop the hash_lock to avoid the possibility 3734 * that the hdr (and hence the cv) might be freed before we get to 3735 * the cv_broadcast(). 3736 */ 3737 cv_broadcast(&hdr->b_l1hdr.b_cv); 3738 3739 if (hash_lock != NULL) { 3740 mutex_exit(hash_lock); 3741 } else { 3742 /* 3743 * This block was freed while we waited for the read to 3744 * complete. It has been removed from the hash table and 3745 * moved to the anonymous state (so that it won't show up 3746 * in the cache). 3747 */ 3748 ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); 3749 freeable = refcount_is_zero(&hdr->b_l1hdr.b_refcnt); 3750 } 3751 3752 /* execute each callback and free its structure */ 3753 while ((acb = callback_list) != NULL) { 3754 if (acb->acb_done) 3755 acb->acb_done(zio, acb->acb_buf, acb->acb_private); 3756 3757 if (acb->acb_zio_dummy != NULL) { 3758 acb->acb_zio_dummy->io_error = zio->io_error; 3759 zio_nowait(acb->acb_zio_dummy); 3760 } 3761 3762 callback_list = acb->acb_next; 3763 kmem_free(acb, sizeof (arc_callback_t)); 3764 } 3765 3766 if (freeable) 3767 arc_hdr_destroy(hdr); 3768} 3769 3770/* 3771 * "Read" the block block at the specified DVA (in bp) via the 3772 * cache. If the block is found in the cache, invoke the provided 3773 * callback immediately and return. Note that the `zio' parameter 3774 * in the callback will be NULL in this case, since no IO was 3775 * required. If the block is not in the cache pass the read request 3776 * on to the spa with a substitute callback function, so that the 3777 * requested block will be added to the cache. 3778 * 3779 * If a read request arrives for a block that has a read in-progress, 3780 * either wait for the in-progress read to complete (and return the 3781 * results); or, if this is a read with a "done" func, add a record 3782 * to the read to invoke the "done" func when the read completes, 3783 * and return; or just return. 3784 * 3785 * arc_read_done() will invoke all the requested "done" functions 3786 * for readers of this block. 3787 */ 3788int 3789arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, 3790 void *private, zio_priority_t priority, int zio_flags, 3791 arc_flags_t *arc_flags, const zbookmark_phys_t *zb) 3792{ 3793 arc_buf_hdr_t *hdr = NULL; 3794 arc_buf_t *buf = NULL; 3795 kmutex_t *hash_lock = NULL; 3796 zio_t *rzio; 3797 uint64_t guid = spa_load_guid(spa); 3798 3799 ASSERT(!BP_IS_EMBEDDED(bp) || 3800 BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA); 3801 3802top: 3803 if (!BP_IS_EMBEDDED(bp)) { 3804 /* 3805 * Embedded BP's have no DVA and require no I/O to "read". 3806 * Create an anonymous arc buf to back it. 3807 */ 3808 hdr = buf_hash_find(guid, bp, &hash_lock); 3809 } 3810 3811 if (hdr != NULL && HDR_HAS_L1HDR(hdr) && hdr->b_l1hdr.b_datacnt > 0) { 3812 3813 *arc_flags |= ARC_FLAG_CACHED; 3814 3815 if (HDR_IO_IN_PROGRESS(hdr)) { 3816 3817 if (*arc_flags & ARC_FLAG_WAIT) { 3818 cv_wait(&hdr->b_l1hdr.b_cv, hash_lock); 3819 mutex_exit(hash_lock); 3820 goto top; 3821 } 3822 ASSERT(*arc_flags & ARC_FLAG_NOWAIT); 3823 3824 if (done) { 3825 arc_callback_t *acb = NULL; 3826 3827 acb = kmem_zalloc(sizeof (arc_callback_t), 3828 KM_SLEEP); 3829 acb->acb_done = done; 3830 acb->acb_private = private; 3831 if (pio != NULL) 3832 acb->acb_zio_dummy = zio_null(pio, 3833 spa, NULL, NULL, NULL, zio_flags); 3834 3835 ASSERT(acb->acb_done != NULL); 3836 acb->acb_next = hdr->b_l1hdr.b_acb; 3837 hdr->b_l1hdr.b_acb = acb; 3838 add_reference(hdr, hash_lock, private); 3839 mutex_exit(hash_lock); 3840 return (0); 3841 } 3842 mutex_exit(hash_lock); 3843 return (0); 3844 } 3845 3846 ASSERT(hdr->b_l1hdr.b_state == arc_mru || 3847 hdr->b_l1hdr.b_state == arc_mfu); 3848 3849 if (done) { 3850 add_reference(hdr, hash_lock, private); 3851 /* 3852 * If this block is already in use, create a new 3853 * copy of the data so that we will be guaranteed 3854 * that arc_release() will always succeed. 3855 */ 3856 buf = hdr->b_l1hdr.b_buf; 3857 ASSERT(buf); 3858 ASSERT(buf->b_data); 3859 if (HDR_BUF_AVAILABLE(hdr)) { 3860 ASSERT(buf->b_efunc == NULL); 3861 hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE; 3862 } else { 3863 buf = arc_buf_clone(buf); 3864 } 3865 3866 } else if (*arc_flags & ARC_FLAG_PREFETCH && 3867 refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) { 3868 hdr->b_flags |= ARC_FLAG_PREFETCH; 3869 } 3870 DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); 3871 arc_access(hdr, hash_lock); 3872 if (*arc_flags & ARC_FLAG_L2CACHE) 3873 hdr->b_flags |= ARC_FLAG_L2CACHE; 3874 if (*arc_flags & ARC_FLAG_L2COMPRESS) 3875 hdr->b_flags |= ARC_FLAG_L2COMPRESS; 3876 mutex_exit(hash_lock); 3877 ARCSTAT_BUMP(arcstat_hits); 3878 ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr), 3879 demand, prefetch, !HDR_ISTYPE_METADATA(hdr), 3880 data, metadata, hits); 3881 3882 if (done) 3883 done(NULL, buf, private); 3884 } else { 3885 uint64_t size = BP_GET_LSIZE(bp); 3886 arc_callback_t *acb; 3887 vdev_t *vd = NULL; 3888 uint64_t addr = 0; 3889 boolean_t devw = B_FALSE; 3890 enum zio_compress b_compress = ZIO_COMPRESS_OFF; 3891 int32_t b_asize = 0; 3892 3893 if (hdr == NULL) { 3894 /* this block is not in the cache */ 3895 arc_buf_hdr_t *exists = NULL; 3896 arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp); 3897 buf = arc_buf_alloc(spa, size, private, type); 3898 hdr = buf->b_hdr; 3899 if (!BP_IS_EMBEDDED(bp)) { 3900 hdr->b_dva = *BP_IDENTITY(bp); 3901 hdr->b_birth = BP_PHYSICAL_BIRTH(bp); 3902 exists = buf_hash_insert(hdr, &hash_lock); 3903 } 3904 if (exists != NULL) { 3905 /* somebody beat us to the hash insert */ 3906 mutex_exit(hash_lock); 3907 buf_discard_identity(hdr); 3908 (void) arc_buf_remove_ref(buf, private); 3909 goto top; /* restart the IO request */ 3910 } 3911 3912 /* if this is a prefetch, we don't have a reference */ 3913 if (*arc_flags & ARC_FLAG_PREFETCH) { 3914 (void) remove_reference(hdr, hash_lock, 3915 private); 3916 hdr->b_flags |= ARC_FLAG_PREFETCH; 3917 } 3918 if (*arc_flags & ARC_FLAG_L2CACHE) 3919 hdr->b_flags |= ARC_FLAG_L2CACHE; 3920 if (*arc_flags & ARC_FLAG_L2COMPRESS) 3921 hdr->b_flags |= ARC_FLAG_L2COMPRESS; 3922 if (BP_GET_LEVEL(bp) > 0) 3923 hdr->b_flags |= ARC_FLAG_INDIRECT; 3924 } else { 3925 /* 3926 * This block is in the ghost cache. If it was L2-only 3927 * (and thus didn't have an L1 hdr), we realloc the 3928 * header to add an L1 hdr. 3929 */ 3930 if (!HDR_HAS_L1HDR(hdr)) { 3931 hdr = arc_hdr_realloc(hdr, hdr_l2only_cache, 3932 hdr_full_cache); 3933 } 3934 3935 ASSERT(GHOST_STATE(hdr->b_l1hdr.b_state)); 3936 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 3937 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 3938 ASSERT(hdr->b_l1hdr.b_buf == NULL); 3939 3940 /* if this is a prefetch, we don't have a reference */ 3941 if (*arc_flags & ARC_FLAG_PREFETCH) 3942 hdr->b_flags |= ARC_FLAG_PREFETCH; 3943 else 3944 add_reference(hdr, hash_lock, private); 3945 if (*arc_flags & ARC_FLAG_L2CACHE) 3946 hdr->b_flags |= ARC_FLAG_L2CACHE; 3947 if (*arc_flags & ARC_FLAG_L2COMPRESS) 3948 hdr->b_flags |= ARC_FLAG_L2COMPRESS; 3949 buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); 3950 buf->b_hdr = hdr; 3951 buf->b_data = NULL; 3952 buf->b_efunc = NULL; 3953 buf->b_private = NULL; 3954 buf->b_next = NULL; 3955 hdr->b_l1hdr.b_buf = buf; 3956 ASSERT0(hdr->b_l1hdr.b_datacnt); 3957 hdr->b_l1hdr.b_datacnt = 1; 3958 arc_get_data_buf(buf); 3959 arc_access(hdr, hash_lock); 3960 } 3961 3962 ASSERT(!GHOST_STATE(hdr->b_l1hdr.b_state)); 3963 3964 acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); 3965 acb->acb_done = done; 3966 acb->acb_private = private; 3967 3968 ASSERT(hdr->b_l1hdr.b_acb == NULL); 3969 hdr->b_l1hdr.b_acb = acb; 3970 hdr->b_flags |= ARC_FLAG_IO_IN_PROGRESS; 3971 3972 if (HDR_HAS_L2HDR(hdr) && 3973 (vd = hdr->b_l2hdr.b_dev->l2ad_vdev) != NULL) { 3974 devw = hdr->b_l2hdr.b_dev->l2ad_writing; 3975 addr = hdr->b_l2hdr.b_daddr; 3976 b_compress = HDR_GET_COMPRESS(hdr); 3977 b_asize = hdr->b_l2hdr.b_asize; 3978 /* 3979 * Lock out device removal. 3980 */ 3981 if (vdev_is_dead(vd) || 3982 !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER)) 3983 vd = NULL; 3984 } 3985 3986 if (hash_lock != NULL) 3987 mutex_exit(hash_lock); 3988 3989 /* 3990 * At this point, we have a level 1 cache miss. Try again in 3991 * L2ARC if possible. 3992 */ 3993 ASSERT3U(hdr->b_size, ==, size); 3994 DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp, 3995 uint64_t, size, zbookmark_phys_t *, zb); 3996 ARCSTAT_BUMP(arcstat_misses); 3997 ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr), 3998 demand, prefetch, !HDR_ISTYPE_METADATA(hdr), 3999 data, metadata, misses); 4000#ifdef _KERNEL 4001 curthread->td_ru.ru_inblock++; 4002#endif 4003 4004 if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) { 4005 /* 4006 * Read from the L2ARC if the following are true: 4007 * 1. The L2ARC vdev was previously cached. 4008 * 2. This buffer still has L2ARC metadata. 4009 * 3. This buffer isn't currently writing to the L2ARC. 4010 * 4. The L2ARC entry wasn't evicted, which may 4011 * also have invalidated the vdev. 4012 * 5. This isn't prefetch and l2arc_noprefetch is set. 4013 */ 4014 if (HDR_HAS_L2HDR(hdr) && 4015 !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) && 4016 !(l2arc_noprefetch && HDR_PREFETCH(hdr))) { 4017 l2arc_read_callback_t *cb; 4018 4019 DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr); 4020 ARCSTAT_BUMP(arcstat_l2_hits); 4021 4022 cb = kmem_zalloc(sizeof (l2arc_read_callback_t), 4023 KM_SLEEP); 4024 cb->l2rcb_buf = buf; 4025 cb->l2rcb_spa = spa; 4026 cb->l2rcb_bp = *bp; 4027 cb->l2rcb_zb = *zb; 4028 cb->l2rcb_flags = zio_flags; 4029 cb->l2rcb_compress = b_compress; 4030 4031 ASSERT(addr >= VDEV_LABEL_START_SIZE && 4032 addr + size < vd->vdev_psize - 4033 VDEV_LABEL_END_SIZE); 4034 4035 /* 4036 * l2arc read. The SCL_L2ARC lock will be 4037 * released by l2arc_read_done(). 4038 * Issue a null zio if the underlying buffer 4039 * was squashed to zero size by compression. 4040 */ 4041 if (b_compress == ZIO_COMPRESS_EMPTY) { 4042 rzio = zio_null(pio, spa, vd, 4043 l2arc_read_done, cb, 4044 zio_flags | ZIO_FLAG_DONT_CACHE | 4045 ZIO_FLAG_CANFAIL | 4046 ZIO_FLAG_DONT_PROPAGATE | 4047 ZIO_FLAG_DONT_RETRY); 4048 } else { 4049 rzio = zio_read_phys(pio, vd, addr, 4050 b_asize, buf->b_data, 4051 ZIO_CHECKSUM_OFF, 4052 l2arc_read_done, cb, priority, 4053 zio_flags | ZIO_FLAG_DONT_CACHE | 4054 ZIO_FLAG_CANFAIL | 4055 ZIO_FLAG_DONT_PROPAGATE | 4056 ZIO_FLAG_DONT_RETRY, B_FALSE); 4057 } 4058 DTRACE_PROBE2(l2arc__read, vdev_t *, vd, 4059 zio_t *, rzio); 4060 ARCSTAT_INCR(arcstat_l2_read_bytes, b_asize); 4061 4062 if (*arc_flags & ARC_FLAG_NOWAIT) { 4063 zio_nowait(rzio); 4064 return (0); 4065 } 4066 4067 ASSERT(*arc_flags & ARC_FLAG_WAIT); 4068 if (zio_wait(rzio) == 0) 4069 return (0); 4070 4071 /* l2arc read error; goto zio_read() */ 4072 } else { 4073 DTRACE_PROBE1(l2arc__miss, 4074 arc_buf_hdr_t *, hdr); 4075 ARCSTAT_BUMP(arcstat_l2_misses); 4076 if (HDR_L2_WRITING(hdr)) 4077 ARCSTAT_BUMP(arcstat_l2_rw_clash); 4078 spa_config_exit(spa, SCL_L2ARC, vd); 4079 } 4080 } else { 4081 if (vd != NULL) 4082 spa_config_exit(spa, SCL_L2ARC, vd); 4083 if (l2arc_ndev != 0) { 4084 DTRACE_PROBE1(l2arc__miss, 4085 arc_buf_hdr_t *, hdr); 4086 ARCSTAT_BUMP(arcstat_l2_misses); 4087 } 4088 } 4089 4090 rzio = zio_read(pio, spa, bp, buf->b_data, size, 4091 arc_read_done, buf, priority, zio_flags, zb); 4092 4093 if (*arc_flags & ARC_FLAG_WAIT) 4094 return (zio_wait(rzio)); 4095 4096 ASSERT(*arc_flags & ARC_FLAG_NOWAIT); 4097 zio_nowait(rzio); 4098 } 4099 return (0); 4100} 4101 4102void 4103arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private) 4104{ 4105 ASSERT(buf->b_hdr != NULL); 4106 ASSERT(buf->b_hdr->b_l1hdr.b_state != arc_anon); 4107 ASSERT(!refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt) || 4108 func == NULL); 4109 ASSERT(buf->b_efunc == NULL); 4110 ASSERT(!HDR_BUF_AVAILABLE(buf->b_hdr)); 4111 4112 buf->b_efunc = func; 4113 buf->b_private = private; 4114} 4115 4116/* 4117 * Notify the arc that a block was freed, and thus will never be used again. 4118 */ 4119void 4120arc_freed(spa_t *spa, const blkptr_t *bp) 4121{ 4122 arc_buf_hdr_t *hdr; 4123 kmutex_t *hash_lock; 4124 uint64_t guid = spa_load_guid(spa); 4125 4126 ASSERT(!BP_IS_EMBEDDED(bp)); 4127 4128 hdr = buf_hash_find(guid, bp, &hash_lock); 4129 if (hdr == NULL) 4130 return; 4131 if (HDR_BUF_AVAILABLE(hdr)) { 4132 arc_buf_t *buf = hdr->b_l1hdr.b_buf; 4133 add_reference(hdr, hash_lock, FTAG); 4134 hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE; 4135 mutex_exit(hash_lock); 4136 4137 arc_release(buf, FTAG); 4138 (void) arc_buf_remove_ref(buf, FTAG); 4139 } else { 4140 mutex_exit(hash_lock); 4141 } 4142 4143} 4144 4145/* 4146 * Clear the user eviction callback set by arc_set_callback(), first calling 4147 * it if it exists. Because the presence of a callback keeps an arc_buf cached 4148 * clearing the callback may result in the arc_buf being destroyed. However, 4149 * it will not result in the *last* arc_buf being destroyed, hence the data 4150 * will remain cached in the ARC. We make a copy of the arc buffer here so 4151 * that we can process the callback without holding any locks. 4152 * 4153 * It's possible that the callback is already in the process of being cleared 4154 * by another thread. In this case we can not clear the callback. 4155 * 4156 * Returns B_TRUE if the callback was successfully called and cleared. 4157 */ 4158boolean_t 4159arc_clear_callback(arc_buf_t *buf) 4160{ 4161 arc_buf_hdr_t *hdr; 4162 kmutex_t *hash_lock; 4163 arc_evict_func_t *efunc = buf->b_efunc; 4164 void *private = buf->b_private; 4165 list_t *list, *evicted_list; 4166 kmutex_t *lock, *evicted_lock; 4167 4168 mutex_enter(&buf->b_evict_lock); 4169 hdr = buf->b_hdr; 4170 if (hdr == NULL) { 4171 /* 4172 * We are in arc_do_user_evicts(). 4173 */ 4174 ASSERT(buf->b_data == NULL); 4175 mutex_exit(&buf->b_evict_lock); 4176 return (B_FALSE); 4177 } else if (buf->b_data == NULL) { 4178 /* 4179 * We are on the eviction list; process this buffer now 4180 * but let arc_do_user_evicts() do the reaping. 4181 */ 4182 buf->b_efunc = NULL; 4183 mutex_exit(&buf->b_evict_lock); 4184 VERIFY0(efunc(private)); 4185 return (B_TRUE); 4186 } 4187 hash_lock = HDR_LOCK(hdr); 4188 mutex_enter(hash_lock); 4189 hdr = buf->b_hdr; 4190 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 4191 4192 ASSERT3U(refcount_count(&hdr->b_l1hdr.b_refcnt), <, 4193 hdr->b_l1hdr.b_datacnt); 4194 ASSERT(hdr->b_l1hdr.b_state == arc_mru || 4195 hdr->b_l1hdr.b_state == arc_mfu); 4196 4197 buf->b_efunc = NULL; 4198 buf->b_private = NULL; 4199 4200 if (hdr->b_l1hdr.b_datacnt > 1) { 4201 mutex_exit(&buf->b_evict_lock); 4202 arc_buf_destroy(buf, FALSE, TRUE); 4203 } else { 4204 ASSERT(buf == hdr->b_l1hdr.b_buf); 4205 hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE; 4206 mutex_exit(&buf->b_evict_lock); 4207 } 4208 4209 mutex_exit(hash_lock); 4210 VERIFY0(efunc(private)); 4211 return (B_TRUE); 4212} 4213 4214/* 4215 * Release this buffer from the cache, making it an anonymous buffer. This 4216 * must be done after a read and prior to modifying the buffer contents. 4217 * If the buffer has more than one reference, we must make 4218 * a new hdr for the buffer. 4219 */ 4220void 4221arc_release(arc_buf_t *buf, void *tag) 4222{ 4223 arc_buf_hdr_t *hdr = buf->b_hdr; 4224 4225 /* 4226 * It would be nice to assert that if it's DMU metadata (level > 4227 * 0 || it's the dnode file), then it must be syncing context. 4228 * But we don't know that information at this level. 4229 */ 4230 4231 mutex_enter(&buf->b_evict_lock); 4232 /* 4233 * We don't grab the hash lock prior to this check, because if 4234 * the buffer's header is in the arc_anon state, it won't be 4235 * linked into the hash table. 4236 */ 4237 if (hdr->b_l1hdr.b_state == arc_anon) { 4238 mutex_exit(&buf->b_evict_lock); 4239 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 4240 ASSERT(!HDR_IN_HASH_TABLE(hdr)); 4241 ASSERT(!HDR_HAS_L2HDR(hdr)); 4242 ASSERT(BUF_EMPTY(hdr)); 4243 ASSERT3U(hdr->b_l1hdr.b_datacnt, ==, 1); 4244 ASSERT3S(refcount_count(&hdr->b_l1hdr.b_refcnt), ==, 1); 4245 ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node)); 4246 4247 ASSERT3P(buf->b_efunc, ==, NULL); 4248 ASSERT3P(buf->b_private, ==, NULL); 4249 4250 hdr->b_l1hdr.b_arc_access = 0; 4251 arc_buf_thaw(buf); 4252 4253 return; 4254 } 4255 4256 kmutex_t *hash_lock = HDR_LOCK(hdr); 4257 mutex_enter(hash_lock); 4258 4259 /* 4260 * This assignment is only valid as long as the hash_lock is 4261 * held, we must be careful not to reference state or the 4262 * b_state field after dropping the lock. 4263 */ 4264 arc_state_t *state = hdr->b_l1hdr.b_state; 4265 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 4266 ASSERT3P(state, !=, arc_anon); 4267 4268 /* this buffer is not on any list */ 4269 ASSERT(refcount_count(&hdr->b_l1hdr.b_refcnt) > 0); 4270 4271 if (HDR_HAS_L2HDR(hdr)) { 4272 ARCSTAT_INCR(arcstat_l2_asize, -hdr->b_l2hdr.b_asize); 4273 ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); 4274 4275 mutex_enter(&hdr->b_l2hdr.b_dev->l2ad_mtx); 4276 trim_map_free(hdr->b_l2hdr.b_dev->l2ad_vdev, 4277 hdr->b_l2hdr.b_daddr, hdr->b_l2hdr.b_asize, 0); 4278 list_remove(&hdr->b_l2hdr.b_dev->l2ad_buflist, hdr); 4279 4280 /* 4281 * We don't want to leak the b_tmp_cdata buffer that was 4282 * allocated in l2arc_write_buffers() 4283 */ 4284 arc_buf_l2_cdata_free(hdr); 4285 4286 mutex_exit(&hdr->b_l2hdr.b_dev->l2ad_mtx); 4287 4288 hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR; 4289 } 4290 4291 /* 4292 * Do we have more than one buf? 4293 */ 4294 if (hdr->b_l1hdr.b_datacnt > 1) { 4295 arc_buf_hdr_t *nhdr; 4296 arc_buf_t **bufp; 4297 uint64_t blksz = hdr->b_size; 4298 uint64_t spa = hdr->b_spa; 4299 arc_buf_contents_t type = arc_buf_type(hdr); 4300 uint32_t flags = hdr->b_flags; 4301 4302 ASSERT(hdr->b_l1hdr.b_buf != buf || buf->b_next != NULL); 4303 /* 4304 * Pull the data off of this hdr and attach it to 4305 * a new anonymous hdr. 4306 */ 4307 (void) remove_reference(hdr, hash_lock, tag); 4308 bufp = &hdr->b_l1hdr.b_buf; 4309 while (*bufp != buf) 4310 bufp = &(*bufp)->b_next; 4311 *bufp = buf->b_next; 4312 buf->b_next = NULL; 4313 4314 ASSERT3P(state, !=, arc_l2c_only); 4315 ASSERT3U(state->arcs_size, >=, hdr->b_size); 4316 atomic_add_64(&state->arcs_size, -hdr->b_size); 4317 if (refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) { 4318 ASSERT3P(state, !=, arc_l2c_only); 4319 uint64_t *size = &state->arcs_lsize[type]; 4320 ASSERT3U(*size, >=, hdr->b_size); 4321 atomic_add_64(size, -hdr->b_size); 4322 } 4323 4324 /* 4325 * We're releasing a duplicate user data buffer, update 4326 * our statistics accordingly. 4327 */ 4328 if (HDR_ISTYPE_DATA(hdr)) { 4329 ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers); 4330 ARCSTAT_INCR(arcstat_duplicate_buffers_size, 4331 -hdr->b_size); 4332 } 4333 hdr->b_l1hdr.b_datacnt -= 1; 4334 arc_cksum_verify(buf); 4335#ifdef illumos 4336 arc_buf_unwatch(buf); 4337#endif 4338 4339 mutex_exit(hash_lock); 4340 4341 nhdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE); 4342 nhdr->b_size = blksz; 4343 nhdr->b_spa = spa; 4344 4345 nhdr->b_flags = flags & ARC_FLAG_L2_WRITING; 4346 nhdr->b_flags |= arc_bufc_to_flags(type); 4347 nhdr->b_flags |= ARC_FLAG_HAS_L1HDR; 4348 4349 nhdr->b_l1hdr.b_buf = buf; 4350 nhdr->b_l1hdr.b_datacnt = 1; 4351 nhdr->b_l1hdr.b_state = arc_anon; 4352 nhdr->b_l1hdr.b_arc_access = 0; 4353 nhdr->b_freeze_cksum = NULL; 4354 4355 (void) refcount_add(&nhdr->b_l1hdr.b_refcnt, tag); 4356 buf->b_hdr = nhdr; 4357 mutex_exit(&buf->b_evict_lock); 4358 atomic_add_64(&arc_anon->arcs_size, blksz); 4359 } else { 4360 mutex_exit(&buf->b_evict_lock); 4361 ASSERT(refcount_count(&hdr->b_l1hdr.b_refcnt) == 1); 4362 /* protected by hash lock */ 4363 ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node)); 4364 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 4365 arc_change_state(arc_anon, hdr, hash_lock); 4366 hdr->b_l1hdr.b_arc_access = 0; 4367 mutex_exit(hash_lock); 4368 4369 buf_discard_identity(hdr); 4370 arc_buf_thaw(buf); 4371 } 4372 buf->b_efunc = NULL; 4373 buf->b_private = NULL; 4374} 4375 4376int 4377arc_released(arc_buf_t *buf) 4378{ 4379 int released; 4380 4381 mutex_enter(&buf->b_evict_lock); 4382 released = (buf->b_data != NULL && 4383 buf->b_hdr->b_l1hdr.b_state == arc_anon); 4384 mutex_exit(&buf->b_evict_lock); 4385 return (released); 4386} 4387 4388#ifdef ZFS_DEBUG 4389int 4390arc_referenced(arc_buf_t *buf) 4391{ 4392 int referenced; 4393 4394 mutex_enter(&buf->b_evict_lock); 4395 referenced = (refcount_count(&buf->b_hdr->b_l1hdr.b_refcnt)); 4396 mutex_exit(&buf->b_evict_lock); 4397 return (referenced); 4398} 4399#endif 4400 4401static void 4402arc_write_ready(zio_t *zio) 4403{ 4404 arc_write_callback_t *callback = zio->io_private; 4405 arc_buf_t *buf = callback->awcb_buf; 4406 arc_buf_hdr_t *hdr = buf->b_hdr; 4407 4408 ASSERT(HDR_HAS_L1HDR(hdr)); 4409 ASSERT(!refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt)); 4410 ASSERT(hdr->b_l1hdr.b_datacnt > 0); 4411 callback->awcb_ready(zio, buf, callback->awcb_private); 4412 4413 /* 4414 * If the IO is already in progress, then this is a re-write 4415 * attempt, so we need to thaw and re-compute the cksum. 4416 * It is the responsibility of the callback to handle the 4417 * accounting for any re-write attempt. 4418 */ 4419 if (HDR_IO_IN_PROGRESS(hdr)) { 4420 mutex_enter(&hdr->b_l1hdr.b_freeze_lock); 4421 if (hdr->b_freeze_cksum != NULL) { 4422 kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t)); 4423 hdr->b_freeze_cksum = NULL; 4424 } 4425 mutex_exit(&hdr->b_l1hdr.b_freeze_lock); 4426 } 4427 arc_cksum_compute(buf, B_FALSE); 4428 hdr->b_flags |= ARC_FLAG_IO_IN_PROGRESS; 4429} 4430 4431/* 4432 * The SPA calls this callback for each physical write that happens on behalf 4433 * of a logical write. See the comment in dbuf_write_physdone() for details. 4434 */ 4435static void 4436arc_write_physdone(zio_t *zio) 4437{ 4438 arc_write_callback_t *cb = zio->io_private; 4439 if (cb->awcb_physdone != NULL) 4440 cb->awcb_physdone(zio, cb->awcb_buf, cb->awcb_private); 4441} 4442 4443static void 4444arc_write_done(zio_t *zio) 4445{ 4446 arc_write_callback_t *callback = zio->io_private; 4447 arc_buf_t *buf = callback->awcb_buf; 4448 arc_buf_hdr_t *hdr = buf->b_hdr; 4449 4450 ASSERT(hdr->b_l1hdr.b_acb == NULL); 4451 4452 if (zio->io_error == 0) { 4453 if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) { 4454 buf_discard_identity(hdr); 4455 } else { 4456 hdr->b_dva = *BP_IDENTITY(zio->io_bp); 4457 hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp); 4458 } 4459 } else { 4460 ASSERT(BUF_EMPTY(hdr)); 4461 } 4462 4463 /* 4464 * If the block to be written was all-zero or compressed enough to be 4465 * embedded in the BP, no write was performed so there will be no 4466 * dva/birth/checksum. The buffer must therefore remain anonymous 4467 * (and uncached). 4468 */ 4469 if (!BUF_EMPTY(hdr)) { 4470 arc_buf_hdr_t *exists; 4471 kmutex_t *hash_lock; 4472 4473 ASSERT(zio->io_error == 0); 4474 4475 arc_cksum_verify(buf); 4476 4477 exists = buf_hash_insert(hdr, &hash_lock); 4478 if (exists != NULL) { 4479 /* 4480 * This can only happen if we overwrite for 4481 * sync-to-convergence, because we remove 4482 * buffers from the hash table when we arc_free(). 4483 */ 4484 if (zio->io_flags & ZIO_FLAG_IO_REWRITE) { 4485 if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp)) 4486 panic("bad overwrite, hdr=%p exists=%p", 4487 (void *)hdr, (void *)exists); 4488 ASSERT(refcount_is_zero( 4489 &exists->b_l1hdr.b_refcnt)); 4490 arc_change_state(arc_anon, exists, hash_lock); 4491 mutex_exit(hash_lock); 4492 arc_hdr_destroy(exists); 4493 exists = buf_hash_insert(hdr, &hash_lock); 4494 ASSERT3P(exists, ==, NULL); 4495 } else if (zio->io_flags & ZIO_FLAG_NOPWRITE) { 4496 /* nopwrite */ 4497 ASSERT(zio->io_prop.zp_nopwrite); 4498 if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp)) 4499 panic("bad nopwrite, hdr=%p exists=%p", 4500 (void *)hdr, (void *)exists); 4501 } else { 4502 /* Dedup */ 4503 ASSERT(hdr->b_l1hdr.b_datacnt == 1); 4504 ASSERT(hdr->b_l1hdr.b_state == arc_anon); 4505 ASSERT(BP_GET_DEDUP(zio->io_bp)); 4506 ASSERT(BP_GET_LEVEL(zio->io_bp) == 0); 4507 } 4508 } 4509 hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS; 4510 /* if it's not anon, we are doing a scrub */ 4511 if (exists == NULL && hdr->b_l1hdr.b_state == arc_anon) 4512 arc_access(hdr, hash_lock); 4513 mutex_exit(hash_lock); 4514 } else { 4515 hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS; 4516 } 4517 4518 ASSERT(!refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 4519 callback->awcb_done(zio, buf, callback->awcb_private); 4520 4521 kmem_free(callback, sizeof (arc_write_callback_t)); 4522} 4523 4524zio_t * 4525arc_write(zio_t *pio, spa_t *spa, uint64_t txg, 4526 blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, boolean_t l2arc_compress, 4527 const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *physdone, 4528 arc_done_func_t *done, void *private, zio_priority_t priority, 4529 int zio_flags, const zbookmark_phys_t *zb) 4530{ 4531 arc_buf_hdr_t *hdr = buf->b_hdr; 4532 arc_write_callback_t *callback; 4533 zio_t *zio; 4534 4535 ASSERT(ready != NULL); 4536 ASSERT(done != NULL); 4537 ASSERT(!HDR_IO_ERROR(hdr)); 4538 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 4539 ASSERT(hdr->b_l1hdr.b_acb == NULL); 4540 ASSERT(hdr->b_l1hdr.b_datacnt > 0); 4541 if (l2arc) 4542 hdr->b_flags |= ARC_FLAG_L2CACHE; 4543 if (l2arc_compress) 4544 hdr->b_flags |= ARC_FLAG_L2COMPRESS; 4545 callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP); 4546 callback->awcb_ready = ready; 4547 callback->awcb_physdone = physdone; 4548 callback->awcb_done = done; 4549 callback->awcb_private = private; 4550 callback->awcb_buf = buf; 4551 4552 zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, zp, 4553 arc_write_ready, arc_write_physdone, arc_write_done, callback, 4554 priority, zio_flags, zb); 4555 4556 return (zio); 4557} 4558 4559static int 4560arc_memory_throttle(uint64_t reserve, uint64_t txg) 4561{ 4562#ifdef _KERNEL 4563 uint64_t available_memory = ptob(freemem); 4564 static uint64_t page_load = 0; 4565 static uint64_t last_txg = 0; 4566 4567#if defined(__i386) || !defined(UMA_MD_SMALL_ALLOC) 4568 available_memory = 4569 MIN(available_memory, ptob(vmem_size(heap_arena, VMEM_FREE))); 4570#endif 4571 4572 if (freemem > (uint64_t)physmem * arc_lotsfree_percent / 100) 4573 return (0); 4574 4575 if (txg > last_txg) { 4576 last_txg = txg; 4577 page_load = 0; 4578 } 4579 /* 4580 * If we are in pageout, we know that memory is already tight, 4581 * the arc is already going to be evicting, so we just want to 4582 * continue to let page writes occur as quickly as possible. 4583 */ 4584 if (curproc == pageproc) { 4585 if (page_load > MAX(ptob(minfree), available_memory) / 4) 4586 return (SET_ERROR(ERESTART)); 4587 /* Note: reserve is inflated, so we deflate */ 4588 page_load += reserve / 8; 4589 return (0); 4590 } else if (page_load > 0 && arc_reclaim_needed()) { 4591 /* memory is low, delay before restarting */ 4592 ARCSTAT_INCR(arcstat_memory_throttle_count, 1); 4593 return (SET_ERROR(EAGAIN)); 4594 } 4595 page_load = 0; 4596#endif 4597 return (0); 4598} 4599 4600static void 4601arc_kstat_update_state(arc_state_t *state, kstat_named_t *size, 4602 kstat_named_t *evict_data, kstat_named_t *evict_metadata) 4603{ 4604 size->value.ui64 = state->arcs_size; 4605 evict_data->value.ui64 = state->arcs_lsize[ARC_BUFC_DATA]; 4606 evict_metadata->value.ui64 = state->arcs_lsize[ARC_BUFC_METADATA]; 4607} 4608 4609static int 4610arc_kstat_update(kstat_t *ksp, int rw) 4611{ 4612 arc_stats_t *as = ksp->ks_data; 4613 4614 if (rw == KSTAT_WRITE) { 4615 return (EACCES); 4616 } else { 4617 arc_kstat_update_state(arc_anon, 4618 &as->arcstat_anon_size, 4619 &as->arcstat_anon_evictable_data, 4620 &as->arcstat_anon_evictable_metadata); 4621 arc_kstat_update_state(arc_mru, 4622 &as->arcstat_mru_size, 4623 &as->arcstat_mru_evictable_data, 4624 &as->arcstat_mru_evictable_metadata); 4625 arc_kstat_update_state(arc_mru_ghost, 4626 &as->arcstat_mru_ghost_size, 4627 &as->arcstat_mru_ghost_evictable_data, 4628 &as->arcstat_mru_ghost_evictable_metadata); 4629 arc_kstat_update_state(arc_mfu, 4630 &as->arcstat_mfu_size, 4631 &as->arcstat_mfu_evictable_data, 4632 &as->arcstat_mfu_evictable_metadata); 4633 arc_kstat_update_state(arc_mfu_ghost, 4634 &as->arcstat_mfu_ghost_size, 4635 &as->arcstat_mfu_ghost_evictable_data, 4636 &as->arcstat_mfu_ghost_evictable_metadata); 4637 } 4638 4639 return (0); 4640} 4641 4642void 4643arc_tempreserve_clear(uint64_t reserve) 4644{ 4645 atomic_add_64(&arc_tempreserve, -reserve); 4646 ASSERT((int64_t)arc_tempreserve >= 0); 4647} 4648 4649int 4650arc_tempreserve_space(uint64_t reserve, uint64_t txg) 4651{ 4652 int error; 4653 uint64_t anon_size; 4654 4655 if (reserve > arc_c/4 && !arc_no_grow) { 4656 arc_c = MIN(arc_c_max, reserve * 4); 4657 DTRACE_PROBE1(arc__set_reserve, uint64_t, arc_c); 4658 } 4659 if (reserve > arc_c) 4660 return (SET_ERROR(ENOMEM)); 4661 4662 /* 4663 * Don't count loaned bufs as in flight dirty data to prevent long 4664 * network delays from blocking transactions that are ready to be 4665 * assigned to a txg. 4666 */ 4667 anon_size = MAX((int64_t)(arc_anon->arcs_size - arc_loaned_bytes), 0); 4668 4669 /* 4670 * Writes will, almost always, require additional memory allocations 4671 * in order to compress/encrypt/etc the data. We therefore need to 4672 * make sure that there is sufficient available memory for this. 4673 */ 4674 error = arc_memory_throttle(reserve, txg); 4675 if (error != 0) 4676 return (error); 4677 4678 /* 4679 * Throttle writes when the amount of dirty data in the cache 4680 * gets too large. We try to keep the cache less than half full 4681 * of dirty blocks so that our sync times don't grow too large. 4682 * Note: if two requests come in concurrently, we might let them 4683 * both succeed, when one of them should fail. Not a huge deal. 4684 */ 4685 4686 if (reserve + arc_tempreserve + anon_size > arc_c / 2 && 4687 anon_size > arc_c / 4) { 4688 dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK " 4689 "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n", 4690 arc_tempreserve>>10, 4691 arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10, 4692 arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10, 4693 reserve>>10, arc_c>>10); 4694 return (SET_ERROR(ERESTART)); 4695 } 4696 atomic_add_64(&arc_tempreserve, reserve); 4697 return (0); 4698} 4699 4700static kmutex_t arc_lowmem_lock; 4701#ifdef _KERNEL 4702static eventhandler_tag arc_event_lowmem = NULL; 4703 4704static void 4705arc_lowmem(void *arg __unused, int howto __unused) 4706{ 4707 4708 /* Serialize access via arc_lowmem_lock. */ 4709 mutex_enter(&arc_lowmem_lock); 4710 mutex_enter(&arc_reclaim_thr_lock); 4711 needfree = 1; 4712 DTRACE_PROBE(arc__needfree); 4713 cv_signal(&arc_reclaim_thr_cv); 4714 4715 /* 4716 * It is unsafe to block here in arbitrary threads, because we can come 4717 * here from ARC itself and may hold ARC locks and thus risk a deadlock 4718 * with ARC reclaim thread. 4719 */ 4720 if (curproc == pageproc) { 4721 while (needfree) 4722 msleep(&needfree, &arc_reclaim_thr_lock, 0, "zfs:lowmem", 0); 4723 } 4724 mutex_exit(&arc_reclaim_thr_lock); 4725 mutex_exit(&arc_lowmem_lock); 4726} 4727#endif 4728 4729void 4730arc_init(void) 4731{ 4732 int i, prefetch_tunable_set = 0; 4733 4734 mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL); 4735 cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL); 4736 mutex_init(&arc_lowmem_lock, NULL, MUTEX_DEFAULT, NULL); 4737 4738 /* Convert seconds to clock ticks */ 4739 arc_min_prefetch_lifespan = 1 * hz; 4740 4741 /* Start out with 1/8 of all memory */ 4742 arc_c = kmem_size() / 8; 4743 4744#ifdef illumos 4745#ifdef _KERNEL 4746 /* 4747 * On architectures where the physical memory can be larger 4748 * than the addressable space (intel in 32-bit mode), we may 4749 * need to limit the cache to 1/8 of VM size. 4750 */ 4751 arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8); 4752#endif 4753#endif /* illumos */ 4754 /* set min cache to 1/32 of all memory, or 16MB, whichever is more */ 4755 arc_c_min = MAX(arc_c / 4, 16 << 20); 4756 /* set max to 1/2 of all memory, or all but 1GB, whichever is more */ 4757 if (arc_c * 8 >= 1 << 30) 4758 arc_c_max = (arc_c * 8) - (1 << 30); 4759 else 4760 arc_c_max = arc_c_min; 4761 arc_c_max = MAX(arc_c * 5, arc_c_max); 4762 4763#ifdef _KERNEL 4764 /* 4765 * Allow the tunables to override our calculations if they are 4766 * reasonable (ie. over 16MB) 4767 */ 4768 if (zfs_arc_max > 16 << 20 && zfs_arc_max < kmem_size()) 4769 arc_c_max = zfs_arc_max; 4770 if (zfs_arc_min > 16 << 20 && zfs_arc_min <= arc_c_max) 4771 arc_c_min = zfs_arc_min; 4772#endif 4773 4774 arc_c = arc_c_max; 4775 arc_p = (arc_c >> 1); 4776 4777 /* limit meta-data to 1/4 of the arc capacity */ 4778 arc_meta_limit = arc_c_max / 4; 4779 4780 /* Allow the tunable to override if it is reasonable */ 4781 if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max) 4782 arc_meta_limit = zfs_arc_meta_limit; 4783 4784 if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0) 4785 arc_c_min = arc_meta_limit / 2; 4786 4787 if (zfs_arc_meta_min > 0) { 4788 arc_meta_min = zfs_arc_meta_min; 4789 } else { 4790 arc_meta_min = arc_c_min / 2; 4791 } 4792 4793 if (zfs_arc_grow_retry > 0) 4794 arc_grow_retry = zfs_arc_grow_retry; 4795 4796 if (zfs_arc_shrink_shift > 0) 4797 arc_shrink_shift = zfs_arc_shrink_shift; 4798 4799 if (zfs_arc_p_min_shift > 0) 4800 arc_p_min_shift = zfs_arc_p_min_shift; 4801 4802 /* if kmem_flags are set, lets try to use less memory */ 4803 if (kmem_debugging()) 4804 arc_c = arc_c / 2; 4805 if (arc_c < arc_c_min) 4806 arc_c = arc_c_min; 4807 4808 zfs_arc_min = arc_c_min; 4809 zfs_arc_max = arc_c_max; 4810 4811 arc_anon = &ARC_anon; 4812 arc_mru = &ARC_mru; 4813 arc_mru_ghost = &ARC_mru_ghost; 4814 arc_mfu = &ARC_mfu; 4815 arc_mfu_ghost = &ARC_mfu_ghost; 4816 arc_l2c_only = &ARC_l2c_only; 4817 arc_size = 0; 4818 4819 for (i = 0; i < ARC_BUFC_NUMLISTS; i++) { 4820 mutex_init(&arc_anon->arcs_locks[i].arcs_lock, 4821 NULL, MUTEX_DEFAULT, NULL); 4822 mutex_init(&arc_mru->arcs_locks[i].arcs_lock, 4823 NULL, MUTEX_DEFAULT, NULL); 4824 mutex_init(&arc_mru_ghost->arcs_locks[i].arcs_lock, 4825 NULL, MUTEX_DEFAULT, NULL); 4826 mutex_init(&arc_mfu->arcs_locks[i].arcs_lock, 4827 NULL, MUTEX_DEFAULT, NULL); 4828 mutex_init(&arc_mfu_ghost->arcs_locks[i].arcs_lock, 4829 NULL, MUTEX_DEFAULT, NULL); 4830 mutex_init(&arc_l2c_only->arcs_locks[i].arcs_lock, 4831 NULL, MUTEX_DEFAULT, NULL); 4832 4833 list_create(&arc_mru->arcs_lists[i], 4834 sizeof (arc_buf_hdr_t), 4835 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node)); 4836 list_create(&arc_mru_ghost->arcs_lists[i], 4837 sizeof (arc_buf_hdr_t), 4838 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node)); 4839 list_create(&arc_mfu->arcs_lists[i], 4840 sizeof (arc_buf_hdr_t), 4841 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node)); 4842 list_create(&arc_mfu_ghost->arcs_lists[i], 4843 sizeof (arc_buf_hdr_t), 4844 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node)); 4845 list_create(&arc_mfu_ghost->arcs_lists[i], 4846 sizeof (arc_buf_hdr_t), 4847 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node)); 4848 list_create(&arc_l2c_only->arcs_lists[i], 4849 sizeof (arc_buf_hdr_t), 4850 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node)); 4851 } 4852 4853 buf_init(); 4854 4855 arc_thread_exit = 0; 4856 arc_eviction_list = NULL; 4857 mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL); 4858 bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t)); 4859 4860 arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED, 4861 sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); 4862 4863 if (arc_ksp != NULL) { 4864 arc_ksp->ks_data = &arc_stats; 4865 arc_ksp->ks_update = arc_kstat_update; 4866 kstat_install(arc_ksp); 4867 } 4868 4869 (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0, 4870 TS_RUN, minclsyspri); 4871 4872#ifdef _KERNEL 4873 arc_event_lowmem = EVENTHANDLER_REGISTER(vm_lowmem, arc_lowmem, NULL, 4874 EVENTHANDLER_PRI_FIRST); 4875#endif 4876 4877 arc_dead = FALSE; 4878 arc_warm = B_FALSE; 4879 4880 /* 4881 * Calculate maximum amount of dirty data per pool. 4882 * 4883 * If it has been set by /etc/system, take that. 4884 * Otherwise, use a percentage of physical memory defined by 4885 * zfs_dirty_data_max_percent (default 10%) with a cap at 4886 * zfs_dirty_data_max_max (default 4GB). 4887 */ 4888 if (zfs_dirty_data_max == 0) { 4889 zfs_dirty_data_max = ptob(physmem) * 4890 zfs_dirty_data_max_percent / 100; 4891 zfs_dirty_data_max = MIN(zfs_dirty_data_max, 4892 zfs_dirty_data_max_max); 4893 } 4894 4895#ifdef _KERNEL 4896 if (TUNABLE_INT_FETCH("vfs.zfs.prefetch_disable", &zfs_prefetch_disable)) 4897 prefetch_tunable_set = 1; 4898 4899#ifdef __i386__ 4900 if (prefetch_tunable_set == 0) { 4901 printf("ZFS NOTICE: Prefetch is disabled by default on i386 " 4902 "-- to enable,\n"); 4903 printf(" add \"vfs.zfs.prefetch_disable=0\" " 4904 "to /boot/loader.conf.\n"); 4905 zfs_prefetch_disable = 1; 4906 } 4907#else 4908 if ((((uint64_t)physmem * PAGESIZE) < (1ULL << 32)) && 4909 prefetch_tunable_set == 0) { 4910 printf("ZFS NOTICE: Prefetch is disabled by default if less " 4911 "than 4GB of RAM is present;\n" 4912 " to enable, add \"vfs.zfs.prefetch_disable=0\" " 4913 "to /boot/loader.conf.\n"); 4914 zfs_prefetch_disable = 1; 4915 } 4916#endif 4917 /* Warn about ZFS memory and address space requirements. */ 4918 if (((uint64_t)physmem * PAGESIZE) < (256 + 128 + 64) * (1 << 20)) { 4919 printf("ZFS WARNING: Recommended minimum RAM size is 512MB; " 4920 "expect unstable behavior.\n"); 4921 } 4922 if (kmem_size() < 512 * (1 << 20)) { 4923 printf("ZFS WARNING: Recommended minimum kmem_size is 512MB; " 4924 "expect unstable behavior.\n"); 4925 printf(" Consider tuning vm.kmem_size and " 4926 "vm.kmem_size_max\n"); 4927 printf(" in /boot/loader.conf.\n"); 4928 } 4929#endif 4930} 4931 4932void 4933arc_fini(void) 4934{ 4935 int i; 4936 4937 mutex_enter(&arc_reclaim_thr_lock); 4938 arc_thread_exit = 1; 4939 cv_signal(&arc_reclaim_thr_cv); 4940 while (arc_thread_exit != 0) 4941 cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock); 4942 mutex_exit(&arc_reclaim_thr_lock); 4943 4944 arc_flush(NULL); 4945 4946 arc_dead = TRUE; 4947 4948 if (arc_ksp != NULL) { 4949 kstat_delete(arc_ksp); 4950 arc_ksp = NULL; 4951 } 4952 4953 mutex_destroy(&arc_eviction_mtx); 4954 mutex_destroy(&arc_reclaim_thr_lock); 4955 cv_destroy(&arc_reclaim_thr_cv); 4956 4957 for (i = 0; i < ARC_BUFC_NUMLISTS; i++) { 4958 list_destroy(&arc_mru->arcs_lists[i]); 4959 list_destroy(&arc_mru_ghost->arcs_lists[i]); 4960 list_destroy(&arc_mfu->arcs_lists[i]); 4961 list_destroy(&arc_mfu_ghost->arcs_lists[i]); 4962 list_destroy(&arc_l2c_only->arcs_lists[i]); 4963 4964 mutex_destroy(&arc_anon->arcs_locks[i].arcs_lock); 4965 mutex_destroy(&arc_mru->arcs_locks[i].arcs_lock); 4966 mutex_destroy(&arc_mru_ghost->arcs_locks[i].arcs_lock); 4967 mutex_destroy(&arc_mfu->arcs_locks[i].arcs_lock); 4968 mutex_destroy(&arc_mfu_ghost->arcs_locks[i].arcs_lock); 4969 mutex_destroy(&arc_l2c_only->arcs_locks[i].arcs_lock); 4970 } 4971 4972 buf_fini(); 4973 4974 ASSERT0(arc_loaned_bytes); 4975 4976 mutex_destroy(&arc_lowmem_lock); 4977#ifdef _KERNEL 4978 if (arc_event_lowmem != NULL) 4979 EVENTHANDLER_DEREGISTER(vm_lowmem, arc_event_lowmem); 4980#endif 4981} 4982 4983/* 4984 * Level 2 ARC 4985 * 4986 * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk. 4987 * It uses dedicated storage devices to hold cached data, which are populated 4988 * using large infrequent writes. The main role of this cache is to boost 4989 * the performance of random read workloads. The intended L2ARC devices 4990 * include short-stroked disks, solid state disks, and other media with 4991 * substantially faster read latency than disk. 4992 * 4993 * +-----------------------+ 4994 * | ARC | 4995 * +-----------------------+ 4996 * | ^ ^ 4997 * | | | 4998 * l2arc_feed_thread() arc_read() 4999 * | | | 5000 * | l2arc read | 5001 * V | | 5002 * +---------------+ | 5003 * | L2ARC | | 5004 * +---------------+ | 5005 * | ^ | 5006 * l2arc_write() | | 5007 * | | | 5008 * V | | 5009 * +-------+ +-------+ 5010 * | vdev | | vdev | 5011 * | cache | | cache | 5012 * +-------+ +-------+ 5013 * +=========+ .-----. 5014 * : L2ARC : |-_____-| 5015 * : devices : | Disks | 5016 * +=========+ `-_____-' 5017 * 5018 * Read requests are satisfied from the following sources, in order: 5019 * 5020 * 1) ARC 5021 * 2) vdev cache of L2ARC devices 5022 * 3) L2ARC devices 5023 * 4) vdev cache of disks 5024 * 5) disks 5025 * 5026 * Some L2ARC device types exhibit extremely slow write performance. 5027 * To accommodate for this there are some significant differences between 5028 * the L2ARC and traditional cache design: 5029 * 5030 * 1. There is no eviction path from the ARC to the L2ARC. Evictions from 5031 * the ARC behave as usual, freeing buffers and placing headers on ghost 5032 * lists. The ARC does not send buffers to the L2ARC during eviction as 5033 * this would add inflated write latencies for all ARC memory pressure. 5034 * 5035 * 2. The L2ARC attempts to cache data from the ARC before it is evicted. 5036 * It does this by periodically scanning buffers from the eviction-end of 5037 * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are 5038 * not already there. It scans until a headroom of buffers is satisfied, 5039 * which itself is a buffer for ARC eviction. If a compressible buffer is 5040 * found during scanning and selected for writing to an L2ARC device, we 5041 * temporarily boost scanning headroom during the next scan cycle to make 5042 * sure we adapt to compression effects (which might significantly reduce 5043 * the data volume we write to L2ARC). The thread that does this is 5044 * l2arc_feed_thread(), illustrated below; example sizes are included to 5045 * provide a better sense of ratio than this diagram: 5046 * 5047 * head --> tail 5048 * +---------------------+----------+ 5049 * ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->. # already on L2ARC 5050 * +---------------------+----------+ | o L2ARC eligible 5051 * ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->| : ARC buffer 5052 * +---------------------+----------+ | 5053 * 15.9 Gbytes ^ 32 Mbytes | 5054 * headroom | 5055 * l2arc_feed_thread() 5056 * | 5057 * l2arc write hand <--[oooo]--' 5058 * | 8 Mbyte 5059 * | write max 5060 * V 5061 * +==============================+ 5062 * L2ARC dev |####|#|###|###| |####| ... | 5063 * +==============================+ 5064 * 32 Gbytes 5065 * 5066 * 3. If an ARC buffer is copied to the L2ARC but then hit instead of 5067 * evicted, then the L2ARC has cached a buffer much sooner than it probably 5068 * needed to, potentially wasting L2ARC device bandwidth and storage. It is 5069 * safe to say that this is an uncommon case, since buffers at the end of 5070 * the ARC lists have moved there due to inactivity. 5071 * 5072 * 4. If the ARC evicts faster than the L2ARC can maintain a headroom, 5073 * then the L2ARC simply misses copying some buffers. This serves as a 5074 * pressure valve to prevent heavy read workloads from both stalling the ARC 5075 * with waits and clogging the L2ARC with writes. This also helps prevent 5076 * the potential for the L2ARC to churn if it attempts to cache content too 5077 * quickly, such as during backups of the entire pool. 5078 * 5079 * 5. After system boot and before the ARC has filled main memory, there are 5080 * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru 5081 * lists can remain mostly static. Instead of searching from tail of these 5082 * lists as pictured, the l2arc_feed_thread() will search from the list heads 5083 * for eligible buffers, greatly increasing its chance of finding them. 5084 * 5085 * The L2ARC device write speed is also boosted during this time so that 5086 * the L2ARC warms up faster. Since there have been no ARC evictions yet, 5087 * there are no L2ARC reads, and no fear of degrading read performance 5088 * through increased writes. 5089 * 5090 * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that 5091 * the vdev queue can aggregate them into larger and fewer writes. Each 5092 * device is written to in a rotor fashion, sweeping writes through 5093 * available space then repeating. 5094 * 5095 * 7. The L2ARC does not store dirty content. It never needs to flush 5096 * write buffers back to disk based storage. 5097 * 5098 * 8. If an ARC buffer is written (and dirtied) which also exists in the 5099 * L2ARC, the now stale L2ARC buffer is immediately dropped. 5100 * 5101 * The performance of the L2ARC can be tweaked by a number of tunables, which 5102 * may be necessary for different workloads: 5103 * 5104 * l2arc_write_max max write bytes per interval 5105 * l2arc_write_boost extra write bytes during device warmup 5106 * l2arc_noprefetch skip caching prefetched buffers 5107 * l2arc_headroom number of max device writes to precache 5108 * l2arc_headroom_boost when we find compressed buffers during ARC 5109 * scanning, we multiply headroom by this 5110 * percentage factor for the next scan cycle, 5111 * since more compressed buffers are likely to 5112 * be present 5113 * l2arc_feed_secs seconds between L2ARC writing 5114 * 5115 * Tunables may be removed or added as future performance improvements are 5116 * integrated, and also may become zpool properties. 5117 * 5118 * There are three key functions that control how the L2ARC warms up: 5119 * 5120 * l2arc_write_eligible() check if a buffer is eligible to cache 5121 * l2arc_write_size() calculate how much to write 5122 * l2arc_write_interval() calculate sleep delay between writes 5123 * 5124 * These three functions determine what to write, how much, and how quickly 5125 * to send writes. 5126 */ 5127 5128static boolean_t 5129l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *hdr) 5130{ 5131 /* 5132 * A buffer is *not* eligible for the L2ARC if it: 5133 * 1. belongs to a different spa. 5134 * 2. is already cached on the L2ARC. 5135 * 3. has an I/O in progress (it may be an incomplete read). 5136 * 4. is flagged not eligible (zfs property). 5137 */ 5138 if (hdr->b_spa != spa_guid) { 5139 ARCSTAT_BUMP(arcstat_l2_write_spa_mismatch); 5140 return (B_FALSE); 5141 } 5142 if (HDR_HAS_L2HDR(hdr)) { 5143 ARCSTAT_BUMP(arcstat_l2_write_in_l2); 5144 return (B_FALSE); 5145 } 5146 if (HDR_IO_IN_PROGRESS(hdr)) { 5147 ARCSTAT_BUMP(arcstat_l2_write_hdr_io_in_progress); 5148 return (B_FALSE); 5149 } 5150 if (!HDR_L2CACHE(hdr)) { 5151 ARCSTAT_BUMP(arcstat_l2_write_not_cacheable); 5152 return (B_FALSE); 5153 } 5154 5155 return (B_TRUE); 5156} 5157 5158static uint64_t 5159l2arc_write_size(void) 5160{ 5161 uint64_t size; 5162 5163 /* 5164 * Make sure our globals have meaningful values in case the user 5165 * altered them. 5166 */ 5167 size = l2arc_write_max; 5168 if (size == 0) { 5169 cmn_err(CE_NOTE, "Bad value for l2arc_write_max, value must " 5170 "be greater than zero, resetting it to the default (%d)", 5171 L2ARC_WRITE_SIZE); 5172 size = l2arc_write_max = L2ARC_WRITE_SIZE; 5173 } 5174 5175 if (arc_warm == B_FALSE) 5176 size += l2arc_write_boost; 5177 5178 return (size); 5179 5180} 5181 5182static clock_t 5183l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote) 5184{ 5185 clock_t interval, next, now; 5186 5187 /* 5188 * If the ARC lists are busy, increase our write rate; if the 5189 * lists are stale, idle back. This is achieved by checking 5190 * how much we previously wrote - if it was more than half of 5191 * what we wanted, schedule the next write much sooner. 5192 */ 5193 if (l2arc_feed_again && wrote > (wanted / 2)) 5194 interval = (hz * l2arc_feed_min_ms) / 1000; 5195 else 5196 interval = hz * l2arc_feed_secs; 5197 5198 now = ddi_get_lbolt(); 5199 next = MAX(now, MIN(now + interval, began + interval)); 5200 5201 return (next); 5202} 5203 5204/* 5205 * Cycle through L2ARC devices. This is how L2ARC load balances. 5206 * If a device is returned, this also returns holding the spa config lock. 5207 */ 5208static l2arc_dev_t * 5209l2arc_dev_get_next(void) 5210{ 5211 l2arc_dev_t *first, *next = NULL; 5212 5213 /* 5214 * Lock out the removal of spas (spa_namespace_lock), then removal 5215 * of cache devices (l2arc_dev_mtx). Once a device has been selected, 5216 * both locks will be dropped and a spa config lock held instead. 5217 */ 5218 mutex_enter(&spa_namespace_lock); 5219 mutex_enter(&l2arc_dev_mtx); 5220 5221 /* if there are no vdevs, there is nothing to do */ 5222 if (l2arc_ndev == 0) 5223 goto out; 5224 5225 first = NULL; 5226 next = l2arc_dev_last; 5227 do { 5228 /* loop around the list looking for a non-faulted vdev */ 5229 if (next == NULL) { 5230 next = list_head(l2arc_dev_list); 5231 } else { 5232 next = list_next(l2arc_dev_list, next); 5233 if (next == NULL) 5234 next = list_head(l2arc_dev_list); 5235 } 5236 5237 /* if we have come back to the start, bail out */ 5238 if (first == NULL) 5239 first = next; 5240 else if (next == first) 5241 break; 5242 5243 } while (vdev_is_dead(next->l2ad_vdev)); 5244 5245 /* if we were unable to find any usable vdevs, return NULL */ 5246 if (vdev_is_dead(next->l2ad_vdev)) 5247 next = NULL; 5248 5249 l2arc_dev_last = next; 5250 5251out: 5252 mutex_exit(&l2arc_dev_mtx); 5253 5254 /* 5255 * Grab the config lock to prevent the 'next' device from being 5256 * removed while we are writing to it. 5257 */ 5258 if (next != NULL) 5259 spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER); 5260 mutex_exit(&spa_namespace_lock); 5261 5262 return (next); 5263} 5264 5265/* 5266 * Free buffers that were tagged for destruction. 5267 */ 5268static void 5269l2arc_do_free_on_write() 5270{ 5271 list_t *buflist; 5272 l2arc_data_free_t *df, *df_prev; 5273 5274 mutex_enter(&l2arc_free_on_write_mtx); 5275 buflist = l2arc_free_on_write; 5276 5277 for (df = list_tail(buflist); df; df = df_prev) { 5278 df_prev = list_prev(buflist, df); 5279 ASSERT(df->l2df_data != NULL); 5280 ASSERT(df->l2df_func != NULL); 5281 df->l2df_func(df->l2df_data, df->l2df_size); 5282 list_remove(buflist, df); 5283 kmem_free(df, sizeof (l2arc_data_free_t)); 5284 } 5285 5286 mutex_exit(&l2arc_free_on_write_mtx); 5287} 5288 5289/* 5290 * A write to a cache device has completed. Update all headers to allow 5291 * reads from these buffers to begin. 5292 */ 5293static void 5294l2arc_write_done(zio_t *zio) 5295{ 5296 l2arc_write_callback_t *cb; 5297 l2arc_dev_t *dev; 5298 list_t *buflist; 5299 arc_buf_hdr_t *head, *hdr, *hdr_prev; 5300 kmutex_t *hash_lock; 5301 int64_t bytes_dropped = 0; 5302 5303 cb = zio->io_private; 5304 ASSERT(cb != NULL); 5305 dev = cb->l2wcb_dev; 5306 ASSERT(dev != NULL); 5307 head = cb->l2wcb_head; 5308 ASSERT(head != NULL); 5309 buflist = &dev->l2ad_buflist; 5310 ASSERT(buflist != NULL); 5311 DTRACE_PROBE2(l2arc__iodone, zio_t *, zio, 5312 l2arc_write_callback_t *, cb); 5313 5314 if (zio->io_error != 0) 5315 ARCSTAT_BUMP(arcstat_l2_writes_error); 5316 5317 mutex_enter(&dev->l2ad_mtx); 5318 5319 /* 5320 * All writes completed, or an error was hit. 5321 */ 5322 for (hdr = list_prev(buflist, head); hdr; hdr = hdr_prev) { 5323 hdr_prev = list_prev(buflist, hdr); 5324 5325 hash_lock = HDR_LOCK(hdr); 5326 if (!mutex_tryenter(hash_lock)) { 5327 /* 5328 * This buffer misses out. It may be in a stage 5329 * of eviction. Its ARC_FLAG_L2_WRITING flag will be 5330 * left set, denying reads to this buffer. 5331 */ 5332 ARCSTAT_BUMP(arcstat_l2_writes_hdr_miss); 5333 continue; 5334 } 5335 5336 /* 5337 * It's possible that this buffer got evicted from the L1 cache 5338 * before we grabbed the vdev + hash locks, in which case 5339 * arc_hdr_realloc freed b_tmp_cdata for us if it was allocated. 5340 * Only free the buffer if we still have an L1 hdr. 5341 */ 5342 if (HDR_HAS_L1HDR(hdr) && hdr->b_l1hdr.b_tmp_cdata != NULL && 5343 HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF) 5344 l2arc_release_cdata_buf(hdr); 5345 5346 if (zio->io_error != 0) { 5347 /* 5348 * Error - drop L2ARC entry. 5349 */ 5350 trim_map_free(hdr->b_l2hdr.b_dev->l2ad_vdev, 5351 hdr->b_l2hdr.b_daddr, hdr->b_l2hdr.b_asize, 0); 5352 hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR; 5353 5354 ARCSTAT_INCR(arcstat_l2_asize, -hdr->b_l2hdr.b_asize); 5355 ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); 5356 } 5357 5358 /* 5359 * Allow ARC to begin reads to this L2ARC entry. 5360 */ 5361 hdr->b_flags &= ~ARC_FLAG_L2_WRITING; 5362 5363 mutex_exit(hash_lock); 5364 } 5365 5366 atomic_inc_64(&l2arc_writes_done); 5367 list_remove(buflist, head); 5368 ASSERT(!HDR_HAS_L1HDR(head)); 5369 kmem_cache_free(hdr_l2only_cache, head); 5370 mutex_exit(&dev->l2ad_mtx); 5371 5372 vdev_space_update(dev->l2ad_vdev, -bytes_dropped, 0, 0); 5373 5374 l2arc_do_free_on_write(); 5375 5376 kmem_free(cb, sizeof (l2arc_write_callback_t)); 5377} 5378 5379/* 5380 * A read to a cache device completed. Validate buffer contents before 5381 * handing over to the regular ARC routines. 5382 */ 5383static void 5384l2arc_read_done(zio_t *zio) 5385{ 5386 l2arc_read_callback_t *cb; 5387 arc_buf_hdr_t *hdr; 5388 arc_buf_t *buf; 5389 kmutex_t *hash_lock; 5390 int equal; 5391 5392 ASSERT(zio->io_vd != NULL); 5393 ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE); 5394 5395 spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd); 5396 5397 cb = zio->io_private; 5398 ASSERT(cb != NULL); 5399 buf = cb->l2rcb_buf; 5400 ASSERT(buf != NULL); 5401 5402 hash_lock = HDR_LOCK(buf->b_hdr); 5403 mutex_enter(hash_lock); 5404 hdr = buf->b_hdr; 5405 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 5406 5407 /* 5408 * If the buffer was compressed, decompress it first. 5409 */ 5410 if (cb->l2rcb_compress != ZIO_COMPRESS_OFF) 5411 l2arc_decompress_zio(zio, hdr, cb->l2rcb_compress); 5412 ASSERT(zio->io_data != NULL); 5413 5414 /* 5415 * Check this survived the L2ARC journey. 5416 */ 5417 equal = arc_cksum_equal(buf); 5418 if (equal && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) { 5419 mutex_exit(hash_lock); 5420 zio->io_private = buf; 5421 zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 */ 5422 zio->io_bp = &zio->io_bp_copy; /* XXX fix in L2ARC 2.0 */ 5423 arc_read_done(zio); 5424 } else { 5425 mutex_exit(hash_lock); 5426 /* 5427 * Buffer didn't survive caching. Increment stats and 5428 * reissue to the original storage device. 5429 */ 5430 if (zio->io_error != 0) { 5431 ARCSTAT_BUMP(arcstat_l2_io_error); 5432 } else { 5433 zio->io_error = SET_ERROR(EIO); 5434 } 5435 if (!equal) 5436 ARCSTAT_BUMP(arcstat_l2_cksum_bad); 5437 5438 /* 5439 * If there's no waiter, issue an async i/o to the primary 5440 * storage now. If there *is* a waiter, the caller must 5441 * issue the i/o in a context where it's OK to block. 5442 */ 5443 if (zio->io_waiter == NULL) { 5444 zio_t *pio = zio_unique_parent(zio); 5445 5446 ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL); 5447 5448 zio_nowait(zio_read(pio, cb->l2rcb_spa, &cb->l2rcb_bp, 5449 buf->b_data, zio->io_size, arc_read_done, buf, 5450 zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb)); 5451 } 5452 } 5453 5454 kmem_free(cb, sizeof (l2arc_read_callback_t)); 5455} 5456 5457/* 5458 * This is the list priority from which the L2ARC will search for pages to 5459 * cache. This is used within loops (0..3) to cycle through lists in the 5460 * desired order. This order can have a significant effect on cache 5461 * performance. 5462 * 5463 * Currently the metadata lists are hit first, MFU then MRU, followed by 5464 * the data lists. This function returns a locked list, and also returns 5465 * the lock pointer. 5466 */ 5467static list_t * 5468l2arc_list_locked(int list_num, kmutex_t **lock) 5469{ 5470 list_t *list = NULL; 5471 int idx; 5472 5473 ASSERT(list_num >= 0 && list_num < 2 * ARC_BUFC_NUMLISTS); 5474 5475 if (list_num < ARC_BUFC_NUMMETADATALISTS) { 5476 idx = list_num; 5477 list = &arc_mfu->arcs_lists[idx]; 5478 *lock = ARCS_LOCK(arc_mfu, idx); 5479 } else if (list_num < ARC_BUFC_NUMMETADATALISTS * 2) { 5480 idx = list_num - ARC_BUFC_NUMMETADATALISTS; 5481 list = &arc_mru->arcs_lists[idx]; 5482 *lock = ARCS_LOCK(arc_mru, idx); 5483 } else if (list_num < (ARC_BUFC_NUMMETADATALISTS * 2 + 5484 ARC_BUFC_NUMDATALISTS)) { 5485 idx = list_num - ARC_BUFC_NUMMETADATALISTS; 5486 list = &arc_mfu->arcs_lists[idx]; 5487 *lock = ARCS_LOCK(arc_mfu, idx); 5488 } else { 5489 idx = list_num - ARC_BUFC_NUMLISTS; 5490 list = &arc_mru->arcs_lists[idx]; 5491 *lock = ARCS_LOCK(arc_mru, idx); 5492 } 5493 5494 ASSERT(!(MUTEX_HELD(*lock))); 5495 mutex_enter(*lock); 5496 return (list); 5497} 5498 5499/* 5500 * Evict buffers from the device write hand to the distance specified in 5501 * bytes. This distance may span populated buffers, it may span nothing. 5502 * This is clearing a region on the L2ARC device ready for writing. 5503 * If the 'all' boolean is set, every buffer is evicted. 5504 */ 5505static void 5506l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) 5507{ 5508 list_t *buflist; 5509 arc_buf_hdr_t *hdr, *hdr_prev; 5510 kmutex_t *hash_lock; 5511 uint64_t taddr; 5512 int64_t bytes_evicted = 0; 5513 5514 buflist = &dev->l2ad_buflist; 5515 5516 if (!all && dev->l2ad_first) { 5517 /* 5518 * This is the first sweep through the device. There is 5519 * nothing to evict. 5520 */ 5521 return; 5522 } 5523 5524 if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) { 5525 /* 5526 * When nearing the end of the device, evict to the end 5527 * before the device write hand jumps to the start. 5528 */ 5529 taddr = dev->l2ad_end; 5530 } else { 5531 taddr = dev->l2ad_hand + distance; 5532 } 5533 DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist, 5534 uint64_t, taddr, boolean_t, all); 5535 5536top: 5537 mutex_enter(&dev->l2ad_mtx); 5538 for (hdr = list_tail(buflist); hdr; hdr = hdr_prev) { 5539 hdr_prev = list_prev(buflist, hdr); 5540 5541 hash_lock = HDR_LOCK(hdr); 5542 if (!mutex_tryenter(hash_lock)) { 5543 /* 5544 * Missed the hash lock. Retry. 5545 */ 5546 ARCSTAT_BUMP(arcstat_l2_evict_lock_retry); 5547 mutex_exit(&dev->l2ad_mtx); 5548 mutex_enter(hash_lock); 5549 mutex_exit(hash_lock); 5550 goto top; 5551 } 5552 5553 if (HDR_L2_WRITE_HEAD(hdr)) { 5554 /* 5555 * We hit a write head node. Leave it for 5556 * l2arc_write_done(). 5557 */ 5558 list_remove(buflist, hdr); 5559 mutex_exit(hash_lock); 5560 continue; 5561 } 5562 5563 if (!all && HDR_HAS_L2HDR(hdr) && 5564 (hdr->b_l2hdr.b_daddr > taddr || 5565 hdr->b_l2hdr.b_daddr < dev->l2ad_hand)) { 5566 /* 5567 * We've evicted to the target address, 5568 * or the end of the device. 5569 */ 5570 mutex_exit(hash_lock); 5571 break; 5572 } 5573 5574 ASSERT(HDR_HAS_L2HDR(hdr)); 5575 if (!HDR_HAS_L1HDR(hdr)) { 5576 ASSERT(!HDR_L2_READING(hdr)); 5577 /* 5578 * This doesn't exist in the ARC. Destroy. 5579 * arc_hdr_destroy() will call list_remove() 5580 * and decrement arcstat_l2_size. 5581 */ 5582 arc_change_state(arc_anon, hdr, hash_lock); 5583 arc_hdr_destroy(hdr); 5584 } else { 5585 ASSERT(hdr->b_l1hdr.b_state != arc_l2c_only); 5586 ARCSTAT_BUMP(arcstat_l2_evict_l1cached); 5587 /* 5588 * Invalidate issued or about to be issued 5589 * reads, since we may be about to write 5590 * over this location. 5591 */ 5592 if (HDR_L2_READING(hdr)) { 5593 ARCSTAT_BUMP(arcstat_l2_evict_reading); 5594 hdr->b_flags |= ARC_FLAG_L2_EVICTED; 5595 } 5596 5597 /* Tell ARC this no longer exists in L2ARC. */ 5598 ARCSTAT_INCR(arcstat_l2_asize, -hdr->b_l2hdr.b_asize); 5599 ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); 5600 hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR; 5601 list_remove(buflist, hdr); 5602 5603 /* This may have been leftover after a failed write. */ 5604 hdr->b_flags &= ~ARC_FLAG_L2_WRITING; 5605 } 5606 mutex_exit(hash_lock); 5607 } 5608 mutex_exit(&dev->l2ad_mtx); 5609 5610 vdev_space_update(dev->l2ad_vdev, -bytes_evicted, 0, 0); 5611 dev->l2ad_evict = taddr; 5612} 5613 5614/* 5615 * Find and write ARC buffers to the L2ARC device. 5616 * 5617 * An ARC_FLAG_L2_WRITING flag is set so that the L2ARC buffers are not valid 5618 * for reading until they have completed writing. 5619 * The headroom_boost is an in-out parameter used to maintain headroom boost 5620 * state between calls to this function. 5621 * 5622 * Returns the number of bytes actually written (which may be smaller than 5623 * the delta by which the device hand has changed due to alignment). 5624 */ 5625static uint64_t 5626l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, 5627 boolean_t *headroom_boost) 5628{ 5629 arc_buf_hdr_t *hdr, *hdr_prev, *head; 5630 list_t *list; 5631 uint64_t write_asize, write_psize, write_sz, headroom, 5632 buf_compress_minsz; 5633 void *buf_data; 5634 kmutex_t *list_lock; 5635 boolean_t full; 5636 l2arc_write_callback_t *cb; 5637 zio_t *pio, *wzio; 5638 uint64_t guid = spa_load_guid(spa); 5639 const boolean_t do_headroom_boost = *headroom_boost; 5640 int try; 5641 5642 ASSERT(dev->l2ad_vdev != NULL); 5643 5644 /* Lower the flag now, we might want to raise it again later. */ 5645 *headroom_boost = B_FALSE; 5646 5647 pio = NULL; 5648 write_sz = write_asize = write_psize = 0; 5649 full = B_FALSE; 5650 head = kmem_cache_alloc(hdr_l2only_cache, KM_PUSHPAGE); 5651 head->b_flags |= ARC_FLAG_L2_WRITE_HEAD; 5652 head->b_flags |= ARC_FLAG_HAS_L2HDR; 5653 5654 ARCSTAT_BUMP(arcstat_l2_write_buffer_iter); 5655 /* 5656 * We will want to try to compress buffers that are at least 2x the 5657 * device sector size. 5658 */ 5659 buf_compress_minsz = 2 << dev->l2ad_vdev->vdev_ashift; 5660 5661 /* 5662 * Copy buffers for L2ARC writing. 5663 */ 5664 mutex_enter(&dev->l2ad_mtx); 5665 for (try = 0; try < 2 * ARC_BUFC_NUMLISTS; try++) { 5666 uint64_t passed_sz = 0; 5667 5668 list = l2arc_list_locked(try, &list_lock); 5669 ARCSTAT_BUMP(arcstat_l2_write_buffer_list_iter); 5670 5671 /* 5672 * L2ARC fast warmup. 5673 * 5674 * Until the ARC is warm and starts to evict, read from the 5675 * head of the ARC lists rather than the tail. 5676 */ 5677 if (arc_warm == B_FALSE) 5678 hdr = list_head(list); 5679 else 5680 hdr = list_tail(list); 5681 if (hdr == NULL) 5682 ARCSTAT_BUMP(arcstat_l2_write_buffer_list_null_iter); 5683 5684 headroom = target_sz * l2arc_headroom * 2 / ARC_BUFC_NUMLISTS; 5685 if (do_headroom_boost) 5686 headroom = (headroom * l2arc_headroom_boost) / 100; 5687 5688 for (; hdr; hdr = hdr_prev) { 5689 kmutex_t *hash_lock; 5690 uint64_t buf_sz; 5691 5692 if (arc_warm == B_FALSE) 5693 hdr_prev = list_next(list, hdr); 5694 else 5695 hdr_prev = list_prev(list, hdr); 5696 ARCSTAT_INCR(arcstat_l2_write_buffer_bytes_scanned, hdr->b_size); 5697 5698 hash_lock = HDR_LOCK(hdr); 5699 if (!mutex_tryenter(hash_lock)) { 5700 ARCSTAT_BUMP(arcstat_l2_write_trylock_fail); 5701 /* 5702 * Skip this buffer rather than waiting. 5703 */ 5704 continue; 5705 } 5706 5707 passed_sz += hdr->b_size; 5708 if (passed_sz > headroom) { 5709 /* 5710 * Searched too far. 5711 */ 5712 mutex_exit(hash_lock); 5713 ARCSTAT_BUMP(arcstat_l2_write_passed_headroom); 5714 break; 5715 } 5716 5717 if (!l2arc_write_eligible(guid, hdr)) { 5718 mutex_exit(hash_lock); 5719 continue; 5720 } 5721 5722 if ((write_sz + hdr->b_size) > target_sz) { 5723 full = B_TRUE; 5724 mutex_exit(hash_lock); 5725 ARCSTAT_BUMP(arcstat_l2_write_full); 5726 break; 5727 } 5728 5729 if (pio == NULL) { 5730 /* 5731 * Insert a dummy header on the buflist so 5732 * l2arc_write_done() can find where the 5733 * write buffers begin without searching. 5734 */ 5735 list_insert_head(&dev->l2ad_buflist, head); 5736 5737 cb = kmem_alloc( 5738 sizeof (l2arc_write_callback_t), KM_SLEEP); 5739 cb->l2wcb_dev = dev; 5740 cb->l2wcb_head = head; 5741 pio = zio_root(spa, l2arc_write_done, cb, 5742 ZIO_FLAG_CANFAIL); 5743 ARCSTAT_BUMP(arcstat_l2_write_pios); 5744 } 5745 5746 /* 5747 * Create and add a new L2ARC header. 5748 */ 5749 hdr->b_l2hdr.b_dev = dev; 5750 hdr->b_flags |= ARC_FLAG_L2_WRITING; 5751 /* 5752 * Temporarily stash the data buffer in b_tmp_cdata. 5753 * The subsequent write step will pick it up from 5754 * there. This is because can't access b_l1hdr.b_buf 5755 * without holding the hash_lock, which we in turn 5756 * can't access without holding the ARC list locks 5757 * (which we want to avoid during compression/writing). 5758 */ 5759 HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_OFF); 5760 hdr->b_l2hdr.b_asize = hdr->b_size; 5761 hdr->b_l1hdr.b_tmp_cdata = hdr->b_l1hdr.b_buf->b_data; 5762 5763 buf_sz = hdr->b_size; 5764 hdr->b_flags |= ARC_FLAG_HAS_L2HDR; 5765 5766 list_insert_head(&dev->l2ad_buflist, hdr); 5767 5768 /* 5769 * Compute and store the buffer cksum before 5770 * writing. On debug the cksum is verified first. 5771 */ 5772 arc_cksum_verify(hdr->b_l1hdr.b_buf); 5773 arc_cksum_compute(hdr->b_l1hdr.b_buf, B_TRUE); 5774 5775 mutex_exit(hash_lock); 5776 5777 write_sz += buf_sz; 5778 } 5779 5780 mutex_exit(list_lock); 5781 5782 if (full == B_TRUE) 5783 break; 5784 } 5785 5786 /* No buffers selected for writing? */ 5787 if (pio == NULL) { 5788 ASSERT0(write_sz); 5789 mutex_exit(&dev->l2ad_mtx); 5790 ASSERT(!HDR_HAS_L1HDR(head)); 5791 kmem_cache_free(hdr_l2only_cache, head); 5792 return (0); 5793 } 5794 5795 /* 5796 * Now start writing the buffers. We're starting at the write head 5797 * and work backwards, retracing the course of the buffer selector 5798 * loop above. 5799 */ 5800 for (hdr = list_prev(&dev->l2ad_buflist, head); hdr; 5801 hdr = list_prev(&dev->l2ad_buflist, hdr)) { 5802 uint64_t buf_sz; 5803 5804 /* 5805 * We shouldn't need to lock the buffer here, since we flagged 5806 * it as ARC_FLAG_L2_WRITING in the previous step, but we must 5807 * take care to only access its L2 cache parameters. In 5808 * particular, hdr->l1hdr.b_buf may be invalid by now due to 5809 * ARC eviction. 5810 */ 5811 hdr->b_l2hdr.b_daddr = dev->l2ad_hand; 5812 5813 if ((HDR_L2COMPRESS(hdr)) && 5814 hdr->b_l2hdr.b_asize >= buf_compress_minsz) { 5815 if (l2arc_compress_buf(hdr)) { 5816 /* 5817 * If compression succeeded, enable headroom 5818 * boost on the next scan cycle. 5819 */ 5820 *headroom_boost = B_TRUE; 5821 } 5822 } 5823 5824 /* 5825 * Pick up the buffer data we had previously stashed away 5826 * (and now potentially also compressed). 5827 */ 5828 buf_data = hdr->b_l1hdr.b_tmp_cdata; 5829 buf_sz = hdr->b_l2hdr.b_asize; 5830 5831 /* 5832 * If the data has not been compressed, then clear b_tmp_cdata 5833 * to make sure that it points only to a temporary compression 5834 * buffer. 5835 */ 5836 if (!L2ARC_IS_VALID_COMPRESS(HDR_GET_COMPRESS(hdr))) 5837 hdr->b_l1hdr.b_tmp_cdata = NULL; 5838 5839 /* Compression may have squashed the buffer to zero length. */ 5840 if (buf_sz != 0) { 5841 uint64_t buf_p_sz; 5842 5843 wzio = zio_write_phys(pio, dev->l2ad_vdev, 5844 dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF, 5845 NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE, 5846 ZIO_FLAG_CANFAIL, B_FALSE); 5847 5848 DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev, 5849 zio_t *, wzio); 5850 (void) zio_nowait(wzio); 5851 5852 write_asize += buf_sz; 5853 /* 5854 * Keep the clock hand suitably device-aligned. 5855 */ 5856 buf_p_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz); 5857 write_psize += buf_p_sz; 5858 dev->l2ad_hand += buf_p_sz; 5859 } 5860 } 5861 5862 mutex_exit(&dev->l2ad_mtx); 5863 5864 ASSERT3U(write_asize, <=, target_sz); 5865 ARCSTAT_BUMP(arcstat_l2_writes_sent); 5866 ARCSTAT_INCR(arcstat_l2_write_bytes, write_asize); 5867 ARCSTAT_INCR(arcstat_l2_size, write_sz); 5868 ARCSTAT_INCR(arcstat_l2_asize, write_asize); 5869 vdev_space_update(dev->l2ad_vdev, write_asize, 0, 0); 5870 5871 /* 5872 * Bump device hand to the device start if it is approaching the end. 5873 * l2arc_evict() will already have evicted ahead for this case. 5874 */ 5875 if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) { 5876 dev->l2ad_hand = dev->l2ad_start; 5877 dev->l2ad_evict = dev->l2ad_start; 5878 dev->l2ad_first = B_FALSE; 5879 } 5880 5881 dev->l2ad_writing = B_TRUE; 5882 (void) zio_wait(pio); 5883 dev->l2ad_writing = B_FALSE; 5884 5885 return (write_asize); 5886} 5887 5888/* 5889 * Compresses an L2ARC buffer. 5890 * The data to be compressed must be prefilled in l1hdr.b_tmp_cdata and its 5891 * size in l2hdr->b_asize. This routine tries to compress the data and 5892 * depending on the compression result there are three possible outcomes: 5893 * *) The buffer was incompressible. The original l2hdr contents were left 5894 * untouched and are ready for writing to an L2 device. 5895 * *) The buffer was all-zeros, so there is no need to write it to an L2 5896 * device. To indicate this situation b_tmp_cdata is NULL'ed, b_asize is 5897 * set to zero and b_compress is set to ZIO_COMPRESS_EMPTY. 5898 * *) Compression succeeded and b_tmp_cdata was replaced with a temporary 5899 * data buffer which holds the compressed data to be written, and b_asize 5900 * tells us how much data there is. b_compress is set to the appropriate 5901 * compression algorithm. Once writing is done, invoke 5902 * l2arc_release_cdata_buf on this l2hdr to free this temporary buffer. 5903 * 5904 * Returns B_TRUE if compression succeeded, or B_FALSE if it didn't (the 5905 * buffer was incompressible). 5906 */ 5907static boolean_t 5908l2arc_compress_buf(arc_buf_hdr_t *hdr) 5909{ 5910 void *cdata; 5911 size_t csize, len, rounded; 5912 ASSERT(HDR_HAS_L2HDR(hdr)); 5913 l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr; 5914 5915 ASSERT(HDR_HAS_L1HDR(hdr)); 5916 ASSERT(HDR_GET_COMPRESS(hdr) == ZIO_COMPRESS_OFF); 5917 ASSERT(hdr->b_l1hdr.b_tmp_cdata != NULL); 5918 5919 len = l2hdr->b_asize; 5920 cdata = zio_data_buf_alloc(len); 5921 ASSERT3P(cdata, !=, NULL); 5922 csize = zio_compress_data(ZIO_COMPRESS_LZ4, hdr->b_l1hdr.b_tmp_cdata, 5923 cdata, l2hdr->b_asize); 5924 5925 if (csize == 0) { 5926 /* zero block, indicate that there's nothing to write */ 5927 zio_data_buf_free(cdata, len); 5928 HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_EMPTY); 5929 l2hdr->b_asize = 0; 5930 hdr->b_l1hdr.b_tmp_cdata = NULL; 5931 ARCSTAT_BUMP(arcstat_l2_compress_zeros); 5932 return (B_TRUE); 5933 } 5934 5935 rounded = P2ROUNDUP(csize, 5936 (size_t)1 << l2hdr->b_dev->l2ad_vdev->vdev_ashift); 5937 if (rounded < len) { 5938 /* 5939 * Compression succeeded, we'll keep the cdata around for 5940 * writing and release it afterwards. 5941 */ 5942 if (rounded > csize) { 5943 bzero((char *)cdata + csize, rounded - csize); 5944 csize = rounded; 5945 } 5946 HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_LZ4); 5947 l2hdr->b_asize = csize; 5948 hdr->b_l1hdr.b_tmp_cdata = cdata; 5949 ARCSTAT_BUMP(arcstat_l2_compress_successes); 5950 return (B_TRUE); 5951 } else { 5952 /* 5953 * Compression failed, release the compressed buffer. 5954 * l2hdr will be left unmodified. 5955 */ 5956 zio_data_buf_free(cdata, len); 5957 ARCSTAT_BUMP(arcstat_l2_compress_failures); 5958 return (B_FALSE); 5959 } 5960} 5961 5962/* 5963 * Decompresses a zio read back from an l2arc device. On success, the 5964 * underlying zio's io_data buffer is overwritten by the uncompressed 5965 * version. On decompression error (corrupt compressed stream), the 5966 * zio->io_error value is set to signal an I/O error. 5967 * 5968 * Please note that the compressed data stream is not checksummed, so 5969 * if the underlying device is experiencing data corruption, we may feed 5970 * corrupt data to the decompressor, so the decompressor needs to be 5971 * able to handle this situation (LZ4 does). 5972 */ 5973static void 5974l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr, enum zio_compress c) 5975{ 5976 ASSERT(L2ARC_IS_VALID_COMPRESS(c)); 5977 5978 if (zio->io_error != 0) { 5979 /* 5980 * An io error has occured, just restore the original io 5981 * size in preparation for a main pool read. 5982 */ 5983 zio->io_orig_size = zio->io_size = hdr->b_size; 5984 return; 5985 } 5986 5987 if (c == ZIO_COMPRESS_EMPTY) { 5988 /* 5989 * An empty buffer results in a null zio, which means we 5990 * need to fill its io_data after we're done restoring the 5991 * buffer's contents. 5992 */ 5993 ASSERT(hdr->b_l1hdr.b_buf != NULL); 5994 bzero(hdr->b_l1hdr.b_buf->b_data, hdr->b_size); 5995 zio->io_data = zio->io_orig_data = hdr->b_l1hdr.b_buf->b_data; 5996 } else { 5997 ASSERT(zio->io_data != NULL); 5998 /* 5999 * We copy the compressed data from the start of the arc buffer 6000 * (the zio_read will have pulled in only what we need, the 6001 * rest is garbage which we will overwrite at decompression) 6002 * and then decompress back to the ARC data buffer. This way we 6003 * can minimize copying by simply decompressing back over the 6004 * original compressed data (rather than decompressing to an 6005 * aux buffer and then copying back the uncompressed buffer, 6006 * which is likely to be much larger). 6007 */ 6008 uint64_t csize; 6009 void *cdata; 6010 6011 csize = zio->io_size; 6012 cdata = zio_data_buf_alloc(csize); 6013 bcopy(zio->io_data, cdata, csize); 6014 if (zio_decompress_data(c, cdata, zio->io_data, csize, 6015 hdr->b_size) != 0) 6016 zio->io_error = EIO; 6017 zio_data_buf_free(cdata, csize); 6018 } 6019 6020 /* Restore the expected uncompressed IO size. */ 6021 zio->io_orig_size = zio->io_size = hdr->b_size; 6022} 6023 6024/* 6025 * Releases the temporary b_tmp_cdata buffer in an l2arc header structure. 6026 * This buffer serves as a temporary holder of compressed data while 6027 * the buffer entry is being written to an l2arc device. Once that is 6028 * done, we can dispose of it. 6029 */ 6030static void 6031l2arc_release_cdata_buf(arc_buf_hdr_t *hdr) 6032{ 6033 ASSERT(HDR_HAS_L1HDR(hdr)); 6034 if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_EMPTY) { 6035 /* 6036 * If the data was compressed, then we've allocated a 6037 * temporary buffer for it, so now we need to release it. 6038 */ 6039 ASSERT(hdr->b_l1hdr.b_tmp_cdata != NULL); 6040 zio_data_buf_free(hdr->b_l1hdr.b_tmp_cdata, 6041 hdr->b_size); 6042 hdr->b_l1hdr.b_tmp_cdata = NULL; 6043 } else { 6044 ASSERT(hdr->b_l1hdr.b_tmp_cdata == NULL); 6045 } 6046} 6047 6048/* 6049 * This thread feeds the L2ARC at regular intervals. This is the beating 6050 * heart of the L2ARC. 6051 */ 6052static void 6053l2arc_feed_thread(void *dummy __unused) 6054{ 6055 callb_cpr_t cpr; 6056 l2arc_dev_t *dev; 6057 spa_t *spa; 6058 uint64_t size, wrote; 6059 clock_t begin, next = ddi_get_lbolt(); 6060 boolean_t headroom_boost = B_FALSE; 6061 6062 CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG); 6063 6064 mutex_enter(&l2arc_feed_thr_lock); 6065 6066 while (l2arc_thread_exit == 0) { 6067 CALLB_CPR_SAFE_BEGIN(&cpr); 6068 (void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock, 6069 next - ddi_get_lbolt()); 6070 CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock); 6071 next = ddi_get_lbolt() + hz; 6072 6073 /* 6074 * Quick check for L2ARC devices. 6075 */ 6076 mutex_enter(&l2arc_dev_mtx); 6077 if (l2arc_ndev == 0) { 6078 mutex_exit(&l2arc_dev_mtx); 6079 continue; 6080 } 6081 mutex_exit(&l2arc_dev_mtx); 6082 begin = ddi_get_lbolt(); 6083 6084 /* 6085 * This selects the next l2arc device to write to, and in 6086 * doing so the next spa to feed from: dev->l2ad_spa. This 6087 * will return NULL if there are now no l2arc devices or if 6088 * they are all faulted. 6089 * 6090 * If a device is returned, its spa's config lock is also 6091 * held to prevent device removal. l2arc_dev_get_next() 6092 * will grab and release l2arc_dev_mtx. 6093 */ 6094 if ((dev = l2arc_dev_get_next()) == NULL) 6095 continue; 6096 6097 spa = dev->l2ad_spa; 6098 ASSERT(spa != NULL); 6099 6100 /* 6101 * If the pool is read-only then force the feed thread to 6102 * sleep a little longer. 6103 */ 6104 if (!spa_writeable(spa)) { 6105 next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz; 6106 spa_config_exit(spa, SCL_L2ARC, dev); 6107 continue; 6108 } 6109 6110 /* 6111 * Avoid contributing to memory pressure. 6112 */ 6113 if (arc_reclaim_needed()) { 6114 ARCSTAT_BUMP(arcstat_l2_abort_lowmem); 6115 spa_config_exit(spa, SCL_L2ARC, dev); 6116 continue; 6117 } 6118 6119 ARCSTAT_BUMP(arcstat_l2_feeds); 6120 6121 size = l2arc_write_size(); 6122 6123 /* 6124 * Evict L2ARC buffers that will be overwritten. 6125 */ 6126 l2arc_evict(dev, size, B_FALSE); 6127 6128 /* 6129 * Write ARC buffers. 6130 */ 6131 wrote = l2arc_write_buffers(spa, dev, size, &headroom_boost); 6132 6133 /* 6134 * Calculate interval between writes. 6135 */ 6136 next = l2arc_write_interval(begin, size, wrote); 6137 spa_config_exit(spa, SCL_L2ARC, dev); 6138 } 6139 6140 l2arc_thread_exit = 0; 6141 cv_broadcast(&l2arc_feed_thr_cv); 6142 CALLB_CPR_EXIT(&cpr); /* drops l2arc_feed_thr_lock */ 6143 thread_exit(); 6144} 6145 6146boolean_t 6147l2arc_vdev_present(vdev_t *vd) 6148{ 6149 l2arc_dev_t *dev; 6150 6151 mutex_enter(&l2arc_dev_mtx); 6152 for (dev = list_head(l2arc_dev_list); dev != NULL; 6153 dev = list_next(l2arc_dev_list, dev)) { 6154 if (dev->l2ad_vdev == vd) 6155 break; 6156 } 6157 mutex_exit(&l2arc_dev_mtx); 6158 6159 return (dev != NULL); 6160} 6161 6162/* 6163 * Add a vdev for use by the L2ARC. By this point the spa has already 6164 * validated the vdev and opened it. 6165 */ 6166void 6167l2arc_add_vdev(spa_t *spa, vdev_t *vd) 6168{ 6169 l2arc_dev_t *adddev; 6170 6171 ASSERT(!l2arc_vdev_present(vd)); 6172 6173 vdev_ashift_optimize(vd); 6174 6175 /* 6176 * Create a new l2arc device entry. 6177 */ 6178 adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP); 6179 adddev->l2ad_spa = spa; 6180 adddev->l2ad_vdev = vd; 6181 adddev->l2ad_start = VDEV_LABEL_START_SIZE; 6182 adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd); 6183 adddev->l2ad_hand = adddev->l2ad_start; 6184 adddev->l2ad_evict = adddev->l2ad_start; 6185 adddev->l2ad_first = B_TRUE; 6186 adddev->l2ad_writing = B_FALSE; 6187 6188 mutex_init(&adddev->l2ad_mtx, NULL, MUTEX_DEFAULT, NULL); 6189 /* 6190 * This is a list of all ARC buffers that are still valid on the 6191 * device. 6192 */ 6193 list_create(&adddev->l2ad_buflist, sizeof (arc_buf_hdr_t), 6194 offsetof(arc_buf_hdr_t, b_l2hdr.b_l2node)); 6195 6196 vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand); 6197 6198 /* 6199 * Add device to global list 6200 */ 6201 mutex_enter(&l2arc_dev_mtx); 6202 list_insert_head(l2arc_dev_list, adddev); 6203 atomic_inc_64(&l2arc_ndev); 6204 mutex_exit(&l2arc_dev_mtx); 6205} 6206 6207/* 6208 * Remove a vdev from the L2ARC. 6209 */ 6210void 6211l2arc_remove_vdev(vdev_t *vd) 6212{ 6213 l2arc_dev_t *dev, *nextdev, *remdev = NULL; 6214 6215 /* 6216 * Find the device by vdev 6217 */ 6218 mutex_enter(&l2arc_dev_mtx); 6219 for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) { 6220 nextdev = list_next(l2arc_dev_list, dev); 6221 if (vd == dev->l2ad_vdev) { 6222 remdev = dev; 6223 break; 6224 } 6225 } 6226 ASSERT(remdev != NULL); 6227 6228 /* 6229 * Remove device from global list 6230 */ 6231 list_remove(l2arc_dev_list, remdev); 6232 l2arc_dev_last = NULL; /* may have been invalidated */ 6233 atomic_dec_64(&l2arc_ndev); 6234 mutex_exit(&l2arc_dev_mtx); 6235 6236 /* 6237 * Clear all buflists and ARC references. L2ARC device flush. 6238 */ 6239 l2arc_evict(remdev, 0, B_TRUE); 6240 list_destroy(&remdev->l2ad_buflist); 6241 mutex_destroy(&remdev->l2ad_mtx); 6242 kmem_free(remdev, sizeof (l2arc_dev_t)); 6243} 6244 6245void 6246l2arc_init(void) 6247{ 6248 l2arc_thread_exit = 0; 6249 l2arc_ndev = 0; 6250 l2arc_writes_sent = 0; 6251 l2arc_writes_done = 0; 6252 6253 mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL); 6254 cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL); 6255 mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL); 6256 mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL); 6257 6258 l2arc_dev_list = &L2ARC_dev_list; 6259 l2arc_free_on_write = &L2ARC_free_on_write; 6260 list_create(l2arc_dev_list, sizeof (l2arc_dev_t), 6261 offsetof(l2arc_dev_t, l2ad_node)); 6262 list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t), 6263 offsetof(l2arc_data_free_t, l2df_list_node)); 6264} 6265 6266void 6267l2arc_fini(void) 6268{ 6269 /* 6270 * This is called from dmu_fini(), which is called from spa_fini(); 6271 * Because of this, we can assume that all l2arc devices have 6272 * already been removed when the pools themselves were removed. 6273 */ 6274 6275 l2arc_do_free_on_write(); 6276 6277 mutex_destroy(&l2arc_feed_thr_lock); 6278 cv_destroy(&l2arc_feed_thr_cv); 6279 mutex_destroy(&l2arc_dev_mtx); 6280 mutex_destroy(&l2arc_free_on_write_mtx); 6281 6282 list_destroy(l2arc_dev_list); 6283 list_destroy(l2arc_free_on_write); 6284} 6285 6286void 6287l2arc_start(void) 6288{ 6289 if (!(spa_mode_global & FWRITE)) 6290 return; 6291 6292 (void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0, 6293 TS_RUN, minclsyspri); 6294} 6295 6296void 6297l2arc_stop(void) 6298{ 6299 if (!(spa_mode_global & FWRITE)) 6300 return; 6301 6302 mutex_enter(&l2arc_feed_thr_lock); 6303 cv_signal(&l2arc_feed_thr_cv); /* kick thread out of startup */ 6304 l2arc_thread_exit = 1; 6305 while (l2arc_thread_exit != 0) 6306 cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock); 6307 mutex_exit(&l2arc_feed_thr_lock); 6308} 6309