arc.c revision 304138
1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21/* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2012, Joyent, Inc. All rights reserved. 24 * Copyright (c) 2011, 2016 by Delphix. All rights reserved. 25 * Copyright (c) 2014 by Saso Kiselkov. All rights reserved. 26 * Copyright 2015 Nexenta Systems, Inc. All rights reserved. 27 */ 28 29/* 30 * DVA-based Adjustable Replacement Cache 31 * 32 * While much of the theory of operation used here is 33 * based on the self-tuning, low overhead replacement cache 34 * presented by Megiddo and Modha at FAST 2003, there are some 35 * significant differences: 36 * 37 * 1. The Megiddo and Modha model assumes any page is evictable. 38 * Pages in its cache cannot be "locked" into memory. This makes 39 * the eviction algorithm simple: evict the last page in the list. 40 * This also make the performance characteristics easy to reason 41 * about. Our cache is not so simple. At any given moment, some 42 * subset of the blocks in the cache are un-evictable because we 43 * have handed out a reference to them. Blocks are only evictable 44 * when there are no external references active. This makes 45 * eviction far more problematic: we choose to evict the evictable 46 * blocks that are the "lowest" in the list. 47 * 48 * There are times when it is not possible to evict the requested 49 * space. In these circumstances we are unable to adjust the cache 50 * size. To prevent the cache growing unbounded at these times we 51 * implement a "cache throttle" that slows the flow of new data 52 * into the cache until we can make space available. 53 * 54 * 2. The Megiddo and Modha model assumes a fixed cache size. 55 * Pages are evicted when the cache is full and there is a cache 56 * miss. Our model has a variable sized cache. It grows with 57 * high use, but also tries to react to memory pressure from the 58 * operating system: decreasing its size when system memory is 59 * tight. 60 * 61 * 3. The Megiddo and Modha model assumes a fixed page size. All 62 * elements of the cache are therefore exactly the same size. So 63 * when adjusting the cache size following a cache miss, its simply 64 * a matter of choosing a single page to evict. In our model, we 65 * have variable sized cache blocks (rangeing from 512 bytes to 66 * 128K bytes). We therefore choose a set of blocks to evict to make 67 * space for a cache miss that approximates as closely as possible 68 * the space used by the new block. 69 * 70 * See also: "ARC: A Self-Tuning, Low Overhead Replacement Cache" 71 * by N. Megiddo & D. Modha, FAST 2003 72 */ 73 74/* 75 * The locking model: 76 * 77 * A new reference to a cache buffer can be obtained in two 78 * ways: 1) via a hash table lookup using the DVA as a key, 79 * or 2) via one of the ARC lists. The arc_read() interface 80 * uses method 1, while the internal arc algorithms for 81 * adjusting the cache use method 2. We therefore provide two 82 * types of locks: 1) the hash table lock array, and 2) the 83 * arc list locks. 84 * 85 * Buffers do not have their own mutexes, rather they rely on the 86 * hash table mutexes for the bulk of their protection (i.e. most 87 * fields in the arc_buf_hdr_t are protected by these mutexes). 88 * 89 * buf_hash_find() returns the appropriate mutex (held) when it 90 * locates the requested buffer in the hash table. It returns 91 * NULL for the mutex if the buffer was not in the table. 92 * 93 * buf_hash_remove() expects the appropriate hash mutex to be 94 * already held before it is invoked. 95 * 96 * Each arc state also has a mutex which is used to protect the 97 * buffer list associated with the state. When attempting to 98 * obtain a hash table lock while holding an arc list lock you 99 * must use: mutex_tryenter() to avoid deadlock. Also note that 100 * the active state mutex must be held before the ghost state mutex. 101 * 102 * Arc buffers may have an associated eviction callback function. 103 * This function will be invoked prior to removing the buffer (e.g. 104 * in arc_do_user_evicts()). Note however that the data associated 105 * with the buffer may be evicted prior to the callback. The callback 106 * must be made with *no locks held* (to prevent deadlock). Additionally, 107 * the users of callbacks must ensure that their private data is 108 * protected from simultaneous callbacks from arc_clear_callback() 109 * and arc_do_user_evicts(). 110 * 111 * Note that the majority of the performance stats are manipulated 112 * with atomic operations. 113 * 114 * The L2ARC uses the l2ad_mtx on each vdev for the following: 115 * 116 * - L2ARC buflist creation 117 * - L2ARC buflist eviction 118 * - L2ARC write completion, which walks L2ARC buflists 119 * - ARC header destruction, as it removes from L2ARC buflists 120 * - ARC header release, as it removes from L2ARC buflists 121 */ 122 123#include <sys/spa.h> 124#include <sys/zio.h> 125#include <sys/zio_compress.h> 126#include <sys/zfs_context.h> 127#include <sys/arc.h> 128#include <sys/refcount.h> 129#include <sys/vdev.h> 130#include <sys/vdev_impl.h> 131#include <sys/dsl_pool.h> 132#include <sys/multilist.h> 133#ifdef _KERNEL 134#include <sys/dnlc.h> 135#include <sys/racct.h> 136#endif 137#include <sys/callb.h> 138#include <sys/kstat.h> 139#include <sys/trim_map.h> 140#include <zfs_fletcher.h> 141#include <sys/sdt.h> 142 143#include <machine/vmparam.h> 144 145#ifdef illumos 146#ifndef _KERNEL 147/* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */ 148boolean_t arc_watch = B_FALSE; 149int arc_procfd; 150#endif 151#endif /* illumos */ 152 153static kmutex_t arc_reclaim_lock; 154static kcondvar_t arc_reclaim_thread_cv; 155static boolean_t arc_reclaim_thread_exit; 156static kcondvar_t arc_reclaim_waiters_cv; 157 158static kmutex_t arc_user_evicts_lock; 159static kcondvar_t arc_user_evicts_cv; 160static boolean_t arc_user_evicts_thread_exit; 161 162static kmutex_t arc_dnlc_evicts_lock; 163static kcondvar_t arc_dnlc_evicts_cv; 164static boolean_t arc_dnlc_evicts_thread_exit; 165 166uint_t arc_reduce_dnlc_percent = 3; 167 168/* 169 * The number of headers to evict in arc_evict_state_impl() before 170 * dropping the sublist lock and evicting from another sublist. A lower 171 * value means we're more likely to evict the "correct" header (i.e. the 172 * oldest header in the arc state), but comes with higher overhead 173 * (i.e. more invocations of arc_evict_state_impl()). 174 */ 175int zfs_arc_evict_batch_limit = 10; 176 177/* 178 * The number of sublists used for each of the arc state lists. If this 179 * is not set to a suitable value by the user, it will be configured to 180 * the number of CPUs on the system in arc_init(). 181 */ 182int zfs_arc_num_sublists_per_state = 0; 183 184/* number of seconds before growing cache again */ 185static int arc_grow_retry = 60; 186 187/* shift of arc_c for calculating overflow limit in arc_get_data_buf */ 188int zfs_arc_overflow_shift = 8; 189 190/* shift of arc_c for calculating both min and max arc_p */ 191static int arc_p_min_shift = 4; 192 193/* log2(fraction of arc to reclaim) */ 194static int arc_shrink_shift = 7; 195 196/* 197 * log2(fraction of ARC which must be free to allow growing). 198 * I.e. If there is less than arc_c >> arc_no_grow_shift free memory, 199 * when reading a new block into the ARC, we will evict an equal-sized block 200 * from the ARC. 201 * 202 * This must be less than arc_shrink_shift, so that when we shrink the ARC, 203 * we will still not allow it to grow. 204 */ 205int arc_no_grow_shift = 5; 206 207 208/* 209 * minimum lifespan of a prefetch block in clock ticks 210 * (initialized in arc_init()) 211 */ 212static int arc_min_prefetch_lifespan; 213 214/* 215 * If this percent of memory is free, don't throttle. 216 */ 217int arc_lotsfree_percent = 10; 218 219static int arc_dead; 220extern boolean_t zfs_prefetch_disable; 221 222/* 223 * The arc has filled available memory and has now warmed up. 224 */ 225static boolean_t arc_warm; 226 227/* 228 * These tunables are for performance analysis. 229 */ 230uint64_t zfs_arc_max; 231uint64_t zfs_arc_min; 232uint64_t zfs_arc_meta_limit = 0; 233uint64_t zfs_arc_meta_min = 0; 234int zfs_arc_grow_retry = 0; 235int zfs_arc_shrink_shift = 0; 236int zfs_arc_p_min_shift = 0; 237int zfs_disable_dup_eviction = 0; 238uint64_t zfs_arc_average_blocksize = 8 * 1024; /* 8KB */ 239u_int zfs_arc_free_target = 0; 240 241/* Absolute min for arc min / max is 16MB. */ 242static uint64_t arc_abs_min = 16 << 20; 243 244static int sysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS); 245static int sysctl_vfs_zfs_arc_meta_limit(SYSCTL_HANDLER_ARGS); 246static int sysctl_vfs_zfs_arc_max(SYSCTL_HANDLER_ARGS); 247static int sysctl_vfs_zfs_arc_min(SYSCTL_HANDLER_ARGS); 248 249#if defined(__FreeBSD__) && defined(_KERNEL) 250static void 251arc_free_target_init(void *unused __unused) 252{ 253 254 zfs_arc_free_target = vm_pageout_wakeup_thresh; 255} 256SYSINIT(arc_free_target_init, SI_SUB_KTHREAD_PAGE, SI_ORDER_ANY, 257 arc_free_target_init, NULL); 258 259TUNABLE_QUAD("vfs.zfs.arc_meta_limit", &zfs_arc_meta_limit); 260TUNABLE_QUAD("vfs.zfs.arc_meta_min", &zfs_arc_meta_min); 261TUNABLE_INT("vfs.zfs.arc_shrink_shift", &zfs_arc_shrink_shift); 262SYSCTL_DECL(_vfs_zfs); 263SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_max, CTLTYPE_U64 | CTLFLAG_RWTUN, 264 0, sizeof(uint64_t), sysctl_vfs_zfs_arc_max, "QU", "Maximum ARC size"); 265SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_min, CTLTYPE_U64 | CTLFLAG_RWTUN, 266 0, sizeof(uint64_t), sysctl_vfs_zfs_arc_min, "QU", "Minimum ARC size"); 267SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_average_blocksize, CTLFLAG_RDTUN, 268 &zfs_arc_average_blocksize, 0, 269 "ARC average blocksize"); 270SYSCTL_INT(_vfs_zfs, OID_AUTO, arc_shrink_shift, CTLFLAG_RW, 271 &arc_shrink_shift, 0, 272 "log2(fraction of arc to reclaim)"); 273 274/* 275 * We don't have a tunable for arc_free_target due to the dependency on 276 * pagedaemon initialisation. 277 */ 278SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_free_target, 279 CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(u_int), 280 sysctl_vfs_zfs_arc_free_target, "IU", 281 "Desired number of free pages below which ARC triggers reclaim"); 282 283static int 284sysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS) 285{ 286 u_int val; 287 int err; 288 289 val = zfs_arc_free_target; 290 err = sysctl_handle_int(oidp, &val, 0, req); 291 if (err != 0 || req->newptr == NULL) 292 return (err); 293 294 if (val < minfree) 295 return (EINVAL); 296 if (val > vm_cnt.v_page_count) 297 return (EINVAL); 298 299 zfs_arc_free_target = val; 300 301 return (0); 302} 303 304/* 305 * Must be declared here, before the definition of corresponding kstat 306 * macro which uses the same names will confuse the compiler. 307 */ 308SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_meta_limit, 309 CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(uint64_t), 310 sysctl_vfs_zfs_arc_meta_limit, "QU", 311 "ARC metadata limit"); 312#endif 313 314/* 315 * Note that buffers can be in one of 6 states: 316 * ARC_anon - anonymous (discussed below) 317 * ARC_mru - recently used, currently cached 318 * ARC_mru_ghost - recentely used, no longer in cache 319 * ARC_mfu - frequently used, currently cached 320 * ARC_mfu_ghost - frequently used, no longer in cache 321 * ARC_l2c_only - exists in L2ARC but not other states 322 * When there are no active references to the buffer, they are 323 * are linked onto a list in one of these arc states. These are 324 * the only buffers that can be evicted or deleted. Within each 325 * state there are multiple lists, one for meta-data and one for 326 * non-meta-data. Meta-data (indirect blocks, blocks of dnodes, 327 * etc.) is tracked separately so that it can be managed more 328 * explicitly: favored over data, limited explicitly. 329 * 330 * Anonymous buffers are buffers that are not associated with 331 * a DVA. These are buffers that hold dirty block copies 332 * before they are written to stable storage. By definition, 333 * they are "ref'd" and are considered part of arc_mru 334 * that cannot be freed. Generally, they will aquire a DVA 335 * as they are written and migrate onto the arc_mru list. 336 * 337 * The ARC_l2c_only state is for buffers that are in the second 338 * level ARC but no longer in any of the ARC_m* lists. The second 339 * level ARC itself may also contain buffers that are in any of 340 * the ARC_m* states - meaning that a buffer can exist in two 341 * places. The reason for the ARC_l2c_only state is to keep the 342 * buffer header in the hash table, so that reads that hit the 343 * second level ARC benefit from these fast lookups. 344 */ 345 346typedef struct arc_state { 347 /* 348 * list of evictable buffers 349 */ 350 multilist_t arcs_list[ARC_BUFC_NUMTYPES]; 351 /* 352 * total amount of evictable data in this state 353 */ 354 uint64_t arcs_lsize[ARC_BUFC_NUMTYPES]; 355 /* 356 * total amount of data in this state; this includes: evictable, 357 * non-evictable, ARC_BUFC_DATA, and ARC_BUFC_METADATA. 358 */ 359 refcount_t arcs_size; 360} arc_state_t; 361 362/* The 6 states: */ 363static arc_state_t ARC_anon; 364static arc_state_t ARC_mru; 365static arc_state_t ARC_mru_ghost; 366static arc_state_t ARC_mfu; 367static arc_state_t ARC_mfu_ghost; 368static arc_state_t ARC_l2c_only; 369 370typedef struct arc_stats { 371 kstat_named_t arcstat_hits; 372 kstat_named_t arcstat_misses; 373 kstat_named_t arcstat_demand_data_hits; 374 kstat_named_t arcstat_demand_data_misses; 375 kstat_named_t arcstat_demand_metadata_hits; 376 kstat_named_t arcstat_demand_metadata_misses; 377 kstat_named_t arcstat_prefetch_data_hits; 378 kstat_named_t arcstat_prefetch_data_misses; 379 kstat_named_t arcstat_prefetch_metadata_hits; 380 kstat_named_t arcstat_prefetch_metadata_misses; 381 kstat_named_t arcstat_mru_hits; 382 kstat_named_t arcstat_mru_ghost_hits; 383 kstat_named_t arcstat_mfu_hits; 384 kstat_named_t arcstat_mfu_ghost_hits; 385 kstat_named_t arcstat_allocated; 386 kstat_named_t arcstat_deleted; 387 /* 388 * Number of buffers that could not be evicted because the hash lock 389 * was held by another thread. The lock may not necessarily be held 390 * by something using the same buffer, since hash locks are shared 391 * by multiple buffers. 392 */ 393 kstat_named_t arcstat_mutex_miss; 394 /* 395 * Number of buffers skipped because they have I/O in progress, are 396 * indrect prefetch buffers that have not lived long enough, or are 397 * not from the spa we're trying to evict from. 398 */ 399 kstat_named_t arcstat_evict_skip; 400 /* 401 * Number of times arc_evict_state() was unable to evict enough 402 * buffers to reach it's target amount. 403 */ 404 kstat_named_t arcstat_evict_not_enough; 405 kstat_named_t arcstat_evict_l2_cached; 406 kstat_named_t arcstat_evict_l2_eligible; 407 kstat_named_t arcstat_evict_l2_ineligible; 408 kstat_named_t arcstat_evict_l2_skip; 409 kstat_named_t arcstat_hash_elements; 410 kstat_named_t arcstat_hash_elements_max; 411 kstat_named_t arcstat_hash_collisions; 412 kstat_named_t arcstat_hash_chains; 413 kstat_named_t arcstat_hash_chain_max; 414 kstat_named_t arcstat_p; 415 kstat_named_t arcstat_c; 416 kstat_named_t arcstat_c_min; 417 kstat_named_t arcstat_c_max; 418 kstat_named_t arcstat_size; 419 /* 420 * Number of bytes consumed by internal ARC structures necessary 421 * for tracking purposes; these structures are not actually 422 * backed by ARC buffers. This includes arc_buf_hdr_t structures 423 * (allocated via arc_buf_hdr_t_full and arc_buf_hdr_t_l2only 424 * caches), and arc_buf_t structures (allocated via arc_buf_t 425 * cache). 426 */ 427 kstat_named_t arcstat_hdr_size; 428 /* 429 * Number of bytes consumed by ARC buffers of type equal to 430 * ARC_BUFC_DATA. This is generally consumed by buffers backing 431 * on disk user data (e.g. plain file contents). 432 */ 433 kstat_named_t arcstat_data_size; 434 /* 435 * Number of bytes consumed by ARC buffers of type equal to 436 * ARC_BUFC_METADATA. This is generally consumed by buffers 437 * backing on disk data that is used for internal ZFS 438 * structures (e.g. ZAP, dnode, indirect blocks, etc). 439 */ 440 kstat_named_t arcstat_metadata_size; 441 /* 442 * Number of bytes consumed by various buffers and structures 443 * not actually backed with ARC buffers. This includes bonus 444 * buffers (allocated directly via zio_buf_* functions), 445 * dmu_buf_impl_t structures (allocated via dmu_buf_impl_t 446 * cache), and dnode_t structures (allocated via dnode_t cache). 447 */ 448 kstat_named_t arcstat_other_size; 449 /* 450 * Total number of bytes consumed by ARC buffers residing in the 451 * arc_anon state. This includes *all* buffers in the arc_anon 452 * state; e.g. data, metadata, evictable, and unevictable buffers 453 * are all included in this value. 454 */ 455 kstat_named_t arcstat_anon_size; 456 /* 457 * Number of bytes consumed by ARC buffers that meet the 458 * following criteria: backing buffers of type ARC_BUFC_DATA, 459 * residing in the arc_anon state, and are eligible for eviction 460 * (e.g. have no outstanding holds on the buffer). 461 */ 462 kstat_named_t arcstat_anon_evictable_data; 463 /* 464 * Number of bytes consumed by ARC buffers that meet the 465 * following criteria: backing buffers of type ARC_BUFC_METADATA, 466 * residing in the arc_anon state, and are eligible for eviction 467 * (e.g. have no outstanding holds on the buffer). 468 */ 469 kstat_named_t arcstat_anon_evictable_metadata; 470 /* 471 * Total number of bytes consumed by ARC buffers residing in the 472 * arc_mru state. This includes *all* buffers in the arc_mru 473 * state; e.g. data, metadata, evictable, and unevictable buffers 474 * are all included in this value. 475 */ 476 kstat_named_t arcstat_mru_size; 477 /* 478 * Number of bytes consumed by ARC buffers that meet the 479 * following criteria: backing buffers of type ARC_BUFC_DATA, 480 * residing in the arc_mru state, and are eligible for eviction 481 * (e.g. have no outstanding holds on the buffer). 482 */ 483 kstat_named_t arcstat_mru_evictable_data; 484 /* 485 * Number of bytes consumed by ARC buffers that meet the 486 * following criteria: backing buffers of type ARC_BUFC_METADATA, 487 * residing in the arc_mru state, and are eligible for eviction 488 * (e.g. have no outstanding holds on the buffer). 489 */ 490 kstat_named_t arcstat_mru_evictable_metadata; 491 /* 492 * Total number of bytes that *would have been* consumed by ARC 493 * buffers in the arc_mru_ghost state. The key thing to note 494 * here, is the fact that this size doesn't actually indicate 495 * RAM consumption. The ghost lists only consist of headers and 496 * don't actually have ARC buffers linked off of these headers. 497 * Thus, *if* the headers had associated ARC buffers, these 498 * buffers *would have* consumed this number of bytes. 499 */ 500 kstat_named_t arcstat_mru_ghost_size; 501 /* 502 * Number of bytes that *would have been* consumed by ARC 503 * buffers that are eligible for eviction, of type 504 * ARC_BUFC_DATA, and linked off the arc_mru_ghost state. 505 */ 506 kstat_named_t arcstat_mru_ghost_evictable_data; 507 /* 508 * Number of bytes that *would have been* consumed by ARC 509 * buffers that are eligible for eviction, of type 510 * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state. 511 */ 512 kstat_named_t arcstat_mru_ghost_evictable_metadata; 513 /* 514 * Total number of bytes consumed by ARC buffers residing in the 515 * arc_mfu state. This includes *all* buffers in the arc_mfu 516 * state; e.g. data, metadata, evictable, and unevictable buffers 517 * are all included in this value. 518 */ 519 kstat_named_t arcstat_mfu_size; 520 /* 521 * Number of bytes consumed by ARC buffers that are eligible for 522 * eviction, of type ARC_BUFC_DATA, and reside in the arc_mfu 523 * state. 524 */ 525 kstat_named_t arcstat_mfu_evictable_data; 526 /* 527 * Number of bytes consumed by ARC buffers that are eligible for 528 * eviction, of type ARC_BUFC_METADATA, and reside in the 529 * arc_mfu state. 530 */ 531 kstat_named_t arcstat_mfu_evictable_metadata; 532 /* 533 * Total number of bytes that *would have been* consumed by ARC 534 * buffers in the arc_mfu_ghost state. See the comment above 535 * arcstat_mru_ghost_size for more details. 536 */ 537 kstat_named_t arcstat_mfu_ghost_size; 538 /* 539 * Number of bytes that *would have been* consumed by ARC 540 * buffers that are eligible for eviction, of type 541 * ARC_BUFC_DATA, and linked off the arc_mfu_ghost state. 542 */ 543 kstat_named_t arcstat_mfu_ghost_evictable_data; 544 /* 545 * Number of bytes that *would have been* consumed by ARC 546 * buffers that are eligible for eviction, of type 547 * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state. 548 */ 549 kstat_named_t arcstat_mfu_ghost_evictable_metadata; 550 kstat_named_t arcstat_l2_hits; 551 kstat_named_t arcstat_l2_misses; 552 kstat_named_t arcstat_l2_feeds; 553 kstat_named_t arcstat_l2_rw_clash; 554 kstat_named_t arcstat_l2_read_bytes; 555 kstat_named_t arcstat_l2_write_bytes; 556 kstat_named_t arcstat_l2_writes_sent; 557 kstat_named_t arcstat_l2_writes_done; 558 kstat_named_t arcstat_l2_writes_error; 559 kstat_named_t arcstat_l2_writes_lock_retry; 560 kstat_named_t arcstat_l2_evict_lock_retry; 561 kstat_named_t arcstat_l2_evict_reading; 562 kstat_named_t arcstat_l2_evict_l1cached; 563 kstat_named_t arcstat_l2_free_on_write; 564 kstat_named_t arcstat_l2_cdata_free_on_write; 565 kstat_named_t arcstat_l2_abort_lowmem; 566 kstat_named_t arcstat_l2_cksum_bad; 567 kstat_named_t arcstat_l2_io_error; 568 kstat_named_t arcstat_l2_size; 569 kstat_named_t arcstat_l2_asize; 570 kstat_named_t arcstat_l2_hdr_size; 571 kstat_named_t arcstat_l2_compress_successes; 572 kstat_named_t arcstat_l2_compress_zeros; 573 kstat_named_t arcstat_l2_compress_failures; 574 kstat_named_t arcstat_l2_padding_needed; 575 kstat_named_t arcstat_l2_write_trylock_fail; 576 kstat_named_t arcstat_l2_write_passed_headroom; 577 kstat_named_t arcstat_l2_write_spa_mismatch; 578 kstat_named_t arcstat_l2_write_in_l2; 579 kstat_named_t arcstat_l2_write_hdr_io_in_progress; 580 kstat_named_t arcstat_l2_write_not_cacheable; 581 kstat_named_t arcstat_l2_write_full; 582 kstat_named_t arcstat_l2_write_buffer_iter; 583 kstat_named_t arcstat_l2_write_pios; 584 kstat_named_t arcstat_l2_write_buffer_bytes_scanned; 585 kstat_named_t arcstat_l2_write_buffer_list_iter; 586 kstat_named_t arcstat_l2_write_buffer_list_null_iter; 587 kstat_named_t arcstat_memory_throttle_count; 588 kstat_named_t arcstat_duplicate_buffers; 589 kstat_named_t arcstat_duplicate_buffers_size; 590 kstat_named_t arcstat_duplicate_reads; 591 kstat_named_t arcstat_meta_used; 592 kstat_named_t arcstat_meta_limit; 593 kstat_named_t arcstat_meta_max; 594 kstat_named_t arcstat_meta_min; 595 kstat_named_t arcstat_sync_wait_for_async; 596 kstat_named_t arcstat_demand_hit_predictive_prefetch; 597} arc_stats_t; 598 599static arc_stats_t arc_stats = { 600 { "hits", KSTAT_DATA_UINT64 }, 601 { "misses", KSTAT_DATA_UINT64 }, 602 { "demand_data_hits", KSTAT_DATA_UINT64 }, 603 { "demand_data_misses", KSTAT_DATA_UINT64 }, 604 { "demand_metadata_hits", KSTAT_DATA_UINT64 }, 605 { "demand_metadata_misses", KSTAT_DATA_UINT64 }, 606 { "prefetch_data_hits", KSTAT_DATA_UINT64 }, 607 { "prefetch_data_misses", KSTAT_DATA_UINT64 }, 608 { "prefetch_metadata_hits", KSTAT_DATA_UINT64 }, 609 { "prefetch_metadata_misses", KSTAT_DATA_UINT64 }, 610 { "mru_hits", KSTAT_DATA_UINT64 }, 611 { "mru_ghost_hits", KSTAT_DATA_UINT64 }, 612 { "mfu_hits", KSTAT_DATA_UINT64 }, 613 { "mfu_ghost_hits", KSTAT_DATA_UINT64 }, 614 { "allocated", KSTAT_DATA_UINT64 }, 615 { "deleted", KSTAT_DATA_UINT64 }, 616 { "mutex_miss", KSTAT_DATA_UINT64 }, 617 { "evict_skip", KSTAT_DATA_UINT64 }, 618 { "evict_not_enough", KSTAT_DATA_UINT64 }, 619 { "evict_l2_cached", KSTAT_DATA_UINT64 }, 620 { "evict_l2_eligible", KSTAT_DATA_UINT64 }, 621 { "evict_l2_ineligible", KSTAT_DATA_UINT64 }, 622 { "evict_l2_skip", KSTAT_DATA_UINT64 }, 623 { "hash_elements", KSTAT_DATA_UINT64 }, 624 { "hash_elements_max", KSTAT_DATA_UINT64 }, 625 { "hash_collisions", KSTAT_DATA_UINT64 }, 626 { "hash_chains", KSTAT_DATA_UINT64 }, 627 { "hash_chain_max", KSTAT_DATA_UINT64 }, 628 { "p", KSTAT_DATA_UINT64 }, 629 { "c", KSTAT_DATA_UINT64 }, 630 { "c_min", KSTAT_DATA_UINT64 }, 631 { "c_max", KSTAT_DATA_UINT64 }, 632 { "size", KSTAT_DATA_UINT64 }, 633 { "hdr_size", KSTAT_DATA_UINT64 }, 634 { "data_size", KSTAT_DATA_UINT64 }, 635 { "metadata_size", KSTAT_DATA_UINT64 }, 636 { "other_size", KSTAT_DATA_UINT64 }, 637 { "anon_size", KSTAT_DATA_UINT64 }, 638 { "anon_evictable_data", KSTAT_DATA_UINT64 }, 639 { "anon_evictable_metadata", KSTAT_DATA_UINT64 }, 640 { "mru_size", KSTAT_DATA_UINT64 }, 641 { "mru_evictable_data", KSTAT_DATA_UINT64 }, 642 { "mru_evictable_metadata", KSTAT_DATA_UINT64 }, 643 { "mru_ghost_size", KSTAT_DATA_UINT64 }, 644 { "mru_ghost_evictable_data", KSTAT_DATA_UINT64 }, 645 { "mru_ghost_evictable_metadata", KSTAT_DATA_UINT64 }, 646 { "mfu_size", KSTAT_DATA_UINT64 }, 647 { "mfu_evictable_data", KSTAT_DATA_UINT64 }, 648 { "mfu_evictable_metadata", KSTAT_DATA_UINT64 }, 649 { "mfu_ghost_size", KSTAT_DATA_UINT64 }, 650 { "mfu_ghost_evictable_data", KSTAT_DATA_UINT64 }, 651 { "mfu_ghost_evictable_metadata", KSTAT_DATA_UINT64 }, 652 { "l2_hits", KSTAT_DATA_UINT64 }, 653 { "l2_misses", KSTAT_DATA_UINT64 }, 654 { "l2_feeds", KSTAT_DATA_UINT64 }, 655 { "l2_rw_clash", KSTAT_DATA_UINT64 }, 656 { "l2_read_bytes", KSTAT_DATA_UINT64 }, 657 { "l2_write_bytes", KSTAT_DATA_UINT64 }, 658 { "l2_writes_sent", KSTAT_DATA_UINT64 }, 659 { "l2_writes_done", KSTAT_DATA_UINT64 }, 660 { "l2_writes_error", KSTAT_DATA_UINT64 }, 661 { "l2_writes_lock_retry", KSTAT_DATA_UINT64 }, 662 { "l2_evict_lock_retry", KSTAT_DATA_UINT64 }, 663 { "l2_evict_reading", KSTAT_DATA_UINT64 }, 664 { "l2_evict_l1cached", KSTAT_DATA_UINT64 }, 665 { "l2_free_on_write", KSTAT_DATA_UINT64 }, 666 { "l2_cdata_free_on_write", KSTAT_DATA_UINT64 }, 667 { "l2_abort_lowmem", KSTAT_DATA_UINT64 }, 668 { "l2_cksum_bad", KSTAT_DATA_UINT64 }, 669 { "l2_io_error", KSTAT_DATA_UINT64 }, 670 { "l2_size", KSTAT_DATA_UINT64 }, 671 { "l2_asize", KSTAT_DATA_UINT64 }, 672 { "l2_hdr_size", KSTAT_DATA_UINT64 }, 673 { "l2_compress_successes", KSTAT_DATA_UINT64 }, 674 { "l2_compress_zeros", KSTAT_DATA_UINT64 }, 675 { "l2_compress_failures", KSTAT_DATA_UINT64 }, 676 { "l2_padding_needed", KSTAT_DATA_UINT64 }, 677 { "l2_write_trylock_fail", KSTAT_DATA_UINT64 }, 678 { "l2_write_passed_headroom", KSTAT_DATA_UINT64 }, 679 { "l2_write_spa_mismatch", KSTAT_DATA_UINT64 }, 680 { "l2_write_in_l2", KSTAT_DATA_UINT64 }, 681 { "l2_write_io_in_progress", KSTAT_DATA_UINT64 }, 682 { "l2_write_not_cacheable", KSTAT_DATA_UINT64 }, 683 { "l2_write_full", KSTAT_DATA_UINT64 }, 684 { "l2_write_buffer_iter", KSTAT_DATA_UINT64 }, 685 { "l2_write_pios", KSTAT_DATA_UINT64 }, 686 { "l2_write_buffer_bytes_scanned", KSTAT_DATA_UINT64 }, 687 { "l2_write_buffer_list_iter", KSTAT_DATA_UINT64 }, 688 { "l2_write_buffer_list_null_iter", KSTAT_DATA_UINT64 }, 689 { "memory_throttle_count", KSTAT_DATA_UINT64 }, 690 { "duplicate_buffers", KSTAT_DATA_UINT64 }, 691 { "duplicate_buffers_size", KSTAT_DATA_UINT64 }, 692 { "duplicate_reads", KSTAT_DATA_UINT64 }, 693 { "arc_meta_used", KSTAT_DATA_UINT64 }, 694 { "arc_meta_limit", KSTAT_DATA_UINT64 }, 695 { "arc_meta_max", KSTAT_DATA_UINT64 }, 696 { "arc_meta_min", KSTAT_DATA_UINT64 }, 697 { "sync_wait_for_async", KSTAT_DATA_UINT64 }, 698 { "demand_hit_predictive_prefetch", KSTAT_DATA_UINT64 }, 699}; 700 701#define ARCSTAT(stat) (arc_stats.stat.value.ui64) 702 703#define ARCSTAT_INCR(stat, val) \ 704 atomic_add_64(&arc_stats.stat.value.ui64, (val)) 705 706#define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1) 707#define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1) 708 709#define ARCSTAT_MAX(stat, val) { \ 710 uint64_t m; \ 711 while ((val) > (m = arc_stats.stat.value.ui64) && \ 712 (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val)))) \ 713 continue; \ 714} 715 716#define ARCSTAT_MAXSTAT(stat) \ 717 ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64) 718 719/* 720 * We define a macro to allow ARC hits/misses to be easily broken down by 721 * two separate conditions, giving a total of four different subtypes for 722 * each of hits and misses (so eight statistics total). 723 */ 724#define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \ 725 if (cond1) { \ 726 if (cond2) { \ 727 ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \ 728 } else { \ 729 ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \ 730 } \ 731 } else { \ 732 if (cond2) { \ 733 ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \ 734 } else { \ 735 ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\ 736 } \ 737 } 738 739kstat_t *arc_ksp; 740static arc_state_t *arc_anon; 741static arc_state_t *arc_mru; 742static arc_state_t *arc_mru_ghost; 743static arc_state_t *arc_mfu; 744static arc_state_t *arc_mfu_ghost; 745static arc_state_t *arc_l2c_only; 746 747/* 748 * There are several ARC variables that are critical to export as kstats -- 749 * but we don't want to have to grovel around in the kstat whenever we wish to 750 * manipulate them. For these variables, we therefore define them to be in 751 * terms of the statistic variable. This assures that we are not introducing 752 * the possibility of inconsistency by having shadow copies of the variables, 753 * while still allowing the code to be readable. 754 */ 755#define arc_size ARCSTAT(arcstat_size) /* actual total arc size */ 756#define arc_p ARCSTAT(arcstat_p) /* target size of MRU */ 757#define arc_c ARCSTAT(arcstat_c) /* target size of cache */ 758#define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */ 759#define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */ 760#define arc_meta_limit ARCSTAT(arcstat_meta_limit) /* max size for metadata */ 761#define arc_meta_min ARCSTAT(arcstat_meta_min) /* min size for metadata */ 762#define arc_meta_used ARCSTAT(arcstat_meta_used) /* size of metadata */ 763#define arc_meta_max ARCSTAT(arcstat_meta_max) /* max size of metadata */ 764 765#define L2ARC_IS_VALID_COMPRESS(_c_) \ 766 ((_c_) == ZIO_COMPRESS_LZ4 || (_c_) == ZIO_COMPRESS_EMPTY) 767 768static int arc_no_grow; /* Don't try to grow cache size */ 769static uint64_t arc_tempreserve; 770static uint64_t arc_loaned_bytes; 771 772typedef struct arc_callback arc_callback_t; 773 774struct arc_callback { 775 void *acb_private; 776 arc_done_func_t *acb_done; 777 arc_buf_t *acb_buf; 778 zio_t *acb_zio_dummy; 779 arc_callback_t *acb_next; 780}; 781 782typedef struct arc_write_callback arc_write_callback_t; 783 784struct arc_write_callback { 785 void *awcb_private; 786 arc_done_func_t *awcb_ready; 787 arc_done_func_t *awcb_children_ready; 788 arc_done_func_t *awcb_physdone; 789 arc_done_func_t *awcb_done; 790 arc_buf_t *awcb_buf; 791}; 792 793/* 794 * ARC buffers are separated into multiple structs as a memory saving measure: 795 * - Common fields struct, always defined, and embedded within it: 796 * - L2-only fields, always allocated but undefined when not in L2ARC 797 * - L1-only fields, only allocated when in L1ARC 798 * 799 * Buffer in L1 Buffer only in L2 800 * +------------------------+ +------------------------+ 801 * | arc_buf_hdr_t | | arc_buf_hdr_t | 802 * | | | | 803 * | | | | 804 * | | | | 805 * +------------------------+ +------------------------+ 806 * | l2arc_buf_hdr_t | | l2arc_buf_hdr_t | 807 * | (undefined if L1-only) | | | 808 * +------------------------+ +------------------------+ 809 * | l1arc_buf_hdr_t | 810 * | | 811 * | | 812 * | | 813 * | | 814 * +------------------------+ 815 * 816 * Because it's possible for the L2ARC to become extremely large, we can wind 817 * up eating a lot of memory in L2ARC buffer headers, so the size of a header 818 * is minimized by only allocating the fields necessary for an L1-cached buffer 819 * when a header is actually in the L1 cache. The sub-headers (l1arc_buf_hdr and 820 * l2arc_buf_hdr) are embedded rather than allocated separately to save a couple 821 * words in pointers. arc_hdr_realloc() is used to switch a header between 822 * these two allocation states. 823 */ 824typedef struct l1arc_buf_hdr { 825 kmutex_t b_freeze_lock; 826#ifdef ZFS_DEBUG 827 /* 828 * used for debugging wtih kmem_flags - by allocating and freeing 829 * b_thawed when the buffer is thawed, we get a record of the stack 830 * trace that thawed it. 831 */ 832 void *b_thawed; 833#endif 834 835 arc_buf_t *b_buf; 836 uint32_t b_datacnt; 837 /* for waiting on writes to complete */ 838 kcondvar_t b_cv; 839 840 /* protected by arc state mutex */ 841 arc_state_t *b_state; 842 multilist_node_t b_arc_node; 843 844 /* updated atomically */ 845 clock_t b_arc_access; 846 847 /* self protecting */ 848 refcount_t b_refcnt; 849 850 arc_callback_t *b_acb; 851 /* temporary buffer holder for in-flight compressed or padded data */ 852 void *b_tmp_cdata; 853} l1arc_buf_hdr_t; 854 855typedef struct l2arc_dev l2arc_dev_t; 856 857typedef struct l2arc_buf_hdr { 858 /* protected by arc_buf_hdr mutex */ 859 l2arc_dev_t *b_dev; /* L2ARC device */ 860 uint64_t b_daddr; /* disk address, offset byte */ 861 /* real alloc'd buffer size depending on b_compress applied */ 862 int32_t b_asize; 863 uint8_t b_compress; 864 865 list_node_t b_l2node; 866} l2arc_buf_hdr_t; 867 868struct arc_buf_hdr { 869 /* protected by hash lock */ 870 dva_t b_dva; 871 uint64_t b_birth; 872 /* 873 * Even though this checksum is only set/verified when a buffer is in 874 * the L1 cache, it needs to be in the set of common fields because it 875 * must be preserved from the time before a buffer is written out to 876 * L2ARC until after it is read back in. 877 */ 878 zio_cksum_t *b_freeze_cksum; 879 880 arc_buf_hdr_t *b_hash_next; 881 arc_flags_t b_flags; 882 883 /* immutable */ 884 int32_t b_size; 885 uint64_t b_spa; 886 887 /* L2ARC fields. Undefined when not in L2ARC. */ 888 l2arc_buf_hdr_t b_l2hdr; 889 /* L1ARC fields. Undefined when in l2arc_only state */ 890 l1arc_buf_hdr_t b_l1hdr; 891}; 892 893#if defined(__FreeBSD__) && defined(_KERNEL) 894static int 895sysctl_vfs_zfs_arc_meta_limit(SYSCTL_HANDLER_ARGS) 896{ 897 uint64_t val; 898 int err; 899 900 val = arc_meta_limit; 901 err = sysctl_handle_64(oidp, &val, 0, req); 902 if (err != 0 || req->newptr == NULL) 903 return (err); 904 905 if (val <= 0 || val > arc_c_max) 906 return (EINVAL); 907 908 arc_meta_limit = val; 909 return (0); 910} 911 912static int 913sysctl_vfs_zfs_arc_max(SYSCTL_HANDLER_ARGS) 914{ 915 uint64_t val; 916 int err; 917 918 val = zfs_arc_max; 919 err = sysctl_handle_64(oidp, &val, 0, req); 920 if (err != 0 || req->newptr == NULL) 921 return (err); 922 923 if (zfs_arc_max == 0) { 924 /* Loader tunable so blindly set */ 925 zfs_arc_max = val; 926 return (0); 927 } 928 929 if (val < arc_abs_min || val > kmem_size()) 930 return (EINVAL); 931 if (val < arc_c_min) 932 return (EINVAL); 933 if (zfs_arc_meta_limit > 0 && val < zfs_arc_meta_limit) 934 return (EINVAL); 935 936 arc_c_max = val; 937 938 arc_c = arc_c_max; 939 arc_p = (arc_c >> 1); 940 941 if (zfs_arc_meta_limit == 0) { 942 /* limit meta-data to 1/4 of the arc capacity */ 943 arc_meta_limit = arc_c_max / 4; 944 } 945 946 /* if kmem_flags are set, lets try to use less memory */ 947 if (kmem_debugging()) 948 arc_c = arc_c / 2; 949 950 zfs_arc_max = arc_c; 951 952 return (0); 953} 954 955static int 956sysctl_vfs_zfs_arc_min(SYSCTL_HANDLER_ARGS) 957{ 958 uint64_t val; 959 int err; 960 961 val = zfs_arc_min; 962 err = sysctl_handle_64(oidp, &val, 0, req); 963 if (err != 0 || req->newptr == NULL) 964 return (err); 965 966 if (zfs_arc_min == 0) { 967 /* Loader tunable so blindly set */ 968 zfs_arc_min = val; 969 return (0); 970 } 971 972 if (val < arc_abs_min || val > arc_c_max) 973 return (EINVAL); 974 975 arc_c_min = val; 976 977 if (zfs_arc_meta_min == 0) 978 arc_meta_min = arc_c_min / 2; 979 980 if (arc_c < arc_c_min) 981 arc_c = arc_c_min; 982 983 zfs_arc_min = arc_c_min; 984 985 return (0); 986} 987#endif 988 989static arc_buf_t *arc_eviction_list; 990static arc_buf_hdr_t arc_eviction_hdr; 991 992#define GHOST_STATE(state) \ 993 ((state) == arc_mru_ghost || (state) == arc_mfu_ghost || \ 994 (state) == arc_l2c_only) 995 996#define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_FLAG_IN_HASH_TABLE) 997#define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) 998#define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_FLAG_IO_ERROR) 999#define HDR_PREFETCH(hdr) ((hdr)->b_flags & ARC_FLAG_PREFETCH) 1000#define HDR_FREED_IN_READ(hdr) ((hdr)->b_flags & ARC_FLAG_FREED_IN_READ) 1001#define HDR_BUF_AVAILABLE(hdr) ((hdr)->b_flags & ARC_FLAG_BUF_AVAILABLE) 1002 1003#define HDR_L2CACHE(hdr) ((hdr)->b_flags & ARC_FLAG_L2CACHE) 1004#define HDR_L2COMPRESS(hdr) ((hdr)->b_flags & ARC_FLAG_L2COMPRESS) 1005#define HDR_L2_READING(hdr) \ 1006 (((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) && \ 1007 ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR)) 1008#define HDR_L2_WRITING(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITING) 1009#define HDR_L2_EVICTED(hdr) ((hdr)->b_flags & ARC_FLAG_L2_EVICTED) 1010#define HDR_L2_WRITE_HEAD(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITE_HEAD) 1011 1012#define HDR_ISTYPE_METADATA(hdr) \ 1013 ((hdr)->b_flags & ARC_FLAG_BUFC_METADATA) 1014#define HDR_ISTYPE_DATA(hdr) (!HDR_ISTYPE_METADATA(hdr)) 1015 1016#define HDR_HAS_L1HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L1HDR) 1017#define HDR_HAS_L2HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR) 1018 1019/* 1020 * Other sizes 1021 */ 1022 1023#define HDR_FULL_SIZE ((int64_t)sizeof (arc_buf_hdr_t)) 1024#define HDR_L2ONLY_SIZE ((int64_t)offsetof(arc_buf_hdr_t, b_l1hdr)) 1025 1026/* 1027 * Hash table routines 1028 */ 1029 1030#define HT_LOCK_PAD CACHE_LINE_SIZE 1031 1032struct ht_lock { 1033 kmutex_t ht_lock; 1034#ifdef _KERNEL 1035 unsigned char pad[(HT_LOCK_PAD - sizeof (kmutex_t))]; 1036#endif 1037}; 1038 1039#define BUF_LOCKS 256 1040typedef struct buf_hash_table { 1041 uint64_t ht_mask; 1042 arc_buf_hdr_t **ht_table; 1043 struct ht_lock ht_locks[BUF_LOCKS] __aligned(CACHE_LINE_SIZE); 1044} buf_hash_table_t; 1045 1046static buf_hash_table_t buf_hash_table; 1047 1048#define BUF_HASH_INDEX(spa, dva, birth) \ 1049 (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask) 1050#define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)]) 1051#define BUF_HASH_LOCK(idx) (&(BUF_HASH_LOCK_NTRY(idx).ht_lock)) 1052#define HDR_LOCK(hdr) \ 1053 (BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth))) 1054 1055uint64_t zfs_crc64_table[256]; 1056 1057/* 1058 * Level 2 ARC 1059 */ 1060 1061#define L2ARC_WRITE_SIZE (8 * 1024 * 1024) /* initial write max */ 1062#define L2ARC_HEADROOM 2 /* num of writes */ 1063/* 1064 * If we discover during ARC scan any buffers to be compressed, we boost 1065 * our headroom for the next scanning cycle by this percentage multiple. 1066 */ 1067#define L2ARC_HEADROOM_BOOST 200 1068#define L2ARC_FEED_SECS 1 /* caching interval secs */ 1069#define L2ARC_FEED_MIN_MS 200 /* min caching interval ms */ 1070 1071/* 1072 * Used to distinguish headers that are being process by 1073 * l2arc_write_buffers(), but have yet to be assigned to a l2arc disk 1074 * address. This can happen when the header is added to the l2arc's list 1075 * of buffers to write in the first stage of l2arc_write_buffers(), but 1076 * has not yet been written out which happens in the second stage of 1077 * l2arc_write_buffers(). 1078 */ 1079#define L2ARC_ADDR_UNSET ((uint64_t)(-1)) 1080 1081#define l2arc_writes_sent ARCSTAT(arcstat_l2_writes_sent) 1082#define l2arc_writes_done ARCSTAT(arcstat_l2_writes_done) 1083 1084/* L2ARC Performance Tunables */ 1085uint64_t l2arc_write_max = L2ARC_WRITE_SIZE; /* default max write size */ 1086uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra write during warmup */ 1087uint64_t l2arc_headroom = L2ARC_HEADROOM; /* number of dev writes */ 1088uint64_t l2arc_headroom_boost = L2ARC_HEADROOM_BOOST; 1089uint64_t l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */ 1090uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval milliseconds */ 1091boolean_t l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */ 1092boolean_t l2arc_feed_again = B_TRUE; /* turbo warmup */ 1093boolean_t l2arc_norw = B_TRUE; /* no reads during writes */ 1094 1095SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_max, CTLFLAG_RW, 1096 &l2arc_write_max, 0, "max write size"); 1097SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_boost, CTLFLAG_RW, 1098 &l2arc_write_boost, 0, "extra write during warmup"); 1099SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_headroom, CTLFLAG_RW, 1100 &l2arc_headroom, 0, "number of dev writes"); 1101SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_secs, CTLFLAG_RW, 1102 &l2arc_feed_secs, 0, "interval seconds"); 1103SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_min_ms, CTLFLAG_RW, 1104 &l2arc_feed_min_ms, 0, "min interval milliseconds"); 1105 1106SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_noprefetch, CTLFLAG_RW, 1107 &l2arc_noprefetch, 0, "don't cache prefetch bufs"); 1108SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_feed_again, CTLFLAG_RW, 1109 &l2arc_feed_again, 0, "turbo warmup"); 1110SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_norw, CTLFLAG_RW, 1111 &l2arc_norw, 0, "no reads during writes"); 1112 1113SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_size, CTLFLAG_RD, 1114 &ARC_anon.arcs_size.rc_count, 0, "size of anonymous state"); 1115SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_metadata_lsize, CTLFLAG_RD, 1116 &ARC_anon.arcs_lsize[ARC_BUFC_METADATA], 0, "size of anonymous state"); 1117SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_data_lsize, CTLFLAG_RD, 1118 &ARC_anon.arcs_lsize[ARC_BUFC_DATA], 0, "size of anonymous state"); 1119 1120SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_size, CTLFLAG_RD, 1121 &ARC_mru.arcs_size.rc_count, 0, "size of mru state"); 1122SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_metadata_lsize, CTLFLAG_RD, 1123 &ARC_mru.arcs_lsize[ARC_BUFC_METADATA], 0, "size of metadata in mru state"); 1124SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_data_lsize, CTLFLAG_RD, 1125 &ARC_mru.arcs_lsize[ARC_BUFC_DATA], 0, "size of data in mru state"); 1126 1127SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_size, CTLFLAG_RD, 1128 &ARC_mru_ghost.arcs_size.rc_count, 0, "size of mru ghost state"); 1129SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_metadata_lsize, CTLFLAG_RD, 1130 &ARC_mru_ghost.arcs_lsize[ARC_BUFC_METADATA], 0, 1131 "size of metadata in mru ghost state"); 1132SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_data_lsize, CTLFLAG_RD, 1133 &ARC_mru_ghost.arcs_lsize[ARC_BUFC_DATA], 0, 1134 "size of data in mru ghost state"); 1135 1136SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_size, CTLFLAG_RD, 1137 &ARC_mfu.arcs_size.rc_count, 0, "size of mfu state"); 1138SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_metadata_lsize, CTLFLAG_RD, 1139 &ARC_mfu.arcs_lsize[ARC_BUFC_METADATA], 0, "size of metadata in mfu state"); 1140SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_data_lsize, CTLFLAG_RD, 1141 &ARC_mfu.arcs_lsize[ARC_BUFC_DATA], 0, "size of data in mfu state"); 1142 1143SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_size, CTLFLAG_RD, 1144 &ARC_mfu_ghost.arcs_size.rc_count, 0, "size of mfu ghost state"); 1145SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_metadata_lsize, CTLFLAG_RD, 1146 &ARC_mfu_ghost.arcs_lsize[ARC_BUFC_METADATA], 0, 1147 "size of metadata in mfu ghost state"); 1148SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_data_lsize, CTLFLAG_RD, 1149 &ARC_mfu_ghost.arcs_lsize[ARC_BUFC_DATA], 0, 1150 "size of data in mfu ghost state"); 1151 1152SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2c_only_size, CTLFLAG_RD, 1153 &ARC_l2c_only.arcs_size.rc_count, 0, "size of mru state"); 1154 1155/* 1156 * L2ARC Internals 1157 */ 1158struct l2arc_dev { 1159 vdev_t *l2ad_vdev; /* vdev */ 1160 spa_t *l2ad_spa; /* spa */ 1161 uint64_t l2ad_hand; /* next write location */ 1162 uint64_t l2ad_start; /* first addr on device */ 1163 uint64_t l2ad_end; /* last addr on device */ 1164 boolean_t l2ad_first; /* first sweep through */ 1165 boolean_t l2ad_writing; /* currently writing */ 1166 kmutex_t l2ad_mtx; /* lock for buffer list */ 1167 list_t l2ad_buflist; /* buffer list */ 1168 list_node_t l2ad_node; /* device list node */ 1169 refcount_t l2ad_alloc; /* allocated bytes */ 1170}; 1171 1172static list_t L2ARC_dev_list; /* device list */ 1173static list_t *l2arc_dev_list; /* device list pointer */ 1174static kmutex_t l2arc_dev_mtx; /* device list mutex */ 1175static l2arc_dev_t *l2arc_dev_last; /* last device used */ 1176static list_t L2ARC_free_on_write; /* free after write buf list */ 1177static list_t *l2arc_free_on_write; /* free after write list ptr */ 1178static kmutex_t l2arc_free_on_write_mtx; /* mutex for list */ 1179static uint64_t l2arc_ndev; /* number of devices */ 1180 1181typedef struct l2arc_read_callback { 1182 arc_buf_t *l2rcb_buf; /* read buffer */ 1183 spa_t *l2rcb_spa; /* spa */ 1184 blkptr_t l2rcb_bp; /* original blkptr */ 1185 zbookmark_phys_t l2rcb_zb; /* original bookmark */ 1186 int l2rcb_flags; /* original flags */ 1187 enum zio_compress l2rcb_compress; /* applied compress */ 1188 void *l2rcb_data; /* temporary buffer */ 1189} l2arc_read_callback_t; 1190 1191typedef struct l2arc_write_callback { 1192 l2arc_dev_t *l2wcb_dev; /* device info */ 1193 arc_buf_hdr_t *l2wcb_head; /* head of write buflist */ 1194} l2arc_write_callback_t; 1195 1196typedef struct l2arc_data_free { 1197 /* protected by l2arc_free_on_write_mtx */ 1198 void *l2df_data; 1199 size_t l2df_size; 1200 void (*l2df_func)(void *, size_t); 1201 list_node_t l2df_list_node; 1202} l2arc_data_free_t; 1203 1204static kmutex_t l2arc_feed_thr_lock; 1205static kcondvar_t l2arc_feed_thr_cv; 1206static uint8_t l2arc_thread_exit; 1207 1208static void arc_get_data_buf(arc_buf_t *); 1209static void arc_access(arc_buf_hdr_t *, kmutex_t *); 1210static boolean_t arc_is_overflowing(); 1211static void arc_buf_watch(arc_buf_t *); 1212 1213static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *); 1214static uint32_t arc_bufc_to_flags(arc_buf_contents_t); 1215 1216static boolean_t l2arc_write_eligible(uint64_t, arc_buf_hdr_t *); 1217static void l2arc_read_done(zio_t *); 1218 1219static boolean_t l2arc_transform_buf(arc_buf_hdr_t *, boolean_t); 1220static void l2arc_decompress_zio(zio_t *, arc_buf_hdr_t *, enum zio_compress); 1221static void l2arc_release_cdata_buf(arc_buf_hdr_t *); 1222 1223static void 1224l2arc_trim(const arc_buf_hdr_t *hdr) 1225{ 1226 l2arc_dev_t *dev = hdr->b_l2hdr.b_dev; 1227 1228 ASSERT(HDR_HAS_L2HDR(hdr)); 1229 ASSERT(MUTEX_HELD(&dev->l2ad_mtx)); 1230 1231 if (hdr->b_l2hdr.b_daddr == L2ARC_ADDR_UNSET) 1232 return; 1233 if (hdr->b_l2hdr.b_asize != 0) { 1234 trim_map_free(dev->l2ad_vdev, hdr->b_l2hdr.b_daddr, 1235 hdr->b_l2hdr.b_asize, 0); 1236 } else { 1237 ASSERT3U(hdr->b_l2hdr.b_compress, ==, ZIO_COMPRESS_EMPTY); 1238 } 1239} 1240 1241static uint64_t 1242buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth) 1243{ 1244 uint8_t *vdva = (uint8_t *)dva; 1245 uint64_t crc = -1ULL; 1246 int i; 1247 1248 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); 1249 1250 for (i = 0; i < sizeof (dva_t); i++) 1251 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF]; 1252 1253 crc ^= (spa>>8) ^ birth; 1254 1255 return (crc); 1256} 1257 1258#define BUF_EMPTY(buf) \ 1259 ((buf)->b_dva.dva_word[0] == 0 && \ 1260 (buf)->b_dva.dva_word[1] == 0) 1261 1262#define BUF_EQUAL(spa, dva, birth, buf) \ 1263 ((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \ 1264 ((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \ 1265 ((buf)->b_birth == birth) && ((buf)->b_spa == spa) 1266 1267static void 1268buf_discard_identity(arc_buf_hdr_t *hdr) 1269{ 1270 hdr->b_dva.dva_word[0] = 0; 1271 hdr->b_dva.dva_word[1] = 0; 1272 hdr->b_birth = 0; 1273} 1274 1275static arc_buf_hdr_t * 1276buf_hash_find(uint64_t spa, const blkptr_t *bp, kmutex_t **lockp) 1277{ 1278 const dva_t *dva = BP_IDENTITY(bp); 1279 uint64_t birth = BP_PHYSICAL_BIRTH(bp); 1280 uint64_t idx = BUF_HASH_INDEX(spa, dva, birth); 1281 kmutex_t *hash_lock = BUF_HASH_LOCK(idx); 1282 arc_buf_hdr_t *hdr; 1283 1284 mutex_enter(hash_lock); 1285 for (hdr = buf_hash_table.ht_table[idx]; hdr != NULL; 1286 hdr = hdr->b_hash_next) { 1287 if (BUF_EQUAL(spa, dva, birth, hdr)) { 1288 *lockp = hash_lock; 1289 return (hdr); 1290 } 1291 } 1292 mutex_exit(hash_lock); 1293 *lockp = NULL; 1294 return (NULL); 1295} 1296 1297/* 1298 * Insert an entry into the hash table. If there is already an element 1299 * equal to elem in the hash table, then the already existing element 1300 * will be returned and the new element will not be inserted. 1301 * Otherwise returns NULL. 1302 * If lockp == NULL, the caller is assumed to already hold the hash lock. 1303 */ 1304static arc_buf_hdr_t * 1305buf_hash_insert(arc_buf_hdr_t *hdr, kmutex_t **lockp) 1306{ 1307 uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth); 1308 kmutex_t *hash_lock = BUF_HASH_LOCK(idx); 1309 arc_buf_hdr_t *fhdr; 1310 uint32_t i; 1311 1312 ASSERT(!DVA_IS_EMPTY(&hdr->b_dva)); 1313 ASSERT(hdr->b_birth != 0); 1314 ASSERT(!HDR_IN_HASH_TABLE(hdr)); 1315 1316 if (lockp != NULL) { 1317 *lockp = hash_lock; 1318 mutex_enter(hash_lock); 1319 } else { 1320 ASSERT(MUTEX_HELD(hash_lock)); 1321 } 1322 1323 for (fhdr = buf_hash_table.ht_table[idx], i = 0; fhdr != NULL; 1324 fhdr = fhdr->b_hash_next, i++) { 1325 if (BUF_EQUAL(hdr->b_spa, &hdr->b_dva, hdr->b_birth, fhdr)) 1326 return (fhdr); 1327 } 1328 1329 hdr->b_hash_next = buf_hash_table.ht_table[idx]; 1330 buf_hash_table.ht_table[idx] = hdr; 1331 hdr->b_flags |= ARC_FLAG_IN_HASH_TABLE; 1332 1333 /* collect some hash table performance data */ 1334 if (i > 0) { 1335 ARCSTAT_BUMP(arcstat_hash_collisions); 1336 if (i == 1) 1337 ARCSTAT_BUMP(arcstat_hash_chains); 1338 1339 ARCSTAT_MAX(arcstat_hash_chain_max, i); 1340 } 1341 1342 ARCSTAT_BUMP(arcstat_hash_elements); 1343 ARCSTAT_MAXSTAT(arcstat_hash_elements); 1344 1345 return (NULL); 1346} 1347 1348static void 1349buf_hash_remove(arc_buf_hdr_t *hdr) 1350{ 1351 arc_buf_hdr_t *fhdr, **hdrp; 1352 uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth); 1353 1354 ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx))); 1355 ASSERT(HDR_IN_HASH_TABLE(hdr)); 1356 1357 hdrp = &buf_hash_table.ht_table[idx]; 1358 while ((fhdr = *hdrp) != hdr) { 1359 ASSERT(fhdr != NULL); 1360 hdrp = &fhdr->b_hash_next; 1361 } 1362 *hdrp = hdr->b_hash_next; 1363 hdr->b_hash_next = NULL; 1364 hdr->b_flags &= ~ARC_FLAG_IN_HASH_TABLE; 1365 1366 /* collect some hash table performance data */ 1367 ARCSTAT_BUMPDOWN(arcstat_hash_elements); 1368 1369 if (buf_hash_table.ht_table[idx] && 1370 buf_hash_table.ht_table[idx]->b_hash_next == NULL) 1371 ARCSTAT_BUMPDOWN(arcstat_hash_chains); 1372} 1373 1374/* 1375 * Global data structures and functions for the buf kmem cache. 1376 */ 1377static kmem_cache_t *hdr_full_cache; 1378static kmem_cache_t *hdr_l2only_cache; 1379static kmem_cache_t *buf_cache; 1380 1381static void 1382buf_fini(void) 1383{ 1384 int i; 1385 1386 kmem_free(buf_hash_table.ht_table, 1387 (buf_hash_table.ht_mask + 1) * sizeof (void *)); 1388 for (i = 0; i < BUF_LOCKS; i++) 1389 mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock); 1390 kmem_cache_destroy(hdr_full_cache); 1391 kmem_cache_destroy(hdr_l2only_cache); 1392 kmem_cache_destroy(buf_cache); 1393} 1394 1395/* 1396 * Constructor callback - called when the cache is empty 1397 * and a new buf is requested. 1398 */ 1399/* ARGSUSED */ 1400static int 1401hdr_full_cons(void *vbuf, void *unused, int kmflag) 1402{ 1403 arc_buf_hdr_t *hdr = vbuf; 1404 1405 bzero(hdr, HDR_FULL_SIZE); 1406 cv_init(&hdr->b_l1hdr.b_cv, NULL, CV_DEFAULT, NULL); 1407 refcount_create(&hdr->b_l1hdr.b_refcnt); 1408 mutex_init(&hdr->b_l1hdr.b_freeze_lock, NULL, MUTEX_DEFAULT, NULL); 1409 multilist_link_init(&hdr->b_l1hdr.b_arc_node); 1410 arc_space_consume(HDR_FULL_SIZE, ARC_SPACE_HDRS); 1411 1412 return (0); 1413} 1414 1415/* ARGSUSED */ 1416static int 1417hdr_l2only_cons(void *vbuf, void *unused, int kmflag) 1418{ 1419 arc_buf_hdr_t *hdr = vbuf; 1420 1421 bzero(hdr, HDR_L2ONLY_SIZE); 1422 arc_space_consume(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS); 1423 1424 return (0); 1425} 1426 1427/* ARGSUSED */ 1428static int 1429buf_cons(void *vbuf, void *unused, int kmflag) 1430{ 1431 arc_buf_t *buf = vbuf; 1432 1433 bzero(buf, sizeof (arc_buf_t)); 1434 mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL); 1435 arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS); 1436 1437 return (0); 1438} 1439 1440/* 1441 * Destructor callback - called when a cached buf is 1442 * no longer required. 1443 */ 1444/* ARGSUSED */ 1445static void 1446hdr_full_dest(void *vbuf, void *unused) 1447{ 1448 arc_buf_hdr_t *hdr = vbuf; 1449 1450 ASSERT(BUF_EMPTY(hdr)); 1451 cv_destroy(&hdr->b_l1hdr.b_cv); 1452 refcount_destroy(&hdr->b_l1hdr.b_refcnt); 1453 mutex_destroy(&hdr->b_l1hdr.b_freeze_lock); 1454 ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); 1455 arc_space_return(HDR_FULL_SIZE, ARC_SPACE_HDRS); 1456} 1457 1458/* ARGSUSED */ 1459static void 1460hdr_l2only_dest(void *vbuf, void *unused) 1461{ 1462 arc_buf_hdr_t *hdr = vbuf; 1463 1464 ASSERT(BUF_EMPTY(hdr)); 1465 arc_space_return(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS); 1466} 1467 1468/* ARGSUSED */ 1469static void 1470buf_dest(void *vbuf, void *unused) 1471{ 1472 arc_buf_t *buf = vbuf; 1473 1474 mutex_destroy(&buf->b_evict_lock); 1475 arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS); 1476} 1477 1478/* 1479 * Reclaim callback -- invoked when memory is low. 1480 */ 1481/* ARGSUSED */ 1482static void 1483hdr_recl(void *unused) 1484{ 1485 dprintf("hdr_recl called\n"); 1486 /* 1487 * umem calls the reclaim func when we destroy the buf cache, 1488 * which is after we do arc_fini(). 1489 */ 1490 if (!arc_dead) 1491 cv_signal(&arc_reclaim_thread_cv); 1492} 1493 1494static void 1495buf_init(void) 1496{ 1497 uint64_t *ct; 1498 uint64_t hsize = 1ULL << 12; 1499 int i, j; 1500 1501 /* 1502 * The hash table is big enough to fill all of physical memory 1503 * with an average block size of zfs_arc_average_blocksize (default 8K). 1504 * By default, the table will take up 1505 * totalmem * sizeof(void*) / 8K (1MB per GB with 8-byte pointers). 1506 */ 1507 while (hsize * zfs_arc_average_blocksize < (uint64_t)physmem * PAGESIZE) 1508 hsize <<= 1; 1509retry: 1510 buf_hash_table.ht_mask = hsize - 1; 1511 buf_hash_table.ht_table = 1512 kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP); 1513 if (buf_hash_table.ht_table == NULL) { 1514 ASSERT(hsize > (1ULL << 8)); 1515 hsize >>= 1; 1516 goto retry; 1517 } 1518 1519 hdr_full_cache = kmem_cache_create("arc_buf_hdr_t_full", HDR_FULL_SIZE, 1520 0, hdr_full_cons, hdr_full_dest, hdr_recl, NULL, NULL, 0); 1521 hdr_l2only_cache = kmem_cache_create("arc_buf_hdr_t_l2only", 1522 HDR_L2ONLY_SIZE, 0, hdr_l2only_cons, hdr_l2only_dest, hdr_recl, 1523 NULL, NULL, 0); 1524 buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t), 1525 0, buf_cons, buf_dest, NULL, NULL, NULL, 0); 1526 1527 for (i = 0; i < 256; i++) 1528 for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--) 1529 *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY); 1530 1531 for (i = 0; i < BUF_LOCKS; i++) { 1532 mutex_init(&buf_hash_table.ht_locks[i].ht_lock, 1533 NULL, MUTEX_DEFAULT, NULL); 1534 } 1535} 1536 1537/* 1538 * Transition between the two allocation states for the arc_buf_hdr struct. 1539 * The arc_buf_hdr struct can be allocated with (hdr_full_cache) or without 1540 * (hdr_l2only_cache) the fields necessary for the L1 cache - the smaller 1541 * version is used when a cache buffer is only in the L2ARC in order to reduce 1542 * memory usage. 1543 */ 1544static arc_buf_hdr_t * 1545arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new) 1546{ 1547 ASSERT(HDR_HAS_L2HDR(hdr)); 1548 1549 arc_buf_hdr_t *nhdr; 1550 l2arc_dev_t *dev = hdr->b_l2hdr.b_dev; 1551 1552 ASSERT((old == hdr_full_cache && new == hdr_l2only_cache) || 1553 (old == hdr_l2only_cache && new == hdr_full_cache)); 1554 1555 nhdr = kmem_cache_alloc(new, KM_PUSHPAGE); 1556 1557 ASSERT(MUTEX_HELD(HDR_LOCK(hdr))); 1558 buf_hash_remove(hdr); 1559 1560 bcopy(hdr, nhdr, HDR_L2ONLY_SIZE); 1561 1562 if (new == hdr_full_cache) { 1563 nhdr->b_flags |= ARC_FLAG_HAS_L1HDR; 1564 /* 1565 * arc_access and arc_change_state need to be aware that a 1566 * header has just come out of L2ARC, so we set its state to 1567 * l2c_only even though it's about to change. 1568 */ 1569 nhdr->b_l1hdr.b_state = arc_l2c_only; 1570 1571 /* Verify previous threads set to NULL before freeing */ 1572 ASSERT3P(nhdr->b_l1hdr.b_tmp_cdata, ==, NULL); 1573 } else { 1574 ASSERT(hdr->b_l1hdr.b_buf == NULL); 1575 ASSERT0(hdr->b_l1hdr.b_datacnt); 1576 1577 /* 1578 * If we've reached here, We must have been called from 1579 * arc_evict_hdr(), as such we should have already been 1580 * removed from any ghost list we were previously on 1581 * (which protects us from racing with arc_evict_state), 1582 * thus no locking is needed during this check. 1583 */ 1584 ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); 1585 1586 /* 1587 * A buffer must not be moved into the arc_l2c_only 1588 * state if it's not finished being written out to the 1589 * l2arc device. Otherwise, the b_l1hdr.b_tmp_cdata field 1590 * might try to be accessed, even though it was removed. 1591 */ 1592 VERIFY(!HDR_L2_WRITING(hdr)); 1593 VERIFY3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL); 1594 1595#ifdef ZFS_DEBUG 1596 if (hdr->b_l1hdr.b_thawed != NULL) { 1597 kmem_free(hdr->b_l1hdr.b_thawed, 1); 1598 hdr->b_l1hdr.b_thawed = NULL; 1599 } 1600#endif 1601 1602 nhdr->b_flags &= ~ARC_FLAG_HAS_L1HDR; 1603 } 1604 /* 1605 * The header has been reallocated so we need to re-insert it into any 1606 * lists it was on. 1607 */ 1608 (void) buf_hash_insert(nhdr, NULL); 1609 1610 ASSERT(list_link_active(&hdr->b_l2hdr.b_l2node)); 1611 1612 mutex_enter(&dev->l2ad_mtx); 1613 1614 /* 1615 * We must place the realloc'ed header back into the list at 1616 * the same spot. Otherwise, if it's placed earlier in the list, 1617 * l2arc_write_buffers() could find it during the function's 1618 * write phase, and try to write it out to the l2arc. 1619 */ 1620 list_insert_after(&dev->l2ad_buflist, hdr, nhdr); 1621 list_remove(&dev->l2ad_buflist, hdr); 1622 1623 mutex_exit(&dev->l2ad_mtx); 1624 1625 /* 1626 * Since we're using the pointer address as the tag when 1627 * incrementing and decrementing the l2ad_alloc refcount, we 1628 * must remove the old pointer (that we're about to destroy) and 1629 * add the new pointer to the refcount. Otherwise we'd remove 1630 * the wrong pointer address when calling arc_hdr_destroy() later. 1631 */ 1632 1633 (void) refcount_remove_many(&dev->l2ad_alloc, 1634 hdr->b_l2hdr.b_asize, hdr); 1635 1636 (void) refcount_add_many(&dev->l2ad_alloc, 1637 nhdr->b_l2hdr.b_asize, nhdr); 1638 1639 buf_discard_identity(hdr); 1640 hdr->b_freeze_cksum = NULL; 1641 kmem_cache_free(old, hdr); 1642 1643 return (nhdr); 1644} 1645 1646 1647#define ARC_MINTIME (hz>>4) /* 62 ms */ 1648 1649static void 1650arc_cksum_verify(arc_buf_t *buf) 1651{ 1652 zio_cksum_t zc; 1653 1654 if (!(zfs_flags & ZFS_DEBUG_MODIFY)) 1655 return; 1656 1657 mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1658 if (buf->b_hdr->b_freeze_cksum == NULL || HDR_IO_ERROR(buf->b_hdr)) { 1659 mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1660 return; 1661 } 1662 fletcher_2_native(buf->b_data, buf->b_hdr->b_size, NULL, &zc); 1663 if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc)) 1664 panic("buffer modified while frozen!"); 1665 mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1666} 1667 1668static int 1669arc_cksum_equal(arc_buf_t *buf) 1670{ 1671 zio_cksum_t zc; 1672 int equal; 1673 1674 mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1675 fletcher_2_native(buf->b_data, buf->b_hdr->b_size, NULL, &zc); 1676 equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc); 1677 mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1678 1679 return (equal); 1680} 1681 1682static void 1683arc_cksum_compute(arc_buf_t *buf, boolean_t force) 1684{ 1685 if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY)) 1686 return; 1687 1688 mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1689 if (buf->b_hdr->b_freeze_cksum != NULL) { 1690 mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1691 return; 1692 } 1693 buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP); 1694 fletcher_2_native(buf->b_data, buf->b_hdr->b_size, 1695 NULL, buf->b_hdr->b_freeze_cksum); 1696 mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1697#ifdef illumos 1698 arc_buf_watch(buf); 1699#endif 1700} 1701 1702#ifdef illumos 1703#ifndef _KERNEL 1704typedef struct procctl { 1705 long cmd; 1706 prwatch_t prwatch; 1707} procctl_t; 1708#endif 1709 1710/* ARGSUSED */ 1711static void 1712arc_buf_unwatch(arc_buf_t *buf) 1713{ 1714#ifndef _KERNEL 1715 if (arc_watch) { 1716 int result; 1717 procctl_t ctl; 1718 ctl.cmd = PCWATCH; 1719 ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data; 1720 ctl.prwatch.pr_size = 0; 1721 ctl.prwatch.pr_wflags = 0; 1722 result = write(arc_procfd, &ctl, sizeof (ctl)); 1723 ASSERT3U(result, ==, sizeof (ctl)); 1724 } 1725#endif 1726} 1727 1728/* ARGSUSED */ 1729static void 1730arc_buf_watch(arc_buf_t *buf) 1731{ 1732#ifndef _KERNEL 1733 if (arc_watch) { 1734 int result; 1735 procctl_t ctl; 1736 ctl.cmd = PCWATCH; 1737 ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data; 1738 ctl.prwatch.pr_size = buf->b_hdr->b_size; 1739 ctl.prwatch.pr_wflags = WA_WRITE; 1740 result = write(arc_procfd, &ctl, sizeof (ctl)); 1741 ASSERT3U(result, ==, sizeof (ctl)); 1742 } 1743#endif 1744} 1745#endif /* illumos */ 1746 1747static arc_buf_contents_t 1748arc_buf_type(arc_buf_hdr_t *hdr) 1749{ 1750 if (HDR_ISTYPE_METADATA(hdr)) { 1751 return (ARC_BUFC_METADATA); 1752 } else { 1753 return (ARC_BUFC_DATA); 1754 } 1755} 1756 1757static uint32_t 1758arc_bufc_to_flags(arc_buf_contents_t type) 1759{ 1760 switch (type) { 1761 case ARC_BUFC_DATA: 1762 /* metadata field is 0 if buffer contains normal data */ 1763 return (0); 1764 case ARC_BUFC_METADATA: 1765 return (ARC_FLAG_BUFC_METADATA); 1766 default: 1767 break; 1768 } 1769 panic("undefined ARC buffer type!"); 1770 return ((uint32_t)-1); 1771} 1772 1773void 1774arc_buf_thaw(arc_buf_t *buf) 1775{ 1776 if (zfs_flags & ZFS_DEBUG_MODIFY) { 1777 if (buf->b_hdr->b_l1hdr.b_state != arc_anon) 1778 panic("modifying non-anon buffer!"); 1779 if (HDR_IO_IN_PROGRESS(buf->b_hdr)) 1780 panic("modifying buffer while i/o in progress!"); 1781 arc_cksum_verify(buf); 1782 } 1783 1784 mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1785 if (buf->b_hdr->b_freeze_cksum != NULL) { 1786 kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t)); 1787 buf->b_hdr->b_freeze_cksum = NULL; 1788 } 1789 1790#ifdef ZFS_DEBUG 1791 if (zfs_flags & ZFS_DEBUG_MODIFY) { 1792 if (buf->b_hdr->b_l1hdr.b_thawed != NULL) 1793 kmem_free(buf->b_hdr->b_l1hdr.b_thawed, 1); 1794 buf->b_hdr->b_l1hdr.b_thawed = kmem_alloc(1, KM_SLEEP); 1795 } 1796#endif 1797 1798 mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1799 1800#ifdef illumos 1801 arc_buf_unwatch(buf); 1802#endif 1803} 1804 1805void 1806arc_buf_freeze(arc_buf_t *buf) 1807{ 1808 kmutex_t *hash_lock; 1809 1810 if (!(zfs_flags & ZFS_DEBUG_MODIFY)) 1811 return; 1812 1813 hash_lock = HDR_LOCK(buf->b_hdr); 1814 mutex_enter(hash_lock); 1815 1816 ASSERT(buf->b_hdr->b_freeze_cksum != NULL || 1817 buf->b_hdr->b_l1hdr.b_state == arc_anon); 1818 arc_cksum_compute(buf, B_FALSE); 1819 mutex_exit(hash_lock); 1820 1821} 1822 1823static void 1824add_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag) 1825{ 1826 ASSERT(HDR_HAS_L1HDR(hdr)); 1827 ASSERT(MUTEX_HELD(hash_lock)); 1828 arc_state_t *state = hdr->b_l1hdr.b_state; 1829 1830 if ((refcount_add(&hdr->b_l1hdr.b_refcnt, tag) == 1) && 1831 (state != arc_anon)) { 1832 /* We don't use the L2-only state list. */ 1833 if (state != arc_l2c_only) { 1834 arc_buf_contents_t type = arc_buf_type(hdr); 1835 uint64_t delta = hdr->b_size * hdr->b_l1hdr.b_datacnt; 1836 multilist_t *list = &state->arcs_list[type]; 1837 uint64_t *size = &state->arcs_lsize[type]; 1838 1839 multilist_remove(list, hdr); 1840 1841 if (GHOST_STATE(state)) { 1842 ASSERT0(hdr->b_l1hdr.b_datacnt); 1843 ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); 1844 delta = hdr->b_size; 1845 } 1846 ASSERT(delta > 0); 1847 ASSERT3U(*size, >=, delta); 1848 atomic_add_64(size, -delta); 1849 } 1850 /* remove the prefetch flag if we get a reference */ 1851 hdr->b_flags &= ~ARC_FLAG_PREFETCH; 1852 } 1853} 1854 1855static int 1856remove_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag) 1857{ 1858 int cnt; 1859 arc_state_t *state = hdr->b_l1hdr.b_state; 1860 1861 ASSERT(HDR_HAS_L1HDR(hdr)); 1862 ASSERT(state == arc_anon || MUTEX_HELD(hash_lock)); 1863 ASSERT(!GHOST_STATE(state)); 1864 1865 /* 1866 * arc_l2c_only counts as a ghost state so we don't need to explicitly 1867 * check to prevent usage of the arc_l2c_only list. 1868 */ 1869 if (((cnt = refcount_remove(&hdr->b_l1hdr.b_refcnt, tag)) == 0) && 1870 (state != arc_anon)) { 1871 arc_buf_contents_t type = arc_buf_type(hdr); 1872 multilist_t *list = &state->arcs_list[type]; 1873 uint64_t *size = &state->arcs_lsize[type]; 1874 1875 multilist_insert(list, hdr); 1876 1877 ASSERT(hdr->b_l1hdr.b_datacnt > 0); 1878 atomic_add_64(size, hdr->b_size * 1879 hdr->b_l1hdr.b_datacnt); 1880 } 1881 return (cnt); 1882} 1883 1884/* 1885 * Move the supplied buffer to the indicated state. The hash lock 1886 * for the buffer must be held by the caller. 1887 */ 1888static void 1889arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, 1890 kmutex_t *hash_lock) 1891{ 1892 arc_state_t *old_state; 1893 int64_t refcnt; 1894 uint32_t datacnt; 1895 uint64_t from_delta, to_delta; 1896 arc_buf_contents_t buftype = arc_buf_type(hdr); 1897 1898 /* 1899 * We almost always have an L1 hdr here, since we call arc_hdr_realloc() 1900 * in arc_read() when bringing a buffer out of the L2ARC. However, the 1901 * L1 hdr doesn't always exist when we change state to arc_anon before 1902 * destroying a header, in which case reallocating to add the L1 hdr is 1903 * pointless. 1904 */ 1905 if (HDR_HAS_L1HDR(hdr)) { 1906 old_state = hdr->b_l1hdr.b_state; 1907 refcnt = refcount_count(&hdr->b_l1hdr.b_refcnt); 1908 datacnt = hdr->b_l1hdr.b_datacnt; 1909 } else { 1910 old_state = arc_l2c_only; 1911 refcnt = 0; 1912 datacnt = 0; 1913 } 1914 1915 ASSERT(MUTEX_HELD(hash_lock)); 1916 ASSERT3P(new_state, !=, old_state); 1917 ASSERT(refcnt == 0 || datacnt > 0); 1918 ASSERT(!GHOST_STATE(new_state) || datacnt == 0); 1919 ASSERT(old_state != arc_anon || datacnt <= 1); 1920 1921 from_delta = to_delta = datacnt * hdr->b_size; 1922 1923 /* 1924 * If this buffer is evictable, transfer it from the 1925 * old state list to the new state list. 1926 */ 1927 if (refcnt == 0) { 1928 if (old_state != arc_anon && old_state != arc_l2c_only) { 1929 uint64_t *size = &old_state->arcs_lsize[buftype]; 1930 1931 ASSERT(HDR_HAS_L1HDR(hdr)); 1932 multilist_remove(&old_state->arcs_list[buftype], hdr); 1933 1934 /* 1935 * If prefetching out of the ghost cache, 1936 * we will have a non-zero datacnt. 1937 */ 1938 if (GHOST_STATE(old_state) && datacnt == 0) { 1939 /* ghost elements have a ghost size */ 1940 ASSERT(hdr->b_l1hdr.b_buf == NULL); 1941 from_delta = hdr->b_size; 1942 } 1943 ASSERT3U(*size, >=, from_delta); 1944 atomic_add_64(size, -from_delta); 1945 } 1946 if (new_state != arc_anon && new_state != arc_l2c_only) { 1947 uint64_t *size = &new_state->arcs_lsize[buftype]; 1948 1949 /* 1950 * An L1 header always exists here, since if we're 1951 * moving to some L1-cached state (i.e. not l2c_only or 1952 * anonymous), we realloc the header to add an L1hdr 1953 * beforehand. 1954 */ 1955 ASSERT(HDR_HAS_L1HDR(hdr)); 1956 multilist_insert(&new_state->arcs_list[buftype], hdr); 1957 1958 /* ghost elements have a ghost size */ 1959 if (GHOST_STATE(new_state)) { 1960 ASSERT0(datacnt); 1961 ASSERT(hdr->b_l1hdr.b_buf == NULL); 1962 to_delta = hdr->b_size; 1963 } 1964 atomic_add_64(size, to_delta); 1965 } 1966 } 1967 1968 ASSERT(!BUF_EMPTY(hdr)); 1969 if (new_state == arc_anon && HDR_IN_HASH_TABLE(hdr)) 1970 buf_hash_remove(hdr); 1971 1972 /* adjust state sizes (ignore arc_l2c_only) */ 1973 1974 if (to_delta && new_state != arc_l2c_only) { 1975 ASSERT(HDR_HAS_L1HDR(hdr)); 1976 if (GHOST_STATE(new_state)) { 1977 ASSERT0(datacnt); 1978 1979 /* 1980 * We moving a header to a ghost state, we first 1981 * remove all arc buffers. Thus, we'll have a 1982 * datacnt of zero, and no arc buffer to use for 1983 * the reference. As a result, we use the arc 1984 * header pointer for the reference. 1985 */ 1986 (void) refcount_add_many(&new_state->arcs_size, 1987 hdr->b_size, hdr); 1988 } else { 1989 ASSERT3U(datacnt, !=, 0); 1990 1991 /* 1992 * Each individual buffer holds a unique reference, 1993 * thus we must remove each of these references one 1994 * at a time. 1995 */ 1996 for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; 1997 buf = buf->b_next) { 1998 (void) refcount_add_many(&new_state->arcs_size, 1999 hdr->b_size, buf); 2000 } 2001 } 2002 } 2003 2004 if (from_delta && old_state != arc_l2c_only) { 2005 ASSERT(HDR_HAS_L1HDR(hdr)); 2006 if (GHOST_STATE(old_state)) { 2007 /* 2008 * When moving a header off of a ghost state, 2009 * there's the possibility for datacnt to be 2010 * non-zero. This is because we first add the 2011 * arc buffer to the header prior to changing 2012 * the header's state. Since we used the header 2013 * for the reference when putting the header on 2014 * the ghost state, we must balance that and use 2015 * the header when removing off the ghost state 2016 * (even though datacnt is non zero). 2017 */ 2018 2019 IMPLY(datacnt == 0, new_state == arc_anon || 2020 new_state == arc_l2c_only); 2021 2022 (void) refcount_remove_many(&old_state->arcs_size, 2023 hdr->b_size, hdr); 2024 } else { 2025 ASSERT3P(datacnt, !=, 0); 2026 2027 /* 2028 * Each individual buffer holds a unique reference, 2029 * thus we must remove each of these references one 2030 * at a time. 2031 */ 2032 for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; 2033 buf = buf->b_next) { 2034 (void) refcount_remove_many( 2035 &old_state->arcs_size, hdr->b_size, buf); 2036 } 2037 } 2038 } 2039 2040 if (HDR_HAS_L1HDR(hdr)) 2041 hdr->b_l1hdr.b_state = new_state; 2042 2043 /* 2044 * L2 headers should never be on the L2 state list since they don't 2045 * have L1 headers allocated. 2046 */ 2047 ASSERT(multilist_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_DATA]) && 2048 multilist_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA])); 2049} 2050 2051void 2052arc_space_consume(uint64_t space, arc_space_type_t type) 2053{ 2054 ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES); 2055 2056 switch (type) { 2057 case ARC_SPACE_DATA: 2058 ARCSTAT_INCR(arcstat_data_size, space); 2059 break; 2060 case ARC_SPACE_META: 2061 ARCSTAT_INCR(arcstat_metadata_size, space); 2062 break; 2063 case ARC_SPACE_OTHER: 2064 ARCSTAT_INCR(arcstat_other_size, space); 2065 break; 2066 case ARC_SPACE_HDRS: 2067 ARCSTAT_INCR(arcstat_hdr_size, space); 2068 break; 2069 case ARC_SPACE_L2HDRS: 2070 ARCSTAT_INCR(arcstat_l2_hdr_size, space); 2071 break; 2072 } 2073 2074 if (type != ARC_SPACE_DATA) 2075 ARCSTAT_INCR(arcstat_meta_used, space); 2076 2077 atomic_add_64(&arc_size, space); 2078} 2079 2080void 2081arc_space_return(uint64_t space, arc_space_type_t type) 2082{ 2083 ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES); 2084 2085 switch (type) { 2086 case ARC_SPACE_DATA: 2087 ARCSTAT_INCR(arcstat_data_size, -space); 2088 break; 2089 case ARC_SPACE_META: 2090 ARCSTAT_INCR(arcstat_metadata_size, -space); 2091 break; 2092 case ARC_SPACE_OTHER: 2093 ARCSTAT_INCR(arcstat_other_size, -space); 2094 break; 2095 case ARC_SPACE_HDRS: 2096 ARCSTAT_INCR(arcstat_hdr_size, -space); 2097 break; 2098 case ARC_SPACE_L2HDRS: 2099 ARCSTAT_INCR(arcstat_l2_hdr_size, -space); 2100 break; 2101 } 2102 2103 if (type != ARC_SPACE_DATA) { 2104 ASSERT(arc_meta_used >= space); 2105 if (arc_meta_max < arc_meta_used) 2106 arc_meta_max = arc_meta_used; 2107 ARCSTAT_INCR(arcstat_meta_used, -space); 2108 } 2109 2110 ASSERT(arc_size >= space); 2111 atomic_add_64(&arc_size, -space); 2112} 2113 2114arc_buf_t * 2115arc_buf_alloc(spa_t *spa, int32_t size, void *tag, arc_buf_contents_t type) 2116{ 2117 arc_buf_hdr_t *hdr; 2118 arc_buf_t *buf; 2119 2120 ASSERT3U(size, >, 0); 2121 hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE); 2122 ASSERT(BUF_EMPTY(hdr)); 2123 ASSERT3P(hdr->b_freeze_cksum, ==, NULL); 2124 hdr->b_size = size; 2125 hdr->b_spa = spa_load_guid(spa); 2126 2127 buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); 2128 buf->b_hdr = hdr; 2129 buf->b_data = NULL; 2130 buf->b_efunc = NULL; 2131 buf->b_private = NULL; 2132 buf->b_next = NULL; 2133 2134 hdr->b_flags = arc_bufc_to_flags(type); 2135 hdr->b_flags |= ARC_FLAG_HAS_L1HDR; 2136 2137 hdr->b_l1hdr.b_buf = buf; 2138 hdr->b_l1hdr.b_state = arc_anon; 2139 hdr->b_l1hdr.b_arc_access = 0; 2140 hdr->b_l1hdr.b_datacnt = 1; 2141 hdr->b_l1hdr.b_tmp_cdata = NULL; 2142 2143 arc_get_data_buf(buf); 2144 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 2145 (void) refcount_add(&hdr->b_l1hdr.b_refcnt, tag); 2146 2147 return (buf); 2148} 2149 2150static char *arc_onloan_tag = "onloan"; 2151 2152/* 2153 * Loan out an anonymous arc buffer. Loaned buffers are not counted as in 2154 * flight data by arc_tempreserve_space() until they are "returned". Loaned 2155 * buffers must be returned to the arc before they can be used by the DMU or 2156 * freed. 2157 */ 2158arc_buf_t * 2159arc_loan_buf(spa_t *spa, int size) 2160{ 2161 arc_buf_t *buf; 2162 2163 buf = arc_buf_alloc(spa, size, arc_onloan_tag, ARC_BUFC_DATA); 2164 2165 atomic_add_64(&arc_loaned_bytes, size); 2166 return (buf); 2167} 2168 2169/* 2170 * Return a loaned arc buffer to the arc. 2171 */ 2172void 2173arc_return_buf(arc_buf_t *buf, void *tag) 2174{ 2175 arc_buf_hdr_t *hdr = buf->b_hdr; 2176 2177 ASSERT(buf->b_data != NULL); 2178 ASSERT(HDR_HAS_L1HDR(hdr)); 2179 (void) refcount_add(&hdr->b_l1hdr.b_refcnt, tag); 2180 (void) refcount_remove(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag); 2181 2182 atomic_add_64(&arc_loaned_bytes, -hdr->b_size); 2183} 2184 2185/* Detach an arc_buf from a dbuf (tag) */ 2186void 2187arc_loan_inuse_buf(arc_buf_t *buf, void *tag) 2188{ 2189 arc_buf_hdr_t *hdr = buf->b_hdr; 2190 2191 ASSERT(buf->b_data != NULL); 2192 ASSERT(HDR_HAS_L1HDR(hdr)); 2193 (void) refcount_add(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag); 2194 (void) refcount_remove(&hdr->b_l1hdr.b_refcnt, tag); 2195 buf->b_efunc = NULL; 2196 buf->b_private = NULL; 2197 2198 atomic_add_64(&arc_loaned_bytes, hdr->b_size); 2199} 2200 2201static arc_buf_t * 2202arc_buf_clone(arc_buf_t *from) 2203{ 2204 arc_buf_t *buf; 2205 arc_buf_hdr_t *hdr = from->b_hdr; 2206 uint64_t size = hdr->b_size; 2207 2208 ASSERT(HDR_HAS_L1HDR(hdr)); 2209 ASSERT(hdr->b_l1hdr.b_state != arc_anon); 2210 2211 buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); 2212 buf->b_hdr = hdr; 2213 buf->b_data = NULL; 2214 buf->b_efunc = NULL; 2215 buf->b_private = NULL; 2216 buf->b_next = hdr->b_l1hdr.b_buf; 2217 hdr->b_l1hdr.b_buf = buf; 2218 arc_get_data_buf(buf); 2219 bcopy(from->b_data, buf->b_data, size); 2220 2221 /* 2222 * This buffer already exists in the arc so create a duplicate 2223 * copy for the caller. If the buffer is associated with user data 2224 * then track the size and number of duplicates. These stats will be 2225 * updated as duplicate buffers are created and destroyed. 2226 */ 2227 if (HDR_ISTYPE_DATA(hdr)) { 2228 ARCSTAT_BUMP(arcstat_duplicate_buffers); 2229 ARCSTAT_INCR(arcstat_duplicate_buffers_size, size); 2230 } 2231 hdr->b_l1hdr.b_datacnt += 1; 2232 return (buf); 2233} 2234 2235void 2236arc_buf_add_ref(arc_buf_t *buf, void* tag) 2237{ 2238 arc_buf_hdr_t *hdr; 2239 kmutex_t *hash_lock; 2240 2241 /* 2242 * Check to see if this buffer is evicted. Callers 2243 * must verify b_data != NULL to know if the add_ref 2244 * was successful. 2245 */ 2246 mutex_enter(&buf->b_evict_lock); 2247 if (buf->b_data == NULL) { 2248 mutex_exit(&buf->b_evict_lock); 2249 return; 2250 } 2251 hash_lock = HDR_LOCK(buf->b_hdr); 2252 mutex_enter(hash_lock); 2253 hdr = buf->b_hdr; 2254 ASSERT(HDR_HAS_L1HDR(hdr)); 2255 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 2256 mutex_exit(&buf->b_evict_lock); 2257 2258 ASSERT(hdr->b_l1hdr.b_state == arc_mru || 2259 hdr->b_l1hdr.b_state == arc_mfu); 2260 2261 add_reference(hdr, hash_lock, tag); 2262 DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); 2263 arc_access(hdr, hash_lock); 2264 mutex_exit(hash_lock); 2265 ARCSTAT_BUMP(arcstat_hits); 2266 ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr), 2267 demand, prefetch, !HDR_ISTYPE_METADATA(hdr), 2268 data, metadata, hits); 2269} 2270 2271static void 2272arc_buf_free_on_write(void *data, size_t size, 2273 void (*free_func)(void *, size_t)) 2274{ 2275 l2arc_data_free_t *df; 2276 2277 df = kmem_alloc(sizeof (*df), KM_SLEEP); 2278 df->l2df_data = data; 2279 df->l2df_size = size; 2280 df->l2df_func = free_func; 2281 mutex_enter(&l2arc_free_on_write_mtx); 2282 list_insert_head(l2arc_free_on_write, df); 2283 mutex_exit(&l2arc_free_on_write_mtx); 2284} 2285 2286/* 2287 * Free the arc data buffer. If it is an l2arc write in progress, 2288 * the buffer is placed on l2arc_free_on_write to be freed later. 2289 */ 2290static void 2291arc_buf_data_free(arc_buf_t *buf, void (*free_func)(void *, size_t)) 2292{ 2293 arc_buf_hdr_t *hdr = buf->b_hdr; 2294 2295 if (HDR_L2_WRITING(hdr)) { 2296 arc_buf_free_on_write(buf->b_data, hdr->b_size, free_func); 2297 ARCSTAT_BUMP(arcstat_l2_free_on_write); 2298 } else { 2299 free_func(buf->b_data, hdr->b_size); 2300 } 2301} 2302 2303static void 2304arc_buf_l2_cdata_free(arc_buf_hdr_t *hdr) 2305{ 2306 size_t align, asize, len; 2307 2308 ASSERT(HDR_HAS_L2HDR(hdr)); 2309 ASSERT(MUTEX_HELD(&hdr->b_l2hdr.b_dev->l2ad_mtx)); 2310 2311 /* 2312 * The b_tmp_cdata field is linked off of the b_l1hdr, so if 2313 * that doesn't exist, the header is in the arc_l2c_only state, 2314 * and there isn't anything to free (it's already been freed). 2315 */ 2316 if (!HDR_HAS_L1HDR(hdr)) 2317 return; 2318 2319 /* 2320 * The header isn't being written to the l2arc device, thus it 2321 * shouldn't have a b_tmp_cdata to free. 2322 */ 2323 if (!HDR_L2_WRITING(hdr)) { 2324 ASSERT3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL); 2325 return; 2326 } 2327 2328 /* 2329 * The bufer has been chosen for writing to L2ARC, but it's 2330 * not being written just yet. In other words, 2331 * b_tmp_cdata points to exactly the same buffer as b_data, 2332 * l2arc_transform_buf hasn't been called. 2333 */ 2334 if (hdr->b_l2hdr.b_daddr == L2ARC_ADDR_UNSET) { 2335 ASSERT3P(hdr->b_l1hdr.b_tmp_cdata, ==, 2336 hdr->b_l1hdr.b_buf->b_data); 2337 ASSERT3U(hdr->b_l2hdr.b_compress, ==, ZIO_COMPRESS_OFF); 2338 hdr->b_l1hdr.b_tmp_cdata = NULL; 2339 return; 2340 } 2341 2342 /* 2343 * There's nothing to free since the buffer was all zero's and 2344 * compressed to a zero length buffer. 2345 */ 2346 if (hdr->b_l2hdr.b_compress == ZIO_COMPRESS_EMPTY) { 2347 ASSERT3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL); 2348 return; 2349 } 2350 2351 /* 2352 * Nothing to do if the temporary buffer was not required. 2353 */ 2354 if (hdr->b_l1hdr.b_tmp_cdata == NULL) 2355 return; 2356 2357 ARCSTAT_BUMP(arcstat_l2_cdata_free_on_write); 2358 len = hdr->b_size; 2359 align = (size_t)1 << hdr->b_l2hdr.b_dev->l2ad_vdev->vdev_ashift; 2360 asize = P2ROUNDUP(len, align); 2361 arc_buf_free_on_write(hdr->b_l1hdr.b_tmp_cdata, asize, 2362 zio_data_buf_free); 2363 hdr->b_l1hdr.b_tmp_cdata = NULL; 2364} 2365 2366/* 2367 * Free up buf->b_data and if 'remove' is set, then pull the 2368 * arc_buf_t off of the the arc_buf_hdr_t's list and free it. 2369 */ 2370static void 2371arc_buf_destroy(arc_buf_t *buf, boolean_t remove) 2372{ 2373 arc_buf_t **bufp; 2374 2375 /* free up data associated with the buf */ 2376 if (buf->b_data != NULL) { 2377 arc_state_t *state = buf->b_hdr->b_l1hdr.b_state; 2378 uint64_t size = buf->b_hdr->b_size; 2379 arc_buf_contents_t type = arc_buf_type(buf->b_hdr); 2380 2381 arc_cksum_verify(buf); 2382#ifdef illumos 2383 arc_buf_unwatch(buf); 2384#endif 2385 2386 if (type == ARC_BUFC_METADATA) { 2387 arc_buf_data_free(buf, zio_buf_free); 2388 arc_space_return(size, ARC_SPACE_META); 2389 } else { 2390 ASSERT(type == ARC_BUFC_DATA); 2391 arc_buf_data_free(buf, zio_data_buf_free); 2392 arc_space_return(size, ARC_SPACE_DATA); 2393 } 2394 2395 /* protected by hash lock, if in the hash table */ 2396 if (multilist_link_active(&buf->b_hdr->b_l1hdr.b_arc_node)) { 2397 uint64_t *cnt = &state->arcs_lsize[type]; 2398 2399 ASSERT(refcount_is_zero( 2400 &buf->b_hdr->b_l1hdr.b_refcnt)); 2401 ASSERT(state != arc_anon && state != arc_l2c_only); 2402 2403 ASSERT3U(*cnt, >=, size); 2404 atomic_add_64(cnt, -size); 2405 } 2406 2407 (void) refcount_remove_many(&state->arcs_size, size, buf); 2408 buf->b_data = NULL; 2409 2410 /* 2411 * If we're destroying a duplicate buffer make sure 2412 * that the appropriate statistics are updated. 2413 */ 2414 if (buf->b_hdr->b_l1hdr.b_datacnt > 1 && 2415 HDR_ISTYPE_DATA(buf->b_hdr)) { 2416 ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers); 2417 ARCSTAT_INCR(arcstat_duplicate_buffers_size, -size); 2418 } 2419 ASSERT(buf->b_hdr->b_l1hdr.b_datacnt > 0); 2420 buf->b_hdr->b_l1hdr.b_datacnt -= 1; 2421 } 2422 2423 /* only remove the buf if requested */ 2424 if (!remove) 2425 return; 2426 2427 /* remove the buf from the hdr list */ 2428 for (bufp = &buf->b_hdr->b_l1hdr.b_buf; *bufp != buf; 2429 bufp = &(*bufp)->b_next) 2430 continue; 2431 *bufp = buf->b_next; 2432 buf->b_next = NULL; 2433 2434 ASSERT(buf->b_efunc == NULL); 2435 2436 /* clean up the buf */ 2437 buf->b_hdr = NULL; 2438 kmem_cache_free(buf_cache, buf); 2439} 2440 2441static void 2442arc_hdr_l2hdr_destroy(arc_buf_hdr_t *hdr) 2443{ 2444 l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr; 2445 l2arc_dev_t *dev = l2hdr->b_dev; 2446 2447 ASSERT(MUTEX_HELD(&dev->l2ad_mtx)); 2448 ASSERT(HDR_HAS_L2HDR(hdr)); 2449 2450 list_remove(&dev->l2ad_buflist, hdr); 2451 2452 /* 2453 * We don't want to leak the b_tmp_cdata buffer that was 2454 * allocated in l2arc_write_buffers() 2455 */ 2456 arc_buf_l2_cdata_free(hdr); 2457 2458 /* 2459 * If the l2hdr's b_daddr is equal to L2ARC_ADDR_UNSET, then 2460 * this header is being processed by l2arc_write_buffers() (i.e. 2461 * it's in the first stage of l2arc_write_buffers()). 2462 * Re-affirming that truth here, just to serve as a reminder. If 2463 * b_daddr does not equal L2ARC_ADDR_UNSET, then the header may or 2464 * may not have its HDR_L2_WRITING flag set. (the write may have 2465 * completed, in which case HDR_L2_WRITING will be false and the 2466 * b_daddr field will point to the address of the buffer on disk). 2467 */ 2468 IMPLY(l2hdr->b_daddr == L2ARC_ADDR_UNSET, HDR_L2_WRITING(hdr)); 2469 2470 /* 2471 * If b_daddr is equal to L2ARC_ADDR_UNSET, we're racing with 2472 * l2arc_write_buffers(). Since we've just removed this header 2473 * from the l2arc buffer list, this header will never reach the 2474 * second stage of l2arc_write_buffers(), which increments the 2475 * accounting stats for this header. Thus, we must be careful 2476 * not to decrement them for this header either. 2477 */ 2478 if (l2hdr->b_daddr != L2ARC_ADDR_UNSET) { 2479 ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize); 2480 ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); 2481 2482 vdev_space_update(dev->l2ad_vdev, 2483 -l2hdr->b_asize, 0, 0); 2484 2485 (void) refcount_remove_many(&dev->l2ad_alloc, 2486 l2hdr->b_asize, hdr); 2487 } 2488 2489 hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR; 2490} 2491 2492static void 2493arc_hdr_destroy(arc_buf_hdr_t *hdr) 2494{ 2495 if (HDR_HAS_L1HDR(hdr)) { 2496 ASSERT(hdr->b_l1hdr.b_buf == NULL || 2497 hdr->b_l1hdr.b_datacnt > 0); 2498 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 2499 ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); 2500 } 2501 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 2502 ASSERT(!HDR_IN_HASH_TABLE(hdr)); 2503 2504 if (HDR_HAS_L2HDR(hdr)) { 2505 l2arc_dev_t *dev = hdr->b_l2hdr.b_dev; 2506 boolean_t buflist_held = MUTEX_HELD(&dev->l2ad_mtx); 2507 2508 if (!buflist_held) 2509 mutex_enter(&dev->l2ad_mtx); 2510 2511 /* 2512 * Even though we checked this conditional above, we 2513 * need to check this again now that we have the 2514 * l2ad_mtx. This is because we could be racing with 2515 * another thread calling l2arc_evict() which might have 2516 * destroyed this header's L2 portion as we were waiting 2517 * to acquire the l2ad_mtx. If that happens, we don't 2518 * want to re-destroy the header's L2 portion. 2519 */ 2520 if (HDR_HAS_L2HDR(hdr)) { 2521 l2arc_trim(hdr); 2522 arc_hdr_l2hdr_destroy(hdr); 2523 } 2524 2525 if (!buflist_held) 2526 mutex_exit(&dev->l2ad_mtx); 2527 } 2528 2529 if (!BUF_EMPTY(hdr)) 2530 buf_discard_identity(hdr); 2531 2532 if (hdr->b_freeze_cksum != NULL) { 2533 kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t)); 2534 hdr->b_freeze_cksum = NULL; 2535 } 2536 2537 if (HDR_HAS_L1HDR(hdr)) { 2538 while (hdr->b_l1hdr.b_buf) { 2539 arc_buf_t *buf = hdr->b_l1hdr.b_buf; 2540 2541 if (buf->b_efunc != NULL) { 2542 mutex_enter(&arc_user_evicts_lock); 2543 mutex_enter(&buf->b_evict_lock); 2544 ASSERT(buf->b_hdr != NULL); 2545 arc_buf_destroy(hdr->b_l1hdr.b_buf, FALSE); 2546 hdr->b_l1hdr.b_buf = buf->b_next; 2547 buf->b_hdr = &arc_eviction_hdr; 2548 buf->b_next = arc_eviction_list; 2549 arc_eviction_list = buf; 2550 mutex_exit(&buf->b_evict_lock); 2551 cv_signal(&arc_user_evicts_cv); 2552 mutex_exit(&arc_user_evicts_lock); 2553 } else { 2554 arc_buf_destroy(hdr->b_l1hdr.b_buf, TRUE); 2555 } 2556 } 2557#ifdef ZFS_DEBUG 2558 if (hdr->b_l1hdr.b_thawed != NULL) { 2559 kmem_free(hdr->b_l1hdr.b_thawed, 1); 2560 hdr->b_l1hdr.b_thawed = NULL; 2561 } 2562#endif 2563 } 2564 2565 ASSERT3P(hdr->b_hash_next, ==, NULL); 2566 if (HDR_HAS_L1HDR(hdr)) { 2567 ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); 2568 ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); 2569 kmem_cache_free(hdr_full_cache, hdr); 2570 } else { 2571 kmem_cache_free(hdr_l2only_cache, hdr); 2572 } 2573} 2574 2575void 2576arc_buf_free(arc_buf_t *buf, void *tag) 2577{ 2578 arc_buf_hdr_t *hdr = buf->b_hdr; 2579 int hashed = hdr->b_l1hdr.b_state != arc_anon; 2580 2581 ASSERT(buf->b_efunc == NULL); 2582 ASSERT(buf->b_data != NULL); 2583 2584 if (hashed) { 2585 kmutex_t *hash_lock = HDR_LOCK(hdr); 2586 2587 mutex_enter(hash_lock); 2588 hdr = buf->b_hdr; 2589 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 2590 2591 (void) remove_reference(hdr, hash_lock, tag); 2592 if (hdr->b_l1hdr.b_datacnt > 1) { 2593 arc_buf_destroy(buf, TRUE); 2594 } else { 2595 ASSERT(buf == hdr->b_l1hdr.b_buf); 2596 ASSERT(buf->b_efunc == NULL); 2597 hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE; 2598 } 2599 mutex_exit(hash_lock); 2600 } else if (HDR_IO_IN_PROGRESS(hdr)) { 2601 int destroy_hdr; 2602 /* 2603 * We are in the middle of an async write. Don't destroy 2604 * this buffer unless the write completes before we finish 2605 * decrementing the reference count. 2606 */ 2607 mutex_enter(&arc_user_evicts_lock); 2608 (void) remove_reference(hdr, NULL, tag); 2609 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 2610 destroy_hdr = !HDR_IO_IN_PROGRESS(hdr); 2611 mutex_exit(&arc_user_evicts_lock); 2612 if (destroy_hdr) 2613 arc_hdr_destroy(hdr); 2614 } else { 2615 if (remove_reference(hdr, NULL, tag) > 0) 2616 arc_buf_destroy(buf, TRUE); 2617 else 2618 arc_hdr_destroy(hdr); 2619 } 2620} 2621 2622boolean_t 2623arc_buf_remove_ref(arc_buf_t *buf, void* tag) 2624{ 2625 arc_buf_hdr_t *hdr = buf->b_hdr; 2626 kmutex_t *hash_lock = HDR_LOCK(hdr); 2627 boolean_t no_callback = (buf->b_efunc == NULL); 2628 2629 if (hdr->b_l1hdr.b_state == arc_anon) { 2630 ASSERT(hdr->b_l1hdr.b_datacnt == 1); 2631 arc_buf_free(buf, tag); 2632 return (no_callback); 2633 } 2634 2635 mutex_enter(hash_lock); 2636 hdr = buf->b_hdr; 2637 ASSERT(hdr->b_l1hdr.b_datacnt > 0); 2638 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 2639 ASSERT(hdr->b_l1hdr.b_state != arc_anon); 2640 ASSERT(buf->b_data != NULL); 2641 2642 (void) remove_reference(hdr, hash_lock, tag); 2643 if (hdr->b_l1hdr.b_datacnt > 1) { 2644 if (no_callback) 2645 arc_buf_destroy(buf, TRUE); 2646 } else if (no_callback) { 2647 ASSERT(hdr->b_l1hdr.b_buf == buf && buf->b_next == NULL); 2648 ASSERT(buf->b_efunc == NULL); 2649 hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE; 2650 } 2651 ASSERT(no_callback || hdr->b_l1hdr.b_datacnt > 1 || 2652 refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 2653 mutex_exit(hash_lock); 2654 return (no_callback); 2655} 2656 2657int32_t 2658arc_buf_size(arc_buf_t *buf) 2659{ 2660 return (buf->b_hdr->b_size); 2661} 2662 2663/* 2664 * Called from the DMU to determine if the current buffer should be 2665 * evicted. In order to ensure proper locking, the eviction must be initiated 2666 * from the DMU. Return true if the buffer is associated with user data and 2667 * duplicate buffers still exist. 2668 */ 2669boolean_t 2670arc_buf_eviction_needed(arc_buf_t *buf) 2671{ 2672 arc_buf_hdr_t *hdr; 2673 boolean_t evict_needed = B_FALSE; 2674 2675 if (zfs_disable_dup_eviction) 2676 return (B_FALSE); 2677 2678 mutex_enter(&buf->b_evict_lock); 2679 hdr = buf->b_hdr; 2680 if (hdr == NULL) { 2681 /* 2682 * We are in arc_do_user_evicts(); let that function 2683 * perform the eviction. 2684 */ 2685 ASSERT(buf->b_data == NULL); 2686 mutex_exit(&buf->b_evict_lock); 2687 return (B_FALSE); 2688 } else if (buf->b_data == NULL) { 2689 /* 2690 * We have already been added to the arc eviction list; 2691 * recommend eviction. 2692 */ 2693 ASSERT3P(hdr, ==, &arc_eviction_hdr); 2694 mutex_exit(&buf->b_evict_lock); 2695 return (B_TRUE); 2696 } 2697 2698 if (hdr->b_l1hdr.b_datacnt > 1 && HDR_ISTYPE_DATA(hdr)) 2699 evict_needed = B_TRUE; 2700 2701 mutex_exit(&buf->b_evict_lock); 2702 return (evict_needed); 2703} 2704 2705/* 2706 * Evict the arc_buf_hdr that is provided as a parameter. The resultant 2707 * state of the header is dependent on it's state prior to entering this 2708 * function. The following transitions are possible: 2709 * 2710 * - arc_mru -> arc_mru_ghost 2711 * - arc_mfu -> arc_mfu_ghost 2712 * - arc_mru_ghost -> arc_l2c_only 2713 * - arc_mru_ghost -> deleted 2714 * - arc_mfu_ghost -> arc_l2c_only 2715 * - arc_mfu_ghost -> deleted 2716 */ 2717static int64_t 2718arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) 2719{ 2720 arc_state_t *evicted_state, *state; 2721 int64_t bytes_evicted = 0; 2722 2723 ASSERT(MUTEX_HELD(hash_lock)); 2724 ASSERT(HDR_HAS_L1HDR(hdr)); 2725 2726 state = hdr->b_l1hdr.b_state; 2727 if (GHOST_STATE(state)) { 2728 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 2729 ASSERT(hdr->b_l1hdr.b_buf == NULL); 2730 2731 /* 2732 * l2arc_write_buffers() relies on a header's L1 portion 2733 * (i.e. it's b_tmp_cdata field) during it's write phase. 2734 * Thus, we cannot push a header onto the arc_l2c_only 2735 * state (removing it's L1 piece) until the header is 2736 * done being written to the l2arc. 2737 */ 2738 if (HDR_HAS_L2HDR(hdr) && HDR_L2_WRITING(hdr)) { 2739 ARCSTAT_BUMP(arcstat_evict_l2_skip); 2740 return (bytes_evicted); 2741 } 2742 2743 ARCSTAT_BUMP(arcstat_deleted); 2744 bytes_evicted += hdr->b_size; 2745 2746 DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, hdr); 2747 2748 if (HDR_HAS_L2HDR(hdr)) { 2749 /* 2750 * This buffer is cached on the 2nd Level ARC; 2751 * don't destroy the header. 2752 */ 2753 arc_change_state(arc_l2c_only, hdr, hash_lock); 2754 /* 2755 * dropping from L1+L2 cached to L2-only, 2756 * realloc to remove the L1 header. 2757 */ 2758 hdr = arc_hdr_realloc(hdr, hdr_full_cache, 2759 hdr_l2only_cache); 2760 } else { 2761 arc_change_state(arc_anon, hdr, hash_lock); 2762 arc_hdr_destroy(hdr); 2763 } 2764 return (bytes_evicted); 2765 } 2766 2767 ASSERT(state == arc_mru || state == arc_mfu); 2768 evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost; 2769 2770 /* prefetch buffers have a minimum lifespan */ 2771 if (HDR_IO_IN_PROGRESS(hdr) || 2772 ((hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT)) && 2773 ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access < 2774 arc_min_prefetch_lifespan)) { 2775 ARCSTAT_BUMP(arcstat_evict_skip); 2776 return (bytes_evicted); 2777 } 2778 2779 ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt)); 2780 ASSERT3U(hdr->b_l1hdr.b_datacnt, >, 0); 2781 while (hdr->b_l1hdr.b_buf) { 2782 arc_buf_t *buf = hdr->b_l1hdr.b_buf; 2783 if (!mutex_tryenter(&buf->b_evict_lock)) { 2784 ARCSTAT_BUMP(arcstat_mutex_miss); 2785 break; 2786 } 2787 if (buf->b_data != NULL) 2788 bytes_evicted += hdr->b_size; 2789 if (buf->b_efunc != NULL) { 2790 mutex_enter(&arc_user_evicts_lock); 2791 arc_buf_destroy(buf, FALSE); 2792 hdr->b_l1hdr.b_buf = buf->b_next; 2793 buf->b_hdr = &arc_eviction_hdr; 2794 buf->b_next = arc_eviction_list; 2795 arc_eviction_list = buf; 2796 cv_signal(&arc_user_evicts_cv); 2797 mutex_exit(&arc_user_evicts_lock); 2798 mutex_exit(&buf->b_evict_lock); 2799 } else { 2800 mutex_exit(&buf->b_evict_lock); 2801 arc_buf_destroy(buf, TRUE); 2802 } 2803 } 2804 2805 if (HDR_HAS_L2HDR(hdr)) { 2806 ARCSTAT_INCR(arcstat_evict_l2_cached, hdr->b_size); 2807 } else { 2808 if (l2arc_write_eligible(hdr->b_spa, hdr)) 2809 ARCSTAT_INCR(arcstat_evict_l2_eligible, hdr->b_size); 2810 else 2811 ARCSTAT_INCR(arcstat_evict_l2_ineligible, hdr->b_size); 2812 } 2813 2814 if (hdr->b_l1hdr.b_datacnt == 0) { 2815 arc_change_state(evicted_state, hdr, hash_lock); 2816 ASSERT(HDR_IN_HASH_TABLE(hdr)); 2817 hdr->b_flags |= ARC_FLAG_IN_HASH_TABLE; 2818 hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE; 2819 DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, hdr); 2820 } 2821 2822 return (bytes_evicted); 2823} 2824 2825static uint64_t 2826arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker, 2827 uint64_t spa, int64_t bytes) 2828{ 2829 multilist_sublist_t *mls; 2830 uint64_t bytes_evicted = 0; 2831 arc_buf_hdr_t *hdr; 2832 kmutex_t *hash_lock; 2833 int evict_count = 0; 2834 2835 ASSERT3P(marker, !=, NULL); 2836 IMPLY(bytes < 0, bytes == ARC_EVICT_ALL); 2837 2838 mls = multilist_sublist_lock(ml, idx); 2839 2840 for (hdr = multilist_sublist_prev(mls, marker); hdr != NULL; 2841 hdr = multilist_sublist_prev(mls, marker)) { 2842 if ((bytes != ARC_EVICT_ALL && bytes_evicted >= bytes) || 2843 (evict_count >= zfs_arc_evict_batch_limit)) 2844 break; 2845 2846 /* 2847 * To keep our iteration location, move the marker 2848 * forward. Since we're not holding hdr's hash lock, we 2849 * must be very careful and not remove 'hdr' from the 2850 * sublist. Otherwise, other consumers might mistake the 2851 * 'hdr' as not being on a sublist when they call the 2852 * multilist_link_active() function (they all rely on 2853 * the hash lock protecting concurrent insertions and 2854 * removals). multilist_sublist_move_forward() was 2855 * specifically implemented to ensure this is the case 2856 * (only 'marker' will be removed and re-inserted). 2857 */ 2858 multilist_sublist_move_forward(mls, marker); 2859 2860 /* 2861 * The only case where the b_spa field should ever be 2862 * zero, is the marker headers inserted by 2863 * arc_evict_state(). It's possible for multiple threads 2864 * to be calling arc_evict_state() concurrently (e.g. 2865 * dsl_pool_close() and zio_inject_fault()), so we must 2866 * skip any markers we see from these other threads. 2867 */ 2868 if (hdr->b_spa == 0) 2869 continue; 2870 2871 /* we're only interested in evicting buffers of a certain spa */ 2872 if (spa != 0 && hdr->b_spa != spa) { 2873 ARCSTAT_BUMP(arcstat_evict_skip); 2874 continue; 2875 } 2876 2877 hash_lock = HDR_LOCK(hdr); 2878 2879 /* 2880 * We aren't calling this function from any code path 2881 * that would already be holding a hash lock, so we're 2882 * asserting on this assumption to be defensive in case 2883 * this ever changes. Without this check, it would be 2884 * possible to incorrectly increment arcstat_mutex_miss 2885 * below (e.g. if the code changed such that we called 2886 * this function with a hash lock held). 2887 */ 2888 ASSERT(!MUTEX_HELD(hash_lock)); 2889 2890 if (mutex_tryenter(hash_lock)) { 2891 uint64_t evicted = arc_evict_hdr(hdr, hash_lock); 2892 mutex_exit(hash_lock); 2893 2894 bytes_evicted += evicted; 2895 2896 /* 2897 * If evicted is zero, arc_evict_hdr() must have 2898 * decided to skip this header, don't increment 2899 * evict_count in this case. 2900 */ 2901 if (evicted != 0) 2902 evict_count++; 2903 2904 /* 2905 * If arc_size isn't overflowing, signal any 2906 * threads that might happen to be waiting. 2907 * 2908 * For each header evicted, we wake up a single 2909 * thread. If we used cv_broadcast, we could 2910 * wake up "too many" threads causing arc_size 2911 * to significantly overflow arc_c; since 2912 * arc_get_data_buf() doesn't check for overflow 2913 * when it's woken up (it doesn't because it's 2914 * possible for the ARC to be overflowing while 2915 * full of un-evictable buffers, and the 2916 * function should proceed in this case). 2917 * 2918 * If threads are left sleeping, due to not 2919 * using cv_broadcast, they will be woken up 2920 * just before arc_reclaim_thread() sleeps. 2921 */ 2922 mutex_enter(&arc_reclaim_lock); 2923 if (!arc_is_overflowing()) 2924 cv_signal(&arc_reclaim_waiters_cv); 2925 mutex_exit(&arc_reclaim_lock); 2926 } else { 2927 ARCSTAT_BUMP(arcstat_mutex_miss); 2928 } 2929 } 2930 2931 multilist_sublist_unlock(mls); 2932 2933 return (bytes_evicted); 2934} 2935 2936/* 2937 * Evict buffers from the given arc state, until we've removed the 2938 * specified number of bytes. Move the removed buffers to the 2939 * appropriate evict state. 2940 * 2941 * This function makes a "best effort". It skips over any buffers 2942 * it can't get a hash_lock on, and so, may not catch all candidates. 2943 * It may also return without evicting as much space as requested. 2944 * 2945 * If bytes is specified using the special value ARC_EVICT_ALL, this 2946 * will evict all available (i.e. unlocked and evictable) buffers from 2947 * the given arc state; which is used by arc_flush(). 2948 */ 2949static uint64_t 2950arc_evict_state(arc_state_t *state, uint64_t spa, int64_t bytes, 2951 arc_buf_contents_t type) 2952{ 2953 uint64_t total_evicted = 0; 2954 multilist_t *ml = &state->arcs_list[type]; 2955 int num_sublists; 2956 arc_buf_hdr_t **markers; 2957 2958 IMPLY(bytes < 0, bytes == ARC_EVICT_ALL); 2959 2960 num_sublists = multilist_get_num_sublists(ml); 2961 2962 /* 2963 * If we've tried to evict from each sublist, made some 2964 * progress, but still have not hit the target number of bytes 2965 * to evict, we want to keep trying. The markers allow us to 2966 * pick up where we left off for each individual sublist, rather 2967 * than starting from the tail each time. 2968 */ 2969 markers = kmem_zalloc(sizeof (*markers) * num_sublists, KM_SLEEP); 2970 for (int i = 0; i < num_sublists; i++) { 2971 markers[i] = kmem_cache_alloc(hdr_full_cache, KM_SLEEP); 2972 2973 /* 2974 * A b_spa of 0 is used to indicate that this header is 2975 * a marker. This fact is used in arc_adjust_type() and 2976 * arc_evict_state_impl(). 2977 */ 2978 markers[i]->b_spa = 0; 2979 2980 multilist_sublist_t *mls = multilist_sublist_lock(ml, i); 2981 multilist_sublist_insert_tail(mls, markers[i]); 2982 multilist_sublist_unlock(mls); 2983 } 2984 2985 /* 2986 * While we haven't hit our target number of bytes to evict, or 2987 * we're evicting all available buffers. 2988 */ 2989 while (total_evicted < bytes || bytes == ARC_EVICT_ALL) { 2990 /* 2991 * Start eviction using a randomly selected sublist, 2992 * this is to try and evenly balance eviction across all 2993 * sublists. Always starting at the same sublist 2994 * (e.g. index 0) would cause evictions to favor certain 2995 * sublists over others. 2996 */ 2997 int sublist_idx = multilist_get_random_index(ml); 2998 uint64_t scan_evicted = 0; 2999 3000 for (int i = 0; i < num_sublists; i++) { 3001 uint64_t bytes_remaining; 3002 uint64_t bytes_evicted; 3003 3004 if (bytes == ARC_EVICT_ALL) 3005 bytes_remaining = ARC_EVICT_ALL; 3006 else if (total_evicted < bytes) 3007 bytes_remaining = bytes - total_evicted; 3008 else 3009 break; 3010 3011 bytes_evicted = arc_evict_state_impl(ml, sublist_idx, 3012 markers[sublist_idx], spa, bytes_remaining); 3013 3014 scan_evicted += bytes_evicted; 3015 total_evicted += bytes_evicted; 3016 3017 /* we've reached the end, wrap to the beginning */ 3018 if (++sublist_idx >= num_sublists) 3019 sublist_idx = 0; 3020 } 3021 3022 /* 3023 * If we didn't evict anything during this scan, we have 3024 * no reason to believe we'll evict more during another 3025 * scan, so break the loop. 3026 */ 3027 if (scan_evicted == 0) { 3028 /* This isn't possible, let's make that obvious */ 3029 ASSERT3S(bytes, !=, 0); 3030 3031 /* 3032 * When bytes is ARC_EVICT_ALL, the only way to 3033 * break the loop is when scan_evicted is zero. 3034 * In that case, we actually have evicted enough, 3035 * so we don't want to increment the kstat. 3036 */ 3037 if (bytes != ARC_EVICT_ALL) { 3038 ASSERT3S(total_evicted, <, bytes); 3039 ARCSTAT_BUMP(arcstat_evict_not_enough); 3040 } 3041 3042 break; 3043 } 3044 } 3045 3046 for (int i = 0; i < num_sublists; i++) { 3047 multilist_sublist_t *mls = multilist_sublist_lock(ml, i); 3048 multilist_sublist_remove(mls, markers[i]); 3049 multilist_sublist_unlock(mls); 3050 3051 kmem_cache_free(hdr_full_cache, markers[i]); 3052 } 3053 kmem_free(markers, sizeof (*markers) * num_sublists); 3054 3055 return (total_evicted); 3056} 3057 3058/* 3059 * Flush all "evictable" data of the given type from the arc state 3060 * specified. This will not evict any "active" buffers (i.e. referenced). 3061 * 3062 * When 'retry' is set to FALSE, the function will make a single pass 3063 * over the state and evict any buffers that it can. Since it doesn't 3064 * continually retry the eviction, it might end up leaving some buffers 3065 * in the ARC due to lock misses. 3066 * 3067 * When 'retry' is set to TRUE, the function will continually retry the 3068 * eviction until *all* evictable buffers have been removed from the 3069 * state. As a result, if concurrent insertions into the state are 3070 * allowed (e.g. if the ARC isn't shutting down), this function might 3071 * wind up in an infinite loop, continually trying to evict buffers. 3072 */ 3073static uint64_t 3074arc_flush_state(arc_state_t *state, uint64_t spa, arc_buf_contents_t type, 3075 boolean_t retry) 3076{ 3077 uint64_t evicted = 0; 3078 3079 while (state->arcs_lsize[type] != 0) { 3080 evicted += arc_evict_state(state, spa, ARC_EVICT_ALL, type); 3081 3082 if (!retry) 3083 break; 3084 } 3085 3086 return (evicted); 3087} 3088 3089/* 3090 * Evict the specified number of bytes from the state specified, 3091 * restricting eviction to the spa and type given. This function 3092 * prevents us from trying to evict more from a state's list than 3093 * is "evictable", and to skip evicting altogether when passed a 3094 * negative value for "bytes". In contrast, arc_evict_state() will 3095 * evict everything it can, when passed a negative value for "bytes". 3096 */ 3097static uint64_t 3098arc_adjust_impl(arc_state_t *state, uint64_t spa, int64_t bytes, 3099 arc_buf_contents_t type) 3100{ 3101 int64_t delta; 3102 3103 if (bytes > 0 && state->arcs_lsize[type] > 0) { 3104 delta = MIN(state->arcs_lsize[type], bytes); 3105 return (arc_evict_state(state, spa, delta, type)); 3106 } 3107 3108 return (0); 3109} 3110 3111/* 3112 * Evict metadata buffers from the cache, such that arc_meta_used is 3113 * capped by the arc_meta_limit tunable. 3114 */ 3115static uint64_t 3116arc_adjust_meta(void) 3117{ 3118 uint64_t total_evicted = 0; 3119 int64_t target; 3120 3121 /* 3122 * If we're over the meta limit, we want to evict enough 3123 * metadata to get back under the meta limit. We don't want to 3124 * evict so much that we drop the MRU below arc_p, though. If 3125 * we're over the meta limit more than we're over arc_p, we 3126 * evict some from the MRU here, and some from the MFU below. 3127 */ 3128 target = MIN((int64_t)(arc_meta_used - arc_meta_limit), 3129 (int64_t)(refcount_count(&arc_anon->arcs_size) + 3130 refcount_count(&arc_mru->arcs_size) - arc_p)); 3131 3132 total_evicted += arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA); 3133 3134 /* 3135 * Similar to the above, we want to evict enough bytes to get us 3136 * below the meta limit, but not so much as to drop us below the 3137 * space alloted to the MFU (which is defined as arc_c - arc_p). 3138 */ 3139 target = MIN((int64_t)(arc_meta_used - arc_meta_limit), 3140 (int64_t)(refcount_count(&arc_mfu->arcs_size) - (arc_c - arc_p))); 3141 3142 total_evicted += arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA); 3143 3144 return (total_evicted); 3145} 3146 3147/* 3148 * Return the type of the oldest buffer in the given arc state 3149 * 3150 * This function will select a random sublist of type ARC_BUFC_DATA and 3151 * a random sublist of type ARC_BUFC_METADATA. The tail of each sublist 3152 * is compared, and the type which contains the "older" buffer will be 3153 * returned. 3154 */ 3155static arc_buf_contents_t 3156arc_adjust_type(arc_state_t *state) 3157{ 3158 multilist_t *data_ml = &state->arcs_list[ARC_BUFC_DATA]; 3159 multilist_t *meta_ml = &state->arcs_list[ARC_BUFC_METADATA]; 3160 int data_idx = multilist_get_random_index(data_ml); 3161 int meta_idx = multilist_get_random_index(meta_ml); 3162 multilist_sublist_t *data_mls; 3163 multilist_sublist_t *meta_mls; 3164 arc_buf_contents_t type; 3165 arc_buf_hdr_t *data_hdr; 3166 arc_buf_hdr_t *meta_hdr; 3167 3168 /* 3169 * We keep the sublist lock until we're finished, to prevent 3170 * the headers from being destroyed via arc_evict_state(). 3171 */ 3172 data_mls = multilist_sublist_lock(data_ml, data_idx); 3173 meta_mls = multilist_sublist_lock(meta_ml, meta_idx); 3174 3175 /* 3176 * These two loops are to ensure we skip any markers that 3177 * might be at the tail of the lists due to arc_evict_state(). 3178 */ 3179 3180 for (data_hdr = multilist_sublist_tail(data_mls); data_hdr != NULL; 3181 data_hdr = multilist_sublist_prev(data_mls, data_hdr)) { 3182 if (data_hdr->b_spa != 0) 3183 break; 3184 } 3185 3186 for (meta_hdr = multilist_sublist_tail(meta_mls); meta_hdr != NULL; 3187 meta_hdr = multilist_sublist_prev(meta_mls, meta_hdr)) { 3188 if (meta_hdr->b_spa != 0) 3189 break; 3190 } 3191 3192 if (data_hdr == NULL && meta_hdr == NULL) { 3193 type = ARC_BUFC_DATA; 3194 } else if (data_hdr == NULL) { 3195 ASSERT3P(meta_hdr, !=, NULL); 3196 type = ARC_BUFC_METADATA; 3197 } else if (meta_hdr == NULL) { 3198 ASSERT3P(data_hdr, !=, NULL); 3199 type = ARC_BUFC_DATA; 3200 } else { 3201 ASSERT3P(data_hdr, !=, NULL); 3202 ASSERT3P(meta_hdr, !=, NULL); 3203 3204 /* The headers can't be on the sublist without an L1 header */ 3205 ASSERT(HDR_HAS_L1HDR(data_hdr)); 3206 ASSERT(HDR_HAS_L1HDR(meta_hdr)); 3207 3208 if (data_hdr->b_l1hdr.b_arc_access < 3209 meta_hdr->b_l1hdr.b_arc_access) { 3210 type = ARC_BUFC_DATA; 3211 } else { 3212 type = ARC_BUFC_METADATA; 3213 } 3214 } 3215 3216 multilist_sublist_unlock(meta_mls); 3217 multilist_sublist_unlock(data_mls); 3218 3219 return (type); 3220} 3221 3222/* 3223 * Evict buffers from the cache, such that arc_size is capped by arc_c. 3224 */ 3225static uint64_t 3226arc_adjust(void) 3227{ 3228 uint64_t total_evicted = 0; 3229 uint64_t bytes; 3230 int64_t target; 3231 3232 /* 3233 * If we're over arc_meta_limit, we want to correct that before 3234 * potentially evicting data buffers below. 3235 */ 3236 total_evicted += arc_adjust_meta(); 3237 3238 /* 3239 * Adjust MRU size 3240 * 3241 * If we're over the target cache size, we want to evict enough 3242 * from the list to get back to our target size. We don't want 3243 * to evict too much from the MRU, such that it drops below 3244 * arc_p. So, if we're over our target cache size more than 3245 * the MRU is over arc_p, we'll evict enough to get back to 3246 * arc_p here, and then evict more from the MFU below. 3247 */ 3248 target = MIN((int64_t)(arc_size - arc_c), 3249 (int64_t)(refcount_count(&arc_anon->arcs_size) + 3250 refcount_count(&arc_mru->arcs_size) + arc_meta_used - arc_p)); 3251 3252 /* 3253 * If we're below arc_meta_min, always prefer to evict data. 3254 * Otherwise, try to satisfy the requested number of bytes to 3255 * evict from the type which contains older buffers; in an 3256 * effort to keep newer buffers in the cache regardless of their 3257 * type. If we cannot satisfy the number of bytes from this 3258 * type, spill over into the next type. 3259 */ 3260 if (arc_adjust_type(arc_mru) == ARC_BUFC_METADATA && 3261 arc_meta_used > arc_meta_min) { 3262 bytes = arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA); 3263 total_evicted += bytes; 3264 3265 /* 3266 * If we couldn't evict our target number of bytes from 3267 * metadata, we try to get the rest from data. 3268 */ 3269 target -= bytes; 3270 3271 total_evicted += 3272 arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_DATA); 3273 } else { 3274 bytes = arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_DATA); 3275 total_evicted += bytes; 3276 3277 /* 3278 * If we couldn't evict our target number of bytes from 3279 * data, we try to get the rest from metadata. 3280 */ 3281 target -= bytes; 3282 3283 total_evicted += 3284 arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA); 3285 } 3286 3287 /* 3288 * Adjust MFU size 3289 * 3290 * Now that we've tried to evict enough from the MRU to get its 3291 * size back to arc_p, if we're still above the target cache 3292 * size, we evict the rest from the MFU. 3293 */ 3294 target = arc_size - arc_c; 3295 3296 if (arc_adjust_type(arc_mfu) == ARC_BUFC_METADATA && 3297 arc_meta_used > arc_meta_min) { 3298 bytes = arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA); 3299 total_evicted += bytes; 3300 3301 /* 3302 * If we couldn't evict our target number of bytes from 3303 * metadata, we try to get the rest from data. 3304 */ 3305 target -= bytes; 3306 3307 total_evicted += 3308 arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_DATA); 3309 } else { 3310 bytes = arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_DATA); 3311 total_evicted += bytes; 3312 3313 /* 3314 * If we couldn't evict our target number of bytes from 3315 * data, we try to get the rest from data. 3316 */ 3317 target -= bytes; 3318 3319 total_evicted += 3320 arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA); 3321 } 3322 3323 /* 3324 * Adjust ghost lists 3325 * 3326 * In addition to the above, the ARC also defines target values 3327 * for the ghost lists. The sum of the mru list and mru ghost 3328 * list should never exceed the target size of the cache, and 3329 * the sum of the mru list, mfu list, mru ghost list, and mfu 3330 * ghost list should never exceed twice the target size of the 3331 * cache. The following logic enforces these limits on the ghost 3332 * caches, and evicts from them as needed. 3333 */ 3334 target = refcount_count(&arc_mru->arcs_size) + 3335 refcount_count(&arc_mru_ghost->arcs_size) - arc_c; 3336 3337 bytes = arc_adjust_impl(arc_mru_ghost, 0, target, ARC_BUFC_DATA); 3338 total_evicted += bytes; 3339 3340 target -= bytes; 3341 3342 total_evicted += 3343 arc_adjust_impl(arc_mru_ghost, 0, target, ARC_BUFC_METADATA); 3344 3345 /* 3346 * We assume the sum of the mru list and mfu list is less than 3347 * or equal to arc_c (we enforced this above), which means we 3348 * can use the simpler of the two equations below: 3349 * 3350 * mru + mfu + mru ghost + mfu ghost <= 2 * arc_c 3351 * mru ghost + mfu ghost <= arc_c 3352 */ 3353 target = refcount_count(&arc_mru_ghost->arcs_size) + 3354 refcount_count(&arc_mfu_ghost->arcs_size) - arc_c; 3355 3356 bytes = arc_adjust_impl(arc_mfu_ghost, 0, target, ARC_BUFC_DATA); 3357 total_evicted += bytes; 3358 3359 target -= bytes; 3360 3361 total_evicted += 3362 arc_adjust_impl(arc_mfu_ghost, 0, target, ARC_BUFC_METADATA); 3363 3364 return (total_evicted); 3365} 3366 3367static void 3368arc_do_user_evicts(void) 3369{ 3370 mutex_enter(&arc_user_evicts_lock); 3371 while (arc_eviction_list != NULL) { 3372 arc_buf_t *buf = arc_eviction_list; 3373 arc_eviction_list = buf->b_next; 3374 mutex_enter(&buf->b_evict_lock); 3375 buf->b_hdr = NULL; 3376 mutex_exit(&buf->b_evict_lock); 3377 mutex_exit(&arc_user_evicts_lock); 3378 3379 if (buf->b_efunc != NULL) 3380 VERIFY0(buf->b_efunc(buf->b_private)); 3381 3382 buf->b_efunc = NULL; 3383 buf->b_private = NULL; 3384 kmem_cache_free(buf_cache, buf); 3385 mutex_enter(&arc_user_evicts_lock); 3386 } 3387 mutex_exit(&arc_user_evicts_lock); 3388} 3389 3390void 3391arc_flush(spa_t *spa, boolean_t retry) 3392{ 3393 uint64_t guid = 0; 3394 3395 /* 3396 * If retry is TRUE, a spa must not be specified since we have 3397 * no good way to determine if all of a spa's buffers have been 3398 * evicted from an arc state. 3399 */ 3400 ASSERT(!retry || spa == 0); 3401 3402 if (spa != NULL) 3403 guid = spa_load_guid(spa); 3404 3405 (void) arc_flush_state(arc_mru, guid, ARC_BUFC_DATA, retry); 3406 (void) arc_flush_state(arc_mru, guid, ARC_BUFC_METADATA, retry); 3407 3408 (void) arc_flush_state(arc_mfu, guid, ARC_BUFC_DATA, retry); 3409 (void) arc_flush_state(arc_mfu, guid, ARC_BUFC_METADATA, retry); 3410 3411 (void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_DATA, retry); 3412 (void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_METADATA, retry); 3413 3414 (void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_DATA, retry); 3415 (void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_METADATA, retry); 3416 3417 arc_do_user_evicts(); 3418 ASSERT(spa || arc_eviction_list == NULL); 3419} 3420 3421void 3422arc_shrink(int64_t to_free) 3423{ 3424 if (arc_c > arc_c_min) { 3425 DTRACE_PROBE4(arc__shrink, uint64_t, arc_c, uint64_t, 3426 arc_c_min, uint64_t, arc_p, uint64_t, to_free); 3427 if (arc_c > arc_c_min + to_free) 3428 atomic_add_64(&arc_c, -to_free); 3429 else 3430 arc_c = arc_c_min; 3431 3432 atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift)); 3433 if (arc_c > arc_size) 3434 arc_c = MAX(arc_size, arc_c_min); 3435 if (arc_p > arc_c) 3436 arc_p = (arc_c >> 1); 3437 3438 DTRACE_PROBE2(arc__shrunk, uint64_t, arc_c, uint64_t, 3439 arc_p); 3440 3441 ASSERT(arc_c >= arc_c_min); 3442 ASSERT((int64_t)arc_p >= 0); 3443 } 3444 3445 if (arc_size > arc_c) { 3446 DTRACE_PROBE2(arc__shrink_adjust, uint64_t, arc_size, 3447 uint64_t, arc_c); 3448 (void) arc_adjust(); 3449 } 3450} 3451 3452static long needfree = 0; 3453 3454typedef enum free_memory_reason_t { 3455 FMR_UNKNOWN, 3456 FMR_NEEDFREE, 3457 FMR_LOTSFREE, 3458 FMR_SWAPFS_MINFREE, 3459 FMR_PAGES_PP_MAXIMUM, 3460 FMR_HEAP_ARENA, 3461 FMR_ZIO_ARENA, 3462 FMR_ZIO_FRAG, 3463} free_memory_reason_t; 3464 3465int64_t last_free_memory; 3466free_memory_reason_t last_free_reason; 3467 3468/* 3469 * Additional reserve of pages for pp_reserve. 3470 */ 3471int64_t arc_pages_pp_reserve = 64; 3472 3473/* 3474 * Additional reserve of pages for swapfs. 3475 */ 3476int64_t arc_swapfs_reserve = 64; 3477 3478/* 3479 * Return the amount of memory that can be consumed before reclaim will be 3480 * needed. Positive if there is sufficient free memory, negative indicates 3481 * the amount of memory that needs to be freed up. 3482 */ 3483static int64_t 3484arc_available_memory(void) 3485{ 3486 int64_t lowest = INT64_MAX; 3487 int64_t n; 3488 free_memory_reason_t r = FMR_UNKNOWN; 3489 3490#ifdef _KERNEL 3491 if (needfree > 0) { 3492 n = PAGESIZE * (-needfree); 3493 if (n < lowest) { 3494 lowest = n; 3495 r = FMR_NEEDFREE; 3496 } 3497 } 3498 3499 /* 3500 * Cooperate with pagedaemon when it's time for it to scan 3501 * and reclaim some pages. 3502 */ 3503 n = PAGESIZE * ((int64_t)freemem - zfs_arc_free_target); 3504 if (n < lowest) { 3505 lowest = n; 3506 r = FMR_LOTSFREE; 3507 } 3508 3509#ifdef illumos 3510 /* 3511 * check that we're out of range of the pageout scanner. It starts to 3512 * schedule paging if freemem is less than lotsfree and needfree. 3513 * lotsfree is the high-water mark for pageout, and needfree is the 3514 * number of needed free pages. We add extra pages here to make sure 3515 * the scanner doesn't start up while we're freeing memory. 3516 */ 3517 n = PAGESIZE * (freemem - lotsfree - needfree - desfree); 3518 if (n < lowest) { 3519 lowest = n; 3520 r = FMR_LOTSFREE; 3521 } 3522 3523 /* 3524 * check to make sure that swapfs has enough space so that anon 3525 * reservations can still succeed. anon_resvmem() checks that the 3526 * availrmem is greater than swapfs_minfree, and the number of reserved 3527 * swap pages. We also add a bit of extra here just to prevent 3528 * circumstances from getting really dire. 3529 */ 3530 n = PAGESIZE * (availrmem - swapfs_minfree - swapfs_reserve - 3531 desfree - arc_swapfs_reserve); 3532 if (n < lowest) { 3533 lowest = n; 3534 r = FMR_SWAPFS_MINFREE; 3535 } 3536 3537 3538 /* 3539 * Check that we have enough availrmem that memory locking (e.g., via 3540 * mlock(3C) or memcntl(2)) can still succeed. (pages_pp_maximum 3541 * stores the number of pages that cannot be locked; when availrmem 3542 * drops below pages_pp_maximum, page locking mechanisms such as 3543 * page_pp_lock() will fail.) 3544 */ 3545 n = PAGESIZE * (availrmem - pages_pp_maximum - 3546 arc_pages_pp_reserve); 3547 if (n < lowest) { 3548 lowest = n; 3549 r = FMR_PAGES_PP_MAXIMUM; 3550 } 3551 3552#endif /* illumos */ 3553#if defined(__i386) || !defined(UMA_MD_SMALL_ALLOC) 3554 /* 3555 * If we're on an i386 platform, it's possible that we'll exhaust the 3556 * kernel heap space before we ever run out of available physical 3557 * memory. Most checks of the size of the heap_area compare against 3558 * tune.t_minarmem, which is the minimum available real memory that we 3559 * can have in the system. However, this is generally fixed at 25 pages 3560 * which is so low that it's useless. In this comparison, we seek to 3561 * calculate the total heap-size, and reclaim if more than 3/4ths of the 3562 * heap is allocated. (Or, in the calculation, if less than 1/4th is 3563 * free) 3564 */ 3565 n = (int64_t)vmem_size(heap_arena, VMEM_FREE) - 3566 (vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC) >> 2); 3567 if (n < lowest) { 3568 lowest = n; 3569 r = FMR_HEAP_ARENA; 3570 } 3571#define zio_arena NULL 3572#else 3573#define zio_arena heap_arena 3574#endif 3575 3576 /* 3577 * If zio data pages are being allocated out of a separate heap segment, 3578 * then enforce that the size of available vmem for this arena remains 3579 * above about 1/16th free. 3580 * 3581 * Note: The 1/16th arena free requirement was put in place 3582 * to aggressively evict memory from the arc in order to avoid 3583 * memory fragmentation issues. 3584 */ 3585 if (zio_arena != NULL) { 3586 n = (int64_t)vmem_size(zio_arena, VMEM_FREE) - 3587 (vmem_size(zio_arena, VMEM_ALLOC) >> 4); 3588 if (n < lowest) { 3589 lowest = n; 3590 r = FMR_ZIO_ARENA; 3591 } 3592 } 3593 3594 /* 3595 * Above limits know nothing about real level of KVA fragmentation. 3596 * Start aggressive reclamation if too little sequential KVA left. 3597 */ 3598 if (lowest > 0) { 3599 n = (vmem_size(heap_arena, VMEM_MAXFREE) < zfs_max_recordsize) ? 3600 -((int64_t)vmem_size(heap_arena, VMEM_ALLOC) >> 4) : 3601 INT64_MAX; 3602 if (n < lowest) { 3603 lowest = n; 3604 r = FMR_ZIO_FRAG; 3605 } 3606 } 3607 3608#else /* _KERNEL */ 3609 /* Every 100 calls, free a small amount */ 3610 if (spa_get_random(100) == 0) 3611 lowest = -1024; 3612#endif /* _KERNEL */ 3613 3614 last_free_memory = lowest; 3615 last_free_reason = r; 3616 DTRACE_PROBE2(arc__available_memory, int64_t, lowest, int, r); 3617 return (lowest); 3618} 3619 3620 3621/* 3622 * Determine if the system is under memory pressure and is asking 3623 * to reclaim memory. A return value of TRUE indicates that the system 3624 * is under memory pressure and that the arc should adjust accordingly. 3625 */ 3626static boolean_t 3627arc_reclaim_needed(void) 3628{ 3629 return (arc_available_memory() < 0); 3630} 3631 3632extern kmem_cache_t *zio_buf_cache[]; 3633extern kmem_cache_t *zio_data_buf_cache[]; 3634extern kmem_cache_t *range_seg_cache; 3635 3636static __noinline void 3637arc_kmem_reap_now(void) 3638{ 3639 size_t i; 3640 kmem_cache_t *prev_cache = NULL; 3641 kmem_cache_t *prev_data_cache = NULL; 3642 3643 DTRACE_PROBE(arc__kmem_reap_start); 3644#ifdef _KERNEL 3645 if (arc_meta_used >= arc_meta_limit) { 3646 /* 3647 * We are exceeding our meta-data cache limit. 3648 * Purge some DNLC entries to release holds on meta-data. 3649 */ 3650 dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent); 3651 } 3652#if defined(__i386) 3653 /* 3654 * Reclaim unused memory from all kmem caches. 3655 */ 3656 kmem_reap(); 3657#endif 3658#endif 3659 3660 for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) { 3661 if (zio_buf_cache[i] != prev_cache) { 3662 prev_cache = zio_buf_cache[i]; 3663 kmem_cache_reap_now(zio_buf_cache[i]); 3664 } 3665 if (zio_data_buf_cache[i] != prev_data_cache) { 3666 prev_data_cache = zio_data_buf_cache[i]; 3667 kmem_cache_reap_now(zio_data_buf_cache[i]); 3668 } 3669 } 3670 kmem_cache_reap_now(buf_cache); 3671 kmem_cache_reap_now(hdr_full_cache); 3672 kmem_cache_reap_now(hdr_l2only_cache); 3673 kmem_cache_reap_now(range_seg_cache); 3674 3675#ifdef illumos 3676 if (zio_arena != NULL) { 3677 /* 3678 * Ask the vmem arena to reclaim unused memory from its 3679 * quantum caches. 3680 */ 3681 vmem_qcache_reap(zio_arena); 3682 } 3683#endif 3684 DTRACE_PROBE(arc__kmem_reap_end); 3685} 3686 3687/* 3688 * Threads can block in arc_get_data_buf() waiting for this thread to evict 3689 * enough data and signal them to proceed. When this happens, the threads in 3690 * arc_get_data_buf() are sleeping while holding the hash lock for their 3691 * particular arc header. Thus, we must be careful to never sleep on a 3692 * hash lock in this thread. This is to prevent the following deadlock: 3693 * 3694 * - Thread A sleeps on CV in arc_get_data_buf() holding hash lock "L", 3695 * waiting for the reclaim thread to signal it. 3696 * 3697 * - arc_reclaim_thread() tries to acquire hash lock "L" using mutex_enter, 3698 * fails, and goes to sleep forever. 3699 * 3700 * This possible deadlock is avoided by always acquiring a hash lock 3701 * using mutex_tryenter() from arc_reclaim_thread(). 3702 */ 3703static void 3704arc_reclaim_thread(void *dummy __unused) 3705{ 3706 hrtime_t growtime = 0; 3707 callb_cpr_t cpr; 3708 3709 CALLB_CPR_INIT(&cpr, &arc_reclaim_lock, callb_generic_cpr, FTAG); 3710 3711 mutex_enter(&arc_reclaim_lock); 3712 while (!arc_reclaim_thread_exit) { 3713 int64_t free_memory = arc_available_memory(); 3714 uint64_t evicted = 0; 3715 3716 mutex_exit(&arc_reclaim_lock); 3717 3718 if (free_memory < 0) { 3719 3720 arc_no_grow = B_TRUE; 3721 arc_warm = B_TRUE; 3722 3723 /* 3724 * Wait at least zfs_grow_retry (default 60) seconds 3725 * before considering growing. 3726 */ 3727 growtime = gethrtime() + SEC2NSEC(arc_grow_retry); 3728 3729 arc_kmem_reap_now(); 3730 3731 /* 3732 * If we are still low on memory, shrink the ARC 3733 * so that we have arc_shrink_min free space. 3734 */ 3735 free_memory = arc_available_memory(); 3736 3737 int64_t to_free = 3738 (arc_c >> arc_shrink_shift) - free_memory; 3739 if (to_free > 0) { 3740#ifdef _KERNEL 3741 to_free = MAX(to_free, ptob(needfree)); 3742#endif 3743 arc_shrink(to_free); 3744 } 3745 } else if (free_memory < arc_c >> arc_no_grow_shift) { 3746 arc_no_grow = B_TRUE; 3747 } else if (gethrtime() >= growtime) { 3748 arc_no_grow = B_FALSE; 3749 } 3750 3751 evicted = arc_adjust(); 3752 3753 mutex_enter(&arc_reclaim_lock); 3754 3755 /* 3756 * If evicted is zero, we couldn't evict anything via 3757 * arc_adjust(). This could be due to hash lock 3758 * collisions, but more likely due to the majority of 3759 * arc buffers being unevictable. Therefore, even if 3760 * arc_size is above arc_c, another pass is unlikely to 3761 * be helpful and could potentially cause us to enter an 3762 * infinite loop. 3763 */ 3764 if (arc_size <= arc_c || evicted == 0) { 3765#ifdef _KERNEL 3766 needfree = 0; 3767#endif 3768 /* 3769 * We're either no longer overflowing, or we 3770 * can't evict anything more, so we should wake 3771 * up any threads before we go to sleep. 3772 */ 3773 cv_broadcast(&arc_reclaim_waiters_cv); 3774 3775 /* 3776 * Block until signaled, or after one second (we 3777 * might need to perform arc_kmem_reap_now() 3778 * even if we aren't being signalled) 3779 */ 3780 CALLB_CPR_SAFE_BEGIN(&cpr); 3781 (void) cv_timedwait_hires(&arc_reclaim_thread_cv, 3782 &arc_reclaim_lock, SEC2NSEC(1), MSEC2NSEC(1), 0); 3783 CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_lock); 3784 } 3785 } 3786 3787 arc_reclaim_thread_exit = FALSE; 3788 cv_broadcast(&arc_reclaim_thread_cv); 3789 CALLB_CPR_EXIT(&cpr); /* drops arc_reclaim_lock */ 3790 thread_exit(); 3791} 3792 3793static void 3794arc_user_evicts_thread(void *dummy __unused) 3795{ 3796 callb_cpr_t cpr; 3797 3798 CALLB_CPR_INIT(&cpr, &arc_user_evicts_lock, callb_generic_cpr, FTAG); 3799 3800 mutex_enter(&arc_user_evicts_lock); 3801 while (!arc_user_evicts_thread_exit) { 3802 mutex_exit(&arc_user_evicts_lock); 3803 3804 arc_do_user_evicts(); 3805 3806 /* 3807 * This is necessary in order for the mdb ::arc dcmd to 3808 * show up to date information. Since the ::arc command 3809 * does not call the kstat's update function, without 3810 * this call, the command may show stale stats for the 3811 * anon, mru, mru_ghost, mfu, and mfu_ghost lists. Even 3812 * with this change, the data might be up to 1 second 3813 * out of date; but that should suffice. The arc_state_t 3814 * structures can be queried directly if more accurate 3815 * information is needed. 3816 */ 3817 if (arc_ksp != NULL) 3818 arc_ksp->ks_update(arc_ksp, KSTAT_READ); 3819 3820 mutex_enter(&arc_user_evicts_lock); 3821 3822 /* 3823 * Block until signaled, or after one second (we need to 3824 * call the arc's kstat update function regularly). 3825 */ 3826 CALLB_CPR_SAFE_BEGIN(&cpr); 3827 (void) cv_timedwait(&arc_user_evicts_cv, 3828 &arc_user_evicts_lock, hz); 3829 CALLB_CPR_SAFE_END(&cpr, &arc_user_evicts_lock); 3830 } 3831 3832 arc_user_evicts_thread_exit = FALSE; 3833 cv_broadcast(&arc_user_evicts_cv); 3834 CALLB_CPR_EXIT(&cpr); /* drops arc_user_evicts_lock */ 3835 thread_exit(); 3836} 3837 3838static u_int arc_dnlc_evicts_arg; 3839extern struct vfsops zfs_vfsops; 3840 3841static void 3842arc_dnlc_evicts_thread(void *dummy __unused) 3843{ 3844 callb_cpr_t cpr; 3845 u_int percent; 3846 3847 CALLB_CPR_INIT(&cpr, &arc_dnlc_evicts_lock, callb_generic_cpr, FTAG); 3848 3849 mutex_enter(&arc_dnlc_evicts_lock); 3850 while (!arc_dnlc_evicts_thread_exit) { 3851 CALLB_CPR_SAFE_BEGIN(&cpr); 3852 (void) cv_wait(&arc_dnlc_evicts_cv, &arc_dnlc_evicts_lock); 3853 CALLB_CPR_SAFE_END(&cpr, &arc_dnlc_evicts_lock); 3854 if (arc_dnlc_evicts_arg != 0) { 3855 percent = arc_dnlc_evicts_arg; 3856 mutex_exit(&arc_dnlc_evicts_lock); 3857#ifdef _KERNEL 3858 vnlru_free(desiredvnodes * percent / 100, &zfs_vfsops); 3859#endif 3860 mutex_enter(&arc_dnlc_evicts_lock); 3861 /* 3862 * Clear our token only after vnlru_free() 3863 * pass is done, to avoid false queueing of 3864 * the requests. 3865 */ 3866 arc_dnlc_evicts_arg = 0; 3867 } 3868 } 3869 arc_dnlc_evicts_thread_exit = FALSE; 3870 cv_broadcast(&arc_dnlc_evicts_cv); 3871 CALLB_CPR_EXIT(&cpr); 3872 thread_exit(); 3873} 3874 3875void 3876dnlc_reduce_cache(void *arg) 3877{ 3878 u_int percent; 3879 3880 percent = (u_int)(uintptr_t)arg; 3881 mutex_enter(&arc_dnlc_evicts_lock); 3882 if (arc_dnlc_evicts_arg == 0) { 3883 arc_dnlc_evicts_arg = percent; 3884 cv_broadcast(&arc_dnlc_evicts_cv); 3885 } 3886 mutex_exit(&arc_dnlc_evicts_lock); 3887} 3888 3889/* 3890 * Adapt arc info given the number of bytes we are trying to add and 3891 * the state that we are comming from. This function is only called 3892 * when we are adding new content to the cache. 3893 */ 3894static void 3895arc_adapt(int bytes, arc_state_t *state) 3896{ 3897 int mult; 3898 uint64_t arc_p_min = (arc_c >> arc_p_min_shift); 3899 int64_t mrug_size = refcount_count(&arc_mru_ghost->arcs_size); 3900 int64_t mfug_size = refcount_count(&arc_mfu_ghost->arcs_size); 3901 3902 if (state == arc_l2c_only) 3903 return; 3904 3905 ASSERT(bytes > 0); 3906 /* 3907 * Adapt the target size of the MRU list: 3908 * - if we just hit in the MRU ghost list, then increase 3909 * the target size of the MRU list. 3910 * - if we just hit in the MFU ghost list, then increase 3911 * the target size of the MFU list by decreasing the 3912 * target size of the MRU list. 3913 */ 3914 if (state == arc_mru_ghost) { 3915 mult = (mrug_size >= mfug_size) ? 1 : (mfug_size / mrug_size); 3916 mult = MIN(mult, 10); /* avoid wild arc_p adjustment */ 3917 3918 arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult); 3919 } else if (state == arc_mfu_ghost) { 3920 uint64_t delta; 3921 3922 mult = (mfug_size >= mrug_size) ? 1 : (mrug_size / mfug_size); 3923 mult = MIN(mult, 10); 3924 3925 delta = MIN(bytes * mult, arc_p); 3926 arc_p = MAX(arc_p_min, arc_p - delta); 3927 } 3928 ASSERT((int64_t)arc_p >= 0); 3929 3930 if (arc_reclaim_needed()) { 3931 cv_signal(&arc_reclaim_thread_cv); 3932 return; 3933 } 3934 3935 if (arc_no_grow) 3936 return; 3937 3938 if (arc_c >= arc_c_max) 3939 return; 3940 3941 /* 3942 * If we're within (2 * maxblocksize) bytes of the target 3943 * cache size, increment the target cache size 3944 */ 3945 if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) { 3946 DTRACE_PROBE1(arc__inc_adapt, int, bytes); 3947 atomic_add_64(&arc_c, (int64_t)bytes); 3948 if (arc_c > arc_c_max) 3949 arc_c = arc_c_max; 3950 else if (state == arc_anon) 3951 atomic_add_64(&arc_p, (int64_t)bytes); 3952 if (arc_p > arc_c) 3953 arc_p = arc_c; 3954 } 3955 ASSERT((int64_t)arc_p >= 0); 3956} 3957 3958/* 3959 * Check if arc_size has grown past our upper threshold, determined by 3960 * zfs_arc_overflow_shift. 3961 */ 3962static boolean_t 3963arc_is_overflowing(void) 3964{ 3965 /* Always allow at least one block of overflow */ 3966 uint64_t overflow = MAX(SPA_MAXBLOCKSIZE, 3967 arc_c >> zfs_arc_overflow_shift); 3968 3969 return (arc_size >= arc_c + overflow); 3970} 3971 3972/* 3973 * The buffer, supplied as the first argument, needs a data block. If we 3974 * are hitting the hard limit for the cache size, we must sleep, waiting 3975 * for the eviction thread to catch up. If we're past the target size 3976 * but below the hard limit, we'll only signal the reclaim thread and 3977 * continue on. 3978 */ 3979static void 3980arc_get_data_buf(arc_buf_t *buf) 3981{ 3982 arc_state_t *state = buf->b_hdr->b_l1hdr.b_state; 3983 uint64_t size = buf->b_hdr->b_size; 3984 arc_buf_contents_t type = arc_buf_type(buf->b_hdr); 3985 3986 arc_adapt(size, state); 3987 3988 /* 3989 * If arc_size is currently overflowing, and has grown past our 3990 * upper limit, we must be adding data faster than the evict 3991 * thread can evict. Thus, to ensure we don't compound the 3992 * problem by adding more data and forcing arc_size to grow even 3993 * further past it's target size, we halt and wait for the 3994 * eviction thread to catch up. 3995 * 3996 * It's also possible that the reclaim thread is unable to evict 3997 * enough buffers to get arc_size below the overflow limit (e.g. 3998 * due to buffers being un-evictable, or hash lock collisions). 3999 * In this case, we want to proceed regardless if we're 4000 * overflowing; thus we don't use a while loop here. 4001 */ 4002 if (arc_is_overflowing()) { 4003 mutex_enter(&arc_reclaim_lock); 4004 4005 /* 4006 * Now that we've acquired the lock, we may no longer be 4007 * over the overflow limit, lets check. 4008 * 4009 * We're ignoring the case of spurious wake ups. If that 4010 * were to happen, it'd let this thread consume an ARC 4011 * buffer before it should have (i.e. before we're under 4012 * the overflow limit and were signalled by the reclaim 4013 * thread). As long as that is a rare occurrence, it 4014 * shouldn't cause any harm. 4015 */ 4016 if (arc_is_overflowing()) { 4017 cv_signal(&arc_reclaim_thread_cv); 4018 cv_wait(&arc_reclaim_waiters_cv, &arc_reclaim_lock); 4019 } 4020 4021 mutex_exit(&arc_reclaim_lock); 4022 } 4023 4024 if (type == ARC_BUFC_METADATA) { 4025 buf->b_data = zio_buf_alloc(size); 4026 arc_space_consume(size, ARC_SPACE_META); 4027 } else { 4028 ASSERT(type == ARC_BUFC_DATA); 4029 buf->b_data = zio_data_buf_alloc(size); 4030 arc_space_consume(size, ARC_SPACE_DATA); 4031 } 4032 4033 /* 4034 * Update the state size. Note that ghost states have a 4035 * "ghost size" and so don't need to be updated. 4036 */ 4037 if (!GHOST_STATE(buf->b_hdr->b_l1hdr.b_state)) { 4038 arc_buf_hdr_t *hdr = buf->b_hdr; 4039 arc_state_t *state = hdr->b_l1hdr.b_state; 4040 4041 (void) refcount_add_many(&state->arcs_size, size, buf); 4042 4043 /* 4044 * If this is reached via arc_read, the link is 4045 * protected by the hash lock. If reached via 4046 * arc_buf_alloc, the header should not be accessed by 4047 * any other thread. And, if reached via arc_read_done, 4048 * the hash lock will protect it if it's found in the 4049 * hash table; otherwise no other thread should be 4050 * trying to [add|remove]_reference it. 4051 */ 4052 if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) { 4053 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 4054 atomic_add_64(&hdr->b_l1hdr.b_state->arcs_lsize[type], 4055 size); 4056 } 4057 /* 4058 * If we are growing the cache, and we are adding anonymous 4059 * data, and we have outgrown arc_p, update arc_p 4060 */ 4061 if (arc_size < arc_c && hdr->b_l1hdr.b_state == arc_anon && 4062 (refcount_count(&arc_anon->arcs_size) + 4063 refcount_count(&arc_mru->arcs_size) > arc_p)) 4064 arc_p = MIN(arc_c, arc_p + size); 4065 } 4066 ARCSTAT_BUMP(arcstat_allocated); 4067} 4068 4069/* 4070 * This routine is called whenever a buffer is accessed. 4071 * NOTE: the hash lock is dropped in this function. 4072 */ 4073static void 4074arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) 4075{ 4076 clock_t now; 4077 4078 ASSERT(MUTEX_HELD(hash_lock)); 4079 ASSERT(HDR_HAS_L1HDR(hdr)); 4080 4081 if (hdr->b_l1hdr.b_state == arc_anon) { 4082 /* 4083 * This buffer is not in the cache, and does not 4084 * appear in our "ghost" list. Add the new buffer 4085 * to the MRU state. 4086 */ 4087 4088 ASSERT0(hdr->b_l1hdr.b_arc_access); 4089 hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); 4090 DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr); 4091 arc_change_state(arc_mru, hdr, hash_lock); 4092 4093 } else if (hdr->b_l1hdr.b_state == arc_mru) { 4094 now = ddi_get_lbolt(); 4095 4096 /* 4097 * If this buffer is here because of a prefetch, then either: 4098 * - clear the flag if this is a "referencing" read 4099 * (any subsequent access will bump this into the MFU state). 4100 * or 4101 * - move the buffer to the head of the list if this is 4102 * another prefetch (to make it less likely to be evicted). 4103 */ 4104 if (HDR_PREFETCH(hdr)) { 4105 if (refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) { 4106 /* link protected by hash lock */ 4107 ASSERT(multilist_link_active( 4108 &hdr->b_l1hdr.b_arc_node)); 4109 } else { 4110 hdr->b_flags &= ~ARC_FLAG_PREFETCH; 4111 ARCSTAT_BUMP(arcstat_mru_hits); 4112 } 4113 hdr->b_l1hdr.b_arc_access = now; 4114 return; 4115 } 4116 4117 /* 4118 * This buffer has been "accessed" only once so far, 4119 * but it is still in the cache. Move it to the MFU 4120 * state. 4121 */ 4122 if (now > hdr->b_l1hdr.b_arc_access + ARC_MINTIME) { 4123 /* 4124 * More than 125ms have passed since we 4125 * instantiated this buffer. Move it to the 4126 * most frequently used state. 4127 */ 4128 hdr->b_l1hdr.b_arc_access = now; 4129 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); 4130 arc_change_state(arc_mfu, hdr, hash_lock); 4131 } 4132 ARCSTAT_BUMP(arcstat_mru_hits); 4133 } else if (hdr->b_l1hdr.b_state == arc_mru_ghost) { 4134 arc_state_t *new_state; 4135 /* 4136 * This buffer has been "accessed" recently, but 4137 * was evicted from the cache. Move it to the 4138 * MFU state. 4139 */ 4140 4141 if (HDR_PREFETCH(hdr)) { 4142 new_state = arc_mru; 4143 if (refcount_count(&hdr->b_l1hdr.b_refcnt) > 0) 4144 hdr->b_flags &= ~ARC_FLAG_PREFETCH; 4145 DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr); 4146 } else { 4147 new_state = arc_mfu; 4148 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); 4149 } 4150 4151 hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); 4152 arc_change_state(new_state, hdr, hash_lock); 4153 4154 ARCSTAT_BUMP(arcstat_mru_ghost_hits); 4155 } else if (hdr->b_l1hdr.b_state == arc_mfu) { 4156 /* 4157 * This buffer has been accessed more than once and is 4158 * still in the cache. Keep it in the MFU state. 4159 * 4160 * NOTE: an add_reference() that occurred when we did 4161 * the arc_read() will have kicked this off the list. 4162 * If it was a prefetch, we will explicitly move it to 4163 * the head of the list now. 4164 */ 4165 if ((HDR_PREFETCH(hdr)) != 0) { 4166 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 4167 /* link protected by hash_lock */ 4168 ASSERT(multilist_link_active(&hdr->b_l1hdr.b_arc_node)); 4169 } 4170 ARCSTAT_BUMP(arcstat_mfu_hits); 4171 hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); 4172 } else if (hdr->b_l1hdr.b_state == arc_mfu_ghost) { 4173 arc_state_t *new_state = arc_mfu; 4174 /* 4175 * This buffer has been accessed more than once but has 4176 * been evicted from the cache. Move it back to the 4177 * MFU state. 4178 */ 4179 4180 if (HDR_PREFETCH(hdr)) { 4181 /* 4182 * This is a prefetch access... 4183 * move this block back to the MRU state. 4184 */ 4185 ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt)); 4186 new_state = arc_mru; 4187 } 4188 4189 hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); 4190 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); 4191 arc_change_state(new_state, hdr, hash_lock); 4192 4193 ARCSTAT_BUMP(arcstat_mfu_ghost_hits); 4194 } else if (hdr->b_l1hdr.b_state == arc_l2c_only) { 4195 /* 4196 * This buffer is on the 2nd Level ARC. 4197 */ 4198 4199 hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); 4200 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); 4201 arc_change_state(arc_mfu, hdr, hash_lock); 4202 } else { 4203 ASSERT(!"invalid arc state"); 4204 } 4205} 4206 4207/* a generic arc_done_func_t which you can use */ 4208/* ARGSUSED */ 4209void 4210arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg) 4211{ 4212 if (zio == NULL || zio->io_error == 0) 4213 bcopy(buf->b_data, arg, buf->b_hdr->b_size); 4214 VERIFY(arc_buf_remove_ref(buf, arg)); 4215} 4216 4217/* a generic arc_done_func_t */ 4218void 4219arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg) 4220{ 4221 arc_buf_t **bufp = arg; 4222 if (zio && zio->io_error) { 4223 VERIFY(arc_buf_remove_ref(buf, arg)); 4224 *bufp = NULL; 4225 } else { 4226 *bufp = buf; 4227 ASSERT(buf->b_data); 4228 } 4229} 4230 4231static void 4232arc_read_done(zio_t *zio) 4233{ 4234 arc_buf_hdr_t *hdr; 4235 arc_buf_t *buf; 4236 arc_buf_t *abuf; /* buffer we're assigning to callback */ 4237 kmutex_t *hash_lock = NULL; 4238 arc_callback_t *callback_list, *acb; 4239 int freeable = FALSE; 4240 4241 buf = zio->io_private; 4242 hdr = buf->b_hdr; 4243 4244 /* 4245 * The hdr was inserted into hash-table and removed from lists 4246 * prior to starting I/O. We should find this header, since 4247 * it's in the hash table, and it should be legit since it's 4248 * not possible to evict it during the I/O. The only possible 4249 * reason for it not to be found is if we were freed during the 4250 * read. 4251 */ 4252 if (HDR_IN_HASH_TABLE(hdr)) { 4253 ASSERT3U(hdr->b_birth, ==, BP_PHYSICAL_BIRTH(zio->io_bp)); 4254 ASSERT3U(hdr->b_dva.dva_word[0], ==, 4255 BP_IDENTITY(zio->io_bp)->dva_word[0]); 4256 ASSERT3U(hdr->b_dva.dva_word[1], ==, 4257 BP_IDENTITY(zio->io_bp)->dva_word[1]); 4258 4259 arc_buf_hdr_t *found = buf_hash_find(hdr->b_spa, zio->io_bp, 4260 &hash_lock); 4261 4262 ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && 4263 hash_lock == NULL) || 4264 (found == hdr && 4265 DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) || 4266 (found == hdr && HDR_L2_READING(hdr))); 4267 } 4268 4269 hdr->b_flags &= ~ARC_FLAG_L2_EVICTED; 4270 if (l2arc_noprefetch && HDR_PREFETCH(hdr)) 4271 hdr->b_flags &= ~ARC_FLAG_L2CACHE; 4272 4273 /* byteswap if necessary */ 4274 callback_list = hdr->b_l1hdr.b_acb; 4275 ASSERT(callback_list != NULL); 4276 if (BP_SHOULD_BYTESWAP(zio->io_bp) && zio->io_error == 0) { 4277 dmu_object_byteswap_t bswap = 4278 DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp)); 4279 arc_byteswap_func_t *func = BP_GET_LEVEL(zio->io_bp) > 0 ? 4280 byteswap_uint64_array : 4281 dmu_ot_byteswap[bswap].ob_func; 4282 func(buf->b_data, hdr->b_size); 4283 } 4284 4285 arc_cksum_compute(buf, B_FALSE); 4286#ifdef illumos 4287 arc_buf_watch(buf); 4288#endif 4289 4290 if (hash_lock && zio->io_error == 0 && 4291 hdr->b_l1hdr.b_state == arc_anon) { 4292 /* 4293 * Only call arc_access on anonymous buffers. This is because 4294 * if we've issued an I/O for an evicted buffer, we've already 4295 * called arc_access (to prevent any simultaneous readers from 4296 * getting confused). 4297 */ 4298 arc_access(hdr, hash_lock); 4299 } 4300 4301 /* create copies of the data buffer for the callers */ 4302 abuf = buf; 4303 for (acb = callback_list; acb; acb = acb->acb_next) { 4304 if (acb->acb_done) { 4305 if (abuf == NULL) { 4306 ARCSTAT_BUMP(arcstat_duplicate_reads); 4307 abuf = arc_buf_clone(buf); 4308 } 4309 acb->acb_buf = abuf; 4310 abuf = NULL; 4311 } 4312 } 4313 hdr->b_l1hdr.b_acb = NULL; 4314 hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS; 4315 ASSERT(!HDR_BUF_AVAILABLE(hdr)); 4316 if (abuf == buf) { 4317 ASSERT(buf->b_efunc == NULL); 4318 ASSERT(hdr->b_l1hdr.b_datacnt == 1); 4319 hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE; 4320 } 4321 4322 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt) || 4323 callback_list != NULL); 4324 4325 if (zio->io_error != 0) { 4326 hdr->b_flags |= ARC_FLAG_IO_ERROR; 4327 if (hdr->b_l1hdr.b_state != arc_anon) 4328 arc_change_state(arc_anon, hdr, hash_lock); 4329 if (HDR_IN_HASH_TABLE(hdr)) 4330 buf_hash_remove(hdr); 4331 freeable = refcount_is_zero(&hdr->b_l1hdr.b_refcnt); 4332 } 4333 4334 /* 4335 * Broadcast before we drop the hash_lock to avoid the possibility 4336 * that the hdr (and hence the cv) might be freed before we get to 4337 * the cv_broadcast(). 4338 */ 4339 cv_broadcast(&hdr->b_l1hdr.b_cv); 4340 4341 if (hash_lock != NULL) { 4342 mutex_exit(hash_lock); 4343 } else { 4344 /* 4345 * This block was freed while we waited for the read to 4346 * complete. It has been removed from the hash table and 4347 * moved to the anonymous state (so that it won't show up 4348 * in the cache). 4349 */ 4350 ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); 4351 freeable = refcount_is_zero(&hdr->b_l1hdr.b_refcnt); 4352 } 4353 4354 /* execute each callback and free its structure */ 4355 while ((acb = callback_list) != NULL) { 4356 if (acb->acb_done) 4357 acb->acb_done(zio, acb->acb_buf, acb->acb_private); 4358 4359 if (acb->acb_zio_dummy != NULL) { 4360 acb->acb_zio_dummy->io_error = zio->io_error; 4361 zio_nowait(acb->acb_zio_dummy); 4362 } 4363 4364 callback_list = acb->acb_next; 4365 kmem_free(acb, sizeof (arc_callback_t)); 4366 } 4367 4368 if (freeable) 4369 arc_hdr_destroy(hdr); 4370} 4371 4372/* 4373 * "Read" the block at the specified DVA (in bp) via the 4374 * cache. If the block is found in the cache, invoke the provided 4375 * callback immediately and return. Note that the `zio' parameter 4376 * in the callback will be NULL in this case, since no IO was 4377 * required. If the block is not in the cache pass the read request 4378 * on to the spa with a substitute callback function, so that the 4379 * requested block will be added to the cache. 4380 * 4381 * If a read request arrives for a block that has a read in-progress, 4382 * either wait for the in-progress read to complete (and return the 4383 * results); or, if this is a read with a "done" func, add a record 4384 * to the read to invoke the "done" func when the read completes, 4385 * and return; or just return. 4386 * 4387 * arc_read_done() will invoke all the requested "done" functions 4388 * for readers of this block. 4389 */ 4390int 4391arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, 4392 void *private, zio_priority_t priority, int zio_flags, 4393 arc_flags_t *arc_flags, const zbookmark_phys_t *zb) 4394{ 4395 arc_buf_hdr_t *hdr = NULL; 4396 arc_buf_t *buf = NULL; 4397 kmutex_t *hash_lock = NULL; 4398 zio_t *rzio; 4399 uint64_t guid = spa_load_guid(spa); 4400 4401 ASSERT(!BP_IS_EMBEDDED(bp) || 4402 BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA); 4403 4404top: 4405 if (!BP_IS_EMBEDDED(bp)) { 4406 /* 4407 * Embedded BP's have no DVA and require no I/O to "read". 4408 * Create an anonymous arc buf to back it. 4409 */ 4410 hdr = buf_hash_find(guid, bp, &hash_lock); 4411 } 4412 4413 if (hdr != NULL && HDR_HAS_L1HDR(hdr) && hdr->b_l1hdr.b_datacnt > 0) { 4414 4415 *arc_flags |= ARC_FLAG_CACHED; 4416 4417 if (HDR_IO_IN_PROGRESS(hdr)) { 4418 4419 if ((hdr->b_flags & ARC_FLAG_PRIO_ASYNC_READ) && 4420 priority == ZIO_PRIORITY_SYNC_READ) { 4421 /* 4422 * This sync read must wait for an 4423 * in-progress async read (e.g. a predictive 4424 * prefetch). Async reads are queued 4425 * separately at the vdev_queue layer, so 4426 * this is a form of priority inversion. 4427 * Ideally, we would "inherit" the demand 4428 * i/o's priority by moving the i/o from 4429 * the async queue to the synchronous queue, 4430 * but there is currently no mechanism to do 4431 * so. Track this so that we can evaluate 4432 * the magnitude of this potential performance 4433 * problem. 4434 * 4435 * Note that if the prefetch i/o is already 4436 * active (has been issued to the device), 4437 * the prefetch improved performance, because 4438 * we issued it sooner than we would have 4439 * without the prefetch. 4440 */ 4441 DTRACE_PROBE1(arc__sync__wait__for__async, 4442 arc_buf_hdr_t *, hdr); 4443 ARCSTAT_BUMP(arcstat_sync_wait_for_async); 4444 } 4445 if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) { 4446 hdr->b_flags &= ~ARC_FLAG_PREDICTIVE_PREFETCH; 4447 } 4448 4449 if (*arc_flags & ARC_FLAG_WAIT) { 4450 cv_wait(&hdr->b_l1hdr.b_cv, hash_lock); 4451 mutex_exit(hash_lock); 4452 goto top; 4453 } 4454 ASSERT(*arc_flags & ARC_FLAG_NOWAIT); 4455 4456 if (done) { 4457 arc_callback_t *acb = NULL; 4458 4459 acb = kmem_zalloc(sizeof (arc_callback_t), 4460 KM_SLEEP); 4461 acb->acb_done = done; 4462 acb->acb_private = private; 4463 if (pio != NULL) 4464 acb->acb_zio_dummy = zio_null(pio, 4465 spa, NULL, NULL, NULL, zio_flags); 4466 4467 ASSERT(acb->acb_done != NULL); 4468 acb->acb_next = hdr->b_l1hdr.b_acb; 4469 hdr->b_l1hdr.b_acb = acb; 4470 add_reference(hdr, hash_lock, private); 4471 mutex_exit(hash_lock); 4472 return (0); 4473 } 4474 mutex_exit(hash_lock); 4475 return (0); 4476 } 4477 4478 ASSERT(hdr->b_l1hdr.b_state == arc_mru || 4479 hdr->b_l1hdr.b_state == arc_mfu); 4480 4481 if (done) { 4482 if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) { 4483 /* 4484 * This is a demand read which does not have to 4485 * wait for i/o because we did a predictive 4486 * prefetch i/o for it, which has completed. 4487 */ 4488 DTRACE_PROBE1( 4489 arc__demand__hit__predictive__prefetch, 4490 arc_buf_hdr_t *, hdr); 4491 ARCSTAT_BUMP( 4492 arcstat_demand_hit_predictive_prefetch); 4493 hdr->b_flags &= ~ARC_FLAG_PREDICTIVE_PREFETCH; 4494 } 4495 add_reference(hdr, hash_lock, private); 4496 /* 4497 * If this block is already in use, create a new 4498 * copy of the data so that we will be guaranteed 4499 * that arc_release() will always succeed. 4500 */ 4501 buf = hdr->b_l1hdr.b_buf; 4502 ASSERT(buf); 4503 ASSERT(buf->b_data); 4504 if (HDR_BUF_AVAILABLE(hdr)) { 4505 ASSERT(buf->b_efunc == NULL); 4506 hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE; 4507 } else { 4508 buf = arc_buf_clone(buf); 4509 } 4510 4511 } else if (*arc_flags & ARC_FLAG_PREFETCH && 4512 refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) { 4513 hdr->b_flags |= ARC_FLAG_PREFETCH; 4514 } 4515 DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); 4516 arc_access(hdr, hash_lock); 4517 if (*arc_flags & ARC_FLAG_L2CACHE) 4518 hdr->b_flags |= ARC_FLAG_L2CACHE; 4519 if (*arc_flags & ARC_FLAG_L2COMPRESS) 4520 hdr->b_flags |= ARC_FLAG_L2COMPRESS; 4521 mutex_exit(hash_lock); 4522 ARCSTAT_BUMP(arcstat_hits); 4523 ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr), 4524 demand, prefetch, !HDR_ISTYPE_METADATA(hdr), 4525 data, metadata, hits); 4526 4527 if (done) 4528 done(NULL, buf, private); 4529 } else { 4530 uint64_t size = BP_GET_LSIZE(bp); 4531 arc_callback_t *acb; 4532 vdev_t *vd = NULL; 4533 uint64_t addr = 0; 4534 boolean_t devw = B_FALSE; 4535 enum zio_compress b_compress = ZIO_COMPRESS_OFF; 4536 int32_t b_asize = 0; 4537 4538 if (hdr == NULL) { 4539 /* this block is not in the cache */ 4540 arc_buf_hdr_t *exists = NULL; 4541 arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp); 4542 buf = arc_buf_alloc(spa, size, private, type); 4543 hdr = buf->b_hdr; 4544 if (!BP_IS_EMBEDDED(bp)) { 4545 hdr->b_dva = *BP_IDENTITY(bp); 4546 hdr->b_birth = BP_PHYSICAL_BIRTH(bp); 4547 exists = buf_hash_insert(hdr, &hash_lock); 4548 } 4549 if (exists != NULL) { 4550 /* somebody beat us to the hash insert */ 4551 mutex_exit(hash_lock); 4552 buf_discard_identity(hdr); 4553 (void) arc_buf_remove_ref(buf, private); 4554 goto top; /* restart the IO request */ 4555 } 4556 4557 /* 4558 * If there is a callback, we pass our reference to 4559 * it; otherwise we remove our reference. 4560 */ 4561 if (done == NULL) { 4562 (void) remove_reference(hdr, hash_lock, 4563 private); 4564 } 4565 if (*arc_flags & ARC_FLAG_PREFETCH) 4566 hdr->b_flags |= ARC_FLAG_PREFETCH; 4567 if (*arc_flags & ARC_FLAG_L2CACHE) 4568 hdr->b_flags |= ARC_FLAG_L2CACHE; 4569 if (*arc_flags & ARC_FLAG_L2COMPRESS) 4570 hdr->b_flags |= ARC_FLAG_L2COMPRESS; 4571 if (BP_GET_LEVEL(bp) > 0) 4572 hdr->b_flags |= ARC_FLAG_INDIRECT; 4573 } else { 4574 /* 4575 * This block is in the ghost cache. If it was L2-only 4576 * (and thus didn't have an L1 hdr), we realloc the 4577 * header to add an L1 hdr. 4578 */ 4579 if (!HDR_HAS_L1HDR(hdr)) { 4580 hdr = arc_hdr_realloc(hdr, hdr_l2only_cache, 4581 hdr_full_cache); 4582 } 4583 4584 ASSERT(GHOST_STATE(hdr->b_l1hdr.b_state)); 4585 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 4586 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 4587 ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); 4588 4589 /* 4590 * If there is a callback, we pass a reference to it. 4591 */ 4592 if (done != NULL) 4593 add_reference(hdr, hash_lock, private); 4594 if (*arc_flags & ARC_FLAG_PREFETCH) 4595 hdr->b_flags |= ARC_FLAG_PREFETCH; 4596 if (*arc_flags & ARC_FLAG_L2CACHE) 4597 hdr->b_flags |= ARC_FLAG_L2CACHE; 4598 if (*arc_flags & ARC_FLAG_L2COMPRESS) 4599 hdr->b_flags |= ARC_FLAG_L2COMPRESS; 4600 buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); 4601 buf->b_hdr = hdr; 4602 buf->b_data = NULL; 4603 buf->b_efunc = NULL; 4604 buf->b_private = NULL; 4605 buf->b_next = NULL; 4606 hdr->b_l1hdr.b_buf = buf; 4607 ASSERT0(hdr->b_l1hdr.b_datacnt); 4608 hdr->b_l1hdr.b_datacnt = 1; 4609 arc_get_data_buf(buf); 4610 arc_access(hdr, hash_lock); 4611 } 4612 4613 if (*arc_flags & ARC_FLAG_PREDICTIVE_PREFETCH) 4614 hdr->b_flags |= ARC_FLAG_PREDICTIVE_PREFETCH; 4615 ASSERT(!GHOST_STATE(hdr->b_l1hdr.b_state)); 4616 4617 acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); 4618 acb->acb_done = done; 4619 acb->acb_private = private; 4620 4621 ASSERT(hdr->b_l1hdr.b_acb == NULL); 4622 hdr->b_l1hdr.b_acb = acb; 4623 hdr->b_flags |= ARC_FLAG_IO_IN_PROGRESS; 4624 4625 if (HDR_HAS_L2HDR(hdr) && 4626 (vd = hdr->b_l2hdr.b_dev->l2ad_vdev) != NULL) { 4627 devw = hdr->b_l2hdr.b_dev->l2ad_writing; 4628 addr = hdr->b_l2hdr.b_daddr; 4629 b_compress = hdr->b_l2hdr.b_compress; 4630 b_asize = hdr->b_l2hdr.b_asize; 4631 /* 4632 * Lock out device removal. 4633 */ 4634 if (vdev_is_dead(vd) || 4635 !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER)) 4636 vd = NULL; 4637 } 4638 4639 if (hash_lock != NULL) 4640 mutex_exit(hash_lock); 4641 4642 /* 4643 * At this point, we have a level 1 cache miss. Try again in 4644 * L2ARC if possible. 4645 */ 4646 ASSERT3U(hdr->b_size, ==, size); 4647 DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp, 4648 uint64_t, size, zbookmark_phys_t *, zb); 4649 ARCSTAT_BUMP(arcstat_misses); 4650 ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr), 4651 demand, prefetch, !HDR_ISTYPE_METADATA(hdr), 4652 data, metadata, misses); 4653#ifdef _KERNEL 4654#ifdef RACCT 4655 if (racct_enable) { 4656 PROC_LOCK(curproc); 4657 racct_add_force(curproc, RACCT_READBPS, size); 4658 racct_add_force(curproc, RACCT_READIOPS, 1); 4659 PROC_UNLOCK(curproc); 4660 } 4661#endif /* RACCT */ 4662 curthread->td_ru.ru_inblock++; 4663#endif 4664 4665 if (priority == ZIO_PRIORITY_ASYNC_READ) 4666 hdr->b_flags |= ARC_FLAG_PRIO_ASYNC_READ; 4667 else 4668 hdr->b_flags &= ~ARC_FLAG_PRIO_ASYNC_READ; 4669 4670 if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) { 4671 /* 4672 * Read from the L2ARC if the following are true: 4673 * 1. The L2ARC vdev was previously cached. 4674 * 2. This buffer still has L2ARC metadata. 4675 * 3. This buffer isn't currently writing to the L2ARC. 4676 * 4. The L2ARC entry wasn't evicted, which may 4677 * also have invalidated the vdev. 4678 * 5. This isn't prefetch and l2arc_noprefetch is set. 4679 */ 4680 if (HDR_HAS_L2HDR(hdr) && 4681 !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) && 4682 !(l2arc_noprefetch && HDR_PREFETCH(hdr))) { 4683 l2arc_read_callback_t *cb; 4684 void* b_data; 4685 4686 DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr); 4687 ARCSTAT_BUMP(arcstat_l2_hits); 4688 4689 cb = kmem_zalloc(sizeof (l2arc_read_callback_t), 4690 KM_SLEEP); 4691 cb->l2rcb_buf = buf; 4692 cb->l2rcb_spa = spa; 4693 cb->l2rcb_bp = *bp; 4694 cb->l2rcb_zb = *zb; 4695 cb->l2rcb_flags = zio_flags; 4696 cb->l2rcb_compress = b_compress; 4697 if (b_asize > hdr->b_size) { 4698 ASSERT3U(b_compress, ==, 4699 ZIO_COMPRESS_OFF); 4700 b_data = zio_data_buf_alloc(b_asize); 4701 cb->l2rcb_data = b_data; 4702 } else { 4703 b_data = buf->b_data; 4704 } 4705 4706 ASSERT(addr >= VDEV_LABEL_START_SIZE && 4707 addr + size < vd->vdev_psize - 4708 VDEV_LABEL_END_SIZE); 4709 4710 /* 4711 * l2arc read. The SCL_L2ARC lock will be 4712 * released by l2arc_read_done(). 4713 * Issue a null zio if the underlying buffer 4714 * was squashed to zero size by compression. 4715 */ 4716 if (b_compress == ZIO_COMPRESS_EMPTY) { 4717 ASSERT3U(b_asize, ==, 0); 4718 rzio = zio_null(pio, spa, vd, 4719 l2arc_read_done, cb, 4720 zio_flags | ZIO_FLAG_DONT_CACHE | 4721 ZIO_FLAG_CANFAIL | 4722 ZIO_FLAG_DONT_PROPAGATE | 4723 ZIO_FLAG_DONT_RETRY); 4724 } else { 4725 rzio = zio_read_phys(pio, vd, addr, 4726 b_asize, b_data, 4727 ZIO_CHECKSUM_OFF, 4728 l2arc_read_done, cb, priority, 4729 zio_flags | ZIO_FLAG_DONT_CACHE | 4730 ZIO_FLAG_CANFAIL | 4731 ZIO_FLAG_DONT_PROPAGATE | 4732 ZIO_FLAG_DONT_RETRY, B_FALSE); 4733 } 4734 DTRACE_PROBE2(l2arc__read, vdev_t *, vd, 4735 zio_t *, rzio); 4736 ARCSTAT_INCR(arcstat_l2_read_bytes, b_asize); 4737 4738 if (*arc_flags & ARC_FLAG_NOWAIT) { 4739 zio_nowait(rzio); 4740 return (0); 4741 } 4742 4743 ASSERT(*arc_flags & ARC_FLAG_WAIT); 4744 if (zio_wait(rzio) == 0) 4745 return (0); 4746 4747 /* l2arc read error; goto zio_read() */ 4748 } else { 4749 DTRACE_PROBE1(l2arc__miss, 4750 arc_buf_hdr_t *, hdr); 4751 ARCSTAT_BUMP(arcstat_l2_misses); 4752 if (HDR_L2_WRITING(hdr)) 4753 ARCSTAT_BUMP(arcstat_l2_rw_clash); 4754 spa_config_exit(spa, SCL_L2ARC, vd); 4755 } 4756 } else { 4757 if (vd != NULL) 4758 spa_config_exit(spa, SCL_L2ARC, vd); 4759 if (l2arc_ndev != 0) { 4760 DTRACE_PROBE1(l2arc__miss, 4761 arc_buf_hdr_t *, hdr); 4762 ARCSTAT_BUMP(arcstat_l2_misses); 4763 } 4764 } 4765 4766 rzio = zio_read(pio, spa, bp, buf->b_data, size, 4767 arc_read_done, buf, priority, zio_flags, zb); 4768 4769 if (*arc_flags & ARC_FLAG_WAIT) 4770 return (zio_wait(rzio)); 4771 4772 ASSERT(*arc_flags & ARC_FLAG_NOWAIT); 4773 zio_nowait(rzio); 4774 } 4775 return (0); 4776} 4777 4778void 4779arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private) 4780{ 4781 ASSERT(buf->b_hdr != NULL); 4782 ASSERT(buf->b_hdr->b_l1hdr.b_state != arc_anon); 4783 ASSERT(!refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt) || 4784 func == NULL); 4785 ASSERT(buf->b_efunc == NULL); 4786 ASSERT(!HDR_BUF_AVAILABLE(buf->b_hdr)); 4787 4788 buf->b_efunc = func; 4789 buf->b_private = private; 4790} 4791 4792/* 4793 * Notify the arc that a block was freed, and thus will never be used again. 4794 */ 4795void 4796arc_freed(spa_t *spa, const blkptr_t *bp) 4797{ 4798 arc_buf_hdr_t *hdr; 4799 kmutex_t *hash_lock; 4800 uint64_t guid = spa_load_guid(spa); 4801 4802 ASSERT(!BP_IS_EMBEDDED(bp)); 4803 4804 hdr = buf_hash_find(guid, bp, &hash_lock); 4805 if (hdr == NULL) 4806 return; 4807 if (HDR_BUF_AVAILABLE(hdr)) { 4808 arc_buf_t *buf = hdr->b_l1hdr.b_buf; 4809 add_reference(hdr, hash_lock, FTAG); 4810 hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE; 4811 mutex_exit(hash_lock); 4812 4813 arc_release(buf, FTAG); 4814 (void) arc_buf_remove_ref(buf, FTAG); 4815 } else { 4816 mutex_exit(hash_lock); 4817 } 4818 4819} 4820 4821/* 4822 * Clear the user eviction callback set by arc_set_callback(), first calling 4823 * it if it exists. Because the presence of a callback keeps an arc_buf cached 4824 * clearing the callback may result in the arc_buf being destroyed. However, 4825 * it will not result in the *last* arc_buf being destroyed, hence the data 4826 * will remain cached in the ARC. We make a copy of the arc buffer here so 4827 * that we can process the callback without holding any locks. 4828 * 4829 * It's possible that the callback is already in the process of being cleared 4830 * by another thread. In this case we can not clear the callback. 4831 * 4832 * Returns B_TRUE if the callback was successfully called and cleared. 4833 */ 4834boolean_t 4835arc_clear_callback(arc_buf_t *buf) 4836{ 4837 arc_buf_hdr_t *hdr; 4838 kmutex_t *hash_lock; 4839 arc_evict_func_t *efunc = buf->b_efunc; 4840 void *private = buf->b_private; 4841 4842 mutex_enter(&buf->b_evict_lock); 4843 hdr = buf->b_hdr; 4844 if (hdr == NULL) { 4845 /* 4846 * We are in arc_do_user_evicts(). 4847 */ 4848 ASSERT(buf->b_data == NULL); 4849 mutex_exit(&buf->b_evict_lock); 4850 return (B_FALSE); 4851 } else if (buf->b_data == NULL) { 4852 /* 4853 * We are on the eviction list; process this buffer now 4854 * but let arc_do_user_evicts() do the reaping. 4855 */ 4856 buf->b_efunc = NULL; 4857 mutex_exit(&buf->b_evict_lock); 4858 VERIFY0(efunc(private)); 4859 return (B_TRUE); 4860 } 4861 hash_lock = HDR_LOCK(hdr); 4862 mutex_enter(hash_lock); 4863 hdr = buf->b_hdr; 4864 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 4865 4866 ASSERT3U(refcount_count(&hdr->b_l1hdr.b_refcnt), <, 4867 hdr->b_l1hdr.b_datacnt); 4868 ASSERT(hdr->b_l1hdr.b_state == arc_mru || 4869 hdr->b_l1hdr.b_state == arc_mfu); 4870 4871 buf->b_efunc = NULL; 4872 buf->b_private = NULL; 4873 4874 if (hdr->b_l1hdr.b_datacnt > 1) { 4875 mutex_exit(&buf->b_evict_lock); 4876 arc_buf_destroy(buf, TRUE); 4877 } else { 4878 ASSERT(buf == hdr->b_l1hdr.b_buf); 4879 hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE; 4880 mutex_exit(&buf->b_evict_lock); 4881 } 4882 4883 mutex_exit(hash_lock); 4884 VERIFY0(efunc(private)); 4885 return (B_TRUE); 4886} 4887 4888/* 4889 * Release this buffer from the cache, making it an anonymous buffer. This 4890 * must be done after a read and prior to modifying the buffer contents. 4891 * If the buffer has more than one reference, we must make 4892 * a new hdr for the buffer. 4893 */ 4894void 4895arc_release(arc_buf_t *buf, void *tag) 4896{ 4897 arc_buf_hdr_t *hdr = buf->b_hdr; 4898 4899 /* 4900 * It would be nice to assert that if it's DMU metadata (level > 4901 * 0 || it's the dnode file), then it must be syncing context. 4902 * But we don't know that information at this level. 4903 */ 4904 4905 mutex_enter(&buf->b_evict_lock); 4906 4907 ASSERT(HDR_HAS_L1HDR(hdr)); 4908 4909 /* 4910 * We don't grab the hash lock prior to this check, because if 4911 * the buffer's header is in the arc_anon state, it won't be 4912 * linked into the hash table. 4913 */ 4914 if (hdr->b_l1hdr.b_state == arc_anon) { 4915 mutex_exit(&buf->b_evict_lock); 4916 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 4917 ASSERT(!HDR_IN_HASH_TABLE(hdr)); 4918 ASSERT(!HDR_HAS_L2HDR(hdr)); 4919 ASSERT(BUF_EMPTY(hdr)); 4920 ASSERT3U(hdr->b_l1hdr.b_datacnt, ==, 1); 4921 ASSERT3S(refcount_count(&hdr->b_l1hdr.b_refcnt), ==, 1); 4922 ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node)); 4923 4924 ASSERT3P(buf->b_efunc, ==, NULL); 4925 ASSERT3P(buf->b_private, ==, NULL); 4926 4927 hdr->b_l1hdr.b_arc_access = 0; 4928 arc_buf_thaw(buf); 4929 4930 return; 4931 } 4932 4933 kmutex_t *hash_lock = HDR_LOCK(hdr); 4934 mutex_enter(hash_lock); 4935 4936 /* 4937 * This assignment is only valid as long as the hash_lock is 4938 * held, we must be careful not to reference state or the 4939 * b_state field after dropping the lock. 4940 */ 4941 arc_state_t *state = hdr->b_l1hdr.b_state; 4942 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 4943 ASSERT3P(state, !=, arc_anon); 4944 4945 /* this buffer is not on any list */ 4946 ASSERT(refcount_count(&hdr->b_l1hdr.b_refcnt) > 0); 4947 4948 if (HDR_HAS_L2HDR(hdr)) { 4949 mutex_enter(&hdr->b_l2hdr.b_dev->l2ad_mtx); 4950 4951 /* 4952 * We have to recheck this conditional again now that 4953 * we're holding the l2ad_mtx to prevent a race with 4954 * another thread which might be concurrently calling 4955 * l2arc_evict(). In that case, l2arc_evict() might have 4956 * destroyed the header's L2 portion as we were waiting 4957 * to acquire the l2ad_mtx. 4958 */ 4959 if (HDR_HAS_L2HDR(hdr)) { 4960 l2arc_trim(hdr); 4961 arc_hdr_l2hdr_destroy(hdr); 4962 } 4963 4964 mutex_exit(&hdr->b_l2hdr.b_dev->l2ad_mtx); 4965 } 4966 4967 /* 4968 * Do we have more than one buf? 4969 */ 4970 if (hdr->b_l1hdr.b_datacnt > 1) { 4971 arc_buf_hdr_t *nhdr; 4972 arc_buf_t **bufp; 4973 uint64_t blksz = hdr->b_size; 4974 uint64_t spa = hdr->b_spa; 4975 arc_buf_contents_t type = arc_buf_type(hdr); 4976 uint32_t flags = hdr->b_flags; 4977 4978 ASSERT(hdr->b_l1hdr.b_buf != buf || buf->b_next != NULL); 4979 /* 4980 * Pull the data off of this hdr and attach it to 4981 * a new anonymous hdr. 4982 */ 4983 (void) remove_reference(hdr, hash_lock, tag); 4984 bufp = &hdr->b_l1hdr.b_buf; 4985 while (*bufp != buf) 4986 bufp = &(*bufp)->b_next; 4987 *bufp = buf->b_next; 4988 buf->b_next = NULL; 4989 4990 ASSERT3P(state, !=, arc_l2c_only); 4991 4992 (void) refcount_remove_many( 4993 &state->arcs_size, hdr->b_size, buf); 4994 4995 if (refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) { 4996 ASSERT3P(state, !=, arc_l2c_only); 4997 uint64_t *size = &state->arcs_lsize[type]; 4998 ASSERT3U(*size, >=, hdr->b_size); 4999 atomic_add_64(size, -hdr->b_size); 5000 } 5001 5002 /* 5003 * We're releasing a duplicate user data buffer, update 5004 * our statistics accordingly. 5005 */ 5006 if (HDR_ISTYPE_DATA(hdr)) { 5007 ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers); 5008 ARCSTAT_INCR(arcstat_duplicate_buffers_size, 5009 -hdr->b_size); 5010 } 5011 hdr->b_l1hdr.b_datacnt -= 1; 5012 arc_cksum_verify(buf); 5013#ifdef illumos 5014 arc_buf_unwatch(buf); 5015#endif 5016 5017 mutex_exit(hash_lock); 5018 5019 nhdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE); 5020 nhdr->b_size = blksz; 5021 nhdr->b_spa = spa; 5022 5023 nhdr->b_flags = flags & ARC_FLAG_L2_WRITING; 5024 nhdr->b_flags |= arc_bufc_to_flags(type); 5025 nhdr->b_flags |= ARC_FLAG_HAS_L1HDR; 5026 5027 nhdr->b_l1hdr.b_buf = buf; 5028 nhdr->b_l1hdr.b_datacnt = 1; 5029 nhdr->b_l1hdr.b_state = arc_anon; 5030 nhdr->b_l1hdr.b_arc_access = 0; 5031 nhdr->b_l1hdr.b_tmp_cdata = NULL; 5032 nhdr->b_freeze_cksum = NULL; 5033 5034 (void) refcount_add(&nhdr->b_l1hdr.b_refcnt, tag); 5035 buf->b_hdr = nhdr; 5036 mutex_exit(&buf->b_evict_lock); 5037 (void) refcount_add_many(&arc_anon->arcs_size, blksz, buf); 5038 } else { 5039 mutex_exit(&buf->b_evict_lock); 5040 ASSERT(refcount_count(&hdr->b_l1hdr.b_refcnt) == 1); 5041 /* protected by hash lock, or hdr is on arc_anon */ 5042 ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); 5043 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 5044 arc_change_state(arc_anon, hdr, hash_lock); 5045 hdr->b_l1hdr.b_arc_access = 0; 5046 mutex_exit(hash_lock); 5047 5048 buf_discard_identity(hdr); 5049 arc_buf_thaw(buf); 5050 } 5051 buf->b_efunc = NULL; 5052 buf->b_private = NULL; 5053} 5054 5055int 5056arc_released(arc_buf_t *buf) 5057{ 5058 int released; 5059 5060 mutex_enter(&buf->b_evict_lock); 5061 released = (buf->b_data != NULL && 5062 buf->b_hdr->b_l1hdr.b_state == arc_anon); 5063 mutex_exit(&buf->b_evict_lock); 5064 return (released); 5065} 5066 5067#ifdef ZFS_DEBUG 5068int 5069arc_referenced(arc_buf_t *buf) 5070{ 5071 int referenced; 5072 5073 mutex_enter(&buf->b_evict_lock); 5074 referenced = (refcount_count(&buf->b_hdr->b_l1hdr.b_refcnt)); 5075 mutex_exit(&buf->b_evict_lock); 5076 return (referenced); 5077} 5078#endif 5079 5080static void 5081arc_write_ready(zio_t *zio) 5082{ 5083 arc_write_callback_t *callback = zio->io_private; 5084 arc_buf_t *buf = callback->awcb_buf; 5085 arc_buf_hdr_t *hdr = buf->b_hdr; 5086 5087 ASSERT(HDR_HAS_L1HDR(hdr)); 5088 ASSERT(!refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt)); 5089 ASSERT(hdr->b_l1hdr.b_datacnt > 0); 5090 callback->awcb_ready(zio, buf, callback->awcb_private); 5091 5092 /* 5093 * If the IO is already in progress, then this is a re-write 5094 * attempt, so we need to thaw and re-compute the cksum. 5095 * It is the responsibility of the callback to handle the 5096 * accounting for any re-write attempt. 5097 */ 5098 if (HDR_IO_IN_PROGRESS(hdr)) { 5099 mutex_enter(&hdr->b_l1hdr.b_freeze_lock); 5100 if (hdr->b_freeze_cksum != NULL) { 5101 kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t)); 5102 hdr->b_freeze_cksum = NULL; 5103 } 5104 mutex_exit(&hdr->b_l1hdr.b_freeze_lock); 5105 } 5106 arc_cksum_compute(buf, B_FALSE); 5107 hdr->b_flags |= ARC_FLAG_IO_IN_PROGRESS; 5108} 5109 5110static void 5111arc_write_children_ready(zio_t *zio) 5112{ 5113 arc_write_callback_t *callback = zio->io_private; 5114 arc_buf_t *buf = callback->awcb_buf; 5115 5116 callback->awcb_children_ready(zio, buf, callback->awcb_private); 5117} 5118 5119/* 5120 * The SPA calls this callback for each physical write that happens on behalf 5121 * of a logical write. See the comment in dbuf_write_physdone() for details. 5122 */ 5123static void 5124arc_write_physdone(zio_t *zio) 5125{ 5126 arc_write_callback_t *cb = zio->io_private; 5127 if (cb->awcb_physdone != NULL) 5128 cb->awcb_physdone(zio, cb->awcb_buf, cb->awcb_private); 5129} 5130 5131static void 5132arc_write_done(zio_t *zio) 5133{ 5134 arc_write_callback_t *callback = zio->io_private; 5135 arc_buf_t *buf = callback->awcb_buf; 5136 arc_buf_hdr_t *hdr = buf->b_hdr; 5137 5138 ASSERT(hdr->b_l1hdr.b_acb == NULL); 5139 5140 if (zio->io_error == 0) { 5141 if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) { 5142 buf_discard_identity(hdr); 5143 } else { 5144 hdr->b_dva = *BP_IDENTITY(zio->io_bp); 5145 hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp); 5146 } 5147 } else { 5148 ASSERT(BUF_EMPTY(hdr)); 5149 } 5150 5151 /* 5152 * If the block to be written was all-zero or compressed enough to be 5153 * embedded in the BP, no write was performed so there will be no 5154 * dva/birth/checksum. The buffer must therefore remain anonymous 5155 * (and uncached). 5156 */ 5157 if (!BUF_EMPTY(hdr)) { 5158 arc_buf_hdr_t *exists; 5159 kmutex_t *hash_lock; 5160 5161 ASSERT(zio->io_error == 0); 5162 5163 arc_cksum_verify(buf); 5164 5165 exists = buf_hash_insert(hdr, &hash_lock); 5166 if (exists != NULL) { 5167 /* 5168 * This can only happen if we overwrite for 5169 * sync-to-convergence, because we remove 5170 * buffers from the hash table when we arc_free(). 5171 */ 5172 if (zio->io_flags & ZIO_FLAG_IO_REWRITE) { 5173 if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp)) 5174 panic("bad overwrite, hdr=%p exists=%p", 5175 (void *)hdr, (void *)exists); 5176 ASSERT(refcount_is_zero( 5177 &exists->b_l1hdr.b_refcnt)); 5178 arc_change_state(arc_anon, exists, hash_lock); 5179 mutex_exit(hash_lock); 5180 arc_hdr_destroy(exists); 5181 exists = buf_hash_insert(hdr, &hash_lock); 5182 ASSERT3P(exists, ==, NULL); 5183 } else if (zio->io_flags & ZIO_FLAG_NOPWRITE) { 5184 /* nopwrite */ 5185 ASSERT(zio->io_prop.zp_nopwrite); 5186 if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp)) 5187 panic("bad nopwrite, hdr=%p exists=%p", 5188 (void *)hdr, (void *)exists); 5189 } else { 5190 /* Dedup */ 5191 ASSERT(hdr->b_l1hdr.b_datacnt == 1); 5192 ASSERT(hdr->b_l1hdr.b_state == arc_anon); 5193 ASSERT(BP_GET_DEDUP(zio->io_bp)); 5194 ASSERT(BP_GET_LEVEL(zio->io_bp) == 0); 5195 } 5196 } 5197 hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS; 5198 /* if it's not anon, we are doing a scrub */ 5199 if (exists == NULL && hdr->b_l1hdr.b_state == arc_anon) 5200 arc_access(hdr, hash_lock); 5201 mutex_exit(hash_lock); 5202 } else { 5203 hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS; 5204 } 5205 5206 ASSERT(!refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 5207 callback->awcb_done(zio, buf, callback->awcb_private); 5208 5209 kmem_free(callback, sizeof (arc_write_callback_t)); 5210} 5211 5212zio_t * 5213arc_write(zio_t *pio, spa_t *spa, uint64_t txg, 5214 blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, boolean_t l2arc_compress, 5215 const zio_prop_t *zp, arc_done_func_t *ready, 5216 arc_done_func_t *children_ready, arc_done_func_t *physdone, 5217 arc_done_func_t *done, void *private, zio_priority_t priority, 5218 int zio_flags, const zbookmark_phys_t *zb) 5219{ 5220 arc_buf_hdr_t *hdr = buf->b_hdr; 5221 arc_write_callback_t *callback; 5222 zio_t *zio; 5223 5224 ASSERT(ready != NULL); 5225 ASSERT(done != NULL); 5226 ASSERT(!HDR_IO_ERROR(hdr)); 5227 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 5228 ASSERT(hdr->b_l1hdr.b_acb == NULL); 5229 ASSERT(hdr->b_l1hdr.b_datacnt > 0); 5230 if (l2arc) 5231 hdr->b_flags |= ARC_FLAG_L2CACHE; 5232 if (l2arc_compress) 5233 hdr->b_flags |= ARC_FLAG_L2COMPRESS; 5234 callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP); 5235 callback->awcb_ready = ready; 5236 callback->awcb_children_ready = children_ready; 5237 callback->awcb_physdone = physdone; 5238 callback->awcb_done = done; 5239 callback->awcb_private = private; 5240 callback->awcb_buf = buf; 5241 5242 zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, zp, 5243 arc_write_ready, 5244 (children_ready != NULL) ? arc_write_children_ready : NULL, 5245 arc_write_physdone, arc_write_done, callback, 5246 priority, zio_flags, zb); 5247 5248 return (zio); 5249} 5250 5251static int 5252arc_memory_throttle(uint64_t reserve, uint64_t txg) 5253{ 5254#ifdef _KERNEL 5255 uint64_t available_memory = ptob(freemem); 5256 static uint64_t page_load = 0; 5257 static uint64_t last_txg = 0; 5258 5259#if defined(__i386) || !defined(UMA_MD_SMALL_ALLOC) 5260 available_memory = 5261 MIN(available_memory, ptob(vmem_size(heap_arena, VMEM_FREE))); 5262#endif 5263 5264 if (freemem > (uint64_t)physmem * arc_lotsfree_percent / 100) 5265 return (0); 5266 5267 if (txg > last_txg) { 5268 last_txg = txg; 5269 page_load = 0; 5270 } 5271 /* 5272 * If we are in pageout, we know that memory is already tight, 5273 * the arc is already going to be evicting, so we just want to 5274 * continue to let page writes occur as quickly as possible. 5275 */ 5276 if (curproc == pageproc) { 5277 if (page_load > MAX(ptob(minfree), available_memory) / 4) 5278 return (SET_ERROR(ERESTART)); 5279 /* Note: reserve is inflated, so we deflate */ 5280 page_load += reserve / 8; 5281 return (0); 5282 } else if (page_load > 0 && arc_reclaim_needed()) { 5283 /* memory is low, delay before restarting */ 5284 ARCSTAT_INCR(arcstat_memory_throttle_count, 1); 5285 return (SET_ERROR(EAGAIN)); 5286 } 5287 page_load = 0; 5288#endif 5289 return (0); 5290} 5291 5292void 5293arc_tempreserve_clear(uint64_t reserve) 5294{ 5295 atomic_add_64(&arc_tempreserve, -reserve); 5296 ASSERT((int64_t)arc_tempreserve >= 0); 5297} 5298 5299int 5300arc_tempreserve_space(uint64_t reserve, uint64_t txg) 5301{ 5302 int error; 5303 uint64_t anon_size; 5304 5305 if (reserve > arc_c/4 && !arc_no_grow) { 5306 arc_c = MIN(arc_c_max, reserve * 4); 5307 DTRACE_PROBE1(arc__set_reserve, uint64_t, arc_c); 5308 } 5309 if (reserve > arc_c) 5310 return (SET_ERROR(ENOMEM)); 5311 5312 /* 5313 * Don't count loaned bufs as in flight dirty data to prevent long 5314 * network delays from blocking transactions that are ready to be 5315 * assigned to a txg. 5316 */ 5317 anon_size = MAX((int64_t)(refcount_count(&arc_anon->arcs_size) - 5318 arc_loaned_bytes), 0); 5319 5320 /* 5321 * Writes will, almost always, require additional memory allocations 5322 * in order to compress/encrypt/etc the data. We therefore need to 5323 * make sure that there is sufficient available memory for this. 5324 */ 5325 error = arc_memory_throttle(reserve, txg); 5326 if (error != 0) 5327 return (error); 5328 5329 /* 5330 * Throttle writes when the amount of dirty data in the cache 5331 * gets too large. We try to keep the cache less than half full 5332 * of dirty blocks so that our sync times don't grow too large. 5333 * Note: if two requests come in concurrently, we might let them 5334 * both succeed, when one of them should fail. Not a huge deal. 5335 */ 5336 5337 if (reserve + arc_tempreserve + anon_size > arc_c / 2 && 5338 anon_size > arc_c / 4) { 5339 dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK " 5340 "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n", 5341 arc_tempreserve>>10, 5342 arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10, 5343 arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10, 5344 reserve>>10, arc_c>>10); 5345 return (SET_ERROR(ERESTART)); 5346 } 5347 atomic_add_64(&arc_tempreserve, reserve); 5348 return (0); 5349} 5350 5351static void 5352arc_kstat_update_state(arc_state_t *state, kstat_named_t *size, 5353 kstat_named_t *evict_data, kstat_named_t *evict_metadata) 5354{ 5355 size->value.ui64 = refcount_count(&state->arcs_size); 5356 evict_data->value.ui64 = state->arcs_lsize[ARC_BUFC_DATA]; 5357 evict_metadata->value.ui64 = state->arcs_lsize[ARC_BUFC_METADATA]; 5358} 5359 5360static int 5361arc_kstat_update(kstat_t *ksp, int rw) 5362{ 5363 arc_stats_t *as = ksp->ks_data; 5364 5365 if (rw == KSTAT_WRITE) { 5366 return (EACCES); 5367 } else { 5368 arc_kstat_update_state(arc_anon, 5369 &as->arcstat_anon_size, 5370 &as->arcstat_anon_evictable_data, 5371 &as->arcstat_anon_evictable_metadata); 5372 arc_kstat_update_state(arc_mru, 5373 &as->arcstat_mru_size, 5374 &as->arcstat_mru_evictable_data, 5375 &as->arcstat_mru_evictable_metadata); 5376 arc_kstat_update_state(arc_mru_ghost, 5377 &as->arcstat_mru_ghost_size, 5378 &as->arcstat_mru_ghost_evictable_data, 5379 &as->arcstat_mru_ghost_evictable_metadata); 5380 arc_kstat_update_state(arc_mfu, 5381 &as->arcstat_mfu_size, 5382 &as->arcstat_mfu_evictable_data, 5383 &as->arcstat_mfu_evictable_metadata); 5384 arc_kstat_update_state(arc_mfu_ghost, 5385 &as->arcstat_mfu_ghost_size, 5386 &as->arcstat_mfu_ghost_evictable_data, 5387 &as->arcstat_mfu_ghost_evictable_metadata); 5388 } 5389 5390 return (0); 5391} 5392 5393/* 5394 * This function *must* return indices evenly distributed between all 5395 * sublists of the multilist. This is needed due to how the ARC eviction 5396 * code is laid out; arc_evict_state() assumes ARC buffers are evenly 5397 * distributed between all sublists and uses this assumption when 5398 * deciding which sublist to evict from and how much to evict from it. 5399 */ 5400unsigned int 5401arc_state_multilist_index_func(multilist_t *ml, void *obj) 5402{ 5403 arc_buf_hdr_t *hdr = obj; 5404 5405 /* 5406 * We rely on b_dva to generate evenly distributed index 5407 * numbers using buf_hash below. So, as an added precaution, 5408 * let's make sure we never add empty buffers to the arc lists. 5409 */ 5410 ASSERT(!BUF_EMPTY(hdr)); 5411 5412 /* 5413 * The assumption here, is the hash value for a given 5414 * arc_buf_hdr_t will remain constant throughout it's lifetime 5415 * (i.e. it's b_spa, b_dva, and b_birth fields don't change). 5416 * Thus, we don't need to store the header's sublist index 5417 * on insertion, as this index can be recalculated on removal. 5418 * 5419 * Also, the low order bits of the hash value are thought to be 5420 * distributed evenly. Otherwise, in the case that the multilist 5421 * has a power of two number of sublists, each sublists' usage 5422 * would not be evenly distributed. 5423 */ 5424 return (buf_hash(hdr->b_spa, &hdr->b_dva, hdr->b_birth) % 5425 multilist_get_num_sublists(ml)); 5426} 5427 5428#ifdef _KERNEL 5429static eventhandler_tag arc_event_lowmem = NULL; 5430 5431static void 5432arc_lowmem(void *arg __unused, int howto __unused) 5433{ 5434 5435 mutex_enter(&arc_reclaim_lock); 5436 /* XXX: Memory deficit should be passed as argument. */ 5437 needfree = btoc(arc_c >> arc_shrink_shift); 5438 DTRACE_PROBE(arc__needfree); 5439 cv_signal(&arc_reclaim_thread_cv); 5440 5441 /* 5442 * It is unsafe to block here in arbitrary threads, because we can come 5443 * here from ARC itself and may hold ARC locks and thus risk a deadlock 5444 * with ARC reclaim thread. 5445 */ 5446 if (curproc == pageproc) 5447 (void) cv_wait(&arc_reclaim_waiters_cv, &arc_reclaim_lock); 5448 mutex_exit(&arc_reclaim_lock); 5449} 5450#endif 5451 5452void 5453arc_init(void) 5454{ 5455 int i, prefetch_tunable_set = 0; 5456 5457 mutex_init(&arc_reclaim_lock, NULL, MUTEX_DEFAULT, NULL); 5458 cv_init(&arc_reclaim_thread_cv, NULL, CV_DEFAULT, NULL); 5459 cv_init(&arc_reclaim_waiters_cv, NULL, CV_DEFAULT, NULL); 5460 5461 mutex_init(&arc_user_evicts_lock, NULL, MUTEX_DEFAULT, NULL); 5462 cv_init(&arc_user_evicts_cv, NULL, CV_DEFAULT, NULL); 5463 5464 mutex_init(&arc_dnlc_evicts_lock, NULL, MUTEX_DEFAULT, NULL); 5465 cv_init(&arc_dnlc_evicts_cv, NULL, CV_DEFAULT, NULL); 5466 5467 /* Convert seconds to clock ticks */ 5468 arc_min_prefetch_lifespan = 1 * hz; 5469 5470 /* Start out with 1/8 of all memory */ 5471 arc_c = kmem_size() / 8; 5472 5473#ifdef illumos 5474#ifdef _KERNEL 5475 /* 5476 * On architectures where the physical memory can be larger 5477 * than the addressable space (intel in 32-bit mode), we may 5478 * need to limit the cache to 1/8 of VM size. 5479 */ 5480 arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8); 5481#endif 5482#endif /* illumos */ 5483 /* set min cache to 1/32 of all memory, or arc_abs_min, whichever is more */ 5484 arc_c_min = MAX(arc_c / 4, arc_abs_min); 5485 /* set max to 1/2 of all memory, or all but 1GB, whichever is more */ 5486 if (arc_c * 8 >= 1 << 30) 5487 arc_c_max = (arc_c * 8) - (1 << 30); 5488 else 5489 arc_c_max = arc_c_min; 5490 arc_c_max = MAX(arc_c * 5, arc_c_max); 5491 5492 /* 5493 * In userland, there's only the memory pressure that we artificially 5494 * create (see arc_available_memory()). Don't let arc_c get too 5495 * small, because it can cause transactions to be larger than 5496 * arc_c, causing arc_tempreserve_space() to fail. 5497 */ 5498#ifndef _KERNEL 5499 arc_c_min = arc_c_max / 2; 5500#endif 5501 5502#ifdef _KERNEL 5503 /* 5504 * Allow the tunables to override our calculations if they are 5505 * reasonable. 5506 */ 5507 if (zfs_arc_max > arc_abs_min && zfs_arc_max < kmem_size()) 5508 arc_c_max = zfs_arc_max; 5509 if (zfs_arc_min > arc_abs_min && zfs_arc_min <= arc_c_max) 5510 arc_c_min = zfs_arc_min; 5511#endif 5512 5513 arc_c = arc_c_max; 5514 arc_p = (arc_c >> 1); 5515 5516 /* limit meta-data to 1/4 of the arc capacity */ 5517 arc_meta_limit = arc_c_max / 4; 5518 5519 /* Allow the tunable to override if it is reasonable */ 5520 if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max) 5521 arc_meta_limit = zfs_arc_meta_limit; 5522 5523 if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0) 5524 arc_c_min = arc_meta_limit / 2; 5525 5526 if (zfs_arc_meta_min > 0) { 5527 arc_meta_min = zfs_arc_meta_min; 5528 } else { 5529 arc_meta_min = arc_c_min / 2; 5530 } 5531 5532 if (zfs_arc_grow_retry > 0) 5533 arc_grow_retry = zfs_arc_grow_retry; 5534 5535 if (zfs_arc_shrink_shift > 0) 5536 arc_shrink_shift = zfs_arc_shrink_shift; 5537 5538 /* 5539 * Ensure that arc_no_grow_shift is less than arc_shrink_shift. 5540 */ 5541 if (arc_no_grow_shift >= arc_shrink_shift) 5542 arc_no_grow_shift = arc_shrink_shift - 1; 5543 5544 if (zfs_arc_p_min_shift > 0) 5545 arc_p_min_shift = zfs_arc_p_min_shift; 5546 5547 if (zfs_arc_num_sublists_per_state < 1) 5548 zfs_arc_num_sublists_per_state = MAX(max_ncpus, 1); 5549 5550 /* if kmem_flags are set, lets try to use less memory */ 5551 if (kmem_debugging()) 5552 arc_c = arc_c / 2; 5553 if (arc_c < arc_c_min) 5554 arc_c = arc_c_min; 5555 5556 zfs_arc_min = arc_c_min; 5557 zfs_arc_max = arc_c_max; 5558 5559 arc_anon = &ARC_anon; 5560 arc_mru = &ARC_mru; 5561 arc_mru_ghost = &ARC_mru_ghost; 5562 arc_mfu = &ARC_mfu; 5563 arc_mfu_ghost = &ARC_mfu_ghost; 5564 arc_l2c_only = &ARC_l2c_only; 5565 arc_size = 0; 5566 5567 multilist_create(&arc_mru->arcs_list[ARC_BUFC_METADATA], 5568 sizeof (arc_buf_hdr_t), 5569 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), 5570 zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); 5571 multilist_create(&arc_mru->arcs_list[ARC_BUFC_DATA], 5572 sizeof (arc_buf_hdr_t), 5573 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), 5574 zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); 5575 multilist_create(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA], 5576 sizeof (arc_buf_hdr_t), 5577 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), 5578 zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); 5579 multilist_create(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA], 5580 sizeof (arc_buf_hdr_t), 5581 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), 5582 zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); 5583 multilist_create(&arc_mfu->arcs_list[ARC_BUFC_METADATA], 5584 sizeof (arc_buf_hdr_t), 5585 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), 5586 zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); 5587 multilist_create(&arc_mfu->arcs_list[ARC_BUFC_DATA], 5588 sizeof (arc_buf_hdr_t), 5589 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), 5590 zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); 5591 multilist_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA], 5592 sizeof (arc_buf_hdr_t), 5593 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), 5594 zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); 5595 multilist_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA], 5596 sizeof (arc_buf_hdr_t), 5597 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), 5598 zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); 5599 multilist_create(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA], 5600 sizeof (arc_buf_hdr_t), 5601 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), 5602 zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); 5603 multilist_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA], 5604 sizeof (arc_buf_hdr_t), 5605 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), 5606 zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); 5607 5608 refcount_create(&arc_anon->arcs_size); 5609 refcount_create(&arc_mru->arcs_size); 5610 refcount_create(&arc_mru_ghost->arcs_size); 5611 refcount_create(&arc_mfu->arcs_size); 5612 refcount_create(&arc_mfu_ghost->arcs_size); 5613 refcount_create(&arc_l2c_only->arcs_size); 5614 5615 buf_init(); 5616 5617 arc_reclaim_thread_exit = FALSE; 5618 arc_user_evicts_thread_exit = FALSE; 5619 arc_dnlc_evicts_thread_exit = FALSE; 5620 arc_eviction_list = NULL; 5621 bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t)); 5622 5623 arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED, 5624 sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); 5625 5626 if (arc_ksp != NULL) { 5627 arc_ksp->ks_data = &arc_stats; 5628 arc_ksp->ks_update = arc_kstat_update; 5629 kstat_install(arc_ksp); 5630 } 5631 5632 (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0, 5633 TS_RUN, minclsyspri); 5634 5635#ifdef _KERNEL 5636 arc_event_lowmem = EVENTHANDLER_REGISTER(vm_lowmem, arc_lowmem, NULL, 5637 EVENTHANDLER_PRI_FIRST); 5638#endif 5639 5640 (void) thread_create(NULL, 0, arc_user_evicts_thread, NULL, 0, &p0, 5641 TS_RUN, minclsyspri); 5642 5643 (void) thread_create(NULL, 0, arc_dnlc_evicts_thread, NULL, 0, &p0, 5644 TS_RUN, minclsyspri); 5645 5646 arc_dead = FALSE; 5647 arc_warm = B_FALSE; 5648 5649 /* 5650 * Calculate maximum amount of dirty data per pool. 5651 * 5652 * If it has been set by /etc/system, take that. 5653 * Otherwise, use a percentage of physical memory defined by 5654 * zfs_dirty_data_max_percent (default 10%) with a cap at 5655 * zfs_dirty_data_max_max (default 4GB). 5656 */ 5657 if (zfs_dirty_data_max == 0) { 5658 zfs_dirty_data_max = ptob(physmem) * 5659 zfs_dirty_data_max_percent / 100; 5660 zfs_dirty_data_max = MIN(zfs_dirty_data_max, 5661 zfs_dirty_data_max_max); 5662 } 5663 5664#ifdef _KERNEL 5665 if (TUNABLE_INT_FETCH("vfs.zfs.prefetch_disable", &zfs_prefetch_disable)) 5666 prefetch_tunable_set = 1; 5667 5668#ifdef __i386__ 5669 if (prefetch_tunable_set == 0) { 5670 printf("ZFS NOTICE: Prefetch is disabled by default on i386 " 5671 "-- to enable,\n"); 5672 printf(" add \"vfs.zfs.prefetch_disable=0\" " 5673 "to /boot/loader.conf.\n"); 5674 zfs_prefetch_disable = 1; 5675 } 5676#else 5677 if ((((uint64_t)physmem * PAGESIZE) < (1ULL << 32)) && 5678 prefetch_tunable_set == 0) { 5679 printf("ZFS NOTICE: Prefetch is disabled by default if less " 5680 "than 4GB of RAM is present;\n" 5681 " to enable, add \"vfs.zfs.prefetch_disable=0\" " 5682 "to /boot/loader.conf.\n"); 5683 zfs_prefetch_disable = 1; 5684 } 5685#endif 5686 /* Warn about ZFS memory and address space requirements. */ 5687 if (((uint64_t)physmem * PAGESIZE) < (256 + 128 + 64) * (1 << 20)) { 5688 printf("ZFS WARNING: Recommended minimum RAM size is 512MB; " 5689 "expect unstable behavior.\n"); 5690 } 5691 if (kmem_size() < 512 * (1 << 20)) { 5692 printf("ZFS WARNING: Recommended minimum kmem_size is 512MB; " 5693 "expect unstable behavior.\n"); 5694 printf(" Consider tuning vm.kmem_size and " 5695 "vm.kmem_size_max\n"); 5696 printf(" in /boot/loader.conf.\n"); 5697 } 5698#endif 5699} 5700 5701void 5702arc_fini(void) 5703{ 5704 mutex_enter(&arc_reclaim_lock); 5705 arc_reclaim_thread_exit = TRUE; 5706 /* 5707 * The reclaim thread will set arc_reclaim_thread_exit back to 5708 * FALSE when it is finished exiting; we're waiting for that. 5709 */ 5710 while (arc_reclaim_thread_exit) { 5711 cv_signal(&arc_reclaim_thread_cv); 5712 cv_wait(&arc_reclaim_thread_cv, &arc_reclaim_lock); 5713 } 5714 mutex_exit(&arc_reclaim_lock); 5715 5716 mutex_enter(&arc_user_evicts_lock); 5717 arc_user_evicts_thread_exit = TRUE; 5718 /* 5719 * The user evicts thread will set arc_user_evicts_thread_exit 5720 * to FALSE when it is finished exiting; we're waiting for that. 5721 */ 5722 while (arc_user_evicts_thread_exit) { 5723 cv_signal(&arc_user_evicts_cv); 5724 cv_wait(&arc_user_evicts_cv, &arc_user_evicts_lock); 5725 } 5726 mutex_exit(&arc_user_evicts_lock); 5727 5728 mutex_enter(&arc_dnlc_evicts_lock); 5729 arc_dnlc_evicts_thread_exit = TRUE; 5730 /* 5731 * The user evicts thread will set arc_user_evicts_thread_exit 5732 * to FALSE when it is finished exiting; we're waiting for that. 5733 */ 5734 while (arc_dnlc_evicts_thread_exit) { 5735 cv_signal(&arc_dnlc_evicts_cv); 5736 cv_wait(&arc_dnlc_evicts_cv, &arc_dnlc_evicts_lock); 5737 } 5738 mutex_exit(&arc_dnlc_evicts_lock); 5739 5740 /* Use TRUE to ensure *all* buffers are evicted */ 5741 arc_flush(NULL, TRUE); 5742 5743 arc_dead = TRUE; 5744 5745 if (arc_ksp != NULL) { 5746 kstat_delete(arc_ksp); 5747 arc_ksp = NULL; 5748 } 5749 5750 mutex_destroy(&arc_reclaim_lock); 5751 cv_destroy(&arc_reclaim_thread_cv); 5752 cv_destroy(&arc_reclaim_waiters_cv); 5753 5754 mutex_destroy(&arc_user_evicts_lock); 5755 cv_destroy(&arc_user_evicts_cv); 5756 5757 mutex_destroy(&arc_dnlc_evicts_lock); 5758 cv_destroy(&arc_dnlc_evicts_cv); 5759 5760 refcount_destroy(&arc_anon->arcs_size); 5761 refcount_destroy(&arc_mru->arcs_size); 5762 refcount_destroy(&arc_mru_ghost->arcs_size); 5763 refcount_destroy(&arc_mfu->arcs_size); 5764 refcount_destroy(&arc_mfu_ghost->arcs_size); 5765 refcount_destroy(&arc_l2c_only->arcs_size); 5766 5767 multilist_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]); 5768 multilist_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]); 5769 multilist_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]); 5770 multilist_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]); 5771 multilist_destroy(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA]); 5772 multilist_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]); 5773 multilist_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]); 5774 multilist_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]); 5775 multilist_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]); 5776 multilist_destroy(&arc_l2c_only->arcs_list[ARC_BUFC_DATA]); 5777 5778 buf_fini(); 5779 5780 ASSERT0(arc_loaned_bytes); 5781 5782#ifdef _KERNEL 5783 if (arc_event_lowmem != NULL) 5784 EVENTHANDLER_DEREGISTER(vm_lowmem, arc_event_lowmem); 5785#endif 5786} 5787 5788/* 5789 * Level 2 ARC 5790 * 5791 * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk. 5792 * It uses dedicated storage devices to hold cached data, which are populated 5793 * using large infrequent writes. The main role of this cache is to boost 5794 * the performance of random read workloads. The intended L2ARC devices 5795 * include short-stroked disks, solid state disks, and other media with 5796 * substantially faster read latency than disk. 5797 * 5798 * +-----------------------+ 5799 * | ARC | 5800 * +-----------------------+ 5801 * | ^ ^ 5802 * | | | 5803 * l2arc_feed_thread() arc_read() 5804 * | | | 5805 * | l2arc read | 5806 * V | | 5807 * +---------------+ | 5808 * | L2ARC | | 5809 * +---------------+ | 5810 * | ^ | 5811 * l2arc_write() | | 5812 * | | | 5813 * V | | 5814 * +-------+ +-------+ 5815 * | vdev | | vdev | 5816 * | cache | | cache | 5817 * +-------+ +-------+ 5818 * +=========+ .-----. 5819 * : L2ARC : |-_____-| 5820 * : devices : | Disks | 5821 * +=========+ `-_____-' 5822 * 5823 * Read requests are satisfied from the following sources, in order: 5824 * 5825 * 1) ARC 5826 * 2) vdev cache of L2ARC devices 5827 * 3) L2ARC devices 5828 * 4) vdev cache of disks 5829 * 5) disks 5830 * 5831 * Some L2ARC device types exhibit extremely slow write performance. 5832 * To accommodate for this there are some significant differences between 5833 * the L2ARC and traditional cache design: 5834 * 5835 * 1. There is no eviction path from the ARC to the L2ARC. Evictions from 5836 * the ARC behave as usual, freeing buffers and placing headers on ghost 5837 * lists. The ARC does not send buffers to the L2ARC during eviction as 5838 * this would add inflated write latencies for all ARC memory pressure. 5839 * 5840 * 2. The L2ARC attempts to cache data from the ARC before it is evicted. 5841 * It does this by periodically scanning buffers from the eviction-end of 5842 * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are 5843 * not already there. It scans until a headroom of buffers is satisfied, 5844 * which itself is a buffer for ARC eviction. If a compressible buffer is 5845 * found during scanning and selected for writing to an L2ARC device, we 5846 * temporarily boost scanning headroom during the next scan cycle to make 5847 * sure we adapt to compression effects (which might significantly reduce 5848 * the data volume we write to L2ARC). The thread that does this is 5849 * l2arc_feed_thread(), illustrated below; example sizes are included to 5850 * provide a better sense of ratio than this diagram: 5851 * 5852 * head --> tail 5853 * +---------------------+----------+ 5854 * ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->. # already on L2ARC 5855 * +---------------------+----------+ | o L2ARC eligible 5856 * ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->| : ARC buffer 5857 * +---------------------+----------+ | 5858 * 15.9 Gbytes ^ 32 Mbytes | 5859 * headroom | 5860 * l2arc_feed_thread() 5861 * | 5862 * l2arc write hand <--[oooo]--' 5863 * | 8 Mbyte 5864 * | write max 5865 * V 5866 * +==============================+ 5867 * L2ARC dev |####|#|###|###| |####| ... | 5868 * +==============================+ 5869 * 32 Gbytes 5870 * 5871 * 3. If an ARC buffer is copied to the L2ARC but then hit instead of 5872 * evicted, then the L2ARC has cached a buffer much sooner than it probably 5873 * needed to, potentially wasting L2ARC device bandwidth and storage. It is 5874 * safe to say that this is an uncommon case, since buffers at the end of 5875 * the ARC lists have moved there due to inactivity. 5876 * 5877 * 4. If the ARC evicts faster than the L2ARC can maintain a headroom, 5878 * then the L2ARC simply misses copying some buffers. This serves as a 5879 * pressure valve to prevent heavy read workloads from both stalling the ARC 5880 * with waits and clogging the L2ARC with writes. This also helps prevent 5881 * the potential for the L2ARC to churn if it attempts to cache content too 5882 * quickly, such as during backups of the entire pool. 5883 * 5884 * 5. After system boot and before the ARC has filled main memory, there are 5885 * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru 5886 * lists can remain mostly static. Instead of searching from tail of these 5887 * lists as pictured, the l2arc_feed_thread() will search from the list heads 5888 * for eligible buffers, greatly increasing its chance of finding them. 5889 * 5890 * The L2ARC device write speed is also boosted during this time so that 5891 * the L2ARC warms up faster. Since there have been no ARC evictions yet, 5892 * there are no L2ARC reads, and no fear of degrading read performance 5893 * through increased writes. 5894 * 5895 * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that 5896 * the vdev queue can aggregate them into larger and fewer writes. Each 5897 * device is written to in a rotor fashion, sweeping writes through 5898 * available space then repeating. 5899 * 5900 * 7. The L2ARC does not store dirty content. It never needs to flush 5901 * write buffers back to disk based storage. 5902 * 5903 * 8. If an ARC buffer is written (and dirtied) which also exists in the 5904 * L2ARC, the now stale L2ARC buffer is immediately dropped. 5905 * 5906 * The performance of the L2ARC can be tweaked by a number of tunables, which 5907 * may be necessary for different workloads: 5908 * 5909 * l2arc_write_max max write bytes per interval 5910 * l2arc_write_boost extra write bytes during device warmup 5911 * l2arc_noprefetch skip caching prefetched buffers 5912 * l2arc_headroom number of max device writes to precache 5913 * l2arc_headroom_boost when we find compressed buffers during ARC 5914 * scanning, we multiply headroom by this 5915 * percentage factor for the next scan cycle, 5916 * since more compressed buffers are likely to 5917 * be present 5918 * l2arc_feed_secs seconds between L2ARC writing 5919 * 5920 * Tunables may be removed or added as future performance improvements are 5921 * integrated, and also may become zpool properties. 5922 * 5923 * There are three key functions that control how the L2ARC warms up: 5924 * 5925 * l2arc_write_eligible() check if a buffer is eligible to cache 5926 * l2arc_write_size() calculate how much to write 5927 * l2arc_write_interval() calculate sleep delay between writes 5928 * 5929 * These three functions determine what to write, how much, and how quickly 5930 * to send writes. 5931 */ 5932 5933static boolean_t 5934l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *hdr) 5935{ 5936 /* 5937 * A buffer is *not* eligible for the L2ARC if it: 5938 * 1. belongs to a different spa. 5939 * 2. is already cached on the L2ARC. 5940 * 3. has an I/O in progress (it may be an incomplete read). 5941 * 4. is flagged not eligible (zfs property). 5942 */ 5943 if (hdr->b_spa != spa_guid) { 5944 ARCSTAT_BUMP(arcstat_l2_write_spa_mismatch); 5945 return (B_FALSE); 5946 } 5947 if (HDR_HAS_L2HDR(hdr)) { 5948 ARCSTAT_BUMP(arcstat_l2_write_in_l2); 5949 return (B_FALSE); 5950 } 5951 if (HDR_IO_IN_PROGRESS(hdr)) { 5952 ARCSTAT_BUMP(arcstat_l2_write_hdr_io_in_progress); 5953 return (B_FALSE); 5954 } 5955 if (!HDR_L2CACHE(hdr)) { 5956 ARCSTAT_BUMP(arcstat_l2_write_not_cacheable); 5957 return (B_FALSE); 5958 } 5959 5960 return (B_TRUE); 5961} 5962 5963static uint64_t 5964l2arc_write_size(void) 5965{ 5966 uint64_t size; 5967 5968 /* 5969 * Make sure our globals have meaningful values in case the user 5970 * altered them. 5971 */ 5972 size = l2arc_write_max; 5973 if (size == 0) { 5974 cmn_err(CE_NOTE, "Bad value for l2arc_write_max, value must " 5975 "be greater than zero, resetting it to the default (%d)", 5976 L2ARC_WRITE_SIZE); 5977 size = l2arc_write_max = L2ARC_WRITE_SIZE; 5978 } 5979 5980 if (arc_warm == B_FALSE) 5981 size += l2arc_write_boost; 5982 5983 return (size); 5984 5985} 5986 5987static clock_t 5988l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote) 5989{ 5990 clock_t interval, next, now; 5991 5992 /* 5993 * If the ARC lists are busy, increase our write rate; if the 5994 * lists are stale, idle back. This is achieved by checking 5995 * how much we previously wrote - if it was more than half of 5996 * what we wanted, schedule the next write much sooner. 5997 */ 5998 if (l2arc_feed_again && wrote > (wanted / 2)) 5999 interval = (hz * l2arc_feed_min_ms) / 1000; 6000 else 6001 interval = hz * l2arc_feed_secs; 6002 6003 now = ddi_get_lbolt(); 6004 next = MAX(now, MIN(now + interval, began + interval)); 6005 6006 return (next); 6007} 6008 6009/* 6010 * Cycle through L2ARC devices. This is how L2ARC load balances. 6011 * If a device is returned, this also returns holding the spa config lock. 6012 */ 6013static l2arc_dev_t * 6014l2arc_dev_get_next(void) 6015{ 6016 l2arc_dev_t *first, *next = NULL; 6017 6018 /* 6019 * Lock out the removal of spas (spa_namespace_lock), then removal 6020 * of cache devices (l2arc_dev_mtx). Once a device has been selected, 6021 * both locks will be dropped and a spa config lock held instead. 6022 */ 6023 mutex_enter(&spa_namespace_lock); 6024 mutex_enter(&l2arc_dev_mtx); 6025 6026 /* if there are no vdevs, there is nothing to do */ 6027 if (l2arc_ndev == 0) 6028 goto out; 6029 6030 first = NULL; 6031 next = l2arc_dev_last; 6032 do { 6033 /* loop around the list looking for a non-faulted vdev */ 6034 if (next == NULL) { 6035 next = list_head(l2arc_dev_list); 6036 } else { 6037 next = list_next(l2arc_dev_list, next); 6038 if (next == NULL) 6039 next = list_head(l2arc_dev_list); 6040 } 6041 6042 /* if we have come back to the start, bail out */ 6043 if (first == NULL) 6044 first = next; 6045 else if (next == first) 6046 break; 6047 6048 } while (vdev_is_dead(next->l2ad_vdev)); 6049 6050 /* if we were unable to find any usable vdevs, return NULL */ 6051 if (vdev_is_dead(next->l2ad_vdev)) 6052 next = NULL; 6053 6054 l2arc_dev_last = next; 6055 6056out: 6057 mutex_exit(&l2arc_dev_mtx); 6058 6059 /* 6060 * Grab the config lock to prevent the 'next' device from being 6061 * removed while we are writing to it. 6062 */ 6063 if (next != NULL) 6064 spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER); 6065 mutex_exit(&spa_namespace_lock); 6066 6067 return (next); 6068} 6069 6070/* 6071 * Free buffers that were tagged for destruction. 6072 */ 6073static void 6074l2arc_do_free_on_write() 6075{ 6076 list_t *buflist; 6077 l2arc_data_free_t *df, *df_prev; 6078 6079 mutex_enter(&l2arc_free_on_write_mtx); 6080 buflist = l2arc_free_on_write; 6081 6082 for (df = list_tail(buflist); df; df = df_prev) { 6083 df_prev = list_prev(buflist, df); 6084 ASSERT(df->l2df_data != NULL); 6085 ASSERT(df->l2df_func != NULL); 6086 df->l2df_func(df->l2df_data, df->l2df_size); 6087 list_remove(buflist, df); 6088 kmem_free(df, sizeof (l2arc_data_free_t)); 6089 } 6090 6091 mutex_exit(&l2arc_free_on_write_mtx); 6092} 6093 6094/* 6095 * A write to a cache device has completed. Update all headers to allow 6096 * reads from these buffers to begin. 6097 */ 6098static void 6099l2arc_write_done(zio_t *zio) 6100{ 6101 l2arc_write_callback_t *cb; 6102 l2arc_dev_t *dev; 6103 list_t *buflist; 6104 arc_buf_hdr_t *head, *hdr, *hdr_prev; 6105 kmutex_t *hash_lock; 6106 int64_t bytes_dropped = 0; 6107 6108 cb = zio->io_private; 6109 ASSERT(cb != NULL); 6110 dev = cb->l2wcb_dev; 6111 ASSERT(dev != NULL); 6112 head = cb->l2wcb_head; 6113 ASSERT(head != NULL); 6114 buflist = &dev->l2ad_buflist; 6115 ASSERT(buflist != NULL); 6116 DTRACE_PROBE2(l2arc__iodone, zio_t *, zio, 6117 l2arc_write_callback_t *, cb); 6118 6119 if (zio->io_error != 0) 6120 ARCSTAT_BUMP(arcstat_l2_writes_error); 6121 6122 /* 6123 * All writes completed, or an error was hit. 6124 */ 6125top: 6126 mutex_enter(&dev->l2ad_mtx); 6127 for (hdr = list_prev(buflist, head); hdr; hdr = hdr_prev) { 6128 hdr_prev = list_prev(buflist, hdr); 6129 6130 hash_lock = HDR_LOCK(hdr); 6131 6132 /* 6133 * We cannot use mutex_enter or else we can deadlock 6134 * with l2arc_write_buffers (due to swapping the order 6135 * the hash lock and l2ad_mtx are taken). 6136 */ 6137 if (!mutex_tryenter(hash_lock)) { 6138 /* 6139 * Missed the hash lock. We must retry so we 6140 * don't leave the ARC_FLAG_L2_WRITING bit set. 6141 */ 6142 ARCSTAT_BUMP(arcstat_l2_writes_lock_retry); 6143 6144 /* 6145 * We don't want to rescan the headers we've 6146 * already marked as having been written out, so 6147 * we reinsert the head node so we can pick up 6148 * where we left off. 6149 */ 6150 list_remove(buflist, head); 6151 list_insert_after(buflist, hdr, head); 6152 6153 mutex_exit(&dev->l2ad_mtx); 6154 6155 /* 6156 * We wait for the hash lock to become available 6157 * to try and prevent busy waiting, and increase 6158 * the chance we'll be able to acquire the lock 6159 * the next time around. 6160 */ 6161 mutex_enter(hash_lock); 6162 mutex_exit(hash_lock); 6163 goto top; 6164 } 6165 6166 /* 6167 * We could not have been moved into the arc_l2c_only 6168 * state while in-flight due to our ARC_FLAG_L2_WRITING 6169 * bit being set. Let's just ensure that's being enforced. 6170 */ 6171 ASSERT(HDR_HAS_L1HDR(hdr)); 6172 6173 /* 6174 * We may have allocated a buffer for L2ARC compression, 6175 * we must release it to avoid leaking this data. 6176 */ 6177 l2arc_release_cdata_buf(hdr); 6178 6179 if (zio->io_error != 0) { 6180 /* 6181 * Error - drop L2ARC entry. 6182 */ 6183 list_remove(buflist, hdr); 6184 l2arc_trim(hdr); 6185 hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR; 6186 6187 ARCSTAT_INCR(arcstat_l2_asize, -hdr->b_l2hdr.b_asize); 6188 ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); 6189 6190 bytes_dropped += hdr->b_l2hdr.b_asize; 6191 (void) refcount_remove_many(&dev->l2ad_alloc, 6192 hdr->b_l2hdr.b_asize, hdr); 6193 } 6194 6195 /* 6196 * Allow ARC to begin reads and ghost list evictions to 6197 * this L2ARC entry. 6198 */ 6199 hdr->b_flags &= ~ARC_FLAG_L2_WRITING; 6200 6201 mutex_exit(hash_lock); 6202 } 6203 6204 atomic_inc_64(&l2arc_writes_done); 6205 list_remove(buflist, head); 6206 ASSERT(!HDR_HAS_L1HDR(head)); 6207 kmem_cache_free(hdr_l2only_cache, head); 6208 mutex_exit(&dev->l2ad_mtx); 6209 6210 vdev_space_update(dev->l2ad_vdev, -bytes_dropped, 0, 0); 6211 6212 l2arc_do_free_on_write(); 6213 6214 kmem_free(cb, sizeof (l2arc_write_callback_t)); 6215} 6216 6217/* 6218 * A read to a cache device completed. Validate buffer contents before 6219 * handing over to the regular ARC routines. 6220 */ 6221static void 6222l2arc_read_done(zio_t *zio) 6223{ 6224 l2arc_read_callback_t *cb; 6225 arc_buf_hdr_t *hdr; 6226 arc_buf_t *buf; 6227 kmutex_t *hash_lock; 6228 int equal; 6229 6230 ASSERT(zio->io_vd != NULL); 6231 ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE); 6232 6233 spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd); 6234 6235 cb = zio->io_private; 6236 ASSERT(cb != NULL); 6237 buf = cb->l2rcb_buf; 6238 ASSERT(buf != NULL); 6239 6240 hash_lock = HDR_LOCK(buf->b_hdr); 6241 mutex_enter(hash_lock); 6242 hdr = buf->b_hdr; 6243 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 6244 6245 /* 6246 * If the data was read into a temporary buffer, 6247 * move it and free the buffer. 6248 */ 6249 if (cb->l2rcb_data != NULL) { 6250 ASSERT3U(hdr->b_size, <, zio->io_size); 6251 ASSERT3U(cb->l2rcb_compress, ==, ZIO_COMPRESS_OFF); 6252 if (zio->io_error == 0) 6253 bcopy(cb->l2rcb_data, buf->b_data, hdr->b_size); 6254 6255 /* 6256 * The following must be done regardless of whether 6257 * there was an error: 6258 * - free the temporary buffer 6259 * - point zio to the real ARC buffer 6260 * - set zio size accordingly 6261 * These are required because zio is either re-used for 6262 * an I/O of the block in the case of the error 6263 * or the zio is passed to arc_read_done() and it 6264 * needs real data. 6265 */ 6266 zio_data_buf_free(cb->l2rcb_data, zio->io_size); 6267 zio->io_size = zio->io_orig_size = hdr->b_size; 6268 zio->io_data = zio->io_orig_data = buf->b_data; 6269 } 6270 6271 /* 6272 * If the buffer was compressed, decompress it first. 6273 */ 6274 if (cb->l2rcb_compress != ZIO_COMPRESS_OFF) 6275 l2arc_decompress_zio(zio, hdr, cb->l2rcb_compress); 6276 ASSERT(zio->io_data != NULL); 6277 ASSERT3U(zio->io_size, ==, hdr->b_size); 6278 ASSERT3U(BP_GET_LSIZE(&cb->l2rcb_bp), ==, hdr->b_size); 6279 6280 /* 6281 * Check this survived the L2ARC journey. 6282 */ 6283 equal = arc_cksum_equal(buf); 6284 if (equal && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) { 6285 mutex_exit(hash_lock); 6286 zio->io_private = buf; 6287 zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 */ 6288 zio->io_bp = &zio->io_bp_copy; /* XXX fix in L2ARC 2.0 */ 6289 arc_read_done(zio); 6290 } else { 6291 mutex_exit(hash_lock); 6292 /* 6293 * Buffer didn't survive caching. Increment stats and 6294 * reissue to the original storage device. 6295 */ 6296 if (zio->io_error != 0) { 6297 ARCSTAT_BUMP(arcstat_l2_io_error); 6298 } else { 6299 zio->io_error = SET_ERROR(EIO); 6300 } 6301 if (!equal) 6302 ARCSTAT_BUMP(arcstat_l2_cksum_bad); 6303 6304 /* 6305 * If there's no waiter, issue an async i/o to the primary 6306 * storage now. If there *is* a waiter, the caller must 6307 * issue the i/o in a context where it's OK to block. 6308 */ 6309 if (zio->io_waiter == NULL) { 6310 zio_t *pio = zio_unique_parent(zio); 6311 6312 ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL); 6313 6314 zio_nowait(zio_read(pio, cb->l2rcb_spa, &cb->l2rcb_bp, 6315 buf->b_data, hdr->b_size, arc_read_done, buf, 6316 zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb)); 6317 } 6318 } 6319 6320 kmem_free(cb, sizeof (l2arc_read_callback_t)); 6321} 6322 6323/* 6324 * This is the list priority from which the L2ARC will search for pages to 6325 * cache. This is used within loops (0..3) to cycle through lists in the 6326 * desired order. This order can have a significant effect on cache 6327 * performance. 6328 * 6329 * Currently the metadata lists are hit first, MFU then MRU, followed by 6330 * the data lists. This function returns a locked list, and also returns 6331 * the lock pointer. 6332 */ 6333static multilist_sublist_t * 6334l2arc_sublist_lock(int list_num) 6335{ 6336 multilist_t *ml = NULL; 6337 unsigned int idx; 6338 6339 ASSERT(list_num >= 0 && list_num <= 3); 6340 6341 switch (list_num) { 6342 case 0: 6343 ml = &arc_mfu->arcs_list[ARC_BUFC_METADATA]; 6344 break; 6345 case 1: 6346 ml = &arc_mru->arcs_list[ARC_BUFC_METADATA]; 6347 break; 6348 case 2: 6349 ml = &arc_mfu->arcs_list[ARC_BUFC_DATA]; 6350 break; 6351 case 3: 6352 ml = &arc_mru->arcs_list[ARC_BUFC_DATA]; 6353 break; 6354 } 6355 6356 /* 6357 * Return a randomly-selected sublist. This is acceptable 6358 * because the caller feeds only a little bit of data for each 6359 * call (8MB). Subsequent calls will result in different 6360 * sublists being selected. 6361 */ 6362 idx = multilist_get_random_index(ml); 6363 return (multilist_sublist_lock(ml, idx)); 6364} 6365 6366/* 6367 * Evict buffers from the device write hand to the distance specified in 6368 * bytes. This distance may span populated buffers, it may span nothing. 6369 * This is clearing a region on the L2ARC device ready for writing. 6370 * If the 'all' boolean is set, every buffer is evicted. 6371 */ 6372static void 6373l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) 6374{ 6375 list_t *buflist; 6376 arc_buf_hdr_t *hdr, *hdr_prev; 6377 kmutex_t *hash_lock; 6378 uint64_t taddr; 6379 6380 buflist = &dev->l2ad_buflist; 6381 6382 if (!all && dev->l2ad_first) { 6383 /* 6384 * This is the first sweep through the device. There is 6385 * nothing to evict. 6386 */ 6387 return; 6388 } 6389 6390 if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) { 6391 /* 6392 * When nearing the end of the device, evict to the end 6393 * before the device write hand jumps to the start. 6394 */ 6395 taddr = dev->l2ad_end; 6396 } else { 6397 taddr = dev->l2ad_hand + distance; 6398 } 6399 DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist, 6400 uint64_t, taddr, boolean_t, all); 6401 6402top: 6403 mutex_enter(&dev->l2ad_mtx); 6404 for (hdr = list_tail(buflist); hdr; hdr = hdr_prev) { 6405 hdr_prev = list_prev(buflist, hdr); 6406 6407 hash_lock = HDR_LOCK(hdr); 6408 6409 /* 6410 * We cannot use mutex_enter or else we can deadlock 6411 * with l2arc_write_buffers (due to swapping the order 6412 * the hash lock and l2ad_mtx are taken). 6413 */ 6414 if (!mutex_tryenter(hash_lock)) { 6415 /* 6416 * Missed the hash lock. Retry. 6417 */ 6418 ARCSTAT_BUMP(arcstat_l2_evict_lock_retry); 6419 mutex_exit(&dev->l2ad_mtx); 6420 mutex_enter(hash_lock); 6421 mutex_exit(hash_lock); 6422 goto top; 6423 } 6424 6425 if (HDR_L2_WRITE_HEAD(hdr)) { 6426 /* 6427 * We hit a write head node. Leave it for 6428 * l2arc_write_done(). 6429 */ 6430 list_remove(buflist, hdr); 6431 mutex_exit(hash_lock); 6432 continue; 6433 } 6434 6435 if (!all && HDR_HAS_L2HDR(hdr) && 6436 (hdr->b_l2hdr.b_daddr > taddr || 6437 hdr->b_l2hdr.b_daddr < dev->l2ad_hand)) { 6438 /* 6439 * We've evicted to the target address, 6440 * or the end of the device. 6441 */ 6442 mutex_exit(hash_lock); 6443 break; 6444 } 6445 6446 ASSERT(HDR_HAS_L2HDR(hdr)); 6447 if (!HDR_HAS_L1HDR(hdr)) { 6448 ASSERT(!HDR_L2_READING(hdr)); 6449 /* 6450 * This doesn't exist in the ARC. Destroy. 6451 * arc_hdr_destroy() will call list_remove() 6452 * and decrement arcstat_l2_size. 6453 */ 6454 arc_change_state(arc_anon, hdr, hash_lock); 6455 arc_hdr_destroy(hdr); 6456 } else { 6457 ASSERT(hdr->b_l1hdr.b_state != arc_l2c_only); 6458 ARCSTAT_BUMP(arcstat_l2_evict_l1cached); 6459 /* 6460 * Invalidate issued or about to be issued 6461 * reads, since we may be about to write 6462 * over this location. 6463 */ 6464 if (HDR_L2_READING(hdr)) { 6465 ARCSTAT_BUMP(arcstat_l2_evict_reading); 6466 hdr->b_flags |= ARC_FLAG_L2_EVICTED; 6467 } 6468 6469 /* Ensure this header has finished being written */ 6470 ASSERT(!HDR_L2_WRITING(hdr)); 6471 ASSERT3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL); 6472 6473 arc_hdr_l2hdr_destroy(hdr); 6474 } 6475 mutex_exit(hash_lock); 6476 } 6477 mutex_exit(&dev->l2ad_mtx); 6478} 6479 6480/* 6481 * Find and write ARC buffers to the L2ARC device. 6482 * 6483 * An ARC_FLAG_L2_WRITING flag is set so that the L2ARC buffers are not valid 6484 * for reading until they have completed writing. 6485 * The headroom_boost is an in-out parameter used to maintain headroom boost 6486 * state between calls to this function. 6487 * 6488 * Returns the number of bytes actually written (which may be smaller than 6489 * the delta by which the device hand has changed due to alignment). 6490 */ 6491static uint64_t 6492l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, 6493 boolean_t *headroom_boost) 6494{ 6495 arc_buf_hdr_t *hdr, *hdr_prev, *head; 6496 uint64_t write_asize, write_sz, headroom, 6497 buf_compress_minsz; 6498 void *buf_data; 6499 boolean_t full; 6500 l2arc_write_callback_t *cb; 6501 zio_t *pio, *wzio; 6502 uint64_t guid = spa_load_guid(spa); 6503 const boolean_t do_headroom_boost = *headroom_boost; 6504 int try; 6505 6506 ASSERT(dev->l2ad_vdev != NULL); 6507 6508 /* Lower the flag now, we might want to raise it again later. */ 6509 *headroom_boost = B_FALSE; 6510 6511 pio = NULL; 6512 write_sz = write_asize = 0; 6513 full = B_FALSE; 6514 head = kmem_cache_alloc(hdr_l2only_cache, KM_PUSHPAGE); 6515 head->b_flags |= ARC_FLAG_L2_WRITE_HEAD; 6516 head->b_flags |= ARC_FLAG_HAS_L2HDR; 6517 6518 ARCSTAT_BUMP(arcstat_l2_write_buffer_iter); 6519 /* 6520 * We will want to try to compress buffers that are at least 2x the 6521 * device sector size. 6522 */ 6523 buf_compress_minsz = 2 << dev->l2ad_vdev->vdev_ashift; 6524 6525 /* 6526 * Copy buffers for L2ARC writing. 6527 */ 6528 for (try = 0; try <= 3; try++) { 6529 multilist_sublist_t *mls = l2arc_sublist_lock(try); 6530 uint64_t passed_sz = 0; 6531 6532 ARCSTAT_BUMP(arcstat_l2_write_buffer_list_iter); 6533 6534 /* 6535 * L2ARC fast warmup. 6536 * 6537 * Until the ARC is warm and starts to evict, read from the 6538 * head of the ARC lists rather than the tail. 6539 */ 6540 if (arc_warm == B_FALSE) 6541 hdr = multilist_sublist_head(mls); 6542 else 6543 hdr = multilist_sublist_tail(mls); 6544 if (hdr == NULL) 6545 ARCSTAT_BUMP(arcstat_l2_write_buffer_list_null_iter); 6546 6547 headroom = target_sz * l2arc_headroom; 6548 if (do_headroom_boost) 6549 headroom = (headroom * l2arc_headroom_boost) / 100; 6550 6551 for (; hdr; hdr = hdr_prev) { 6552 kmutex_t *hash_lock; 6553 uint64_t buf_sz; 6554 uint64_t buf_a_sz; 6555 size_t align; 6556 6557 if (arc_warm == B_FALSE) 6558 hdr_prev = multilist_sublist_next(mls, hdr); 6559 else 6560 hdr_prev = multilist_sublist_prev(mls, hdr); 6561 ARCSTAT_INCR(arcstat_l2_write_buffer_bytes_scanned, hdr->b_size); 6562 6563 hash_lock = HDR_LOCK(hdr); 6564 if (!mutex_tryenter(hash_lock)) { 6565 ARCSTAT_BUMP(arcstat_l2_write_trylock_fail); 6566 /* 6567 * Skip this buffer rather than waiting. 6568 */ 6569 continue; 6570 } 6571 6572 passed_sz += hdr->b_size; 6573 if (passed_sz > headroom) { 6574 /* 6575 * Searched too far. 6576 */ 6577 mutex_exit(hash_lock); 6578 ARCSTAT_BUMP(arcstat_l2_write_passed_headroom); 6579 break; 6580 } 6581 6582 if (!l2arc_write_eligible(guid, hdr)) { 6583 mutex_exit(hash_lock); 6584 continue; 6585 } 6586 6587 /* 6588 * Assume that the buffer is not going to be compressed 6589 * and could take more space on disk because of a larger 6590 * disk block size. 6591 */ 6592 buf_sz = hdr->b_size; 6593 align = (size_t)1 << dev->l2ad_vdev->vdev_ashift; 6594 buf_a_sz = P2ROUNDUP(buf_sz, align); 6595 6596 if ((write_asize + buf_a_sz) > target_sz) { 6597 full = B_TRUE; 6598 mutex_exit(hash_lock); 6599 ARCSTAT_BUMP(arcstat_l2_write_full); 6600 break; 6601 } 6602 6603 if (pio == NULL) { 6604 /* 6605 * Insert a dummy header on the buflist so 6606 * l2arc_write_done() can find where the 6607 * write buffers begin without searching. 6608 */ 6609 mutex_enter(&dev->l2ad_mtx); 6610 list_insert_head(&dev->l2ad_buflist, head); 6611 mutex_exit(&dev->l2ad_mtx); 6612 6613 cb = kmem_alloc( 6614 sizeof (l2arc_write_callback_t), KM_SLEEP); 6615 cb->l2wcb_dev = dev; 6616 cb->l2wcb_head = head; 6617 pio = zio_root(spa, l2arc_write_done, cb, 6618 ZIO_FLAG_CANFAIL); 6619 ARCSTAT_BUMP(arcstat_l2_write_pios); 6620 } 6621 6622 /* 6623 * Create and add a new L2ARC header. 6624 */ 6625 hdr->b_l2hdr.b_dev = dev; 6626 hdr->b_flags |= ARC_FLAG_L2_WRITING; 6627 /* 6628 * Temporarily stash the data buffer in b_tmp_cdata. 6629 * The subsequent write step will pick it up from 6630 * there. This is because can't access b_l1hdr.b_buf 6631 * without holding the hash_lock, which we in turn 6632 * can't access without holding the ARC list locks 6633 * (which we want to avoid during compression/writing). 6634 */ 6635 hdr->b_l2hdr.b_compress = ZIO_COMPRESS_OFF; 6636 hdr->b_l2hdr.b_asize = hdr->b_size; 6637 hdr->b_l1hdr.b_tmp_cdata = hdr->b_l1hdr.b_buf->b_data; 6638 6639 /* 6640 * Explicitly set the b_daddr field to a known 6641 * value which means "invalid address". This 6642 * enables us to differentiate which stage of 6643 * l2arc_write_buffers() the particular header 6644 * is in (e.g. this loop, or the one below). 6645 * ARC_FLAG_L2_WRITING is not enough to make 6646 * this distinction, and we need to know in 6647 * order to do proper l2arc vdev accounting in 6648 * arc_release() and arc_hdr_destroy(). 6649 * 6650 * Note, we can't use a new flag to distinguish 6651 * the two stages because we don't hold the 6652 * header's hash_lock below, in the second stage 6653 * of this function. Thus, we can't simply 6654 * change the b_flags field to denote that the 6655 * IO has been sent. We can change the b_daddr 6656 * field of the L2 portion, though, since we'll 6657 * be holding the l2ad_mtx; which is why we're 6658 * using it to denote the header's state change. 6659 */ 6660 hdr->b_l2hdr.b_daddr = L2ARC_ADDR_UNSET; 6661 6662 hdr->b_flags |= ARC_FLAG_HAS_L2HDR; 6663 6664 mutex_enter(&dev->l2ad_mtx); 6665 list_insert_head(&dev->l2ad_buflist, hdr); 6666 mutex_exit(&dev->l2ad_mtx); 6667 6668 /* 6669 * Compute and store the buffer cksum before 6670 * writing. On debug the cksum is verified first. 6671 */ 6672 arc_cksum_verify(hdr->b_l1hdr.b_buf); 6673 arc_cksum_compute(hdr->b_l1hdr.b_buf, B_TRUE); 6674 6675 mutex_exit(hash_lock); 6676 6677 write_sz += buf_sz; 6678 write_asize += buf_a_sz; 6679 } 6680 6681 multilist_sublist_unlock(mls); 6682 6683 if (full == B_TRUE) 6684 break; 6685 } 6686 6687 /* No buffers selected for writing? */ 6688 if (pio == NULL) { 6689 ASSERT0(write_sz); 6690 ASSERT(!HDR_HAS_L1HDR(head)); 6691 kmem_cache_free(hdr_l2only_cache, head); 6692 return (0); 6693 } 6694 6695 mutex_enter(&dev->l2ad_mtx); 6696 6697 /* 6698 * Now start writing the buffers. We're starting at the write head 6699 * and work backwards, retracing the course of the buffer selector 6700 * loop above. 6701 */ 6702 write_asize = 0; 6703 for (hdr = list_prev(&dev->l2ad_buflist, head); hdr; 6704 hdr = list_prev(&dev->l2ad_buflist, hdr)) { 6705 uint64_t buf_sz; 6706 boolean_t compress; 6707 6708 /* 6709 * We rely on the L1 portion of the header below, so 6710 * it's invalid for this header to have been evicted out 6711 * of the ghost cache, prior to being written out. The 6712 * ARC_FLAG_L2_WRITING bit ensures this won't happen. 6713 */ 6714 ASSERT(HDR_HAS_L1HDR(hdr)); 6715 6716 /* 6717 * We shouldn't need to lock the buffer here, since we flagged 6718 * it as ARC_FLAG_L2_WRITING in the previous step, but we must 6719 * take care to only access its L2 cache parameters. In 6720 * particular, hdr->l1hdr.b_buf may be invalid by now due to 6721 * ARC eviction. 6722 */ 6723 hdr->b_l2hdr.b_daddr = dev->l2ad_hand; 6724 6725 /* 6726 * Save a pointer to the original buffer data we had previously 6727 * stashed away. 6728 */ 6729 buf_data = hdr->b_l1hdr.b_tmp_cdata; 6730 6731 compress = HDR_L2COMPRESS(hdr) && 6732 hdr->b_l2hdr.b_asize >= buf_compress_minsz; 6733 if (l2arc_transform_buf(hdr, compress)) { 6734 /* 6735 * If compression succeeded, enable headroom 6736 * boost on the next scan cycle. 6737 */ 6738 *headroom_boost = B_TRUE; 6739 } 6740 6741 /* 6742 * Get the new buffer size that accounts for compression 6743 * and padding. 6744 */ 6745 buf_sz = hdr->b_l2hdr.b_asize; 6746 6747 /* 6748 * We need to do this regardless if buf_sz is zero or 6749 * not, otherwise, when this l2hdr is evicted we'll 6750 * remove a reference that was never added. 6751 */ 6752 (void) refcount_add_many(&dev->l2ad_alloc, buf_sz, hdr); 6753 6754 /* Compression may have squashed the buffer to zero length. */ 6755 if (buf_sz != 0) { 6756 /* 6757 * If the data was padded or compressed, then it 6758 * it is in a new buffer. 6759 */ 6760 if (hdr->b_l1hdr.b_tmp_cdata != NULL) 6761 buf_data = hdr->b_l1hdr.b_tmp_cdata; 6762 wzio = zio_write_phys(pio, dev->l2ad_vdev, 6763 dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF, 6764 NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE, 6765 ZIO_FLAG_CANFAIL, B_FALSE); 6766 6767 DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev, 6768 zio_t *, wzio); 6769 (void) zio_nowait(wzio); 6770 6771 write_asize += buf_sz; 6772 dev->l2ad_hand += buf_sz; 6773 } 6774 } 6775 6776 mutex_exit(&dev->l2ad_mtx); 6777 6778 ASSERT3U(write_asize, <=, target_sz); 6779 ARCSTAT_BUMP(arcstat_l2_writes_sent); 6780 ARCSTAT_INCR(arcstat_l2_write_bytes, write_asize); 6781 ARCSTAT_INCR(arcstat_l2_size, write_sz); 6782 ARCSTAT_INCR(arcstat_l2_asize, write_asize); 6783 vdev_space_update(dev->l2ad_vdev, write_asize, 0, 0); 6784 6785 /* 6786 * Bump device hand to the device start if it is approaching the end. 6787 * l2arc_evict() will already have evicted ahead for this case. 6788 */ 6789 if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) { 6790 dev->l2ad_hand = dev->l2ad_start; 6791 dev->l2ad_first = B_FALSE; 6792 } 6793 6794 dev->l2ad_writing = B_TRUE; 6795 (void) zio_wait(pio); 6796 dev->l2ad_writing = B_FALSE; 6797 6798 return (write_asize); 6799} 6800 6801/* 6802 * Transforms, possibly compresses and pads, an L2ARC buffer. 6803 * The data to be compressed must be prefilled in l1hdr.b_tmp_cdata and its 6804 * size in l2hdr->b_asize. This routine tries to compress the data and 6805 * depending on the compression result there are three possible outcomes: 6806 * *) The buffer was incompressible. The buffer size was already ashift aligned. 6807 * The original hdr contents were left untouched except for b_tmp_cdata, 6808 * which is reset to NULL. The caller must keep a pointer to the original 6809 * data. 6810 * *) The buffer was incompressible. The buffer size was not ashift aligned. 6811 * b_tmp_cdata was replaced with a temporary data buffer which holds a padded 6812 * (aligned) copy of the data. Once writing is done, invoke 6813 * l2arc_release_cdata_buf on this hdr to free the temporary buffer. 6814 * *) The buffer was all-zeros, so there is no need to write it to an L2 6815 * device. To indicate this situation b_tmp_cdata is NULL'ed, b_asize is 6816 * set to zero and b_compress is set to ZIO_COMPRESS_EMPTY. 6817 * *) Compression succeeded and b_tmp_cdata was replaced with a temporary 6818 * data buffer which holds the compressed data to be written, and b_asize 6819 * tells us how much data there is. b_compress is set to the appropriate 6820 * compression algorithm. Once writing is done, invoke 6821 * l2arc_release_cdata_buf on this l2hdr to free this temporary buffer. 6822 * 6823 * Returns B_TRUE if compression succeeded, or B_FALSE if it didn't (the 6824 * buffer was incompressible). 6825 */ 6826static boolean_t 6827l2arc_transform_buf(arc_buf_hdr_t *hdr, boolean_t compress) 6828{ 6829 void *cdata; 6830 size_t align, asize, csize, len, rounded; 6831 6832 ASSERT(HDR_HAS_L2HDR(hdr)); 6833 l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr; 6834 6835 ASSERT(HDR_HAS_L1HDR(hdr)); 6836 ASSERT3S(l2hdr->b_compress, ==, ZIO_COMPRESS_OFF); 6837 ASSERT(hdr->b_l1hdr.b_tmp_cdata != NULL); 6838 6839 len = l2hdr->b_asize; 6840 align = (size_t)1 << l2hdr->b_dev->l2ad_vdev->vdev_ashift; 6841 asize = P2ROUNDUP(len, align); 6842 cdata = zio_data_buf_alloc(asize); 6843 ASSERT3P(cdata, !=, NULL); 6844 if (compress) 6845 csize = zio_compress_data(ZIO_COMPRESS_LZ4, 6846 hdr->b_l1hdr.b_tmp_cdata, cdata, len); 6847 else 6848 csize = len; 6849 6850 if (csize == 0) { 6851 /* zero block, indicate that there's nothing to write */ 6852 zio_data_buf_free(cdata, asize); 6853 l2hdr->b_compress = ZIO_COMPRESS_EMPTY; 6854 l2hdr->b_asize = 0; 6855 hdr->b_l1hdr.b_tmp_cdata = NULL; 6856 ARCSTAT_BUMP(arcstat_l2_compress_zeros); 6857 return (B_TRUE); 6858 } 6859 6860 rounded = P2ROUNDUP(csize, align); 6861 ASSERT3U(rounded, <=, asize); 6862 if (rounded < len) { 6863 /* 6864 * Compression succeeded, we'll keep the cdata around for 6865 * writing and release it afterwards. 6866 */ 6867 if (rounded > csize) { 6868 bzero((char *)cdata + csize, rounded - csize); 6869 csize = rounded; 6870 } 6871 l2hdr->b_compress = ZIO_COMPRESS_LZ4; 6872 l2hdr->b_asize = csize; 6873 hdr->b_l1hdr.b_tmp_cdata = cdata; 6874 ARCSTAT_BUMP(arcstat_l2_compress_successes); 6875 return (B_TRUE); 6876 } else { 6877 /* 6878 * Compression did not save space. 6879 */ 6880 if (P2PHASE(len, align) != 0) { 6881 /* 6882 * Use compression buffer for a copy of data padded to 6883 * the proper size. Compression algorithm remains set 6884 * to ZIO_COMPRESS_OFF. 6885 */ 6886 ASSERT3U(len, <, asize); 6887 bcopy(hdr->b_l1hdr.b_tmp_cdata, cdata, len); 6888 bzero((char *)cdata + len, asize - len); 6889 l2hdr->b_asize = asize; 6890 hdr->b_l1hdr.b_tmp_cdata = cdata; 6891 ARCSTAT_BUMP(arcstat_l2_padding_needed); 6892 } else { 6893 ASSERT3U(len, ==, asize); 6894 /* 6895 * The original buffer is good as is, 6896 * release the compressed buffer. 6897 * l2hdr will be left unmodified except for b_tmp_cdata. 6898 */ 6899 zio_data_buf_free(cdata, asize); 6900 hdr->b_l1hdr.b_tmp_cdata = NULL; 6901 } 6902 if (compress) 6903 ARCSTAT_BUMP(arcstat_l2_compress_failures); 6904 return (B_FALSE); 6905 } 6906} 6907 6908/* 6909 * Decompresses a zio read back from an l2arc device. On success, the 6910 * underlying zio's io_data buffer is overwritten by the uncompressed 6911 * version. On decompression error (corrupt compressed stream), the 6912 * zio->io_error value is set to signal an I/O error. 6913 * 6914 * Please note that the compressed data stream is not checksummed, so 6915 * if the underlying device is experiencing data corruption, we may feed 6916 * corrupt data to the decompressor, so the decompressor needs to be 6917 * able to handle this situation (LZ4 does). 6918 */ 6919static void 6920l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr, enum zio_compress c) 6921{ 6922 ASSERT(L2ARC_IS_VALID_COMPRESS(c)); 6923 6924 if (zio->io_error != 0) { 6925 /* 6926 * An io error has occured, just restore the original io 6927 * size in preparation for a main pool read. 6928 */ 6929 zio->io_orig_size = zio->io_size = hdr->b_size; 6930 return; 6931 } 6932 6933 if (c == ZIO_COMPRESS_EMPTY) { 6934 /* 6935 * An empty buffer results in a null zio, which means we 6936 * need to fill its io_data after we're done restoring the 6937 * buffer's contents. 6938 */ 6939 ASSERT(hdr->b_l1hdr.b_buf != NULL); 6940 bzero(hdr->b_l1hdr.b_buf->b_data, hdr->b_size); 6941 zio->io_data = zio->io_orig_data = hdr->b_l1hdr.b_buf->b_data; 6942 } else { 6943 ASSERT(zio->io_data != NULL); 6944 /* 6945 * We copy the compressed data from the start of the arc buffer 6946 * (the zio_read will have pulled in only what we need, the 6947 * rest is garbage which we will overwrite at decompression) 6948 * and then decompress back to the ARC data buffer. This way we 6949 * can minimize copying by simply decompressing back over the 6950 * original compressed data (rather than decompressing to an 6951 * aux buffer and then copying back the uncompressed buffer, 6952 * which is likely to be much larger). 6953 */ 6954 uint64_t csize; 6955 void *cdata; 6956 6957 csize = zio->io_size; 6958 cdata = zio_data_buf_alloc(csize); 6959 bcopy(zio->io_data, cdata, csize); 6960 if (zio_decompress_data(c, cdata, zio->io_data, csize, 6961 hdr->b_size) != 0) 6962 zio->io_error = EIO; 6963 zio_data_buf_free(cdata, csize); 6964 } 6965 6966 /* Restore the expected uncompressed IO size. */ 6967 zio->io_orig_size = zio->io_size = hdr->b_size; 6968} 6969 6970/* 6971 * Releases the temporary b_tmp_cdata buffer in an l2arc header structure. 6972 * This buffer serves as a temporary holder of compressed or padded data while 6973 * the buffer entry is being written to an l2arc device. Once that is 6974 * done, we can dispose of it. 6975 */ 6976static void 6977l2arc_release_cdata_buf(arc_buf_hdr_t *hdr) 6978{ 6979 size_t align, asize, len; 6980 enum zio_compress comp = hdr->b_l2hdr.b_compress; 6981 6982 ASSERT(HDR_HAS_L2HDR(hdr)); 6983 ASSERT(HDR_HAS_L1HDR(hdr)); 6984 ASSERT(comp == ZIO_COMPRESS_OFF || L2ARC_IS_VALID_COMPRESS(comp)); 6985 6986 if (hdr->b_l1hdr.b_tmp_cdata != NULL) { 6987 ASSERT(comp != ZIO_COMPRESS_EMPTY); 6988 len = hdr->b_size; 6989 align = (size_t)1 << hdr->b_l2hdr.b_dev->l2ad_vdev->vdev_ashift; 6990 asize = P2ROUNDUP(len, align); 6991 zio_data_buf_free(hdr->b_l1hdr.b_tmp_cdata, asize); 6992 hdr->b_l1hdr.b_tmp_cdata = NULL; 6993 } else { 6994 ASSERT(comp == ZIO_COMPRESS_OFF || comp == ZIO_COMPRESS_EMPTY); 6995 } 6996} 6997 6998/* 6999 * This thread feeds the L2ARC at regular intervals. This is the beating 7000 * heart of the L2ARC. 7001 */ 7002static void 7003l2arc_feed_thread(void *dummy __unused) 7004{ 7005 callb_cpr_t cpr; 7006 l2arc_dev_t *dev; 7007 spa_t *spa; 7008 uint64_t size, wrote; 7009 clock_t begin, next = ddi_get_lbolt(); 7010 boolean_t headroom_boost = B_FALSE; 7011 7012 CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG); 7013 7014 mutex_enter(&l2arc_feed_thr_lock); 7015 7016 while (l2arc_thread_exit == 0) { 7017 CALLB_CPR_SAFE_BEGIN(&cpr); 7018 (void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock, 7019 next - ddi_get_lbolt()); 7020 CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock); 7021 next = ddi_get_lbolt() + hz; 7022 7023 /* 7024 * Quick check for L2ARC devices. 7025 */ 7026 mutex_enter(&l2arc_dev_mtx); 7027 if (l2arc_ndev == 0) { 7028 mutex_exit(&l2arc_dev_mtx); 7029 continue; 7030 } 7031 mutex_exit(&l2arc_dev_mtx); 7032 begin = ddi_get_lbolt(); 7033 7034 /* 7035 * This selects the next l2arc device to write to, and in 7036 * doing so the next spa to feed from: dev->l2ad_spa. This 7037 * will return NULL if there are now no l2arc devices or if 7038 * they are all faulted. 7039 * 7040 * If a device is returned, its spa's config lock is also 7041 * held to prevent device removal. l2arc_dev_get_next() 7042 * will grab and release l2arc_dev_mtx. 7043 */ 7044 if ((dev = l2arc_dev_get_next()) == NULL) 7045 continue; 7046 7047 spa = dev->l2ad_spa; 7048 ASSERT(spa != NULL); 7049 7050 /* 7051 * If the pool is read-only then force the feed thread to 7052 * sleep a little longer. 7053 */ 7054 if (!spa_writeable(spa)) { 7055 next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz; 7056 spa_config_exit(spa, SCL_L2ARC, dev); 7057 continue; 7058 } 7059 7060 /* 7061 * Avoid contributing to memory pressure. 7062 */ 7063 if (arc_reclaim_needed()) { 7064 ARCSTAT_BUMP(arcstat_l2_abort_lowmem); 7065 spa_config_exit(spa, SCL_L2ARC, dev); 7066 continue; 7067 } 7068 7069 ARCSTAT_BUMP(arcstat_l2_feeds); 7070 7071 size = l2arc_write_size(); 7072 7073 /* 7074 * Evict L2ARC buffers that will be overwritten. 7075 */ 7076 l2arc_evict(dev, size, B_FALSE); 7077 7078 /* 7079 * Write ARC buffers. 7080 */ 7081 wrote = l2arc_write_buffers(spa, dev, size, &headroom_boost); 7082 7083 /* 7084 * Calculate interval between writes. 7085 */ 7086 next = l2arc_write_interval(begin, size, wrote); 7087 spa_config_exit(spa, SCL_L2ARC, dev); 7088 } 7089 7090 l2arc_thread_exit = 0; 7091 cv_broadcast(&l2arc_feed_thr_cv); 7092 CALLB_CPR_EXIT(&cpr); /* drops l2arc_feed_thr_lock */ 7093 thread_exit(); 7094} 7095 7096boolean_t 7097l2arc_vdev_present(vdev_t *vd) 7098{ 7099 l2arc_dev_t *dev; 7100 7101 mutex_enter(&l2arc_dev_mtx); 7102 for (dev = list_head(l2arc_dev_list); dev != NULL; 7103 dev = list_next(l2arc_dev_list, dev)) { 7104 if (dev->l2ad_vdev == vd) 7105 break; 7106 } 7107 mutex_exit(&l2arc_dev_mtx); 7108 7109 return (dev != NULL); 7110} 7111 7112/* 7113 * Add a vdev for use by the L2ARC. By this point the spa has already 7114 * validated the vdev and opened it. 7115 */ 7116void 7117l2arc_add_vdev(spa_t *spa, vdev_t *vd) 7118{ 7119 l2arc_dev_t *adddev; 7120 7121 ASSERT(!l2arc_vdev_present(vd)); 7122 7123 vdev_ashift_optimize(vd); 7124 7125 /* 7126 * Create a new l2arc device entry. 7127 */ 7128 adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP); 7129 adddev->l2ad_spa = spa; 7130 adddev->l2ad_vdev = vd; 7131 adddev->l2ad_start = VDEV_LABEL_START_SIZE; 7132 adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd); 7133 adddev->l2ad_hand = adddev->l2ad_start; 7134 adddev->l2ad_first = B_TRUE; 7135 adddev->l2ad_writing = B_FALSE; 7136 7137 mutex_init(&adddev->l2ad_mtx, NULL, MUTEX_DEFAULT, NULL); 7138 /* 7139 * This is a list of all ARC buffers that are still valid on the 7140 * device. 7141 */ 7142 list_create(&adddev->l2ad_buflist, sizeof (arc_buf_hdr_t), 7143 offsetof(arc_buf_hdr_t, b_l2hdr.b_l2node)); 7144 7145 vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand); 7146 refcount_create(&adddev->l2ad_alloc); 7147 7148 /* 7149 * Add device to global list 7150 */ 7151 mutex_enter(&l2arc_dev_mtx); 7152 list_insert_head(l2arc_dev_list, adddev); 7153 atomic_inc_64(&l2arc_ndev); 7154 mutex_exit(&l2arc_dev_mtx); 7155} 7156 7157/* 7158 * Remove a vdev from the L2ARC. 7159 */ 7160void 7161l2arc_remove_vdev(vdev_t *vd) 7162{ 7163 l2arc_dev_t *dev, *nextdev, *remdev = NULL; 7164 7165 /* 7166 * Find the device by vdev 7167 */ 7168 mutex_enter(&l2arc_dev_mtx); 7169 for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) { 7170 nextdev = list_next(l2arc_dev_list, dev); 7171 if (vd == dev->l2ad_vdev) { 7172 remdev = dev; 7173 break; 7174 } 7175 } 7176 ASSERT(remdev != NULL); 7177 7178 /* 7179 * Remove device from global list 7180 */ 7181 list_remove(l2arc_dev_list, remdev); 7182 l2arc_dev_last = NULL; /* may have been invalidated */ 7183 atomic_dec_64(&l2arc_ndev); 7184 mutex_exit(&l2arc_dev_mtx); 7185 7186 /* 7187 * Clear all buflists and ARC references. L2ARC device flush. 7188 */ 7189 l2arc_evict(remdev, 0, B_TRUE); 7190 list_destroy(&remdev->l2ad_buflist); 7191 mutex_destroy(&remdev->l2ad_mtx); 7192 refcount_destroy(&remdev->l2ad_alloc); 7193 kmem_free(remdev, sizeof (l2arc_dev_t)); 7194} 7195 7196void 7197l2arc_init(void) 7198{ 7199 l2arc_thread_exit = 0; 7200 l2arc_ndev = 0; 7201 l2arc_writes_sent = 0; 7202 l2arc_writes_done = 0; 7203 7204 mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL); 7205 cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL); 7206 mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL); 7207 mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL); 7208 7209 l2arc_dev_list = &L2ARC_dev_list; 7210 l2arc_free_on_write = &L2ARC_free_on_write; 7211 list_create(l2arc_dev_list, sizeof (l2arc_dev_t), 7212 offsetof(l2arc_dev_t, l2ad_node)); 7213 list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t), 7214 offsetof(l2arc_data_free_t, l2df_list_node)); 7215} 7216 7217void 7218l2arc_fini(void) 7219{ 7220 /* 7221 * This is called from dmu_fini(), which is called from spa_fini(); 7222 * Because of this, we can assume that all l2arc devices have 7223 * already been removed when the pools themselves were removed. 7224 */ 7225 7226 l2arc_do_free_on_write(); 7227 7228 mutex_destroy(&l2arc_feed_thr_lock); 7229 cv_destroy(&l2arc_feed_thr_cv); 7230 mutex_destroy(&l2arc_dev_mtx); 7231 mutex_destroy(&l2arc_free_on_write_mtx); 7232 7233 list_destroy(l2arc_dev_list); 7234 list_destroy(l2arc_free_on_write); 7235} 7236 7237void 7238l2arc_start(void) 7239{ 7240 if (!(spa_mode_global & FWRITE)) 7241 return; 7242 7243 (void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0, 7244 TS_RUN, minclsyspri); 7245} 7246 7247void 7248l2arc_stop(void) 7249{ 7250 if (!(spa_mode_global & FWRITE)) 7251 return; 7252 7253 mutex_enter(&l2arc_feed_thr_lock); 7254 cv_signal(&l2arc_feed_thr_cv); /* kick thread out of startup */ 7255 l2arc_thread_exit = 1; 7256 while (l2arc_thread_exit != 0) 7257 cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock); 7258 mutex_exit(&l2arc_feed_thr_lock); 7259} 7260