arc.c revision 286762
156746Sroberto/* 256746Sroberto * CDDL HEADER START 356746Sroberto * 456746Sroberto * The contents of this file are subject to the terms of the 556746Sroberto * Common Development and Distribution License (the "License"). 656746Sroberto * You may not use this file except in compliance with the License. 756746Sroberto * 856746Sroberto * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 956746Sroberto * or http://www.opensolaris.org/os/licensing. 1056746Sroberto * See the License for the specific language governing permissions 1156746Sroberto * and limitations under the License. 1256746Sroberto * 1356746Sroberto * When distributing Covered Code, include this CDDL HEADER in each 1456746Sroberto * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 1556746Sroberto * If applicable, add the following below this CDDL HEADER, with the 1656746Sroberto * fields enclosed by brackets "[]" replaced with your own identifying 1756746Sroberto * information: Portions Copyright [yyyy] [name of copyright owner] 1856746Sroberto * 1956746Sroberto * CDDL HEADER END 2056746Sroberto */ 2156746Sroberto/* 2256746Sroberto * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 2356746Sroberto * Copyright (c) 2012, Joyent, Inc. All rights reserved. 2456746Sroberto * Copyright (c) 2011, 2014 by Delphix. All rights reserved. 2556746Sroberto * Copyright (c) 2014 by Saso Kiselkov. All rights reserved. 2656746Sroberto * Copyright 2014 Nexenta Systems, Inc. All rights reserved. 2756746Sroberto */ 2856746Sroberto 2956746Sroberto/* 3056746Sroberto * DVA-based Adjustable Replacement Cache 3156746Sroberto * 3256746Sroberto * While much of the theory of operation used here is 3356746Sroberto * based on the self-tuning, low overhead replacement cache 3456746Sroberto * presented by Megiddo and Modha at FAST 2003, there are some 3556746Sroberto * significant differences: 36285612Sdelphij * 37285612Sdelphij * 1. The Megiddo and Modha model assumes any page is evictable. 38285612Sdelphij * Pages in its cache cannot be "locked" into memory. This makes 39285612Sdelphij * the eviction algorithm simple: evict the last page in the list. 40285612Sdelphij * This also make the performance characteristics easy to reason 4156746Sroberto * about. Our cache is not so simple. At any given moment, some 4256746Sroberto * subset of the blocks in the cache are un-evictable because we 4356746Sroberto * have handed out a reference to them. Blocks are only evictable 4456746Sroberto * when there are no external references active. This makes 4556746Sroberto * eviction far more problematic: we choose to evict the evictable 4656746Sroberto * blocks that are the "lowest" in the list. 47285612Sdelphij * 48285612Sdelphij * There are times when it is not possible to evict the requested 49285612Sdelphij * space. In these circumstances we are unable to adjust the cache 5056746Sroberto * size. To prevent the cache growing unbounded at these times we 5156746Sroberto * implement a "cache throttle" that slows the flow of new data 5256746Sroberto * into the cache until we can make space available. 5356746Sroberto * 5456746Sroberto * 2. The Megiddo and Modha model assumes a fixed cache size. 5556746Sroberto * Pages are evicted when the cache is full and there is a cache 5656746Sroberto * miss. Our model has a variable sized cache. It grows with 5756746Sroberto * high use, but also tries to react to memory pressure from the 5856746Sroberto * operating system: decreasing its size when system memory is 5956746Sroberto * tight. 6056746Sroberto * 6156746Sroberto * 3. The Megiddo and Modha model assumes a fixed page size. All 6256746Sroberto * elements of the cache are therefore exactly the same size. So 6356746Sroberto * when adjusting the cache size following a cache miss, its simply 64285612Sdelphij * a matter of choosing a single page to evict. In our model, we 65285612Sdelphij * have variable sized cache blocks (rangeing from 512 bytes to 66285612Sdelphij * 128K bytes). We therefore choose a set of blocks to evict to make 67285612Sdelphij * space for a cache miss that approximates as closely as possible 68285612Sdelphij * the space used by the new block. 69285612Sdelphij * 70285612Sdelphij * See also: "ARC: A Self-Tuning, Low Overhead Replacement Cache" 7156746Sroberto * by N. Megiddo & D. Modha, FAST 2003 7256746Sroberto */ 7356746Sroberto 7456746Sroberto/* 7556746Sroberto * The locking model: 7656746Sroberto * 7756746Sroberto * A new reference to a cache buffer can be obtained in two 7856746Sroberto * ways: 1) via a hash table lookup using the DVA as a key, 79285612Sdelphij * or 2) via one of the ARC lists. The arc_read() interface 80285612Sdelphij * uses method 1, while the internal arc algorithms for 8156746Sroberto * adjusting the cache use method 2. We therefore provide two 8256746Sroberto * types of locks: 1) the hash table lock array, and 2) the 83285612Sdelphij * arc list locks. 8456746Sroberto * 85285612Sdelphij * Buffers do not have their own mutexs, rather they rely on the 8656746Sroberto * hash table mutexs for the bulk of their protection (i.e. most 8756746Sroberto * fields in the arc_buf_hdr_t are protected by these mutexs). 8856746Sroberto * 8956746Sroberto * buf_hash_find() returns the appropriate mutex (held) when it 9056746Sroberto * locates the requested buffer in the hash table. It returns 9156746Sroberto * NULL for the mutex if the buffer was not in the table. 9256746Sroberto * 93285612Sdelphij * buf_hash_remove() expects the appropriate hash mutex to be 9456746Sroberto * already held before it is invoked. 9556746Sroberto * 9656746Sroberto * Each arc state also has a mutex which is used to protect the 9756746Sroberto * buffer list associated with the state. When attempting to 9856746Sroberto * obtain a hash table lock while holding an arc list lock you 9956746Sroberto * must use: mutex_tryenter() to avoid deadlock. Also note that 10056746Sroberto * the active state mutex must be held before the ghost state mutex. 10156746Sroberto * 10256746Sroberto * Arc buffers may have an associated eviction callback function. 10356746Sroberto * This function will be invoked prior to removing the buffer (e.g. 10456746Sroberto * in arc_do_user_evicts()). Note however that the data associated 10556746Sroberto * with the buffer may be evicted prior to the callback. The callback 106285612Sdelphij * must be made with *no locks held* (to prevent deadlock). Additionally, 10756746Sroberto * the users of callbacks must ensure that their private data is 108285612Sdelphij * protected from simultaneous callbacks from arc_clear_callback() 109285612Sdelphij * and arc_do_user_evicts(). 110285612Sdelphij * 111285612Sdelphij * Note that the majority of the performance stats are manipulated 112285612Sdelphij * with atomic operations. 11356746Sroberto * 114285612Sdelphij * The L2ARC uses the l2ad_mtx on each vdev for the following: 115285612Sdelphij * 116285612Sdelphij * - L2ARC buflist creation 11756746Sroberto * - L2ARC buflist eviction 118285612Sdelphij * - L2ARC write completion, which walks L2ARC buflists 119285612Sdelphij * - ARC header destruction, as it removes from L2ARC buflists 12056746Sroberto * - ARC header release, as it removes from L2ARC buflists 121285612Sdelphij */ 12256746Sroberto 123285612Sdelphij#include <sys/spa.h> 12456746Sroberto#include <sys/zio.h> 12556746Sroberto#include <sys/zio_compress.h> 12656746Sroberto#include <sys/zfs_context.h> 127285612Sdelphij#include <sys/arc.h> 128285612Sdelphij#include <sys/refcount.h> 129285612Sdelphij#include <sys/vdev.h> 130285612Sdelphij#include <sys/vdev_impl.h> 13156746Sroberto#include <sys/dsl_pool.h> 13256746Sroberto#ifdef _KERNEL 13356746Sroberto#include <sys/dnlc.h> 13456746Sroberto#endif 13556746Sroberto#include <sys/callb.h> 13656746Sroberto#include <sys/kstat.h> 13756746Sroberto#include <sys/trim_map.h> 138285612Sdelphij#include <zfs_fletcher.h> 13956746Sroberto#include <sys/sdt.h> 14056746Sroberto 14156746Sroberto#include <vm/vm_pageout.h> 14256746Sroberto#include <machine/vmparam.h> 14356746Sroberto 144285612Sdelphij#ifdef illumos 14556746Sroberto#ifndef _KERNEL 14656746Sroberto/* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */ 14756746Srobertoboolean_t arc_watch = B_FALSE; 14856746Srobertoint arc_procfd; 14956746Sroberto#endif 15056746Sroberto#endif /* illumos */ 15156746Sroberto 15256746Srobertostatic kmutex_t arc_reclaim_thr_lock; 15356746Srobertostatic kcondvar_t arc_reclaim_thr_cv; /* used to signal reclaim thr */ 15456746Srobertostatic uint8_t arc_thread_exit; 15556746Sroberto 15656746Srobertouint_t arc_reduce_dnlc_percent = 3; 15756746Sroberto 15856746Sroberto/* 15956746Sroberto * The number of iterations through arc_evict_*() before we 16056746Sroberto * drop & reacquire the lock. 16156746Sroberto */ 16256746Srobertoint arc_evict_iterations = 100; 16356746Sroberto 16456746Sroberto/* number of seconds before growing cache again */ 165285612Sdelphijstatic int arc_grow_retry = 60; 166285612Sdelphij 167285612Sdelphij/* shift of arc_c for calculating both min and max arc_p */ 168285612Sdelphijstatic int arc_p_min_shift = 4; 169285612Sdelphij 17056746Sroberto/* log2(fraction of arc to reclaim) */ 17156746Srobertostatic int arc_shrink_shift = 7; 17256746Sroberto 17356746Sroberto/* 17456746Sroberto * log2(fraction of ARC which must be free to allow growing). 17556746Sroberto * I.e. If there is less than arc_c >> arc_no_grow_shift free memory, 17656746Sroberto * when reading a new block into the ARC, we will evict an equal-sized block 17756746Sroberto * from the ARC. 17856746Sroberto * 17956746Sroberto * This must be less than arc_shrink_shift, so that when we shrink the ARC, 18056746Sroberto * we will still not allow it to grow. 18156746Sroberto */ 18256746Srobertoint arc_no_grow_shift = 5; 18356746Sroberto 18456746Sroberto 18556746Sroberto/* 186285612Sdelphij * minimum lifespan of a prefetch block in clock ticks 187285612Sdelphij * (initialized in arc_init()) 188285612Sdelphij */ 189285612Sdelphijstatic int arc_min_prefetch_lifespan; 190285612Sdelphij 191285612Sdelphij/* 19256746Sroberto * If this percent of memory is free, don't throttle. 19356746Sroberto */ 194285612Sdelphijint arc_lotsfree_percent = 10; 195285612Sdelphij 196285612Sdelphijstatic int arc_dead; 19756746Srobertoextern int zfs_prefetch_disable; 19856746Sroberto 199285612Sdelphij/* 200285612Sdelphij * The arc has filled available memory and has now warmed up. 201285612Sdelphij */ 202285612Sdelphijstatic boolean_t arc_warm; 20356746Sroberto 20456746Sroberto/* 205285612Sdelphij * These tunables are for performance analysis. 20656746Sroberto */ 20756746Srobertouint64_t zfs_arc_max; 20856746Srobertouint64_t zfs_arc_min; 20956746Srobertouint64_t zfs_arc_meta_limit = 0; 21056746Srobertouint64_t zfs_arc_meta_min = 0; 21156746Srobertoint zfs_arc_grow_retry = 0; 21256746Srobertoint zfs_arc_shrink_shift = 0; 21356746Srobertoint zfs_arc_p_min_shift = 0; 21456746Srobertoint zfs_disable_dup_eviction = 0; 21556746Srobertouint64_t zfs_arc_average_blocksize = 8 * 1024; /* 8KB */ 216285612Sdelphiju_int zfs_arc_free_target = 0; 217285612Sdelphij 21856746Srobertostatic int sysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS); 219285612Sdelphijstatic int sysctl_vfs_zfs_arc_meta_limit(SYSCTL_HANDLER_ARGS); 22056746Sroberto 221285612Sdelphij#ifdef _KERNEL 22256746Srobertostatic void 22356746Srobertoarc_free_target_init(void *unused __unused) 224285612Sdelphij{ 225285612Sdelphij 22656746Sroberto zfs_arc_free_target = vm_pageout_wakeup_thresh; 22756746Sroberto} 228285612SdelphijSYSINIT(arc_free_target_init, SI_SUB_KTHREAD_PAGE, SI_ORDER_ANY, 229285612Sdelphij arc_free_target_init, NULL); 230285612Sdelphij 23156746SrobertoTUNABLE_QUAD("vfs.zfs.arc_meta_limit", &zfs_arc_meta_limit); 23256746SrobertoTUNABLE_QUAD("vfs.zfs.arc_meta_min", &zfs_arc_meta_min); 233285612SdelphijTUNABLE_INT("vfs.zfs.arc_shrink_shift", &zfs_arc_shrink_shift); 234285612SdelphijSYSCTL_DECL(_vfs_zfs); 235285612SdelphijSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_max, CTLFLAG_RDTUN, &zfs_arc_max, 0, 23656746Sroberto "Maximum ARC size"); 23756746SrobertoSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_min, CTLFLAG_RDTUN, &zfs_arc_min, 0, 23856746Sroberto "Minimum ARC size"); 23956746SrobertoSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_average_blocksize, CTLFLAG_RDTUN, 24056746Sroberto &zfs_arc_average_blocksize, 0, 241285612Sdelphij "ARC average blocksize"); 24256746SrobertoSYSCTL_INT(_vfs_zfs, OID_AUTO, arc_shrink_shift, CTLFLAG_RW, 243285612Sdelphij &arc_shrink_shift, 0, 24456746Sroberto "log2(fraction of arc to reclaim)"); 24556746Sroberto 24656746Sroberto/* 24756746Sroberto * We don't have a tunable for arc_free_target due to the dependency on 24856746Sroberto * pagedaemon initialisation. 24956746Sroberto */ 25056746SrobertoSYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_free_target, 25156746Sroberto CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(u_int), 252285612Sdelphij sysctl_vfs_zfs_arc_free_target, "IU", 25356746Sroberto "Desired number of free pages below which ARC triggers reclaim"); 25456746Sroberto 25556746Srobertostatic int 25656746Srobertosysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS) 25756746Sroberto{ 258285612Sdelphij u_int val; 25956746Sroberto int err; 260285612Sdelphij 26156746Sroberto val = zfs_arc_free_target; 262285612Sdelphij err = sysctl_handle_int(oidp, &val, 0, req); 26356746Sroberto if (err != 0 || req->newptr == NULL) 264285612Sdelphij return (err); 26556746Sroberto 26656746Sroberto if (val < minfree) 26756746Sroberto return (EINVAL); 26856746Sroberto if (val > vm_cnt.v_page_count) 26956746Sroberto return (EINVAL); 27056746Sroberto 271285612Sdelphij zfs_arc_free_target = val; 27256746Sroberto 273285612Sdelphij return (0); 274285612Sdelphij} 27556746Sroberto 27656746Sroberto/* 27756746Sroberto * Must be declared here, before the definition of corresponding kstat 27856746Sroberto * macro which uses the same names will confuse the compiler. 27956746Sroberto */ 28056746SrobertoSYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_meta_limit, 28156746Sroberto CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(uint64_t), 28256746Sroberto sysctl_vfs_zfs_arc_meta_limit, "QU", 283285612Sdelphij "ARC metadata limit"); 284285612Sdelphij#endif 28556746Sroberto 286285612Sdelphij/* 28756746Sroberto * Note that buffers can be in one of 6 states: 288285612Sdelphij * ARC_anon - anonymous (discussed below) 289285612Sdelphij * ARC_mru - recently used, currently cached 290285612Sdelphij * ARC_mru_ghost - recentely used, no longer in cache 291285612Sdelphij * ARC_mfu - frequently used, currently cached 292285612Sdelphij * ARC_mfu_ghost - frequently used, no longer in cache 29356746Sroberto * ARC_l2c_only - exists in L2ARC but not other states 294285612Sdelphij * When there are no active references to the buffer, they are 295285612Sdelphij * are linked onto a list in one of these arc states. These are 296285612Sdelphij * the only buffers that can be evicted or deleted. Within each 297285612Sdelphij * state there are multiple lists, one for meta-data and one for 298285612Sdelphij * non-meta-data. Meta-data (indirect blocks, blocks of dnodes, 29956746Sroberto * etc.) is tracked separately so that it can be managed more 300285612Sdelphij * explicitly: favored over data, limited explicitly. 301285612Sdelphij * 30256746Sroberto * Anonymous buffers are buffers that are not associated with 30356746Sroberto * a DVA. These are buffers that hold dirty block copies 30456746Sroberto * before they are written to stable storage. By definition, 305285612Sdelphij * they are "ref'd" and are considered part of arc_mru 306285612Sdelphij * that cannot be freed. Generally, they will aquire a DVA 307285612Sdelphij * as they are written and migrate onto the arc_mru list. 30856746Sroberto * 309285612Sdelphij * The ARC_l2c_only state is for buffers that are in the second 31056746Sroberto * level ARC but no longer in any of the ARC_m* lists. The second 31156746Sroberto * level ARC itself may also contain buffers that are in any of 312285612Sdelphij * the ARC_m* states - meaning that a buffer can exist in two 313285612Sdelphij * places. The reason for the ARC_l2c_only state is to keep the 314285612Sdelphij * buffer header in the hash table, so that reads that hit the 31556746Sroberto * second level ARC benefit from these fast lookups. 316285612Sdelphij */ 317132451Sroberto 31856746Srobertotypedef struct arc_state { 31956746Sroberto list_t arcs_list[ARC_BUFC_NUMTYPES]; /* list of evictable buffers */ 320285612Sdelphij uint64_t arcs_lsize[ARC_BUFC_NUMTYPES]; /* amount of evictable data */ 321285612Sdelphij uint64_t arcs_size; /* total amount of data in this state */ 322285612Sdelphij kmutex_t arcs_mtx; 323285612Sdelphij} arc_state_t; 32456746Sroberto 325285612Sdelphij/* The 6 states: */ 326285612Sdelphijstatic arc_state_t ARC_anon; 327285612Sdelphijstatic arc_state_t ARC_mru; 32856746Srobertostatic arc_state_t ARC_mru_ghost; 32956746Srobertostatic arc_state_t ARC_mfu; 33056746Srobertostatic arc_state_t ARC_mfu_ghost; 33156746Srobertostatic arc_state_t ARC_l2c_only; 33256746Sroberto 33356746Srobertotypedef struct arc_stats { 33456746Sroberto kstat_named_t arcstat_hits; 33556746Sroberto kstat_named_t arcstat_misses; 336 kstat_named_t arcstat_demand_data_hits; 337 kstat_named_t arcstat_demand_data_misses; 338 kstat_named_t arcstat_demand_metadata_hits; 339 kstat_named_t arcstat_demand_metadata_misses; 340 kstat_named_t arcstat_prefetch_data_hits; 341 kstat_named_t arcstat_prefetch_data_misses; 342 kstat_named_t arcstat_prefetch_metadata_hits; 343 kstat_named_t arcstat_prefetch_metadata_misses; 344 kstat_named_t arcstat_mru_hits; 345 kstat_named_t arcstat_mru_ghost_hits; 346 kstat_named_t arcstat_mfu_hits; 347 kstat_named_t arcstat_mfu_ghost_hits; 348 kstat_named_t arcstat_allocated; 349 kstat_named_t arcstat_deleted; 350 kstat_named_t arcstat_recycle_miss; 351 /* 352 * Number of buffers that could not be evicted because the hash lock 353 * was held by another thread. The lock may not necessarily be held 354 * by something using the same buffer, since hash locks are shared 355 * by multiple buffers. 356 */ 357 kstat_named_t arcstat_mutex_miss; 358 /* 359 * Number of buffers skipped because they have I/O in progress, are 360 * indrect prefetch buffers that have not lived long enough, or are 361 * not from the spa we're trying to evict from. 362 */ 363 kstat_named_t arcstat_evict_skip; 364 kstat_named_t arcstat_evict_l2_cached; 365 kstat_named_t arcstat_evict_l2_eligible; 366 kstat_named_t arcstat_evict_l2_ineligible; 367 kstat_named_t arcstat_hash_elements; 368 kstat_named_t arcstat_hash_elements_max; 369 kstat_named_t arcstat_hash_collisions; 370 kstat_named_t arcstat_hash_chains; 371 kstat_named_t arcstat_hash_chain_max; 372 kstat_named_t arcstat_p; 373 kstat_named_t arcstat_c; 374 kstat_named_t arcstat_c_min; 375 kstat_named_t arcstat_c_max; 376 kstat_named_t arcstat_size; 377 /* 378 * Number of bytes consumed by internal ARC structures necessary 379 * for tracking purposes; these structures are not actually 380 * backed by ARC buffers. This includes arc_buf_hdr_t structures 381 * (allocated via arc_buf_hdr_t_full and arc_buf_hdr_t_l2only 382 * caches), and arc_buf_t structures (allocated via arc_buf_t 383 * cache). 384 */ 385 kstat_named_t arcstat_hdr_size; 386 /* 387 * Number of bytes consumed by ARC buffers of type equal to 388 * ARC_BUFC_DATA. This is generally consumed by buffers backing 389 * on disk user data (e.g. plain file contents). 390 */ 391 kstat_named_t arcstat_data_size; 392 /* 393 * Number of bytes consumed by ARC buffers of type equal to 394 * ARC_BUFC_METADATA. This is generally consumed by buffers 395 * backing on disk data that is used for internal ZFS 396 * structures (e.g. ZAP, dnode, indirect blocks, etc). 397 */ 398 kstat_named_t arcstat_metadata_size; 399 /* 400 * Number of bytes consumed by various buffers and structures 401 * not actually backed with ARC buffers. This includes bonus 402 * buffers (allocated directly via zio_buf_* functions), 403 * dmu_buf_impl_t structures (allocated via dmu_buf_impl_t 404 * cache), and dnode_t structures (allocated via dnode_t cache). 405 */ 406 kstat_named_t arcstat_other_size; 407 /* 408 * Total number of bytes consumed by ARC buffers residing in the 409 * arc_anon state. This includes *all* buffers in the arc_anon 410 * state; e.g. data, metadata, evictable, and unevictable buffers 411 * are all included in this value. 412 */ 413 kstat_named_t arcstat_anon_size; 414 /* 415 * Number of bytes consumed by ARC buffers that meet the 416 * following criteria: backing buffers of type ARC_BUFC_DATA, 417 * residing in the arc_anon state, and are eligible for eviction 418 * (e.g. have no outstanding holds on the buffer). 419 */ 420 kstat_named_t arcstat_anon_evictable_data; 421 /* 422 * Number of bytes consumed by ARC buffers that meet the 423 * following criteria: backing buffers of type ARC_BUFC_METADATA, 424 * residing in the arc_anon state, and are eligible for eviction 425 * (e.g. have no outstanding holds on the buffer). 426 */ 427 kstat_named_t arcstat_anon_evictable_metadata; 428 /* 429 * Total number of bytes consumed by ARC buffers residing in the 430 * arc_mru state. This includes *all* buffers in the arc_mru 431 * state; e.g. data, metadata, evictable, and unevictable buffers 432 * are all included in this value. 433 */ 434 kstat_named_t arcstat_mru_size; 435 /* 436 * Number of bytes consumed by ARC buffers that meet the 437 * following criteria: backing buffers of type ARC_BUFC_DATA, 438 * residing in the arc_mru state, and are eligible for eviction 439 * (e.g. have no outstanding holds on the buffer). 440 */ 441 kstat_named_t arcstat_mru_evictable_data; 442 /* 443 * Number of bytes consumed by ARC buffers that meet the 444 * following criteria: backing buffers of type ARC_BUFC_METADATA, 445 * residing in the arc_mru state, and are eligible for eviction 446 * (e.g. have no outstanding holds on the buffer). 447 */ 448 kstat_named_t arcstat_mru_evictable_metadata; 449 /* 450 * Total number of bytes that *would have been* consumed by ARC 451 * buffers in the arc_mru_ghost state. The key thing to note 452 * here, is the fact that this size doesn't actually indicate 453 * RAM consumption. The ghost lists only consist of headers and 454 * don't actually have ARC buffers linked off of these headers. 455 * Thus, *if* the headers had associated ARC buffers, these 456 * buffers *would have* consumed this number of bytes. 457 */ 458 kstat_named_t arcstat_mru_ghost_size; 459 /* 460 * Number of bytes that *would have been* consumed by ARC 461 * buffers that are eligible for eviction, of type 462 * ARC_BUFC_DATA, and linked off the arc_mru_ghost state. 463 */ 464 kstat_named_t arcstat_mru_ghost_evictable_data; 465 /* 466 * Number of bytes that *would have been* consumed by ARC 467 * buffers that are eligible for eviction, of type 468 * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state. 469 */ 470 kstat_named_t arcstat_mru_ghost_evictable_metadata; 471 /* 472 * Total number of bytes consumed by ARC buffers residing in the 473 * arc_mfu state. This includes *all* buffers in the arc_mfu 474 * state; e.g. data, metadata, evictable, and unevictable buffers 475 * are all included in this value. 476 */ 477 kstat_named_t arcstat_mfu_size; 478 /* 479 * Number of bytes consumed by ARC buffers that are eligible for 480 * eviction, of type ARC_BUFC_DATA, and reside in the arc_mfu 481 * state. 482 */ 483 kstat_named_t arcstat_mfu_evictable_data; 484 /* 485 * Number of bytes consumed by ARC buffers that are eligible for 486 * eviction, of type ARC_BUFC_METADATA, and reside in the 487 * arc_mfu state. 488 */ 489 kstat_named_t arcstat_mfu_evictable_metadata; 490 /* 491 * Total number of bytes that *would have been* consumed by ARC 492 * buffers in the arc_mfu_ghost state. See the comment above 493 * arcstat_mru_ghost_size for more details. 494 */ 495 kstat_named_t arcstat_mfu_ghost_size; 496 /* 497 * Number of bytes that *would have been* consumed by ARC 498 * buffers that are eligible for eviction, of type 499 * ARC_BUFC_DATA, and linked off the arc_mfu_ghost state. 500 */ 501 kstat_named_t arcstat_mfu_ghost_evictable_data; 502 /* 503 * Number of bytes that *would have been* consumed by ARC 504 * buffers that are eligible for eviction, of type 505 * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state. 506 */ 507 kstat_named_t arcstat_mfu_ghost_evictable_metadata; 508 kstat_named_t arcstat_l2_hits; 509 kstat_named_t arcstat_l2_misses; 510 kstat_named_t arcstat_l2_feeds; 511 kstat_named_t arcstat_l2_rw_clash; 512 kstat_named_t arcstat_l2_read_bytes; 513 kstat_named_t arcstat_l2_write_bytes; 514 kstat_named_t arcstat_l2_writes_sent; 515 kstat_named_t arcstat_l2_writes_done; 516 kstat_named_t arcstat_l2_writes_error; 517 kstat_named_t arcstat_l2_writes_hdr_miss; 518 kstat_named_t arcstat_l2_evict_lock_retry; 519 kstat_named_t arcstat_l2_evict_reading; 520 kstat_named_t arcstat_l2_evict_l1cached; 521 kstat_named_t arcstat_l2_free_on_write; 522 kstat_named_t arcstat_l2_cdata_free_on_write; 523 kstat_named_t arcstat_l2_abort_lowmem; 524 kstat_named_t arcstat_l2_cksum_bad; 525 kstat_named_t arcstat_l2_io_error; 526 kstat_named_t arcstat_l2_size; 527 kstat_named_t arcstat_l2_asize; 528 kstat_named_t arcstat_l2_hdr_size; 529 kstat_named_t arcstat_l2_compress_successes; 530 kstat_named_t arcstat_l2_compress_zeros; 531 kstat_named_t arcstat_l2_compress_failures; 532 kstat_named_t arcstat_l2_write_trylock_fail; 533 kstat_named_t arcstat_l2_write_passed_headroom; 534 kstat_named_t arcstat_l2_write_spa_mismatch; 535 kstat_named_t arcstat_l2_write_in_l2; 536 kstat_named_t arcstat_l2_write_hdr_io_in_progress; 537 kstat_named_t arcstat_l2_write_not_cacheable; 538 kstat_named_t arcstat_l2_write_full; 539 kstat_named_t arcstat_l2_write_buffer_iter; 540 kstat_named_t arcstat_l2_write_pios; 541 kstat_named_t arcstat_l2_write_buffer_bytes_scanned; 542 kstat_named_t arcstat_l2_write_buffer_list_iter; 543 kstat_named_t arcstat_l2_write_buffer_list_null_iter; 544 kstat_named_t arcstat_memory_throttle_count; 545 kstat_named_t arcstat_duplicate_buffers; 546 kstat_named_t arcstat_duplicate_buffers_size; 547 kstat_named_t arcstat_duplicate_reads; 548 kstat_named_t arcstat_meta_used; 549 kstat_named_t arcstat_meta_limit; 550 kstat_named_t arcstat_meta_max; 551 kstat_named_t arcstat_meta_min; 552} arc_stats_t; 553 554static arc_stats_t arc_stats = { 555 { "hits", KSTAT_DATA_UINT64 }, 556 { "misses", KSTAT_DATA_UINT64 }, 557 { "demand_data_hits", KSTAT_DATA_UINT64 }, 558 { "demand_data_misses", KSTAT_DATA_UINT64 }, 559 { "demand_metadata_hits", KSTAT_DATA_UINT64 }, 560 { "demand_metadata_misses", KSTAT_DATA_UINT64 }, 561 { "prefetch_data_hits", KSTAT_DATA_UINT64 }, 562 { "prefetch_data_misses", KSTAT_DATA_UINT64 }, 563 { "prefetch_metadata_hits", KSTAT_DATA_UINT64 }, 564 { "prefetch_metadata_misses", KSTAT_DATA_UINT64 }, 565 { "mru_hits", KSTAT_DATA_UINT64 }, 566 { "mru_ghost_hits", KSTAT_DATA_UINT64 }, 567 { "mfu_hits", KSTAT_DATA_UINT64 }, 568 { "mfu_ghost_hits", KSTAT_DATA_UINT64 }, 569 { "allocated", KSTAT_DATA_UINT64 }, 570 { "deleted", KSTAT_DATA_UINT64 }, 571 { "recycle_miss", KSTAT_DATA_UINT64 }, 572 { "mutex_miss", KSTAT_DATA_UINT64 }, 573 { "evict_skip", KSTAT_DATA_UINT64 }, 574 { "evict_l2_cached", KSTAT_DATA_UINT64 }, 575 { "evict_l2_eligible", KSTAT_DATA_UINT64 }, 576 { "evict_l2_ineligible", KSTAT_DATA_UINT64 }, 577 { "hash_elements", KSTAT_DATA_UINT64 }, 578 { "hash_elements_max", KSTAT_DATA_UINT64 }, 579 { "hash_collisions", KSTAT_DATA_UINT64 }, 580 { "hash_chains", KSTAT_DATA_UINT64 }, 581 { "hash_chain_max", KSTAT_DATA_UINT64 }, 582 { "p", KSTAT_DATA_UINT64 }, 583 { "c", KSTAT_DATA_UINT64 }, 584 { "c_min", KSTAT_DATA_UINT64 }, 585 { "c_max", KSTAT_DATA_UINT64 }, 586 { "size", KSTAT_DATA_UINT64 }, 587 { "hdr_size", KSTAT_DATA_UINT64 }, 588 { "data_size", KSTAT_DATA_UINT64 }, 589 { "metadata_size", KSTAT_DATA_UINT64 }, 590 { "other_size", KSTAT_DATA_UINT64 }, 591 { "anon_size", KSTAT_DATA_UINT64 }, 592 { "anon_evictable_data", KSTAT_DATA_UINT64 }, 593 { "anon_evictable_metadata", KSTAT_DATA_UINT64 }, 594 { "mru_size", KSTAT_DATA_UINT64 }, 595 { "mru_evictable_data", KSTAT_DATA_UINT64 }, 596 { "mru_evictable_metadata", KSTAT_DATA_UINT64 }, 597 { "mru_ghost_size", KSTAT_DATA_UINT64 }, 598 { "mru_ghost_evictable_data", KSTAT_DATA_UINT64 }, 599 { "mru_ghost_evictable_metadata", KSTAT_DATA_UINT64 }, 600 { "mfu_size", KSTAT_DATA_UINT64 }, 601 { "mfu_evictable_data", KSTAT_DATA_UINT64 }, 602 { "mfu_evictable_metadata", KSTAT_DATA_UINT64 }, 603 { "mfu_ghost_size", KSTAT_DATA_UINT64 }, 604 { "mfu_ghost_evictable_data", KSTAT_DATA_UINT64 }, 605 { "mfu_ghost_evictable_metadata", KSTAT_DATA_UINT64 }, 606 { "l2_hits", KSTAT_DATA_UINT64 }, 607 { "l2_misses", KSTAT_DATA_UINT64 }, 608 { "l2_feeds", KSTAT_DATA_UINT64 }, 609 { "l2_rw_clash", KSTAT_DATA_UINT64 }, 610 { "l2_read_bytes", KSTAT_DATA_UINT64 }, 611 { "l2_write_bytes", KSTAT_DATA_UINT64 }, 612 { "l2_writes_sent", KSTAT_DATA_UINT64 }, 613 { "l2_writes_done", KSTAT_DATA_UINT64 }, 614 { "l2_writes_error", KSTAT_DATA_UINT64 }, 615 { "l2_writes_hdr_miss", KSTAT_DATA_UINT64 }, 616 { "l2_evict_lock_retry", KSTAT_DATA_UINT64 }, 617 { "l2_evict_reading", KSTAT_DATA_UINT64 }, 618 { "l2_evict_l1cached", KSTAT_DATA_UINT64 }, 619 { "l2_free_on_write", KSTAT_DATA_UINT64 }, 620 { "l2_cdata_free_on_write", KSTAT_DATA_UINT64 }, 621 { "l2_abort_lowmem", KSTAT_DATA_UINT64 }, 622 { "l2_cksum_bad", KSTAT_DATA_UINT64 }, 623 { "l2_io_error", KSTAT_DATA_UINT64 }, 624 { "l2_size", KSTAT_DATA_UINT64 }, 625 { "l2_asize", KSTAT_DATA_UINT64 }, 626 { "l2_hdr_size", KSTAT_DATA_UINT64 }, 627 { "l2_compress_successes", KSTAT_DATA_UINT64 }, 628 { "l2_compress_zeros", KSTAT_DATA_UINT64 }, 629 { "l2_compress_failures", KSTAT_DATA_UINT64 }, 630 { "l2_write_trylock_fail", KSTAT_DATA_UINT64 }, 631 { "l2_write_passed_headroom", KSTAT_DATA_UINT64 }, 632 { "l2_write_spa_mismatch", KSTAT_DATA_UINT64 }, 633 { "l2_write_in_l2", KSTAT_DATA_UINT64 }, 634 { "l2_write_io_in_progress", KSTAT_DATA_UINT64 }, 635 { "l2_write_not_cacheable", KSTAT_DATA_UINT64 }, 636 { "l2_write_full", KSTAT_DATA_UINT64 }, 637 { "l2_write_buffer_iter", KSTAT_DATA_UINT64 }, 638 { "l2_write_pios", KSTAT_DATA_UINT64 }, 639 { "l2_write_buffer_bytes_scanned", KSTAT_DATA_UINT64 }, 640 { "l2_write_buffer_list_iter", KSTAT_DATA_UINT64 }, 641 { "l2_write_buffer_list_null_iter", KSTAT_DATA_UINT64 }, 642 { "memory_throttle_count", KSTAT_DATA_UINT64 }, 643 { "duplicate_buffers", KSTAT_DATA_UINT64 }, 644 { "duplicate_buffers_size", KSTAT_DATA_UINT64 }, 645 { "duplicate_reads", KSTAT_DATA_UINT64 }, 646 { "arc_meta_used", KSTAT_DATA_UINT64 }, 647 { "arc_meta_limit", KSTAT_DATA_UINT64 }, 648 { "arc_meta_max", KSTAT_DATA_UINT64 }, 649 { "arc_meta_min", KSTAT_DATA_UINT64 } 650}; 651 652#define ARCSTAT(stat) (arc_stats.stat.value.ui64) 653 654#define ARCSTAT_INCR(stat, val) \ 655 atomic_add_64(&arc_stats.stat.value.ui64, (val)) 656 657#define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1) 658#define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1) 659 660#define ARCSTAT_MAX(stat, val) { \ 661 uint64_t m; \ 662 while ((val) > (m = arc_stats.stat.value.ui64) && \ 663 (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val)))) \ 664 continue; \ 665} 666 667#define ARCSTAT_MAXSTAT(stat) \ 668 ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64) 669 670/* 671 * We define a macro to allow ARC hits/misses to be easily broken down by 672 * two separate conditions, giving a total of four different subtypes for 673 * each of hits and misses (so eight statistics total). 674 */ 675#define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \ 676 if (cond1) { \ 677 if (cond2) { \ 678 ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \ 679 } else { \ 680 ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \ 681 } \ 682 } else { \ 683 if (cond2) { \ 684 ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \ 685 } else { \ 686 ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\ 687 } \ 688 } 689 690kstat_t *arc_ksp; 691static arc_state_t *arc_anon; 692static arc_state_t *arc_mru; 693static arc_state_t *arc_mru_ghost; 694static arc_state_t *arc_mfu; 695static arc_state_t *arc_mfu_ghost; 696static arc_state_t *arc_l2c_only; 697 698/* 699 * There are several ARC variables that are critical to export as kstats -- 700 * but we don't want to have to grovel around in the kstat whenever we wish to 701 * manipulate them. For these variables, we therefore define them to be in 702 * terms of the statistic variable. This assures that we are not introducing 703 * the possibility of inconsistency by having shadow copies of the variables, 704 * while still allowing the code to be readable. 705 */ 706#define arc_size ARCSTAT(arcstat_size) /* actual total arc size */ 707#define arc_p ARCSTAT(arcstat_p) /* target size of MRU */ 708#define arc_c ARCSTAT(arcstat_c) /* target size of cache */ 709#define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */ 710#define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */ 711#define arc_meta_limit ARCSTAT(arcstat_meta_limit) /* max size for metadata */ 712#define arc_meta_min ARCSTAT(arcstat_meta_min) /* min size for metadata */ 713#define arc_meta_used ARCSTAT(arcstat_meta_used) /* size of metadata */ 714#define arc_meta_max ARCSTAT(arcstat_meta_max) /* max size of metadata */ 715 716#define L2ARC_IS_VALID_COMPRESS(_c_) \ 717 ((_c_) == ZIO_COMPRESS_LZ4 || (_c_) == ZIO_COMPRESS_EMPTY) 718 719static int arc_no_grow; /* Don't try to grow cache size */ 720static uint64_t arc_tempreserve; 721static uint64_t arc_loaned_bytes; 722 723typedef struct arc_callback arc_callback_t; 724 725struct arc_callback { 726 void *acb_private; 727 arc_done_func_t *acb_done; 728 arc_buf_t *acb_buf; 729 zio_t *acb_zio_dummy; 730 arc_callback_t *acb_next; 731}; 732 733typedef struct arc_write_callback arc_write_callback_t; 734 735struct arc_write_callback { 736 void *awcb_private; 737 arc_done_func_t *awcb_ready; 738 arc_done_func_t *awcb_physdone; 739 arc_done_func_t *awcb_done; 740 arc_buf_t *awcb_buf; 741}; 742 743/* 744 * ARC buffers are separated into multiple structs as a memory saving measure: 745 * - Common fields struct, always defined, and embedded within it: 746 * - L2-only fields, always allocated but undefined when not in L2ARC 747 * - L1-only fields, only allocated when in L1ARC 748 * 749 * Buffer in L1 Buffer only in L2 750 * +------------------------+ +------------------------+ 751 * | arc_buf_hdr_t | | arc_buf_hdr_t | 752 * | | | | 753 * | | | | 754 * | | | | 755 * +------------------------+ +------------------------+ 756 * | l2arc_buf_hdr_t | | l2arc_buf_hdr_t | 757 * | (undefined if L1-only) | | | 758 * +------------------------+ +------------------------+ 759 * | l1arc_buf_hdr_t | 760 * | | 761 * | | 762 * | | 763 * | | 764 * +------------------------+ 765 * 766 * Because it's possible for the L2ARC to become extremely large, we can wind 767 * up eating a lot of memory in L2ARC buffer headers, so the size of a header 768 * is minimized by only allocating the fields necessary for an L1-cached buffer 769 * when a header is actually in the L1 cache. The sub-headers (l1arc_buf_hdr and 770 * l2arc_buf_hdr) are embedded rather than allocated separately to save a couple 771 * words in pointers. arc_hdr_realloc() is used to switch a header between 772 * these two allocation states. 773 */ 774typedef struct l1arc_buf_hdr { 775 kmutex_t b_freeze_lock; 776#ifdef ZFS_DEBUG 777 /* 778 * used for debugging wtih kmem_flags - by allocating and freeing 779 * b_thawed when the buffer is thawed, we get a record of the stack 780 * trace that thawed it. 781 */ 782 void *b_thawed; 783#endif 784 785 arc_buf_t *b_buf; 786 uint32_t b_datacnt; 787 /* for waiting on writes to complete */ 788 kcondvar_t b_cv; 789 790 /* protected by arc state mutex */ 791 arc_state_t *b_state; 792 list_node_t b_arc_node; 793 794 /* updated atomically */ 795 clock_t b_arc_access; 796 797 /* self protecting */ 798 refcount_t b_refcnt; 799 800 arc_callback_t *b_acb; 801 /* temporary buffer holder for in-flight compressed data */ 802 void *b_tmp_cdata; 803} l1arc_buf_hdr_t; 804 805typedef struct l2arc_dev l2arc_dev_t; 806 807typedef struct l2arc_buf_hdr { 808 /* protected by arc_buf_hdr mutex */ 809 l2arc_dev_t *b_dev; /* L2ARC device */ 810 uint64_t b_daddr; /* disk address, offset byte */ 811 /* real alloc'd buffer size depending on b_compress applied */ 812 int32_t b_asize; 813 814 list_node_t b_l2node; 815} l2arc_buf_hdr_t; 816 817struct arc_buf_hdr { 818 /* protected by hash lock */ 819 dva_t b_dva; 820 uint64_t b_birth; 821 /* 822 * Even though this checksum is only set/verified when a buffer is in 823 * the L1 cache, it needs to be in the set of common fields because it 824 * must be preserved from the time before a buffer is written out to 825 * L2ARC until after it is read back in. 826 */ 827 zio_cksum_t *b_freeze_cksum; 828 829 arc_buf_hdr_t *b_hash_next; 830 arc_flags_t b_flags; 831 832 /* immutable */ 833 int32_t b_size; 834 uint64_t b_spa; 835 836 /* L2ARC fields. Undefined when not in L2ARC. */ 837 l2arc_buf_hdr_t b_l2hdr; 838 /* L1ARC fields. Undefined when in l2arc_only state */ 839 l1arc_buf_hdr_t b_l1hdr; 840}; 841 842#ifdef _KERNEL 843static int 844sysctl_vfs_zfs_arc_meta_limit(SYSCTL_HANDLER_ARGS) 845{ 846 uint64_t val; 847 int err; 848 849 val = arc_meta_limit; 850 err = sysctl_handle_64(oidp, &val, 0, req); 851 if (err != 0 || req->newptr == NULL) 852 return (err); 853 854 if (val <= 0 || val > arc_c_max) 855 return (EINVAL); 856 857 arc_meta_limit = val; 858 return (0); 859} 860#endif 861 862static arc_buf_t *arc_eviction_list; 863static kmutex_t arc_eviction_mtx; 864static arc_buf_hdr_t arc_eviction_hdr; 865 866#define GHOST_STATE(state) \ 867 ((state) == arc_mru_ghost || (state) == arc_mfu_ghost || \ 868 (state) == arc_l2c_only) 869 870#define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_FLAG_IN_HASH_TABLE) 871#define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) 872#define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_FLAG_IO_ERROR) 873#define HDR_PREFETCH(hdr) ((hdr)->b_flags & ARC_FLAG_PREFETCH) 874#define HDR_FREED_IN_READ(hdr) ((hdr)->b_flags & ARC_FLAG_FREED_IN_READ) 875#define HDR_BUF_AVAILABLE(hdr) ((hdr)->b_flags & ARC_FLAG_BUF_AVAILABLE) 876 877#define HDR_L2CACHE(hdr) ((hdr)->b_flags & ARC_FLAG_L2CACHE) 878#define HDR_L2COMPRESS(hdr) ((hdr)->b_flags & ARC_FLAG_L2COMPRESS) 879#define HDR_L2_READING(hdr) \ 880 (((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) && \ 881 ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR)) 882#define HDR_L2_WRITING(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITING) 883#define HDR_L2_EVICTED(hdr) ((hdr)->b_flags & ARC_FLAG_L2_EVICTED) 884#define HDR_L2_WRITE_HEAD(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITE_HEAD) 885 886#define HDR_ISTYPE_METADATA(hdr) \ 887 ((hdr)->b_flags & ARC_FLAG_BUFC_METADATA) 888#define HDR_ISTYPE_DATA(hdr) (!HDR_ISTYPE_METADATA(hdr)) 889 890#define HDR_HAS_L1HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L1HDR) 891#define HDR_HAS_L2HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR) 892 893/* For storing compression mode in b_flags */ 894#define HDR_COMPRESS_OFFSET 24 895#define HDR_COMPRESS_NBITS 7 896 897#define HDR_GET_COMPRESS(hdr) ((enum zio_compress)BF32_GET(hdr->b_flags, \ 898 HDR_COMPRESS_OFFSET, HDR_COMPRESS_NBITS)) 899#define HDR_SET_COMPRESS(hdr, cmp) BF32_SET(hdr->b_flags, \ 900 HDR_COMPRESS_OFFSET, HDR_COMPRESS_NBITS, (cmp)) 901 902/* 903 * Other sizes 904 */ 905 906#define HDR_FULL_SIZE ((int64_t)sizeof (arc_buf_hdr_t)) 907#define HDR_L2ONLY_SIZE ((int64_t)offsetof(arc_buf_hdr_t, b_l1hdr)) 908 909/* 910 * Hash table routines 911 */ 912 913#define HT_LOCK_PAD CACHE_LINE_SIZE 914 915struct ht_lock { 916 kmutex_t ht_lock; 917#ifdef _KERNEL 918 unsigned char pad[(HT_LOCK_PAD - sizeof (kmutex_t))]; 919#endif 920}; 921 922#define BUF_LOCKS 256 923typedef struct buf_hash_table { 924 uint64_t ht_mask; 925 arc_buf_hdr_t **ht_table; 926 struct ht_lock ht_locks[BUF_LOCKS] __aligned(CACHE_LINE_SIZE); 927} buf_hash_table_t; 928 929static buf_hash_table_t buf_hash_table; 930 931#define BUF_HASH_INDEX(spa, dva, birth) \ 932 (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask) 933#define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)]) 934#define BUF_HASH_LOCK(idx) (&(BUF_HASH_LOCK_NTRY(idx).ht_lock)) 935#define HDR_LOCK(hdr) \ 936 (BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth))) 937 938uint64_t zfs_crc64_table[256]; 939 940/* 941 * Level 2 ARC 942 */ 943 944#define L2ARC_WRITE_SIZE (8 * 1024 * 1024) /* initial write max */ 945#define L2ARC_HEADROOM 2 /* num of writes */ 946/* 947 * If we discover during ARC scan any buffers to be compressed, we boost 948 * our headroom for the next scanning cycle by this percentage multiple. 949 */ 950#define L2ARC_HEADROOM_BOOST 200 951#define L2ARC_FEED_SECS 1 /* caching interval secs */ 952#define L2ARC_FEED_MIN_MS 200 /* min caching interval ms */ 953 954/* 955 * Used to distinguish headers that are being process by 956 * l2arc_write_buffers(), but have yet to be assigned to a l2arc disk 957 * address. This can happen when the header is added to the l2arc's list 958 * of buffers to write in the first stage of l2arc_write_buffers(), but 959 * has not yet been written out which happens in the second stage of 960 * l2arc_write_buffers(). 961 */ 962#define L2ARC_ADDR_UNSET ((uint64_t)(-1)) 963 964#define l2arc_writes_sent ARCSTAT(arcstat_l2_writes_sent) 965#define l2arc_writes_done ARCSTAT(arcstat_l2_writes_done) 966 967/* L2ARC Performance Tunables */ 968uint64_t l2arc_write_max = L2ARC_WRITE_SIZE; /* default max write size */ 969uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra write during warmup */ 970uint64_t l2arc_headroom = L2ARC_HEADROOM; /* number of dev writes */ 971uint64_t l2arc_headroom_boost = L2ARC_HEADROOM_BOOST; 972uint64_t l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */ 973uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval milliseconds */ 974boolean_t l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */ 975boolean_t l2arc_feed_again = B_TRUE; /* turbo warmup */ 976boolean_t l2arc_norw = B_TRUE; /* no reads during writes */ 977 978SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_max, CTLFLAG_RW, 979 &l2arc_write_max, 0, "max write size"); 980SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_boost, CTLFLAG_RW, 981 &l2arc_write_boost, 0, "extra write during warmup"); 982SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_headroom, CTLFLAG_RW, 983 &l2arc_headroom, 0, "number of dev writes"); 984SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_secs, CTLFLAG_RW, 985 &l2arc_feed_secs, 0, "interval seconds"); 986SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_min_ms, CTLFLAG_RW, 987 &l2arc_feed_min_ms, 0, "min interval milliseconds"); 988 989SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_noprefetch, CTLFLAG_RW, 990 &l2arc_noprefetch, 0, "don't cache prefetch bufs"); 991SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_feed_again, CTLFLAG_RW, 992 &l2arc_feed_again, 0, "turbo warmup"); 993SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_norw, CTLFLAG_RW, 994 &l2arc_norw, 0, "no reads during writes"); 995 996SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_size, CTLFLAG_RD, 997 &ARC_anon.arcs_size, 0, "size of anonymous state"); 998SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_metadata_lsize, CTLFLAG_RD, 999 &ARC_anon.arcs_lsize[ARC_BUFC_METADATA], 0, "size of anonymous state"); 1000SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_data_lsize, CTLFLAG_RD, 1001 &ARC_anon.arcs_lsize[ARC_BUFC_DATA], 0, "size of anonymous state"); 1002 1003SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_size, CTLFLAG_RD, 1004 &ARC_mru.arcs_size, 0, "size of mru state"); 1005SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_metadata_lsize, CTLFLAG_RD, 1006 &ARC_mru.arcs_lsize[ARC_BUFC_METADATA], 0, "size of metadata in mru state"); 1007SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_data_lsize, CTLFLAG_RD, 1008 &ARC_mru.arcs_lsize[ARC_BUFC_DATA], 0, "size of data in mru state"); 1009 1010SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_size, CTLFLAG_RD, 1011 &ARC_mru_ghost.arcs_size, 0, "size of mru ghost state"); 1012SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_metadata_lsize, CTLFLAG_RD, 1013 &ARC_mru_ghost.arcs_lsize[ARC_BUFC_METADATA], 0, 1014 "size of metadata in mru ghost state"); 1015SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_data_lsize, CTLFLAG_RD, 1016 &ARC_mru_ghost.arcs_lsize[ARC_BUFC_DATA], 0, 1017 "size of data in mru ghost state"); 1018 1019SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_size, CTLFLAG_RD, 1020 &ARC_mfu.arcs_size, 0, "size of mfu state"); 1021SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_metadata_lsize, CTLFLAG_RD, 1022 &ARC_mfu.arcs_lsize[ARC_BUFC_METADATA], 0, "size of metadata in mfu state"); 1023SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_data_lsize, CTLFLAG_RD, 1024 &ARC_mfu.arcs_lsize[ARC_BUFC_DATA], 0, "size of data in mfu state"); 1025 1026SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_size, CTLFLAG_RD, 1027 &ARC_mfu_ghost.arcs_size, 0, "size of mfu ghost state"); 1028SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_metadata_lsize, CTLFLAG_RD, 1029 &ARC_mfu_ghost.arcs_lsize[ARC_BUFC_METADATA], 0, 1030 "size of metadata in mfu ghost state"); 1031SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_data_lsize, CTLFLAG_RD, 1032 &ARC_mfu_ghost.arcs_lsize[ARC_BUFC_DATA], 0, 1033 "size of data in mfu ghost state"); 1034 1035SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2c_only_size, CTLFLAG_RD, 1036 &ARC_l2c_only.arcs_size, 0, "size of mru state"); 1037 1038/* 1039 * L2ARC Internals 1040 */ 1041struct l2arc_dev { 1042 vdev_t *l2ad_vdev; /* vdev */ 1043 spa_t *l2ad_spa; /* spa */ 1044 uint64_t l2ad_hand; /* next write location */ 1045 uint64_t l2ad_start; /* first addr on device */ 1046 uint64_t l2ad_end; /* last addr on device */ 1047 boolean_t l2ad_first; /* first sweep through */ 1048 boolean_t l2ad_writing; /* currently writing */ 1049 kmutex_t l2ad_mtx; /* lock for buffer list */ 1050 list_t l2ad_buflist; /* buffer list */ 1051 list_node_t l2ad_node; /* device list node */ 1052 refcount_t l2ad_alloc; /* allocated bytes */ 1053}; 1054 1055static list_t L2ARC_dev_list; /* device list */ 1056static list_t *l2arc_dev_list; /* device list pointer */ 1057static kmutex_t l2arc_dev_mtx; /* device list mutex */ 1058static l2arc_dev_t *l2arc_dev_last; /* last device used */ 1059static list_t L2ARC_free_on_write; /* free after write buf list */ 1060static list_t *l2arc_free_on_write; /* free after write list ptr */ 1061static kmutex_t l2arc_free_on_write_mtx; /* mutex for list */ 1062static uint64_t l2arc_ndev; /* number of devices */ 1063 1064typedef struct l2arc_read_callback { 1065 arc_buf_t *l2rcb_buf; /* read buffer */ 1066 spa_t *l2rcb_spa; /* spa */ 1067 blkptr_t l2rcb_bp; /* original blkptr */ 1068 zbookmark_phys_t l2rcb_zb; /* original bookmark */ 1069 int l2rcb_flags; /* original flags */ 1070 enum zio_compress l2rcb_compress; /* applied compress */ 1071} l2arc_read_callback_t; 1072 1073typedef struct l2arc_write_callback { 1074 l2arc_dev_t *l2wcb_dev; /* device info */ 1075 arc_buf_hdr_t *l2wcb_head; /* head of write buflist */ 1076} l2arc_write_callback_t; 1077 1078typedef struct l2arc_data_free { 1079 /* protected by l2arc_free_on_write_mtx */ 1080 void *l2df_data; 1081 size_t l2df_size; 1082 void (*l2df_func)(void *, size_t); 1083 list_node_t l2df_list_node; 1084} l2arc_data_free_t; 1085 1086static kmutex_t l2arc_feed_thr_lock; 1087static kcondvar_t l2arc_feed_thr_cv; 1088static uint8_t l2arc_thread_exit; 1089 1090static void arc_get_data_buf(arc_buf_t *); 1091static void arc_access(arc_buf_hdr_t *, kmutex_t *); 1092static int arc_evict_needed(arc_buf_contents_t); 1093static void arc_evict_ghost(arc_state_t *, uint64_t, int64_t); 1094static void arc_buf_watch(arc_buf_t *); 1095 1096static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *); 1097static uint32_t arc_bufc_to_flags(arc_buf_contents_t); 1098 1099static boolean_t l2arc_write_eligible(uint64_t, arc_buf_hdr_t *); 1100static void l2arc_read_done(zio_t *); 1101 1102static boolean_t l2arc_compress_buf(arc_buf_hdr_t *); 1103static void l2arc_decompress_zio(zio_t *, arc_buf_hdr_t *, enum zio_compress); 1104static void l2arc_release_cdata_buf(arc_buf_hdr_t *); 1105 1106static uint64_t 1107buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth) 1108{ 1109 uint8_t *vdva = (uint8_t *)dva; 1110 uint64_t crc = -1ULL; 1111 int i; 1112 1113 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); 1114 1115 for (i = 0; i < sizeof (dva_t); i++) 1116 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF]; 1117 1118 crc ^= (spa>>8) ^ birth; 1119 1120 return (crc); 1121} 1122 1123#define BUF_EMPTY(buf) \ 1124 ((buf)->b_dva.dva_word[0] == 0 && \ 1125 (buf)->b_dva.dva_word[1] == 0) 1126 1127#define BUF_EQUAL(spa, dva, birth, buf) \ 1128 ((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \ 1129 ((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \ 1130 ((buf)->b_birth == birth) && ((buf)->b_spa == spa) 1131 1132static void 1133buf_discard_identity(arc_buf_hdr_t *hdr) 1134{ 1135 hdr->b_dva.dva_word[0] = 0; 1136 hdr->b_dva.dva_word[1] = 0; 1137 hdr->b_birth = 0; 1138} 1139 1140static arc_buf_hdr_t * 1141buf_hash_find(uint64_t spa, const blkptr_t *bp, kmutex_t **lockp) 1142{ 1143 const dva_t *dva = BP_IDENTITY(bp); 1144 uint64_t birth = BP_PHYSICAL_BIRTH(bp); 1145 uint64_t idx = BUF_HASH_INDEX(spa, dva, birth); 1146 kmutex_t *hash_lock = BUF_HASH_LOCK(idx); 1147 arc_buf_hdr_t *hdr; 1148 1149 mutex_enter(hash_lock); 1150 for (hdr = buf_hash_table.ht_table[idx]; hdr != NULL; 1151 hdr = hdr->b_hash_next) { 1152 if (BUF_EQUAL(spa, dva, birth, hdr)) { 1153 *lockp = hash_lock; 1154 return (hdr); 1155 } 1156 } 1157 mutex_exit(hash_lock); 1158 *lockp = NULL; 1159 return (NULL); 1160} 1161 1162/* 1163 * Insert an entry into the hash table. If there is already an element 1164 * equal to elem in the hash table, then the already existing element 1165 * will be returned and the new element will not be inserted. 1166 * Otherwise returns NULL. 1167 * If lockp == NULL, the caller is assumed to already hold the hash lock. 1168 */ 1169static arc_buf_hdr_t * 1170buf_hash_insert(arc_buf_hdr_t *hdr, kmutex_t **lockp) 1171{ 1172 uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth); 1173 kmutex_t *hash_lock = BUF_HASH_LOCK(idx); 1174 arc_buf_hdr_t *fhdr; 1175 uint32_t i; 1176 1177 ASSERT(!DVA_IS_EMPTY(&hdr->b_dva)); 1178 ASSERT(hdr->b_birth != 0); 1179 ASSERT(!HDR_IN_HASH_TABLE(hdr)); 1180 1181 if (lockp != NULL) { 1182 *lockp = hash_lock; 1183 mutex_enter(hash_lock); 1184 } else { 1185 ASSERT(MUTEX_HELD(hash_lock)); 1186 } 1187 1188 for (fhdr = buf_hash_table.ht_table[idx], i = 0; fhdr != NULL; 1189 fhdr = fhdr->b_hash_next, i++) { 1190 if (BUF_EQUAL(hdr->b_spa, &hdr->b_dva, hdr->b_birth, fhdr)) 1191 return (fhdr); 1192 } 1193 1194 hdr->b_hash_next = buf_hash_table.ht_table[idx]; 1195 buf_hash_table.ht_table[idx] = hdr; 1196 hdr->b_flags |= ARC_FLAG_IN_HASH_TABLE; 1197 1198 /* collect some hash table performance data */ 1199 if (i > 0) { 1200 ARCSTAT_BUMP(arcstat_hash_collisions); 1201 if (i == 1) 1202 ARCSTAT_BUMP(arcstat_hash_chains); 1203 1204 ARCSTAT_MAX(arcstat_hash_chain_max, i); 1205 } 1206 1207 ARCSTAT_BUMP(arcstat_hash_elements); 1208 ARCSTAT_MAXSTAT(arcstat_hash_elements); 1209 1210 return (NULL); 1211} 1212 1213static void 1214buf_hash_remove(arc_buf_hdr_t *hdr) 1215{ 1216 arc_buf_hdr_t *fhdr, **hdrp; 1217 uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth); 1218 1219 ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx))); 1220 ASSERT(HDR_IN_HASH_TABLE(hdr)); 1221 1222 hdrp = &buf_hash_table.ht_table[idx]; 1223 while ((fhdr = *hdrp) != hdr) { 1224 ASSERT(fhdr != NULL); 1225 hdrp = &fhdr->b_hash_next; 1226 } 1227 *hdrp = hdr->b_hash_next; 1228 hdr->b_hash_next = NULL; 1229 hdr->b_flags &= ~ARC_FLAG_IN_HASH_TABLE; 1230 1231 /* collect some hash table performance data */ 1232 ARCSTAT_BUMPDOWN(arcstat_hash_elements); 1233 1234 if (buf_hash_table.ht_table[idx] && 1235 buf_hash_table.ht_table[idx]->b_hash_next == NULL) 1236 ARCSTAT_BUMPDOWN(arcstat_hash_chains); 1237} 1238 1239/* 1240 * Global data structures and functions for the buf kmem cache. 1241 */ 1242static kmem_cache_t *hdr_full_cache; 1243static kmem_cache_t *hdr_l2only_cache; 1244static kmem_cache_t *buf_cache; 1245 1246static void 1247buf_fini(void) 1248{ 1249 int i; 1250 1251 kmem_free(buf_hash_table.ht_table, 1252 (buf_hash_table.ht_mask + 1) * sizeof (void *)); 1253 for (i = 0; i < BUF_LOCKS; i++) 1254 mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock); 1255 kmem_cache_destroy(hdr_full_cache); 1256 kmem_cache_destroy(hdr_l2only_cache); 1257 kmem_cache_destroy(buf_cache); 1258} 1259 1260/* 1261 * Constructor callback - called when the cache is empty 1262 * and a new buf is requested. 1263 */ 1264/* ARGSUSED */ 1265static int 1266hdr_full_cons(void *vbuf, void *unused, int kmflag) 1267{ 1268 arc_buf_hdr_t *hdr = vbuf; 1269 1270 bzero(hdr, HDR_FULL_SIZE); 1271 cv_init(&hdr->b_l1hdr.b_cv, NULL, CV_DEFAULT, NULL); 1272 refcount_create(&hdr->b_l1hdr.b_refcnt); 1273 mutex_init(&hdr->b_l1hdr.b_freeze_lock, NULL, MUTEX_DEFAULT, NULL); 1274 arc_space_consume(HDR_FULL_SIZE, ARC_SPACE_HDRS); 1275 1276 return (0); 1277} 1278 1279/* ARGSUSED */ 1280static int 1281hdr_l2only_cons(void *vbuf, void *unused, int kmflag) 1282{ 1283 arc_buf_hdr_t *hdr = vbuf; 1284 1285 bzero(hdr, HDR_L2ONLY_SIZE); 1286 arc_space_consume(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS); 1287 1288 return (0); 1289} 1290 1291/* ARGSUSED */ 1292static int 1293buf_cons(void *vbuf, void *unused, int kmflag) 1294{ 1295 arc_buf_t *buf = vbuf; 1296 1297 bzero(buf, sizeof (arc_buf_t)); 1298 mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL); 1299 arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS); 1300 1301 return (0); 1302} 1303 1304/* 1305 * Destructor callback - called when a cached buf is 1306 * no longer required. 1307 */ 1308/* ARGSUSED */ 1309static void 1310hdr_full_dest(void *vbuf, void *unused) 1311{ 1312 arc_buf_hdr_t *hdr = vbuf; 1313 1314 ASSERT(BUF_EMPTY(hdr)); 1315 cv_destroy(&hdr->b_l1hdr.b_cv); 1316 refcount_destroy(&hdr->b_l1hdr.b_refcnt); 1317 mutex_destroy(&hdr->b_l1hdr.b_freeze_lock); 1318 arc_space_return(HDR_FULL_SIZE, ARC_SPACE_HDRS); 1319} 1320 1321/* ARGSUSED */ 1322static void 1323hdr_l2only_dest(void *vbuf, void *unused) 1324{ 1325 arc_buf_hdr_t *hdr = vbuf; 1326 1327 ASSERT(BUF_EMPTY(hdr)); 1328 arc_space_return(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS); 1329} 1330 1331/* ARGSUSED */ 1332static void 1333buf_dest(void *vbuf, void *unused) 1334{ 1335 arc_buf_t *buf = vbuf; 1336 1337 mutex_destroy(&buf->b_evict_lock); 1338 arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS); 1339} 1340 1341/* 1342 * Reclaim callback -- invoked when memory is low. 1343 */ 1344/* ARGSUSED */ 1345static void 1346hdr_recl(void *unused) 1347{ 1348 dprintf("hdr_recl called\n"); 1349 /* 1350 * umem calls the reclaim func when we destroy the buf cache, 1351 * which is after we do arc_fini(). 1352 */ 1353 if (!arc_dead) 1354 cv_signal(&arc_reclaim_thr_cv); 1355} 1356 1357static void 1358buf_init(void) 1359{ 1360 uint64_t *ct; 1361 uint64_t hsize = 1ULL << 12; 1362 int i, j; 1363 1364 /* 1365 * The hash table is big enough to fill all of physical memory 1366 * with an average block size of zfs_arc_average_blocksize (default 8K). 1367 * By default, the table will take up 1368 * totalmem * sizeof(void*) / 8K (1MB per GB with 8-byte pointers). 1369 */ 1370 while (hsize * zfs_arc_average_blocksize < (uint64_t)physmem * PAGESIZE) 1371 hsize <<= 1; 1372retry: 1373 buf_hash_table.ht_mask = hsize - 1; 1374 buf_hash_table.ht_table = 1375 kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP); 1376 if (buf_hash_table.ht_table == NULL) { 1377 ASSERT(hsize > (1ULL << 8)); 1378 hsize >>= 1; 1379 goto retry; 1380 } 1381 1382 hdr_full_cache = kmem_cache_create("arc_buf_hdr_t_full", HDR_FULL_SIZE, 1383 0, hdr_full_cons, hdr_full_dest, hdr_recl, NULL, NULL, 0); 1384 hdr_l2only_cache = kmem_cache_create("arc_buf_hdr_t_l2only", 1385 HDR_L2ONLY_SIZE, 0, hdr_l2only_cons, hdr_l2only_dest, hdr_recl, 1386 NULL, NULL, 0); 1387 buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t), 1388 0, buf_cons, buf_dest, NULL, NULL, NULL, 0); 1389 1390 for (i = 0; i < 256; i++) 1391 for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--) 1392 *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY); 1393 1394 for (i = 0; i < BUF_LOCKS; i++) { 1395 mutex_init(&buf_hash_table.ht_locks[i].ht_lock, 1396 NULL, MUTEX_DEFAULT, NULL); 1397 } 1398} 1399 1400/* 1401 * Transition between the two allocation states for the arc_buf_hdr struct. 1402 * The arc_buf_hdr struct can be allocated with (hdr_full_cache) or without 1403 * (hdr_l2only_cache) the fields necessary for the L1 cache - the smaller 1404 * version is used when a cache buffer is only in the L2ARC in order to reduce 1405 * memory usage. 1406 */ 1407static arc_buf_hdr_t * 1408arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new) 1409{ 1410 ASSERT(HDR_HAS_L2HDR(hdr)); 1411 1412 arc_buf_hdr_t *nhdr; 1413 l2arc_dev_t *dev = hdr->b_l2hdr.b_dev; 1414 1415 ASSERT((old == hdr_full_cache && new == hdr_l2only_cache) || 1416 (old == hdr_l2only_cache && new == hdr_full_cache)); 1417 1418 nhdr = kmem_cache_alloc(new, KM_PUSHPAGE); 1419 1420 ASSERT(MUTEX_HELD(HDR_LOCK(hdr))); 1421 buf_hash_remove(hdr); 1422 1423 bcopy(hdr, nhdr, HDR_L2ONLY_SIZE); 1424 1425 if (new == hdr_full_cache) { 1426 nhdr->b_flags |= ARC_FLAG_HAS_L1HDR; 1427 /* 1428 * arc_access and arc_change_state need to be aware that a 1429 * header has just come out of L2ARC, so we set its state to 1430 * l2c_only even though it's about to change. 1431 */ 1432 nhdr->b_l1hdr.b_state = arc_l2c_only; 1433 } else { 1434 ASSERT(hdr->b_l1hdr.b_buf == NULL); 1435 ASSERT0(hdr->b_l1hdr.b_datacnt); 1436 ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node)); 1437 /* 1438 * We might be removing the L1hdr of a buffer which was just 1439 * written out to L2ARC. If such a buffer is compressed then we 1440 * need to free its b_tmp_cdata before destroying the header. 1441 */ 1442 if (hdr->b_l1hdr.b_tmp_cdata != NULL && 1443 HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF) 1444 l2arc_release_cdata_buf(hdr); 1445 nhdr->b_flags &= ~ARC_FLAG_HAS_L1HDR; 1446 } 1447 /* 1448 * The header has been reallocated so we need to re-insert it into any 1449 * lists it was on. 1450 */ 1451 (void) buf_hash_insert(nhdr, NULL); 1452 1453 ASSERT(list_link_active(&hdr->b_l2hdr.b_l2node)); 1454 1455 mutex_enter(&dev->l2ad_mtx); 1456 1457 /* 1458 * We must place the realloc'ed header back into the list at 1459 * the same spot. Otherwise, if it's placed earlier in the list, 1460 * l2arc_write_buffers() could find it during the function's 1461 * write phase, and try to write it out to the l2arc. 1462 */ 1463 list_insert_after(&dev->l2ad_buflist, hdr, nhdr); 1464 list_remove(&dev->l2ad_buflist, hdr); 1465 1466 mutex_exit(&dev->l2ad_mtx); 1467 1468 /* 1469 * Since we're using the pointer address as the tag when 1470 * incrementing and decrementing the l2ad_alloc refcount, we 1471 * must remove the old pointer (that we're about to destroy) and 1472 * add the new pointer to the refcount. Otherwise we'd remove 1473 * the wrong pointer address when calling arc_hdr_destroy() later. 1474 */ 1475 1476 (void) refcount_remove_many(&dev->l2ad_alloc, 1477 hdr->b_l2hdr.b_asize, hdr); 1478 1479 (void) refcount_add_many(&dev->l2ad_alloc, 1480 nhdr->b_l2hdr.b_asize, nhdr); 1481 1482 buf_discard_identity(hdr); 1483 hdr->b_freeze_cksum = NULL; 1484 kmem_cache_free(old, hdr); 1485 1486 return (nhdr); 1487} 1488 1489 1490#define ARC_MINTIME (hz>>4) /* 62 ms */ 1491 1492static void 1493arc_cksum_verify(arc_buf_t *buf) 1494{ 1495 zio_cksum_t zc; 1496 1497 if (!(zfs_flags & ZFS_DEBUG_MODIFY)) 1498 return; 1499 1500 mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1501 if (buf->b_hdr->b_freeze_cksum == NULL || HDR_IO_ERROR(buf->b_hdr)) { 1502 mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1503 return; 1504 } 1505 fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc); 1506 if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc)) 1507 panic("buffer modified while frozen!"); 1508 mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1509} 1510 1511static int 1512arc_cksum_equal(arc_buf_t *buf) 1513{ 1514 zio_cksum_t zc; 1515 int equal; 1516 1517 mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1518 fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc); 1519 equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc); 1520 mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1521 1522 return (equal); 1523} 1524 1525static void 1526arc_cksum_compute(arc_buf_t *buf, boolean_t force) 1527{ 1528 if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY)) 1529 return; 1530 1531 mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1532 if (buf->b_hdr->b_freeze_cksum != NULL) { 1533 mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1534 return; 1535 } 1536 buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP); 1537 fletcher_2_native(buf->b_data, buf->b_hdr->b_size, 1538 buf->b_hdr->b_freeze_cksum); 1539 mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1540#ifdef illumos 1541 arc_buf_watch(buf); 1542#endif 1543} 1544 1545#ifdef illumos 1546#ifndef _KERNEL 1547typedef struct procctl { 1548 long cmd; 1549 prwatch_t prwatch; 1550} procctl_t; 1551#endif 1552 1553/* ARGSUSED */ 1554static void 1555arc_buf_unwatch(arc_buf_t *buf) 1556{ 1557#ifndef _KERNEL 1558 if (arc_watch) { 1559 int result; 1560 procctl_t ctl; 1561 ctl.cmd = PCWATCH; 1562 ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data; 1563 ctl.prwatch.pr_size = 0; 1564 ctl.prwatch.pr_wflags = 0; 1565 result = write(arc_procfd, &ctl, sizeof (ctl)); 1566 ASSERT3U(result, ==, sizeof (ctl)); 1567 } 1568#endif 1569} 1570 1571/* ARGSUSED */ 1572static void 1573arc_buf_watch(arc_buf_t *buf) 1574{ 1575#ifndef _KERNEL 1576 if (arc_watch) { 1577 int result; 1578 procctl_t ctl; 1579 ctl.cmd = PCWATCH; 1580 ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data; 1581 ctl.prwatch.pr_size = buf->b_hdr->b_size; 1582 ctl.prwatch.pr_wflags = WA_WRITE; 1583 result = write(arc_procfd, &ctl, sizeof (ctl)); 1584 ASSERT3U(result, ==, sizeof (ctl)); 1585 } 1586#endif 1587} 1588#endif /* illumos */ 1589 1590static arc_buf_contents_t 1591arc_buf_type(arc_buf_hdr_t *hdr) 1592{ 1593 if (HDR_ISTYPE_METADATA(hdr)) { 1594 return (ARC_BUFC_METADATA); 1595 } else { 1596 return (ARC_BUFC_DATA); 1597 } 1598} 1599 1600static uint32_t 1601arc_bufc_to_flags(arc_buf_contents_t type) 1602{ 1603 switch (type) { 1604 case ARC_BUFC_DATA: 1605 /* metadata field is 0 if buffer contains normal data */ 1606 return (0); 1607 case ARC_BUFC_METADATA: 1608 return (ARC_FLAG_BUFC_METADATA); 1609 default: 1610 break; 1611 } 1612 panic("undefined ARC buffer type!"); 1613 return ((uint32_t)-1); 1614} 1615 1616void 1617arc_buf_thaw(arc_buf_t *buf) 1618{ 1619 if (zfs_flags & ZFS_DEBUG_MODIFY) { 1620 if (buf->b_hdr->b_l1hdr.b_state != arc_anon) 1621 panic("modifying non-anon buffer!"); 1622 if (HDR_IO_IN_PROGRESS(buf->b_hdr)) 1623 panic("modifying buffer while i/o in progress!"); 1624 arc_cksum_verify(buf); 1625 } 1626 1627 mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1628 if (buf->b_hdr->b_freeze_cksum != NULL) { 1629 kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t)); 1630 buf->b_hdr->b_freeze_cksum = NULL; 1631 } 1632 1633#ifdef ZFS_DEBUG 1634 if (zfs_flags & ZFS_DEBUG_MODIFY) { 1635 if (buf->b_hdr->b_l1hdr.b_thawed != NULL) 1636 kmem_free(buf->b_hdr->b_l1hdr.b_thawed, 1); 1637 buf->b_hdr->b_l1hdr.b_thawed = kmem_alloc(1, KM_SLEEP); 1638 } 1639#endif 1640 1641 mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1642 1643#ifdef illumos 1644 arc_buf_unwatch(buf); 1645#endif 1646} 1647 1648void 1649arc_buf_freeze(arc_buf_t *buf) 1650{ 1651 kmutex_t *hash_lock; 1652 1653 if (!(zfs_flags & ZFS_DEBUG_MODIFY)) 1654 return; 1655 1656 hash_lock = HDR_LOCK(buf->b_hdr); 1657 mutex_enter(hash_lock); 1658 1659 ASSERT(buf->b_hdr->b_freeze_cksum != NULL || 1660 buf->b_hdr->b_l1hdr.b_state == arc_anon); 1661 arc_cksum_compute(buf, B_FALSE); 1662 mutex_exit(hash_lock); 1663 1664} 1665 1666static void 1667add_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag) 1668{ 1669 ASSERT(HDR_HAS_L1HDR(hdr)); 1670 ASSERT(MUTEX_HELD(hash_lock)); 1671 arc_state_t *state = hdr->b_l1hdr.b_state; 1672 1673 if ((refcount_add(&hdr->b_l1hdr.b_refcnt, tag) == 1) && 1674 (state != arc_anon)) { 1675 /* We don't use the L2-only state list. */ 1676 if (state != arc_l2c_only) { 1677 uint64_t delta = hdr->b_size * hdr->b_l1hdr.b_datacnt; 1678 list_t *list = &state->arcs_list[arc_buf_type(hdr)]; 1679 uint64_t *size = &state->arcs_lsize[arc_buf_type(hdr)]; 1680 1681 ASSERT(!MUTEX_HELD(&state->arcs_mtx)); 1682 mutex_enter(&state->arcs_mtx); 1683 ASSERT(list_link_active(&hdr->b_l1hdr.b_arc_node)); 1684 list_remove(list, hdr); 1685 if (GHOST_STATE(state)) { 1686 ASSERT0(hdr->b_l1hdr.b_datacnt); 1687 ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); 1688 delta = hdr->b_size; 1689 } 1690 ASSERT(delta > 0); 1691 ASSERT3U(*size, >=, delta); 1692 atomic_add_64(size, -delta); 1693 mutex_exit(&state->arcs_mtx); 1694 } 1695 /* remove the prefetch flag if we get a reference */ 1696 hdr->b_flags &= ~ARC_FLAG_PREFETCH; 1697 } 1698} 1699 1700static int 1701remove_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag) 1702{ 1703 int cnt; 1704 arc_state_t *state = hdr->b_l1hdr.b_state; 1705 1706 ASSERT(HDR_HAS_L1HDR(hdr)); 1707 ASSERT(state == arc_anon || MUTEX_HELD(hash_lock)); 1708 ASSERT(!GHOST_STATE(state)); 1709 1710 /* 1711 * arc_l2c_only counts as a ghost state so we don't need to explicitly 1712 * check to prevent usage of the arc_l2c_only list. 1713 */ 1714 if (((cnt = refcount_remove(&hdr->b_l1hdr.b_refcnt, tag)) == 0) && 1715 (state != arc_anon)) { 1716 uint64_t *size = &state->arcs_lsize[arc_buf_type(hdr)]; 1717 1718 ASSERT(!MUTEX_HELD(&state->arcs_mtx)); 1719 mutex_enter(&state->arcs_mtx); 1720 ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node)); 1721 list_insert_head(&state->arcs_list[arc_buf_type(hdr)], hdr); 1722 ASSERT(hdr->b_l1hdr.b_datacnt > 0); 1723 atomic_add_64(size, hdr->b_size * 1724 hdr->b_l1hdr.b_datacnt); 1725 mutex_exit(&state->arcs_mtx); 1726 } 1727 return (cnt); 1728} 1729 1730/* 1731 * Move the supplied buffer to the indicated state. The mutex 1732 * for the buffer must be held by the caller. 1733 */ 1734static void 1735arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, 1736 kmutex_t *hash_lock) 1737{ 1738 arc_state_t *old_state; 1739 int64_t refcnt; 1740 uint32_t datacnt; 1741 uint64_t from_delta, to_delta; 1742 arc_buf_contents_t buftype = arc_buf_type(hdr); 1743 1744 /* 1745 * We almost always have an L1 hdr here, since we call arc_hdr_realloc() 1746 * in arc_read() when bringing a buffer out of the L2ARC. However, the 1747 * L1 hdr doesn't always exist when we change state to arc_anon before 1748 * destroying a header, in which case reallocating to add the L1 hdr is 1749 * pointless. 1750 */ 1751 if (HDR_HAS_L1HDR(hdr)) { 1752 old_state = hdr->b_l1hdr.b_state; 1753 refcnt = refcount_count(&hdr->b_l1hdr.b_refcnt); 1754 datacnt = hdr->b_l1hdr.b_datacnt; 1755 } else { 1756 old_state = arc_l2c_only; 1757 refcnt = 0; 1758 datacnt = 0; 1759 } 1760 1761 ASSERT(MUTEX_HELD(hash_lock)); 1762 ASSERT3P(new_state, !=, old_state); 1763 ASSERT(refcnt == 0 || datacnt > 0); 1764 ASSERT(!GHOST_STATE(new_state) || datacnt == 0); 1765 ASSERT(old_state != arc_anon || datacnt <= 1); 1766 1767 from_delta = to_delta = datacnt * hdr->b_size; 1768 1769 /* 1770 * If this buffer is evictable, transfer it from the 1771 * old state list to the new state list. 1772 */ 1773 if (refcnt == 0) { 1774 if (old_state != arc_anon && old_state != arc_l2c_only) { 1775 int use_mutex = !MUTEX_HELD(&old_state->arcs_mtx); 1776 uint64_t *size = &old_state->arcs_lsize[buftype]; 1777 1778 if (use_mutex) 1779 mutex_enter(&old_state->arcs_mtx); 1780 1781 ASSERT(HDR_HAS_L1HDR(hdr)); 1782 ASSERT(list_link_active(&hdr->b_l1hdr.b_arc_node)); 1783 list_remove(&old_state->arcs_list[buftype], hdr); 1784 1785 /* 1786 * If prefetching out of the ghost cache, 1787 * we will have a non-zero datacnt. 1788 */ 1789 if (GHOST_STATE(old_state) && datacnt == 0) { 1790 /* ghost elements have a ghost size */ 1791 ASSERT(hdr->b_l1hdr.b_buf == NULL); 1792 from_delta = hdr->b_size; 1793 } 1794 ASSERT3U(*size, >=, from_delta); 1795 atomic_add_64(size, -from_delta); 1796 1797 if (use_mutex) 1798 mutex_exit(&old_state->arcs_mtx); 1799 } 1800 if (new_state != arc_anon && new_state != arc_l2c_only) { 1801 int use_mutex = !MUTEX_HELD(&new_state->arcs_mtx); 1802 uint64_t *size = &new_state->arcs_lsize[buftype]; 1803 1804 /* 1805 * An L1 header always exists here, since if we're 1806 * moving to some L1-cached state (i.e. not l2c_only or 1807 * anonymous), we realloc the header to add an L1hdr 1808 * beforehand. 1809 */ 1810 ASSERT(HDR_HAS_L1HDR(hdr)); 1811 if (use_mutex) 1812 mutex_enter(&new_state->arcs_mtx); 1813 1814 list_insert_head(&new_state->arcs_list[buftype], hdr); 1815 1816 /* ghost elements have a ghost size */ 1817 if (GHOST_STATE(new_state)) { 1818 ASSERT0(datacnt); 1819 ASSERT(hdr->b_l1hdr.b_buf == NULL); 1820 to_delta = hdr->b_size; 1821 } 1822 atomic_add_64(size, to_delta); 1823 1824 if (use_mutex) 1825 mutex_exit(&new_state->arcs_mtx); 1826 } 1827 } 1828 1829 ASSERT(!BUF_EMPTY(hdr)); 1830 if (new_state == arc_anon && HDR_IN_HASH_TABLE(hdr)) 1831 buf_hash_remove(hdr); 1832 1833 /* adjust state sizes (ignore arc_l2c_only) */ 1834 if (to_delta && new_state != arc_l2c_only) 1835 atomic_add_64(&new_state->arcs_size, to_delta); 1836 if (from_delta && old_state != arc_l2c_only) { 1837 ASSERT3U(old_state->arcs_size, >=, from_delta); 1838 atomic_add_64(&old_state->arcs_size, -from_delta); 1839 } 1840 if (HDR_HAS_L1HDR(hdr)) 1841 hdr->b_l1hdr.b_state = new_state; 1842 1843 /* 1844 * L2 headers should never be on the L2 state list since they don't 1845 * have L1 headers allocated. 1846 */ 1847 ASSERT(list_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_DATA]) && 1848 list_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA])); 1849} 1850 1851void 1852arc_space_consume(uint64_t space, arc_space_type_t type) 1853{ 1854 ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES); 1855 1856 switch (type) { 1857 case ARC_SPACE_DATA: 1858 ARCSTAT_INCR(arcstat_data_size, space); 1859 break; 1860 case ARC_SPACE_META: 1861 ARCSTAT_INCR(arcstat_metadata_size, space); 1862 break; 1863 case ARC_SPACE_OTHER: 1864 ARCSTAT_INCR(arcstat_other_size, space); 1865 break; 1866 case ARC_SPACE_HDRS: 1867 ARCSTAT_INCR(arcstat_hdr_size, space); 1868 break; 1869 case ARC_SPACE_L2HDRS: 1870 ARCSTAT_INCR(arcstat_l2_hdr_size, space); 1871 break; 1872 } 1873 1874 if (type != ARC_SPACE_DATA) 1875 ARCSTAT_INCR(arcstat_meta_used, space); 1876 1877 atomic_add_64(&arc_size, space); 1878} 1879 1880void 1881arc_space_return(uint64_t space, arc_space_type_t type) 1882{ 1883 ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES); 1884 1885 switch (type) { 1886 case ARC_SPACE_DATA: 1887 ARCSTAT_INCR(arcstat_data_size, -space); 1888 break; 1889 case ARC_SPACE_META: 1890 ARCSTAT_INCR(arcstat_metadata_size, -space); 1891 break; 1892 case ARC_SPACE_OTHER: 1893 ARCSTAT_INCR(arcstat_other_size, -space); 1894 break; 1895 case ARC_SPACE_HDRS: 1896 ARCSTAT_INCR(arcstat_hdr_size, -space); 1897 break; 1898 case ARC_SPACE_L2HDRS: 1899 ARCSTAT_INCR(arcstat_l2_hdr_size, -space); 1900 break; 1901 } 1902 1903 if (type != ARC_SPACE_DATA) { 1904 ASSERT(arc_meta_used >= space); 1905 if (arc_meta_max < arc_meta_used) 1906 arc_meta_max = arc_meta_used; 1907 ARCSTAT_INCR(arcstat_meta_used, -space); 1908 } 1909 1910 ASSERT(arc_size >= space); 1911 atomic_add_64(&arc_size, -space); 1912} 1913 1914arc_buf_t * 1915arc_buf_alloc(spa_t *spa, int32_t size, void *tag, arc_buf_contents_t type) 1916{ 1917 arc_buf_hdr_t *hdr; 1918 arc_buf_t *buf; 1919 1920 ASSERT3U(size, >, 0); 1921 hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE); 1922 ASSERT(BUF_EMPTY(hdr)); 1923 ASSERT3P(hdr->b_freeze_cksum, ==, NULL); 1924 hdr->b_size = size; 1925 hdr->b_spa = spa_load_guid(spa); 1926 1927 buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); 1928 buf->b_hdr = hdr; 1929 buf->b_data = NULL; 1930 buf->b_efunc = NULL; 1931 buf->b_private = NULL; 1932 buf->b_next = NULL; 1933 1934 hdr->b_flags = arc_bufc_to_flags(type); 1935 hdr->b_flags |= ARC_FLAG_HAS_L1HDR; 1936 1937 hdr->b_l1hdr.b_buf = buf; 1938 hdr->b_l1hdr.b_state = arc_anon; 1939 hdr->b_l1hdr.b_arc_access = 0; 1940 hdr->b_l1hdr.b_datacnt = 1; 1941 1942 arc_get_data_buf(buf); 1943 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 1944 (void) refcount_add(&hdr->b_l1hdr.b_refcnt, tag); 1945 1946 return (buf); 1947} 1948 1949static char *arc_onloan_tag = "onloan"; 1950 1951/* 1952 * Loan out an anonymous arc buffer. Loaned buffers are not counted as in 1953 * flight data by arc_tempreserve_space() until they are "returned". Loaned 1954 * buffers must be returned to the arc before they can be used by the DMU or 1955 * freed. 1956 */ 1957arc_buf_t * 1958arc_loan_buf(spa_t *spa, int size) 1959{ 1960 arc_buf_t *buf; 1961 1962 buf = arc_buf_alloc(spa, size, arc_onloan_tag, ARC_BUFC_DATA); 1963 1964 atomic_add_64(&arc_loaned_bytes, size); 1965 return (buf); 1966} 1967 1968/* 1969 * Return a loaned arc buffer to the arc. 1970 */ 1971void 1972arc_return_buf(arc_buf_t *buf, void *tag) 1973{ 1974 arc_buf_hdr_t *hdr = buf->b_hdr; 1975 1976 ASSERT(buf->b_data != NULL); 1977 ASSERT(HDR_HAS_L1HDR(hdr)); 1978 (void) refcount_add(&hdr->b_l1hdr.b_refcnt, tag); 1979 (void) refcount_remove(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag); 1980 1981 atomic_add_64(&arc_loaned_bytes, -hdr->b_size); 1982} 1983 1984/* Detach an arc_buf from a dbuf (tag) */ 1985void 1986arc_loan_inuse_buf(arc_buf_t *buf, void *tag) 1987{ 1988 arc_buf_hdr_t *hdr = buf->b_hdr; 1989 1990 ASSERT(buf->b_data != NULL); 1991 ASSERT(HDR_HAS_L1HDR(hdr)); 1992 (void) refcount_add(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag); 1993 (void) refcount_remove(&hdr->b_l1hdr.b_refcnt, tag); 1994 buf->b_efunc = NULL; 1995 buf->b_private = NULL; 1996 1997 atomic_add_64(&arc_loaned_bytes, hdr->b_size); 1998} 1999 2000static arc_buf_t * 2001arc_buf_clone(arc_buf_t *from) 2002{ 2003 arc_buf_t *buf; 2004 arc_buf_hdr_t *hdr = from->b_hdr; 2005 uint64_t size = hdr->b_size; 2006 2007 ASSERT(HDR_HAS_L1HDR(hdr)); 2008 ASSERT(hdr->b_l1hdr.b_state != arc_anon); 2009 2010 buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); 2011 buf->b_hdr = hdr; 2012 buf->b_data = NULL; 2013 buf->b_efunc = NULL; 2014 buf->b_private = NULL; 2015 buf->b_next = hdr->b_l1hdr.b_buf; 2016 hdr->b_l1hdr.b_buf = buf; 2017 arc_get_data_buf(buf); 2018 bcopy(from->b_data, buf->b_data, size); 2019 2020 /* 2021 * This buffer already exists in the arc so create a duplicate 2022 * copy for the caller. If the buffer is associated with user data 2023 * then track the size and number of duplicates. These stats will be 2024 * updated as duplicate buffers are created and destroyed. 2025 */ 2026 if (HDR_ISTYPE_DATA(hdr)) { 2027 ARCSTAT_BUMP(arcstat_duplicate_buffers); 2028 ARCSTAT_INCR(arcstat_duplicate_buffers_size, size); 2029 } 2030 hdr->b_l1hdr.b_datacnt += 1; 2031 return (buf); 2032} 2033 2034void 2035arc_buf_add_ref(arc_buf_t *buf, void* tag) 2036{ 2037 arc_buf_hdr_t *hdr; 2038 kmutex_t *hash_lock; 2039 2040 /* 2041 * Check to see if this buffer is evicted. Callers 2042 * must verify b_data != NULL to know if the add_ref 2043 * was successful. 2044 */ 2045 mutex_enter(&buf->b_evict_lock); 2046 if (buf->b_data == NULL) { 2047 mutex_exit(&buf->b_evict_lock); 2048 return; 2049 } 2050 hash_lock = HDR_LOCK(buf->b_hdr); 2051 mutex_enter(hash_lock); 2052 hdr = buf->b_hdr; 2053 ASSERT(HDR_HAS_L1HDR(hdr)); 2054 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 2055 mutex_exit(&buf->b_evict_lock); 2056 2057 ASSERT(hdr->b_l1hdr.b_state == arc_mru || 2058 hdr->b_l1hdr.b_state == arc_mfu); 2059 2060 add_reference(hdr, hash_lock, tag); 2061 DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); 2062 arc_access(hdr, hash_lock); 2063 mutex_exit(hash_lock); 2064 ARCSTAT_BUMP(arcstat_hits); 2065 ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr), 2066 demand, prefetch, !HDR_ISTYPE_METADATA(hdr), 2067 data, metadata, hits); 2068} 2069 2070static void 2071arc_buf_free_on_write(void *data, size_t size, 2072 void (*free_func)(void *, size_t)) 2073{ 2074 l2arc_data_free_t *df; 2075 2076 df = kmem_alloc(sizeof (l2arc_data_free_t), KM_SLEEP); 2077 df->l2df_data = data; 2078 df->l2df_size = size; 2079 df->l2df_func = free_func; 2080 mutex_enter(&l2arc_free_on_write_mtx); 2081 list_insert_head(l2arc_free_on_write, df); 2082 mutex_exit(&l2arc_free_on_write_mtx); 2083} 2084 2085/* 2086 * Free the arc data buffer. If it is an l2arc write in progress, 2087 * the buffer is placed on l2arc_free_on_write to be freed later. 2088 */ 2089static void 2090arc_buf_data_free(arc_buf_t *buf, void (*free_func)(void *, size_t)) 2091{ 2092 arc_buf_hdr_t *hdr = buf->b_hdr; 2093 2094 if (HDR_L2_WRITING(hdr)) { 2095 arc_buf_free_on_write(buf->b_data, hdr->b_size, free_func); 2096 ARCSTAT_BUMP(arcstat_l2_free_on_write); 2097 } else { 2098 free_func(buf->b_data, hdr->b_size); 2099 } 2100} 2101 2102/* 2103 * Free up buf->b_data and if 'remove' is set, then pull the 2104 * arc_buf_t off of the the arc_buf_hdr_t's list and free it. 2105 */ 2106static void 2107arc_buf_l2_cdata_free(arc_buf_hdr_t *hdr) 2108{ 2109 ASSERT(HDR_HAS_L2HDR(hdr)); 2110 ASSERT(MUTEX_HELD(&hdr->b_l2hdr.b_dev->l2ad_mtx)); 2111 2112 /* 2113 * The b_tmp_cdata field is linked off of the b_l1hdr, so if 2114 * that doesn't exist, the header is in the arc_l2c_only state, 2115 * and there isn't anything to free (it's already been freed). 2116 */ 2117 if (!HDR_HAS_L1HDR(hdr)) 2118 return; 2119 2120 if (hdr->b_l1hdr.b_tmp_cdata == NULL) 2121 return; 2122 2123 ASSERT(HDR_L2_WRITING(hdr)); 2124 arc_buf_free_on_write(hdr->b_l1hdr.b_tmp_cdata, hdr->b_size, 2125 zio_data_buf_free); 2126 2127 ARCSTAT_BUMP(arcstat_l2_cdata_free_on_write); 2128 hdr->b_l1hdr.b_tmp_cdata = NULL; 2129} 2130 2131static void 2132arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t remove) 2133{ 2134 arc_buf_t **bufp; 2135 2136 /* free up data associated with the buf */ 2137 if (buf->b_data != NULL) { 2138 arc_state_t *state = buf->b_hdr->b_l1hdr.b_state; 2139 uint64_t size = buf->b_hdr->b_size; 2140 arc_buf_contents_t type = arc_buf_type(buf->b_hdr); 2141 2142 arc_cksum_verify(buf); 2143#ifdef illumos 2144 arc_buf_unwatch(buf); 2145#endif 2146 2147 if (!recycle) { 2148 if (type == ARC_BUFC_METADATA) { 2149 arc_buf_data_free(buf, zio_buf_free); 2150 arc_space_return(size, ARC_SPACE_META); 2151 } else { 2152 ASSERT(type == ARC_BUFC_DATA); 2153 arc_buf_data_free(buf, zio_data_buf_free); 2154 arc_space_return(size, ARC_SPACE_DATA); 2155 } 2156 } 2157 if (list_link_active(&buf->b_hdr->b_l1hdr.b_arc_node)) { 2158 uint64_t *cnt = &state->arcs_lsize[type]; 2159 2160 ASSERT(refcount_is_zero( 2161 &buf->b_hdr->b_l1hdr.b_refcnt)); 2162 ASSERT(state != arc_anon && state != arc_l2c_only); 2163 2164 ASSERT3U(*cnt, >=, size); 2165 atomic_add_64(cnt, -size); 2166 } 2167 ASSERT3U(state->arcs_size, >=, size); 2168 atomic_add_64(&state->arcs_size, -size); 2169 buf->b_data = NULL; 2170 2171 /* 2172 * If we're destroying a duplicate buffer make sure 2173 * that the appropriate statistics are updated. 2174 */ 2175 if (buf->b_hdr->b_l1hdr.b_datacnt > 1 && 2176 HDR_ISTYPE_DATA(buf->b_hdr)) { 2177 ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers); 2178 ARCSTAT_INCR(arcstat_duplicate_buffers_size, -size); 2179 } 2180 ASSERT(buf->b_hdr->b_l1hdr.b_datacnt > 0); 2181 buf->b_hdr->b_l1hdr.b_datacnt -= 1; 2182 } 2183 2184 /* only remove the buf if requested */ 2185 if (!remove) 2186 return; 2187 2188 /* remove the buf from the hdr list */ 2189 for (bufp = &buf->b_hdr->b_l1hdr.b_buf; *bufp != buf; 2190 bufp = &(*bufp)->b_next) 2191 continue; 2192 *bufp = buf->b_next; 2193 buf->b_next = NULL; 2194 2195 ASSERT(buf->b_efunc == NULL); 2196 2197 /* clean up the buf */ 2198 buf->b_hdr = NULL; 2199 kmem_cache_free(buf_cache, buf); 2200} 2201 2202static void 2203arc_hdr_l2hdr_destroy(arc_buf_hdr_t *hdr) 2204{ 2205 l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr; 2206 l2arc_dev_t *dev = l2hdr->b_dev; 2207 2208 ASSERT(MUTEX_HELD(&dev->l2ad_mtx)); 2209 ASSERT(HDR_HAS_L2HDR(hdr)); 2210 2211 list_remove(&dev->l2ad_buflist, hdr); 2212 2213 /* 2214 * We don't want to leak the b_tmp_cdata buffer that was 2215 * allocated in l2arc_write_buffers() 2216 */ 2217 arc_buf_l2_cdata_free(hdr); 2218 2219 /* 2220 * If the l2hdr's b_daddr is equal to L2ARC_ADDR_UNSET, then 2221 * this header is being processed by l2arc_write_buffers() (i.e. 2222 * it's in the first stage of l2arc_write_buffers()). 2223 * Re-affirming that truth here, just to serve as a reminder. If 2224 * b_daddr does not equal L2ARC_ADDR_UNSET, then the header may or 2225 * may not have its HDR_L2_WRITING flag set. (the write may have 2226 * completed, in which case HDR_L2_WRITING will be false and the 2227 * b_daddr field will point to the address of the buffer on disk). 2228 */ 2229 IMPLY(l2hdr->b_daddr == L2ARC_ADDR_UNSET, HDR_L2_WRITING(hdr)); 2230 2231 /* 2232 * If b_daddr is equal to L2ARC_ADDR_UNSET, we're racing with 2233 * l2arc_write_buffers(). Since we've just removed this header 2234 * from the l2arc buffer list, this header will never reach the 2235 * second stage of l2arc_write_buffers(), which increments the 2236 * accounting stats for this header. Thus, we must be careful 2237 * not to decrement them for this header either. 2238 */ 2239 if (l2hdr->b_daddr != L2ARC_ADDR_UNSET) { 2240 ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize); 2241 ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); 2242 2243 vdev_space_update(dev->l2ad_vdev, 2244 -l2hdr->b_asize, 0, 0); 2245 2246 (void) refcount_remove_many(&dev->l2ad_alloc, 2247 l2hdr->b_asize, hdr); 2248 } 2249 2250 hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR; 2251} 2252 2253static void 2254arc_hdr_destroy(arc_buf_hdr_t *hdr) 2255{ 2256 if (HDR_HAS_L1HDR(hdr)) { 2257 ASSERT(hdr->b_l1hdr.b_buf == NULL || 2258 hdr->b_l1hdr.b_datacnt > 0); 2259 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 2260 ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); 2261 } 2262 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 2263 ASSERT(!HDR_IN_HASH_TABLE(hdr)); 2264 2265 if (HDR_HAS_L2HDR(hdr)) { 2266 l2arc_dev_t *dev = hdr->b_l2hdr.b_dev; 2267 boolean_t buflist_held = MUTEX_HELD(&dev->l2ad_mtx); 2268 2269 if (!buflist_held) 2270 mutex_enter(&dev->l2ad_mtx); 2271 2272 /* 2273 * Even though we checked this conditional above, we 2274 * need to check this again now that we have the 2275 * l2ad_mtx. This is because we could be racing with 2276 * another thread calling l2arc_evict() which might have 2277 * destroyed this header's L2 portion as we were waiting 2278 * to acquire the l2ad_mtx. If that happens, we don't 2279 * want to re-destroy the header's L2 portion. 2280 */ 2281 if (HDR_HAS_L2HDR(hdr)) { 2282 if (hdr->b_l2hdr.b_daddr != L2ARC_ADDR_UNSET) 2283 trim_map_free(dev->l2ad_vdev, 2284 hdr->b_l2hdr.b_daddr, 2285 hdr->b_l2hdr.b_asize, 0); 2286 arc_hdr_l2hdr_destroy(hdr); 2287 } 2288 2289 if (!buflist_held) 2290 mutex_exit(&dev->l2ad_mtx); 2291 } 2292 2293 if (!BUF_EMPTY(hdr)) 2294 buf_discard_identity(hdr); 2295 if (hdr->b_freeze_cksum != NULL) { 2296 kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t)); 2297 hdr->b_freeze_cksum = NULL; 2298 } 2299 2300 if (HDR_HAS_L1HDR(hdr)) { 2301 while (hdr->b_l1hdr.b_buf) { 2302 arc_buf_t *buf = hdr->b_l1hdr.b_buf; 2303 2304 if (buf->b_efunc != NULL) { 2305 mutex_enter(&arc_eviction_mtx); 2306 mutex_enter(&buf->b_evict_lock); 2307 ASSERT(buf->b_hdr != NULL); 2308 arc_buf_destroy(hdr->b_l1hdr.b_buf, FALSE, 2309 FALSE); 2310 hdr->b_l1hdr.b_buf = buf->b_next; 2311 buf->b_hdr = &arc_eviction_hdr; 2312 buf->b_next = arc_eviction_list; 2313 arc_eviction_list = buf; 2314 mutex_exit(&buf->b_evict_lock); 2315 mutex_exit(&arc_eviction_mtx); 2316 } else { 2317 arc_buf_destroy(hdr->b_l1hdr.b_buf, FALSE, 2318 TRUE); 2319 } 2320 } 2321#ifdef ZFS_DEBUG 2322 if (hdr->b_l1hdr.b_thawed != NULL) { 2323 kmem_free(hdr->b_l1hdr.b_thawed, 1); 2324 hdr->b_l1hdr.b_thawed = NULL; 2325 } 2326#endif 2327 } 2328 2329 ASSERT3P(hdr->b_hash_next, ==, NULL); 2330 if (HDR_HAS_L1HDR(hdr)) { 2331 ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node)); 2332 ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); 2333 kmem_cache_free(hdr_full_cache, hdr); 2334 } else { 2335 kmem_cache_free(hdr_l2only_cache, hdr); 2336 } 2337} 2338 2339void 2340arc_buf_free(arc_buf_t *buf, void *tag) 2341{ 2342 arc_buf_hdr_t *hdr = buf->b_hdr; 2343 int hashed = hdr->b_l1hdr.b_state != arc_anon; 2344 2345 ASSERT(buf->b_efunc == NULL); 2346 ASSERT(buf->b_data != NULL); 2347 2348 if (hashed) { 2349 kmutex_t *hash_lock = HDR_LOCK(hdr); 2350 2351 mutex_enter(hash_lock); 2352 hdr = buf->b_hdr; 2353 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 2354 2355 (void) remove_reference(hdr, hash_lock, tag); 2356 if (hdr->b_l1hdr.b_datacnt > 1) { 2357 arc_buf_destroy(buf, FALSE, TRUE); 2358 } else { 2359 ASSERT(buf == hdr->b_l1hdr.b_buf); 2360 ASSERT(buf->b_efunc == NULL); 2361 hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE; 2362 } 2363 mutex_exit(hash_lock); 2364 } else if (HDR_IO_IN_PROGRESS(hdr)) { 2365 int destroy_hdr; 2366 /* 2367 * We are in the middle of an async write. Don't destroy 2368 * this buffer unless the write completes before we finish 2369 * decrementing the reference count. 2370 */ 2371 mutex_enter(&arc_eviction_mtx); 2372 (void) remove_reference(hdr, NULL, tag); 2373 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 2374 destroy_hdr = !HDR_IO_IN_PROGRESS(hdr); 2375 mutex_exit(&arc_eviction_mtx); 2376 if (destroy_hdr) 2377 arc_hdr_destroy(hdr); 2378 } else { 2379 if (remove_reference(hdr, NULL, tag) > 0) 2380 arc_buf_destroy(buf, FALSE, TRUE); 2381 else 2382 arc_hdr_destroy(hdr); 2383 } 2384} 2385 2386boolean_t 2387arc_buf_remove_ref(arc_buf_t *buf, void* tag) 2388{ 2389 arc_buf_hdr_t *hdr = buf->b_hdr; 2390 kmutex_t *hash_lock = HDR_LOCK(hdr); 2391 boolean_t no_callback = (buf->b_efunc == NULL); 2392 2393 if (hdr->b_l1hdr.b_state == arc_anon) { 2394 ASSERT(hdr->b_l1hdr.b_datacnt == 1); 2395 arc_buf_free(buf, tag); 2396 return (no_callback); 2397 } 2398 2399 mutex_enter(hash_lock); 2400 hdr = buf->b_hdr; 2401 ASSERT(hdr->b_l1hdr.b_datacnt > 0); 2402 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 2403 ASSERT(hdr->b_l1hdr.b_state != arc_anon); 2404 ASSERT(buf->b_data != NULL); 2405 2406 (void) remove_reference(hdr, hash_lock, tag); 2407 if (hdr->b_l1hdr.b_datacnt > 1) { 2408 if (no_callback) 2409 arc_buf_destroy(buf, FALSE, TRUE); 2410 } else if (no_callback) { 2411 ASSERT(hdr->b_l1hdr.b_buf == buf && buf->b_next == NULL); 2412 ASSERT(buf->b_efunc == NULL); 2413 hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE; 2414 } 2415 ASSERT(no_callback || hdr->b_l1hdr.b_datacnt > 1 || 2416 refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 2417 mutex_exit(hash_lock); 2418 return (no_callback); 2419} 2420 2421int32_t 2422arc_buf_size(arc_buf_t *buf) 2423{ 2424 return (buf->b_hdr->b_size); 2425} 2426 2427/* 2428 * Called from the DMU to determine if the current buffer should be 2429 * evicted. In order to ensure proper locking, the eviction must be initiated 2430 * from the DMU. Return true if the buffer is associated with user data and 2431 * duplicate buffers still exist. 2432 */ 2433boolean_t 2434arc_buf_eviction_needed(arc_buf_t *buf) 2435{ 2436 arc_buf_hdr_t *hdr; 2437 boolean_t evict_needed = B_FALSE; 2438 2439 if (zfs_disable_dup_eviction) 2440 return (B_FALSE); 2441 2442 mutex_enter(&buf->b_evict_lock); 2443 hdr = buf->b_hdr; 2444 if (hdr == NULL) { 2445 /* 2446 * We are in arc_do_user_evicts(); let that function 2447 * perform the eviction. 2448 */ 2449 ASSERT(buf->b_data == NULL); 2450 mutex_exit(&buf->b_evict_lock); 2451 return (B_FALSE); 2452 } else if (buf->b_data == NULL) { 2453 /* 2454 * We have already been added to the arc eviction list; 2455 * recommend eviction. 2456 */ 2457 ASSERT3P(hdr, ==, &arc_eviction_hdr); 2458 mutex_exit(&buf->b_evict_lock); 2459 return (B_TRUE); 2460 } 2461 2462 if (hdr->b_l1hdr.b_datacnt > 1 && HDR_ISTYPE_DATA(hdr)) 2463 evict_needed = B_TRUE; 2464 2465 mutex_exit(&buf->b_evict_lock); 2466 return (evict_needed); 2467} 2468 2469/* 2470 * Evict buffers from list until we've removed the specified number of 2471 * bytes. Move the removed buffers to the appropriate evict state. 2472 * If the recycle flag is set, then attempt to "recycle" a buffer: 2473 * - look for a buffer to evict that is `bytes' long. 2474 * - return the data block from this buffer rather than freeing it. 2475 * This flag is used by callers that are trying to make space for a 2476 * new buffer in a full arc cache. 2477 * 2478 * This function makes a "best effort". It skips over any buffers 2479 * it can't get a hash_lock on, and so may not catch all candidates. 2480 * It may also return without evicting as much space as requested. 2481 */ 2482static void * 2483arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle, 2484 arc_buf_contents_t type) 2485{ 2486 arc_state_t *evicted_state; 2487 uint64_t bytes_evicted = 0, skipped = 0, missed = 0; 2488 arc_buf_hdr_t *hdr, *hdr_prev = NULL; 2489 kmutex_t *hash_lock; 2490 boolean_t have_lock; 2491 void *stolen = NULL; 2492 arc_buf_hdr_t marker = { 0 }; 2493 int count = 0; 2494 2495 ASSERT(state == arc_mru || state == arc_mfu); 2496 2497 evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost; 2498 2499 /* 2500 * The ghost list lock must be acquired first in order to prevent 2501 * a 3 party deadlock: 2502 * 2503 * - arc_evict_ghost acquires arc_*_ghost->arcs_mtx, followed by 2504 * l2ad_mtx in arc_hdr_realloc 2505 * - l2arc_write_buffers acquires l2ad_mtx, followed by arc_*->arcs_mtx 2506 * - arc_evict acquires arc_*_ghost->arcs_mtx, followed by 2507 * arc_*_ghost->arcs_mtx and forms a deadlock cycle. 2508 * 2509 * This situation is avoided by acquiring the ghost list lock first. 2510 */ 2511 mutex_enter(&evicted_state->arcs_mtx); 2512 mutex_enter(&state->arcs_mtx); 2513 2514 /* 2515 * Decide which "type" (data vs metadata) to recycle from. 2516 * 2517 * If we are over the metadata limit, recycle from metadata. 2518 * If we are under the metadata minimum, recycle from data. 2519 * Otherwise, recycle from whichever type has the oldest (least 2520 * recently accessed) header. 2521 */ 2522 if (recycle) { 2523 arc_buf_hdr_t *data_hdr = 2524 list_tail(&state->arcs_list[ARC_BUFC_DATA]); 2525 arc_buf_hdr_t *metadata_hdr = 2526 list_tail(&state->arcs_list[ARC_BUFC_METADATA]); 2527 arc_buf_contents_t realtype; 2528 2529 if (data_hdr == NULL) { 2530 realtype = ARC_BUFC_METADATA; 2531 } else if (metadata_hdr == NULL) { 2532 realtype = ARC_BUFC_DATA; 2533 } else if (arc_meta_used >= arc_meta_limit) { 2534 realtype = ARC_BUFC_METADATA; 2535 } else if (arc_meta_used <= arc_meta_min) { 2536 realtype = ARC_BUFC_DATA; 2537 } else if (HDR_HAS_L1HDR(data_hdr) && 2538 HDR_HAS_L1HDR(metadata_hdr) && 2539 data_hdr->b_l1hdr.b_arc_access < 2540 metadata_hdr->b_l1hdr.b_arc_access) { 2541 realtype = ARC_BUFC_DATA; 2542 } else { 2543 realtype = ARC_BUFC_METADATA; 2544 } 2545 if (realtype != type) { 2546 /* 2547 * If we want to evict from a different list, 2548 * we can not recycle, because DATA vs METADATA 2549 * buffers are segregated into different kmem 2550 * caches (and vmem arenas). 2551 */ 2552 type = realtype; 2553 recycle = B_FALSE; 2554 } 2555 } 2556 2557 list_t *list = &state->arcs_list[type]; 2558 2559 for (hdr = list_tail(list); hdr; hdr = hdr_prev) { 2560 hdr_prev = list_prev(list, hdr); 2561 /* prefetch buffers have a minimum lifespan */ 2562 if (HDR_IO_IN_PROGRESS(hdr) || 2563 (spa && hdr->b_spa != spa) || 2564 ((hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT)) && 2565 ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access < 2566 arc_min_prefetch_lifespan)) { 2567 skipped++; 2568 continue; 2569 } 2570 /* "lookahead" for better eviction candidate */ 2571 if (recycle && hdr->b_size != bytes && 2572 hdr_prev && hdr_prev->b_size == bytes) 2573 continue; 2574 2575 /* ignore markers */ 2576 if (hdr->b_spa == 0) 2577 continue; 2578 2579 /* 2580 * It may take a long time to evict all the bufs requested. 2581 * To avoid blocking all arc activity, periodically drop 2582 * the arcs_mtx and give other threads a chance to run 2583 * before reacquiring the lock. 2584 * 2585 * If we are looking for a buffer to recycle, we are in 2586 * the hot code path, so don't sleep. 2587 */ 2588 if (!recycle && count++ > arc_evict_iterations) { 2589 list_insert_after(list, hdr, &marker); 2590 mutex_exit(&state->arcs_mtx); 2591 mutex_exit(&evicted_state->arcs_mtx); 2592 kpreempt(KPREEMPT_SYNC); 2593 mutex_enter(&evicted_state->arcs_mtx); 2594 mutex_enter(&state->arcs_mtx); 2595 hdr_prev = list_prev(list, &marker); 2596 list_remove(list, &marker); 2597 count = 0; 2598 continue; 2599 } 2600 2601 hash_lock = HDR_LOCK(hdr); 2602 have_lock = MUTEX_HELD(hash_lock); 2603 if (have_lock || mutex_tryenter(hash_lock)) { 2604 ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt)); 2605 ASSERT3U(hdr->b_l1hdr.b_datacnt, >, 0); 2606 while (hdr->b_l1hdr.b_buf) { 2607 arc_buf_t *buf = hdr->b_l1hdr.b_buf; 2608 if (!mutex_tryenter(&buf->b_evict_lock)) { 2609 missed += 1; 2610 break; 2611 } 2612 if (buf->b_data != NULL) { 2613 bytes_evicted += hdr->b_size; 2614 if (recycle && 2615 arc_buf_type(hdr) == type && 2616 hdr->b_size == bytes && 2617 !HDR_L2_WRITING(hdr)) { 2618 stolen = buf->b_data; 2619 recycle = FALSE; 2620 } 2621 } 2622 if (buf->b_efunc != NULL) { 2623 mutex_enter(&arc_eviction_mtx); 2624 arc_buf_destroy(buf, 2625 buf->b_data == stolen, FALSE); 2626 hdr->b_l1hdr.b_buf = buf->b_next; 2627 buf->b_hdr = &arc_eviction_hdr; 2628 buf->b_next = arc_eviction_list; 2629 arc_eviction_list = buf; 2630 mutex_exit(&arc_eviction_mtx); 2631 mutex_exit(&buf->b_evict_lock); 2632 } else { 2633 mutex_exit(&buf->b_evict_lock); 2634 arc_buf_destroy(buf, 2635 buf->b_data == stolen, TRUE); 2636 } 2637 } 2638 2639 if (HDR_HAS_L2HDR(hdr)) { 2640 ARCSTAT_INCR(arcstat_evict_l2_cached, 2641 hdr->b_size); 2642 } else { 2643 if (l2arc_write_eligible(hdr->b_spa, hdr)) { 2644 ARCSTAT_INCR(arcstat_evict_l2_eligible, 2645 hdr->b_size); 2646 } else { 2647 ARCSTAT_INCR( 2648 arcstat_evict_l2_ineligible, 2649 hdr->b_size); 2650 } 2651 } 2652 2653 if (hdr->b_l1hdr.b_datacnt == 0) { 2654 arc_change_state(evicted_state, hdr, hash_lock); 2655 ASSERT(HDR_IN_HASH_TABLE(hdr)); 2656 hdr->b_flags |= ARC_FLAG_IN_HASH_TABLE; 2657 hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE; 2658 DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, hdr); 2659 } 2660 if (!have_lock) 2661 mutex_exit(hash_lock); 2662 if (bytes >= 0 && bytes_evicted >= bytes) 2663 break; 2664 } else { 2665 missed += 1; 2666 } 2667 } 2668 2669 mutex_exit(&state->arcs_mtx); 2670 mutex_exit(&evicted_state->arcs_mtx); 2671 2672 if (bytes_evicted < bytes) 2673 dprintf("only evicted %lld bytes from %x", 2674 (longlong_t)bytes_evicted, state); 2675 2676 if (skipped) 2677 ARCSTAT_INCR(arcstat_evict_skip, skipped); 2678 2679 if (missed) 2680 ARCSTAT_INCR(arcstat_mutex_miss, missed); 2681 2682 /* 2683 * Note: we have just evicted some data into the ghost state, 2684 * potentially putting the ghost size over the desired size. Rather 2685 * that evicting from the ghost list in this hot code path, leave 2686 * this chore to the arc_reclaim_thread(). 2687 */ 2688 2689 return (stolen); 2690} 2691 2692/* 2693 * Remove buffers from list until we've removed the specified number of 2694 * bytes. Destroy the buffers that are removed. 2695 */ 2696static void 2697arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes) 2698{ 2699 arc_buf_hdr_t *hdr, *hdr_prev; 2700 arc_buf_hdr_t marker = { 0 }; 2701 list_t *list = &state->arcs_list[ARC_BUFC_DATA]; 2702 kmutex_t *hash_lock; 2703 uint64_t bytes_deleted = 0; 2704 uint64_t bufs_skipped = 0; 2705 int count = 0; 2706 2707 ASSERT(GHOST_STATE(state)); 2708top: 2709 mutex_enter(&state->arcs_mtx); 2710 for (hdr = list_tail(list); hdr; hdr = hdr_prev) { 2711 hdr_prev = list_prev(list, hdr); 2712 if (arc_buf_type(hdr) >= ARC_BUFC_NUMTYPES) 2713 panic("invalid hdr=%p", (void *)hdr); 2714 if (spa && hdr->b_spa != spa) 2715 continue; 2716 2717 /* ignore markers */ 2718 if (hdr->b_spa == 0) 2719 continue; 2720 2721 hash_lock = HDR_LOCK(hdr); 2722 /* caller may be trying to modify this buffer, skip it */ 2723 if (MUTEX_HELD(hash_lock)) 2724 continue; 2725 2726 /* 2727 * It may take a long time to evict all the bufs requested. 2728 * To avoid blocking all arc activity, periodically drop 2729 * the arcs_mtx and give other threads a chance to run 2730 * before reacquiring the lock. 2731 */ 2732 if (count++ > arc_evict_iterations) { 2733 list_insert_after(list, hdr, &marker); 2734 mutex_exit(&state->arcs_mtx); 2735 kpreempt(KPREEMPT_SYNC); 2736 mutex_enter(&state->arcs_mtx); 2737 hdr_prev = list_prev(list, &marker); 2738 list_remove(list, &marker); 2739 count = 0; 2740 continue; 2741 } 2742 if (mutex_tryenter(hash_lock)) { 2743 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 2744 ASSERT(!HDR_HAS_L1HDR(hdr) || 2745 hdr->b_l1hdr.b_buf == NULL); 2746 ARCSTAT_BUMP(arcstat_deleted); 2747 bytes_deleted += hdr->b_size; 2748 2749 if (HDR_HAS_L2HDR(hdr)) { 2750 /* 2751 * This buffer is cached on the 2nd Level ARC; 2752 * don't destroy the header. 2753 */ 2754 arc_change_state(arc_l2c_only, hdr, hash_lock); 2755 /* 2756 * dropping from L1+L2 cached to L2-only, 2757 * realloc to remove the L1 header. 2758 */ 2759 hdr = arc_hdr_realloc(hdr, hdr_full_cache, 2760 hdr_l2only_cache); 2761 mutex_exit(hash_lock); 2762 } else { 2763 arc_change_state(arc_anon, hdr, hash_lock); 2764 mutex_exit(hash_lock); 2765 arc_hdr_destroy(hdr); 2766 } 2767 2768 DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, hdr); 2769 if (bytes >= 0 && bytes_deleted >= bytes) 2770 break; 2771 } else if (bytes < 0) { 2772 /* 2773 * Insert a list marker and then wait for the 2774 * hash lock to become available. Once its 2775 * available, restart from where we left off. 2776 */ 2777 list_insert_after(list, hdr, &marker); 2778 mutex_exit(&state->arcs_mtx); 2779 mutex_enter(hash_lock); 2780 mutex_exit(hash_lock); 2781 mutex_enter(&state->arcs_mtx); 2782 hdr_prev = list_prev(list, &marker); 2783 list_remove(list, &marker); 2784 } else { 2785 bufs_skipped += 1; 2786 } 2787 2788 } 2789 mutex_exit(&state->arcs_mtx); 2790 2791 if (list == &state->arcs_list[ARC_BUFC_DATA] && 2792 (bytes < 0 || bytes_deleted < bytes)) { 2793 list = &state->arcs_list[ARC_BUFC_METADATA]; 2794 goto top; 2795 } 2796 2797 if (bufs_skipped) { 2798 ARCSTAT_INCR(arcstat_mutex_miss, bufs_skipped); 2799 ASSERT(bytes >= 0); 2800 } 2801 2802 if (bytes_deleted < bytes) 2803 dprintf("only deleted %lld bytes from %p", 2804 (longlong_t)bytes_deleted, state); 2805} 2806 2807static void 2808arc_adjust(void) 2809{ 2810 int64_t adjustment, delta; 2811 2812 /* 2813 * Adjust MRU size 2814 */ 2815 2816 adjustment = MIN((int64_t)(arc_size - arc_c), 2817 (int64_t)(arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used - 2818 arc_p)); 2819 2820 if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) { 2821 delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], adjustment); 2822 (void) arc_evict(arc_mru, 0, delta, FALSE, ARC_BUFC_DATA); 2823 adjustment -= delta; 2824 } 2825 2826 if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) { 2827 delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustment); 2828 (void) arc_evict(arc_mru, 0, delta, FALSE, 2829 ARC_BUFC_METADATA); 2830 } 2831 2832 /* 2833 * Adjust MFU size 2834 */ 2835 2836 adjustment = arc_size - arc_c; 2837 2838 if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) { 2839 delta = MIN(adjustment, arc_mfu->arcs_lsize[ARC_BUFC_DATA]); 2840 (void) arc_evict(arc_mfu, 0, delta, FALSE, ARC_BUFC_DATA); 2841 adjustment -= delta; 2842 } 2843 2844 if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) { 2845 int64_t delta = MIN(adjustment, 2846 arc_mfu->arcs_lsize[ARC_BUFC_METADATA]); 2847 (void) arc_evict(arc_mfu, 0, delta, FALSE, 2848 ARC_BUFC_METADATA); 2849 } 2850 2851 /* 2852 * Adjust ghost lists 2853 */ 2854 2855 adjustment = arc_mru->arcs_size + arc_mru_ghost->arcs_size - arc_c; 2856 2857 if (adjustment > 0 && arc_mru_ghost->arcs_size > 0) { 2858 delta = MIN(arc_mru_ghost->arcs_size, adjustment); 2859 arc_evict_ghost(arc_mru_ghost, 0, delta); 2860 } 2861 2862 adjustment = 2863 arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size - arc_c; 2864 2865 if (adjustment > 0 && arc_mfu_ghost->arcs_size > 0) { 2866 delta = MIN(arc_mfu_ghost->arcs_size, adjustment); 2867 arc_evict_ghost(arc_mfu_ghost, 0, delta); 2868 } 2869} 2870 2871static void 2872arc_do_user_evicts(void) 2873{ 2874 mutex_enter(&arc_eviction_mtx); 2875 while (arc_eviction_list != NULL) { 2876 arc_buf_t *buf = arc_eviction_list; 2877 arc_eviction_list = buf->b_next; 2878 mutex_enter(&buf->b_evict_lock); 2879 buf->b_hdr = NULL; 2880 mutex_exit(&buf->b_evict_lock); 2881 mutex_exit(&arc_eviction_mtx); 2882 2883 if (buf->b_efunc != NULL) 2884 VERIFY0(buf->b_efunc(buf->b_private)); 2885 2886 buf->b_efunc = NULL; 2887 buf->b_private = NULL; 2888 kmem_cache_free(buf_cache, buf); 2889 mutex_enter(&arc_eviction_mtx); 2890 } 2891 mutex_exit(&arc_eviction_mtx); 2892} 2893 2894/* 2895 * Flush all *evictable* data from the cache for the given spa. 2896 * NOTE: this will not touch "active" (i.e. referenced) data. 2897 */ 2898void 2899arc_flush(spa_t *spa) 2900{ 2901 uint64_t guid = 0; 2902 2903 if (spa != NULL) 2904 guid = spa_load_guid(spa); 2905 2906 while (arc_mru->arcs_lsize[ARC_BUFC_DATA]) { 2907 (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_DATA); 2908 if (spa != NULL) 2909 break; 2910 } 2911 while (arc_mru->arcs_lsize[ARC_BUFC_METADATA]) { 2912 (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_METADATA); 2913 if (spa != NULL) 2914 break; 2915 } 2916 while (arc_mfu->arcs_lsize[ARC_BUFC_DATA]) { 2917 (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_DATA); 2918 if (spa != NULL) 2919 break; 2920 } 2921 while (arc_mfu->arcs_lsize[ARC_BUFC_METADATA]) { 2922 (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_METADATA); 2923 if (spa != NULL) 2924 break; 2925 } 2926 2927 arc_evict_ghost(arc_mru_ghost, guid, -1); 2928 arc_evict_ghost(arc_mfu_ghost, guid, -1); 2929 2930 mutex_enter(&arc_reclaim_thr_lock); 2931 arc_do_user_evicts(); 2932 mutex_exit(&arc_reclaim_thr_lock); 2933 ASSERT(spa || arc_eviction_list == NULL); 2934} 2935 2936void 2937arc_shrink(int64_t to_free) 2938{ 2939 if (arc_c > arc_c_min) { 2940 DTRACE_PROBE4(arc__shrink, uint64_t, arc_c, uint64_t, 2941 arc_c_min, uint64_t, arc_p, uint64_t, to_free); 2942 if (arc_c > arc_c_min + to_free) 2943 atomic_add_64(&arc_c, -to_free); 2944 else 2945 arc_c = arc_c_min; 2946 2947 atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift)); 2948 if (arc_c > arc_size) 2949 arc_c = MAX(arc_size, arc_c_min); 2950 if (arc_p > arc_c) 2951 arc_p = (arc_c >> 1); 2952 2953 DTRACE_PROBE2(arc__shrunk, uint64_t, arc_c, uint64_t, 2954 arc_p); 2955 2956 ASSERT(arc_c >= arc_c_min); 2957 ASSERT((int64_t)arc_p >= 0); 2958 } 2959 2960 if (arc_size > arc_c) { 2961 DTRACE_PROBE2(arc__shrink_adjust, uint64_t, arc_size, 2962 uint64_t, arc_c); 2963 arc_adjust(); 2964 } 2965} 2966 2967static long needfree = 0; 2968 2969typedef enum free_memory_reason_t { 2970 FMR_UNKNOWN, 2971 FMR_NEEDFREE, 2972 FMR_LOTSFREE, 2973 FMR_SWAPFS_MINFREE, 2974 FMR_PAGES_PP_MAXIMUM, 2975 FMR_HEAP_ARENA, 2976 FMR_ZIO_ARENA, 2977 FMR_ZIO_FRAG, 2978} free_memory_reason_t; 2979 2980int64_t last_free_memory; 2981free_memory_reason_t last_free_reason; 2982 2983/* 2984 * Additional reserve of pages for pp_reserve. 2985 */ 2986int64_t arc_pages_pp_reserve = 64; 2987 2988/* 2989 * Additional reserve of pages for swapfs. 2990 */ 2991int64_t arc_swapfs_reserve = 64; 2992 2993/* 2994 * Return the amount of memory that can be consumed before reclaim will be 2995 * needed. Positive if there is sufficient free memory, negative indicates 2996 * the amount of memory that needs to be freed up. 2997 */ 2998static int64_t 2999arc_available_memory(void) 3000{ 3001 int64_t lowest = INT64_MAX; 3002 int64_t n; 3003 free_memory_reason_t r = FMR_UNKNOWN; 3004 3005#ifdef _KERNEL 3006 if (needfree > 0) { 3007 n = PAGESIZE * (-needfree); 3008 if (n < lowest) { 3009 lowest = n; 3010 r = FMR_NEEDFREE; 3011 } 3012 } 3013 3014 /* 3015 * Cooperate with pagedaemon when it's time for it to scan 3016 * and reclaim some pages. 3017 */ 3018 n = PAGESIZE * ((int64_t)freemem - zfs_arc_free_target); 3019 if (n < lowest) { 3020 lowest = n; 3021 r = FMR_LOTSFREE; 3022 } 3023 3024#ifdef illumos 3025 /* 3026 * check that we're out of range of the pageout scanner. It starts to 3027 * schedule paging if freemem is less than lotsfree and needfree. 3028 * lotsfree is the high-water mark for pageout, and needfree is the 3029 * number of needed free pages. We add extra pages here to make sure 3030 * the scanner doesn't start up while we're freeing memory. 3031 */ 3032 n = PAGESIZE * (freemem - lotsfree - needfree - desfree); 3033 if (n < lowest) { 3034 lowest = n; 3035 r = FMR_LOTSFREE; 3036 } 3037 3038 /* 3039 * check to make sure that swapfs has enough space so that anon 3040 * reservations can still succeed. anon_resvmem() checks that the 3041 * availrmem is greater than swapfs_minfree, and the number of reserved 3042 * swap pages. We also add a bit of extra here just to prevent 3043 * circumstances from getting really dire. 3044 */ 3045 n = PAGESIZE * (availrmem - swapfs_minfree - swapfs_reserve - 3046 desfree - arc_swapfs_reserve); 3047 if (n < lowest) { 3048 lowest = n; 3049 r = FMR_SWAPFS_MINFREE; 3050 } 3051 3052 3053 /* 3054 * Check that we have enough availrmem that memory locking (e.g., via 3055 * mlock(3C) or memcntl(2)) can still succeed. (pages_pp_maximum 3056 * stores the number of pages that cannot be locked; when availrmem 3057 * drops below pages_pp_maximum, page locking mechanisms such as 3058 * page_pp_lock() will fail.) 3059 */ 3060 n = PAGESIZE * (availrmem - pages_pp_maximum - 3061 arc_pages_pp_reserve); 3062 if (n < lowest) { 3063 lowest = n; 3064 r = FMR_PAGES_PP_MAXIMUM; 3065 } 3066 3067#endif /* illumos */ 3068#if defined(__i386) || !defined(UMA_MD_SMALL_ALLOC) 3069 /* 3070 * If we're on an i386 platform, it's possible that we'll exhaust the 3071 * kernel heap space before we ever run out of available physical 3072 * memory. Most checks of the size of the heap_area compare against 3073 * tune.t_minarmem, which is the minimum available real memory that we 3074 * can have in the system. However, this is generally fixed at 25 pages 3075 * which is so low that it's useless. In this comparison, we seek to 3076 * calculate the total heap-size, and reclaim if more than 3/4ths of the 3077 * heap is allocated. (Or, in the calculation, if less than 1/4th is 3078 * free) 3079 */ 3080 n = (int64_t)vmem_size(heap_arena, VMEM_FREE) - 3081 (vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC) >> 2); 3082 if (n < lowest) { 3083 lowest = n; 3084 r = FMR_HEAP_ARENA; 3085 } 3086#define zio_arena NULL 3087#else 3088#define zio_arena heap_arena 3089#endif 3090 3091 /* 3092 * If zio data pages are being allocated out of a separate heap segment, 3093 * then enforce that the size of available vmem for this arena remains 3094 * above about 1/16th free. 3095 * 3096 * Note: The 1/16th arena free requirement was put in place 3097 * to aggressively evict memory from the arc in order to avoid 3098 * memory fragmentation issues. 3099 */ 3100 if (zio_arena != NULL) { 3101 n = (int64_t)vmem_size(zio_arena, VMEM_FREE) - 3102 (vmem_size(zio_arena, VMEM_ALLOC) >> 4); 3103 if (n < lowest) { 3104 lowest = n; 3105 r = FMR_ZIO_ARENA; 3106 } 3107 } 3108 3109 /* 3110 * Above limits know nothing about real level of KVA fragmentation. 3111 * Start aggressive reclamation if too little sequential KVA left. 3112 */ 3113 if (lowest > 0) { 3114 n = (vmem_size(heap_arena, VMEM_MAXFREE) < zfs_max_recordsize) ? 3115 -((int64_t)vmem_size(heap_arena, VMEM_ALLOC) >> 4) : 3116 INT64_MAX; 3117 if (n < lowest) { 3118 lowest = n; 3119 r = FMR_ZIO_FRAG; 3120 } 3121 } 3122 3123#else /* _KERNEL */ 3124 /* Every 100 calls, free a small amount */ 3125 if (spa_get_random(100) == 0) 3126 lowest = -1024; 3127#endif /* _KERNEL */ 3128 3129 last_free_memory = lowest; 3130 last_free_reason = r; 3131 DTRACE_PROBE2(arc__available_memory, int64_t, lowest, int, r); 3132 return (lowest); 3133} 3134 3135 3136/* 3137 * Determine if the system is under memory pressure and is asking 3138 * to reclaim memory. A return value of TRUE indicates that the system 3139 * is under memory pressure and that the arc should adjust accordingly. 3140 */ 3141static boolean_t 3142arc_reclaim_needed(void) 3143{ 3144 return (arc_available_memory() < 0); 3145} 3146 3147extern kmem_cache_t *zio_buf_cache[]; 3148extern kmem_cache_t *zio_data_buf_cache[]; 3149extern kmem_cache_t *range_seg_cache; 3150 3151static __noinline void 3152arc_kmem_reap_now(void) 3153{ 3154 size_t i; 3155 kmem_cache_t *prev_cache = NULL; 3156 kmem_cache_t *prev_data_cache = NULL; 3157 3158 DTRACE_PROBE(arc__kmem_reap_start); 3159#ifdef _KERNEL 3160 if (arc_meta_used >= arc_meta_limit) { 3161 /* 3162 * We are exceeding our meta-data cache limit. 3163 * Purge some DNLC entries to release holds on meta-data. 3164 */ 3165 dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent); 3166 } 3167#if defined(__i386) 3168 /* 3169 * Reclaim unused memory from all kmem caches. 3170 */ 3171 kmem_reap(); 3172#endif 3173#endif 3174 3175 for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) { 3176 if (zio_buf_cache[i] != prev_cache) { 3177 prev_cache = zio_buf_cache[i]; 3178 kmem_cache_reap_now(zio_buf_cache[i]); 3179 } 3180 if (zio_data_buf_cache[i] != prev_data_cache) { 3181 prev_data_cache = zio_data_buf_cache[i]; 3182 kmem_cache_reap_now(zio_data_buf_cache[i]); 3183 } 3184 } 3185 kmem_cache_reap_now(buf_cache); 3186 kmem_cache_reap_now(hdr_full_cache); 3187 kmem_cache_reap_now(hdr_l2only_cache); 3188 kmem_cache_reap_now(range_seg_cache); 3189 3190#ifdef illumos 3191 if (zio_arena != NULL) { 3192 /* 3193 * Ask the vmem arena to reclaim unused memory from its 3194 * quantum caches. 3195 */ 3196 vmem_qcache_reap(zio_arena); 3197 } 3198#endif 3199 DTRACE_PROBE(arc__kmem_reap_end); 3200} 3201 3202static void 3203arc_reclaim_thread(void *dummy __unused) 3204{ 3205 clock_t growtime = 0; 3206 callb_cpr_t cpr; 3207 3208 CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG); 3209 3210 mutex_enter(&arc_reclaim_thr_lock); 3211 while (arc_thread_exit == 0) { 3212 int64_t free_memory = arc_available_memory(); 3213 if (free_memory < 0) { 3214 3215 arc_no_grow = B_TRUE; 3216 arc_warm = B_TRUE; 3217 3218 /* 3219 * Wait at least zfs_grow_retry (default 60) seconds 3220 * before considering growing. 3221 */ 3222 growtime = ddi_get_lbolt() + (arc_grow_retry * hz); 3223 3224 arc_kmem_reap_now(); 3225 3226 /* 3227 * If we are still low on memory, shrink the ARC 3228 * so that we have arc_shrink_min free space. 3229 */ 3230 free_memory = arc_available_memory(); 3231 3232 int64_t to_free = 3233 (arc_c >> arc_shrink_shift) - free_memory; 3234 if (to_free > 0) { 3235#ifdef _KERNEL 3236 to_free = MAX(to_free, ptob(needfree)); 3237#endif 3238 arc_shrink(to_free); 3239 } 3240 } else if (free_memory < arc_c >> arc_no_grow_shift) { 3241 arc_no_grow = B_TRUE; 3242 } else if (ddi_get_lbolt() >= growtime) { 3243 arc_no_grow = B_FALSE; 3244 } 3245 3246 arc_adjust(); 3247 3248 if (arc_eviction_list != NULL) 3249 arc_do_user_evicts(); 3250 3251#ifdef _KERNEL 3252 if (needfree) { 3253 needfree = 0; 3254 wakeup(&needfree); 3255 } 3256#endif 3257 3258 /* 3259 * This is necessary in order for the mdb ::arc dcmd to 3260 * show up to date information. Since the ::arc command 3261 * does not call the kstat's update function, without 3262 * this call, the command may show stale stats for the 3263 * anon, mru, mru_ghost, mfu, and mfu_ghost lists. Even 3264 * with this change, the data might be up to 1 second 3265 * out of date; but that should suffice. The arc_state_t 3266 * structures can be queried directly if more accurate 3267 * information is needed. 3268 */ 3269 if (arc_ksp != NULL) 3270 arc_ksp->ks_update(arc_ksp, KSTAT_READ); 3271 3272 /* block until needed, or one second, whichever is shorter */ 3273 CALLB_CPR_SAFE_BEGIN(&cpr); 3274 (void) cv_timedwait(&arc_reclaim_thr_cv, 3275 &arc_reclaim_thr_lock, hz); 3276 CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock); 3277 } 3278 3279 arc_thread_exit = 0; 3280 cv_broadcast(&arc_reclaim_thr_cv); 3281 CALLB_CPR_EXIT(&cpr); /* drops arc_reclaim_thr_lock */ 3282 thread_exit(); 3283} 3284 3285/* 3286 * Adapt arc info given the number of bytes we are trying to add and 3287 * the state that we are comming from. This function is only called 3288 * when we are adding new content to the cache. 3289 */ 3290static void 3291arc_adapt(int bytes, arc_state_t *state) 3292{ 3293 int mult; 3294 uint64_t arc_p_min = (arc_c >> arc_p_min_shift); 3295 3296 if (state == arc_l2c_only) 3297 return; 3298 3299 ASSERT(bytes > 0); 3300 /* 3301 * Adapt the target size of the MRU list: 3302 * - if we just hit in the MRU ghost list, then increase 3303 * the target size of the MRU list. 3304 * - if we just hit in the MFU ghost list, then increase 3305 * the target size of the MFU list by decreasing the 3306 * target size of the MRU list. 3307 */ 3308 if (state == arc_mru_ghost) { 3309 mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ? 3310 1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size)); 3311 mult = MIN(mult, 10); /* avoid wild arc_p adjustment */ 3312 3313 arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult); 3314 } else if (state == arc_mfu_ghost) { 3315 uint64_t delta; 3316 3317 mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ? 3318 1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size)); 3319 mult = MIN(mult, 10); 3320 3321 delta = MIN(bytes * mult, arc_p); 3322 arc_p = MAX(arc_p_min, arc_p - delta); 3323 } 3324 ASSERT((int64_t)arc_p >= 0); 3325 3326 if (arc_reclaim_needed()) { 3327 cv_signal(&arc_reclaim_thr_cv); 3328 return; 3329 } 3330 3331 if (arc_no_grow) 3332 return; 3333 3334 if (arc_c >= arc_c_max) 3335 return; 3336 3337 /* 3338 * If we're within (2 * maxblocksize) bytes of the target 3339 * cache size, increment the target cache size 3340 */ 3341 if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) { 3342 DTRACE_PROBE1(arc__inc_adapt, int, bytes); 3343 atomic_add_64(&arc_c, (int64_t)bytes); 3344 if (arc_c > arc_c_max) 3345 arc_c = arc_c_max; 3346 else if (state == arc_anon) 3347 atomic_add_64(&arc_p, (int64_t)bytes); 3348 if (arc_p > arc_c) 3349 arc_p = arc_c; 3350 } 3351 ASSERT((int64_t)arc_p >= 0); 3352} 3353 3354/* 3355 * Check if the cache has reached its limits and eviction is required 3356 * prior to insert. 3357 */ 3358static int 3359arc_evict_needed(arc_buf_contents_t type) 3360{ 3361 if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit) 3362 return (1); 3363 3364 if (arc_reclaim_needed()) 3365 return (1); 3366 3367 return (arc_size > arc_c); 3368} 3369 3370/* 3371 * The buffer, supplied as the first argument, needs a data block. 3372 * So, if we are at cache max, determine which cache should be victimized. 3373 * We have the following cases: 3374 * 3375 * 1. Insert for MRU, p > sizeof(arc_anon + arc_mru) -> 3376 * In this situation if we're out of space, but the resident size of the MFU is 3377 * under the limit, victimize the MFU cache to satisfy this insertion request. 3378 * 3379 * 2. Insert for MRU, p <= sizeof(arc_anon + arc_mru) -> 3380 * Here, we've used up all of the available space for the MRU, so we need to 3381 * evict from our own cache instead. Evict from the set of resident MRU 3382 * entries. 3383 * 3384 * 3. Insert for MFU (c - p) > sizeof(arc_mfu) -> 3385 * c minus p represents the MFU space in the cache, since p is the size of the 3386 * cache that is dedicated to the MRU. In this situation there's still space on 3387 * the MFU side, so the MRU side needs to be victimized. 3388 * 3389 * 4. Insert for MFU (c - p) < sizeof(arc_mfu) -> 3390 * MFU's resident set is consuming more space than it has been allotted. In 3391 * this situation, we must victimize our own cache, the MFU, for this insertion. 3392 */ 3393static void 3394arc_get_data_buf(arc_buf_t *buf) 3395{ 3396 arc_state_t *state = buf->b_hdr->b_l1hdr.b_state; 3397 uint64_t size = buf->b_hdr->b_size; 3398 arc_buf_contents_t type = arc_buf_type(buf->b_hdr); 3399 3400 arc_adapt(size, state); 3401 3402 /* 3403 * We have not yet reached cache maximum size, 3404 * just allocate a new buffer. 3405 */ 3406 if (!arc_evict_needed(type)) { 3407 if (type == ARC_BUFC_METADATA) { 3408 buf->b_data = zio_buf_alloc(size); 3409 arc_space_consume(size, ARC_SPACE_META); 3410 } else { 3411 ASSERT(type == ARC_BUFC_DATA); 3412 buf->b_data = zio_data_buf_alloc(size); 3413 arc_space_consume(size, ARC_SPACE_DATA); 3414 } 3415 goto out; 3416 } 3417 3418 /* 3419 * If we are prefetching from the mfu ghost list, this buffer 3420 * will end up on the mru list; so steal space from there. 3421 */ 3422 if (state == arc_mfu_ghost) 3423 state = HDR_PREFETCH(buf->b_hdr) ? arc_mru : arc_mfu; 3424 else if (state == arc_mru_ghost) 3425 state = arc_mru; 3426 3427 if (state == arc_mru || state == arc_anon) { 3428 uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size; 3429 state = (arc_mfu->arcs_lsize[type] >= size && 3430 arc_p > mru_used) ? arc_mfu : arc_mru; 3431 } else { 3432 /* MFU cases */ 3433 uint64_t mfu_space = arc_c - arc_p; 3434 state = (arc_mru->arcs_lsize[type] >= size && 3435 mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu; 3436 } 3437 if ((buf->b_data = arc_evict(state, 0, size, TRUE, type)) == NULL) { 3438 if (type == ARC_BUFC_METADATA) { 3439 buf->b_data = zio_buf_alloc(size); 3440 arc_space_consume(size, ARC_SPACE_META); 3441 } else { 3442 ASSERT(type == ARC_BUFC_DATA); 3443 buf->b_data = zio_data_buf_alloc(size); 3444 arc_space_consume(size, ARC_SPACE_DATA); 3445 } 3446 ARCSTAT_BUMP(arcstat_recycle_miss); 3447 } 3448 ASSERT(buf->b_data != NULL); 3449out: 3450 /* 3451 * Update the state size. Note that ghost states have a 3452 * "ghost size" and so don't need to be updated. 3453 */ 3454 if (!GHOST_STATE(buf->b_hdr->b_l1hdr.b_state)) { 3455 arc_buf_hdr_t *hdr = buf->b_hdr; 3456 3457 atomic_add_64(&hdr->b_l1hdr.b_state->arcs_size, size); 3458 if (list_link_active(&hdr->b_l1hdr.b_arc_node)) { 3459 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 3460 atomic_add_64(&hdr->b_l1hdr.b_state->arcs_lsize[type], 3461 size); 3462 } 3463 /* 3464 * If we are growing the cache, and we are adding anonymous 3465 * data, and we have outgrown arc_p, update arc_p 3466 */ 3467 if (arc_size < arc_c && hdr->b_l1hdr.b_state == arc_anon && 3468 arc_anon->arcs_size + arc_mru->arcs_size > arc_p) 3469 arc_p = MIN(arc_c, arc_p + size); 3470 } 3471 ARCSTAT_BUMP(arcstat_allocated); 3472} 3473 3474/* 3475 * This routine is called whenever a buffer is accessed. 3476 * NOTE: the hash lock is dropped in this function. 3477 */ 3478static void 3479arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) 3480{ 3481 clock_t now; 3482 3483 ASSERT(MUTEX_HELD(hash_lock)); 3484 ASSERT(HDR_HAS_L1HDR(hdr)); 3485 3486 if (hdr->b_l1hdr.b_state == arc_anon) { 3487 /* 3488 * This buffer is not in the cache, and does not 3489 * appear in our "ghost" list. Add the new buffer 3490 * to the MRU state. 3491 */ 3492 3493 ASSERT0(hdr->b_l1hdr.b_arc_access); 3494 hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); 3495 DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr); 3496 arc_change_state(arc_mru, hdr, hash_lock); 3497 3498 } else if (hdr->b_l1hdr.b_state == arc_mru) { 3499 now = ddi_get_lbolt(); 3500 3501 /* 3502 * If this buffer is here because of a prefetch, then either: 3503 * - clear the flag if this is a "referencing" read 3504 * (any subsequent access will bump this into the MFU state). 3505 * or 3506 * - move the buffer to the head of the list if this is 3507 * another prefetch (to make it less likely to be evicted). 3508 */ 3509 if (HDR_PREFETCH(hdr)) { 3510 if (refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) { 3511 ASSERT(list_link_active( 3512 &hdr->b_l1hdr.b_arc_node)); 3513 } else { 3514 hdr->b_flags &= ~ARC_FLAG_PREFETCH; 3515 ARCSTAT_BUMP(arcstat_mru_hits); 3516 } 3517 hdr->b_l1hdr.b_arc_access = now; 3518 return; 3519 } 3520 3521 /* 3522 * This buffer has been "accessed" only once so far, 3523 * but it is still in the cache. Move it to the MFU 3524 * state. 3525 */ 3526 if (now > hdr->b_l1hdr.b_arc_access + ARC_MINTIME) { 3527 /* 3528 * More than 125ms have passed since we 3529 * instantiated this buffer. Move it to the 3530 * most frequently used state. 3531 */ 3532 hdr->b_l1hdr.b_arc_access = now; 3533 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); 3534 arc_change_state(arc_mfu, hdr, hash_lock); 3535 } 3536 ARCSTAT_BUMP(arcstat_mru_hits); 3537 } else if (hdr->b_l1hdr.b_state == arc_mru_ghost) { 3538 arc_state_t *new_state; 3539 /* 3540 * This buffer has been "accessed" recently, but 3541 * was evicted from the cache. Move it to the 3542 * MFU state. 3543 */ 3544 3545 if (HDR_PREFETCH(hdr)) { 3546 new_state = arc_mru; 3547 if (refcount_count(&hdr->b_l1hdr.b_refcnt) > 0) 3548 hdr->b_flags &= ~ARC_FLAG_PREFETCH; 3549 DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr); 3550 } else { 3551 new_state = arc_mfu; 3552 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); 3553 } 3554 3555 hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); 3556 arc_change_state(new_state, hdr, hash_lock); 3557 3558 ARCSTAT_BUMP(arcstat_mru_ghost_hits); 3559 } else if (hdr->b_l1hdr.b_state == arc_mfu) { 3560 /* 3561 * This buffer has been accessed more than once and is 3562 * still in the cache. Keep it in the MFU state. 3563 * 3564 * NOTE: an add_reference() that occurred when we did 3565 * the arc_read() will have kicked this off the list. 3566 * If it was a prefetch, we will explicitly move it to 3567 * the head of the list now. 3568 */ 3569 if ((HDR_PREFETCH(hdr)) != 0) { 3570 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 3571 ASSERT(list_link_active(&hdr->b_l1hdr.b_arc_node)); 3572 } 3573 ARCSTAT_BUMP(arcstat_mfu_hits); 3574 hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); 3575 } else if (hdr->b_l1hdr.b_state == arc_mfu_ghost) { 3576 arc_state_t *new_state = arc_mfu; 3577 /* 3578 * This buffer has been accessed more than once but has 3579 * been evicted from the cache. Move it back to the 3580 * MFU state. 3581 */ 3582 3583 if (HDR_PREFETCH(hdr)) { 3584 /* 3585 * This is a prefetch access... 3586 * move this block back to the MRU state. 3587 */ 3588 ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt)); 3589 new_state = arc_mru; 3590 } 3591 3592 hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); 3593 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); 3594 arc_change_state(new_state, hdr, hash_lock); 3595 3596 ARCSTAT_BUMP(arcstat_mfu_ghost_hits); 3597 } else if (hdr->b_l1hdr.b_state == arc_l2c_only) { 3598 /* 3599 * This buffer is on the 2nd Level ARC. 3600 */ 3601 3602 hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); 3603 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); 3604 arc_change_state(arc_mfu, hdr, hash_lock); 3605 } else { 3606 ASSERT(!"invalid arc state"); 3607 } 3608} 3609 3610/* a generic arc_done_func_t which you can use */ 3611/* ARGSUSED */ 3612void 3613arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg) 3614{ 3615 if (zio == NULL || zio->io_error == 0) 3616 bcopy(buf->b_data, arg, buf->b_hdr->b_size); 3617 VERIFY(arc_buf_remove_ref(buf, arg)); 3618} 3619 3620/* a generic arc_done_func_t */ 3621void 3622arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg) 3623{ 3624 arc_buf_t **bufp = arg; 3625 if (zio && zio->io_error) { 3626 VERIFY(arc_buf_remove_ref(buf, arg)); 3627 *bufp = NULL; 3628 } else { 3629 *bufp = buf; 3630 ASSERT(buf->b_data); 3631 } 3632} 3633 3634static void 3635arc_read_done(zio_t *zio) 3636{ 3637 arc_buf_hdr_t *hdr; 3638 arc_buf_t *buf; 3639 arc_buf_t *abuf; /* buffer we're assigning to callback */ 3640 kmutex_t *hash_lock = NULL; 3641 arc_callback_t *callback_list, *acb; 3642 int freeable = FALSE; 3643 3644 buf = zio->io_private; 3645 hdr = buf->b_hdr; 3646 3647 /* 3648 * The hdr was inserted into hash-table and removed from lists 3649 * prior to starting I/O. We should find this header, since 3650 * it's in the hash table, and it should be legit since it's 3651 * not possible to evict it during the I/O. The only possible 3652 * reason for it not to be found is if we were freed during the 3653 * read. 3654 */ 3655 if (HDR_IN_HASH_TABLE(hdr)) { 3656 ASSERT3U(hdr->b_birth, ==, BP_PHYSICAL_BIRTH(zio->io_bp)); 3657 ASSERT3U(hdr->b_dva.dva_word[0], ==, 3658 BP_IDENTITY(zio->io_bp)->dva_word[0]); 3659 ASSERT3U(hdr->b_dva.dva_word[1], ==, 3660 BP_IDENTITY(zio->io_bp)->dva_word[1]); 3661 3662 arc_buf_hdr_t *found = buf_hash_find(hdr->b_spa, zio->io_bp, 3663 &hash_lock); 3664 3665 ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && 3666 hash_lock == NULL) || 3667 (found == hdr && 3668 DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) || 3669 (found == hdr && HDR_L2_READING(hdr))); 3670 } 3671 3672 hdr->b_flags &= ~ARC_FLAG_L2_EVICTED; 3673 if (l2arc_noprefetch && HDR_PREFETCH(hdr)) 3674 hdr->b_flags &= ~ARC_FLAG_L2CACHE; 3675 3676 /* byteswap if necessary */ 3677 callback_list = hdr->b_l1hdr.b_acb; 3678 ASSERT(callback_list != NULL); 3679 if (BP_SHOULD_BYTESWAP(zio->io_bp) && zio->io_error == 0) { 3680 dmu_object_byteswap_t bswap = 3681 DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp)); 3682 arc_byteswap_func_t *func = BP_GET_LEVEL(zio->io_bp) > 0 ? 3683 byteswap_uint64_array : 3684 dmu_ot_byteswap[bswap].ob_func; 3685 func(buf->b_data, hdr->b_size); 3686 } 3687 3688 arc_cksum_compute(buf, B_FALSE); 3689#ifdef illumos 3690 arc_buf_watch(buf); 3691#endif 3692 3693 if (hash_lock && zio->io_error == 0 && 3694 hdr->b_l1hdr.b_state == arc_anon) { 3695 /* 3696 * Only call arc_access on anonymous buffers. This is because 3697 * if we've issued an I/O for an evicted buffer, we've already 3698 * called arc_access (to prevent any simultaneous readers from 3699 * getting confused). 3700 */ 3701 arc_access(hdr, hash_lock); 3702 } 3703 3704 /* create copies of the data buffer for the callers */ 3705 abuf = buf; 3706 for (acb = callback_list; acb; acb = acb->acb_next) { 3707 if (acb->acb_done) { 3708 if (abuf == NULL) { 3709 ARCSTAT_BUMP(arcstat_duplicate_reads); 3710 abuf = arc_buf_clone(buf); 3711 } 3712 acb->acb_buf = abuf; 3713 abuf = NULL; 3714 } 3715 } 3716 hdr->b_l1hdr.b_acb = NULL; 3717 hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS; 3718 ASSERT(!HDR_BUF_AVAILABLE(hdr)); 3719 if (abuf == buf) { 3720 ASSERT(buf->b_efunc == NULL); 3721 ASSERT(hdr->b_l1hdr.b_datacnt == 1); 3722 hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE; 3723 } 3724 3725 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt) || 3726 callback_list != NULL); 3727 3728 if (zio->io_error != 0) { 3729 hdr->b_flags |= ARC_FLAG_IO_ERROR; 3730 if (hdr->b_l1hdr.b_state != arc_anon) 3731 arc_change_state(arc_anon, hdr, hash_lock); 3732 if (HDR_IN_HASH_TABLE(hdr)) 3733 buf_hash_remove(hdr); 3734 freeable = refcount_is_zero(&hdr->b_l1hdr.b_refcnt); 3735 } 3736 3737 /* 3738 * Broadcast before we drop the hash_lock to avoid the possibility 3739 * that the hdr (and hence the cv) might be freed before we get to 3740 * the cv_broadcast(). 3741 */ 3742 cv_broadcast(&hdr->b_l1hdr.b_cv); 3743 3744 if (hash_lock != NULL) { 3745 mutex_exit(hash_lock); 3746 } else { 3747 /* 3748 * This block was freed while we waited for the read to 3749 * complete. It has been removed from the hash table and 3750 * moved to the anonymous state (so that it won't show up 3751 * in the cache). 3752 */ 3753 ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); 3754 freeable = refcount_is_zero(&hdr->b_l1hdr.b_refcnt); 3755 } 3756 3757 /* execute each callback and free its structure */ 3758 while ((acb = callback_list) != NULL) { 3759 if (acb->acb_done) 3760 acb->acb_done(zio, acb->acb_buf, acb->acb_private); 3761 3762 if (acb->acb_zio_dummy != NULL) { 3763 acb->acb_zio_dummy->io_error = zio->io_error; 3764 zio_nowait(acb->acb_zio_dummy); 3765 } 3766 3767 callback_list = acb->acb_next; 3768 kmem_free(acb, sizeof (arc_callback_t)); 3769 } 3770 3771 if (freeable) 3772 arc_hdr_destroy(hdr); 3773} 3774 3775/* 3776 * "Read" the block at the specified DVA (in bp) via the 3777 * cache. If the block is found in the cache, invoke the provided 3778 * callback immediately and return. Note that the `zio' parameter 3779 * in the callback will be NULL in this case, since no IO was 3780 * required. If the block is not in the cache pass the read request 3781 * on to the spa with a substitute callback function, so that the 3782 * requested block will be added to the cache. 3783 * 3784 * If a read request arrives for a block that has a read in-progress, 3785 * either wait for the in-progress read to complete (and return the 3786 * results); or, if this is a read with a "done" func, add a record 3787 * to the read to invoke the "done" func when the read completes, 3788 * and return; or just return. 3789 * 3790 * arc_read_done() will invoke all the requested "done" functions 3791 * for readers of this block. 3792 */ 3793int 3794arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, 3795 void *private, zio_priority_t priority, int zio_flags, 3796 arc_flags_t *arc_flags, const zbookmark_phys_t *zb) 3797{ 3798 arc_buf_hdr_t *hdr = NULL; 3799 arc_buf_t *buf = NULL; 3800 kmutex_t *hash_lock = NULL; 3801 zio_t *rzio; 3802 uint64_t guid = spa_load_guid(spa); 3803 3804 ASSERT(!BP_IS_EMBEDDED(bp) || 3805 BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA); 3806 3807top: 3808 if (!BP_IS_EMBEDDED(bp)) { 3809 /* 3810 * Embedded BP's have no DVA and require no I/O to "read". 3811 * Create an anonymous arc buf to back it. 3812 */ 3813 hdr = buf_hash_find(guid, bp, &hash_lock); 3814 } 3815 3816 if (hdr != NULL && HDR_HAS_L1HDR(hdr) && hdr->b_l1hdr.b_datacnt > 0) { 3817 3818 *arc_flags |= ARC_FLAG_CACHED; 3819 3820 if (HDR_IO_IN_PROGRESS(hdr)) { 3821 3822 if (*arc_flags & ARC_FLAG_WAIT) { 3823 cv_wait(&hdr->b_l1hdr.b_cv, hash_lock); 3824 mutex_exit(hash_lock); 3825 goto top; 3826 } 3827 ASSERT(*arc_flags & ARC_FLAG_NOWAIT); 3828 3829 if (done) { 3830 arc_callback_t *acb = NULL; 3831 3832 acb = kmem_zalloc(sizeof (arc_callback_t), 3833 KM_SLEEP); 3834 acb->acb_done = done; 3835 acb->acb_private = private; 3836 if (pio != NULL) 3837 acb->acb_zio_dummy = zio_null(pio, 3838 spa, NULL, NULL, NULL, zio_flags); 3839 3840 ASSERT(acb->acb_done != NULL); 3841 acb->acb_next = hdr->b_l1hdr.b_acb; 3842 hdr->b_l1hdr.b_acb = acb; 3843 add_reference(hdr, hash_lock, private); 3844 mutex_exit(hash_lock); 3845 return (0); 3846 } 3847 mutex_exit(hash_lock); 3848 return (0); 3849 } 3850 3851 ASSERT(hdr->b_l1hdr.b_state == arc_mru || 3852 hdr->b_l1hdr.b_state == arc_mfu); 3853 3854 if (done) { 3855 add_reference(hdr, hash_lock, private); 3856 /* 3857 * If this block is already in use, create a new 3858 * copy of the data so that we will be guaranteed 3859 * that arc_release() will always succeed. 3860 */ 3861 buf = hdr->b_l1hdr.b_buf; 3862 ASSERT(buf); 3863 ASSERT(buf->b_data); 3864 if (HDR_BUF_AVAILABLE(hdr)) { 3865 ASSERT(buf->b_efunc == NULL); 3866 hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE; 3867 } else { 3868 buf = arc_buf_clone(buf); 3869 } 3870 3871 } else if (*arc_flags & ARC_FLAG_PREFETCH && 3872 refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) { 3873 hdr->b_flags |= ARC_FLAG_PREFETCH; 3874 } 3875 DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); 3876 arc_access(hdr, hash_lock); 3877 if (*arc_flags & ARC_FLAG_L2CACHE) 3878 hdr->b_flags |= ARC_FLAG_L2CACHE; 3879 if (*arc_flags & ARC_FLAG_L2COMPRESS) 3880 hdr->b_flags |= ARC_FLAG_L2COMPRESS; 3881 mutex_exit(hash_lock); 3882 ARCSTAT_BUMP(arcstat_hits); 3883 ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr), 3884 demand, prefetch, !HDR_ISTYPE_METADATA(hdr), 3885 data, metadata, hits); 3886 3887 if (done) 3888 done(NULL, buf, private); 3889 } else { 3890 uint64_t size = BP_GET_LSIZE(bp); 3891 arc_callback_t *acb; 3892 vdev_t *vd = NULL; 3893 uint64_t addr = 0; 3894 boolean_t devw = B_FALSE; 3895 enum zio_compress b_compress = ZIO_COMPRESS_OFF; 3896 int32_t b_asize = 0; 3897 3898 if (hdr == NULL) { 3899 /* this block is not in the cache */ 3900 arc_buf_hdr_t *exists = NULL; 3901 arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp); 3902 buf = arc_buf_alloc(spa, size, private, type); 3903 hdr = buf->b_hdr; 3904 if (!BP_IS_EMBEDDED(bp)) { 3905 hdr->b_dva = *BP_IDENTITY(bp); 3906 hdr->b_birth = BP_PHYSICAL_BIRTH(bp); 3907 exists = buf_hash_insert(hdr, &hash_lock); 3908 } 3909 if (exists != NULL) { 3910 /* somebody beat us to the hash insert */ 3911 mutex_exit(hash_lock); 3912 buf_discard_identity(hdr); 3913 (void) arc_buf_remove_ref(buf, private); 3914 goto top; /* restart the IO request */ 3915 } 3916 3917 /* if this is a prefetch, we don't have a reference */ 3918 if (*arc_flags & ARC_FLAG_PREFETCH) { 3919 (void) remove_reference(hdr, hash_lock, 3920 private); 3921 hdr->b_flags |= ARC_FLAG_PREFETCH; 3922 } 3923 if (*arc_flags & ARC_FLAG_L2CACHE) 3924 hdr->b_flags |= ARC_FLAG_L2CACHE; 3925 if (*arc_flags & ARC_FLAG_L2COMPRESS) 3926 hdr->b_flags |= ARC_FLAG_L2COMPRESS; 3927 if (BP_GET_LEVEL(bp) > 0) 3928 hdr->b_flags |= ARC_FLAG_INDIRECT; 3929 } else { 3930 /* 3931 * This block is in the ghost cache. If it was L2-only 3932 * (and thus didn't have an L1 hdr), we realloc the 3933 * header to add an L1 hdr. 3934 */ 3935 if (!HDR_HAS_L1HDR(hdr)) { 3936 hdr = arc_hdr_realloc(hdr, hdr_l2only_cache, 3937 hdr_full_cache); 3938 } 3939 3940 ASSERT(GHOST_STATE(hdr->b_l1hdr.b_state)); 3941 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 3942 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 3943 ASSERT(hdr->b_l1hdr.b_buf == NULL); 3944 3945 /* if this is a prefetch, we don't have a reference */ 3946 if (*arc_flags & ARC_FLAG_PREFETCH) 3947 hdr->b_flags |= ARC_FLAG_PREFETCH; 3948 else 3949 add_reference(hdr, hash_lock, private); 3950 if (*arc_flags & ARC_FLAG_L2CACHE) 3951 hdr->b_flags |= ARC_FLAG_L2CACHE; 3952 if (*arc_flags & ARC_FLAG_L2COMPRESS) 3953 hdr->b_flags |= ARC_FLAG_L2COMPRESS; 3954 buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); 3955 buf->b_hdr = hdr; 3956 buf->b_data = NULL; 3957 buf->b_efunc = NULL; 3958 buf->b_private = NULL; 3959 buf->b_next = NULL; 3960 hdr->b_l1hdr.b_buf = buf; 3961 ASSERT0(hdr->b_l1hdr.b_datacnt); 3962 hdr->b_l1hdr.b_datacnt = 1; 3963 arc_get_data_buf(buf); 3964 arc_access(hdr, hash_lock); 3965 } 3966 3967 ASSERT(!GHOST_STATE(hdr->b_l1hdr.b_state)); 3968 3969 acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); 3970 acb->acb_done = done; 3971 acb->acb_private = private; 3972 3973 ASSERT(hdr->b_l1hdr.b_acb == NULL); 3974 hdr->b_l1hdr.b_acb = acb; 3975 hdr->b_flags |= ARC_FLAG_IO_IN_PROGRESS; 3976 3977 if (HDR_HAS_L2HDR(hdr) && 3978 (vd = hdr->b_l2hdr.b_dev->l2ad_vdev) != NULL) { 3979 devw = hdr->b_l2hdr.b_dev->l2ad_writing; 3980 addr = hdr->b_l2hdr.b_daddr; 3981 b_compress = HDR_GET_COMPRESS(hdr); 3982 b_asize = hdr->b_l2hdr.b_asize; 3983 /* 3984 * Lock out device removal. 3985 */ 3986 if (vdev_is_dead(vd) || 3987 !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER)) 3988 vd = NULL; 3989 } 3990 3991 if (hash_lock != NULL) 3992 mutex_exit(hash_lock); 3993 3994 /* 3995 * At this point, we have a level 1 cache miss. Try again in 3996 * L2ARC if possible. 3997 */ 3998 ASSERT3U(hdr->b_size, ==, size); 3999 DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp, 4000 uint64_t, size, zbookmark_phys_t *, zb); 4001 ARCSTAT_BUMP(arcstat_misses); 4002 ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr), 4003 demand, prefetch, !HDR_ISTYPE_METADATA(hdr), 4004 data, metadata, misses); 4005#ifdef _KERNEL 4006 curthread->td_ru.ru_inblock++; 4007#endif 4008 4009 if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) { 4010 /* 4011 * Read from the L2ARC if the following are true: 4012 * 1. The L2ARC vdev was previously cached. 4013 * 2. This buffer still has L2ARC metadata. 4014 * 3. This buffer isn't currently writing to the L2ARC. 4015 * 4. The L2ARC entry wasn't evicted, which may 4016 * also have invalidated the vdev. 4017 * 5. This isn't prefetch and l2arc_noprefetch is set. 4018 */ 4019 if (HDR_HAS_L2HDR(hdr) && 4020 !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) && 4021 !(l2arc_noprefetch && HDR_PREFETCH(hdr))) { 4022 l2arc_read_callback_t *cb; 4023 4024 DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr); 4025 ARCSTAT_BUMP(arcstat_l2_hits); 4026 4027 cb = kmem_zalloc(sizeof (l2arc_read_callback_t), 4028 KM_SLEEP); 4029 cb->l2rcb_buf = buf; 4030 cb->l2rcb_spa = spa; 4031 cb->l2rcb_bp = *bp; 4032 cb->l2rcb_zb = *zb; 4033 cb->l2rcb_flags = zio_flags; 4034 cb->l2rcb_compress = b_compress; 4035 4036 ASSERT(addr >= VDEV_LABEL_START_SIZE && 4037 addr + size < vd->vdev_psize - 4038 VDEV_LABEL_END_SIZE); 4039 4040 /* 4041 * l2arc read. The SCL_L2ARC lock will be 4042 * released by l2arc_read_done(). 4043 * Issue a null zio if the underlying buffer 4044 * was squashed to zero size by compression. 4045 */ 4046 if (b_compress == ZIO_COMPRESS_EMPTY) { 4047 rzio = zio_null(pio, spa, vd, 4048 l2arc_read_done, cb, 4049 zio_flags | ZIO_FLAG_DONT_CACHE | 4050 ZIO_FLAG_CANFAIL | 4051 ZIO_FLAG_DONT_PROPAGATE | 4052 ZIO_FLAG_DONT_RETRY); 4053 } else { 4054 rzio = zio_read_phys(pio, vd, addr, 4055 b_asize, buf->b_data, 4056 ZIO_CHECKSUM_OFF, 4057 l2arc_read_done, cb, priority, 4058 zio_flags | ZIO_FLAG_DONT_CACHE | 4059 ZIO_FLAG_CANFAIL | 4060 ZIO_FLAG_DONT_PROPAGATE | 4061 ZIO_FLAG_DONT_RETRY, B_FALSE); 4062 } 4063 DTRACE_PROBE2(l2arc__read, vdev_t *, vd, 4064 zio_t *, rzio); 4065 ARCSTAT_INCR(arcstat_l2_read_bytes, b_asize); 4066 4067 if (*arc_flags & ARC_FLAG_NOWAIT) { 4068 zio_nowait(rzio); 4069 return (0); 4070 } 4071 4072 ASSERT(*arc_flags & ARC_FLAG_WAIT); 4073 if (zio_wait(rzio) == 0) 4074 return (0); 4075 4076 /* l2arc read error; goto zio_read() */ 4077 } else { 4078 DTRACE_PROBE1(l2arc__miss, 4079 arc_buf_hdr_t *, hdr); 4080 ARCSTAT_BUMP(arcstat_l2_misses); 4081 if (HDR_L2_WRITING(hdr)) 4082 ARCSTAT_BUMP(arcstat_l2_rw_clash); 4083 spa_config_exit(spa, SCL_L2ARC, vd); 4084 } 4085 } else { 4086 if (vd != NULL) 4087 spa_config_exit(spa, SCL_L2ARC, vd); 4088 if (l2arc_ndev != 0) { 4089 DTRACE_PROBE1(l2arc__miss, 4090 arc_buf_hdr_t *, hdr); 4091 ARCSTAT_BUMP(arcstat_l2_misses); 4092 } 4093 } 4094 4095 rzio = zio_read(pio, spa, bp, buf->b_data, size, 4096 arc_read_done, buf, priority, zio_flags, zb); 4097 4098 if (*arc_flags & ARC_FLAG_WAIT) 4099 return (zio_wait(rzio)); 4100 4101 ASSERT(*arc_flags & ARC_FLAG_NOWAIT); 4102 zio_nowait(rzio); 4103 } 4104 return (0); 4105} 4106 4107void 4108arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private) 4109{ 4110 ASSERT(buf->b_hdr != NULL); 4111 ASSERT(buf->b_hdr->b_l1hdr.b_state != arc_anon); 4112 ASSERT(!refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt) || 4113 func == NULL); 4114 ASSERT(buf->b_efunc == NULL); 4115 ASSERT(!HDR_BUF_AVAILABLE(buf->b_hdr)); 4116 4117 buf->b_efunc = func; 4118 buf->b_private = private; 4119} 4120 4121/* 4122 * Notify the arc that a block was freed, and thus will never be used again. 4123 */ 4124void 4125arc_freed(spa_t *spa, const blkptr_t *bp) 4126{ 4127 arc_buf_hdr_t *hdr; 4128 kmutex_t *hash_lock; 4129 uint64_t guid = spa_load_guid(spa); 4130 4131 ASSERT(!BP_IS_EMBEDDED(bp)); 4132 4133 hdr = buf_hash_find(guid, bp, &hash_lock); 4134 if (hdr == NULL) 4135 return; 4136 if (HDR_BUF_AVAILABLE(hdr)) { 4137 arc_buf_t *buf = hdr->b_l1hdr.b_buf; 4138 add_reference(hdr, hash_lock, FTAG); 4139 hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE; 4140 mutex_exit(hash_lock); 4141 4142 arc_release(buf, FTAG); 4143 (void) arc_buf_remove_ref(buf, FTAG); 4144 } else { 4145 mutex_exit(hash_lock); 4146 } 4147 4148} 4149 4150/* 4151 * Clear the user eviction callback set by arc_set_callback(), first calling 4152 * it if it exists. Because the presence of a callback keeps an arc_buf cached 4153 * clearing the callback may result in the arc_buf being destroyed. However, 4154 * it will not result in the *last* arc_buf being destroyed, hence the data 4155 * will remain cached in the ARC. We make a copy of the arc buffer here so 4156 * that we can process the callback without holding any locks. 4157 * 4158 * It's possible that the callback is already in the process of being cleared 4159 * by another thread. In this case we can not clear the callback. 4160 * 4161 * Returns B_TRUE if the callback was successfully called and cleared. 4162 */ 4163boolean_t 4164arc_clear_callback(arc_buf_t *buf) 4165{ 4166 arc_buf_hdr_t *hdr; 4167 kmutex_t *hash_lock; 4168 arc_evict_func_t *efunc = buf->b_efunc; 4169 void *private = buf->b_private; 4170 4171 mutex_enter(&buf->b_evict_lock); 4172 hdr = buf->b_hdr; 4173 if (hdr == NULL) { 4174 /* 4175 * We are in arc_do_user_evicts(). 4176 */ 4177 ASSERT(buf->b_data == NULL); 4178 mutex_exit(&buf->b_evict_lock); 4179 return (B_FALSE); 4180 } else if (buf->b_data == NULL) { 4181 /* 4182 * We are on the eviction list; process this buffer now 4183 * but let arc_do_user_evicts() do the reaping. 4184 */ 4185 buf->b_efunc = NULL; 4186 mutex_exit(&buf->b_evict_lock); 4187 VERIFY0(efunc(private)); 4188 return (B_TRUE); 4189 } 4190 hash_lock = HDR_LOCK(hdr); 4191 mutex_enter(hash_lock); 4192 hdr = buf->b_hdr; 4193 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 4194 4195 ASSERT3U(refcount_count(&hdr->b_l1hdr.b_refcnt), <, 4196 hdr->b_l1hdr.b_datacnt); 4197 ASSERT(hdr->b_l1hdr.b_state == arc_mru || 4198 hdr->b_l1hdr.b_state == arc_mfu); 4199 4200 buf->b_efunc = NULL; 4201 buf->b_private = NULL; 4202 4203 if (hdr->b_l1hdr.b_datacnt > 1) { 4204 mutex_exit(&buf->b_evict_lock); 4205 arc_buf_destroy(buf, FALSE, TRUE); 4206 } else { 4207 ASSERT(buf == hdr->b_l1hdr.b_buf); 4208 hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE; 4209 mutex_exit(&buf->b_evict_lock); 4210 } 4211 4212 mutex_exit(hash_lock); 4213 VERIFY0(efunc(private)); 4214 return (B_TRUE); 4215} 4216 4217/* 4218 * Release this buffer from the cache, making it an anonymous buffer. This 4219 * must be done after a read and prior to modifying the buffer contents. 4220 * If the buffer has more than one reference, we must make 4221 * a new hdr for the buffer. 4222 */ 4223void 4224arc_release(arc_buf_t *buf, void *tag) 4225{ 4226 arc_buf_hdr_t *hdr = buf->b_hdr; 4227 4228 /* 4229 * It would be nice to assert that if it's DMU metadata (level > 4230 * 0 || it's the dnode file), then it must be syncing context. 4231 * But we don't know that information at this level. 4232 */ 4233 4234 mutex_enter(&buf->b_evict_lock); 4235 /* 4236 * We don't grab the hash lock prior to this check, because if 4237 * the buffer's header is in the arc_anon state, it won't be 4238 * linked into the hash table. 4239 */ 4240 if (hdr->b_l1hdr.b_state == arc_anon) { 4241 mutex_exit(&buf->b_evict_lock); 4242 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 4243 ASSERT(!HDR_IN_HASH_TABLE(hdr)); 4244 ASSERT(!HDR_HAS_L2HDR(hdr)); 4245 ASSERT(BUF_EMPTY(hdr)); 4246 ASSERT3U(hdr->b_l1hdr.b_datacnt, ==, 1); 4247 ASSERT3S(refcount_count(&hdr->b_l1hdr.b_refcnt), ==, 1); 4248 ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node)); 4249 4250 ASSERT3P(buf->b_efunc, ==, NULL); 4251 ASSERT3P(buf->b_private, ==, NULL); 4252 4253 hdr->b_l1hdr.b_arc_access = 0; 4254 arc_buf_thaw(buf); 4255 4256 return; 4257 } 4258 4259 kmutex_t *hash_lock = HDR_LOCK(hdr); 4260 mutex_enter(hash_lock); 4261 4262 /* 4263 * This assignment is only valid as long as the hash_lock is 4264 * held, we must be careful not to reference state or the 4265 * b_state field after dropping the lock. 4266 */ 4267 arc_state_t *state = hdr->b_l1hdr.b_state; 4268 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 4269 ASSERT3P(state, !=, arc_anon); 4270 4271 /* this buffer is not on any list */ 4272 ASSERT(refcount_count(&hdr->b_l1hdr.b_refcnt) > 0); 4273 4274 if (HDR_HAS_L2HDR(hdr)) { 4275 mutex_enter(&hdr->b_l2hdr.b_dev->l2ad_mtx); 4276 4277 /* 4278 * We have to recheck this conditional again now that 4279 * we're holding the l2ad_mtx to prevent a race with 4280 * another thread which might be concurrently calling 4281 * l2arc_evict(). In that case, l2arc_evict() might have 4282 * destroyed the header's L2 portion as we were waiting 4283 * to acquire the l2ad_mtx. 4284 */ 4285 if (HDR_HAS_L2HDR(hdr)) { 4286 if (hdr->b_l2hdr.b_daddr != L2ARC_ADDR_UNSET) 4287 trim_map_free(hdr->b_l2hdr.b_dev->l2ad_vdev, 4288 hdr->b_l2hdr.b_daddr, 4289 hdr->b_l2hdr.b_asize, 0); 4290 arc_hdr_l2hdr_destroy(hdr); 4291 } 4292 4293 mutex_exit(&hdr->b_l2hdr.b_dev->l2ad_mtx); 4294 } 4295 4296 /* 4297 * Do we have more than one buf? 4298 */ 4299 if (hdr->b_l1hdr.b_datacnt > 1) { 4300 arc_buf_hdr_t *nhdr; 4301 arc_buf_t **bufp; 4302 uint64_t blksz = hdr->b_size; 4303 uint64_t spa = hdr->b_spa; 4304 arc_buf_contents_t type = arc_buf_type(hdr); 4305 uint32_t flags = hdr->b_flags; 4306 4307 ASSERT(hdr->b_l1hdr.b_buf != buf || buf->b_next != NULL); 4308 /* 4309 * Pull the data off of this hdr and attach it to 4310 * a new anonymous hdr. 4311 */ 4312 (void) remove_reference(hdr, hash_lock, tag); 4313 bufp = &hdr->b_l1hdr.b_buf; 4314 while (*bufp != buf) 4315 bufp = &(*bufp)->b_next; 4316 *bufp = buf->b_next; 4317 buf->b_next = NULL; 4318 4319 ASSERT3P(state, !=, arc_l2c_only); 4320 ASSERT3U(state->arcs_size, >=, hdr->b_size); 4321 atomic_add_64(&state->arcs_size, -hdr->b_size); 4322 if (refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) { 4323 ASSERT3P(state, !=, arc_l2c_only); 4324 uint64_t *size = &state->arcs_lsize[type]; 4325 ASSERT3U(*size, >=, hdr->b_size); 4326 atomic_add_64(size, -hdr->b_size); 4327 } 4328 4329 /* 4330 * We're releasing a duplicate user data buffer, update 4331 * our statistics accordingly. 4332 */ 4333 if (HDR_ISTYPE_DATA(hdr)) { 4334 ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers); 4335 ARCSTAT_INCR(arcstat_duplicate_buffers_size, 4336 -hdr->b_size); 4337 } 4338 hdr->b_l1hdr.b_datacnt -= 1; 4339 arc_cksum_verify(buf); 4340#ifdef illumos 4341 arc_buf_unwatch(buf); 4342#endif 4343 4344 mutex_exit(hash_lock); 4345 4346 nhdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE); 4347 nhdr->b_size = blksz; 4348 nhdr->b_spa = spa; 4349 4350 nhdr->b_flags = flags & ARC_FLAG_L2_WRITING; 4351 nhdr->b_flags |= arc_bufc_to_flags(type); 4352 nhdr->b_flags |= ARC_FLAG_HAS_L1HDR; 4353 4354 nhdr->b_l1hdr.b_buf = buf; 4355 nhdr->b_l1hdr.b_datacnt = 1; 4356 nhdr->b_l1hdr.b_state = arc_anon; 4357 nhdr->b_l1hdr.b_arc_access = 0; 4358 nhdr->b_freeze_cksum = NULL; 4359 4360 (void) refcount_add(&nhdr->b_l1hdr.b_refcnt, tag); 4361 buf->b_hdr = nhdr; 4362 mutex_exit(&buf->b_evict_lock); 4363 atomic_add_64(&arc_anon->arcs_size, blksz); 4364 } else { 4365 mutex_exit(&buf->b_evict_lock); 4366 ASSERT(refcount_count(&hdr->b_l1hdr.b_refcnt) == 1); 4367 /* protected by hash lock */ 4368 ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node)); 4369 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 4370 arc_change_state(arc_anon, hdr, hash_lock); 4371 hdr->b_l1hdr.b_arc_access = 0; 4372 mutex_exit(hash_lock); 4373 4374 buf_discard_identity(hdr); 4375 arc_buf_thaw(buf); 4376 } 4377 buf->b_efunc = NULL; 4378 buf->b_private = NULL; 4379} 4380 4381int 4382arc_released(arc_buf_t *buf) 4383{ 4384 int released; 4385 4386 mutex_enter(&buf->b_evict_lock); 4387 released = (buf->b_data != NULL && 4388 buf->b_hdr->b_l1hdr.b_state == arc_anon); 4389 mutex_exit(&buf->b_evict_lock); 4390 return (released); 4391} 4392 4393#ifdef ZFS_DEBUG 4394int 4395arc_referenced(arc_buf_t *buf) 4396{ 4397 int referenced; 4398 4399 mutex_enter(&buf->b_evict_lock); 4400 referenced = (refcount_count(&buf->b_hdr->b_l1hdr.b_refcnt)); 4401 mutex_exit(&buf->b_evict_lock); 4402 return (referenced); 4403} 4404#endif 4405 4406static void 4407arc_write_ready(zio_t *zio) 4408{ 4409 arc_write_callback_t *callback = zio->io_private; 4410 arc_buf_t *buf = callback->awcb_buf; 4411 arc_buf_hdr_t *hdr = buf->b_hdr; 4412 4413 ASSERT(HDR_HAS_L1HDR(hdr)); 4414 ASSERT(!refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt)); 4415 ASSERT(hdr->b_l1hdr.b_datacnt > 0); 4416 callback->awcb_ready(zio, buf, callback->awcb_private); 4417 4418 /* 4419 * If the IO is already in progress, then this is a re-write 4420 * attempt, so we need to thaw and re-compute the cksum. 4421 * It is the responsibility of the callback to handle the 4422 * accounting for any re-write attempt. 4423 */ 4424 if (HDR_IO_IN_PROGRESS(hdr)) { 4425 mutex_enter(&hdr->b_l1hdr.b_freeze_lock); 4426 if (hdr->b_freeze_cksum != NULL) { 4427 kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t)); 4428 hdr->b_freeze_cksum = NULL; 4429 } 4430 mutex_exit(&hdr->b_l1hdr.b_freeze_lock); 4431 } 4432 arc_cksum_compute(buf, B_FALSE); 4433 hdr->b_flags |= ARC_FLAG_IO_IN_PROGRESS; 4434} 4435 4436/* 4437 * The SPA calls this callback for each physical write that happens on behalf 4438 * of a logical write. See the comment in dbuf_write_physdone() for details. 4439 */ 4440static void 4441arc_write_physdone(zio_t *zio) 4442{ 4443 arc_write_callback_t *cb = zio->io_private; 4444 if (cb->awcb_physdone != NULL) 4445 cb->awcb_physdone(zio, cb->awcb_buf, cb->awcb_private); 4446} 4447 4448static void 4449arc_write_done(zio_t *zio) 4450{ 4451 arc_write_callback_t *callback = zio->io_private; 4452 arc_buf_t *buf = callback->awcb_buf; 4453 arc_buf_hdr_t *hdr = buf->b_hdr; 4454 4455 ASSERT(hdr->b_l1hdr.b_acb == NULL); 4456 4457 if (zio->io_error == 0) { 4458 if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) { 4459 buf_discard_identity(hdr); 4460 } else { 4461 hdr->b_dva = *BP_IDENTITY(zio->io_bp); 4462 hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp); 4463 } 4464 } else { 4465 ASSERT(BUF_EMPTY(hdr)); 4466 } 4467 4468 /* 4469 * If the block to be written was all-zero or compressed enough to be 4470 * embedded in the BP, no write was performed so there will be no 4471 * dva/birth/checksum. The buffer must therefore remain anonymous 4472 * (and uncached). 4473 */ 4474 if (!BUF_EMPTY(hdr)) { 4475 arc_buf_hdr_t *exists; 4476 kmutex_t *hash_lock; 4477 4478 ASSERT(zio->io_error == 0); 4479 4480 arc_cksum_verify(buf); 4481 4482 exists = buf_hash_insert(hdr, &hash_lock); 4483 if (exists != NULL) { 4484 /* 4485 * This can only happen if we overwrite for 4486 * sync-to-convergence, because we remove 4487 * buffers from the hash table when we arc_free(). 4488 */ 4489 if (zio->io_flags & ZIO_FLAG_IO_REWRITE) { 4490 if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp)) 4491 panic("bad overwrite, hdr=%p exists=%p", 4492 (void *)hdr, (void *)exists); 4493 ASSERT(refcount_is_zero( 4494 &exists->b_l1hdr.b_refcnt)); 4495 arc_change_state(arc_anon, exists, hash_lock); 4496 mutex_exit(hash_lock); 4497 arc_hdr_destroy(exists); 4498 exists = buf_hash_insert(hdr, &hash_lock); 4499 ASSERT3P(exists, ==, NULL); 4500 } else if (zio->io_flags & ZIO_FLAG_NOPWRITE) { 4501 /* nopwrite */ 4502 ASSERT(zio->io_prop.zp_nopwrite); 4503 if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp)) 4504 panic("bad nopwrite, hdr=%p exists=%p", 4505 (void *)hdr, (void *)exists); 4506 } else { 4507 /* Dedup */ 4508 ASSERT(hdr->b_l1hdr.b_datacnt == 1); 4509 ASSERT(hdr->b_l1hdr.b_state == arc_anon); 4510 ASSERT(BP_GET_DEDUP(zio->io_bp)); 4511 ASSERT(BP_GET_LEVEL(zio->io_bp) == 0); 4512 } 4513 } 4514 hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS; 4515 /* if it's not anon, we are doing a scrub */ 4516 if (exists == NULL && hdr->b_l1hdr.b_state == arc_anon) 4517 arc_access(hdr, hash_lock); 4518 mutex_exit(hash_lock); 4519 } else { 4520 hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS; 4521 } 4522 4523 ASSERT(!refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 4524 callback->awcb_done(zio, buf, callback->awcb_private); 4525 4526 kmem_free(callback, sizeof (arc_write_callback_t)); 4527} 4528 4529zio_t * 4530arc_write(zio_t *pio, spa_t *spa, uint64_t txg, 4531 blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, boolean_t l2arc_compress, 4532 const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *physdone, 4533 arc_done_func_t *done, void *private, zio_priority_t priority, 4534 int zio_flags, const zbookmark_phys_t *zb) 4535{ 4536 arc_buf_hdr_t *hdr = buf->b_hdr; 4537 arc_write_callback_t *callback; 4538 zio_t *zio; 4539 4540 ASSERT(ready != NULL); 4541 ASSERT(done != NULL); 4542 ASSERT(!HDR_IO_ERROR(hdr)); 4543 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 4544 ASSERT(hdr->b_l1hdr.b_acb == NULL); 4545 ASSERT(hdr->b_l1hdr.b_datacnt > 0); 4546 if (l2arc) 4547 hdr->b_flags |= ARC_FLAG_L2CACHE; 4548 if (l2arc_compress) 4549 hdr->b_flags |= ARC_FLAG_L2COMPRESS; 4550 callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP); 4551 callback->awcb_ready = ready; 4552 callback->awcb_physdone = physdone; 4553 callback->awcb_done = done; 4554 callback->awcb_private = private; 4555 callback->awcb_buf = buf; 4556 4557 zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, zp, 4558 arc_write_ready, arc_write_physdone, arc_write_done, callback, 4559 priority, zio_flags, zb); 4560 4561 return (zio); 4562} 4563 4564static int 4565arc_memory_throttle(uint64_t reserve, uint64_t txg) 4566{ 4567#ifdef _KERNEL 4568 uint64_t available_memory = ptob(freemem); 4569 static uint64_t page_load = 0; 4570 static uint64_t last_txg = 0; 4571 4572#if defined(__i386) || !defined(UMA_MD_SMALL_ALLOC) 4573 available_memory = 4574 MIN(available_memory, ptob(vmem_size(heap_arena, VMEM_FREE))); 4575#endif 4576 4577 if (freemem > (uint64_t)physmem * arc_lotsfree_percent / 100) 4578 return (0); 4579 4580 if (txg > last_txg) { 4581 last_txg = txg; 4582 page_load = 0; 4583 } 4584 /* 4585 * If we are in pageout, we know that memory is already tight, 4586 * the arc is already going to be evicting, so we just want to 4587 * continue to let page writes occur as quickly as possible. 4588 */ 4589 if (curproc == pageproc) { 4590 if (page_load > MAX(ptob(minfree), available_memory) / 4) 4591 return (SET_ERROR(ERESTART)); 4592 /* Note: reserve is inflated, so we deflate */ 4593 page_load += reserve / 8; 4594 return (0); 4595 } else if (page_load > 0 && arc_reclaim_needed()) { 4596 /* memory is low, delay before restarting */ 4597 ARCSTAT_INCR(arcstat_memory_throttle_count, 1); 4598 return (SET_ERROR(EAGAIN)); 4599 } 4600 page_load = 0; 4601#endif 4602 return (0); 4603} 4604 4605void 4606arc_tempreserve_clear(uint64_t reserve) 4607{ 4608 atomic_add_64(&arc_tempreserve, -reserve); 4609 ASSERT((int64_t)arc_tempreserve >= 0); 4610} 4611 4612int 4613arc_tempreserve_space(uint64_t reserve, uint64_t txg) 4614{ 4615 int error; 4616 uint64_t anon_size; 4617 4618 if (reserve > arc_c/4 && !arc_no_grow) { 4619 arc_c = MIN(arc_c_max, reserve * 4); 4620 DTRACE_PROBE1(arc__set_reserve, uint64_t, arc_c); 4621 } 4622 if (reserve > arc_c) 4623 return (SET_ERROR(ENOMEM)); 4624 4625 /* 4626 * Don't count loaned bufs as in flight dirty data to prevent long 4627 * network delays from blocking transactions that are ready to be 4628 * assigned to a txg. 4629 */ 4630 anon_size = MAX((int64_t)(arc_anon->arcs_size - arc_loaned_bytes), 0); 4631 4632 /* 4633 * Writes will, almost always, require additional memory allocations 4634 * in order to compress/encrypt/etc the data. We therefore need to 4635 * make sure that there is sufficient available memory for this. 4636 */ 4637 error = arc_memory_throttle(reserve, txg); 4638 if (error != 0) 4639 return (error); 4640 4641 /* 4642 * Throttle writes when the amount of dirty data in the cache 4643 * gets too large. We try to keep the cache less than half full 4644 * of dirty blocks so that our sync times don't grow too large. 4645 * Note: if two requests come in concurrently, we might let them 4646 * both succeed, when one of them should fail. Not a huge deal. 4647 */ 4648 4649 if (reserve + arc_tempreserve + anon_size > arc_c / 2 && 4650 anon_size > arc_c / 4) { 4651 dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK " 4652 "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n", 4653 arc_tempreserve>>10, 4654 arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10, 4655 arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10, 4656 reserve>>10, arc_c>>10); 4657 return (SET_ERROR(ERESTART)); 4658 } 4659 atomic_add_64(&arc_tempreserve, reserve); 4660 return (0); 4661} 4662 4663static void 4664arc_kstat_update_state(arc_state_t *state, kstat_named_t *size, 4665 kstat_named_t *evict_data, kstat_named_t *evict_metadata) 4666{ 4667 size->value.ui64 = state->arcs_size; 4668 evict_data->value.ui64 = state->arcs_lsize[ARC_BUFC_DATA]; 4669 evict_metadata->value.ui64 = state->arcs_lsize[ARC_BUFC_METADATA]; 4670} 4671 4672static int 4673arc_kstat_update(kstat_t *ksp, int rw) 4674{ 4675 arc_stats_t *as = ksp->ks_data; 4676 4677 if (rw == KSTAT_WRITE) { 4678 return (EACCES); 4679 } else { 4680 arc_kstat_update_state(arc_anon, 4681 &as->arcstat_anon_size, 4682 &as->arcstat_anon_evictable_data, 4683 &as->arcstat_anon_evictable_metadata); 4684 arc_kstat_update_state(arc_mru, 4685 &as->arcstat_mru_size, 4686 &as->arcstat_mru_evictable_data, 4687 &as->arcstat_mru_evictable_metadata); 4688 arc_kstat_update_state(arc_mru_ghost, 4689 &as->arcstat_mru_ghost_size, 4690 &as->arcstat_mru_ghost_evictable_data, 4691 &as->arcstat_mru_ghost_evictable_metadata); 4692 arc_kstat_update_state(arc_mfu, 4693 &as->arcstat_mfu_size, 4694 &as->arcstat_mfu_evictable_data, 4695 &as->arcstat_mfu_evictable_metadata); 4696 arc_kstat_update_state(arc_mfu_ghost, 4697 &as->arcstat_mfu_ghost_size, 4698 &as->arcstat_mfu_ghost_evictable_data, 4699 &as->arcstat_mfu_ghost_evictable_metadata); 4700 } 4701 4702 return (0); 4703} 4704 4705#ifdef _KERNEL 4706static eventhandler_tag arc_event_lowmem = NULL; 4707 4708static void 4709arc_lowmem(void *arg __unused, int howto __unused) 4710{ 4711 4712 mutex_enter(&arc_reclaim_thr_lock); 4713 /* XXX: Memory deficit should be passed as argument. */ 4714 needfree = btoc(arc_c >> arc_shrink_shift); 4715 DTRACE_PROBE(arc__needfree); 4716 cv_signal(&arc_reclaim_thr_cv); 4717 4718 /* 4719 * It is unsafe to block here in arbitrary threads, because we can come 4720 * here from ARC itself and may hold ARC locks and thus risk a deadlock 4721 * with ARC reclaim thread. 4722 */ 4723 if (curproc == pageproc) 4724 msleep(&needfree, &arc_reclaim_thr_lock, 0, "zfs:lowmem", 0); 4725 mutex_exit(&arc_reclaim_thr_lock); 4726} 4727#endif 4728 4729void 4730arc_init(void) 4731{ 4732 int i, prefetch_tunable_set = 0; 4733 4734 mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL); 4735 cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL); 4736 4737 /* Convert seconds to clock ticks */ 4738 arc_min_prefetch_lifespan = 1 * hz; 4739 4740 /* Start out with 1/8 of all memory */ 4741 arc_c = kmem_size() / 8; 4742 4743#ifdef illumos 4744#ifdef _KERNEL 4745 /* 4746 * On architectures where the physical memory can be larger 4747 * than the addressable space (intel in 32-bit mode), we may 4748 * need to limit the cache to 1/8 of VM size. 4749 */ 4750 arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8); 4751#endif 4752#endif /* illumos */ 4753 /* set min cache to 1/32 of all memory, or 16MB, whichever is more */ 4754 arc_c_min = MAX(arc_c / 4, 16 << 20); 4755 /* set max to 1/2 of all memory, or all but 1GB, whichever is more */ 4756 if (arc_c * 8 >= 1 << 30) 4757 arc_c_max = (arc_c * 8) - (1 << 30); 4758 else 4759 arc_c_max = arc_c_min; 4760 arc_c_max = MAX(arc_c * 5, arc_c_max); 4761 4762#ifdef _KERNEL 4763 /* 4764 * Allow the tunables to override our calculations if they are 4765 * reasonable (ie. over 16MB) 4766 */ 4767 if (zfs_arc_max > 16 << 20 && zfs_arc_max < kmem_size()) 4768 arc_c_max = zfs_arc_max; 4769 if (zfs_arc_min > 16 << 20 && zfs_arc_min <= arc_c_max) 4770 arc_c_min = zfs_arc_min; 4771#endif 4772 4773 arc_c = arc_c_max; 4774 arc_p = (arc_c >> 1); 4775 4776 /* limit meta-data to 1/4 of the arc capacity */ 4777 arc_meta_limit = arc_c_max / 4; 4778 4779 /* Allow the tunable to override if it is reasonable */ 4780 if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max) 4781 arc_meta_limit = zfs_arc_meta_limit; 4782 4783 if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0) 4784 arc_c_min = arc_meta_limit / 2; 4785 4786 if (zfs_arc_meta_min > 0) { 4787 arc_meta_min = zfs_arc_meta_min; 4788 } else { 4789 arc_meta_min = arc_c_min / 2; 4790 } 4791 4792 if (zfs_arc_grow_retry > 0) 4793 arc_grow_retry = zfs_arc_grow_retry; 4794 4795 if (zfs_arc_shrink_shift > 0) 4796 arc_shrink_shift = zfs_arc_shrink_shift; 4797 4798 /* 4799 * Ensure that arc_no_grow_shift is less than arc_shrink_shift. 4800 */ 4801 if (arc_no_grow_shift >= arc_shrink_shift) 4802 arc_no_grow_shift = arc_shrink_shift - 1; 4803 4804 if (zfs_arc_p_min_shift > 0) 4805 arc_p_min_shift = zfs_arc_p_min_shift; 4806 4807 /* if kmem_flags are set, lets try to use less memory */ 4808 if (kmem_debugging()) 4809 arc_c = arc_c / 2; 4810 if (arc_c < arc_c_min) 4811 arc_c = arc_c_min; 4812 4813 zfs_arc_min = arc_c_min; 4814 zfs_arc_max = arc_c_max; 4815 4816 arc_anon = &ARC_anon; 4817 arc_mru = &ARC_mru; 4818 arc_mru_ghost = &ARC_mru_ghost; 4819 arc_mfu = &ARC_mfu; 4820 arc_mfu_ghost = &ARC_mfu_ghost; 4821 arc_l2c_only = &ARC_l2c_only; 4822 arc_size = 0; 4823 4824 mutex_init(&arc_anon->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); 4825 mutex_init(&arc_mru->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); 4826 mutex_init(&arc_mru_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); 4827 mutex_init(&arc_mfu->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); 4828 mutex_init(&arc_mfu_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); 4829 mutex_init(&arc_l2c_only->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); 4830 4831 list_create(&arc_mru->arcs_list[ARC_BUFC_METADATA], 4832 sizeof (arc_buf_hdr_t), 4833 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node)); 4834 list_create(&arc_mru->arcs_list[ARC_BUFC_DATA], 4835 sizeof (arc_buf_hdr_t), 4836 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node)); 4837 list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA], 4838 sizeof (arc_buf_hdr_t), 4839 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node)); 4840 list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA], 4841 sizeof (arc_buf_hdr_t), 4842 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node)); 4843 list_create(&arc_mfu->arcs_list[ARC_BUFC_METADATA], 4844 sizeof (arc_buf_hdr_t), 4845 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node)); 4846 list_create(&arc_mfu->arcs_list[ARC_BUFC_DATA], 4847 sizeof (arc_buf_hdr_t), 4848 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node)); 4849 list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA], 4850 sizeof (arc_buf_hdr_t), 4851 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node)); 4852 list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA], 4853 sizeof (arc_buf_hdr_t), 4854 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node)); 4855 list_create(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA], 4856 sizeof (arc_buf_hdr_t), 4857 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node)); 4858 list_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA], 4859 sizeof (arc_buf_hdr_t), 4860 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node)); 4861 4862 buf_init(); 4863 4864 arc_thread_exit = 0; 4865 arc_eviction_list = NULL; 4866 mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL); 4867 bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t)); 4868 4869 arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED, 4870 sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); 4871 4872 if (arc_ksp != NULL) { 4873 arc_ksp->ks_data = &arc_stats; 4874 arc_ksp->ks_update = arc_kstat_update; 4875 kstat_install(arc_ksp); 4876 } 4877 4878 (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0, 4879 TS_RUN, minclsyspri); 4880 4881#ifdef _KERNEL 4882 arc_event_lowmem = EVENTHANDLER_REGISTER(vm_lowmem, arc_lowmem, NULL, 4883 EVENTHANDLER_PRI_FIRST); 4884#endif 4885 4886 arc_dead = FALSE; 4887 arc_warm = B_FALSE; 4888 4889 /* 4890 * Calculate maximum amount of dirty data per pool. 4891 * 4892 * If it has been set by /etc/system, take that. 4893 * Otherwise, use a percentage of physical memory defined by 4894 * zfs_dirty_data_max_percent (default 10%) with a cap at 4895 * zfs_dirty_data_max_max (default 4GB). 4896 */ 4897 if (zfs_dirty_data_max == 0) { 4898 zfs_dirty_data_max = ptob(physmem) * 4899 zfs_dirty_data_max_percent / 100; 4900 zfs_dirty_data_max = MIN(zfs_dirty_data_max, 4901 zfs_dirty_data_max_max); 4902 } 4903 4904#ifdef _KERNEL 4905 if (TUNABLE_INT_FETCH("vfs.zfs.prefetch_disable", &zfs_prefetch_disable)) 4906 prefetch_tunable_set = 1; 4907 4908#ifdef __i386__ 4909 if (prefetch_tunable_set == 0) { 4910 printf("ZFS NOTICE: Prefetch is disabled by default on i386 " 4911 "-- to enable,\n"); 4912 printf(" add \"vfs.zfs.prefetch_disable=0\" " 4913 "to /boot/loader.conf.\n"); 4914 zfs_prefetch_disable = 1; 4915 } 4916#else 4917 if ((((uint64_t)physmem * PAGESIZE) < (1ULL << 32)) && 4918 prefetch_tunable_set == 0) { 4919 printf("ZFS NOTICE: Prefetch is disabled by default if less " 4920 "than 4GB of RAM is present;\n" 4921 " to enable, add \"vfs.zfs.prefetch_disable=0\" " 4922 "to /boot/loader.conf.\n"); 4923 zfs_prefetch_disable = 1; 4924 } 4925#endif 4926 /* Warn about ZFS memory and address space requirements. */ 4927 if (((uint64_t)physmem * PAGESIZE) < (256 + 128 + 64) * (1 << 20)) { 4928 printf("ZFS WARNING: Recommended minimum RAM size is 512MB; " 4929 "expect unstable behavior.\n"); 4930 } 4931 if (kmem_size() < 512 * (1 << 20)) { 4932 printf("ZFS WARNING: Recommended minimum kmem_size is 512MB; " 4933 "expect unstable behavior.\n"); 4934 printf(" Consider tuning vm.kmem_size and " 4935 "vm.kmem_size_max\n"); 4936 printf(" in /boot/loader.conf.\n"); 4937 } 4938#endif 4939} 4940 4941void 4942arc_fini(void) 4943{ 4944 mutex_enter(&arc_reclaim_thr_lock); 4945 arc_thread_exit = 1; 4946 cv_signal(&arc_reclaim_thr_cv); 4947 while (arc_thread_exit != 0) 4948 cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock); 4949 mutex_exit(&arc_reclaim_thr_lock); 4950 4951 arc_flush(NULL); 4952 4953 arc_dead = TRUE; 4954 4955 if (arc_ksp != NULL) { 4956 kstat_delete(arc_ksp); 4957 arc_ksp = NULL; 4958 } 4959 4960 mutex_destroy(&arc_eviction_mtx); 4961 mutex_destroy(&arc_reclaim_thr_lock); 4962 cv_destroy(&arc_reclaim_thr_cv); 4963 4964 list_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]); 4965 list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]); 4966 list_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]); 4967 list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]); 4968 list_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]); 4969 list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]); 4970 list_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]); 4971 list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]); 4972 4973 mutex_destroy(&arc_anon->arcs_mtx); 4974 mutex_destroy(&arc_mru->arcs_mtx); 4975 mutex_destroy(&arc_mru_ghost->arcs_mtx); 4976 mutex_destroy(&arc_mfu->arcs_mtx); 4977 mutex_destroy(&arc_mfu_ghost->arcs_mtx); 4978 mutex_destroy(&arc_l2c_only->arcs_mtx); 4979 4980 buf_fini(); 4981 4982 ASSERT0(arc_loaned_bytes); 4983 4984#ifdef _KERNEL 4985 if (arc_event_lowmem != NULL) 4986 EVENTHANDLER_DEREGISTER(vm_lowmem, arc_event_lowmem); 4987#endif 4988} 4989 4990/* 4991 * Level 2 ARC 4992 * 4993 * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk. 4994 * It uses dedicated storage devices to hold cached data, which are populated 4995 * using large infrequent writes. The main role of this cache is to boost 4996 * the performance of random read workloads. The intended L2ARC devices 4997 * include short-stroked disks, solid state disks, and other media with 4998 * substantially faster read latency than disk. 4999 * 5000 * +-----------------------+ 5001 * | ARC | 5002 * +-----------------------+ 5003 * | ^ ^ 5004 * | | | 5005 * l2arc_feed_thread() arc_read() 5006 * | | | 5007 * | l2arc read | 5008 * V | | 5009 * +---------------+ | 5010 * | L2ARC | | 5011 * +---------------+ | 5012 * | ^ | 5013 * l2arc_write() | | 5014 * | | | 5015 * V | | 5016 * +-------+ +-------+ 5017 * | vdev | | vdev | 5018 * | cache | | cache | 5019 * +-------+ +-------+ 5020 * +=========+ .-----. 5021 * : L2ARC : |-_____-| 5022 * : devices : | Disks | 5023 * +=========+ `-_____-' 5024 * 5025 * Read requests are satisfied from the following sources, in order: 5026 * 5027 * 1) ARC 5028 * 2) vdev cache of L2ARC devices 5029 * 3) L2ARC devices 5030 * 4) vdev cache of disks 5031 * 5) disks 5032 * 5033 * Some L2ARC device types exhibit extremely slow write performance. 5034 * To accommodate for this there are some significant differences between 5035 * the L2ARC and traditional cache design: 5036 * 5037 * 1. There is no eviction path from the ARC to the L2ARC. Evictions from 5038 * the ARC behave as usual, freeing buffers and placing headers on ghost 5039 * lists. The ARC does not send buffers to the L2ARC during eviction as 5040 * this would add inflated write latencies for all ARC memory pressure. 5041 * 5042 * 2. The L2ARC attempts to cache data from the ARC before it is evicted. 5043 * It does this by periodically scanning buffers from the eviction-end of 5044 * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are 5045 * not already there. It scans until a headroom of buffers is satisfied, 5046 * which itself is a buffer for ARC eviction. If a compressible buffer is 5047 * found during scanning and selected for writing to an L2ARC device, we 5048 * temporarily boost scanning headroom during the next scan cycle to make 5049 * sure we adapt to compression effects (which might significantly reduce 5050 * the data volume we write to L2ARC). The thread that does this is 5051 * l2arc_feed_thread(), illustrated below; example sizes are included to 5052 * provide a better sense of ratio than this diagram: 5053 * 5054 * head --> tail 5055 * +---------------------+----------+ 5056 * ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->. # already on L2ARC 5057 * +---------------------+----------+ | o L2ARC eligible 5058 * ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->| : ARC buffer 5059 * +---------------------+----------+ | 5060 * 15.9 Gbytes ^ 32 Mbytes | 5061 * headroom | 5062 * l2arc_feed_thread() 5063 * | 5064 * l2arc write hand <--[oooo]--' 5065 * | 8 Mbyte 5066 * | write max 5067 * V 5068 * +==============================+ 5069 * L2ARC dev |####|#|###|###| |####| ... | 5070 * +==============================+ 5071 * 32 Gbytes 5072 * 5073 * 3. If an ARC buffer is copied to the L2ARC but then hit instead of 5074 * evicted, then the L2ARC has cached a buffer much sooner than it probably 5075 * needed to, potentially wasting L2ARC device bandwidth and storage. It is 5076 * safe to say that this is an uncommon case, since buffers at the end of 5077 * the ARC lists have moved there due to inactivity. 5078 * 5079 * 4. If the ARC evicts faster than the L2ARC can maintain a headroom, 5080 * then the L2ARC simply misses copying some buffers. This serves as a 5081 * pressure valve to prevent heavy read workloads from both stalling the ARC 5082 * with waits and clogging the L2ARC with writes. This also helps prevent 5083 * the potential for the L2ARC to churn if it attempts to cache content too 5084 * quickly, such as during backups of the entire pool. 5085 * 5086 * 5. After system boot and before the ARC has filled main memory, there are 5087 * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru 5088 * lists can remain mostly static. Instead of searching from tail of these 5089 * lists as pictured, the l2arc_feed_thread() will search from the list heads 5090 * for eligible buffers, greatly increasing its chance of finding them. 5091 * 5092 * The L2ARC device write speed is also boosted during this time so that 5093 * the L2ARC warms up faster. Since there have been no ARC evictions yet, 5094 * there are no L2ARC reads, and no fear of degrading read performance 5095 * through increased writes. 5096 * 5097 * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that 5098 * the vdev queue can aggregate them into larger and fewer writes. Each 5099 * device is written to in a rotor fashion, sweeping writes through 5100 * available space then repeating. 5101 * 5102 * 7. The L2ARC does not store dirty content. It never needs to flush 5103 * write buffers back to disk based storage. 5104 * 5105 * 8. If an ARC buffer is written (and dirtied) which also exists in the 5106 * L2ARC, the now stale L2ARC buffer is immediately dropped. 5107 * 5108 * The performance of the L2ARC can be tweaked by a number of tunables, which 5109 * may be necessary for different workloads: 5110 * 5111 * l2arc_write_max max write bytes per interval 5112 * l2arc_write_boost extra write bytes during device warmup 5113 * l2arc_noprefetch skip caching prefetched buffers 5114 * l2arc_headroom number of max device writes to precache 5115 * l2arc_headroom_boost when we find compressed buffers during ARC 5116 * scanning, we multiply headroom by this 5117 * percentage factor for the next scan cycle, 5118 * since more compressed buffers are likely to 5119 * be present 5120 * l2arc_feed_secs seconds between L2ARC writing 5121 * 5122 * Tunables may be removed or added as future performance improvements are 5123 * integrated, and also may become zpool properties. 5124 * 5125 * There are three key functions that control how the L2ARC warms up: 5126 * 5127 * l2arc_write_eligible() check if a buffer is eligible to cache 5128 * l2arc_write_size() calculate how much to write 5129 * l2arc_write_interval() calculate sleep delay between writes 5130 * 5131 * These three functions determine what to write, how much, and how quickly 5132 * to send writes. 5133 */ 5134 5135static boolean_t 5136l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *hdr) 5137{ 5138 /* 5139 * A buffer is *not* eligible for the L2ARC if it: 5140 * 1. belongs to a different spa. 5141 * 2. is already cached on the L2ARC. 5142 * 3. has an I/O in progress (it may be an incomplete read). 5143 * 4. is flagged not eligible (zfs property). 5144 */ 5145 if (hdr->b_spa != spa_guid) { 5146 ARCSTAT_BUMP(arcstat_l2_write_spa_mismatch); 5147 return (B_FALSE); 5148 } 5149 if (HDR_HAS_L2HDR(hdr)) { 5150 ARCSTAT_BUMP(arcstat_l2_write_in_l2); 5151 return (B_FALSE); 5152 } 5153 if (HDR_IO_IN_PROGRESS(hdr)) { 5154 ARCSTAT_BUMP(arcstat_l2_write_hdr_io_in_progress); 5155 return (B_FALSE); 5156 } 5157 if (!HDR_L2CACHE(hdr)) { 5158 ARCSTAT_BUMP(arcstat_l2_write_not_cacheable); 5159 return (B_FALSE); 5160 } 5161 5162 return (B_TRUE); 5163} 5164 5165static uint64_t 5166l2arc_write_size(void) 5167{ 5168 uint64_t size; 5169 5170 /* 5171 * Make sure our globals have meaningful values in case the user 5172 * altered them. 5173 */ 5174 size = l2arc_write_max; 5175 if (size == 0) { 5176 cmn_err(CE_NOTE, "Bad value for l2arc_write_max, value must " 5177 "be greater than zero, resetting it to the default (%d)", 5178 L2ARC_WRITE_SIZE); 5179 size = l2arc_write_max = L2ARC_WRITE_SIZE; 5180 } 5181 5182 if (arc_warm == B_FALSE) 5183 size += l2arc_write_boost; 5184 5185 return (size); 5186 5187} 5188 5189static clock_t 5190l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote) 5191{ 5192 clock_t interval, next, now; 5193 5194 /* 5195 * If the ARC lists are busy, increase our write rate; if the 5196 * lists are stale, idle back. This is achieved by checking 5197 * how much we previously wrote - if it was more than half of 5198 * what we wanted, schedule the next write much sooner. 5199 */ 5200 if (l2arc_feed_again && wrote > (wanted / 2)) 5201 interval = (hz * l2arc_feed_min_ms) / 1000; 5202 else 5203 interval = hz * l2arc_feed_secs; 5204 5205 now = ddi_get_lbolt(); 5206 next = MAX(now, MIN(now + interval, began + interval)); 5207 5208 return (next); 5209} 5210 5211/* 5212 * Cycle through L2ARC devices. This is how L2ARC load balances. 5213 * If a device is returned, this also returns holding the spa config lock. 5214 */ 5215static l2arc_dev_t * 5216l2arc_dev_get_next(void) 5217{ 5218 l2arc_dev_t *first, *next = NULL; 5219 5220 /* 5221 * Lock out the removal of spas (spa_namespace_lock), then removal 5222 * of cache devices (l2arc_dev_mtx). Once a device has been selected, 5223 * both locks will be dropped and a spa config lock held instead. 5224 */ 5225 mutex_enter(&spa_namespace_lock); 5226 mutex_enter(&l2arc_dev_mtx); 5227 5228 /* if there are no vdevs, there is nothing to do */ 5229 if (l2arc_ndev == 0) 5230 goto out; 5231 5232 first = NULL; 5233 next = l2arc_dev_last; 5234 do { 5235 /* loop around the list looking for a non-faulted vdev */ 5236 if (next == NULL) { 5237 next = list_head(l2arc_dev_list); 5238 } else { 5239 next = list_next(l2arc_dev_list, next); 5240 if (next == NULL) 5241 next = list_head(l2arc_dev_list); 5242 } 5243 5244 /* if we have come back to the start, bail out */ 5245 if (first == NULL) 5246 first = next; 5247 else if (next == first) 5248 break; 5249 5250 } while (vdev_is_dead(next->l2ad_vdev)); 5251 5252 /* if we were unable to find any usable vdevs, return NULL */ 5253 if (vdev_is_dead(next->l2ad_vdev)) 5254 next = NULL; 5255 5256 l2arc_dev_last = next; 5257 5258out: 5259 mutex_exit(&l2arc_dev_mtx); 5260 5261 /* 5262 * Grab the config lock to prevent the 'next' device from being 5263 * removed while we are writing to it. 5264 */ 5265 if (next != NULL) 5266 spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER); 5267 mutex_exit(&spa_namespace_lock); 5268 5269 return (next); 5270} 5271 5272/* 5273 * Free buffers that were tagged for destruction. 5274 */ 5275static void 5276l2arc_do_free_on_write() 5277{ 5278 list_t *buflist; 5279 l2arc_data_free_t *df, *df_prev; 5280 5281 mutex_enter(&l2arc_free_on_write_mtx); 5282 buflist = l2arc_free_on_write; 5283 5284 for (df = list_tail(buflist); df; df = df_prev) { 5285 df_prev = list_prev(buflist, df); 5286 ASSERT(df->l2df_data != NULL); 5287 ASSERT(df->l2df_func != NULL); 5288 df->l2df_func(df->l2df_data, df->l2df_size); 5289 list_remove(buflist, df); 5290 kmem_free(df, sizeof (l2arc_data_free_t)); 5291 } 5292 5293 mutex_exit(&l2arc_free_on_write_mtx); 5294} 5295 5296/* 5297 * A write to a cache device has completed. Update all headers to allow 5298 * reads from these buffers to begin. 5299 */ 5300static void 5301l2arc_write_done(zio_t *zio) 5302{ 5303 l2arc_write_callback_t *cb; 5304 l2arc_dev_t *dev; 5305 list_t *buflist; 5306 arc_buf_hdr_t *head, *hdr, *hdr_prev; 5307 kmutex_t *hash_lock; 5308 int64_t bytes_dropped = 0; 5309 5310 cb = zio->io_private; 5311 ASSERT(cb != NULL); 5312 dev = cb->l2wcb_dev; 5313 ASSERT(dev != NULL); 5314 head = cb->l2wcb_head; 5315 ASSERT(head != NULL); 5316 buflist = &dev->l2ad_buflist; 5317 ASSERT(buflist != NULL); 5318 DTRACE_PROBE2(l2arc__iodone, zio_t *, zio, 5319 l2arc_write_callback_t *, cb); 5320 5321 if (zio->io_error != 0) 5322 ARCSTAT_BUMP(arcstat_l2_writes_error); 5323 5324 mutex_enter(&dev->l2ad_mtx); 5325 5326 /* 5327 * All writes completed, or an error was hit. 5328 */ 5329 for (hdr = list_prev(buflist, head); hdr; hdr = hdr_prev) { 5330 hdr_prev = list_prev(buflist, hdr); 5331 5332 hash_lock = HDR_LOCK(hdr); 5333 if (!mutex_tryenter(hash_lock)) { 5334 /* 5335 * This buffer misses out. It may be in a stage 5336 * of eviction. Its ARC_FLAG_L2_WRITING flag will be 5337 * left set, denying reads to this buffer. 5338 */ 5339 ARCSTAT_BUMP(arcstat_l2_writes_hdr_miss); 5340 continue; 5341 } 5342 5343 /* 5344 * It's possible that this buffer got evicted from the L1 cache 5345 * before we grabbed the vdev + hash locks, in which case 5346 * arc_hdr_realloc freed b_tmp_cdata for us if it was allocated. 5347 * Only free the buffer if we still have an L1 hdr. 5348 */ 5349 if (HDR_HAS_L1HDR(hdr) && hdr->b_l1hdr.b_tmp_cdata != NULL && 5350 HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF) 5351 l2arc_release_cdata_buf(hdr); 5352 5353 if (zio->io_error != 0) { 5354 /* 5355 * Error - drop L2ARC entry. 5356 */ 5357 trim_map_free(hdr->b_l2hdr.b_dev->l2ad_vdev, 5358 hdr->b_l2hdr.b_daddr, hdr->b_l2hdr.b_asize, 0); 5359 hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR; 5360 5361 ARCSTAT_INCR(arcstat_l2_asize, -hdr->b_l2hdr.b_asize); 5362 ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); 5363 5364 bytes_dropped += hdr->b_l2hdr.b_asize; 5365 (void) refcount_remove_many(&dev->l2ad_alloc, 5366 hdr->b_l2hdr.b_asize, hdr); 5367 } 5368 5369 /* 5370 * Allow ARC to begin reads to this L2ARC entry. 5371 */ 5372 hdr->b_flags &= ~ARC_FLAG_L2_WRITING; 5373 5374 mutex_exit(hash_lock); 5375 } 5376 5377 atomic_inc_64(&l2arc_writes_done); 5378 list_remove(buflist, head); 5379 ASSERT(!HDR_HAS_L1HDR(head)); 5380 kmem_cache_free(hdr_l2only_cache, head); 5381 mutex_exit(&dev->l2ad_mtx); 5382 5383 vdev_space_update(dev->l2ad_vdev, -bytes_dropped, 0, 0); 5384 5385 l2arc_do_free_on_write(); 5386 5387 kmem_free(cb, sizeof (l2arc_write_callback_t)); 5388} 5389 5390/* 5391 * A read to a cache device completed. Validate buffer contents before 5392 * handing over to the regular ARC routines. 5393 */ 5394static void 5395l2arc_read_done(zio_t *zio) 5396{ 5397 l2arc_read_callback_t *cb; 5398 arc_buf_hdr_t *hdr; 5399 arc_buf_t *buf; 5400 kmutex_t *hash_lock; 5401 int equal; 5402 5403 ASSERT(zio->io_vd != NULL); 5404 ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE); 5405 5406 spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd); 5407 5408 cb = zio->io_private; 5409 ASSERT(cb != NULL); 5410 buf = cb->l2rcb_buf; 5411 ASSERT(buf != NULL); 5412 5413 hash_lock = HDR_LOCK(buf->b_hdr); 5414 mutex_enter(hash_lock); 5415 hdr = buf->b_hdr; 5416 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 5417 5418 /* 5419 * If the buffer was compressed, decompress it first. 5420 */ 5421 if (cb->l2rcb_compress != ZIO_COMPRESS_OFF) 5422 l2arc_decompress_zio(zio, hdr, cb->l2rcb_compress); 5423 ASSERT(zio->io_data != NULL); 5424 5425 /* 5426 * Check this survived the L2ARC journey. 5427 */ 5428 equal = arc_cksum_equal(buf); 5429 if (equal && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) { 5430 mutex_exit(hash_lock); 5431 zio->io_private = buf; 5432 zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 */ 5433 zio->io_bp = &zio->io_bp_copy; /* XXX fix in L2ARC 2.0 */ 5434 arc_read_done(zio); 5435 } else { 5436 mutex_exit(hash_lock); 5437 /* 5438 * Buffer didn't survive caching. Increment stats and 5439 * reissue to the original storage device. 5440 */ 5441 if (zio->io_error != 0) { 5442 ARCSTAT_BUMP(arcstat_l2_io_error); 5443 } else { 5444 zio->io_error = SET_ERROR(EIO); 5445 } 5446 if (!equal) 5447 ARCSTAT_BUMP(arcstat_l2_cksum_bad); 5448 5449 /* 5450 * If there's no waiter, issue an async i/o to the primary 5451 * storage now. If there *is* a waiter, the caller must 5452 * issue the i/o in a context where it's OK to block. 5453 */ 5454 if (zio->io_waiter == NULL) { 5455 zio_t *pio = zio_unique_parent(zio); 5456 5457 ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL); 5458 5459 zio_nowait(zio_read(pio, cb->l2rcb_spa, &cb->l2rcb_bp, 5460 buf->b_data, zio->io_size, arc_read_done, buf, 5461 zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb)); 5462 } 5463 } 5464 5465 kmem_free(cb, sizeof (l2arc_read_callback_t)); 5466} 5467 5468/* 5469 * This is the list priority from which the L2ARC will search for pages to 5470 * cache. This is used within loops (0..3) to cycle through lists in the 5471 * desired order. This order can have a significant effect on cache 5472 * performance. 5473 * 5474 * Currently the metadata lists are hit first, MFU then MRU, followed by 5475 * the data lists. This function returns a locked list, and also returns 5476 * the lock pointer. 5477 */ 5478static list_t * 5479l2arc_list_locked(int list_num, kmutex_t **lock) 5480{ 5481 list_t *list = NULL; 5482 5483 ASSERT(list_num >= 0 && list_num <= 3); 5484 5485 switch (list_num) { 5486 case 0: 5487 list = &arc_mfu->arcs_list[ARC_BUFC_METADATA]; 5488 *lock = &arc_mfu->arcs_mtx; 5489 break; 5490 case 1: 5491 list = &arc_mru->arcs_list[ARC_BUFC_METADATA]; 5492 *lock = &arc_mru->arcs_mtx; 5493 break; 5494 case 2: 5495 list = &arc_mfu->arcs_list[ARC_BUFC_DATA]; 5496 *lock = &arc_mfu->arcs_mtx; 5497 break; 5498 case 3: 5499 list = &arc_mru->arcs_list[ARC_BUFC_DATA]; 5500 *lock = &arc_mru->arcs_mtx; 5501 break; 5502 } 5503 5504 ASSERT(!(MUTEX_HELD(*lock))); 5505 mutex_enter(*lock); 5506 return (list); 5507} 5508 5509/* 5510 * Evict buffers from the device write hand to the distance specified in 5511 * bytes. This distance may span populated buffers, it may span nothing. 5512 * This is clearing a region on the L2ARC device ready for writing. 5513 * If the 'all' boolean is set, every buffer is evicted. 5514 */ 5515static void 5516l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) 5517{ 5518 list_t *buflist; 5519 arc_buf_hdr_t *hdr, *hdr_prev; 5520 kmutex_t *hash_lock; 5521 uint64_t taddr; 5522 5523 buflist = &dev->l2ad_buflist; 5524 5525 if (!all && dev->l2ad_first) { 5526 /* 5527 * This is the first sweep through the device. There is 5528 * nothing to evict. 5529 */ 5530 return; 5531 } 5532 5533 if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) { 5534 /* 5535 * When nearing the end of the device, evict to the end 5536 * before the device write hand jumps to the start. 5537 */ 5538 taddr = dev->l2ad_end; 5539 } else { 5540 taddr = dev->l2ad_hand + distance; 5541 } 5542 DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist, 5543 uint64_t, taddr, boolean_t, all); 5544 5545top: 5546 mutex_enter(&dev->l2ad_mtx); 5547 for (hdr = list_tail(buflist); hdr; hdr = hdr_prev) { 5548 hdr_prev = list_prev(buflist, hdr); 5549 5550 hash_lock = HDR_LOCK(hdr); 5551 if (!mutex_tryenter(hash_lock)) { 5552 /* 5553 * Missed the hash lock. Retry. 5554 */ 5555 ARCSTAT_BUMP(arcstat_l2_evict_lock_retry); 5556 mutex_exit(&dev->l2ad_mtx); 5557 mutex_enter(hash_lock); 5558 mutex_exit(hash_lock); 5559 goto top; 5560 } 5561 5562 if (HDR_L2_WRITE_HEAD(hdr)) { 5563 /* 5564 * We hit a write head node. Leave it for 5565 * l2arc_write_done(). 5566 */ 5567 list_remove(buflist, hdr); 5568 mutex_exit(hash_lock); 5569 continue; 5570 } 5571 5572 if (!all && HDR_HAS_L2HDR(hdr) && 5573 (hdr->b_l2hdr.b_daddr > taddr || 5574 hdr->b_l2hdr.b_daddr < dev->l2ad_hand)) { 5575 /* 5576 * We've evicted to the target address, 5577 * or the end of the device. 5578 */ 5579 mutex_exit(hash_lock); 5580 break; 5581 } 5582 5583 ASSERT(HDR_HAS_L2HDR(hdr)); 5584 if (!HDR_HAS_L1HDR(hdr)) { 5585 ASSERT(!HDR_L2_READING(hdr)); 5586 /* 5587 * This doesn't exist in the ARC. Destroy. 5588 * arc_hdr_destroy() will call list_remove() 5589 * and decrement arcstat_l2_size. 5590 */ 5591 arc_change_state(arc_anon, hdr, hash_lock); 5592 arc_hdr_destroy(hdr); 5593 } else { 5594 ASSERT(hdr->b_l1hdr.b_state != arc_l2c_only); 5595 ARCSTAT_BUMP(arcstat_l2_evict_l1cached); 5596 /* 5597 * Invalidate issued or about to be issued 5598 * reads, since we may be about to write 5599 * over this location. 5600 */ 5601 if (HDR_L2_READING(hdr)) { 5602 ARCSTAT_BUMP(arcstat_l2_evict_reading); 5603 hdr->b_flags |= ARC_FLAG_L2_EVICTED; 5604 } 5605 5606 arc_hdr_l2hdr_destroy(hdr); 5607 } 5608 mutex_exit(hash_lock); 5609 } 5610 mutex_exit(&dev->l2ad_mtx); 5611} 5612 5613/* 5614 * Find and write ARC buffers to the L2ARC device. 5615 * 5616 * An ARC_FLAG_L2_WRITING flag is set so that the L2ARC buffers are not valid 5617 * for reading until they have completed writing. 5618 * The headroom_boost is an in-out parameter used to maintain headroom boost 5619 * state between calls to this function. 5620 * 5621 * Returns the number of bytes actually written (which may be smaller than 5622 * the delta by which the device hand has changed due to alignment). 5623 */ 5624static uint64_t 5625l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, 5626 boolean_t *headroom_boost) 5627{ 5628 arc_buf_hdr_t *hdr, *hdr_prev, *head; 5629 list_t *list; 5630 uint64_t write_asize, write_psize, write_sz, headroom, 5631 buf_compress_minsz; 5632 void *buf_data; 5633 kmutex_t *list_lock; 5634 boolean_t full; 5635 l2arc_write_callback_t *cb; 5636 zio_t *pio, *wzio; 5637 uint64_t guid = spa_load_guid(spa); 5638 const boolean_t do_headroom_boost = *headroom_boost; 5639 int try; 5640 5641 ASSERT(dev->l2ad_vdev != NULL); 5642 5643 /* Lower the flag now, we might want to raise it again later. */ 5644 *headroom_boost = B_FALSE; 5645 5646 pio = NULL; 5647 write_sz = write_asize = write_psize = 0; 5648 full = B_FALSE; 5649 head = kmem_cache_alloc(hdr_l2only_cache, KM_PUSHPAGE); 5650 head->b_flags |= ARC_FLAG_L2_WRITE_HEAD; 5651 head->b_flags |= ARC_FLAG_HAS_L2HDR; 5652 5653 ARCSTAT_BUMP(arcstat_l2_write_buffer_iter); 5654 /* 5655 * We will want to try to compress buffers that are at least 2x the 5656 * device sector size. 5657 */ 5658 buf_compress_minsz = 2 << dev->l2ad_vdev->vdev_ashift; 5659 5660 /* 5661 * Copy buffers for L2ARC writing. 5662 */ 5663 mutex_enter(&dev->l2ad_mtx); 5664 for (try = 0; try <= 3; try++) { 5665 uint64_t passed_sz = 0; 5666 5667 list = l2arc_list_locked(try, &list_lock); 5668 ARCSTAT_BUMP(arcstat_l2_write_buffer_list_iter); 5669 5670 /* 5671 * L2ARC fast warmup. 5672 * 5673 * Until the ARC is warm and starts to evict, read from the 5674 * head of the ARC lists rather than the tail. 5675 */ 5676 if (arc_warm == B_FALSE) 5677 hdr = list_head(list); 5678 else 5679 hdr = list_tail(list); 5680 if (hdr == NULL) 5681 ARCSTAT_BUMP(arcstat_l2_write_buffer_list_null_iter); 5682 5683 headroom = target_sz * l2arc_headroom; 5684 if (do_headroom_boost) 5685 headroom = (headroom * l2arc_headroom_boost) / 100; 5686 5687 for (; hdr; hdr = hdr_prev) { 5688 kmutex_t *hash_lock; 5689 uint64_t buf_sz; 5690 5691 if (arc_warm == B_FALSE) 5692 hdr_prev = list_next(list, hdr); 5693 else 5694 hdr_prev = list_prev(list, hdr); 5695 ARCSTAT_INCR(arcstat_l2_write_buffer_bytes_scanned, hdr->b_size); 5696 5697 hash_lock = HDR_LOCK(hdr); 5698 if (!mutex_tryenter(hash_lock)) { 5699 ARCSTAT_BUMP(arcstat_l2_write_trylock_fail); 5700 /* 5701 * Skip this buffer rather than waiting. 5702 */ 5703 continue; 5704 } 5705 5706 passed_sz += hdr->b_size; 5707 if (passed_sz > headroom) { 5708 /* 5709 * Searched too far. 5710 */ 5711 mutex_exit(hash_lock); 5712 ARCSTAT_BUMP(arcstat_l2_write_passed_headroom); 5713 break; 5714 } 5715 5716 if (!l2arc_write_eligible(guid, hdr)) { 5717 mutex_exit(hash_lock); 5718 continue; 5719 } 5720 5721 if ((write_sz + hdr->b_size) > target_sz) { 5722 full = B_TRUE; 5723 mutex_exit(hash_lock); 5724 ARCSTAT_BUMP(arcstat_l2_write_full); 5725 break; 5726 } 5727 5728 if (pio == NULL) { 5729 /* 5730 * Insert a dummy header on the buflist so 5731 * l2arc_write_done() can find where the 5732 * write buffers begin without searching. 5733 */ 5734 list_insert_head(&dev->l2ad_buflist, head); 5735 5736 cb = kmem_alloc( 5737 sizeof (l2arc_write_callback_t), KM_SLEEP); 5738 cb->l2wcb_dev = dev; 5739 cb->l2wcb_head = head; 5740 pio = zio_root(spa, l2arc_write_done, cb, 5741 ZIO_FLAG_CANFAIL); 5742 ARCSTAT_BUMP(arcstat_l2_write_pios); 5743 } 5744 5745 /* 5746 * Create and add a new L2ARC header. 5747 */ 5748 hdr->b_l2hdr.b_dev = dev; 5749 hdr->b_flags |= ARC_FLAG_L2_WRITING; 5750 /* 5751 * Temporarily stash the data buffer in b_tmp_cdata. 5752 * The subsequent write step will pick it up from 5753 * there. This is because can't access b_l1hdr.b_buf 5754 * without holding the hash_lock, which we in turn 5755 * can't access without holding the ARC list locks 5756 * (which we want to avoid during compression/writing). 5757 */ 5758 HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_OFF); 5759 hdr->b_l2hdr.b_asize = hdr->b_size; 5760 hdr->b_l1hdr.b_tmp_cdata = hdr->b_l1hdr.b_buf->b_data; 5761 5762 /* 5763 * Explicitly set the b_daddr field to a known 5764 * value which means "invalid address". This 5765 * enables us to differentiate which stage of 5766 * l2arc_write_buffers() the particular header 5767 * is in (e.g. this loop, or the one below). 5768 * ARC_FLAG_L2_WRITING is not enough to make 5769 * this distinction, and we need to know in 5770 * order to do proper l2arc vdev accounting in 5771 * arc_release() and arc_hdr_destroy(). 5772 * 5773 * Note, we can't use a new flag to distinguish 5774 * the two stages because we don't hold the 5775 * header's hash_lock below, in the second stage 5776 * of this function. Thus, we can't simply 5777 * change the b_flags field to denote that the 5778 * IO has been sent. We can change the b_daddr 5779 * field of the L2 portion, though, since we'll 5780 * be holding the l2ad_mtx; which is why we're 5781 * using it to denote the header's state change. 5782 */ 5783 hdr->b_l2hdr.b_daddr = L2ARC_ADDR_UNSET; 5784 5785 buf_sz = hdr->b_size; 5786 hdr->b_flags |= ARC_FLAG_HAS_L2HDR; 5787 5788 list_insert_head(&dev->l2ad_buflist, hdr); 5789 5790 /* 5791 * Compute and store the buffer cksum before 5792 * writing. On debug the cksum is verified first. 5793 */ 5794 arc_cksum_verify(hdr->b_l1hdr.b_buf); 5795 arc_cksum_compute(hdr->b_l1hdr.b_buf, B_TRUE); 5796 5797 mutex_exit(hash_lock); 5798 5799 write_sz += buf_sz; 5800 } 5801 5802 mutex_exit(list_lock); 5803 5804 if (full == B_TRUE) 5805 break; 5806 } 5807 5808 /* No buffers selected for writing? */ 5809 if (pio == NULL) { 5810 ASSERT0(write_sz); 5811 mutex_exit(&dev->l2ad_mtx); 5812 ASSERT(!HDR_HAS_L1HDR(head)); 5813 kmem_cache_free(hdr_l2only_cache, head); 5814 return (0); 5815 } 5816 5817 /* 5818 * Now start writing the buffers. We're starting at the write head 5819 * and work backwards, retracing the course of the buffer selector 5820 * loop above. 5821 */ 5822 for (hdr = list_prev(&dev->l2ad_buflist, head); hdr; 5823 hdr = list_prev(&dev->l2ad_buflist, hdr)) { 5824 uint64_t buf_sz; 5825 5826 /* 5827 * We shouldn't need to lock the buffer here, since we flagged 5828 * it as ARC_FLAG_L2_WRITING in the previous step, but we must 5829 * take care to only access its L2 cache parameters. In 5830 * particular, hdr->l1hdr.b_buf may be invalid by now due to 5831 * ARC eviction. 5832 */ 5833 hdr->b_l2hdr.b_daddr = dev->l2ad_hand; 5834 5835 if ((HDR_L2COMPRESS(hdr)) && 5836 hdr->b_l2hdr.b_asize >= buf_compress_minsz) { 5837 if (l2arc_compress_buf(hdr)) { 5838 /* 5839 * If compression succeeded, enable headroom 5840 * boost on the next scan cycle. 5841 */ 5842 *headroom_boost = B_TRUE; 5843 } 5844 } 5845 5846 /* 5847 * Pick up the buffer data we had previously stashed away 5848 * (and now potentially also compressed). 5849 */ 5850 buf_data = hdr->b_l1hdr.b_tmp_cdata; 5851 buf_sz = hdr->b_l2hdr.b_asize; 5852 5853 /* 5854 * If the data has not been compressed, then clear b_tmp_cdata 5855 * to make sure that it points only to a temporary compression 5856 * buffer. 5857 */ 5858 if (!L2ARC_IS_VALID_COMPRESS(HDR_GET_COMPRESS(hdr))) 5859 hdr->b_l1hdr.b_tmp_cdata = NULL; 5860 5861 /* 5862 * We need to do this regardless if buf_sz is zero or 5863 * not, otherwise, when this l2hdr is evicted we'll 5864 * remove a reference that was never added. 5865 */ 5866 (void) refcount_add_many(&dev->l2ad_alloc, buf_sz, hdr); 5867 5868 /* Compression may have squashed the buffer to zero length. */ 5869 if (buf_sz != 0) { 5870 uint64_t buf_p_sz; 5871 5872 wzio = zio_write_phys(pio, dev->l2ad_vdev, 5873 dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF, 5874 NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE, 5875 ZIO_FLAG_CANFAIL, B_FALSE); 5876 5877 DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev, 5878 zio_t *, wzio); 5879 (void) zio_nowait(wzio); 5880 5881 write_asize += buf_sz; 5882 5883 /* 5884 * Keep the clock hand suitably device-aligned. 5885 */ 5886 buf_p_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz); 5887 write_psize += buf_p_sz; 5888 dev->l2ad_hand += buf_p_sz; 5889 } 5890 } 5891 5892 mutex_exit(&dev->l2ad_mtx); 5893 5894 ASSERT3U(write_asize, <=, target_sz); 5895 ARCSTAT_BUMP(arcstat_l2_writes_sent); 5896 ARCSTAT_INCR(arcstat_l2_write_bytes, write_asize); 5897 ARCSTAT_INCR(arcstat_l2_size, write_sz); 5898 ARCSTAT_INCR(arcstat_l2_asize, write_asize); 5899 vdev_space_update(dev->l2ad_vdev, write_asize, 0, 0); 5900 5901 /* 5902 * Bump device hand to the device start if it is approaching the end. 5903 * l2arc_evict() will already have evicted ahead for this case. 5904 */ 5905 if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) { 5906 dev->l2ad_hand = dev->l2ad_start; 5907 dev->l2ad_first = B_FALSE; 5908 } 5909 5910 dev->l2ad_writing = B_TRUE; 5911 (void) zio_wait(pio); 5912 dev->l2ad_writing = B_FALSE; 5913 5914 return (write_asize); 5915} 5916 5917/* 5918 * Compresses an L2ARC buffer. 5919 * The data to be compressed must be prefilled in l1hdr.b_tmp_cdata and its 5920 * size in l2hdr->b_asize. This routine tries to compress the data and 5921 * depending on the compression result there are three possible outcomes: 5922 * *) The buffer was incompressible. The original l2hdr contents were left 5923 * untouched and are ready for writing to an L2 device. 5924 * *) The buffer was all-zeros, so there is no need to write it to an L2 5925 * device. To indicate this situation b_tmp_cdata is NULL'ed, b_asize is 5926 * set to zero and b_compress is set to ZIO_COMPRESS_EMPTY. 5927 * *) Compression succeeded and b_tmp_cdata was replaced with a temporary 5928 * data buffer which holds the compressed data to be written, and b_asize 5929 * tells us how much data there is. b_compress is set to the appropriate 5930 * compression algorithm. Once writing is done, invoke 5931 * l2arc_release_cdata_buf on this l2hdr to free this temporary buffer. 5932 * 5933 * Returns B_TRUE if compression succeeded, or B_FALSE if it didn't (the 5934 * buffer was incompressible). 5935 */ 5936static boolean_t 5937l2arc_compress_buf(arc_buf_hdr_t *hdr) 5938{ 5939 void *cdata; 5940 size_t csize, len, rounded; 5941 ASSERT(HDR_HAS_L2HDR(hdr)); 5942 l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr; 5943 5944 ASSERT(HDR_HAS_L1HDR(hdr)); 5945 ASSERT(HDR_GET_COMPRESS(hdr) == ZIO_COMPRESS_OFF); 5946 ASSERT(hdr->b_l1hdr.b_tmp_cdata != NULL); 5947 5948 len = l2hdr->b_asize; 5949 cdata = zio_data_buf_alloc(len); 5950 ASSERT3P(cdata, !=, NULL); 5951 csize = zio_compress_data(ZIO_COMPRESS_LZ4, hdr->b_l1hdr.b_tmp_cdata, 5952 cdata, l2hdr->b_asize); 5953 5954 if (csize == 0) { 5955 /* zero block, indicate that there's nothing to write */ 5956 zio_data_buf_free(cdata, len); 5957 HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_EMPTY); 5958 l2hdr->b_asize = 0; 5959 hdr->b_l1hdr.b_tmp_cdata = NULL; 5960 ARCSTAT_BUMP(arcstat_l2_compress_zeros); 5961 return (B_TRUE); 5962 } 5963 5964 rounded = P2ROUNDUP(csize, 5965 (size_t)1 << l2hdr->b_dev->l2ad_vdev->vdev_ashift); 5966 if (rounded < len) { 5967 /* 5968 * Compression succeeded, we'll keep the cdata around for 5969 * writing and release it afterwards. 5970 */ 5971 if (rounded > csize) { 5972 bzero((char *)cdata + csize, rounded - csize); 5973 csize = rounded; 5974 } 5975 HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_LZ4); 5976 l2hdr->b_asize = csize; 5977 hdr->b_l1hdr.b_tmp_cdata = cdata; 5978 ARCSTAT_BUMP(arcstat_l2_compress_successes); 5979 return (B_TRUE); 5980 } else { 5981 /* 5982 * Compression failed, release the compressed buffer. 5983 * l2hdr will be left unmodified. 5984 */ 5985 zio_data_buf_free(cdata, len); 5986 ARCSTAT_BUMP(arcstat_l2_compress_failures); 5987 return (B_FALSE); 5988 } 5989} 5990 5991/* 5992 * Decompresses a zio read back from an l2arc device. On success, the 5993 * underlying zio's io_data buffer is overwritten by the uncompressed 5994 * version. On decompression error (corrupt compressed stream), the 5995 * zio->io_error value is set to signal an I/O error. 5996 * 5997 * Please note that the compressed data stream is not checksummed, so 5998 * if the underlying device is experiencing data corruption, we may feed 5999 * corrupt data to the decompressor, so the decompressor needs to be 6000 * able to handle this situation (LZ4 does). 6001 */ 6002static void 6003l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr, enum zio_compress c) 6004{ 6005 ASSERT(L2ARC_IS_VALID_COMPRESS(c)); 6006 6007 if (zio->io_error != 0) { 6008 /* 6009 * An io error has occured, just restore the original io 6010 * size in preparation for a main pool read. 6011 */ 6012 zio->io_orig_size = zio->io_size = hdr->b_size; 6013 return; 6014 } 6015 6016 if (c == ZIO_COMPRESS_EMPTY) { 6017 /* 6018 * An empty buffer results in a null zio, which means we 6019 * need to fill its io_data after we're done restoring the 6020 * buffer's contents. 6021 */ 6022 ASSERT(hdr->b_l1hdr.b_buf != NULL); 6023 bzero(hdr->b_l1hdr.b_buf->b_data, hdr->b_size); 6024 zio->io_data = zio->io_orig_data = hdr->b_l1hdr.b_buf->b_data; 6025 } else { 6026 ASSERT(zio->io_data != NULL); 6027 /* 6028 * We copy the compressed data from the start of the arc buffer 6029 * (the zio_read will have pulled in only what we need, the 6030 * rest is garbage which we will overwrite at decompression) 6031 * and then decompress back to the ARC data buffer. This way we 6032 * can minimize copying by simply decompressing back over the 6033 * original compressed data (rather than decompressing to an 6034 * aux buffer and then copying back the uncompressed buffer, 6035 * which is likely to be much larger). 6036 */ 6037 uint64_t csize; 6038 void *cdata; 6039 6040 csize = zio->io_size; 6041 cdata = zio_data_buf_alloc(csize); 6042 bcopy(zio->io_data, cdata, csize); 6043 if (zio_decompress_data(c, cdata, zio->io_data, csize, 6044 hdr->b_size) != 0) 6045 zio->io_error = EIO; 6046 zio_data_buf_free(cdata, csize); 6047 } 6048 6049 /* Restore the expected uncompressed IO size. */ 6050 zio->io_orig_size = zio->io_size = hdr->b_size; 6051} 6052 6053/* 6054 * Releases the temporary b_tmp_cdata buffer in an l2arc header structure. 6055 * This buffer serves as a temporary holder of compressed data while 6056 * the buffer entry is being written to an l2arc device. Once that is 6057 * done, we can dispose of it. 6058 */ 6059static void 6060l2arc_release_cdata_buf(arc_buf_hdr_t *hdr) 6061{ 6062 ASSERT(HDR_HAS_L1HDR(hdr)); 6063 if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_EMPTY) { 6064 /* 6065 * If the data was compressed, then we've allocated a 6066 * temporary buffer for it, so now we need to release it. 6067 */ 6068 ASSERT(hdr->b_l1hdr.b_tmp_cdata != NULL); 6069 zio_data_buf_free(hdr->b_l1hdr.b_tmp_cdata, 6070 hdr->b_size); 6071 hdr->b_l1hdr.b_tmp_cdata = NULL; 6072 } else { 6073 ASSERT(hdr->b_l1hdr.b_tmp_cdata == NULL); 6074 } 6075} 6076 6077/* 6078 * This thread feeds the L2ARC at regular intervals. This is the beating 6079 * heart of the L2ARC. 6080 */ 6081static void 6082l2arc_feed_thread(void *dummy __unused) 6083{ 6084 callb_cpr_t cpr; 6085 l2arc_dev_t *dev; 6086 spa_t *spa; 6087 uint64_t size, wrote; 6088 clock_t begin, next = ddi_get_lbolt(); 6089 boolean_t headroom_boost = B_FALSE; 6090 6091 CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG); 6092 6093 mutex_enter(&l2arc_feed_thr_lock); 6094 6095 while (l2arc_thread_exit == 0) { 6096 CALLB_CPR_SAFE_BEGIN(&cpr); 6097 (void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock, 6098 next - ddi_get_lbolt()); 6099 CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock); 6100 next = ddi_get_lbolt() + hz; 6101 6102 /* 6103 * Quick check for L2ARC devices. 6104 */ 6105 mutex_enter(&l2arc_dev_mtx); 6106 if (l2arc_ndev == 0) { 6107 mutex_exit(&l2arc_dev_mtx); 6108 continue; 6109 } 6110 mutex_exit(&l2arc_dev_mtx); 6111 begin = ddi_get_lbolt(); 6112 6113 /* 6114 * This selects the next l2arc device to write to, and in 6115 * doing so the next spa to feed from: dev->l2ad_spa. This 6116 * will return NULL if there are now no l2arc devices or if 6117 * they are all faulted. 6118 * 6119 * If a device is returned, its spa's config lock is also 6120 * held to prevent device removal. l2arc_dev_get_next() 6121 * will grab and release l2arc_dev_mtx. 6122 */ 6123 if ((dev = l2arc_dev_get_next()) == NULL) 6124 continue; 6125 6126 spa = dev->l2ad_spa; 6127 ASSERT(spa != NULL); 6128 6129 /* 6130 * If the pool is read-only then force the feed thread to 6131 * sleep a little longer. 6132 */ 6133 if (!spa_writeable(spa)) { 6134 next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz; 6135 spa_config_exit(spa, SCL_L2ARC, dev); 6136 continue; 6137 } 6138 6139 /* 6140 * Avoid contributing to memory pressure. 6141 */ 6142 if (arc_reclaim_needed()) { 6143 ARCSTAT_BUMP(arcstat_l2_abort_lowmem); 6144 spa_config_exit(spa, SCL_L2ARC, dev); 6145 continue; 6146 } 6147 6148 ARCSTAT_BUMP(arcstat_l2_feeds); 6149 6150 size = l2arc_write_size(); 6151 6152 /* 6153 * Evict L2ARC buffers that will be overwritten. 6154 */ 6155 l2arc_evict(dev, size, B_FALSE); 6156 6157 /* 6158 * Write ARC buffers. 6159 */ 6160 wrote = l2arc_write_buffers(spa, dev, size, &headroom_boost); 6161 6162 /* 6163 * Calculate interval between writes. 6164 */ 6165 next = l2arc_write_interval(begin, size, wrote); 6166 spa_config_exit(spa, SCL_L2ARC, dev); 6167 } 6168 6169 l2arc_thread_exit = 0; 6170 cv_broadcast(&l2arc_feed_thr_cv); 6171 CALLB_CPR_EXIT(&cpr); /* drops l2arc_feed_thr_lock */ 6172 thread_exit(); 6173} 6174 6175boolean_t 6176l2arc_vdev_present(vdev_t *vd) 6177{ 6178 l2arc_dev_t *dev; 6179 6180 mutex_enter(&l2arc_dev_mtx); 6181 for (dev = list_head(l2arc_dev_list); dev != NULL; 6182 dev = list_next(l2arc_dev_list, dev)) { 6183 if (dev->l2ad_vdev == vd) 6184 break; 6185 } 6186 mutex_exit(&l2arc_dev_mtx); 6187 6188 return (dev != NULL); 6189} 6190 6191/* 6192 * Add a vdev for use by the L2ARC. By this point the spa has already 6193 * validated the vdev and opened it. 6194 */ 6195void 6196l2arc_add_vdev(spa_t *spa, vdev_t *vd) 6197{ 6198 l2arc_dev_t *adddev; 6199 6200 ASSERT(!l2arc_vdev_present(vd)); 6201 6202 vdev_ashift_optimize(vd); 6203 6204 /* 6205 * Create a new l2arc device entry. 6206 */ 6207 adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP); 6208 adddev->l2ad_spa = spa; 6209 adddev->l2ad_vdev = vd; 6210 adddev->l2ad_start = VDEV_LABEL_START_SIZE; 6211 adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd); 6212 adddev->l2ad_hand = adddev->l2ad_start; 6213 adddev->l2ad_first = B_TRUE; 6214 adddev->l2ad_writing = B_FALSE; 6215 6216 mutex_init(&adddev->l2ad_mtx, NULL, MUTEX_DEFAULT, NULL); 6217 /* 6218 * This is a list of all ARC buffers that are still valid on the 6219 * device. 6220 */ 6221 list_create(&adddev->l2ad_buflist, sizeof (arc_buf_hdr_t), 6222 offsetof(arc_buf_hdr_t, b_l2hdr.b_l2node)); 6223 6224 vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand); 6225 refcount_create(&adddev->l2ad_alloc); 6226 6227 /* 6228 * Add device to global list 6229 */ 6230 mutex_enter(&l2arc_dev_mtx); 6231 list_insert_head(l2arc_dev_list, adddev); 6232 atomic_inc_64(&l2arc_ndev); 6233 mutex_exit(&l2arc_dev_mtx); 6234} 6235 6236/* 6237 * Remove a vdev from the L2ARC. 6238 */ 6239void 6240l2arc_remove_vdev(vdev_t *vd) 6241{ 6242 l2arc_dev_t *dev, *nextdev, *remdev = NULL; 6243 6244 /* 6245 * Find the device by vdev 6246 */ 6247 mutex_enter(&l2arc_dev_mtx); 6248 for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) { 6249 nextdev = list_next(l2arc_dev_list, dev); 6250 if (vd == dev->l2ad_vdev) { 6251 remdev = dev; 6252 break; 6253 } 6254 } 6255 ASSERT(remdev != NULL); 6256 6257 /* 6258 * Remove device from global list 6259 */ 6260 list_remove(l2arc_dev_list, remdev); 6261 l2arc_dev_last = NULL; /* may have been invalidated */ 6262 atomic_dec_64(&l2arc_ndev); 6263 mutex_exit(&l2arc_dev_mtx); 6264 6265 /* 6266 * Clear all buflists and ARC references. L2ARC device flush. 6267 */ 6268 l2arc_evict(remdev, 0, B_TRUE); 6269 list_destroy(&remdev->l2ad_buflist); 6270 mutex_destroy(&remdev->l2ad_mtx); 6271 refcount_destroy(&remdev->l2ad_alloc); 6272 kmem_free(remdev, sizeof (l2arc_dev_t)); 6273} 6274 6275void 6276l2arc_init(void) 6277{ 6278 l2arc_thread_exit = 0; 6279 l2arc_ndev = 0; 6280 l2arc_writes_sent = 0; 6281 l2arc_writes_done = 0; 6282 6283 mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL); 6284 cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL); 6285 mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL); 6286 mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL); 6287 6288 l2arc_dev_list = &L2ARC_dev_list; 6289 l2arc_free_on_write = &L2ARC_free_on_write; 6290 list_create(l2arc_dev_list, sizeof (l2arc_dev_t), 6291 offsetof(l2arc_dev_t, l2ad_node)); 6292 list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t), 6293 offsetof(l2arc_data_free_t, l2df_list_node)); 6294} 6295 6296void 6297l2arc_fini(void) 6298{ 6299 /* 6300 * This is called from dmu_fini(), which is called from spa_fini(); 6301 * Because of this, we can assume that all l2arc devices have 6302 * already been removed when the pools themselves were removed. 6303 */ 6304 6305 l2arc_do_free_on_write(); 6306 6307 mutex_destroy(&l2arc_feed_thr_lock); 6308 cv_destroy(&l2arc_feed_thr_cv); 6309 mutex_destroy(&l2arc_dev_mtx); 6310 mutex_destroy(&l2arc_free_on_write_mtx); 6311 6312 list_destroy(l2arc_dev_list); 6313 list_destroy(l2arc_free_on_write); 6314} 6315 6316void 6317l2arc_start(void) 6318{ 6319 if (!(spa_mode_global & FWRITE)) 6320 return; 6321 6322 (void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0, 6323 TS_RUN, minclsyspri); 6324} 6325 6326void 6327l2arc_stop(void) 6328{ 6329 if (!(spa_mode_global & FWRITE)) 6330 return; 6331 6332 mutex_enter(&l2arc_feed_thr_lock); 6333 cv_signal(&l2arc_feed_thr_cv); /* kick thread out of startup */ 6334 l2arc_thread_exit = 1; 6335 while (l2arc_thread_exit != 0) 6336 cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock); 6337 mutex_exit(&l2arc_feed_thr_lock); 6338} 6339