1168404Spjd/* 2168404Spjd * CDDL HEADER START 3168404Spjd * 4168404Spjd * The contents of this file are subject to the terms of the 5168404Spjd * Common Development and Distribution License (the "License"). 6168404Spjd * You may not use this file except in compliance with the License. 7168404Spjd * 8168404Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9168404Spjd * or http://www.opensolaris.org/os/licensing. 10168404Spjd * See the License for the specific language governing permissions 11168404Spjd * and limitations under the License. 12168404Spjd * 13168404Spjd * When distributing Covered Code, include this CDDL HEADER in each 14168404Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15168404Spjd * If applicable, add the following below this CDDL HEADER, with the 16168404Spjd * fields enclosed by brackets "[]" replaced with your own identifying 17168404Spjd * information: Portions Copyright [yyyy] [name of copyright owner] 18168404Spjd * 19168404Spjd * CDDL HEADER END 20168404Spjd */ 21168404Spjd/* 22219089Spjd * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23332528Smav * Copyright (c) 2018, Joyent, Inc. 24339114Smav * Copyright (c) 2011, 2018 by Delphix. All rights reserved. 25260835Sdelphij * Copyright (c) 2014 by Saso Kiselkov. All rights reserved. 26329490Smav * Copyright 2017 Nexenta Systems, Inc. All rights reserved. 27168404Spjd */ 28168404Spjd 29168404Spjd/* 30168404Spjd * DVA-based Adjustable Replacement Cache 31168404Spjd * 32168404Spjd * While much of the theory of operation used here is 33168404Spjd * based on the self-tuning, low overhead replacement cache 34168404Spjd * presented by Megiddo and Modha at FAST 2003, there are some 35168404Spjd * significant differences: 36168404Spjd * 37168404Spjd * 1. The Megiddo and Modha model assumes any page is evictable. 38168404Spjd * Pages in its cache cannot be "locked" into memory. This makes 39168404Spjd * the eviction algorithm simple: evict the last page in the list. 40168404Spjd * This also make the performance characteristics easy to reason 41168404Spjd * about. Our cache is not so simple. At any given moment, some 42168404Spjd * subset of the blocks in the cache are un-evictable because we 43168404Spjd * have handed out a reference to them. Blocks are only evictable 44168404Spjd * when there are no external references active. This makes 45168404Spjd * eviction far more problematic: we choose to evict the evictable 46168404Spjd * blocks that are the "lowest" in the list. 47168404Spjd * 48168404Spjd * There are times when it is not possible to evict the requested 49168404Spjd * space. In these circumstances we are unable to adjust the cache 50168404Spjd * size. To prevent the cache growing unbounded at these times we 51185029Spjd * implement a "cache throttle" that slows the flow of new data 52185029Spjd * into the cache until we can make space available. 53168404Spjd * 54168404Spjd * 2. The Megiddo and Modha model assumes a fixed cache size. 55168404Spjd * Pages are evicted when the cache is full and there is a cache 56168404Spjd * miss. Our model has a variable sized cache. It grows with 57185029Spjd * high use, but also tries to react to memory pressure from the 58168404Spjd * operating system: decreasing its size when system memory is 59168404Spjd * tight. 60168404Spjd * 61168404Spjd * 3. The Megiddo and Modha model assumes a fixed page size. All 62251631Sdelphij * elements of the cache are therefore exactly the same size. So 63168404Spjd * when adjusting the cache size following a cache miss, its simply 64168404Spjd * a matter of choosing a single page to evict. In our model, we 65168404Spjd * have variable sized cache blocks (rangeing from 512 bytes to 66251631Sdelphij * 128K bytes). We therefore choose a set of blocks to evict to make 67168404Spjd * space for a cache miss that approximates as closely as possible 68168404Spjd * the space used by the new block. 69168404Spjd * 70168404Spjd * See also: "ARC: A Self-Tuning, Low Overhead Replacement Cache" 71168404Spjd * by N. Megiddo & D. Modha, FAST 2003 72168404Spjd */ 73168404Spjd 74168404Spjd/* 75168404Spjd * The locking model: 76168404Spjd * 77168404Spjd * A new reference to a cache buffer can be obtained in two 78168404Spjd * ways: 1) via a hash table lookup using the DVA as a key, 79185029Spjd * or 2) via one of the ARC lists. The arc_read() interface 80321535Smav * uses method 1, while the internal ARC algorithms for 81251631Sdelphij * adjusting the cache use method 2. We therefore provide two 82168404Spjd * types of locks: 1) the hash table lock array, and 2) the 83321535Smav * ARC list locks. 84168404Spjd * 85286774Smav * Buffers do not have their own mutexes, rather they rely on the 86286774Smav * hash table mutexes for the bulk of their protection (i.e. most 87286774Smav * fields in the arc_buf_hdr_t are protected by these mutexes). 88168404Spjd * 89168404Spjd * buf_hash_find() returns the appropriate mutex (held) when it 90168404Spjd * locates the requested buffer in the hash table. It returns 91168404Spjd * NULL for the mutex if the buffer was not in the table. 92168404Spjd * 93168404Spjd * buf_hash_remove() expects the appropriate hash mutex to be 94168404Spjd * already held before it is invoked. 95168404Spjd * 96321535Smav * Each ARC state also has a mutex which is used to protect the 97168404Spjd * buffer list associated with the state. When attempting to 98321535Smav * obtain a hash table lock while holding an ARC list lock you 99168404Spjd * must use: mutex_tryenter() to avoid deadlock. Also note that 100168404Spjd * the active state mutex must be held before the ghost state mutex. 101168404Spjd * 102168404Spjd * Note that the majority of the performance stats are manipulated 103168404Spjd * with atomic operations. 104185029Spjd * 105286570Smav * The L2ARC uses the l2ad_mtx on each vdev for the following: 106185029Spjd * 107185029Spjd * - L2ARC buflist creation 108185029Spjd * - L2ARC buflist eviction 109185029Spjd * - L2ARC write completion, which walks L2ARC buflists 110185029Spjd * - ARC header destruction, as it removes from L2ARC buflists 111185029Spjd * - ARC header release, as it removes from L2ARC buflists 112168404Spjd */ 113168404Spjd 114307265Smav/* 115307265Smav * ARC operation: 116307265Smav * 117307265Smav * Every block that is in the ARC is tracked by an arc_buf_hdr_t structure. 118307265Smav * This structure can point either to a block that is still in the cache or to 119307265Smav * one that is only accessible in an L2 ARC device, or it can provide 120307265Smav * information about a block that was recently evicted. If a block is 121307265Smav * only accessible in the L2ARC, then the arc_buf_hdr_t only has enough 122307265Smav * information to retrieve it from the L2ARC device. This information is 123307265Smav * stored in the l2arc_buf_hdr_t sub-structure of the arc_buf_hdr_t. A block 124307265Smav * that is in this state cannot access the data directly. 125307265Smav * 126307265Smav * Blocks that are actively being referenced or have not been evicted 127307265Smav * are cached in the L1ARC. The L1ARC (l1arc_buf_hdr_t) is a structure within 128307265Smav * the arc_buf_hdr_t that will point to the data block in memory. A block can 129307265Smav * only be read by a consumer if it has an l1arc_buf_hdr_t. The L1ARC 130321535Smav * caches data in two ways -- in a list of ARC buffers (arc_buf_t) and 131321610Smav * also in the arc_buf_hdr_t's private physical data block pointer (b_pabd). 132321535Smav * 133321535Smav * The L1ARC's data pointer may or may not be uncompressed. The ARC has the 134321610Smav * ability to store the physical data (b_pabd) associated with the DVA of the 135321610Smav * arc_buf_hdr_t. Since the b_pabd is a copy of the on-disk physical block, 136321535Smav * it will match its on-disk compression characteristics. This behavior can be 137321535Smav * disabled by setting 'zfs_compressed_arc_enabled' to B_FALSE. When the 138321610Smav * compressed ARC functionality is disabled, the b_pabd will point to an 139321535Smav * uncompressed version of the on-disk data. 140321535Smav * 141321535Smav * Data in the L1ARC is not accessed by consumers of the ARC directly. Each 142321535Smav * arc_buf_hdr_t can have multiple ARC buffers (arc_buf_t) which reference it. 143321535Smav * Each ARC buffer (arc_buf_t) is being actively accessed by a specific ARC 144321535Smav * consumer. The ARC will provide references to this data and will keep it 145321535Smav * cached until it is no longer in use. The ARC caches only the L1ARC's physical 146321535Smav * data block and will evict any arc_buf_t that is no longer referenced. The 147321535Smav * amount of memory consumed by the arc_buf_ts' data buffers can be seen via the 148307265Smav * "overhead_size" kstat. 149307265Smav * 150321535Smav * Depending on the consumer, an arc_buf_t can be requested in uncompressed or 151321535Smav * compressed form. The typical case is that consumers will want uncompressed 152321535Smav * data, and when that happens a new data buffer is allocated where the data is 153321535Smav * decompressed for them to use. Currently the only consumer who wants 154321535Smav * compressed arc_buf_t's is "zfs send", when it streams data exactly as it 155321535Smav * exists on disk. When this happens, the arc_buf_t's data buffer is shared 156321535Smav * with the arc_buf_hdr_t. 157307265Smav * 158321535Smav * Here is a diagram showing an arc_buf_hdr_t referenced by two arc_buf_t's. The 159321535Smav * first one is owned by a compressed send consumer (and therefore references 160321535Smav * the same compressed data buffer as the arc_buf_hdr_t) and the second could be 161321535Smav * used by any other consumer (and has its own uncompressed copy of the data 162321535Smav * buffer). 163307265Smav * 164321535Smav * arc_buf_hdr_t 165321535Smav * +-----------+ 166321535Smav * | fields | 167321535Smav * | common to | 168321535Smav * | L1- and | 169321535Smav * | L2ARC | 170321535Smav * +-----------+ 171321535Smav * | l2arc_buf_hdr_t 172321535Smav * | | 173321535Smav * +-----------+ 174321535Smav * | l1arc_buf_hdr_t 175321535Smav * | | arc_buf_t 176321535Smav * | b_buf +------------>+-----------+ arc_buf_t 177321610Smav * | b_pabd +-+ |b_next +---->+-----------+ 178321535Smav * +-----------+ | |-----------| |b_next +-->NULL 179321535Smav * | |b_comp = T | +-----------+ 180321535Smav * | |b_data +-+ |b_comp = F | 181321535Smav * | +-----------+ | |b_data +-+ 182321535Smav * +->+------+ | +-----------+ | 183321535Smav * compressed | | | | 184321535Smav * data | |<--------------+ | uncompressed 185321535Smav * +------+ compressed, | data 186321535Smav * shared +-->+------+ 187321535Smav * data | | 188321535Smav * | | 189321535Smav * +------+ 190307265Smav * 191307265Smav * When a consumer reads a block, the ARC must first look to see if the 192321535Smav * arc_buf_hdr_t is cached. If the hdr is cached then the ARC allocates a new 193321535Smav * arc_buf_t and either copies uncompressed data into a new data buffer from an 194321610Smav * existing uncompressed arc_buf_t, decompresses the hdr's b_pabd buffer into a 195321610Smav * new data buffer, or shares the hdr's b_pabd buffer, depending on whether the 196321535Smav * hdr is compressed and the desired compression characteristics of the 197321535Smav * arc_buf_t consumer. If the arc_buf_t ends up sharing data with the 198321535Smav * arc_buf_hdr_t and both of them are uncompressed then the arc_buf_t must be 199321535Smav * the last buffer in the hdr's b_buf list, however a shared compressed buf can 200321535Smav * be anywhere in the hdr's list. 201307265Smav * 202307265Smav * The diagram below shows an example of an uncompressed ARC hdr that is 203321535Smav * sharing its data with an arc_buf_t (note that the shared uncompressed buf is 204321535Smav * the last element in the buf list): 205307265Smav * 206307265Smav * arc_buf_hdr_t 207307265Smav * +-----------+ 208307265Smav * | | 209307265Smav * | | 210307265Smav * | | 211307265Smav * +-----------+ 212307265Smav * l2arc_buf_hdr_t| | 213307265Smav * | | 214307265Smav * +-----------+ 215307265Smav * l1arc_buf_hdr_t| | 216307265Smav * | | arc_buf_t (shared) 217307265Smav * | b_buf +------------>+---------+ arc_buf_t 218307265Smav * | | |b_next +---->+---------+ 219321610Smav * | b_pabd +-+ |---------| |b_next +-->NULL 220307265Smav * +-----------+ | | | +---------+ 221307265Smav * | |b_data +-+ | | 222307265Smav * | +---------+ | |b_data +-+ 223307265Smav * +->+------+ | +---------+ | 224307265Smav * | | | | 225307265Smav * uncompressed | | | | 226307265Smav * data +------+ | | 227307265Smav * ^ +->+------+ | 228307265Smav * | uncompressed | | | 229307265Smav * | data | | | 230307265Smav * | +------+ | 231307265Smav * +---------------------------------+ 232307265Smav * 233321610Smav * Writing to the ARC requires that the ARC first discard the hdr's b_pabd 234307265Smav * since the physical block is about to be rewritten. The new data contents 235321535Smav * will be contained in the arc_buf_t. As the I/O pipeline performs the write, 236321535Smav * it may compress the data before writing it to disk. The ARC will be called 237321535Smav * with the transformed data and will bcopy the transformed on-disk block into 238321610Smav * a newly allocated b_pabd. Writes are always done into buffers which have 239321535Smav * either been loaned (and hence are new and don't have other readers) or 240321535Smav * buffers which have been released (and hence have their own hdr, if there 241321535Smav * were originally other readers of the buf's original hdr). This ensures that 242321535Smav * the ARC only needs to update a single buf and its hdr after a write occurs. 243307265Smav * 244321610Smav * When the L2ARC is in use, it will also take advantage of the b_pabd. The 245321610Smav * L2ARC will always write the contents of b_pabd to the L2ARC. This means 246321535Smav * that when compressed ARC is enabled that the L2ARC blocks are identical 247307265Smav * to the on-disk block in the main data pool. This provides a significant 248307265Smav * advantage since the ARC can leverage the bp's checksum when reading from the 249307265Smav * L2ARC to determine if the contents are valid. However, if the compressed 250321535Smav * ARC is disabled, then the L2ARC's block must be transformed to look 251307265Smav * like the physical block in the main data pool before comparing the 252307265Smav * checksum and determining its validity. 253307265Smav */ 254307265Smav 255168404Spjd#include <sys/spa.h> 256168404Spjd#include <sys/zio.h> 257307265Smav#include <sys/spa_impl.h> 258251478Sdelphij#include <sys/zio_compress.h> 259307265Smav#include <sys/zio_checksum.h> 260168404Spjd#include <sys/zfs_context.h> 261168404Spjd#include <sys/arc.h> 262168404Spjd#include <sys/refcount.h> 263185029Spjd#include <sys/vdev.h> 264219089Spjd#include <sys/vdev_impl.h> 265258632Savg#include <sys/dsl_pool.h> 266321610Smav#include <sys/zio_checksum.h> 267286763Smav#include <sys/multilist.h> 268321610Smav#include <sys/abd.h> 269168404Spjd#ifdef _KERNEL 270168404Spjd#include <sys/dnlc.h> 271297633Strasz#include <sys/racct.h> 272168404Spjd#endif 273168404Spjd#include <sys/callb.h> 274168404Spjd#include <sys/kstat.h> 275248572Ssmh#include <sys/trim_map.h> 276346686Smav#include <sys/zthr.h> 277219089Spjd#include <zfs_fletcher.h> 278168404Spjd#include <sys/sdt.h> 279332540Smav#include <sys/aggsum.h> 280332540Smav#include <sys/cityhash.h> 281168404Spjd 282272483Ssmh#include <machine/vmparam.h> 283191902Skmacy 284240133Smm#ifdef illumos 285240133Smm#ifndef _KERNEL 286240133Smm/* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */ 287240133Smmboolean_t arc_watch = B_FALSE; 288240133Smmint arc_procfd; 289240133Smm#endif 290240133Smm#endif /* illumos */ 291240133Smm 292346686Smav/* 293346686Smav * This thread's job is to keep enough free memory in the system, by 294346686Smav * calling arc_kmem_reap_now() plus arc_shrink(), which improves 295346686Smav * arc_available_memory(). 296346686Smav */ 297346686Smavstatic zthr_t *arc_reap_zthr; 298168404Spjd 299346686Smav/* 300346686Smav * This thread's job is to keep arc_size under arc_c, by calling 301346686Smav * arc_adjust(), which improves arc_is_overflowing(). 302346686Smav */ 303346686Smavstatic zthr_t *arc_adjust_zthr; 304346686Smav 305346686Smavstatic kmutex_t arc_adjust_lock; 306346686Smavstatic kcondvar_t arc_adjust_waiters_cv; 307346686Smavstatic boolean_t arc_adjust_needed = B_FALSE; 308346686Smav 309301997Skibstatic kmutex_t arc_dnlc_evicts_lock; 310301997Skibstatic kcondvar_t arc_dnlc_evicts_cv; 311301997Skibstatic boolean_t arc_dnlc_evicts_thread_exit; 312301997Skib 313286625Smavuint_t arc_reduce_dnlc_percent = 3; 314168404Spjd 315258632Savg/* 316286763Smav * The number of headers to evict in arc_evict_state_impl() before 317286763Smav * dropping the sublist lock and evicting from another sublist. A lower 318286763Smav * value means we're more likely to evict the "correct" header (i.e. the 319286763Smav * oldest header in the arc state), but comes with higher overhead 320286763Smav * (i.e. more invocations of arc_evict_state_impl()). 321258632Savg */ 322286763Smavint zfs_arc_evict_batch_limit = 10; 323258632Savg 324168404Spjd/* number of seconds before growing cache again */ 325346686Smavint arc_grow_retry = 60; 326168404Spjd 327346686Smav/* 328346686Smav * Minimum time between calls to arc_kmem_reap_soon(). Note that this will 329346686Smav * be converted to ticks, so with the default hz=100, a setting of 15 ms 330346686Smav * will actually wait 2 ticks, or 20ms. 331346686Smav */ 332346686Smavint arc_kmem_cache_reap_retry_ms = 1000; 333332528Smav 334321610Smav/* shift of arc_c for calculating overflow limit in arc_get_data_impl */ 335346686Smavint zfs_arc_overflow_shift = 8; 336286763Smav 337208373Smm/* shift of arc_c for calculating both min and max arc_p */ 338346686Smavint arc_p_min_shift = 4; 339208373Smm 340208373Smm/* log2(fraction of arc to reclaim) */ 341346686Smavint arc_shrink_shift = 7; 342208373Smm 343168404Spjd/* 344286625Smav * log2(fraction of ARC which must be free to allow growing). 345286625Smav * I.e. If there is less than arc_c >> arc_no_grow_shift free memory, 346286625Smav * when reading a new block into the ARC, we will evict an equal-sized block 347286625Smav * from the ARC. 348286625Smav * 349286625Smav * This must be less than arc_shrink_shift, so that when we shrink the ARC, 350286625Smav * we will still not allow it to grow. 351286625Smav */ 352286625Smavint arc_no_grow_shift = 5; 353286625Smav 354286625Smav 355286625Smav/* 356168404Spjd * minimum lifespan of a prefetch block in clock ticks 357168404Spjd * (initialized in arc_init()) 358168404Spjd */ 359339034Ssefstatic int zfs_arc_min_prefetch_ms = 1; 360339034Ssefstatic int zfs_arc_min_prescient_prefetch_ms = 6; 361168404Spjd 362258632Savg/* 363258632Savg * If this percent of memory is free, don't throttle. 364258632Savg */ 365258632Savgint arc_lotsfree_percent = 10; 366258632Savg 367346686Smavstatic boolean_t arc_initialized; 368287702Sdelphijextern boolean_t zfs_prefetch_disable; 369168404Spjd 370168404Spjd/* 371185029Spjd * The arc has filled available memory and has now warmed up. 372185029Spjd */ 373185029Spjdstatic boolean_t arc_warm; 374185029Spjd 375286762Smav/* 376331383Smav * log2 fraction of the zio arena to keep free. 377331383Smav */ 378331383Smavint arc_zio_arena_free_shift = 2; 379331383Smav 380331383Smav/* 381286762Smav * These tunables are for performance analysis. 382286762Smav */ 383185029Spjduint64_t zfs_arc_max; 384185029Spjduint64_t zfs_arc_min; 385185029Spjduint64_t zfs_arc_meta_limit = 0; 386275780Sdelphijuint64_t zfs_arc_meta_min = 0; 387208373Smmint zfs_arc_grow_retry = 0; 388208373Smmint zfs_arc_shrink_shift = 0; 389323667Sbaptint zfs_arc_no_grow_shift = 0; 390208373Smmint zfs_arc_p_min_shift = 0; 391269230Sdelphijuint64_t zfs_arc_average_blocksize = 8 * 1024; /* 8KB */ 392272483Ssmhu_int zfs_arc_free_target = 0; 393185029Spjd 394302265Ssmh/* Absolute min for arc min / max is 16MB. */ 395302265Ssmhstatic uint64_t arc_abs_min = 16 << 20; 396302265Ssmh 397339141Smav/* 398339141Smav * ARC dirty data constraints for arc_tempreserve_space() throttle 399339141Smav */ 400339141Smavuint_t zfs_arc_dirty_limit_percent = 50; /* total dirty data limit */ 401339141Smavuint_t zfs_arc_anon_limit_percent = 25; /* anon block dirty limit */ 402339141Smavuint_t zfs_arc_pool_dirty_percent = 20; /* each pool's anon allowance */ 403339141Smav 404307265Smavboolean_t zfs_compressed_arc_enabled = B_TRUE; 405307265Smav 406270759Ssmhstatic int sysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS); 407275748Sdelphijstatic int sysctl_vfs_zfs_arc_meta_limit(SYSCTL_HANDLER_ARGS); 408302265Ssmhstatic int sysctl_vfs_zfs_arc_max(SYSCTL_HANDLER_ARGS); 409302265Ssmhstatic int sysctl_vfs_zfs_arc_min(SYSCTL_HANDLER_ARGS); 410323667Sbaptstatic int sysctl_vfs_zfs_arc_no_grow_shift(SYSCTL_HANDLER_ARGS); 411270759Ssmh 412302265Ssmh#if defined(__FreeBSD__) && defined(_KERNEL) 413270759Ssmhstatic void 414270759Ssmharc_free_target_init(void *unused __unused) 415270759Ssmh{ 416270759Ssmh 417272483Ssmh zfs_arc_free_target = vm_pageout_wakeup_thresh; 418270759Ssmh} 419270759SsmhSYSINIT(arc_free_target_init, SI_SUB_KTHREAD_PAGE, SI_ORDER_ANY, 420270759Ssmh arc_free_target_init, NULL); 421270759Ssmh 422185029SpjdTUNABLE_QUAD("vfs.zfs.arc_meta_limit", &zfs_arc_meta_limit); 423275780SdelphijTUNABLE_QUAD("vfs.zfs.arc_meta_min", &zfs_arc_meta_min); 424273026SdelphijTUNABLE_INT("vfs.zfs.arc_shrink_shift", &zfs_arc_shrink_shift); 425323667SbaptTUNABLE_INT("vfs.zfs.arc_grow_retry", &zfs_arc_grow_retry); 426323667SbaptTUNABLE_INT("vfs.zfs.arc_no_grow_shift", &zfs_arc_no_grow_shift); 427168473SpjdSYSCTL_DECL(_vfs_zfs); 428302265SsmhSYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_max, CTLTYPE_U64 | CTLFLAG_RWTUN, 429302265Ssmh 0, sizeof(uint64_t), sysctl_vfs_zfs_arc_max, "QU", "Maximum ARC size"); 430302265SsmhSYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_min, CTLTYPE_U64 | CTLFLAG_RWTUN, 431302265Ssmh 0, sizeof(uint64_t), sysctl_vfs_zfs_arc_min, "QU", "Minimum ARC size"); 432323667SbaptSYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_no_grow_shift, CTLTYPE_U32 | CTLFLAG_RWTUN, 433323667Sbapt 0, sizeof(uint32_t), sysctl_vfs_zfs_arc_no_grow_shift, "U", 434323667Sbapt "log2(fraction of ARC which must be free to allow growing)"); 435269230SdelphijSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_average_blocksize, CTLFLAG_RDTUN, 436269230Sdelphij &zfs_arc_average_blocksize, 0, 437269230Sdelphij "ARC average blocksize"); 438273026SdelphijSYSCTL_INT(_vfs_zfs, OID_AUTO, arc_shrink_shift, CTLFLAG_RW, 439273026Sdelphij &arc_shrink_shift, 0, 440273026Sdelphij "log2(fraction of arc to reclaim)"); 441323667SbaptSYSCTL_INT(_vfs_zfs, OID_AUTO, arc_grow_retry, CTLFLAG_RW, 442323667Sbapt &arc_grow_retry, 0, 443323667Sbapt "Wait in seconds before considering growing ARC"); 444307265SmavSYSCTL_INT(_vfs_zfs, OID_AUTO, compressed_arc_enabled, CTLFLAG_RDTUN, 445338346Smarkj &zfs_compressed_arc_enabled, 0, 446338346Smarkj "Enable compressed ARC"); 447338346SmarkjSYSCTL_INT(_vfs_zfs, OID_AUTO, arc_kmem_cache_reap_retry_ms, CTLFLAG_RWTUN, 448338346Smarkj &arc_kmem_cache_reap_retry_ms, 0, 449338346Smarkj "Interval between ARC kmem_cache reapings"); 450273026Sdelphij 451270759Ssmh/* 452270759Ssmh * We don't have a tunable for arc_free_target due to the dependency on 453270759Ssmh * pagedaemon initialisation. 454270759Ssmh */ 455270759SsmhSYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_free_target, 456270759Ssmh CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(u_int), 457270759Ssmh sysctl_vfs_zfs_arc_free_target, "IU", 458270759Ssmh "Desired number of free pages below which ARC triggers reclaim"); 459168404Spjd 460270759Ssmhstatic int 461270759Ssmhsysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS) 462270759Ssmh{ 463270759Ssmh u_int val; 464270759Ssmh int err; 465270759Ssmh 466270759Ssmh val = zfs_arc_free_target; 467270759Ssmh err = sysctl_handle_int(oidp, &val, 0, req); 468270759Ssmh if (err != 0 || req->newptr == NULL) 469270759Ssmh return (err); 470270759Ssmh 471272483Ssmh if (val < minfree) 472270759Ssmh return (EINVAL); 473272483Ssmh if (val > vm_cnt.v_page_count) 474270759Ssmh return (EINVAL); 475270759Ssmh 476270759Ssmh zfs_arc_free_target = val; 477270759Ssmh 478270759Ssmh return (0); 479270759Ssmh} 480275748Sdelphij 481275748Sdelphij/* 482275748Sdelphij * Must be declared here, before the definition of corresponding kstat 483275748Sdelphij * macro which uses the same names will confuse the compiler. 484275748Sdelphij */ 485275748SdelphijSYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_meta_limit, 486275748Sdelphij CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(uint64_t), 487275748Sdelphij sysctl_vfs_zfs_arc_meta_limit, "QU", 488275748Sdelphij "ARC metadata limit"); 489272483Ssmh#endif 490270759Ssmh 491168404Spjd/* 492185029Spjd * Note that buffers can be in one of 6 states: 493168404Spjd * ARC_anon - anonymous (discussed below) 494168404Spjd * ARC_mru - recently used, currently cached 495168404Spjd * ARC_mru_ghost - recentely used, no longer in cache 496168404Spjd * ARC_mfu - frequently used, currently cached 497168404Spjd * ARC_mfu_ghost - frequently used, no longer in cache 498185029Spjd * ARC_l2c_only - exists in L2ARC but not other states 499185029Spjd * When there are no active references to the buffer, they are 500185029Spjd * are linked onto a list in one of these arc states. These are 501185029Spjd * the only buffers that can be evicted or deleted. Within each 502185029Spjd * state there are multiple lists, one for meta-data and one for 503185029Spjd * non-meta-data. Meta-data (indirect blocks, blocks of dnodes, 504185029Spjd * etc.) is tracked separately so that it can be managed more 505185029Spjd * explicitly: favored over data, limited explicitly. 506168404Spjd * 507168404Spjd * Anonymous buffers are buffers that are not associated with 508168404Spjd * a DVA. These are buffers that hold dirty block copies 509168404Spjd * before they are written to stable storage. By definition, 510168404Spjd * they are "ref'd" and are considered part of arc_mru 511168404Spjd * that cannot be freed. Generally, they will aquire a DVA 512168404Spjd * as they are written and migrate onto the arc_mru list. 513185029Spjd * 514185029Spjd * The ARC_l2c_only state is for buffers that are in the second 515185029Spjd * level ARC but no longer in any of the ARC_m* lists. The second 516185029Spjd * level ARC itself may also contain buffers that are in any of 517185029Spjd * the ARC_m* states - meaning that a buffer can exist in two 518185029Spjd * places. The reason for the ARC_l2c_only state is to keep the 519185029Spjd * buffer header in the hash table, so that reads that hit the 520185029Spjd * second level ARC benefit from these fast lookups. 521168404Spjd */ 522168404Spjd 523168404Spjdtypedef struct arc_state { 524286763Smav /* 525286763Smav * list of evictable buffers 526286763Smav */ 527321553Smav multilist_t *arcs_list[ARC_BUFC_NUMTYPES]; 528286763Smav /* 529286763Smav * total amount of evictable data in this state 530286763Smav */ 531307265Smav refcount_t arcs_esize[ARC_BUFC_NUMTYPES]; 532286763Smav /* 533286763Smav * total amount of data in this state; this includes: evictable, 534286763Smav * non-evictable, ARC_BUFC_DATA, and ARC_BUFC_METADATA. 535286763Smav */ 536286766Smav refcount_t arcs_size; 537168404Spjd} arc_state_t; 538168404Spjd 539185029Spjd/* The 6 states: */ 540168404Spjdstatic arc_state_t ARC_anon; 541168404Spjdstatic arc_state_t ARC_mru; 542168404Spjdstatic arc_state_t ARC_mru_ghost; 543168404Spjdstatic arc_state_t ARC_mfu; 544168404Spjdstatic arc_state_t ARC_mfu_ghost; 545185029Spjdstatic arc_state_t ARC_l2c_only; 546168404Spjd 547168404Spjdtypedef struct arc_stats { 548168404Spjd kstat_named_t arcstat_hits; 549168404Spjd kstat_named_t arcstat_misses; 550168404Spjd kstat_named_t arcstat_demand_data_hits; 551168404Spjd kstat_named_t arcstat_demand_data_misses; 552168404Spjd kstat_named_t arcstat_demand_metadata_hits; 553168404Spjd kstat_named_t arcstat_demand_metadata_misses; 554168404Spjd kstat_named_t arcstat_prefetch_data_hits; 555168404Spjd kstat_named_t arcstat_prefetch_data_misses; 556168404Spjd kstat_named_t arcstat_prefetch_metadata_hits; 557168404Spjd kstat_named_t arcstat_prefetch_metadata_misses; 558168404Spjd kstat_named_t arcstat_mru_hits; 559168404Spjd kstat_named_t arcstat_mru_ghost_hits; 560168404Spjd kstat_named_t arcstat_mfu_hits; 561168404Spjd kstat_named_t arcstat_mfu_ghost_hits; 562205231Skmacy kstat_named_t arcstat_allocated; 563168404Spjd kstat_named_t arcstat_deleted; 564251629Sdelphij /* 565251629Sdelphij * Number of buffers that could not be evicted because the hash lock 566251629Sdelphij * was held by another thread. The lock may not necessarily be held 567251629Sdelphij * by something using the same buffer, since hash locks are shared 568251629Sdelphij * by multiple buffers. 569251629Sdelphij */ 570168404Spjd kstat_named_t arcstat_mutex_miss; 571251629Sdelphij /* 572332785Smav * Number of buffers skipped when updating the access state due to the 573332785Smav * header having already been released after acquiring the hash lock. 574332785Smav */ 575332785Smav kstat_named_t arcstat_access_skip; 576332785Smav /* 577251629Sdelphij * Number of buffers skipped because they have I/O in progress, are 578332785Smav * indirect prefetch buffers that have not lived long enough, or are 579251629Sdelphij * not from the spa we're trying to evict from. 580251629Sdelphij */ 581168404Spjd kstat_named_t arcstat_evict_skip; 582286763Smav /* 583286763Smav * Number of times arc_evict_state() was unable to evict enough 584286763Smav * buffers to reach it's target amount. 585286763Smav */ 586286763Smav kstat_named_t arcstat_evict_not_enough; 587208373Smm kstat_named_t arcstat_evict_l2_cached; 588208373Smm kstat_named_t arcstat_evict_l2_eligible; 589208373Smm kstat_named_t arcstat_evict_l2_ineligible; 590286763Smav kstat_named_t arcstat_evict_l2_skip; 591168404Spjd kstat_named_t arcstat_hash_elements; 592168404Spjd kstat_named_t arcstat_hash_elements_max; 593168404Spjd kstat_named_t arcstat_hash_collisions; 594168404Spjd kstat_named_t arcstat_hash_chains; 595168404Spjd kstat_named_t arcstat_hash_chain_max; 596168404Spjd kstat_named_t arcstat_p; 597168404Spjd kstat_named_t arcstat_c; 598168404Spjd kstat_named_t arcstat_c_min; 599168404Spjd kstat_named_t arcstat_c_max; 600332540Smav /* Not updated directly; only synced in arc_kstat_update. */ 601168404Spjd kstat_named_t arcstat_size; 602286574Smav /* 603321610Smav * Number of compressed bytes stored in the arc_buf_hdr_t's b_pabd. 604307265Smav * Note that the compressed bytes may match the uncompressed bytes 605307265Smav * if the block is either not compressed or compressed arc is disabled. 606307265Smav */ 607307265Smav kstat_named_t arcstat_compressed_size; 608307265Smav /* 609321610Smav * Uncompressed size of the data stored in b_pabd. If compressed 610307265Smav * arc is disabled then this value will be identical to the stat 611307265Smav * above. 612307265Smav */ 613307265Smav kstat_named_t arcstat_uncompressed_size; 614307265Smav /* 615307265Smav * Number of bytes stored in all the arc_buf_t's. This is classified 616307265Smav * as "overhead" since this data is typically short-lived and will 617307265Smav * be evicted from the arc when it becomes unreferenced unless the 618307265Smav * zfs_keep_uncompressed_metadata or zfs_keep_uncompressed_level 619307265Smav * values have been set (see comment in dbuf.c for more information). 620307265Smav */ 621307265Smav kstat_named_t arcstat_overhead_size; 622307265Smav /* 623286574Smav * Number of bytes consumed by internal ARC structures necessary 624286574Smav * for tracking purposes; these structures are not actually 625286574Smav * backed by ARC buffers. This includes arc_buf_hdr_t structures 626286574Smav * (allocated via arc_buf_hdr_t_full and arc_buf_hdr_t_l2only 627286574Smav * caches), and arc_buf_t structures (allocated via arc_buf_t 628286574Smav * cache). 629332540Smav * Not updated directly; only synced in arc_kstat_update. 630286574Smav */ 631185029Spjd kstat_named_t arcstat_hdr_size; 632286574Smav /* 633286574Smav * Number of bytes consumed by ARC buffers of type equal to 634286574Smav * ARC_BUFC_DATA. This is generally consumed by buffers backing 635286574Smav * on disk user data (e.g. plain file contents). 636332540Smav * Not updated directly; only synced in arc_kstat_update. 637286574Smav */ 638208373Smm kstat_named_t arcstat_data_size; 639286574Smav /* 640286574Smav * Number of bytes consumed by ARC buffers of type equal to 641286574Smav * ARC_BUFC_METADATA. This is generally consumed by buffers 642286574Smav * backing on disk data that is used for internal ZFS 643286574Smav * structures (e.g. ZAP, dnode, indirect blocks, etc). 644332540Smav * Not updated directly; only synced in arc_kstat_update. 645286574Smav */ 646286574Smav kstat_named_t arcstat_metadata_size; 647286574Smav /* 648286574Smav * Number of bytes consumed by various buffers and structures 649286574Smav * not actually backed with ARC buffers. This includes bonus 650286574Smav * buffers (allocated directly via zio_buf_* functions), 651286574Smav * dmu_buf_impl_t structures (allocated via dmu_buf_impl_t 652286574Smav * cache), and dnode_t structures (allocated via dnode_t cache). 653332540Smav * Not updated directly; only synced in arc_kstat_update. 654286574Smav */ 655208373Smm kstat_named_t arcstat_other_size; 656286574Smav /* 657286574Smav * Total number of bytes consumed by ARC buffers residing in the 658286574Smav * arc_anon state. This includes *all* buffers in the arc_anon 659286574Smav * state; e.g. data, metadata, evictable, and unevictable buffers 660286574Smav * are all included in this value. 661332540Smav * Not updated directly; only synced in arc_kstat_update. 662286574Smav */ 663286574Smav kstat_named_t arcstat_anon_size; 664286574Smav /* 665286574Smav * Number of bytes consumed by ARC buffers that meet the 666286574Smav * following criteria: backing buffers of type ARC_BUFC_DATA, 667286574Smav * residing in the arc_anon state, and are eligible for eviction 668286574Smav * (e.g. have no outstanding holds on the buffer). 669332540Smav * Not updated directly; only synced in arc_kstat_update. 670286574Smav */ 671286574Smav kstat_named_t arcstat_anon_evictable_data; 672286574Smav /* 673286574Smav * Number of bytes consumed by ARC buffers that meet the 674286574Smav * following criteria: backing buffers of type ARC_BUFC_METADATA, 675286574Smav * residing in the arc_anon state, and are eligible for eviction 676286574Smav * (e.g. have no outstanding holds on the buffer). 677332540Smav * Not updated directly; only synced in arc_kstat_update. 678286574Smav */ 679286574Smav kstat_named_t arcstat_anon_evictable_metadata; 680286574Smav /* 681286574Smav * Total number of bytes consumed by ARC buffers residing in the 682286574Smav * arc_mru state. This includes *all* buffers in the arc_mru 683286574Smav * state; e.g. data, metadata, evictable, and unevictable buffers 684286574Smav * are all included in this value. 685332540Smav * Not updated directly; only synced in arc_kstat_update. 686286574Smav */ 687286574Smav kstat_named_t arcstat_mru_size; 688286574Smav /* 689286574Smav * Number of bytes consumed by ARC buffers that meet the 690286574Smav * following criteria: backing buffers of type ARC_BUFC_DATA, 691286574Smav * residing in the arc_mru state, and are eligible for eviction 692286574Smav * (e.g. have no outstanding holds on the buffer). 693332540Smav * Not updated directly; only synced in arc_kstat_update. 694286574Smav */ 695286574Smav kstat_named_t arcstat_mru_evictable_data; 696286574Smav /* 697286574Smav * Number of bytes consumed by ARC buffers that meet the 698286574Smav * following criteria: backing buffers of type ARC_BUFC_METADATA, 699286574Smav * residing in the arc_mru state, and are eligible for eviction 700286574Smav * (e.g. have no outstanding holds on the buffer). 701332540Smav * Not updated directly; only synced in arc_kstat_update. 702286574Smav */ 703286574Smav kstat_named_t arcstat_mru_evictable_metadata; 704286574Smav /* 705286574Smav * Total number of bytes that *would have been* consumed by ARC 706286574Smav * buffers in the arc_mru_ghost state. The key thing to note 707286574Smav * here, is the fact that this size doesn't actually indicate 708286574Smav * RAM consumption. The ghost lists only consist of headers and 709286574Smav * don't actually have ARC buffers linked off of these headers. 710286574Smav * Thus, *if* the headers had associated ARC buffers, these 711286574Smav * buffers *would have* consumed this number of bytes. 712332540Smav * Not updated directly; only synced in arc_kstat_update. 713286574Smav */ 714286574Smav kstat_named_t arcstat_mru_ghost_size; 715286574Smav /* 716286574Smav * Number of bytes that *would have been* consumed by ARC 717286574Smav * buffers that are eligible for eviction, of type 718286574Smav * ARC_BUFC_DATA, and linked off the arc_mru_ghost state. 719332540Smav * Not updated directly; only synced in arc_kstat_update. 720286574Smav */ 721286574Smav kstat_named_t arcstat_mru_ghost_evictable_data; 722286574Smav /* 723286574Smav * Number of bytes that *would have been* consumed by ARC 724286574Smav * buffers that are eligible for eviction, of type 725286574Smav * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state. 726332540Smav * Not updated directly; only synced in arc_kstat_update. 727286574Smav */ 728286574Smav kstat_named_t arcstat_mru_ghost_evictable_metadata; 729286574Smav /* 730286574Smav * Total number of bytes consumed by ARC buffers residing in the 731286574Smav * arc_mfu state. This includes *all* buffers in the arc_mfu 732286574Smav * state; e.g. data, metadata, evictable, and unevictable buffers 733286574Smav * are all included in this value. 734332540Smav * Not updated directly; only synced in arc_kstat_update. 735286574Smav */ 736286574Smav kstat_named_t arcstat_mfu_size; 737286574Smav /* 738286574Smav * Number of bytes consumed by ARC buffers that are eligible for 739286574Smav * eviction, of type ARC_BUFC_DATA, and reside in the arc_mfu 740286574Smav * state. 741332540Smav * Not updated directly; only synced in arc_kstat_update. 742286574Smav */ 743286574Smav kstat_named_t arcstat_mfu_evictable_data; 744286574Smav /* 745286574Smav * Number of bytes consumed by ARC buffers that are eligible for 746286574Smav * eviction, of type ARC_BUFC_METADATA, and reside in the 747286574Smav * arc_mfu state. 748332540Smav * Not updated directly; only synced in arc_kstat_update. 749286574Smav */ 750286574Smav kstat_named_t arcstat_mfu_evictable_metadata; 751286574Smav /* 752286574Smav * Total number of bytes that *would have been* consumed by ARC 753286574Smav * buffers in the arc_mfu_ghost state. See the comment above 754286574Smav * arcstat_mru_ghost_size for more details. 755332540Smav * Not updated directly; only synced in arc_kstat_update. 756286574Smav */ 757286574Smav kstat_named_t arcstat_mfu_ghost_size; 758286574Smav /* 759286574Smav * Number of bytes that *would have been* consumed by ARC 760286574Smav * buffers that are eligible for eviction, of type 761286574Smav * ARC_BUFC_DATA, and linked off the arc_mfu_ghost state. 762332540Smav * Not updated directly; only synced in arc_kstat_update. 763286574Smav */ 764286574Smav kstat_named_t arcstat_mfu_ghost_evictable_data; 765286574Smav /* 766286574Smav * Number of bytes that *would have been* consumed by ARC 767286574Smav * buffers that are eligible for eviction, of type 768286574Smav * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state. 769332540Smav * Not updated directly; only synced in arc_kstat_update. 770286574Smav */ 771286574Smav kstat_named_t arcstat_mfu_ghost_evictable_metadata; 772185029Spjd kstat_named_t arcstat_l2_hits; 773185029Spjd kstat_named_t arcstat_l2_misses; 774185029Spjd kstat_named_t arcstat_l2_feeds; 775185029Spjd kstat_named_t arcstat_l2_rw_clash; 776208373Smm kstat_named_t arcstat_l2_read_bytes; 777208373Smm kstat_named_t arcstat_l2_write_bytes; 778185029Spjd kstat_named_t arcstat_l2_writes_sent; 779185029Spjd kstat_named_t arcstat_l2_writes_done; 780185029Spjd kstat_named_t arcstat_l2_writes_error; 781286763Smav kstat_named_t arcstat_l2_writes_lock_retry; 782185029Spjd kstat_named_t arcstat_l2_evict_lock_retry; 783185029Spjd kstat_named_t arcstat_l2_evict_reading; 784286570Smav kstat_named_t arcstat_l2_evict_l1cached; 785185029Spjd kstat_named_t arcstat_l2_free_on_write; 786185029Spjd kstat_named_t arcstat_l2_abort_lowmem; 787185029Spjd kstat_named_t arcstat_l2_cksum_bad; 788185029Spjd kstat_named_t arcstat_l2_io_error; 789323754Savg kstat_named_t arcstat_l2_lsize; 790323754Savg kstat_named_t arcstat_l2_psize; 791332540Smav /* Not updated directly; only synced in arc_kstat_update. */ 792185029Spjd kstat_named_t arcstat_l2_hdr_size; 793205231Skmacy kstat_named_t arcstat_l2_write_trylock_fail; 794205231Skmacy kstat_named_t arcstat_l2_write_passed_headroom; 795205231Skmacy kstat_named_t arcstat_l2_write_spa_mismatch; 796206796Spjd kstat_named_t arcstat_l2_write_in_l2; 797205231Skmacy kstat_named_t arcstat_l2_write_hdr_io_in_progress; 798205231Skmacy kstat_named_t arcstat_l2_write_not_cacheable; 799205231Skmacy kstat_named_t arcstat_l2_write_full; 800205231Skmacy kstat_named_t arcstat_l2_write_buffer_iter; 801205231Skmacy kstat_named_t arcstat_l2_write_pios; 802205231Skmacy kstat_named_t arcstat_l2_write_buffer_bytes_scanned; 803205231Skmacy kstat_named_t arcstat_l2_write_buffer_list_iter; 804205231Skmacy kstat_named_t arcstat_l2_write_buffer_list_null_iter; 805242845Sdelphij kstat_named_t arcstat_memory_throttle_count; 806332540Smav /* Not updated directly; only synced in arc_kstat_update. */ 807275748Sdelphij kstat_named_t arcstat_meta_used; 808275748Sdelphij kstat_named_t arcstat_meta_limit; 809275748Sdelphij kstat_named_t arcstat_meta_max; 810275780Sdelphij kstat_named_t arcstat_meta_min; 811339034Ssef kstat_named_t arcstat_async_upgrade_sync; 812287702Sdelphij kstat_named_t arcstat_demand_hit_predictive_prefetch; 813339034Ssef kstat_named_t arcstat_demand_hit_prescient_prefetch; 814168404Spjd} arc_stats_t; 815168404Spjd 816168404Spjdstatic arc_stats_t arc_stats = { 817168404Spjd { "hits", KSTAT_DATA_UINT64 }, 818168404Spjd { "misses", KSTAT_DATA_UINT64 }, 819168404Spjd { "demand_data_hits", KSTAT_DATA_UINT64 }, 820168404Spjd { "demand_data_misses", KSTAT_DATA_UINT64 }, 821168404Spjd { "demand_metadata_hits", KSTAT_DATA_UINT64 }, 822168404Spjd { "demand_metadata_misses", KSTAT_DATA_UINT64 }, 823168404Spjd { "prefetch_data_hits", KSTAT_DATA_UINT64 }, 824168404Spjd { "prefetch_data_misses", KSTAT_DATA_UINT64 }, 825168404Spjd { "prefetch_metadata_hits", KSTAT_DATA_UINT64 }, 826168404Spjd { "prefetch_metadata_misses", KSTAT_DATA_UINT64 }, 827168404Spjd { "mru_hits", KSTAT_DATA_UINT64 }, 828168404Spjd { "mru_ghost_hits", KSTAT_DATA_UINT64 }, 829168404Spjd { "mfu_hits", KSTAT_DATA_UINT64 }, 830168404Spjd { "mfu_ghost_hits", KSTAT_DATA_UINT64 }, 831205231Skmacy { "allocated", KSTAT_DATA_UINT64 }, 832168404Spjd { "deleted", KSTAT_DATA_UINT64 }, 833168404Spjd { "mutex_miss", KSTAT_DATA_UINT64 }, 834332785Smav { "access_skip", KSTAT_DATA_UINT64 }, 835168404Spjd { "evict_skip", KSTAT_DATA_UINT64 }, 836286763Smav { "evict_not_enough", KSTAT_DATA_UINT64 }, 837208373Smm { "evict_l2_cached", KSTAT_DATA_UINT64 }, 838208373Smm { "evict_l2_eligible", KSTAT_DATA_UINT64 }, 839208373Smm { "evict_l2_ineligible", KSTAT_DATA_UINT64 }, 840286763Smav { "evict_l2_skip", KSTAT_DATA_UINT64 }, 841168404Spjd { "hash_elements", KSTAT_DATA_UINT64 }, 842168404Spjd { "hash_elements_max", KSTAT_DATA_UINT64 }, 843168404Spjd { "hash_collisions", KSTAT_DATA_UINT64 }, 844168404Spjd { "hash_chains", KSTAT_DATA_UINT64 }, 845168404Spjd { "hash_chain_max", KSTAT_DATA_UINT64 }, 846168404Spjd { "p", KSTAT_DATA_UINT64 }, 847168404Spjd { "c", KSTAT_DATA_UINT64 }, 848168404Spjd { "c_min", KSTAT_DATA_UINT64 }, 849168404Spjd { "c_max", KSTAT_DATA_UINT64 }, 850185029Spjd { "size", KSTAT_DATA_UINT64 }, 851307265Smav { "compressed_size", KSTAT_DATA_UINT64 }, 852307265Smav { "uncompressed_size", KSTAT_DATA_UINT64 }, 853307265Smav { "overhead_size", KSTAT_DATA_UINT64 }, 854185029Spjd { "hdr_size", KSTAT_DATA_UINT64 }, 855208373Smm { "data_size", KSTAT_DATA_UINT64 }, 856286574Smav { "metadata_size", KSTAT_DATA_UINT64 }, 857208373Smm { "other_size", KSTAT_DATA_UINT64 }, 858286574Smav { "anon_size", KSTAT_DATA_UINT64 }, 859286574Smav { "anon_evictable_data", KSTAT_DATA_UINT64 }, 860286574Smav { "anon_evictable_metadata", KSTAT_DATA_UINT64 }, 861286574Smav { "mru_size", KSTAT_DATA_UINT64 }, 862286574Smav { "mru_evictable_data", KSTAT_DATA_UINT64 }, 863286574Smav { "mru_evictable_metadata", KSTAT_DATA_UINT64 }, 864286574Smav { "mru_ghost_size", KSTAT_DATA_UINT64 }, 865286574Smav { "mru_ghost_evictable_data", KSTAT_DATA_UINT64 }, 866286574Smav { "mru_ghost_evictable_metadata", KSTAT_DATA_UINT64 }, 867286574Smav { "mfu_size", KSTAT_DATA_UINT64 }, 868286574Smav { "mfu_evictable_data", KSTAT_DATA_UINT64 }, 869286574Smav { "mfu_evictable_metadata", KSTAT_DATA_UINT64 }, 870286574Smav { "mfu_ghost_size", KSTAT_DATA_UINT64 }, 871286574Smav { "mfu_ghost_evictable_data", KSTAT_DATA_UINT64 }, 872286574Smav { "mfu_ghost_evictable_metadata", KSTAT_DATA_UINT64 }, 873185029Spjd { "l2_hits", KSTAT_DATA_UINT64 }, 874185029Spjd { "l2_misses", KSTAT_DATA_UINT64 }, 875185029Spjd { "l2_feeds", KSTAT_DATA_UINT64 }, 876185029Spjd { "l2_rw_clash", KSTAT_DATA_UINT64 }, 877208373Smm { "l2_read_bytes", KSTAT_DATA_UINT64 }, 878208373Smm { "l2_write_bytes", KSTAT_DATA_UINT64 }, 879185029Spjd { "l2_writes_sent", KSTAT_DATA_UINT64 }, 880185029Spjd { "l2_writes_done", KSTAT_DATA_UINT64 }, 881185029Spjd { "l2_writes_error", KSTAT_DATA_UINT64 }, 882286763Smav { "l2_writes_lock_retry", KSTAT_DATA_UINT64 }, 883185029Spjd { "l2_evict_lock_retry", KSTAT_DATA_UINT64 }, 884185029Spjd { "l2_evict_reading", KSTAT_DATA_UINT64 }, 885286570Smav { "l2_evict_l1cached", KSTAT_DATA_UINT64 }, 886185029Spjd { "l2_free_on_write", KSTAT_DATA_UINT64 }, 887185029Spjd { "l2_abort_lowmem", KSTAT_DATA_UINT64 }, 888185029Spjd { "l2_cksum_bad", KSTAT_DATA_UINT64 }, 889185029Spjd { "l2_io_error", KSTAT_DATA_UINT64 }, 890185029Spjd { "l2_size", KSTAT_DATA_UINT64 }, 891251478Sdelphij { "l2_asize", KSTAT_DATA_UINT64 }, 892185029Spjd { "l2_hdr_size", KSTAT_DATA_UINT64 }, 893206796Spjd { "l2_write_trylock_fail", KSTAT_DATA_UINT64 }, 894206796Spjd { "l2_write_passed_headroom", KSTAT_DATA_UINT64 }, 895206796Spjd { "l2_write_spa_mismatch", KSTAT_DATA_UINT64 }, 896206796Spjd { "l2_write_in_l2", KSTAT_DATA_UINT64 }, 897206796Spjd { "l2_write_io_in_progress", KSTAT_DATA_UINT64 }, 898206796Spjd { "l2_write_not_cacheable", KSTAT_DATA_UINT64 }, 899206796Spjd { "l2_write_full", KSTAT_DATA_UINT64 }, 900206796Spjd { "l2_write_buffer_iter", KSTAT_DATA_UINT64 }, 901206796Spjd { "l2_write_pios", KSTAT_DATA_UINT64 }, 902206796Spjd { "l2_write_buffer_bytes_scanned", KSTAT_DATA_UINT64 }, 903206796Spjd { "l2_write_buffer_list_iter", KSTAT_DATA_UINT64 }, 904242845Sdelphij { "l2_write_buffer_list_null_iter", KSTAT_DATA_UINT64 }, 905242845Sdelphij { "memory_throttle_count", KSTAT_DATA_UINT64 }, 906275748Sdelphij { "arc_meta_used", KSTAT_DATA_UINT64 }, 907275748Sdelphij { "arc_meta_limit", KSTAT_DATA_UINT64 }, 908275780Sdelphij { "arc_meta_max", KSTAT_DATA_UINT64 }, 909287702Sdelphij { "arc_meta_min", KSTAT_DATA_UINT64 }, 910339034Ssef { "async_upgrade_sync", KSTAT_DATA_UINT64 }, 911287702Sdelphij { "demand_hit_predictive_prefetch", KSTAT_DATA_UINT64 }, 912339034Ssef { "demand_hit_prescient_prefetch", KSTAT_DATA_UINT64 }, 913168404Spjd}; 914168404Spjd 915168404Spjd#define ARCSTAT(stat) (arc_stats.stat.value.ui64) 916168404Spjd 917168404Spjd#define ARCSTAT_INCR(stat, val) \ 918251631Sdelphij atomic_add_64(&arc_stats.stat.value.ui64, (val)) 919168404Spjd 920206796Spjd#define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1) 921168404Spjd#define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1) 922168404Spjd 923168404Spjd#define ARCSTAT_MAX(stat, val) { \ 924168404Spjd uint64_t m; \ 925168404Spjd while ((val) > (m = arc_stats.stat.value.ui64) && \ 926168404Spjd (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val)))) \ 927168404Spjd continue; \ 928168404Spjd} 929168404Spjd 930168404Spjd#define ARCSTAT_MAXSTAT(stat) \ 931168404Spjd ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64) 932168404Spjd 933168404Spjd/* 934168404Spjd * We define a macro to allow ARC hits/misses to be easily broken down by 935168404Spjd * two separate conditions, giving a total of four different subtypes for 936168404Spjd * each of hits and misses (so eight statistics total). 937168404Spjd */ 938168404Spjd#define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \ 939168404Spjd if (cond1) { \ 940168404Spjd if (cond2) { \ 941168404Spjd ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \ 942168404Spjd } else { \ 943168404Spjd ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \ 944168404Spjd } \ 945168404Spjd } else { \ 946168404Spjd if (cond2) { \ 947168404Spjd ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \ 948168404Spjd } else { \ 949168404Spjd ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\ 950168404Spjd } \ 951168404Spjd } 952168404Spjd 953168404Spjdkstat_t *arc_ksp; 954206796Spjdstatic arc_state_t *arc_anon; 955168404Spjdstatic arc_state_t *arc_mru; 956168404Spjdstatic arc_state_t *arc_mru_ghost; 957168404Spjdstatic arc_state_t *arc_mfu; 958168404Spjdstatic arc_state_t *arc_mfu_ghost; 959185029Spjdstatic arc_state_t *arc_l2c_only; 960168404Spjd 961168404Spjd/* 962168404Spjd * There are several ARC variables that are critical to export as kstats -- 963168404Spjd * but we don't want to have to grovel around in the kstat whenever we wish to 964168404Spjd * manipulate them. For these variables, we therefore define them to be in 965168404Spjd * terms of the statistic variable. This assures that we are not introducing 966168404Spjd * the possibility of inconsistency by having shadow copies of the variables, 967168404Spjd * while still allowing the code to be readable. 968168404Spjd */ 969168404Spjd#define arc_p ARCSTAT(arcstat_p) /* target size of MRU */ 970168404Spjd#define arc_c ARCSTAT(arcstat_c) /* target size of cache */ 971168404Spjd#define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */ 972168404Spjd#define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */ 973275748Sdelphij#define arc_meta_limit ARCSTAT(arcstat_meta_limit) /* max size for metadata */ 974275780Sdelphij#define arc_meta_min ARCSTAT(arcstat_meta_min) /* min size for metadata */ 975275748Sdelphij#define arc_meta_max ARCSTAT(arcstat_meta_max) /* max size of metadata */ 976168404Spjd 977307265Smav/* compressed size of entire arc */ 978307265Smav#define arc_compressed_size ARCSTAT(arcstat_compressed_size) 979307265Smav/* uncompressed size of entire arc */ 980307265Smav#define arc_uncompressed_size ARCSTAT(arcstat_uncompressed_size) 981307265Smav/* number of bytes in the arc from arc_buf_t's */ 982307265Smav#define arc_overhead_size ARCSTAT(arcstat_overhead_size) 983251478Sdelphij 984332540Smav/* 985332540Smav * There are also some ARC variables that we want to export, but that are 986332540Smav * updated so often that having the canonical representation be the statistic 987332540Smav * variable causes a performance bottleneck. We want to use aggsum_t's for these 988332540Smav * instead, but still be able to export the kstat in the same way as before. 989332540Smav * The solution is to always use the aggsum version, except in the kstat update 990332540Smav * callback. 991332540Smav */ 992332540Smavaggsum_t arc_size; 993332540Smavaggsum_t arc_meta_used; 994332540Smavaggsum_t astat_data_size; 995332540Smavaggsum_t astat_metadata_size; 996332540Smavaggsum_t astat_hdr_size; 997332540Smavaggsum_t astat_other_size; 998332540Smavaggsum_t astat_l2_hdr_size; 999332540Smav 1000168404Spjdstatic int arc_no_grow; /* Don't try to grow cache size */ 1001346686Smavstatic hrtime_t arc_growtime; 1002168404Spjdstatic uint64_t arc_tempreserve; 1003209962Smmstatic uint64_t arc_loaned_bytes; 1004168404Spjd 1005168404Spjdtypedef struct arc_callback arc_callback_t; 1006168404Spjd 1007168404Spjdstruct arc_callback { 1008168404Spjd void *acb_private; 1009339034Ssef arc_read_done_func_t *acb_done; 1010168404Spjd arc_buf_t *acb_buf; 1011321535Smav boolean_t acb_compressed; 1012168404Spjd zio_t *acb_zio_dummy; 1013339034Ssef zio_t *acb_zio_head; 1014168404Spjd arc_callback_t *acb_next; 1015168404Spjd}; 1016168404Spjd 1017168404Spjdtypedef struct arc_write_callback arc_write_callback_t; 1018168404Spjd 1019168404Spjdstruct arc_write_callback { 1020339034Ssef void *awcb_private; 1021339034Ssef arc_write_done_func_t *awcb_ready; 1022339034Ssef arc_write_done_func_t *awcb_children_ready; 1023339034Ssef arc_write_done_func_t *awcb_physdone; 1024339034Ssef arc_write_done_func_t *awcb_done; 1025339034Ssef arc_buf_t *awcb_buf; 1026168404Spjd}; 1027168404Spjd 1028286570Smav/* 1029286570Smav * ARC buffers are separated into multiple structs as a memory saving measure: 1030286570Smav * - Common fields struct, always defined, and embedded within it: 1031286570Smav * - L2-only fields, always allocated but undefined when not in L2ARC 1032286570Smav * - L1-only fields, only allocated when in L1ARC 1033286570Smav * 1034286570Smav * Buffer in L1 Buffer only in L2 1035286570Smav * +------------------------+ +------------------------+ 1036286570Smav * | arc_buf_hdr_t | | arc_buf_hdr_t | 1037286570Smav * | | | | 1038286570Smav * | | | | 1039286570Smav * | | | | 1040286570Smav * +------------------------+ +------------------------+ 1041286570Smav * | l2arc_buf_hdr_t | | l2arc_buf_hdr_t | 1042286570Smav * | (undefined if L1-only) | | | 1043286570Smav * +------------------------+ +------------------------+ 1044286570Smav * | l1arc_buf_hdr_t | 1045286570Smav * | | 1046286570Smav * | | 1047286570Smav * | | 1048286570Smav * | | 1049286570Smav * +------------------------+ 1050286570Smav * 1051286570Smav * Because it's possible for the L2ARC to become extremely large, we can wind 1052286570Smav * up eating a lot of memory in L2ARC buffer headers, so the size of a header 1053286570Smav * is minimized by only allocating the fields necessary for an L1-cached buffer 1054286570Smav * when a header is actually in the L1 cache. The sub-headers (l1arc_buf_hdr and 1055286570Smav * l2arc_buf_hdr) are embedded rather than allocated separately to save a couple 1056286570Smav * words in pointers. arc_hdr_realloc() is used to switch a header between 1057286570Smav * these two allocation states. 1058286570Smav */ 1059286570Smavtypedef struct l1arc_buf_hdr { 1060168404Spjd kmutex_t b_freeze_lock; 1061307265Smav zio_cksum_t *b_freeze_cksum; 1062286570Smav#ifdef ZFS_DEBUG 1063286570Smav /* 1064321535Smav * Used for debugging with kmem_flags - by allocating and freeing 1065286570Smav * b_thawed when the buffer is thawed, we get a record of the stack 1066286570Smav * trace that thawed it. 1067286570Smav */ 1068219089Spjd void *b_thawed; 1069286570Smav#endif 1070168404Spjd 1071168404Spjd arc_buf_t *b_buf; 1072307265Smav uint32_t b_bufcnt; 1073286570Smav /* for waiting on writes to complete */ 1074168404Spjd kcondvar_t b_cv; 1075307265Smav uint8_t b_byteswap; 1076168404Spjd 1077168404Spjd /* protected by arc state mutex */ 1078168404Spjd arc_state_t *b_state; 1079286763Smav multilist_node_t b_arc_node; 1080168404Spjd 1081168404Spjd /* updated atomically */ 1082168404Spjd clock_t b_arc_access; 1083168404Spjd 1084168404Spjd /* self protecting */ 1085168404Spjd refcount_t b_refcnt; 1086185029Spjd 1087286570Smav arc_callback_t *b_acb; 1088321610Smav abd_t *b_pabd; 1089286570Smav} l1arc_buf_hdr_t; 1090286570Smav 1091286570Smavtypedef struct l2arc_dev l2arc_dev_t; 1092286570Smav 1093286570Smavtypedef struct l2arc_buf_hdr { 1094286570Smav /* protected by arc_buf_hdr mutex */ 1095286570Smav l2arc_dev_t *b_dev; /* L2ARC device */ 1096286570Smav uint64_t b_daddr; /* disk address, offset byte */ 1097286570Smav 1098185029Spjd list_node_t b_l2node; 1099286570Smav} l2arc_buf_hdr_t; 1100286570Smav 1101286570Smavstruct arc_buf_hdr { 1102286570Smav /* protected by hash lock */ 1103286570Smav dva_t b_dva; 1104286570Smav uint64_t b_birth; 1105286570Smav 1106307265Smav arc_buf_contents_t b_type; 1107286570Smav arc_buf_hdr_t *b_hash_next; 1108286570Smav arc_flags_t b_flags; 1109286570Smav 1110307265Smav /* 1111307265Smav * This field stores the size of the data buffer after 1112307265Smav * compression, and is set in the arc's zio completion handlers. 1113307265Smav * It is in units of SPA_MINBLOCKSIZE (e.g. 1 == 512 bytes). 1114307265Smav * 1115307265Smav * While the block pointers can store up to 32MB in their psize 1116307265Smav * field, we can only store up to 32MB minus 512B. This is due 1117307265Smav * to the bp using a bias of 1, whereas we use a bias of 0 (i.e. 1118307265Smav * a field of zeros represents 512B in the bp). We can't use a 1119307265Smav * bias of 1 since we need to reserve a psize of zero, here, to 1120307265Smav * represent holes and embedded blocks. 1121307265Smav * 1122307265Smav * This isn't a problem in practice, since the maximum size of a 1123307265Smav * buffer is limited to 16MB, so we never need to store 32MB in 1124307265Smav * this field. Even in the upstream illumos code base, the 1125307265Smav * maximum size of a buffer is limited to 16MB. 1126307265Smav */ 1127307265Smav uint16_t b_psize; 1128286570Smav 1129307265Smav /* 1130307265Smav * This field stores the size of the data buffer before 1131307265Smav * compression, and cannot change once set. It is in units 1132307265Smav * of SPA_MINBLOCKSIZE (e.g. 2 == 1024 bytes) 1133307265Smav */ 1134307265Smav uint16_t b_lsize; /* immutable */ 1135307265Smav uint64_t b_spa; /* immutable */ 1136307265Smav 1137286570Smav /* L2ARC fields. Undefined when not in L2ARC. */ 1138286570Smav l2arc_buf_hdr_t b_l2hdr; 1139286570Smav /* L1ARC fields. Undefined when in l2arc_only state */ 1140286570Smav l1arc_buf_hdr_t b_l1hdr; 1141168404Spjd}; 1142168404Spjd 1143302265Ssmh#if defined(__FreeBSD__) && defined(_KERNEL) 1144275748Sdelphijstatic int 1145275748Sdelphijsysctl_vfs_zfs_arc_meta_limit(SYSCTL_HANDLER_ARGS) 1146275748Sdelphij{ 1147275748Sdelphij uint64_t val; 1148275748Sdelphij int err; 1149275748Sdelphij 1150275748Sdelphij val = arc_meta_limit; 1151275748Sdelphij err = sysctl_handle_64(oidp, &val, 0, req); 1152275748Sdelphij if (err != 0 || req->newptr == NULL) 1153275748Sdelphij return (err); 1154275748Sdelphij 1155275748Sdelphij if (val <= 0 || val > arc_c_max) 1156275748Sdelphij return (EINVAL); 1157275748Sdelphij 1158275748Sdelphij arc_meta_limit = val; 1159275748Sdelphij return (0); 1160275748Sdelphij} 1161302265Ssmh 1162302265Ssmhstatic int 1163323667Sbaptsysctl_vfs_zfs_arc_no_grow_shift(SYSCTL_HANDLER_ARGS) 1164323667Sbapt{ 1165323667Sbapt uint32_t val; 1166323667Sbapt int err; 1167323667Sbapt 1168323667Sbapt val = arc_no_grow_shift; 1169323667Sbapt err = sysctl_handle_32(oidp, &val, 0, req); 1170323667Sbapt if (err != 0 || req->newptr == NULL) 1171323667Sbapt return (err); 1172323667Sbapt 1173323667Sbapt if (val >= arc_shrink_shift) 1174323667Sbapt return (EINVAL); 1175323667Sbapt 1176323667Sbapt arc_no_grow_shift = val; 1177323667Sbapt return (0); 1178323667Sbapt} 1179323667Sbapt 1180323667Sbaptstatic int 1181302265Ssmhsysctl_vfs_zfs_arc_max(SYSCTL_HANDLER_ARGS) 1182302265Ssmh{ 1183302265Ssmh uint64_t val; 1184302265Ssmh int err; 1185302265Ssmh 1186302265Ssmh val = zfs_arc_max; 1187302265Ssmh err = sysctl_handle_64(oidp, &val, 0, req); 1188302265Ssmh if (err != 0 || req->newptr == NULL) 1189302265Ssmh return (err); 1190302265Ssmh 1191302382Ssmh if (zfs_arc_max == 0) { 1192302382Ssmh /* Loader tunable so blindly set */ 1193302382Ssmh zfs_arc_max = val; 1194302382Ssmh return (0); 1195302382Ssmh } 1196302382Ssmh 1197302265Ssmh if (val < arc_abs_min || val > kmem_size()) 1198302265Ssmh return (EINVAL); 1199302265Ssmh if (val < arc_c_min) 1200302265Ssmh return (EINVAL); 1201302265Ssmh if (zfs_arc_meta_limit > 0 && val < zfs_arc_meta_limit) 1202302265Ssmh return (EINVAL); 1203302265Ssmh 1204302265Ssmh arc_c_max = val; 1205302265Ssmh 1206302265Ssmh arc_c = arc_c_max; 1207302265Ssmh arc_p = (arc_c >> 1); 1208302265Ssmh 1209302265Ssmh if (zfs_arc_meta_limit == 0) { 1210302265Ssmh /* limit meta-data to 1/4 of the arc capacity */ 1211302265Ssmh arc_meta_limit = arc_c_max / 4; 1212302265Ssmh } 1213302265Ssmh 1214302265Ssmh /* if kmem_flags are set, lets try to use less memory */ 1215302265Ssmh if (kmem_debugging()) 1216302265Ssmh arc_c = arc_c / 2; 1217302265Ssmh 1218302265Ssmh zfs_arc_max = arc_c; 1219302265Ssmh 1220302265Ssmh return (0); 1221302265Ssmh} 1222302265Ssmh 1223302265Ssmhstatic int 1224302265Ssmhsysctl_vfs_zfs_arc_min(SYSCTL_HANDLER_ARGS) 1225302265Ssmh{ 1226302265Ssmh uint64_t val; 1227302265Ssmh int err; 1228302265Ssmh 1229302265Ssmh val = zfs_arc_min; 1230302265Ssmh err = sysctl_handle_64(oidp, &val, 0, req); 1231302265Ssmh if (err != 0 || req->newptr == NULL) 1232302265Ssmh return (err); 1233302265Ssmh 1234302382Ssmh if (zfs_arc_min == 0) { 1235302382Ssmh /* Loader tunable so blindly set */ 1236302382Ssmh zfs_arc_min = val; 1237302382Ssmh return (0); 1238302382Ssmh } 1239302382Ssmh 1240302265Ssmh if (val < arc_abs_min || val > arc_c_max) 1241302265Ssmh return (EINVAL); 1242302265Ssmh 1243302265Ssmh arc_c_min = val; 1244302265Ssmh 1245302265Ssmh if (zfs_arc_meta_min == 0) 1246302265Ssmh arc_meta_min = arc_c_min / 2; 1247302265Ssmh 1248302265Ssmh if (arc_c < arc_c_min) 1249302265Ssmh arc_c = arc_c_min; 1250302265Ssmh 1251302265Ssmh zfs_arc_min = arc_c_min; 1252302265Ssmh 1253302265Ssmh return (0); 1254302265Ssmh} 1255275748Sdelphij#endif 1256275748Sdelphij 1257168404Spjd#define GHOST_STATE(state) \ 1258185029Spjd ((state) == arc_mru_ghost || (state) == arc_mfu_ghost || \ 1259185029Spjd (state) == arc_l2c_only) 1260168404Spjd 1261275811Sdelphij#define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_FLAG_IN_HASH_TABLE) 1262275811Sdelphij#define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) 1263275811Sdelphij#define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_FLAG_IO_ERROR) 1264275811Sdelphij#define HDR_PREFETCH(hdr) ((hdr)->b_flags & ARC_FLAG_PREFETCH) 1265339034Ssef#define HDR_PRESCIENT_PREFETCH(hdr) \ 1266339034Ssef ((hdr)->b_flags & ARC_FLAG_PRESCIENT_PREFETCH) 1267307265Smav#define HDR_COMPRESSION_ENABLED(hdr) \ 1268307265Smav ((hdr)->b_flags & ARC_FLAG_COMPRESSED_ARC) 1269286570Smav 1270275811Sdelphij#define HDR_L2CACHE(hdr) ((hdr)->b_flags & ARC_FLAG_L2CACHE) 1271275811Sdelphij#define HDR_L2_READING(hdr) \ 1272307265Smav (((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) && \ 1273307265Smav ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR)) 1274275811Sdelphij#define HDR_L2_WRITING(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITING) 1275275811Sdelphij#define HDR_L2_EVICTED(hdr) ((hdr)->b_flags & ARC_FLAG_L2_EVICTED) 1276275811Sdelphij#define HDR_L2_WRITE_HEAD(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITE_HEAD) 1277307265Smav#define HDR_SHARED_DATA(hdr) ((hdr)->b_flags & ARC_FLAG_SHARED_DATA) 1278168404Spjd 1279286570Smav#define HDR_ISTYPE_METADATA(hdr) \ 1280307265Smav ((hdr)->b_flags & ARC_FLAG_BUFC_METADATA) 1281286570Smav#define HDR_ISTYPE_DATA(hdr) (!HDR_ISTYPE_METADATA(hdr)) 1282286570Smav 1283286570Smav#define HDR_HAS_L1HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L1HDR) 1284286570Smav#define HDR_HAS_L2HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR) 1285286570Smav 1286307265Smav/* For storing compression mode in b_flags */ 1287307265Smav#define HDR_COMPRESS_OFFSET (highbit64(ARC_FLAG_COMPRESS_0) - 1) 1288307265Smav 1289307265Smav#define HDR_GET_COMPRESS(hdr) ((enum zio_compress)BF32_GET((hdr)->b_flags, \ 1290307265Smav HDR_COMPRESS_OFFSET, SPA_COMPRESSBITS)) 1291307265Smav#define HDR_SET_COMPRESS(hdr, cmp) BF32_SET((hdr)->b_flags, \ 1292307265Smav HDR_COMPRESS_OFFSET, SPA_COMPRESSBITS, (cmp)); 1293307265Smav 1294307265Smav#define ARC_BUF_LAST(buf) ((buf)->b_next == NULL) 1295321535Smav#define ARC_BUF_SHARED(buf) ((buf)->b_flags & ARC_BUF_FLAG_SHARED) 1296321535Smav#define ARC_BUF_COMPRESSED(buf) ((buf)->b_flags & ARC_BUF_FLAG_COMPRESSED) 1297307265Smav 1298168404Spjd/* 1299185029Spjd * Other sizes 1300185029Spjd */ 1301185029Spjd 1302286570Smav#define HDR_FULL_SIZE ((int64_t)sizeof (arc_buf_hdr_t)) 1303286570Smav#define HDR_L2ONLY_SIZE ((int64_t)offsetof(arc_buf_hdr_t, b_l1hdr)) 1304185029Spjd 1305185029Spjd/* 1306168404Spjd * Hash table routines 1307168404Spjd */ 1308168404Spjd 1309205253Skmacy#define HT_LOCK_PAD CACHE_LINE_SIZE 1310168404Spjd 1311168404Spjdstruct ht_lock { 1312168404Spjd kmutex_t ht_lock; 1313168404Spjd#ifdef _KERNEL 1314168404Spjd unsigned char pad[(HT_LOCK_PAD - sizeof (kmutex_t))]; 1315168404Spjd#endif 1316168404Spjd}; 1317168404Spjd 1318168404Spjd#define BUF_LOCKS 256 1319168404Spjdtypedef struct buf_hash_table { 1320168404Spjd uint64_t ht_mask; 1321168404Spjd arc_buf_hdr_t **ht_table; 1322205264Skmacy struct ht_lock ht_locks[BUF_LOCKS] __aligned(CACHE_LINE_SIZE); 1323168404Spjd} buf_hash_table_t; 1324168404Spjd 1325168404Spjdstatic buf_hash_table_t buf_hash_table; 1326168404Spjd 1327168404Spjd#define BUF_HASH_INDEX(spa, dva, birth) \ 1328168404Spjd (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask) 1329168404Spjd#define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)]) 1330168404Spjd#define BUF_HASH_LOCK(idx) (&(BUF_HASH_LOCK_NTRY(idx).ht_lock)) 1331219089Spjd#define HDR_LOCK(hdr) \ 1332219089Spjd (BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth))) 1333168404Spjd 1334168404Spjduint64_t zfs_crc64_table[256]; 1335168404Spjd 1336185029Spjd/* 1337185029Spjd * Level 2 ARC 1338185029Spjd */ 1339185029Spjd 1340272707Savg#define L2ARC_WRITE_SIZE (8 * 1024 * 1024) /* initial write max */ 1341251478Sdelphij#define L2ARC_HEADROOM 2 /* num of writes */ 1342251478Sdelphij/* 1343251478Sdelphij * If we discover during ARC scan any buffers to be compressed, we boost 1344251478Sdelphij * our headroom for the next scanning cycle by this percentage multiple. 1345251478Sdelphij */ 1346251478Sdelphij#define L2ARC_HEADROOM_BOOST 200 1347208373Smm#define L2ARC_FEED_SECS 1 /* caching interval secs */ 1348208373Smm#define L2ARC_FEED_MIN_MS 200 /* min caching interval ms */ 1349185029Spjd 1350185029Spjd#define l2arc_writes_sent ARCSTAT(arcstat_l2_writes_sent) 1351185029Spjd#define l2arc_writes_done ARCSTAT(arcstat_l2_writes_done) 1352185029Spjd 1353251631Sdelphij/* L2ARC Performance Tunables */ 1354185029Spjduint64_t l2arc_write_max = L2ARC_WRITE_SIZE; /* default max write size */ 1355185029Spjduint64_t l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra write during warmup */ 1356185029Spjduint64_t l2arc_headroom = L2ARC_HEADROOM; /* number of dev writes */ 1357251478Sdelphijuint64_t l2arc_headroom_boost = L2ARC_HEADROOM_BOOST; 1358185029Spjduint64_t l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */ 1359208373Smmuint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval milliseconds */ 1360219089Spjdboolean_t l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */ 1361208373Smmboolean_t l2arc_feed_again = B_TRUE; /* turbo warmup */ 1362208373Smmboolean_t l2arc_norw = B_TRUE; /* no reads during writes */ 1363185029Spjd 1364345121SmavSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_max, CTLFLAG_RWTUN, 1365205231Skmacy &l2arc_write_max, 0, "max write size"); 1366345121SmavSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_boost, CTLFLAG_RWTUN, 1367205231Skmacy &l2arc_write_boost, 0, "extra write during warmup"); 1368345121SmavSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_headroom, CTLFLAG_RWTUN, 1369205231Skmacy &l2arc_headroom, 0, "number of dev writes"); 1370345121SmavSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_secs, CTLFLAG_RWTUN, 1371205231Skmacy &l2arc_feed_secs, 0, "interval seconds"); 1372345121SmavSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_min_ms, CTLFLAG_RWTUN, 1373208373Smm &l2arc_feed_min_ms, 0, "min interval milliseconds"); 1374205231Skmacy 1375345121SmavSYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_noprefetch, CTLFLAG_RWTUN, 1376205231Skmacy &l2arc_noprefetch, 0, "don't cache prefetch bufs"); 1377345121SmavSYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_feed_again, CTLFLAG_RWTUN, 1378208373Smm &l2arc_feed_again, 0, "turbo warmup"); 1379345121SmavSYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_norw, CTLFLAG_RWTUN, 1380208373Smm &l2arc_norw, 0, "no reads during writes"); 1381205231Skmacy 1382217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_size, CTLFLAG_RD, 1383286770Smav &ARC_anon.arcs_size.rc_count, 0, "size of anonymous state"); 1384307265SmavSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_metadata_esize, CTLFLAG_RD, 1385307265Smav &ARC_anon.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, 1386307265Smav "size of anonymous state"); 1387307265SmavSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_data_esize, CTLFLAG_RD, 1388307265Smav &ARC_anon.arcs_esize[ARC_BUFC_DATA].rc_count, 0, 1389307265Smav "size of anonymous state"); 1390205231Skmacy 1391217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_size, CTLFLAG_RD, 1392286770Smav &ARC_mru.arcs_size.rc_count, 0, "size of mru state"); 1393307265SmavSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_metadata_esize, CTLFLAG_RD, 1394307265Smav &ARC_mru.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, 1395307265Smav "size of metadata in mru state"); 1396307265SmavSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_data_esize, CTLFLAG_RD, 1397307265Smav &ARC_mru.arcs_esize[ARC_BUFC_DATA].rc_count, 0, 1398307265Smav "size of data in mru state"); 1399205231Skmacy 1400217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_size, CTLFLAG_RD, 1401286770Smav &ARC_mru_ghost.arcs_size.rc_count, 0, "size of mru ghost state"); 1402307265SmavSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_metadata_esize, CTLFLAG_RD, 1403307265Smav &ARC_mru_ghost.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, 1404205231Skmacy "size of metadata in mru ghost state"); 1405307265SmavSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_data_esize, CTLFLAG_RD, 1406307265Smav &ARC_mru_ghost.arcs_esize[ARC_BUFC_DATA].rc_count, 0, 1407205231Skmacy "size of data in mru ghost state"); 1408205231Skmacy 1409217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_size, CTLFLAG_RD, 1410286770Smav &ARC_mfu.arcs_size.rc_count, 0, "size of mfu state"); 1411307265SmavSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_metadata_esize, CTLFLAG_RD, 1412307265Smav &ARC_mfu.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, 1413307265Smav "size of metadata in mfu state"); 1414307265SmavSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_data_esize, CTLFLAG_RD, 1415307265Smav &ARC_mfu.arcs_esize[ARC_BUFC_DATA].rc_count, 0, 1416307265Smav "size of data in mfu state"); 1417205231Skmacy 1418217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_size, CTLFLAG_RD, 1419286770Smav &ARC_mfu_ghost.arcs_size.rc_count, 0, "size of mfu ghost state"); 1420307265SmavSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_metadata_esize, CTLFLAG_RD, 1421307265Smav &ARC_mfu_ghost.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, 1422205231Skmacy "size of metadata in mfu ghost state"); 1423307265SmavSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_data_esize, CTLFLAG_RD, 1424307265Smav &ARC_mfu_ghost.arcs_esize[ARC_BUFC_DATA].rc_count, 0, 1425205231Skmacy "size of data in mfu ghost state"); 1426205231Skmacy 1427217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2c_only_size, CTLFLAG_RD, 1428286770Smav &ARC_l2c_only.arcs_size.rc_count, 0, "size of mru state"); 1429205231Skmacy 1430339034SsefSYSCTL_UINT(_vfs_zfs, OID_AUTO, arc_min_prefetch_ms, CTLFLAG_RW, 1431339034Ssef &zfs_arc_min_prefetch_ms, 0, "Min life of prefetch block in ms"); 1432339034SsefSYSCTL_UINT(_vfs_zfs, OID_AUTO, arc_min_prescient_prefetch_ms, CTLFLAG_RW, 1433339034Ssef &zfs_arc_min_prescient_prefetch_ms, 0, "Min life of prescient prefetched block in ms"); 1434339034Ssef 1435185029Spjd/* 1436185029Spjd * L2ARC Internals 1437185029Spjd */ 1438286570Smavstruct l2arc_dev { 1439185029Spjd vdev_t *l2ad_vdev; /* vdev */ 1440185029Spjd spa_t *l2ad_spa; /* spa */ 1441185029Spjd uint64_t l2ad_hand; /* next write location */ 1442185029Spjd uint64_t l2ad_start; /* first addr on device */ 1443185029Spjd uint64_t l2ad_end; /* last addr on device */ 1444185029Spjd boolean_t l2ad_first; /* first sweep through */ 1445208373Smm boolean_t l2ad_writing; /* currently writing */ 1446286570Smav kmutex_t l2ad_mtx; /* lock for buffer list */ 1447286570Smav list_t l2ad_buflist; /* buffer list */ 1448185029Spjd list_node_t l2ad_node; /* device list node */ 1449286598Smav refcount_t l2ad_alloc; /* allocated bytes */ 1450286570Smav}; 1451185029Spjd 1452185029Spjdstatic list_t L2ARC_dev_list; /* device list */ 1453185029Spjdstatic list_t *l2arc_dev_list; /* device list pointer */ 1454185029Spjdstatic kmutex_t l2arc_dev_mtx; /* device list mutex */ 1455185029Spjdstatic l2arc_dev_t *l2arc_dev_last; /* last device used */ 1456185029Spjdstatic list_t L2ARC_free_on_write; /* free after write buf list */ 1457185029Spjdstatic list_t *l2arc_free_on_write; /* free after write list ptr */ 1458185029Spjdstatic kmutex_t l2arc_free_on_write_mtx; /* mutex for list */ 1459185029Spjdstatic uint64_t l2arc_ndev; /* number of devices */ 1460185029Spjd 1461185029Spjdtypedef struct l2arc_read_callback { 1462321535Smav arc_buf_hdr_t *l2rcb_hdr; /* read header */ 1463251478Sdelphij blkptr_t l2rcb_bp; /* original blkptr */ 1464268123Sdelphij zbookmark_phys_t l2rcb_zb; /* original bookmark */ 1465251478Sdelphij int l2rcb_flags; /* original flags */ 1466321613Smav abd_t *l2rcb_abd; /* temporary buffer */ 1467185029Spjd} l2arc_read_callback_t; 1468185029Spjd 1469185029Spjdtypedef struct l2arc_write_callback { 1470185029Spjd l2arc_dev_t *l2wcb_dev; /* device info */ 1471185029Spjd arc_buf_hdr_t *l2wcb_head; /* head of write buflist */ 1472185029Spjd} l2arc_write_callback_t; 1473185029Spjd 1474185029Spjdtypedef struct l2arc_data_free { 1475185029Spjd /* protected by l2arc_free_on_write_mtx */ 1476321610Smav abd_t *l2df_abd; 1477185029Spjd size_t l2df_size; 1478307265Smav arc_buf_contents_t l2df_type; 1479185029Spjd list_node_t l2df_list_node; 1480185029Spjd} l2arc_data_free_t; 1481185029Spjd 1482185029Spjdstatic kmutex_t l2arc_feed_thr_lock; 1483185029Spjdstatic kcondvar_t l2arc_feed_thr_cv; 1484185029Spjdstatic uint8_t l2arc_thread_exit; 1485185029Spjd 1486349216Savgstatic abd_t *arc_get_data_abd(arc_buf_hdr_t *, uint64_t, void *, boolean_t); 1487307265Smavstatic void *arc_get_data_buf(arc_buf_hdr_t *, uint64_t, void *); 1488349216Savgstatic void arc_get_data_impl(arc_buf_hdr_t *, uint64_t, void *, boolean_t); 1489321610Smavstatic void arc_free_data_abd(arc_buf_hdr_t *, abd_t *, uint64_t, void *); 1490307265Smavstatic void arc_free_data_buf(arc_buf_hdr_t *, void *, uint64_t, void *); 1491321610Smavstatic void arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag); 1492321610Smavstatic void arc_hdr_free_pabd(arc_buf_hdr_t *); 1493349216Savgstatic void arc_hdr_alloc_pabd(arc_buf_hdr_t *, boolean_t); 1494275811Sdelphijstatic void arc_access(arc_buf_hdr_t *, kmutex_t *); 1495286763Smavstatic boolean_t arc_is_overflowing(); 1496275811Sdelphijstatic void arc_buf_watch(arc_buf_t *); 1497275811Sdelphij 1498286570Smavstatic arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *); 1499286570Smavstatic uint32_t arc_bufc_to_flags(arc_buf_contents_t); 1500307265Smavstatic inline void arc_hdr_set_flags(arc_buf_hdr_t *hdr, arc_flags_t flags); 1501307265Smavstatic inline void arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags); 1502286570Smav 1503275811Sdelphijstatic boolean_t l2arc_write_eligible(uint64_t, arc_buf_hdr_t *); 1504275811Sdelphijstatic void l2arc_read_done(zio_t *); 1505185029Spjd 1506290191Savgstatic void 1507290191Savgl2arc_trim(const arc_buf_hdr_t *hdr) 1508290191Savg{ 1509290191Savg l2arc_dev_t *dev = hdr->b_l2hdr.b_dev; 1510290191Savg 1511290191Savg ASSERT(HDR_HAS_L2HDR(hdr)); 1512290191Savg ASSERT(MUTEX_HELD(&dev->l2ad_mtx)); 1513290191Savg 1514307265Smav if (HDR_GET_PSIZE(hdr) != 0) { 1515290191Savg trim_map_free(dev->l2ad_vdev, hdr->b_l2hdr.b_daddr, 1516307265Smav HDR_GET_PSIZE(hdr), 0); 1517290191Savg } 1518290191Savg} 1519290191Savg 1520332540Smav/* 1521332540Smav * We use Cityhash for this. It's fast, and has good hash properties without 1522332540Smav * requiring any large static buffers. 1523332540Smav */ 1524168404Spjdstatic uint64_t 1525209962Smmbuf_hash(uint64_t spa, const dva_t *dva, uint64_t birth) 1526168404Spjd{ 1527332540Smav return (cityhash4(spa, dva->dva_word[0], dva->dva_word[1], birth)); 1528168404Spjd} 1529168404Spjd 1530307265Smav#define HDR_EMPTY(hdr) \ 1531307265Smav ((hdr)->b_dva.dva_word[0] == 0 && \ 1532307265Smav (hdr)->b_dva.dva_word[1] == 0) 1533168404Spjd 1534307265Smav#define HDR_EQUAL(spa, dva, birth, hdr) \ 1535307265Smav ((hdr)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \ 1536307265Smav ((hdr)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \ 1537307265Smav ((hdr)->b_birth == birth) && ((hdr)->b_spa == spa) 1538168404Spjd 1539219089Spjdstatic void 1540219089Spjdbuf_discard_identity(arc_buf_hdr_t *hdr) 1541219089Spjd{ 1542219089Spjd hdr->b_dva.dva_word[0] = 0; 1543219089Spjd hdr->b_dva.dva_word[1] = 0; 1544219089Spjd hdr->b_birth = 0; 1545219089Spjd} 1546219089Spjd 1547168404Spjdstatic arc_buf_hdr_t * 1548268075Sdelphijbuf_hash_find(uint64_t spa, const blkptr_t *bp, kmutex_t **lockp) 1549168404Spjd{ 1550268075Sdelphij const dva_t *dva = BP_IDENTITY(bp); 1551268075Sdelphij uint64_t birth = BP_PHYSICAL_BIRTH(bp); 1552168404Spjd uint64_t idx = BUF_HASH_INDEX(spa, dva, birth); 1553168404Spjd kmutex_t *hash_lock = BUF_HASH_LOCK(idx); 1554275811Sdelphij arc_buf_hdr_t *hdr; 1555168404Spjd 1556168404Spjd mutex_enter(hash_lock); 1557275811Sdelphij for (hdr = buf_hash_table.ht_table[idx]; hdr != NULL; 1558275811Sdelphij hdr = hdr->b_hash_next) { 1559307265Smav if (HDR_EQUAL(spa, dva, birth, hdr)) { 1560168404Spjd *lockp = hash_lock; 1561275811Sdelphij return (hdr); 1562168404Spjd } 1563168404Spjd } 1564168404Spjd mutex_exit(hash_lock); 1565168404Spjd *lockp = NULL; 1566168404Spjd return (NULL); 1567168404Spjd} 1568168404Spjd 1569168404Spjd/* 1570168404Spjd * Insert an entry into the hash table. If there is already an element 1571168404Spjd * equal to elem in the hash table, then the already existing element 1572168404Spjd * will be returned and the new element will not be inserted. 1573168404Spjd * Otherwise returns NULL. 1574286570Smav * If lockp == NULL, the caller is assumed to already hold the hash lock. 1575168404Spjd */ 1576168404Spjdstatic arc_buf_hdr_t * 1577275811Sdelphijbuf_hash_insert(arc_buf_hdr_t *hdr, kmutex_t **lockp) 1578168404Spjd{ 1579275811Sdelphij uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth); 1580168404Spjd kmutex_t *hash_lock = BUF_HASH_LOCK(idx); 1581275811Sdelphij arc_buf_hdr_t *fhdr; 1582168404Spjd uint32_t i; 1583168404Spjd 1584275811Sdelphij ASSERT(!DVA_IS_EMPTY(&hdr->b_dva)); 1585275811Sdelphij ASSERT(hdr->b_birth != 0); 1586275811Sdelphij ASSERT(!HDR_IN_HASH_TABLE(hdr)); 1587286570Smav 1588286570Smav if (lockp != NULL) { 1589286570Smav *lockp = hash_lock; 1590286570Smav mutex_enter(hash_lock); 1591286570Smav } else { 1592286570Smav ASSERT(MUTEX_HELD(hash_lock)); 1593286570Smav } 1594286570Smav 1595275811Sdelphij for (fhdr = buf_hash_table.ht_table[idx], i = 0; fhdr != NULL; 1596275811Sdelphij fhdr = fhdr->b_hash_next, i++) { 1597307265Smav if (HDR_EQUAL(hdr->b_spa, &hdr->b_dva, hdr->b_birth, fhdr)) 1598275811Sdelphij return (fhdr); 1599168404Spjd } 1600168404Spjd 1601275811Sdelphij hdr->b_hash_next = buf_hash_table.ht_table[idx]; 1602275811Sdelphij buf_hash_table.ht_table[idx] = hdr; 1603307265Smav arc_hdr_set_flags(hdr, ARC_FLAG_IN_HASH_TABLE); 1604168404Spjd 1605168404Spjd /* collect some hash table performance data */ 1606168404Spjd if (i > 0) { 1607168404Spjd ARCSTAT_BUMP(arcstat_hash_collisions); 1608168404Spjd if (i == 1) 1609168404Spjd ARCSTAT_BUMP(arcstat_hash_chains); 1610168404Spjd 1611168404Spjd ARCSTAT_MAX(arcstat_hash_chain_max, i); 1612168404Spjd } 1613168404Spjd 1614168404Spjd ARCSTAT_BUMP(arcstat_hash_elements); 1615168404Spjd ARCSTAT_MAXSTAT(arcstat_hash_elements); 1616168404Spjd 1617168404Spjd return (NULL); 1618168404Spjd} 1619168404Spjd 1620168404Spjdstatic void 1621275811Sdelphijbuf_hash_remove(arc_buf_hdr_t *hdr) 1622168404Spjd{ 1623275811Sdelphij arc_buf_hdr_t *fhdr, **hdrp; 1624275811Sdelphij uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth); 1625168404Spjd 1626168404Spjd ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx))); 1627275811Sdelphij ASSERT(HDR_IN_HASH_TABLE(hdr)); 1628168404Spjd 1629275811Sdelphij hdrp = &buf_hash_table.ht_table[idx]; 1630275811Sdelphij while ((fhdr = *hdrp) != hdr) { 1631307265Smav ASSERT3P(fhdr, !=, NULL); 1632275811Sdelphij hdrp = &fhdr->b_hash_next; 1633168404Spjd } 1634275811Sdelphij *hdrp = hdr->b_hash_next; 1635275811Sdelphij hdr->b_hash_next = NULL; 1636307265Smav arc_hdr_clear_flags(hdr, ARC_FLAG_IN_HASH_TABLE); 1637168404Spjd 1638168404Spjd /* collect some hash table performance data */ 1639168404Spjd ARCSTAT_BUMPDOWN(arcstat_hash_elements); 1640168404Spjd 1641168404Spjd if (buf_hash_table.ht_table[idx] && 1642168404Spjd buf_hash_table.ht_table[idx]->b_hash_next == NULL) 1643168404Spjd ARCSTAT_BUMPDOWN(arcstat_hash_chains); 1644168404Spjd} 1645168404Spjd 1646168404Spjd/* 1647168404Spjd * Global data structures and functions for the buf kmem cache. 1648168404Spjd */ 1649286570Smavstatic kmem_cache_t *hdr_full_cache; 1650286570Smavstatic kmem_cache_t *hdr_l2only_cache; 1651168404Spjdstatic kmem_cache_t *buf_cache; 1652168404Spjd 1653168404Spjdstatic void 1654168404Spjdbuf_fini(void) 1655168404Spjd{ 1656168404Spjd int i; 1657168404Spjd 1658168404Spjd kmem_free(buf_hash_table.ht_table, 1659168404Spjd (buf_hash_table.ht_mask + 1) * sizeof (void *)); 1660168404Spjd for (i = 0; i < BUF_LOCKS; i++) 1661168404Spjd mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock); 1662286570Smav kmem_cache_destroy(hdr_full_cache); 1663286570Smav kmem_cache_destroy(hdr_l2only_cache); 1664168404Spjd kmem_cache_destroy(buf_cache); 1665168404Spjd} 1666168404Spjd 1667168404Spjd/* 1668168404Spjd * Constructor callback - called when the cache is empty 1669168404Spjd * and a new buf is requested. 1670168404Spjd */ 1671168404Spjd/* ARGSUSED */ 1672168404Spjdstatic int 1673286570Smavhdr_full_cons(void *vbuf, void *unused, int kmflag) 1674168404Spjd{ 1675275811Sdelphij arc_buf_hdr_t *hdr = vbuf; 1676168404Spjd 1677286570Smav bzero(hdr, HDR_FULL_SIZE); 1678286570Smav cv_init(&hdr->b_l1hdr.b_cv, NULL, CV_DEFAULT, NULL); 1679286570Smav refcount_create(&hdr->b_l1hdr.b_refcnt); 1680286570Smav mutex_init(&hdr->b_l1hdr.b_freeze_lock, NULL, MUTEX_DEFAULT, NULL); 1681286763Smav multilist_link_init(&hdr->b_l1hdr.b_arc_node); 1682286570Smav arc_space_consume(HDR_FULL_SIZE, ARC_SPACE_HDRS); 1683185029Spjd 1684168404Spjd return (0); 1685168404Spjd} 1686168404Spjd 1687185029Spjd/* ARGSUSED */ 1688185029Spjdstatic int 1689286570Smavhdr_l2only_cons(void *vbuf, void *unused, int kmflag) 1690286570Smav{ 1691286570Smav arc_buf_hdr_t *hdr = vbuf; 1692286570Smav 1693286570Smav bzero(hdr, HDR_L2ONLY_SIZE); 1694286570Smav arc_space_consume(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS); 1695286570Smav 1696286570Smav return (0); 1697286570Smav} 1698286570Smav 1699286570Smav/* ARGSUSED */ 1700286570Smavstatic int 1701185029Spjdbuf_cons(void *vbuf, void *unused, int kmflag) 1702185029Spjd{ 1703185029Spjd arc_buf_t *buf = vbuf; 1704185029Spjd 1705185029Spjd bzero(buf, sizeof (arc_buf_t)); 1706219089Spjd mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL); 1707208373Smm arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS); 1708208373Smm 1709185029Spjd return (0); 1710185029Spjd} 1711185029Spjd 1712168404Spjd/* 1713168404Spjd * Destructor callback - called when a cached buf is 1714168404Spjd * no longer required. 1715168404Spjd */ 1716168404Spjd/* ARGSUSED */ 1717168404Spjdstatic void 1718286570Smavhdr_full_dest(void *vbuf, void *unused) 1719168404Spjd{ 1720275811Sdelphij arc_buf_hdr_t *hdr = vbuf; 1721168404Spjd 1722307265Smav ASSERT(HDR_EMPTY(hdr)); 1723286570Smav cv_destroy(&hdr->b_l1hdr.b_cv); 1724286570Smav refcount_destroy(&hdr->b_l1hdr.b_refcnt); 1725286570Smav mutex_destroy(&hdr->b_l1hdr.b_freeze_lock); 1726286763Smav ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); 1727286570Smav arc_space_return(HDR_FULL_SIZE, ARC_SPACE_HDRS); 1728168404Spjd} 1729168404Spjd 1730185029Spjd/* ARGSUSED */ 1731185029Spjdstatic void 1732286570Smavhdr_l2only_dest(void *vbuf, void *unused) 1733286570Smav{ 1734286570Smav arc_buf_hdr_t *hdr = vbuf; 1735286570Smav 1736307265Smav ASSERT(HDR_EMPTY(hdr)); 1737286570Smav arc_space_return(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS); 1738286570Smav} 1739286570Smav 1740286570Smav/* ARGSUSED */ 1741286570Smavstatic void 1742185029Spjdbuf_dest(void *vbuf, void *unused) 1743185029Spjd{ 1744185029Spjd arc_buf_t *buf = vbuf; 1745185029Spjd 1746219089Spjd mutex_destroy(&buf->b_evict_lock); 1747208373Smm arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS); 1748185029Spjd} 1749185029Spjd 1750168404Spjd/* 1751168404Spjd * Reclaim callback -- invoked when memory is low. 1752168404Spjd */ 1753168404Spjd/* ARGSUSED */ 1754168404Spjdstatic void 1755168404Spjdhdr_recl(void *unused) 1756168404Spjd{ 1757168404Spjd dprintf("hdr_recl called\n"); 1758168404Spjd /* 1759168404Spjd * umem calls the reclaim func when we destroy the buf cache, 1760168404Spjd * which is after we do arc_fini(). 1761168404Spjd */ 1762346686Smav if (arc_initialized) 1763346686Smav zthr_wakeup(arc_reap_zthr); 1764168404Spjd} 1765168404Spjd 1766168404Spjdstatic void 1767168404Spjdbuf_init(void) 1768168404Spjd{ 1769168404Spjd uint64_t *ct; 1770168404Spjd uint64_t hsize = 1ULL << 12; 1771168404Spjd int i, j; 1772168404Spjd 1773168404Spjd /* 1774168404Spjd * The hash table is big enough to fill all of physical memory 1775269230Sdelphij * with an average block size of zfs_arc_average_blocksize (default 8K). 1776269230Sdelphij * By default, the table will take up 1777269230Sdelphij * totalmem * sizeof(void*) / 8K (1MB per GB with 8-byte pointers). 1778168404Spjd */ 1779269230Sdelphij while (hsize * zfs_arc_average_blocksize < (uint64_t)physmem * PAGESIZE) 1780168404Spjd hsize <<= 1; 1781168404Spjdretry: 1782168404Spjd buf_hash_table.ht_mask = hsize - 1; 1783168404Spjd buf_hash_table.ht_table = 1784168404Spjd kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP); 1785168404Spjd if (buf_hash_table.ht_table == NULL) { 1786168404Spjd ASSERT(hsize > (1ULL << 8)); 1787168404Spjd hsize >>= 1; 1788168404Spjd goto retry; 1789168404Spjd } 1790168404Spjd 1791286570Smav hdr_full_cache = kmem_cache_create("arc_buf_hdr_t_full", HDR_FULL_SIZE, 1792286570Smav 0, hdr_full_cons, hdr_full_dest, hdr_recl, NULL, NULL, 0); 1793286570Smav hdr_l2only_cache = kmem_cache_create("arc_buf_hdr_t_l2only", 1794286570Smav HDR_L2ONLY_SIZE, 0, hdr_l2only_cons, hdr_l2only_dest, hdr_recl, 1795286570Smav NULL, NULL, 0); 1796168404Spjd buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t), 1797185029Spjd 0, buf_cons, buf_dest, NULL, NULL, NULL, 0); 1798168404Spjd 1799168404Spjd for (i = 0; i < 256; i++) 1800168404Spjd for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--) 1801168404Spjd *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY); 1802168404Spjd 1803168404Spjd for (i = 0; i < BUF_LOCKS; i++) { 1804168404Spjd mutex_init(&buf_hash_table.ht_locks[i].ht_lock, 1805168404Spjd NULL, MUTEX_DEFAULT, NULL); 1806168404Spjd } 1807168404Spjd} 1808168404Spjd 1809321535Smav/* 1810321535Smav * This is the size that the buf occupies in memory. If the buf is compressed, 1811321535Smav * it will correspond to the compressed size. You should use this method of 1812321535Smav * getting the buf size unless you explicitly need the logical size. 1813321535Smav */ 1814321535Smavint32_t 1815321535Smavarc_buf_size(arc_buf_t *buf) 1816321535Smav{ 1817321535Smav return (ARC_BUF_COMPRESSED(buf) ? 1818321535Smav HDR_GET_PSIZE(buf->b_hdr) : HDR_GET_LSIZE(buf->b_hdr)); 1819321535Smav} 1820321535Smav 1821321535Smavint32_t 1822321535Smavarc_buf_lsize(arc_buf_t *buf) 1823321535Smav{ 1824321535Smav return (HDR_GET_LSIZE(buf->b_hdr)); 1825321535Smav} 1826321535Smav 1827321535Smavenum zio_compress 1828321535Smavarc_get_compression(arc_buf_t *buf) 1829321535Smav{ 1830321535Smav return (ARC_BUF_COMPRESSED(buf) ? 1831321535Smav HDR_GET_COMPRESS(buf->b_hdr) : ZIO_COMPRESS_OFF); 1832321535Smav} 1833321535Smav 1834307265Smav#define ARC_MINTIME (hz>>4) /* 62 ms */ 1835307265Smav 1836307265Smavstatic inline boolean_t 1837307265Smavarc_buf_is_shared(arc_buf_t *buf) 1838286570Smav{ 1839307265Smav boolean_t shared = (buf->b_data != NULL && 1840321610Smav buf->b_hdr->b_l1hdr.b_pabd != NULL && 1841321610Smav abd_is_linear(buf->b_hdr->b_l1hdr.b_pabd) && 1842321610Smav buf->b_data == abd_to_buf(buf->b_hdr->b_l1hdr.b_pabd)); 1843307265Smav IMPLY(shared, HDR_SHARED_DATA(buf->b_hdr)); 1844321535Smav IMPLY(shared, ARC_BUF_SHARED(buf)); 1845321535Smav IMPLY(shared, ARC_BUF_COMPRESSED(buf) || ARC_BUF_LAST(buf)); 1846321535Smav 1847321535Smav /* 1848321535Smav * It would be nice to assert arc_can_share() too, but the "hdr isn't 1849321535Smav * already being shared" requirement prevents us from doing that. 1850321535Smav */ 1851321535Smav 1852307265Smav return (shared); 1853307265Smav} 1854286570Smav 1855321535Smav/* 1856321535Smav * Free the checksum associated with this header. If there is no checksum, this 1857321535Smav * is a no-op. 1858321535Smav */ 1859307265Smavstatic inline void 1860307265Smavarc_cksum_free(arc_buf_hdr_t *hdr) 1861307265Smav{ 1862307265Smav ASSERT(HDR_HAS_L1HDR(hdr)); 1863307265Smav mutex_enter(&hdr->b_l1hdr.b_freeze_lock); 1864307265Smav if (hdr->b_l1hdr.b_freeze_cksum != NULL) { 1865307265Smav kmem_free(hdr->b_l1hdr.b_freeze_cksum, sizeof (zio_cksum_t)); 1866307265Smav hdr->b_l1hdr.b_freeze_cksum = NULL; 1867286570Smav } 1868307265Smav mutex_exit(&hdr->b_l1hdr.b_freeze_lock); 1869286570Smav} 1870286570Smav 1871321535Smav/* 1872321535Smav * Return true iff at least one of the bufs on hdr is not compressed. 1873321535Smav */ 1874321535Smavstatic boolean_t 1875321535Smavarc_hdr_has_uncompressed_buf(arc_buf_hdr_t *hdr) 1876321535Smav{ 1877321535Smav for (arc_buf_t *b = hdr->b_l1hdr.b_buf; b != NULL; b = b->b_next) { 1878321535Smav if (!ARC_BUF_COMPRESSED(b)) { 1879321535Smav return (B_TRUE); 1880321535Smav } 1881321535Smav } 1882321535Smav return (B_FALSE); 1883321535Smav} 1884321535Smav 1885321535Smav/* 1886321535Smav * If we've turned on the ZFS_DEBUG_MODIFY flag, verify that the buf's data 1887321535Smav * matches the checksum that is stored in the hdr. If there is no checksum, 1888321535Smav * or if the buf is compressed, this is a no-op. 1889321535Smav */ 1890168404Spjdstatic void 1891168404Spjdarc_cksum_verify(arc_buf_t *buf) 1892168404Spjd{ 1893307265Smav arc_buf_hdr_t *hdr = buf->b_hdr; 1894168404Spjd zio_cksum_t zc; 1895168404Spjd 1896168404Spjd if (!(zfs_flags & ZFS_DEBUG_MODIFY)) 1897168404Spjd return; 1898168404Spjd 1899321535Smav if (ARC_BUF_COMPRESSED(buf)) { 1900321535Smav ASSERT(hdr->b_l1hdr.b_freeze_cksum == NULL || 1901321535Smav arc_hdr_has_uncompressed_buf(hdr)); 1902321535Smav return; 1903321535Smav } 1904321535Smav 1905307265Smav ASSERT(HDR_HAS_L1HDR(hdr)); 1906307265Smav 1907307265Smav mutex_enter(&hdr->b_l1hdr.b_freeze_lock); 1908307265Smav if (hdr->b_l1hdr.b_freeze_cksum == NULL || HDR_IO_ERROR(hdr)) { 1909307265Smav mutex_exit(&hdr->b_l1hdr.b_freeze_lock); 1910168404Spjd return; 1911168404Spjd } 1912321535Smav 1913321535Smav fletcher_2_native(buf->b_data, arc_buf_size(buf), NULL, &zc); 1914307265Smav if (!ZIO_CHECKSUM_EQUAL(*hdr->b_l1hdr.b_freeze_cksum, zc)) 1915168404Spjd panic("buffer modified while frozen!"); 1916307265Smav mutex_exit(&hdr->b_l1hdr.b_freeze_lock); 1917168404Spjd} 1918168404Spjd 1919307265Smavstatic boolean_t 1920307265Smavarc_cksum_is_equal(arc_buf_hdr_t *hdr, zio_t *zio) 1921185029Spjd{ 1922307265Smav enum zio_compress compress = BP_GET_COMPRESS(zio->io_bp); 1923307265Smav boolean_t valid_cksum; 1924185029Spjd 1925307265Smav ASSERT(!BP_IS_EMBEDDED(zio->io_bp)); 1926307265Smav VERIFY3U(BP_GET_PSIZE(zio->io_bp), ==, HDR_GET_PSIZE(hdr)); 1927185029Spjd 1928307265Smav /* 1929307265Smav * We rely on the blkptr's checksum to determine if the block 1930307265Smav * is valid or not. When compressed arc is enabled, the l2arc 1931307265Smav * writes the block to the l2arc just as it appears in the pool. 1932307265Smav * This allows us to use the blkptr's checksum to validate the 1933307265Smav * data that we just read off of the l2arc without having to store 1934307265Smav * a separate checksum in the arc_buf_hdr_t. However, if compressed 1935307265Smav * arc is disabled, then the data written to the l2arc is always 1936307265Smav * uncompressed and won't match the block as it exists in the main 1937307265Smav * pool. When this is the case, we must first compress it if it is 1938307265Smav * compressed on the main pool before we can validate the checksum. 1939307265Smav */ 1940307265Smav if (!HDR_COMPRESSION_ENABLED(hdr) && compress != ZIO_COMPRESS_OFF) { 1941307265Smav ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF); 1942307265Smav uint64_t lsize = HDR_GET_LSIZE(hdr); 1943307265Smav uint64_t csize; 1944307265Smav 1945329490Smav abd_t *cdata = abd_alloc_linear(HDR_GET_PSIZE(hdr), B_TRUE); 1946329490Smav csize = zio_compress_data(compress, zio->io_abd, 1947329490Smav abd_to_buf(cdata), lsize); 1948321610Smav 1949307265Smav ASSERT3U(csize, <=, HDR_GET_PSIZE(hdr)); 1950307265Smav if (csize < HDR_GET_PSIZE(hdr)) { 1951307265Smav /* 1952307265Smav * Compressed blocks are always a multiple of the 1953307265Smav * smallest ashift in the pool. Ideally, we would 1954307265Smav * like to round up the csize to the next 1955307265Smav * spa_min_ashift but that value may have changed 1956307265Smav * since the block was last written. Instead, 1957307265Smav * we rely on the fact that the hdr's psize 1958307265Smav * was set to the psize of the block when it was 1959307265Smav * last written. We set the csize to that value 1960307265Smav * and zero out any part that should not contain 1961307265Smav * data. 1962307265Smav */ 1963329490Smav abd_zero_off(cdata, csize, HDR_GET_PSIZE(hdr) - csize); 1964307265Smav csize = HDR_GET_PSIZE(hdr); 1965307265Smav } 1966329490Smav zio_push_transform(zio, cdata, csize, HDR_GET_PSIZE(hdr), NULL); 1967307265Smav } 1968307265Smav 1969307265Smav /* 1970307265Smav * Block pointers always store the checksum for the logical data. 1971307265Smav * If the block pointer has the gang bit set, then the checksum 1972307265Smav * it represents is for the reconstituted data and not for an 1973307265Smav * individual gang member. The zio pipeline, however, must be able to 1974307265Smav * determine the checksum of each of the gang constituents so it 1975307265Smav * treats the checksum comparison differently than what we need 1976307265Smav * for l2arc blocks. This prevents us from using the 1977307265Smav * zio_checksum_error() interface directly. Instead we must call the 1978307265Smav * zio_checksum_error_impl() so that we can ensure the checksum is 1979307265Smav * generated using the correct checksum algorithm and accounts for the 1980307265Smav * logical I/O size and not just a gang fragment. 1981307265Smav */ 1982307265Smav valid_cksum = (zio_checksum_error_impl(zio->io_spa, zio->io_bp, 1983321610Smav BP_GET_CHECKSUM(zio->io_bp), zio->io_abd, zio->io_size, 1984307265Smav zio->io_offset, NULL) == 0); 1985307265Smav zio_pop_transforms(zio); 1986307265Smav return (valid_cksum); 1987185029Spjd} 1988185029Spjd 1989321535Smav/* 1990321535Smav * Given a buf full of data, if ZFS_DEBUG_MODIFY is enabled this computes a 1991321535Smav * checksum and attaches it to the buf's hdr so that we can ensure that the buf 1992321535Smav * isn't modified later on. If buf is compressed or there is already a checksum 1993321535Smav * on the hdr, this is a no-op (we only checksum uncompressed bufs). 1994321535Smav */ 1995168404Spjdstatic void 1996307265Smavarc_cksum_compute(arc_buf_t *buf) 1997168404Spjd{ 1998307265Smav arc_buf_hdr_t *hdr = buf->b_hdr; 1999307265Smav 2000307265Smav if (!(zfs_flags & ZFS_DEBUG_MODIFY)) 2001168404Spjd return; 2002168404Spjd 2003307265Smav ASSERT(HDR_HAS_L1HDR(hdr)); 2004321535Smav 2005286570Smav mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); 2006307265Smav if (hdr->b_l1hdr.b_freeze_cksum != NULL) { 2007321535Smav ASSERT(arc_hdr_has_uncompressed_buf(hdr)); 2008307265Smav mutex_exit(&hdr->b_l1hdr.b_freeze_lock); 2009168404Spjd return; 2010321535Smav } else if (ARC_BUF_COMPRESSED(buf)) { 2011321535Smav mutex_exit(&hdr->b_l1hdr.b_freeze_lock); 2012321535Smav return; 2013168404Spjd } 2014321535Smav 2015321535Smav ASSERT(!ARC_BUF_COMPRESSED(buf)); 2016307265Smav hdr->b_l1hdr.b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), 2017307265Smav KM_SLEEP); 2018321535Smav fletcher_2_native(buf->b_data, arc_buf_size(buf), NULL, 2019307265Smav hdr->b_l1hdr.b_freeze_cksum); 2020307265Smav mutex_exit(&hdr->b_l1hdr.b_freeze_lock); 2021240133Smm#ifdef illumos 2022240133Smm arc_buf_watch(buf); 2023277300Ssmh#endif 2024168404Spjd} 2025168404Spjd 2026240133Smm#ifdef illumos 2027240133Smm#ifndef _KERNEL 2028240133Smmtypedef struct procctl { 2029240133Smm long cmd; 2030240133Smm prwatch_t prwatch; 2031240133Smm} procctl_t; 2032240133Smm#endif 2033240133Smm 2034240133Smm/* ARGSUSED */ 2035240133Smmstatic void 2036240133Smmarc_buf_unwatch(arc_buf_t *buf) 2037240133Smm{ 2038240133Smm#ifndef _KERNEL 2039240133Smm if (arc_watch) { 2040240133Smm int result; 2041240133Smm procctl_t ctl; 2042240133Smm ctl.cmd = PCWATCH; 2043240133Smm ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data; 2044240133Smm ctl.prwatch.pr_size = 0; 2045240133Smm ctl.prwatch.pr_wflags = 0; 2046240133Smm result = write(arc_procfd, &ctl, sizeof (ctl)); 2047240133Smm ASSERT3U(result, ==, sizeof (ctl)); 2048240133Smm } 2049240133Smm#endif 2050240133Smm} 2051240133Smm 2052240133Smm/* ARGSUSED */ 2053240133Smmstatic void 2054240133Smmarc_buf_watch(arc_buf_t *buf) 2055240133Smm{ 2056240133Smm#ifndef _KERNEL 2057240133Smm if (arc_watch) { 2058240133Smm int result; 2059240133Smm procctl_t ctl; 2060240133Smm ctl.cmd = PCWATCH; 2061240133Smm ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data; 2062321535Smav ctl.prwatch.pr_size = arc_buf_size(buf); 2063240133Smm ctl.prwatch.pr_wflags = WA_WRITE; 2064240133Smm result = write(arc_procfd, &ctl, sizeof (ctl)); 2065240133Smm ASSERT3U(result, ==, sizeof (ctl)); 2066240133Smm } 2067240133Smm#endif 2068240133Smm} 2069240133Smm#endif /* illumos */ 2070240133Smm 2071286570Smavstatic arc_buf_contents_t 2072286570Smavarc_buf_type(arc_buf_hdr_t *hdr) 2073286570Smav{ 2074307265Smav arc_buf_contents_t type; 2075286570Smav if (HDR_ISTYPE_METADATA(hdr)) { 2076307265Smav type = ARC_BUFC_METADATA; 2077286570Smav } else { 2078307265Smav type = ARC_BUFC_DATA; 2079286570Smav } 2080307265Smav VERIFY3U(hdr->b_type, ==, type); 2081307265Smav return (type); 2082286570Smav} 2083286570Smav 2084321535Smavboolean_t 2085321535Smavarc_is_metadata(arc_buf_t *buf) 2086321535Smav{ 2087321535Smav return (HDR_ISTYPE_METADATA(buf->b_hdr) != 0); 2088321535Smav} 2089321535Smav 2090286570Smavstatic uint32_t 2091286570Smavarc_bufc_to_flags(arc_buf_contents_t type) 2092286570Smav{ 2093286570Smav switch (type) { 2094286570Smav case ARC_BUFC_DATA: 2095286570Smav /* metadata field is 0 if buffer contains normal data */ 2096286570Smav return (0); 2097286570Smav case ARC_BUFC_METADATA: 2098286570Smav return (ARC_FLAG_BUFC_METADATA); 2099286570Smav default: 2100286570Smav break; 2101286570Smav } 2102286570Smav panic("undefined ARC buffer type!"); 2103286570Smav return ((uint32_t)-1); 2104286570Smav} 2105286570Smav 2106168404Spjdvoid 2107168404Spjdarc_buf_thaw(arc_buf_t *buf) 2108168404Spjd{ 2109307265Smav arc_buf_hdr_t *hdr = buf->b_hdr; 2110307265Smav 2111321535Smav ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); 2112321535Smav ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 2113321535Smav 2114321535Smav arc_cksum_verify(buf); 2115321535Smav 2116321535Smav /* 2117321535Smav * Compressed buffers do not manipulate the b_freeze_cksum or 2118321535Smav * allocate b_thawed. 2119321535Smav */ 2120321535Smav if (ARC_BUF_COMPRESSED(buf)) { 2121321535Smav ASSERT(hdr->b_l1hdr.b_freeze_cksum == NULL || 2122321535Smav arc_hdr_has_uncompressed_buf(hdr)); 2123321535Smav return; 2124185029Spjd } 2125168404Spjd 2126307265Smav ASSERT(HDR_HAS_L1HDR(hdr)); 2127307265Smav arc_cksum_free(hdr); 2128219089Spjd 2129307265Smav mutex_enter(&hdr->b_l1hdr.b_freeze_lock); 2130286570Smav#ifdef ZFS_DEBUG 2131219089Spjd if (zfs_flags & ZFS_DEBUG_MODIFY) { 2132307265Smav if (hdr->b_l1hdr.b_thawed != NULL) 2133307265Smav kmem_free(hdr->b_l1hdr.b_thawed, 1); 2134307265Smav hdr->b_l1hdr.b_thawed = kmem_alloc(1, KM_SLEEP); 2135219089Spjd } 2136286570Smav#endif 2137219089Spjd 2138307265Smav mutex_exit(&hdr->b_l1hdr.b_freeze_lock); 2139240133Smm 2140240133Smm#ifdef illumos 2141240133Smm arc_buf_unwatch(buf); 2142277300Ssmh#endif 2143168404Spjd} 2144168404Spjd 2145168404Spjdvoid 2146168404Spjdarc_buf_freeze(arc_buf_t *buf) 2147168404Spjd{ 2148307265Smav arc_buf_hdr_t *hdr = buf->b_hdr; 2149219089Spjd kmutex_t *hash_lock; 2150219089Spjd 2151168404Spjd if (!(zfs_flags & ZFS_DEBUG_MODIFY)) 2152168404Spjd return; 2153168404Spjd 2154321535Smav if (ARC_BUF_COMPRESSED(buf)) { 2155321535Smav ASSERT(hdr->b_l1hdr.b_freeze_cksum == NULL || 2156321535Smav arc_hdr_has_uncompressed_buf(hdr)); 2157321535Smav return; 2158321535Smav } 2159321535Smav 2160307265Smav hash_lock = HDR_LOCK(hdr); 2161219089Spjd mutex_enter(hash_lock); 2162219089Spjd 2163307265Smav ASSERT(HDR_HAS_L1HDR(hdr)); 2164307265Smav ASSERT(hdr->b_l1hdr.b_freeze_cksum != NULL || 2165307265Smav hdr->b_l1hdr.b_state == arc_anon); 2166307265Smav arc_cksum_compute(buf); 2167219089Spjd mutex_exit(hash_lock); 2168168404Spjd} 2169168404Spjd 2170307265Smav/* 2171307265Smav * The arc_buf_hdr_t's b_flags should never be modified directly. Instead, 2172307265Smav * the following functions should be used to ensure that the flags are 2173307265Smav * updated in a thread-safe way. When manipulating the flags either 2174307265Smav * the hash_lock must be held or the hdr must be undiscoverable. This 2175307265Smav * ensures that we're not racing with any other threads when updating 2176307265Smav * the flags. 2177307265Smav */ 2178307265Smavstatic inline void 2179307265Smavarc_hdr_set_flags(arc_buf_hdr_t *hdr, arc_flags_t flags) 2180307265Smav{ 2181307265Smav ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); 2182307265Smav hdr->b_flags |= flags; 2183307265Smav} 2184307265Smav 2185307265Smavstatic inline void 2186307265Smavarc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags) 2187307265Smav{ 2188307265Smav ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); 2189307265Smav hdr->b_flags &= ~flags; 2190307265Smav} 2191307265Smav 2192307265Smav/* 2193307265Smav * Setting the compression bits in the arc_buf_hdr_t's b_flags is 2194307265Smav * done in a special way since we have to clear and set bits 2195307265Smav * at the same time. Consumers that wish to set the compression bits 2196307265Smav * must use this function to ensure that the flags are updated in 2197307265Smav * thread-safe manner. 2198307265Smav */ 2199168404Spjdstatic void 2200307265Smavarc_hdr_set_compress(arc_buf_hdr_t *hdr, enum zio_compress cmp) 2201168404Spjd{ 2202307265Smav ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); 2203307265Smav 2204307265Smav /* 2205307265Smav * Holes and embedded blocks will always have a psize = 0 so 2206307265Smav * we ignore the compression of the blkptr and set the 2207307265Smav * arc_buf_hdr_t's compression to ZIO_COMPRESS_OFF. 2208307265Smav * Holes and embedded blocks remain anonymous so we don't 2209307265Smav * want to uncompress them. Mark them as uncompressed. 2210307265Smav */ 2211307265Smav if (!zfs_compressed_arc_enabled || HDR_GET_PSIZE(hdr) == 0) { 2212307265Smav arc_hdr_clear_flags(hdr, ARC_FLAG_COMPRESSED_ARC); 2213307265Smav HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_OFF); 2214307265Smav ASSERT(!HDR_COMPRESSION_ENABLED(hdr)); 2215307265Smav ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF); 2216307265Smav } else { 2217307265Smav arc_hdr_set_flags(hdr, ARC_FLAG_COMPRESSED_ARC); 2218307265Smav HDR_SET_COMPRESS(hdr, cmp); 2219307265Smav ASSERT3U(HDR_GET_COMPRESS(hdr), ==, cmp); 2220307265Smav ASSERT(HDR_COMPRESSION_ENABLED(hdr)); 2221307265Smav } 2222307265Smav} 2223307265Smav 2224321535Smav/* 2225321535Smav * Looks for another buf on the same hdr which has the data decompressed, copies 2226321535Smav * from it, and returns true. If no such buf exists, returns false. 2227321535Smav */ 2228321535Smavstatic boolean_t 2229321535Smavarc_buf_try_copy_decompressed_data(arc_buf_t *buf) 2230321535Smav{ 2231321535Smav arc_buf_hdr_t *hdr = buf->b_hdr; 2232321535Smav boolean_t copied = B_FALSE; 2233321535Smav 2234321535Smav ASSERT(HDR_HAS_L1HDR(hdr)); 2235321535Smav ASSERT3P(buf->b_data, !=, NULL); 2236321535Smav ASSERT(!ARC_BUF_COMPRESSED(buf)); 2237321535Smav 2238321535Smav for (arc_buf_t *from = hdr->b_l1hdr.b_buf; from != NULL; 2239321535Smav from = from->b_next) { 2240321535Smav /* can't use our own data buffer */ 2241321535Smav if (from == buf) { 2242321535Smav continue; 2243321535Smav } 2244321535Smav 2245321535Smav if (!ARC_BUF_COMPRESSED(from)) { 2246321535Smav bcopy(from->b_data, buf->b_data, arc_buf_size(buf)); 2247321535Smav copied = B_TRUE; 2248321535Smav break; 2249321535Smav } 2250321535Smav } 2251321535Smav 2252321535Smav /* 2253321535Smav * There were no decompressed bufs, so there should not be a 2254321535Smav * checksum on the hdr either. 2255321535Smav */ 2256321535Smav EQUIV(!copied, hdr->b_l1hdr.b_freeze_cksum == NULL); 2257321535Smav 2258321535Smav return (copied); 2259321535Smav} 2260321535Smav 2261321535Smav/* 2262321535Smav * Given a buf that has a data buffer attached to it, this function will 2263321535Smav * efficiently fill the buf with data of the specified compression setting from 2264321535Smav * the hdr and update the hdr's b_freeze_cksum if necessary. If the buf and hdr 2265321535Smav * are already sharing a data buf, no copy is performed. 2266321535Smav * 2267321535Smav * If the buf is marked as compressed but uncompressed data was requested, this 2268321535Smav * will allocate a new data buffer for the buf, remove that flag, and fill the 2269321535Smav * buf with uncompressed data. You can't request a compressed buf on a hdr with 2270321535Smav * uncompressed data, and (since we haven't added support for it yet) if you 2271321535Smav * want compressed data your buf must already be marked as compressed and have 2272321535Smav * the correct-sized data buffer. 2273321535Smav */ 2274307265Smavstatic int 2275321535Smavarc_buf_fill(arc_buf_t *buf, boolean_t compressed) 2276307265Smav{ 2277307265Smav arc_buf_hdr_t *hdr = buf->b_hdr; 2278321535Smav boolean_t hdr_compressed = (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF); 2279307265Smav dmu_object_byteswap_t bswap = hdr->b_l1hdr.b_byteswap; 2280307265Smav 2281321535Smav ASSERT3P(buf->b_data, !=, NULL); 2282321535Smav IMPLY(compressed, hdr_compressed); 2283321535Smav IMPLY(compressed, ARC_BUF_COMPRESSED(buf)); 2284321535Smav 2285321535Smav if (hdr_compressed == compressed) { 2286321535Smav if (!arc_buf_is_shared(buf)) { 2287321610Smav abd_copy_to_buf(buf->b_data, hdr->b_l1hdr.b_pabd, 2288321535Smav arc_buf_size(buf)); 2289321535Smav } 2290321535Smav } else { 2291321535Smav ASSERT(hdr_compressed); 2292321535Smav ASSERT(!compressed); 2293321535Smav ASSERT3U(HDR_GET_LSIZE(hdr), !=, HDR_GET_PSIZE(hdr)); 2294321535Smav 2295307265Smav /* 2296321535Smav * If the buf is sharing its data with the hdr, unlink it and 2297321535Smav * allocate a new data buffer for the buf. 2298307265Smav */ 2299321535Smav if (arc_buf_is_shared(buf)) { 2300321535Smav ASSERT(ARC_BUF_COMPRESSED(buf)); 2301321535Smav 2302321535Smav /* We need to give the buf it's own b_data */ 2303321535Smav buf->b_flags &= ~ARC_BUF_FLAG_SHARED; 2304321535Smav buf->b_data = 2305321535Smav arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf); 2306321535Smav arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA); 2307321535Smav 2308321535Smav /* Previously overhead was 0; just add new overhead */ 2309321535Smav ARCSTAT_INCR(arcstat_overhead_size, HDR_GET_LSIZE(hdr)); 2310321535Smav } else if (ARC_BUF_COMPRESSED(buf)) { 2311321535Smav /* We need to reallocate the buf's b_data */ 2312321535Smav arc_free_data_buf(hdr, buf->b_data, HDR_GET_PSIZE(hdr), 2313321535Smav buf); 2314321535Smav buf->b_data = 2315321535Smav arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf); 2316321535Smav 2317321535Smav /* We increased the size of b_data; update overhead */ 2318321535Smav ARCSTAT_INCR(arcstat_overhead_size, 2319321535Smav HDR_GET_LSIZE(hdr) - HDR_GET_PSIZE(hdr)); 2320307265Smav } 2321321535Smav 2322321535Smav /* 2323321535Smav * Regardless of the buf's previous compression settings, it 2324321535Smav * should not be compressed at the end of this function. 2325321535Smav */ 2326321535Smav buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED; 2327321535Smav 2328321535Smav /* 2329321535Smav * Try copying the data from another buf which already has a 2330321535Smav * decompressed version. If that's not possible, it's time to 2331321535Smav * bite the bullet and decompress the data from the hdr. 2332321535Smav */ 2333321535Smav if (arc_buf_try_copy_decompressed_data(buf)) { 2334321535Smav /* Skip byteswapping and checksumming (already done) */ 2335321535Smav ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, !=, NULL); 2336321535Smav return (0); 2337321535Smav } else { 2338321535Smav int error = zio_decompress_data(HDR_GET_COMPRESS(hdr), 2339321610Smav hdr->b_l1hdr.b_pabd, buf->b_data, 2340321535Smav HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr)); 2341321535Smav 2342321535Smav /* 2343321535Smav * Absent hardware errors or software bugs, this should 2344321535Smav * be impossible, but log it anyway so we can debug it. 2345321535Smav */ 2346321535Smav if (error != 0) { 2347321535Smav zfs_dbgmsg( 2348321535Smav "hdr %p, compress %d, psize %d, lsize %d", 2349321535Smav hdr, HDR_GET_COMPRESS(hdr), 2350321535Smav HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr)); 2351321535Smav return (SET_ERROR(EIO)); 2352321535Smav } 2353321535Smav } 2354307265Smav } 2355321535Smav 2356321535Smav /* Byteswap the buf's data if necessary */ 2357307265Smav if (bswap != DMU_BSWAP_NUMFUNCS) { 2358307265Smav ASSERT(!HDR_SHARED_DATA(hdr)); 2359307265Smav ASSERT3U(bswap, <, DMU_BSWAP_NUMFUNCS); 2360307265Smav dmu_ot_byteswap[bswap].ob_func(buf->b_data, HDR_GET_LSIZE(hdr)); 2361307265Smav } 2362321535Smav 2363321535Smav /* Compute the hdr's checksum if necessary */ 2364307265Smav arc_cksum_compute(buf); 2365321535Smav 2366307265Smav return (0); 2367307265Smav} 2368307265Smav 2369321535Smavint 2370321535Smavarc_decompress(arc_buf_t *buf) 2371321535Smav{ 2372321535Smav return (arc_buf_fill(buf, B_FALSE)); 2373321535Smav} 2374321535Smav 2375307265Smav/* 2376321610Smav * Return the size of the block, b_pabd, that is stored in the arc_buf_hdr_t. 2377307265Smav */ 2378307265Smavstatic uint64_t 2379307265Smavarc_hdr_size(arc_buf_hdr_t *hdr) 2380307265Smav{ 2381307265Smav uint64_t size; 2382307265Smav 2383307265Smav if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF && 2384307265Smav HDR_GET_PSIZE(hdr) > 0) { 2385307265Smav size = HDR_GET_PSIZE(hdr); 2386307265Smav } else { 2387307265Smav ASSERT3U(HDR_GET_LSIZE(hdr), !=, 0); 2388307265Smav size = HDR_GET_LSIZE(hdr); 2389307265Smav } 2390307265Smav return (size); 2391307265Smav} 2392307265Smav 2393307265Smav/* 2394307265Smav * Increment the amount of evictable space in the arc_state_t's refcount. 2395307265Smav * We account for the space used by the hdr and the arc buf individually 2396307265Smav * so that we can add and remove them from the refcount individually. 2397307265Smav */ 2398307265Smavstatic void 2399307265Smavarc_evictable_space_increment(arc_buf_hdr_t *hdr, arc_state_t *state) 2400307265Smav{ 2401307265Smav arc_buf_contents_t type = arc_buf_type(hdr); 2402307265Smav 2403286570Smav ASSERT(HDR_HAS_L1HDR(hdr)); 2404307265Smav 2405307265Smav if (GHOST_STATE(state)) { 2406307265Smav ASSERT0(hdr->b_l1hdr.b_bufcnt); 2407307265Smav ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); 2408321610Smav ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); 2409321535Smav (void) refcount_add_many(&state->arcs_esize[type], 2410321535Smav HDR_GET_LSIZE(hdr), hdr); 2411307265Smav return; 2412307265Smav } 2413307265Smav 2414307265Smav ASSERT(!GHOST_STATE(state)); 2415321610Smav if (hdr->b_l1hdr.b_pabd != NULL) { 2416307265Smav (void) refcount_add_many(&state->arcs_esize[type], 2417307265Smav arc_hdr_size(hdr), hdr); 2418307265Smav } 2419307265Smav for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; 2420307265Smav buf = buf->b_next) { 2421321535Smav if (arc_buf_is_shared(buf)) 2422307265Smav continue; 2423321535Smav (void) refcount_add_many(&state->arcs_esize[type], 2424321535Smav arc_buf_size(buf), buf); 2425307265Smav } 2426307265Smav} 2427307265Smav 2428307265Smav/* 2429307265Smav * Decrement the amount of evictable space in the arc_state_t's refcount. 2430307265Smav * We account for the space used by the hdr and the arc buf individually 2431307265Smav * so that we can add and remove them from the refcount individually. 2432307265Smav */ 2433307265Smavstatic void 2434321535Smavarc_evictable_space_decrement(arc_buf_hdr_t *hdr, arc_state_t *state) 2435307265Smav{ 2436307265Smav arc_buf_contents_t type = arc_buf_type(hdr); 2437307265Smav 2438307265Smav ASSERT(HDR_HAS_L1HDR(hdr)); 2439307265Smav 2440307265Smav if (GHOST_STATE(state)) { 2441307265Smav ASSERT0(hdr->b_l1hdr.b_bufcnt); 2442307265Smav ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); 2443321610Smav ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); 2444307265Smav (void) refcount_remove_many(&state->arcs_esize[type], 2445321535Smav HDR_GET_LSIZE(hdr), hdr); 2446307265Smav return; 2447307265Smav } 2448307265Smav 2449307265Smav ASSERT(!GHOST_STATE(state)); 2450321610Smav if (hdr->b_l1hdr.b_pabd != NULL) { 2451307265Smav (void) refcount_remove_many(&state->arcs_esize[type], 2452307265Smav arc_hdr_size(hdr), hdr); 2453307265Smav } 2454307265Smav for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; 2455307265Smav buf = buf->b_next) { 2456321535Smav if (arc_buf_is_shared(buf)) 2457307265Smav continue; 2458307265Smav (void) refcount_remove_many(&state->arcs_esize[type], 2459321535Smav arc_buf_size(buf), buf); 2460307265Smav } 2461307265Smav} 2462307265Smav 2463307265Smav/* 2464307265Smav * Add a reference to this hdr indicating that someone is actively 2465307265Smav * referencing that memory. When the refcount transitions from 0 to 1, 2466307265Smav * we remove it from the respective arc_state_t list to indicate that 2467307265Smav * it is not evictable. 2468307265Smav */ 2469307265Smavstatic void 2470307265Smavadd_reference(arc_buf_hdr_t *hdr, void *tag) 2471307265Smav{ 2472307265Smav ASSERT(HDR_HAS_L1HDR(hdr)); 2473307265Smav if (!MUTEX_HELD(HDR_LOCK(hdr))) { 2474307265Smav ASSERT(hdr->b_l1hdr.b_state == arc_anon); 2475307265Smav ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 2476307265Smav ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); 2477307265Smav } 2478307265Smav 2479286570Smav arc_state_t *state = hdr->b_l1hdr.b_state; 2480168404Spjd 2481286570Smav if ((refcount_add(&hdr->b_l1hdr.b_refcnt, tag) == 1) && 2482286570Smav (state != arc_anon)) { 2483286570Smav /* We don't use the L2-only state list. */ 2484286570Smav if (state != arc_l2c_only) { 2485321553Smav multilist_remove(state->arcs_list[arc_buf_type(hdr)], 2486307265Smav hdr); 2487321535Smav arc_evictable_space_decrement(hdr, state); 2488168404Spjd } 2489185029Spjd /* remove the prefetch flag if we get a reference */ 2490307265Smav arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH); 2491168404Spjd } 2492168404Spjd} 2493168404Spjd 2494307265Smav/* 2495307265Smav * Remove a reference from this hdr. When the reference transitions from 2496307265Smav * 1 to 0 and we're not anonymous, then we add this hdr to the arc_state_t's 2497307265Smav * list making it eligible for eviction. 2498307265Smav */ 2499168404Spjdstatic int 2500275811Sdelphijremove_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag) 2501168404Spjd{ 2502168404Spjd int cnt; 2503286570Smav arc_state_t *state = hdr->b_l1hdr.b_state; 2504168404Spjd 2505286570Smav ASSERT(HDR_HAS_L1HDR(hdr)); 2506168404Spjd ASSERT(state == arc_anon || MUTEX_HELD(hash_lock)); 2507168404Spjd ASSERT(!GHOST_STATE(state)); 2508168404Spjd 2509286570Smav /* 2510286570Smav * arc_l2c_only counts as a ghost state so we don't need to explicitly 2511286570Smav * check to prevent usage of the arc_l2c_only list. 2512286570Smav */ 2513286570Smav if (((cnt = refcount_remove(&hdr->b_l1hdr.b_refcnt, tag)) == 0) && 2514168404Spjd (state != arc_anon)) { 2515321553Smav multilist_insert(state->arcs_list[arc_buf_type(hdr)], hdr); 2516307265Smav ASSERT3U(hdr->b_l1hdr.b_bufcnt, >, 0); 2517307265Smav arc_evictable_space_increment(hdr, state); 2518168404Spjd } 2519168404Spjd return (cnt); 2520168404Spjd} 2521168404Spjd 2522168404Spjd/* 2523286763Smav * Move the supplied buffer to the indicated state. The hash lock 2524168404Spjd * for the buffer must be held by the caller. 2525168404Spjd */ 2526168404Spjdstatic void 2527275811Sdelphijarc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, 2528275811Sdelphij kmutex_t *hash_lock) 2529168404Spjd{ 2530286570Smav arc_state_t *old_state; 2531286570Smav int64_t refcnt; 2532307265Smav uint32_t bufcnt; 2533307265Smav boolean_t update_old, update_new; 2534286570Smav arc_buf_contents_t buftype = arc_buf_type(hdr); 2535168404Spjd 2536286570Smav /* 2537286570Smav * We almost always have an L1 hdr here, since we call arc_hdr_realloc() 2538286570Smav * in arc_read() when bringing a buffer out of the L2ARC. However, the 2539286570Smav * L1 hdr doesn't always exist when we change state to arc_anon before 2540286570Smav * destroying a header, in which case reallocating to add the L1 hdr is 2541286570Smav * pointless. 2542286570Smav */ 2543286570Smav if (HDR_HAS_L1HDR(hdr)) { 2544286570Smav old_state = hdr->b_l1hdr.b_state; 2545286570Smav refcnt = refcount_count(&hdr->b_l1hdr.b_refcnt); 2546307265Smav bufcnt = hdr->b_l1hdr.b_bufcnt; 2547321610Smav update_old = (bufcnt > 0 || hdr->b_l1hdr.b_pabd != NULL); 2548286570Smav } else { 2549286570Smav old_state = arc_l2c_only; 2550286570Smav refcnt = 0; 2551307265Smav bufcnt = 0; 2552307265Smav update_old = B_FALSE; 2553286570Smav } 2554307265Smav update_new = update_old; 2555286570Smav 2556168404Spjd ASSERT(MUTEX_HELD(hash_lock)); 2557258632Savg ASSERT3P(new_state, !=, old_state); 2558307265Smav ASSERT(!GHOST_STATE(new_state) || bufcnt == 0); 2559307265Smav ASSERT(old_state != arc_anon || bufcnt <= 1); 2560168404Spjd 2561168404Spjd /* 2562168404Spjd * If this buffer is evictable, transfer it from the 2563168404Spjd * old state list to the new state list. 2564168404Spjd */ 2565168404Spjd if (refcnt == 0) { 2566286570Smav if (old_state != arc_anon && old_state != arc_l2c_only) { 2567286570Smav ASSERT(HDR_HAS_L1HDR(hdr)); 2568321553Smav multilist_remove(old_state->arcs_list[buftype], hdr); 2569168404Spjd 2570307265Smav if (GHOST_STATE(old_state)) { 2571307265Smav ASSERT0(bufcnt); 2572307265Smav ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); 2573307265Smav update_old = B_TRUE; 2574168404Spjd } 2575321535Smav arc_evictable_space_decrement(hdr, old_state); 2576168404Spjd } 2577286570Smav if (new_state != arc_anon && new_state != arc_l2c_only) { 2578168404Spjd 2579286570Smav /* 2580286570Smav * An L1 header always exists here, since if we're 2581286570Smav * moving to some L1-cached state (i.e. not l2c_only or 2582286570Smav * anonymous), we realloc the header to add an L1hdr 2583286570Smav * beforehand. 2584286570Smav */ 2585286570Smav ASSERT(HDR_HAS_L1HDR(hdr)); 2586321553Smav multilist_insert(new_state->arcs_list[buftype], hdr); 2587168404Spjd 2588168404Spjd if (GHOST_STATE(new_state)) { 2589307265Smav ASSERT0(bufcnt); 2590307265Smav ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); 2591307265Smav update_new = B_TRUE; 2592168404Spjd } 2593307265Smav arc_evictable_space_increment(hdr, new_state); 2594168404Spjd } 2595168404Spjd } 2596168404Spjd 2597307265Smav ASSERT(!HDR_EMPTY(hdr)); 2598275811Sdelphij if (new_state == arc_anon && HDR_IN_HASH_TABLE(hdr)) 2599275811Sdelphij buf_hash_remove(hdr); 2600168404Spjd 2601286570Smav /* adjust state sizes (ignore arc_l2c_only) */ 2602286766Smav 2603307265Smav if (update_new && new_state != arc_l2c_only) { 2604286766Smav ASSERT(HDR_HAS_L1HDR(hdr)); 2605286766Smav if (GHOST_STATE(new_state)) { 2606307265Smav ASSERT0(bufcnt); 2607286766Smav 2608286766Smav /* 2609307265Smav * When moving a header to a ghost state, we first 2610286766Smav * remove all arc buffers. Thus, we'll have a 2611307265Smav * bufcnt of zero, and no arc buffer to use for 2612286766Smav * the reference. As a result, we use the arc 2613286766Smav * header pointer for the reference. 2614286766Smav */ 2615286766Smav (void) refcount_add_many(&new_state->arcs_size, 2616307265Smav HDR_GET_LSIZE(hdr), hdr); 2617321610Smav ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); 2618286766Smav } else { 2619307265Smav uint32_t buffers = 0; 2620286766Smav 2621286766Smav /* 2622286766Smav * Each individual buffer holds a unique reference, 2623286766Smav * thus we must remove each of these references one 2624286766Smav * at a time. 2625286766Smav */ 2626286766Smav for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; 2627286766Smav buf = buf->b_next) { 2628307265Smav ASSERT3U(bufcnt, !=, 0); 2629307265Smav buffers++; 2630307265Smav 2631307265Smav /* 2632307265Smav * When the arc_buf_t is sharing the data 2633307265Smav * block with the hdr, the owner of the 2634307265Smav * reference belongs to the hdr. Only 2635307265Smav * add to the refcount if the arc_buf_t is 2636307265Smav * not shared. 2637307265Smav */ 2638321535Smav if (arc_buf_is_shared(buf)) 2639307265Smav continue; 2640307265Smav 2641286766Smav (void) refcount_add_many(&new_state->arcs_size, 2642321535Smav arc_buf_size(buf), buf); 2643286766Smav } 2644307265Smav ASSERT3U(bufcnt, ==, buffers); 2645307265Smav 2646321610Smav if (hdr->b_l1hdr.b_pabd != NULL) { 2647307265Smav (void) refcount_add_many(&new_state->arcs_size, 2648307265Smav arc_hdr_size(hdr), hdr); 2649307265Smav } else { 2650307265Smav ASSERT(GHOST_STATE(old_state)); 2651307265Smav } 2652286766Smav } 2653286766Smav } 2654286766Smav 2655307265Smav if (update_old && old_state != arc_l2c_only) { 2656286766Smav ASSERT(HDR_HAS_L1HDR(hdr)); 2657286766Smav if (GHOST_STATE(old_state)) { 2658307265Smav ASSERT0(bufcnt); 2659321610Smav ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); 2660307265Smav 2661286766Smav /* 2662286766Smav * When moving a header off of a ghost state, 2663307265Smav * the header will not contain any arc buffers. 2664307265Smav * We use the arc header pointer for the reference 2665307265Smav * which is exactly what we did when we put the 2666307265Smav * header on the ghost state. 2667286766Smav */ 2668286766Smav 2669286766Smav (void) refcount_remove_many(&old_state->arcs_size, 2670307265Smav HDR_GET_LSIZE(hdr), hdr); 2671286766Smav } else { 2672307265Smav uint32_t buffers = 0; 2673286766Smav 2674286766Smav /* 2675286766Smav * Each individual buffer holds a unique reference, 2676286766Smav * thus we must remove each of these references one 2677286766Smav * at a time. 2678286766Smav */ 2679286766Smav for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; 2680286766Smav buf = buf->b_next) { 2681321535Smav ASSERT3U(bufcnt, !=, 0); 2682307265Smav buffers++; 2683307265Smav 2684307265Smav /* 2685307265Smav * When the arc_buf_t is sharing the data 2686307265Smav * block with the hdr, the owner of the 2687307265Smav * reference belongs to the hdr. Only 2688307265Smav * add to the refcount if the arc_buf_t is 2689307265Smav * not shared. 2690307265Smav */ 2691321535Smav if (arc_buf_is_shared(buf)) 2692307265Smav continue; 2693307265Smav 2694286766Smav (void) refcount_remove_many( 2695321535Smav &old_state->arcs_size, arc_buf_size(buf), 2696307265Smav buf); 2697286766Smav } 2698307265Smav ASSERT3U(bufcnt, ==, buffers); 2699321610Smav ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); 2700307265Smav (void) refcount_remove_many( 2701307265Smav &old_state->arcs_size, arc_hdr_size(hdr), hdr); 2702286766Smav } 2703168404Spjd } 2704286766Smav 2705286570Smav if (HDR_HAS_L1HDR(hdr)) 2706286570Smav hdr->b_l1hdr.b_state = new_state; 2707185029Spjd 2708286570Smav /* 2709286570Smav * L2 headers should never be on the L2 state list since they don't 2710286570Smav * have L1 headers allocated. 2711286570Smav */ 2712321553Smav ASSERT(multilist_is_empty(arc_l2c_only->arcs_list[ARC_BUFC_DATA]) && 2713321553Smav multilist_is_empty(arc_l2c_only->arcs_list[ARC_BUFC_METADATA])); 2714168404Spjd} 2715168404Spjd 2716185029Spjdvoid 2717208373Smmarc_space_consume(uint64_t space, arc_space_type_t type) 2718185029Spjd{ 2719208373Smm ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES); 2720208373Smm 2721208373Smm switch (type) { 2722208373Smm case ARC_SPACE_DATA: 2723332540Smav aggsum_add(&astat_data_size, space); 2724208373Smm break; 2725286574Smav case ARC_SPACE_META: 2726332540Smav aggsum_add(&astat_metadata_size, space); 2727286574Smav break; 2728208373Smm case ARC_SPACE_OTHER: 2729332540Smav aggsum_add(&astat_other_size, space); 2730208373Smm break; 2731208373Smm case ARC_SPACE_HDRS: 2732332540Smav aggsum_add(&astat_hdr_size, space); 2733208373Smm break; 2734208373Smm case ARC_SPACE_L2HDRS: 2735332540Smav aggsum_add(&astat_l2_hdr_size, space); 2736208373Smm break; 2737208373Smm } 2738208373Smm 2739286574Smav if (type != ARC_SPACE_DATA) 2740332540Smav aggsum_add(&arc_meta_used, space); 2741286574Smav 2742332540Smav aggsum_add(&arc_size, space); 2743185029Spjd} 2744185029Spjd 2745185029Spjdvoid 2746208373Smmarc_space_return(uint64_t space, arc_space_type_t type) 2747185029Spjd{ 2748208373Smm ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES); 2749208373Smm 2750208373Smm switch (type) { 2751208373Smm case ARC_SPACE_DATA: 2752332540Smav aggsum_add(&astat_data_size, -space); 2753208373Smm break; 2754286574Smav case ARC_SPACE_META: 2755332540Smav aggsum_add(&astat_metadata_size, -space); 2756286574Smav break; 2757208373Smm case ARC_SPACE_OTHER: 2758332540Smav aggsum_add(&astat_other_size, -space); 2759208373Smm break; 2760208373Smm case ARC_SPACE_HDRS: 2761332540Smav aggsum_add(&astat_hdr_size, -space); 2762208373Smm break; 2763208373Smm case ARC_SPACE_L2HDRS: 2764332540Smav aggsum_add(&astat_l2_hdr_size, -space); 2765208373Smm break; 2766208373Smm } 2767208373Smm 2768286574Smav if (type != ARC_SPACE_DATA) { 2769332540Smav ASSERT(aggsum_compare(&arc_meta_used, space) >= 0); 2770332540Smav /* 2771332540Smav * We use the upper bound here rather than the precise value 2772332540Smav * because the arc_meta_max value doesn't need to be 2773332540Smav * precise. It's only consumed by humans via arcstats. 2774332540Smav */ 2775332540Smav if (arc_meta_max < aggsum_upper_bound(&arc_meta_used)) 2776332540Smav arc_meta_max = aggsum_upper_bound(&arc_meta_used); 2777332540Smav aggsum_add(&arc_meta_used, -space); 2778286574Smav } 2779286574Smav 2780332540Smav ASSERT(aggsum_compare(&arc_size, space) >= 0); 2781332540Smav aggsum_add(&arc_size, -space); 2782185029Spjd} 2783185029Spjd 2784307265Smav/* 2785321535Smav * Given a hdr and a buf, returns whether that buf can share its b_data buffer 2786321610Smav * with the hdr's b_pabd. 2787307265Smav */ 2788321535Smavstatic boolean_t 2789321535Smavarc_can_share(arc_buf_hdr_t *hdr, arc_buf_t *buf) 2790168404Spjd{ 2791321535Smav /* 2792321535Smav * The criteria for sharing a hdr's data are: 2793321535Smav * 1. the hdr's compression matches the buf's compression 2794321535Smav * 2. the hdr doesn't need to be byteswapped 2795321535Smav * 3. the hdr isn't already being shared 2796321535Smav * 4. the buf is either compressed or it is the last buf in the hdr list 2797321535Smav * 2798321535Smav * Criterion #4 maintains the invariant that shared uncompressed 2799321535Smav * bufs must be the final buf in the hdr's b_buf list. Reading this, you 2800321535Smav * might ask, "if a compressed buf is allocated first, won't that be the 2801321535Smav * last thing in the list?", but in that case it's impossible to create 2802321535Smav * a shared uncompressed buf anyway (because the hdr must be compressed 2803321535Smav * to have the compressed buf). You might also think that #3 is 2804321535Smav * sufficient to make this guarantee, however it's possible 2805321535Smav * (specifically in the rare L2ARC write race mentioned in 2806321535Smav * arc_buf_alloc_impl()) there will be an existing uncompressed buf that 2807321535Smav * is sharable, but wasn't at the time of its allocation. Rather than 2808321535Smav * allow a new shared uncompressed buf to be created and then shuffle 2809321535Smav * the list around to make it the last element, this simply disallows 2810321535Smav * sharing if the new buf isn't the first to be added. 2811321535Smav */ 2812321535Smav ASSERT3P(buf->b_hdr, ==, hdr); 2813321535Smav boolean_t hdr_compressed = HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF; 2814321535Smav boolean_t buf_compressed = ARC_BUF_COMPRESSED(buf) != 0; 2815321535Smav return (buf_compressed == hdr_compressed && 2816321535Smav hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS && 2817321535Smav !HDR_SHARED_DATA(hdr) && 2818321535Smav (ARC_BUF_LAST(buf) || ARC_BUF_COMPRESSED(buf))); 2819321535Smav} 2820321535Smav 2821321535Smav/* 2822321535Smav * Allocate a buf for this hdr. If you care about the data that's in the hdr, 2823321535Smav * or if you want a compressed buffer, pass those flags in. Returns 0 if the 2824321535Smav * copy was made successfully, or an error code otherwise. 2825321535Smav */ 2826321535Smavstatic int 2827321535Smavarc_buf_alloc_impl(arc_buf_hdr_t *hdr, void *tag, boolean_t compressed, 2828321535Smav boolean_t fill, arc_buf_t **ret) 2829321535Smav{ 2830168404Spjd arc_buf_t *buf; 2831168404Spjd 2832307265Smav ASSERT(HDR_HAS_L1HDR(hdr)); 2833307265Smav ASSERT3U(HDR_GET_LSIZE(hdr), >, 0); 2834307265Smav VERIFY(hdr->b_type == ARC_BUFC_DATA || 2835307265Smav hdr->b_type == ARC_BUFC_METADATA); 2836321535Smav ASSERT3P(ret, !=, NULL); 2837321535Smav ASSERT3P(*ret, ==, NULL); 2838286570Smav 2839321535Smav buf = *ret = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); 2840168404Spjd buf->b_hdr = hdr; 2841168404Spjd buf->b_data = NULL; 2842321535Smav buf->b_next = hdr->b_l1hdr.b_buf; 2843321535Smav buf->b_flags = 0; 2844286570Smav 2845307265Smav add_reference(hdr, tag); 2846286570Smav 2847307265Smav /* 2848307265Smav * We're about to change the hdr's b_flags. We must either 2849307265Smav * hold the hash_lock or be undiscoverable. 2850307265Smav */ 2851307265Smav ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); 2852307265Smav 2853307265Smav /* 2854321535Smav * Only honor requests for compressed bufs if the hdr is actually 2855321535Smav * compressed. 2856307265Smav */ 2857321535Smav if (compressed && HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF) 2858321535Smav buf->b_flags |= ARC_BUF_FLAG_COMPRESSED; 2859321535Smav 2860321535Smav /* 2861321535Smav * If the hdr's data can be shared then we share the data buffer and 2862321535Smav * set the appropriate bit in the hdr's b_flags to indicate the hdr is 2863321610Smav * sharing it's b_pabd with the arc_buf_t. Otherwise, we allocate a new 2864321535Smav * buffer to store the buf's data. 2865321535Smav * 2866321610Smav * There are two additional restrictions here because we're sharing 2867321610Smav * hdr -> buf instead of the usual buf -> hdr. First, the hdr can't be 2868321610Smav * actively involved in an L2ARC write, because if this buf is used by 2869321610Smav * an arc_write() then the hdr's data buffer will be released when the 2870321535Smav * write completes, even though the L2ARC write might still be using it. 2871321610Smav * Second, the hdr's ABD must be linear so that the buf's user doesn't 2872321610Smav * need to be ABD-aware. 2873321535Smav */ 2874321610Smav boolean_t can_share = arc_can_share(hdr, buf) && !HDR_L2_WRITING(hdr) && 2875321610Smav abd_is_linear(hdr->b_l1hdr.b_pabd); 2876321535Smav 2877321535Smav /* Set up b_data and sharing */ 2878321535Smav if (can_share) { 2879321610Smav buf->b_data = abd_to_buf(hdr->b_l1hdr.b_pabd); 2880321535Smav buf->b_flags |= ARC_BUF_FLAG_SHARED; 2881307265Smav arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA); 2882307265Smav } else { 2883321535Smav buf->b_data = 2884321535Smav arc_get_data_buf(hdr, arc_buf_size(buf), buf); 2885321535Smav ARCSTAT_INCR(arcstat_overhead_size, arc_buf_size(buf)); 2886307265Smav } 2887307265Smav VERIFY3P(buf->b_data, !=, NULL); 2888307265Smav 2889286570Smav hdr->b_l1hdr.b_buf = buf; 2890307265Smav hdr->b_l1hdr.b_bufcnt += 1; 2891286570Smav 2892321535Smav /* 2893321535Smav * If the user wants the data from the hdr, we need to either copy or 2894321535Smav * decompress the data. 2895321535Smav */ 2896321535Smav if (fill) { 2897321535Smav return (arc_buf_fill(buf, ARC_BUF_COMPRESSED(buf) != 0)); 2898321535Smav } 2899321535Smav 2900321535Smav return (0); 2901307265Smav} 2902168404Spjd 2903321535Smavstatic char *arc_onloan_tag = "onloan"; 2904321535Smav 2905321535Smavstatic inline void 2906321535Smavarc_loaned_bytes_update(int64_t delta) 2907307265Smav{ 2908321535Smav atomic_add_64(&arc_loaned_bytes, delta); 2909307265Smav 2910321535Smav /* assert that it did not wrap around */ 2911321535Smav ASSERT3S(atomic_add_64_nv(&arc_loaned_bytes, 0), >=, 0); 2912168404Spjd} 2913168404Spjd 2914209962Smm/* 2915209962Smm * Loan out an anonymous arc buffer. Loaned buffers are not counted as in 2916209962Smm * flight data by arc_tempreserve_space() until they are "returned". Loaned 2917209962Smm * buffers must be returned to the arc before they can be used by the DMU or 2918209962Smm * freed. 2919209962Smm */ 2920209962Smmarc_buf_t * 2921321535Smavarc_loan_buf(spa_t *spa, boolean_t is_metadata, int size) 2922209962Smm{ 2923321535Smav arc_buf_t *buf = arc_alloc_buf(spa, arc_onloan_tag, 2924321535Smav is_metadata ? ARC_BUFC_METADATA : ARC_BUFC_DATA, size); 2925209962Smm 2926332551Smav arc_loaned_bytes_update(arc_buf_size(buf)); 2927209962Smm 2928209962Smm return (buf); 2929209962Smm} 2930209962Smm 2931321535Smavarc_buf_t * 2932321535Smavarc_loan_compressed_buf(spa_t *spa, uint64_t psize, uint64_t lsize, 2933321535Smav enum zio_compress compression_type) 2934321535Smav{ 2935321535Smav arc_buf_t *buf = arc_alloc_compressed_buf(spa, arc_onloan_tag, 2936321535Smav psize, lsize, compression_type); 2937321535Smav 2938332551Smav arc_loaned_bytes_update(arc_buf_size(buf)); 2939321535Smav 2940321535Smav return (buf); 2941321535Smav} 2942321535Smav 2943321535Smav 2944209962Smm/* 2945209962Smm * Return a loaned arc buffer to the arc. 2946209962Smm */ 2947209962Smmvoid 2948209962Smmarc_return_buf(arc_buf_t *buf, void *tag) 2949209962Smm{ 2950209962Smm arc_buf_hdr_t *hdr = buf->b_hdr; 2951209962Smm 2952307265Smav ASSERT3P(buf->b_data, !=, NULL); 2953286570Smav ASSERT(HDR_HAS_L1HDR(hdr)); 2954286570Smav (void) refcount_add(&hdr->b_l1hdr.b_refcnt, tag); 2955286570Smav (void) refcount_remove(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag); 2956209962Smm 2957321535Smav arc_loaned_bytes_update(-arc_buf_size(buf)); 2958209962Smm} 2959209962Smm 2960219089Spjd/* Detach an arc_buf from a dbuf (tag) */ 2961219089Spjdvoid 2962219089Spjdarc_loan_inuse_buf(arc_buf_t *buf, void *tag) 2963219089Spjd{ 2964286570Smav arc_buf_hdr_t *hdr = buf->b_hdr; 2965219089Spjd 2966307265Smav ASSERT3P(buf->b_data, !=, NULL); 2967286570Smav ASSERT(HDR_HAS_L1HDR(hdr)); 2968286570Smav (void) refcount_add(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag); 2969286570Smav (void) refcount_remove(&hdr->b_l1hdr.b_refcnt, tag); 2970219089Spjd 2971321535Smav arc_loaned_bytes_update(arc_buf_size(buf)); 2972219089Spjd} 2973219089Spjd 2974274172Savgstatic void 2975321610Smavl2arc_free_abd_on_write(abd_t *abd, size_t size, arc_buf_contents_t type) 2976274172Savg{ 2977307265Smav l2arc_data_free_t *df = kmem_alloc(sizeof (*df), KM_SLEEP); 2978274172Savg 2979321610Smav df->l2df_abd = abd; 2980274172Savg df->l2df_size = size; 2981307265Smav df->l2df_type = type; 2982274172Savg mutex_enter(&l2arc_free_on_write_mtx); 2983274172Savg list_insert_head(l2arc_free_on_write, df); 2984274172Savg mutex_exit(&l2arc_free_on_write_mtx); 2985274172Savg} 2986274172Savg 2987168404Spjdstatic void 2988307265Smavarc_hdr_free_on_write(arc_buf_hdr_t *hdr) 2989185029Spjd{ 2990307265Smav arc_state_t *state = hdr->b_l1hdr.b_state; 2991307265Smav arc_buf_contents_t type = arc_buf_type(hdr); 2992307265Smav uint64_t size = arc_hdr_size(hdr); 2993240133Smm 2994307265Smav /* protected by hash lock, if in the hash table */ 2995307265Smav if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) { 2996307265Smav ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 2997307265Smav ASSERT(state != arc_anon && state != arc_l2c_only); 2998307265Smav 2999307265Smav (void) refcount_remove_many(&state->arcs_esize[type], 3000307265Smav size, hdr); 3001185029Spjd } 3002307265Smav (void) refcount_remove_many(&state->arcs_size, size, hdr); 3003315834Savg if (type == ARC_BUFC_METADATA) { 3004315834Savg arc_space_return(size, ARC_SPACE_META); 3005315834Savg } else { 3006315834Savg ASSERT(type == ARC_BUFC_DATA); 3007315834Savg arc_space_return(size, ARC_SPACE_DATA); 3008315834Savg } 3009307265Smav 3010321610Smav l2arc_free_abd_on_write(hdr->b_l1hdr.b_pabd, size, type); 3011185029Spjd} 3012185029Spjd 3013307265Smav/* 3014307265Smav * Share the arc_buf_t's data with the hdr. Whenever we are sharing the 3015307265Smav * data buffer, we transfer the refcount ownership to the hdr and update 3016307265Smav * the appropriate kstats. 3017307265Smav */ 3018185029Spjdstatic void 3019307265Smavarc_share_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf) 3020274172Savg{ 3021307265Smav arc_state_t *state = hdr->b_l1hdr.b_state; 3022297848Savg 3023321535Smav ASSERT(arc_can_share(hdr, buf)); 3024321610Smav ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); 3025307265Smav ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); 3026274172Savg 3027286570Smav /* 3028307265Smav * Start sharing the data buffer. We transfer the 3029307265Smav * refcount ownership to the hdr since it always owns 3030307265Smav * the refcount whenever an arc_buf_t is shared. 3031286570Smav */ 3032307265Smav refcount_transfer_ownership(&state->arcs_size, buf, hdr); 3033321610Smav hdr->b_l1hdr.b_pabd = abd_get_from_buf(buf->b_data, arc_buf_size(buf)); 3034321610Smav abd_take_ownership_of_buf(hdr->b_l1hdr.b_pabd, 3035321610Smav HDR_ISTYPE_METADATA(hdr)); 3036307265Smav arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA); 3037321535Smav buf->b_flags |= ARC_BUF_FLAG_SHARED; 3038274172Savg 3039286763Smav /* 3040307265Smav * Since we've transferred ownership to the hdr we need 3041307265Smav * to increment its compressed and uncompressed kstats and 3042307265Smav * decrement the overhead size. 3043286763Smav */ 3044307265Smav ARCSTAT_INCR(arcstat_compressed_size, arc_hdr_size(hdr)); 3045307265Smav ARCSTAT_INCR(arcstat_uncompressed_size, HDR_GET_LSIZE(hdr)); 3046321535Smav ARCSTAT_INCR(arcstat_overhead_size, -arc_buf_size(buf)); 3047307265Smav} 3048274172Savg 3049307265Smavstatic void 3050307265Smavarc_unshare_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf) 3051307265Smav{ 3052307265Smav arc_state_t *state = hdr->b_l1hdr.b_state; 3053286570Smav 3054307265Smav ASSERT(arc_buf_is_shared(buf)); 3055321610Smav ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); 3056307265Smav ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); 3057307265Smav 3058286763Smav /* 3059307265Smav * We are no longer sharing this buffer so we need 3060307265Smav * to transfer its ownership to the rightful owner. 3061286763Smav */ 3062307265Smav refcount_transfer_ownership(&state->arcs_size, hdr, buf); 3063307265Smav arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA); 3064321610Smav abd_release_ownership_of_buf(hdr->b_l1hdr.b_pabd); 3065321610Smav abd_put(hdr->b_l1hdr.b_pabd); 3066321610Smav hdr->b_l1hdr.b_pabd = NULL; 3067321535Smav buf->b_flags &= ~ARC_BUF_FLAG_SHARED; 3068286763Smav 3069297848Savg /* 3070307265Smav * Since the buffer is no longer shared between 3071307265Smav * the arc buf and the hdr, count it as overhead. 3072297848Savg */ 3073307265Smav ARCSTAT_INCR(arcstat_compressed_size, -arc_hdr_size(hdr)); 3074307265Smav ARCSTAT_INCR(arcstat_uncompressed_size, -HDR_GET_LSIZE(hdr)); 3075321535Smav ARCSTAT_INCR(arcstat_overhead_size, arc_buf_size(buf)); 3076274172Savg} 3077274172Savg 3078286767Smav/* 3079321535Smav * Remove an arc_buf_t from the hdr's buf list and return the last 3080321535Smav * arc_buf_t on the list. If no buffers remain on the list then return 3081321535Smav * NULL. 3082286767Smav */ 3083321535Smavstatic arc_buf_t * 3084321535Smavarc_buf_remove(arc_buf_hdr_t *hdr, arc_buf_t *buf) 3085321535Smav{ 3086321535Smav ASSERT(HDR_HAS_L1HDR(hdr)); 3087321535Smav ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); 3088321535Smav 3089321535Smav arc_buf_t **bufp = &hdr->b_l1hdr.b_buf; 3090321535Smav arc_buf_t *lastbuf = NULL; 3091321535Smav 3092321535Smav /* 3093321535Smav * Remove the buf from the hdr list and locate the last 3094321535Smav * remaining buffer on the list. 3095321535Smav */ 3096321535Smav while (*bufp != NULL) { 3097321535Smav if (*bufp == buf) 3098321535Smav *bufp = buf->b_next; 3099321535Smav 3100321535Smav /* 3101321535Smav * If we've removed a buffer in the middle of 3102321535Smav * the list then update the lastbuf and update 3103321535Smav * bufp. 3104321535Smav */ 3105321535Smav if (*bufp != NULL) { 3106321535Smav lastbuf = *bufp; 3107321535Smav bufp = &(*bufp)->b_next; 3108321535Smav } 3109321535Smav } 3110321535Smav buf->b_next = NULL; 3111321535Smav ASSERT3P(lastbuf, !=, buf); 3112321535Smav IMPLY(hdr->b_l1hdr.b_bufcnt > 0, lastbuf != NULL); 3113321535Smav IMPLY(hdr->b_l1hdr.b_bufcnt > 0, hdr->b_l1hdr.b_buf != NULL); 3114321535Smav IMPLY(lastbuf != NULL, ARC_BUF_LAST(lastbuf)); 3115321535Smav 3116321535Smav return (lastbuf); 3117321535Smav} 3118321535Smav 3119321535Smav/* 3120321535Smav * Free up buf->b_data and pull the arc_buf_t off of the the arc_buf_hdr_t's 3121321535Smav * list and free it. 3122321535Smav */ 3123274172Savgstatic void 3124321535Smavarc_buf_destroy_impl(arc_buf_t *buf) 3125168404Spjd{ 3126307265Smav arc_buf_hdr_t *hdr = buf->b_hdr; 3127168404Spjd 3128307265Smav /* 3129321535Smav * Free up the data associated with the buf but only if we're not 3130321535Smav * sharing this with the hdr. If we are sharing it with the hdr, the 3131321535Smav * hdr is responsible for doing the free. 3132307265Smav */ 3133286570Smav if (buf->b_data != NULL) { 3134307265Smav /* 3135307265Smav * We're about to change the hdr's b_flags. We must either 3136307265Smav * hold the hash_lock or be undiscoverable. 3137307265Smav */ 3138307265Smav ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); 3139168404Spjd 3140168404Spjd arc_cksum_verify(buf); 3141240133Smm#ifdef illumos 3142240133Smm arc_buf_unwatch(buf); 3143277300Ssmh#endif 3144219089Spjd 3145321535Smav if (arc_buf_is_shared(buf)) { 3146307265Smav arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA); 3147286763Smav } else { 3148321535Smav uint64_t size = arc_buf_size(buf); 3149307265Smav arc_free_data_buf(hdr, buf->b_data, size, buf); 3150307265Smav ARCSTAT_INCR(arcstat_overhead_size, -size); 3151168404Spjd } 3152168404Spjd buf->b_data = NULL; 3153242845Sdelphij 3154307265Smav ASSERT(hdr->b_l1hdr.b_bufcnt > 0); 3155307265Smav hdr->b_l1hdr.b_bufcnt -= 1; 3156168404Spjd } 3157168404Spjd 3158321535Smav arc_buf_t *lastbuf = arc_buf_remove(hdr, buf); 3159168404Spjd 3160321535Smav if (ARC_BUF_SHARED(buf) && !ARC_BUF_COMPRESSED(buf)) { 3161307265Smav /* 3162321535Smav * If the current arc_buf_t is sharing its data buffer with the 3163321610Smav * hdr, then reassign the hdr's b_pabd to share it with the new 3164321535Smav * buffer at the end of the list. The shared buffer is always 3165321535Smav * the last one on the hdr's buffer list. 3166321535Smav * 3167321535Smav * There is an equivalent case for compressed bufs, but since 3168321535Smav * they aren't guaranteed to be the last buf in the list and 3169321535Smav * that is an exceedingly rare case, we just allow that space be 3170321535Smav * wasted temporarily. 3171307265Smav */ 3172321535Smav if (lastbuf != NULL) { 3173321535Smav /* Only one buf can be shared at once */ 3174321535Smav VERIFY(!arc_buf_is_shared(lastbuf)); 3175321535Smav /* hdr is uncompressed so can't have compressed buf */ 3176321535Smav VERIFY(!ARC_BUF_COMPRESSED(lastbuf)); 3177168404Spjd 3178321610Smav ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); 3179321610Smav arc_hdr_free_pabd(hdr); 3180168404Spjd 3181321535Smav /* 3182321535Smav * We must setup a new shared block between the 3183321535Smav * last buffer and the hdr. The data would have 3184321535Smav * been allocated by the arc buf so we need to transfer 3185321535Smav * ownership to the hdr since it's now being shared. 3186321535Smav */ 3187321535Smav arc_share_buf(hdr, lastbuf); 3188321535Smav } 3189321535Smav } else if (HDR_SHARED_DATA(hdr)) { 3190307265Smav /* 3191321535Smav * Uncompressed shared buffers are always at the end 3192321535Smav * of the list. Compressed buffers don't have the 3193321535Smav * same requirements. This makes it hard to 3194321535Smav * simply assert that the lastbuf is shared so 3195321535Smav * we rely on the hdr's compression flags to determine 3196321535Smav * if we have a compressed, shared buffer. 3197307265Smav */ 3198321535Smav ASSERT3P(lastbuf, !=, NULL); 3199321535Smav ASSERT(arc_buf_is_shared(lastbuf) || 3200321535Smav HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF); 3201307265Smav } 3202307265Smav 3203321535Smav /* 3204321535Smav * Free the checksum if we're removing the last uncompressed buf from 3205321535Smav * this hdr. 3206321535Smav */ 3207321535Smav if (!arc_hdr_has_uncompressed_buf(hdr)) { 3208307265Smav arc_cksum_free(hdr); 3209321535Smav } 3210307265Smav 3211168404Spjd /* clean up the buf */ 3212168404Spjd buf->b_hdr = NULL; 3213168404Spjd kmem_cache_free(buf_cache, buf); 3214168404Spjd} 3215168404Spjd 3216168404Spjdstatic void 3217349216Savgarc_hdr_alloc_pabd(arc_buf_hdr_t *hdr, boolean_t do_adapt) 3218286598Smav{ 3219307265Smav ASSERT3U(HDR_GET_LSIZE(hdr), >, 0); 3220307265Smav ASSERT(HDR_HAS_L1HDR(hdr)); 3221307265Smav ASSERT(!HDR_SHARED_DATA(hdr)); 3222286598Smav 3223321610Smav ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); 3224349216Savg hdr->b_l1hdr.b_pabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr, do_adapt); 3225307265Smav hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS; 3226321610Smav ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); 3227307265Smav 3228307265Smav ARCSTAT_INCR(arcstat_compressed_size, arc_hdr_size(hdr)); 3229307265Smav ARCSTAT_INCR(arcstat_uncompressed_size, HDR_GET_LSIZE(hdr)); 3230307265Smav} 3231307265Smav 3232307265Smavstatic void 3233321610Smavarc_hdr_free_pabd(arc_buf_hdr_t *hdr) 3234307265Smav{ 3235307265Smav ASSERT(HDR_HAS_L1HDR(hdr)); 3236321610Smav ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); 3237307265Smav 3238307265Smav /* 3239307265Smav * If the hdr is currently being written to the l2arc then 3240307265Smav * we defer freeing the data by adding it to the l2arc_free_on_write 3241307265Smav * list. The l2arc will free the data once it's finished 3242307265Smav * writing it to the l2arc device. 3243307265Smav */ 3244307265Smav if (HDR_L2_WRITING(hdr)) { 3245307265Smav arc_hdr_free_on_write(hdr); 3246307265Smav ARCSTAT_BUMP(arcstat_l2_free_on_write); 3247307265Smav } else { 3248321610Smav arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd, 3249307265Smav arc_hdr_size(hdr), hdr); 3250307265Smav } 3251321610Smav hdr->b_l1hdr.b_pabd = NULL; 3252307265Smav hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS; 3253307265Smav 3254307265Smav ARCSTAT_INCR(arcstat_compressed_size, -arc_hdr_size(hdr)); 3255307265Smav ARCSTAT_INCR(arcstat_uncompressed_size, -HDR_GET_LSIZE(hdr)); 3256307265Smav} 3257307265Smav 3258307265Smavstatic arc_buf_hdr_t * 3259307265Smavarc_hdr_alloc(uint64_t spa, int32_t psize, int32_t lsize, 3260321535Smav enum zio_compress compression_type, arc_buf_contents_t type) 3261307265Smav{ 3262307265Smav arc_buf_hdr_t *hdr; 3263307265Smav 3264307265Smav VERIFY(type == ARC_BUFC_DATA || type == ARC_BUFC_METADATA); 3265307265Smav 3266307265Smav hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE); 3267307265Smav ASSERT(HDR_EMPTY(hdr)); 3268307265Smav ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); 3269307265Smav ASSERT3P(hdr->b_l1hdr.b_thawed, ==, NULL); 3270307265Smav HDR_SET_PSIZE(hdr, psize); 3271307265Smav HDR_SET_LSIZE(hdr, lsize); 3272307265Smav hdr->b_spa = spa; 3273307265Smav hdr->b_type = type; 3274307265Smav hdr->b_flags = 0; 3275307265Smav arc_hdr_set_flags(hdr, arc_bufc_to_flags(type) | ARC_FLAG_HAS_L1HDR); 3276321535Smav arc_hdr_set_compress(hdr, compression_type); 3277307265Smav 3278307265Smav hdr->b_l1hdr.b_state = arc_anon; 3279307265Smav hdr->b_l1hdr.b_arc_access = 0; 3280307265Smav hdr->b_l1hdr.b_bufcnt = 0; 3281307265Smav hdr->b_l1hdr.b_buf = NULL; 3282307265Smav 3283307265Smav /* 3284307265Smav * Allocate the hdr's buffer. This will contain either 3285307265Smav * the compressed or uncompressed data depending on the block 3286307265Smav * it references and compressed arc enablement. 3287307265Smav */ 3288349216Savg arc_hdr_alloc_pabd(hdr, B_TRUE); 3289307265Smav ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 3290307265Smav 3291307265Smav return (hdr); 3292307265Smav} 3293307265Smav 3294307265Smav/* 3295307265Smav * Transition between the two allocation states for the arc_buf_hdr struct. 3296307265Smav * The arc_buf_hdr struct can be allocated with (hdr_full_cache) or without 3297307265Smav * (hdr_l2only_cache) the fields necessary for the L1 cache - the smaller 3298307265Smav * version is used when a cache buffer is only in the L2ARC in order to reduce 3299307265Smav * memory usage. 3300307265Smav */ 3301307265Smavstatic arc_buf_hdr_t * 3302307265Smavarc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new) 3303307265Smav{ 3304286598Smav ASSERT(HDR_HAS_L2HDR(hdr)); 3305286598Smav 3306307265Smav arc_buf_hdr_t *nhdr; 3307307265Smav l2arc_dev_t *dev = hdr->b_l2hdr.b_dev; 3308286598Smav 3309307265Smav ASSERT((old == hdr_full_cache && new == hdr_l2only_cache) || 3310307265Smav (old == hdr_l2only_cache && new == hdr_full_cache)); 3311307265Smav 3312307265Smav nhdr = kmem_cache_alloc(new, KM_PUSHPAGE); 3313307265Smav 3314307265Smav ASSERT(MUTEX_HELD(HDR_LOCK(hdr))); 3315307265Smav buf_hash_remove(hdr); 3316307265Smav 3317307265Smav bcopy(hdr, nhdr, HDR_L2ONLY_SIZE); 3318307265Smav 3319307265Smav if (new == hdr_full_cache) { 3320307265Smav arc_hdr_set_flags(nhdr, ARC_FLAG_HAS_L1HDR); 3321307265Smav /* 3322307265Smav * arc_access and arc_change_state need to be aware that a 3323307265Smav * header has just come out of L2ARC, so we set its state to 3324307265Smav * l2c_only even though it's about to change. 3325307265Smav */ 3326307265Smav nhdr->b_l1hdr.b_state = arc_l2c_only; 3327307265Smav 3328307265Smav /* Verify previous threads set to NULL before freeing */ 3329321610Smav ASSERT3P(nhdr->b_l1hdr.b_pabd, ==, NULL); 3330307265Smav } else { 3331307265Smav ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); 3332307265Smav ASSERT0(hdr->b_l1hdr.b_bufcnt); 3333307265Smav ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); 3334307265Smav 3335307265Smav /* 3336307265Smav * If we've reached here, We must have been called from 3337307265Smav * arc_evict_hdr(), as such we should have already been 3338307265Smav * removed from any ghost list we were previously on 3339307265Smav * (which protects us from racing with arc_evict_state), 3340307265Smav * thus no locking is needed during this check. 3341307265Smav */ 3342307265Smav ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); 3343307265Smav 3344307265Smav /* 3345307265Smav * A buffer must not be moved into the arc_l2c_only 3346307265Smav * state if it's not finished being written out to the 3347321610Smav * l2arc device. Otherwise, the b_l1hdr.b_pabd field 3348307265Smav * might try to be accessed, even though it was removed. 3349307265Smav */ 3350307265Smav VERIFY(!HDR_L2_WRITING(hdr)); 3351321610Smav VERIFY3P(hdr->b_l1hdr.b_pabd, ==, NULL); 3352307265Smav 3353307265Smav#ifdef ZFS_DEBUG 3354307265Smav if (hdr->b_l1hdr.b_thawed != NULL) { 3355307265Smav kmem_free(hdr->b_l1hdr.b_thawed, 1); 3356307265Smav hdr->b_l1hdr.b_thawed = NULL; 3357307265Smav } 3358307265Smav#endif 3359307265Smav 3360307265Smav arc_hdr_clear_flags(nhdr, ARC_FLAG_HAS_L1HDR); 3361307265Smav } 3362286598Smav /* 3363307265Smav * The header has been reallocated so we need to re-insert it into any 3364307265Smav * lists it was on. 3365286598Smav */ 3366307265Smav (void) buf_hash_insert(nhdr, NULL); 3367286598Smav 3368307265Smav ASSERT(list_link_active(&hdr->b_l2hdr.b_l2node)); 3369307265Smav 3370307265Smav mutex_enter(&dev->l2ad_mtx); 3371307265Smav 3372286598Smav /* 3373307265Smav * We must place the realloc'ed header back into the list at 3374307265Smav * the same spot. Otherwise, if it's placed earlier in the list, 3375307265Smav * l2arc_write_buffers() could find it during the function's 3376307265Smav * write phase, and try to write it out to the l2arc. 3377286598Smav */ 3378307265Smav list_insert_after(&dev->l2ad_buflist, hdr, nhdr); 3379307265Smav list_remove(&dev->l2ad_buflist, hdr); 3380286598Smav 3381307265Smav mutex_exit(&dev->l2ad_mtx); 3382307265Smav 3383286598Smav /* 3384307265Smav * Since we're using the pointer address as the tag when 3385307265Smav * incrementing and decrementing the l2ad_alloc refcount, we 3386307265Smav * must remove the old pointer (that we're about to destroy) and 3387307265Smav * add the new pointer to the refcount. Otherwise we'd remove 3388307265Smav * the wrong pointer address when calling arc_hdr_destroy() later. 3389286598Smav */ 3390286598Smav 3391307265Smav (void) refcount_remove_many(&dev->l2ad_alloc, arc_hdr_size(hdr), hdr); 3392307265Smav (void) refcount_add_many(&dev->l2ad_alloc, arc_hdr_size(nhdr), nhdr); 3393286598Smav 3394307265Smav buf_discard_identity(hdr); 3395307265Smav kmem_cache_free(old, hdr); 3396286598Smav 3397307265Smav return (nhdr); 3398286598Smav} 3399286598Smav 3400307265Smav/* 3401307265Smav * Allocate a new arc_buf_hdr_t and arc_buf_t and return the buf to the caller. 3402307265Smav * The buf is returned thawed since we expect the consumer to modify it. 3403307265Smav */ 3404307265Smavarc_buf_t * 3405321535Smavarc_alloc_buf(spa_t *spa, void *tag, arc_buf_contents_t type, int32_t size) 3406307265Smav{ 3407307265Smav arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), size, size, 3408307265Smav ZIO_COMPRESS_OFF, type); 3409307265Smav ASSERT(!MUTEX_HELD(HDR_LOCK(hdr))); 3410321535Smav 3411321535Smav arc_buf_t *buf = NULL; 3412321535Smav VERIFY0(arc_buf_alloc_impl(hdr, tag, B_FALSE, B_FALSE, &buf)); 3413307265Smav arc_buf_thaw(buf); 3414321535Smav 3415307265Smav return (buf); 3416307265Smav} 3417307265Smav 3418321535Smav/* 3419321535Smav * Allocate a compressed buf in the same manner as arc_alloc_buf. Don't use this 3420321535Smav * for bufs containing metadata. 3421321535Smav */ 3422321535Smavarc_buf_t * 3423321535Smavarc_alloc_compressed_buf(spa_t *spa, void *tag, uint64_t psize, uint64_t lsize, 3424321535Smav enum zio_compress compression_type) 3425321535Smav{ 3426321535Smav ASSERT3U(lsize, >, 0); 3427321535Smav ASSERT3U(lsize, >=, psize); 3428321535Smav ASSERT(compression_type > ZIO_COMPRESS_OFF); 3429321535Smav ASSERT(compression_type < ZIO_COMPRESS_FUNCTIONS); 3430321535Smav 3431321535Smav arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize, 3432321535Smav compression_type, ARC_BUFC_DATA); 3433321535Smav ASSERT(!MUTEX_HELD(HDR_LOCK(hdr))); 3434321535Smav 3435321535Smav arc_buf_t *buf = NULL; 3436321535Smav VERIFY0(arc_buf_alloc_impl(hdr, tag, B_TRUE, B_FALSE, &buf)); 3437321535Smav arc_buf_thaw(buf); 3438321535Smav ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); 3439321535Smav 3440321610Smav if (!arc_buf_is_shared(buf)) { 3441321610Smav /* 3442321610Smav * To ensure that the hdr has the correct data in it if we call 3443321610Smav * arc_decompress() on this buf before it's been written to 3444321610Smav * disk, it's easiest if we just set up sharing between the 3445321610Smav * buf and the hdr. 3446321610Smav */ 3447321610Smav ASSERT(!abd_is_linear(hdr->b_l1hdr.b_pabd)); 3448321610Smav arc_hdr_free_pabd(hdr); 3449321610Smav arc_share_buf(hdr, buf); 3450321610Smav } 3451321610Smav 3452321535Smav return (buf); 3453321535Smav} 3454321535Smav 3455286598Smavstatic void 3456307265Smavarc_hdr_l2hdr_destroy(arc_buf_hdr_t *hdr) 3457307265Smav{ 3458307265Smav l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr; 3459307265Smav l2arc_dev_t *dev = l2hdr->b_dev; 3460323754Savg uint64_t psize = arc_hdr_size(hdr); 3461307265Smav 3462307265Smav ASSERT(MUTEX_HELD(&dev->l2ad_mtx)); 3463307265Smav ASSERT(HDR_HAS_L2HDR(hdr)); 3464307265Smav 3465307265Smav list_remove(&dev->l2ad_buflist, hdr); 3466307265Smav 3467323754Savg ARCSTAT_INCR(arcstat_l2_psize, -psize); 3468323754Savg ARCSTAT_INCR(arcstat_l2_lsize, -HDR_GET_LSIZE(hdr)); 3469307265Smav 3470323754Savg vdev_space_update(dev->l2ad_vdev, -psize, 0, 0); 3471307265Smav 3472323754Savg (void) refcount_remove_many(&dev->l2ad_alloc, psize, hdr); 3473307265Smav arc_hdr_clear_flags(hdr, ARC_FLAG_HAS_L2HDR); 3474307265Smav} 3475307265Smav 3476307265Smavstatic void 3477168404Spjdarc_hdr_destroy(arc_buf_hdr_t *hdr) 3478168404Spjd{ 3479286570Smav if (HDR_HAS_L1HDR(hdr)) { 3480286570Smav ASSERT(hdr->b_l1hdr.b_buf == NULL || 3481307265Smav hdr->b_l1hdr.b_bufcnt > 0); 3482286570Smav ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 3483286570Smav ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); 3484286570Smav } 3485168404Spjd ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 3486286570Smav ASSERT(!HDR_IN_HASH_TABLE(hdr)); 3487168404Spjd 3488307265Smav if (!HDR_EMPTY(hdr)) 3489307265Smav buf_discard_identity(hdr); 3490307265Smav 3491286570Smav if (HDR_HAS_L2HDR(hdr)) { 3492286598Smav l2arc_dev_t *dev = hdr->b_l2hdr.b_dev; 3493286598Smav boolean_t buflist_held = MUTEX_HELD(&dev->l2ad_mtx); 3494286570Smav 3495286598Smav if (!buflist_held) 3496286598Smav mutex_enter(&dev->l2ad_mtx); 3497219089Spjd 3498286570Smav /* 3499286598Smav * Even though we checked this conditional above, we 3500286598Smav * need to check this again now that we have the 3501286598Smav * l2ad_mtx. This is because we could be racing with 3502286598Smav * another thread calling l2arc_evict() which might have 3503286598Smav * destroyed this header's L2 portion as we were waiting 3504286598Smav * to acquire the l2ad_mtx. If that happens, we don't 3505286598Smav * want to re-destroy the header's L2 portion. 3506286570Smav */ 3507286598Smav if (HDR_HAS_L2HDR(hdr)) { 3508290191Savg l2arc_trim(hdr); 3509286598Smav arc_hdr_l2hdr_destroy(hdr); 3510286598Smav } 3511286570Smav 3512219089Spjd if (!buflist_held) 3513286598Smav mutex_exit(&dev->l2ad_mtx); 3514185029Spjd } 3515185029Spjd 3516307265Smav if (HDR_HAS_L1HDR(hdr)) { 3517307265Smav arc_cksum_free(hdr); 3518286776Smav 3519307265Smav while (hdr->b_l1hdr.b_buf != NULL) 3520321535Smav arc_buf_destroy_impl(hdr->b_l1hdr.b_buf); 3521286570Smav 3522286570Smav#ifdef ZFS_DEBUG 3523286570Smav if (hdr->b_l1hdr.b_thawed != NULL) { 3524286570Smav kmem_free(hdr->b_l1hdr.b_thawed, 1); 3525286570Smav hdr->b_l1hdr.b_thawed = NULL; 3526286570Smav } 3527286570Smav#endif 3528307265Smav 3529321610Smav if (hdr->b_l1hdr.b_pabd != NULL) { 3530321610Smav arc_hdr_free_pabd(hdr); 3531307265Smav } 3532219089Spjd } 3533168404Spjd 3534168404Spjd ASSERT3P(hdr->b_hash_next, ==, NULL); 3535286570Smav if (HDR_HAS_L1HDR(hdr)) { 3536286763Smav ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); 3537286570Smav ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); 3538286570Smav kmem_cache_free(hdr_full_cache, hdr); 3539286570Smav } else { 3540286570Smav kmem_cache_free(hdr_l2only_cache, hdr); 3541286570Smav } 3542168404Spjd} 3543168404Spjd 3544168404Spjdvoid 3545307265Smavarc_buf_destroy(arc_buf_t *buf, void* tag) 3546168404Spjd{ 3547168404Spjd arc_buf_hdr_t *hdr = buf->b_hdr; 3548168404Spjd kmutex_t *hash_lock = HDR_LOCK(hdr); 3549168404Spjd 3550286570Smav if (hdr->b_l1hdr.b_state == arc_anon) { 3551307265Smav ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1); 3552307265Smav ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 3553307265Smav VERIFY0(remove_reference(hdr, NULL, tag)); 3554307265Smav arc_hdr_destroy(hdr); 3555307265Smav return; 3556168404Spjd } 3557168404Spjd 3558168404Spjd mutex_enter(hash_lock); 3559307265Smav ASSERT3P(hdr, ==, buf->b_hdr); 3560307265Smav ASSERT(hdr->b_l1hdr.b_bufcnt > 0); 3561219089Spjd ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 3562307265Smav ASSERT3P(hdr->b_l1hdr.b_state, !=, arc_anon); 3563307265Smav ASSERT3P(buf->b_data, !=, NULL); 3564168404Spjd 3565168404Spjd (void) remove_reference(hdr, hash_lock, tag); 3566321535Smav arc_buf_destroy_impl(buf); 3567168404Spjd mutex_exit(hash_lock); 3568168404Spjd} 3569168404Spjd 3570168404Spjd/* 3571286763Smav * Evict the arc_buf_hdr that is provided as a parameter. The resultant 3572286763Smav * state of the header is dependent on it's state prior to entering this 3573286763Smav * function. The following transitions are possible: 3574185029Spjd * 3575286763Smav * - arc_mru -> arc_mru_ghost 3576286763Smav * - arc_mfu -> arc_mfu_ghost 3577286763Smav * - arc_mru_ghost -> arc_l2c_only 3578286763Smav * - arc_mru_ghost -> deleted 3579286763Smav * - arc_mfu_ghost -> arc_l2c_only 3580286763Smav * - arc_mfu_ghost -> deleted 3581168404Spjd */ 3582286763Smavstatic int64_t 3583286763Smavarc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) 3584168404Spjd{ 3585286763Smav arc_state_t *evicted_state, *state; 3586286763Smav int64_t bytes_evicted = 0; 3587339034Ssef int min_lifetime = HDR_PRESCIENT_PREFETCH(hdr) ? 3588339034Ssef zfs_arc_min_prescient_prefetch_ms : zfs_arc_min_prefetch_ms; 3589168404Spjd 3590286763Smav ASSERT(MUTEX_HELD(hash_lock)); 3591286763Smav ASSERT(HDR_HAS_L1HDR(hdr)); 3592168404Spjd 3593286763Smav state = hdr->b_l1hdr.b_state; 3594286763Smav if (GHOST_STATE(state)) { 3595286763Smav ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 3596307265Smav ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); 3597206796Spjd 3598286763Smav /* 3599286763Smav * l2arc_write_buffers() relies on a header's L1 portion 3600321610Smav * (i.e. its b_pabd field) during it's write phase. 3601286763Smav * Thus, we cannot push a header onto the arc_l2c_only 3602286763Smav * state (removing it's L1 piece) until the header is 3603286763Smav * done being written to the l2arc. 3604286763Smav */ 3605286763Smav if (HDR_HAS_L2HDR(hdr) && HDR_L2_WRITING(hdr)) { 3606286763Smav ARCSTAT_BUMP(arcstat_evict_l2_skip); 3607286763Smav return (bytes_evicted); 3608286763Smav } 3609286762Smav 3610286763Smav ARCSTAT_BUMP(arcstat_deleted); 3611307265Smav bytes_evicted += HDR_GET_LSIZE(hdr); 3612286762Smav 3613286763Smav DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, hdr); 3614286763Smav 3615321610Smav ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); 3616286763Smav if (HDR_HAS_L2HDR(hdr)) { 3617275780Sdelphij /* 3618286763Smav * This buffer is cached on the 2nd Level ARC; 3619286763Smav * don't destroy the header. 3620275780Sdelphij */ 3621286763Smav arc_change_state(arc_l2c_only, hdr, hash_lock); 3622286763Smav /* 3623286763Smav * dropping from L1+L2 cached to L2-only, 3624286763Smav * realloc to remove the L1 header. 3625286763Smav */ 3626286763Smav hdr = arc_hdr_realloc(hdr, hdr_full_cache, 3627286763Smav hdr_l2only_cache); 3628286763Smav } else { 3629286763Smav arc_change_state(arc_anon, hdr, hash_lock); 3630286763Smav arc_hdr_destroy(hdr); 3631275780Sdelphij } 3632286763Smav return (bytes_evicted); 3633275780Sdelphij } 3634275780Sdelphij 3635286763Smav ASSERT(state == arc_mru || state == arc_mfu); 3636286763Smav evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost; 3637206796Spjd 3638286763Smav /* prefetch buffers have a minimum lifespan */ 3639286763Smav if (HDR_IO_IN_PROGRESS(hdr) || 3640286763Smav ((hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT)) && 3641339034Ssef ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access < min_lifetime * hz)) { 3642286763Smav ARCSTAT_BUMP(arcstat_evict_skip); 3643286763Smav return (bytes_evicted); 3644286763Smav } 3645286763Smav 3646286763Smav ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt)); 3647286763Smav while (hdr->b_l1hdr.b_buf) { 3648286763Smav arc_buf_t *buf = hdr->b_l1hdr.b_buf; 3649286763Smav if (!mutex_tryenter(&buf->b_evict_lock)) { 3650286763Smav ARCSTAT_BUMP(arcstat_mutex_miss); 3651286763Smav break; 3652168404Spjd } 3653286763Smav if (buf->b_data != NULL) 3654307265Smav bytes_evicted += HDR_GET_LSIZE(hdr); 3655307265Smav mutex_exit(&buf->b_evict_lock); 3656321535Smav arc_buf_destroy_impl(buf); 3657286763Smav } 3658258632Savg 3659286763Smav if (HDR_HAS_L2HDR(hdr)) { 3660307265Smav ARCSTAT_INCR(arcstat_evict_l2_cached, HDR_GET_LSIZE(hdr)); 3661286763Smav } else { 3662307265Smav if (l2arc_write_eligible(hdr->b_spa, hdr)) { 3663307265Smav ARCSTAT_INCR(arcstat_evict_l2_eligible, 3664307265Smav HDR_GET_LSIZE(hdr)); 3665307265Smav } else { 3666307265Smav ARCSTAT_INCR(arcstat_evict_l2_ineligible, 3667307265Smav HDR_GET_LSIZE(hdr)); 3668307265Smav } 3669286763Smav } 3670258632Savg 3671307265Smav if (hdr->b_l1hdr.b_bufcnt == 0) { 3672307265Smav arc_cksum_free(hdr); 3673307265Smav 3674307265Smav bytes_evicted += arc_hdr_size(hdr); 3675307265Smav 3676307265Smav /* 3677307265Smav * If this hdr is being evicted and has a compressed 3678307265Smav * buffer then we discard it here before we change states. 3679307265Smav * This ensures that the accounting is updated correctly 3680321610Smav * in arc_free_data_impl(). 3681307265Smav */ 3682321610Smav arc_hdr_free_pabd(hdr); 3683307265Smav 3684286763Smav arc_change_state(evicted_state, hdr, hash_lock); 3685286763Smav ASSERT(HDR_IN_HASH_TABLE(hdr)); 3686307265Smav arc_hdr_set_flags(hdr, ARC_FLAG_IN_HASH_TABLE); 3687286763Smav DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, hdr); 3688286763Smav } 3689286763Smav 3690286763Smav return (bytes_evicted); 3691286763Smav} 3692286763Smav 3693286763Smavstatic uint64_t 3694286763Smavarc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker, 3695286763Smav uint64_t spa, int64_t bytes) 3696286763Smav{ 3697286763Smav multilist_sublist_t *mls; 3698286763Smav uint64_t bytes_evicted = 0; 3699286763Smav arc_buf_hdr_t *hdr; 3700286763Smav kmutex_t *hash_lock; 3701286763Smav int evict_count = 0; 3702286763Smav 3703286763Smav ASSERT3P(marker, !=, NULL); 3704286763Smav IMPLY(bytes < 0, bytes == ARC_EVICT_ALL); 3705286763Smav 3706286763Smav mls = multilist_sublist_lock(ml, idx); 3707286763Smav 3708286763Smav for (hdr = multilist_sublist_prev(mls, marker); hdr != NULL; 3709286763Smav hdr = multilist_sublist_prev(mls, marker)) { 3710286763Smav if ((bytes != ARC_EVICT_ALL && bytes_evicted >= bytes) || 3711286763Smav (evict_count >= zfs_arc_evict_batch_limit)) 3712286763Smav break; 3713286763Smav 3714258632Savg /* 3715286763Smav * To keep our iteration location, move the marker 3716286763Smav * forward. Since we're not holding hdr's hash lock, we 3717286763Smav * must be very careful and not remove 'hdr' from the 3718286763Smav * sublist. Otherwise, other consumers might mistake the 3719286763Smav * 'hdr' as not being on a sublist when they call the 3720286763Smav * multilist_link_active() function (they all rely on 3721286763Smav * the hash lock protecting concurrent insertions and 3722286763Smav * removals). multilist_sublist_move_forward() was 3723286763Smav * specifically implemented to ensure this is the case 3724286763Smav * (only 'marker' will be removed and re-inserted). 3725258632Savg */ 3726286763Smav multilist_sublist_move_forward(mls, marker); 3727286763Smav 3728286763Smav /* 3729286763Smav * The only case where the b_spa field should ever be 3730286763Smav * zero, is the marker headers inserted by 3731286763Smav * arc_evict_state(). It's possible for multiple threads 3732286763Smav * to be calling arc_evict_state() concurrently (e.g. 3733286763Smav * dsl_pool_close() and zio_inject_fault()), so we must 3734286763Smav * skip any markers we see from these other threads. 3735286763Smav */ 3736286763Smav if (hdr->b_spa == 0) 3737258632Savg continue; 3738286763Smav 3739286763Smav /* we're only interested in evicting buffers of a certain spa */ 3740286763Smav if (spa != 0 && hdr->b_spa != spa) { 3741286763Smav ARCSTAT_BUMP(arcstat_evict_skip); 3742286763Smav continue; 3743258632Savg } 3744258632Savg 3745275811Sdelphij hash_lock = HDR_LOCK(hdr); 3746208373Smm 3747286763Smav /* 3748286763Smav * We aren't calling this function from any code path 3749286763Smav * that would already be holding a hash lock, so we're 3750286763Smav * asserting on this assumption to be defensive in case 3751286763Smav * this ever changes. Without this check, it would be 3752286763Smav * possible to incorrectly increment arcstat_mutex_miss 3753286763Smav * below (e.g. if the code changed such that we called 3754286763Smav * this function with a hash lock held). 3755286763Smav */ 3756286763Smav ASSERT(!MUTEX_HELD(hash_lock)); 3757208373Smm 3758286763Smav if (mutex_tryenter(hash_lock)) { 3759286763Smav uint64_t evicted = arc_evict_hdr(hdr, hash_lock); 3760286763Smav mutex_exit(hash_lock); 3761286763Smav 3762286763Smav bytes_evicted += evicted; 3763286763Smav 3764286763Smav /* 3765286763Smav * If evicted is zero, arc_evict_hdr() must have 3766286763Smav * decided to skip this header, don't increment 3767286763Smav * evict_count in this case. 3768286763Smav */ 3769286763Smav if (evicted != 0) 3770286763Smav evict_count++; 3771286763Smav 3772286763Smav /* 3773286763Smav * If arc_size isn't overflowing, signal any 3774286763Smav * threads that might happen to be waiting. 3775286763Smav * 3776286763Smav * For each header evicted, we wake up a single 3777286763Smav * thread. If we used cv_broadcast, we could 3778286763Smav * wake up "too many" threads causing arc_size 3779286763Smav * to significantly overflow arc_c; since 3780321610Smav * arc_get_data_impl() doesn't check for overflow 3781286763Smav * when it's woken up (it doesn't because it's 3782286763Smav * possible for the ARC to be overflowing while 3783286763Smav * full of un-evictable buffers, and the 3784286763Smav * function should proceed in this case). 3785286763Smav * 3786286763Smav * If threads are left sleeping, due to not 3787346686Smav * using cv_broadcast here, they will be woken 3788346686Smav * up via cv_broadcast in arc_adjust_cb() just 3789346686Smav * before arc_adjust_zthr sleeps. 3790286763Smav */ 3791346686Smav mutex_enter(&arc_adjust_lock); 3792286763Smav if (!arc_is_overflowing()) 3793346686Smav cv_signal(&arc_adjust_waiters_cv); 3794346686Smav mutex_exit(&arc_adjust_lock); 3795168404Spjd } else { 3796286763Smav ARCSTAT_BUMP(arcstat_mutex_miss); 3797168404Spjd } 3798168404Spjd } 3799168404Spjd 3800286763Smav multilist_sublist_unlock(mls); 3801206796Spjd 3802286763Smav return (bytes_evicted); 3803286763Smav} 3804168404Spjd 3805286763Smav/* 3806286763Smav * Evict buffers from the given arc state, until we've removed the 3807286763Smav * specified number of bytes. Move the removed buffers to the 3808286763Smav * appropriate evict state. 3809286763Smav * 3810286763Smav * This function makes a "best effort". It skips over any buffers 3811286763Smav * it can't get a hash_lock on, and so, may not catch all candidates. 3812286763Smav * It may also return without evicting as much space as requested. 3813286763Smav * 3814286763Smav * If bytes is specified using the special value ARC_EVICT_ALL, this 3815286763Smav * will evict all available (i.e. unlocked and evictable) buffers from 3816286763Smav * the given arc state; which is used by arc_flush(). 3817286763Smav */ 3818286763Smavstatic uint64_t 3819286763Smavarc_evict_state(arc_state_t *state, uint64_t spa, int64_t bytes, 3820286763Smav arc_buf_contents_t type) 3821286763Smav{ 3822286763Smav uint64_t total_evicted = 0; 3823321553Smav multilist_t *ml = state->arcs_list[type]; 3824286763Smav int num_sublists; 3825286763Smav arc_buf_hdr_t **markers; 3826168404Spjd 3827286763Smav IMPLY(bytes < 0, bytes == ARC_EVICT_ALL); 3828168404Spjd 3829286763Smav num_sublists = multilist_get_num_sublists(ml); 3830286763Smav 3831185029Spjd /* 3832286763Smav * If we've tried to evict from each sublist, made some 3833286763Smav * progress, but still have not hit the target number of bytes 3834286763Smav * to evict, we want to keep trying. The markers allow us to 3835286763Smav * pick up where we left off for each individual sublist, rather 3836286763Smav * than starting from the tail each time. 3837185029Spjd */ 3838286763Smav markers = kmem_zalloc(sizeof (*markers) * num_sublists, KM_SLEEP); 3839286763Smav for (int i = 0; i < num_sublists; i++) { 3840286763Smav markers[i] = kmem_cache_alloc(hdr_full_cache, KM_SLEEP); 3841185029Spjd 3842286763Smav /* 3843286763Smav * A b_spa of 0 is used to indicate that this header is 3844286763Smav * a marker. This fact is used in arc_adjust_type() and 3845286763Smav * arc_evict_state_impl(). 3846286763Smav */ 3847286763Smav markers[i]->b_spa = 0; 3848168404Spjd 3849286763Smav multilist_sublist_t *mls = multilist_sublist_lock(ml, i); 3850286763Smav multilist_sublist_insert_tail(mls, markers[i]); 3851286763Smav multilist_sublist_unlock(mls); 3852286763Smav } 3853168404Spjd 3854286763Smav /* 3855286763Smav * While we haven't hit our target number of bytes to evict, or 3856286763Smav * we're evicting all available buffers. 3857286763Smav */ 3858286763Smav while (total_evicted < bytes || bytes == ARC_EVICT_ALL) { 3859286763Smav /* 3860286763Smav * Start eviction using a randomly selected sublist, 3861286763Smav * this is to try and evenly balance eviction across all 3862286763Smav * sublists. Always starting at the same sublist 3863286763Smav * (e.g. index 0) would cause evictions to favor certain 3864286763Smav * sublists over others. 3865286763Smav */ 3866286763Smav int sublist_idx = multilist_get_random_index(ml); 3867286763Smav uint64_t scan_evicted = 0; 3868219089Spjd 3869286763Smav for (int i = 0; i < num_sublists; i++) { 3870286763Smav uint64_t bytes_remaining; 3871286763Smav uint64_t bytes_evicted; 3872219089Spjd 3873286763Smav if (bytes == ARC_EVICT_ALL) 3874286763Smav bytes_remaining = ARC_EVICT_ALL; 3875286763Smav else if (total_evicted < bytes) 3876286763Smav bytes_remaining = bytes - total_evicted; 3877286763Smav else 3878286763Smav break; 3879258632Savg 3880286763Smav bytes_evicted = arc_evict_state_impl(ml, sublist_idx, 3881286763Smav markers[sublist_idx], spa, bytes_remaining); 3882286763Smav 3883286763Smav scan_evicted += bytes_evicted; 3884286763Smav total_evicted += bytes_evicted; 3885286763Smav 3886286763Smav /* we've reached the end, wrap to the beginning */ 3887286763Smav if (++sublist_idx >= num_sublists) 3888286763Smav sublist_idx = 0; 3889286763Smav } 3890286763Smav 3891258632Savg /* 3892286763Smav * If we didn't evict anything during this scan, we have 3893286763Smav * no reason to believe we'll evict more during another 3894286763Smav * scan, so break the loop. 3895258632Savg */ 3896286763Smav if (scan_evicted == 0) { 3897286763Smav /* This isn't possible, let's make that obvious */ 3898286763Smav ASSERT3S(bytes, !=, 0); 3899185029Spjd 3900286763Smav /* 3901286763Smav * When bytes is ARC_EVICT_ALL, the only way to 3902286763Smav * break the loop is when scan_evicted is zero. 3903286763Smav * In that case, we actually have evicted enough, 3904286763Smav * so we don't want to increment the kstat. 3905286763Smav */ 3906286763Smav if (bytes != ARC_EVICT_ALL) { 3907286763Smav ASSERT3S(total_evicted, <, bytes); 3908286763Smav ARCSTAT_BUMP(arcstat_evict_not_enough); 3909185029Spjd } 3910185029Spjd 3911286763Smav break; 3912258632Savg } 3913286763Smav } 3914258632Savg 3915286763Smav for (int i = 0; i < num_sublists; i++) { 3916286763Smav multilist_sublist_t *mls = multilist_sublist_lock(ml, i); 3917286763Smav multilist_sublist_remove(mls, markers[i]); 3918286763Smav multilist_sublist_unlock(mls); 3919286763Smav 3920286763Smav kmem_cache_free(hdr_full_cache, markers[i]); 3921168404Spjd } 3922286763Smav kmem_free(markers, sizeof (*markers) * num_sublists); 3923206796Spjd 3924286763Smav return (total_evicted); 3925286763Smav} 3926286763Smav 3927286763Smav/* 3928286763Smav * Flush all "evictable" data of the given type from the arc state 3929286763Smav * specified. This will not evict any "active" buffers (i.e. referenced). 3930286763Smav * 3931307265Smav * When 'retry' is set to B_FALSE, the function will make a single pass 3932286763Smav * over the state and evict any buffers that it can. Since it doesn't 3933286763Smav * continually retry the eviction, it might end up leaving some buffers 3934286763Smav * in the ARC due to lock misses. 3935286763Smav * 3936307265Smav * When 'retry' is set to B_TRUE, the function will continually retry the 3937286763Smav * eviction until *all* evictable buffers have been removed from the 3938286763Smav * state. As a result, if concurrent insertions into the state are 3939286763Smav * allowed (e.g. if the ARC isn't shutting down), this function might 3940286763Smav * wind up in an infinite loop, continually trying to evict buffers. 3941286763Smav */ 3942286763Smavstatic uint64_t 3943286763Smavarc_flush_state(arc_state_t *state, uint64_t spa, arc_buf_contents_t type, 3944286763Smav boolean_t retry) 3945286763Smav{ 3946286763Smav uint64_t evicted = 0; 3947286763Smav 3948307265Smav while (refcount_count(&state->arcs_esize[type]) != 0) { 3949286763Smav evicted += arc_evict_state(state, spa, ARC_EVICT_ALL, type); 3950286763Smav 3951286763Smav if (!retry) 3952286763Smav break; 3953185029Spjd } 3954185029Spjd 3955286763Smav return (evicted); 3956286763Smav} 3957286763Smav 3958286763Smav/* 3959286763Smav * Evict the specified number of bytes from the state specified, 3960286763Smav * restricting eviction to the spa and type given. This function 3961286763Smav * prevents us from trying to evict more from a state's list than 3962286763Smav * is "evictable", and to skip evicting altogether when passed a 3963286763Smav * negative value for "bytes". In contrast, arc_evict_state() will 3964286763Smav * evict everything it can, when passed a negative value for "bytes". 3965286763Smav */ 3966286763Smavstatic uint64_t 3967286763Smavarc_adjust_impl(arc_state_t *state, uint64_t spa, int64_t bytes, 3968286763Smav arc_buf_contents_t type) 3969286763Smav{ 3970286763Smav int64_t delta; 3971286763Smav 3972307265Smav if (bytes > 0 && refcount_count(&state->arcs_esize[type]) > 0) { 3973307265Smav delta = MIN(refcount_count(&state->arcs_esize[type]), bytes); 3974286763Smav return (arc_evict_state(state, spa, delta, type)); 3975168404Spjd } 3976168404Spjd 3977286763Smav return (0); 3978168404Spjd} 3979168404Spjd 3980286763Smav/* 3981286763Smav * Evict metadata buffers from the cache, such that arc_meta_used is 3982286763Smav * capped by the arc_meta_limit tunable. 3983286763Smav */ 3984286763Smavstatic uint64_t 3985332540Smavarc_adjust_meta(uint64_t meta_used) 3986286763Smav{ 3987286763Smav uint64_t total_evicted = 0; 3988286763Smav int64_t target; 3989286763Smav 3990286763Smav /* 3991286763Smav * If we're over the meta limit, we want to evict enough 3992286763Smav * metadata to get back under the meta limit. We don't want to 3993286763Smav * evict so much that we drop the MRU below arc_p, though. If 3994286763Smav * we're over the meta limit more than we're over arc_p, we 3995286763Smav * evict some from the MRU here, and some from the MFU below. 3996286763Smav */ 3997332540Smav target = MIN((int64_t)(meta_used - arc_meta_limit), 3998286766Smav (int64_t)(refcount_count(&arc_anon->arcs_size) + 3999286766Smav refcount_count(&arc_mru->arcs_size) - arc_p)); 4000286763Smav 4001286763Smav total_evicted += arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA); 4002286763Smav 4003286763Smav /* 4004286763Smav * Similar to the above, we want to evict enough bytes to get us 4005286763Smav * below the meta limit, but not so much as to drop us below the 4006321535Smav * space allotted to the MFU (which is defined as arc_c - arc_p). 4007286763Smav */ 4008332540Smav target = MIN((int64_t)(meta_used - arc_meta_limit), 4009332540Smav (int64_t)(refcount_count(&arc_mfu->arcs_size) - 4010332540Smav (arc_c - arc_p))); 4011286763Smav 4012286763Smav total_evicted += arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA); 4013286763Smav 4014286763Smav return (total_evicted); 4015286763Smav} 4016286763Smav 4017286763Smav/* 4018286763Smav * Return the type of the oldest buffer in the given arc state 4019286763Smav * 4020286763Smav * This function will select a random sublist of type ARC_BUFC_DATA and 4021286763Smav * a random sublist of type ARC_BUFC_METADATA. The tail of each sublist 4022286763Smav * is compared, and the type which contains the "older" buffer will be 4023286763Smav * returned. 4024286763Smav */ 4025286763Smavstatic arc_buf_contents_t 4026286763Smavarc_adjust_type(arc_state_t *state) 4027286763Smav{ 4028321553Smav multilist_t *data_ml = state->arcs_list[ARC_BUFC_DATA]; 4029321553Smav multilist_t *meta_ml = state->arcs_list[ARC_BUFC_METADATA]; 4030286763Smav int data_idx = multilist_get_random_index(data_ml); 4031286763Smav int meta_idx = multilist_get_random_index(meta_ml); 4032286763Smav multilist_sublist_t *data_mls; 4033286763Smav multilist_sublist_t *meta_mls; 4034286763Smav arc_buf_contents_t type; 4035286763Smav arc_buf_hdr_t *data_hdr; 4036286763Smav arc_buf_hdr_t *meta_hdr; 4037286763Smav 4038286763Smav /* 4039286763Smav * We keep the sublist lock until we're finished, to prevent 4040286763Smav * the headers from being destroyed via arc_evict_state(). 4041286763Smav */ 4042286763Smav data_mls = multilist_sublist_lock(data_ml, data_idx); 4043286763Smav meta_mls = multilist_sublist_lock(meta_ml, meta_idx); 4044286763Smav 4045286763Smav /* 4046286763Smav * These two loops are to ensure we skip any markers that 4047286763Smav * might be at the tail of the lists due to arc_evict_state(). 4048286763Smav */ 4049286763Smav 4050286763Smav for (data_hdr = multilist_sublist_tail(data_mls); data_hdr != NULL; 4051286763Smav data_hdr = multilist_sublist_prev(data_mls, data_hdr)) { 4052286763Smav if (data_hdr->b_spa != 0) 4053286763Smav break; 4054286763Smav } 4055286763Smav 4056286763Smav for (meta_hdr = multilist_sublist_tail(meta_mls); meta_hdr != NULL; 4057286763Smav meta_hdr = multilist_sublist_prev(meta_mls, meta_hdr)) { 4058286763Smav if (meta_hdr->b_spa != 0) 4059286763Smav break; 4060286763Smav } 4061286763Smav 4062286763Smav if (data_hdr == NULL && meta_hdr == NULL) { 4063286763Smav type = ARC_BUFC_DATA; 4064286763Smav } else if (data_hdr == NULL) { 4065286763Smav ASSERT3P(meta_hdr, !=, NULL); 4066286763Smav type = ARC_BUFC_METADATA; 4067286763Smav } else if (meta_hdr == NULL) { 4068286763Smav ASSERT3P(data_hdr, !=, NULL); 4069286763Smav type = ARC_BUFC_DATA; 4070286763Smav } else { 4071286763Smav ASSERT3P(data_hdr, !=, NULL); 4072286763Smav ASSERT3P(meta_hdr, !=, NULL); 4073286763Smav 4074286763Smav /* The headers can't be on the sublist without an L1 header */ 4075286763Smav ASSERT(HDR_HAS_L1HDR(data_hdr)); 4076286763Smav ASSERT(HDR_HAS_L1HDR(meta_hdr)); 4077286763Smav 4078286763Smav if (data_hdr->b_l1hdr.b_arc_access < 4079286763Smav meta_hdr->b_l1hdr.b_arc_access) { 4080286763Smav type = ARC_BUFC_DATA; 4081286763Smav } else { 4082286763Smav type = ARC_BUFC_METADATA; 4083286763Smav } 4084286763Smav } 4085286763Smav 4086286763Smav multilist_sublist_unlock(meta_mls); 4087286763Smav multilist_sublist_unlock(data_mls); 4088286763Smav 4089286763Smav return (type); 4090286763Smav} 4091286763Smav 4092286763Smav/* 4093286763Smav * Evict buffers from the cache, such that arc_size is capped by arc_c. 4094286763Smav */ 4095286763Smavstatic uint64_t 4096168404Spjdarc_adjust(void) 4097168404Spjd{ 4098286763Smav uint64_t total_evicted = 0; 4099286763Smav uint64_t bytes; 4100286763Smav int64_t target; 4101332540Smav uint64_t asize = aggsum_value(&arc_size); 4102332540Smav uint64_t ameta = aggsum_value(&arc_meta_used); 4103168404Spjd 4104208373Smm /* 4105286763Smav * If we're over arc_meta_limit, we want to correct that before 4106286763Smav * potentially evicting data buffers below. 4107286763Smav */ 4108332540Smav total_evicted += arc_adjust_meta(ameta); 4109286763Smav 4110286763Smav /* 4111208373Smm * Adjust MRU size 4112286763Smav * 4113286763Smav * If we're over the target cache size, we want to evict enough 4114286763Smav * from the list to get back to our target size. We don't want 4115286763Smav * to evict too much from the MRU, such that it drops below 4116286763Smav * arc_p. So, if we're over our target cache size more than 4117286763Smav * the MRU is over arc_p, we'll evict enough to get back to 4118286763Smav * arc_p here, and then evict more from the MFU below. 4119208373Smm */ 4120332540Smav target = MIN((int64_t)(asize - arc_c), 4121286766Smav (int64_t)(refcount_count(&arc_anon->arcs_size) + 4122332540Smav refcount_count(&arc_mru->arcs_size) + ameta - arc_p)); 4123208373Smm 4124286763Smav /* 4125286763Smav * If we're below arc_meta_min, always prefer to evict data. 4126286763Smav * Otherwise, try to satisfy the requested number of bytes to 4127286763Smav * evict from the type which contains older buffers; in an 4128286763Smav * effort to keep newer buffers in the cache regardless of their 4129286763Smav * type. If we cannot satisfy the number of bytes from this 4130286763Smav * type, spill over into the next type. 4131286763Smav */ 4132286763Smav if (arc_adjust_type(arc_mru) == ARC_BUFC_METADATA && 4133332540Smav ameta > arc_meta_min) { 4134286763Smav bytes = arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA); 4135286763Smav total_evicted += bytes; 4136168404Spjd 4137286763Smav /* 4138286763Smav * If we couldn't evict our target number of bytes from 4139286763Smav * metadata, we try to get the rest from data. 4140286763Smav */ 4141286763Smav target -= bytes; 4142286763Smav 4143286763Smav total_evicted += 4144286763Smav arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_DATA); 4145286763Smav } else { 4146286763Smav bytes = arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_DATA); 4147286763Smav total_evicted += bytes; 4148286763Smav 4149286763Smav /* 4150286763Smav * If we couldn't evict our target number of bytes from 4151286763Smav * data, we try to get the rest from metadata. 4152286763Smav */ 4153286763Smav target -= bytes; 4154286763Smav 4155286763Smav total_evicted += 4156286763Smav arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA); 4157185029Spjd } 4158185029Spjd 4159208373Smm /* 4160338456Smarkj * Re-sum ARC stats after the first round of evictions. 4161338456Smarkj */ 4162338456Smarkj asize = aggsum_value(&arc_size); 4163338456Smarkj ameta = aggsum_value(&arc_meta_used); 4164338456Smarkj 4165338456Smarkj /* 4166208373Smm * Adjust MFU size 4167286763Smav * 4168286763Smav * Now that we've tried to evict enough from the MRU to get its 4169286763Smav * size back to arc_p, if we're still above the target cache 4170286763Smav * size, we evict the rest from the MFU. 4171208373Smm */ 4172332540Smav target = asize - arc_c; 4173168404Spjd 4174286764Smav if (arc_adjust_type(arc_mfu) == ARC_BUFC_METADATA && 4175332540Smav ameta > arc_meta_min) { 4176286763Smav bytes = arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA); 4177286763Smav total_evicted += bytes; 4178208373Smm 4179286763Smav /* 4180286763Smav * If we couldn't evict our target number of bytes from 4181286763Smav * metadata, we try to get the rest from data. 4182286763Smav */ 4183286763Smav target -= bytes; 4184168404Spjd 4185286763Smav total_evicted += 4186286763Smav arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_DATA); 4187286763Smav } else { 4188286763Smav bytes = arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_DATA); 4189286763Smav total_evicted += bytes; 4190286763Smav 4191286763Smav /* 4192286763Smav * If we couldn't evict our target number of bytes from 4193286763Smav * data, we try to get the rest from data. 4194286763Smav */ 4195286763Smav target -= bytes; 4196286763Smav 4197286763Smav total_evicted += 4198286763Smav arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA); 4199208373Smm } 4200168404Spjd 4201208373Smm /* 4202208373Smm * Adjust ghost lists 4203286763Smav * 4204286763Smav * In addition to the above, the ARC also defines target values 4205286763Smav * for the ghost lists. The sum of the mru list and mru ghost 4206286763Smav * list should never exceed the target size of the cache, and 4207286763Smav * the sum of the mru list, mfu list, mru ghost list, and mfu 4208286763Smav * ghost list should never exceed twice the target size of the 4209286763Smav * cache. The following logic enforces these limits on the ghost 4210286763Smav * caches, and evicts from them as needed. 4211208373Smm */ 4212286766Smav target = refcount_count(&arc_mru->arcs_size) + 4213286766Smav refcount_count(&arc_mru_ghost->arcs_size) - arc_c; 4214168404Spjd 4215286763Smav bytes = arc_adjust_impl(arc_mru_ghost, 0, target, ARC_BUFC_DATA); 4216286763Smav total_evicted += bytes; 4217168404Spjd 4218286763Smav target -= bytes; 4219185029Spjd 4220286763Smav total_evicted += 4221286763Smav arc_adjust_impl(arc_mru_ghost, 0, target, ARC_BUFC_METADATA); 4222208373Smm 4223286763Smav /* 4224286763Smav * We assume the sum of the mru list and mfu list is less than 4225286763Smav * or equal to arc_c (we enforced this above), which means we 4226286763Smav * can use the simpler of the two equations below: 4227286763Smav * 4228286763Smav * mru + mfu + mru ghost + mfu ghost <= 2 * arc_c 4229286763Smav * mru ghost + mfu ghost <= arc_c 4230286763Smav */ 4231286766Smav target = refcount_count(&arc_mru_ghost->arcs_size) + 4232286766Smav refcount_count(&arc_mfu_ghost->arcs_size) - arc_c; 4233286763Smav 4234286763Smav bytes = arc_adjust_impl(arc_mfu_ghost, 0, target, ARC_BUFC_DATA); 4235286763Smav total_evicted += bytes; 4236286763Smav 4237286763Smav target -= bytes; 4238286763Smav 4239286763Smav total_evicted += 4240286763Smav arc_adjust_impl(arc_mfu_ghost, 0, target, ARC_BUFC_METADATA); 4241286763Smav 4242286763Smav return (total_evicted); 4243168404Spjd} 4244168404Spjd 4245168404Spjdvoid 4246286763Smavarc_flush(spa_t *spa, boolean_t retry) 4247168404Spjd{ 4248209962Smm uint64_t guid = 0; 4249209962Smm 4250286763Smav /* 4251307265Smav * If retry is B_TRUE, a spa must not be specified since we have 4252286763Smav * no good way to determine if all of a spa's buffers have been 4253286763Smav * evicted from an arc state. 4254286763Smav */ 4255286763Smav ASSERT(!retry || spa == 0); 4256286763Smav 4257286570Smav if (spa != NULL) 4258228103Smm guid = spa_load_guid(spa); 4259209962Smm 4260286763Smav (void) arc_flush_state(arc_mru, guid, ARC_BUFC_DATA, retry); 4261286763Smav (void) arc_flush_state(arc_mru, guid, ARC_BUFC_METADATA, retry); 4262168404Spjd 4263286763Smav (void) arc_flush_state(arc_mfu, guid, ARC_BUFC_DATA, retry); 4264286763Smav (void) arc_flush_state(arc_mfu, guid, ARC_BUFC_METADATA, retry); 4265168404Spjd 4266286763Smav (void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_DATA, retry); 4267286763Smav (void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_METADATA, retry); 4268286763Smav 4269286763Smav (void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_DATA, retry); 4270286763Smav (void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_METADATA, retry); 4271168404Spjd} 4272168404Spjd 4273346686Smavstatic void 4274346686Smavarc_reduce_target_size(int64_t to_free) 4275168404Spjd{ 4276332540Smav uint64_t asize = aggsum_value(&arc_size); 4277168404Spjd if (arc_c > arc_c_min) { 4278272483Ssmh DTRACE_PROBE4(arc__shrink, uint64_t, arc_c, uint64_t, 4279272483Ssmh arc_c_min, uint64_t, arc_p, uint64_t, to_free); 4280168404Spjd if (arc_c > arc_c_min + to_free) 4281168404Spjd atomic_add_64(&arc_c, -to_free); 4282168404Spjd else 4283168404Spjd arc_c = arc_c_min; 4284168404Spjd 4285168404Spjd atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift)); 4286332540Smav if (asize < arc_c) 4287332540Smav arc_c = MAX(asize, arc_c_min); 4288168404Spjd if (arc_p > arc_c) 4289168404Spjd arc_p = (arc_c >> 1); 4290272483Ssmh 4291272483Ssmh DTRACE_PROBE2(arc__shrunk, uint64_t, arc_c, uint64_t, 4292272483Ssmh arc_p); 4293272483Ssmh 4294168404Spjd ASSERT(arc_c >= arc_c_min); 4295168404Spjd ASSERT((int64_t)arc_p >= 0); 4296168404Spjd } 4297168404Spjd 4298332540Smav if (asize > arc_c) { 4299332540Smav DTRACE_PROBE2(arc__shrink_adjust, uint64_t, asize, 4300270759Ssmh uint64_t, arc_c); 4301346686Smav /* See comment in arc_adjust_cb_check() on why lock+flag */ 4302346686Smav mutex_enter(&arc_adjust_lock); 4303346686Smav arc_adjust_needed = B_TRUE; 4304346686Smav mutex_exit(&arc_adjust_lock); 4305346686Smav zthr_wakeup(arc_adjust_zthr); 4306270759Ssmh } 4307168404Spjd} 4308168404Spjd 4309286625Smavtypedef enum free_memory_reason_t { 4310286625Smav FMR_UNKNOWN, 4311286625Smav FMR_NEEDFREE, 4312286625Smav FMR_LOTSFREE, 4313286625Smav FMR_SWAPFS_MINFREE, 4314286625Smav FMR_PAGES_PP_MAXIMUM, 4315286625Smav FMR_HEAP_ARENA, 4316286625Smav FMR_ZIO_ARENA, 4317286625Smav FMR_ZIO_FRAG, 4318286625Smav} free_memory_reason_t; 4319286625Smav 4320286625Smavint64_t last_free_memory; 4321286625Smavfree_memory_reason_t last_free_reason; 4322286625Smav 4323286625Smav/* 4324286625Smav * Additional reserve of pages for pp_reserve. 4325286625Smav */ 4326286625Smavint64_t arc_pages_pp_reserve = 64; 4327286625Smav 4328286625Smav/* 4329286625Smav * Additional reserve of pages for swapfs. 4330286625Smav */ 4331286625Smavint64_t arc_swapfs_reserve = 64; 4332286625Smav 4333286625Smav/* 4334286625Smav * Return the amount of memory that can be consumed before reclaim will be 4335286625Smav * needed. Positive if there is sufficient free memory, negative indicates 4336286625Smav * the amount of memory that needs to be freed up. 4337286625Smav */ 4338286625Smavstatic int64_t 4339286625Smavarc_available_memory(void) 4340168404Spjd{ 4341286625Smav int64_t lowest = INT64_MAX; 4342286625Smav int64_t n; 4343286625Smav free_memory_reason_t r = FMR_UNKNOWN; 4344168404Spjd 4345168404Spjd#ifdef _KERNEL 4346330061Savg#ifdef __FreeBSD__ 4347191902Skmacy /* 4348212780Savg * Cooperate with pagedaemon when it's time for it to scan 4349212780Savg * and reclaim some pages. 4350191902Skmacy */ 4351286655Smav n = PAGESIZE * ((int64_t)freemem - zfs_arc_free_target); 4352286625Smav if (n < lowest) { 4353286625Smav lowest = n; 4354286625Smav r = FMR_LOTSFREE; 4355270759Ssmh } 4356191902Skmacy 4357330061Savg#else 4358330061Savg if (needfree > 0) { 4359330061Savg n = PAGESIZE * (-needfree); 4360330061Savg if (n < lowest) { 4361330061Savg lowest = n; 4362330061Savg r = FMR_NEEDFREE; 4363330061Savg } 4364330061Savg } 4365330061Savg 4366168404Spjd /* 4367185029Spjd * check that we're out of range of the pageout scanner. It starts to 4368185029Spjd * schedule paging if freemem is less than lotsfree and needfree. 4369185029Spjd * lotsfree is the high-water mark for pageout, and needfree is the 4370185029Spjd * number of needed free pages. We add extra pages here to make sure 4371185029Spjd * the scanner doesn't start up while we're freeing memory. 4372185029Spjd */ 4373286625Smav n = PAGESIZE * (freemem - lotsfree - needfree - desfree); 4374286625Smav if (n < lowest) { 4375286625Smav lowest = n; 4376286625Smav r = FMR_LOTSFREE; 4377286625Smav } 4378185029Spjd 4379185029Spjd /* 4380168404Spjd * check to make sure that swapfs has enough space so that anon 4381185029Spjd * reservations can still succeed. anon_resvmem() checks that the 4382168404Spjd * availrmem is greater than swapfs_minfree, and the number of reserved 4383168404Spjd * swap pages. We also add a bit of extra here just to prevent 4384168404Spjd * circumstances from getting really dire. 4385168404Spjd */ 4386286625Smav n = PAGESIZE * (availrmem - swapfs_minfree - swapfs_reserve - 4387286625Smav desfree - arc_swapfs_reserve); 4388286625Smav if (n < lowest) { 4389286625Smav lowest = n; 4390286625Smav r = FMR_SWAPFS_MINFREE; 4391286625Smav } 4392168404Spjd 4393286625Smav 4394168404Spjd /* 4395272483Ssmh * Check that we have enough availrmem that memory locking (e.g., via 4396272483Ssmh * mlock(3C) or memcntl(2)) can still succeed. (pages_pp_maximum 4397272483Ssmh * stores the number of pages that cannot be locked; when availrmem 4398272483Ssmh * drops below pages_pp_maximum, page locking mechanisms such as 4399272483Ssmh * page_pp_lock() will fail.) 4400272483Ssmh */ 4401286625Smav n = PAGESIZE * (availrmem - pages_pp_maximum - 4402286625Smav arc_pages_pp_reserve); 4403286625Smav if (n < lowest) { 4404286625Smav lowest = n; 4405286625Smav r = FMR_PAGES_PP_MAXIMUM; 4406286625Smav } 4407272483Ssmh 4408330061Savg#endif /* __FreeBSD__ */ 4409272483Ssmh#if defined(__i386) || !defined(UMA_MD_SMALL_ALLOC) 4410272483Ssmh /* 4411168404Spjd * If we're on an i386 platform, it's possible that we'll exhaust the 4412168404Spjd * kernel heap space before we ever run out of available physical 4413168404Spjd * memory. Most checks of the size of the heap_area compare against 4414168404Spjd * tune.t_minarmem, which is the minimum available real memory that we 4415168404Spjd * can have in the system. However, this is generally fixed at 25 pages 4416168404Spjd * which is so low that it's useless. In this comparison, we seek to 4417168404Spjd * calculate the total heap-size, and reclaim if more than 3/4ths of the 4418185029Spjd * heap is allocated. (Or, in the calculation, if less than 1/4th is 4419168404Spjd * free) 4420168404Spjd */ 4421286655Smav n = (int64_t)vmem_size(heap_arena, VMEM_FREE) - 4422286628Smav (vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC) >> 2); 4423286625Smav if (n < lowest) { 4424286625Smav lowest = n; 4425286625Smav r = FMR_HEAP_ARENA; 4426270861Ssmh } 4427281026Smav#define zio_arena NULL 4428281026Smav#else 4429281026Smav#define zio_arena heap_arena 4430270861Ssmh#endif 4431281026Smav 4432272483Ssmh /* 4433272483Ssmh * If zio data pages are being allocated out of a separate heap segment, 4434272483Ssmh * then enforce that the size of available vmem for this arena remains 4435331383Smav * above about 1/4th (1/(2^arc_zio_arena_free_shift)) free. 4436272483Ssmh * 4437331383Smav * Note that reducing the arc_zio_arena_free_shift keeps more virtual 4438331383Smav * memory (in the zio_arena) free, which can avoid memory 4439331383Smav * fragmentation issues. 4440272483Ssmh */ 4441286625Smav if (zio_arena != NULL) { 4442286655Smav n = (int64_t)vmem_size(zio_arena, VMEM_FREE) - 4443331383Smav (vmem_size(zio_arena, VMEM_ALLOC) >> 4444331383Smav arc_zio_arena_free_shift); 4445286625Smav if (n < lowest) { 4446286625Smav lowest = n; 4447286625Smav r = FMR_ZIO_ARENA; 4448286625Smav } 4449286625Smav } 4450281026Smav 4451281026Smav /* 4452281026Smav * Above limits know nothing about real level of KVA fragmentation. 4453281026Smav * Start aggressive reclamation if too little sequential KVA left. 4454281026Smav */ 4455286625Smav if (lowest > 0) { 4456317470Ssmh n = (vmem_size(heap_arena, VMEM_MAXFREE) < SPA_MAXBLOCKSIZE) ? 4457286655Smav -((int64_t)vmem_size(heap_arena, VMEM_ALLOC) >> 4) : 4458286655Smav INT64_MAX; 4459286625Smav if (n < lowest) { 4460286625Smav lowest = n; 4461286625Smav r = FMR_ZIO_FRAG; 4462286625Smav } 4463281109Smav } 4464281026Smav 4465272483Ssmh#else /* _KERNEL */ 4466286625Smav /* Every 100 calls, free a small amount */ 4467168404Spjd if (spa_get_random(100) == 0) 4468286625Smav lowest = -1024; 4469272483Ssmh#endif /* _KERNEL */ 4470270759Ssmh 4471286625Smav last_free_memory = lowest; 4472286625Smav last_free_reason = r; 4473286625Smav DTRACE_PROBE2(arc__available_memory, int64_t, lowest, int, r); 4474286625Smav return (lowest); 4475168404Spjd} 4476168404Spjd 4477286625Smav 4478286625Smav/* 4479286625Smav * Determine if the system is under memory pressure and is asking 4480307265Smav * to reclaim memory. A return value of B_TRUE indicates that the system 4481286625Smav * is under memory pressure and that the arc should adjust accordingly. 4482286625Smav */ 4483286625Smavstatic boolean_t 4484286625Smavarc_reclaim_needed(void) 4485286625Smav{ 4486286625Smav return (arc_available_memory() < 0); 4487286625Smav} 4488286625Smav 4489208454Spjdextern kmem_cache_t *zio_buf_cache[]; 4490208454Spjdextern kmem_cache_t *zio_data_buf_cache[]; 4491272527Sdelphijextern kmem_cache_t *range_seg_cache; 4492321610Smavextern kmem_cache_t *abd_chunk_cache; 4493208454Spjd 4494278040Ssmhstatic __noinline void 4495346686Smavarc_kmem_reap_soon(void) 4496168404Spjd{ 4497168404Spjd size_t i; 4498168404Spjd kmem_cache_t *prev_cache = NULL; 4499168404Spjd kmem_cache_t *prev_data_cache = NULL; 4500168404Spjd 4501272483Ssmh DTRACE_PROBE(arc__kmem_reap_start); 4502168404Spjd#ifdef _KERNEL 4503332540Smav if (aggsum_compare(&arc_meta_used, arc_meta_limit) >= 0) { 4504185029Spjd /* 4505185029Spjd * We are exceeding our meta-data cache limit. 4506185029Spjd * Purge some DNLC entries to release holds on meta-data. 4507185029Spjd */ 4508185029Spjd dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent); 4509185029Spjd } 4510168404Spjd#if defined(__i386) 4511168404Spjd /* 4512168404Spjd * Reclaim unused memory from all kmem caches. 4513168404Spjd */ 4514168404Spjd kmem_reap(); 4515168404Spjd#endif 4516168404Spjd#endif 4517168404Spjd 4518168404Spjd for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) { 4519168404Spjd if (zio_buf_cache[i] != prev_cache) { 4520168404Spjd prev_cache = zio_buf_cache[i]; 4521332528Smav kmem_cache_reap_soon(zio_buf_cache[i]); 4522168404Spjd } 4523168404Spjd if (zio_data_buf_cache[i] != prev_data_cache) { 4524168404Spjd prev_data_cache = zio_data_buf_cache[i]; 4525332528Smav kmem_cache_reap_soon(zio_data_buf_cache[i]); 4526168404Spjd } 4527168404Spjd } 4528332528Smav kmem_cache_reap_soon(abd_chunk_cache); 4529332528Smav kmem_cache_reap_soon(buf_cache); 4530332528Smav kmem_cache_reap_soon(hdr_full_cache); 4531332528Smav kmem_cache_reap_soon(hdr_l2only_cache); 4532332528Smav kmem_cache_reap_soon(range_seg_cache); 4533272483Ssmh 4534277300Ssmh#ifdef illumos 4535286625Smav if (zio_arena != NULL) { 4536286625Smav /* 4537286625Smav * Ask the vmem arena to reclaim unused memory from its 4538286625Smav * quantum caches. 4539286625Smav */ 4540272483Ssmh vmem_qcache_reap(zio_arena); 4541286625Smav } 4542272483Ssmh#endif 4543272483Ssmh DTRACE_PROBE(arc__kmem_reap_end); 4544168404Spjd} 4545168404Spjd 4546346686Smav/* ARGSUSED */ 4547346686Smavstatic boolean_t 4548346686Smavarc_adjust_cb_check(void *arg, zthr_t *zthr) 4549346686Smav{ 4550346686Smav /* 4551346686Smav * This is necessary in order for the mdb ::arc dcmd to 4552346686Smav * show up to date information. Since the ::arc command 4553346686Smav * does not call the kstat's update function, without 4554346686Smav * this call, the command may show stale stats for the 4555346686Smav * anon, mru, mru_ghost, mfu, and mfu_ghost lists. Even 4556346686Smav * with this change, the data might be up to 1 second 4557346686Smav * out of date(the arc_adjust_zthr has a maximum sleep 4558346686Smav * time of 1 second); but that should suffice. The 4559346686Smav * arc_state_t structures can be queried directly if more 4560346686Smav * accurate information is needed. 4561346686Smav */ 4562346686Smav if (arc_ksp != NULL) 4563346686Smav arc_ksp->ks_update(arc_ksp, KSTAT_READ); 4564346686Smav 4565346686Smav /* 4566346686Smav * We have to rely on arc_get_data_impl() to tell us when to adjust, 4567346686Smav * rather than checking if we are overflowing here, so that we are 4568346686Smav * sure to not leave arc_get_data_impl() waiting on 4569346686Smav * arc_adjust_waiters_cv. If we have become "not overflowing" since 4570346686Smav * arc_get_data_impl() checked, we need to wake it up. We could 4571346686Smav * broadcast the CV here, but arc_get_data_impl() may have not yet 4572346686Smav * gone to sleep. We would need to use a mutex to ensure that this 4573346686Smav * function doesn't broadcast until arc_get_data_impl() has gone to 4574346686Smav * sleep (e.g. the arc_adjust_lock). However, the lock ordering of 4575346686Smav * such a lock would necessarily be incorrect with respect to the 4576346686Smav * zthr_lock, which is held before this function is called, and is 4577346686Smav * held by arc_get_data_impl() when it calls zthr_wakeup(). 4578346686Smav */ 4579346686Smav return (arc_adjust_needed); 4580346686Smav} 4581346686Smav 4582286763Smav/* 4583346686Smav * Keep arc_size under arc_c by running arc_adjust which evicts data 4584346686Smav * from the ARC. */ 4585331399Smav/* ARGSUSED */ 4586346686Smavstatic int 4587346686Smavarc_adjust_cb(void *arg, zthr_t *zthr) 4588168404Spjd{ 4589346686Smav uint64_t evicted = 0; 4590168404Spjd 4591346686Smav /* Evict from cache */ 4592346686Smav evicted = arc_adjust(); 4593168404Spjd 4594346686Smav /* 4595346686Smav * If evicted is zero, we couldn't evict anything 4596346686Smav * via arc_adjust(). This could be due to hash lock 4597346686Smav * collisions, but more likely due to the majority of 4598346686Smav * arc buffers being unevictable. Therefore, even if 4599346686Smav * arc_size is above arc_c, another pass is unlikely to 4600346686Smav * be helpful and could potentially cause us to enter an 4601346686Smav * infinite loop. Additionally, zthr_iscancelled() is 4602346686Smav * checked here so that if the arc is shutting down, the 4603346686Smav * broadcast will wake any remaining arc adjust waiters. 4604346686Smav */ 4605346686Smav mutex_enter(&arc_adjust_lock); 4606346686Smav arc_adjust_needed = !zthr_iscancelled(arc_adjust_zthr) && 4607346686Smav evicted > 0 && aggsum_compare(&arc_size, arc_c) > 0; 4608346686Smav if (!arc_adjust_needed) { 4609307265Smav /* 4610346686Smav * We're either no longer overflowing, or we 4611346686Smav * can't evict anything more, so we should wake 4612346686Smav * up any waiters. 4613307265Smav */ 4614346686Smav cv_broadcast(&arc_adjust_waiters_cv); 4615346686Smav } 4616346686Smav mutex_exit(&arc_adjust_lock); 4617307265Smav 4618346686Smav return (0); 4619346686Smav} 4620286763Smav 4621346686Smav/* ARGSUSED */ 4622346686Smavstatic boolean_t 4623346686Smavarc_reap_cb_check(void *arg, zthr_t *zthr) 4624346686Smav{ 4625346686Smav int64_t free_memory = arc_available_memory(); 4626346686Smav 4627346686Smav /* 4628346686Smav * If a kmem reap is already active, don't schedule more. We must 4629346686Smav * check for this because kmem_cache_reap_soon() won't actually 4630346686Smav * block on the cache being reaped (this is to prevent callers from 4631346686Smav * becoming implicitly blocked by a system-wide kmem reap -- which, 4632346686Smav * on a system with many, many full magazines, can take minutes). 4633346686Smav */ 4634346686Smav if (!kmem_cache_reap_active() && 4635346686Smav free_memory < 0) { 4636346686Smav arc_no_grow = B_TRUE; 4637346686Smav arc_warm = B_TRUE; 4638314873Sjpaetzel /* 4639346686Smav * Wait at least zfs_grow_retry (default 60) seconds 4640346686Smav * before considering growing. 4641314873Sjpaetzel */ 4642346686Smav arc_growtime = gethrtime() + SEC2NSEC(arc_grow_retry); 4643346686Smav return (B_TRUE); 4644346686Smav } else if (free_memory < arc_c >> arc_no_grow_shift) { 4645346686Smav arc_no_grow = B_TRUE; 4646346686Smav } else if (gethrtime() >= arc_growtime) { 4647346686Smav arc_no_grow = B_FALSE; 4648346686Smav } 4649314873Sjpaetzel 4650346686Smav return (B_FALSE); 4651346686Smav} 4652168404Spjd 4653346686Smav/* 4654346686Smav * Keep enough free memory in the system by reaping the ARC's kmem 4655346686Smav * caches. To cause more slabs to be reapable, we may reduce the 4656346686Smav * target size of the cache (arc_c), causing the arc_adjust_cb() 4657346686Smav * to free more buffers. 4658346686Smav */ 4659346686Smav/* ARGSUSED */ 4660346686Smavstatic int 4661346686Smavarc_reap_cb(void *arg, zthr_t *zthr) 4662346686Smav{ 4663346686Smav int64_t free_memory; 4664168404Spjd 4665346686Smav /* 4666346686Smav * Kick off asynchronous kmem_reap()'s of all our caches. 4667346686Smav */ 4668346686Smav arc_kmem_reap_soon(); 4669286625Smav 4670346686Smav /* 4671346686Smav * Wait at least arc_kmem_cache_reap_retry_ms between 4672346686Smav * arc_kmem_reap_soon() calls. Without this check it is possible to 4673346686Smav * end up in a situation where we spend lots of time reaping 4674346686Smav * caches, while we're near arc_c_min. Waiting here also gives the 4675346686Smav * subsequent free memory check a chance of finding that the 4676346686Smav * asynchronous reap has already freed enough memory, and we don't 4677346686Smav * need to call arc_reduce_target_size(). 4678346686Smav */ 4679346686Smav delay((hz * arc_kmem_cache_reap_retry_ms + 999) / 1000); 4680286625Smav 4681346686Smav /* 4682346686Smav * Reduce the target size as needed to maintain the amount of free 4683346686Smav * memory in the system at a fraction of the arc_size (1/128th by 4684346686Smav * default). If oversubscribed (free_memory < 0) then reduce the 4685346686Smav * target arc_size by the deficit amount plus the fractional 4686346686Smav * amount. If free memory is positive but less then the fractional 4687346686Smav * amount, reduce by what is needed to hit the fractional amount. 4688346686Smav */ 4689346686Smav free_memory = arc_available_memory(); 4690346686Smav 4691346686Smav int64_t to_free = 4692346686Smav (arc_c >> arc_shrink_shift) - free_memory; 4693346686Smav if (to_free > 0) { 4694330061Savg#ifdef _KERNEL 4695330061Savg#ifdef illumos 4696346686Smav to_free = MAX(to_free, ptob(needfree)); 4697330061Savg#endif 4698330061Savg#endif 4699346686Smav arc_reduce_target_size(to_free); 4700286763Smav } 4701286763Smav 4702346686Smav return (0); 4703286763Smav} 4704286763Smav 4705301997Skibstatic u_int arc_dnlc_evicts_arg; 4706301997Skibextern struct vfsops zfs_vfsops; 4707301997Skib 4708301997Skibstatic void 4709301997Skibarc_dnlc_evicts_thread(void *dummy __unused) 4710301997Skib{ 4711301997Skib callb_cpr_t cpr; 4712301997Skib u_int percent; 4713301997Skib 4714301997Skib CALLB_CPR_INIT(&cpr, &arc_dnlc_evicts_lock, callb_generic_cpr, FTAG); 4715301997Skib 4716301997Skib mutex_enter(&arc_dnlc_evicts_lock); 4717301997Skib while (!arc_dnlc_evicts_thread_exit) { 4718301997Skib CALLB_CPR_SAFE_BEGIN(&cpr); 4719301997Skib (void) cv_wait(&arc_dnlc_evicts_cv, &arc_dnlc_evicts_lock); 4720301997Skib CALLB_CPR_SAFE_END(&cpr, &arc_dnlc_evicts_lock); 4721301997Skib if (arc_dnlc_evicts_arg != 0) { 4722301997Skib percent = arc_dnlc_evicts_arg; 4723301997Skib mutex_exit(&arc_dnlc_evicts_lock); 4724301997Skib#ifdef _KERNEL 4725301997Skib vnlru_free(desiredvnodes * percent / 100, &zfs_vfsops); 4726301997Skib#endif 4727301997Skib mutex_enter(&arc_dnlc_evicts_lock); 4728301997Skib /* 4729301997Skib * Clear our token only after vnlru_free() 4730301997Skib * pass is done, to avoid false queueing of 4731301997Skib * the requests. 4732301997Skib */ 4733301997Skib arc_dnlc_evicts_arg = 0; 4734301997Skib } 4735301997Skib } 4736301997Skib arc_dnlc_evicts_thread_exit = FALSE; 4737301997Skib cv_broadcast(&arc_dnlc_evicts_cv); 4738301997Skib CALLB_CPR_EXIT(&cpr); 4739301997Skib thread_exit(); 4740301997Skib} 4741301997Skib 4742301997Skibvoid 4743301997Skibdnlc_reduce_cache(void *arg) 4744301997Skib{ 4745301997Skib u_int percent; 4746301997Skib 4747302012Skib percent = (u_int)(uintptr_t)arg; 4748301997Skib mutex_enter(&arc_dnlc_evicts_lock); 4749301997Skib if (arc_dnlc_evicts_arg == 0) { 4750301997Skib arc_dnlc_evicts_arg = percent; 4751301997Skib cv_broadcast(&arc_dnlc_evicts_cv); 4752301997Skib } 4753301997Skib mutex_exit(&arc_dnlc_evicts_lock); 4754301997Skib} 4755301997Skib 4756168404Spjd/* 4757168404Spjd * Adapt arc info given the number of bytes we are trying to add and 4758168404Spjd * the state that we are comming from. This function is only called 4759168404Spjd * when we are adding new content to the cache. 4760168404Spjd */ 4761168404Spjdstatic void 4762168404Spjdarc_adapt(int bytes, arc_state_t *state) 4763168404Spjd{ 4764168404Spjd int mult; 4765208373Smm uint64_t arc_p_min = (arc_c >> arc_p_min_shift); 4766286766Smav int64_t mrug_size = refcount_count(&arc_mru_ghost->arcs_size); 4767286766Smav int64_t mfug_size = refcount_count(&arc_mfu_ghost->arcs_size); 4768168404Spjd 4769185029Spjd if (state == arc_l2c_only) 4770185029Spjd return; 4771185029Spjd 4772168404Spjd ASSERT(bytes > 0); 4773168404Spjd /* 4774168404Spjd * Adapt the target size of the MRU list: 4775168404Spjd * - if we just hit in the MRU ghost list, then increase 4776168404Spjd * the target size of the MRU list. 4777168404Spjd * - if we just hit in the MFU ghost list, then increase 4778168404Spjd * the target size of the MFU list by decreasing the 4779168404Spjd * target size of the MRU list. 4780168404Spjd */ 4781168404Spjd if (state == arc_mru_ghost) { 4782286766Smav mult = (mrug_size >= mfug_size) ? 1 : (mfug_size / mrug_size); 4783209275Smm mult = MIN(mult, 10); /* avoid wild arc_p adjustment */ 4784168404Spjd 4785208373Smm arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult); 4786168404Spjd } else if (state == arc_mfu_ghost) { 4787208373Smm uint64_t delta; 4788208373Smm 4789286766Smav mult = (mfug_size >= mrug_size) ? 1 : (mrug_size / mfug_size); 4790209275Smm mult = MIN(mult, 10); 4791168404Spjd 4792208373Smm delta = MIN(bytes * mult, arc_p); 4793208373Smm arc_p = MAX(arc_p_min, arc_p - delta); 4794168404Spjd } 4795168404Spjd ASSERT((int64_t)arc_p >= 0); 4796168404Spjd 4797346686Smav /* 4798346686Smav * Wake reap thread if we do not have any available memory 4799346686Smav */ 4800168404Spjd if (arc_reclaim_needed()) { 4801346686Smav zthr_wakeup(arc_reap_zthr); 4802168404Spjd return; 4803168404Spjd } 4804168404Spjd 4805168404Spjd if (arc_no_grow) 4806168404Spjd return; 4807168404Spjd 4808168404Spjd if (arc_c >= arc_c_max) 4809168404Spjd return; 4810168404Spjd 4811168404Spjd /* 4812168404Spjd * If we're within (2 * maxblocksize) bytes of the target 4813168404Spjd * cache size, increment the target cache size 4814168404Spjd */ 4815332540Smav if (aggsum_compare(&arc_size, arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) > 4816332540Smav 0) { 4817272483Ssmh DTRACE_PROBE1(arc__inc_adapt, int, bytes); 4818168404Spjd atomic_add_64(&arc_c, (int64_t)bytes); 4819168404Spjd if (arc_c > arc_c_max) 4820168404Spjd arc_c = arc_c_max; 4821168404Spjd else if (state == arc_anon) 4822168404Spjd atomic_add_64(&arc_p, (int64_t)bytes); 4823168404Spjd if (arc_p > arc_c) 4824168404Spjd arc_p = arc_c; 4825168404Spjd } 4826168404Spjd ASSERT((int64_t)arc_p >= 0); 4827168404Spjd} 4828168404Spjd 4829168404Spjd/* 4830286763Smav * Check if arc_size has grown past our upper threshold, determined by 4831286763Smav * zfs_arc_overflow_shift. 4832168404Spjd */ 4833286763Smavstatic boolean_t 4834286763Smavarc_is_overflowing(void) 4835168404Spjd{ 4836286763Smav /* Always allow at least one block of overflow */ 4837286763Smav uint64_t overflow = MAX(SPA_MAXBLOCKSIZE, 4838286763Smav arc_c >> zfs_arc_overflow_shift); 4839185029Spjd 4840332540Smav /* 4841332540Smav * We just compare the lower bound here for performance reasons. Our 4842332540Smav * primary goals are to make sure that the arc never grows without 4843332540Smav * bound, and that it can reach its maximum size. This check 4844332540Smav * accomplishes both goals. The maximum amount we could run over by is 4845332540Smav * 2 * aggsum_borrow_multiplier * NUM_CPUS * the average size of a block 4846332540Smav * in the ARC. In practice, that's in the tens of MB, which is low 4847332540Smav * enough to be safe. 4848332540Smav */ 4849332540Smav return (aggsum_lower_bound(&arc_size) >= arc_c + overflow); 4850168404Spjd} 4851168404Spjd 4852321610Smavstatic abd_t * 4853349216Savgarc_get_data_abd(arc_buf_hdr_t *hdr, uint64_t size, void *tag, boolean_t do_adapt) 4854321610Smav{ 4855321610Smav arc_buf_contents_t type = arc_buf_type(hdr); 4856321610Smav 4857349216Savg arc_get_data_impl(hdr, size, tag, do_adapt); 4858321610Smav if (type == ARC_BUFC_METADATA) { 4859321610Smav return (abd_alloc(size, B_TRUE)); 4860321610Smav } else { 4861321610Smav ASSERT(type == ARC_BUFC_DATA); 4862321610Smav return (abd_alloc(size, B_FALSE)); 4863321610Smav } 4864321610Smav} 4865321610Smav 4866321610Smavstatic void * 4867321610Smavarc_get_data_buf(arc_buf_hdr_t *hdr, uint64_t size, void *tag) 4868321610Smav{ 4869321610Smav arc_buf_contents_t type = arc_buf_type(hdr); 4870321610Smav 4871349216Savg arc_get_data_impl(hdr, size, tag, B_TRUE); 4872321610Smav if (type == ARC_BUFC_METADATA) { 4873321610Smav return (zio_buf_alloc(size)); 4874321610Smav } else { 4875321610Smav ASSERT(type == ARC_BUFC_DATA); 4876321610Smav return (zio_data_buf_alloc(size)); 4877321610Smav } 4878321610Smav} 4879321610Smav 4880168404Spjd/* 4881307265Smav * Allocate a block and return it to the caller. If we are hitting the 4882307265Smav * hard limit for the cache size, we must sleep, waiting for the eviction 4883307265Smav * thread to catch up. If we're past the target size but below the hard 4884307265Smav * limit, we'll only signal the reclaim thread and continue on. 4885168404Spjd */ 4886321610Smavstatic void 4887349216Savgarc_get_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag, boolean_t do_adapt) 4888168404Spjd{ 4889321610Smav arc_state_t *state = hdr->b_l1hdr.b_state; 4890321610Smav arc_buf_contents_t type = arc_buf_type(hdr); 4891168404Spjd 4892349216Savg if (do_adapt) 4893349216Savg arc_adapt(size, state); 4894168404Spjd 4895168404Spjd /* 4896286763Smav * If arc_size is currently overflowing, and has grown past our 4897286763Smav * upper limit, we must be adding data faster than the evict 4898286763Smav * thread can evict. Thus, to ensure we don't compound the 4899286763Smav * problem by adding more data and forcing arc_size to grow even 4900286763Smav * further past it's target size, we halt and wait for the 4901286763Smav * eviction thread to catch up. 4902286763Smav * 4903286763Smav * It's also possible that the reclaim thread is unable to evict 4904286763Smav * enough buffers to get arc_size below the overflow limit (e.g. 4905286763Smav * due to buffers being un-evictable, or hash lock collisions). 4906286763Smav * In this case, we want to proceed regardless if we're 4907286763Smav * overflowing; thus we don't use a while loop here. 4908168404Spjd */ 4909286763Smav if (arc_is_overflowing()) { 4910346686Smav mutex_enter(&arc_adjust_lock); 4911286763Smav 4912286763Smav /* 4913286763Smav * Now that we've acquired the lock, we may no longer be 4914286763Smav * over the overflow limit, lets check. 4915286763Smav * 4916286763Smav * We're ignoring the case of spurious wake ups. If that 4917286763Smav * were to happen, it'd let this thread consume an ARC 4918286763Smav * buffer before it should have (i.e. before we're under 4919286763Smav * the overflow limit and were signalled by the reclaim 4920286763Smav * thread). As long as that is a rare occurrence, it 4921286763Smav * shouldn't cause any harm. 4922286763Smav */ 4923286763Smav if (arc_is_overflowing()) { 4924346686Smav arc_adjust_needed = B_TRUE; 4925346686Smav zthr_wakeup(arc_adjust_zthr); 4926346686Smav (void) cv_wait(&arc_adjust_waiters_cv, 4927346686Smav &arc_adjust_lock); 4928168404Spjd } 4929346686Smav mutex_exit(&arc_adjust_lock); 4930168404Spjd } 4931168404Spjd 4932307265Smav VERIFY3U(hdr->b_type, ==, type); 4933286763Smav if (type == ARC_BUFC_METADATA) { 4934286763Smav arc_space_consume(size, ARC_SPACE_META); 4935168404Spjd } else { 4936286763Smav arc_space_consume(size, ARC_SPACE_DATA); 4937168404Spjd } 4938286763Smav 4939168404Spjd /* 4940168404Spjd * Update the state size. Note that ghost states have a 4941168404Spjd * "ghost size" and so don't need to be updated. 4942168404Spjd */ 4943307265Smav if (!GHOST_STATE(state)) { 4944168404Spjd 4945307265Smav (void) refcount_add_many(&state->arcs_size, size, tag); 4946286763Smav 4947286763Smav /* 4948286763Smav * If this is reached via arc_read, the link is 4949286763Smav * protected by the hash lock. If reached via 4950286763Smav * arc_buf_alloc, the header should not be accessed by 4951286763Smav * any other thread. And, if reached via arc_read_done, 4952286763Smav * the hash lock will protect it if it's found in the 4953286763Smav * hash table; otherwise no other thread should be 4954286763Smav * trying to [add|remove]_reference it. 4955286763Smav */ 4956286763Smav if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) { 4957286570Smav ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 4958307265Smav (void) refcount_add_many(&state->arcs_esize[type], 4959307265Smav size, tag); 4960168404Spjd } 4961307265Smav 4962168404Spjd /* 4963168404Spjd * If we are growing the cache, and we are adding anonymous 4964168404Spjd * data, and we have outgrown arc_p, update arc_p 4965168404Spjd */ 4966332540Smav if (aggsum_compare(&arc_size, arc_c) < 0 && 4967332540Smav hdr->b_l1hdr.b_state == arc_anon && 4968286766Smav (refcount_count(&arc_anon->arcs_size) + 4969286766Smav refcount_count(&arc_mru->arcs_size) > arc_p)) 4970168404Spjd arc_p = MIN(arc_c, arc_p + size); 4971168404Spjd } 4972205231Skmacy ARCSTAT_BUMP(arcstat_allocated); 4973168404Spjd} 4974168404Spjd 4975321610Smavstatic void 4976321610Smavarc_free_data_abd(arc_buf_hdr_t *hdr, abd_t *abd, uint64_t size, void *tag) 4977321610Smav{ 4978321610Smav arc_free_data_impl(hdr, size, tag); 4979321610Smav abd_free(abd); 4980321610Smav} 4981321610Smav 4982321610Smavstatic void 4983321610Smavarc_free_data_buf(arc_buf_hdr_t *hdr, void *buf, uint64_t size, void *tag) 4984321610Smav{ 4985321610Smav arc_buf_contents_t type = arc_buf_type(hdr); 4986321610Smav 4987321610Smav arc_free_data_impl(hdr, size, tag); 4988321610Smav if (type == ARC_BUFC_METADATA) { 4989321610Smav zio_buf_free(buf, size); 4990321610Smav } else { 4991321610Smav ASSERT(type == ARC_BUFC_DATA); 4992321610Smav zio_data_buf_free(buf, size); 4993321610Smav } 4994321610Smav} 4995321610Smav 4996168404Spjd/* 4997307265Smav * Free the arc data buffer. 4998307265Smav */ 4999307265Smavstatic void 5000321610Smavarc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag) 5001307265Smav{ 5002307265Smav arc_state_t *state = hdr->b_l1hdr.b_state; 5003307265Smav arc_buf_contents_t type = arc_buf_type(hdr); 5004307265Smav 5005307265Smav /* protected by hash lock, if in the hash table */ 5006307265Smav if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) { 5007307265Smav ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 5008307265Smav ASSERT(state != arc_anon && state != arc_l2c_only); 5009307265Smav 5010307265Smav (void) refcount_remove_many(&state->arcs_esize[type], 5011307265Smav size, tag); 5012307265Smav } 5013307265Smav (void) refcount_remove_many(&state->arcs_size, size, tag); 5014307265Smav 5015307265Smav VERIFY3U(hdr->b_type, ==, type); 5016307265Smav if (type == ARC_BUFC_METADATA) { 5017307265Smav arc_space_return(size, ARC_SPACE_META); 5018307265Smav } else { 5019307265Smav ASSERT(type == ARC_BUFC_DATA); 5020307265Smav arc_space_return(size, ARC_SPACE_DATA); 5021307265Smav } 5022307265Smav} 5023307265Smav 5024307265Smav/* 5025168404Spjd * This routine is called whenever a buffer is accessed. 5026168404Spjd * NOTE: the hash lock is dropped in this function. 5027168404Spjd */ 5028168404Spjdstatic void 5029275811Sdelphijarc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) 5030168404Spjd{ 5031219089Spjd clock_t now; 5032219089Spjd 5033168404Spjd ASSERT(MUTEX_HELD(hash_lock)); 5034286570Smav ASSERT(HDR_HAS_L1HDR(hdr)); 5035168404Spjd 5036286570Smav if (hdr->b_l1hdr.b_state == arc_anon) { 5037168404Spjd /* 5038168404Spjd * This buffer is not in the cache, and does not 5039168404Spjd * appear in our "ghost" list. Add the new buffer 5040168404Spjd * to the MRU state. 5041168404Spjd */ 5042168404Spjd 5043286570Smav ASSERT0(hdr->b_l1hdr.b_arc_access); 5044286570Smav hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); 5045275811Sdelphij DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr); 5046275811Sdelphij arc_change_state(arc_mru, hdr, hash_lock); 5047168404Spjd 5048286570Smav } else if (hdr->b_l1hdr.b_state == arc_mru) { 5049219089Spjd now = ddi_get_lbolt(); 5050219089Spjd 5051168404Spjd /* 5052168404Spjd * If this buffer is here because of a prefetch, then either: 5053168404Spjd * - clear the flag if this is a "referencing" read 5054168404Spjd * (any subsequent access will bump this into the MFU state). 5055168404Spjd * or 5056168404Spjd * - move the buffer to the head of the list if this is 5057168404Spjd * another prefetch (to make it less likely to be evicted). 5058168404Spjd */ 5059339034Ssef if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) { 5060286570Smav if (refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) { 5061286763Smav /* link protected by hash lock */ 5062286763Smav ASSERT(multilist_link_active( 5063286570Smav &hdr->b_l1hdr.b_arc_node)); 5064168404Spjd } else { 5065339034Ssef arc_hdr_clear_flags(hdr, 5066339034Ssef ARC_FLAG_PREFETCH | 5067339034Ssef ARC_FLAG_PRESCIENT_PREFETCH); 5068168404Spjd ARCSTAT_BUMP(arcstat_mru_hits); 5069168404Spjd } 5070286570Smav hdr->b_l1hdr.b_arc_access = now; 5071168404Spjd return; 5072168404Spjd } 5073168404Spjd 5074168404Spjd /* 5075168404Spjd * This buffer has been "accessed" only once so far, 5076168404Spjd * but it is still in the cache. Move it to the MFU 5077168404Spjd * state. 5078168404Spjd */ 5079286570Smav if (now > hdr->b_l1hdr.b_arc_access + ARC_MINTIME) { 5080168404Spjd /* 5081168404Spjd * More than 125ms have passed since we 5082168404Spjd * instantiated this buffer. Move it to the 5083168404Spjd * most frequently used state. 5084168404Spjd */ 5085286570Smav hdr->b_l1hdr.b_arc_access = now; 5086275811Sdelphij DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); 5087275811Sdelphij arc_change_state(arc_mfu, hdr, hash_lock); 5088168404Spjd } 5089168404Spjd ARCSTAT_BUMP(arcstat_mru_hits); 5090286570Smav } else if (hdr->b_l1hdr.b_state == arc_mru_ghost) { 5091168404Spjd arc_state_t *new_state; 5092168404Spjd /* 5093168404Spjd * This buffer has been "accessed" recently, but 5094168404Spjd * was evicted from the cache. Move it to the 5095168404Spjd * MFU state. 5096168404Spjd */ 5097168404Spjd 5098339034Ssef if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) { 5099168404Spjd new_state = arc_mru; 5100339034Ssef if (refcount_count(&hdr->b_l1hdr.b_refcnt) > 0) { 5101339034Ssef arc_hdr_clear_flags(hdr, 5102339034Ssef ARC_FLAG_PREFETCH | 5103339034Ssef ARC_FLAG_PRESCIENT_PREFETCH); 5104339034Ssef } 5105275811Sdelphij DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr); 5106168404Spjd } else { 5107168404Spjd new_state = arc_mfu; 5108275811Sdelphij DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); 5109168404Spjd } 5110168404Spjd 5111286570Smav hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); 5112275811Sdelphij arc_change_state(new_state, hdr, hash_lock); 5113168404Spjd 5114168404Spjd ARCSTAT_BUMP(arcstat_mru_ghost_hits); 5115286570Smav } else if (hdr->b_l1hdr.b_state == arc_mfu) { 5116168404Spjd /* 5117168404Spjd * This buffer has been accessed more than once and is 5118168404Spjd * still in the cache. Keep it in the MFU state. 5119168404Spjd * 5120168404Spjd * NOTE: an add_reference() that occurred when we did 5121168404Spjd * the arc_read() will have kicked this off the list. 5122168404Spjd * If it was a prefetch, we will explicitly move it to 5123168404Spjd * the head of the list now. 5124168404Spjd */ 5125339034Ssef 5126168404Spjd ARCSTAT_BUMP(arcstat_mfu_hits); 5127286570Smav hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); 5128286570Smav } else if (hdr->b_l1hdr.b_state == arc_mfu_ghost) { 5129168404Spjd arc_state_t *new_state = arc_mfu; 5130168404Spjd /* 5131168404Spjd * This buffer has been accessed more than once but has 5132168404Spjd * been evicted from the cache. Move it back to the 5133168404Spjd * MFU state. 5134168404Spjd */ 5135168404Spjd 5136339034Ssef if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) { 5137168404Spjd /* 5138168404Spjd * This is a prefetch access... 5139168404Spjd * move this block back to the MRU state. 5140168404Spjd */ 5141168404Spjd new_state = arc_mru; 5142168404Spjd } 5143168404Spjd 5144286570Smav hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); 5145275811Sdelphij DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); 5146275811Sdelphij arc_change_state(new_state, hdr, hash_lock); 5147168404Spjd 5148168404Spjd ARCSTAT_BUMP(arcstat_mfu_ghost_hits); 5149286570Smav } else if (hdr->b_l1hdr.b_state == arc_l2c_only) { 5150185029Spjd /* 5151185029Spjd * This buffer is on the 2nd Level ARC. 5152185029Spjd */ 5153185029Spjd 5154286570Smav hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); 5155275811Sdelphij DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); 5156275811Sdelphij arc_change_state(arc_mfu, hdr, hash_lock); 5157168404Spjd } else { 5158168404Spjd ASSERT(!"invalid arc state"); 5159168404Spjd } 5160168404Spjd} 5161168404Spjd 5162332785Smav/* 5163332785Smav * This routine is called by dbuf_hold() to update the arc_access() state 5164332785Smav * which otherwise would be skipped for entries in the dbuf cache. 5165332785Smav */ 5166332785Smavvoid 5167332785Smavarc_buf_access(arc_buf_t *buf) 5168332785Smav{ 5169332785Smav mutex_enter(&buf->b_evict_lock); 5170332785Smav arc_buf_hdr_t *hdr = buf->b_hdr; 5171332785Smav 5172332785Smav /* 5173332785Smav * Avoid taking the hash_lock when possible as an optimization. 5174332785Smav * The header must be checked again under the hash_lock in order 5175332785Smav * to handle the case where it is concurrently being released. 5176332785Smav */ 5177332785Smav if (hdr->b_l1hdr.b_state == arc_anon || HDR_EMPTY(hdr)) { 5178332785Smav mutex_exit(&buf->b_evict_lock); 5179332785Smav ARCSTAT_BUMP(arcstat_access_skip); 5180332785Smav return; 5181332785Smav } 5182332785Smav 5183332785Smav kmutex_t *hash_lock = HDR_LOCK(hdr); 5184332785Smav mutex_enter(hash_lock); 5185332785Smav 5186332785Smav if (hdr->b_l1hdr.b_state == arc_anon || HDR_EMPTY(hdr)) { 5187332785Smav mutex_exit(hash_lock); 5188332785Smav mutex_exit(&buf->b_evict_lock); 5189332785Smav ARCSTAT_BUMP(arcstat_access_skip); 5190332785Smav return; 5191332785Smav } 5192332785Smav 5193332785Smav mutex_exit(&buf->b_evict_lock); 5194332785Smav 5195332785Smav ASSERT(hdr->b_l1hdr.b_state == arc_mru || 5196332785Smav hdr->b_l1hdr.b_state == arc_mfu); 5197332785Smav 5198332785Smav DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); 5199332785Smav arc_access(hdr, hash_lock); 5200332785Smav mutex_exit(hash_lock); 5201332785Smav 5202332785Smav ARCSTAT_BUMP(arcstat_hits); 5203332785Smav ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr), 5204332785Smav demand, prefetch, !HDR_ISTYPE_METADATA(hdr), data, metadata, hits); 5205332785Smav} 5206332785Smav 5207339034Ssef/* a generic arc_read_done_func_t which you can use */ 5208168404Spjd/* ARGSUSED */ 5209168404Spjdvoid 5210339034Ssefarc_bcopy_func(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp, 5211339034Ssef arc_buf_t *buf, void *arg) 5212168404Spjd{ 5213339034Ssef if (buf == NULL) 5214339034Ssef return; 5215339034Ssef 5216339034Ssef bcopy(buf->b_data, arg, arc_buf_size(buf)); 5217307265Smav arc_buf_destroy(buf, arg); 5218168404Spjd} 5219168404Spjd 5220339034Ssef/* a generic arc_read_done_func_t */ 5221339034Ssef/* ARGSUSED */ 5222168404Spjdvoid 5223339034Ssefarc_getbuf_func(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp, 5224339034Ssef arc_buf_t *buf, void *arg) 5225168404Spjd{ 5226168404Spjd arc_buf_t **bufp = arg; 5227339034Ssef if (buf == NULL) { 5228339114Smav ASSERT(zio == NULL || zio->io_error != 0); 5229168404Spjd *bufp = NULL; 5230168404Spjd } else { 5231339114Smav ASSERT(zio == NULL || zio->io_error == 0); 5232168404Spjd *bufp = buf; 5233339114Smav ASSERT(buf->b_data != NULL); 5234168404Spjd } 5235168404Spjd} 5236168404Spjd 5237168404Spjdstatic void 5238307265Smavarc_hdr_verify(arc_buf_hdr_t *hdr, blkptr_t *bp) 5239307265Smav{ 5240307265Smav if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) { 5241307265Smav ASSERT3U(HDR_GET_PSIZE(hdr), ==, 0); 5242307265Smav ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF); 5243307265Smav } else { 5244307265Smav if (HDR_COMPRESSION_ENABLED(hdr)) { 5245307265Smav ASSERT3U(HDR_GET_COMPRESS(hdr), ==, 5246307265Smav BP_GET_COMPRESS(bp)); 5247307265Smav } 5248307265Smav ASSERT3U(HDR_GET_LSIZE(hdr), ==, BP_GET_LSIZE(bp)); 5249307265Smav ASSERT3U(HDR_GET_PSIZE(hdr), ==, BP_GET_PSIZE(bp)); 5250307265Smav } 5251307265Smav} 5252307265Smav 5253307265Smavstatic void 5254168404Spjdarc_read_done(zio_t *zio) 5255168404Spjd{ 5256307265Smav arc_buf_hdr_t *hdr = zio->io_private; 5257268075Sdelphij kmutex_t *hash_lock = NULL; 5258321535Smav arc_callback_t *callback_list; 5259321535Smav arc_callback_t *acb; 5260321535Smav boolean_t freeable = B_FALSE; 5261339114Smav boolean_t no_zio_error = (zio->io_error == 0); 5262168404Spjd 5263168404Spjd /* 5264168404Spjd * The hdr was inserted into hash-table and removed from lists 5265168404Spjd * prior to starting I/O. We should find this header, since 5266168404Spjd * it's in the hash table, and it should be legit since it's 5267168404Spjd * not possible to evict it during the I/O. The only possible 5268168404Spjd * reason for it not to be found is if we were freed during the 5269168404Spjd * read. 5270168404Spjd */ 5271268075Sdelphij if (HDR_IN_HASH_TABLE(hdr)) { 5272268075Sdelphij ASSERT3U(hdr->b_birth, ==, BP_PHYSICAL_BIRTH(zio->io_bp)); 5273268075Sdelphij ASSERT3U(hdr->b_dva.dva_word[0], ==, 5274268075Sdelphij BP_IDENTITY(zio->io_bp)->dva_word[0]); 5275268075Sdelphij ASSERT3U(hdr->b_dva.dva_word[1], ==, 5276268075Sdelphij BP_IDENTITY(zio->io_bp)->dva_word[1]); 5277168404Spjd 5278268075Sdelphij arc_buf_hdr_t *found = buf_hash_find(hdr->b_spa, zio->io_bp, 5279268075Sdelphij &hash_lock); 5280168404Spjd 5281307265Smav ASSERT((found == hdr && 5282268075Sdelphij DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) || 5283268075Sdelphij (found == hdr && HDR_L2_READING(hdr))); 5284307265Smav ASSERT3P(hash_lock, !=, NULL); 5285268075Sdelphij } 5286268075Sdelphij 5287339114Smav if (no_zio_error) { 5288307265Smav /* byteswap if necessary */ 5289307265Smav if (BP_SHOULD_BYTESWAP(zio->io_bp)) { 5290307265Smav if (BP_GET_LEVEL(zio->io_bp) > 0) { 5291307265Smav hdr->b_l1hdr.b_byteswap = DMU_BSWAP_UINT64; 5292307265Smav } else { 5293307265Smav hdr->b_l1hdr.b_byteswap = 5294307265Smav DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp)); 5295307265Smav } 5296307265Smav } else { 5297307265Smav hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS; 5298307265Smav } 5299307265Smav } 5300307265Smav 5301307265Smav arc_hdr_clear_flags(hdr, ARC_FLAG_L2_EVICTED); 5302286570Smav if (l2arc_noprefetch && HDR_PREFETCH(hdr)) 5303307265Smav arc_hdr_clear_flags(hdr, ARC_FLAG_L2CACHE); 5304206796Spjd 5305286570Smav callback_list = hdr->b_l1hdr.b_acb; 5306307265Smav ASSERT3P(callback_list, !=, NULL); 5307168404Spjd 5308339114Smav if (hash_lock && no_zio_error && hdr->b_l1hdr.b_state == arc_anon) { 5309219089Spjd /* 5310219089Spjd * Only call arc_access on anonymous buffers. This is because 5311219089Spjd * if we've issued an I/O for an evicted buffer, we've already 5312219089Spjd * called arc_access (to prevent any simultaneous readers from 5313219089Spjd * getting confused). 5314219089Spjd */ 5315219089Spjd arc_access(hdr, hash_lock); 5316219089Spjd } 5317219089Spjd 5318321535Smav /* 5319321535Smav * If a read request has a callback (i.e. acb_done is not NULL), then we 5320321535Smav * make a buf containing the data according to the parameters which were 5321321535Smav * passed in. The implementation of arc_buf_alloc_impl() ensures that we 5322321535Smav * aren't needlessly decompressing the data multiple times. 5323321535Smav */ 5324321535Smav int callback_cnt = 0; 5325321535Smav for (acb = callback_list; acb != NULL; acb = acb->acb_next) { 5326321535Smav if (!acb->acb_done) 5327321535Smav continue; 5328321535Smav 5329321535Smav callback_cnt++; 5330321535Smav 5331339114Smav if (no_zio_error) { 5332339114Smav int error = arc_buf_alloc_impl(hdr, acb->acb_private, 5333339114Smav acb->acb_compressed, zio->io_error == 0, 5334339114Smav &acb->acb_buf); 5335339114Smav if (error != 0) { 5336339114Smav /* 5337339114Smav * Decompression failed. Set io_error 5338339114Smav * so that when we call acb_done (below), 5339339114Smav * we will indicate that the read failed. 5340339114Smav * Note that in the unusual case where one 5341339114Smav * callback is compressed and another 5342339114Smav * uncompressed, we will mark all of them 5343339114Smav * as failed, even though the uncompressed 5344339114Smav * one can't actually fail. In this case, 5345339114Smav * the hdr will not be anonymous, because 5346339114Smav * if there are multiple callbacks, it's 5347339114Smav * because multiple threads found the same 5348339114Smav * arc buf in the hash table. 5349339114Smav */ 5350339114Smav zio->io_error = error; 5351339114Smav } 5352339034Ssef } 5353339114Smav } 5354339114Smav /* 5355339114Smav * If there are multiple callbacks, we must have the hash lock, 5356339114Smav * because the only way for multiple threads to find this hdr is 5357339114Smav * in the hash table. This ensures that if there are multiple 5358339114Smav * callbacks, the hdr is not anonymous. If it were anonymous, 5359339114Smav * we couldn't use arc_buf_destroy() in the error case below. 5360339114Smav */ 5361339114Smav ASSERT(callback_cnt < 2 || hash_lock != NULL); 5362339034Ssef 5363286570Smav hdr->b_l1hdr.b_acb = NULL; 5364307265Smav arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); 5365321535Smav if (callback_cnt == 0) { 5366307265Smav ASSERT(HDR_PREFETCH(hdr)); 5367307265Smav ASSERT0(hdr->b_l1hdr.b_bufcnt); 5368321610Smav ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); 5369219089Spjd } 5370168404Spjd 5371286570Smav ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt) || 5372286570Smav callback_list != NULL); 5373168404Spjd 5374339114Smav if (no_zio_error) { 5375307265Smav arc_hdr_verify(hdr, zio->io_bp); 5376307265Smav } else { 5377307265Smav arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR); 5378286570Smav if (hdr->b_l1hdr.b_state != arc_anon) 5379168404Spjd arc_change_state(arc_anon, hdr, hash_lock); 5380168404Spjd if (HDR_IN_HASH_TABLE(hdr)) 5381168404Spjd buf_hash_remove(hdr); 5382286570Smav freeable = refcount_is_zero(&hdr->b_l1hdr.b_refcnt); 5383168404Spjd } 5384168404Spjd 5385168404Spjd /* 5386168404Spjd * Broadcast before we drop the hash_lock to avoid the possibility 5387168404Spjd * that the hdr (and hence the cv) might be freed before we get to 5388168404Spjd * the cv_broadcast(). 5389168404Spjd */ 5390286570Smav cv_broadcast(&hdr->b_l1hdr.b_cv); 5391168404Spjd 5392286570Smav if (hash_lock != NULL) { 5393168404Spjd mutex_exit(hash_lock); 5394168404Spjd } else { 5395168404Spjd /* 5396168404Spjd * This block was freed while we waited for the read to 5397168404Spjd * complete. It has been removed from the hash table and 5398168404Spjd * moved to the anonymous state (so that it won't show up 5399168404Spjd * in the cache). 5400168404Spjd */ 5401286570Smav ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); 5402286570Smav freeable = refcount_is_zero(&hdr->b_l1hdr.b_refcnt); 5403168404Spjd } 5404168404Spjd 5405168404Spjd /* execute each callback and free its structure */ 5406168404Spjd while ((acb = callback_list) != NULL) { 5407339114Smav if (acb->acb_done != NULL) { 5408339114Smav if (zio->io_error != 0 && acb->acb_buf != NULL) { 5409339114Smav /* 5410339114Smav * If arc_buf_alloc_impl() fails during 5411339114Smav * decompression, the buf will still be 5412339114Smav * allocated, and needs to be freed here. 5413339114Smav */ 5414339114Smav arc_buf_destroy(acb->acb_buf, acb->acb_private); 5415339114Smav acb->acb_buf = NULL; 5416339114Smav } 5417339034Ssef acb->acb_done(zio, &zio->io_bookmark, zio->io_bp, 5418339034Ssef acb->acb_buf, acb->acb_private); 5419339034Ssef } 5420168404Spjd 5421168404Spjd if (acb->acb_zio_dummy != NULL) { 5422168404Spjd acb->acb_zio_dummy->io_error = zio->io_error; 5423168404Spjd zio_nowait(acb->acb_zio_dummy); 5424168404Spjd } 5425168404Spjd 5426168404Spjd callback_list = acb->acb_next; 5427168404Spjd kmem_free(acb, sizeof (arc_callback_t)); 5428168404Spjd } 5429168404Spjd 5430168404Spjd if (freeable) 5431168404Spjd arc_hdr_destroy(hdr); 5432168404Spjd} 5433168404Spjd 5434168404Spjd/* 5435286762Smav * "Read" the block at the specified DVA (in bp) via the 5436168404Spjd * cache. If the block is found in the cache, invoke the provided 5437168404Spjd * callback immediately and return. Note that the `zio' parameter 5438168404Spjd * in the callback will be NULL in this case, since no IO was 5439168404Spjd * required. If the block is not in the cache pass the read request 5440168404Spjd * on to the spa with a substitute callback function, so that the 5441168404Spjd * requested block will be added to the cache. 5442168404Spjd * 5443168404Spjd * If a read request arrives for a block that has a read in-progress, 5444168404Spjd * either wait for the in-progress read to complete (and return the 5445168404Spjd * results); or, if this is a read with a "done" func, add a record 5446168404Spjd * to the read to invoke the "done" func when the read completes, 5447168404Spjd * and return; or just return. 5448168404Spjd * 5449168404Spjd * arc_read_done() will invoke all the requested "done" functions 5450168404Spjd * for readers of this block. 5451168404Spjd */ 5452168404Spjdint 5453339034Ssefarc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_read_done_func_t *done, 5454275811Sdelphij void *private, zio_priority_t priority, int zio_flags, 5455275811Sdelphij arc_flags_t *arc_flags, const zbookmark_phys_t *zb) 5456168404Spjd{ 5457268075Sdelphij arc_buf_hdr_t *hdr = NULL; 5458268075Sdelphij kmutex_t *hash_lock = NULL; 5459185029Spjd zio_t *rzio; 5460228103Smm uint64_t guid = spa_load_guid(spa); 5461321535Smav boolean_t compressed_read = (zio_flags & ZIO_FLAG_RAW) != 0; 5462339034Ssef int rc = 0; 5463339034Ssef 5464268075Sdelphij ASSERT(!BP_IS_EMBEDDED(bp) || 5465268075Sdelphij BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA); 5466268075Sdelphij 5467168404Spjdtop: 5468268075Sdelphij if (!BP_IS_EMBEDDED(bp)) { 5469268075Sdelphij /* 5470268075Sdelphij * Embedded BP's have no DVA and require no I/O to "read". 5471268075Sdelphij * Create an anonymous arc buf to back it. 5472268075Sdelphij */ 5473268075Sdelphij hdr = buf_hash_find(guid, bp, &hash_lock); 5474268075Sdelphij } 5475168404Spjd 5476321610Smav if (hdr != NULL && HDR_HAS_L1HDR(hdr) && hdr->b_l1hdr.b_pabd != NULL) { 5477307265Smav arc_buf_t *buf = NULL; 5478275811Sdelphij *arc_flags |= ARC_FLAG_CACHED; 5479168404Spjd 5480168404Spjd if (HDR_IO_IN_PROGRESS(hdr)) { 5481339034Ssef zio_t *head_zio = hdr->b_l1hdr.b_acb->acb_zio_head; 5482168404Spjd 5483339034Ssef ASSERT3P(head_zio, !=, NULL); 5484287702Sdelphij if ((hdr->b_flags & ARC_FLAG_PRIO_ASYNC_READ) && 5485287702Sdelphij priority == ZIO_PRIORITY_SYNC_READ) { 5486287702Sdelphij /* 5487339034Ssef * This is a sync read that needs to wait for 5488339034Ssef * an in-flight async read. Request that the 5489339034Ssef * zio have its priority upgraded. 5490287702Sdelphij */ 5491339034Ssef zio_change_priority(head_zio, priority); 5492339034Ssef DTRACE_PROBE1(arc__async__upgrade__sync, 5493287702Sdelphij arc_buf_hdr_t *, hdr); 5494339034Ssef ARCSTAT_BUMP(arcstat_async_upgrade_sync); 5495287702Sdelphij } 5496287702Sdelphij if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) { 5497307265Smav arc_hdr_clear_flags(hdr, 5498307265Smav ARC_FLAG_PREDICTIVE_PREFETCH); 5499287702Sdelphij } 5500287702Sdelphij 5501275811Sdelphij if (*arc_flags & ARC_FLAG_WAIT) { 5502286570Smav cv_wait(&hdr->b_l1hdr.b_cv, hash_lock); 5503168404Spjd mutex_exit(hash_lock); 5504168404Spjd goto top; 5505168404Spjd } 5506275811Sdelphij ASSERT(*arc_flags & ARC_FLAG_NOWAIT); 5507168404Spjd 5508168404Spjd if (done) { 5509287702Sdelphij arc_callback_t *acb = NULL; 5510168404Spjd 5511168404Spjd acb = kmem_zalloc(sizeof (arc_callback_t), 5512168404Spjd KM_SLEEP); 5513168404Spjd acb->acb_done = done; 5514168404Spjd acb->acb_private = private; 5515321535Smav acb->acb_compressed = compressed_read; 5516168404Spjd if (pio != NULL) 5517168404Spjd acb->acb_zio_dummy = zio_null(pio, 5518209962Smm spa, NULL, NULL, NULL, zio_flags); 5519168404Spjd 5520307265Smav ASSERT3P(acb->acb_done, !=, NULL); 5521339034Ssef acb->acb_zio_head = head_zio; 5522286570Smav acb->acb_next = hdr->b_l1hdr.b_acb; 5523286570Smav hdr->b_l1hdr.b_acb = acb; 5524168404Spjd mutex_exit(hash_lock); 5525168404Spjd return (0); 5526168404Spjd } 5527168404Spjd mutex_exit(hash_lock); 5528168404Spjd return (0); 5529168404Spjd } 5530168404Spjd 5531286570Smav ASSERT(hdr->b_l1hdr.b_state == arc_mru || 5532286570Smav hdr->b_l1hdr.b_state == arc_mfu); 5533168404Spjd 5534168404Spjd if (done) { 5535287702Sdelphij if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) { 5536287702Sdelphij /* 5537287702Sdelphij * This is a demand read which does not have to 5538287702Sdelphij * wait for i/o because we did a predictive 5539287702Sdelphij * prefetch i/o for it, which has completed. 5540287702Sdelphij */ 5541287702Sdelphij DTRACE_PROBE1( 5542287702Sdelphij arc__demand__hit__predictive__prefetch, 5543287702Sdelphij arc_buf_hdr_t *, hdr); 5544287702Sdelphij ARCSTAT_BUMP( 5545287702Sdelphij arcstat_demand_hit_predictive_prefetch); 5546307265Smav arc_hdr_clear_flags(hdr, 5547307265Smav ARC_FLAG_PREDICTIVE_PREFETCH); 5548287702Sdelphij } 5549339034Ssef 5550339034Ssef if (hdr->b_flags & ARC_FLAG_PRESCIENT_PREFETCH) { 5551339034Ssef ARCSTAT_BUMP( 5552339034Ssef arcstat_demand_hit_prescient_prefetch); 5553339034Ssef arc_hdr_clear_flags(hdr, 5554339034Ssef ARC_FLAG_PRESCIENT_PREFETCH); 5555339034Ssef } 5556339034Ssef 5557307265Smav ASSERT(!BP_IS_EMBEDDED(bp) || !BP_IS_HOLE(bp)); 5558321535Smav /* Get a buf with the desired data in it. */ 5559339034Ssef rc = arc_buf_alloc_impl(hdr, private, 5560339034Ssef compressed_read, B_TRUE, &buf); 5561339034Ssef if (rc != 0) { 5562339034Ssef arc_buf_destroy(buf, private); 5563339034Ssef buf = NULL; 5564339034Ssef } 5565339034Ssef ASSERT((zio_flags & ZIO_FLAG_SPECULATIVE) || 5566339034Ssef rc == 0 || rc != ENOENT); 5567275811Sdelphij } else if (*arc_flags & ARC_FLAG_PREFETCH && 5568286570Smav refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) { 5569307265Smav arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH); 5570168404Spjd } 5571168404Spjd DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); 5572168404Spjd arc_access(hdr, hash_lock); 5573339034Ssef if (*arc_flags & ARC_FLAG_PRESCIENT_PREFETCH) 5574339034Ssef arc_hdr_set_flags(hdr, ARC_FLAG_PRESCIENT_PREFETCH); 5575275811Sdelphij if (*arc_flags & ARC_FLAG_L2CACHE) 5576307265Smav arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE); 5577168404Spjd mutex_exit(hash_lock); 5578168404Spjd ARCSTAT_BUMP(arcstat_hits); 5579286570Smav ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr), 5580286570Smav demand, prefetch, !HDR_ISTYPE_METADATA(hdr), 5581168404Spjd data, metadata, hits); 5582168404Spjd 5583168404Spjd if (done) 5584339034Ssef done(NULL, zb, bp, buf, private); 5585168404Spjd } else { 5586307265Smav uint64_t lsize = BP_GET_LSIZE(bp); 5587307265Smav uint64_t psize = BP_GET_PSIZE(bp); 5588268075Sdelphij arc_callback_t *acb; 5589185029Spjd vdev_t *vd = NULL; 5590247187Smm uint64_t addr = 0; 5591208373Smm boolean_t devw = B_FALSE; 5592307265Smav uint64_t size; 5593168404Spjd 5594168404Spjd if (hdr == NULL) { 5595168404Spjd /* this block is not in the cache */ 5596268075Sdelphij arc_buf_hdr_t *exists = NULL; 5597168404Spjd arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp); 5598307265Smav hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize, 5599307265Smav BP_GET_COMPRESS(bp), type); 5600307265Smav 5601268075Sdelphij if (!BP_IS_EMBEDDED(bp)) { 5602268075Sdelphij hdr->b_dva = *BP_IDENTITY(bp); 5603268075Sdelphij hdr->b_birth = BP_PHYSICAL_BIRTH(bp); 5604268075Sdelphij exists = buf_hash_insert(hdr, &hash_lock); 5605268075Sdelphij } 5606268075Sdelphij if (exists != NULL) { 5607168404Spjd /* somebody beat us to the hash insert */ 5608168404Spjd mutex_exit(hash_lock); 5609219089Spjd buf_discard_identity(hdr); 5610307265Smav arc_hdr_destroy(hdr); 5611168404Spjd goto top; /* restart the IO request */ 5612168404Spjd } 5613168404Spjd } else { 5614286570Smav /* 5615286570Smav * This block is in the ghost cache. If it was L2-only 5616286570Smav * (and thus didn't have an L1 hdr), we realloc the 5617286570Smav * header to add an L1 hdr. 5618286570Smav */ 5619286570Smav if (!HDR_HAS_L1HDR(hdr)) { 5620286570Smav hdr = arc_hdr_realloc(hdr, hdr_l2only_cache, 5621286570Smav hdr_full_cache); 5622286570Smav } 5623321610Smav ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); 5624286570Smav ASSERT(GHOST_STATE(hdr->b_l1hdr.b_state)); 5625168404Spjd ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 5626286570Smav ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 5627286763Smav ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); 5628321535Smav ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); 5629168404Spjd 5630287702Sdelphij /* 5631307265Smav * This is a delicate dance that we play here. 5632307265Smav * This hdr is in the ghost list so we access it 5633307265Smav * to move it out of the ghost list before we 5634307265Smav * initiate the read. If it's a prefetch then 5635307265Smav * it won't have a callback so we'll remove the 5636307265Smav * reference that arc_buf_alloc_impl() created. We 5637307265Smav * do this after we've called arc_access() to 5638307265Smav * avoid hitting an assert in remove_reference(). 5639287702Sdelphij */ 5640349216Savg arc_adapt(arc_hdr_size(hdr), hdr->b_l1hdr.b_state); 5641219089Spjd arc_access(hdr, hash_lock); 5642349216Savg arc_hdr_alloc_pabd(hdr, B_FALSE); 5643168404Spjd } 5644321610Smav ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); 5645307265Smav size = arc_hdr_size(hdr); 5646168404Spjd 5647307265Smav /* 5648307265Smav * If compression is enabled on the hdr, then will do 5649307265Smav * RAW I/O and will store the compressed data in the hdr's 5650307265Smav * data block. Otherwise, the hdr's data block will contain 5651307265Smav * the uncompressed data. 5652307265Smav */ 5653307265Smav if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF) { 5654307265Smav zio_flags |= ZIO_FLAG_RAW; 5655307265Smav } 5656307265Smav 5657307265Smav if (*arc_flags & ARC_FLAG_PREFETCH) 5658307265Smav arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH); 5659339034Ssef if (*arc_flags & ARC_FLAG_PRESCIENT_PREFETCH) 5660339034Ssef arc_hdr_set_flags(hdr, ARC_FLAG_PRESCIENT_PREFETCH); 5661339034Ssef 5662307265Smav if (*arc_flags & ARC_FLAG_L2CACHE) 5663307265Smav arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE); 5664307265Smav if (BP_GET_LEVEL(bp) > 0) 5665307265Smav arc_hdr_set_flags(hdr, ARC_FLAG_INDIRECT); 5666287702Sdelphij if (*arc_flags & ARC_FLAG_PREDICTIVE_PREFETCH) 5667307265Smav arc_hdr_set_flags(hdr, ARC_FLAG_PREDICTIVE_PREFETCH); 5668286570Smav ASSERT(!GHOST_STATE(hdr->b_l1hdr.b_state)); 5669219089Spjd 5670168404Spjd acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); 5671168404Spjd acb->acb_done = done; 5672168404Spjd acb->acb_private = private; 5673321535Smav acb->acb_compressed = compressed_read; 5674168404Spjd 5675307265Smav ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); 5676286570Smav hdr->b_l1hdr.b_acb = acb; 5677307265Smav arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); 5678168404Spjd 5679286570Smav if (HDR_HAS_L2HDR(hdr) && 5680286570Smav (vd = hdr->b_l2hdr.b_dev->l2ad_vdev) != NULL) { 5681286570Smav devw = hdr->b_l2hdr.b_dev->l2ad_writing; 5682286570Smav addr = hdr->b_l2hdr.b_daddr; 5683185029Spjd /* 5684332525Smav * Lock out L2ARC device removal. 5685185029Spjd */ 5686185029Spjd if (vdev_is_dead(vd) || 5687185029Spjd !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER)) 5688185029Spjd vd = NULL; 5689185029Spjd } 5690185029Spjd 5691339034Ssef /* 5692339034Ssef * We count both async reads and scrub IOs as asynchronous so 5693339034Ssef * that both can be upgraded in the event of a cache hit while 5694339034Ssef * the read IO is still in-flight. 5695339034Ssef */ 5696339034Ssef if (priority == ZIO_PRIORITY_ASYNC_READ || 5697339034Ssef priority == ZIO_PRIORITY_SCRUB) 5698307265Smav arc_hdr_set_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ); 5699307265Smav else 5700307265Smav arc_hdr_clear_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ); 5701307265Smav 5702251629Sdelphij /* 5703251629Sdelphij * At this point, we have a level 1 cache miss. Try again in 5704251629Sdelphij * L2ARC if possible. 5705251629Sdelphij */ 5706307265Smav ASSERT3U(HDR_GET_LSIZE(hdr), ==, lsize); 5707307265Smav 5708219089Spjd DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp, 5709307265Smav uint64_t, lsize, zbookmark_phys_t *, zb); 5710168404Spjd ARCSTAT_BUMP(arcstat_misses); 5711286570Smav ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr), 5712286570Smav demand, prefetch, !HDR_ISTYPE_METADATA(hdr), 5713168404Spjd data, metadata, misses); 5714228392Spjd#ifdef _KERNEL 5715297633Strasz#ifdef RACCT 5716297633Strasz if (racct_enable) { 5717297633Strasz PROC_LOCK(curproc); 5718297633Strasz racct_add_force(curproc, RACCT_READBPS, size); 5719297633Strasz racct_add_force(curproc, RACCT_READIOPS, 1); 5720297633Strasz PROC_UNLOCK(curproc); 5721297633Strasz } 5722297633Strasz#endif /* RACCT */ 5723228392Spjd curthread->td_ru.ru_inblock++; 5724228392Spjd#endif 5725168404Spjd 5726208373Smm if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) { 5727185029Spjd /* 5728185029Spjd * Read from the L2ARC if the following are true: 5729185029Spjd * 1. The L2ARC vdev was previously cached. 5730185029Spjd * 2. This buffer still has L2ARC metadata. 5731185029Spjd * 3. This buffer isn't currently writing to the L2ARC. 5732185029Spjd * 4. The L2ARC entry wasn't evicted, which may 5733185029Spjd * also have invalidated the vdev. 5734208373Smm * 5. This isn't prefetch and l2arc_noprefetch is set. 5735185029Spjd */ 5736286570Smav if (HDR_HAS_L2HDR(hdr) && 5737208373Smm !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) && 5738208373Smm !(l2arc_noprefetch && HDR_PREFETCH(hdr))) { 5739185029Spjd l2arc_read_callback_t *cb; 5740321610Smav abd_t *abd; 5741321610Smav uint64_t asize; 5742185029Spjd 5743185029Spjd DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr); 5744185029Spjd ARCSTAT_BUMP(arcstat_l2_hits); 5745185029Spjd 5746185029Spjd cb = kmem_zalloc(sizeof (l2arc_read_callback_t), 5747185029Spjd KM_SLEEP); 5748307265Smav cb->l2rcb_hdr = hdr; 5749185029Spjd cb->l2rcb_bp = *bp; 5750185029Spjd cb->l2rcb_zb = *zb; 5751185029Spjd cb->l2rcb_flags = zio_flags; 5752321610Smav 5753321610Smav asize = vdev_psize_to_asize(vd, size); 5754307265Smav if (asize != size) { 5755321610Smav abd = abd_alloc_for_io(asize, 5756321610Smav HDR_ISTYPE_METADATA(hdr)); 5757321610Smav cb->l2rcb_abd = abd; 5758297848Savg } else { 5759321610Smav abd = hdr->b_l1hdr.b_pabd; 5760297848Savg } 5761185029Spjd 5762247187Smm ASSERT(addr >= VDEV_LABEL_START_SIZE && 5763321610Smav addr + asize <= vd->vdev_psize - 5764247187Smm VDEV_LABEL_END_SIZE); 5765247187Smm 5766185029Spjd /* 5767185029Spjd * l2arc read. The SCL_L2ARC lock will be 5768185029Spjd * released by l2arc_read_done(). 5769251478Sdelphij * Issue a null zio if the underlying buffer 5770251478Sdelphij * was squashed to zero size by compression. 5771185029Spjd */ 5772307265Smav ASSERT3U(HDR_GET_COMPRESS(hdr), !=, 5773307265Smav ZIO_COMPRESS_EMPTY); 5774307265Smav rzio = zio_read_phys(pio, vd, addr, 5775321610Smav asize, abd, 5776307265Smav ZIO_CHECKSUM_OFF, 5777307265Smav l2arc_read_done, cb, priority, 5778307265Smav zio_flags | ZIO_FLAG_DONT_CACHE | 5779307265Smav ZIO_FLAG_CANFAIL | 5780307265Smav ZIO_FLAG_DONT_PROPAGATE | 5781307265Smav ZIO_FLAG_DONT_RETRY, B_FALSE); 5782339034Ssef acb->acb_zio_head = rzio; 5783339034Ssef 5784339034Ssef if (hash_lock != NULL) 5785339034Ssef mutex_exit(hash_lock); 5786339034Ssef 5787185029Spjd DTRACE_PROBE2(l2arc__read, vdev_t *, vd, 5788185029Spjd zio_t *, rzio); 5789307265Smav ARCSTAT_INCR(arcstat_l2_read_bytes, size); 5790185029Spjd 5791275811Sdelphij if (*arc_flags & ARC_FLAG_NOWAIT) { 5792185029Spjd zio_nowait(rzio); 5793185029Spjd return (0); 5794185029Spjd } 5795185029Spjd 5796275811Sdelphij ASSERT(*arc_flags & ARC_FLAG_WAIT); 5797185029Spjd if (zio_wait(rzio) == 0) 5798185029Spjd return (0); 5799185029Spjd 5800185029Spjd /* l2arc read error; goto zio_read() */ 5801339034Ssef if (hash_lock != NULL) 5802339034Ssef mutex_enter(hash_lock); 5803185029Spjd } else { 5804185029Spjd DTRACE_PROBE1(l2arc__miss, 5805185029Spjd arc_buf_hdr_t *, hdr); 5806185029Spjd ARCSTAT_BUMP(arcstat_l2_misses); 5807185029Spjd if (HDR_L2_WRITING(hdr)) 5808185029Spjd ARCSTAT_BUMP(arcstat_l2_rw_clash); 5809185029Spjd spa_config_exit(spa, SCL_L2ARC, vd); 5810185029Spjd } 5811208373Smm } else { 5812208373Smm if (vd != NULL) 5813208373Smm spa_config_exit(spa, SCL_L2ARC, vd); 5814208373Smm if (l2arc_ndev != 0) { 5815208373Smm DTRACE_PROBE1(l2arc__miss, 5816208373Smm arc_buf_hdr_t *, hdr); 5817208373Smm ARCSTAT_BUMP(arcstat_l2_misses); 5818208373Smm } 5819185029Spjd } 5820185029Spjd 5821321610Smav rzio = zio_read(pio, spa, bp, hdr->b_l1hdr.b_pabd, size, 5822307265Smav arc_read_done, hdr, priority, zio_flags, zb); 5823339034Ssef acb->acb_zio_head = rzio; 5824168404Spjd 5825339034Ssef if (hash_lock != NULL) 5826339034Ssef mutex_exit(hash_lock); 5827339034Ssef 5828275811Sdelphij if (*arc_flags & ARC_FLAG_WAIT) 5829168404Spjd return (zio_wait(rzio)); 5830168404Spjd 5831275811Sdelphij ASSERT(*arc_flags & ARC_FLAG_NOWAIT); 5832168404Spjd zio_nowait(rzio); 5833168404Spjd } 5834168404Spjd return (0); 5835168404Spjd} 5836168404Spjd 5837168404Spjd/* 5838251520Sdelphij * Notify the arc that a block was freed, and thus will never be used again. 5839251520Sdelphij */ 5840251520Sdelphijvoid 5841251520Sdelphijarc_freed(spa_t *spa, const blkptr_t *bp) 5842251520Sdelphij{ 5843251520Sdelphij arc_buf_hdr_t *hdr; 5844251520Sdelphij kmutex_t *hash_lock; 5845251520Sdelphij uint64_t guid = spa_load_guid(spa); 5846251520Sdelphij 5847268075Sdelphij ASSERT(!BP_IS_EMBEDDED(bp)); 5848268075Sdelphij 5849268075Sdelphij hdr = buf_hash_find(guid, bp, &hash_lock); 5850251520Sdelphij if (hdr == NULL) 5851251520Sdelphij return; 5852307265Smav 5853307265Smav /* 5854307265Smav * We might be trying to free a block that is still doing I/O 5855307265Smav * (i.e. prefetch) or has a reference (i.e. a dedup-ed, 5856307265Smav * dmu_sync-ed block). If this block is being prefetched, then it 5857307265Smav * would still have the ARC_FLAG_IO_IN_PROGRESS flag set on the hdr 5858307265Smav * until the I/O completes. A block may also have a reference if it is 5859307265Smav * part of a dedup-ed, dmu_synced write. The dmu_sync() function would 5860307265Smav * have written the new block to its final resting place on disk but 5861307265Smav * without the dedup flag set. This would have left the hdr in the MRU 5862307265Smav * state and discoverable. When the txg finally syncs it detects that 5863307265Smav * the block was overridden in open context and issues an override I/O. 5864307265Smav * Since this is a dedup block, the override I/O will determine if the 5865307265Smav * block is already in the DDT. If so, then it will replace the io_bp 5866307265Smav * with the bp from the DDT and allow the I/O to finish. When the I/O 5867307265Smav * reaches the done callback, dbuf_write_override_done, it will 5868307265Smav * check to see if the io_bp and io_bp_override are identical. 5869307265Smav * If they are not, then it indicates that the bp was replaced with 5870307265Smav * the bp in the DDT and the override bp is freed. This allows 5871307265Smav * us to arrive here with a reference on a block that is being 5872307265Smav * freed. So if we have an I/O in progress, or a reference to 5873307265Smav * this hdr, then we don't destroy the hdr. 5874307265Smav */ 5875307265Smav if (!HDR_HAS_L1HDR(hdr) || (!HDR_IO_IN_PROGRESS(hdr) && 5876307265Smav refcount_is_zero(&hdr->b_l1hdr.b_refcnt))) { 5877307265Smav arc_change_state(arc_anon, hdr, hash_lock); 5878307265Smav arc_hdr_destroy(hdr); 5879251520Sdelphij mutex_exit(hash_lock); 5880251520Sdelphij } else { 5881251520Sdelphij mutex_exit(hash_lock); 5882251520Sdelphij } 5883251520Sdelphij 5884251520Sdelphij} 5885251520Sdelphij 5886251520Sdelphij/* 5887251629Sdelphij * Release this buffer from the cache, making it an anonymous buffer. This 5888251629Sdelphij * must be done after a read and prior to modifying the buffer contents. 5889168404Spjd * If the buffer has more than one reference, we must make 5890185029Spjd * a new hdr for the buffer. 5891168404Spjd */ 5892168404Spjdvoid 5893168404Spjdarc_release(arc_buf_t *buf, void *tag) 5894168404Spjd{ 5895286570Smav arc_buf_hdr_t *hdr = buf->b_hdr; 5896168404Spjd 5897219089Spjd /* 5898219089Spjd * It would be nice to assert that if it's DMU metadata (level > 5899219089Spjd * 0 || it's the dnode file), then it must be syncing context. 5900219089Spjd * But we don't know that information at this level. 5901219089Spjd */ 5902219089Spjd 5903219089Spjd mutex_enter(&buf->b_evict_lock); 5904286776Smav 5905286776Smav ASSERT(HDR_HAS_L1HDR(hdr)); 5906286776Smav 5907286570Smav /* 5908286570Smav * We don't grab the hash lock prior to this check, because if 5909286570Smav * the buffer's header is in the arc_anon state, it won't be 5910286570Smav * linked into the hash table. 5911286570Smav */ 5912286570Smav if (hdr->b_l1hdr.b_state == arc_anon) { 5913286570Smav mutex_exit(&buf->b_evict_lock); 5914286570Smav ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 5915286570Smav ASSERT(!HDR_IN_HASH_TABLE(hdr)); 5916286570Smav ASSERT(!HDR_HAS_L2HDR(hdr)); 5917307265Smav ASSERT(HDR_EMPTY(hdr)); 5918307265Smav ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1); 5919286570Smav ASSERT3S(refcount_count(&hdr->b_l1hdr.b_refcnt), ==, 1); 5920286570Smav ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node)); 5921185029Spjd 5922307265Smav hdr->b_l1hdr.b_arc_access = 0; 5923168404Spjd 5924307265Smav /* 5925307265Smav * If the buf is being overridden then it may already 5926307265Smav * have a hdr that is not empty. 5927307265Smav */ 5928307265Smav buf_discard_identity(hdr); 5929286570Smav arc_buf_thaw(buf); 5930286570Smav 5931286570Smav return; 5932168404Spjd } 5933168404Spjd 5934286570Smav kmutex_t *hash_lock = HDR_LOCK(hdr); 5935286570Smav mutex_enter(hash_lock); 5936286570Smav 5937286570Smav /* 5938286570Smav * This assignment is only valid as long as the hash_lock is 5939286570Smav * held, we must be careful not to reference state or the 5940286570Smav * b_state field after dropping the lock. 5941286570Smav */ 5942286570Smav arc_state_t *state = hdr->b_l1hdr.b_state; 5943286570Smav ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 5944286570Smav ASSERT3P(state, !=, arc_anon); 5945286570Smav 5946286570Smav /* this buffer is not on any list */ 5947321535Smav ASSERT3S(refcount_count(&hdr->b_l1hdr.b_refcnt), >, 0); 5948286570Smav 5949286570Smav if (HDR_HAS_L2HDR(hdr)) { 5950286570Smav mutex_enter(&hdr->b_l2hdr.b_dev->l2ad_mtx); 5951286570Smav 5952286570Smav /* 5953286598Smav * We have to recheck this conditional again now that 5954286598Smav * we're holding the l2ad_mtx to prevent a race with 5955286598Smav * another thread which might be concurrently calling 5956286598Smav * l2arc_evict(). In that case, l2arc_evict() might have 5957286598Smav * destroyed the header's L2 portion as we were waiting 5958286598Smav * to acquire the l2ad_mtx. 5959286570Smav */ 5960286598Smav if (HDR_HAS_L2HDR(hdr)) { 5961290191Savg l2arc_trim(hdr); 5962286598Smav arc_hdr_l2hdr_destroy(hdr); 5963286598Smav } 5964286570Smav 5965286570Smav mutex_exit(&hdr->b_l2hdr.b_dev->l2ad_mtx); 5966185029Spjd } 5967185029Spjd 5968168404Spjd /* 5969168404Spjd * Do we have more than one buf? 5970168404Spjd */ 5971307265Smav if (hdr->b_l1hdr.b_bufcnt > 1) { 5972168404Spjd arc_buf_hdr_t *nhdr; 5973209962Smm uint64_t spa = hdr->b_spa; 5974307265Smav uint64_t psize = HDR_GET_PSIZE(hdr); 5975307265Smav uint64_t lsize = HDR_GET_LSIZE(hdr); 5976307265Smav enum zio_compress compress = HDR_GET_COMPRESS(hdr); 5977286570Smav arc_buf_contents_t type = arc_buf_type(hdr); 5978307265Smav VERIFY3U(hdr->b_type, ==, type); 5979168404Spjd 5980286570Smav ASSERT(hdr->b_l1hdr.b_buf != buf || buf->b_next != NULL); 5981307265Smav (void) remove_reference(hdr, hash_lock, tag); 5982307265Smav 5983321535Smav if (arc_buf_is_shared(buf) && !ARC_BUF_COMPRESSED(buf)) { 5984307265Smav ASSERT3P(hdr->b_l1hdr.b_buf, !=, buf); 5985307265Smav ASSERT(ARC_BUF_LAST(buf)); 5986307265Smav } 5987307265Smav 5988168404Spjd /* 5989219089Spjd * Pull the data off of this hdr and attach it to 5990307265Smav * a new anonymous hdr. Also find the last buffer 5991307265Smav * in the hdr's buffer list. 5992168404Spjd */ 5993321535Smav arc_buf_t *lastbuf = arc_buf_remove(hdr, buf); 5994307265Smav ASSERT3P(lastbuf, !=, NULL); 5995168404Spjd 5996307265Smav /* 5997307265Smav * If the current arc_buf_t and the hdr are sharing their data 5998321535Smav * buffer, then we must stop sharing that block. 5999307265Smav */ 6000307265Smav if (arc_buf_is_shared(buf)) { 6001307265Smav VERIFY(!arc_buf_is_shared(lastbuf)); 6002307265Smav 6003307265Smav /* 6004307265Smav * First, sever the block sharing relationship between 6005321535Smav * buf and the arc_buf_hdr_t. 6006307265Smav */ 6007307265Smav arc_unshare_buf(hdr, buf); 6008321535Smav 6009321535Smav /* 6010321610Smav * Now we need to recreate the hdr's b_pabd. Since we 6011321535Smav * have lastbuf handy, we try to share with it, but if 6012321610Smav * we can't then we allocate a new b_pabd and copy the 6013321535Smav * data from buf into it. 6014321535Smav */ 6015321535Smav if (arc_can_share(hdr, lastbuf)) { 6016321535Smav arc_share_buf(hdr, lastbuf); 6017321535Smav } else { 6018349216Savg arc_hdr_alloc_pabd(hdr, B_TRUE); 6019321610Smav abd_copy_from_buf(hdr->b_l1hdr.b_pabd, 6020321610Smav buf->b_data, psize); 6021321535Smav } 6022307265Smav VERIFY3P(lastbuf->b_data, !=, NULL); 6023307265Smav } else if (HDR_SHARED_DATA(hdr)) { 6024321535Smav /* 6025321535Smav * Uncompressed shared buffers are always at the end 6026321535Smav * of the list. Compressed buffers don't have the 6027321535Smav * same requirements. This makes it hard to 6028321535Smav * simply assert that the lastbuf is shared so 6029321535Smav * we rely on the hdr's compression flags to determine 6030321535Smav * if we have a compressed, shared buffer. 6031321535Smav */ 6032321535Smav ASSERT(arc_buf_is_shared(lastbuf) || 6033321535Smav HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF); 6034321535Smav ASSERT(!ARC_BUF_SHARED(buf)); 6035307265Smav } 6036321610Smav ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); 6037286570Smav ASSERT3P(state, !=, arc_l2c_only); 6038286766Smav 6039307265Smav (void) refcount_remove_many(&state->arcs_size, 6040321535Smav arc_buf_size(buf), buf); 6041286766Smav 6042286570Smav if (refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) { 6043286570Smav ASSERT3P(state, !=, arc_l2c_only); 6044307265Smav (void) refcount_remove_many(&state->arcs_esize[type], 6045321535Smav arc_buf_size(buf), buf); 6046168404Spjd } 6047242845Sdelphij 6048307265Smav hdr->b_l1hdr.b_bufcnt -= 1; 6049168404Spjd arc_cksum_verify(buf); 6050240133Smm#ifdef illumos 6051240133Smm arc_buf_unwatch(buf); 6052277300Ssmh#endif 6053168404Spjd 6054168404Spjd mutex_exit(hash_lock); 6055168404Spjd 6056307265Smav /* 6057321610Smav * Allocate a new hdr. The new hdr will contain a b_pabd 6058307265Smav * buffer which will be freed in arc_write(). 6059307265Smav */ 6060307265Smav nhdr = arc_hdr_alloc(spa, psize, lsize, compress, type); 6061307265Smav ASSERT3P(nhdr->b_l1hdr.b_buf, ==, NULL); 6062307265Smav ASSERT0(nhdr->b_l1hdr.b_bufcnt); 6063307265Smav ASSERT0(refcount_count(&nhdr->b_l1hdr.b_refcnt)); 6064307265Smav VERIFY3U(nhdr->b_type, ==, type); 6065307265Smav ASSERT(!HDR_SHARED_DATA(nhdr)); 6066286570Smav 6067286570Smav nhdr->b_l1hdr.b_buf = buf; 6068307265Smav nhdr->b_l1hdr.b_bufcnt = 1; 6069286570Smav (void) refcount_add(&nhdr->b_l1hdr.b_refcnt, tag); 6070168404Spjd buf->b_hdr = nhdr; 6071307265Smav 6072219089Spjd mutex_exit(&buf->b_evict_lock); 6073307265Smav (void) refcount_add_many(&arc_anon->arcs_size, 6074321535Smav arc_buf_size(buf), buf); 6075168404Spjd } else { 6076219089Spjd mutex_exit(&buf->b_evict_lock); 6077286570Smav ASSERT(refcount_count(&hdr->b_l1hdr.b_refcnt) == 1); 6078286763Smav /* protected by hash lock, or hdr is on arc_anon */ 6079286763Smav ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); 6080168404Spjd ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 6081286570Smav arc_change_state(arc_anon, hdr, hash_lock); 6082286570Smav hdr->b_l1hdr.b_arc_access = 0; 6083286570Smav mutex_exit(hash_lock); 6084185029Spjd 6085219089Spjd buf_discard_identity(hdr); 6086168404Spjd arc_buf_thaw(buf); 6087168404Spjd } 6088168404Spjd} 6089168404Spjd 6090168404Spjdint 6091168404Spjdarc_released(arc_buf_t *buf) 6092168404Spjd{ 6093185029Spjd int released; 6094185029Spjd 6095219089Spjd mutex_enter(&buf->b_evict_lock); 6096286570Smav released = (buf->b_data != NULL && 6097286570Smav buf->b_hdr->b_l1hdr.b_state == arc_anon); 6098219089Spjd mutex_exit(&buf->b_evict_lock); 6099185029Spjd return (released); 6100168404Spjd} 6101168404Spjd 6102168404Spjd#ifdef ZFS_DEBUG 6103168404Spjdint 6104168404Spjdarc_referenced(arc_buf_t *buf) 6105168404Spjd{ 6106185029Spjd int referenced; 6107185029Spjd 6108219089Spjd mutex_enter(&buf->b_evict_lock); 6109286570Smav referenced = (refcount_count(&buf->b_hdr->b_l1hdr.b_refcnt)); 6110219089Spjd mutex_exit(&buf->b_evict_lock); 6111185029Spjd return (referenced); 6112168404Spjd} 6113168404Spjd#endif 6114168404Spjd 6115168404Spjdstatic void 6116168404Spjdarc_write_ready(zio_t *zio) 6117168404Spjd{ 6118168404Spjd arc_write_callback_t *callback = zio->io_private; 6119168404Spjd arc_buf_t *buf = callback->awcb_buf; 6120185029Spjd arc_buf_hdr_t *hdr = buf->b_hdr; 6121307265Smav uint64_t psize = BP_IS_HOLE(zio->io_bp) ? 0 : BP_GET_PSIZE(zio->io_bp); 6122168404Spjd 6123286570Smav ASSERT(HDR_HAS_L1HDR(hdr)); 6124286570Smav ASSERT(!refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt)); 6125307265Smav ASSERT(hdr->b_l1hdr.b_bufcnt > 0); 6126185029Spjd 6127185029Spjd /* 6128307265Smav * If we're reexecuting this zio because the pool suspended, then 6129307265Smav * cleanup any state that was previously set the first time the 6130321535Smav * callback was invoked. 6131185029Spjd */ 6132307265Smav if (zio->io_flags & ZIO_FLAG_REEXECUTED) { 6133307265Smav arc_cksum_free(hdr); 6134307265Smav#ifdef illumos 6135307265Smav arc_buf_unwatch(buf); 6136307265Smav#endif 6137321610Smav if (hdr->b_l1hdr.b_pabd != NULL) { 6138307265Smav if (arc_buf_is_shared(buf)) { 6139307265Smav arc_unshare_buf(hdr, buf); 6140307265Smav } else { 6141321610Smav arc_hdr_free_pabd(hdr); 6142307265Smav } 6143185029Spjd } 6144168404Spjd } 6145321610Smav ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); 6146307265Smav ASSERT(!HDR_SHARED_DATA(hdr)); 6147307265Smav ASSERT(!arc_buf_is_shared(buf)); 6148307265Smav 6149307265Smav callback->awcb_ready(zio, buf, callback->awcb_private); 6150307265Smav 6151307265Smav if (HDR_IO_IN_PROGRESS(hdr)) 6152307265Smav ASSERT(zio->io_flags & ZIO_FLAG_REEXECUTED); 6153307265Smav 6154307265Smav arc_cksum_compute(buf); 6155307265Smav arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); 6156307265Smav 6157307265Smav enum zio_compress compress; 6158307265Smav if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) { 6159307265Smav compress = ZIO_COMPRESS_OFF; 6160307265Smav } else { 6161307265Smav ASSERT3U(HDR_GET_LSIZE(hdr), ==, BP_GET_LSIZE(zio->io_bp)); 6162307265Smav compress = BP_GET_COMPRESS(zio->io_bp); 6163307265Smav } 6164307265Smav HDR_SET_PSIZE(hdr, psize); 6165307265Smav arc_hdr_set_compress(hdr, compress); 6166307265Smav 6167321610Smav 6168307265Smav /* 6169321610Smav * Fill the hdr with data. If the hdr is compressed, the data we want 6170321610Smav * is available from the zio, otherwise we can take it from the buf. 6171321610Smav * 6172321610Smav * We might be able to share the buf's data with the hdr here. However, 6173321610Smav * doing so would cause the ARC to be full of linear ABDs if we write a 6174321610Smav * lot of shareable data. As a compromise, we check whether scattered 6175321610Smav * ABDs are allowed, and assume that if they are then the user wants 6176321610Smav * the ARC to be primarily filled with them regardless of the data being 6177321610Smav * written. Therefore, if they're allowed then we allocate one and copy 6178321610Smav * the data into it; otherwise, we share the data directly if we can. 6179307265Smav */ 6180321610Smav if (zfs_abd_scatter_enabled || !arc_can_share(hdr, buf)) { 6181349216Savg arc_hdr_alloc_pabd(hdr, B_TRUE); 6182321610Smav 6183321610Smav /* 6184321610Smav * Ideally, we would always copy the io_abd into b_pabd, but the 6185321610Smav * user may have disabled compressed ARC, thus we must check the 6186321610Smav * hdr's compression setting rather than the io_bp's. 6187321610Smav */ 6188321610Smav if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF) { 6189321610Smav ASSERT3U(BP_GET_COMPRESS(zio->io_bp), !=, 6190321610Smav ZIO_COMPRESS_OFF); 6191321610Smav ASSERT3U(psize, >, 0); 6192321610Smav 6193321610Smav abd_copy(hdr->b_l1hdr.b_pabd, zio->io_abd, psize); 6194321610Smav } else { 6195321610Smav ASSERT3U(zio->io_orig_size, ==, arc_hdr_size(hdr)); 6196321610Smav 6197321610Smav abd_copy_from_buf(hdr->b_l1hdr.b_pabd, buf->b_data, 6198321610Smav arc_buf_size(buf)); 6199321610Smav } 6200307265Smav } else { 6201321610Smav ASSERT3P(buf->b_data, ==, abd_to_buf(zio->io_orig_abd)); 6202321535Smav ASSERT3U(zio->io_orig_size, ==, arc_buf_size(buf)); 6203307265Smav ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1); 6204307265Smav 6205307265Smav arc_share_buf(hdr, buf); 6206307265Smav } 6207321610Smav 6208307265Smav arc_hdr_verify(hdr, zio->io_bp); 6209168404Spjd} 6210168404Spjd 6211304138Savgstatic void 6212304138Savgarc_write_children_ready(zio_t *zio) 6213304138Savg{ 6214304138Savg arc_write_callback_t *callback = zio->io_private; 6215304138Savg arc_buf_t *buf = callback->awcb_buf; 6216304138Savg 6217304138Savg callback->awcb_children_ready(zio, buf, callback->awcb_private); 6218304138Savg} 6219304138Savg 6220258632Savg/* 6221258632Savg * The SPA calls this callback for each physical write that happens on behalf 6222258632Savg * of a logical write. See the comment in dbuf_write_physdone() for details. 6223258632Savg */ 6224168404Spjdstatic void 6225258632Savgarc_write_physdone(zio_t *zio) 6226258632Savg{ 6227258632Savg arc_write_callback_t *cb = zio->io_private; 6228258632Savg if (cb->awcb_physdone != NULL) 6229258632Savg cb->awcb_physdone(zio, cb->awcb_buf, cb->awcb_private); 6230258632Savg} 6231258632Savg 6232258632Savgstatic void 6233168404Spjdarc_write_done(zio_t *zio) 6234168404Spjd{ 6235168404Spjd arc_write_callback_t *callback = zio->io_private; 6236168404Spjd arc_buf_t *buf = callback->awcb_buf; 6237168404Spjd arc_buf_hdr_t *hdr = buf->b_hdr; 6238168404Spjd 6239307265Smav ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); 6240168404Spjd 6241219089Spjd if (zio->io_error == 0) { 6242307265Smav arc_hdr_verify(hdr, zio->io_bp); 6243307265Smav 6244268075Sdelphij if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) { 6245260150Sdelphij buf_discard_identity(hdr); 6246260150Sdelphij } else { 6247260150Sdelphij hdr->b_dva = *BP_IDENTITY(zio->io_bp); 6248260150Sdelphij hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp); 6249260150Sdelphij } 6250219089Spjd } else { 6251307265Smav ASSERT(HDR_EMPTY(hdr)); 6252219089Spjd } 6253219089Spjd 6254168404Spjd /* 6255268075Sdelphij * If the block to be written was all-zero or compressed enough to be 6256268075Sdelphij * embedded in the BP, no write was performed so there will be no 6257268075Sdelphij * dva/birth/checksum. The buffer must therefore remain anonymous 6258268075Sdelphij * (and uncached). 6259168404Spjd */ 6260307265Smav if (!HDR_EMPTY(hdr)) { 6261168404Spjd arc_buf_hdr_t *exists; 6262168404Spjd kmutex_t *hash_lock; 6263168404Spjd 6264321535Smav ASSERT3U(zio->io_error, ==, 0); 6265219089Spjd 6266168404Spjd arc_cksum_verify(buf); 6267168404Spjd 6268168404Spjd exists = buf_hash_insert(hdr, &hash_lock); 6269286570Smav if (exists != NULL) { 6270168404Spjd /* 6271168404Spjd * This can only happen if we overwrite for 6272168404Spjd * sync-to-convergence, because we remove 6273168404Spjd * buffers from the hash table when we arc_free(). 6274168404Spjd */ 6275219089Spjd if (zio->io_flags & ZIO_FLAG_IO_REWRITE) { 6276219089Spjd if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp)) 6277219089Spjd panic("bad overwrite, hdr=%p exists=%p", 6278219089Spjd (void *)hdr, (void *)exists); 6279286570Smav ASSERT(refcount_is_zero( 6280286570Smav &exists->b_l1hdr.b_refcnt)); 6281219089Spjd arc_change_state(arc_anon, exists, hash_lock); 6282219089Spjd mutex_exit(hash_lock); 6283219089Spjd arc_hdr_destroy(exists); 6284219089Spjd exists = buf_hash_insert(hdr, &hash_lock); 6285219089Spjd ASSERT3P(exists, ==, NULL); 6286243524Smm } else if (zio->io_flags & ZIO_FLAG_NOPWRITE) { 6287243524Smm /* nopwrite */ 6288243524Smm ASSERT(zio->io_prop.zp_nopwrite); 6289243524Smm if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp)) 6290243524Smm panic("bad nopwrite, hdr=%p exists=%p", 6291243524Smm (void *)hdr, (void *)exists); 6292219089Spjd } else { 6293219089Spjd /* Dedup */ 6294307265Smav ASSERT(hdr->b_l1hdr.b_bufcnt == 1); 6295286570Smav ASSERT(hdr->b_l1hdr.b_state == arc_anon); 6296219089Spjd ASSERT(BP_GET_DEDUP(zio->io_bp)); 6297219089Spjd ASSERT(BP_GET_LEVEL(zio->io_bp) == 0); 6298219089Spjd } 6299168404Spjd } 6300307265Smav arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); 6301185029Spjd /* if it's not anon, we are doing a scrub */ 6302286570Smav if (exists == NULL && hdr->b_l1hdr.b_state == arc_anon) 6303185029Spjd arc_access(hdr, hash_lock); 6304168404Spjd mutex_exit(hash_lock); 6305168404Spjd } else { 6306307265Smav arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); 6307168404Spjd } 6308168404Spjd 6309286570Smav ASSERT(!refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 6310219089Spjd callback->awcb_done(zio, buf, callback->awcb_private); 6311168404Spjd 6312321610Smav abd_put(zio->io_abd); 6313168404Spjd kmem_free(callback, sizeof (arc_write_callback_t)); 6314168404Spjd} 6315168404Spjd 6316168404Spjdzio_t * 6317307265Smavarc_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, arc_buf_t *buf, 6318339034Ssef boolean_t l2arc, const zio_prop_t *zp, arc_write_done_func_t *ready, 6319339034Ssef arc_write_done_func_t *children_ready, arc_write_done_func_t *physdone, 6320339034Ssef arc_write_done_func_t *done, void *private, zio_priority_t priority, 6321268123Sdelphij int zio_flags, const zbookmark_phys_t *zb) 6322168404Spjd{ 6323168404Spjd arc_buf_hdr_t *hdr = buf->b_hdr; 6324168404Spjd arc_write_callback_t *callback; 6325185029Spjd zio_t *zio; 6326321573Smav zio_prop_t localprop = *zp; 6327168404Spjd 6328307265Smav ASSERT3P(ready, !=, NULL); 6329307265Smav ASSERT3P(done, !=, NULL); 6330168404Spjd ASSERT(!HDR_IO_ERROR(hdr)); 6331286570Smav ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 6332307265Smav ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); 6333307265Smav ASSERT3U(hdr->b_l1hdr.b_bufcnt, >, 0); 6334185029Spjd if (l2arc) 6335307265Smav arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE); 6336321535Smav if (ARC_BUF_COMPRESSED(buf)) { 6337321573Smav /* 6338321573Smav * We're writing a pre-compressed buffer. Make the 6339321573Smav * compression algorithm requested by the zio_prop_t match 6340321573Smav * the pre-compressed buffer's compression algorithm. 6341321573Smav */ 6342321573Smav localprop.zp_compress = HDR_GET_COMPRESS(hdr); 6343321573Smav 6344321535Smav ASSERT3U(HDR_GET_LSIZE(hdr), !=, arc_buf_size(buf)); 6345321535Smav zio_flags |= ZIO_FLAG_RAW; 6346321535Smav } 6347168404Spjd callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP); 6348168404Spjd callback->awcb_ready = ready; 6349304138Savg callback->awcb_children_ready = children_ready; 6350258632Savg callback->awcb_physdone = physdone; 6351168404Spjd callback->awcb_done = done; 6352168404Spjd callback->awcb_private = private; 6353168404Spjd callback->awcb_buf = buf; 6354168404Spjd 6355307265Smav /* 6356321610Smav * The hdr's b_pabd is now stale, free it now. A new data block 6357307265Smav * will be allocated when the zio pipeline calls arc_write_ready(). 6358307265Smav */ 6359321610Smav if (hdr->b_l1hdr.b_pabd != NULL) { 6360307265Smav /* 6361307265Smav * If the buf is currently sharing the data block with 6362307265Smav * the hdr then we need to break that relationship here. 6363307265Smav * The hdr will remain with a NULL data pointer and the 6364307265Smav * buf will take sole ownership of the block. 6365307265Smav */ 6366307265Smav if (arc_buf_is_shared(buf)) { 6367307265Smav arc_unshare_buf(hdr, buf); 6368307265Smav } else { 6369321610Smav arc_hdr_free_pabd(hdr); 6370307265Smav } 6371307265Smav VERIFY3P(buf->b_data, !=, NULL); 6372307265Smav arc_hdr_set_compress(hdr, ZIO_COMPRESS_OFF); 6373307265Smav } 6374307265Smav ASSERT(!arc_buf_is_shared(buf)); 6375321610Smav ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); 6376307265Smav 6377321610Smav zio = zio_write(pio, spa, txg, bp, 6378321610Smav abd_get_from_buf(buf->b_data, HDR_GET_LSIZE(hdr)), 6379321573Smav HDR_GET_LSIZE(hdr), arc_buf_size(buf), &localprop, arc_write_ready, 6380304138Savg (children_ready != NULL) ? arc_write_children_ready : NULL, 6381304138Savg arc_write_physdone, arc_write_done, callback, 6382258632Savg priority, zio_flags, zb); 6383185029Spjd 6384168404Spjd return (zio); 6385168404Spjd} 6386168404Spjd 6387185029Spjdstatic int 6388339141Smavarc_memory_throttle(spa_t *spa, uint64_t reserve, uint64_t txg) 6389185029Spjd{ 6390185029Spjd#ifdef _KERNEL 6391272483Ssmh uint64_t available_memory = ptob(freemem); 6392185029Spjd 6393272483Ssmh#if defined(__i386) || !defined(UMA_MD_SMALL_ALLOC) 6394185029Spjd available_memory = 6395272483Ssmh MIN(available_memory, ptob(vmem_size(heap_arena, VMEM_FREE))); 6396185029Spjd#endif 6397258632Savg 6398272483Ssmh if (freemem > (uint64_t)physmem * arc_lotsfree_percent / 100) 6399185029Spjd return (0); 6400185029Spjd 6401339141Smav if (txg > spa->spa_lowmem_last_txg) { 6402339141Smav spa->spa_lowmem_last_txg = txg; 6403339141Smav spa->spa_lowmem_page_load = 0; 6404185029Spjd } 6405185029Spjd /* 6406185029Spjd * If we are in pageout, we know that memory is already tight, 6407185029Spjd * the arc is already going to be evicting, so we just want to 6408185029Spjd * continue to let page writes occur as quickly as possible. 6409185029Spjd */ 6410185029Spjd if (curproc == pageproc) { 6411339141Smav if (spa->spa_lowmem_page_load > 6412339141Smav MAX(ptob(minfree), available_memory) / 4) 6413249195Smm return (SET_ERROR(ERESTART)); 6414185029Spjd /* Note: reserve is inflated, so we deflate */ 6415339141Smav atomic_add_64(&spa->spa_lowmem_page_load, reserve / 8); 6416185029Spjd return (0); 6417339141Smav } else if (spa->spa_lowmem_page_load > 0 && arc_reclaim_needed()) { 6418185029Spjd /* memory is low, delay before restarting */ 6419185029Spjd ARCSTAT_INCR(arcstat_memory_throttle_count, 1); 6420249195Smm return (SET_ERROR(EAGAIN)); 6421185029Spjd } 6422339141Smav spa->spa_lowmem_page_load = 0; 6423339141Smav#endif /* _KERNEL */ 6424185029Spjd return (0); 6425185029Spjd} 6426185029Spjd 6427168404Spjdvoid 6428185029Spjdarc_tempreserve_clear(uint64_t reserve) 6429168404Spjd{ 6430185029Spjd atomic_add_64(&arc_tempreserve, -reserve); 6431168404Spjd ASSERT((int64_t)arc_tempreserve >= 0); 6432168404Spjd} 6433168404Spjd 6434168404Spjdint 6435339141Smavarc_tempreserve_space(spa_t *spa, uint64_t reserve, uint64_t txg) 6436168404Spjd{ 6437185029Spjd int error; 6438209962Smm uint64_t anon_size; 6439185029Spjd 6440272483Ssmh if (reserve > arc_c/4 && !arc_no_grow) { 6441185029Spjd arc_c = MIN(arc_c_max, reserve * 4); 6442272483Ssmh DTRACE_PROBE1(arc__set_reserve, uint64_t, arc_c); 6443272483Ssmh } 6444185029Spjd if (reserve > arc_c) 6445249195Smm return (SET_ERROR(ENOMEM)); 6446168404Spjd 6447168404Spjd /* 6448209962Smm * Don't count loaned bufs as in flight dirty data to prevent long 6449209962Smm * network delays from blocking transactions that are ready to be 6450209962Smm * assigned to a txg. 6451209962Smm */ 6452321535Smav 6453321535Smav /* assert that it has not wrapped around */ 6454321535Smav ASSERT3S(atomic_add_64_nv(&arc_loaned_bytes, 0), >=, 0); 6455321535Smav 6456286766Smav anon_size = MAX((int64_t)(refcount_count(&arc_anon->arcs_size) - 6457286766Smav arc_loaned_bytes), 0); 6458209962Smm 6459209962Smm /* 6460185029Spjd * Writes will, almost always, require additional memory allocations 6461251631Sdelphij * in order to compress/encrypt/etc the data. We therefore need to 6462185029Spjd * make sure that there is sufficient available memory for this. 6463185029Spjd */ 6464339141Smav error = arc_memory_throttle(spa, reserve, txg); 6465258632Savg if (error != 0) 6466185029Spjd return (error); 6467185029Spjd 6468185029Spjd /* 6469168404Spjd * Throttle writes when the amount of dirty data in the cache 6470168404Spjd * gets too large. We try to keep the cache less than half full 6471168404Spjd * of dirty blocks so that our sync times don't grow too large. 6472339141Smav * 6473339141Smav * In the case of one pool being built on another pool, we want 6474339141Smav * to make sure we don't end up throttling the lower (backing) 6475339141Smav * pool when the upper pool is the majority contributor to dirty 6476339141Smav * data. To insure we make forward progress during throttling, we 6477339141Smav * also check the current pool's net dirty data and only throttle 6478339141Smav * if it exceeds zfs_arc_pool_dirty_percent of the anonymous dirty 6479339141Smav * data in the cache. 6480339141Smav * 6481168404Spjd * Note: if two requests come in concurrently, we might let them 6482168404Spjd * both succeed, when one of them should fail. Not a huge deal. 6483168404Spjd */ 6484339141Smav uint64_t total_dirty = reserve + arc_tempreserve + anon_size; 6485339141Smav uint64_t spa_dirty_anon = spa_dirty_data(spa); 6486209962Smm 6487339141Smav if (total_dirty > arc_c * zfs_arc_dirty_limit_percent / 100 && 6488339141Smav anon_size > arc_c * zfs_arc_anon_limit_percent / 100 && 6489339141Smav spa_dirty_anon > anon_size * zfs_arc_pool_dirty_percent / 100) { 6490307265Smav uint64_t meta_esize = 6491307265Smav refcount_count(&arc_anon->arcs_esize[ARC_BUFC_METADATA]); 6492307265Smav uint64_t data_esize = 6493307265Smav refcount_count(&arc_anon->arcs_esize[ARC_BUFC_DATA]); 6494185029Spjd dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK " 6495185029Spjd "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n", 6496307265Smav arc_tempreserve >> 10, meta_esize >> 10, 6497307265Smav data_esize >> 10, reserve >> 10, arc_c >> 10); 6498249195Smm return (SET_ERROR(ERESTART)); 6499168404Spjd } 6500185029Spjd atomic_add_64(&arc_tempreserve, reserve); 6501168404Spjd return (0); 6502168404Spjd} 6503168404Spjd 6504286626Smavstatic void 6505286626Smavarc_kstat_update_state(arc_state_t *state, kstat_named_t *size, 6506286626Smav kstat_named_t *evict_data, kstat_named_t *evict_metadata) 6507286626Smav{ 6508286766Smav size->value.ui64 = refcount_count(&state->arcs_size); 6509307265Smav evict_data->value.ui64 = 6510307265Smav refcount_count(&state->arcs_esize[ARC_BUFC_DATA]); 6511307265Smav evict_metadata->value.ui64 = 6512307265Smav refcount_count(&state->arcs_esize[ARC_BUFC_METADATA]); 6513286626Smav} 6514286626Smav 6515286626Smavstatic int 6516286626Smavarc_kstat_update(kstat_t *ksp, int rw) 6517286626Smav{ 6518286626Smav arc_stats_t *as = ksp->ks_data; 6519286626Smav 6520286626Smav if (rw == KSTAT_WRITE) { 6521286626Smav return (EACCES); 6522286626Smav } else { 6523286626Smav arc_kstat_update_state(arc_anon, 6524286626Smav &as->arcstat_anon_size, 6525286626Smav &as->arcstat_anon_evictable_data, 6526286626Smav &as->arcstat_anon_evictable_metadata); 6527286626Smav arc_kstat_update_state(arc_mru, 6528286626Smav &as->arcstat_mru_size, 6529286626Smav &as->arcstat_mru_evictable_data, 6530286626Smav &as->arcstat_mru_evictable_metadata); 6531286626Smav arc_kstat_update_state(arc_mru_ghost, 6532286626Smav &as->arcstat_mru_ghost_size, 6533286626Smav &as->arcstat_mru_ghost_evictable_data, 6534286626Smav &as->arcstat_mru_ghost_evictable_metadata); 6535286626Smav arc_kstat_update_state(arc_mfu, 6536286626Smav &as->arcstat_mfu_size, 6537286626Smav &as->arcstat_mfu_evictable_data, 6538286626Smav &as->arcstat_mfu_evictable_metadata); 6539286626Smav arc_kstat_update_state(arc_mfu_ghost, 6540286626Smav &as->arcstat_mfu_ghost_size, 6541286626Smav &as->arcstat_mfu_ghost_evictable_data, 6542286626Smav &as->arcstat_mfu_ghost_evictable_metadata); 6543332540Smav 6544332540Smav ARCSTAT(arcstat_size) = aggsum_value(&arc_size); 6545332540Smav ARCSTAT(arcstat_meta_used) = aggsum_value(&arc_meta_used); 6546332540Smav ARCSTAT(arcstat_data_size) = aggsum_value(&astat_data_size); 6547332540Smav ARCSTAT(arcstat_metadata_size) = 6548332540Smav aggsum_value(&astat_metadata_size); 6549332540Smav ARCSTAT(arcstat_hdr_size) = aggsum_value(&astat_hdr_size); 6550332540Smav ARCSTAT(arcstat_other_size) = aggsum_value(&astat_other_size); 6551332540Smav ARCSTAT(arcstat_l2_hdr_size) = aggsum_value(&astat_l2_hdr_size); 6552286626Smav } 6553286626Smav 6554286626Smav return (0); 6555286626Smav} 6556286626Smav 6557286763Smav/* 6558286763Smav * This function *must* return indices evenly distributed between all 6559286763Smav * sublists of the multilist. This is needed due to how the ARC eviction 6560286763Smav * code is laid out; arc_evict_state() assumes ARC buffers are evenly 6561286763Smav * distributed between all sublists and uses this assumption when 6562286763Smav * deciding which sublist to evict from and how much to evict from it. 6563286763Smav */ 6564286763Smavunsigned int 6565286763Smavarc_state_multilist_index_func(multilist_t *ml, void *obj) 6566286763Smav{ 6567286763Smav arc_buf_hdr_t *hdr = obj; 6568286763Smav 6569286763Smav /* 6570286763Smav * We rely on b_dva to generate evenly distributed index 6571286763Smav * numbers using buf_hash below. So, as an added precaution, 6572286763Smav * let's make sure we never add empty buffers to the arc lists. 6573286763Smav */ 6574307265Smav ASSERT(!HDR_EMPTY(hdr)); 6575286763Smav 6576286763Smav /* 6577286763Smav * The assumption here, is the hash value for a given 6578286763Smav * arc_buf_hdr_t will remain constant throughout it's lifetime 6579286763Smav * (i.e. it's b_spa, b_dva, and b_birth fields don't change). 6580286763Smav * Thus, we don't need to store the header's sublist index 6581286763Smav * on insertion, as this index can be recalculated on removal. 6582286763Smav * 6583286763Smav * Also, the low order bits of the hash value are thought to be 6584286763Smav * distributed evenly. Otherwise, in the case that the multilist 6585286763Smav * has a power of two number of sublists, each sublists' usage 6586286763Smav * would not be evenly distributed. 6587286763Smav */ 6588286763Smav return (buf_hash(hdr->b_spa, &hdr->b_dva, hdr->b_birth) % 6589286763Smav multilist_get_num_sublists(ml)); 6590286763Smav} 6591286763Smav 6592168404Spjd#ifdef _KERNEL 6593168566Spjdstatic eventhandler_tag arc_event_lowmem = NULL; 6594168404Spjd 6595168404Spjdstatic void 6596168566Spjdarc_lowmem(void *arg __unused, int howto __unused) 6597168404Spjd{ 6598346686Smav int64_t free_memory, to_free; 6599168404Spjd 6600346686Smav arc_no_grow = B_TRUE; 6601346686Smav arc_warm = B_TRUE; 6602346686Smav arc_growtime = gethrtime() + SEC2NSEC(arc_grow_retry); 6603346686Smav free_memory = arc_available_memory(); 6604346686Smav to_free = (arc_c >> arc_shrink_shift) - MIN(free_memory, 0); 6605346686Smav DTRACE_PROBE2(arc__needfree, int64_t, free_memory, int64_t, to_free); 6606346686Smav arc_reduce_target_size(to_free); 6607241773Savg 6608346686Smav mutex_enter(&arc_adjust_lock); 6609346686Smav arc_adjust_needed = B_TRUE; 6610346686Smav zthr_wakeup(arc_adjust_zthr); 6611346686Smav 6612241773Savg /* 6613241773Savg * It is unsafe to block here in arbitrary threads, because we can come 6614241773Savg * here from ARC itself and may hold ARC locks and thus risk a deadlock 6615241773Savg * with ARC reclaim thread. 6616241773Savg */ 6617286623Smav if (curproc == pageproc) 6618346686Smav (void) cv_wait(&arc_adjust_waiters_cv, &arc_adjust_lock); 6619346686Smav mutex_exit(&arc_adjust_lock); 6620168404Spjd} 6621168404Spjd#endif 6622168404Spjd 6623307265Smavstatic void 6624307265Smavarc_state_init(void) 6625307265Smav{ 6626307265Smav arc_anon = &ARC_anon; 6627307265Smav arc_mru = &ARC_mru; 6628307265Smav arc_mru_ghost = &ARC_mru_ghost; 6629307265Smav arc_mfu = &ARC_mfu; 6630307265Smav arc_mfu_ghost = &ARC_mfu_ghost; 6631307265Smav arc_l2c_only = &ARC_l2c_only; 6632307265Smav 6633321553Smav arc_mru->arcs_list[ARC_BUFC_METADATA] = 6634321553Smav multilist_create(sizeof (arc_buf_hdr_t), 6635307265Smav offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), 6636321552Smav arc_state_multilist_index_func); 6637321553Smav arc_mru->arcs_list[ARC_BUFC_DATA] = 6638321553Smav multilist_create(sizeof (arc_buf_hdr_t), 6639307265Smav offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), 6640321552Smav arc_state_multilist_index_func); 6641321553Smav arc_mru_ghost->arcs_list[ARC_BUFC_METADATA] = 6642321553Smav multilist_create(sizeof (arc_buf_hdr_t), 6643307265Smav offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), 6644321552Smav arc_state_multilist_index_func); 6645321553Smav arc_mru_ghost->arcs_list[ARC_BUFC_DATA] = 6646321553Smav multilist_create(sizeof (arc_buf_hdr_t), 6647307265Smav offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), 6648321552Smav arc_state_multilist_index_func); 6649321553Smav arc_mfu->arcs_list[ARC_BUFC_METADATA] = 6650321553Smav multilist_create(sizeof (arc_buf_hdr_t), 6651307265Smav offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), 6652321552Smav arc_state_multilist_index_func); 6653321553Smav arc_mfu->arcs_list[ARC_BUFC_DATA] = 6654321553Smav multilist_create(sizeof (arc_buf_hdr_t), 6655307265Smav offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), 6656321552Smav arc_state_multilist_index_func); 6657321553Smav arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA] = 6658321553Smav multilist_create(sizeof (arc_buf_hdr_t), 6659307265Smav offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), 6660321552Smav arc_state_multilist_index_func); 6661321553Smav arc_mfu_ghost->arcs_list[ARC_BUFC_DATA] = 6662321553Smav multilist_create(sizeof (arc_buf_hdr_t), 6663307265Smav offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), 6664321552Smav arc_state_multilist_index_func); 6665321553Smav arc_l2c_only->arcs_list[ARC_BUFC_METADATA] = 6666321553Smav multilist_create(sizeof (arc_buf_hdr_t), 6667307265Smav offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), 6668321552Smav arc_state_multilist_index_func); 6669321553Smav arc_l2c_only->arcs_list[ARC_BUFC_DATA] = 6670321553Smav multilist_create(sizeof (arc_buf_hdr_t), 6671307265Smav offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), 6672321552Smav arc_state_multilist_index_func); 6673307265Smav 6674307265Smav refcount_create(&arc_anon->arcs_esize[ARC_BUFC_METADATA]); 6675307265Smav refcount_create(&arc_anon->arcs_esize[ARC_BUFC_DATA]); 6676307265Smav refcount_create(&arc_mru->arcs_esize[ARC_BUFC_METADATA]); 6677307265Smav refcount_create(&arc_mru->arcs_esize[ARC_BUFC_DATA]); 6678307265Smav refcount_create(&arc_mru_ghost->arcs_esize[ARC_BUFC_METADATA]); 6679307265Smav refcount_create(&arc_mru_ghost->arcs_esize[ARC_BUFC_DATA]); 6680307265Smav refcount_create(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]); 6681307265Smav refcount_create(&arc_mfu->arcs_esize[ARC_BUFC_DATA]); 6682307265Smav refcount_create(&arc_mfu_ghost->arcs_esize[ARC_BUFC_METADATA]); 6683307265Smav refcount_create(&arc_mfu_ghost->arcs_esize[ARC_BUFC_DATA]); 6684307265Smav refcount_create(&arc_l2c_only->arcs_esize[ARC_BUFC_METADATA]); 6685307265Smav refcount_create(&arc_l2c_only->arcs_esize[ARC_BUFC_DATA]); 6686307265Smav 6687307265Smav refcount_create(&arc_anon->arcs_size); 6688307265Smav refcount_create(&arc_mru->arcs_size); 6689307265Smav refcount_create(&arc_mru_ghost->arcs_size); 6690307265Smav refcount_create(&arc_mfu->arcs_size); 6691307265Smav refcount_create(&arc_mfu_ghost->arcs_size); 6692307265Smav refcount_create(&arc_l2c_only->arcs_size); 6693332540Smav 6694332540Smav aggsum_init(&arc_meta_used, 0); 6695332540Smav aggsum_init(&arc_size, 0); 6696332540Smav aggsum_init(&astat_data_size, 0); 6697332540Smav aggsum_init(&astat_metadata_size, 0); 6698332540Smav aggsum_init(&astat_hdr_size, 0); 6699332540Smav aggsum_init(&astat_other_size, 0); 6700332540Smav aggsum_init(&astat_l2_hdr_size, 0); 6701307265Smav} 6702307265Smav 6703307265Smavstatic void 6704307265Smavarc_state_fini(void) 6705307265Smav{ 6706307265Smav refcount_destroy(&arc_anon->arcs_esize[ARC_BUFC_METADATA]); 6707307265Smav refcount_destroy(&arc_anon->arcs_esize[ARC_BUFC_DATA]); 6708307265Smav refcount_destroy(&arc_mru->arcs_esize[ARC_BUFC_METADATA]); 6709307265Smav refcount_destroy(&arc_mru->arcs_esize[ARC_BUFC_DATA]); 6710307265Smav refcount_destroy(&arc_mru_ghost->arcs_esize[ARC_BUFC_METADATA]); 6711307265Smav refcount_destroy(&arc_mru_ghost->arcs_esize[ARC_BUFC_DATA]); 6712307265Smav refcount_destroy(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]); 6713307265Smav refcount_destroy(&arc_mfu->arcs_esize[ARC_BUFC_DATA]); 6714307265Smav refcount_destroy(&arc_mfu_ghost->arcs_esize[ARC_BUFC_METADATA]); 6715307265Smav refcount_destroy(&arc_mfu_ghost->arcs_esize[ARC_BUFC_DATA]); 6716307265Smav refcount_destroy(&arc_l2c_only->arcs_esize[ARC_BUFC_METADATA]); 6717307265Smav refcount_destroy(&arc_l2c_only->arcs_esize[ARC_BUFC_DATA]); 6718307265Smav 6719307265Smav refcount_destroy(&arc_anon->arcs_size); 6720307265Smav refcount_destroy(&arc_mru->arcs_size); 6721307265Smav refcount_destroy(&arc_mru_ghost->arcs_size); 6722307265Smav refcount_destroy(&arc_mfu->arcs_size); 6723307265Smav refcount_destroy(&arc_mfu_ghost->arcs_size); 6724307265Smav refcount_destroy(&arc_l2c_only->arcs_size); 6725307265Smav 6726321553Smav multilist_destroy(arc_mru->arcs_list[ARC_BUFC_METADATA]); 6727321553Smav multilist_destroy(arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]); 6728321553Smav multilist_destroy(arc_mfu->arcs_list[ARC_BUFC_METADATA]); 6729321553Smav multilist_destroy(arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]); 6730321553Smav multilist_destroy(arc_mru->arcs_list[ARC_BUFC_DATA]); 6731321553Smav multilist_destroy(arc_mru_ghost->arcs_list[ARC_BUFC_DATA]); 6732321553Smav multilist_destroy(arc_mfu->arcs_list[ARC_BUFC_DATA]); 6733321553Smav multilist_destroy(arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]); 6734307265Smav} 6735307265Smav 6736307265Smavuint64_t 6737307265Smavarc_max_bytes(void) 6738307265Smav{ 6739307265Smav return (arc_c_max); 6740307265Smav} 6741307265Smav 6742168404Spjdvoid 6743168404Spjdarc_init(void) 6744168404Spjd{ 6745219089Spjd int i, prefetch_tunable_set = 0; 6746205231Skmacy 6747321562Smav /* 6748321562Smav * allmem is "all memory that we could possibly use". 6749321562Smav */ 6750321562Smav#ifdef illumos 6751321562Smav#ifdef _KERNEL 6752321562Smav uint64_t allmem = ptob(physmem - swapfs_minfree); 6753321562Smav#else 6754321562Smav uint64_t allmem = (physmem * PAGESIZE) / 2; 6755321562Smav#endif 6756321562Smav#else 6757321562Smav uint64_t allmem = kmem_size(); 6758321562Smav#endif 6759346686Smav mutex_init(&arc_adjust_lock, NULL, MUTEX_DEFAULT, NULL); 6760346686Smav cv_init(&arc_adjust_waiters_cv, NULL, CV_DEFAULT, NULL); 6761321562Smav 6762301997Skib mutex_init(&arc_dnlc_evicts_lock, NULL, MUTEX_DEFAULT, NULL); 6763301997Skib cv_init(&arc_dnlc_evicts_cv, NULL, CV_DEFAULT, NULL); 6764301997Skib 6765302265Ssmh /* set min cache to 1/32 of all memory, or arc_abs_min, whichever is more */ 6766321562Smav arc_c_min = MAX(allmem / 32, arc_abs_min); 6767321562Smav /* set max to 5/8 of all memory, or all but 1GB, whichever is more */ 6768321562Smav if (allmem >= 1 << 30) 6769321562Smav arc_c_max = allmem - (1 << 30); 6770168404Spjd else 6771168404Spjd arc_c_max = arc_c_min; 6772321562Smav arc_c_max = MAX(allmem * 5 / 8, arc_c_max); 6773219089Spjd 6774289305Smav /* 6775289305Smav * In userland, there's only the memory pressure that we artificially 6776289305Smav * create (see arc_available_memory()). Don't let arc_c get too 6777289305Smav * small, because it can cause transactions to be larger than 6778289305Smav * arc_c, causing arc_tempreserve_space() to fail. 6779289305Smav */ 6780289305Smav#ifndef _KERNEL 6781289305Smav arc_c_min = arc_c_max / 2; 6782289305Smav#endif 6783289305Smav 6784168481Spjd#ifdef _KERNEL 6785168404Spjd /* 6786168404Spjd * Allow the tunables to override our calculations if they are 6787302265Ssmh * reasonable. 6788168404Spjd */ 6789321562Smav if (zfs_arc_max > arc_abs_min && zfs_arc_max < allmem) { 6790168404Spjd arc_c_max = zfs_arc_max; 6791307297Smav arc_c_min = MIN(arc_c_min, arc_c_max); 6792307297Smav } 6793302265Ssmh if (zfs_arc_min > arc_abs_min && zfs_arc_min <= arc_c_max) 6794168404Spjd arc_c_min = zfs_arc_min; 6795168481Spjd#endif 6796219089Spjd 6797168404Spjd arc_c = arc_c_max; 6798168404Spjd arc_p = (arc_c >> 1); 6799168404Spjd 6800185029Spjd /* limit meta-data to 1/4 of the arc capacity */ 6801185029Spjd arc_meta_limit = arc_c_max / 4; 6802185029Spjd 6803321563Smav#ifdef _KERNEL 6804321563Smav /* 6805321563Smav * Metadata is stored in the kernel's heap. Don't let us 6806321563Smav * use more than half the heap for the ARC. 6807321563Smav */ 6808321563Smav arc_meta_limit = MIN(arc_meta_limit, 6809321563Smav vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 2); 6810321563Smav#endif 6811321563Smav 6812185029Spjd /* Allow the tunable to override if it is reasonable */ 6813185029Spjd if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max) 6814185029Spjd arc_meta_limit = zfs_arc_meta_limit; 6815185029Spjd 6816185029Spjd if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0) 6817185029Spjd arc_c_min = arc_meta_limit / 2; 6818185029Spjd 6819275780Sdelphij if (zfs_arc_meta_min > 0) { 6820275780Sdelphij arc_meta_min = zfs_arc_meta_min; 6821275780Sdelphij } else { 6822275780Sdelphij arc_meta_min = arc_c_min / 2; 6823275780Sdelphij } 6824275780Sdelphij 6825208373Smm if (zfs_arc_grow_retry > 0) 6826208373Smm arc_grow_retry = zfs_arc_grow_retry; 6827208373Smm 6828208373Smm if (zfs_arc_shrink_shift > 0) 6829208373Smm arc_shrink_shift = zfs_arc_shrink_shift; 6830208373Smm 6831323667Sbapt if (zfs_arc_no_grow_shift > 0) 6832323667Sbapt arc_no_grow_shift = zfs_arc_no_grow_shift; 6833286625Smav /* 6834286625Smav * Ensure that arc_no_grow_shift is less than arc_shrink_shift. 6835286625Smav */ 6836286625Smav if (arc_no_grow_shift >= arc_shrink_shift) 6837286625Smav arc_no_grow_shift = arc_shrink_shift - 1; 6838286625Smav 6839208373Smm if (zfs_arc_p_min_shift > 0) 6840208373Smm arc_p_min_shift = zfs_arc_p_min_shift; 6841208373Smm 6842168404Spjd /* if kmem_flags are set, lets try to use less memory */ 6843168404Spjd if (kmem_debugging()) 6844168404Spjd arc_c = arc_c / 2; 6845168404Spjd if (arc_c < arc_c_min) 6846168404Spjd arc_c = arc_c_min; 6847168404Spjd 6848168473Spjd zfs_arc_min = arc_c_min; 6849168473Spjd zfs_arc_max = arc_c_max; 6850168473Spjd 6851307265Smav arc_state_init(); 6852346686Smav 6853346686Smav /* 6854346686Smav * The arc must be "uninitialized", so that hdr_recl() (which is 6855346686Smav * registered by buf_init()) will not access arc_reap_zthr before 6856346686Smav * it is created. 6857346686Smav */ 6858346686Smav ASSERT(!arc_initialized); 6859168404Spjd buf_init(); 6860168404Spjd 6861301997Skib arc_dnlc_evicts_thread_exit = FALSE; 6862168404Spjd 6863168404Spjd arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED, 6864168404Spjd sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); 6865168404Spjd 6866168404Spjd if (arc_ksp != NULL) { 6867168404Spjd arc_ksp->ks_data = &arc_stats; 6868286574Smav arc_ksp->ks_update = arc_kstat_update; 6869168404Spjd kstat_install(arc_ksp); 6870168404Spjd } 6871168404Spjd 6872346686Smav arc_adjust_zthr = zthr_create_timer(arc_adjust_cb_check, 6873346686Smav arc_adjust_cb, NULL, SEC2NSEC(1)); 6874346686Smav arc_reap_zthr = zthr_create_timer(arc_reap_cb_check, 6875346686Smav arc_reap_cb, NULL, SEC2NSEC(1)); 6876168404Spjd 6877168404Spjd#ifdef _KERNEL 6878168566Spjd arc_event_lowmem = EVENTHANDLER_REGISTER(vm_lowmem, arc_lowmem, NULL, 6879168404Spjd EVENTHANDLER_PRI_FIRST); 6880168404Spjd#endif 6881168404Spjd 6882301997Skib (void) thread_create(NULL, 0, arc_dnlc_evicts_thread, NULL, 0, &p0, 6883301997Skib TS_RUN, minclsyspri); 6884301997Skib 6885346686Smav arc_initialized = B_TRUE; 6886185029Spjd arc_warm = B_FALSE; 6887168566Spjd 6888258632Savg /* 6889258632Savg * Calculate maximum amount of dirty data per pool. 6890258632Savg * 6891258632Savg * If it has been set by /etc/system, take that. 6892258632Savg * Otherwise, use a percentage of physical memory defined by 6893258632Savg * zfs_dirty_data_max_percent (default 10%) with a cap at 6894258632Savg * zfs_dirty_data_max_max (default 4GB). 6895258632Savg */ 6896258632Savg if (zfs_dirty_data_max == 0) { 6897258632Savg zfs_dirty_data_max = ptob(physmem) * 6898258632Savg zfs_dirty_data_max_percent / 100; 6899258632Savg zfs_dirty_data_max = MIN(zfs_dirty_data_max, 6900258632Savg zfs_dirty_data_max_max); 6901258632Savg } 6902185029Spjd 6903168566Spjd#ifdef _KERNEL 6904194043Skmacy if (TUNABLE_INT_FETCH("vfs.zfs.prefetch_disable", &zfs_prefetch_disable)) 6905193953Skmacy prefetch_tunable_set = 1; 6906206796Spjd 6907193878Skmacy#ifdef __i386__ 6908193953Skmacy if (prefetch_tunable_set == 0) { 6909196863Strasz printf("ZFS NOTICE: Prefetch is disabled by default on i386 " 6910196863Strasz "-- to enable,\n"); 6911196863Strasz printf(" add \"vfs.zfs.prefetch_disable=0\" " 6912196863Strasz "to /boot/loader.conf.\n"); 6913219089Spjd zfs_prefetch_disable = 1; 6914193878Skmacy } 6915206796Spjd#else 6916193878Skmacy if ((((uint64_t)physmem * PAGESIZE) < (1ULL << 32)) && 6917193953Skmacy prefetch_tunable_set == 0) { 6918196863Strasz printf("ZFS NOTICE: Prefetch is disabled by default if less " 6919196941Strasz "than 4GB of RAM is present;\n" 6920196863Strasz " to enable, add \"vfs.zfs.prefetch_disable=0\" " 6921196863Strasz "to /boot/loader.conf.\n"); 6922219089Spjd zfs_prefetch_disable = 1; 6923193878Skmacy } 6924206796Spjd#endif 6925175633Spjd /* Warn about ZFS memory and address space requirements. */ 6926168696Spjd if (((uint64_t)physmem * PAGESIZE) < (256 + 128 + 64) * (1 << 20)) { 6927168987Sbmah printf("ZFS WARNING: Recommended minimum RAM size is 512MB; " 6928168987Sbmah "expect unstable behavior.\n"); 6929175633Spjd } 6930321562Smav if (allmem < 512 * (1 << 20)) { 6931173419Spjd printf("ZFS WARNING: Recommended minimum kmem_size is 512MB; " 6932168987Sbmah "expect unstable behavior.\n"); 6933185029Spjd printf(" Consider tuning vm.kmem_size and " 6934173419Spjd "vm.kmem_size_max\n"); 6935185029Spjd printf(" in /boot/loader.conf.\n"); 6936168566Spjd } 6937168566Spjd#endif 6938168404Spjd} 6939168404Spjd 6940168404Spjdvoid 6941168404Spjdarc_fini(void) 6942168404Spjd{ 6943327491Smarkj#ifdef _KERNEL 6944327491Smarkj if (arc_event_lowmem != NULL) 6945327491Smarkj EVENTHANDLER_DEREGISTER(vm_lowmem, arc_event_lowmem); 6946327491Smarkj#endif 6947327491Smarkj 6948307265Smav /* Use B_TRUE to ensure *all* buffers are evicted */ 6949307265Smav arc_flush(NULL, B_TRUE); 6950168404Spjd 6951301997Skib mutex_enter(&arc_dnlc_evicts_lock); 6952301997Skib arc_dnlc_evicts_thread_exit = TRUE; 6953301997Skib /* 6954301997Skib * The user evicts thread will set arc_user_evicts_thread_exit 6955301997Skib * to FALSE when it is finished exiting; we're waiting for that. 6956301997Skib */ 6957301997Skib while (arc_dnlc_evicts_thread_exit) { 6958301997Skib cv_signal(&arc_dnlc_evicts_cv); 6959301997Skib cv_wait(&arc_dnlc_evicts_cv, &arc_dnlc_evicts_lock); 6960301997Skib } 6961301997Skib mutex_exit(&arc_dnlc_evicts_lock); 6962301997Skib 6963346686Smav arc_initialized = B_FALSE; 6964286763Smav 6965168404Spjd if (arc_ksp != NULL) { 6966168404Spjd kstat_delete(arc_ksp); 6967168404Spjd arc_ksp = NULL; 6968168404Spjd } 6969168404Spjd 6970168404Spjd 6971346686Smav (void) zthr_cancel(arc_adjust_zthr); 6972346686Smav zthr_destroy(arc_adjust_zthr); 6973346686Smav 6974301997Skib mutex_destroy(&arc_dnlc_evicts_lock); 6975301997Skib cv_destroy(&arc_dnlc_evicts_cv); 6976301997Skib 6977346686Smav (void) zthr_cancel(arc_reap_zthr); 6978346686Smav zthr_destroy(arc_reap_zthr); 6979346686Smav 6980346686Smav mutex_destroy(&arc_adjust_lock); 6981346686Smav cv_destroy(&arc_adjust_waiters_cv); 6982346686Smav 6983307265Smav arc_state_fini(); 6984168404Spjd buf_fini(); 6985168404Spjd 6986286570Smav ASSERT0(arc_loaned_bytes); 6987168404Spjd} 6988185029Spjd 6989185029Spjd/* 6990185029Spjd * Level 2 ARC 6991185029Spjd * 6992185029Spjd * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk. 6993185029Spjd * It uses dedicated storage devices to hold cached data, which are populated 6994185029Spjd * using large infrequent writes. The main role of this cache is to boost 6995185029Spjd * the performance of random read workloads. The intended L2ARC devices 6996185029Spjd * include short-stroked disks, solid state disks, and other media with 6997185029Spjd * substantially faster read latency than disk. 6998185029Spjd * 6999185029Spjd * +-----------------------+ 7000185029Spjd * | ARC | 7001185029Spjd * +-----------------------+ 7002185029Spjd * | ^ ^ 7003185029Spjd * | | | 7004185029Spjd * l2arc_feed_thread() arc_read() 7005185029Spjd * | | | 7006185029Spjd * | l2arc read | 7007185029Spjd * V | | 7008185029Spjd * +---------------+ | 7009185029Spjd * | L2ARC | | 7010185029Spjd * +---------------+ | 7011185029Spjd * | ^ | 7012185029Spjd * l2arc_write() | | 7013185029Spjd * | | | 7014185029Spjd * V | | 7015185029Spjd * +-------+ +-------+ 7016185029Spjd * | vdev | | vdev | 7017185029Spjd * | cache | | cache | 7018185029Spjd * +-------+ +-------+ 7019185029Spjd * +=========+ .-----. 7020185029Spjd * : L2ARC : |-_____-| 7021185029Spjd * : devices : | Disks | 7022185029Spjd * +=========+ `-_____-' 7023185029Spjd * 7024185029Spjd * Read requests are satisfied from the following sources, in order: 7025185029Spjd * 7026185029Spjd * 1) ARC 7027185029Spjd * 2) vdev cache of L2ARC devices 7028185029Spjd * 3) L2ARC devices 7029185029Spjd * 4) vdev cache of disks 7030185029Spjd * 5) disks 7031185029Spjd * 7032185029Spjd * Some L2ARC device types exhibit extremely slow write performance. 7033185029Spjd * To accommodate for this there are some significant differences between 7034185029Spjd * the L2ARC and traditional cache design: 7035185029Spjd * 7036185029Spjd * 1. There is no eviction path from the ARC to the L2ARC. Evictions from 7037185029Spjd * the ARC behave as usual, freeing buffers and placing headers on ghost 7038185029Spjd * lists. The ARC does not send buffers to the L2ARC during eviction as 7039185029Spjd * this would add inflated write latencies for all ARC memory pressure. 7040185029Spjd * 7041185029Spjd * 2. The L2ARC attempts to cache data from the ARC before it is evicted. 7042185029Spjd * It does this by periodically scanning buffers from the eviction-end of 7043185029Spjd * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are 7044251478Sdelphij * not already there. It scans until a headroom of buffers is satisfied, 7045251478Sdelphij * which itself is a buffer for ARC eviction. If a compressible buffer is 7046251478Sdelphij * found during scanning and selected for writing to an L2ARC device, we 7047251478Sdelphij * temporarily boost scanning headroom during the next scan cycle to make 7048251478Sdelphij * sure we adapt to compression effects (which might significantly reduce 7049251478Sdelphij * the data volume we write to L2ARC). The thread that does this is 7050185029Spjd * l2arc_feed_thread(), illustrated below; example sizes are included to 7051185029Spjd * provide a better sense of ratio than this diagram: 7052185029Spjd * 7053185029Spjd * head --> tail 7054185029Spjd * +---------------------+----------+ 7055185029Spjd * ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->. # already on L2ARC 7056185029Spjd * +---------------------+----------+ | o L2ARC eligible 7057185029Spjd * ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->| : ARC buffer 7058185029Spjd * +---------------------+----------+ | 7059185029Spjd * 15.9 Gbytes ^ 32 Mbytes | 7060185029Spjd * headroom | 7061185029Spjd * l2arc_feed_thread() 7062185029Spjd * | 7063185029Spjd * l2arc write hand <--[oooo]--' 7064185029Spjd * | 8 Mbyte 7065185029Spjd * | write max 7066185029Spjd * V 7067185029Spjd * +==============================+ 7068185029Spjd * L2ARC dev |####|#|###|###| |####| ... | 7069185029Spjd * +==============================+ 7070185029Spjd * 32 Gbytes 7071185029Spjd * 7072185029Spjd * 3. If an ARC buffer is copied to the L2ARC but then hit instead of 7073185029Spjd * evicted, then the L2ARC has cached a buffer much sooner than it probably 7074185029Spjd * needed to, potentially wasting L2ARC device bandwidth and storage. It is 7075185029Spjd * safe to say that this is an uncommon case, since buffers at the end of 7076185029Spjd * the ARC lists have moved there due to inactivity. 7077185029Spjd * 7078185029Spjd * 4. If the ARC evicts faster than the L2ARC can maintain a headroom, 7079185029Spjd * then the L2ARC simply misses copying some buffers. This serves as a 7080185029Spjd * pressure valve to prevent heavy read workloads from both stalling the ARC 7081185029Spjd * with waits and clogging the L2ARC with writes. This also helps prevent 7082185029Spjd * the potential for the L2ARC to churn if it attempts to cache content too 7083185029Spjd * quickly, such as during backups of the entire pool. 7084185029Spjd * 7085185029Spjd * 5. After system boot and before the ARC has filled main memory, there are 7086185029Spjd * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru 7087185029Spjd * lists can remain mostly static. Instead of searching from tail of these 7088185029Spjd * lists as pictured, the l2arc_feed_thread() will search from the list heads 7089185029Spjd * for eligible buffers, greatly increasing its chance of finding them. 7090185029Spjd * 7091185029Spjd * The L2ARC device write speed is also boosted during this time so that 7092185029Spjd * the L2ARC warms up faster. Since there have been no ARC evictions yet, 7093185029Spjd * there are no L2ARC reads, and no fear of degrading read performance 7094185029Spjd * through increased writes. 7095185029Spjd * 7096185029Spjd * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that 7097185029Spjd * the vdev queue can aggregate them into larger and fewer writes. Each 7098185029Spjd * device is written to in a rotor fashion, sweeping writes through 7099185029Spjd * available space then repeating. 7100185029Spjd * 7101185029Spjd * 7. The L2ARC does not store dirty content. It never needs to flush 7102185029Spjd * write buffers back to disk based storage. 7103185029Spjd * 7104185029Spjd * 8. If an ARC buffer is written (and dirtied) which also exists in the 7105185029Spjd * L2ARC, the now stale L2ARC buffer is immediately dropped. 7106185029Spjd * 7107185029Spjd * The performance of the L2ARC can be tweaked by a number of tunables, which 7108185029Spjd * may be necessary for different workloads: 7109185029Spjd * 7110185029Spjd * l2arc_write_max max write bytes per interval 7111185029Spjd * l2arc_write_boost extra write bytes during device warmup 7112185029Spjd * l2arc_noprefetch skip caching prefetched buffers 7113185029Spjd * l2arc_headroom number of max device writes to precache 7114251478Sdelphij * l2arc_headroom_boost when we find compressed buffers during ARC 7115251478Sdelphij * scanning, we multiply headroom by this 7116251478Sdelphij * percentage factor for the next scan cycle, 7117251478Sdelphij * since more compressed buffers are likely to 7118251478Sdelphij * be present 7119185029Spjd * l2arc_feed_secs seconds between L2ARC writing 7120185029Spjd * 7121185029Spjd * Tunables may be removed or added as future performance improvements are 7122185029Spjd * integrated, and also may become zpool properties. 7123208373Smm * 7124208373Smm * There are three key functions that control how the L2ARC warms up: 7125208373Smm * 7126208373Smm * l2arc_write_eligible() check if a buffer is eligible to cache 7127208373Smm * l2arc_write_size() calculate how much to write 7128208373Smm * l2arc_write_interval() calculate sleep delay between writes 7129208373Smm * 7130208373Smm * These three functions determine what to write, how much, and how quickly 7131208373Smm * to send writes. 7132185029Spjd */ 7133185029Spjd 7134208373Smmstatic boolean_t 7135275811Sdelphijl2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *hdr) 7136208373Smm{ 7137208373Smm /* 7138208373Smm * A buffer is *not* eligible for the L2ARC if it: 7139208373Smm * 1. belongs to a different spa. 7140208373Smm * 2. is already cached on the L2ARC. 7141208373Smm * 3. has an I/O in progress (it may be an incomplete read). 7142208373Smm * 4. is flagged not eligible (zfs property). 7143208373Smm */ 7144275811Sdelphij if (hdr->b_spa != spa_guid) { 7145208373Smm ARCSTAT_BUMP(arcstat_l2_write_spa_mismatch); 7146208373Smm return (B_FALSE); 7147208373Smm } 7148286570Smav if (HDR_HAS_L2HDR(hdr)) { 7149208373Smm ARCSTAT_BUMP(arcstat_l2_write_in_l2); 7150208373Smm return (B_FALSE); 7151208373Smm } 7152275811Sdelphij if (HDR_IO_IN_PROGRESS(hdr)) { 7153208373Smm ARCSTAT_BUMP(arcstat_l2_write_hdr_io_in_progress); 7154208373Smm return (B_FALSE); 7155208373Smm } 7156275811Sdelphij if (!HDR_L2CACHE(hdr)) { 7157208373Smm ARCSTAT_BUMP(arcstat_l2_write_not_cacheable); 7158208373Smm return (B_FALSE); 7159208373Smm } 7160208373Smm 7161208373Smm return (B_TRUE); 7162208373Smm} 7163208373Smm 7164208373Smmstatic uint64_t 7165251478Sdelphijl2arc_write_size(void) 7166208373Smm{ 7167208373Smm uint64_t size; 7168208373Smm 7169251478Sdelphij /* 7170251478Sdelphij * Make sure our globals have meaningful values in case the user 7171251478Sdelphij * altered them. 7172251478Sdelphij */ 7173251478Sdelphij size = l2arc_write_max; 7174251478Sdelphij if (size == 0) { 7175251478Sdelphij cmn_err(CE_NOTE, "Bad value for l2arc_write_max, value must " 7176251478Sdelphij "be greater than zero, resetting it to the default (%d)", 7177251478Sdelphij L2ARC_WRITE_SIZE); 7178251478Sdelphij size = l2arc_write_max = L2ARC_WRITE_SIZE; 7179251478Sdelphij } 7180208373Smm 7181208373Smm if (arc_warm == B_FALSE) 7182251478Sdelphij size += l2arc_write_boost; 7183208373Smm 7184208373Smm return (size); 7185208373Smm 7186208373Smm} 7187208373Smm 7188208373Smmstatic clock_t 7189208373Smml2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote) 7190208373Smm{ 7191219089Spjd clock_t interval, next, now; 7192208373Smm 7193208373Smm /* 7194208373Smm * If the ARC lists are busy, increase our write rate; if the 7195208373Smm * lists are stale, idle back. This is achieved by checking 7196208373Smm * how much we previously wrote - if it was more than half of 7197208373Smm * what we wanted, schedule the next write much sooner. 7198208373Smm */ 7199208373Smm if (l2arc_feed_again && wrote > (wanted / 2)) 7200208373Smm interval = (hz * l2arc_feed_min_ms) / 1000; 7201208373Smm else 7202208373Smm interval = hz * l2arc_feed_secs; 7203208373Smm 7204219089Spjd now = ddi_get_lbolt(); 7205219089Spjd next = MAX(now, MIN(now + interval, began + interval)); 7206208373Smm 7207208373Smm return (next); 7208208373Smm} 7209208373Smm 7210185029Spjd/* 7211185029Spjd * Cycle through L2ARC devices. This is how L2ARC load balances. 7212185029Spjd * If a device is returned, this also returns holding the spa config lock. 7213185029Spjd */ 7214185029Spjdstatic l2arc_dev_t * 7215185029Spjdl2arc_dev_get_next(void) 7216185029Spjd{ 7217185029Spjd l2arc_dev_t *first, *next = NULL; 7218185029Spjd 7219185029Spjd /* 7220185029Spjd * Lock out the removal of spas (spa_namespace_lock), then removal 7221185029Spjd * of cache devices (l2arc_dev_mtx). Once a device has been selected, 7222185029Spjd * both locks will be dropped and a spa config lock held instead. 7223185029Spjd */ 7224185029Spjd mutex_enter(&spa_namespace_lock); 7225185029Spjd mutex_enter(&l2arc_dev_mtx); 7226185029Spjd 7227185029Spjd /* if there are no vdevs, there is nothing to do */ 7228185029Spjd if (l2arc_ndev == 0) 7229185029Spjd goto out; 7230185029Spjd 7231185029Spjd first = NULL; 7232185029Spjd next = l2arc_dev_last; 7233185029Spjd do { 7234185029Spjd /* loop around the list looking for a non-faulted vdev */ 7235185029Spjd if (next == NULL) { 7236185029Spjd next = list_head(l2arc_dev_list); 7237185029Spjd } else { 7238185029Spjd next = list_next(l2arc_dev_list, next); 7239185029Spjd if (next == NULL) 7240185029Spjd next = list_head(l2arc_dev_list); 7241185029Spjd } 7242185029Spjd 7243185029Spjd /* if we have come back to the start, bail out */ 7244185029Spjd if (first == NULL) 7245185029Spjd first = next; 7246185029Spjd else if (next == first) 7247185029Spjd break; 7248185029Spjd 7249185029Spjd } while (vdev_is_dead(next->l2ad_vdev)); 7250185029Spjd 7251185029Spjd /* if we were unable to find any usable vdevs, return NULL */ 7252185029Spjd if (vdev_is_dead(next->l2ad_vdev)) 7253185029Spjd next = NULL; 7254185029Spjd 7255185029Spjd l2arc_dev_last = next; 7256185029Spjd 7257185029Spjdout: 7258185029Spjd mutex_exit(&l2arc_dev_mtx); 7259185029Spjd 7260185029Spjd /* 7261185029Spjd * Grab the config lock to prevent the 'next' device from being 7262185029Spjd * removed while we are writing to it. 7263185029Spjd */ 7264185029Spjd if (next != NULL) 7265185029Spjd spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER); 7266185029Spjd mutex_exit(&spa_namespace_lock); 7267185029Spjd 7268185029Spjd return (next); 7269185029Spjd} 7270185029Spjd 7271185029Spjd/* 7272185029Spjd * Free buffers that were tagged for destruction. 7273185029Spjd */ 7274185029Spjdstatic void 7275185029Spjdl2arc_do_free_on_write() 7276185029Spjd{ 7277185029Spjd list_t *buflist; 7278185029Spjd l2arc_data_free_t *df, *df_prev; 7279185029Spjd 7280185029Spjd mutex_enter(&l2arc_free_on_write_mtx); 7281185029Spjd buflist = l2arc_free_on_write; 7282185029Spjd 7283185029Spjd for (df = list_tail(buflist); df; df = df_prev) { 7284185029Spjd df_prev = list_prev(buflist, df); 7285321610Smav ASSERT3P(df->l2df_abd, !=, NULL); 7286321610Smav abd_free(df->l2df_abd); 7287185029Spjd list_remove(buflist, df); 7288185029Spjd kmem_free(df, sizeof (l2arc_data_free_t)); 7289185029Spjd } 7290185029Spjd 7291185029Spjd mutex_exit(&l2arc_free_on_write_mtx); 7292185029Spjd} 7293185029Spjd 7294185029Spjd/* 7295185029Spjd * A write to a cache device has completed. Update all headers to allow 7296185029Spjd * reads from these buffers to begin. 7297185029Spjd */ 7298185029Spjdstatic void 7299185029Spjdl2arc_write_done(zio_t *zio) 7300185029Spjd{ 7301185029Spjd l2arc_write_callback_t *cb; 7302185029Spjd l2arc_dev_t *dev; 7303185029Spjd list_t *buflist; 7304275811Sdelphij arc_buf_hdr_t *head, *hdr, *hdr_prev; 7305185029Spjd kmutex_t *hash_lock; 7306268085Sdelphij int64_t bytes_dropped = 0; 7307185029Spjd 7308185029Spjd cb = zio->io_private; 7309307265Smav ASSERT3P(cb, !=, NULL); 7310185029Spjd dev = cb->l2wcb_dev; 7311307265Smav ASSERT3P(dev, !=, NULL); 7312185029Spjd head = cb->l2wcb_head; 7313307265Smav ASSERT3P(head, !=, NULL); 7314286570Smav buflist = &dev->l2ad_buflist; 7315307265Smav ASSERT3P(buflist, !=, NULL); 7316185029Spjd DTRACE_PROBE2(l2arc__iodone, zio_t *, zio, 7317185029Spjd l2arc_write_callback_t *, cb); 7318185029Spjd 7319185029Spjd if (zio->io_error != 0) 7320185029Spjd ARCSTAT_BUMP(arcstat_l2_writes_error); 7321185029Spjd 7322185029Spjd /* 7323185029Spjd * All writes completed, or an error was hit. 7324185029Spjd */ 7325286763Smavtop: 7326286763Smav mutex_enter(&dev->l2ad_mtx); 7327275811Sdelphij for (hdr = list_prev(buflist, head); hdr; hdr = hdr_prev) { 7328275811Sdelphij hdr_prev = list_prev(buflist, hdr); 7329185029Spjd 7330275811Sdelphij hash_lock = HDR_LOCK(hdr); 7331286763Smav 7332286763Smav /* 7333286763Smav * We cannot use mutex_enter or else we can deadlock 7334286763Smav * with l2arc_write_buffers (due to swapping the order 7335286763Smav * the hash lock and l2ad_mtx are taken). 7336286763Smav */ 7337185029Spjd if (!mutex_tryenter(hash_lock)) { 7338185029Spjd /* 7339286763Smav * Missed the hash lock. We must retry so we 7340286763Smav * don't leave the ARC_FLAG_L2_WRITING bit set. 7341185029Spjd */ 7342286763Smav ARCSTAT_BUMP(arcstat_l2_writes_lock_retry); 7343286763Smav 7344286763Smav /* 7345286763Smav * We don't want to rescan the headers we've 7346286763Smav * already marked as having been written out, so 7347286763Smav * we reinsert the head node so we can pick up 7348286763Smav * where we left off. 7349286763Smav */ 7350286763Smav list_remove(buflist, head); 7351286763Smav list_insert_after(buflist, hdr, head); 7352286763Smav 7353286763Smav mutex_exit(&dev->l2ad_mtx); 7354286763Smav 7355286763Smav /* 7356286763Smav * We wait for the hash lock to become available 7357286763Smav * to try and prevent busy waiting, and increase 7358286763Smav * the chance we'll be able to acquire the lock 7359286763Smav * the next time around. 7360286763Smav */ 7361286763Smav mutex_enter(hash_lock); 7362286763Smav mutex_exit(hash_lock); 7363286763Smav goto top; 7364185029Spjd } 7365185029Spjd 7366286570Smav /* 7367286763Smav * We could not have been moved into the arc_l2c_only 7368286763Smav * state while in-flight due to our ARC_FLAG_L2_WRITING 7369286763Smav * bit being set. Let's just ensure that's being enforced. 7370286570Smav */ 7371286763Smav ASSERT(HDR_HAS_L1HDR(hdr)); 7372286570Smav 7373185029Spjd if (zio->io_error != 0) { 7374185029Spjd /* 7375185029Spjd * Error - drop L2ARC entry. 7376185029Spjd */ 7377286776Smav list_remove(buflist, hdr); 7378290191Savg l2arc_trim(hdr); 7379307265Smav arc_hdr_clear_flags(hdr, ARC_FLAG_HAS_L2HDR); 7380286570Smav 7381323754Savg ARCSTAT_INCR(arcstat_l2_psize, -arc_hdr_size(hdr)); 7382323754Savg ARCSTAT_INCR(arcstat_l2_lsize, -HDR_GET_LSIZE(hdr)); 7383286598Smav 7384307265Smav bytes_dropped += arc_hdr_size(hdr); 7385286598Smav (void) refcount_remove_many(&dev->l2ad_alloc, 7386307265Smav arc_hdr_size(hdr), hdr); 7387185029Spjd } 7388185029Spjd 7389185029Spjd /* 7390286763Smav * Allow ARC to begin reads and ghost list evictions to 7391286763Smav * this L2ARC entry. 7392185029Spjd */ 7393307265Smav arc_hdr_clear_flags(hdr, ARC_FLAG_L2_WRITING); 7394185029Spjd 7395185029Spjd mutex_exit(hash_lock); 7396185029Spjd } 7397185029Spjd 7398185029Spjd atomic_inc_64(&l2arc_writes_done); 7399185029Spjd list_remove(buflist, head); 7400286570Smav ASSERT(!HDR_HAS_L1HDR(head)); 7401286570Smav kmem_cache_free(hdr_l2only_cache, head); 7402286570Smav mutex_exit(&dev->l2ad_mtx); 7403185029Spjd 7404268085Sdelphij vdev_space_update(dev->l2ad_vdev, -bytes_dropped, 0, 0); 7405268085Sdelphij 7406185029Spjd l2arc_do_free_on_write(); 7407185029Spjd 7408185029Spjd kmem_free(cb, sizeof (l2arc_write_callback_t)); 7409185029Spjd} 7410185029Spjd 7411185029Spjd/* 7412185029Spjd * A read to a cache device completed. Validate buffer contents before 7413185029Spjd * handing over to the regular ARC routines. 7414185029Spjd */ 7415185029Spjdstatic void 7416185029Spjdl2arc_read_done(zio_t *zio) 7417185029Spjd{ 7418185029Spjd l2arc_read_callback_t *cb; 7419185029Spjd arc_buf_hdr_t *hdr; 7420185029Spjd kmutex_t *hash_lock; 7421307265Smav boolean_t valid_cksum; 7422185029Spjd 7423307265Smav ASSERT3P(zio->io_vd, !=, NULL); 7424185029Spjd ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE); 7425185029Spjd 7426185029Spjd spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd); 7427185029Spjd 7428185029Spjd cb = zio->io_private; 7429307265Smav ASSERT3P(cb, !=, NULL); 7430307265Smav hdr = cb->l2rcb_hdr; 7431307265Smav ASSERT3P(hdr, !=, NULL); 7432185029Spjd 7433307265Smav hash_lock = HDR_LOCK(hdr); 7434185029Spjd mutex_enter(hash_lock); 7435219089Spjd ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 7436185029Spjd 7437185029Spjd /* 7438297848Savg * If the data was read into a temporary buffer, 7439297848Savg * move it and free the buffer. 7440297848Savg */ 7441321610Smav if (cb->l2rcb_abd != NULL) { 7442307265Smav ASSERT3U(arc_hdr_size(hdr), <, zio->io_size); 7443307265Smav if (zio->io_error == 0) { 7444321610Smav abd_copy(hdr->b_l1hdr.b_pabd, cb->l2rcb_abd, 7445307265Smav arc_hdr_size(hdr)); 7446307265Smav } 7447297848Savg 7448297848Savg /* 7449297848Savg * The following must be done regardless of whether 7450297848Savg * there was an error: 7451297848Savg * - free the temporary buffer 7452297848Savg * - point zio to the real ARC buffer 7453297848Savg * - set zio size accordingly 7454297848Savg * These are required because zio is either re-used for 7455297848Savg * an I/O of the block in the case of the error 7456297848Savg * or the zio is passed to arc_read_done() and it 7457297848Savg * needs real data. 7458297848Savg */ 7459321610Smav abd_free(cb->l2rcb_abd); 7460307265Smav zio->io_size = zio->io_orig_size = arc_hdr_size(hdr); 7461321610Smav zio->io_abd = zio->io_orig_abd = hdr->b_l1hdr.b_pabd; 7462297848Savg } 7463297848Savg 7464321610Smav ASSERT3P(zio->io_abd, !=, NULL); 7465251478Sdelphij 7466251478Sdelphij /* 7467185029Spjd * Check this survived the L2ARC journey. 7468185029Spjd */ 7469321610Smav ASSERT3P(zio->io_abd, ==, hdr->b_l1hdr.b_pabd); 7470307265Smav zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 */ 7471307265Smav zio->io_bp = &zio->io_bp_copy; /* XXX fix in L2ARC 2.0 */ 7472307265Smav 7473307265Smav valid_cksum = arc_cksum_is_equal(hdr, zio); 7474307265Smav if (valid_cksum && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) { 7475185029Spjd mutex_exit(hash_lock); 7476307265Smav zio->io_private = hdr; 7477185029Spjd arc_read_done(zio); 7478185029Spjd } else { 7479185029Spjd /* 7480185029Spjd * Buffer didn't survive caching. Increment stats and 7481185029Spjd * reissue to the original storage device. 7482185029Spjd */ 7483185029Spjd if (zio->io_error != 0) { 7484185029Spjd ARCSTAT_BUMP(arcstat_l2_io_error); 7485185029Spjd } else { 7486249195Smm zio->io_error = SET_ERROR(EIO); 7487185029Spjd } 7488307265Smav if (!valid_cksum) 7489185029Spjd ARCSTAT_BUMP(arcstat_l2_cksum_bad); 7490185029Spjd 7491185029Spjd /* 7492185029Spjd * If there's no waiter, issue an async i/o to the primary 7493185029Spjd * storage now. If there *is* a waiter, the caller must 7494185029Spjd * issue the i/o in a context where it's OK to block. 7495185029Spjd */ 7496209962Smm if (zio->io_waiter == NULL) { 7497209962Smm zio_t *pio = zio_unique_parent(zio); 7498209962Smm 7499209962Smm ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL); 7500209962Smm 7501355637Smav zio = zio_read(pio, zio->io_spa, zio->io_bp, 7502321610Smav hdr->b_l1hdr.b_pabd, zio->io_size, arc_read_done, 7503307265Smav hdr, zio->io_priority, cb->l2rcb_flags, 7504355637Smav &cb->l2rcb_zb); 7505355637Smav for (struct arc_callback *acb = hdr->b_l1hdr.b_acb; 7506355637Smav acb != NULL; acb = acb->acb_next) 7507355637Smav acb->acb_zio_head = zio; 7508355637Smav mutex_exit(hash_lock); 7509355637Smav zio_nowait(zio); 7510355637Smav } else 7511355637Smav mutex_exit(hash_lock); 7512185029Spjd } 7513185029Spjd 7514185029Spjd kmem_free(cb, sizeof (l2arc_read_callback_t)); 7515185029Spjd} 7516185029Spjd 7517185029Spjd/* 7518185029Spjd * This is the list priority from which the L2ARC will search for pages to 7519185029Spjd * cache. This is used within loops (0..3) to cycle through lists in the 7520185029Spjd * desired order. This order can have a significant effect on cache 7521185029Spjd * performance. 7522185029Spjd * 7523185029Spjd * Currently the metadata lists are hit first, MFU then MRU, followed by 7524185029Spjd * the data lists. This function returns a locked list, and also returns 7525185029Spjd * the lock pointer. 7526185029Spjd */ 7527286763Smavstatic multilist_sublist_t * 7528286763Smavl2arc_sublist_lock(int list_num) 7529185029Spjd{ 7530286763Smav multilist_t *ml = NULL; 7531286763Smav unsigned int idx; 7532185029Spjd 7533286762Smav ASSERT(list_num >= 0 && list_num <= 3); 7534206796Spjd 7535286762Smav switch (list_num) { 7536286762Smav case 0: 7537321553Smav ml = arc_mfu->arcs_list[ARC_BUFC_METADATA]; 7538286762Smav break; 7539286762Smav case 1: 7540321553Smav ml = arc_mru->arcs_list[ARC_BUFC_METADATA]; 7541286762Smav break; 7542286762Smav case 2: 7543321553Smav ml = arc_mfu->arcs_list[ARC_BUFC_DATA]; 7544286762Smav break; 7545286762Smav case 3: 7546321553Smav ml = arc_mru->arcs_list[ARC_BUFC_DATA]; 7547286762Smav break; 7548185029Spjd } 7549185029Spjd 7550286763Smav /* 7551286763Smav * Return a randomly-selected sublist. This is acceptable 7552286763Smav * because the caller feeds only a little bit of data for each 7553286763Smav * call (8MB). Subsequent calls will result in different 7554286763Smav * sublists being selected. 7555286763Smav */ 7556286763Smav idx = multilist_get_random_index(ml); 7557286763Smav return (multilist_sublist_lock(ml, idx)); 7558185029Spjd} 7559185029Spjd 7560185029Spjd/* 7561185029Spjd * Evict buffers from the device write hand to the distance specified in 7562185029Spjd * bytes. This distance may span populated buffers, it may span nothing. 7563185029Spjd * This is clearing a region on the L2ARC device ready for writing. 7564185029Spjd * If the 'all' boolean is set, every buffer is evicted. 7565185029Spjd */ 7566185029Spjdstatic void 7567185029Spjdl2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) 7568185029Spjd{ 7569185029Spjd list_t *buflist; 7570275811Sdelphij arc_buf_hdr_t *hdr, *hdr_prev; 7571185029Spjd kmutex_t *hash_lock; 7572185029Spjd uint64_t taddr; 7573185029Spjd 7574286570Smav buflist = &dev->l2ad_buflist; 7575185029Spjd 7576185029Spjd if (!all && dev->l2ad_first) { 7577185029Spjd /* 7578185029Spjd * This is the first sweep through the device. There is 7579185029Spjd * nothing to evict. 7580185029Spjd */ 7581185029Spjd return; 7582185029Spjd } 7583185029Spjd 7584185029Spjd if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) { 7585185029Spjd /* 7586185029Spjd * When nearing the end of the device, evict to the end 7587185029Spjd * before the device write hand jumps to the start. 7588185029Spjd */ 7589185029Spjd taddr = dev->l2ad_end; 7590185029Spjd } else { 7591185029Spjd taddr = dev->l2ad_hand + distance; 7592185029Spjd } 7593185029Spjd DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist, 7594185029Spjd uint64_t, taddr, boolean_t, all); 7595185029Spjd 7596185029Spjdtop: 7597286570Smav mutex_enter(&dev->l2ad_mtx); 7598275811Sdelphij for (hdr = list_tail(buflist); hdr; hdr = hdr_prev) { 7599275811Sdelphij hdr_prev = list_prev(buflist, hdr); 7600185029Spjd 7601275811Sdelphij hash_lock = HDR_LOCK(hdr); 7602286763Smav 7603286763Smav /* 7604286763Smav * We cannot use mutex_enter or else we can deadlock 7605286763Smav * with l2arc_write_buffers (due to swapping the order 7606286763Smav * the hash lock and l2ad_mtx are taken). 7607286763Smav */ 7608185029Spjd if (!mutex_tryenter(hash_lock)) { 7609185029Spjd /* 7610185029Spjd * Missed the hash lock. Retry. 7611185029Spjd */ 7612185029Spjd ARCSTAT_BUMP(arcstat_l2_evict_lock_retry); 7613286570Smav mutex_exit(&dev->l2ad_mtx); 7614185029Spjd mutex_enter(hash_lock); 7615185029Spjd mutex_exit(hash_lock); 7616185029Spjd goto top; 7617185029Spjd } 7618185029Spjd 7619323752Savg /* 7620323752Savg * A header can't be on this list if it doesn't have L2 header. 7621323752Savg */ 7622323752Savg ASSERT(HDR_HAS_L2HDR(hdr)); 7623185029Spjd 7624323752Savg /* Ensure this header has finished being written. */ 7625323752Savg ASSERT(!HDR_L2_WRITING(hdr)); 7626323752Savg ASSERT(!HDR_L2_WRITE_HEAD(hdr)); 7627323752Savg 7628323752Savg if (!all && (hdr->b_l2hdr.b_daddr >= taddr || 7629286570Smav hdr->b_l2hdr.b_daddr < dev->l2ad_hand)) { 7630185029Spjd /* 7631185029Spjd * We've evicted to the target address, 7632185029Spjd * or the end of the device. 7633185029Spjd */ 7634185029Spjd mutex_exit(hash_lock); 7635185029Spjd break; 7636185029Spjd } 7637185029Spjd 7638286570Smav if (!HDR_HAS_L1HDR(hdr)) { 7639275811Sdelphij ASSERT(!HDR_L2_READING(hdr)); 7640185029Spjd /* 7641185029Spjd * This doesn't exist in the ARC. Destroy. 7642185029Spjd * arc_hdr_destroy() will call list_remove() 7643323754Savg * and decrement arcstat_l2_lsize. 7644185029Spjd */ 7645275811Sdelphij arc_change_state(arc_anon, hdr, hash_lock); 7646275811Sdelphij arc_hdr_destroy(hdr); 7647185029Spjd } else { 7648286570Smav ASSERT(hdr->b_l1hdr.b_state != arc_l2c_only); 7649286570Smav ARCSTAT_BUMP(arcstat_l2_evict_l1cached); 7650185029Spjd /* 7651185029Spjd * Invalidate issued or about to be issued 7652185029Spjd * reads, since we may be about to write 7653185029Spjd * over this location. 7654185029Spjd */ 7655275811Sdelphij if (HDR_L2_READING(hdr)) { 7656185029Spjd ARCSTAT_BUMP(arcstat_l2_evict_reading); 7657307265Smav arc_hdr_set_flags(hdr, ARC_FLAG_L2_EVICTED); 7658185029Spjd } 7659185029Spjd 7660286598Smav arc_hdr_l2hdr_destroy(hdr); 7661185029Spjd } 7662185029Spjd mutex_exit(hash_lock); 7663185029Spjd } 7664286570Smav mutex_exit(&dev->l2ad_mtx); 7665185029Spjd} 7666185029Spjd 7667185029Spjd/* 7668185029Spjd * Find and write ARC buffers to the L2ARC device. 7669185029Spjd * 7670275811Sdelphij * An ARC_FLAG_L2_WRITING flag is set so that the L2ARC buffers are not valid 7671185029Spjd * for reading until they have completed writing. 7672251478Sdelphij * The headroom_boost is an in-out parameter used to maintain headroom boost 7673251478Sdelphij * state between calls to this function. 7674251478Sdelphij * 7675251478Sdelphij * Returns the number of bytes actually written (which may be smaller than 7676251478Sdelphij * the delta by which the device hand has changed due to alignment). 7677185029Spjd */ 7678208373Smmstatic uint64_t 7679307265Smavl2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) 7680185029Spjd{ 7681275811Sdelphij arc_buf_hdr_t *hdr, *hdr_prev, *head; 7682323754Savg uint64_t write_asize, write_psize, write_lsize, headroom; 7683251478Sdelphij boolean_t full; 7684185029Spjd l2arc_write_callback_t *cb; 7685185029Spjd zio_t *pio, *wzio; 7686228103Smm uint64_t guid = spa_load_guid(spa); 7687185029Spjd int try; 7688185029Spjd 7689307265Smav ASSERT3P(dev->l2ad_vdev, !=, NULL); 7690185029Spjd 7691185029Spjd pio = NULL; 7692323754Savg write_lsize = write_asize = write_psize = 0; 7693185029Spjd full = B_FALSE; 7694286570Smav head = kmem_cache_alloc(hdr_l2only_cache, KM_PUSHPAGE); 7695307265Smav arc_hdr_set_flags(head, ARC_FLAG_L2_WRITE_HEAD | ARC_FLAG_HAS_L2HDR); 7696185029Spjd 7697205231Skmacy ARCSTAT_BUMP(arcstat_l2_write_buffer_iter); 7698185029Spjd /* 7699185029Spjd * Copy buffers for L2ARC writing. 7700185029Spjd */ 7701286762Smav for (try = 0; try <= 3; try++) { 7702286763Smav multilist_sublist_t *mls = l2arc_sublist_lock(try); 7703251478Sdelphij uint64_t passed_sz = 0; 7704251478Sdelphij 7705205231Skmacy ARCSTAT_BUMP(arcstat_l2_write_buffer_list_iter); 7706185029Spjd 7707185029Spjd /* 7708185029Spjd * L2ARC fast warmup. 7709185029Spjd * 7710185029Spjd * Until the ARC is warm and starts to evict, read from the 7711185029Spjd * head of the ARC lists rather than the tail. 7712185029Spjd */ 7713185029Spjd if (arc_warm == B_FALSE) 7714286763Smav hdr = multilist_sublist_head(mls); 7715185029Spjd else 7716286763Smav hdr = multilist_sublist_tail(mls); 7717275811Sdelphij if (hdr == NULL) 7718205231Skmacy ARCSTAT_BUMP(arcstat_l2_write_buffer_list_null_iter); 7719185029Spjd 7720286762Smav headroom = target_sz * l2arc_headroom; 7721307265Smav if (zfs_compressed_arc_enabled) 7722251478Sdelphij headroom = (headroom * l2arc_headroom_boost) / 100; 7723251478Sdelphij 7724275811Sdelphij for (; hdr; hdr = hdr_prev) { 7725251478Sdelphij kmutex_t *hash_lock; 7726251478Sdelphij 7727185029Spjd if (arc_warm == B_FALSE) 7728286763Smav hdr_prev = multilist_sublist_next(mls, hdr); 7729185029Spjd else 7730286763Smav hdr_prev = multilist_sublist_prev(mls, hdr); 7731307265Smav ARCSTAT_INCR(arcstat_l2_write_buffer_bytes_scanned, 7732307265Smav HDR_GET_LSIZE(hdr)); 7733206796Spjd 7734275811Sdelphij hash_lock = HDR_LOCK(hdr); 7735251478Sdelphij if (!mutex_tryenter(hash_lock)) { 7736205231Skmacy ARCSTAT_BUMP(arcstat_l2_write_trylock_fail); 7737185029Spjd /* 7738185029Spjd * Skip this buffer rather than waiting. 7739185029Spjd */ 7740185029Spjd continue; 7741185029Spjd } 7742185029Spjd 7743307265Smav passed_sz += HDR_GET_LSIZE(hdr); 7744185029Spjd if (passed_sz > headroom) { 7745185029Spjd /* 7746185029Spjd * Searched too far. 7747185029Spjd */ 7748185029Spjd mutex_exit(hash_lock); 7749205231Skmacy ARCSTAT_BUMP(arcstat_l2_write_passed_headroom); 7750185029Spjd break; 7751185029Spjd } 7752185029Spjd 7753275811Sdelphij if (!l2arc_write_eligible(guid, hdr)) { 7754185029Spjd mutex_exit(hash_lock); 7755185029Spjd continue; 7756185029Spjd } 7757185029Spjd 7758315072Savg /* 7759315072Savg * We rely on the L1 portion of the header below, so 7760315072Savg * it's invalid for this header to have been evicted out 7761315072Savg * of the ghost cache, prior to being written out. The 7762315072Savg * ARC_FLAG_L2_WRITING bit ensures this won't happen. 7763315072Savg */ 7764315072Savg ASSERT(HDR_HAS_L1HDR(hdr)); 7765315072Savg 7766315072Savg ASSERT3U(HDR_GET_PSIZE(hdr), >, 0); 7767321610Smav ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); 7768315072Savg ASSERT3U(arc_hdr_size(hdr), >, 0); 7769323754Savg uint64_t psize = arc_hdr_size(hdr); 7770315072Savg uint64_t asize = vdev_psize_to_asize(dev->l2ad_vdev, 7771323754Savg psize); 7772315072Savg 7773323754Savg if ((write_asize + asize) > target_sz) { 7774185029Spjd full = B_TRUE; 7775185029Spjd mutex_exit(hash_lock); 7776205231Skmacy ARCSTAT_BUMP(arcstat_l2_write_full); 7777185029Spjd break; 7778185029Spjd } 7779185029Spjd 7780185029Spjd if (pio == NULL) { 7781185029Spjd /* 7782185029Spjd * Insert a dummy header on the buflist so 7783185029Spjd * l2arc_write_done() can find where the 7784185029Spjd * write buffers begin without searching. 7785185029Spjd */ 7786286763Smav mutex_enter(&dev->l2ad_mtx); 7787286570Smav list_insert_head(&dev->l2ad_buflist, head); 7788286763Smav mutex_exit(&dev->l2ad_mtx); 7789185029Spjd 7790185029Spjd cb = kmem_alloc( 7791185029Spjd sizeof (l2arc_write_callback_t), KM_SLEEP); 7792185029Spjd cb->l2wcb_dev = dev; 7793185029Spjd cb->l2wcb_head = head; 7794185029Spjd pio = zio_root(spa, l2arc_write_done, cb, 7795185029Spjd ZIO_FLAG_CANFAIL); 7796205231Skmacy ARCSTAT_BUMP(arcstat_l2_write_pios); 7797185029Spjd } 7798185029Spjd 7799286570Smav hdr->b_l2hdr.b_dev = dev; 7800307265Smav hdr->b_l2hdr.b_daddr = dev->l2ad_hand; 7801307265Smav arc_hdr_set_flags(hdr, 7802307265Smav ARC_FLAG_L2_WRITING | ARC_FLAG_HAS_L2HDR); 7803251478Sdelphij 7804307265Smav mutex_enter(&dev->l2ad_mtx); 7805307265Smav list_insert_head(&dev->l2ad_buflist, hdr); 7806307265Smav mutex_exit(&dev->l2ad_mtx); 7807307265Smav 7808323754Savg (void) refcount_add_many(&dev->l2ad_alloc, psize, hdr); 7809251478Sdelphij 7810185029Spjd /* 7811307265Smav * Normally the L2ARC can use the hdr's data, but if 7812307265Smav * we're sharing data between the hdr and one of its 7813307265Smav * bufs, L2ARC needs its own copy of the data so that 7814321613Smav * the ZIO below can't race with the buf consumer. 7815321613Smav * Another case where we need to create a copy of the 7816321613Smav * data is when the buffer size is not device-aligned 7817321613Smav * and we need to pad the block to make it such. 7818321613Smav * That also keeps the clock hand suitably aligned. 7819321613Smav * 7820321613Smav * To ensure that the copy will be available for the 7821307265Smav * lifetime of the ZIO and be cleaned up afterwards, we 7822307265Smav * add it to the l2arc_free_on_write queue. 7823185029Spjd */ 7824321610Smav abd_t *to_write; 7825323754Savg if (!HDR_SHARED_DATA(hdr) && psize == asize) { 7826321610Smav to_write = hdr->b_l1hdr.b_pabd; 7827307265Smav } else { 7828321610Smav to_write = abd_alloc_for_io(asize, 7829321610Smav HDR_ISTYPE_METADATA(hdr)); 7830323754Savg abd_copy(to_write, hdr->b_l1hdr.b_pabd, psize); 7831323754Savg if (asize != psize) { 7832323754Savg abd_zero_off(to_write, psize, 7833323754Savg asize - psize); 7834307265Smav } 7835321610Smav l2arc_free_abd_on_write(to_write, asize, 7836321610Smav arc_buf_type(hdr)); 7837307265Smav } 7838307265Smav wzio = zio_write_phys(pio, dev->l2ad_vdev, 7839307265Smav hdr->b_l2hdr.b_daddr, asize, to_write, 7840307265Smav ZIO_CHECKSUM_OFF, NULL, hdr, 7841307265Smav ZIO_PRIORITY_ASYNC_WRITE, 7842307265Smav ZIO_FLAG_CANFAIL, B_FALSE); 7843307265Smav 7844323754Savg write_lsize += HDR_GET_LSIZE(hdr); 7845307265Smav DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev, 7846307265Smav zio_t *, wzio); 7847307265Smav 7848323754Savg write_psize += psize; 7849323754Savg write_asize += asize; 7850307265Smav dev->l2ad_hand += asize; 7851307265Smav 7852185029Spjd mutex_exit(hash_lock); 7853185029Spjd 7854307265Smav (void) zio_nowait(wzio); 7855251478Sdelphij } 7856251478Sdelphij 7857286763Smav multilist_sublist_unlock(mls); 7858251478Sdelphij 7859251478Sdelphij if (full == B_TRUE) 7860251478Sdelphij break; 7861251478Sdelphij } 7862251478Sdelphij 7863251478Sdelphij /* No buffers selected for writing? */ 7864251478Sdelphij if (pio == NULL) { 7865323754Savg ASSERT0(write_lsize); 7866286570Smav ASSERT(!HDR_HAS_L1HDR(head)); 7867286570Smav kmem_cache_free(hdr_l2only_cache, head); 7868251478Sdelphij return (0); 7869251478Sdelphij } 7870251478Sdelphij 7871315072Savg ASSERT3U(write_psize, <=, target_sz); 7872185029Spjd ARCSTAT_BUMP(arcstat_l2_writes_sent); 7873323754Savg ARCSTAT_INCR(arcstat_l2_write_bytes, write_psize); 7874323754Savg ARCSTAT_INCR(arcstat_l2_lsize, write_lsize); 7875323754Savg ARCSTAT_INCR(arcstat_l2_psize, write_psize); 7876323754Savg vdev_space_update(dev->l2ad_vdev, write_psize, 0, 0); 7877185029Spjd 7878185029Spjd /* 7879185029Spjd * Bump device hand to the device start if it is approaching the end. 7880185029Spjd * l2arc_evict() will already have evicted ahead for this case. 7881185029Spjd */ 7882185029Spjd if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) { 7883185029Spjd dev->l2ad_hand = dev->l2ad_start; 7884185029Spjd dev->l2ad_first = B_FALSE; 7885185029Spjd } 7886185029Spjd 7887208373Smm dev->l2ad_writing = B_TRUE; 7888185029Spjd (void) zio_wait(pio); 7889208373Smm dev->l2ad_writing = B_FALSE; 7890208373Smm 7891251478Sdelphij return (write_asize); 7892185029Spjd} 7893185029Spjd 7894185029Spjd/* 7895185029Spjd * This thread feeds the L2ARC at regular intervals. This is the beating 7896185029Spjd * heart of the L2ARC. 7897185029Spjd */ 7898331399Smav/* ARGSUSED */ 7899185029Spjdstatic void 7900331399Smavl2arc_feed_thread(void *unused __unused) 7901185029Spjd{ 7902185029Spjd callb_cpr_t cpr; 7903185029Spjd l2arc_dev_t *dev; 7904185029Spjd spa_t *spa; 7905208373Smm uint64_t size, wrote; 7906219089Spjd clock_t begin, next = ddi_get_lbolt(); 7907185029Spjd 7908185029Spjd CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG); 7909185029Spjd 7910185029Spjd mutex_enter(&l2arc_feed_thr_lock); 7911185029Spjd 7912185029Spjd while (l2arc_thread_exit == 0) { 7913185029Spjd CALLB_CPR_SAFE_BEGIN(&cpr); 7914185029Spjd (void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock, 7915219089Spjd next - ddi_get_lbolt()); 7916185029Spjd CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock); 7917219089Spjd next = ddi_get_lbolt() + hz; 7918185029Spjd 7919185029Spjd /* 7920185029Spjd * Quick check for L2ARC devices. 7921185029Spjd */ 7922185029Spjd mutex_enter(&l2arc_dev_mtx); 7923185029Spjd if (l2arc_ndev == 0) { 7924185029Spjd mutex_exit(&l2arc_dev_mtx); 7925185029Spjd continue; 7926185029Spjd } 7927185029Spjd mutex_exit(&l2arc_dev_mtx); 7928219089Spjd begin = ddi_get_lbolt(); 7929185029Spjd 7930185029Spjd /* 7931185029Spjd * This selects the next l2arc device to write to, and in 7932185029Spjd * doing so the next spa to feed from: dev->l2ad_spa. This 7933185029Spjd * will return NULL if there are now no l2arc devices or if 7934185029Spjd * they are all faulted. 7935185029Spjd * 7936185029Spjd * If a device is returned, its spa's config lock is also 7937185029Spjd * held to prevent device removal. l2arc_dev_get_next() 7938185029Spjd * will grab and release l2arc_dev_mtx. 7939185029Spjd */ 7940185029Spjd if ((dev = l2arc_dev_get_next()) == NULL) 7941185029Spjd continue; 7942185029Spjd 7943185029Spjd spa = dev->l2ad_spa; 7944307265Smav ASSERT3P(spa, !=, NULL); 7945185029Spjd 7946185029Spjd /* 7947219089Spjd * If the pool is read-only then force the feed thread to 7948219089Spjd * sleep a little longer. 7949219089Spjd */ 7950219089Spjd if (!spa_writeable(spa)) { 7951219089Spjd next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz; 7952219089Spjd spa_config_exit(spa, SCL_L2ARC, dev); 7953219089Spjd continue; 7954219089Spjd } 7955219089Spjd 7956219089Spjd /* 7957185029Spjd * Avoid contributing to memory pressure. 7958185029Spjd */ 7959185029Spjd if (arc_reclaim_needed()) { 7960185029Spjd ARCSTAT_BUMP(arcstat_l2_abort_lowmem); 7961185029Spjd spa_config_exit(spa, SCL_L2ARC, dev); 7962185029Spjd continue; 7963185029Spjd } 7964185029Spjd 7965185029Spjd ARCSTAT_BUMP(arcstat_l2_feeds); 7966185029Spjd 7967251478Sdelphij size = l2arc_write_size(); 7968185029Spjd 7969185029Spjd /* 7970185029Spjd * Evict L2ARC buffers that will be overwritten. 7971185029Spjd */ 7972185029Spjd l2arc_evict(dev, size, B_FALSE); 7973185029Spjd 7974185029Spjd /* 7975185029Spjd * Write ARC buffers. 7976185029Spjd */ 7977307265Smav wrote = l2arc_write_buffers(spa, dev, size); 7978208373Smm 7979208373Smm /* 7980208373Smm * Calculate interval between writes. 7981208373Smm */ 7982208373Smm next = l2arc_write_interval(begin, size, wrote); 7983185029Spjd spa_config_exit(spa, SCL_L2ARC, dev); 7984185029Spjd } 7985185029Spjd 7986185029Spjd l2arc_thread_exit = 0; 7987185029Spjd cv_broadcast(&l2arc_feed_thr_cv); 7988185029Spjd CALLB_CPR_EXIT(&cpr); /* drops l2arc_feed_thr_lock */ 7989185029Spjd thread_exit(); 7990185029Spjd} 7991185029Spjd 7992185029Spjdboolean_t 7993185029Spjdl2arc_vdev_present(vdev_t *vd) 7994185029Spjd{ 7995185029Spjd l2arc_dev_t *dev; 7996185029Spjd 7997185029Spjd mutex_enter(&l2arc_dev_mtx); 7998185029Spjd for (dev = list_head(l2arc_dev_list); dev != NULL; 7999185029Spjd dev = list_next(l2arc_dev_list, dev)) { 8000185029Spjd if (dev->l2ad_vdev == vd) 8001185029Spjd break; 8002185029Spjd } 8003185029Spjd mutex_exit(&l2arc_dev_mtx); 8004185029Spjd 8005185029Spjd return (dev != NULL); 8006185029Spjd} 8007185029Spjd 8008185029Spjd/* 8009185029Spjd * Add a vdev for use by the L2ARC. By this point the spa has already 8010185029Spjd * validated the vdev and opened it. 8011185029Spjd */ 8012185029Spjdvoid 8013219089Spjdl2arc_add_vdev(spa_t *spa, vdev_t *vd) 8014185029Spjd{ 8015185029Spjd l2arc_dev_t *adddev; 8016185029Spjd 8017185029Spjd ASSERT(!l2arc_vdev_present(vd)); 8018185029Spjd 8019255753Sgibbs vdev_ashift_optimize(vd); 8020255753Sgibbs 8021185029Spjd /* 8022185029Spjd * Create a new l2arc device entry. 8023185029Spjd */ 8024185029Spjd adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP); 8025185029Spjd adddev->l2ad_spa = spa; 8026185029Spjd adddev->l2ad_vdev = vd; 8027219089Spjd adddev->l2ad_start = VDEV_LABEL_START_SIZE; 8028219089Spjd adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd); 8029185029Spjd adddev->l2ad_hand = adddev->l2ad_start; 8030185029Spjd adddev->l2ad_first = B_TRUE; 8031208373Smm adddev->l2ad_writing = B_FALSE; 8032185029Spjd 8033286570Smav mutex_init(&adddev->l2ad_mtx, NULL, MUTEX_DEFAULT, NULL); 8034185029Spjd /* 8035185029Spjd * This is a list of all ARC buffers that are still valid on the 8036185029Spjd * device. 8037185029Spjd */ 8038286570Smav list_create(&adddev->l2ad_buflist, sizeof (arc_buf_hdr_t), 8039286570Smav offsetof(arc_buf_hdr_t, b_l2hdr.b_l2node)); 8040185029Spjd 8041219089Spjd vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand); 8042286598Smav refcount_create(&adddev->l2ad_alloc); 8043185029Spjd 8044185029Spjd /* 8045185029Spjd * Add device to global list 8046185029Spjd */ 8047185029Spjd mutex_enter(&l2arc_dev_mtx); 8048185029Spjd list_insert_head(l2arc_dev_list, adddev); 8049185029Spjd atomic_inc_64(&l2arc_ndev); 8050185029Spjd mutex_exit(&l2arc_dev_mtx); 8051185029Spjd} 8052185029Spjd 8053185029Spjd/* 8054185029Spjd * Remove a vdev from the L2ARC. 8055185029Spjd */ 8056185029Spjdvoid 8057185029Spjdl2arc_remove_vdev(vdev_t *vd) 8058185029Spjd{ 8059185029Spjd l2arc_dev_t *dev, *nextdev, *remdev = NULL; 8060185029Spjd 8061185029Spjd /* 8062185029Spjd * Find the device by vdev 8063185029Spjd */ 8064185029Spjd mutex_enter(&l2arc_dev_mtx); 8065185029Spjd for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) { 8066185029Spjd nextdev = list_next(l2arc_dev_list, dev); 8067185029Spjd if (vd == dev->l2ad_vdev) { 8068185029Spjd remdev = dev; 8069185029Spjd break; 8070185029Spjd } 8071185029Spjd } 8072307265Smav ASSERT3P(remdev, !=, NULL); 8073185029Spjd 8074185029Spjd /* 8075185029Spjd * Remove device from global list 8076185029Spjd */ 8077185029Spjd list_remove(l2arc_dev_list, remdev); 8078185029Spjd l2arc_dev_last = NULL; /* may have been invalidated */ 8079185029Spjd atomic_dec_64(&l2arc_ndev); 8080185029Spjd mutex_exit(&l2arc_dev_mtx); 8081185029Spjd 8082185029Spjd /* 8083185029Spjd * Clear all buflists and ARC references. L2ARC device flush. 8084185029Spjd */ 8085185029Spjd l2arc_evict(remdev, 0, B_TRUE); 8086286570Smav list_destroy(&remdev->l2ad_buflist); 8087286570Smav mutex_destroy(&remdev->l2ad_mtx); 8088286598Smav refcount_destroy(&remdev->l2ad_alloc); 8089185029Spjd kmem_free(remdev, sizeof (l2arc_dev_t)); 8090185029Spjd} 8091185029Spjd 8092185029Spjdvoid 8093185029Spjdl2arc_init(void) 8094185029Spjd{ 8095185029Spjd l2arc_thread_exit = 0; 8096185029Spjd l2arc_ndev = 0; 8097185029Spjd l2arc_writes_sent = 0; 8098185029Spjd l2arc_writes_done = 0; 8099185029Spjd 8100185029Spjd mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL); 8101185029Spjd cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL); 8102185029Spjd mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL); 8103185029Spjd mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL); 8104185029Spjd 8105185029Spjd l2arc_dev_list = &L2ARC_dev_list; 8106185029Spjd l2arc_free_on_write = &L2ARC_free_on_write; 8107185029Spjd list_create(l2arc_dev_list, sizeof (l2arc_dev_t), 8108185029Spjd offsetof(l2arc_dev_t, l2ad_node)); 8109185029Spjd list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t), 8110185029Spjd offsetof(l2arc_data_free_t, l2df_list_node)); 8111185029Spjd} 8112185029Spjd 8113185029Spjdvoid 8114185029Spjdl2arc_fini(void) 8115185029Spjd{ 8116185029Spjd /* 8117185029Spjd * This is called from dmu_fini(), which is called from spa_fini(); 8118185029Spjd * Because of this, we can assume that all l2arc devices have 8119185029Spjd * already been removed when the pools themselves were removed. 8120185029Spjd */ 8121185029Spjd 8122185029Spjd l2arc_do_free_on_write(); 8123185029Spjd 8124185029Spjd mutex_destroy(&l2arc_feed_thr_lock); 8125185029Spjd cv_destroy(&l2arc_feed_thr_cv); 8126185029Spjd mutex_destroy(&l2arc_dev_mtx); 8127185029Spjd mutex_destroy(&l2arc_free_on_write_mtx); 8128185029Spjd 8129185029Spjd list_destroy(l2arc_dev_list); 8130185029Spjd list_destroy(l2arc_free_on_write); 8131185029Spjd} 8132185029Spjd 8133185029Spjdvoid 8134185029Spjdl2arc_start(void) 8135185029Spjd{ 8136209962Smm if (!(spa_mode_global & FWRITE)) 8137185029Spjd return; 8138185029Spjd 8139185029Spjd (void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0, 8140185029Spjd TS_RUN, minclsyspri); 8141185029Spjd} 8142185029Spjd 8143185029Spjdvoid 8144185029Spjdl2arc_stop(void) 8145185029Spjd{ 8146209962Smm if (!(spa_mode_global & FWRITE)) 8147185029Spjd return; 8148185029Spjd 8149185029Spjd mutex_enter(&l2arc_feed_thr_lock); 8150185029Spjd cv_signal(&l2arc_feed_thr_cv); /* kick thread out of startup */ 8151185029Spjd l2arc_thread_exit = 1; 8152185029Spjd while (l2arc_thread_exit != 0) 8153185029Spjd cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock); 8154185029Spjd mutex_exit(&l2arc_feed_thr_lock); 8155185029Spjd} 8156