arc.c revision 331383
1168404Spjd/* 2168404Spjd * CDDL HEADER START 3168404Spjd * 4168404Spjd * The contents of this file are subject to the terms of the 5168404Spjd * Common Development and Distribution License (the "License"). 6168404Spjd * You may not use this file except in compliance with the License. 7168404Spjd * 8168404Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9168404Spjd * or http://www.opensolaris.org/os/licensing. 10168404Spjd * See the License for the specific language governing permissions 11168404Spjd * and limitations under the License. 12168404Spjd * 13168404Spjd * When distributing Covered Code, include this CDDL HEADER in each 14168404Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15168404Spjd * If applicable, add the following below this CDDL HEADER, with the 16168404Spjd * fields enclosed by brackets "[]" replaced with your own identifying 17168404Spjd * information: Portions Copyright [yyyy] [name of copyright owner] 18168404Spjd * 19168404Spjd * CDDL HEADER END 20168404Spjd */ 21168404Spjd/* 22219089Spjd * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23277826Sdelphij * Copyright (c) 2012, Joyent, Inc. All rights reserved. 24321552Smav * Copyright (c) 2011, 2017 by Delphix. All rights reserved. 25260835Sdelphij * Copyright (c) 2014 by Saso Kiselkov. All rights reserved. 26329490Smav * Copyright 2017 Nexenta Systems, Inc. All rights reserved. 27168404Spjd */ 28168404Spjd 29168404Spjd/* 30168404Spjd * DVA-based Adjustable Replacement Cache 31168404Spjd * 32168404Spjd * While much of the theory of operation used here is 33168404Spjd * based on the self-tuning, low overhead replacement cache 34168404Spjd * presented by Megiddo and Modha at FAST 2003, there are some 35168404Spjd * significant differences: 36168404Spjd * 37168404Spjd * 1. The Megiddo and Modha model assumes any page is evictable. 38168404Spjd * Pages in its cache cannot be "locked" into memory. This makes 39168404Spjd * the eviction algorithm simple: evict the last page in the list. 40168404Spjd * This also make the performance characteristics easy to reason 41168404Spjd * about. Our cache is not so simple. At any given moment, some 42168404Spjd * subset of the blocks in the cache are un-evictable because we 43168404Spjd * have handed out a reference to them. Blocks are only evictable 44168404Spjd * when there are no external references active. This makes 45168404Spjd * eviction far more problematic: we choose to evict the evictable 46168404Spjd * blocks that are the "lowest" in the list. 47168404Spjd * 48168404Spjd * There are times when it is not possible to evict the requested 49168404Spjd * space. In these circumstances we are unable to adjust the cache 50168404Spjd * size. To prevent the cache growing unbounded at these times we 51185029Spjd * implement a "cache throttle" that slows the flow of new data 52185029Spjd * into the cache until we can make space available. 53168404Spjd * 54168404Spjd * 2. The Megiddo and Modha model assumes a fixed cache size. 55168404Spjd * Pages are evicted when the cache is full and there is a cache 56168404Spjd * miss. Our model has a variable sized cache. It grows with 57185029Spjd * high use, but also tries to react to memory pressure from the 58168404Spjd * operating system: decreasing its size when system memory is 59168404Spjd * tight. 60168404Spjd * 61168404Spjd * 3. The Megiddo and Modha model assumes a fixed page size. All 62251631Sdelphij * elements of the cache are therefore exactly the same size. So 63168404Spjd * when adjusting the cache size following a cache miss, its simply 64168404Spjd * a matter of choosing a single page to evict. In our model, we 65168404Spjd * have variable sized cache blocks (rangeing from 512 bytes to 66251631Sdelphij * 128K bytes). We therefore choose a set of blocks to evict to make 67168404Spjd * space for a cache miss that approximates as closely as possible 68168404Spjd * the space used by the new block. 69168404Spjd * 70168404Spjd * See also: "ARC: A Self-Tuning, Low Overhead Replacement Cache" 71168404Spjd * by N. Megiddo & D. Modha, FAST 2003 72168404Spjd */ 73168404Spjd 74168404Spjd/* 75168404Spjd * The locking model: 76168404Spjd * 77168404Spjd * A new reference to a cache buffer can be obtained in two 78168404Spjd * ways: 1) via a hash table lookup using the DVA as a key, 79185029Spjd * or 2) via one of the ARC lists. The arc_read() interface 80321535Smav * uses method 1, while the internal ARC algorithms for 81251631Sdelphij * adjusting the cache use method 2. We therefore provide two 82168404Spjd * types of locks: 1) the hash table lock array, and 2) the 83321535Smav * ARC list locks. 84168404Spjd * 85286774Smav * Buffers do not have their own mutexes, rather they rely on the 86286774Smav * hash table mutexes for the bulk of their protection (i.e. most 87286774Smav * fields in the arc_buf_hdr_t are protected by these mutexes). 88168404Spjd * 89168404Spjd * buf_hash_find() returns the appropriate mutex (held) when it 90168404Spjd * locates the requested buffer in the hash table. It returns 91168404Spjd * NULL for the mutex if the buffer was not in the table. 92168404Spjd * 93168404Spjd * buf_hash_remove() expects the appropriate hash mutex to be 94168404Spjd * already held before it is invoked. 95168404Spjd * 96321535Smav * Each ARC state also has a mutex which is used to protect the 97168404Spjd * buffer list associated with the state. When attempting to 98321535Smav * obtain a hash table lock while holding an ARC list lock you 99168404Spjd * must use: mutex_tryenter() to avoid deadlock. Also note that 100168404Spjd * the active state mutex must be held before the ghost state mutex. 101168404Spjd * 102168404Spjd * Note that the majority of the performance stats are manipulated 103168404Spjd * with atomic operations. 104185029Spjd * 105286570Smav * The L2ARC uses the l2ad_mtx on each vdev for the following: 106185029Spjd * 107185029Spjd * - L2ARC buflist creation 108185029Spjd * - L2ARC buflist eviction 109185029Spjd * - L2ARC write completion, which walks L2ARC buflists 110185029Spjd * - ARC header destruction, as it removes from L2ARC buflists 111185029Spjd * - ARC header release, as it removes from L2ARC buflists 112168404Spjd */ 113168404Spjd 114307265Smav/* 115307265Smav * ARC operation: 116307265Smav * 117307265Smav * Every block that is in the ARC is tracked by an arc_buf_hdr_t structure. 118307265Smav * This structure can point either to a block that is still in the cache or to 119307265Smav * one that is only accessible in an L2 ARC device, or it can provide 120307265Smav * information about a block that was recently evicted. If a block is 121307265Smav * only accessible in the L2ARC, then the arc_buf_hdr_t only has enough 122307265Smav * information to retrieve it from the L2ARC device. This information is 123307265Smav * stored in the l2arc_buf_hdr_t sub-structure of the arc_buf_hdr_t. A block 124307265Smav * that is in this state cannot access the data directly. 125307265Smav * 126307265Smav * Blocks that are actively being referenced or have not been evicted 127307265Smav * are cached in the L1ARC. The L1ARC (l1arc_buf_hdr_t) is a structure within 128307265Smav * the arc_buf_hdr_t that will point to the data block in memory. A block can 129307265Smav * only be read by a consumer if it has an l1arc_buf_hdr_t. The L1ARC 130321535Smav * caches data in two ways -- in a list of ARC buffers (arc_buf_t) and 131321610Smav * also in the arc_buf_hdr_t's private physical data block pointer (b_pabd). 132321535Smav * 133321535Smav * The L1ARC's data pointer may or may not be uncompressed. The ARC has the 134321610Smav * ability to store the physical data (b_pabd) associated with the DVA of the 135321610Smav * arc_buf_hdr_t. Since the b_pabd is a copy of the on-disk physical block, 136321535Smav * it will match its on-disk compression characteristics. This behavior can be 137321535Smav * disabled by setting 'zfs_compressed_arc_enabled' to B_FALSE. When the 138321610Smav * compressed ARC functionality is disabled, the b_pabd will point to an 139321535Smav * uncompressed version of the on-disk data. 140321535Smav * 141321535Smav * Data in the L1ARC is not accessed by consumers of the ARC directly. Each 142321535Smav * arc_buf_hdr_t can have multiple ARC buffers (arc_buf_t) which reference it. 143321535Smav * Each ARC buffer (arc_buf_t) is being actively accessed by a specific ARC 144321535Smav * consumer. The ARC will provide references to this data and will keep it 145321535Smav * cached until it is no longer in use. The ARC caches only the L1ARC's physical 146321535Smav * data block and will evict any arc_buf_t that is no longer referenced. The 147321535Smav * amount of memory consumed by the arc_buf_ts' data buffers can be seen via the 148307265Smav * "overhead_size" kstat. 149307265Smav * 150321535Smav * Depending on the consumer, an arc_buf_t can be requested in uncompressed or 151321535Smav * compressed form. The typical case is that consumers will want uncompressed 152321535Smav * data, and when that happens a new data buffer is allocated where the data is 153321535Smav * decompressed for them to use. Currently the only consumer who wants 154321535Smav * compressed arc_buf_t's is "zfs send", when it streams data exactly as it 155321535Smav * exists on disk. When this happens, the arc_buf_t's data buffer is shared 156321535Smav * with the arc_buf_hdr_t. 157307265Smav * 158321535Smav * Here is a diagram showing an arc_buf_hdr_t referenced by two arc_buf_t's. The 159321535Smav * first one is owned by a compressed send consumer (and therefore references 160321535Smav * the same compressed data buffer as the arc_buf_hdr_t) and the second could be 161321535Smav * used by any other consumer (and has its own uncompressed copy of the data 162321535Smav * buffer). 163307265Smav * 164321535Smav * arc_buf_hdr_t 165321535Smav * +-----------+ 166321535Smav * | fields | 167321535Smav * | common to | 168321535Smav * | L1- and | 169321535Smav * | L2ARC | 170321535Smav * +-----------+ 171321535Smav * | l2arc_buf_hdr_t 172321535Smav * | | 173321535Smav * +-----------+ 174321535Smav * | l1arc_buf_hdr_t 175321535Smav * | | arc_buf_t 176321535Smav * | b_buf +------------>+-----------+ arc_buf_t 177321610Smav * | b_pabd +-+ |b_next +---->+-----------+ 178321535Smav * +-----------+ | |-----------| |b_next +-->NULL 179321535Smav * | |b_comp = T | +-----------+ 180321535Smav * | |b_data +-+ |b_comp = F | 181321535Smav * | +-----------+ | |b_data +-+ 182321535Smav * +->+------+ | +-----------+ | 183321535Smav * compressed | | | | 184321535Smav * data | |<--------------+ | uncompressed 185321535Smav * +------+ compressed, | data 186321535Smav * shared +-->+------+ 187321535Smav * data | | 188321535Smav * | | 189321535Smav * +------+ 190307265Smav * 191307265Smav * When a consumer reads a block, the ARC must first look to see if the 192321535Smav * arc_buf_hdr_t is cached. If the hdr is cached then the ARC allocates a new 193321535Smav * arc_buf_t and either copies uncompressed data into a new data buffer from an 194321610Smav * existing uncompressed arc_buf_t, decompresses the hdr's b_pabd buffer into a 195321610Smav * new data buffer, or shares the hdr's b_pabd buffer, depending on whether the 196321535Smav * hdr is compressed and the desired compression characteristics of the 197321535Smav * arc_buf_t consumer. If the arc_buf_t ends up sharing data with the 198321535Smav * arc_buf_hdr_t and both of them are uncompressed then the arc_buf_t must be 199321535Smav * the last buffer in the hdr's b_buf list, however a shared compressed buf can 200321535Smav * be anywhere in the hdr's list. 201307265Smav * 202307265Smav * The diagram below shows an example of an uncompressed ARC hdr that is 203321535Smav * sharing its data with an arc_buf_t (note that the shared uncompressed buf is 204321535Smav * the last element in the buf list): 205307265Smav * 206307265Smav * arc_buf_hdr_t 207307265Smav * +-----------+ 208307265Smav * | | 209307265Smav * | | 210307265Smav * | | 211307265Smav * +-----------+ 212307265Smav * l2arc_buf_hdr_t| | 213307265Smav * | | 214307265Smav * +-----------+ 215307265Smav * l1arc_buf_hdr_t| | 216307265Smav * | | arc_buf_t (shared) 217307265Smav * | b_buf +------------>+---------+ arc_buf_t 218307265Smav * | | |b_next +---->+---------+ 219321610Smav * | b_pabd +-+ |---------| |b_next +-->NULL 220307265Smav * +-----------+ | | | +---------+ 221307265Smav * | |b_data +-+ | | 222307265Smav * | +---------+ | |b_data +-+ 223307265Smav * +->+------+ | +---------+ | 224307265Smav * | | | | 225307265Smav * uncompressed | | | | 226307265Smav * data +------+ | | 227307265Smav * ^ +->+------+ | 228307265Smav * | uncompressed | | | 229307265Smav * | data | | | 230307265Smav * | +------+ | 231307265Smav * +---------------------------------+ 232307265Smav * 233321610Smav * Writing to the ARC requires that the ARC first discard the hdr's b_pabd 234307265Smav * since the physical block is about to be rewritten. The new data contents 235321535Smav * will be contained in the arc_buf_t. As the I/O pipeline performs the write, 236321535Smav * it may compress the data before writing it to disk. The ARC will be called 237321535Smav * with the transformed data and will bcopy the transformed on-disk block into 238321610Smav * a newly allocated b_pabd. Writes are always done into buffers which have 239321535Smav * either been loaned (and hence are new and don't have other readers) or 240321535Smav * buffers which have been released (and hence have their own hdr, if there 241321535Smav * were originally other readers of the buf's original hdr). This ensures that 242321535Smav * the ARC only needs to update a single buf and its hdr after a write occurs. 243307265Smav * 244321610Smav * When the L2ARC is in use, it will also take advantage of the b_pabd. The 245321610Smav * L2ARC will always write the contents of b_pabd to the L2ARC. This means 246321535Smav * that when compressed ARC is enabled that the L2ARC blocks are identical 247307265Smav * to the on-disk block in the main data pool. This provides a significant 248307265Smav * advantage since the ARC can leverage the bp's checksum when reading from the 249307265Smav * L2ARC to determine if the contents are valid. However, if the compressed 250321535Smav * ARC is disabled, then the L2ARC's block must be transformed to look 251307265Smav * like the physical block in the main data pool before comparing the 252307265Smav * checksum and determining its validity. 253307265Smav */ 254307265Smav 255168404Spjd#include <sys/spa.h> 256168404Spjd#include <sys/zio.h> 257307265Smav#include <sys/spa_impl.h> 258251478Sdelphij#include <sys/zio_compress.h> 259307265Smav#include <sys/zio_checksum.h> 260168404Spjd#include <sys/zfs_context.h> 261168404Spjd#include <sys/arc.h> 262168404Spjd#include <sys/refcount.h> 263185029Spjd#include <sys/vdev.h> 264219089Spjd#include <sys/vdev_impl.h> 265258632Savg#include <sys/dsl_pool.h> 266321610Smav#include <sys/zio_checksum.h> 267286763Smav#include <sys/multilist.h> 268321610Smav#include <sys/abd.h> 269168404Spjd#ifdef _KERNEL 270168404Spjd#include <sys/dnlc.h> 271297633Strasz#include <sys/racct.h> 272168404Spjd#endif 273168404Spjd#include <sys/callb.h> 274168404Spjd#include <sys/kstat.h> 275248572Ssmh#include <sys/trim_map.h> 276219089Spjd#include <zfs_fletcher.h> 277168404Spjd#include <sys/sdt.h> 278168404Spjd 279272483Ssmh#include <machine/vmparam.h> 280191902Skmacy 281240133Smm#ifdef illumos 282240133Smm#ifndef _KERNEL 283240133Smm/* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */ 284240133Smmboolean_t arc_watch = B_FALSE; 285240133Smmint arc_procfd; 286240133Smm#endif 287240133Smm#endif /* illumos */ 288240133Smm 289286763Smavstatic kmutex_t arc_reclaim_lock; 290286763Smavstatic kcondvar_t arc_reclaim_thread_cv; 291286763Smavstatic boolean_t arc_reclaim_thread_exit; 292286763Smavstatic kcondvar_t arc_reclaim_waiters_cv; 293168404Spjd 294301997Skibstatic kmutex_t arc_dnlc_evicts_lock; 295301997Skibstatic kcondvar_t arc_dnlc_evicts_cv; 296301997Skibstatic boolean_t arc_dnlc_evicts_thread_exit; 297301997Skib 298286625Smavuint_t arc_reduce_dnlc_percent = 3; 299168404Spjd 300258632Savg/* 301286763Smav * The number of headers to evict in arc_evict_state_impl() before 302286763Smav * dropping the sublist lock and evicting from another sublist. A lower 303286763Smav * value means we're more likely to evict the "correct" header (i.e. the 304286763Smav * oldest header in the arc state), but comes with higher overhead 305286763Smav * (i.e. more invocations of arc_evict_state_impl()). 306258632Savg */ 307286763Smavint zfs_arc_evict_batch_limit = 10; 308258632Savg 309168404Spjd/* number of seconds before growing cache again */ 310168404Spjdstatic int arc_grow_retry = 60; 311168404Spjd 312321610Smav/* shift of arc_c for calculating overflow limit in arc_get_data_impl */ 313286763Smavint zfs_arc_overflow_shift = 8; 314286763Smav 315208373Smm/* shift of arc_c for calculating both min and max arc_p */ 316208373Smmstatic int arc_p_min_shift = 4; 317208373Smm 318208373Smm/* log2(fraction of arc to reclaim) */ 319286625Smavstatic int arc_shrink_shift = 7; 320208373Smm 321168404Spjd/* 322286625Smav * log2(fraction of ARC which must be free to allow growing). 323286625Smav * I.e. If there is less than arc_c >> arc_no_grow_shift free memory, 324286625Smav * when reading a new block into the ARC, we will evict an equal-sized block 325286625Smav * from the ARC. 326286625Smav * 327286625Smav * This must be less than arc_shrink_shift, so that when we shrink the ARC, 328286625Smav * we will still not allow it to grow. 329286625Smav */ 330286625Smavint arc_no_grow_shift = 5; 331286625Smav 332286625Smav 333286625Smav/* 334168404Spjd * minimum lifespan of a prefetch block in clock ticks 335168404Spjd * (initialized in arc_init()) 336168404Spjd */ 337168404Spjdstatic int arc_min_prefetch_lifespan; 338168404Spjd 339258632Savg/* 340258632Savg * If this percent of memory is free, don't throttle. 341258632Savg */ 342258632Savgint arc_lotsfree_percent = 10; 343258632Savg 344208373Smmstatic int arc_dead; 345287702Sdelphijextern boolean_t zfs_prefetch_disable; 346168404Spjd 347168404Spjd/* 348185029Spjd * The arc has filled available memory and has now warmed up. 349185029Spjd */ 350185029Spjdstatic boolean_t arc_warm; 351185029Spjd 352286762Smav/* 353331383Smav * log2 fraction of the zio arena to keep free. 354331383Smav */ 355331383Smavint arc_zio_arena_free_shift = 2; 356331383Smav 357331383Smav/* 358286762Smav * These tunables are for performance analysis. 359286762Smav */ 360185029Spjduint64_t zfs_arc_max; 361185029Spjduint64_t zfs_arc_min; 362185029Spjduint64_t zfs_arc_meta_limit = 0; 363275780Sdelphijuint64_t zfs_arc_meta_min = 0; 364208373Smmint zfs_arc_grow_retry = 0; 365208373Smmint zfs_arc_shrink_shift = 0; 366323667Sbaptint zfs_arc_no_grow_shift = 0; 367208373Smmint zfs_arc_p_min_shift = 0; 368269230Sdelphijuint64_t zfs_arc_average_blocksize = 8 * 1024; /* 8KB */ 369272483Ssmhu_int zfs_arc_free_target = 0; 370185029Spjd 371302265Ssmh/* Absolute min for arc min / max is 16MB. */ 372302265Ssmhstatic uint64_t arc_abs_min = 16 << 20; 373302265Ssmh 374307265Smavboolean_t zfs_compressed_arc_enabled = B_TRUE; 375307265Smav 376270759Ssmhstatic int sysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS); 377275748Sdelphijstatic int sysctl_vfs_zfs_arc_meta_limit(SYSCTL_HANDLER_ARGS); 378302265Ssmhstatic int sysctl_vfs_zfs_arc_max(SYSCTL_HANDLER_ARGS); 379302265Ssmhstatic int sysctl_vfs_zfs_arc_min(SYSCTL_HANDLER_ARGS); 380323667Sbaptstatic int sysctl_vfs_zfs_arc_no_grow_shift(SYSCTL_HANDLER_ARGS); 381270759Ssmh 382302265Ssmh#if defined(__FreeBSD__) && defined(_KERNEL) 383270759Ssmhstatic void 384270759Ssmharc_free_target_init(void *unused __unused) 385270759Ssmh{ 386270759Ssmh 387272483Ssmh zfs_arc_free_target = vm_pageout_wakeup_thresh; 388270759Ssmh} 389270759SsmhSYSINIT(arc_free_target_init, SI_SUB_KTHREAD_PAGE, SI_ORDER_ANY, 390270759Ssmh arc_free_target_init, NULL); 391270759Ssmh 392185029SpjdTUNABLE_QUAD("vfs.zfs.arc_meta_limit", &zfs_arc_meta_limit); 393275780SdelphijTUNABLE_QUAD("vfs.zfs.arc_meta_min", &zfs_arc_meta_min); 394273026SdelphijTUNABLE_INT("vfs.zfs.arc_shrink_shift", &zfs_arc_shrink_shift); 395323667SbaptTUNABLE_INT("vfs.zfs.arc_grow_retry", &zfs_arc_grow_retry); 396323667SbaptTUNABLE_INT("vfs.zfs.arc_no_grow_shift", &zfs_arc_no_grow_shift); 397168473SpjdSYSCTL_DECL(_vfs_zfs); 398302265SsmhSYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_max, CTLTYPE_U64 | CTLFLAG_RWTUN, 399302265Ssmh 0, sizeof(uint64_t), sysctl_vfs_zfs_arc_max, "QU", "Maximum ARC size"); 400302265SsmhSYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_min, CTLTYPE_U64 | CTLFLAG_RWTUN, 401302265Ssmh 0, sizeof(uint64_t), sysctl_vfs_zfs_arc_min, "QU", "Minimum ARC size"); 402323667SbaptSYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_no_grow_shift, CTLTYPE_U32 | CTLFLAG_RWTUN, 403323667Sbapt 0, sizeof(uint32_t), sysctl_vfs_zfs_arc_no_grow_shift, "U", 404323667Sbapt "log2(fraction of ARC which must be free to allow growing)"); 405269230SdelphijSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_average_blocksize, CTLFLAG_RDTUN, 406269230Sdelphij &zfs_arc_average_blocksize, 0, 407269230Sdelphij "ARC average blocksize"); 408273026SdelphijSYSCTL_INT(_vfs_zfs, OID_AUTO, arc_shrink_shift, CTLFLAG_RW, 409273026Sdelphij &arc_shrink_shift, 0, 410273026Sdelphij "log2(fraction of arc to reclaim)"); 411323667SbaptSYSCTL_INT(_vfs_zfs, OID_AUTO, arc_grow_retry, CTLFLAG_RW, 412323667Sbapt &arc_grow_retry, 0, 413323667Sbapt "Wait in seconds before considering growing ARC"); 414307265SmavSYSCTL_INT(_vfs_zfs, OID_AUTO, compressed_arc_enabled, CTLFLAG_RDTUN, 415307265Smav &zfs_compressed_arc_enabled, 0, "Enable compressed ARC"); 416273026Sdelphij 417270759Ssmh/* 418270759Ssmh * We don't have a tunable for arc_free_target due to the dependency on 419270759Ssmh * pagedaemon initialisation. 420270759Ssmh */ 421270759SsmhSYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_free_target, 422270759Ssmh CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(u_int), 423270759Ssmh sysctl_vfs_zfs_arc_free_target, "IU", 424270759Ssmh "Desired number of free pages below which ARC triggers reclaim"); 425168404Spjd 426270759Ssmhstatic int 427270759Ssmhsysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS) 428270759Ssmh{ 429270759Ssmh u_int val; 430270759Ssmh int err; 431270759Ssmh 432270759Ssmh val = zfs_arc_free_target; 433270759Ssmh err = sysctl_handle_int(oidp, &val, 0, req); 434270759Ssmh if (err != 0 || req->newptr == NULL) 435270759Ssmh return (err); 436270759Ssmh 437272483Ssmh if (val < minfree) 438270759Ssmh return (EINVAL); 439272483Ssmh if (val > vm_cnt.v_page_count) 440270759Ssmh return (EINVAL); 441270759Ssmh 442270759Ssmh zfs_arc_free_target = val; 443270759Ssmh 444270759Ssmh return (0); 445270759Ssmh} 446275748Sdelphij 447275748Sdelphij/* 448275748Sdelphij * Must be declared here, before the definition of corresponding kstat 449275748Sdelphij * macro which uses the same names will confuse the compiler. 450275748Sdelphij */ 451275748SdelphijSYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_meta_limit, 452275748Sdelphij CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(uint64_t), 453275748Sdelphij sysctl_vfs_zfs_arc_meta_limit, "QU", 454275748Sdelphij "ARC metadata limit"); 455272483Ssmh#endif 456270759Ssmh 457168404Spjd/* 458185029Spjd * Note that buffers can be in one of 6 states: 459168404Spjd * ARC_anon - anonymous (discussed below) 460168404Spjd * ARC_mru - recently used, currently cached 461168404Spjd * ARC_mru_ghost - recentely used, no longer in cache 462168404Spjd * ARC_mfu - frequently used, currently cached 463168404Spjd * ARC_mfu_ghost - frequently used, no longer in cache 464185029Spjd * ARC_l2c_only - exists in L2ARC but not other states 465185029Spjd * When there are no active references to the buffer, they are 466185029Spjd * are linked onto a list in one of these arc states. These are 467185029Spjd * the only buffers that can be evicted or deleted. Within each 468185029Spjd * state there are multiple lists, one for meta-data and one for 469185029Spjd * non-meta-data. Meta-data (indirect blocks, blocks of dnodes, 470185029Spjd * etc.) is tracked separately so that it can be managed more 471185029Spjd * explicitly: favored over data, limited explicitly. 472168404Spjd * 473168404Spjd * Anonymous buffers are buffers that are not associated with 474168404Spjd * a DVA. These are buffers that hold dirty block copies 475168404Spjd * before they are written to stable storage. By definition, 476168404Spjd * they are "ref'd" and are considered part of arc_mru 477168404Spjd * that cannot be freed. Generally, they will aquire a DVA 478168404Spjd * as they are written and migrate onto the arc_mru list. 479185029Spjd * 480185029Spjd * The ARC_l2c_only state is for buffers that are in the second 481185029Spjd * level ARC but no longer in any of the ARC_m* lists. The second 482185029Spjd * level ARC itself may also contain buffers that are in any of 483185029Spjd * the ARC_m* states - meaning that a buffer can exist in two 484185029Spjd * places. The reason for the ARC_l2c_only state is to keep the 485185029Spjd * buffer header in the hash table, so that reads that hit the 486185029Spjd * second level ARC benefit from these fast lookups. 487168404Spjd */ 488168404Spjd 489168404Spjdtypedef struct arc_state { 490286763Smav /* 491286763Smav * list of evictable buffers 492286763Smav */ 493321553Smav multilist_t *arcs_list[ARC_BUFC_NUMTYPES]; 494286763Smav /* 495286763Smav * total amount of evictable data in this state 496286763Smav */ 497307265Smav refcount_t arcs_esize[ARC_BUFC_NUMTYPES]; 498286763Smav /* 499286763Smav * total amount of data in this state; this includes: evictable, 500286763Smav * non-evictable, ARC_BUFC_DATA, and ARC_BUFC_METADATA. 501286763Smav */ 502286766Smav refcount_t arcs_size; 503168404Spjd} arc_state_t; 504168404Spjd 505185029Spjd/* The 6 states: */ 506168404Spjdstatic arc_state_t ARC_anon; 507168404Spjdstatic arc_state_t ARC_mru; 508168404Spjdstatic arc_state_t ARC_mru_ghost; 509168404Spjdstatic arc_state_t ARC_mfu; 510168404Spjdstatic arc_state_t ARC_mfu_ghost; 511185029Spjdstatic arc_state_t ARC_l2c_only; 512168404Spjd 513168404Spjdtypedef struct arc_stats { 514168404Spjd kstat_named_t arcstat_hits; 515168404Spjd kstat_named_t arcstat_misses; 516168404Spjd kstat_named_t arcstat_demand_data_hits; 517168404Spjd kstat_named_t arcstat_demand_data_misses; 518168404Spjd kstat_named_t arcstat_demand_metadata_hits; 519168404Spjd kstat_named_t arcstat_demand_metadata_misses; 520168404Spjd kstat_named_t arcstat_prefetch_data_hits; 521168404Spjd kstat_named_t arcstat_prefetch_data_misses; 522168404Spjd kstat_named_t arcstat_prefetch_metadata_hits; 523168404Spjd kstat_named_t arcstat_prefetch_metadata_misses; 524168404Spjd kstat_named_t arcstat_mru_hits; 525168404Spjd kstat_named_t arcstat_mru_ghost_hits; 526168404Spjd kstat_named_t arcstat_mfu_hits; 527168404Spjd kstat_named_t arcstat_mfu_ghost_hits; 528205231Skmacy kstat_named_t arcstat_allocated; 529168404Spjd kstat_named_t arcstat_deleted; 530251629Sdelphij /* 531251629Sdelphij * Number of buffers that could not be evicted because the hash lock 532251629Sdelphij * was held by another thread. The lock may not necessarily be held 533251629Sdelphij * by something using the same buffer, since hash locks are shared 534251629Sdelphij * by multiple buffers. 535251629Sdelphij */ 536168404Spjd kstat_named_t arcstat_mutex_miss; 537251629Sdelphij /* 538251629Sdelphij * Number of buffers skipped because they have I/O in progress, are 539251629Sdelphij * indrect prefetch buffers that have not lived long enough, or are 540251629Sdelphij * not from the spa we're trying to evict from. 541251629Sdelphij */ 542168404Spjd kstat_named_t arcstat_evict_skip; 543286763Smav /* 544286763Smav * Number of times arc_evict_state() was unable to evict enough 545286763Smav * buffers to reach it's target amount. 546286763Smav */ 547286763Smav kstat_named_t arcstat_evict_not_enough; 548208373Smm kstat_named_t arcstat_evict_l2_cached; 549208373Smm kstat_named_t arcstat_evict_l2_eligible; 550208373Smm kstat_named_t arcstat_evict_l2_ineligible; 551286763Smav kstat_named_t arcstat_evict_l2_skip; 552168404Spjd kstat_named_t arcstat_hash_elements; 553168404Spjd kstat_named_t arcstat_hash_elements_max; 554168404Spjd kstat_named_t arcstat_hash_collisions; 555168404Spjd kstat_named_t arcstat_hash_chains; 556168404Spjd kstat_named_t arcstat_hash_chain_max; 557168404Spjd kstat_named_t arcstat_p; 558168404Spjd kstat_named_t arcstat_c; 559168404Spjd kstat_named_t arcstat_c_min; 560168404Spjd kstat_named_t arcstat_c_max; 561168404Spjd kstat_named_t arcstat_size; 562286574Smav /* 563321610Smav * Number of compressed bytes stored in the arc_buf_hdr_t's b_pabd. 564307265Smav * Note that the compressed bytes may match the uncompressed bytes 565307265Smav * if the block is either not compressed or compressed arc is disabled. 566307265Smav */ 567307265Smav kstat_named_t arcstat_compressed_size; 568307265Smav /* 569321610Smav * Uncompressed size of the data stored in b_pabd. If compressed 570307265Smav * arc is disabled then this value will be identical to the stat 571307265Smav * above. 572307265Smav */ 573307265Smav kstat_named_t arcstat_uncompressed_size; 574307265Smav /* 575307265Smav * Number of bytes stored in all the arc_buf_t's. This is classified 576307265Smav * as "overhead" since this data is typically short-lived and will 577307265Smav * be evicted from the arc when it becomes unreferenced unless the 578307265Smav * zfs_keep_uncompressed_metadata or zfs_keep_uncompressed_level 579307265Smav * values have been set (see comment in dbuf.c for more information). 580307265Smav */ 581307265Smav kstat_named_t arcstat_overhead_size; 582307265Smav /* 583286574Smav * Number of bytes consumed by internal ARC structures necessary 584286574Smav * for tracking purposes; these structures are not actually 585286574Smav * backed by ARC buffers. This includes arc_buf_hdr_t structures 586286574Smav * (allocated via arc_buf_hdr_t_full and arc_buf_hdr_t_l2only 587286574Smav * caches), and arc_buf_t structures (allocated via arc_buf_t 588286574Smav * cache). 589286574Smav */ 590185029Spjd kstat_named_t arcstat_hdr_size; 591286574Smav /* 592286574Smav * Number of bytes consumed by ARC buffers of type equal to 593286574Smav * ARC_BUFC_DATA. This is generally consumed by buffers backing 594286574Smav * on disk user data (e.g. plain file contents). 595286574Smav */ 596208373Smm kstat_named_t arcstat_data_size; 597286574Smav /* 598286574Smav * Number of bytes consumed by ARC buffers of type equal to 599286574Smav * ARC_BUFC_METADATA. This is generally consumed by buffers 600286574Smav * backing on disk data that is used for internal ZFS 601286574Smav * structures (e.g. ZAP, dnode, indirect blocks, etc). 602286574Smav */ 603286574Smav kstat_named_t arcstat_metadata_size; 604286574Smav /* 605286574Smav * Number of bytes consumed by various buffers and structures 606286574Smav * not actually backed with ARC buffers. This includes bonus 607286574Smav * buffers (allocated directly via zio_buf_* functions), 608286574Smav * dmu_buf_impl_t structures (allocated via dmu_buf_impl_t 609286574Smav * cache), and dnode_t structures (allocated via dnode_t cache). 610286574Smav */ 611208373Smm kstat_named_t arcstat_other_size; 612286574Smav /* 613286574Smav * Total number of bytes consumed by ARC buffers residing in the 614286574Smav * arc_anon state. This includes *all* buffers in the arc_anon 615286574Smav * state; e.g. data, metadata, evictable, and unevictable buffers 616286574Smav * are all included in this value. 617286574Smav */ 618286574Smav kstat_named_t arcstat_anon_size; 619286574Smav /* 620286574Smav * Number of bytes consumed by ARC buffers that meet the 621286574Smav * following criteria: backing buffers of type ARC_BUFC_DATA, 622286574Smav * residing in the arc_anon state, and are eligible for eviction 623286574Smav * (e.g. have no outstanding holds on the buffer). 624286574Smav */ 625286574Smav kstat_named_t arcstat_anon_evictable_data; 626286574Smav /* 627286574Smav * Number of bytes consumed by ARC buffers that meet the 628286574Smav * following criteria: backing buffers of type ARC_BUFC_METADATA, 629286574Smav * residing in the arc_anon state, and are eligible for eviction 630286574Smav * (e.g. have no outstanding holds on the buffer). 631286574Smav */ 632286574Smav kstat_named_t arcstat_anon_evictable_metadata; 633286574Smav /* 634286574Smav * Total number of bytes consumed by ARC buffers residing in the 635286574Smav * arc_mru state. This includes *all* buffers in the arc_mru 636286574Smav * state; e.g. data, metadata, evictable, and unevictable buffers 637286574Smav * are all included in this value. 638286574Smav */ 639286574Smav kstat_named_t arcstat_mru_size; 640286574Smav /* 641286574Smav * Number of bytes consumed by ARC buffers that meet the 642286574Smav * following criteria: backing buffers of type ARC_BUFC_DATA, 643286574Smav * residing in the arc_mru state, and are eligible for eviction 644286574Smav * (e.g. have no outstanding holds on the buffer). 645286574Smav */ 646286574Smav kstat_named_t arcstat_mru_evictable_data; 647286574Smav /* 648286574Smav * Number of bytes consumed by ARC buffers that meet the 649286574Smav * following criteria: backing buffers of type ARC_BUFC_METADATA, 650286574Smav * residing in the arc_mru state, and are eligible for eviction 651286574Smav * (e.g. have no outstanding holds on the buffer). 652286574Smav */ 653286574Smav kstat_named_t arcstat_mru_evictable_metadata; 654286574Smav /* 655286574Smav * Total number of bytes that *would have been* consumed by ARC 656286574Smav * buffers in the arc_mru_ghost state. The key thing to note 657286574Smav * here, is the fact that this size doesn't actually indicate 658286574Smav * RAM consumption. The ghost lists only consist of headers and 659286574Smav * don't actually have ARC buffers linked off of these headers. 660286574Smav * Thus, *if* the headers had associated ARC buffers, these 661286574Smav * buffers *would have* consumed this number of bytes. 662286574Smav */ 663286574Smav kstat_named_t arcstat_mru_ghost_size; 664286574Smav /* 665286574Smav * Number of bytes that *would have been* consumed by ARC 666286574Smav * buffers that are eligible for eviction, of type 667286574Smav * ARC_BUFC_DATA, and linked off the arc_mru_ghost state. 668286574Smav */ 669286574Smav kstat_named_t arcstat_mru_ghost_evictable_data; 670286574Smav /* 671286574Smav * Number of bytes that *would have been* consumed by ARC 672286574Smav * buffers that are eligible for eviction, of type 673286574Smav * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state. 674286574Smav */ 675286574Smav kstat_named_t arcstat_mru_ghost_evictable_metadata; 676286574Smav /* 677286574Smav * Total number of bytes consumed by ARC buffers residing in the 678286574Smav * arc_mfu state. This includes *all* buffers in the arc_mfu 679286574Smav * state; e.g. data, metadata, evictable, and unevictable buffers 680286574Smav * are all included in this value. 681286574Smav */ 682286574Smav kstat_named_t arcstat_mfu_size; 683286574Smav /* 684286574Smav * Number of bytes consumed by ARC buffers that are eligible for 685286574Smav * eviction, of type ARC_BUFC_DATA, and reside in the arc_mfu 686286574Smav * state. 687286574Smav */ 688286574Smav kstat_named_t arcstat_mfu_evictable_data; 689286574Smav /* 690286574Smav * Number of bytes consumed by ARC buffers that are eligible for 691286574Smav * eviction, of type ARC_BUFC_METADATA, and reside in the 692286574Smav * arc_mfu state. 693286574Smav */ 694286574Smav kstat_named_t arcstat_mfu_evictable_metadata; 695286574Smav /* 696286574Smav * Total number of bytes that *would have been* consumed by ARC 697286574Smav * buffers in the arc_mfu_ghost state. See the comment above 698286574Smav * arcstat_mru_ghost_size for more details. 699286574Smav */ 700286574Smav kstat_named_t arcstat_mfu_ghost_size; 701286574Smav /* 702286574Smav * Number of bytes that *would have been* consumed by ARC 703286574Smav * buffers that are eligible for eviction, of type 704286574Smav * ARC_BUFC_DATA, and linked off the arc_mfu_ghost state. 705286574Smav */ 706286574Smav kstat_named_t arcstat_mfu_ghost_evictable_data; 707286574Smav /* 708286574Smav * Number of bytes that *would have been* consumed by ARC 709286574Smav * buffers that are eligible for eviction, of type 710286574Smav * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state. 711286574Smav */ 712286574Smav kstat_named_t arcstat_mfu_ghost_evictable_metadata; 713185029Spjd kstat_named_t arcstat_l2_hits; 714185029Spjd kstat_named_t arcstat_l2_misses; 715185029Spjd kstat_named_t arcstat_l2_feeds; 716185029Spjd kstat_named_t arcstat_l2_rw_clash; 717208373Smm kstat_named_t arcstat_l2_read_bytes; 718208373Smm kstat_named_t arcstat_l2_write_bytes; 719185029Spjd kstat_named_t arcstat_l2_writes_sent; 720185029Spjd kstat_named_t arcstat_l2_writes_done; 721185029Spjd kstat_named_t arcstat_l2_writes_error; 722286763Smav kstat_named_t arcstat_l2_writes_lock_retry; 723185029Spjd kstat_named_t arcstat_l2_evict_lock_retry; 724185029Spjd kstat_named_t arcstat_l2_evict_reading; 725286570Smav kstat_named_t arcstat_l2_evict_l1cached; 726185029Spjd kstat_named_t arcstat_l2_free_on_write; 727185029Spjd kstat_named_t arcstat_l2_abort_lowmem; 728185029Spjd kstat_named_t arcstat_l2_cksum_bad; 729185029Spjd kstat_named_t arcstat_l2_io_error; 730323754Savg kstat_named_t arcstat_l2_lsize; 731323754Savg kstat_named_t arcstat_l2_psize; 732185029Spjd kstat_named_t arcstat_l2_hdr_size; 733205231Skmacy kstat_named_t arcstat_l2_write_trylock_fail; 734205231Skmacy kstat_named_t arcstat_l2_write_passed_headroom; 735205231Skmacy kstat_named_t arcstat_l2_write_spa_mismatch; 736206796Spjd kstat_named_t arcstat_l2_write_in_l2; 737205231Skmacy kstat_named_t arcstat_l2_write_hdr_io_in_progress; 738205231Skmacy kstat_named_t arcstat_l2_write_not_cacheable; 739205231Skmacy kstat_named_t arcstat_l2_write_full; 740205231Skmacy kstat_named_t arcstat_l2_write_buffer_iter; 741205231Skmacy kstat_named_t arcstat_l2_write_pios; 742205231Skmacy kstat_named_t arcstat_l2_write_buffer_bytes_scanned; 743205231Skmacy kstat_named_t arcstat_l2_write_buffer_list_iter; 744205231Skmacy kstat_named_t arcstat_l2_write_buffer_list_null_iter; 745242845Sdelphij kstat_named_t arcstat_memory_throttle_count; 746275748Sdelphij kstat_named_t arcstat_meta_used; 747275748Sdelphij kstat_named_t arcstat_meta_limit; 748275748Sdelphij kstat_named_t arcstat_meta_max; 749275780Sdelphij kstat_named_t arcstat_meta_min; 750287702Sdelphij kstat_named_t arcstat_sync_wait_for_async; 751287702Sdelphij kstat_named_t arcstat_demand_hit_predictive_prefetch; 752168404Spjd} arc_stats_t; 753168404Spjd 754168404Spjdstatic arc_stats_t arc_stats = { 755168404Spjd { "hits", KSTAT_DATA_UINT64 }, 756168404Spjd { "misses", KSTAT_DATA_UINT64 }, 757168404Spjd { "demand_data_hits", KSTAT_DATA_UINT64 }, 758168404Spjd { "demand_data_misses", KSTAT_DATA_UINT64 }, 759168404Spjd { "demand_metadata_hits", KSTAT_DATA_UINT64 }, 760168404Spjd { "demand_metadata_misses", KSTAT_DATA_UINT64 }, 761168404Spjd { "prefetch_data_hits", KSTAT_DATA_UINT64 }, 762168404Spjd { "prefetch_data_misses", KSTAT_DATA_UINT64 }, 763168404Spjd { "prefetch_metadata_hits", KSTAT_DATA_UINT64 }, 764168404Spjd { "prefetch_metadata_misses", KSTAT_DATA_UINT64 }, 765168404Spjd { "mru_hits", KSTAT_DATA_UINT64 }, 766168404Spjd { "mru_ghost_hits", KSTAT_DATA_UINT64 }, 767168404Spjd { "mfu_hits", KSTAT_DATA_UINT64 }, 768168404Spjd { "mfu_ghost_hits", KSTAT_DATA_UINT64 }, 769205231Skmacy { "allocated", KSTAT_DATA_UINT64 }, 770168404Spjd { "deleted", KSTAT_DATA_UINT64 }, 771168404Spjd { "mutex_miss", KSTAT_DATA_UINT64 }, 772168404Spjd { "evict_skip", KSTAT_DATA_UINT64 }, 773286763Smav { "evict_not_enough", KSTAT_DATA_UINT64 }, 774208373Smm { "evict_l2_cached", KSTAT_DATA_UINT64 }, 775208373Smm { "evict_l2_eligible", KSTAT_DATA_UINT64 }, 776208373Smm { "evict_l2_ineligible", KSTAT_DATA_UINT64 }, 777286763Smav { "evict_l2_skip", KSTAT_DATA_UINT64 }, 778168404Spjd { "hash_elements", KSTAT_DATA_UINT64 }, 779168404Spjd { "hash_elements_max", KSTAT_DATA_UINT64 }, 780168404Spjd { "hash_collisions", KSTAT_DATA_UINT64 }, 781168404Spjd { "hash_chains", KSTAT_DATA_UINT64 }, 782168404Spjd { "hash_chain_max", KSTAT_DATA_UINT64 }, 783168404Spjd { "p", KSTAT_DATA_UINT64 }, 784168404Spjd { "c", KSTAT_DATA_UINT64 }, 785168404Spjd { "c_min", KSTAT_DATA_UINT64 }, 786168404Spjd { "c_max", KSTAT_DATA_UINT64 }, 787185029Spjd { "size", KSTAT_DATA_UINT64 }, 788307265Smav { "compressed_size", KSTAT_DATA_UINT64 }, 789307265Smav { "uncompressed_size", KSTAT_DATA_UINT64 }, 790307265Smav { "overhead_size", KSTAT_DATA_UINT64 }, 791185029Spjd { "hdr_size", KSTAT_DATA_UINT64 }, 792208373Smm { "data_size", KSTAT_DATA_UINT64 }, 793286574Smav { "metadata_size", KSTAT_DATA_UINT64 }, 794208373Smm { "other_size", KSTAT_DATA_UINT64 }, 795286574Smav { "anon_size", KSTAT_DATA_UINT64 }, 796286574Smav { "anon_evictable_data", KSTAT_DATA_UINT64 }, 797286574Smav { "anon_evictable_metadata", KSTAT_DATA_UINT64 }, 798286574Smav { "mru_size", KSTAT_DATA_UINT64 }, 799286574Smav { "mru_evictable_data", KSTAT_DATA_UINT64 }, 800286574Smav { "mru_evictable_metadata", KSTAT_DATA_UINT64 }, 801286574Smav { "mru_ghost_size", KSTAT_DATA_UINT64 }, 802286574Smav { "mru_ghost_evictable_data", KSTAT_DATA_UINT64 }, 803286574Smav { "mru_ghost_evictable_metadata", KSTAT_DATA_UINT64 }, 804286574Smav { "mfu_size", KSTAT_DATA_UINT64 }, 805286574Smav { "mfu_evictable_data", KSTAT_DATA_UINT64 }, 806286574Smav { "mfu_evictable_metadata", KSTAT_DATA_UINT64 }, 807286574Smav { "mfu_ghost_size", KSTAT_DATA_UINT64 }, 808286574Smav { "mfu_ghost_evictable_data", KSTAT_DATA_UINT64 }, 809286574Smav { "mfu_ghost_evictable_metadata", KSTAT_DATA_UINT64 }, 810185029Spjd { "l2_hits", KSTAT_DATA_UINT64 }, 811185029Spjd { "l2_misses", KSTAT_DATA_UINT64 }, 812185029Spjd { "l2_feeds", KSTAT_DATA_UINT64 }, 813185029Spjd { "l2_rw_clash", KSTAT_DATA_UINT64 }, 814208373Smm { "l2_read_bytes", KSTAT_DATA_UINT64 }, 815208373Smm { "l2_write_bytes", KSTAT_DATA_UINT64 }, 816185029Spjd { "l2_writes_sent", KSTAT_DATA_UINT64 }, 817185029Spjd { "l2_writes_done", KSTAT_DATA_UINT64 }, 818185029Spjd { "l2_writes_error", KSTAT_DATA_UINT64 }, 819286763Smav { "l2_writes_lock_retry", KSTAT_DATA_UINT64 }, 820185029Spjd { "l2_evict_lock_retry", KSTAT_DATA_UINT64 }, 821185029Spjd { "l2_evict_reading", KSTAT_DATA_UINT64 }, 822286570Smav { "l2_evict_l1cached", KSTAT_DATA_UINT64 }, 823185029Spjd { "l2_free_on_write", KSTAT_DATA_UINT64 }, 824185029Spjd { "l2_abort_lowmem", KSTAT_DATA_UINT64 }, 825185029Spjd { "l2_cksum_bad", KSTAT_DATA_UINT64 }, 826185029Spjd { "l2_io_error", KSTAT_DATA_UINT64 }, 827185029Spjd { "l2_size", KSTAT_DATA_UINT64 }, 828251478Sdelphij { "l2_asize", KSTAT_DATA_UINT64 }, 829185029Spjd { "l2_hdr_size", KSTAT_DATA_UINT64 }, 830206796Spjd { "l2_write_trylock_fail", KSTAT_DATA_UINT64 }, 831206796Spjd { "l2_write_passed_headroom", KSTAT_DATA_UINT64 }, 832206796Spjd { "l2_write_spa_mismatch", KSTAT_DATA_UINT64 }, 833206796Spjd { "l2_write_in_l2", KSTAT_DATA_UINT64 }, 834206796Spjd { "l2_write_io_in_progress", KSTAT_DATA_UINT64 }, 835206796Spjd { "l2_write_not_cacheable", KSTAT_DATA_UINT64 }, 836206796Spjd { "l2_write_full", KSTAT_DATA_UINT64 }, 837206796Spjd { "l2_write_buffer_iter", KSTAT_DATA_UINT64 }, 838206796Spjd { "l2_write_pios", KSTAT_DATA_UINT64 }, 839206796Spjd { "l2_write_buffer_bytes_scanned", KSTAT_DATA_UINT64 }, 840206796Spjd { "l2_write_buffer_list_iter", KSTAT_DATA_UINT64 }, 841242845Sdelphij { "l2_write_buffer_list_null_iter", KSTAT_DATA_UINT64 }, 842242845Sdelphij { "memory_throttle_count", KSTAT_DATA_UINT64 }, 843275748Sdelphij { "arc_meta_used", KSTAT_DATA_UINT64 }, 844275748Sdelphij { "arc_meta_limit", KSTAT_DATA_UINT64 }, 845275780Sdelphij { "arc_meta_max", KSTAT_DATA_UINT64 }, 846287702Sdelphij { "arc_meta_min", KSTAT_DATA_UINT64 }, 847287702Sdelphij { "sync_wait_for_async", KSTAT_DATA_UINT64 }, 848287702Sdelphij { "demand_hit_predictive_prefetch", KSTAT_DATA_UINT64 }, 849168404Spjd}; 850168404Spjd 851168404Spjd#define ARCSTAT(stat) (arc_stats.stat.value.ui64) 852168404Spjd 853168404Spjd#define ARCSTAT_INCR(stat, val) \ 854251631Sdelphij atomic_add_64(&arc_stats.stat.value.ui64, (val)) 855168404Spjd 856206796Spjd#define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1) 857168404Spjd#define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1) 858168404Spjd 859168404Spjd#define ARCSTAT_MAX(stat, val) { \ 860168404Spjd uint64_t m; \ 861168404Spjd while ((val) > (m = arc_stats.stat.value.ui64) && \ 862168404Spjd (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val)))) \ 863168404Spjd continue; \ 864168404Spjd} 865168404Spjd 866168404Spjd#define ARCSTAT_MAXSTAT(stat) \ 867168404Spjd ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64) 868168404Spjd 869168404Spjd/* 870168404Spjd * We define a macro to allow ARC hits/misses to be easily broken down by 871168404Spjd * two separate conditions, giving a total of four different subtypes for 872168404Spjd * each of hits and misses (so eight statistics total). 873168404Spjd */ 874168404Spjd#define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \ 875168404Spjd if (cond1) { \ 876168404Spjd if (cond2) { \ 877168404Spjd ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \ 878168404Spjd } else { \ 879168404Spjd ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \ 880168404Spjd } \ 881168404Spjd } else { \ 882168404Spjd if (cond2) { \ 883168404Spjd ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \ 884168404Spjd } else { \ 885168404Spjd ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\ 886168404Spjd } \ 887168404Spjd } 888168404Spjd 889168404Spjdkstat_t *arc_ksp; 890206796Spjdstatic arc_state_t *arc_anon; 891168404Spjdstatic arc_state_t *arc_mru; 892168404Spjdstatic arc_state_t *arc_mru_ghost; 893168404Spjdstatic arc_state_t *arc_mfu; 894168404Spjdstatic arc_state_t *arc_mfu_ghost; 895185029Spjdstatic arc_state_t *arc_l2c_only; 896168404Spjd 897168404Spjd/* 898168404Spjd * There are several ARC variables that are critical to export as kstats -- 899168404Spjd * but we don't want to have to grovel around in the kstat whenever we wish to 900168404Spjd * manipulate them. For these variables, we therefore define them to be in 901168404Spjd * terms of the statistic variable. This assures that we are not introducing 902168404Spjd * the possibility of inconsistency by having shadow copies of the variables, 903168404Spjd * while still allowing the code to be readable. 904168404Spjd */ 905168404Spjd#define arc_size ARCSTAT(arcstat_size) /* actual total arc size */ 906168404Spjd#define arc_p ARCSTAT(arcstat_p) /* target size of MRU */ 907168404Spjd#define arc_c ARCSTAT(arcstat_c) /* target size of cache */ 908168404Spjd#define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */ 909168404Spjd#define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */ 910275748Sdelphij#define arc_meta_limit ARCSTAT(arcstat_meta_limit) /* max size for metadata */ 911275780Sdelphij#define arc_meta_min ARCSTAT(arcstat_meta_min) /* min size for metadata */ 912275748Sdelphij#define arc_meta_used ARCSTAT(arcstat_meta_used) /* size of metadata */ 913275748Sdelphij#define arc_meta_max ARCSTAT(arcstat_meta_max) /* max size of metadata */ 914168404Spjd 915307265Smav/* compressed size of entire arc */ 916307265Smav#define arc_compressed_size ARCSTAT(arcstat_compressed_size) 917307265Smav/* uncompressed size of entire arc */ 918307265Smav#define arc_uncompressed_size ARCSTAT(arcstat_uncompressed_size) 919307265Smav/* number of bytes in the arc from arc_buf_t's */ 920307265Smav#define arc_overhead_size ARCSTAT(arcstat_overhead_size) 921251478Sdelphij 922168404Spjdstatic int arc_no_grow; /* Don't try to grow cache size */ 923168404Spjdstatic uint64_t arc_tempreserve; 924209962Smmstatic uint64_t arc_loaned_bytes; 925168404Spjd 926168404Spjdtypedef struct arc_callback arc_callback_t; 927168404Spjd 928168404Spjdstruct arc_callback { 929168404Spjd void *acb_private; 930168404Spjd arc_done_func_t *acb_done; 931168404Spjd arc_buf_t *acb_buf; 932321535Smav boolean_t acb_compressed; 933168404Spjd zio_t *acb_zio_dummy; 934168404Spjd arc_callback_t *acb_next; 935168404Spjd}; 936168404Spjd 937168404Spjdtypedef struct arc_write_callback arc_write_callback_t; 938168404Spjd 939168404Spjdstruct arc_write_callback { 940168404Spjd void *awcb_private; 941168404Spjd arc_done_func_t *awcb_ready; 942304138Savg arc_done_func_t *awcb_children_ready; 943258632Savg arc_done_func_t *awcb_physdone; 944168404Spjd arc_done_func_t *awcb_done; 945168404Spjd arc_buf_t *awcb_buf; 946168404Spjd}; 947168404Spjd 948286570Smav/* 949286570Smav * ARC buffers are separated into multiple structs as a memory saving measure: 950286570Smav * - Common fields struct, always defined, and embedded within it: 951286570Smav * - L2-only fields, always allocated but undefined when not in L2ARC 952286570Smav * - L1-only fields, only allocated when in L1ARC 953286570Smav * 954286570Smav * Buffer in L1 Buffer only in L2 955286570Smav * +------------------------+ +------------------------+ 956286570Smav * | arc_buf_hdr_t | | arc_buf_hdr_t | 957286570Smav * | | | | 958286570Smav * | | | | 959286570Smav * | | | | 960286570Smav * +------------------------+ +------------------------+ 961286570Smav * | l2arc_buf_hdr_t | | l2arc_buf_hdr_t | 962286570Smav * | (undefined if L1-only) | | | 963286570Smav * +------------------------+ +------------------------+ 964286570Smav * | l1arc_buf_hdr_t | 965286570Smav * | | 966286570Smav * | | 967286570Smav * | | 968286570Smav * | | 969286570Smav * +------------------------+ 970286570Smav * 971286570Smav * Because it's possible for the L2ARC to become extremely large, we can wind 972286570Smav * up eating a lot of memory in L2ARC buffer headers, so the size of a header 973286570Smav * is minimized by only allocating the fields necessary for an L1-cached buffer 974286570Smav * when a header is actually in the L1 cache. The sub-headers (l1arc_buf_hdr and 975286570Smav * l2arc_buf_hdr) are embedded rather than allocated separately to save a couple 976286570Smav * words in pointers. arc_hdr_realloc() is used to switch a header between 977286570Smav * these two allocation states. 978286570Smav */ 979286570Smavtypedef struct l1arc_buf_hdr { 980168404Spjd kmutex_t b_freeze_lock; 981307265Smav zio_cksum_t *b_freeze_cksum; 982286570Smav#ifdef ZFS_DEBUG 983286570Smav /* 984321535Smav * Used for debugging with kmem_flags - by allocating and freeing 985286570Smav * b_thawed when the buffer is thawed, we get a record of the stack 986286570Smav * trace that thawed it. 987286570Smav */ 988219089Spjd void *b_thawed; 989286570Smav#endif 990168404Spjd 991168404Spjd arc_buf_t *b_buf; 992307265Smav uint32_t b_bufcnt; 993286570Smav /* for waiting on writes to complete */ 994168404Spjd kcondvar_t b_cv; 995307265Smav uint8_t b_byteswap; 996168404Spjd 997168404Spjd /* protected by arc state mutex */ 998168404Spjd arc_state_t *b_state; 999286763Smav multilist_node_t b_arc_node; 1000168404Spjd 1001168404Spjd /* updated atomically */ 1002168404Spjd clock_t b_arc_access; 1003168404Spjd 1004168404Spjd /* self protecting */ 1005168404Spjd refcount_t b_refcnt; 1006185029Spjd 1007286570Smav arc_callback_t *b_acb; 1008321610Smav abd_t *b_pabd; 1009286570Smav} l1arc_buf_hdr_t; 1010286570Smav 1011286570Smavtypedef struct l2arc_dev l2arc_dev_t; 1012286570Smav 1013286570Smavtypedef struct l2arc_buf_hdr { 1014286570Smav /* protected by arc_buf_hdr mutex */ 1015286570Smav l2arc_dev_t *b_dev; /* L2ARC device */ 1016286570Smav uint64_t b_daddr; /* disk address, offset byte */ 1017286570Smav 1018185029Spjd list_node_t b_l2node; 1019286570Smav} l2arc_buf_hdr_t; 1020286570Smav 1021286570Smavstruct arc_buf_hdr { 1022286570Smav /* protected by hash lock */ 1023286570Smav dva_t b_dva; 1024286570Smav uint64_t b_birth; 1025286570Smav 1026307265Smav arc_buf_contents_t b_type; 1027286570Smav arc_buf_hdr_t *b_hash_next; 1028286570Smav arc_flags_t b_flags; 1029286570Smav 1030307265Smav /* 1031307265Smav * This field stores the size of the data buffer after 1032307265Smav * compression, and is set in the arc's zio completion handlers. 1033307265Smav * It is in units of SPA_MINBLOCKSIZE (e.g. 1 == 512 bytes). 1034307265Smav * 1035307265Smav * While the block pointers can store up to 32MB in their psize 1036307265Smav * field, we can only store up to 32MB minus 512B. This is due 1037307265Smav * to the bp using a bias of 1, whereas we use a bias of 0 (i.e. 1038307265Smav * a field of zeros represents 512B in the bp). We can't use a 1039307265Smav * bias of 1 since we need to reserve a psize of zero, here, to 1040307265Smav * represent holes and embedded blocks. 1041307265Smav * 1042307265Smav * This isn't a problem in practice, since the maximum size of a 1043307265Smav * buffer is limited to 16MB, so we never need to store 32MB in 1044307265Smav * this field. Even in the upstream illumos code base, the 1045307265Smav * maximum size of a buffer is limited to 16MB. 1046307265Smav */ 1047307265Smav uint16_t b_psize; 1048286570Smav 1049307265Smav /* 1050307265Smav * This field stores the size of the data buffer before 1051307265Smav * compression, and cannot change once set. It is in units 1052307265Smav * of SPA_MINBLOCKSIZE (e.g. 2 == 1024 bytes) 1053307265Smav */ 1054307265Smav uint16_t b_lsize; /* immutable */ 1055307265Smav uint64_t b_spa; /* immutable */ 1056307265Smav 1057286570Smav /* L2ARC fields. Undefined when not in L2ARC. */ 1058286570Smav l2arc_buf_hdr_t b_l2hdr; 1059286570Smav /* L1ARC fields. Undefined when in l2arc_only state */ 1060286570Smav l1arc_buf_hdr_t b_l1hdr; 1061168404Spjd}; 1062168404Spjd 1063302265Ssmh#if defined(__FreeBSD__) && defined(_KERNEL) 1064275748Sdelphijstatic int 1065275748Sdelphijsysctl_vfs_zfs_arc_meta_limit(SYSCTL_HANDLER_ARGS) 1066275748Sdelphij{ 1067275748Sdelphij uint64_t val; 1068275748Sdelphij int err; 1069275748Sdelphij 1070275748Sdelphij val = arc_meta_limit; 1071275748Sdelphij err = sysctl_handle_64(oidp, &val, 0, req); 1072275748Sdelphij if (err != 0 || req->newptr == NULL) 1073275748Sdelphij return (err); 1074275748Sdelphij 1075275748Sdelphij if (val <= 0 || val > arc_c_max) 1076275748Sdelphij return (EINVAL); 1077275748Sdelphij 1078275748Sdelphij arc_meta_limit = val; 1079275748Sdelphij return (0); 1080275748Sdelphij} 1081302265Ssmh 1082302265Ssmhstatic int 1083323667Sbaptsysctl_vfs_zfs_arc_no_grow_shift(SYSCTL_HANDLER_ARGS) 1084323667Sbapt{ 1085323667Sbapt uint32_t val; 1086323667Sbapt int err; 1087323667Sbapt 1088323667Sbapt val = arc_no_grow_shift; 1089323667Sbapt err = sysctl_handle_32(oidp, &val, 0, req); 1090323667Sbapt if (err != 0 || req->newptr == NULL) 1091323667Sbapt return (err); 1092323667Sbapt 1093323667Sbapt if (val >= arc_shrink_shift) 1094323667Sbapt return (EINVAL); 1095323667Sbapt 1096323667Sbapt arc_no_grow_shift = val; 1097323667Sbapt return (0); 1098323667Sbapt} 1099323667Sbapt 1100323667Sbaptstatic int 1101302265Ssmhsysctl_vfs_zfs_arc_max(SYSCTL_HANDLER_ARGS) 1102302265Ssmh{ 1103302265Ssmh uint64_t val; 1104302265Ssmh int err; 1105302265Ssmh 1106302265Ssmh val = zfs_arc_max; 1107302265Ssmh err = sysctl_handle_64(oidp, &val, 0, req); 1108302265Ssmh if (err != 0 || req->newptr == NULL) 1109302265Ssmh return (err); 1110302265Ssmh 1111302382Ssmh if (zfs_arc_max == 0) { 1112302382Ssmh /* Loader tunable so blindly set */ 1113302382Ssmh zfs_arc_max = val; 1114302382Ssmh return (0); 1115302382Ssmh } 1116302382Ssmh 1117302265Ssmh if (val < arc_abs_min || val > kmem_size()) 1118302265Ssmh return (EINVAL); 1119302265Ssmh if (val < arc_c_min) 1120302265Ssmh return (EINVAL); 1121302265Ssmh if (zfs_arc_meta_limit > 0 && val < zfs_arc_meta_limit) 1122302265Ssmh return (EINVAL); 1123302265Ssmh 1124302265Ssmh arc_c_max = val; 1125302265Ssmh 1126302265Ssmh arc_c = arc_c_max; 1127302265Ssmh arc_p = (arc_c >> 1); 1128302265Ssmh 1129302265Ssmh if (zfs_arc_meta_limit == 0) { 1130302265Ssmh /* limit meta-data to 1/4 of the arc capacity */ 1131302265Ssmh arc_meta_limit = arc_c_max / 4; 1132302265Ssmh } 1133302265Ssmh 1134302265Ssmh /* if kmem_flags are set, lets try to use less memory */ 1135302265Ssmh if (kmem_debugging()) 1136302265Ssmh arc_c = arc_c / 2; 1137302265Ssmh 1138302265Ssmh zfs_arc_max = arc_c; 1139302265Ssmh 1140302265Ssmh return (0); 1141302265Ssmh} 1142302265Ssmh 1143302265Ssmhstatic int 1144302265Ssmhsysctl_vfs_zfs_arc_min(SYSCTL_HANDLER_ARGS) 1145302265Ssmh{ 1146302265Ssmh uint64_t val; 1147302265Ssmh int err; 1148302265Ssmh 1149302265Ssmh val = zfs_arc_min; 1150302265Ssmh err = sysctl_handle_64(oidp, &val, 0, req); 1151302265Ssmh if (err != 0 || req->newptr == NULL) 1152302265Ssmh return (err); 1153302265Ssmh 1154302382Ssmh if (zfs_arc_min == 0) { 1155302382Ssmh /* Loader tunable so blindly set */ 1156302382Ssmh zfs_arc_min = val; 1157302382Ssmh return (0); 1158302382Ssmh } 1159302382Ssmh 1160302265Ssmh if (val < arc_abs_min || val > arc_c_max) 1161302265Ssmh return (EINVAL); 1162302265Ssmh 1163302265Ssmh arc_c_min = val; 1164302265Ssmh 1165302265Ssmh if (zfs_arc_meta_min == 0) 1166302265Ssmh arc_meta_min = arc_c_min / 2; 1167302265Ssmh 1168302265Ssmh if (arc_c < arc_c_min) 1169302265Ssmh arc_c = arc_c_min; 1170302265Ssmh 1171302265Ssmh zfs_arc_min = arc_c_min; 1172302265Ssmh 1173302265Ssmh return (0); 1174302265Ssmh} 1175275748Sdelphij#endif 1176275748Sdelphij 1177168404Spjd#define GHOST_STATE(state) \ 1178185029Spjd ((state) == arc_mru_ghost || (state) == arc_mfu_ghost || \ 1179185029Spjd (state) == arc_l2c_only) 1180168404Spjd 1181275811Sdelphij#define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_FLAG_IN_HASH_TABLE) 1182275811Sdelphij#define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) 1183275811Sdelphij#define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_FLAG_IO_ERROR) 1184275811Sdelphij#define HDR_PREFETCH(hdr) ((hdr)->b_flags & ARC_FLAG_PREFETCH) 1185307265Smav#define HDR_COMPRESSION_ENABLED(hdr) \ 1186307265Smav ((hdr)->b_flags & ARC_FLAG_COMPRESSED_ARC) 1187286570Smav 1188275811Sdelphij#define HDR_L2CACHE(hdr) ((hdr)->b_flags & ARC_FLAG_L2CACHE) 1189275811Sdelphij#define HDR_L2_READING(hdr) \ 1190307265Smav (((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) && \ 1191307265Smav ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR)) 1192275811Sdelphij#define HDR_L2_WRITING(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITING) 1193275811Sdelphij#define HDR_L2_EVICTED(hdr) ((hdr)->b_flags & ARC_FLAG_L2_EVICTED) 1194275811Sdelphij#define HDR_L2_WRITE_HEAD(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITE_HEAD) 1195307265Smav#define HDR_SHARED_DATA(hdr) ((hdr)->b_flags & ARC_FLAG_SHARED_DATA) 1196168404Spjd 1197286570Smav#define HDR_ISTYPE_METADATA(hdr) \ 1198307265Smav ((hdr)->b_flags & ARC_FLAG_BUFC_METADATA) 1199286570Smav#define HDR_ISTYPE_DATA(hdr) (!HDR_ISTYPE_METADATA(hdr)) 1200286570Smav 1201286570Smav#define HDR_HAS_L1HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L1HDR) 1202286570Smav#define HDR_HAS_L2HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR) 1203286570Smav 1204307265Smav/* For storing compression mode in b_flags */ 1205307265Smav#define HDR_COMPRESS_OFFSET (highbit64(ARC_FLAG_COMPRESS_0) - 1) 1206307265Smav 1207307265Smav#define HDR_GET_COMPRESS(hdr) ((enum zio_compress)BF32_GET((hdr)->b_flags, \ 1208307265Smav HDR_COMPRESS_OFFSET, SPA_COMPRESSBITS)) 1209307265Smav#define HDR_SET_COMPRESS(hdr, cmp) BF32_SET((hdr)->b_flags, \ 1210307265Smav HDR_COMPRESS_OFFSET, SPA_COMPRESSBITS, (cmp)); 1211307265Smav 1212307265Smav#define ARC_BUF_LAST(buf) ((buf)->b_next == NULL) 1213321535Smav#define ARC_BUF_SHARED(buf) ((buf)->b_flags & ARC_BUF_FLAG_SHARED) 1214321535Smav#define ARC_BUF_COMPRESSED(buf) ((buf)->b_flags & ARC_BUF_FLAG_COMPRESSED) 1215307265Smav 1216168404Spjd/* 1217185029Spjd * Other sizes 1218185029Spjd */ 1219185029Spjd 1220286570Smav#define HDR_FULL_SIZE ((int64_t)sizeof (arc_buf_hdr_t)) 1221286570Smav#define HDR_L2ONLY_SIZE ((int64_t)offsetof(arc_buf_hdr_t, b_l1hdr)) 1222185029Spjd 1223185029Spjd/* 1224168404Spjd * Hash table routines 1225168404Spjd */ 1226168404Spjd 1227205253Skmacy#define HT_LOCK_PAD CACHE_LINE_SIZE 1228168404Spjd 1229168404Spjdstruct ht_lock { 1230168404Spjd kmutex_t ht_lock; 1231168404Spjd#ifdef _KERNEL 1232168404Spjd unsigned char pad[(HT_LOCK_PAD - sizeof (kmutex_t))]; 1233168404Spjd#endif 1234168404Spjd}; 1235168404Spjd 1236168404Spjd#define BUF_LOCKS 256 1237168404Spjdtypedef struct buf_hash_table { 1238168404Spjd uint64_t ht_mask; 1239168404Spjd arc_buf_hdr_t **ht_table; 1240205264Skmacy struct ht_lock ht_locks[BUF_LOCKS] __aligned(CACHE_LINE_SIZE); 1241168404Spjd} buf_hash_table_t; 1242168404Spjd 1243168404Spjdstatic buf_hash_table_t buf_hash_table; 1244168404Spjd 1245168404Spjd#define BUF_HASH_INDEX(spa, dva, birth) \ 1246168404Spjd (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask) 1247168404Spjd#define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)]) 1248168404Spjd#define BUF_HASH_LOCK(idx) (&(BUF_HASH_LOCK_NTRY(idx).ht_lock)) 1249219089Spjd#define HDR_LOCK(hdr) \ 1250219089Spjd (BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth))) 1251168404Spjd 1252168404Spjduint64_t zfs_crc64_table[256]; 1253168404Spjd 1254185029Spjd/* 1255185029Spjd * Level 2 ARC 1256185029Spjd */ 1257185029Spjd 1258272707Savg#define L2ARC_WRITE_SIZE (8 * 1024 * 1024) /* initial write max */ 1259251478Sdelphij#define L2ARC_HEADROOM 2 /* num of writes */ 1260251478Sdelphij/* 1261251478Sdelphij * If we discover during ARC scan any buffers to be compressed, we boost 1262251478Sdelphij * our headroom for the next scanning cycle by this percentage multiple. 1263251478Sdelphij */ 1264251478Sdelphij#define L2ARC_HEADROOM_BOOST 200 1265208373Smm#define L2ARC_FEED_SECS 1 /* caching interval secs */ 1266208373Smm#define L2ARC_FEED_MIN_MS 200 /* min caching interval ms */ 1267185029Spjd 1268185029Spjd#define l2arc_writes_sent ARCSTAT(arcstat_l2_writes_sent) 1269185029Spjd#define l2arc_writes_done ARCSTAT(arcstat_l2_writes_done) 1270185029Spjd 1271251631Sdelphij/* L2ARC Performance Tunables */ 1272185029Spjduint64_t l2arc_write_max = L2ARC_WRITE_SIZE; /* default max write size */ 1273185029Spjduint64_t l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra write during warmup */ 1274185029Spjduint64_t l2arc_headroom = L2ARC_HEADROOM; /* number of dev writes */ 1275251478Sdelphijuint64_t l2arc_headroom_boost = L2ARC_HEADROOM_BOOST; 1276185029Spjduint64_t l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */ 1277208373Smmuint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval milliseconds */ 1278219089Spjdboolean_t l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */ 1279208373Smmboolean_t l2arc_feed_again = B_TRUE; /* turbo warmup */ 1280208373Smmboolean_t l2arc_norw = B_TRUE; /* no reads during writes */ 1281185029Spjd 1282217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_max, CTLFLAG_RW, 1283205231Skmacy &l2arc_write_max, 0, "max write size"); 1284217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_boost, CTLFLAG_RW, 1285205231Skmacy &l2arc_write_boost, 0, "extra write during warmup"); 1286217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_headroom, CTLFLAG_RW, 1287205231Skmacy &l2arc_headroom, 0, "number of dev writes"); 1288217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_secs, CTLFLAG_RW, 1289205231Skmacy &l2arc_feed_secs, 0, "interval seconds"); 1290217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_min_ms, CTLFLAG_RW, 1291208373Smm &l2arc_feed_min_ms, 0, "min interval milliseconds"); 1292205231Skmacy 1293205231SkmacySYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_noprefetch, CTLFLAG_RW, 1294205231Skmacy &l2arc_noprefetch, 0, "don't cache prefetch bufs"); 1295208373SmmSYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_feed_again, CTLFLAG_RW, 1296208373Smm &l2arc_feed_again, 0, "turbo warmup"); 1297208373SmmSYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_norw, CTLFLAG_RW, 1298208373Smm &l2arc_norw, 0, "no reads during writes"); 1299205231Skmacy 1300217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_size, CTLFLAG_RD, 1301286770Smav &ARC_anon.arcs_size.rc_count, 0, "size of anonymous state"); 1302307265SmavSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_metadata_esize, CTLFLAG_RD, 1303307265Smav &ARC_anon.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, 1304307265Smav "size of anonymous state"); 1305307265SmavSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_data_esize, CTLFLAG_RD, 1306307265Smav &ARC_anon.arcs_esize[ARC_BUFC_DATA].rc_count, 0, 1307307265Smav "size of anonymous state"); 1308205231Skmacy 1309217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_size, CTLFLAG_RD, 1310286770Smav &ARC_mru.arcs_size.rc_count, 0, "size of mru state"); 1311307265SmavSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_metadata_esize, CTLFLAG_RD, 1312307265Smav &ARC_mru.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, 1313307265Smav "size of metadata in mru state"); 1314307265SmavSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_data_esize, CTLFLAG_RD, 1315307265Smav &ARC_mru.arcs_esize[ARC_BUFC_DATA].rc_count, 0, 1316307265Smav "size of data in mru state"); 1317205231Skmacy 1318217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_size, CTLFLAG_RD, 1319286770Smav &ARC_mru_ghost.arcs_size.rc_count, 0, "size of mru ghost state"); 1320307265SmavSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_metadata_esize, CTLFLAG_RD, 1321307265Smav &ARC_mru_ghost.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, 1322205231Skmacy "size of metadata in mru ghost state"); 1323307265SmavSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_data_esize, CTLFLAG_RD, 1324307265Smav &ARC_mru_ghost.arcs_esize[ARC_BUFC_DATA].rc_count, 0, 1325205231Skmacy "size of data in mru ghost state"); 1326205231Skmacy 1327217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_size, CTLFLAG_RD, 1328286770Smav &ARC_mfu.arcs_size.rc_count, 0, "size of mfu state"); 1329307265SmavSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_metadata_esize, CTLFLAG_RD, 1330307265Smav &ARC_mfu.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, 1331307265Smav "size of metadata in mfu state"); 1332307265SmavSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_data_esize, CTLFLAG_RD, 1333307265Smav &ARC_mfu.arcs_esize[ARC_BUFC_DATA].rc_count, 0, 1334307265Smav "size of data in mfu state"); 1335205231Skmacy 1336217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_size, CTLFLAG_RD, 1337286770Smav &ARC_mfu_ghost.arcs_size.rc_count, 0, "size of mfu ghost state"); 1338307265SmavSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_metadata_esize, CTLFLAG_RD, 1339307265Smav &ARC_mfu_ghost.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, 1340205231Skmacy "size of metadata in mfu ghost state"); 1341307265SmavSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_data_esize, CTLFLAG_RD, 1342307265Smav &ARC_mfu_ghost.arcs_esize[ARC_BUFC_DATA].rc_count, 0, 1343205231Skmacy "size of data in mfu ghost state"); 1344205231Skmacy 1345217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2c_only_size, CTLFLAG_RD, 1346286770Smav &ARC_l2c_only.arcs_size.rc_count, 0, "size of mru state"); 1347205231Skmacy 1348185029Spjd/* 1349185029Spjd * L2ARC Internals 1350185029Spjd */ 1351286570Smavstruct l2arc_dev { 1352185029Spjd vdev_t *l2ad_vdev; /* vdev */ 1353185029Spjd spa_t *l2ad_spa; /* spa */ 1354185029Spjd uint64_t l2ad_hand; /* next write location */ 1355185029Spjd uint64_t l2ad_start; /* first addr on device */ 1356185029Spjd uint64_t l2ad_end; /* last addr on device */ 1357185029Spjd boolean_t l2ad_first; /* first sweep through */ 1358208373Smm boolean_t l2ad_writing; /* currently writing */ 1359286570Smav kmutex_t l2ad_mtx; /* lock for buffer list */ 1360286570Smav list_t l2ad_buflist; /* buffer list */ 1361185029Spjd list_node_t l2ad_node; /* device list node */ 1362286598Smav refcount_t l2ad_alloc; /* allocated bytes */ 1363286570Smav}; 1364185029Spjd 1365185029Spjdstatic list_t L2ARC_dev_list; /* device list */ 1366185029Spjdstatic list_t *l2arc_dev_list; /* device list pointer */ 1367185029Spjdstatic kmutex_t l2arc_dev_mtx; /* device list mutex */ 1368185029Spjdstatic l2arc_dev_t *l2arc_dev_last; /* last device used */ 1369185029Spjdstatic list_t L2ARC_free_on_write; /* free after write buf list */ 1370185029Spjdstatic list_t *l2arc_free_on_write; /* free after write list ptr */ 1371185029Spjdstatic kmutex_t l2arc_free_on_write_mtx; /* mutex for list */ 1372185029Spjdstatic uint64_t l2arc_ndev; /* number of devices */ 1373185029Spjd 1374185029Spjdtypedef struct l2arc_read_callback { 1375321535Smav arc_buf_hdr_t *l2rcb_hdr; /* read header */ 1376251478Sdelphij blkptr_t l2rcb_bp; /* original blkptr */ 1377268123Sdelphij zbookmark_phys_t l2rcb_zb; /* original bookmark */ 1378251478Sdelphij int l2rcb_flags; /* original flags */ 1379321613Smav abd_t *l2rcb_abd; /* temporary buffer */ 1380185029Spjd} l2arc_read_callback_t; 1381185029Spjd 1382185029Spjdtypedef struct l2arc_write_callback { 1383185029Spjd l2arc_dev_t *l2wcb_dev; /* device info */ 1384185029Spjd arc_buf_hdr_t *l2wcb_head; /* head of write buflist */ 1385185029Spjd} l2arc_write_callback_t; 1386185029Spjd 1387185029Spjdtypedef struct l2arc_data_free { 1388185029Spjd /* protected by l2arc_free_on_write_mtx */ 1389321610Smav abd_t *l2df_abd; 1390185029Spjd size_t l2df_size; 1391307265Smav arc_buf_contents_t l2df_type; 1392185029Spjd list_node_t l2df_list_node; 1393185029Spjd} l2arc_data_free_t; 1394185029Spjd 1395185029Spjdstatic kmutex_t l2arc_feed_thr_lock; 1396185029Spjdstatic kcondvar_t l2arc_feed_thr_cv; 1397185029Spjdstatic uint8_t l2arc_thread_exit; 1398185029Spjd 1399321610Smavstatic abd_t *arc_get_data_abd(arc_buf_hdr_t *, uint64_t, void *); 1400307265Smavstatic void *arc_get_data_buf(arc_buf_hdr_t *, uint64_t, void *); 1401321610Smavstatic void arc_get_data_impl(arc_buf_hdr_t *, uint64_t, void *); 1402321610Smavstatic void arc_free_data_abd(arc_buf_hdr_t *, abd_t *, uint64_t, void *); 1403307265Smavstatic void arc_free_data_buf(arc_buf_hdr_t *, void *, uint64_t, void *); 1404321610Smavstatic void arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag); 1405321610Smavstatic void arc_hdr_free_pabd(arc_buf_hdr_t *); 1406321610Smavstatic void arc_hdr_alloc_pabd(arc_buf_hdr_t *); 1407275811Sdelphijstatic void arc_access(arc_buf_hdr_t *, kmutex_t *); 1408286763Smavstatic boolean_t arc_is_overflowing(); 1409275811Sdelphijstatic void arc_buf_watch(arc_buf_t *); 1410275811Sdelphij 1411286570Smavstatic arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *); 1412286570Smavstatic uint32_t arc_bufc_to_flags(arc_buf_contents_t); 1413307265Smavstatic inline void arc_hdr_set_flags(arc_buf_hdr_t *hdr, arc_flags_t flags); 1414307265Smavstatic inline void arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags); 1415286570Smav 1416275811Sdelphijstatic boolean_t l2arc_write_eligible(uint64_t, arc_buf_hdr_t *); 1417275811Sdelphijstatic void l2arc_read_done(zio_t *); 1418185029Spjd 1419290191Savgstatic void 1420290191Savgl2arc_trim(const arc_buf_hdr_t *hdr) 1421290191Savg{ 1422290191Savg l2arc_dev_t *dev = hdr->b_l2hdr.b_dev; 1423290191Savg 1424290191Savg ASSERT(HDR_HAS_L2HDR(hdr)); 1425290191Savg ASSERT(MUTEX_HELD(&dev->l2ad_mtx)); 1426290191Savg 1427307265Smav if (HDR_GET_PSIZE(hdr) != 0) { 1428290191Savg trim_map_free(dev->l2ad_vdev, hdr->b_l2hdr.b_daddr, 1429307265Smav HDR_GET_PSIZE(hdr), 0); 1430290191Savg } 1431290191Savg} 1432290191Savg 1433168404Spjdstatic uint64_t 1434209962Smmbuf_hash(uint64_t spa, const dva_t *dva, uint64_t birth) 1435168404Spjd{ 1436168404Spjd uint8_t *vdva = (uint8_t *)dva; 1437168404Spjd uint64_t crc = -1ULL; 1438168404Spjd int i; 1439168404Spjd 1440168404Spjd ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); 1441168404Spjd 1442168404Spjd for (i = 0; i < sizeof (dva_t); i++) 1443168404Spjd crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF]; 1444168404Spjd 1445209962Smm crc ^= (spa>>8) ^ birth; 1446168404Spjd 1447168404Spjd return (crc); 1448168404Spjd} 1449168404Spjd 1450307265Smav#define HDR_EMPTY(hdr) \ 1451307265Smav ((hdr)->b_dva.dva_word[0] == 0 && \ 1452307265Smav (hdr)->b_dva.dva_word[1] == 0) 1453168404Spjd 1454307265Smav#define HDR_EQUAL(spa, dva, birth, hdr) \ 1455307265Smav ((hdr)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \ 1456307265Smav ((hdr)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \ 1457307265Smav ((hdr)->b_birth == birth) && ((hdr)->b_spa == spa) 1458168404Spjd 1459219089Spjdstatic void 1460219089Spjdbuf_discard_identity(arc_buf_hdr_t *hdr) 1461219089Spjd{ 1462219089Spjd hdr->b_dva.dva_word[0] = 0; 1463219089Spjd hdr->b_dva.dva_word[1] = 0; 1464219089Spjd hdr->b_birth = 0; 1465219089Spjd} 1466219089Spjd 1467168404Spjdstatic arc_buf_hdr_t * 1468268075Sdelphijbuf_hash_find(uint64_t spa, const blkptr_t *bp, kmutex_t **lockp) 1469168404Spjd{ 1470268075Sdelphij const dva_t *dva = BP_IDENTITY(bp); 1471268075Sdelphij uint64_t birth = BP_PHYSICAL_BIRTH(bp); 1472168404Spjd uint64_t idx = BUF_HASH_INDEX(spa, dva, birth); 1473168404Spjd kmutex_t *hash_lock = BUF_HASH_LOCK(idx); 1474275811Sdelphij arc_buf_hdr_t *hdr; 1475168404Spjd 1476168404Spjd mutex_enter(hash_lock); 1477275811Sdelphij for (hdr = buf_hash_table.ht_table[idx]; hdr != NULL; 1478275811Sdelphij hdr = hdr->b_hash_next) { 1479307265Smav if (HDR_EQUAL(spa, dva, birth, hdr)) { 1480168404Spjd *lockp = hash_lock; 1481275811Sdelphij return (hdr); 1482168404Spjd } 1483168404Spjd } 1484168404Spjd mutex_exit(hash_lock); 1485168404Spjd *lockp = NULL; 1486168404Spjd return (NULL); 1487168404Spjd} 1488168404Spjd 1489168404Spjd/* 1490168404Spjd * Insert an entry into the hash table. If there is already an element 1491168404Spjd * equal to elem in the hash table, then the already existing element 1492168404Spjd * will be returned and the new element will not be inserted. 1493168404Spjd * Otherwise returns NULL. 1494286570Smav * If lockp == NULL, the caller is assumed to already hold the hash lock. 1495168404Spjd */ 1496168404Spjdstatic arc_buf_hdr_t * 1497275811Sdelphijbuf_hash_insert(arc_buf_hdr_t *hdr, kmutex_t **lockp) 1498168404Spjd{ 1499275811Sdelphij uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth); 1500168404Spjd kmutex_t *hash_lock = BUF_HASH_LOCK(idx); 1501275811Sdelphij arc_buf_hdr_t *fhdr; 1502168404Spjd uint32_t i; 1503168404Spjd 1504275811Sdelphij ASSERT(!DVA_IS_EMPTY(&hdr->b_dva)); 1505275811Sdelphij ASSERT(hdr->b_birth != 0); 1506275811Sdelphij ASSERT(!HDR_IN_HASH_TABLE(hdr)); 1507286570Smav 1508286570Smav if (lockp != NULL) { 1509286570Smav *lockp = hash_lock; 1510286570Smav mutex_enter(hash_lock); 1511286570Smav } else { 1512286570Smav ASSERT(MUTEX_HELD(hash_lock)); 1513286570Smav } 1514286570Smav 1515275811Sdelphij for (fhdr = buf_hash_table.ht_table[idx], i = 0; fhdr != NULL; 1516275811Sdelphij fhdr = fhdr->b_hash_next, i++) { 1517307265Smav if (HDR_EQUAL(hdr->b_spa, &hdr->b_dva, hdr->b_birth, fhdr)) 1518275811Sdelphij return (fhdr); 1519168404Spjd } 1520168404Spjd 1521275811Sdelphij hdr->b_hash_next = buf_hash_table.ht_table[idx]; 1522275811Sdelphij buf_hash_table.ht_table[idx] = hdr; 1523307265Smav arc_hdr_set_flags(hdr, ARC_FLAG_IN_HASH_TABLE); 1524168404Spjd 1525168404Spjd /* collect some hash table performance data */ 1526168404Spjd if (i > 0) { 1527168404Spjd ARCSTAT_BUMP(arcstat_hash_collisions); 1528168404Spjd if (i == 1) 1529168404Spjd ARCSTAT_BUMP(arcstat_hash_chains); 1530168404Spjd 1531168404Spjd ARCSTAT_MAX(arcstat_hash_chain_max, i); 1532168404Spjd } 1533168404Spjd 1534168404Spjd ARCSTAT_BUMP(arcstat_hash_elements); 1535168404Spjd ARCSTAT_MAXSTAT(arcstat_hash_elements); 1536168404Spjd 1537168404Spjd return (NULL); 1538168404Spjd} 1539168404Spjd 1540168404Spjdstatic void 1541275811Sdelphijbuf_hash_remove(arc_buf_hdr_t *hdr) 1542168404Spjd{ 1543275811Sdelphij arc_buf_hdr_t *fhdr, **hdrp; 1544275811Sdelphij uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth); 1545168404Spjd 1546168404Spjd ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx))); 1547275811Sdelphij ASSERT(HDR_IN_HASH_TABLE(hdr)); 1548168404Spjd 1549275811Sdelphij hdrp = &buf_hash_table.ht_table[idx]; 1550275811Sdelphij while ((fhdr = *hdrp) != hdr) { 1551307265Smav ASSERT3P(fhdr, !=, NULL); 1552275811Sdelphij hdrp = &fhdr->b_hash_next; 1553168404Spjd } 1554275811Sdelphij *hdrp = hdr->b_hash_next; 1555275811Sdelphij hdr->b_hash_next = NULL; 1556307265Smav arc_hdr_clear_flags(hdr, ARC_FLAG_IN_HASH_TABLE); 1557168404Spjd 1558168404Spjd /* collect some hash table performance data */ 1559168404Spjd ARCSTAT_BUMPDOWN(arcstat_hash_elements); 1560168404Spjd 1561168404Spjd if (buf_hash_table.ht_table[idx] && 1562168404Spjd buf_hash_table.ht_table[idx]->b_hash_next == NULL) 1563168404Spjd ARCSTAT_BUMPDOWN(arcstat_hash_chains); 1564168404Spjd} 1565168404Spjd 1566168404Spjd/* 1567168404Spjd * Global data structures and functions for the buf kmem cache. 1568168404Spjd */ 1569286570Smavstatic kmem_cache_t *hdr_full_cache; 1570286570Smavstatic kmem_cache_t *hdr_l2only_cache; 1571168404Spjdstatic kmem_cache_t *buf_cache; 1572168404Spjd 1573168404Spjdstatic void 1574168404Spjdbuf_fini(void) 1575168404Spjd{ 1576168404Spjd int i; 1577168404Spjd 1578168404Spjd kmem_free(buf_hash_table.ht_table, 1579168404Spjd (buf_hash_table.ht_mask + 1) * sizeof (void *)); 1580168404Spjd for (i = 0; i < BUF_LOCKS; i++) 1581168404Spjd mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock); 1582286570Smav kmem_cache_destroy(hdr_full_cache); 1583286570Smav kmem_cache_destroy(hdr_l2only_cache); 1584168404Spjd kmem_cache_destroy(buf_cache); 1585168404Spjd} 1586168404Spjd 1587168404Spjd/* 1588168404Spjd * Constructor callback - called when the cache is empty 1589168404Spjd * and a new buf is requested. 1590168404Spjd */ 1591168404Spjd/* ARGSUSED */ 1592168404Spjdstatic int 1593286570Smavhdr_full_cons(void *vbuf, void *unused, int kmflag) 1594168404Spjd{ 1595275811Sdelphij arc_buf_hdr_t *hdr = vbuf; 1596168404Spjd 1597286570Smav bzero(hdr, HDR_FULL_SIZE); 1598286570Smav cv_init(&hdr->b_l1hdr.b_cv, NULL, CV_DEFAULT, NULL); 1599286570Smav refcount_create(&hdr->b_l1hdr.b_refcnt); 1600286570Smav mutex_init(&hdr->b_l1hdr.b_freeze_lock, NULL, MUTEX_DEFAULT, NULL); 1601286763Smav multilist_link_init(&hdr->b_l1hdr.b_arc_node); 1602286570Smav arc_space_consume(HDR_FULL_SIZE, ARC_SPACE_HDRS); 1603185029Spjd 1604168404Spjd return (0); 1605168404Spjd} 1606168404Spjd 1607185029Spjd/* ARGSUSED */ 1608185029Spjdstatic int 1609286570Smavhdr_l2only_cons(void *vbuf, void *unused, int kmflag) 1610286570Smav{ 1611286570Smav arc_buf_hdr_t *hdr = vbuf; 1612286570Smav 1613286570Smav bzero(hdr, HDR_L2ONLY_SIZE); 1614286570Smav arc_space_consume(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS); 1615286570Smav 1616286570Smav return (0); 1617286570Smav} 1618286570Smav 1619286570Smav/* ARGSUSED */ 1620286570Smavstatic int 1621185029Spjdbuf_cons(void *vbuf, void *unused, int kmflag) 1622185029Spjd{ 1623185029Spjd arc_buf_t *buf = vbuf; 1624185029Spjd 1625185029Spjd bzero(buf, sizeof (arc_buf_t)); 1626219089Spjd mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL); 1627208373Smm arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS); 1628208373Smm 1629185029Spjd return (0); 1630185029Spjd} 1631185029Spjd 1632168404Spjd/* 1633168404Spjd * Destructor callback - called when a cached buf is 1634168404Spjd * no longer required. 1635168404Spjd */ 1636168404Spjd/* ARGSUSED */ 1637168404Spjdstatic void 1638286570Smavhdr_full_dest(void *vbuf, void *unused) 1639168404Spjd{ 1640275811Sdelphij arc_buf_hdr_t *hdr = vbuf; 1641168404Spjd 1642307265Smav ASSERT(HDR_EMPTY(hdr)); 1643286570Smav cv_destroy(&hdr->b_l1hdr.b_cv); 1644286570Smav refcount_destroy(&hdr->b_l1hdr.b_refcnt); 1645286570Smav mutex_destroy(&hdr->b_l1hdr.b_freeze_lock); 1646286763Smav ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); 1647286570Smav arc_space_return(HDR_FULL_SIZE, ARC_SPACE_HDRS); 1648168404Spjd} 1649168404Spjd 1650185029Spjd/* ARGSUSED */ 1651185029Spjdstatic void 1652286570Smavhdr_l2only_dest(void *vbuf, void *unused) 1653286570Smav{ 1654286570Smav arc_buf_hdr_t *hdr = vbuf; 1655286570Smav 1656307265Smav ASSERT(HDR_EMPTY(hdr)); 1657286570Smav arc_space_return(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS); 1658286570Smav} 1659286570Smav 1660286570Smav/* ARGSUSED */ 1661286570Smavstatic void 1662185029Spjdbuf_dest(void *vbuf, void *unused) 1663185029Spjd{ 1664185029Spjd arc_buf_t *buf = vbuf; 1665185029Spjd 1666219089Spjd mutex_destroy(&buf->b_evict_lock); 1667208373Smm arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS); 1668185029Spjd} 1669185029Spjd 1670168404Spjd/* 1671168404Spjd * Reclaim callback -- invoked when memory is low. 1672168404Spjd */ 1673168404Spjd/* ARGSUSED */ 1674168404Spjdstatic void 1675168404Spjdhdr_recl(void *unused) 1676168404Spjd{ 1677168404Spjd dprintf("hdr_recl called\n"); 1678168404Spjd /* 1679168404Spjd * umem calls the reclaim func when we destroy the buf cache, 1680168404Spjd * which is after we do arc_fini(). 1681168404Spjd */ 1682168404Spjd if (!arc_dead) 1683286763Smav cv_signal(&arc_reclaim_thread_cv); 1684168404Spjd} 1685168404Spjd 1686168404Spjdstatic void 1687168404Spjdbuf_init(void) 1688168404Spjd{ 1689168404Spjd uint64_t *ct; 1690168404Spjd uint64_t hsize = 1ULL << 12; 1691168404Spjd int i, j; 1692168404Spjd 1693168404Spjd /* 1694168404Spjd * The hash table is big enough to fill all of physical memory 1695269230Sdelphij * with an average block size of zfs_arc_average_blocksize (default 8K). 1696269230Sdelphij * By default, the table will take up 1697269230Sdelphij * totalmem * sizeof(void*) / 8K (1MB per GB with 8-byte pointers). 1698168404Spjd */ 1699269230Sdelphij while (hsize * zfs_arc_average_blocksize < (uint64_t)physmem * PAGESIZE) 1700168404Spjd hsize <<= 1; 1701168404Spjdretry: 1702168404Spjd buf_hash_table.ht_mask = hsize - 1; 1703168404Spjd buf_hash_table.ht_table = 1704168404Spjd kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP); 1705168404Spjd if (buf_hash_table.ht_table == NULL) { 1706168404Spjd ASSERT(hsize > (1ULL << 8)); 1707168404Spjd hsize >>= 1; 1708168404Spjd goto retry; 1709168404Spjd } 1710168404Spjd 1711286570Smav hdr_full_cache = kmem_cache_create("arc_buf_hdr_t_full", HDR_FULL_SIZE, 1712286570Smav 0, hdr_full_cons, hdr_full_dest, hdr_recl, NULL, NULL, 0); 1713286570Smav hdr_l2only_cache = kmem_cache_create("arc_buf_hdr_t_l2only", 1714286570Smav HDR_L2ONLY_SIZE, 0, hdr_l2only_cons, hdr_l2only_dest, hdr_recl, 1715286570Smav NULL, NULL, 0); 1716168404Spjd buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t), 1717185029Spjd 0, buf_cons, buf_dest, NULL, NULL, NULL, 0); 1718168404Spjd 1719168404Spjd for (i = 0; i < 256; i++) 1720168404Spjd for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--) 1721168404Spjd *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY); 1722168404Spjd 1723168404Spjd for (i = 0; i < BUF_LOCKS; i++) { 1724168404Spjd mutex_init(&buf_hash_table.ht_locks[i].ht_lock, 1725168404Spjd NULL, MUTEX_DEFAULT, NULL); 1726168404Spjd } 1727168404Spjd} 1728168404Spjd 1729321535Smav/* 1730321535Smav * This is the size that the buf occupies in memory. If the buf is compressed, 1731321535Smav * it will correspond to the compressed size. You should use this method of 1732321535Smav * getting the buf size unless you explicitly need the logical size. 1733321535Smav */ 1734321535Smavint32_t 1735321535Smavarc_buf_size(arc_buf_t *buf) 1736321535Smav{ 1737321535Smav return (ARC_BUF_COMPRESSED(buf) ? 1738321535Smav HDR_GET_PSIZE(buf->b_hdr) : HDR_GET_LSIZE(buf->b_hdr)); 1739321535Smav} 1740321535Smav 1741321535Smavint32_t 1742321535Smavarc_buf_lsize(arc_buf_t *buf) 1743321535Smav{ 1744321535Smav return (HDR_GET_LSIZE(buf->b_hdr)); 1745321535Smav} 1746321535Smav 1747321535Smavenum zio_compress 1748321535Smavarc_get_compression(arc_buf_t *buf) 1749321535Smav{ 1750321535Smav return (ARC_BUF_COMPRESSED(buf) ? 1751321535Smav HDR_GET_COMPRESS(buf->b_hdr) : ZIO_COMPRESS_OFF); 1752321535Smav} 1753321535Smav 1754307265Smav#define ARC_MINTIME (hz>>4) /* 62 ms */ 1755307265Smav 1756307265Smavstatic inline boolean_t 1757307265Smavarc_buf_is_shared(arc_buf_t *buf) 1758286570Smav{ 1759307265Smav boolean_t shared = (buf->b_data != NULL && 1760321610Smav buf->b_hdr->b_l1hdr.b_pabd != NULL && 1761321610Smav abd_is_linear(buf->b_hdr->b_l1hdr.b_pabd) && 1762321610Smav buf->b_data == abd_to_buf(buf->b_hdr->b_l1hdr.b_pabd)); 1763307265Smav IMPLY(shared, HDR_SHARED_DATA(buf->b_hdr)); 1764321535Smav IMPLY(shared, ARC_BUF_SHARED(buf)); 1765321535Smav IMPLY(shared, ARC_BUF_COMPRESSED(buf) || ARC_BUF_LAST(buf)); 1766321535Smav 1767321535Smav /* 1768321535Smav * It would be nice to assert arc_can_share() too, but the "hdr isn't 1769321535Smav * already being shared" requirement prevents us from doing that. 1770321535Smav */ 1771321535Smav 1772307265Smav return (shared); 1773307265Smav} 1774286570Smav 1775321535Smav/* 1776321535Smav * Free the checksum associated with this header. If there is no checksum, this 1777321535Smav * is a no-op. 1778321535Smav */ 1779307265Smavstatic inline void 1780307265Smavarc_cksum_free(arc_buf_hdr_t *hdr) 1781307265Smav{ 1782307265Smav ASSERT(HDR_HAS_L1HDR(hdr)); 1783307265Smav mutex_enter(&hdr->b_l1hdr.b_freeze_lock); 1784307265Smav if (hdr->b_l1hdr.b_freeze_cksum != NULL) { 1785307265Smav kmem_free(hdr->b_l1hdr.b_freeze_cksum, sizeof (zio_cksum_t)); 1786307265Smav hdr->b_l1hdr.b_freeze_cksum = NULL; 1787286570Smav } 1788307265Smav mutex_exit(&hdr->b_l1hdr.b_freeze_lock); 1789286570Smav} 1790286570Smav 1791321535Smav/* 1792321535Smav * Return true iff at least one of the bufs on hdr is not compressed. 1793321535Smav */ 1794321535Smavstatic boolean_t 1795321535Smavarc_hdr_has_uncompressed_buf(arc_buf_hdr_t *hdr) 1796321535Smav{ 1797321535Smav for (arc_buf_t *b = hdr->b_l1hdr.b_buf; b != NULL; b = b->b_next) { 1798321535Smav if (!ARC_BUF_COMPRESSED(b)) { 1799321535Smav return (B_TRUE); 1800321535Smav } 1801321535Smav } 1802321535Smav return (B_FALSE); 1803321535Smav} 1804321535Smav 1805321535Smav/* 1806321535Smav * If we've turned on the ZFS_DEBUG_MODIFY flag, verify that the buf's data 1807321535Smav * matches the checksum that is stored in the hdr. If there is no checksum, 1808321535Smav * or if the buf is compressed, this is a no-op. 1809321535Smav */ 1810168404Spjdstatic void 1811168404Spjdarc_cksum_verify(arc_buf_t *buf) 1812168404Spjd{ 1813307265Smav arc_buf_hdr_t *hdr = buf->b_hdr; 1814168404Spjd zio_cksum_t zc; 1815168404Spjd 1816168404Spjd if (!(zfs_flags & ZFS_DEBUG_MODIFY)) 1817168404Spjd return; 1818168404Spjd 1819321535Smav if (ARC_BUF_COMPRESSED(buf)) { 1820321535Smav ASSERT(hdr->b_l1hdr.b_freeze_cksum == NULL || 1821321535Smav arc_hdr_has_uncompressed_buf(hdr)); 1822321535Smav return; 1823321535Smav } 1824321535Smav 1825307265Smav ASSERT(HDR_HAS_L1HDR(hdr)); 1826307265Smav 1827307265Smav mutex_enter(&hdr->b_l1hdr.b_freeze_lock); 1828307265Smav if (hdr->b_l1hdr.b_freeze_cksum == NULL || HDR_IO_ERROR(hdr)) { 1829307265Smav mutex_exit(&hdr->b_l1hdr.b_freeze_lock); 1830168404Spjd return; 1831168404Spjd } 1832321535Smav 1833321535Smav fletcher_2_native(buf->b_data, arc_buf_size(buf), NULL, &zc); 1834307265Smav if (!ZIO_CHECKSUM_EQUAL(*hdr->b_l1hdr.b_freeze_cksum, zc)) 1835168404Spjd panic("buffer modified while frozen!"); 1836307265Smav mutex_exit(&hdr->b_l1hdr.b_freeze_lock); 1837168404Spjd} 1838168404Spjd 1839307265Smavstatic boolean_t 1840307265Smavarc_cksum_is_equal(arc_buf_hdr_t *hdr, zio_t *zio) 1841185029Spjd{ 1842307265Smav enum zio_compress compress = BP_GET_COMPRESS(zio->io_bp); 1843307265Smav boolean_t valid_cksum; 1844185029Spjd 1845307265Smav ASSERT(!BP_IS_EMBEDDED(zio->io_bp)); 1846307265Smav VERIFY3U(BP_GET_PSIZE(zio->io_bp), ==, HDR_GET_PSIZE(hdr)); 1847185029Spjd 1848307265Smav /* 1849307265Smav * We rely on the blkptr's checksum to determine if the block 1850307265Smav * is valid or not. When compressed arc is enabled, the l2arc 1851307265Smav * writes the block to the l2arc just as it appears in the pool. 1852307265Smav * This allows us to use the blkptr's checksum to validate the 1853307265Smav * data that we just read off of the l2arc without having to store 1854307265Smav * a separate checksum in the arc_buf_hdr_t. However, if compressed 1855307265Smav * arc is disabled, then the data written to the l2arc is always 1856307265Smav * uncompressed and won't match the block as it exists in the main 1857307265Smav * pool. When this is the case, we must first compress it if it is 1858307265Smav * compressed on the main pool before we can validate the checksum. 1859307265Smav */ 1860307265Smav if (!HDR_COMPRESSION_ENABLED(hdr) && compress != ZIO_COMPRESS_OFF) { 1861307265Smav ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF); 1862307265Smav uint64_t lsize = HDR_GET_LSIZE(hdr); 1863307265Smav uint64_t csize; 1864307265Smav 1865329490Smav abd_t *cdata = abd_alloc_linear(HDR_GET_PSIZE(hdr), B_TRUE); 1866329490Smav csize = zio_compress_data(compress, zio->io_abd, 1867329490Smav abd_to_buf(cdata), lsize); 1868321610Smav 1869307265Smav ASSERT3U(csize, <=, HDR_GET_PSIZE(hdr)); 1870307265Smav if (csize < HDR_GET_PSIZE(hdr)) { 1871307265Smav /* 1872307265Smav * Compressed blocks are always a multiple of the 1873307265Smav * smallest ashift in the pool. Ideally, we would 1874307265Smav * like to round up the csize to the next 1875307265Smav * spa_min_ashift but that value may have changed 1876307265Smav * since the block was last written. Instead, 1877307265Smav * we rely on the fact that the hdr's psize 1878307265Smav * was set to the psize of the block when it was 1879307265Smav * last written. We set the csize to that value 1880307265Smav * and zero out any part that should not contain 1881307265Smav * data. 1882307265Smav */ 1883329490Smav abd_zero_off(cdata, csize, HDR_GET_PSIZE(hdr) - csize); 1884307265Smav csize = HDR_GET_PSIZE(hdr); 1885307265Smav } 1886329490Smav zio_push_transform(zio, cdata, csize, HDR_GET_PSIZE(hdr), NULL); 1887307265Smav } 1888307265Smav 1889307265Smav /* 1890307265Smav * Block pointers always store the checksum for the logical data. 1891307265Smav * If the block pointer has the gang bit set, then the checksum 1892307265Smav * it represents is for the reconstituted data and not for an 1893307265Smav * individual gang member. The zio pipeline, however, must be able to 1894307265Smav * determine the checksum of each of the gang constituents so it 1895307265Smav * treats the checksum comparison differently than what we need 1896307265Smav * for l2arc blocks. This prevents us from using the 1897307265Smav * zio_checksum_error() interface directly. Instead we must call the 1898307265Smav * zio_checksum_error_impl() so that we can ensure the checksum is 1899307265Smav * generated using the correct checksum algorithm and accounts for the 1900307265Smav * logical I/O size and not just a gang fragment. 1901307265Smav */ 1902307265Smav valid_cksum = (zio_checksum_error_impl(zio->io_spa, zio->io_bp, 1903321610Smav BP_GET_CHECKSUM(zio->io_bp), zio->io_abd, zio->io_size, 1904307265Smav zio->io_offset, NULL) == 0); 1905307265Smav zio_pop_transforms(zio); 1906307265Smav return (valid_cksum); 1907185029Spjd} 1908185029Spjd 1909321535Smav/* 1910321535Smav * Given a buf full of data, if ZFS_DEBUG_MODIFY is enabled this computes a 1911321535Smav * checksum and attaches it to the buf's hdr so that we can ensure that the buf 1912321535Smav * isn't modified later on. If buf is compressed or there is already a checksum 1913321535Smav * on the hdr, this is a no-op (we only checksum uncompressed bufs). 1914321535Smav */ 1915168404Spjdstatic void 1916307265Smavarc_cksum_compute(arc_buf_t *buf) 1917168404Spjd{ 1918307265Smav arc_buf_hdr_t *hdr = buf->b_hdr; 1919307265Smav 1920307265Smav if (!(zfs_flags & ZFS_DEBUG_MODIFY)) 1921168404Spjd return; 1922168404Spjd 1923307265Smav ASSERT(HDR_HAS_L1HDR(hdr)); 1924321535Smav 1925286570Smav mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1926307265Smav if (hdr->b_l1hdr.b_freeze_cksum != NULL) { 1927321535Smav ASSERT(arc_hdr_has_uncompressed_buf(hdr)); 1928307265Smav mutex_exit(&hdr->b_l1hdr.b_freeze_lock); 1929168404Spjd return; 1930321535Smav } else if (ARC_BUF_COMPRESSED(buf)) { 1931321535Smav mutex_exit(&hdr->b_l1hdr.b_freeze_lock); 1932321535Smav return; 1933168404Spjd } 1934321535Smav 1935321535Smav ASSERT(!ARC_BUF_COMPRESSED(buf)); 1936307265Smav hdr->b_l1hdr.b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), 1937307265Smav KM_SLEEP); 1938321535Smav fletcher_2_native(buf->b_data, arc_buf_size(buf), NULL, 1939307265Smav hdr->b_l1hdr.b_freeze_cksum); 1940307265Smav mutex_exit(&hdr->b_l1hdr.b_freeze_lock); 1941240133Smm#ifdef illumos 1942240133Smm arc_buf_watch(buf); 1943277300Ssmh#endif 1944168404Spjd} 1945168404Spjd 1946240133Smm#ifdef illumos 1947240133Smm#ifndef _KERNEL 1948240133Smmtypedef struct procctl { 1949240133Smm long cmd; 1950240133Smm prwatch_t prwatch; 1951240133Smm} procctl_t; 1952240133Smm#endif 1953240133Smm 1954240133Smm/* ARGSUSED */ 1955240133Smmstatic void 1956240133Smmarc_buf_unwatch(arc_buf_t *buf) 1957240133Smm{ 1958240133Smm#ifndef _KERNEL 1959240133Smm if (arc_watch) { 1960240133Smm int result; 1961240133Smm procctl_t ctl; 1962240133Smm ctl.cmd = PCWATCH; 1963240133Smm ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data; 1964240133Smm ctl.prwatch.pr_size = 0; 1965240133Smm ctl.prwatch.pr_wflags = 0; 1966240133Smm result = write(arc_procfd, &ctl, sizeof (ctl)); 1967240133Smm ASSERT3U(result, ==, sizeof (ctl)); 1968240133Smm } 1969240133Smm#endif 1970240133Smm} 1971240133Smm 1972240133Smm/* ARGSUSED */ 1973240133Smmstatic void 1974240133Smmarc_buf_watch(arc_buf_t *buf) 1975240133Smm{ 1976240133Smm#ifndef _KERNEL 1977240133Smm if (arc_watch) { 1978240133Smm int result; 1979240133Smm procctl_t ctl; 1980240133Smm ctl.cmd = PCWATCH; 1981240133Smm ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data; 1982321535Smav ctl.prwatch.pr_size = arc_buf_size(buf); 1983240133Smm ctl.prwatch.pr_wflags = WA_WRITE; 1984240133Smm result = write(arc_procfd, &ctl, sizeof (ctl)); 1985240133Smm ASSERT3U(result, ==, sizeof (ctl)); 1986240133Smm } 1987240133Smm#endif 1988240133Smm} 1989240133Smm#endif /* illumos */ 1990240133Smm 1991286570Smavstatic arc_buf_contents_t 1992286570Smavarc_buf_type(arc_buf_hdr_t *hdr) 1993286570Smav{ 1994307265Smav arc_buf_contents_t type; 1995286570Smav if (HDR_ISTYPE_METADATA(hdr)) { 1996307265Smav type = ARC_BUFC_METADATA; 1997286570Smav } else { 1998307265Smav type = ARC_BUFC_DATA; 1999286570Smav } 2000307265Smav VERIFY3U(hdr->b_type, ==, type); 2001307265Smav return (type); 2002286570Smav} 2003286570Smav 2004321535Smavboolean_t 2005321535Smavarc_is_metadata(arc_buf_t *buf) 2006321535Smav{ 2007321535Smav return (HDR_ISTYPE_METADATA(buf->b_hdr) != 0); 2008321535Smav} 2009321535Smav 2010286570Smavstatic uint32_t 2011286570Smavarc_bufc_to_flags(arc_buf_contents_t type) 2012286570Smav{ 2013286570Smav switch (type) { 2014286570Smav case ARC_BUFC_DATA: 2015286570Smav /* metadata field is 0 if buffer contains normal data */ 2016286570Smav return (0); 2017286570Smav case ARC_BUFC_METADATA: 2018286570Smav return (ARC_FLAG_BUFC_METADATA); 2019286570Smav default: 2020286570Smav break; 2021286570Smav } 2022286570Smav panic("undefined ARC buffer type!"); 2023286570Smav return ((uint32_t)-1); 2024286570Smav} 2025286570Smav 2026168404Spjdvoid 2027168404Spjdarc_buf_thaw(arc_buf_t *buf) 2028168404Spjd{ 2029307265Smav arc_buf_hdr_t *hdr = buf->b_hdr; 2030307265Smav 2031321535Smav ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); 2032321535Smav ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 2033321535Smav 2034321535Smav arc_cksum_verify(buf); 2035321535Smav 2036321535Smav /* 2037321535Smav * Compressed buffers do not manipulate the b_freeze_cksum or 2038321535Smav * allocate b_thawed. 2039321535Smav */ 2040321535Smav if (ARC_BUF_COMPRESSED(buf)) { 2041321535Smav ASSERT(hdr->b_l1hdr.b_freeze_cksum == NULL || 2042321535Smav arc_hdr_has_uncompressed_buf(hdr)); 2043321535Smav return; 2044185029Spjd } 2045168404Spjd 2046307265Smav ASSERT(HDR_HAS_L1HDR(hdr)); 2047307265Smav arc_cksum_free(hdr); 2048219089Spjd 2049307265Smav mutex_enter(&hdr->b_l1hdr.b_freeze_lock); 2050286570Smav#ifdef ZFS_DEBUG 2051219089Spjd if (zfs_flags & ZFS_DEBUG_MODIFY) { 2052307265Smav if (hdr->b_l1hdr.b_thawed != NULL) 2053307265Smav kmem_free(hdr->b_l1hdr.b_thawed, 1); 2054307265Smav hdr->b_l1hdr.b_thawed = kmem_alloc(1, KM_SLEEP); 2055219089Spjd } 2056286570Smav#endif 2057219089Spjd 2058307265Smav mutex_exit(&hdr->b_l1hdr.b_freeze_lock); 2059240133Smm 2060240133Smm#ifdef illumos 2061240133Smm arc_buf_unwatch(buf); 2062277300Ssmh#endif 2063168404Spjd} 2064168404Spjd 2065168404Spjdvoid 2066168404Spjdarc_buf_freeze(arc_buf_t *buf) 2067168404Spjd{ 2068307265Smav arc_buf_hdr_t *hdr = buf->b_hdr; 2069219089Spjd kmutex_t *hash_lock; 2070219089Spjd 2071168404Spjd if (!(zfs_flags & ZFS_DEBUG_MODIFY)) 2072168404Spjd return; 2073168404Spjd 2074321535Smav if (ARC_BUF_COMPRESSED(buf)) { 2075321535Smav ASSERT(hdr->b_l1hdr.b_freeze_cksum == NULL || 2076321535Smav arc_hdr_has_uncompressed_buf(hdr)); 2077321535Smav return; 2078321535Smav } 2079321535Smav 2080307265Smav hash_lock = HDR_LOCK(hdr); 2081219089Spjd mutex_enter(hash_lock); 2082219089Spjd 2083307265Smav ASSERT(HDR_HAS_L1HDR(hdr)); 2084307265Smav ASSERT(hdr->b_l1hdr.b_freeze_cksum != NULL || 2085307265Smav hdr->b_l1hdr.b_state == arc_anon); 2086307265Smav arc_cksum_compute(buf); 2087219089Spjd mutex_exit(hash_lock); 2088168404Spjd} 2089168404Spjd 2090307265Smav/* 2091307265Smav * The arc_buf_hdr_t's b_flags should never be modified directly. Instead, 2092307265Smav * the following functions should be used to ensure that the flags are 2093307265Smav * updated in a thread-safe way. When manipulating the flags either 2094307265Smav * the hash_lock must be held or the hdr must be undiscoverable. This 2095307265Smav * ensures that we're not racing with any other threads when updating 2096307265Smav * the flags. 2097307265Smav */ 2098307265Smavstatic inline void 2099307265Smavarc_hdr_set_flags(arc_buf_hdr_t *hdr, arc_flags_t flags) 2100307265Smav{ 2101307265Smav ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); 2102307265Smav hdr->b_flags |= flags; 2103307265Smav} 2104307265Smav 2105307265Smavstatic inline void 2106307265Smavarc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags) 2107307265Smav{ 2108307265Smav ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); 2109307265Smav hdr->b_flags &= ~flags; 2110307265Smav} 2111307265Smav 2112307265Smav/* 2113307265Smav * Setting the compression bits in the arc_buf_hdr_t's b_flags is 2114307265Smav * done in a special way since we have to clear and set bits 2115307265Smav * at the same time. Consumers that wish to set the compression bits 2116307265Smav * must use this function to ensure that the flags are updated in 2117307265Smav * thread-safe manner. 2118307265Smav */ 2119168404Spjdstatic void 2120307265Smavarc_hdr_set_compress(arc_buf_hdr_t *hdr, enum zio_compress cmp) 2121168404Spjd{ 2122307265Smav ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); 2123307265Smav 2124307265Smav /* 2125307265Smav * Holes and embedded blocks will always have a psize = 0 so 2126307265Smav * we ignore the compression of the blkptr and set the 2127307265Smav * arc_buf_hdr_t's compression to ZIO_COMPRESS_OFF. 2128307265Smav * Holes and embedded blocks remain anonymous so we don't 2129307265Smav * want to uncompress them. Mark them as uncompressed. 2130307265Smav */ 2131307265Smav if (!zfs_compressed_arc_enabled || HDR_GET_PSIZE(hdr) == 0) { 2132307265Smav arc_hdr_clear_flags(hdr, ARC_FLAG_COMPRESSED_ARC); 2133307265Smav HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_OFF); 2134307265Smav ASSERT(!HDR_COMPRESSION_ENABLED(hdr)); 2135307265Smav ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF); 2136307265Smav } else { 2137307265Smav arc_hdr_set_flags(hdr, ARC_FLAG_COMPRESSED_ARC); 2138307265Smav HDR_SET_COMPRESS(hdr, cmp); 2139307265Smav ASSERT3U(HDR_GET_COMPRESS(hdr), ==, cmp); 2140307265Smav ASSERT(HDR_COMPRESSION_ENABLED(hdr)); 2141307265Smav } 2142307265Smav} 2143307265Smav 2144321535Smav/* 2145321535Smav * Looks for another buf on the same hdr which has the data decompressed, copies 2146321535Smav * from it, and returns true. If no such buf exists, returns false. 2147321535Smav */ 2148321535Smavstatic boolean_t 2149321535Smavarc_buf_try_copy_decompressed_data(arc_buf_t *buf) 2150321535Smav{ 2151321535Smav arc_buf_hdr_t *hdr = buf->b_hdr; 2152321535Smav boolean_t copied = B_FALSE; 2153321535Smav 2154321535Smav ASSERT(HDR_HAS_L1HDR(hdr)); 2155321535Smav ASSERT3P(buf->b_data, !=, NULL); 2156321535Smav ASSERT(!ARC_BUF_COMPRESSED(buf)); 2157321535Smav 2158321535Smav for (arc_buf_t *from = hdr->b_l1hdr.b_buf; from != NULL; 2159321535Smav from = from->b_next) { 2160321535Smav /* can't use our own data buffer */ 2161321535Smav if (from == buf) { 2162321535Smav continue; 2163321535Smav } 2164321535Smav 2165321535Smav if (!ARC_BUF_COMPRESSED(from)) { 2166321535Smav bcopy(from->b_data, buf->b_data, arc_buf_size(buf)); 2167321535Smav copied = B_TRUE; 2168321535Smav break; 2169321535Smav } 2170321535Smav } 2171321535Smav 2172321535Smav /* 2173321535Smav * There were no decompressed bufs, so there should not be a 2174321535Smav * checksum on the hdr either. 2175321535Smav */ 2176321535Smav EQUIV(!copied, hdr->b_l1hdr.b_freeze_cksum == NULL); 2177321535Smav 2178321535Smav return (copied); 2179321535Smav} 2180321535Smav 2181321535Smav/* 2182321535Smav * Given a buf that has a data buffer attached to it, this function will 2183321535Smav * efficiently fill the buf with data of the specified compression setting from 2184321535Smav * the hdr and update the hdr's b_freeze_cksum if necessary. If the buf and hdr 2185321535Smav * are already sharing a data buf, no copy is performed. 2186321535Smav * 2187321535Smav * If the buf is marked as compressed but uncompressed data was requested, this 2188321535Smav * will allocate a new data buffer for the buf, remove that flag, and fill the 2189321535Smav * buf with uncompressed data. You can't request a compressed buf on a hdr with 2190321535Smav * uncompressed data, and (since we haven't added support for it yet) if you 2191321535Smav * want compressed data your buf must already be marked as compressed and have 2192321535Smav * the correct-sized data buffer. 2193321535Smav */ 2194307265Smavstatic int 2195321535Smavarc_buf_fill(arc_buf_t *buf, boolean_t compressed) 2196307265Smav{ 2197307265Smav arc_buf_hdr_t *hdr = buf->b_hdr; 2198321535Smav boolean_t hdr_compressed = (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF); 2199307265Smav dmu_object_byteswap_t bswap = hdr->b_l1hdr.b_byteswap; 2200307265Smav 2201321535Smav ASSERT3P(buf->b_data, !=, NULL); 2202321535Smav IMPLY(compressed, hdr_compressed); 2203321535Smav IMPLY(compressed, ARC_BUF_COMPRESSED(buf)); 2204321535Smav 2205321535Smav if (hdr_compressed == compressed) { 2206321535Smav if (!arc_buf_is_shared(buf)) { 2207321610Smav abd_copy_to_buf(buf->b_data, hdr->b_l1hdr.b_pabd, 2208321535Smav arc_buf_size(buf)); 2209321535Smav } 2210321535Smav } else { 2211321535Smav ASSERT(hdr_compressed); 2212321535Smav ASSERT(!compressed); 2213321535Smav ASSERT3U(HDR_GET_LSIZE(hdr), !=, HDR_GET_PSIZE(hdr)); 2214321535Smav 2215307265Smav /* 2216321535Smav * If the buf is sharing its data with the hdr, unlink it and 2217321535Smav * allocate a new data buffer for the buf. 2218307265Smav */ 2219321535Smav if (arc_buf_is_shared(buf)) { 2220321535Smav ASSERT(ARC_BUF_COMPRESSED(buf)); 2221321535Smav 2222321535Smav /* We need to give the buf it's own b_data */ 2223321535Smav buf->b_flags &= ~ARC_BUF_FLAG_SHARED; 2224321535Smav buf->b_data = 2225321535Smav arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf); 2226321535Smav arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA); 2227321535Smav 2228321535Smav /* Previously overhead was 0; just add new overhead */ 2229321535Smav ARCSTAT_INCR(arcstat_overhead_size, HDR_GET_LSIZE(hdr)); 2230321535Smav } else if (ARC_BUF_COMPRESSED(buf)) { 2231321535Smav /* We need to reallocate the buf's b_data */ 2232321535Smav arc_free_data_buf(hdr, buf->b_data, HDR_GET_PSIZE(hdr), 2233321535Smav buf); 2234321535Smav buf->b_data = 2235321535Smav arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf); 2236321535Smav 2237321535Smav /* We increased the size of b_data; update overhead */ 2238321535Smav ARCSTAT_INCR(arcstat_overhead_size, 2239321535Smav HDR_GET_LSIZE(hdr) - HDR_GET_PSIZE(hdr)); 2240307265Smav } 2241321535Smav 2242321535Smav /* 2243321535Smav * Regardless of the buf's previous compression settings, it 2244321535Smav * should not be compressed at the end of this function. 2245321535Smav */ 2246321535Smav buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED; 2247321535Smav 2248321535Smav /* 2249321535Smav * Try copying the data from another buf which already has a 2250321535Smav * decompressed version. If that's not possible, it's time to 2251321535Smav * bite the bullet and decompress the data from the hdr. 2252321535Smav */ 2253321535Smav if (arc_buf_try_copy_decompressed_data(buf)) { 2254321535Smav /* Skip byteswapping and checksumming (already done) */ 2255321535Smav ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, !=, NULL); 2256321535Smav return (0); 2257321535Smav } else { 2258321535Smav int error = zio_decompress_data(HDR_GET_COMPRESS(hdr), 2259321610Smav hdr->b_l1hdr.b_pabd, buf->b_data, 2260321535Smav HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr)); 2261321535Smav 2262321535Smav /* 2263321535Smav * Absent hardware errors or software bugs, this should 2264321535Smav * be impossible, but log it anyway so we can debug it. 2265321535Smav */ 2266321535Smav if (error != 0) { 2267321535Smav zfs_dbgmsg( 2268321535Smav "hdr %p, compress %d, psize %d, lsize %d", 2269321535Smav hdr, HDR_GET_COMPRESS(hdr), 2270321535Smav HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr)); 2271321535Smav return (SET_ERROR(EIO)); 2272321535Smav } 2273321535Smav } 2274307265Smav } 2275321535Smav 2276321535Smav /* Byteswap the buf's data if necessary */ 2277307265Smav if (bswap != DMU_BSWAP_NUMFUNCS) { 2278307265Smav ASSERT(!HDR_SHARED_DATA(hdr)); 2279307265Smav ASSERT3U(bswap, <, DMU_BSWAP_NUMFUNCS); 2280307265Smav dmu_ot_byteswap[bswap].ob_func(buf->b_data, HDR_GET_LSIZE(hdr)); 2281307265Smav } 2282321535Smav 2283321535Smav /* Compute the hdr's checksum if necessary */ 2284307265Smav arc_cksum_compute(buf); 2285321535Smav 2286307265Smav return (0); 2287307265Smav} 2288307265Smav 2289321535Smavint 2290321535Smavarc_decompress(arc_buf_t *buf) 2291321535Smav{ 2292321535Smav return (arc_buf_fill(buf, B_FALSE)); 2293321535Smav} 2294321535Smav 2295307265Smav/* 2296321610Smav * Return the size of the block, b_pabd, that is stored in the arc_buf_hdr_t. 2297307265Smav */ 2298307265Smavstatic uint64_t 2299307265Smavarc_hdr_size(arc_buf_hdr_t *hdr) 2300307265Smav{ 2301307265Smav uint64_t size; 2302307265Smav 2303307265Smav if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF && 2304307265Smav HDR_GET_PSIZE(hdr) > 0) { 2305307265Smav size = HDR_GET_PSIZE(hdr); 2306307265Smav } else { 2307307265Smav ASSERT3U(HDR_GET_LSIZE(hdr), !=, 0); 2308307265Smav size = HDR_GET_LSIZE(hdr); 2309307265Smav } 2310307265Smav return (size); 2311307265Smav} 2312307265Smav 2313307265Smav/* 2314307265Smav * Increment the amount of evictable space in the arc_state_t's refcount. 2315307265Smav * We account for the space used by the hdr and the arc buf individually 2316307265Smav * so that we can add and remove them from the refcount individually. 2317307265Smav */ 2318307265Smavstatic void 2319307265Smavarc_evictable_space_increment(arc_buf_hdr_t *hdr, arc_state_t *state) 2320307265Smav{ 2321307265Smav arc_buf_contents_t type = arc_buf_type(hdr); 2322307265Smav 2323286570Smav ASSERT(HDR_HAS_L1HDR(hdr)); 2324307265Smav 2325307265Smav if (GHOST_STATE(state)) { 2326307265Smav ASSERT0(hdr->b_l1hdr.b_bufcnt); 2327307265Smav ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); 2328321610Smav ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); 2329321535Smav (void) refcount_add_many(&state->arcs_esize[type], 2330321535Smav HDR_GET_LSIZE(hdr), hdr); 2331307265Smav return; 2332307265Smav } 2333307265Smav 2334307265Smav ASSERT(!GHOST_STATE(state)); 2335321610Smav if (hdr->b_l1hdr.b_pabd != NULL) { 2336307265Smav (void) refcount_add_many(&state->arcs_esize[type], 2337307265Smav arc_hdr_size(hdr), hdr); 2338307265Smav } 2339307265Smav for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; 2340307265Smav buf = buf->b_next) { 2341321535Smav if (arc_buf_is_shared(buf)) 2342307265Smav continue; 2343321535Smav (void) refcount_add_many(&state->arcs_esize[type], 2344321535Smav arc_buf_size(buf), buf); 2345307265Smav } 2346307265Smav} 2347307265Smav 2348307265Smav/* 2349307265Smav * Decrement the amount of evictable space in the arc_state_t's refcount. 2350307265Smav * We account for the space used by the hdr and the arc buf individually 2351307265Smav * so that we can add and remove them from the refcount individually. 2352307265Smav */ 2353307265Smavstatic void 2354321535Smavarc_evictable_space_decrement(arc_buf_hdr_t *hdr, arc_state_t *state) 2355307265Smav{ 2356307265Smav arc_buf_contents_t type = arc_buf_type(hdr); 2357307265Smav 2358307265Smav ASSERT(HDR_HAS_L1HDR(hdr)); 2359307265Smav 2360307265Smav if (GHOST_STATE(state)) { 2361307265Smav ASSERT0(hdr->b_l1hdr.b_bufcnt); 2362307265Smav ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); 2363321610Smav ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); 2364307265Smav (void) refcount_remove_many(&state->arcs_esize[type], 2365321535Smav HDR_GET_LSIZE(hdr), hdr); 2366307265Smav return; 2367307265Smav } 2368307265Smav 2369307265Smav ASSERT(!GHOST_STATE(state)); 2370321610Smav if (hdr->b_l1hdr.b_pabd != NULL) { 2371307265Smav (void) refcount_remove_many(&state->arcs_esize[type], 2372307265Smav arc_hdr_size(hdr), hdr); 2373307265Smav } 2374307265Smav for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; 2375307265Smav buf = buf->b_next) { 2376321535Smav if (arc_buf_is_shared(buf)) 2377307265Smav continue; 2378307265Smav (void) refcount_remove_many(&state->arcs_esize[type], 2379321535Smav arc_buf_size(buf), buf); 2380307265Smav } 2381307265Smav} 2382307265Smav 2383307265Smav/* 2384307265Smav * Add a reference to this hdr indicating that someone is actively 2385307265Smav * referencing that memory. When the refcount transitions from 0 to 1, 2386307265Smav * we remove it from the respective arc_state_t list to indicate that 2387307265Smav * it is not evictable. 2388307265Smav */ 2389307265Smavstatic void 2390307265Smavadd_reference(arc_buf_hdr_t *hdr, void *tag) 2391307265Smav{ 2392307265Smav ASSERT(HDR_HAS_L1HDR(hdr)); 2393307265Smav if (!MUTEX_HELD(HDR_LOCK(hdr))) { 2394307265Smav ASSERT(hdr->b_l1hdr.b_state == arc_anon); 2395307265Smav ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 2396307265Smav ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); 2397307265Smav } 2398307265Smav 2399286570Smav arc_state_t *state = hdr->b_l1hdr.b_state; 2400168404Spjd 2401286570Smav if ((refcount_add(&hdr->b_l1hdr.b_refcnt, tag) == 1) && 2402286570Smav (state != arc_anon)) { 2403286570Smav /* We don't use the L2-only state list. */ 2404286570Smav if (state != arc_l2c_only) { 2405321553Smav multilist_remove(state->arcs_list[arc_buf_type(hdr)], 2406307265Smav hdr); 2407321535Smav arc_evictable_space_decrement(hdr, state); 2408168404Spjd } 2409185029Spjd /* remove the prefetch flag if we get a reference */ 2410307265Smav arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH); 2411168404Spjd } 2412168404Spjd} 2413168404Spjd 2414307265Smav/* 2415307265Smav * Remove a reference from this hdr. When the reference transitions from 2416307265Smav * 1 to 0 and we're not anonymous, then we add this hdr to the arc_state_t's 2417307265Smav * list making it eligible for eviction. 2418307265Smav */ 2419168404Spjdstatic int 2420275811Sdelphijremove_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag) 2421168404Spjd{ 2422168404Spjd int cnt; 2423286570Smav arc_state_t *state = hdr->b_l1hdr.b_state; 2424168404Spjd 2425286570Smav ASSERT(HDR_HAS_L1HDR(hdr)); 2426168404Spjd ASSERT(state == arc_anon || MUTEX_HELD(hash_lock)); 2427168404Spjd ASSERT(!GHOST_STATE(state)); 2428168404Spjd 2429286570Smav /* 2430286570Smav * arc_l2c_only counts as a ghost state so we don't need to explicitly 2431286570Smav * check to prevent usage of the arc_l2c_only list. 2432286570Smav */ 2433286570Smav if (((cnt = refcount_remove(&hdr->b_l1hdr.b_refcnt, tag)) == 0) && 2434168404Spjd (state != arc_anon)) { 2435321553Smav multilist_insert(state->arcs_list[arc_buf_type(hdr)], hdr); 2436307265Smav ASSERT3U(hdr->b_l1hdr.b_bufcnt, >, 0); 2437307265Smav arc_evictable_space_increment(hdr, state); 2438168404Spjd } 2439168404Spjd return (cnt); 2440168404Spjd} 2441168404Spjd 2442168404Spjd/* 2443286763Smav * Move the supplied buffer to the indicated state. The hash lock 2444168404Spjd * for the buffer must be held by the caller. 2445168404Spjd */ 2446168404Spjdstatic void 2447275811Sdelphijarc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, 2448275811Sdelphij kmutex_t *hash_lock) 2449168404Spjd{ 2450286570Smav arc_state_t *old_state; 2451286570Smav int64_t refcnt; 2452307265Smav uint32_t bufcnt; 2453307265Smav boolean_t update_old, update_new; 2454286570Smav arc_buf_contents_t buftype = arc_buf_type(hdr); 2455168404Spjd 2456286570Smav /* 2457286570Smav * We almost always have an L1 hdr here, since we call arc_hdr_realloc() 2458286570Smav * in arc_read() when bringing a buffer out of the L2ARC. However, the 2459286570Smav * L1 hdr doesn't always exist when we change state to arc_anon before 2460286570Smav * destroying a header, in which case reallocating to add the L1 hdr is 2461286570Smav * pointless. 2462286570Smav */ 2463286570Smav if (HDR_HAS_L1HDR(hdr)) { 2464286570Smav old_state = hdr->b_l1hdr.b_state; 2465286570Smav refcnt = refcount_count(&hdr->b_l1hdr.b_refcnt); 2466307265Smav bufcnt = hdr->b_l1hdr.b_bufcnt; 2467321610Smav update_old = (bufcnt > 0 || hdr->b_l1hdr.b_pabd != NULL); 2468286570Smav } else { 2469286570Smav old_state = arc_l2c_only; 2470286570Smav refcnt = 0; 2471307265Smav bufcnt = 0; 2472307265Smav update_old = B_FALSE; 2473286570Smav } 2474307265Smav update_new = update_old; 2475286570Smav 2476168404Spjd ASSERT(MUTEX_HELD(hash_lock)); 2477258632Savg ASSERT3P(new_state, !=, old_state); 2478307265Smav ASSERT(!GHOST_STATE(new_state) || bufcnt == 0); 2479307265Smav ASSERT(old_state != arc_anon || bufcnt <= 1); 2480168404Spjd 2481168404Spjd /* 2482168404Spjd * If this buffer is evictable, transfer it from the 2483168404Spjd * old state list to the new state list. 2484168404Spjd */ 2485168404Spjd if (refcnt == 0) { 2486286570Smav if (old_state != arc_anon && old_state != arc_l2c_only) { 2487286570Smav ASSERT(HDR_HAS_L1HDR(hdr)); 2488321553Smav multilist_remove(old_state->arcs_list[buftype], hdr); 2489168404Spjd 2490307265Smav if (GHOST_STATE(old_state)) { 2491307265Smav ASSERT0(bufcnt); 2492307265Smav ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); 2493307265Smav update_old = B_TRUE; 2494168404Spjd } 2495321535Smav arc_evictable_space_decrement(hdr, old_state); 2496168404Spjd } 2497286570Smav if (new_state != arc_anon && new_state != arc_l2c_only) { 2498168404Spjd 2499286570Smav /* 2500286570Smav * An L1 header always exists here, since if we're 2501286570Smav * moving to some L1-cached state (i.e. not l2c_only or 2502286570Smav * anonymous), we realloc the header to add an L1hdr 2503286570Smav * beforehand. 2504286570Smav */ 2505286570Smav ASSERT(HDR_HAS_L1HDR(hdr)); 2506321553Smav multilist_insert(new_state->arcs_list[buftype], hdr); 2507168404Spjd 2508168404Spjd if (GHOST_STATE(new_state)) { 2509307265Smav ASSERT0(bufcnt); 2510307265Smav ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); 2511307265Smav update_new = B_TRUE; 2512168404Spjd } 2513307265Smav arc_evictable_space_increment(hdr, new_state); 2514168404Spjd } 2515168404Spjd } 2516168404Spjd 2517307265Smav ASSERT(!HDR_EMPTY(hdr)); 2518275811Sdelphij if (new_state == arc_anon && HDR_IN_HASH_TABLE(hdr)) 2519275811Sdelphij buf_hash_remove(hdr); 2520168404Spjd 2521286570Smav /* adjust state sizes (ignore arc_l2c_only) */ 2522286766Smav 2523307265Smav if (update_new && new_state != arc_l2c_only) { 2524286766Smav ASSERT(HDR_HAS_L1HDR(hdr)); 2525286766Smav if (GHOST_STATE(new_state)) { 2526307265Smav ASSERT0(bufcnt); 2527286766Smav 2528286766Smav /* 2529307265Smav * When moving a header to a ghost state, we first 2530286766Smav * remove all arc buffers. Thus, we'll have a 2531307265Smav * bufcnt of zero, and no arc buffer to use for 2532286766Smav * the reference. As a result, we use the arc 2533286766Smav * header pointer for the reference. 2534286766Smav */ 2535286766Smav (void) refcount_add_many(&new_state->arcs_size, 2536307265Smav HDR_GET_LSIZE(hdr), hdr); 2537321610Smav ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); 2538286766Smav } else { 2539307265Smav uint32_t buffers = 0; 2540286766Smav 2541286766Smav /* 2542286766Smav * Each individual buffer holds a unique reference, 2543286766Smav * thus we must remove each of these references one 2544286766Smav * at a time. 2545286766Smav */ 2546286766Smav for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; 2547286766Smav buf = buf->b_next) { 2548307265Smav ASSERT3U(bufcnt, !=, 0); 2549307265Smav buffers++; 2550307265Smav 2551307265Smav /* 2552307265Smav * When the arc_buf_t is sharing the data 2553307265Smav * block with the hdr, the owner of the 2554307265Smav * reference belongs to the hdr. Only 2555307265Smav * add to the refcount if the arc_buf_t is 2556307265Smav * not shared. 2557307265Smav */ 2558321535Smav if (arc_buf_is_shared(buf)) 2559307265Smav continue; 2560307265Smav 2561286766Smav (void) refcount_add_many(&new_state->arcs_size, 2562321535Smav arc_buf_size(buf), buf); 2563286766Smav } 2564307265Smav ASSERT3U(bufcnt, ==, buffers); 2565307265Smav 2566321610Smav if (hdr->b_l1hdr.b_pabd != NULL) { 2567307265Smav (void) refcount_add_many(&new_state->arcs_size, 2568307265Smav arc_hdr_size(hdr), hdr); 2569307265Smav } else { 2570307265Smav ASSERT(GHOST_STATE(old_state)); 2571307265Smav } 2572286766Smav } 2573286766Smav } 2574286766Smav 2575307265Smav if (update_old && old_state != arc_l2c_only) { 2576286766Smav ASSERT(HDR_HAS_L1HDR(hdr)); 2577286766Smav if (GHOST_STATE(old_state)) { 2578307265Smav ASSERT0(bufcnt); 2579321610Smav ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); 2580307265Smav 2581286766Smav /* 2582286766Smav * When moving a header off of a ghost state, 2583307265Smav * the header will not contain any arc buffers. 2584307265Smav * We use the arc header pointer for the reference 2585307265Smav * which is exactly what we did when we put the 2586307265Smav * header on the ghost state. 2587286766Smav */ 2588286766Smav 2589286766Smav (void) refcount_remove_many(&old_state->arcs_size, 2590307265Smav HDR_GET_LSIZE(hdr), hdr); 2591286766Smav } else { 2592307265Smav uint32_t buffers = 0; 2593286766Smav 2594286766Smav /* 2595286766Smav * Each individual buffer holds a unique reference, 2596286766Smav * thus we must remove each of these references one 2597286766Smav * at a time. 2598286766Smav */ 2599286766Smav for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; 2600286766Smav buf = buf->b_next) { 2601321535Smav ASSERT3U(bufcnt, !=, 0); 2602307265Smav buffers++; 2603307265Smav 2604307265Smav /* 2605307265Smav * When the arc_buf_t is sharing the data 2606307265Smav * block with the hdr, the owner of the 2607307265Smav * reference belongs to the hdr. Only 2608307265Smav * add to the refcount if the arc_buf_t is 2609307265Smav * not shared. 2610307265Smav */ 2611321535Smav if (arc_buf_is_shared(buf)) 2612307265Smav continue; 2613307265Smav 2614286766Smav (void) refcount_remove_many( 2615321535Smav &old_state->arcs_size, arc_buf_size(buf), 2616307265Smav buf); 2617286766Smav } 2618307265Smav ASSERT3U(bufcnt, ==, buffers); 2619321610Smav ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); 2620307265Smav (void) refcount_remove_many( 2621307265Smav &old_state->arcs_size, arc_hdr_size(hdr), hdr); 2622286766Smav } 2623168404Spjd } 2624286766Smav 2625286570Smav if (HDR_HAS_L1HDR(hdr)) 2626286570Smav hdr->b_l1hdr.b_state = new_state; 2627185029Spjd 2628286570Smav /* 2629286570Smav * L2 headers should never be on the L2 state list since they don't 2630286570Smav * have L1 headers allocated. 2631286570Smav */ 2632321553Smav ASSERT(multilist_is_empty(arc_l2c_only->arcs_list[ARC_BUFC_DATA]) && 2633321553Smav multilist_is_empty(arc_l2c_only->arcs_list[ARC_BUFC_METADATA])); 2634168404Spjd} 2635168404Spjd 2636185029Spjdvoid 2637208373Smmarc_space_consume(uint64_t space, arc_space_type_t type) 2638185029Spjd{ 2639208373Smm ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES); 2640208373Smm 2641208373Smm switch (type) { 2642208373Smm case ARC_SPACE_DATA: 2643208373Smm ARCSTAT_INCR(arcstat_data_size, space); 2644208373Smm break; 2645286574Smav case ARC_SPACE_META: 2646286574Smav ARCSTAT_INCR(arcstat_metadata_size, space); 2647286574Smav break; 2648208373Smm case ARC_SPACE_OTHER: 2649208373Smm ARCSTAT_INCR(arcstat_other_size, space); 2650208373Smm break; 2651208373Smm case ARC_SPACE_HDRS: 2652208373Smm ARCSTAT_INCR(arcstat_hdr_size, space); 2653208373Smm break; 2654208373Smm case ARC_SPACE_L2HDRS: 2655208373Smm ARCSTAT_INCR(arcstat_l2_hdr_size, space); 2656208373Smm break; 2657208373Smm } 2658208373Smm 2659286574Smav if (type != ARC_SPACE_DATA) 2660286574Smav ARCSTAT_INCR(arcstat_meta_used, space); 2661286574Smav 2662185029Spjd atomic_add_64(&arc_size, space); 2663185029Spjd} 2664185029Spjd 2665185029Spjdvoid 2666208373Smmarc_space_return(uint64_t space, arc_space_type_t type) 2667185029Spjd{ 2668208373Smm ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES); 2669208373Smm 2670208373Smm switch (type) { 2671208373Smm case ARC_SPACE_DATA: 2672208373Smm ARCSTAT_INCR(arcstat_data_size, -space); 2673208373Smm break; 2674286574Smav case ARC_SPACE_META: 2675286574Smav ARCSTAT_INCR(arcstat_metadata_size, -space); 2676286574Smav break; 2677208373Smm case ARC_SPACE_OTHER: 2678208373Smm ARCSTAT_INCR(arcstat_other_size, -space); 2679208373Smm break; 2680208373Smm case ARC_SPACE_HDRS: 2681208373Smm ARCSTAT_INCR(arcstat_hdr_size, -space); 2682208373Smm break; 2683208373Smm case ARC_SPACE_L2HDRS: 2684208373Smm ARCSTAT_INCR(arcstat_l2_hdr_size, -space); 2685208373Smm break; 2686208373Smm } 2687208373Smm 2688286574Smav if (type != ARC_SPACE_DATA) { 2689286574Smav ASSERT(arc_meta_used >= space); 2690286574Smav if (arc_meta_max < arc_meta_used) 2691286574Smav arc_meta_max = arc_meta_used; 2692286574Smav ARCSTAT_INCR(arcstat_meta_used, -space); 2693286574Smav } 2694286574Smav 2695185029Spjd ASSERT(arc_size >= space); 2696185029Spjd atomic_add_64(&arc_size, -space); 2697185029Spjd} 2698185029Spjd 2699307265Smav/* 2700321535Smav * Given a hdr and a buf, returns whether that buf can share its b_data buffer 2701321610Smav * with the hdr's b_pabd. 2702307265Smav */ 2703321535Smavstatic boolean_t 2704321535Smavarc_can_share(arc_buf_hdr_t *hdr, arc_buf_t *buf) 2705168404Spjd{ 2706321535Smav /* 2707321535Smav * The criteria for sharing a hdr's data are: 2708321535Smav * 1. the hdr's compression matches the buf's compression 2709321535Smav * 2. the hdr doesn't need to be byteswapped 2710321535Smav * 3. the hdr isn't already being shared 2711321535Smav * 4. the buf is either compressed or it is the last buf in the hdr list 2712321535Smav * 2713321535Smav * Criterion #4 maintains the invariant that shared uncompressed 2714321535Smav * bufs must be the final buf in the hdr's b_buf list. Reading this, you 2715321535Smav * might ask, "if a compressed buf is allocated first, won't that be the 2716321535Smav * last thing in the list?", but in that case it's impossible to create 2717321535Smav * a shared uncompressed buf anyway (because the hdr must be compressed 2718321535Smav * to have the compressed buf). You might also think that #3 is 2719321535Smav * sufficient to make this guarantee, however it's possible 2720321535Smav * (specifically in the rare L2ARC write race mentioned in 2721321535Smav * arc_buf_alloc_impl()) there will be an existing uncompressed buf that 2722321535Smav * is sharable, but wasn't at the time of its allocation. Rather than 2723321535Smav * allow a new shared uncompressed buf to be created and then shuffle 2724321535Smav * the list around to make it the last element, this simply disallows 2725321535Smav * sharing if the new buf isn't the first to be added. 2726321535Smav */ 2727321535Smav ASSERT3P(buf->b_hdr, ==, hdr); 2728321535Smav boolean_t hdr_compressed = HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF; 2729321535Smav boolean_t buf_compressed = ARC_BUF_COMPRESSED(buf) != 0; 2730321535Smav return (buf_compressed == hdr_compressed && 2731321535Smav hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS && 2732321535Smav !HDR_SHARED_DATA(hdr) && 2733321535Smav (ARC_BUF_LAST(buf) || ARC_BUF_COMPRESSED(buf))); 2734321535Smav} 2735321535Smav 2736321535Smav/* 2737321535Smav * Allocate a buf for this hdr. If you care about the data that's in the hdr, 2738321535Smav * or if you want a compressed buffer, pass those flags in. Returns 0 if the 2739321535Smav * copy was made successfully, or an error code otherwise. 2740321535Smav */ 2741321535Smavstatic int 2742321535Smavarc_buf_alloc_impl(arc_buf_hdr_t *hdr, void *tag, boolean_t compressed, 2743321535Smav boolean_t fill, arc_buf_t **ret) 2744321535Smav{ 2745168404Spjd arc_buf_t *buf; 2746168404Spjd 2747307265Smav ASSERT(HDR_HAS_L1HDR(hdr)); 2748307265Smav ASSERT3U(HDR_GET_LSIZE(hdr), >, 0); 2749307265Smav VERIFY(hdr->b_type == ARC_BUFC_DATA || 2750307265Smav hdr->b_type == ARC_BUFC_METADATA); 2751321535Smav ASSERT3P(ret, !=, NULL); 2752321535Smav ASSERT3P(*ret, ==, NULL); 2753286570Smav 2754321535Smav buf = *ret = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); 2755168404Spjd buf->b_hdr = hdr; 2756168404Spjd buf->b_data = NULL; 2757321535Smav buf->b_next = hdr->b_l1hdr.b_buf; 2758321535Smav buf->b_flags = 0; 2759286570Smav 2760307265Smav add_reference(hdr, tag); 2761286570Smav 2762307265Smav /* 2763307265Smav * We're about to change the hdr's b_flags. We must either 2764307265Smav * hold the hash_lock or be undiscoverable. 2765307265Smav */ 2766307265Smav ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); 2767307265Smav 2768307265Smav /* 2769321535Smav * Only honor requests for compressed bufs if the hdr is actually 2770321535Smav * compressed. 2771307265Smav */ 2772321535Smav if (compressed && HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF) 2773321535Smav buf->b_flags |= ARC_BUF_FLAG_COMPRESSED; 2774321535Smav 2775321535Smav /* 2776321535Smav * If the hdr's data can be shared then we share the data buffer and 2777321535Smav * set the appropriate bit in the hdr's b_flags to indicate the hdr is 2778321610Smav * sharing it's b_pabd with the arc_buf_t. Otherwise, we allocate a new 2779321535Smav * buffer to store the buf's data. 2780321535Smav * 2781321610Smav * There are two additional restrictions here because we're sharing 2782321610Smav * hdr -> buf instead of the usual buf -> hdr. First, the hdr can't be 2783321610Smav * actively involved in an L2ARC write, because if this buf is used by 2784321610Smav * an arc_write() then the hdr's data buffer will be released when the 2785321535Smav * write completes, even though the L2ARC write might still be using it. 2786321610Smav * Second, the hdr's ABD must be linear so that the buf's user doesn't 2787321610Smav * need to be ABD-aware. 2788321535Smav */ 2789321610Smav boolean_t can_share = arc_can_share(hdr, buf) && !HDR_L2_WRITING(hdr) && 2790321610Smav abd_is_linear(hdr->b_l1hdr.b_pabd); 2791321535Smav 2792321535Smav /* Set up b_data and sharing */ 2793321535Smav if (can_share) { 2794321610Smav buf->b_data = abd_to_buf(hdr->b_l1hdr.b_pabd); 2795321535Smav buf->b_flags |= ARC_BUF_FLAG_SHARED; 2796307265Smav arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA); 2797307265Smav } else { 2798321535Smav buf->b_data = 2799321535Smav arc_get_data_buf(hdr, arc_buf_size(buf), buf); 2800321535Smav ARCSTAT_INCR(arcstat_overhead_size, arc_buf_size(buf)); 2801307265Smav } 2802307265Smav VERIFY3P(buf->b_data, !=, NULL); 2803307265Smav 2804286570Smav hdr->b_l1hdr.b_buf = buf; 2805307265Smav hdr->b_l1hdr.b_bufcnt += 1; 2806286570Smav 2807321535Smav /* 2808321535Smav * If the user wants the data from the hdr, we need to either copy or 2809321535Smav * decompress the data. 2810321535Smav */ 2811321535Smav if (fill) { 2812321535Smav return (arc_buf_fill(buf, ARC_BUF_COMPRESSED(buf) != 0)); 2813321535Smav } 2814321535Smav 2815321535Smav return (0); 2816307265Smav} 2817168404Spjd 2818321535Smavstatic char *arc_onloan_tag = "onloan"; 2819321535Smav 2820321535Smavstatic inline void 2821321535Smavarc_loaned_bytes_update(int64_t delta) 2822307265Smav{ 2823321535Smav atomic_add_64(&arc_loaned_bytes, delta); 2824307265Smav 2825321535Smav /* assert that it did not wrap around */ 2826321535Smav ASSERT3S(atomic_add_64_nv(&arc_loaned_bytes, 0), >=, 0); 2827168404Spjd} 2828168404Spjd 2829209962Smm/* 2830209962Smm * Loan out an anonymous arc buffer. Loaned buffers are not counted as in 2831209962Smm * flight data by arc_tempreserve_space() until they are "returned". Loaned 2832209962Smm * buffers must be returned to the arc before they can be used by the DMU or 2833209962Smm * freed. 2834209962Smm */ 2835209962Smmarc_buf_t * 2836321535Smavarc_loan_buf(spa_t *spa, boolean_t is_metadata, int size) 2837209962Smm{ 2838321535Smav arc_buf_t *buf = arc_alloc_buf(spa, arc_onloan_tag, 2839321535Smav is_metadata ? ARC_BUFC_METADATA : ARC_BUFC_DATA, size); 2840209962Smm 2841321535Smav arc_loaned_bytes_update(size); 2842209962Smm 2843209962Smm return (buf); 2844209962Smm} 2845209962Smm 2846321535Smavarc_buf_t * 2847321535Smavarc_loan_compressed_buf(spa_t *spa, uint64_t psize, uint64_t lsize, 2848321535Smav enum zio_compress compression_type) 2849321535Smav{ 2850321535Smav arc_buf_t *buf = arc_alloc_compressed_buf(spa, arc_onloan_tag, 2851321535Smav psize, lsize, compression_type); 2852321535Smav 2853321535Smav arc_loaned_bytes_update(psize); 2854321535Smav 2855321535Smav return (buf); 2856321535Smav} 2857321535Smav 2858321535Smav 2859209962Smm/* 2860209962Smm * Return a loaned arc buffer to the arc. 2861209962Smm */ 2862209962Smmvoid 2863209962Smmarc_return_buf(arc_buf_t *buf, void *tag) 2864209962Smm{ 2865209962Smm arc_buf_hdr_t *hdr = buf->b_hdr; 2866209962Smm 2867307265Smav ASSERT3P(buf->b_data, !=, NULL); 2868286570Smav ASSERT(HDR_HAS_L1HDR(hdr)); 2869286570Smav (void) refcount_add(&hdr->b_l1hdr.b_refcnt, tag); 2870286570Smav (void) refcount_remove(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag); 2871209962Smm 2872321535Smav arc_loaned_bytes_update(-arc_buf_size(buf)); 2873209962Smm} 2874209962Smm 2875219089Spjd/* Detach an arc_buf from a dbuf (tag) */ 2876219089Spjdvoid 2877219089Spjdarc_loan_inuse_buf(arc_buf_t *buf, void *tag) 2878219089Spjd{ 2879286570Smav arc_buf_hdr_t *hdr = buf->b_hdr; 2880219089Spjd 2881307265Smav ASSERT3P(buf->b_data, !=, NULL); 2882286570Smav ASSERT(HDR_HAS_L1HDR(hdr)); 2883286570Smav (void) refcount_add(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag); 2884286570Smav (void) refcount_remove(&hdr->b_l1hdr.b_refcnt, tag); 2885219089Spjd 2886321535Smav arc_loaned_bytes_update(arc_buf_size(buf)); 2887219089Spjd} 2888219089Spjd 2889274172Savgstatic void 2890321610Smavl2arc_free_abd_on_write(abd_t *abd, size_t size, arc_buf_contents_t type) 2891274172Savg{ 2892307265Smav l2arc_data_free_t *df = kmem_alloc(sizeof (*df), KM_SLEEP); 2893274172Savg 2894321610Smav df->l2df_abd = abd; 2895274172Savg df->l2df_size = size; 2896307265Smav df->l2df_type = type; 2897274172Savg mutex_enter(&l2arc_free_on_write_mtx); 2898274172Savg list_insert_head(l2arc_free_on_write, df); 2899274172Savg mutex_exit(&l2arc_free_on_write_mtx); 2900274172Savg} 2901274172Savg 2902168404Spjdstatic void 2903307265Smavarc_hdr_free_on_write(arc_buf_hdr_t *hdr) 2904185029Spjd{ 2905307265Smav arc_state_t *state = hdr->b_l1hdr.b_state; 2906307265Smav arc_buf_contents_t type = arc_buf_type(hdr); 2907307265Smav uint64_t size = arc_hdr_size(hdr); 2908240133Smm 2909307265Smav /* protected by hash lock, if in the hash table */ 2910307265Smav if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) { 2911307265Smav ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 2912307265Smav ASSERT(state != arc_anon && state != arc_l2c_only); 2913307265Smav 2914307265Smav (void) refcount_remove_many(&state->arcs_esize[type], 2915307265Smav size, hdr); 2916185029Spjd } 2917307265Smav (void) refcount_remove_many(&state->arcs_size, size, hdr); 2918315834Savg if (type == ARC_BUFC_METADATA) { 2919315834Savg arc_space_return(size, ARC_SPACE_META); 2920315834Savg } else { 2921315834Savg ASSERT(type == ARC_BUFC_DATA); 2922315834Savg arc_space_return(size, ARC_SPACE_DATA); 2923315834Savg } 2924307265Smav 2925321610Smav l2arc_free_abd_on_write(hdr->b_l1hdr.b_pabd, size, type); 2926185029Spjd} 2927185029Spjd 2928307265Smav/* 2929307265Smav * Share the arc_buf_t's data with the hdr. Whenever we are sharing the 2930307265Smav * data buffer, we transfer the refcount ownership to the hdr and update 2931307265Smav * the appropriate kstats. 2932307265Smav */ 2933185029Spjdstatic void 2934307265Smavarc_share_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf) 2935274172Savg{ 2936307265Smav arc_state_t *state = hdr->b_l1hdr.b_state; 2937297848Savg 2938321535Smav ASSERT(arc_can_share(hdr, buf)); 2939321610Smav ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); 2940307265Smav ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); 2941274172Savg 2942286570Smav /* 2943307265Smav * Start sharing the data buffer. We transfer the 2944307265Smav * refcount ownership to the hdr since it always owns 2945307265Smav * the refcount whenever an arc_buf_t is shared. 2946286570Smav */ 2947307265Smav refcount_transfer_ownership(&state->arcs_size, buf, hdr); 2948321610Smav hdr->b_l1hdr.b_pabd = abd_get_from_buf(buf->b_data, arc_buf_size(buf)); 2949321610Smav abd_take_ownership_of_buf(hdr->b_l1hdr.b_pabd, 2950321610Smav HDR_ISTYPE_METADATA(hdr)); 2951307265Smav arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA); 2952321535Smav buf->b_flags |= ARC_BUF_FLAG_SHARED; 2953274172Savg 2954286763Smav /* 2955307265Smav * Since we've transferred ownership to the hdr we need 2956307265Smav * to increment its compressed and uncompressed kstats and 2957307265Smav * decrement the overhead size. 2958286763Smav */ 2959307265Smav ARCSTAT_INCR(arcstat_compressed_size, arc_hdr_size(hdr)); 2960307265Smav ARCSTAT_INCR(arcstat_uncompressed_size, HDR_GET_LSIZE(hdr)); 2961321535Smav ARCSTAT_INCR(arcstat_overhead_size, -arc_buf_size(buf)); 2962307265Smav} 2963274172Savg 2964307265Smavstatic void 2965307265Smavarc_unshare_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf) 2966307265Smav{ 2967307265Smav arc_state_t *state = hdr->b_l1hdr.b_state; 2968286570Smav 2969307265Smav ASSERT(arc_buf_is_shared(buf)); 2970321610Smav ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); 2971307265Smav ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); 2972307265Smav 2973286763Smav /* 2974307265Smav * We are no longer sharing this buffer so we need 2975307265Smav * to transfer its ownership to the rightful owner. 2976286763Smav */ 2977307265Smav refcount_transfer_ownership(&state->arcs_size, hdr, buf); 2978307265Smav arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA); 2979321610Smav abd_release_ownership_of_buf(hdr->b_l1hdr.b_pabd); 2980321610Smav abd_put(hdr->b_l1hdr.b_pabd); 2981321610Smav hdr->b_l1hdr.b_pabd = NULL; 2982321535Smav buf->b_flags &= ~ARC_BUF_FLAG_SHARED; 2983286763Smav 2984297848Savg /* 2985307265Smav * Since the buffer is no longer shared between 2986307265Smav * the arc buf and the hdr, count it as overhead. 2987297848Savg */ 2988307265Smav ARCSTAT_INCR(arcstat_compressed_size, -arc_hdr_size(hdr)); 2989307265Smav ARCSTAT_INCR(arcstat_uncompressed_size, -HDR_GET_LSIZE(hdr)); 2990321535Smav ARCSTAT_INCR(arcstat_overhead_size, arc_buf_size(buf)); 2991274172Savg} 2992274172Savg 2993286767Smav/* 2994321535Smav * Remove an arc_buf_t from the hdr's buf list and return the last 2995321535Smav * arc_buf_t on the list. If no buffers remain on the list then return 2996321535Smav * NULL. 2997286767Smav */ 2998321535Smavstatic arc_buf_t * 2999321535Smavarc_buf_remove(arc_buf_hdr_t *hdr, arc_buf_t *buf) 3000321535Smav{ 3001321535Smav ASSERT(HDR_HAS_L1HDR(hdr)); 3002321535Smav ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); 3003321535Smav 3004321535Smav arc_buf_t **bufp = &hdr->b_l1hdr.b_buf; 3005321535Smav arc_buf_t *lastbuf = NULL; 3006321535Smav 3007321535Smav /* 3008321535Smav * Remove the buf from the hdr list and locate the last 3009321535Smav * remaining buffer on the list. 3010321535Smav */ 3011321535Smav while (*bufp != NULL) { 3012321535Smav if (*bufp == buf) 3013321535Smav *bufp = buf->b_next; 3014321535Smav 3015321535Smav /* 3016321535Smav * If we've removed a buffer in the middle of 3017321535Smav * the list then update the lastbuf and update 3018321535Smav * bufp. 3019321535Smav */ 3020321535Smav if (*bufp != NULL) { 3021321535Smav lastbuf = *bufp; 3022321535Smav bufp = &(*bufp)->b_next; 3023321535Smav } 3024321535Smav } 3025321535Smav buf->b_next = NULL; 3026321535Smav ASSERT3P(lastbuf, !=, buf); 3027321535Smav IMPLY(hdr->b_l1hdr.b_bufcnt > 0, lastbuf != NULL); 3028321535Smav IMPLY(hdr->b_l1hdr.b_bufcnt > 0, hdr->b_l1hdr.b_buf != NULL); 3029321535Smav IMPLY(lastbuf != NULL, ARC_BUF_LAST(lastbuf)); 3030321535Smav 3031321535Smav return (lastbuf); 3032321535Smav} 3033321535Smav 3034321535Smav/* 3035321535Smav * Free up buf->b_data and pull the arc_buf_t off of the the arc_buf_hdr_t's 3036321535Smav * list and free it. 3037321535Smav */ 3038274172Savgstatic void 3039321535Smavarc_buf_destroy_impl(arc_buf_t *buf) 3040168404Spjd{ 3041307265Smav arc_buf_hdr_t *hdr = buf->b_hdr; 3042168404Spjd 3043307265Smav /* 3044321535Smav * Free up the data associated with the buf but only if we're not 3045321535Smav * sharing this with the hdr. If we are sharing it with the hdr, the 3046321535Smav * hdr is responsible for doing the free. 3047307265Smav */ 3048286570Smav if (buf->b_data != NULL) { 3049307265Smav /* 3050307265Smav * We're about to change the hdr's b_flags. We must either 3051307265Smav * hold the hash_lock or be undiscoverable. 3052307265Smav */ 3053307265Smav ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); 3054168404Spjd 3055168404Spjd arc_cksum_verify(buf); 3056240133Smm#ifdef illumos 3057240133Smm arc_buf_unwatch(buf); 3058277300Ssmh#endif 3059219089Spjd 3060321535Smav if (arc_buf_is_shared(buf)) { 3061307265Smav arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA); 3062286763Smav } else { 3063321535Smav uint64_t size = arc_buf_size(buf); 3064307265Smav arc_free_data_buf(hdr, buf->b_data, size, buf); 3065307265Smav ARCSTAT_INCR(arcstat_overhead_size, -size); 3066168404Spjd } 3067168404Spjd buf->b_data = NULL; 3068242845Sdelphij 3069307265Smav ASSERT(hdr->b_l1hdr.b_bufcnt > 0); 3070307265Smav hdr->b_l1hdr.b_bufcnt -= 1; 3071168404Spjd } 3072168404Spjd 3073321535Smav arc_buf_t *lastbuf = arc_buf_remove(hdr, buf); 3074168404Spjd 3075321535Smav if (ARC_BUF_SHARED(buf) && !ARC_BUF_COMPRESSED(buf)) { 3076307265Smav /* 3077321535Smav * If the current arc_buf_t is sharing its data buffer with the 3078321610Smav * hdr, then reassign the hdr's b_pabd to share it with the new 3079321535Smav * buffer at the end of the list. The shared buffer is always 3080321535Smav * the last one on the hdr's buffer list. 3081321535Smav * 3082321535Smav * There is an equivalent case for compressed bufs, but since 3083321535Smav * they aren't guaranteed to be the last buf in the list and 3084321535Smav * that is an exceedingly rare case, we just allow that space be 3085321535Smav * wasted temporarily. 3086307265Smav */ 3087321535Smav if (lastbuf != NULL) { 3088321535Smav /* Only one buf can be shared at once */ 3089321535Smav VERIFY(!arc_buf_is_shared(lastbuf)); 3090321535Smav /* hdr is uncompressed so can't have compressed buf */ 3091321535Smav VERIFY(!ARC_BUF_COMPRESSED(lastbuf)); 3092168404Spjd 3093321610Smav ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); 3094321610Smav arc_hdr_free_pabd(hdr); 3095168404Spjd 3096321535Smav /* 3097321535Smav * We must setup a new shared block between the 3098321535Smav * last buffer and the hdr. The data would have 3099321535Smav * been allocated by the arc buf so we need to transfer 3100321535Smav * ownership to the hdr since it's now being shared. 3101321535Smav */ 3102321535Smav arc_share_buf(hdr, lastbuf); 3103321535Smav } 3104321535Smav } else if (HDR_SHARED_DATA(hdr)) { 3105307265Smav /* 3106321535Smav * Uncompressed shared buffers are always at the end 3107321535Smav * of the list. Compressed buffers don't have the 3108321535Smav * same requirements. This makes it hard to 3109321535Smav * simply assert that the lastbuf is shared so 3110321535Smav * we rely on the hdr's compression flags to determine 3111321535Smav * if we have a compressed, shared buffer. 3112307265Smav */ 3113321535Smav ASSERT3P(lastbuf, !=, NULL); 3114321535Smav ASSERT(arc_buf_is_shared(lastbuf) || 3115321535Smav HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF); 3116307265Smav } 3117307265Smav 3118321535Smav /* 3119321535Smav * Free the checksum if we're removing the last uncompressed buf from 3120321535Smav * this hdr. 3121321535Smav */ 3122321535Smav if (!arc_hdr_has_uncompressed_buf(hdr)) { 3123307265Smav arc_cksum_free(hdr); 3124321535Smav } 3125307265Smav 3126168404Spjd /* clean up the buf */ 3127168404Spjd buf->b_hdr = NULL; 3128168404Spjd kmem_cache_free(buf_cache, buf); 3129168404Spjd} 3130168404Spjd 3131168404Spjdstatic void 3132321610Smavarc_hdr_alloc_pabd(arc_buf_hdr_t *hdr) 3133286598Smav{ 3134307265Smav ASSERT3U(HDR_GET_LSIZE(hdr), >, 0); 3135307265Smav ASSERT(HDR_HAS_L1HDR(hdr)); 3136307265Smav ASSERT(!HDR_SHARED_DATA(hdr)); 3137286598Smav 3138321610Smav ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); 3139321610Smav hdr->b_l1hdr.b_pabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr); 3140307265Smav hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS; 3141321610Smav ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); 3142307265Smav 3143307265Smav ARCSTAT_INCR(arcstat_compressed_size, arc_hdr_size(hdr)); 3144307265Smav ARCSTAT_INCR(arcstat_uncompressed_size, HDR_GET_LSIZE(hdr)); 3145307265Smav} 3146307265Smav 3147307265Smavstatic void 3148321610Smavarc_hdr_free_pabd(arc_buf_hdr_t *hdr) 3149307265Smav{ 3150307265Smav ASSERT(HDR_HAS_L1HDR(hdr)); 3151321610Smav ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); 3152307265Smav 3153307265Smav /* 3154307265Smav * If the hdr is currently being written to the l2arc then 3155307265Smav * we defer freeing the data by adding it to the l2arc_free_on_write 3156307265Smav * list. The l2arc will free the data once it's finished 3157307265Smav * writing it to the l2arc device. 3158307265Smav */ 3159307265Smav if (HDR_L2_WRITING(hdr)) { 3160307265Smav arc_hdr_free_on_write(hdr); 3161307265Smav ARCSTAT_BUMP(arcstat_l2_free_on_write); 3162307265Smav } else { 3163321610Smav arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd, 3164307265Smav arc_hdr_size(hdr), hdr); 3165307265Smav } 3166321610Smav hdr->b_l1hdr.b_pabd = NULL; 3167307265Smav hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS; 3168307265Smav 3169307265Smav ARCSTAT_INCR(arcstat_compressed_size, -arc_hdr_size(hdr)); 3170307265Smav ARCSTAT_INCR(arcstat_uncompressed_size, -HDR_GET_LSIZE(hdr)); 3171307265Smav} 3172307265Smav 3173307265Smavstatic arc_buf_hdr_t * 3174307265Smavarc_hdr_alloc(uint64_t spa, int32_t psize, int32_t lsize, 3175321535Smav enum zio_compress compression_type, arc_buf_contents_t type) 3176307265Smav{ 3177307265Smav arc_buf_hdr_t *hdr; 3178307265Smav 3179307265Smav VERIFY(type == ARC_BUFC_DATA || type == ARC_BUFC_METADATA); 3180307265Smav 3181307265Smav hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE); 3182307265Smav ASSERT(HDR_EMPTY(hdr)); 3183307265Smav ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); 3184307265Smav ASSERT3P(hdr->b_l1hdr.b_thawed, ==, NULL); 3185307265Smav HDR_SET_PSIZE(hdr, psize); 3186307265Smav HDR_SET_LSIZE(hdr, lsize); 3187307265Smav hdr->b_spa = spa; 3188307265Smav hdr->b_type = type; 3189307265Smav hdr->b_flags = 0; 3190307265Smav arc_hdr_set_flags(hdr, arc_bufc_to_flags(type) | ARC_FLAG_HAS_L1HDR); 3191321535Smav arc_hdr_set_compress(hdr, compression_type); 3192307265Smav 3193307265Smav hdr->b_l1hdr.b_state = arc_anon; 3194307265Smav hdr->b_l1hdr.b_arc_access = 0; 3195307265Smav hdr->b_l1hdr.b_bufcnt = 0; 3196307265Smav hdr->b_l1hdr.b_buf = NULL; 3197307265Smav 3198307265Smav /* 3199307265Smav * Allocate the hdr's buffer. This will contain either 3200307265Smav * the compressed or uncompressed data depending on the block 3201307265Smav * it references and compressed arc enablement. 3202307265Smav */ 3203321610Smav arc_hdr_alloc_pabd(hdr); 3204307265Smav ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 3205307265Smav 3206307265Smav return (hdr); 3207307265Smav} 3208307265Smav 3209307265Smav/* 3210307265Smav * Transition between the two allocation states for the arc_buf_hdr struct. 3211307265Smav * The arc_buf_hdr struct can be allocated with (hdr_full_cache) or without 3212307265Smav * (hdr_l2only_cache) the fields necessary for the L1 cache - the smaller 3213307265Smav * version is used when a cache buffer is only in the L2ARC in order to reduce 3214307265Smav * memory usage. 3215307265Smav */ 3216307265Smavstatic arc_buf_hdr_t * 3217307265Smavarc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new) 3218307265Smav{ 3219286598Smav ASSERT(HDR_HAS_L2HDR(hdr)); 3220286598Smav 3221307265Smav arc_buf_hdr_t *nhdr; 3222307265Smav l2arc_dev_t *dev = hdr->b_l2hdr.b_dev; 3223286598Smav 3224307265Smav ASSERT((old == hdr_full_cache && new == hdr_l2only_cache) || 3225307265Smav (old == hdr_l2only_cache && new == hdr_full_cache)); 3226307265Smav 3227307265Smav nhdr = kmem_cache_alloc(new, KM_PUSHPAGE); 3228307265Smav 3229307265Smav ASSERT(MUTEX_HELD(HDR_LOCK(hdr))); 3230307265Smav buf_hash_remove(hdr); 3231307265Smav 3232307265Smav bcopy(hdr, nhdr, HDR_L2ONLY_SIZE); 3233307265Smav 3234307265Smav if (new == hdr_full_cache) { 3235307265Smav arc_hdr_set_flags(nhdr, ARC_FLAG_HAS_L1HDR); 3236307265Smav /* 3237307265Smav * arc_access and arc_change_state need to be aware that a 3238307265Smav * header has just come out of L2ARC, so we set its state to 3239307265Smav * l2c_only even though it's about to change. 3240307265Smav */ 3241307265Smav nhdr->b_l1hdr.b_state = arc_l2c_only; 3242307265Smav 3243307265Smav /* Verify previous threads set to NULL before freeing */ 3244321610Smav ASSERT3P(nhdr->b_l1hdr.b_pabd, ==, NULL); 3245307265Smav } else { 3246307265Smav ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); 3247307265Smav ASSERT0(hdr->b_l1hdr.b_bufcnt); 3248307265Smav ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); 3249307265Smav 3250307265Smav /* 3251307265Smav * If we've reached here, We must have been called from 3252307265Smav * arc_evict_hdr(), as such we should have already been 3253307265Smav * removed from any ghost list we were previously on 3254307265Smav * (which protects us from racing with arc_evict_state), 3255307265Smav * thus no locking is needed during this check. 3256307265Smav */ 3257307265Smav ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); 3258307265Smav 3259307265Smav /* 3260307265Smav * A buffer must not be moved into the arc_l2c_only 3261307265Smav * state if it's not finished being written out to the 3262321610Smav * l2arc device. Otherwise, the b_l1hdr.b_pabd field 3263307265Smav * might try to be accessed, even though it was removed. 3264307265Smav */ 3265307265Smav VERIFY(!HDR_L2_WRITING(hdr)); 3266321610Smav VERIFY3P(hdr->b_l1hdr.b_pabd, ==, NULL); 3267307265Smav 3268307265Smav#ifdef ZFS_DEBUG 3269307265Smav if (hdr->b_l1hdr.b_thawed != NULL) { 3270307265Smav kmem_free(hdr->b_l1hdr.b_thawed, 1); 3271307265Smav hdr->b_l1hdr.b_thawed = NULL; 3272307265Smav } 3273307265Smav#endif 3274307265Smav 3275307265Smav arc_hdr_clear_flags(nhdr, ARC_FLAG_HAS_L1HDR); 3276307265Smav } 3277286598Smav /* 3278307265Smav * The header has been reallocated so we need to re-insert it into any 3279307265Smav * lists it was on. 3280286598Smav */ 3281307265Smav (void) buf_hash_insert(nhdr, NULL); 3282286598Smav 3283307265Smav ASSERT(list_link_active(&hdr->b_l2hdr.b_l2node)); 3284307265Smav 3285307265Smav mutex_enter(&dev->l2ad_mtx); 3286307265Smav 3287286598Smav /* 3288307265Smav * We must place the realloc'ed header back into the list at 3289307265Smav * the same spot. Otherwise, if it's placed earlier in the list, 3290307265Smav * l2arc_write_buffers() could find it during the function's 3291307265Smav * write phase, and try to write it out to the l2arc. 3292286598Smav */ 3293307265Smav list_insert_after(&dev->l2ad_buflist, hdr, nhdr); 3294307265Smav list_remove(&dev->l2ad_buflist, hdr); 3295286598Smav 3296307265Smav mutex_exit(&dev->l2ad_mtx); 3297307265Smav 3298286598Smav /* 3299307265Smav * Since we're using the pointer address as the tag when 3300307265Smav * incrementing and decrementing the l2ad_alloc refcount, we 3301307265Smav * must remove the old pointer (that we're about to destroy) and 3302307265Smav * add the new pointer to the refcount. Otherwise we'd remove 3303307265Smav * the wrong pointer address when calling arc_hdr_destroy() later. 3304286598Smav */ 3305286598Smav 3306307265Smav (void) refcount_remove_many(&dev->l2ad_alloc, arc_hdr_size(hdr), hdr); 3307307265Smav (void) refcount_add_many(&dev->l2ad_alloc, arc_hdr_size(nhdr), nhdr); 3308286598Smav 3309307265Smav buf_discard_identity(hdr); 3310307265Smav kmem_cache_free(old, hdr); 3311286598Smav 3312307265Smav return (nhdr); 3313286598Smav} 3314286598Smav 3315307265Smav/* 3316307265Smav * Allocate a new arc_buf_hdr_t and arc_buf_t and return the buf to the caller. 3317307265Smav * The buf is returned thawed since we expect the consumer to modify it. 3318307265Smav */ 3319307265Smavarc_buf_t * 3320321535Smavarc_alloc_buf(spa_t *spa, void *tag, arc_buf_contents_t type, int32_t size) 3321307265Smav{ 3322307265Smav arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), size, size, 3323307265Smav ZIO_COMPRESS_OFF, type); 3324307265Smav ASSERT(!MUTEX_HELD(HDR_LOCK(hdr))); 3325321535Smav 3326321535Smav arc_buf_t *buf = NULL; 3327321535Smav VERIFY0(arc_buf_alloc_impl(hdr, tag, B_FALSE, B_FALSE, &buf)); 3328307265Smav arc_buf_thaw(buf); 3329321535Smav 3330307265Smav return (buf); 3331307265Smav} 3332307265Smav 3333321535Smav/* 3334321535Smav * Allocate a compressed buf in the same manner as arc_alloc_buf. Don't use this 3335321535Smav * for bufs containing metadata. 3336321535Smav */ 3337321535Smavarc_buf_t * 3338321535Smavarc_alloc_compressed_buf(spa_t *spa, void *tag, uint64_t psize, uint64_t lsize, 3339321535Smav enum zio_compress compression_type) 3340321535Smav{ 3341321535Smav ASSERT3U(lsize, >, 0); 3342321535Smav ASSERT3U(lsize, >=, psize); 3343321535Smav ASSERT(compression_type > ZIO_COMPRESS_OFF); 3344321535Smav ASSERT(compression_type < ZIO_COMPRESS_FUNCTIONS); 3345321535Smav 3346321535Smav arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize, 3347321535Smav compression_type, ARC_BUFC_DATA); 3348321535Smav ASSERT(!MUTEX_HELD(HDR_LOCK(hdr))); 3349321535Smav 3350321535Smav arc_buf_t *buf = NULL; 3351321535Smav VERIFY0(arc_buf_alloc_impl(hdr, tag, B_TRUE, B_FALSE, &buf)); 3352321535Smav arc_buf_thaw(buf); 3353321535Smav ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); 3354321535Smav 3355321610Smav if (!arc_buf_is_shared(buf)) { 3356321610Smav /* 3357321610Smav * To ensure that the hdr has the correct data in it if we call 3358321610Smav * arc_decompress() on this buf before it's been written to 3359321610Smav * disk, it's easiest if we just set up sharing between the 3360321610Smav * buf and the hdr. 3361321610Smav */ 3362321610Smav ASSERT(!abd_is_linear(hdr->b_l1hdr.b_pabd)); 3363321610Smav arc_hdr_free_pabd(hdr); 3364321610Smav arc_share_buf(hdr, buf); 3365321610Smav } 3366321610Smav 3367321535Smav return (buf); 3368321535Smav} 3369321535Smav 3370286598Smavstatic void 3371307265Smavarc_hdr_l2hdr_destroy(arc_buf_hdr_t *hdr) 3372307265Smav{ 3373307265Smav l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr; 3374307265Smav l2arc_dev_t *dev = l2hdr->b_dev; 3375323754Savg uint64_t psize = arc_hdr_size(hdr); 3376307265Smav 3377307265Smav ASSERT(MUTEX_HELD(&dev->l2ad_mtx)); 3378307265Smav ASSERT(HDR_HAS_L2HDR(hdr)); 3379307265Smav 3380307265Smav list_remove(&dev->l2ad_buflist, hdr); 3381307265Smav 3382323754Savg ARCSTAT_INCR(arcstat_l2_psize, -psize); 3383323754Savg ARCSTAT_INCR(arcstat_l2_lsize, -HDR_GET_LSIZE(hdr)); 3384307265Smav 3385323754Savg vdev_space_update(dev->l2ad_vdev, -psize, 0, 0); 3386307265Smav 3387323754Savg (void) refcount_remove_many(&dev->l2ad_alloc, psize, hdr); 3388307265Smav arc_hdr_clear_flags(hdr, ARC_FLAG_HAS_L2HDR); 3389307265Smav} 3390307265Smav 3391307265Smavstatic void 3392168404Spjdarc_hdr_destroy(arc_buf_hdr_t *hdr) 3393168404Spjd{ 3394286570Smav if (HDR_HAS_L1HDR(hdr)) { 3395286570Smav ASSERT(hdr->b_l1hdr.b_buf == NULL || 3396307265Smav hdr->b_l1hdr.b_bufcnt > 0); 3397286570Smav ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 3398286570Smav ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); 3399286570Smav } 3400168404Spjd ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 3401286570Smav ASSERT(!HDR_IN_HASH_TABLE(hdr)); 3402168404Spjd 3403307265Smav if (!HDR_EMPTY(hdr)) 3404307265Smav buf_discard_identity(hdr); 3405307265Smav 3406286570Smav if (HDR_HAS_L2HDR(hdr)) { 3407286598Smav l2arc_dev_t *dev = hdr->b_l2hdr.b_dev; 3408286598Smav boolean_t buflist_held = MUTEX_HELD(&dev->l2ad_mtx); 3409286570Smav 3410286598Smav if (!buflist_held) 3411286598Smav mutex_enter(&dev->l2ad_mtx); 3412219089Spjd 3413286570Smav /* 3414286598Smav * Even though we checked this conditional above, we 3415286598Smav * need to check this again now that we have the 3416286598Smav * l2ad_mtx. This is because we could be racing with 3417286598Smav * another thread calling l2arc_evict() which might have 3418286598Smav * destroyed this header's L2 portion as we were waiting 3419286598Smav * to acquire the l2ad_mtx. If that happens, we don't 3420286598Smav * want to re-destroy the header's L2 portion. 3421286570Smav */ 3422286598Smav if (HDR_HAS_L2HDR(hdr)) { 3423290191Savg l2arc_trim(hdr); 3424286598Smav arc_hdr_l2hdr_destroy(hdr); 3425286598Smav } 3426286570Smav 3427219089Spjd if (!buflist_held) 3428286598Smav mutex_exit(&dev->l2ad_mtx); 3429185029Spjd } 3430185029Spjd 3431307265Smav if (HDR_HAS_L1HDR(hdr)) { 3432307265Smav arc_cksum_free(hdr); 3433286776Smav 3434307265Smav while (hdr->b_l1hdr.b_buf != NULL) 3435321535Smav arc_buf_destroy_impl(hdr->b_l1hdr.b_buf); 3436286570Smav 3437286570Smav#ifdef ZFS_DEBUG 3438286570Smav if (hdr->b_l1hdr.b_thawed != NULL) { 3439286570Smav kmem_free(hdr->b_l1hdr.b_thawed, 1); 3440286570Smav hdr->b_l1hdr.b_thawed = NULL; 3441286570Smav } 3442286570Smav#endif 3443307265Smav 3444321610Smav if (hdr->b_l1hdr.b_pabd != NULL) { 3445321610Smav arc_hdr_free_pabd(hdr); 3446307265Smav } 3447219089Spjd } 3448168404Spjd 3449168404Spjd ASSERT3P(hdr->b_hash_next, ==, NULL); 3450286570Smav if (HDR_HAS_L1HDR(hdr)) { 3451286763Smav ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); 3452286570Smav ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); 3453286570Smav kmem_cache_free(hdr_full_cache, hdr); 3454286570Smav } else { 3455286570Smav kmem_cache_free(hdr_l2only_cache, hdr); 3456286570Smav } 3457168404Spjd} 3458168404Spjd 3459168404Spjdvoid 3460307265Smavarc_buf_destroy(arc_buf_t *buf, void* tag) 3461168404Spjd{ 3462168404Spjd arc_buf_hdr_t *hdr = buf->b_hdr; 3463168404Spjd kmutex_t *hash_lock = HDR_LOCK(hdr); 3464168404Spjd 3465286570Smav if (hdr->b_l1hdr.b_state == arc_anon) { 3466307265Smav ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1); 3467307265Smav ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 3468307265Smav VERIFY0(remove_reference(hdr, NULL, tag)); 3469307265Smav arc_hdr_destroy(hdr); 3470307265Smav return; 3471168404Spjd } 3472168404Spjd 3473168404Spjd mutex_enter(hash_lock); 3474307265Smav ASSERT3P(hdr, ==, buf->b_hdr); 3475307265Smav ASSERT(hdr->b_l1hdr.b_bufcnt > 0); 3476219089Spjd ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 3477307265Smav ASSERT3P(hdr->b_l1hdr.b_state, !=, arc_anon); 3478307265Smav ASSERT3P(buf->b_data, !=, NULL); 3479168404Spjd 3480168404Spjd (void) remove_reference(hdr, hash_lock, tag); 3481321535Smav arc_buf_destroy_impl(buf); 3482168404Spjd mutex_exit(hash_lock); 3483168404Spjd} 3484168404Spjd 3485168404Spjd/* 3486286763Smav * Evict the arc_buf_hdr that is provided as a parameter. The resultant 3487286763Smav * state of the header is dependent on it's state prior to entering this 3488286763Smav * function. The following transitions are possible: 3489185029Spjd * 3490286763Smav * - arc_mru -> arc_mru_ghost 3491286763Smav * - arc_mfu -> arc_mfu_ghost 3492286763Smav * - arc_mru_ghost -> arc_l2c_only 3493286763Smav * - arc_mru_ghost -> deleted 3494286763Smav * - arc_mfu_ghost -> arc_l2c_only 3495286763Smav * - arc_mfu_ghost -> deleted 3496168404Spjd */ 3497286763Smavstatic int64_t 3498286763Smavarc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) 3499168404Spjd{ 3500286763Smav arc_state_t *evicted_state, *state; 3501286763Smav int64_t bytes_evicted = 0; 3502168404Spjd 3503286763Smav ASSERT(MUTEX_HELD(hash_lock)); 3504286763Smav ASSERT(HDR_HAS_L1HDR(hdr)); 3505168404Spjd 3506286763Smav state = hdr->b_l1hdr.b_state; 3507286763Smav if (GHOST_STATE(state)) { 3508286763Smav ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 3509307265Smav ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); 3510206796Spjd 3511286763Smav /* 3512286763Smav * l2arc_write_buffers() relies on a header's L1 portion 3513321610Smav * (i.e. its b_pabd field) during it's write phase. 3514286763Smav * Thus, we cannot push a header onto the arc_l2c_only 3515286763Smav * state (removing it's L1 piece) until the header is 3516286763Smav * done being written to the l2arc. 3517286763Smav */ 3518286763Smav if (HDR_HAS_L2HDR(hdr) && HDR_L2_WRITING(hdr)) { 3519286763Smav ARCSTAT_BUMP(arcstat_evict_l2_skip); 3520286763Smav return (bytes_evicted); 3521286763Smav } 3522286762Smav 3523286763Smav ARCSTAT_BUMP(arcstat_deleted); 3524307265Smav bytes_evicted += HDR_GET_LSIZE(hdr); 3525286762Smav 3526286763Smav DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, hdr); 3527286763Smav 3528321610Smav ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); 3529286763Smav if (HDR_HAS_L2HDR(hdr)) { 3530275780Sdelphij /* 3531286763Smav * This buffer is cached on the 2nd Level ARC; 3532286763Smav * don't destroy the header. 3533275780Sdelphij */ 3534286763Smav arc_change_state(arc_l2c_only, hdr, hash_lock); 3535286763Smav /* 3536286763Smav * dropping from L1+L2 cached to L2-only, 3537286763Smav * realloc to remove the L1 header. 3538286763Smav */ 3539286763Smav hdr = arc_hdr_realloc(hdr, hdr_full_cache, 3540286763Smav hdr_l2only_cache); 3541286763Smav } else { 3542286763Smav arc_change_state(arc_anon, hdr, hash_lock); 3543286763Smav arc_hdr_destroy(hdr); 3544275780Sdelphij } 3545286763Smav return (bytes_evicted); 3546275780Sdelphij } 3547275780Sdelphij 3548286763Smav ASSERT(state == arc_mru || state == arc_mfu); 3549286763Smav evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost; 3550206796Spjd 3551286763Smav /* prefetch buffers have a minimum lifespan */ 3552286763Smav if (HDR_IO_IN_PROGRESS(hdr) || 3553286763Smav ((hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT)) && 3554286763Smav ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access < 3555286763Smav arc_min_prefetch_lifespan)) { 3556286763Smav ARCSTAT_BUMP(arcstat_evict_skip); 3557286763Smav return (bytes_evicted); 3558286763Smav } 3559286763Smav 3560286763Smav ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt)); 3561286763Smav while (hdr->b_l1hdr.b_buf) { 3562286763Smav arc_buf_t *buf = hdr->b_l1hdr.b_buf; 3563286763Smav if (!mutex_tryenter(&buf->b_evict_lock)) { 3564286763Smav ARCSTAT_BUMP(arcstat_mutex_miss); 3565286763Smav break; 3566168404Spjd } 3567286763Smav if (buf->b_data != NULL) 3568307265Smav bytes_evicted += HDR_GET_LSIZE(hdr); 3569307265Smav mutex_exit(&buf->b_evict_lock); 3570321535Smav arc_buf_destroy_impl(buf); 3571286763Smav } 3572258632Savg 3573286763Smav if (HDR_HAS_L2HDR(hdr)) { 3574307265Smav ARCSTAT_INCR(arcstat_evict_l2_cached, HDR_GET_LSIZE(hdr)); 3575286763Smav } else { 3576307265Smav if (l2arc_write_eligible(hdr->b_spa, hdr)) { 3577307265Smav ARCSTAT_INCR(arcstat_evict_l2_eligible, 3578307265Smav HDR_GET_LSIZE(hdr)); 3579307265Smav } else { 3580307265Smav ARCSTAT_INCR(arcstat_evict_l2_ineligible, 3581307265Smav HDR_GET_LSIZE(hdr)); 3582307265Smav } 3583286763Smav } 3584258632Savg 3585307265Smav if (hdr->b_l1hdr.b_bufcnt == 0) { 3586307265Smav arc_cksum_free(hdr); 3587307265Smav 3588307265Smav bytes_evicted += arc_hdr_size(hdr); 3589307265Smav 3590307265Smav /* 3591307265Smav * If this hdr is being evicted and has a compressed 3592307265Smav * buffer then we discard it here before we change states. 3593307265Smav * This ensures that the accounting is updated correctly 3594321610Smav * in arc_free_data_impl(). 3595307265Smav */ 3596321610Smav arc_hdr_free_pabd(hdr); 3597307265Smav 3598286763Smav arc_change_state(evicted_state, hdr, hash_lock); 3599286763Smav ASSERT(HDR_IN_HASH_TABLE(hdr)); 3600307265Smav arc_hdr_set_flags(hdr, ARC_FLAG_IN_HASH_TABLE); 3601286763Smav DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, hdr); 3602286763Smav } 3603286763Smav 3604286763Smav return (bytes_evicted); 3605286763Smav} 3606286763Smav 3607286763Smavstatic uint64_t 3608286763Smavarc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker, 3609286763Smav uint64_t spa, int64_t bytes) 3610286763Smav{ 3611286763Smav multilist_sublist_t *mls; 3612286763Smav uint64_t bytes_evicted = 0; 3613286763Smav arc_buf_hdr_t *hdr; 3614286763Smav kmutex_t *hash_lock; 3615286763Smav int evict_count = 0; 3616286763Smav 3617286763Smav ASSERT3P(marker, !=, NULL); 3618286763Smav IMPLY(bytes < 0, bytes == ARC_EVICT_ALL); 3619286763Smav 3620286763Smav mls = multilist_sublist_lock(ml, idx); 3621286763Smav 3622286763Smav for (hdr = multilist_sublist_prev(mls, marker); hdr != NULL; 3623286763Smav hdr = multilist_sublist_prev(mls, marker)) { 3624286763Smav if ((bytes != ARC_EVICT_ALL && bytes_evicted >= bytes) || 3625286763Smav (evict_count >= zfs_arc_evict_batch_limit)) 3626286763Smav break; 3627286763Smav 3628258632Savg /* 3629286763Smav * To keep our iteration location, move the marker 3630286763Smav * forward. Since we're not holding hdr's hash lock, we 3631286763Smav * must be very careful and not remove 'hdr' from the 3632286763Smav * sublist. Otherwise, other consumers might mistake the 3633286763Smav * 'hdr' as not being on a sublist when they call the 3634286763Smav * multilist_link_active() function (they all rely on 3635286763Smav * the hash lock protecting concurrent insertions and 3636286763Smav * removals). multilist_sublist_move_forward() was 3637286763Smav * specifically implemented to ensure this is the case 3638286763Smav * (only 'marker' will be removed and re-inserted). 3639258632Savg */ 3640286763Smav multilist_sublist_move_forward(mls, marker); 3641286763Smav 3642286763Smav /* 3643286763Smav * The only case where the b_spa field should ever be 3644286763Smav * zero, is the marker headers inserted by 3645286763Smav * arc_evict_state(). It's possible for multiple threads 3646286763Smav * to be calling arc_evict_state() concurrently (e.g. 3647286763Smav * dsl_pool_close() and zio_inject_fault()), so we must 3648286763Smav * skip any markers we see from these other threads. 3649286763Smav */ 3650286763Smav if (hdr->b_spa == 0) 3651258632Savg continue; 3652286763Smav 3653286763Smav /* we're only interested in evicting buffers of a certain spa */ 3654286763Smav if (spa != 0 && hdr->b_spa != spa) { 3655286763Smav ARCSTAT_BUMP(arcstat_evict_skip); 3656286763Smav continue; 3657258632Savg } 3658258632Savg 3659275811Sdelphij hash_lock = HDR_LOCK(hdr); 3660208373Smm 3661286763Smav /* 3662286763Smav * We aren't calling this function from any code path 3663286763Smav * that would already be holding a hash lock, so we're 3664286763Smav * asserting on this assumption to be defensive in case 3665286763Smav * this ever changes. Without this check, it would be 3666286763Smav * possible to incorrectly increment arcstat_mutex_miss 3667286763Smav * below (e.g. if the code changed such that we called 3668286763Smav * this function with a hash lock held). 3669286763Smav */ 3670286763Smav ASSERT(!MUTEX_HELD(hash_lock)); 3671208373Smm 3672286763Smav if (mutex_tryenter(hash_lock)) { 3673286763Smav uint64_t evicted = arc_evict_hdr(hdr, hash_lock); 3674286763Smav mutex_exit(hash_lock); 3675286763Smav 3676286763Smav bytes_evicted += evicted; 3677286763Smav 3678286763Smav /* 3679286763Smav * If evicted is zero, arc_evict_hdr() must have 3680286763Smav * decided to skip this header, don't increment 3681286763Smav * evict_count in this case. 3682286763Smav */ 3683286763Smav if (evicted != 0) 3684286763Smav evict_count++; 3685286763Smav 3686286763Smav /* 3687286763Smav * If arc_size isn't overflowing, signal any 3688286763Smav * threads that might happen to be waiting. 3689286763Smav * 3690286763Smav * For each header evicted, we wake up a single 3691286763Smav * thread. If we used cv_broadcast, we could 3692286763Smav * wake up "too many" threads causing arc_size 3693286763Smav * to significantly overflow arc_c; since 3694321610Smav * arc_get_data_impl() doesn't check for overflow 3695286763Smav * when it's woken up (it doesn't because it's 3696286763Smav * possible for the ARC to be overflowing while 3697286763Smav * full of un-evictable buffers, and the 3698286763Smav * function should proceed in this case). 3699286763Smav * 3700286763Smav * If threads are left sleeping, due to not 3701286763Smav * using cv_broadcast, they will be woken up 3702286763Smav * just before arc_reclaim_thread() sleeps. 3703286763Smav */ 3704286763Smav mutex_enter(&arc_reclaim_lock); 3705286763Smav if (!arc_is_overflowing()) 3706286763Smav cv_signal(&arc_reclaim_waiters_cv); 3707286763Smav mutex_exit(&arc_reclaim_lock); 3708168404Spjd } else { 3709286763Smav ARCSTAT_BUMP(arcstat_mutex_miss); 3710168404Spjd } 3711168404Spjd } 3712168404Spjd 3713286763Smav multilist_sublist_unlock(mls); 3714206796Spjd 3715286763Smav return (bytes_evicted); 3716286763Smav} 3717168404Spjd 3718286763Smav/* 3719286763Smav * Evict buffers from the given arc state, until we've removed the 3720286763Smav * specified number of bytes. Move the removed buffers to the 3721286763Smav * appropriate evict state. 3722286763Smav * 3723286763Smav * This function makes a "best effort". It skips over any buffers 3724286763Smav * it can't get a hash_lock on, and so, may not catch all candidates. 3725286763Smav * It may also return without evicting as much space as requested. 3726286763Smav * 3727286763Smav * If bytes is specified using the special value ARC_EVICT_ALL, this 3728286763Smav * will evict all available (i.e. unlocked and evictable) buffers from 3729286763Smav * the given arc state; which is used by arc_flush(). 3730286763Smav */ 3731286763Smavstatic uint64_t 3732286763Smavarc_evict_state(arc_state_t *state, uint64_t spa, int64_t bytes, 3733286763Smav arc_buf_contents_t type) 3734286763Smav{ 3735286763Smav uint64_t total_evicted = 0; 3736321553Smav multilist_t *ml = state->arcs_list[type]; 3737286763Smav int num_sublists; 3738286763Smav arc_buf_hdr_t **markers; 3739168404Spjd 3740286763Smav IMPLY(bytes < 0, bytes == ARC_EVICT_ALL); 3741168404Spjd 3742286763Smav num_sublists = multilist_get_num_sublists(ml); 3743286763Smav 3744185029Spjd /* 3745286763Smav * If we've tried to evict from each sublist, made some 3746286763Smav * progress, but still have not hit the target number of bytes 3747286763Smav * to evict, we want to keep trying. The markers allow us to 3748286763Smav * pick up where we left off for each individual sublist, rather 3749286763Smav * than starting from the tail each time. 3750185029Spjd */ 3751286763Smav markers = kmem_zalloc(sizeof (*markers) * num_sublists, KM_SLEEP); 3752286763Smav for (int i = 0; i < num_sublists; i++) { 3753286763Smav markers[i] = kmem_cache_alloc(hdr_full_cache, KM_SLEEP); 3754185029Spjd 3755286763Smav /* 3756286763Smav * A b_spa of 0 is used to indicate that this header is 3757286763Smav * a marker. This fact is used in arc_adjust_type() and 3758286763Smav * arc_evict_state_impl(). 3759286763Smav */ 3760286763Smav markers[i]->b_spa = 0; 3761168404Spjd 3762286763Smav multilist_sublist_t *mls = multilist_sublist_lock(ml, i); 3763286763Smav multilist_sublist_insert_tail(mls, markers[i]); 3764286763Smav multilist_sublist_unlock(mls); 3765286763Smav } 3766168404Spjd 3767286763Smav /* 3768286763Smav * While we haven't hit our target number of bytes to evict, or 3769286763Smav * we're evicting all available buffers. 3770286763Smav */ 3771286763Smav while (total_evicted < bytes || bytes == ARC_EVICT_ALL) { 3772286763Smav /* 3773286763Smav * Start eviction using a randomly selected sublist, 3774286763Smav * this is to try and evenly balance eviction across all 3775286763Smav * sublists. Always starting at the same sublist 3776286763Smav * (e.g. index 0) would cause evictions to favor certain 3777286763Smav * sublists over others. 3778286763Smav */ 3779286763Smav int sublist_idx = multilist_get_random_index(ml); 3780286763Smav uint64_t scan_evicted = 0; 3781219089Spjd 3782286763Smav for (int i = 0; i < num_sublists; i++) { 3783286763Smav uint64_t bytes_remaining; 3784286763Smav uint64_t bytes_evicted; 3785219089Spjd 3786286763Smav if (bytes == ARC_EVICT_ALL) 3787286763Smav bytes_remaining = ARC_EVICT_ALL; 3788286763Smav else if (total_evicted < bytes) 3789286763Smav bytes_remaining = bytes - total_evicted; 3790286763Smav else 3791286763Smav break; 3792258632Savg 3793286763Smav bytes_evicted = arc_evict_state_impl(ml, sublist_idx, 3794286763Smav markers[sublist_idx], spa, bytes_remaining); 3795286763Smav 3796286763Smav scan_evicted += bytes_evicted; 3797286763Smav total_evicted += bytes_evicted; 3798286763Smav 3799286763Smav /* we've reached the end, wrap to the beginning */ 3800286763Smav if (++sublist_idx >= num_sublists) 3801286763Smav sublist_idx = 0; 3802286763Smav } 3803286763Smav 3804258632Savg /* 3805286763Smav * If we didn't evict anything during this scan, we have 3806286763Smav * no reason to believe we'll evict more during another 3807286763Smav * scan, so break the loop. 3808258632Savg */ 3809286763Smav if (scan_evicted == 0) { 3810286763Smav /* This isn't possible, let's make that obvious */ 3811286763Smav ASSERT3S(bytes, !=, 0); 3812185029Spjd 3813286763Smav /* 3814286763Smav * When bytes is ARC_EVICT_ALL, the only way to 3815286763Smav * break the loop is when scan_evicted is zero. 3816286763Smav * In that case, we actually have evicted enough, 3817286763Smav * so we don't want to increment the kstat. 3818286763Smav */ 3819286763Smav if (bytes != ARC_EVICT_ALL) { 3820286763Smav ASSERT3S(total_evicted, <, bytes); 3821286763Smav ARCSTAT_BUMP(arcstat_evict_not_enough); 3822185029Spjd } 3823185029Spjd 3824286763Smav break; 3825258632Savg } 3826286763Smav } 3827258632Savg 3828286763Smav for (int i = 0; i < num_sublists; i++) { 3829286763Smav multilist_sublist_t *mls = multilist_sublist_lock(ml, i); 3830286763Smav multilist_sublist_remove(mls, markers[i]); 3831286763Smav multilist_sublist_unlock(mls); 3832286763Smav 3833286763Smav kmem_cache_free(hdr_full_cache, markers[i]); 3834168404Spjd } 3835286763Smav kmem_free(markers, sizeof (*markers) * num_sublists); 3836206796Spjd 3837286763Smav return (total_evicted); 3838286763Smav} 3839286763Smav 3840286763Smav/* 3841286763Smav * Flush all "evictable" data of the given type from the arc state 3842286763Smav * specified. This will not evict any "active" buffers (i.e. referenced). 3843286763Smav * 3844307265Smav * When 'retry' is set to B_FALSE, the function will make a single pass 3845286763Smav * over the state and evict any buffers that it can. Since it doesn't 3846286763Smav * continually retry the eviction, it might end up leaving some buffers 3847286763Smav * in the ARC due to lock misses. 3848286763Smav * 3849307265Smav * When 'retry' is set to B_TRUE, the function will continually retry the 3850286763Smav * eviction until *all* evictable buffers have been removed from the 3851286763Smav * state. As a result, if concurrent insertions into the state are 3852286763Smav * allowed (e.g. if the ARC isn't shutting down), this function might 3853286763Smav * wind up in an infinite loop, continually trying to evict buffers. 3854286763Smav */ 3855286763Smavstatic uint64_t 3856286763Smavarc_flush_state(arc_state_t *state, uint64_t spa, arc_buf_contents_t type, 3857286763Smav boolean_t retry) 3858286763Smav{ 3859286763Smav uint64_t evicted = 0; 3860286763Smav 3861307265Smav while (refcount_count(&state->arcs_esize[type]) != 0) { 3862286763Smav evicted += arc_evict_state(state, spa, ARC_EVICT_ALL, type); 3863286763Smav 3864286763Smav if (!retry) 3865286763Smav break; 3866185029Spjd } 3867185029Spjd 3868286763Smav return (evicted); 3869286763Smav} 3870286763Smav 3871286763Smav/* 3872286763Smav * Evict the specified number of bytes from the state specified, 3873286763Smav * restricting eviction to the spa and type given. This function 3874286763Smav * prevents us from trying to evict more from a state's list than 3875286763Smav * is "evictable", and to skip evicting altogether when passed a 3876286763Smav * negative value for "bytes". In contrast, arc_evict_state() will 3877286763Smav * evict everything it can, when passed a negative value for "bytes". 3878286763Smav */ 3879286763Smavstatic uint64_t 3880286763Smavarc_adjust_impl(arc_state_t *state, uint64_t spa, int64_t bytes, 3881286763Smav arc_buf_contents_t type) 3882286763Smav{ 3883286763Smav int64_t delta; 3884286763Smav 3885307265Smav if (bytes > 0 && refcount_count(&state->arcs_esize[type]) > 0) { 3886307265Smav delta = MIN(refcount_count(&state->arcs_esize[type]), bytes); 3887286763Smav return (arc_evict_state(state, spa, delta, type)); 3888168404Spjd } 3889168404Spjd 3890286763Smav return (0); 3891168404Spjd} 3892168404Spjd 3893286763Smav/* 3894286763Smav * Evict metadata buffers from the cache, such that arc_meta_used is 3895286763Smav * capped by the arc_meta_limit tunable. 3896286763Smav */ 3897286763Smavstatic uint64_t 3898286763Smavarc_adjust_meta(void) 3899286763Smav{ 3900286763Smav uint64_t total_evicted = 0; 3901286763Smav int64_t target; 3902286763Smav 3903286763Smav /* 3904286763Smav * If we're over the meta limit, we want to evict enough 3905286763Smav * metadata to get back under the meta limit. We don't want to 3906286763Smav * evict so much that we drop the MRU below arc_p, though. If 3907286763Smav * we're over the meta limit more than we're over arc_p, we 3908286763Smav * evict some from the MRU here, and some from the MFU below. 3909286763Smav */ 3910286763Smav target = MIN((int64_t)(arc_meta_used - arc_meta_limit), 3911286766Smav (int64_t)(refcount_count(&arc_anon->arcs_size) + 3912286766Smav refcount_count(&arc_mru->arcs_size) - arc_p)); 3913286763Smav 3914286763Smav total_evicted += arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA); 3915286763Smav 3916286763Smav /* 3917286763Smav * Similar to the above, we want to evict enough bytes to get us 3918286763Smav * below the meta limit, but not so much as to drop us below the 3919321535Smav * space allotted to the MFU (which is defined as arc_c - arc_p). 3920286763Smav */ 3921286763Smav target = MIN((int64_t)(arc_meta_used - arc_meta_limit), 3922286766Smav (int64_t)(refcount_count(&arc_mfu->arcs_size) - (arc_c - arc_p))); 3923286763Smav 3924286763Smav total_evicted += arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA); 3925286763Smav 3926286763Smav return (total_evicted); 3927286763Smav} 3928286763Smav 3929286763Smav/* 3930286763Smav * Return the type of the oldest buffer in the given arc state 3931286763Smav * 3932286763Smav * This function will select a random sublist of type ARC_BUFC_DATA and 3933286763Smav * a random sublist of type ARC_BUFC_METADATA. The tail of each sublist 3934286763Smav * is compared, and the type which contains the "older" buffer will be 3935286763Smav * returned. 3936286763Smav */ 3937286763Smavstatic arc_buf_contents_t 3938286763Smavarc_adjust_type(arc_state_t *state) 3939286763Smav{ 3940321553Smav multilist_t *data_ml = state->arcs_list[ARC_BUFC_DATA]; 3941321553Smav multilist_t *meta_ml = state->arcs_list[ARC_BUFC_METADATA]; 3942286763Smav int data_idx = multilist_get_random_index(data_ml); 3943286763Smav int meta_idx = multilist_get_random_index(meta_ml); 3944286763Smav multilist_sublist_t *data_mls; 3945286763Smav multilist_sublist_t *meta_mls; 3946286763Smav arc_buf_contents_t type; 3947286763Smav arc_buf_hdr_t *data_hdr; 3948286763Smav arc_buf_hdr_t *meta_hdr; 3949286763Smav 3950286763Smav /* 3951286763Smav * We keep the sublist lock until we're finished, to prevent 3952286763Smav * the headers from being destroyed via arc_evict_state(). 3953286763Smav */ 3954286763Smav data_mls = multilist_sublist_lock(data_ml, data_idx); 3955286763Smav meta_mls = multilist_sublist_lock(meta_ml, meta_idx); 3956286763Smav 3957286763Smav /* 3958286763Smav * These two loops are to ensure we skip any markers that 3959286763Smav * might be at the tail of the lists due to arc_evict_state(). 3960286763Smav */ 3961286763Smav 3962286763Smav for (data_hdr = multilist_sublist_tail(data_mls); data_hdr != NULL; 3963286763Smav data_hdr = multilist_sublist_prev(data_mls, data_hdr)) { 3964286763Smav if (data_hdr->b_spa != 0) 3965286763Smav break; 3966286763Smav } 3967286763Smav 3968286763Smav for (meta_hdr = multilist_sublist_tail(meta_mls); meta_hdr != NULL; 3969286763Smav meta_hdr = multilist_sublist_prev(meta_mls, meta_hdr)) { 3970286763Smav if (meta_hdr->b_spa != 0) 3971286763Smav break; 3972286763Smav } 3973286763Smav 3974286763Smav if (data_hdr == NULL && meta_hdr == NULL) { 3975286763Smav type = ARC_BUFC_DATA; 3976286763Smav } else if (data_hdr == NULL) { 3977286763Smav ASSERT3P(meta_hdr, !=, NULL); 3978286763Smav type = ARC_BUFC_METADATA; 3979286763Smav } else if (meta_hdr == NULL) { 3980286763Smav ASSERT3P(data_hdr, !=, NULL); 3981286763Smav type = ARC_BUFC_DATA; 3982286763Smav } else { 3983286763Smav ASSERT3P(data_hdr, !=, NULL); 3984286763Smav ASSERT3P(meta_hdr, !=, NULL); 3985286763Smav 3986286763Smav /* The headers can't be on the sublist without an L1 header */ 3987286763Smav ASSERT(HDR_HAS_L1HDR(data_hdr)); 3988286763Smav ASSERT(HDR_HAS_L1HDR(meta_hdr)); 3989286763Smav 3990286763Smav if (data_hdr->b_l1hdr.b_arc_access < 3991286763Smav meta_hdr->b_l1hdr.b_arc_access) { 3992286763Smav type = ARC_BUFC_DATA; 3993286763Smav } else { 3994286763Smav type = ARC_BUFC_METADATA; 3995286763Smav } 3996286763Smav } 3997286763Smav 3998286763Smav multilist_sublist_unlock(meta_mls); 3999286763Smav multilist_sublist_unlock(data_mls); 4000286763Smav 4001286763Smav return (type); 4002286763Smav} 4003286763Smav 4004286763Smav/* 4005286763Smav * Evict buffers from the cache, such that arc_size is capped by arc_c. 4006286763Smav */ 4007286763Smavstatic uint64_t 4008168404Spjdarc_adjust(void) 4009168404Spjd{ 4010286763Smav uint64_t total_evicted = 0; 4011286763Smav uint64_t bytes; 4012286763Smav int64_t target; 4013168404Spjd 4014208373Smm /* 4015286763Smav * If we're over arc_meta_limit, we want to correct that before 4016286763Smav * potentially evicting data buffers below. 4017286763Smav */ 4018286763Smav total_evicted += arc_adjust_meta(); 4019286763Smav 4020286763Smav /* 4021208373Smm * Adjust MRU size 4022286763Smav * 4023286763Smav * If we're over the target cache size, we want to evict enough 4024286763Smav * from the list to get back to our target size. We don't want 4025286763Smav * to evict too much from the MRU, such that it drops below 4026286763Smav * arc_p. So, if we're over our target cache size more than 4027286763Smav * the MRU is over arc_p, we'll evict enough to get back to 4028286763Smav * arc_p here, and then evict more from the MFU below. 4029208373Smm */ 4030286763Smav target = MIN((int64_t)(arc_size - arc_c), 4031286766Smav (int64_t)(refcount_count(&arc_anon->arcs_size) + 4032286766Smav refcount_count(&arc_mru->arcs_size) + arc_meta_used - arc_p)); 4033208373Smm 4034286763Smav /* 4035286763Smav * If we're below arc_meta_min, always prefer to evict data. 4036286763Smav * Otherwise, try to satisfy the requested number of bytes to 4037286763Smav * evict from the type which contains older buffers; in an 4038286763Smav * effort to keep newer buffers in the cache regardless of their 4039286763Smav * type. If we cannot satisfy the number of bytes from this 4040286763Smav * type, spill over into the next type. 4041286763Smav */ 4042286763Smav if (arc_adjust_type(arc_mru) == ARC_BUFC_METADATA && 4043286763Smav arc_meta_used > arc_meta_min) { 4044286763Smav bytes = arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA); 4045286763Smav total_evicted += bytes; 4046168404Spjd 4047286763Smav /* 4048286763Smav * If we couldn't evict our target number of bytes from 4049286763Smav * metadata, we try to get the rest from data. 4050286763Smav */ 4051286763Smav target -= bytes; 4052286763Smav 4053286763Smav total_evicted += 4054286763Smav arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_DATA); 4055286763Smav } else { 4056286763Smav bytes = arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_DATA); 4057286763Smav total_evicted += bytes; 4058286763Smav 4059286763Smav /* 4060286763Smav * If we couldn't evict our target number of bytes from 4061286763Smav * data, we try to get the rest from metadata. 4062286763Smav */ 4063286763Smav target -= bytes; 4064286763Smav 4065286763Smav total_evicted += 4066286763Smav arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA); 4067185029Spjd } 4068185029Spjd 4069208373Smm /* 4070208373Smm * Adjust MFU size 4071286763Smav * 4072286763Smav * Now that we've tried to evict enough from the MRU to get its 4073286763Smav * size back to arc_p, if we're still above the target cache 4074286763Smav * size, we evict the rest from the MFU. 4075208373Smm */ 4076286763Smav target = arc_size - arc_c; 4077168404Spjd 4078286764Smav if (arc_adjust_type(arc_mfu) == ARC_BUFC_METADATA && 4079286763Smav arc_meta_used > arc_meta_min) { 4080286763Smav bytes = arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA); 4081286763Smav total_evicted += bytes; 4082208373Smm 4083286763Smav /* 4084286763Smav * If we couldn't evict our target number of bytes from 4085286763Smav * metadata, we try to get the rest from data. 4086286763Smav */ 4087286763Smav target -= bytes; 4088168404Spjd 4089286763Smav total_evicted += 4090286763Smav arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_DATA); 4091286763Smav } else { 4092286763Smav bytes = arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_DATA); 4093286763Smav total_evicted += bytes; 4094286763Smav 4095286763Smav /* 4096286763Smav * If we couldn't evict our target number of bytes from 4097286763Smav * data, we try to get the rest from data. 4098286763Smav */ 4099286763Smav target -= bytes; 4100286763Smav 4101286763Smav total_evicted += 4102286763Smav arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA); 4103208373Smm } 4104168404Spjd 4105208373Smm /* 4106208373Smm * Adjust ghost lists 4107286763Smav * 4108286763Smav * In addition to the above, the ARC also defines target values 4109286763Smav * for the ghost lists. The sum of the mru list and mru ghost 4110286763Smav * list should never exceed the target size of the cache, and 4111286763Smav * the sum of the mru list, mfu list, mru ghost list, and mfu 4112286763Smav * ghost list should never exceed twice the target size of the 4113286763Smav * cache. The following logic enforces these limits on the ghost 4114286763Smav * caches, and evicts from them as needed. 4115208373Smm */ 4116286766Smav target = refcount_count(&arc_mru->arcs_size) + 4117286766Smav refcount_count(&arc_mru_ghost->arcs_size) - arc_c; 4118168404Spjd 4119286763Smav bytes = arc_adjust_impl(arc_mru_ghost, 0, target, ARC_BUFC_DATA); 4120286763Smav total_evicted += bytes; 4121168404Spjd 4122286763Smav target -= bytes; 4123185029Spjd 4124286763Smav total_evicted += 4125286763Smav arc_adjust_impl(arc_mru_ghost, 0, target, ARC_BUFC_METADATA); 4126208373Smm 4127286763Smav /* 4128286763Smav * We assume the sum of the mru list and mfu list is less than 4129286763Smav * or equal to arc_c (we enforced this above), which means we 4130286763Smav * can use the simpler of the two equations below: 4131286763Smav * 4132286763Smav * mru + mfu + mru ghost + mfu ghost <= 2 * arc_c 4133286763Smav * mru ghost + mfu ghost <= arc_c 4134286763Smav */ 4135286766Smav target = refcount_count(&arc_mru_ghost->arcs_size) + 4136286766Smav refcount_count(&arc_mfu_ghost->arcs_size) - arc_c; 4137286763Smav 4138286763Smav bytes = arc_adjust_impl(arc_mfu_ghost, 0, target, ARC_BUFC_DATA); 4139286763Smav total_evicted += bytes; 4140286763Smav 4141286763Smav target -= bytes; 4142286763Smav 4143286763Smav total_evicted += 4144286763Smav arc_adjust_impl(arc_mfu_ghost, 0, target, ARC_BUFC_METADATA); 4145286763Smav 4146286763Smav return (total_evicted); 4147168404Spjd} 4148168404Spjd 4149168404Spjdvoid 4150286763Smavarc_flush(spa_t *spa, boolean_t retry) 4151168404Spjd{ 4152209962Smm uint64_t guid = 0; 4153209962Smm 4154286763Smav /* 4155307265Smav * If retry is B_TRUE, a spa must not be specified since we have 4156286763Smav * no good way to determine if all of a spa's buffers have been 4157286763Smav * evicted from an arc state. 4158286763Smav */ 4159286763Smav ASSERT(!retry || spa == 0); 4160286763Smav 4161286570Smav if (spa != NULL) 4162228103Smm guid = spa_load_guid(spa); 4163209962Smm 4164286763Smav (void) arc_flush_state(arc_mru, guid, ARC_BUFC_DATA, retry); 4165286763Smav (void) arc_flush_state(arc_mru, guid, ARC_BUFC_METADATA, retry); 4166168404Spjd 4167286763Smav (void) arc_flush_state(arc_mfu, guid, ARC_BUFC_DATA, retry); 4168286763Smav (void) arc_flush_state(arc_mfu, guid, ARC_BUFC_METADATA, retry); 4169168404Spjd 4170286763Smav (void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_DATA, retry); 4171286763Smav (void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_METADATA, retry); 4172286763Smav 4173286763Smav (void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_DATA, retry); 4174286763Smav (void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_METADATA, retry); 4175168404Spjd} 4176168404Spjd 4177168404Spjdvoid 4178286625Smavarc_shrink(int64_t to_free) 4179168404Spjd{ 4180168404Spjd if (arc_c > arc_c_min) { 4181272483Ssmh DTRACE_PROBE4(arc__shrink, uint64_t, arc_c, uint64_t, 4182272483Ssmh arc_c_min, uint64_t, arc_p, uint64_t, to_free); 4183168404Spjd if (arc_c > arc_c_min + to_free) 4184168404Spjd atomic_add_64(&arc_c, -to_free); 4185168404Spjd else 4186168404Spjd arc_c = arc_c_min; 4187168404Spjd 4188168404Spjd atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift)); 4189168404Spjd if (arc_c > arc_size) 4190168404Spjd arc_c = MAX(arc_size, arc_c_min); 4191168404Spjd if (arc_p > arc_c) 4192168404Spjd arc_p = (arc_c >> 1); 4193272483Ssmh 4194272483Ssmh DTRACE_PROBE2(arc__shrunk, uint64_t, arc_c, uint64_t, 4195272483Ssmh arc_p); 4196272483Ssmh 4197168404Spjd ASSERT(arc_c >= arc_c_min); 4198168404Spjd ASSERT((int64_t)arc_p >= 0); 4199168404Spjd } 4200168404Spjd 4201270759Ssmh if (arc_size > arc_c) { 4202270759Ssmh DTRACE_PROBE2(arc__shrink_adjust, uint64_t, arc_size, 4203270759Ssmh uint64_t, arc_c); 4204286763Smav (void) arc_adjust(); 4205270759Ssmh } 4206168404Spjd} 4207168404Spjd 4208286625Smavtypedef enum free_memory_reason_t { 4209286625Smav FMR_UNKNOWN, 4210286625Smav FMR_NEEDFREE, 4211286625Smav FMR_LOTSFREE, 4212286625Smav FMR_SWAPFS_MINFREE, 4213286625Smav FMR_PAGES_PP_MAXIMUM, 4214286625Smav FMR_HEAP_ARENA, 4215286625Smav FMR_ZIO_ARENA, 4216286625Smav FMR_ZIO_FRAG, 4217286625Smav} free_memory_reason_t; 4218286625Smav 4219286625Smavint64_t last_free_memory; 4220286625Smavfree_memory_reason_t last_free_reason; 4221286625Smav 4222286625Smav/* 4223286625Smav * Additional reserve of pages for pp_reserve. 4224286625Smav */ 4225286625Smavint64_t arc_pages_pp_reserve = 64; 4226286625Smav 4227286625Smav/* 4228286625Smav * Additional reserve of pages for swapfs. 4229286625Smav */ 4230286625Smavint64_t arc_swapfs_reserve = 64; 4231286625Smav 4232286625Smav/* 4233286625Smav * Return the amount of memory that can be consumed before reclaim will be 4234286625Smav * needed. Positive if there is sufficient free memory, negative indicates 4235286625Smav * the amount of memory that needs to be freed up. 4236286625Smav */ 4237286625Smavstatic int64_t 4238286625Smavarc_available_memory(void) 4239168404Spjd{ 4240286625Smav int64_t lowest = INT64_MAX; 4241286625Smav int64_t n; 4242286625Smav free_memory_reason_t r = FMR_UNKNOWN; 4243168404Spjd 4244168404Spjd#ifdef _KERNEL 4245330061Savg#ifdef __FreeBSD__ 4246191902Skmacy /* 4247212780Savg * Cooperate with pagedaemon when it's time for it to scan 4248212780Savg * and reclaim some pages. 4249191902Skmacy */ 4250286655Smav n = PAGESIZE * ((int64_t)freemem - zfs_arc_free_target); 4251286625Smav if (n < lowest) { 4252286625Smav lowest = n; 4253286625Smav r = FMR_LOTSFREE; 4254270759Ssmh } 4255191902Skmacy 4256330061Savg#else 4257330061Savg if (needfree > 0) { 4258330061Savg n = PAGESIZE * (-needfree); 4259330061Savg if (n < lowest) { 4260330061Savg lowest = n; 4261330061Savg r = FMR_NEEDFREE; 4262330061Savg } 4263330061Savg } 4264330061Savg 4265168404Spjd /* 4266185029Spjd * check that we're out of range of the pageout scanner. It starts to 4267185029Spjd * schedule paging if freemem is less than lotsfree and needfree. 4268185029Spjd * lotsfree is the high-water mark for pageout, and needfree is the 4269185029Spjd * number of needed free pages. We add extra pages here to make sure 4270185029Spjd * the scanner doesn't start up while we're freeing memory. 4271185029Spjd */ 4272286625Smav n = PAGESIZE * (freemem - lotsfree - needfree - desfree); 4273286625Smav if (n < lowest) { 4274286625Smav lowest = n; 4275286625Smav r = FMR_LOTSFREE; 4276286625Smav } 4277185029Spjd 4278185029Spjd /* 4279168404Spjd * check to make sure that swapfs has enough space so that anon 4280185029Spjd * reservations can still succeed. anon_resvmem() checks that the 4281168404Spjd * availrmem is greater than swapfs_minfree, and the number of reserved 4282168404Spjd * swap pages. We also add a bit of extra here just to prevent 4283168404Spjd * circumstances from getting really dire. 4284168404Spjd */ 4285286625Smav n = PAGESIZE * (availrmem - swapfs_minfree - swapfs_reserve - 4286286625Smav desfree - arc_swapfs_reserve); 4287286625Smav if (n < lowest) { 4288286625Smav lowest = n; 4289286625Smav r = FMR_SWAPFS_MINFREE; 4290286625Smav } 4291168404Spjd 4292286625Smav 4293168404Spjd /* 4294272483Ssmh * Check that we have enough availrmem that memory locking (e.g., via 4295272483Ssmh * mlock(3C) or memcntl(2)) can still succeed. (pages_pp_maximum 4296272483Ssmh * stores the number of pages that cannot be locked; when availrmem 4297272483Ssmh * drops below pages_pp_maximum, page locking mechanisms such as 4298272483Ssmh * page_pp_lock() will fail.) 4299272483Ssmh */ 4300286625Smav n = PAGESIZE * (availrmem - pages_pp_maximum - 4301286625Smav arc_pages_pp_reserve); 4302286625Smav if (n < lowest) { 4303286625Smav lowest = n; 4304286625Smav r = FMR_PAGES_PP_MAXIMUM; 4305286625Smav } 4306272483Ssmh 4307330061Savg#endif /* __FreeBSD__ */ 4308272483Ssmh#if defined(__i386) || !defined(UMA_MD_SMALL_ALLOC) 4309272483Ssmh /* 4310168404Spjd * If we're on an i386 platform, it's possible that we'll exhaust the 4311168404Spjd * kernel heap space before we ever run out of available physical 4312168404Spjd * memory. Most checks of the size of the heap_area compare against 4313168404Spjd * tune.t_minarmem, which is the minimum available real memory that we 4314168404Spjd * can have in the system. However, this is generally fixed at 25 pages 4315168404Spjd * which is so low that it's useless. In this comparison, we seek to 4316168404Spjd * calculate the total heap-size, and reclaim if more than 3/4ths of the 4317185029Spjd * heap is allocated. (Or, in the calculation, if less than 1/4th is 4318168404Spjd * free) 4319168404Spjd */ 4320286655Smav n = (int64_t)vmem_size(heap_arena, VMEM_FREE) - 4321286628Smav (vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC) >> 2); 4322286625Smav if (n < lowest) { 4323286625Smav lowest = n; 4324286625Smav r = FMR_HEAP_ARENA; 4325270861Ssmh } 4326281026Smav#define zio_arena NULL 4327281026Smav#else 4328281026Smav#define zio_arena heap_arena 4329270861Ssmh#endif 4330281026Smav 4331272483Ssmh /* 4332272483Ssmh * If zio data pages are being allocated out of a separate heap segment, 4333272483Ssmh * then enforce that the size of available vmem for this arena remains 4334331383Smav * above about 1/4th (1/(2^arc_zio_arena_free_shift)) free. 4335272483Ssmh * 4336331383Smav * Note that reducing the arc_zio_arena_free_shift keeps more virtual 4337331383Smav * memory (in the zio_arena) free, which can avoid memory 4338331383Smav * fragmentation issues. 4339272483Ssmh */ 4340286625Smav if (zio_arena != NULL) { 4341286655Smav n = (int64_t)vmem_size(zio_arena, VMEM_FREE) - 4342331383Smav (vmem_size(zio_arena, VMEM_ALLOC) >> 4343331383Smav arc_zio_arena_free_shift); 4344286625Smav if (n < lowest) { 4345286625Smav lowest = n; 4346286625Smav r = FMR_ZIO_ARENA; 4347286625Smav } 4348286625Smav } 4349281026Smav 4350281026Smav /* 4351281026Smav * Above limits know nothing about real level of KVA fragmentation. 4352281026Smav * Start aggressive reclamation if too little sequential KVA left. 4353281026Smav */ 4354286625Smav if (lowest > 0) { 4355317470Ssmh n = (vmem_size(heap_arena, VMEM_MAXFREE) < SPA_MAXBLOCKSIZE) ? 4356286655Smav -((int64_t)vmem_size(heap_arena, VMEM_ALLOC) >> 4) : 4357286655Smav INT64_MAX; 4358286625Smav if (n < lowest) { 4359286625Smav lowest = n; 4360286625Smav r = FMR_ZIO_FRAG; 4361286625Smav } 4362281109Smav } 4363281026Smav 4364272483Ssmh#else /* _KERNEL */ 4365286625Smav /* Every 100 calls, free a small amount */ 4366168404Spjd if (spa_get_random(100) == 0) 4367286625Smav lowest = -1024; 4368272483Ssmh#endif /* _KERNEL */ 4369270759Ssmh 4370286625Smav last_free_memory = lowest; 4371286625Smav last_free_reason = r; 4372286625Smav DTRACE_PROBE2(arc__available_memory, int64_t, lowest, int, r); 4373286625Smav return (lowest); 4374168404Spjd} 4375168404Spjd 4376286625Smav 4377286625Smav/* 4378286625Smav * Determine if the system is under memory pressure and is asking 4379307265Smav * to reclaim memory. A return value of B_TRUE indicates that the system 4380286625Smav * is under memory pressure and that the arc should adjust accordingly. 4381286625Smav */ 4382286625Smavstatic boolean_t 4383286625Smavarc_reclaim_needed(void) 4384286625Smav{ 4385286625Smav return (arc_available_memory() < 0); 4386286625Smav} 4387286625Smav 4388208454Spjdextern kmem_cache_t *zio_buf_cache[]; 4389208454Spjdextern kmem_cache_t *zio_data_buf_cache[]; 4390272527Sdelphijextern kmem_cache_t *range_seg_cache; 4391321610Smavextern kmem_cache_t *abd_chunk_cache; 4392208454Spjd 4393278040Ssmhstatic __noinline void 4394286625Smavarc_kmem_reap_now(void) 4395168404Spjd{ 4396168404Spjd size_t i; 4397168404Spjd kmem_cache_t *prev_cache = NULL; 4398168404Spjd kmem_cache_t *prev_data_cache = NULL; 4399168404Spjd 4400272483Ssmh DTRACE_PROBE(arc__kmem_reap_start); 4401168404Spjd#ifdef _KERNEL 4402185029Spjd if (arc_meta_used >= arc_meta_limit) { 4403185029Spjd /* 4404185029Spjd * We are exceeding our meta-data cache limit. 4405185029Spjd * Purge some DNLC entries to release holds on meta-data. 4406185029Spjd */ 4407185029Spjd dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent); 4408185029Spjd } 4409168404Spjd#if defined(__i386) 4410168404Spjd /* 4411168404Spjd * Reclaim unused memory from all kmem caches. 4412168404Spjd */ 4413168404Spjd kmem_reap(); 4414168404Spjd#endif 4415168404Spjd#endif 4416168404Spjd 4417168404Spjd for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) { 4418168404Spjd if (zio_buf_cache[i] != prev_cache) { 4419168404Spjd prev_cache = zio_buf_cache[i]; 4420168404Spjd kmem_cache_reap_now(zio_buf_cache[i]); 4421168404Spjd } 4422168404Spjd if (zio_data_buf_cache[i] != prev_data_cache) { 4423168404Spjd prev_data_cache = zio_data_buf_cache[i]; 4424168404Spjd kmem_cache_reap_now(zio_data_buf_cache[i]); 4425168404Spjd } 4426168404Spjd } 4427321610Smav kmem_cache_reap_now(abd_chunk_cache); 4428168404Spjd kmem_cache_reap_now(buf_cache); 4429286570Smav kmem_cache_reap_now(hdr_full_cache); 4430286570Smav kmem_cache_reap_now(hdr_l2only_cache); 4431272506Sdelphij kmem_cache_reap_now(range_seg_cache); 4432272483Ssmh 4433277300Ssmh#ifdef illumos 4434286625Smav if (zio_arena != NULL) { 4435286625Smav /* 4436286625Smav * Ask the vmem arena to reclaim unused memory from its 4437286625Smav * quantum caches. 4438286625Smav */ 4439272483Ssmh vmem_qcache_reap(zio_arena); 4440286625Smav } 4441272483Ssmh#endif 4442272483Ssmh DTRACE_PROBE(arc__kmem_reap_end); 4443168404Spjd} 4444168404Spjd 4445286763Smav/* 4446321610Smav * Threads can block in arc_get_data_impl() waiting for this thread to evict 4447286763Smav * enough data and signal them to proceed. When this happens, the threads in 4448321610Smav * arc_get_data_impl() are sleeping while holding the hash lock for their 4449286763Smav * particular arc header. Thus, we must be careful to never sleep on a 4450286763Smav * hash lock in this thread. This is to prevent the following deadlock: 4451286763Smav * 4452321610Smav * - Thread A sleeps on CV in arc_get_data_impl() holding hash lock "L", 4453286763Smav * waiting for the reclaim thread to signal it. 4454286763Smav * 4455286763Smav * - arc_reclaim_thread() tries to acquire hash lock "L" using mutex_enter, 4456286763Smav * fails, and goes to sleep forever. 4457286763Smav * 4458286763Smav * This possible deadlock is avoided by always acquiring a hash lock 4459286763Smav * using mutex_tryenter() from arc_reclaim_thread(). 4460286763Smav */ 4461168404Spjdstatic void 4462168404Spjdarc_reclaim_thread(void *dummy __unused) 4463168404Spjd{ 4464296530Smav hrtime_t growtime = 0; 4465168404Spjd callb_cpr_t cpr; 4466168404Spjd 4467286763Smav CALLB_CPR_INIT(&cpr, &arc_reclaim_lock, callb_generic_cpr, FTAG); 4468168404Spjd 4469286763Smav mutex_enter(&arc_reclaim_lock); 4470286763Smav while (!arc_reclaim_thread_exit) { 4471286763Smav uint64_t evicted = 0; 4472286763Smav 4473307265Smav /* 4474307265Smav * This is necessary in order for the mdb ::arc dcmd to 4475307265Smav * show up to date information. Since the ::arc command 4476307265Smav * does not call the kstat's update function, without 4477307265Smav * this call, the command may show stale stats for the 4478307265Smav * anon, mru, mru_ghost, mfu, and mfu_ghost lists. Even 4479307265Smav * with this change, the data might be up to 1 second 4480307265Smav * out of date; but that should suffice. The arc_state_t 4481307265Smav * structures can be queried directly if more accurate 4482307265Smav * information is needed. 4483307265Smav */ 4484307265Smav if (arc_ksp != NULL) 4485307265Smav arc_ksp->ks_update(arc_ksp, KSTAT_READ); 4486307265Smav 4487286763Smav mutex_exit(&arc_reclaim_lock); 4488286763Smav 4489314873Sjpaetzel /* 4490314873Sjpaetzel * We call arc_adjust() before (possibly) calling 4491314873Sjpaetzel * arc_kmem_reap_now(), so that we can wake up 4492321610Smav * arc_get_data_impl() sooner. 4493314873Sjpaetzel */ 4494314873Sjpaetzel evicted = arc_adjust(); 4495314873Sjpaetzel 4496314873Sjpaetzel int64_t free_memory = arc_available_memory(); 4497286625Smav if (free_memory < 0) { 4498168404Spjd 4499286625Smav arc_no_grow = B_TRUE; 4500286625Smav arc_warm = B_TRUE; 4501168404Spjd 4502286625Smav /* 4503286625Smav * Wait at least zfs_grow_retry (default 60) seconds 4504286625Smav * before considering growing. 4505286625Smav */ 4506296530Smav growtime = gethrtime() + SEC2NSEC(arc_grow_retry); 4507168404Spjd 4508286625Smav arc_kmem_reap_now(); 4509286625Smav 4510286625Smav /* 4511286625Smav * If we are still low on memory, shrink the ARC 4512286625Smav * so that we have arc_shrink_min free space. 4513286625Smav */ 4514286625Smav free_memory = arc_available_memory(); 4515286625Smav 4516286625Smav int64_t to_free = 4517286625Smav (arc_c >> arc_shrink_shift) - free_memory; 4518286625Smav if (to_free > 0) { 4519330061Savg#ifdef _KERNEL 4520330061Savg#ifdef illumos 4521330061Savg to_free = MAX(to_free, ptob(needfree)); 4522330061Savg#endif 4523330061Savg#endif 4524286625Smav arc_shrink(to_free); 4525168404Spjd } 4526286625Smav } else if (free_memory < arc_c >> arc_no_grow_shift) { 4527286625Smav arc_no_grow = B_TRUE; 4528296530Smav } else if (gethrtime() >= growtime) { 4529286625Smav arc_no_grow = B_FALSE; 4530168404Spjd } 4531168404Spjd 4532286763Smav mutex_enter(&arc_reclaim_lock); 4533168404Spjd 4534286763Smav /* 4535286763Smav * If evicted is zero, we couldn't evict anything via 4536286763Smav * arc_adjust(). This could be due to hash lock 4537286763Smav * collisions, but more likely due to the majority of 4538286763Smav * arc buffers being unevictable. Therefore, even if 4539286763Smav * arc_size is above arc_c, another pass is unlikely to 4540286763Smav * be helpful and could potentially cause us to enter an 4541286763Smav * infinite loop. 4542286763Smav */ 4543286763Smav if (arc_size <= arc_c || evicted == 0) { 4544286763Smav /* 4545286763Smav * We're either no longer overflowing, or we 4546286763Smav * can't evict anything more, so we should wake 4547286763Smav * up any threads before we go to sleep. 4548286763Smav */ 4549286763Smav cv_broadcast(&arc_reclaim_waiters_cv); 4550168404Spjd 4551286763Smav /* 4552286763Smav * Block until signaled, or after one second (we 4553286763Smav * might need to perform arc_kmem_reap_now() 4554286763Smav * even if we aren't being signalled) 4555286763Smav */ 4556286763Smav CALLB_CPR_SAFE_BEGIN(&cpr); 4557296530Smav (void) cv_timedwait_hires(&arc_reclaim_thread_cv, 4558296530Smav &arc_reclaim_lock, SEC2NSEC(1), MSEC2NSEC(1), 0); 4559286763Smav CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_lock); 4560286763Smav } 4561286763Smav } 4562286763Smav 4563307265Smav arc_reclaim_thread_exit = B_FALSE; 4564286763Smav cv_broadcast(&arc_reclaim_thread_cv); 4565286763Smav CALLB_CPR_EXIT(&cpr); /* drops arc_reclaim_lock */ 4566286763Smav thread_exit(); 4567286763Smav} 4568286763Smav 4569301997Skibstatic u_int arc_dnlc_evicts_arg; 4570301997Skibextern struct vfsops zfs_vfsops; 4571301997Skib 4572301997Skibstatic void 4573301997Skibarc_dnlc_evicts_thread(void *dummy __unused) 4574301997Skib{ 4575301997Skib callb_cpr_t cpr; 4576301997Skib u_int percent; 4577301997Skib 4578301997Skib CALLB_CPR_INIT(&cpr, &arc_dnlc_evicts_lock, callb_generic_cpr, FTAG); 4579301997Skib 4580301997Skib mutex_enter(&arc_dnlc_evicts_lock); 4581301997Skib while (!arc_dnlc_evicts_thread_exit) { 4582301997Skib CALLB_CPR_SAFE_BEGIN(&cpr); 4583301997Skib (void) cv_wait(&arc_dnlc_evicts_cv, &arc_dnlc_evicts_lock); 4584301997Skib CALLB_CPR_SAFE_END(&cpr, &arc_dnlc_evicts_lock); 4585301997Skib if (arc_dnlc_evicts_arg != 0) { 4586301997Skib percent = arc_dnlc_evicts_arg; 4587301997Skib mutex_exit(&arc_dnlc_evicts_lock); 4588301997Skib#ifdef _KERNEL 4589301997Skib vnlru_free(desiredvnodes * percent / 100, &zfs_vfsops); 4590301997Skib#endif 4591301997Skib mutex_enter(&arc_dnlc_evicts_lock); 4592301997Skib /* 4593301997Skib * Clear our token only after vnlru_free() 4594301997Skib * pass is done, to avoid false queueing of 4595301997Skib * the requests. 4596301997Skib */ 4597301997Skib arc_dnlc_evicts_arg = 0; 4598301997Skib } 4599301997Skib } 4600301997Skib arc_dnlc_evicts_thread_exit = FALSE; 4601301997Skib cv_broadcast(&arc_dnlc_evicts_cv); 4602301997Skib CALLB_CPR_EXIT(&cpr); 4603301997Skib thread_exit(); 4604301997Skib} 4605301997Skib 4606301997Skibvoid 4607301997Skibdnlc_reduce_cache(void *arg) 4608301997Skib{ 4609301997Skib u_int percent; 4610301997Skib 4611302012Skib percent = (u_int)(uintptr_t)arg; 4612301997Skib mutex_enter(&arc_dnlc_evicts_lock); 4613301997Skib if (arc_dnlc_evicts_arg == 0) { 4614301997Skib arc_dnlc_evicts_arg = percent; 4615301997Skib cv_broadcast(&arc_dnlc_evicts_cv); 4616301997Skib } 4617301997Skib mutex_exit(&arc_dnlc_evicts_lock); 4618301997Skib} 4619301997Skib 4620168404Spjd/* 4621168404Spjd * Adapt arc info given the number of bytes we are trying to add and 4622168404Spjd * the state that we are comming from. This function is only called 4623168404Spjd * when we are adding new content to the cache. 4624168404Spjd */ 4625168404Spjdstatic void 4626168404Spjdarc_adapt(int bytes, arc_state_t *state) 4627168404Spjd{ 4628168404Spjd int mult; 4629208373Smm uint64_t arc_p_min = (arc_c >> arc_p_min_shift); 4630286766Smav int64_t mrug_size = refcount_count(&arc_mru_ghost->arcs_size); 4631286766Smav int64_t mfug_size = refcount_count(&arc_mfu_ghost->arcs_size); 4632168404Spjd 4633185029Spjd if (state == arc_l2c_only) 4634185029Spjd return; 4635185029Spjd 4636168404Spjd ASSERT(bytes > 0); 4637168404Spjd /* 4638168404Spjd * Adapt the target size of the MRU list: 4639168404Spjd * - if we just hit in the MRU ghost list, then increase 4640168404Spjd * the target size of the MRU list. 4641168404Spjd * - if we just hit in the MFU ghost list, then increase 4642168404Spjd * the target size of the MFU list by decreasing the 4643168404Spjd * target size of the MRU list. 4644168404Spjd */ 4645168404Spjd if (state == arc_mru_ghost) { 4646286766Smav mult = (mrug_size >= mfug_size) ? 1 : (mfug_size / mrug_size); 4647209275Smm mult = MIN(mult, 10); /* avoid wild arc_p adjustment */ 4648168404Spjd 4649208373Smm arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult); 4650168404Spjd } else if (state == arc_mfu_ghost) { 4651208373Smm uint64_t delta; 4652208373Smm 4653286766Smav mult = (mfug_size >= mrug_size) ? 1 : (mrug_size / mfug_size); 4654209275Smm mult = MIN(mult, 10); 4655168404Spjd 4656208373Smm delta = MIN(bytes * mult, arc_p); 4657208373Smm arc_p = MAX(arc_p_min, arc_p - delta); 4658168404Spjd } 4659168404Spjd ASSERT((int64_t)arc_p >= 0); 4660168404Spjd 4661168404Spjd if (arc_reclaim_needed()) { 4662286763Smav cv_signal(&arc_reclaim_thread_cv); 4663168404Spjd return; 4664168404Spjd } 4665168404Spjd 4666168404Spjd if (arc_no_grow) 4667168404Spjd return; 4668168404Spjd 4669168404Spjd if (arc_c >= arc_c_max) 4670168404Spjd return; 4671168404Spjd 4672168404Spjd /* 4673168404Spjd * If we're within (2 * maxblocksize) bytes of the target 4674168404Spjd * cache size, increment the target cache size 4675168404Spjd */ 4676168404Spjd if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) { 4677272483Ssmh DTRACE_PROBE1(arc__inc_adapt, int, bytes); 4678168404Spjd atomic_add_64(&arc_c, (int64_t)bytes); 4679168404Spjd if (arc_c > arc_c_max) 4680168404Spjd arc_c = arc_c_max; 4681168404Spjd else if (state == arc_anon) 4682168404Spjd atomic_add_64(&arc_p, (int64_t)bytes); 4683168404Spjd if (arc_p > arc_c) 4684168404Spjd arc_p = arc_c; 4685168404Spjd } 4686168404Spjd ASSERT((int64_t)arc_p >= 0); 4687168404Spjd} 4688168404Spjd 4689168404Spjd/* 4690286763Smav * Check if arc_size has grown past our upper threshold, determined by 4691286763Smav * zfs_arc_overflow_shift. 4692168404Spjd */ 4693286763Smavstatic boolean_t 4694286763Smavarc_is_overflowing(void) 4695168404Spjd{ 4696286763Smav /* Always allow at least one block of overflow */ 4697286763Smav uint64_t overflow = MAX(SPA_MAXBLOCKSIZE, 4698286763Smav arc_c >> zfs_arc_overflow_shift); 4699185029Spjd 4700286763Smav return (arc_size >= arc_c + overflow); 4701168404Spjd} 4702168404Spjd 4703321610Smavstatic abd_t * 4704321610Smavarc_get_data_abd(arc_buf_hdr_t *hdr, uint64_t size, void *tag) 4705321610Smav{ 4706321610Smav arc_buf_contents_t type = arc_buf_type(hdr); 4707321610Smav 4708321610Smav arc_get_data_impl(hdr, size, tag); 4709321610Smav if (type == ARC_BUFC_METADATA) { 4710321610Smav return (abd_alloc(size, B_TRUE)); 4711321610Smav } else { 4712321610Smav ASSERT(type == ARC_BUFC_DATA); 4713321610Smav return (abd_alloc(size, B_FALSE)); 4714321610Smav } 4715321610Smav} 4716321610Smav 4717321610Smavstatic void * 4718321610Smavarc_get_data_buf(arc_buf_hdr_t *hdr, uint64_t size, void *tag) 4719321610Smav{ 4720321610Smav arc_buf_contents_t type = arc_buf_type(hdr); 4721321610Smav 4722321610Smav arc_get_data_impl(hdr, size, tag); 4723321610Smav if (type == ARC_BUFC_METADATA) { 4724321610Smav return (zio_buf_alloc(size)); 4725321610Smav } else { 4726321610Smav ASSERT(type == ARC_BUFC_DATA); 4727321610Smav return (zio_data_buf_alloc(size)); 4728321610Smav } 4729321610Smav} 4730321610Smav 4731168404Spjd/* 4732307265Smav * Allocate a block and return it to the caller. If we are hitting the 4733307265Smav * hard limit for the cache size, we must sleep, waiting for the eviction 4734307265Smav * thread to catch up. If we're past the target size but below the hard 4735307265Smav * limit, we'll only signal the reclaim thread and continue on. 4736168404Spjd */ 4737321610Smavstatic void 4738321610Smavarc_get_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag) 4739168404Spjd{ 4740321610Smav arc_state_t *state = hdr->b_l1hdr.b_state; 4741321610Smav arc_buf_contents_t type = arc_buf_type(hdr); 4742168404Spjd 4743168404Spjd arc_adapt(size, state); 4744168404Spjd 4745168404Spjd /* 4746286763Smav * If arc_size is currently overflowing, and has grown past our 4747286763Smav * upper limit, we must be adding data faster than the evict 4748286763Smav * thread can evict. Thus, to ensure we don't compound the 4749286763Smav * problem by adding more data and forcing arc_size to grow even 4750286763Smav * further past it's target size, we halt and wait for the 4751286763Smav * eviction thread to catch up. 4752286763Smav * 4753286763Smav * It's also possible that the reclaim thread is unable to evict 4754286763Smav * enough buffers to get arc_size below the overflow limit (e.g. 4755286763Smav * due to buffers being un-evictable, or hash lock collisions). 4756286763Smav * In this case, we want to proceed regardless if we're 4757286763Smav * overflowing; thus we don't use a while loop here. 4758168404Spjd */ 4759286763Smav if (arc_is_overflowing()) { 4760286763Smav mutex_enter(&arc_reclaim_lock); 4761286763Smav 4762286763Smav /* 4763286763Smav * Now that we've acquired the lock, we may no longer be 4764286763Smav * over the overflow limit, lets check. 4765286763Smav * 4766286763Smav * We're ignoring the case of spurious wake ups. If that 4767286763Smav * were to happen, it'd let this thread consume an ARC 4768286763Smav * buffer before it should have (i.e. before we're under 4769286763Smav * the overflow limit and were signalled by the reclaim 4770286763Smav * thread). As long as that is a rare occurrence, it 4771286763Smav * shouldn't cause any harm. 4772286763Smav */ 4773286763Smav if (arc_is_overflowing()) { 4774286763Smav cv_signal(&arc_reclaim_thread_cv); 4775286763Smav cv_wait(&arc_reclaim_waiters_cv, &arc_reclaim_lock); 4776168404Spjd } 4777286763Smav 4778286763Smav mutex_exit(&arc_reclaim_lock); 4779168404Spjd } 4780168404Spjd 4781307265Smav VERIFY3U(hdr->b_type, ==, type); 4782286763Smav if (type == ARC_BUFC_METADATA) { 4783286763Smav arc_space_consume(size, ARC_SPACE_META); 4784168404Spjd } else { 4785286763Smav arc_space_consume(size, ARC_SPACE_DATA); 4786168404Spjd } 4787286763Smav 4788168404Spjd /* 4789168404Spjd * Update the state size. Note that ghost states have a 4790168404Spjd * "ghost size" and so don't need to be updated. 4791168404Spjd */ 4792307265Smav if (!GHOST_STATE(state)) { 4793168404Spjd 4794307265Smav (void) refcount_add_many(&state->arcs_size, size, tag); 4795286763Smav 4796286763Smav /* 4797286763Smav * If this is reached via arc_read, the link is 4798286763Smav * protected by the hash lock. If reached via 4799286763Smav * arc_buf_alloc, the header should not be accessed by 4800286763Smav * any other thread. And, if reached via arc_read_done, 4801286763Smav * the hash lock will protect it if it's found in the 4802286763Smav * hash table; otherwise no other thread should be 4803286763Smav * trying to [add|remove]_reference it. 4804286763Smav */ 4805286763Smav if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) { 4806286570Smav ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 4807307265Smav (void) refcount_add_many(&state->arcs_esize[type], 4808307265Smav size, tag); 4809168404Spjd } 4810307265Smav 4811168404Spjd /* 4812168404Spjd * If we are growing the cache, and we are adding anonymous 4813168404Spjd * data, and we have outgrown arc_p, update arc_p 4814168404Spjd */ 4815286570Smav if (arc_size < arc_c && hdr->b_l1hdr.b_state == arc_anon && 4816286766Smav (refcount_count(&arc_anon->arcs_size) + 4817286766Smav refcount_count(&arc_mru->arcs_size) > arc_p)) 4818168404Spjd arc_p = MIN(arc_c, arc_p + size); 4819168404Spjd } 4820205231Skmacy ARCSTAT_BUMP(arcstat_allocated); 4821168404Spjd} 4822168404Spjd 4823321610Smavstatic void 4824321610Smavarc_free_data_abd(arc_buf_hdr_t *hdr, abd_t *abd, uint64_t size, void *tag) 4825321610Smav{ 4826321610Smav arc_free_data_impl(hdr, size, tag); 4827321610Smav abd_free(abd); 4828321610Smav} 4829321610Smav 4830321610Smavstatic void 4831321610Smavarc_free_data_buf(arc_buf_hdr_t *hdr, void *buf, uint64_t size, void *tag) 4832321610Smav{ 4833321610Smav arc_buf_contents_t type = arc_buf_type(hdr); 4834321610Smav 4835321610Smav arc_free_data_impl(hdr, size, tag); 4836321610Smav if (type == ARC_BUFC_METADATA) { 4837321610Smav zio_buf_free(buf, size); 4838321610Smav } else { 4839321610Smav ASSERT(type == ARC_BUFC_DATA); 4840321610Smav zio_data_buf_free(buf, size); 4841321610Smav } 4842321610Smav} 4843321610Smav 4844168404Spjd/* 4845307265Smav * Free the arc data buffer. 4846307265Smav */ 4847307265Smavstatic void 4848321610Smavarc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag) 4849307265Smav{ 4850307265Smav arc_state_t *state = hdr->b_l1hdr.b_state; 4851307265Smav arc_buf_contents_t type = arc_buf_type(hdr); 4852307265Smav 4853307265Smav /* protected by hash lock, if in the hash table */ 4854307265Smav if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) { 4855307265Smav ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 4856307265Smav ASSERT(state != arc_anon && state != arc_l2c_only); 4857307265Smav 4858307265Smav (void) refcount_remove_many(&state->arcs_esize[type], 4859307265Smav size, tag); 4860307265Smav } 4861307265Smav (void) refcount_remove_many(&state->arcs_size, size, tag); 4862307265Smav 4863307265Smav VERIFY3U(hdr->b_type, ==, type); 4864307265Smav if (type == ARC_BUFC_METADATA) { 4865307265Smav arc_space_return(size, ARC_SPACE_META); 4866307265Smav } else { 4867307265Smav ASSERT(type == ARC_BUFC_DATA); 4868307265Smav arc_space_return(size, ARC_SPACE_DATA); 4869307265Smav } 4870307265Smav} 4871307265Smav 4872307265Smav/* 4873168404Spjd * This routine is called whenever a buffer is accessed. 4874168404Spjd * NOTE: the hash lock is dropped in this function. 4875168404Spjd */ 4876168404Spjdstatic void 4877275811Sdelphijarc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) 4878168404Spjd{ 4879219089Spjd clock_t now; 4880219089Spjd 4881168404Spjd ASSERT(MUTEX_HELD(hash_lock)); 4882286570Smav ASSERT(HDR_HAS_L1HDR(hdr)); 4883168404Spjd 4884286570Smav if (hdr->b_l1hdr.b_state == arc_anon) { 4885168404Spjd /* 4886168404Spjd * This buffer is not in the cache, and does not 4887168404Spjd * appear in our "ghost" list. Add the new buffer 4888168404Spjd * to the MRU state. 4889168404Spjd */ 4890168404Spjd 4891286570Smav ASSERT0(hdr->b_l1hdr.b_arc_access); 4892286570Smav hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); 4893275811Sdelphij DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr); 4894275811Sdelphij arc_change_state(arc_mru, hdr, hash_lock); 4895168404Spjd 4896286570Smav } else if (hdr->b_l1hdr.b_state == arc_mru) { 4897219089Spjd now = ddi_get_lbolt(); 4898219089Spjd 4899168404Spjd /* 4900168404Spjd * If this buffer is here because of a prefetch, then either: 4901168404Spjd * - clear the flag if this is a "referencing" read 4902168404Spjd * (any subsequent access will bump this into the MFU state). 4903168404Spjd * or 4904168404Spjd * - move the buffer to the head of the list if this is 4905168404Spjd * another prefetch (to make it less likely to be evicted). 4906168404Spjd */ 4907286570Smav if (HDR_PREFETCH(hdr)) { 4908286570Smav if (refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) { 4909286763Smav /* link protected by hash lock */ 4910286763Smav ASSERT(multilist_link_active( 4911286570Smav &hdr->b_l1hdr.b_arc_node)); 4912168404Spjd } else { 4913307265Smav arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH); 4914168404Spjd ARCSTAT_BUMP(arcstat_mru_hits); 4915168404Spjd } 4916286570Smav hdr->b_l1hdr.b_arc_access = now; 4917168404Spjd return; 4918168404Spjd } 4919168404Spjd 4920168404Spjd /* 4921168404Spjd * This buffer has been "accessed" only once so far, 4922168404Spjd * but it is still in the cache. Move it to the MFU 4923168404Spjd * state. 4924168404Spjd */ 4925286570Smav if (now > hdr->b_l1hdr.b_arc_access + ARC_MINTIME) { 4926168404Spjd /* 4927168404Spjd * More than 125ms have passed since we 4928168404Spjd * instantiated this buffer. Move it to the 4929168404Spjd * most frequently used state. 4930168404Spjd */ 4931286570Smav hdr->b_l1hdr.b_arc_access = now; 4932275811Sdelphij DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); 4933275811Sdelphij arc_change_state(arc_mfu, hdr, hash_lock); 4934168404Spjd } 4935168404Spjd ARCSTAT_BUMP(arcstat_mru_hits); 4936286570Smav } else if (hdr->b_l1hdr.b_state == arc_mru_ghost) { 4937168404Spjd arc_state_t *new_state; 4938168404Spjd /* 4939168404Spjd * This buffer has been "accessed" recently, but 4940168404Spjd * was evicted from the cache. Move it to the 4941168404Spjd * MFU state. 4942168404Spjd */ 4943168404Spjd 4944286570Smav if (HDR_PREFETCH(hdr)) { 4945168404Spjd new_state = arc_mru; 4946286570Smav if (refcount_count(&hdr->b_l1hdr.b_refcnt) > 0) 4947307265Smav arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH); 4948275811Sdelphij DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr); 4949168404Spjd } else { 4950168404Spjd new_state = arc_mfu; 4951275811Sdelphij DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); 4952168404Spjd } 4953168404Spjd 4954286570Smav hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); 4955275811Sdelphij arc_change_state(new_state, hdr, hash_lock); 4956168404Spjd 4957168404Spjd ARCSTAT_BUMP(arcstat_mru_ghost_hits); 4958286570Smav } else if (hdr->b_l1hdr.b_state == arc_mfu) { 4959168404Spjd /* 4960168404Spjd * This buffer has been accessed more than once and is 4961168404Spjd * still in the cache. Keep it in the MFU state. 4962168404Spjd * 4963168404Spjd * NOTE: an add_reference() that occurred when we did 4964168404Spjd * the arc_read() will have kicked this off the list. 4965168404Spjd * If it was a prefetch, we will explicitly move it to 4966168404Spjd * the head of the list now. 4967168404Spjd */ 4968286570Smav if ((HDR_PREFETCH(hdr)) != 0) { 4969286570Smav ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 4970286763Smav /* link protected by hash_lock */ 4971286763Smav ASSERT(multilist_link_active(&hdr->b_l1hdr.b_arc_node)); 4972168404Spjd } 4973168404Spjd ARCSTAT_BUMP(arcstat_mfu_hits); 4974286570Smav hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); 4975286570Smav } else if (hdr->b_l1hdr.b_state == arc_mfu_ghost) { 4976168404Spjd arc_state_t *new_state = arc_mfu; 4977168404Spjd /* 4978168404Spjd * This buffer has been accessed more than once but has 4979168404Spjd * been evicted from the cache. Move it back to the 4980168404Spjd * MFU state. 4981168404Spjd */ 4982168404Spjd 4983286570Smav if (HDR_PREFETCH(hdr)) { 4984168404Spjd /* 4985168404Spjd * This is a prefetch access... 4986168404Spjd * move this block back to the MRU state. 4987168404Spjd */ 4988286570Smav ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt)); 4989168404Spjd new_state = arc_mru; 4990168404Spjd } 4991168404Spjd 4992286570Smav hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); 4993275811Sdelphij DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); 4994275811Sdelphij arc_change_state(new_state, hdr, hash_lock); 4995168404Spjd 4996168404Spjd ARCSTAT_BUMP(arcstat_mfu_ghost_hits); 4997286570Smav } else if (hdr->b_l1hdr.b_state == arc_l2c_only) { 4998185029Spjd /* 4999185029Spjd * This buffer is on the 2nd Level ARC. 5000185029Spjd */ 5001185029Spjd 5002286570Smav hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); 5003275811Sdelphij DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); 5004275811Sdelphij arc_change_state(arc_mfu, hdr, hash_lock); 5005168404Spjd } else { 5006168404Spjd ASSERT(!"invalid arc state"); 5007168404Spjd } 5008168404Spjd} 5009168404Spjd 5010168404Spjd/* a generic arc_done_func_t which you can use */ 5011168404Spjd/* ARGSUSED */ 5012168404Spjdvoid 5013168404Spjdarc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg) 5014168404Spjd{ 5015219089Spjd if (zio == NULL || zio->io_error == 0) 5016321535Smav bcopy(buf->b_data, arg, arc_buf_size(buf)); 5017307265Smav arc_buf_destroy(buf, arg); 5018168404Spjd} 5019168404Spjd 5020185029Spjd/* a generic arc_done_func_t */ 5021168404Spjdvoid 5022168404Spjdarc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg) 5023168404Spjd{ 5024168404Spjd arc_buf_t **bufp = arg; 5025168404Spjd if (zio && zio->io_error) { 5026307265Smav arc_buf_destroy(buf, arg); 5027168404Spjd *bufp = NULL; 5028168404Spjd } else { 5029168404Spjd *bufp = buf; 5030219089Spjd ASSERT(buf->b_data); 5031168404Spjd } 5032168404Spjd} 5033168404Spjd 5034168404Spjdstatic void 5035307265Smavarc_hdr_verify(arc_buf_hdr_t *hdr, blkptr_t *bp) 5036307265Smav{ 5037307265Smav if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) { 5038307265Smav ASSERT3U(HDR_GET_PSIZE(hdr), ==, 0); 5039307265Smav ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF); 5040307265Smav } else { 5041307265Smav if (HDR_COMPRESSION_ENABLED(hdr)) { 5042307265Smav ASSERT3U(HDR_GET_COMPRESS(hdr), ==, 5043307265Smav BP_GET_COMPRESS(bp)); 5044307265Smav } 5045307265Smav ASSERT3U(HDR_GET_LSIZE(hdr), ==, BP_GET_LSIZE(bp)); 5046307265Smav ASSERT3U(HDR_GET_PSIZE(hdr), ==, BP_GET_PSIZE(bp)); 5047307265Smav } 5048307265Smav} 5049307265Smav 5050307265Smavstatic void 5051168404Spjdarc_read_done(zio_t *zio) 5052168404Spjd{ 5053307265Smav arc_buf_hdr_t *hdr = zio->io_private; 5054268075Sdelphij kmutex_t *hash_lock = NULL; 5055321535Smav arc_callback_t *callback_list; 5056321535Smav arc_callback_t *acb; 5057321535Smav boolean_t freeable = B_FALSE; 5058321535Smav boolean_t no_zio_error = (zio->io_error == 0); 5059168404Spjd 5060168404Spjd /* 5061168404Spjd * The hdr was inserted into hash-table and removed from lists 5062168404Spjd * prior to starting I/O. We should find this header, since 5063168404Spjd * it's in the hash table, and it should be legit since it's 5064168404Spjd * not possible to evict it during the I/O. The only possible 5065168404Spjd * reason for it not to be found is if we were freed during the 5066168404Spjd * read. 5067168404Spjd */ 5068268075Sdelphij if (HDR_IN_HASH_TABLE(hdr)) { 5069268075Sdelphij ASSERT3U(hdr->b_birth, ==, BP_PHYSICAL_BIRTH(zio->io_bp)); 5070268075Sdelphij ASSERT3U(hdr->b_dva.dva_word[0], ==, 5071268075Sdelphij BP_IDENTITY(zio->io_bp)->dva_word[0]); 5072268075Sdelphij ASSERT3U(hdr->b_dva.dva_word[1], ==, 5073268075Sdelphij BP_IDENTITY(zio->io_bp)->dva_word[1]); 5074168404Spjd 5075268075Sdelphij arc_buf_hdr_t *found = buf_hash_find(hdr->b_spa, zio->io_bp, 5076268075Sdelphij &hash_lock); 5077168404Spjd 5078307265Smav ASSERT((found == hdr && 5079268075Sdelphij DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) || 5080268075Sdelphij (found == hdr && HDR_L2_READING(hdr))); 5081307265Smav ASSERT3P(hash_lock, !=, NULL); 5082268075Sdelphij } 5083268075Sdelphij 5084321535Smav if (no_zio_error) { 5085307265Smav /* byteswap if necessary */ 5086307265Smav if (BP_SHOULD_BYTESWAP(zio->io_bp)) { 5087307265Smav if (BP_GET_LEVEL(zio->io_bp) > 0) { 5088307265Smav hdr->b_l1hdr.b_byteswap = DMU_BSWAP_UINT64; 5089307265Smav } else { 5090307265Smav hdr->b_l1hdr.b_byteswap = 5091307265Smav DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp)); 5092307265Smav } 5093307265Smav } else { 5094307265Smav hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS; 5095307265Smav } 5096307265Smav } 5097307265Smav 5098307265Smav arc_hdr_clear_flags(hdr, ARC_FLAG_L2_EVICTED); 5099286570Smav if (l2arc_noprefetch && HDR_PREFETCH(hdr)) 5100307265Smav arc_hdr_clear_flags(hdr, ARC_FLAG_L2CACHE); 5101206796Spjd 5102286570Smav callback_list = hdr->b_l1hdr.b_acb; 5103307265Smav ASSERT3P(callback_list, !=, NULL); 5104168404Spjd 5105321535Smav if (hash_lock && no_zio_error && hdr->b_l1hdr.b_state == arc_anon) { 5106219089Spjd /* 5107219089Spjd * Only call arc_access on anonymous buffers. This is because 5108219089Spjd * if we've issued an I/O for an evicted buffer, we've already 5109219089Spjd * called arc_access (to prevent any simultaneous readers from 5110219089Spjd * getting confused). 5111219089Spjd */ 5112219089Spjd arc_access(hdr, hash_lock); 5113219089Spjd } 5114219089Spjd 5115321535Smav /* 5116321535Smav * If a read request has a callback (i.e. acb_done is not NULL), then we 5117321535Smav * make a buf containing the data according to the parameters which were 5118321535Smav * passed in. The implementation of arc_buf_alloc_impl() ensures that we 5119321535Smav * aren't needlessly decompressing the data multiple times. 5120321535Smav */ 5121321535Smav int callback_cnt = 0; 5122321535Smav for (acb = callback_list; acb != NULL; acb = acb->acb_next) { 5123321535Smav if (!acb->acb_done) 5124321535Smav continue; 5125321535Smav 5126321535Smav /* This is a demand read since prefetches don't use callbacks */ 5127321535Smav callback_cnt++; 5128321535Smav 5129321535Smav int error = arc_buf_alloc_impl(hdr, acb->acb_private, 5130321535Smav acb->acb_compressed, no_zio_error, &acb->acb_buf); 5131321535Smav if (no_zio_error) { 5132321535Smav zio->io_error = error; 5133168404Spjd } 5134168404Spjd } 5135286570Smav hdr->b_l1hdr.b_acb = NULL; 5136307265Smav arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); 5137321535Smav if (callback_cnt == 0) { 5138307265Smav ASSERT(HDR_PREFETCH(hdr)); 5139307265Smav ASSERT0(hdr->b_l1hdr.b_bufcnt); 5140321610Smav ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); 5141219089Spjd } 5142168404Spjd 5143286570Smav ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt) || 5144286570Smav callback_list != NULL); 5145168404Spjd 5146321535Smav if (no_zio_error) { 5147307265Smav arc_hdr_verify(hdr, zio->io_bp); 5148307265Smav } else { 5149307265Smav arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR); 5150286570Smav if (hdr->b_l1hdr.b_state != arc_anon) 5151168404Spjd arc_change_state(arc_anon, hdr, hash_lock); 5152168404Spjd if (HDR_IN_HASH_TABLE(hdr)) 5153168404Spjd buf_hash_remove(hdr); 5154286570Smav freeable = refcount_is_zero(&hdr->b_l1hdr.b_refcnt); 5155168404Spjd } 5156168404Spjd 5157168404Spjd /* 5158168404Spjd * Broadcast before we drop the hash_lock to avoid the possibility 5159168404Spjd * that the hdr (and hence the cv) might be freed before we get to 5160168404Spjd * the cv_broadcast(). 5161168404Spjd */ 5162286570Smav cv_broadcast(&hdr->b_l1hdr.b_cv); 5163168404Spjd 5164286570Smav if (hash_lock != NULL) { 5165168404Spjd mutex_exit(hash_lock); 5166168404Spjd } else { 5167168404Spjd /* 5168168404Spjd * This block was freed while we waited for the read to 5169168404Spjd * complete. It has been removed from the hash table and 5170168404Spjd * moved to the anonymous state (so that it won't show up 5171168404Spjd * in the cache). 5172168404Spjd */ 5173286570Smav ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); 5174286570Smav freeable = refcount_is_zero(&hdr->b_l1hdr.b_refcnt); 5175168404Spjd } 5176168404Spjd 5177168404Spjd /* execute each callback and free its structure */ 5178168404Spjd while ((acb = callback_list) != NULL) { 5179168404Spjd if (acb->acb_done) 5180168404Spjd acb->acb_done(zio, acb->acb_buf, acb->acb_private); 5181168404Spjd 5182168404Spjd if (acb->acb_zio_dummy != NULL) { 5183168404Spjd acb->acb_zio_dummy->io_error = zio->io_error; 5184168404Spjd zio_nowait(acb->acb_zio_dummy); 5185168404Spjd } 5186168404Spjd 5187168404Spjd callback_list = acb->acb_next; 5188168404Spjd kmem_free(acb, sizeof (arc_callback_t)); 5189168404Spjd } 5190168404Spjd 5191168404Spjd if (freeable) 5192168404Spjd arc_hdr_destroy(hdr); 5193168404Spjd} 5194168404Spjd 5195168404Spjd/* 5196286762Smav * "Read" the block at the specified DVA (in bp) via the 5197168404Spjd * cache. If the block is found in the cache, invoke the provided 5198168404Spjd * callback immediately and return. Note that the `zio' parameter 5199168404Spjd * in the callback will be NULL in this case, since no IO was 5200168404Spjd * required. If the block is not in the cache pass the read request 5201168404Spjd * on to the spa with a substitute callback function, so that the 5202168404Spjd * requested block will be added to the cache. 5203168404Spjd * 5204168404Spjd * If a read request arrives for a block that has a read in-progress, 5205168404Spjd * either wait for the in-progress read to complete (and return the 5206168404Spjd * results); or, if this is a read with a "done" func, add a record 5207168404Spjd * to the read to invoke the "done" func when the read completes, 5208168404Spjd * and return; or just return. 5209168404Spjd * 5210168404Spjd * arc_read_done() will invoke all the requested "done" functions 5211168404Spjd * for readers of this block. 5212168404Spjd */ 5213168404Spjdint 5214246666Smmarc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, 5215275811Sdelphij void *private, zio_priority_t priority, int zio_flags, 5216275811Sdelphij arc_flags_t *arc_flags, const zbookmark_phys_t *zb) 5217168404Spjd{ 5218268075Sdelphij arc_buf_hdr_t *hdr = NULL; 5219268075Sdelphij kmutex_t *hash_lock = NULL; 5220185029Spjd zio_t *rzio; 5221228103Smm uint64_t guid = spa_load_guid(spa); 5222321535Smav boolean_t compressed_read = (zio_flags & ZIO_FLAG_RAW) != 0; 5223168404Spjd 5224268075Sdelphij ASSERT(!BP_IS_EMBEDDED(bp) || 5225268075Sdelphij BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA); 5226268075Sdelphij 5227168404Spjdtop: 5228268075Sdelphij if (!BP_IS_EMBEDDED(bp)) { 5229268075Sdelphij /* 5230268075Sdelphij * Embedded BP's have no DVA and require no I/O to "read". 5231268075Sdelphij * Create an anonymous arc buf to back it. 5232268075Sdelphij */ 5233268075Sdelphij hdr = buf_hash_find(guid, bp, &hash_lock); 5234268075Sdelphij } 5235168404Spjd 5236321610Smav if (hdr != NULL && HDR_HAS_L1HDR(hdr) && hdr->b_l1hdr.b_pabd != NULL) { 5237307265Smav arc_buf_t *buf = NULL; 5238275811Sdelphij *arc_flags |= ARC_FLAG_CACHED; 5239168404Spjd 5240168404Spjd if (HDR_IO_IN_PROGRESS(hdr)) { 5241168404Spjd 5242287702Sdelphij if ((hdr->b_flags & ARC_FLAG_PRIO_ASYNC_READ) && 5243287702Sdelphij priority == ZIO_PRIORITY_SYNC_READ) { 5244287702Sdelphij /* 5245287702Sdelphij * This sync read must wait for an 5246287702Sdelphij * in-progress async read (e.g. a predictive 5247287702Sdelphij * prefetch). Async reads are queued 5248287702Sdelphij * separately at the vdev_queue layer, so 5249287702Sdelphij * this is a form of priority inversion. 5250287702Sdelphij * Ideally, we would "inherit" the demand 5251287702Sdelphij * i/o's priority by moving the i/o from 5252287702Sdelphij * the async queue to the synchronous queue, 5253287702Sdelphij * but there is currently no mechanism to do 5254287702Sdelphij * so. Track this so that we can evaluate 5255287702Sdelphij * the magnitude of this potential performance 5256287702Sdelphij * problem. 5257287702Sdelphij * 5258287702Sdelphij * Note that if the prefetch i/o is already 5259287702Sdelphij * active (has been issued to the device), 5260287702Sdelphij * the prefetch improved performance, because 5261287702Sdelphij * we issued it sooner than we would have 5262287702Sdelphij * without the prefetch. 5263287702Sdelphij */ 5264287702Sdelphij DTRACE_PROBE1(arc__sync__wait__for__async, 5265287702Sdelphij arc_buf_hdr_t *, hdr); 5266287702Sdelphij ARCSTAT_BUMP(arcstat_sync_wait_for_async); 5267287702Sdelphij } 5268287702Sdelphij if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) { 5269307265Smav arc_hdr_clear_flags(hdr, 5270307265Smav ARC_FLAG_PREDICTIVE_PREFETCH); 5271287702Sdelphij } 5272287702Sdelphij 5273275811Sdelphij if (*arc_flags & ARC_FLAG_WAIT) { 5274286570Smav cv_wait(&hdr->b_l1hdr.b_cv, hash_lock); 5275168404Spjd mutex_exit(hash_lock); 5276168404Spjd goto top; 5277168404Spjd } 5278275811Sdelphij ASSERT(*arc_flags & ARC_FLAG_NOWAIT); 5279168404Spjd 5280168404Spjd if (done) { 5281287702Sdelphij arc_callback_t *acb = NULL; 5282168404Spjd 5283168404Spjd acb = kmem_zalloc(sizeof (arc_callback_t), 5284168404Spjd KM_SLEEP); 5285168404Spjd acb->acb_done = done; 5286168404Spjd acb->acb_private = private; 5287321535Smav acb->acb_compressed = compressed_read; 5288168404Spjd if (pio != NULL) 5289168404Spjd acb->acb_zio_dummy = zio_null(pio, 5290209962Smm spa, NULL, NULL, NULL, zio_flags); 5291168404Spjd 5292307265Smav ASSERT3P(acb->acb_done, !=, NULL); 5293286570Smav acb->acb_next = hdr->b_l1hdr.b_acb; 5294286570Smav hdr->b_l1hdr.b_acb = acb; 5295168404Spjd mutex_exit(hash_lock); 5296168404Spjd return (0); 5297168404Spjd } 5298168404Spjd mutex_exit(hash_lock); 5299168404Spjd return (0); 5300168404Spjd } 5301168404Spjd 5302286570Smav ASSERT(hdr->b_l1hdr.b_state == arc_mru || 5303286570Smav hdr->b_l1hdr.b_state == arc_mfu); 5304168404Spjd 5305168404Spjd if (done) { 5306287702Sdelphij if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) { 5307287702Sdelphij /* 5308287702Sdelphij * This is a demand read which does not have to 5309287702Sdelphij * wait for i/o because we did a predictive 5310287702Sdelphij * prefetch i/o for it, which has completed. 5311287702Sdelphij */ 5312287702Sdelphij DTRACE_PROBE1( 5313287702Sdelphij arc__demand__hit__predictive__prefetch, 5314287702Sdelphij arc_buf_hdr_t *, hdr); 5315287702Sdelphij ARCSTAT_BUMP( 5316287702Sdelphij arcstat_demand_hit_predictive_prefetch); 5317307265Smav arc_hdr_clear_flags(hdr, 5318307265Smav ARC_FLAG_PREDICTIVE_PREFETCH); 5319287702Sdelphij } 5320307265Smav ASSERT(!BP_IS_EMBEDDED(bp) || !BP_IS_HOLE(bp)); 5321307265Smav 5322321535Smav /* Get a buf with the desired data in it. */ 5323321535Smav VERIFY0(arc_buf_alloc_impl(hdr, private, 5324321535Smav compressed_read, B_TRUE, &buf)); 5325275811Sdelphij } else if (*arc_flags & ARC_FLAG_PREFETCH && 5326286570Smav refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) { 5327307265Smav arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH); 5328168404Spjd } 5329168404Spjd DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); 5330168404Spjd arc_access(hdr, hash_lock); 5331275811Sdelphij if (*arc_flags & ARC_FLAG_L2CACHE) 5332307265Smav arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE); 5333168404Spjd mutex_exit(hash_lock); 5334168404Spjd ARCSTAT_BUMP(arcstat_hits); 5335286570Smav ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr), 5336286570Smav demand, prefetch, !HDR_ISTYPE_METADATA(hdr), 5337168404Spjd data, metadata, hits); 5338168404Spjd 5339168404Spjd if (done) 5340168404Spjd done(NULL, buf, private); 5341168404Spjd } else { 5342307265Smav uint64_t lsize = BP_GET_LSIZE(bp); 5343307265Smav uint64_t psize = BP_GET_PSIZE(bp); 5344268075Sdelphij arc_callback_t *acb; 5345185029Spjd vdev_t *vd = NULL; 5346247187Smm uint64_t addr = 0; 5347208373Smm boolean_t devw = B_FALSE; 5348307265Smav uint64_t size; 5349168404Spjd 5350168404Spjd if (hdr == NULL) { 5351168404Spjd /* this block is not in the cache */ 5352268075Sdelphij arc_buf_hdr_t *exists = NULL; 5353168404Spjd arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp); 5354307265Smav hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize, 5355307265Smav BP_GET_COMPRESS(bp), type); 5356307265Smav 5357268075Sdelphij if (!BP_IS_EMBEDDED(bp)) { 5358268075Sdelphij hdr->b_dva = *BP_IDENTITY(bp); 5359268075Sdelphij hdr->b_birth = BP_PHYSICAL_BIRTH(bp); 5360268075Sdelphij exists = buf_hash_insert(hdr, &hash_lock); 5361268075Sdelphij } 5362268075Sdelphij if (exists != NULL) { 5363168404Spjd /* somebody beat us to the hash insert */ 5364168404Spjd mutex_exit(hash_lock); 5365219089Spjd buf_discard_identity(hdr); 5366307265Smav arc_hdr_destroy(hdr); 5367168404Spjd goto top; /* restart the IO request */ 5368168404Spjd } 5369168404Spjd } else { 5370286570Smav /* 5371286570Smav * This block is in the ghost cache. If it was L2-only 5372286570Smav * (and thus didn't have an L1 hdr), we realloc the 5373286570Smav * header to add an L1 hdr. 5374286570Smav */ 5375286570Smav if (!HDR_HAS_L1HDR(hdr)) { 5376286570Smav hdr = arc_hdr_realloc(hdr, hdr_l2only_cache, 5377286570Smav hdr_full_cache); 5378286570Smav } 5379321610Smav ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); 5380286570Smav ASSERT(GHOST_STATE(hdr->b_l1hdr.b_state)); 5381168404Spjd ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 5382286570Smav ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 5383286763Smav ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); 5384321535Smav ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); 5385168404Spjd 5386287702Sdelphij /* 5387307265Smav * This is a delicate dance that we play here. 5388307265Smav * This hdr is in the ghost list so we access it 5389307265Smav * to move it out of the ghost list before we 5390307265Smav * initiate the read. If it's a prefetch then 5391307265Smav * it won't have a callback so we'll remove the 5392307265Smav * reference that arc_buf_alloc_impl() created. We 5393307265Smav * do this after we've called arc_access() to 5394307265Smav * avoid hitting an assert in remove_reference(). 5395287702Sdelphij */ 5396219089Spjd arc_access(hdr, hash_lock); 5397321610Smav arc_hdr_alloc_pabd(hdr); 5398168404Spjd } 5399321610Smav ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); 5400307265Smav size = arc_hdr_size(hdr); 5401168404Spjd 5402307265Smav /* 5403307265Smav * If compression is enabled on the hdr, then will do 5404307265Smav * RAW I/O and will store the compressed data in the hdr's 5405307265Smav * data block. Otherwise, the hdr's data block will contain 5406307265Smav * the uncompressed data. 5407307265Smav */ 5408307265Smav if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF) { 5409307265Smav zio_flags |= ZIO_FLAG_RAW; 5410307265Smav } 5411307265Smav 5412307265Smav if (*arc_flags & ARC_FLAG_PREFETCH) 5413307265Smav arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH); 5414307265Smav if (*arc_flags & ARC_FLAG_L2CACHE) 5415307265Smav arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE); 5416307265Smav if (BP_GET_LEVEL(bp) > 0) 5417307265Smav arc_hdr_set_flags(hdr, ARC_FLAG_INDIRECT); 5418287702Sdelphij if (*arc_flags & ARC_FLAG_PREDICTIVE_PREFETCH) 5419307265Smav arc_hdr_set_flags(hdr, ARC_FLAG_PREDICTIVE_PREFETCH); 5420286570Smav ASSERT(!GHOST_STATE(hdr->b_l1hdr.b_state)); 5421219089Spjd 5422168404Spjd acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); 5423168404Spjd acb->acb_done = done; 5424168404Spjd acb->acb_private = private; 5425321535Smav acb->acb_compressed = compressed_read; 5426168404Spjd 5427307265Smav ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); 5428286570Smav hdr->b_l1hdr.b_acb = acb; 5429307265Smav arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); 5430168404Spjd 5431286570Smav if (HDR_HAS_L2HDR(hdr) && 5432286570Smav (vd = hdr->b_l2hdr.b_dev->l2ad_vdev) != NULL) { 5433286570Smav devw = hdr->b_l2hdr.b_dev->l2ad_writing; 5434286570Smav addr = hdr->b_l2hdr.b_daddr; 5435185029Spjd /* 5436185029Spjd * Lock out device removal. 5437185029Spjd */ 5438185029Spjd if (vdev_is_dead(vd) || 5439185029Spjd !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER)) 5440185029Spjd vd = NULL; 5441185029Spjd } 5442185029Spjd 5443307265Smav if (priority == ZIO_PRIORITY_ASYNC_READ) 5444307265Smav arc_hdr_set_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ); 5445307265Smav else 5446307265Smav arc_hdr_clear_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ); 5447307265Smav 5448268075Sdelphij if (hash_lock != NULL) 5449268075Sdelphij mutex_exit(hash_lock); 5450168404Spjd 5451251629Sdelphij /* 5452251629Sdelphij * At this point, we have a level 1 cache miss. Try again in 5453251629Sdelphij * L2ARC if possible. 5454251629Sdelphij */ 5455307265Smav ASSERT3U(HDR_GET_LSIZE(hdr), ==, lsize); 5456307265Smav 5457219089Spjd DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp, 5458307265Smav uint64_t, lsize, zbookmark_phys_t *, zb); 5459168404Spjd ARCSTAT_BUMP(arcstat_misses); 5460286570Smav ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr), 5461286570Smav demand, prefetch, !HDR_ISTYPE_METADATA(hdr), 5462168404Spjd data, metadata, misses); 5463228392Spjd#ifdef _KERNEL 5464297633Strasz#ifdef RACCT 5465297633Strasz if (racct_enable) { 5466297633Strasz PROC_LOCK(curproc); 5467297633Strasz racct_add_force(curproc, RACCT_READBPS, size); 5468297633Strasz racct_add_force(curproc, RACCT_READIOPS, 1); 5469297633Strasz PROC_UNLOCK(curproc); 5470297633Strasz } 5471297633Strasz#endif /* RACCT */ 5472228392Spjd curthread->td_ru.ru_inblock++; 5473228392Spjd#endif 5474168404Spjd 5475208373Smm if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) { 5476185029Spjd /* 5477185029Spjd * Read from the L2ARC if the following are true: 5478185029Spjd * 1. The L2ARC vdev was previously cached. 5479185029Spjd * 2. This buffer still has L2ARC metadata. 5480185029Spjd * 3. This buffer isn't currently writing to the L2ARC. 5481185029Spjd * 4. The L2ARC entry wasn't evicted, which may 5482185029Spjd * also have invalidated the vdev. 5483208373Smm * 5. This isn't prefetch and l2arc_noprefetch is set. 5484185029Spjd */ 5485286570Smav if (HDR_HAS_L2HDR(hdr) && 5486208373Smm !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) && 5487208373Smm !(l2arc_noprefetch && HDR_PREFETCH(hdr))) { 5488185029Spjd l2arc_read_callback_t *cb; 5489321610Smav abd_t *abd; 5490321610Smav uint64_t asize; 5491185029Spjd 5492185029Spjd DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr); 5493185029Spjd ARCSTAT_BUMP(arcstat_l2_hits); 5494185029Spjd 5495185029Spjd cb = kmem_zalloc(sizeof (l2arc_read_callback_t), 5496185029Spjd KM_SLEEP); 5497307265Smav cb->l2rcb_hdr = hdr; 5498185029Spjd cb->l2rcb_bp = *bp; 5499185029Spjd cb->l2rcb_zb = *zb; 5500185029Spjd cb->l2rcb_flags = zio_flags; 5501321610Smav 5502321610Smav asize = vdev_psize_to_asize(vd, size); 5503307265Smav if (asize != size) { 5504321610Smav abd = abd_alloc_for_io(asize, 5505321610Smav HDR_ISTYPE_METADATA(hdr)); 5506321610Smav cb->l2rcb_abd = abd; 5507297848Savg } else { 5508321610Smav abd = hdr->b_l1hdr.b_pabd; 5509297848Savg } 5510185029Spjd 5511247187Smm ASSERT(addr >= VDEV_LABEL_START_SIZE && 5512321610Smav addr + asize <= vd->vdev_psize - 5513247187Smm VDEV_LABEL_END_SIZE); 5514247187Smm 5515185029Spjd /* 5516185029Spjd * l2arc read. The SCL_L2ARC lock will be 5517185029Spjd * released by l2arc_read_done(). 5518251478Sdelphij * Issue a null zio if the underlying buffer 5519251478Sdelphij * was squashed to zero size by compression. 5520185029Spjd */ 5521307265Smav ASSERT3U(HDR_GET_COMPRESS(hdr), !=, 5522307265Smav ZIO_COMPRESS_EMPTY); 5523307265Smav rzio = zio_read_phys(pio, vd, addr, 5524321610Smav asize, abd, 5525307265Smav ZIO_CHECKSUM_OFF, 5526307265Smav l2arc_read_done, cb, priority, 5527307265Smav zio_flags | ZIO_FLAG_DONT_CACHE | 5528307265Smav ZIO_FLAG_CANFAIL | 5529307265Smav ZIO_FLAG_DONT_PROPAGATE | 5530307265Smav ZIO_FLAG_DONT_RETRY, B_FALSE); 5531185029Spjd DTRACE_PROBE2(l2arc__read, vdev_t *, vd, 5532185029Spjd zio_t *, rzio); 5533307265Smav ARCSTAT_INCR(arcstat_l2_read_bytes, size); 5534185029Spjd 5535275811Sdelphij if (*arc_flags & ARC_FLAG_NOWAIT) { 5536185029Spjd zio_nowait(rzio); 5537185029Spjd return (0); 5538185029Spjd } 5539185029Spjd 5540275811Sdelphij ASSERT(*arc_flags & ARC_FLAG_WAIT); 5541185029Spjd if (zio_wait(rzio) == 0) 5542185029Spjd return (0); 5543185029Spjd 5544185029Spjd /* l2arc read error; goto zio_read() */ 5545185029Spjd } else { 5546185029Spjd DTRACE_PROBE1(l2arc__miss, 5547185029Spjd arc_buf_hdr_t *, hdr); 5548185029Spjd ARCSTAT_BUMP(arcstat_l2_misses); 5549185029Spjd if (HDR_L2_WRITING(hdr)) 5550185029Spjd ARCSTAT_BUMP(arcstat_l2_rw_clash); 5551185029Spjd spa_config_exit(spa, SCL_L2ARC, vd); 5552185029Spjd } 5553208373Smm } else { 5554208373Smm if (vd != NULL) 5555208373Smm spa_config_exit(spa, SCL_L2ARC, vd); 5556208373Smm if (l2arc_ndev != 0) { 5557208373Smm DTRACE_PROBE1(l2arc__miss, 5558208373Smm arc_buf_hdr_t *, hdr); 5559208373Smm ARCSTAT_BUMP(arcstat_l2_misses); 5560208373Smm } 5561185029Spjd } 5562185029Spjd 5563321610Smav rzio = zio_read(pio, spa, bp, hdr->b_l1hdr.b_pabd, size, 5564307265Smav arc_read_done, hdr, priority, zio_flags, zb); 5565168404Spjd 5566275811Sdelphij if (*arc_flags & ARC_FLAG_WAIT) 5567168404Spjd return (zio_wait(rzio)); 5568168404Spjd 5569275811Sdelphij ASSERT(*arc_flags & ARC_FLAG_NOWAIT); 5570168404Spjd zio_nowait(rzio); 5571168404Spjd } 5572168404Spjd return (0); 5573168404Spjd} 5574168404Spjd 5575168404Spjd/* 5576251520Sdelphij * Notify the arc that a block was freed, and thus will never be used again. 5577251520Sdelphij */ 5578251520Sdelphijvoid 5579251520Sdelphijarc_freed(spa_t *spa, const blkptr_t *bp) 5580251520Sdelphij{ 5581251520Sdelphij arc_buf_hdr_t *hdr; 5582251520Sdelphij kmutex_t *hash_lock; 5583251520Sdelphij uint64_t guid = spa_load_guid(spa); 5584251520Sdelphij 5585268075Sdelphij ASSERT(!BP_IS_EMBEDDED(bp)); 5586268075Sdelphij 5587268075Sdelphij hdr = buf_hash_find(guid, bp, &hash_lock); 5588251520Sdelphij if (hdr == NULL) 5589251520Sdelphij return; 5590307265Smav 5591307265Smav /* 5592307265Smav * We might be trying to free a block that is still doing I/O 5593307265Smav * (i.e. prefetch) or has a reference (i.e. a dedup-ed, 5594307265Smav * dmu_sync-ed block). If this block is being prefetched, then it 5595307265Smav * would still have the ARC_FLAG_IO_IN_PROGRESS flag set on the hdr 5596307265Smav * until the I/O completes. A block may also have a reference if it is 5597307265Smav * part of a dedup-ed, dmu_synced write. The dmu_sync() function would 5598307265Smav * have written the new block to its final resting place on disk but 5599307265Smav * without the dedup flag set. This would have left the hdr in the MRU 5600307265Smav * state and discoverable. When the txg finally syncs it detects that 5601307265Smav * the block was overridden in open context and issues an override I/O. 5602307265Smav * Since this is a dedup block, the override I/O will determine if the 5603307265Smav * block is already in the DDT. If so, then it will replace the io_bp 5604307265Smav * with the bp from the DDT and allow the I/O to finish. When the I/O 5605307265Smav * reaches the done callback, dbuf_write_override_done, it will 5606307265Smav * check to see if the io_bp and io_bp_override are identical. 5607307265Smav * If they are not, then it indicates that the bp was replaced with 5608307265Smav * the bp in the DDT and the override bp is freed. This allows 5609307265Smav * us to arrive here with a reference on a block that is being 5610307265Smav * freed. So if we have an I/O in progress, or a reference to 5611307265Smav * this hdr, then we don't destroy the hdr. 5612307265Smav */ 5613307265Smav if (!HDR_HAS_L1HDR(hdr) || (!HDR_IO_IN_PROGRESS(hdr) && 5614307265Smav refcount_is_zero(&hdr->b_l1hdr.b_refcnt))) { 5615307265Smav arc_change_state(arc_anon, hdr, hash_lock); 5616307265Smav arc_hdr_destroy(hdr); 5617251520Sdelphij mutex_exit(hash_lock); 5618251520Sdelphij } else { 5619251520Sdelphij mutex_exit(hash_lock); 5620251520Sdelphij } 5621251520Sdelphij 5622251520Sdelphij} 5623251520Sdelphij 5624251520Sdelphij/* 5625251629Sdelphij * Release this buffer from the cache, making it an anonymous buffer. This 5626251629Sdelphij * must be done after a read and prior to modifying the buffer contents. 5627168404Spjd * If the buffer has more than one reference, we must make 5628185029Spjd * a new hdr for the buffer. 5629168404Spjd */ 5630168404Spjdvoid 5631168404Spjdarc_release(arc_buf_t *buf, void *tag) 5632168404Spjd{ 5633286570Smav arc_buf_hdr_t *hdr = buf->b_hdr; 5634168404Spjd 5635219089Spjd /* 5636219089Spjd * It would be nice to assert that if it's DMU metadata (level > 5637219089Spjd * 0 || it's the dnode file), then it must be syncing context. 5638219089Spjd * But we don't know that information at this level. 5639219089Spjd */ 5640219089Spjd 5641219089Spjd mutex_enter(&buf->b_evict_lock); 5642286776Smav 5643286776Smav ASSERT(HDR_HAS_L1HDR(hdr)); 5644286776Smav 5645286570Smav /* 5646286570Smav * We don't grab the hash lock prior to this check, because if 5647286570Smav * the buffer's header is in the arc_anon state, it won't be 5648286570Smav * linked into the hash table. 5649286570Smav */ 5650286570Smav if (hdr->b_l1hdr.b_state == arc_anon) { 5651286570Smav mutex_exit(&buf->b_evict_lock); 5652286570Smav ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 5653286570Smav ASSERT(!HDR_IN_HASH_TABLE(hdr)); 5654286570Smav ASSERT(!HDR_HAS_L2HDR(hdr)); 5655307265Smav ASSERT(HDR_EMPTY(hdr)); 5656307265Smav ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1); 5657286570Smav ASSERT3S(refcount_count(&hdr->b_l1hdr.b_refcnt), ==, 1); 5658286570Smav ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node)); 5659185029Spjd 5660307265Smav hdr->b_l1hdr.b_arc_access = 0; 5661168404Spjd 5662307265Smav /* 5663307265Smav * If the buf is being overridden then it may already 5664307265Smav * have a hdr that is not empty. 5665307265Smav */ 5666307265Smav buf_discard_identity(hdr); 5667286570Smav arc_buf_thaw(buf); 5668286570Smav 5669286570Smav return; 5670168404Spjd } 5671168404Spjd 5672286570Smav kmutex_t *hash_lock = HDR_LOCK(hdr); 5673286570Smav mutex_enter(hash_lock); 5674286570Smav 5675286570Smav /* 5676286570Smav * This assignment is only valid as long as the hash_lock is 5677286570Smav * held, we must be careful not to reference state or the 5678286570Smav * b_state field after dropping the lock. 5679286570Smav */ 5680286570Smav arc_state_t *state = hdr->b_l1hdr.b_state; 5681286570Smav ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 5682286570Smav ASSERT3P(state, !=, arc_anon); 5683286570Smav 5684286570Smav /* this buffer is not on any list */ 5685321535Smav ASSERT3S(refcount_count(&hdr->b_l1hdr.b_refcnt), >, 0); 5686286570Smav 5687286570Smav if (HDR_HAS_L2HDR(hdr)) { 5688286570Smav mutex_enter(&hdr->b_l2hdr.b_dev->l2ad_mtx); 5689286570Smav 5690286570Smav /* 5691286598Smav * We have to recheck this conditional again now that 5692286598Smav * we're holding the l2ad_mtx to prevent a race with 5693286598Smav * another thread which might be concurrently calling 5694286598Smav * l2arc_evict(). In that case, l2arc_evict() might have 5695286598Smav * destroyed the header's L2 portion as we were waiting 5696286598Smav * to acquire the l2ad_mtx. 5697286570Smav */ 5698286598Smav if (HDR_HAS_L2HDR(hdr)) { 5699290191Savg l2arc_trim(hdr); 5700286598Smav arc_hdr_l2hdr_destroy(hdr); 5701286598Smav } 5702286570Smav 5703286570Smav mutex_exit(&hdr->b_l2hdr.b_dev->l2ad_mtx); 5704185029Spjd } 5705185029Spjd 5706168404Spjd /* 5707168404Spjd * Do we have more than one buf? 5708168404Spjd */ 5709307265Smav if (hdr->b_l1hdr.b_bufcnt > 1) { 5710168404Spjd arc_buf_hdr_t *nhdr; 5711209962Smm uint64_t spa = hdr->b_spa; 5712307265Smav uint64_t psize = HDR_GET_PSIZE(hdr); 5713307265Smav uint64_t lsize = HDR_GET_LSIZE(hdr); 5714307265Smav enum zio_compress compress = HDR_GET_COMPRESS(hdr); 5715286570Smav arc_buf_contents_t type = arc_buf_type(hdr); 5716307265Smav VERIFY3U(hdr->b_type, ==, type); 5717168404Spjd 5718286570Smav ASSERT(hdr->b_l1hdr.b_buf != buf || buf->b_next != NULL); 5719307265Smav (void) remove_reference(hdr, hash_lock, tag); 5720307265Smav 5721321535Smav if (arc_buf_is_shared(buf) && !ARC_BUF_COMPRESSED(buf)) { 5722307265Smav ASSERT3P(hdr->b_l1hdr.b_buf, !=, buf); 5723307265Smav ASSERT(ARC_BUF_LAST(buf)); 5724307265Smav } 5725307265Smav 5726168404Spjd /* 5727219089Spjd * Pull the data off of this hdr and attach it to 5728307265Smav * a new anonymous hdr. Also find the last buffer 5729307265Smav * in the hdr's buffer list. 5730168404Spjd */ 5731321535Smav arc_buf_t *lastbuf = arc_buf_remove(hdr, buf); 5732307265Smav ASSERT3P(lastbuf, !=, NULL); 5733168404Spjd 5734307265Smav /* 5735307265Smav * If the current arc_buf_t and the hdr are sharing their data 5736321535Smav * buffer, then we must stop sharing that block. 5737307265Smav */ 5738307265Smav if (arc_buf_is_shared(buf)) { 5739307265Smav VERIFY(!arc_buf_is_shared(lastbuf)); 5740307265Smav 5741307265Smav /* 5742307265Smav * First, sever the block sharing relationship between 5743321535Smav * buf and the arc_buf_hdr_t. 5744307265Smav */ 5745307265Smav arc_unshare_buf(hdr, buf); 5746321535Smav 5747321535Smav /* 5748321610Smav * Now we need to recreate the hdr's b_pabd. Since we 5749321535Smav * have lastbuf handy, we try to share with it, but if 5750321610Smav * we can't then we allocate a new b_pabd and copy the 5751321535Smav * data from buf into it. 5752321535Smav */ 5753321535Smav if (arc_can_share(hdr, lastbuf)) { 5754321535Smav arc_share_buf(hdr, lastbuf); 5755321535Smav } else { 5756321610Smav arc_hdr_alloc_pabd(hdr); 5757321610Smav abd_copy_from_buf(hdr->b_l1hdr.b_pabd, 5758321610Smav buf->b_data, psize); 5759321535Smav } 5760307265Smav VERIFY3P(lastbuf->b_data, !=, NULL); 5761307265Smav } else if (HDR_SHARED_DATA(hdr)) { 5762321535Smav /* 5763321535Smav * Uncompressed shared buffers are always at the end 5764321535Smav * of the list. Compressed buffers don't have the 5765321535Smav * same requirements. This makes it hard to 5766321535Smav * simply assert that the lastbuf is shared so 5767321535Smav * we rely on the hdr's compression flags to determine 5768321535Smav * if we have a compressed, shared buffer. 5769321535Smav */ 5770321535Smav ASSERT(arc_buf_is_shared(lastbuf) || 5771321535Smav HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF); 5772321535Smav ASSERT(!ARC_BUF_SHARED(buf)); 5773307265Smav } 5774321610Smav ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); 5775286570Smav ASSERT3P(state, !=, arc_l2c_only); 5776286766Smav 5777307265Smav (void) refcount_remove_many(&state->arcs_size, 5778321535Smav arc_buf_size(buf), buf); 5779286766Smav 5780286570Smav if (refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) { 5781286570Smav ASSERT3P(state, !=, arc_l2c_only); 5782307265Smav (void) refcount_remove_many(&state->arcs_esize[type], 5783321535Smav arc_buf_size(buf), buf); 5784168404Spjd } 5785242845Sdelphij 5786307265Smav hdr->b_l1hdr.b_bufcnt -= 1; 5787168404Spjd arc_cksum_verify(buf); 5788240133Smm#ifdef illumos 5789240133Smm arc_buf_unwatch(buf); 5790277300Ssmh#endif 5791168404Spjd 5792168404Spjd mutex_exit(hash_lock); 5793168404Spjd 5794307265Smav /* 5795321610Smav * Allocate a new hdr. The new hdr will contain a b_pabd 5796307265Smav * buffer which will be freed in arc_write(). 5797307265Smav */ 5798307265Smav nhdr = arc_hdr_alloc(spa, psize, lsize, compress, type); 5799307265Smav ASSERT3P(nhdr->b_l1hdr.b_buf, ==, NULL); 5800307265Smav ASSERT0(nhdr->b_l1hdr.b_bufcnt); 5801307265Smav ASSERT0(refcount_count(&nhdr->b_l1hdr.b_refcnt)); 5802307265Smav VERIFY3U(nhdr->b_type, ==, type); 5803307265Smav ASSERT(!HDR_SHARED_DATA(nhdr)); 5804286570Smav 5805286570Smav nhdr->b_l1hdr.b_buf = buf; 5806307265Smav nhdr->b_l1hdr.b_bufcnt = 1; 5807286570Smav (void) refcount_add(&nhdr->b_l1hdr.b_refcnt, tag); 5808168404Spjd buf->b_hdr = nhdr; 5809307265Smav 5810219089Spjd mutex_exit(&buf->b_evict_lock); 5811307265Smav (void) refcount_add_many(&arc_anon->arcs_size, 5812321535Smav arc_buf_size(buf), buf); 5813168404Spjd } else { 5814219089Spjd mutex_exit(&buf->b_evict_lock); 5815286570Smav ASSERT(refcount_count(&hdr->b_l1hdr.b_refcnt) == 1); 5816286763Smav /* protected by hash lock, or hdr is on arc_anon */ 5817286763Smav ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); 5818168404Spjd ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 5819286570Smav arc_change_state(arc_anon, hdr, hash_lock); 5820286570Smav hdr->b_l1hdr.b_arc_access = 0; 5821286570Smav mutex_exit(hash_lock); 5822185029Spjd 5823219089Spjd buf_discard_identity(hdr); 5824168404Spjd arc_buf_thaw(buf); 5825168404Spjd } 5826168404Spjd} 5827168404Spjd 5828168404Spjdint 5829168404Spjdarc_released(arc_buf_t *buf) 5830168404Spjd{ 5831185029Spjd int released; 5832185029Spjd 5833219089Spjd mutex_enter(&buf->b_evict_lock); 5834286570Smav released = (buf->b_data != NULL && 5835286570Smav buf->b_hdr->b_l1hdr.b_state == arc_anon); 5836219089Spjd mutex_exit(&buf->b_evict_lock); 5837185029Spjd return (released); 5838168404Spjd} 5839168404Spjd 5840168404Spjd#ifdef ZFS_DEBUG 5841168404Spjdint 5842168404Spjdarc_referenced(arc_buf_t *buf) 5843168404Spjd{ 5844185029Spjd int referenced; 5845185029Spjd 5846219089Spjd mutex_enter(&buf->b_evict_lock); 5847286570Smav referenced = (refcount_count(&buf->b_hdr->b_l1hdr.b_refcnt)); 5848219089Spjd mutex_exit(&buf->b_evict_lock); 5849185029Spjd return (referenced); 5850168404Spjd} 5851168404Spjd#endif 5852168404Spjd 5853168404Spjdstatic void 5854168404Spjdarc_write_ready(zio_t *zio) 5855168404Spjd{ 5856168404Spjd arc_write_callback_t *callback = zio->io_private; 5857168404Spjd arc_buf_t *buf = callback->awcb_buf; 5858185029Spjd arc_buf_hdr_t *hdr = buf->b_hdr; 5859307265Smav uint64_t psize = BP_IS_HOLE(zio->io_bp) ? 0 : BP_GET_PSIZE(zio->io_bp); 5860168404Spjd 5861286570Smav ASSERT(HDR_HAS_L1HDR(hdr)); 5862286570Smav ASSERT(!refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt)); 5863307265Smav ASSERT(hdr->b_l1hdr.b_bufcnt > 0); 5864185029Spjd 5865185029Spjd /* 5866307265Smav * If we're reexecuting this zio because the pool suspended, then 5867307265Smav * cleanup any state that was previously set the first time the 5868321535Smav * callback was invoked. 5869185029Spjd */ 5870307265Smav if (zio->io_flags & ZIO_FLAG_REEXECUTED) { 5871307265Smav arc_cksum_free(hdr); 5872307265Smav#ifdef illumos 5873307265Smav arc_buf_unwatch(buf); 5874307265Smav#endif 5875321610Smav if (hdr->b_l1hdr.b_pabd != NULL) { 5876307265Smav if (arc_buf_is_shared(buf)) { 5877307265Smav arc_unshare_buf(hdr, buf); 5878307265Smav } else { 5879321610Smav arc_hdr_free_pabd(hdr); 5880307265Smav } 5881185029Spjd } 5882168404Spjd } 5883321610Smav ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); 5884307265Smav ASSERT(!HDR_SHARED_DATA(hdr)); 5885307265Smav ASSERT(!arc_buf_is_shared(buf)); 5886307265Smav 5887307265Smav callback->awcb_ready(zio, buf, callback->awcb_private); 5888307265Smav 5889307265Smav if (HDR_IO_IN_PROGRESS(hdr)) 5890307265Smav ASSERT(zio->io_flags & ZIO_FLAG_REEXECUTED); 5891307265Smav 5892307265Smav arc_cksum_compute(buf); 5893307265Smav arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); 5894307265Smav 5895307265Smav enum zio_compress compress; 5896307265Smav if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) { 5897307265Smav compress = ZIO_COMPRESS_OFF; 5898307265Smav } else { 5899307265Smav ASSERT3U(HDR_GET_LSIZE(hdr), ==, BP_GET_LSIZE(zio->io_bp)); 5900307265Smav compress = BP_GET_COMPRESS(zio->io_bp); 5901307265Smav } 5902307265Smav HDR_SET_PSIZE(hdr, psize); 5903307265Smav arc_hdr_set_compress(hdr, compress); 5904307265Smav 5905321610Smav 5906307265Smav /* 5907321610Smav * Fill the hdr with data. If the hdr is compressed, the data we want 5908321610Smav * is available from the zio, otherwise we can take it from the buf. 5909321610Smav * 5910321610Smav * We might be able to share the buf's data with the hdr here. However, 5911321610Smav * doing so would cause the ARC to be full of linear ABDs if we write a 5912321610Smav * lot of shareable data. As a compromise, we check whether scattered 5913321610Smav * ABDs are allowed, and assume that if they are then the user wants 5914321610Smav * the ARC to be primarily filled with them regardless of the data being 5915321610Smav * written. Therefore, if they're allowed then we allocate one and copy 5916321610Smav * the data into it; otherwise, we share the data directly if we can. 5917307265Smav */ 5918321610Smav if (zfs_abd_scatter_enabled || !arc_can_share(hdr, buf)) { 5919321610Smav arc_hdr_alloc_pabd(hdr); 5920321610Smav 5921321610Smav /* 5922321610Smav * Ideally, we would always copy the io_abd into b_pabd, but the 5923321610Smav * user may have disabled compressed ARC, thus we must check the 5924321610Smav * hdr's compression setting rather than the io_bp's. 5925321610Smav */ 5926321610Smav if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF) { 5927321610Smav ASSERT3U(BP_GET_COMPRESS(zio->io_bp), !=, 5928321610Smav ZIO_COMPRESS_OFF); 5929321610Smav ASSERT3U(psize, >, 0); 5930321610Smav 5931321610Smav abd_copy(hdr->b_l1hdr.b_pabd, zio->io_abd, psize); 5932321610Smav } else { 5933321610Smav ASSERT3U(zio->io_orig_size, ==, arc_hdr_size(hdr)); 5934321610Smav 5935321610Smav abd_copy_from_buf(hdr->b_l1hdr.b_pabd, buf->b_data, 5936321610Smav arc_buf_size(buf)); 5937321610Smav } 5938307265Smav } else { 5939321610Smav ASSERT3P(buf->b_data, ==, abd_to_buf(zio->io_orig_abd)); 5940321535Smav ASSERT3U(zio->io_orig_size, ==, arc_buf_size(buf)); 5941307265Smav ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1); 5942307265Smav 5943307265Smav arc_share_buf(hdr, buf); 5944307265Smav } 5945321610Smav 5946307265Smav arc_hdr_verify(hdr, zio->io_bp); 5947168404Spjd} 5948168404Spjd 5949304138Savgstatic void 5950304138Savgarc_write_children_ready(zio_t *zio) 5951304138Savg{ 5952304138Savg arc_write_callback_t *callback = zio->io_private; 5953304138Savg arc_buf_t *buf = callback->awcb_buf; 5954304138Savg 5955304138Savg callback->awcb_children_ready(zio, buf, callback->awcb_private); 5956304138Savg} 5957304138Savg 5958258632Savg/* 5959258632Savg * The SPA calls this callback for each physical write that happens on behalf 5960258632Savg * of a logical write. See the comment in dbuf_write_physdone() for details. 5961258632Savg */ 5962168404Spjdstatic void 5963258632Savgarc_write_physdone(zio_t *zio) 5964258632Savg{ 5965258632Savg arc_write_callback_t *cb = zio->io_private; 5966258632Savg if (cb->awcb_physdone != NULL) 5967258632Savg cb->awcb_physdone(zio, cb->awcb_buf, cb->awcb_private); 5968258632Savg} 5969258632Savg 5970258632Savgstatic void 5971168404Spjdarc_write_done(zio_t *zio) 5972168404Spjd{ 5973168404Spjd arc_write_callback_t *callback = zio->io_private; 5974168404Spjd arc_buf_t *buf = callback->awcb_buf; 5975168404Spjd arc_buf_hdr_t *hdr = buf->b_hdr; 5976168404Spjd 5977307265Smav ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); 5978168404Spjd 5979219089Spjd if (zio->io_error == 0) { 5980307265Smav arc_hdr_verify(hdr, zio->io_bp); 5981307265Smav 5982268075Sdelphij if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) { 5983260150Sdelphij buf_discard_identity(hdr); 5984260150Sdelphij } else { 5985260150Sdelphij hdr->b_dva = *BP_IDENTITY(zio->io_bp); 5986260150Sdelphij hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp); 5987260150Sdelphij } 5988219089Spjd } else { 5989307265Smav ASSERT(HDR_EMPTY(hdr)); 5990219089Spjd } 5991219089Spjd 5992168404Spjd /* 5993268075Sdelphij * If the block to be written was all-zero or compressed enough to be 5994268075Sdelphij * embedded in the BP, no write was performed so there will be no 5995268075Sdelphij * dva/birth/checksum. The buffer must therefore remain anonymous 5996268075Sdelphij * (and uncached). 5997168404Spjd */ 5998307265Smav if (!HDR_EMPTY(hdr)) { 5999168404Spjd arc_buf_hdr_t *exists; 6000168404Spjd kmutex_t *hash_lock; 6001168404Spjd 6002321535Smav ASSERT3U(zio->io_error, ==, 0); 6003219089Spjd 6004168404Spjd arc_cksum_verify(buf); 6005168404Spjd 6006168404Spjd exists = buf_hash_insert(hdr, &hash_lock); 6007286570Smav if (exists != NULL) { 6008168404Spjd /* 6009168404Spjd * This can only happen if we overwrite for 6010168404Spjd * sync-to-convergence, because we remove 6011168404Spjd * buffers from the hash table when we arc_free(). 6012168404Spjd */ 6013219089Spjd if (zio->io_flags & ZIO_FLAG_IO_REWRITE) { 6014219089Spjd if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp)) 6015219089Spjd panic("bad overwrite, hdr=%p exists=%p", 6016219089Spjd (void *)hdr, (void *)exists); 6017286570Smav ASSERT(refcount_is_zero( 6018286570Smav &exists->b_l1hdr.b_refcnt)); 6019219089Spjd arc_change_state(arc_anon, exists, hash_lock); 6020219089Spjd mutex_exit(hash_lock); 6021219089Spjd arc_hdr_destroy(exists); 6022219089Spjd exists = buf_hash_insert(hdr, &hash_lock); 6023219089Spjd ASSERT3P(exists, ==, NULL); 6024243524Smm } else if (zio->io_flags & ZIO_FLAG_NOPWRITE) { 6025243524Smm /* nopwrite */ 6026243524Smm ASSERT(zio->io_prop.zp_nopwrite); 6027243524Smm if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp)) 6028243524Smm panic("bad nopwrite, hdr=%p exists=%p", 6029243524Smm (void *)hdr, (void *)exists); 6030219089Spjd } else { 6031219089Spjd /* Dedup */ 6032307265Smav ASSERT(hdr->b_l1hdr.b_bufcnt == 1); 6033286570Smav ASSERT(hdr->b_l1hdr.b_state == arc_anon); 6034219089Spjd ASSERT(BP_GET_DEDUP(zio->io_bp)); 6035219089Spjd ASSERT(BP_GET_LEVEL(zio->io_bp) == 0); 6036219089Spjd } 6037168404Spjd } 6038307265Smav arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); 6039185029Spjd /* if it's not anon, we are doing a scrub */ 6040286570Smav if (exists == NULL && hdr->b_l1hdr.b_state == arc_anon) 6041185029Spjd arc_access(hdr, hash_lock); 6042168404Spjd mutex_exit(hash_lock); 6043168404Spjd } else { 6044307265Smav arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); 6045168404Spjd } 6046168404Spjd 6047286570Smav ASSERT(!refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 6048219089Spjd callback->awcb_done(zio, buf, callback->awcb_private); 6049168404Spjd 6050321610Smav abd_put(zio->io_abd); 6051168404Spjd kmem_free(callback, sizeof (arc_write_callback_t)); 6052168404Spjd} 6053168404Spjd 6054168404Spjdzio_t * 6055307265Smavarc_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, arc_buf_t *buf, 6056307265Smav boolean_t l2arc, const zio_prop_t *zp, arc_done_func_t *ready, 6057304138Savg arc_done_func_t *children_ready, arc_done_func_t *physdone, 6058258632Savg arc_done_func_t *done, void *private, zio_priority_t priority, 6059268123Sdelphij int zio_flags, const zbookmark_phys_t *zb) 6060168404Spjd{ 6061168404Spjd arc_buf_hdr_t *hdr = buf->b_hdr; 6062168404Spjd arc_write_callback_t *callback; 6063185029Spjd zio_t *zio; 6064321573Smav zio_prop_t localprop = *zp; 6065168404Spjd 6066307265Smav ASSERT3P(ready, !=, NULL); 6067307265Smav ASSERT3P(done, !=, NULL); 6068168404Spjd ASSERT(!HDR_IO_ERROR(hdr)); 6069286570Smav ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 6070307265Smav ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); 6071307265Smav ASSERT3U(hdr->b_l1hdr.b_bufcnt, >, 0); 6072185029Spjd if (l2arc) 6073307265Smav arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE); 6074321535Smav if (ARC_BUF_COMPRESSED(buf)) { 6075321573Smav /* 6076321573Smav * We're writing a pre-compressed buffer. Make the 6077321573Smav * compression algorithm requested by the zio_prop_t match 6078321573Smav * the pre-compressed buffer's compression algorithm. 6079321573Smav */ 6080321573Smav localprop.zp_compress = HDR_GET_COMPRESS(hdr); 6081321573Smav 6082321535Smav ASSERT3U(HDR_GET_LSIZE(hdr), !=, arc_buf_size(buf)); 6083321535Smav zio_flags |= ZIO_FLAG_RAW; 6084321535Smav } 6085168404Spjd callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP); 6086168404Spjd callback->awcb_ready = ready; 6087304138Savg callback->awcb_children_ready = children_ready; 6088258632Savg callback->awcb_physdone = physdone; 6089168404Spjd callback->awcb_done = done; 6090168404Spjd callback->awcb_private = private; 6091168404Spjd callback->awcb_buf = buf; 6092168404Spjd 6093307265Smav /* 6094321610Smav * The hdr's b_pabd is now stale, free it now. A new data block 6095307265Smav * will be allocated when the zio pipeline calls arc_write_ready(). 6096307265Smav */ 6097321610Smav if (hdr->b_l1hdr.b_pabd != NULL) { 6098307265Smav /* 6099307265Smav * If the buf is currently sharing the data block with 6100307265Smav * the hdr then we need to break that relationship here. 6101307265Smav * The hdr will remain with a NULL data pointer and the 6102307265Smav * buf will take sole ownership of the block. 6103307265Smav */ 6104307265Smav if (arc_buf_is_shared(buf)) { 6105307265Smav arc_unshare_buf(hdr, buf); 6106307265Smav } else { 6107321610Smav arc_hdr_free_pabd(hdr); 6108307265Smav } 6109307265Smav VERIFY3P(buf->b_data, !=, NULL); 6110307265Smav arc_hdr_set_compress(hdr, ZIO_COMPRESS_OFF); 6111307265Smav } 6112307265Smav ASSERT(!arc_buf_is_shared(buf)); 6113321610Smav ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); 6114307265Smav 6115321610Smav zio = zio_write(pio, spa, txg, bp, 6116321610Smav abd_get_from_buf(buf->b_data, HDR_GET_LSIZE(hdr)), 6117321573Smav HDR_GET_LSIZE(hdr), arc_buf_size(buf), &localprop, arc_write_ready, 6118304138Savg (children_ready != NULL) ? arc_write_children_ready : NULL, 6119304138Savg arc_write_physdone, arc_write_done, callback, 6120258632Savg priority, zio_flags, zb); 6121185029Spjd 6122168404Spjd return (zio); 6123168404Spjd} 6124168404Spjd 6125185029Spjdstatic int 6126258632Savgarc_memory_throttle(uint64_t reserve, uint64_t txg) 6127185029Spjd{ 6128185029Spjd#ifdef _KERNEL 6129272483Ssmh uint64_t available_memory = ptob(freemem); 6130185029Spjd static uint64_t page_load = 0; 6131185029Spjd static uint64_t last_txg = 0; 6132185029Spjd 6133272483Ssmh#if defined(__i386) || !defined(UMA_MD_SMALL_ALLOC) 6134185029Spjd available_memory = 6135272483Ssmh MIN(available_memory, ptob(vmem_size(heap_arena, VMEM_FREE))); 6136185029Spjd#endif 6137258632Savg 6138272483Ssmh if (freemem > (uint64_t)physmem * arc_lotsfree_percent / 100) 6139185029Spjd return (0); 6140185029Spjd 6141185029Spjd if (txg > last_txg) { 6142185029Spjd last_txg = txg; 6143185029Spjd page_load = 0; 6144185029Spjd } 6145185029Spjd /* 6146185029Spjd * If we are in pageout, we know that memory is already tight, 6147185029Spjd * the arc is already going to be evicting, so we just want to 6148185029Spjd * continue to let page writes occur as quickly as possible. 6149185029Spjd */ 6150185029Spjd if (curproc == pageproc) { 6151272483Ssmh if (page_load > MAX(ptob(minfree), available_memory) / 4) 6152249195Smm return (SET_ERROR(ERESTART)); 6153185029Spjd /* Note: reserve is inflated, so we deflate */ 6154185029Spjd page_load += reserve / 8; 6155185029Spjd return (0); 6156185029Spjd } else if (page_load > 0 && arc_reclaim_needed()) { 6157185029Spjd /* memory is low, delay before restarting */ 6158185029Spjd ARCSTAT_INCR(arcstat_memory_throttle_count, 1); 6159249195Smm return (SET_ERROR(EAGAIN)); 6160185029Spjd } 6161185029Spjd page_load = 0; 6162185029Spjd#endif 6163185029Spjd return (0); 6164185029Spjd} 6165185029Spjd 6166168404Spjdvoid 6167185029Spjdarc_tempreserve_clear(uint64_t reserve) 6168168404Spjd{ 6169185029Spjd atomic_add_64(&arc_tempreserve, -reserve); 6170168404Spjd ASSERT((int64_t)arc_tempreserve >= 0); 6171168404Spjd} 6172168404Spjd 6173168404Spjdint 6174185029Spjdarc_tempreserve_space(uint64_t reserve, uint64_t txg) 6175168404Spjd{ 6176185029Spjd int error; 6177209962Smm uint64_t anon_size; 6178185029Spjd 6179272483Ssmh if (reserve > arc_c/4 && !arc_no_grow) { 6180185029Spjd arc_c = MIN(arc_c_max, reserve * 4); 6181272483Ssmh DTRACE_PROBE1(arc__set_reserve, uint64_t, arc_c); 6182272483Ssmh } 6183185029Spjd if (reserve > arc_c) 6184249195Smm return (SET_ERROR(ENOMEM)); 6185168404Spjd 6186168404Spjd /* 6187209962Smm * Don't count loaned bufs as in flight dirty data to prevent long 6188209962Smm * network delays from blocking transactions that are ready to be 6189209962Smm * assigned to a txg. 6190209962Smm */ 6191321535Smav 6192321535Smav /* assert that it has not wrapped around */ 6193321535Smav ASSERT3S(atomic_add_64_nv(&arc_loaned_bytes, 0), >=, 0); 6194321535Smav 6195286766Smav anon_size = MAX((int64_t)(refcount_count(&arc_anon->arcs_size) - 6196286766Smav arc_loaned_bytes), 0); 6197209962Smm 6198209962Smm /* 6199185029Spjd * Writes will, almost always, require additional memory allocations 6200251631Sdelphij * in order to compress/encrypt/etc the data. We therefore need to 6201185029Spjd * make sure that there is sufficient available memory for this. 6202185029Spjd */ 6203258632Savg error = arc_memory_throttle(reserve, txg); 6204258632Savg if (error != 0) 6205185029Spjd return (error); 6206185029Spjd 6207185029Spjd /* 6208168404Spjd * Throttle writes when the amount of dirty data in the cache 6209168404Spjd * gets too large. We try to keep the cache less than half full 6210168404Spjd * of dirty blocks so that our sync times don't grow too large. 6211168404Spjd * Note: if two requests come in concurrently, we might let them 6212168404Spjd * both succeed, when one of them should fail. Not a huge deal. 6213168404Spjd */ 6214209962Smm 6215209962Smm if (reserve + arc_tempreserve + anon_size > arc_c / 2 && 6216209962Smm anon_size > arc_c / 4) { 6217307265Smav uint64_t meta_esize = 6218307265Smav refcount_count(&arc_anon->arcs_esize[ARC_BUFC_METADATA]); 6219307265Smav uint64_t data_esize = 6220307265Smav refcount_count(&arc_anon->arcs_esize[ARC_BUFC_DATA]); 6221185029Spjd dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK " 6222185029Spjd "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n", 6223307265Smav arc_tempreserve >> 10, meta_esize >> 10, 6224307265Smav data_esize >> 10, reserve >> 10, arc_c >> 10); 6225249195Smm return (SET_ERROR(ERESTART)); 6226168404Spjd } 6227185029Spjd atomic_add_64(&arc_tempreserve, reserve); 6228168404Spjd return (0); 6229168404Spjd} 6230168404Spjd 6231286626Smavstatic void 6232286626Smavarc_kstat_update_state(arc_state_t *state, kstat_named_t *size, 6233286626Smav kstat_named_t *evict_data, kstat_named_t *evict_metadata) 6234286626Smav{ 6235286766Smav size->value.ui64 = refcount_count(&state->arcs_size); 6236307265Smav evict_data->value.ui64 = 6237307265Smav refcount_count(&state->arcs_esize[ARC_BUFC_DATA]); 6238307265Smav evict_metadata->value.ui64 = 6239307265Smav refcount_count(&state->arcs_esize[ARC_BUFC_METADATA]); 6240286626Smav} 6241286626Smav 6242286626Smavstatic int 6243286626Smavarc_kstat_update(kstat_t *ksp, int rw) 6244286626Smav{ 6245286626Smav arc_stats_t *as = ksp->ks_data; 6246286626Smav 6247286626Smav if (rw == KSTAT_WRITE) { 6248286626Smav return (EACCES); 6249286626Smav } else { 6250286626Smav arc_kstat_update_state(arc_anon, 6251286626Smav &as->arcstat_anon_size, 6252286626Smav &as->arcstat_anon_evictable_data, 6253286626Smav &as->arcstat_anon_evictable_metadata); 6254286626Smav arc_kstat_update_state(arc_mru, 6255286626Smav &as->arcstat_mru_size, 6256286626Smav &as->arcstat_mru_evictable_data, 6257286626Smav &as->arcstat_mru_evictable_metadata); 6258286626Smav arc_kstat_update_state(arc_mru_ghost, 6259286626Smav &as->arcstat_mru_ghost_size, 6260286626Smav &as->arcstat_mru_ghost_evictable_data, 6261286626Smav &as->arcstat_mru_ghost_evictable_metadata); 6262286626Smav arc_kstat_update_state(arc_mfu, 6263286626Smav &as->arcstat_mfu_size, 6264286626Smav &as->arcstat_mfu_evictable_data, 6265286626Smav &as->arcstat_mfu_evictable_metadata); 6266286626Smav arc_kstat_update_state(arc_mfu_ghost, 6267286626Smav &as->arcstat_mfu_ghost_size, 6268286626Smav &as->arcstat_mfu_ghost_evictable_data, 6269286626Smav &as->arcstat_mfu_ghost_evictable_metadata); 6270286626Smav } 6271286626Smav 6272286626Smav return (0); 6273286626Smav} 6274286626Smav 6275286763Smav/* 6276286763Smav * This function *must* return indices evenly distributed between all 6277286763Smav * sublists of the multilist. This is needed due to how the ARC eviction 6278286763Smav * code is laid out; arc_evict_state() assumes ARC buffers are evenly 6279286763Smav * distributed between all sublists and uses this assumption when 6280286763Smav * deciding which sublist to evict from and how much to evict from it. 6281286763Smav */ 6282286763Smavunsigned int 6283286763Smavarc_state_multilist_index_func(multilist_t *ml, void *obj) 6284286763Smav{ 6285286763Smav arc_buf_hdr_t *hdr = obj; 6286286763Smav 6287286763Smav /* 6288286763Smav * We rely on b_dva to generate evenly distributed index 6289286763Smav * numbers using buf_hash below. So, as an added precaution, 6290286763Smav * let's make sure we never add empty buffers to the arc lists. 6291286763Smav */ 6292307265Smav ASSERT(!HDR_EMPTY(hdr)); 6293286763Smav 6294286763Smav /* 6295286763Smav * The assumption here, is the hash value for a given 6296286763Smav * arc_buf_hdr_t will remain constant throughout it's lifetime 6297286763Smav * (i.e. it's b_spa, b_dva, and b_birth fields don't change). 6298286763Smav * Thus, we don't need to store the header's sublist index 6299286763Smav * on insertion, as this index can be recalculated on removal. 6300286763Smav * 6301286763Smav * Also, the low order bits of the hash value are thought to be 6302286763Smav * distributed evenly. Otherwise, in the case that the multilist 6303286763Smav * has a power of two number of sublists, each sublists' usage 6304286763Smav * would not be evenly distributed. 6305286763Smav */ 6306286763Smav return (buf_hash(hdr->b_spa, &hdr->b_dva, hdr->b_birth) % 6307286763Smav multilist_get_num_sublists(ml)); 6308286763Smav} 6309286763Smav 6310168404Spjd#ifdef _KERNEL 6311168566Spjdstatic eventhandler_tag arc_event_lowmem = NULL; 6312168404Spjd 6313168404Spjdstatic void 6314168566Spjdarc_lowmem(void *arg __unused, int howto __unused) 6315168404Spjd{ 6316168404Spjd 6317286763Smav mutex_enter(&arc_reclaim_lock); 6318326619Sbapt DTRACE_PROBE1(arc__needfree, int64_t, ((int64_t)freemem - zfs_arc_free_target) * PAGESIZE); 6319286763Smav cv_signal(&arc_reclaim_thread_cv); 6320241773Savg 6321241773Savg /* 6322241773Savg * It is unsafe to block here in arbitrary threads, because we can come 6323241773Savg * here from ARC itself and may hold ARC locks and thus risk a deadlock 6324241773Savg * with ARC reclaim thread. 6325241773Savg */ 6326286623Smav if (curproc == pageproc) 6327286763Smav (void) cv_wait(&arc_reclaim_waiters_cv, &arc_reclaim_lock); 6328286763Smav mutex_exit(&arc_reclaim_lock); 6329168404Spjd} 6330168404Spjd#endif 6331168404Spjd 6332307265Smavstatic void 6333307265Smavarc_state_init(void) 6334307265Smav{ 6335307265Smav arc_anon = &ARC_anon; 6336307265Smav arc_mru = &ARC_mru; 6337307265Smav arc_mru_ghost = &ARC_mru_ghost; 6338307265Smav arc_mfu = &ARC_mfu; 6339307265Smav arc_mfu_ghost = &ARC_mfu_ghost; 6340307265Smav arc_l2c_only = &ARC_l2c_only; 6341307265Smav 6342321553Smav arc_mru->arcs_list[ARC_BUFC_METADATA] = 6343321553Smav multilist_create(sizeof (arc_buf_hdr_t), 6344307265Smav offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), 6345321552Smav arc_state_multilist_index_func); 6346321553Smav arc_mru->arcs_list[ARC_BUFC_DATA] = 6347321553Smav multilist_create(sizeof (arc_buf_hdr_t), 6348307265Smav offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), 6349321552Smav arc_state_multilist_index_func); 6350321553Smav arc_mru_ghost->arcs_list[ARC_BUFC_METADATA] = 6351321553Smav multilist_create(sizeof (arc_buf_hdr_t), 6352307265Smav offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), 6353321552Smav arc_state_multilist_index_func); 6354321553Smav arc_mru_ghost->arcs_list[ARC_BUFC_DATA] = 6355321553Smav multilist_create(sizeof (arc_buf_hdr_t), 6356307265Smav offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), 6357321552Smav arc_state_multilist_index_func); 6358321553Smav arc_mfu->arcs_list[ARC_BUFC_METADATA] = 6359321553Smav multilist_create(sizeof (arc_buf_hdr_t), 6360307265Smav offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), 6361321552Smav arc_state_multilist_index_func); 6362321553Smav arc_mfu->arcs_list[ARC_BUFC_DATA] = 6363321553Smav multilist_create(sizeof (arc_buf_hdr_t), 6364307265Smav offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), 6365321552Smav arc_state_multilist_index_func); 6366321553Smav arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA] = 6367321553Smav multilist_create(sizeof (arc_buf_hdr_t), 6368307265Smav offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), 6369321552Smav arc_state_multilist_index_func); 6370321553Smav arc_mfu_ghost->arcs_list[ARC_BUFC_DATA] = 6371321553Smav multilist_create(sizeof (arc_buf_hdr_t), 6372307265Smav offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), 6373321552Smav arc_state_multilist_index_func); 6374321553Smav arc_l2c_only->arcs_list[ARC_BUFC_METADATA] = 6375321553Smav multilist_create(sizeof (arc_buf_hdr_t), 6376307265Smav offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), 6377321552Smav arc_state_multilist_index_func); 6378321553Smav arc_l2c_only->arcs_list[ARC_BUFC_DATA] = 6379321553Smav multilist_create(sizeof (arc_buf_hdr_t), 6380307265Smav offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), 6381321552Smav arc_state_multilist_index_func); 6382307265Smav 6383307265Smav refcount_create(&arc_anon->arcs_esize[ARC_BUFC_METADATA]); 6384307265Smav refcount_create(&arc_anon->arcs_esize[ARC_BUFC_DATA]); 6385307265Smav refcount_create(&arc_mru->arcs_esize[ARC_BUFC_METADATA]); 6386307265Smav refcount_create(&arc_mru->arcs_esize[ARC_BUFC_DATA]); 6387307265Smav refcount_create(&arc_mru_ghost->arcs_esize[ARC_BUFC_METADATA]); 6388307265Smav refcount_create(&arc_mru_ghost->arcs_esize[ARC_BUFC_DATA]); 6389307265Smav refcount_create(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]); 6390307265Smav refcount_create(&arc_mfu->arcs_esize[ARC_BUFC_DATA]); 6391307265Smav refcount_create(&arc_mfu_ghost->arcs_esize[ARC_BUFC_METADATA]); 6392307265Smav refcount_create(&arc_mfu_ghost->arcs_esize[ARC_BUFC_DATA]); 6393307265Smav refcount_create(&arc_l2c_only->arcs_esize[ARC_BUFC_METADATA]); 6394307265Smav refcount_create(&arc_l2c_only->arcs_esize[ARC_BUFC_DATA]); 6395307265Smav 6396307265Smav refcount_create(&arc_anon->arcs_size); 6397307265Smav refcount_create(&arc_mru->arcs_size); 6398307265Smav refcount_create(&arc_mru_ghost->arcs_size); 6399307265Smav refcount_create(&arc_mfu->arcs_size); 6400307265Smav refcount_create(&arc_mfu_ghost->arcs_size); 6401307265Smav refcount_create(&arc_l2c_only->arcs_size); 6402307265Smav} 6403307265Smav 6404307265Smavstatic void 6405307265Smavarc_state_fini(void) 6406307265Smav{ 6407307265Smav refcount_destroy(&arc_anon->arcs_esize[ARC_BUFC_METADATA]); 6408307265Smav refcount_destroy(&arc_anon->arcs_esize[ARC_BUFC_DATA]); 6409307265Smav refcount_destroy(&arc_mru->arcs_esize[ARC_BUFC_METADATA]); 6410307265Smav refcount_destroy(&arc_mru->arcs_esize[ARC_BUFC_DATA]); 6411307265Smav refcount_destroy(&arc_mru_ghost->arcs_esize[ARC_BUFC_METADATA]); 6412307265Smav refcount_destroy(&arc_mru_ghost->arcs_esize[ARC_BUFC_DATA]); 6413307265Smav refcount_destroy(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]); 6414307265Smav refcount_destroy(&arc_mfu->arcs_esize[ARC_BUFC_DATA]); 6415307265Smav refcount_destroy(&arc_mfu_ghost->arcs_esize[ARC_BUFC_METADATA]); 6416307265Smav refcount_destroy(&arc_mfu_ghost->arcs_esize[ARC_BUFC_DATA]); 6417307265Smav refcount_destroy(&arc_l2c_only->arcs_esize[ARC_BUFC_METADATA]); 6418307265Smav refcount_destroy(&arc_l2c_only->arcs_esize[ARC_BUFC_DATA]); 6419307265Smav 6420307265Smav refcount_destroy(&arc_anon->arcs_size); 6421307265Smav refcount_destroy(&arc_mru->arcs_size); 6422307265Smav refcount_destroy(&arc_mru_ghost->arcs_size); 6423307265Smav refcount_destroy(&arc_mfu->arcs_size); 6424307265Smav refcount_destroy(&arc_mfu_ghost->arcs_size); 6425307265Smav refcount_destroy(&arc_l2c_only->arcs_size); 6426307265Smav 6427321553Smav multilist_destroy(arc_mru->arcs_list[ARC_BUFC_METADATA]); 6428321553Smav multilist_destroy(arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]); 6429321553Smav multilist_destroy(arc_mfu->arcs_list[ARC_BUFC_METADATA]); 6430321553Smav multilist_destroy(arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]); 6431321553Smav multilist_destroy(arc_mru->arcs_list[ARC_BUFC_DATA]); 6432321553Smav multilist_destroy(arc_mru_ghost->arcs_list[ARC_BUFC_DATA]); 6433321553Smav multilist_destroy(arc_mfu->arcs_list[ARC_BUFC_DATA]); 6434321553Smav multilist_destroy(arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]); 6435307265Smav} 6436307265Smav 6437307265Smavuint64_t 6438307265Smavarc_max_bytes(void) 6439307265Smav{ 6440307265Smav return (arc_c_max); 6441307265Smav} 6442307265Smav 6443168404Spjdvoid 6444168404Spjdarc_init(void) 6445168404Spjd{ 6446219089Spjd int i, prefetch_tunable_set = 0; 6447205231Skmacy 6448321562Smav /* 6449321562Smav * allmem is "all memory that we could possibly use". 6450321562Smav */ 6451321562Smav#ifdef illumos 6452321562Smav#ifdef _KERNEL 6453321562Smav uint64_t allmem = ptob(physmem - swapfs_minfree); 6454321562Smav#else 6455321562Smav uint64_t allmem = (physmem * PAGESIZE) / 2; 6456321562Smav#endif 6457321562Smav#else 6458321562Smav uint64_t allmem = kmem_size(); 6459321562Smav#endif 6460321562Smav 6461321562Smav 6462286763Smav mutex_init(&arc_reclaim_lock, NULL, MUTEX_DEFAULT, NULL); 6463286763Smav cv_init(&arc_reclaim_thread_cv, NULL, CV_DEFAULT, NULL); 6464286763Smav cv_init(&arc_reclaim_waiters_cv, NULL, CV_DEFAULT, NULL); 6465168404Spjd 6466301997Skib mutex_init(&arc_dnlc_evicts_lock, NULL, MUTEX_DEFAULT, NULL); 6467301997Skib cv_init(&arc_dnlc_evicts_cv, NULL, CV_DEFAULT, NULL); 6468301997Skib 6469168404Spjd /* Convert seconds to clock ticks */ 6470168404Spjd arc_min_prefetch_lifespan = 1 * hz; 6471168404Spjd 6472302265Ssmh /* set min cache to 1/32 of all memory, or arc_abs_min, whichever is more */ 6473321562Smav arc_c_min = MAX(allmem / 32, arc_abs_min); 6474321562Smav /* set max to 5/8 of all memory, or all but 1GB, whichever is more */ 6475321562Smav if (allmem >= 1 << 30) 6476321562Smav arc_c_max = allmem - (1 << 30); 6477168404Spjd else 6478168404Spjd arc_c_max = arc_c_min; 6479321562Smav arc_c_max = MAX(allmem * 5 / 8, arc_c_max); 6480219089Spjd 6481289305Smav /* 6482289305Smav * In userland, there's only the memory pressure that we artificially 6483289305Smav * create (see arc_available_memory()). Don't let arc_c get too 6484289305Smav * small, because it can cause transactions to be larger than 6485289305Smav * arc_c, causing arc_tempreserve_space() to fail. 6486289305Smav */ 6487289305Smav#ifndef _KERNEL 6488289305Smav arc_c_min = arc_c_max / 2; 6489289305Smav#endif 6490289305Smav 6491168481Spjd#ifdef _KERNEL 6492168404Spjd /* 6493168404Spjd * Allow the tunables to override our calculations if they are 6494302265Ssmh * reasonable. 6495168404Spjd */ 6496321562Smav if (zfs_arc_max > arc_abs_min && zfs_arc_max < allmem) { 6497168404Spjd arc_c_max = zfs_arc_max; 6498307297Smav arc_c_min = MIN(arc_c_min, arc_c_max); 6499307297Smav } 6500302265Ssmh if (zfs_arc_min > arc_abs_min && zfs_arc_min <= arc_c_max) 6501168404Spjd arc_c_min = zfs_arc_min; 6502168481Spjd#endif 6503219089Spjd 6504168404Spjd arc_c = arc_c_max; 6505168404Spjd arc_p = (arc_c >> 1); 6506307265Smav arc_size = 0; 6507168404Spjd 6508185029Spjd /* limit meta-data to 1/4 of the arc capacity */ 6509185029Spjd arc_meta_limit = arc_c_max / 4; 6510185029Spjd 6511321563Smav#ifdef _KERNEL 6512321563Smav /* 6513321563Smav * Metadata is stored in the kernel's heap. Don't let us 6514321563Smav * use more than half the heap for the ARC. 6515321563Smav */ 6516321563Smav arc_meta_limit = MIN(arc_meta_limit, 6517321563Smav vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 2); 6518321563Smav#endif 6519321563Smav 6520185029Spjd /* Allow the tunable to override if it is reasonable */ 6521185029Spjd if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max) 6522185029Spjd arc_meta_limit = zfs_arc_meta_limit; 6523185029Spjd 6524185029Spjd if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0) 6525185029Spjd arc_c_min = arc_meta_limit / 2; 6526185029Spjd 6527275780Sdelphij if (zfs_arc_meta_min > 0) { 6528275780Sdelphij arc_meta_min = zfs_arc_meta_min; 6529275780Sdelphij } else { 6530275780Sdelphij arc_meta_min = arc_c_min / 2; 6531275780Sdelphij } 6532275780Sdelphij 6533208373Smm if (zfs_arc_grow_retry > 0) 6534208373Smm arc_grow_retry = zfs_arc_grow_retry; 6535208373Smm 6536208373Smm if (zfs_arc_shrink_shift > 0) 6537208373Smm arc_shrink_shift = zfs_arc_shrink_shift; 6538208373Smm 6539323667Sbapt if (zfs_arc_no_grow_shift > 0) 6540323667Sbapt arc_no_grow_shift = zfs_arc_no_grow_shift; 6541286625Smav /* 6542286625Smav * Ensure that arc_no_grow_shift is less than arc_shrink_shift. 6543286625Smav */ 6544286625Smav if (arc_no_grow_shift >= arc_shrink_shift) 6545286625Smav arc_no_grow_shift = arc_shrink_shift - 1; 6546286625Smav 6547208373Smm if (zfs_arc_p_min_shift > 0) 6548208373Smm arc_p_min_shift = zfs_arc_p_min_shift; 6549208373Smm 6550168404Spjd /* if kmem_flags are set, lets try to use less memory */ 6551168404Spjd if (kmem_debugging()) 6552168404Spjd arc_c = arc_c / 2; 6553168404Spjd if (arc_c < arc_c_min) 6554168404Spjd arc_c = arc_c_min; 6555168404Spjd 6556168473Spjd zfs_arc_min = arc_c_min; 6557168473Spjd zfs_arc_max = arc_c_max; 6558168473Spjd 6559307265Smav arc_state_init(); 6560168404Spjd buf_init(); 6561168404Spjd 6562307265Smav arc_reclaim_thread_exit = B_FALSE; 6563301997Skib arc_dnlc_evicts_thread_exit = FALSE; 6564168404Spjd 6565168404Spjd arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED, 6566168404Spjd sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); 6567168404Spjd 6568168404Spjd if (arc_ksp != NULL) { 6569168404Spjd arc_ksp->ks_data = &arc_stats; 6570286574Smav arc_ksp->ks_update = arc_kstat_update; 6571168404Spjd kstat_install(arc_ksp); 6572168404Spjd } 6573168404Spjd 6574168404Spjd (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0, 6575168404Spjd TS_RUN, minclsyspri); 6576168404Spjd 6577168404Spjd#ifdef _KERNEL 6578168566Spjd arc_event_lowmem = EVENTHANDLER_REGISTER(vm_lowmem, arc_lowmem, NULL, 6579168404Spjd EVENTHANDLER_PRI_FIRST); 6580168404Spjd#endif 6581168404Spjd 6582301997Skib (void) thread_create(NULL, 0, arc_dnlc_evicts_thread, NULL, 0, &p0, 6583301997Skib TS_RUN, minclsyspri); 6584301997Skib 6585307265Smav arc_dead = B_FALSE; 6586185029Spjd arc_warm = B_FALSE; 6587168566Spjd 6588258632Savg /* 6589258632Savg * Calculate maximum amount of dirty data per pool. 6590258632Savg * 6591258632Savg * If it has been set by /etc/system, take that. 6592258632Savg * Otherwise, use a percentage of physical memory defined by 6593258632Savg * zfs_dirty_data_max_percent (default 10%) with a cap at 6594258632Savg * zfs_dirty_data_max_max (default 4GB). 6595258632Savg */ 6596258632Savg if (zfs_dirty_data_max == 0) { 6597258632Savg zfs_dirty_data_max = ptob(physmem) * 6598258632Savg zfs_dirty_data_max_percent / 100; 6599258632Savg zfs_dirty_data_max = MIN(zfs_dirty_data_max, 6600258632Savg zfs_dirty_data_max_max); 6601258632Savg } 6602185029Spjd 6603168566Spjd#ifdef _KERNEL 6604194043Skmacy if (TUNABLE_INT_FETCH("vfs.zfs.prefetch_disable", &zfs_prefetch_disable)) 6605193953Skmacy prefetch_tunable_set = 1; 6606206796Spjd 6607193878Skmacy#ifdef __i386__ 6608193953Skmacy if (prefetch_tunable_set == 0) { 6609196863Strasz printf("ZFS NOTICE: Prefetch is disabled by default on i386 " 6610196863Strasz "-- to enable,\n"); 6611196863Strasz printf(" add \"vfs.zfs.prefetch_disable=0\" " 6612196863Strasz "to /boot/loader.conf.\n"); 6613219089Spjd zfs_prefetch_disable = 1; 6614193878Skmacy } 6615206796Spjd#else 6616193878Skmacy if ((((uint64_t)physmem * PAGESIZE) < (1ULL << 32)) && 6617193953Skmacy prefetch_tunable_set == 0) { 6618196863Strasz printf("ZFS NOTICE: Prefetch is disabled by default if less " 6619196941Strasz "than 4GB of RAM is present;\n" 6620196863Strasz " to enable, add \"vfs.zfs.prefetch_disable=0\" " 6621196863Strasz "to /boot/loader.conf.\n"); 6622219089Spjd zfs_prefetch_disable = 1; 6623193878Skmacy } 6624206796Spjd#endif 6625175633Spjd /* Warn about ZFS memory and address space requirements. */ 6626168696Spjd if (((uint64_t)physmem * PAGESIZE) < (256 + 128 + 64) * (1 << 20)) { 6627168987Sbmah printf("ZFS WARNING: Recommended minimum RAM size is 512MB; " 6628168987Sbmah "expect unstable behavior.\n"); 6629175633Spjd } 6630321562Smav if (allmem < 512 * (1 << 20)) { 6631173419Spjd printf("ZFS WARNING: Recommended minimum kmem_size is 512MB; " 6632168987Sbmah "expect unstable behavior.\n"); 6633185029Spjd printf(" Consider tuning vm.kmem_size and " 6634173419Spjd "vm.kmem_size_max\n"); 6635185029Spjd printf(" in /boot/loader.conf.\n"); 6636168566Spjd } 6637168566Spjd#endif 6638168404Spjd} 6639168404Spjd 6640168404Spjdvoid 6641168404Spjdarc_fini(void) 6642168404Spjd{ 6643327491Smarkj#ifdef _KERNEL 6644327491Smarkj if (arc_event_lowmem != NULL) 6645327491Smarkj EVENTHANDLER_DEREGISTER(vm_lowmem, arc_event_lowmem); 6646327491Smarkj#endif 6647327491Smarkj 6648286763Smav mutex_enter(&arc_reclaim_lock); 6649307265Smav arc_reclaim_thread_exit = B_TRUE; 6650286763Smav /* 6651286763Smav * The reclaim thread will set arc_reclaim_thread_exit back to 6652307265Smav * B_FALSE when it is finished exiting; we're waiting for that. 6653286763Smav */ 6654286763Smav while (arc_reclaim_thread_exit) { 6655286763Smav cv_signal(&arc_reclaim_thread_cv); 6656286763Smav cv_wait(&arc_reclaim_thread_cv, &arc_reclaim_lock); 6657286763Smav } 6658286763Smav mutex_exit(&arc_reclaim_lock); 6659168404Spjd 6660307265Smav /* Use B_TRUE to ensure *all* buffers are evicted */ 6661307265Smav arc_flush(NULL, B_TRUE); 6662168404Spjd 6663301997Skib mutex_enter(&arc_dnlc_evicts_lock); 6664301997Skib arc_dnlc_evicts_thread_exit = TRUE; 6665301997Skib /* 6666301997Skib * The user evicts thread will set arc_user_evicts_thread_exit 6667301997Skib * to FALSE when it is finished exiting; we're waiting for that. 6668301997Skib */ 6669301997Skib while (arc_dnlc_evicts_thread_exit) { 6670301997Skib cv_signal(&arc_dnlc_evicts_cv); 6671301997Skib cv_wait(&arc_dnlc_evicts_cv, &arc_dnlc_evicts_lock); 6672301997Skib } 6673301997Skib mutex_exit(&arc_dnlc_evicts_lock); 6674301997Skib 6675307265Smav arc_dead = B_TRUE; 6676286763Smav 6677168404Spjd if (arc_ksp != NULL) { 6678168404Spjd kstat_delete(arc_ksp); 6679168404Spjd arc_ksp = NULL; 6680168404Spjd } 6681168404Spjd 6682286763Smav mutex_destroy(&arc_reclaim_lock); 6683286763Smav cv_destroy(&arc_reclaim_thread_cv); 6684286763Smav cv_destroy(&arc_reclaim_waiters_cv); 6685168404Spjd 6686301997Skib mutex_destroy(&arc_dnlc_evicts_lock); 6687301997Skib cv_destroy(&arc_dnlc_evicts_cv); 6688301997Skib 6689307265Smav arc_state_fini(); 6690168404Spjd buf_fini(); 6691168404Spjd 6692286570Smav ASSERT0(arc_loaned_bytes); 6693168404Spjd} 6694185029Spjd 6695185029Spjd/* 6696185029Spjd * Level 2 ARC 6697185029Spjd * 6698185029Spjd * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk. 6699185029Spjd * It uses dedicated storage devices to hold cached data, which are populated 6700185029Spjd * using large infrequent writes. The main role of this cache is to boost 6701185029Spjd * the performance of random read workloads. The intended L2ARC devices 6702185029Spjd * include short-stroked disks, solid state disks, and other media with 6703185029Spjd * substantially faster read latency than disk. 6704185029Spjd * 6705185029Spjd * +-----------------------+ 6706185029Spjd * | ARC | 6707185029Spjd * +-----------------------+ 6708185029Spjd * | ^ ^ 6709185029Spjd * | | | 6710185029Spjd * l2arc_feed_thread() arc_read() 6711185029Spjd * | | | 6712185029Spjd * | l2arc read | 6713185029Spjd * V | | 6714185029Spjd * +---------------+ | 6715185029Spjd * | L2ARC | | 6716185029Spjd * +---------------+ | 6717185029Spjd * | ^ | 6718185029Spjd * l2arc_write() | | 6719185029Spjd * | | | 6720185029Spjd * V | | 6721185029Spjd * +-------+ +-------+ 6722185029Spjd * | vdev | | vdev | 6723185029Spjd * | cache | | cache | 6724185029Spjd * +-------+ +-------+ 6725185029Spjd * +=========+ .-----. 6726185029Spjd * : L2ARC : |-_____-| 6727185029Spjd * : devices : | Disks | 6728185029Spjd * +=========+ `-_____-' 6729185029Spjd * 6730185029Spjd * Read requests are satisfied from the following sources, in order: 6731185029Spjd * 6732185029Spjd * 1) ARC 6733185029Spjd * 2) vdev cache of L2ARC devices 6734185029Spjd * 3) L2ARC devices 6735185029Spjd * 4) vdev cache of disks 6736185029Spjd * 5) disks 6737185029Spjd * 6738185029Spjd * Some L2ARC device types exhibit extremely slow write performance. 6739185029Spjd * To accommodate for this there are some significant differences between 6740185029Spjd * the L2ARC and traditional cache design: 6741185029Spjd * 6742185029Spjd * 1. There is no eviction path from the ARC to the L2ARC. Evictions from 6743185029Spjd * the ARC behave as usual, freeing buffers and placing headers on ghost 6744185029Spjd * lists. The ARC does not send buffers to the L2ARC during eviction as 6745185029Spjd * this would add inflated write latencies for all ARC memory pressure. 6746185029Spjd * 6747185029Spjd * 2. The L2ARC attempts to cache data from the ARC before it is evicted. 6748185029Spjd * It does this by periodically scanning buffers from the eviction-end of 6749185029Spjd * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are 6750251478Sdelphij * not already there. It scans until a headroom of buffers is satisfied, 6751251478Sdelphij * which itself is a buffer for ARC eviction. If a compressible buffer is 6752251478Sdelphij * found during scanning and selected for writing to an L2ARC device, we 6753251478Sdelphij * temporarily boost scanning headroom during the next scan cycle to make 6754251478Sdelphij * sure we adapt to compression effects (which might significantly reduce 6755251478Sdelphij * the data volume we write to L2ARC). The thread that does this is 6756185029Spjd * l2arc_feed_thread(), illustrated below; example sizes are included to 6757185029Spjd * provide a better sense of ratio than this diagram: 6758185029Spjd * 6759185029Spjd * head --> tail 6760185029Spjd * +---------------------+----------+ 6761185029Spjd * ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->. # already on L2ARC 6762185029Spjd * +---------------------+----------+ | o L2ARC eligible 6763185029Spjd * ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->| : ARC buffer 6764185029Spjd * +---------------------+----------+ | 6765185029Spjd * 15.9 Gbytes ^ 32 Mbytes | 6766185029Spjd * headroom | 6767185029Spjd * l2arc_feed_thread() 6768185029Spjd * | 6769185029Spjd * l2arc write hand <--[oooo]--' 6770185029Spjd * | 8 Mbyte 6771185029Spjd * | write max 6772185029Spjd * V 6773185029Spjd * +==============================+ 6774185029Spjd * L2ARC dev |####|#|###|###| |####| ... | 6775185029Spjd * +==============================+ 6776185029Spjd * 32 Gbytes 6777185029Spjd * 6778185029Spjd * 3. If an ARC buffer is copied to the L2ARC but then hit instead of 6779185029Spjd * evicted, then the L2ARC has cached a buffer much sooner than it probably 6780185029Spjd * needed to, potentially wasting L2ARC device bandwidth and storage. It is 6781185029Spjd * safe to say that this is an uncommon case, since buffers at the end of 6782185029Spjd * the ARC lists have moved there due to inactivity. 6783185029Spjd * 6784185029Spjd * 4. If the ARC evicts faster than the L2ARC can maintain a headroom, 6785185029Spjd * then the L2ARC simply misses copying some buffers. This serves as a 6786185029Spjd * pressure valve to prevent heavy read workloads from both stalling the ARC 6787185029Spjd * with waits and clogging the L2ARC with writes. This also helps prevent 6788185029Spjd * the potential for the L2ARC to churn if it attempts to cache content too 6789185029Spjd * quickly, such as during backups of the entire pool. 6790185029Spjd * 6791185029Spjd * 5. After system boot and before the ARC has filled main memory, there are 6792185029Spjd * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru 6793185029Spjd * lists can remain mostly static. Instead of searching from tail of these 6794185029Spjd * lists as pictured, the l2arc_feed_thread() will search from the list heads 6795185029Spjd * for eligible buffers, greatly increasing its chance of finding them. 6796185029Spjd * 6797185029Spjd * The L2ARC device write speed is also boosted during this time so that 6798185029Spjd * the L2ARC warms up faster. Since there have been no ARC evictions yet, 6799185029Spjd * there are no L2ARC reads, and no fear of degrading read performance 6800185029Spjd * through increased writes. 6801185029Spjd * 6802185029Spjd * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that 6803185029Spjd * the vdev queue can aggregate them into larger and fewer writes. Each 6804185029Spjd * device is written to in a rotor fashion, sweeping writes through 6805185029Spjd * available space then repeating. 6806185029Spjd * 6807185029Spjd * 7. The L2ARC does not store dirty content. It never needs to flush 6808185029Spjd * write buffers back to disk based storage. 6809185029Spjd * 6810185029Spjd * 8. If an ARC buffer is written (and dirtied) which also exists in the 6811185029Spjd * L2ARC, the now stale L2ARC buffer is immediately dropped. 6812185029Spjd * 6813185029Spjd * The performance of the L2ARC can be tweaked by a number of tunables, which 6814185029Spjd * may be necessary for different workloads: 6815185029Spjd * 6816185029Spjd * l2arc_write_max max write bytes per interval 6817185029Spjd * l2arc_write_boost extra write bytes during device warmup 6818185029Spjd * l2arc_noprefetch skip caching prefetched buffers 6819185029Spjd * l2arc_headroom number of max device writes to precache 6820251478Sdelphij * l2arc_headroom_boost when we find compressed buffers during ARC 6821251478Sdelphij * scanning, we multiply headroom by this 6822251478Sdelphij * percentage factor for the next scan cycle, 6823251478Sdelphij * since more compressed buffers are likely to 6824251478Sdelphij * be present 6825185029Spjd * l2arc_feed_secs seconds between L2ARC writing 6826185029Spjd * 6827185029Spjd * Tunables may be removed or added as future performance improvements are 6828185029Spjd * integrated, and also may become zpool properties. 6829208373Smm * 6830208373Smm * There are three key functions that control how the L2ARC warms up: 6831208373Smm * 6832208373Smm * l2arc_write_eligible() check if a buffer is eligible to cache 6833208373Smm * l2arc_write_size() calculate how much to write 6834208373Smm * l2arc_write_interval() calculate sleep delay between writes 6835208373Smm * 6836208373Smm * These three functions determine what to write, how much, and how quickly 6837208373Smm * to send writes. 6838185029Spjd */ 6839185029Spjd 6840208373Smmstatic boolean_t 6841275811Sdelphijl2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *hdr) 6842208373Smm{ 6843208373Smm /* 6844208373Smm * A buffer is *not* eligible for the L2ARC if it: 6845208373Smm * 1. belongs to a different spa. 6846208373Smm * 2. is already cached on the L2ARC. 6847208373Smm * 3. has an I/O in progress (it may be an incomplete read). 6848208373Smm * 4. is flagged not eligible (zfs property). 6849208373Smm */ 6850275811Sdelphij if (hdr->b_spa != spa_guid) { 6851208373Smm ARCSTAT_BUMP(arcstat_l2_write_spa_mismatch); 6852208373Smm return (B_FALSE); 6853208373Smm } 6854286570Smav if (HDR_HAS_L2HDR(hdr)) { 6855208373Smm ARCSTAT_BUMP(arcstat_l2_write_in_l2); 6856208373Smm return (B_FALSE); 6857208373Smm } 6858275811Sdelphij if (HDR_IO_IN_PROGRESS(hdr)) { 6859208373Smm ARCSTAT_BUMP(arcstat_l2_write_hdr_io_in_progress); 6860208373Smm return (B_FALSE); 6861208373Smm } 6862275811Sdelphij if (!HDR_L2CACHE(hdr)) { 6863208373Smm ARCSTAT_BUMP(arcstat_l2_write_not_cacheable); 6864208373Smm return (B_FALSE); 6865208373Smm } 6866208373Smm 6867208373Smm return (B_TRUE); 6868208373Smm} 6869208373Smm 6870208373Smmstatic uint64_t 6871251478Sdelphijl2arc_write_size(void) 6872208373Smm{ 6873208373Smm uint64_t size; 6874208373Smm 6875251478Sdelphij /* 6876251478Sdelphij * Make sure our globals have meaningful values in case the user 6877251478Sdelphij * altered them. 6878251478Sdelphij */ 6879251478Sdelphij size = l2arc_write_max; 6880251478Sdelphij if (size == 0) { 6881251478Sdelphij cmn_err(CE_NOTE, "Bad value for l2arc_write_max, value must " 6882251478Sdelphij "be greater than zero, resetting it to the default (%d)", 6883251478Sdelphij L2ARC_WRITE_SIZE); 6884251478Sdelphij size = l2arc_write_max = L2ARC_WRITE_SIZE; 6885251478Sdelphij } 6886208373Smm 6887208373Smm if (arc_warm == B_FALSE) 6888251478Sdelphij size += l2arc_write_boost; 6889208373Smm 6890208373Smm return (size); 6891208373Smm 6892208373Smm} 6893208373Smm 6894208373Smmstatic clock_t 6895208373Smml2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote) 6896208373Smm{ 6897219089Spjd clock_t interval, next, now; 6898208373Smm 6899208373Smm /* 6900208373Smm * If the ARC lists are busy, increase our write rate; if the 6901208373Smm * lists are stale, idle back. This is achieved by checking 6902208373Smm * how much we previously wrote - if it was more than half of 6903208373Smm * what we wanted, schedule the next write much sooner. 6904208373Smm */ 6905208373Smm if (l2arc_feed_again && wrote > (wanted / 2)) 6906208373Smm interval = (hz * l2arc_feed_min_ms) / 1000; 6907208373Smm else 6908208373Smm interval = hz * l2arc_feed_secs; 6909208373Smm 6910219089Spjd now = ddi_get_lbolt(); 6911219089Spjd next = MAX(now, MIN(now + interval, began + interval)); 6912208373Smm 6913208373Smm return (next); 6914208373Smm} 6915208373Smm 6916185029Spjd/* 6917185029Spjd * Cycle through L2ARC devices. This is how L2ARC load balances. 6918185029Spjd * If a device is returned, this also returns holding the spa config lock. 6919185029Spjd */ 6920185029Spjdstatic l2arc_dev_t * 6921185029Spjdl2arc_dev_get_next(void) 6922185029Spjd{ 6923185029Spjd l2arc_dev_t *first, *next = NULL; 6924185029Spjd 6925185029Spjd /* 6926185029Spjd * Lock out the removal of spas (spa_namespace_lock), then removal 6927185029Spjd * of cache devices (l2arc_dev_mtx). Once a device has been selected, 6928185029Spjd * both locks will be dropped and a spa config lock held instead. 6929185029Spjd */ 6930185029Spjd mutex_enter(&spa_namespace_lock); 6931185029Spjd mutex_enter(&l2arc_dev_mtx); 6932185029Spjd 6933185029Spjd /* if there are no vdevs, there is nothing to do */ 6934185029Spjd if (l2arc_ndev == 0) 6935185029Spjd goto out; 6936185029Spjd 6937185029Spjd first = NULL; 6938185029Spjd next = l2arc_dev_last; 6939185029Spjd do { 6940185029Spjd /* loop around the list looking for a non-faulted vdev */ 6941185029Spjd if (next == NULL) { 6942185029Spjd next = list_head(l2arc_dev_list); 6943185029Spjd } else { 6944185029Spjd next = list_next(l2arc_dev_list, next); 6945185029Spjd if (next == NULL) 6946185029Spjd next = list_head(l2arc_dev_list); 6947185029Spjd } 6948185029Spjd 6949185029Spjd /* if we have come back to the start, bail out */ 6950185029Spjd if (first == NULL) 6951185029Spjd first = next; 6952185029Spjd else if (next == first) 6953185029Spjd break; 6954185029Spjd 6955185029Spjd } while (vdev_is_dead(next->l2ad_vdev)); 6956185029Spjd 6957185029Spjd /* if we were unable to find any usable vdevs, return NULL */ 6958185029Spjd if (vdev_is_dead(next->l2ad_vdev)) 6959185029Spjd next = NULL; 6960185029Spjd 6961185029Spjd l2arc_dev_last = next; 6962185029Spjd 6963185029Spjdout: 6964185029Spjd mutex_exit(&l2arc_dev_mtx); 6965185029Spjd 6966185029Spjd /* 6967185029Spjd * Grab the config lock to prevent the 'next' device from being 6968185029Spjd * removed while we are writing to it. 6969185029Spjd */ 6970185029Spjd if (next != NULL) 6971185029Spjd spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER); 6972185029Spjd mutex_exit(&spa_namespace_lock); 6973185029Spjd 6974185029Spjd return (next); 6975185029Spjd} 6976185029Spjd 6977185029Spjd/* 6978185029Spjd * Free buffers that were tagged for destruction. 6979185029Spjd */ 6980185029Spjdstatic void 6981185029Spjdl2arc_do_free_on_write() 6982185029Spjd{ 6983185029Spjd list_t *buflist; 6984185029Spjd l2arc_data_free_t *df, *df_prev; 6985185029Spjd 6986185029Spjd mutex_enter(&l2arc_free_on_write_mtx); 6987185029Spjd buflist = l2arc_free_on_write; 6988185029Spjd 6989185029Spjd for (df = list_tail(buflist); df; df = df_prev) { 6990185029Spjd df_prev = list_prev(buflist, df); 6991321610Smav ASSERT3P(df->l2df_abd, !=, NULL); 6992321610Smav abd_free(df->l2df_abd); 6993185029Spjd list_remove(buflist, df); 6994185029Spjd kmem_free(df, sizeof (l2arc_data_free_t)); 6995185029Spjd } 6996185029Spjd 6997185029Spjd mutex_exit(&l2arc_free_on_write_mtx); 6998185029Spjd} 6999185029Spjd 7000185029Spjd/* 7001185029Spjd * A write to a cache device has completed. Update all headers to allow 7002185029Spjd * reads from these buffers to begin. 7003185029Spjd */ 7004185029Spjdstatic void 7005185029Spjdl2arc_write_done(zio_t *zio) 7006185029Spjd{ 7007185029Spjd l2arc_write_callback_t *cb; 7008185029Spjd l2arc_dev_t *dev; 7009185029Spjd list_t *buflist; 7010275811Sdelphij arc_buf_hdr_t *head, *hdr, *hdr_prev; 7011185029Spjd kmutex_t *hash_lock; 7012268085Sdelphij int64_t bytes_dropped = 0; 7013185029Spjd 7014185029Spjd cb = zio->io_private; 7015307265Smav ASSERT3P(cb, !=, NULL); 7016185029Spjd dev = cb->l2wcb_dev; 7017307265Smav ASSERT3P(dev, !=, NULL); 7018185029Spjd head = cb->l2wcb_head; 7019307265Smav ASSERT3P(head, !=, NULL); 7020286570Smav buflist = &dev->l2ad_buflist; 7021307265Smav ASSERT3P(buflist, !=, NULL); 7022185029Spjd DTRACE_PROBE2(l2arc__iodone, zio_t *, zio, 7023185029Spjd l2arc_write_callback_t *, cb); 7024185029Spjd 7025185029Spjd if (zio->io_error != 0) 7026185029Spjd ARCSTAT_BUMP(arcstat_l2_writes_error); 7027185029Spjd 7028185029Spjd /* 7029185029Spjd * All writes completed, or an error was hit. 7030185029Spjd */ 7031286763Smavtop: 7032286763Smav mutex_enter(&dev->l2ad_mtx); 7033275811Sdelphij for (hdr = list_prev(buflist, head); hdr; hdr = hdr_prev) { 7034275811Sdelphij hdr_prev = list_prev(buflist, hdr); 7035185029Spjd 7036275811Sdelphij hash_lock = HDR_LOCK(hdr); 7037286763Smav 7038286763Smav /* 7039286763Smav * We cannot use mutex_enter or else we can deadlock 7040286763Smav * with l2arc_write_buffers (due to swapping the order 7041286763Smav * the hash lock and l2ad_mtx are taken). 7042286763Smav */ 7043185029Spjd if (!mutex_tryenter(hash_lock)) { 7044185029Spjd /* 7045286763Smav * Missed the hash lock. We must retry so we 7046286763Smav * don't leave the ARC_FLAG_L2_WRITING bit set. 7047185029Spjd */ 7048286763Smav ARCSTAT_BUMP(arcstat_l2_writes_lock_retry); 7049286763Smav 7050286763Smav /* 7051286763Smav * We don't want to rescan the headers we've 7052286763Smav * already marked as having been written out, so 7053286763Smav * we reinsert the head node so we can pick up 7054286763Smav * where we left off. 7055286763Smav */ 7056286763Smav list_remove(buflist, head); 7057286763Smav list_insert_after(buflist, hdr, head); 7058286763Smav 7059286763Smav mutex_exit(&dev->l2ad_mtx); 7060286763Smav 7061286763Smav /* 7062286763Smav * We wait for the hash lock to become available 7063286763Smav * to try and prevent busy waiting, and increase 7064286763Smav * the chance we'll be able to acquire the lock 7065286763Smav * the next time around. 7066286763Smav */ 7067286763Smav mutex_enter(hash_lock); 7068286763Smav mutex_exit(hash_lock); 7069286763Smav goto top; 7070185029Spjd } 7071185029Spjd 7072286570Smav /* 7073286763Smav * We could not have been moved into the arc_l2c_only 7074286763Smav * state while in-flight due to our ARC_FLAG_L2_WRITING 7075286763Smav * bit being set. Let's just ensure that's being enforced. 7076286570Smav */ 7077286763Smav ASSERT(HDR_HAS_L1HDR(hdr)); 7078286570Smav 7079185029Spjd if (zio->io_error != 0) { 7080185029Spjd /* 7081185029Spjd * Error - drop L2ARC entry. 7082185029Spjd */ 7083286776Smav list_remove(buflist, hdr); 7084290191Savg l2arc_trim(hdr); 7085307265Smav arc_hdr_clear_flags(hdr, ARC_FLAG_HAS_L2HDR); 7086286570Smav 7087323754Savg ARCSTAT_INCR(arcstat_l2_psize, -arc_hdr_size(hdr)); 7088323754Savg ARCSTAT_INCR(arcstat_l2_lsize, -HDR_GET_LSIZE(hdr)); 7089286598Smav 7090307265Smav bytes_dropped += arc_hdr_size(hdr); 7091286598Smav (void) refcount_remove_many(&dev->l2ad_alloc, 7092307265Smav arc_hdr_size(hdr), hdr); 7093185029Spjd } 7094185029Spjd 7095185029Spjd /* 7096286763Smav * Allow ARC to begin reads and ghost list evictions to 7097286763Smav * this L2ARC entry. 7098185029Spjd */ 7099307265Smav arc_hdr_clear_flags(hdr, ARC_FLAG_L2_WRITING); 7100185029Spjd 7101185029Spjd mutex_exit(hash_lock); 7102185029Spjd } 7103185029Spjd 7104185029Spjd atomic_inc_64(&l2arc_writes_done); 7105185029Spjd list_remove(buflist, head); 7106286570Smav ASSERT(!HDR_HAS_L1HDR(head)); 7107286570Smav kmem_cache_free(hdr_l2only_cache, head); 7108286570Smav mutex_exit(&dev->l2ad_mtx); 7109185029Spjd 7110268085Sdelphij vdev_space_update(dev->l2ad_vdev, -bytes_dropped, 0, 0); 7111268085Sdelphij 7112185029Spjd l2arc_do_free_on_write(); 7113185029Spjd 7114185029Spjd kmem_free(cb, sizeof (l2arc_write_callback_t)); 7115185029Spjd} 7116185029Spjd 7117185029Spjd/* 7118185029Spjd * A read to a cache device completed. Validate buffer contents before 7119185029Spjd * handing over to the regular ARC routines. 7120185029Spjd */ 7121185029Spjdstatic void 7122185029Spjdl2arc_read_done(zio_t *zio) 7123185029Spjd{ 7124185029Spjd l2arc_read_callback_t *cb; 7125185029Spjd arc_buf_hdr_t *hdr; 7126185029Spjd kmutex_t *hash_lock; 7127307265Smav boolean_t valid_cksum; 7128185029Spjd 7129307265Smav ASSERT3P(zio->io_vd, !=, NULL); 7130185029Spjd ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE); 7131185029Spjd 7132185029Spjd spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd); 7133185029Spjd 7134185029Spjd cb = zio->io_private; 7135307265Smav ASSERT3P(cb, !=, NULL); 7136307265Smav hdr = cb->l2rcb_hdr; 7137307265Smav ASSERT3P(hdr, !=, NULL); 7138185029Spjd 7139307265Smav hash_lock = HDR_LOCK(hdr); 7140185029Spjd mutex_enter(hash_lock); 7141219089Spjd ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 7142185029Spjd 7143185029Spjd /* 7144297848Savg * If the data was read into a temporary buffer, 7145297848Savg * move it and free the buffer. 7146297848Savg */ 7147321610Smav if (cb->l2rcb_abd != NULL) { 7148307265Smav ASSERT3U(arc_hdr_size(hdr), <, zio->io_size); 7149307265Smav if (zio->io_error == 0) { 7150321610Smav abd_copy(hdr->b_l1hdr.b_pabd, cb->l2rcb_abd, 7151307265Smav arc_hdr_size(hdr)); 7152307265Smav } 7153297848Savg 7154297848Savg /* 7155297848Savg * The following must be done regardless of whether 7156297848Savg * there was an error: 7157297848Savg * - free the temporary buffer 7158297848Savg * - point zio to the real ARC buffer 7159297848Savg * - set zio size accordingly 7160297848Savg * These are required because zio is either re-used for 7161297848Savg * an I/O of the block in the case of the error 7162297848Savg * or the zio is passed to arc_read_done() and it 7163297848Savg * needs real data. 7164297848Savg */ 7165321610Smav abd_free(cb->l2rcb_abd); 7166307265Smav zio->io_size = zio->io_orig_size = arc_hdr_size(hdr); 7167321610Smav zio->io_abd = zio->io_orig_abd = hdr->b_l1hdr.b_pabd; 7168297848Savg } 7169297848Savg 7170321610Smav ASSERT3P(zio->io_abd, !=, NULL); 7171251478Sdelphij 7172251478Sdelphij /* 7173185029Spjd * Check this survived the L2ARC journey. 7174185029Spjd */ 7175321610Smav ASSERT3P(zio->io_abd, ==, hdr->b_l1hdr.b_pabd); 7176307265Smav zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 */ 7177307265Smav zio->io_bp = &zio->io_bp_copy; /* XXX fix in L2ARC 2.0 */ 7178307265Smav 7179307265Smav valid_cksum = arc_cksum_is_equal(hdr, zio); 7180307265Smav if (valid_cksum && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) { 7181185029Spjd mutex_exit(hash_lock); 7182307265Smav zio->io_private = hdr; 7183185029Spjd arc_read_done(zio); 7184185029Spjd } else { 7185185029Spjd mutex_exit(hash_lock); 7186185029Spjd /* 7187185029Spjd * Buffer didn't survive caching. Increment stats and 7188185029Spjd * reissue to the original storage device. 7189185029Spjd */ 7190185029Spjd if (zio->io_error != 0) { 7191185029Spjd ARCSTAT_BUMP(arcstat_l2_io_error); 7192185029Spjd } else { 7193249195Smm zio->io_error = SET_ERROR(EIO); 7194185029Spjd } 7195307265Smav if (!valid_cksum) 7196185029Spjd ARCSTAT_BUMP(arcstat_l2_cksum_bad); 7197185029Spjd 7198185029Spjd /* 7199185029Spjd * If there's no waiter, issue an async i/o to the primary 7200185029Spjd * storage now. If there *is* a waiter, the caller must 7201185029Spjd * issue the i/o in a context where it's OK to block. 7202185029Spjd */ 7203209962Smm if (zio->io_waiter == NULL) { 7204209962Smm zio_t *pio = zio_unique_parent(zio); 7205209962Smm 7206209962Smm ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL); 7207209962Smm 7208307265Smav zio_nowait(zio_read(pio, zio->io_spa, zio->io_bp, 7209321610Smav hdr->b_l1hdr.b_pabd, zio->io_size, arc_read_done, 7210307265Smav hdr, zio->io_priority, cb->l2rcb_flags, 7211307265Smav &cb->l2rcb_zb)); 7212209962Smm } 7213185029Spjd } 7214185029Spjd 7215185029Spjd kmem_free(cb, sizeof (l2arc_read_callback_t)); 7216185029Spjd} 7217185029Spjd 7218185029Spjd/* 7219185029Spjd * This is the list priority from which the L2ARC will search for pages to 7220185029Spjd * cache. This is used within loops (0..3) to cycle through lists in the 7221185029Spjd * desired order. This order can have a significant effect on cache 7222185029Spjd * performance. 7223185029Spjd * 7224185029Spjd * Currently the metadata lists are hit first, MFU then MRU, followed by 7225185029Spjd * the data lists. This function returns a locked list, and also returns 7226185029Spjd * the lock pointer. 7227185029Spjd */ 7228286763Smavstatic multilist_sublist_t * 7229286763Smavl2arc_sublist_lock(int list_num) 7230185029Spjd{ 7231286763Smav multilist_t *ml = NULL; 7232286763Smav unsigned int idx; 7233185029Spjd 7234286762Smav ASSERT(list_num >= 0 && list_num <= 3); 7235206796Spjd 7236286762Smav switch (list_num) { 7237286762Smav case 0: 7238321553Smav ml = arc_mfu->arcs_list[ARC_BUFC_METADATA]; 7239286762Smav break; 7240286762Smav case 1: 7241321553Smav ml = arc_mru->arcs_list[ARC_BUFC_METADATA]; 7242286762Smav break; 7243286762Smav case 2: 7244321553Smav ml = arc_mfu->arcs_list[ARC_BUFC_DATA]; 7245286762Smav break; 7246286762Smav case 3: 7247321553Smav ml = arc_mru->arcs_list[ARC_BUFC_DATA]; 7248286762Smav break; 7249185029Spjd } 7250185029Spjd 7251286763Smav /* 7252286763Smav * Return a randomly-selected sublist. This is acceptable 7253286763Smav * because the caller feeds only a little bit of data for each 7254286763Smav * call (8MB). Subsequent calls will result in different 7255286763Smav * sublists being selected. 7256286763Smav */ 7257286763Smav idx = multilist_get_random_index(ml); 7258286763Smav return (multilist_sublist_lock(ml, idx)); 7259185029Spjd} 7260185029Spjd 7261185029Spjd/* 7262185029Spjd * Evict buffers from the device write hand to the distance specified in 7263185029Spjd * bytes. This distance may span populated buffers, it may span nothing. 7264185029Spjd * This is clearing a region on the L2ARC device ready for writing. 7265185029Spjd * If the 'all' boolean is set, every buffer is evicted. 7266185029Spjd */ 7267185029Spjdstatic void 7268185029Spjdl2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) 7269185029Spjd{ 7270185029Spjd list_t *buflist; 7271275811Sdelphij arc_buf_hdr_t *hdr, *hdr_prev; 7272185029Spjd kmutex_t *hash_lock; 7273185029Spjd uint64_t taddr; 7274185029Spjd 7275286570Smav buflist = &dev->l2ad_buflist; 7276185029Spjd 7277185029Spjd if (!all && dev->l2ad_first) { 7278185029Spjd /* 7279185029Spjd * This is the first sweep through the device. There is 7280185029Spjd * nothing to evict. 7281185029Spjd */ 7282185029Spjd return; 7283185029Spjd } 7284185029Spjd 7285185029Spjd if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) { 7286185029Spjd /* 7287185029Spjd * When nearing the end of the device, evict to the end 7288185029Spjd * before the device write hand jumps to the start. 7289185029Spjd */ 7290185029Spjd taddr = dev->l2ad_end; 7291185029Spjd } else { 7292185029Spjd taddr = dev->l2ad_hand + distance; 7293185029Spjd } 7294185029Spjd DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist, 7295185029Spjd uint64_t, taddr, boolean_t, all); 7296185029Spjd 7297185029Spjdtop: 7298286570Smav mutex_enter(&dev->l2ad_mtx); 7299275811Sdelphij for (hdr = list_tail(buflist); hdr; hdr = hdr_prev) { 7300275811Sdelphij hdr_prev = list_prev(buflist, hdr); 7301185029Spjd 7302275811Sdelphij hash_lock = HDR_LOCK(hdr); 7303286763Smav 7304286763Smav /* 7305286763Smav * We cannot use mutex_enter or else we can deadlock 7306286763Smav * with l2arc_write_buffers (due to swapping the order 7307286763Smav * the hash lock and l2ad_mtx are taken). 7308286763Smav */ 7309185029Spjd if (!mutex_tryenter(hash_lock)) { 7310185029Spjd /* 7311185029Spjd * Missed the hash lock. Retry. 7312185029Spjd */ 7313185029Spjd ARCSTAT_BUMP(arcstat_l2_evict_lock_retry); 7314286570Smav mutex_exit(&dev->l2ad_mtx); 7315185029Spjd mutex_enter(hash_lock); 7316185029Spjd mutex_exit(hash_lock); 7317185029Spjd goto top; 7318185029Spjd } 7319185029Spjd 7320323752Savg /* 7321323752Savg * A header can't be on this list if it doesn't have L2 header. 7322323752Savg */ 7323323752Savg ASSERT(HDR_HAS_L2HDR(hdr)); 7324185029Spjd 7325323752Savg /* Ensure this header has finished being written. */ 7326323752Savg ASSERT(!HDR_L2_WRITING(hdr)); 7327323752Savg ASSERT(!HDR_L2_WRITE_HEAD(hdr)); 7328323752Savg 7329323752Savg if (!all && (hdr->b_l2hdr.b_daddr >= taddr || 7330286570Smav hdr->b_l2hdr.b_daddr < dev->l2ad_hand)) { 7331185029Spjd /* 7332185029Spjd * We've evicted to the target address, 7333185029Spjd * or the end of the device. 7334185029Spjd */ 7335185029Spjd mutex_exit(hash_lock); 7336185029Spjd break; 7337185029Spjd } 7338185029Spjd 7339286570Smav if (!HDR_HAS_L1HDR(hdr)) { 7340275811Sdelphij ASSERT(!HDR_L2_READING(hdr)); 7341185029Spjd /* 7342185029Spjd * This doesn't exist in the ARC. Destroy. 7343185029Spjd * arc_hdr_destroy() will call list_remove() 7344323754Savg * and decrement arcstat_l2_lsize. 7345185029Spjd */ 7346275811Sdelphij arc_change_state(arc_anon, hdr, hash_lock); 7347275811Sdelphij arc_hdr_destroy(hdr); 7348185029Spjd } else { 7349286570Smav ASSERT(hdr->b_l1hdr.b_state != arc_l2c_only); 7350286570Smav ARCSTAT_BUMP(arcstat_l2_evict_l1cached); 7351185029Spjd /* 7352185029Spjd * Invalidate issued or about to be issued 7353185029Spjd * reads, since we may be about to write 7354185029Spjd * over this location. 7355185029Spjd */ 7356275811Sdelphij if (HDR_L2_READING(hdr)) { 7357185029Spjd ARCSTAT_BUMP(arcstat_l2_evict_reading); 7358307265Smav arc_hdr_set_flags(hdr, ARC_FLAG_L2_EVICTED); 7359185029Spjd } 7360185029Spjd 7361286598Smav arc_hdr_l2hdr_destroy(hdr); 7362185029Spjd } 7363185029Spjd mutex_exit(hash_lock); 7364185029Spjd } 7365286570Smav mutex_exit(&dev->l2ad_mtx); 7366185029Spjd} 7367185029Spjd 7368185029Spjd/* 7369185029Spjd * Find and write ARC buffers to the L2ARC device. 7370185029Spjd * 7371275811Sdelphij * An ARC_FLAG_L2_WRITING flag is set so that the L2ARC buffers are not valid 7372185029Spjd * for reading until they have completed writing. 7373251478Sdelphij * The headroom_boost is an in-out parameter used to maintain headroom boost 7374251478Sdelphij * state between calls to this function. 7375251478Sdelphij * 7376251478Sdelphij * Returns the number of bytes actually written (which may be smaller than 7377251478Sdelphij * the delta by which the device hand has changed due to alignment). 7378185029Spjd */ 7379208373Smmstatic uint64_t 7380307265Smavl2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) 7381185029Spjd{ 7382275811Sdelphij arc_buf_hdr_t *hdr, *hdr_prev, *head; 7383323754Savg uint64_t write_asize, write_psize, write_lsize, headroom; 7384251478Sdelphij boolean_t full; 7385185029Spjd l2arc_write_callback_t *cb; 7386185029Spjd zio_t *pio, *wzio; 7387228103Smm uint64_t guid = spa_load_guid(spa); 7388185029Spjd int try; 7389185029Spjd 7390307265Smav ASSERT3P(dev->l2ad_vdev, !=, NULL); 7391185029Spjd 7392185029Spjd pio = NULL; 7393323754Savg write_lsize = write_asize = write_psize = 0; 7394185029Spjd full = B_FALSE; 7395286570Smav head = kmem_cache_alloc(hdr_l2only_cache, KM_PUSHPAGE); 7396307265Smav arc_hdr_set_flags(head, ARC_FLAG_L2_WRITE_HEAD | ARC_FLAG_HAS_L2HDR); 7397185029Spjd 7398205231Skmacy ARCSTAT_BUMP(arcstat_l2_write_buffer_iter); 7399185029Spjd /* 7400185029Spjd * Copy buffers for L2ARC writing. 7401185029Spjd */ 7402286762Smav for (try = 0; try <= 3; try++) { 7403286763Smav multilist_sublist_t *mls = l2arc_sublist_lock(try); 7404251478Sdelphij uint64_t passed_sz = 0; 7405251478Sdelphij 7406205231Skmacy ARCSTAT_BUMP(arcstat_l2_write_buffer_list_iter); 7407185029Spjd 7408185029Spjd /* 7409185029Spjd * L2ARC fast warmup. 7410185029Spjd * 7411185029Spjd * Until the ARC is warm and starts to evict, read from the 7412185029Spjd * head of the ARC lists rather than the tail. 7413185029Spjd */ 7414185029Spjd if (arc_warm == B_FALSE) 7415286763Smav hdr = multilist_sublist_head(mls); 7416185029Spjd else 7417286763Smav hdr = multilist_sublist_tail(mls); 7418275811Sdelphij if (hdr == NULL) 7419205231Skmacy ARCSTAT_BUMP(arcstat_l2_write_buffer_list_null_iter); 7420185029Spjd 7421286762Smav headroom = target_sz * l2arc_headroom; 7422307265Smav if (zfs_compressed_arc_enabled) 7423251478Sdelphij headroom = (headroom * l2arc_headroom_boost) / 100; 7424251478Sdelphij 7425275811Sdelphij for (; hdr; hdr = hdr_prev) { 7426251478Sdelphij kmutex_t *hash_lock; 7427251478Sdelphij 7428185029Spjd if (arc_warm == B_FALSE) 7429286763Smav hdr_prev = multilist_sublist_next(mls, hdr); 7430185029Spjd else 7431286763Smav hdr_prev = multilist_sublist_prev(mls, hdr); 7432307265Smav ARCSTAT_INCR(arcstat_l2_write_buffer_bytes_scanned, 7433307265Smav HDR_GET_LSIZE(hdr)); 7434206796Spjd 7435275811Sdelphij hash_lock = HDR_LOCK(hdr); 7436251478Sdelphij if (!mutex_tryenter(hash_lock)) { 7437205231Skmacy ARCSTAT_BUMP(arcstat_l2_write_trylock_fail); 7438185029Spjd /* 7439185029Spjd * Skip this buffer rather than waiting. 7440185029Spjd */ 7441185029Spjd continue; 7442185029Spjd } 7443185029Spjd 7444307265Smav passed_sz += HDR_GET_LSIZE(hdr); 7445185029Spjd if (passed_sz > headroom) { 7446185029Spjd /* 7447185029Spjd * Searched too far. 7448185029Spjd */ 7449185029Spjd mutex_exit(hash_lock); 7450205231Skmacy ARCSTAT_BUMP(arcstat_l2_write_passed_headroom); 7451185029Spjd break; 7452185029Spjd } 7453185029Spjd 7454275811Sdelphij if (!l2arc_write_eligible(guid, hdr)) { 7455185029Spjd mutex_exit(hash_lock); 7456185029Spjd continue; 7457185029Spjd } 7458185029Spjd 7459315072Savg /* 7460315072Savg * We rely on the L1 portion of the header below, so 7461315072Savg * it's invalid for this header to have been evicted out 7462315072Savg * of the ghost cache, prior to being written out. The 7463315072Savg * ARC_FLAG_L2_WRITING bit ensures this won't happen. 7464315072Savg */ 7465315072Savg ASSERT(HDR_HAS_L1HDR(hdr)); 7466315072Savg 7467315072Savg ASSERT3U(HDR_GET_PSIZE(hdr), >, 0); 7468321610Smav ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); 7469315072Savg ASSERT3U(arc_hdr_size(hdr), >, 0); 7470323754Savg uint64_t psize = arc_hdr_size(hdr); 7471315072Savg uint64_t asize = vdev_psize_to_asize(dev->l2ad_vdev, 7472323754Savg psize); 7473315072Savg 7474323754Savg if ((write_asize + asize) > target_sz) { 7475185029Spjd full = B_TRUE; 7476185029Spjd mutex_exit(hash_lock); 7477205231Skmacy ARCSTAT_BUMP(arcstat_l2_write_full); 7478185029Spjd break; 7479185029Spjd } 7480185029Spjd 7481185029Spjd if (pio == NULL) { 7482185029Spjd /* 7483185029Spjd * Insert a dummy header on the buflist so 7484185029Spjd * l2arc_write_done() can find where the 7485185029Spjd * write buffers begin without searching. 7486185029Spjd */ 7487286763Smav mutex_enter(&dev->l2ad_mtx); 7488286570Smav list_insert_head(&dev->l2ad_buflist, head); 7489286763Smav mutex_exit(&dev->l2ad_mtx); 7490185029Spjd 7491185029Spjd cb = kmem_alloc( 7492185029Spjd sizeof (l2arc_write_callback_t), KM_SLEEP); 7493185029Spjd cb->l2wcb_dev = dev; 7494185029Spjd cb->l2wcb_head = head; 7495185029Spjd pio = zio_root(spa, l2arc_write_done, cb, 7496185029Spjd ZIO_FLAG_CANFAIL); 7497205231Skmacy ARCSTAT_BUMP(arcstat_l2_write_pios); 7498185029Spjd } 7499185029Spjd 7500286570Smav hdr->b_l2hdr.b_dev = dev; 7501307265Smav hdr->b_l2hdr.b_daddr = dev->l2ad_hand; 7502307265Smav arc_hdr_set_flags(hdr, 7503307265Smav ARC_FLAG_L2_WRITING | ARC_FLAG_HAS_L2HDR); 7504251478Sdelphij 7505307265Smav mutex_enter(&dev->l2ad_mtx); 7506307265Smav list_insert_head(&dev->l2ad_buflist, hdr); 7507307265Smav mutex_exit(&dev->l2ad_mtx); 7508307265Smav 7509323754Savg (void) refcount_add_many(&dev->l2ad_alloc, psize, hdr); 7510251478Sdelphij 7511185029Spjd /* 7512307265Smav * Normally the L2ARC can use the hdr's data, but if 7513307265Smav * we're sharing data between the hdr and one of its 7514307265Smav * bufs, L2ARC needs its own copy of the data so that 7515321613Smav * the ZIO below can't race with the buf consumer. 7516321613Smav * Another case where we need to create a copy of the 7517321613Smav * data is when the buffer size is not device-aligned 7518321613Smav * and we need to pad the block to make it such. 7519321613Smav * That also keeps the clock hand suitably aligned. 7520321613Smav * 7521321613Smav * To ensure that the copy will be available for the 7522307265Smav * lifetime of the ZIO and be cleaned up afterwards, we 7523307265Smav * add it to the l2arc_free_on_write queue. 7524185029Spjd */ 7525321610Smav abd_t *to_write; 7526323754Savg if (!HDR_SHARED_DATA(hdr) && psize == asize) { 7527321610Smav to_write = hdr->b_l1hdr.b_pabd; 7528307265Smav } else { 7529321610Smav to_write = abd_alloc_for_io(asize, 7530321610Smav HDR_ISTYPE_METADATA(hdr)); 7531323754Savg abd_copy(to_write, hdr->b_l1hdr.b_pabd, psize); 7532323754Savg if (asize != psize) { 7533323754Savg abd_zero_off(to_write, psize, 7534323754Savg asize - psize); 7535307265Smav } 7536321610Smav l2arc_free_abd_on_write(to_write, asize, 7537321610Smav arc_buf_type(hdr)); 7538307265Smav } 7539307265Smav wzio = zio_write_phys(pio, dev->l2ad_vdev, 7540307265Smav hdr->b_l2hdr.b_daddr, asize, to_write, 7541307265Smav ZIO_CHECKSUM_OFF, NULL, hdr, 7542307265Smav ZIO_PRIORITY_ASYNC_WRITE, 7543307265Smav ZIO_FLAG_CANFAIL, B_FALSE); 7544307265Smav 7545323754Savg write_lsize += HDR_GET_LSIZE(hdr); 7546307265Smav DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev, 7547307265Smav zio_t *, wzio); 7548307265Smav 7549323754Savg write_psize += psize; 7550323754Savg write_asize += asize; 7551307265Smav dev->l2ad_hand += asize; 7552307265Smav 7553185029Spjd mutex_exit(hash_lock); 7554185029Spjd 7555307265Smav (void) zio_nowait(wzio); 7556251478Sdelphij } 7557251478Sdelphij 7558286763Smav multilist_sublist_unlock(mls); 7559251478Sdelphij 7560251478Sdelphij if (full == B_TRUE) 7561251478Sdelphij break; 7562251478Sdelphij } 7563251478Sdelphij 7564251478Sdelphij /* No buffers selected for writing? */ 7565251478Sdelphij if (pio == NULL) { 7566323754Savg ASSERT0(write_lsize); 7567286570Smav ASSERT(!HDR_HAS_L1HDR(head)); 7568286570Smav kmem_cache_free(hdr_l2only_cache, head); 7569251478Sdelphij return (0); 7570251478Sdelphij } 7571251478Sdelphij 7572315072Savg ASSERT3U(write_psize, <=, target_sz); 7573185029Spjd ARCSTAT_BUMP(arcstat_l2_writes_sent); 7574323754Savg ARCSTAT_INCR(arcstat_l2_write_bytes, write_psize); 7575323754Savg ARCSTAT_INCR(arcstat_l2_lsize, write_lsize); 7576323754Savg ARCSTAT_INCR(arcstat_l2_psize, write_psize); 7577323754Savg vdev_space_update(dev->l2ad_vdev, write_psize, 0, 0); 7578185029Spjd 7579185029Spjd /* 7580185029Spjd * Bump device hand to the device start if it is approaching the end. 7581185029Spjd * l2arc_evict() will already have evicted ahead for this case. 7582185029Spjd */ 7583185029Spjd if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) { 7584185029Spjd dev->l2ad_hand = dev->l2ad_start; 7585185029Spjd dev->l2ad_first = B_FALSE; 7586185029Spjd } 7587185029Spjd 7588208373Smm dev->l2ad_writing = B_TRUE; 7589185029Spjd (void) zio_wait(pio); 7590208373Smm dev->l2ad_writing = B_FALSE; 7591208373Smm 7592251478Sdelphij return (write_asize); 7593185029Spjd} 7594185029Spjd 7595185029Spjd/* 7596185029Spjd * This thread feeds the L2ARC at regular intervals. This is the beating 7597185029Spjd * heart of the L2ARC. 7598185029Spjd */ 7599185029Spjdstatic void 7600185029Spjdl2arc_feed_thread(void *dummy __unused) 7601185029Spjd{ 7602185029Spjd callb_cpr_t cpr; 7603185029Spjd l2arc_dev_t *dev; 7604185029Spjd spa_t *spa; 7605208373Smm uint64_t size, wrote; 7606219089Spjd clock_t begin, next = ddi_get_lbolt(); 7607185029Spjd 7608185029Spjd CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG); 7609185029Spjd 7610185029Spjd mutex_enter(&l2arc_feed_thr_lock); 7611185029Spjd 7612185029Spjd while (l2arc_thread_exit == 0) { 7613185029Spjd CALLB_CPR_SAFE_BEGIN(&cpr); 7614185029Spjd (void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock, 7615219089Spjd next - ddi_get_lbolt()); 7616185029Spjd CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock); 7617219089Spjd next = ddi_get_lbolt() + hz; 7618185029Spjd 7619185029Spjd /* 7620185029Spjd * Quick check for L2ARC devices. 7621185029Spjd */ 7622185029Spjd mutex_enter(&l2arc_dev_mtx); 7623185029Spjd if (l2arc_ndev == 0) { 7624185029Spjd mutex_exit(&l2arc_dev_mtx); 7625185029Spjd continue; 7626185029Spjd } 7627185029Spjd mutex_exit(&l2arc_dev_mtx); 7628219089Spjd begin = ddi_get_lbolt(); 7629185029Spjd 7630185029Spjd /* 7631185029Spjd * This selects the next l2arc device to write to, and in 7632185029Spjd * doing so the next spa to feed from: dev->l2ad_spa. This 7633185029Spjd * will return NULL if there are now no l2arc devices or if 7634185029Spjd * they are all faulted. 7635185029Spjd * 7636185029Spjd * If a device is returned, its spa's config lock is also 7637185029Spjd * held to prevent device removal. l2arc_dev_get_next() 7638185029Spjd * will grab and release l2arc_dev_mtx. 7639185029Spjd */ 7640185029Spjd if ((dev = l2arc_dev_get_next()) == NULL) 7641185029Spjd continue; 7642185029Spjd 7643185029Spjd spa = dev->l2ad_spa; 7644307265Smav ASSERT3P(spa, !=, NULL); 7645185029Spjd 7646185029Spjd /* 7647219089Spjd * If the pool is read-only then force the feed thread to 7648219089Spjd * sleep a little longer. 7649219089Spjd */ 7650219089Spjd if (!spa_writeable(spa)) { 7651219089Spjd next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz; 7652219089Spjd spa_config_exit(spa, SCL_L2ARC, dev); 7653219089Spjd continue; 7654219089Spjd } 7655219089Spjd 7656219089Spjd /* 7657185029Spjd * Avoid contributing to memory pressure. 7658185029Spjd */ 7659185029Spjd if (arc_reclaim_needed()) { 7660185029Spjd ARCSTAT_BUMP(arcstat_l2_abort_lowmem); 7661185029Spjd spa_config_exit(spa, SCL_L2ARC, dev); 7662185029Spjd continue; 7663185029Spjd } 7664185029Spjd 7665185029Spjd ARCSTAT_BUMP(arcstat_l2_feeds); 7666185029Spjd 7667251478Sdelphij size = l2arc_write_size(); 7668185029Spjd 7669185029Spjd /* 7670185029Spjd * Evict L2ARC buffers that will be overwritten. 7671185029Spjd */ 7672185029Spjd l2arc_evict(dev, size, B_FALSE); 7673185029Spjd 7674185029Spjd /* 7675185029Spjd * Write ARC buffers. 7676185029Spjd */ 7677307265Smav wrote = l2arc_write_buffers(spa, dev, size); 7678208373Smm 7679208373Smm /* 7680208373Smm * Calculate interval between writes. 7681208373Smm */ 7682208373Smm next = l2arc_write_interval(begin, size, wrote); 7683185029Spjd spa_config_exit(spa, SCL_L2ARC, dev); 7684185029Spjd } 7685185029Spjd 7686185029Spjd l2arc_thread_exit = 0; 7687185029Spjd cv_broadcast(&l2arc_feed_thr_cv); 7688185029Spjd CALLB_CPR_EXIT(&cpr); /* drops l2arc_feed_thr_lock */ 7689185029Spjd thread_exit(); 7690185029Spjd} 7691185029Spjd 7692185029Spjdboolean_t 7693185029Spjdl2arc_vdev_present(vdev_t *vd) 7694185029Spjd{ 7695185029Spjd l2arc_dev_t *dev; 7696185029Spjd 7697185029Spjd mutex_enter(&l2arc_dev_mtx); 7698185029Spjd for (dev = list_head(l2arc_dev_list); dev != NULL; 7699185029Spjd dev = list_next(l2arc_dev_list, dev)) { 7700185029Spjd if (dev->l2ad_vdev == vd) 7701185029Spjd break; 7702185029Spjd } 7703185029Spjd mutex_exit(&l2arc_dev_mtx); 7704185029Spjd 7705185029Spjd return (dev != NULL); 7706185029Spjd} 7707185029Spjd 7708185029Spjd/* 7709185029Spjd * Add a vdev for use by the L2ARC. By this point the spa has already 7710185029Spjd * validated the vdev and opened it. 7711185029Spjd */ 7712185029Spjdvoid 7713219089Spjdl2arc_add_vdev(spa_t *spa, vdev_t *vd) 7714185029Spjd{ 7715185029Spjd l2arc_dev_t *adddev; 7716185029Spjd 7717185029Spjd ASSERT(!l2arc_vdev_present(vd)); 7718185029Spjd 7719255753Sgibbs vdev_ashift_optimize(vd); 7720255753Sgibbs 7721185029Spjd /* 7722185029Spjd * Create a new l2arc device entry. 7723185029Spjd */ 7724185029Spjd adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP); 7725185029Spjd adddev->l2ad_spa = spa; 7726185029Spjd adddev->l2ad_vdev = vd; 7727219089Spjd adddev->l2ad_start = VDEV_LABEL_START_SIZE; 7728219089Spjd adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd); 7729185029Spjd adddev->l2ad_hand = adddev->l2ad_start; 7730185029Spjd adddev->l2ad_first = B_TRUE; 7731208373Smm adddev->l2ad_writing = B_FALSE; 7732185029Spjd 7733286570Smav mutex_init(&adddev->l2ad_mtx, NULL, MUTEX_DEFAULT, NULL); 7734185029Spjd /* 7735185029Spjd * This is a list of all ARC buffers that are still valid on the 7736185029Spjd * device. 7737185029Spjd */ 7738286570Smav list_create(&adddev->l2ad_buflist, sizeof (arc_buf_hdr_t), 7739286570Smav offsetof(arc_buf_hdr_t, b_l2hdr.b_l2node)); 7740185029Spjd 7741219089Spjd vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand); 7742286598Smav refcount_create(&adddev->l2ad_alloc); 7743185029Spjd 7744185029Spjd /* 7745185029Spjd * Add device to global list 7746185029Spjd */ 7747185029Spjd mutex_enter(&l2arc_dev_mtx); 7748185029Spjd list_insert_head(l2arc_dev_list, adddev); 7749185029Spjd atomic_inc_64(&l2arc_ndev); 7750185029Spjd mutex_exit(&l2arc_dev_mtx); 7751185029Spjd} 7752185029Spjd 7753185029Spjd/* 7754185029Spjd * Remove a vdev from the L2ARC. 7755185029Spjd */ 7756185029Spjdvoid 7757185029Spjdl2arc_remove_vdev(vdev_t *vd) 7758185029Spjd{ 7759185029Spjd l2arc_dev_t *dev, *nextdev, *remdev = NULL; 7760185029Spjd 7761185029Spjd /* 7762185029Spjd * Find the device by vdev 7763185029Spjd */ 7764185029Spjd mutex_enter(&l2arc_dev_mtx); 7765185029Spjd for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) { 7766185029Spjd nextdev = list_next(l2arc_dev_list, dev); 7767185029Spjd if (vd == dev->l2ad_vdev) { 7768185029Spjd remdev = dev; 7769185029Spjd break; 7770185029Spjd } 7771185029Spjd } 7772307265Smav ASSERT3P(remdev, !=, NULL); 7773185029Spjd 7774185029Spjd /* 7775185029Spjd * Remove device from global list 7776185029Spjd */ 7777185029Spjd list_remove(l2arc_dev_list, remdev); 7778185029Spjd l2arc_dev_last = NULL; /* may have been invalidated */ 7779185029Spjd atomic_dec_64(&l2arc_ndev); 7780185029Spjd mutex_exit(&l2arc_dev_mtx); 7781185029Spjd 7782185029Spjd /* 7783185029Spjd * Clear all buflists and ARC references. L2ARC device flush. 7784185029Spjd */ 7785185029Spjd l2arc_evict(remdev, 0, B_TRUE); 7786286570Smav list_destroy(&remdev->l2ad_buflist); 7787286570Smav mutex_destroy(&remdev->l2ad_mtx); 7788286598Smav refcount_destroy(&remdev->l2ad_alloc); 7789185029Spjd kmem_free(remdev, sizeof (l2arc_dev_t)); 7790185029Spjd} 7791185029Spjd 7792185029Spjdvoid 7793185029Spjdl2arc_init(void) 7794185029Spjd{ 7795185029Spjd l2arc_thread_exit = 0; 7796185029Spjd l2arc_ndev = 0; 7797185029Spjd l2arc_writes_sent = 0; 7798185029Spjd l2arc_writes_done = 0; 7799185029Spjd 7800185029Spjd mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL); 7801185029Spjd cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL); 7802185029Spjd mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL); 7803185029Spjd mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL); 7804185029Spjd 7805185029Spjd l2arc_dev_list = &L2ARC_dev_list; 7806185029Spjd l2arc_free_on_write = &L2ARC_free_on_write; 7807185029Spjd list_create(l2arc_dev_list, sizeof (l2arc_dev_t), 7808185029Spjd offsetof(l2arc_dev_t, l2ad_node)); 7809185029Spjd list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t), 7810185029Spjd offsetof(l2arc_data_free_t, l2df_list_node)); 7811185029Spjd} 7812185029Spjd 7813185029Spjdvoid 7814185029Spjdl2arc_fini(void) 7815185029Spjd{ 7816185029Spjd /* 7817185029Spjd * This is called from dmu_fini(), which is called from spa_fini(); 7818185029Spjd * Because of this, we can assume that all l2arc devices have 7819185029Spjd * already been removed when the pools themselves were removed. 7820185029Spjd */ 7821185029Spjd 7822185029Spjd l2arc_do_free_on_write(); 7823185029Spjd 7824185029Spjd mutex_destroy(&l2arc_feed_thr_lock); 7825185029Spjd cv_destroy(&l2arc_feed_thr_cv); 7826185029Spjd mutex_destroy(&l2arc_dev_mtx); 7827185029Spjd mutex_destroy(&l2arc_free_on_write_mtx); 7828185029Spjd 7829185029Spjd list_destroy(l2arc_dev_list); 7830185029Spjd list_destroy(l2arc_free_on_write); 7831185029Spjd} 7832185029Spjd 7833185029Spjdvoid 7834185029Spjdl2arc_start(void) 7835185029Spjd{ 7836209962Smm if (!(spa_mode_global & FWRITE)) 7837185029Spjd return; 7838185029Spjd 7839185029Spjd (void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0, 7840185029Spjd TS_RUN, minclsyspri); 7841185029Spjd} 7842185029Spjd 7843185029Spjdvoid 7844185029Spjdl2arc_stop(void) 7845185029Spjd{ 7846209962Smm if (!(spa_mode_global & FWRITE)) 7847185029Spjd return; 7848185029Spjd 7849185029Spjd mutex_enter(&l2arc_feed_thr_lock); 7850185029Spjd cv_signal(&l2arc_feed_thr_cv); /* kick thread out of startup */ 7851185029Spjd l2arc_thread_exit = 1; 7852185029Spjd while (l2arc_thread_exit != 0) 7853185029Spjd cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock); 7854185029Spjd mutex_exit(&l2arc_feed_thr_lock); 7855185029Spjd} 7856