arc.c revision 332525
1168404Spjd/* 2168404Spjd * CDDL HEADER START 3168404Spjd * 4168404Spjd * The contents of this file are subject to the terms of the 5168404Spjd * Common Development and Distribution License (the "License"). 6168404Spjd * You may not use this file except in compliance with the License. 7168404Spjd * 8168404Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9168404Spjd * or http://www.opensolaris.org/os/licensing. 10168404Spjd * See the License for the specific language governing permissions 11168404Spjd * and limitations under the License. 12168404Spjd * 13168404Spjd * When distributing Covered Code, include this CDDL HEADER in each 14168404Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15168404Spjd * If applicable, add the following below this CDDL HEADER, with the 16168404Spjd * fields enclosed by brackets "[]" replaced with your own identifying 17168404Spjd * information: Portions Copyright [yyyy] [name of copyright owner] 18168404Spjd * 19168404Spjd * CDDL HEADER END 20168404Spjd */ 21168404Spjd/* 22219089Spjd * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23277826Sdelphij * Copyright (c) 2012, Joyent, Inc. All rights reserved. 24321552Smav * Copyright (c) 2011, 2017 by Delphix. All rights reserved. 25260835Sdelphij * Copyright (c) 2014 by Saso Kiselkov. All rights reserved. 26329490Smav * Copyright 2017 Nexenta Systems, Inc. All rights reserved. 27168404Spjd */ 28168404Spjd 29168404Spjd/* 30168404Spjd * DVA-based Adjustable Replacement Cache 31168404Spjd * 32168404Spjd * While much of the theory of operation used here is 33168404Spjd * based on the self-tuning, low overhead replacement cache 34168404Spjd * presented by Megiddo and Modha at FAST 2003, there are some 35168404Spjd * significant differences: 36168404Spjd * 37168404Spjd * 1. The Megiddo and Modha model assumes any page is evictable. 38168404Spjd * Pages in its cache cannot be "locked" into memory. This makes 39168404Spjd * the eviction algorithm simple: evict the last page in the list. 40168404Spjd * This also make the performance characteristics easy to reason 41168404Spjd * about. Our cache is not so simple. At any given moment, some 42168404Spjd * subset of the blocks in the cache are un-evictable because we 43168404Spjd * have handed out a reference to them. Blocks are only evictable 44168404Spjd * when there are no external references active. This makes 45168404Spjd * eviction far more problematic: we choose to evict the evictable 46168404Spjd * blocks that are the "lowest" in the list. 47168404Spjd * 48168404Spjd * There are times when it is not possible to evict the requested 49168404Spjd * space. In these circumstances we are unable to adjust the cache 50168404Spjd * size. To prevent the cache growing unbounded at these times we 51185029Spjd * implement a "cache throttle" that slows the flow of new data 52185029Spjd * into the cache until we can make space available. 53168404Spjd * 54168404Spjd * 2. The Megiddo and Modha model assumes a fixed cache size. 55168404Spjd * Pages are evicted when the cache is full and there is a cache 56168404Spjd * miss. Our model has a variable sized cache. It grows with 57185029Spjd * high use, but also tries to react to memory pressure from the 58168404Spjd * operating system: decreasing its size when system memory is 59168404Spjd * tight. 60168404Spjd * 61168404Spjd * 3. The Megiddo and Modha model assumes a fixed page size. All 62251631Sdelphij * elements of the cache are therefore exactly the same size. So 63168404Spjd * when adjusting the cache size following a cache miss, its simply 64168404Spjd * a matter of choosing a single page to evict. In our model, we 65168404Spjd * have variable sized cache blocks (rangeing from 512 bytes to 66251631Sdelphij * 128K bytes). We therefore choose a set of blocks to evict to make 67168404Spjd * space for a cache miss that approximates as closely as possible 68168404Spjd * the space used by the new block. 69168404Spjd * 70168404Spjd * See also: "ARC: A Self-Tuning, Low Overhead Replacement Cache" 71168404Spjd * by N. Megiddo & D. Modha, FAST 2003 72168404Spjd */ 73168404Spjd 74168404Spjd/* 75168404Spjd * The locking model: 76168404Spjd * 77168404Spjd * A new reference to a cache buffer can be obtained in two 78168404Spjd * ways: 1) via a hash table lookup using the DVA as a key, 79185029Spjd * or 2) via one of the ARC lists. The arc_read() interface 80321535Smav * uses method 1, while the internal ARC algorithms for 81251631Sdelphij * adjusting the cache use method 2. We therefore provide two 82168404Spjd * types of locks: 1) the hash table lock array, and 2) the 83321535Smav * ARC list locks. 84168404Spjd * 85286774Smav * Buffers do not have their own mutexes, rather they rely on the 86286774Smav * hash table mutexes for the bulk of their protection (i.e. most 87286774Smav * fields in the arc_buf_hdr_t are protected by these mutexes). 88168404Spjd * 89168404Spjd * buf_hash_find() returns the appropriate mutex (held) when it 90168404Spjd * locates the requested buffer in the hash table. It returns 91168404Spjd * NULL for the mutex if the buffer was not in the table. 92168404Spjd * 93168404Spjd * buf_hash_remove() expects the appropriate hash mutex to be 94168404Spjd * already held before it is invoked. 95168404Spjd * 96321535Smav * Each ARC state also has a mutex which is used to protect the 97168404Spjd * buffer list associated with the state. When attempting to 98321535Smav * obtain a hash table lock while holding an ARC list lock you 99168404Spjd * must use: mutex_tryenter() to avoid deadlock. Also note that 100168404Spjd * the active state mutex must be held before the ghost state mutex. 101168404Spjd * 102168404Spjd * Note that the majority of the performance stats are manipulated 103168404Spjd * with atomic operations. 104185029Spjd * 105286570Smav * The L2ARC uses the l2ad_mtx on each vdev for the following: 106185029Spjd * 107185029Spjd * - L2ARC buflist creation 108185029Spjd * - L2ARC buflist eviction 109185029Spjd * - L2ARC write completion, which walks L2ARC buflists 110185029Spjd * - ARC header destruction, as it removes from L2ARC buflists 111185029Spjd * - ARC header release, as it removes from L2ARC buflists 112168404Spjd */ 113168404Spjd 114307265Smav/* 115307265Smav * ARC operation: 116307265Smav * 117307265Smav * Every block that is in the ARC is tracked by an arc_buf_hdr_t structure. 118307265Smav * This structure can point either to a block that is still in the cache or to 119307265Smav * one that is only accessible in an L2 ARC device, or it can provide 120307265Smav * information about a block that was recently evicted. If a block is 121307265Smav * only accessible in the L2ARC, then the arc_buf_hdr_t only has enough 122307265Smav * information to retrieve it from the L2ARC device. This information is 123307265Smav * stored in the l2arc_buf_hdr_t sub-structure of the arc_buf_hdr_t. A block 124307265Smav * that is in this state cannot access the data directly. 125307265Smav * 126307265Smav * Blocks that are actively being referenced or have not been evicted 127307265Smav * are cached in the L1ARC. The L1ARC (l1arc_buf_hdr_t) is a structure within 128307265Smav * the arc_buf_hdr_t that will point to the data block in memory. A block can 129307265Smav * only be read by a consumer if it has an l1arc_buf_hdr_t. The L1ARC 130321535Smav * caches data in two ways -- in a list of ARC buffers (arc_buf_t) and 131321610Smav * also in the arc_buf_hdr_t's private physical data block pointer (b_pabd). 132321535Smav * 133321535Smav * The L1ARC's data pointer may or may not be uncompressed. The ARC has the 134321610Smav * ability to store the physical data (b_pabd) associated with the DVA of the 135321610Smav * arc_buf_hdr_t. Since the b_pabd is a copy of the on-disk physical block, 136321535Smav * it will match its on-disk compression characteristics. This behavior can be 137321535Smav * disabled by setting 'zfs_compressed_arc_enabled' to B_FALSE. When the 138321610Smav * compressed ARC functionality is disabled, the b_pabd will point to an 139321535Smav * uncompressed version of the on-disk data. 140321535Smav * 141321535Smav * Data in the L1ARC is not accessed by consumers of the ARC directly. Each 142321535Smav * arc_buf_hdr_t can have multiple ARC buffers (arc_buf_t) which reference it. 143321535Smav * Each ARC buffer (arc_buf_t) is being actively accessed by a specific ARC 144321535Smav * consumer. The ARC will provide references to this data and will keep it 145321535Smav * cached until it is no longer in use. The ARC caches only the L1ARC's physical 146321535Smav * data block and will evict any arc_buf_t that is no longer referenced. The 147321535Smav * amount of memory consumed by the arc_buf_ts' data buffers can be seen via the 148307265Smav * "overhead_size" kstat. 149307265Smav * 150321535Smav * Depending on the consumer, an arc_buf_t can be requested in uncompressed or 151321535Smav * compressed form. The typical case is that consumers will want uncompressed 152321535Smav * data, and when that happens a new data buffer is allocated where the data is 153321535Smav * decompressed for them to use. Currently the only consumer who wants 154321535Smav * compressed arc_buf_t's is "zfs send", when it streams data exactly as it 155321535Smav * exists on disk. When this happens, the arc_buf_t's data buffer is shared 156321535Smav * with the arc_buf_hdr_t. 157307265Smav * 158321535Smav * Here is a diagram showing an arc_buf_hdr_t referenced by two arc_buf_t's. The 159321535Smav * first one is owned by a compressed send consumer (and therefore references 160321535Smav * the same compressed data buffer as the arc_buf_hdr_t) and the second could be 161321535Smav * used by any other consumer (and has its own uncompressed copy of the data 162321535Smav * buffer). 163307265Smav * 164321535Smav * arc_buf_hdr_t 165321535Smav * +-----------+ 166321535Smav * | fields | 167321535Smav * | common to | 168321535Smav * | L1- and | 169321535Smav * | L2ARC | 170321535Smav * +-----------+ 171321535Smav * | l2arc_buf_hdr_t 172321535Smav * | | 173321535Smav * +-----------+ 174321535Smav * | l1arc_buf_hdr_t 175321535Smav * | | arc_buf_t 176321535Smav * | b_buf +------------>+-----------+ arc_buf_t 177321610Smav * | b_pabd +-+ |b_next +---->+-----------+ 178321535Smav * +-----------+ | |-----------| |b_next +-->NULL 179321535Smav * | |b_comp = T | +-----------+ 180321535Smav * | |b_data +-+ |b_comp = F | 181321535Smav * | +-----------+ | |b_data +-+ 182321535Smav * +->+------+ | +-----------+ | 183321535Smav * compressed | | | | 184321535Smav * data | |<--------------+ | uncompressed 185321535Smav * +------+ compressed, | data 186321535Smav * shared +-->+------+ 187321535Smav * data | | 188321535Smav * | | 189321535Smav * +------+ 190307265Smav * 191307265Smav * When a consumer reads a block, the ARC must first look to see if the 192321535Smav * arc_buf_hdr_t is cached. If the hdr is cached then the ARC allocates a new 193321535Smav * arc_buf_t and either copies uncompressed data into a new data buffer from an 194321610Smav * existing uncompressed arc_buf_t, decompresses the hdr's b_pabd buffer into a 195321610Smav * new data buffer, or shares the hdr's b_pabd buffer, depending on whether the 196321535Smav * hdr is compressed and the desired compression characteristics of the 197321535Smav * arc_buf_t consumer. If the arc_buf_t ends up sharing data with the 198321535Smav * arc_buf_hdr_t and both of them are uncompressed then the arc_buf_t must be 199321535Smav * the last buffer in the hdr's b_buf list, however a shared compressed buf can 200321535Smav * be anywhere in the hdr's list. 201307265Smav * 202307265Smav * The diagram below shows an example of an uncompressed ARC hdr that is 203321535Smav * sharing its data with an arc_buf_t (note that the shared uncompressed buf is 204321535Smav * the last element in the buf list): 205307265Smav * 206307265Smav * arc_buf_hdr_t 207307265Smav * +-----------+ 208307265Smav * | | 209307265Smav * | | 210307265Smav * | | 211307265Smav * +-----------+ 212307265Smav * l2arc_buf_hdr_t| | 213307265Smav * | | 214307265Smav * +-----------+ 215307265Smav * l1arc_buf_hdr_t| | 216307265Smav * | | arc_buf_t (shared) 217307265Smav * | b_buf +------------>+---------+ arc_buf_t 218307265Smav * | | |b_next +---->+---------+ 219321610Smav * | b_pabd +-+ |---------| |b_next +-->NULL 220307265Smav * +-----------+ | | | +---------+ 221307265Smav * | |b_data +-+ | | 222307265Smav * | +---------+ | |b_data +-+ 223307265Smav * +->+------+ | +---------+ | 224307265Smav * | | | | 225307265Smav * uncompressed | | | | 226307265Smav * data +------+ | | 227307265Smav * ^ +->+------+ | 228307265Smav * | uncompressed | | | 229307265Smav * | data | | | 230307265Smav * | +------+ | 231307265Smav * +---------------------------------+ 232307265Smav * 233321610Smav * Writing to the ARC requires that the ARC first discard the hdr's b_pabd 234307265Smav * since the physical block is about to be rewritten. The new data contents 235321535Smav * will be contained in the arc_buf_t. As the I/O pipeline performs the write, 236321535Smav * it may compress the data before writing it to disk. The ARC will be called 237321535Smav * with the transformed data and will bcopy the transformed on-disk block into 238321610Smav * a newly allocated b_pabd. Writes are always done into buffers which have 239321535Smav * either been loaned (and hence are new and don't have other readers) or 240321535Smav * buffers which have been released (and hence have their own hdr, if there 241321535Smav * were originally other readers of the buf's original hdr). This ensures that 242321535Smav * the ARC only needs to update a single buf and its hdr after a write occurs. 243307265Smav * 244321610Smav * When the L2ARC is in use, it will also take advantage of the b_pabd. The 245321610Smav * L2ARC will always write the contents of b_pabd to the L2ARC. This means 246321535Smav * that when compressed ARC is enabled that the L2ARC blocks are identical 247307265Smav * to the on-disk block in the main data pool. This provides a significant 248307265Smav * advantage since the ARC can leverage the bp's checksum when reading from the 249307265Smav * L2ARC to determine if the contents are valid. However, if the compressed 250321535Smav * ARC is disabled, then the L2ARC's block must be transformed to look 251307265Smav * like the physical block in the main data pool before comparing the 252307265Smav * checksum and determining its validity. 253307265Smav */ 254307265Smav 255168404Spjd#include <sys/spa.h> 256168404Spjd#include <sys/zio.h> 257307265Smav#include <sys/spa_impl.h> 258251478Sdelphij#include <sys/zio_compress.h> 259307265Smav#include <sys/zio_checksum.h> 260168404Spjd#include <sys/zfs_context.h> 261168404Spjd#include <sys/arc.h> 262168404Spjd#include <sys/refcount.h> 263185029Spjd#include <sys/vdev.h> 264219089Spjd#include <sys/vdev_impl.h> 265258632Savg#include <sys/dsl_pool.h> 266321610Smav#include <sys/zio_checksum.h> 267286763Smav#include <sys/multilist.h> 268321610Smav#include <sys/abd.h> 269168404Spjd#ifdef _KERNEL 270168404Spjd#include <sys/dnlc.h> 271297633Strasz#include <sys/racct.h> 272168404Spjd#endif 273168404Spjd#include <sys/callb.h> 274168404Spjd#include <sys/kstat.h> 275248572Ssmh#include <sys/trim_map.h> 276219089Spjd#include <zfs_fletcher.h> 277168404Spjd#include <sys/sdt.h> 278168404Spjd 279272483Ssmh#include <machine/vmparam.h> 280191902Skmacy 281240133Smm#ifdef illumos 282240133Smm#ifndef _KERNEL 283240133Smm/* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */ 284240133Smmboolean_t arc_watch = B_FALSE; 285240133Smmint arc_procfd; 286240133Smm#endif 287240133Smm#endif /* illumos */ 288240133Smm 289286763Smavstatic kmutex_t arc_reclaim_lock; 290286763Smavstatic kcondvar_t arc_reclaim_thread_cv; 291286763Smavstatic boolean_t arc_reclaim_thread_exit; 292286763Smavstatic kcondvar_t arc_reclaim_waiters_cv; 293168404Spjd 294301997Skibstatic kmutex_t arc_dnlc_evicts_lock; 295301997Skibstatic kcondvar_t arc_dnlc_evicts_cv; 296301997Skibstatic boolean_t arc_dnlc_evicts_thread_exit; 297301997Skib 298286625Smavuint_t arc_reduce_dnlc_percent = 3; 299168404Spjd 300258632Savg/* 301286763Smav * The number of headers to evict in arc_evict_state_impl() before 302286763Smav * dropping the sublist lock and evicting from another sublist. A lower 303286763Smav * value means we're more likely to evict the "correct" header (i.e. the 304286763Smav * oldest header in the arc state), but comes with higher overhead 305286763Smav * (i.e. more invocations of arc_evict_state_impl()). 306258632Savg */ 307286763Smavint zfs_arc_evict_batch_limit = 10; 308258632Savg 309168404Spjd/* number of seconds before growing cache again */ 310168404Spjdstatic int arc_grow_retry = 60; 311168404Spjd 312321610Smav/* shift of arc_c for calculating overflow limit in arc_get_data_impl */ 313286763Smavint zfs_arc_overflow_shift = 8; 314286763Smav 315208373Smm/* shift of arc_c for calculating both min and max arc_p */ 316208373Smmstatic int arc_p_min_shift = 4; 317208373Smm 318208373Smm/* log2(fraction of arc to reclaim) */ 319286625Smavstatic int arc_shrink_shift = 7; 320208373Smm 321168404Spjd/* 322286625Smav * log2(fraction of ARC which must be free to allow growing). 323286625Smav * I.e. If there is less than arc_c >> arc_no_grow_shift free memory, 324286625Smav * when reading a new block into the ARC, we will evict an equal-sized block 325286625Smav * from the ARC. 326286625Smav * 327286625Smav * This must be less than arc_shrink_shift, so that when we shrink the ARC, 328286625Smav * we will still not allow it to grow. 329286625Smav */ 330286625Smavint arc_no_grow_shift = 5; 331286625Smav 332286625Smav 333286625Smav/* 334168404Spjd * minimum lifespan of a prefetch block in clock ticks 335168404Spjd * (initialized in arc_init()) 336168404Spjd */ 337168404Spjdstatic int arc_min_prefetch_lifespan; 338168404Spjd 339258632Savg/* 340258632Savg * If this percent of memory is free, don't throttle. 341258632Savg */ 342258632Savgint arc_lotsfree_percent = 10; 343258632Savg 344208373Smmstatic int arc_dead; 345287702Sdelphijextern boolean_t zfs_prefetch_disable; 346168404Spjd 347168404Spjd/* 348185029Spjd * The arc has filled available memory and has now warmed up. 349185029Spjd */ 350185029Spjdstatic boolean_t arc_warm; 351185029Spjd 352286762Smav/* 353331383Smav * log2 fraction of the zio arena to keep free. 354331383Smav */ 355331383Smavint arc_zio_arena_free_shift = 2; 356331383Smav 357331383Smav/* 358286762Smav * These tunables are for performance analysis. 359286762Smav */ 360185029Spjduint64_t zfs_arc_max; 361185029Spjduint64_t zfs_arc_min; 362185029Spjduint64_t zfs_arc_meta_limit = 0; 363275780Sdelphijuint64_t zfs_arc_meta_min = 0; 364208373Smmint zfs_arc_grow_retry = 0; 365208373Smmint zfs_arc_shrink_shift = 0; 366323667Sbaptint zfs_arc_no_grow_shift = 0; 367208373Smmint zfs_arc_p_min_shift = 0; 368269230Sdelphijuint64_t zfs_arc_average_blocksize = 8 * 1024; /* 8KB */ 369272483Ssmhu_int zfs_arc_free_target = 0; 370185029Spjd 371302265Ssmh/* Absolute min for arc min / max is 16MB. */ 372302265Ssmhstatic uint64_t arc_abs_min = 16 << 20; 373302265Ssmh 374307265Smavboolean_t zfs_compressed_arc_enabled = B_TRUE; 375307265Smav 376270759Ssmhstatic int sysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS); 377275748Sdelphijstatic int sysctl_vfs_zfs_arc_meta_limit(SYSCTL_HANDLER_ARGS); 378302265Ssmhstatic int sysctl_vfs_zfs_arc_max(SYSCTL_HANDLER_ARGS); 379302265Ssmhstatic int sysctl_vfs_zfs_arc_min(SYSCTL_HANDLER_ARGS); 380323667Sbaptstatic int sysctl_vfs_zfs_arc_no_grow_shift(SYSCTL_HANDLER_ARGS); 381270759Ssmh 382302265Ssmh#if defined(__FreeBSD__) && defined(_KERNEL) 383270759Ssmhstatic void 384270759Ssmharc_free_target_init(void *unused __unused) 385270759Ssmh{ 386270759Ssmh 387272483Ssmh zfs_arc_free_target = vm_pageout_wakeup_thresh; 388270759Ssmh} 389270759SsmhSYSINIT(arc_free_target_init, SI_SUB_KTHREAD_PAGE, SI_ORDER_ANY, 390270759Ssmh arc_free_target_init, NULL); 391270759Ssmh 392185029SpjdTUNABLE_QUAD("vfs.zfs.arc_meta_limit", &zfs_arc_meta_limit); 393275780SdelphijTUNABLE_QUAD("vfs.zfs.arc_meta_min", &zfs_arc_meta_min); 394273026SdelphijTUNABLE_INT("vfs.zfs.arc_shrink_shift", &zfs_arc_shrink_shift); 395323667SbaptTUNABLE_INT("vfs.zfs.arc_grow_retry", &zfs_arc_grow_retry); 396323667SbaptTUNABLE_INT("vfs.zfs.arc_no_grow_shift", &zfs_arc_no_grow_shift); 397168473SpjdSYSCTL_DECL(_vfs_zfs); 398302265SsmhSYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_max, CTLTYPE_U64 | CTLFLAG_RWTUN, 399302265Ssmh 0, sizeof(uint64_t), sysctl_vfs_zfs_arc_max, "QU", "Maximum ARC size"); 400302265SsmhSYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_min, CTLTYPE_U64 | CTLFLAG_RWTUN, 401302265Ssmh 0, sizeof(uint64_t), sysctl_vfs_zfs_arc_min, "QU", "Minimum ARC size"); 402323667SbaptSYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_no_grow_shift, CTLTYPE_U32 | CTLFLAG_RWTUN, 403323667Sbapt 0, sizeof(uint32_t), sysctl_vfs_zfs_arc_no_grow_shift, "U", 404323667Sbapt "log2(fraction of ARC which must be free to allow growing)"); 405269230SdelphijSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_average_blocksize, CTLFLAG_RDTUN, 406269230Sdelphij &zfs_arc_average_blocksize, 0, 407269230Sdelphij "ARC average blocksize"); 408273026SdelphijSYSCTL_INT(_vfs_zfs, OID_AUTO, arc_shrink_shift, CTLFLAG_RW, 409273026Sdelphij &arc_shrink_shift, 0, 410273026Sdelphij "log2(fraction of arc to reclaim)"); 411323667SbaptSYSCTL_INT(_vfs_zfs, OID_AUTO, arc_grow_retry, CTLFLAG_RW, 412323667Sbapt &arc_grow_retry, 0, 413323667Sbapt "Wait in seconds before considering growing ARC"); 414307265SmavSYSCTL_INT(_vfs_zfs, OID_AUTO, compressed_arc_enabled, CTLFLAG_RDTUN, 415307265Smav &zfs_compressed_arc_enabled, 0, "Enable compressed ARC"); 416273026Sdelphij 417270759Ssmh/* 418270759Ssmh * We don't have a tunable for arc_free_target due to the dependency on 419270759Ssmh * pagedaemon initialisation. 420270759Ssmh */ 421270759SsmhSYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_free_target, 422270759Ssmh CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(u_int), 423270759Ssmh sysctl_vfs_zfs_arc_free_target, "IU", 424270759Ssmh "Desired number of free pages below which ARC triggers reclaim"); 425168404Spjd 426270759Ssmhstatic int 427270759Ssmhsysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS) 428270759Ssmh{ 429270759Ssmh u_int val; 430270759Ssmh int err; 431270759Ssmh 432270759Ssmh val = zfs_arc_free_target; 433270759Ssmh err = sysctl_handle_int(oidp, &val, 0, req); 434270759Ssmh if (err != 0 || req->newptr == NULL) 435270759Ssmh return (err); 436270759Ssmh 437272483Ssmh if (val < minfree) 438270759Ssmh return (EINVAL); 439272483Ssmh if (val > vm_cnt.v_page_count) 440270759Ssmh return (EINVAL); 441270759Ssmh 442270759Ssmh zfs_arc_free_target = val; 443270759Ssmh 444270759Ssmh return (0); 445270759Ssmh} 446275748Sdelphij 447275748Sdelphij/* 448275748Sdelphij * Must be declared here, before the definition of corresponding kstat 449275748Sdelphij * macro which uses the same names will confuse the compiler. 450275748Sdelphij */ 451275748SdelphijSYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_meta_limit, 452275748Sdelphij CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(uint64_t), 453275748Sdelphij sysctl_vfs_zfs_arc_meta_limit, "QU", 454275748Sdelphij "ARC metadata limit"); 455272483Ssmh#endif 456270759Ssmh 457168404Spjd/* 458185029Spjd * Note that buffers can be in one of 6 states: 459168404Spjd * ARC_anon - anonymous (discussed below) 460168404Spjd * ARC_mru - recently used, currently cached 461168404Spjd * ARC_mru_ghost - recentely used, no longer in cache 462168404Spjd * ARC_mfu - frequently used, currently cached 463168404Spjd * ARC_mfu_ghost - frequently used, no longer in cache 464185029Spjd * ARC_l2c_only - exists in L2ARC but not other states 465185029Spjd * When there are no active references to the buffer, they are 466185029Spjd * are linked onto a list in one of these arc states. These are 467185029Spjd * the only buffers that can be evicted or deleted. Within each 468185029Spjd * state there are multiple lists, one for meta-data and one for 469185029Spjd * non-meta-data. Meta-data (indirect blocks, blocks of dnodes, 470185029Spjd * etc.) is tracked separately so that it can be managed more 471185029Spjd * explicitly: favored over data, limited explicitly. 472168404Spjd * 473168404Spjd * Anonymous buffers are buffers that are not associated with 474168404Spjd * a DVA. These are buffers that hold dirty block copies 475168404Spjd * before they are written to stable storage. By definition, 476168404Spjd * they are "ref'd" and are considered part of arc_mru 477168404Spjd * that cannot be freed. Generally, they will aquire a DVA 478168404Spjd * as they are written and migrate onto the arc_mru list. 479185029Spjd * 480185029Spjd * The ARC_l2c_only state is for buffers that are in the second 481185029Spjd * level ARC but no longer in any of the ARC_m* lists. The second 482185029Spjd * level ARC itself may also contain buffers that are in any of 483185029Spjd * the ARC_m* states - meaning that a buffer can exist in two 484185029Spjd * places. The reason for the ARC_l2c_only state is to keep the 485185029Spjd * buffer header in the hash table, so that reads that hit the 486185029Spjd * second level ARC benefit from these fast lookups. 487168404Spjd */ 488168404Spjd 489168404Spjdtypedef struct arc_state { 490286763Smav /* 491286763Smav * list of evictable buffers 492286763Smav */ 493321553Smav multilist_t *arcs_list[ARC_BUFC_NUMTYPES]; 494286763Smav /* 495286763Smav * total amount of evictable data in this state 496286763Smav */ 497307265Smav refcount_t arcs_esize[ARC_BUFC_NUMTYPES]; 498286763Smav /* 499286763Smav * total amount of data in this state; this includes: evictable, 500286763Smav * non-evictable, ARC_BUFC_DATA, and ARC_BUFC_METADATA. 501286763Smav */ 502286766Smav refcount_t arcs_size; 503168404Spjd} arc_state_t; 504168404Spjd 505185029Spjd/* The 6 states: */ 506168404Spjdstatic arc_state_t ARC_anon; 507168404Spjdstatic arc_state_t ARC_mru; 508168404Spjdstatic arc_state_t ARC_mru_ghost; 509168404Spjdstatic arc_state_t ARC_mfu; 510168404Spjdstatic arc_state_t ARC_mfu_ghost; 511185029Spjdstatic arc_state_t ARC_l2c_only; 512168404Spjd 513168404Spjdtypedef struct arc_stats { 514168404Spjd kstat_named_t arcstat_hits; 515168404Spjd kstat_named_t arcstat_misses; 516168404Spjd kstat_named_t arcstat_demand_data_hits; 517168404Spjd kstat_named_t arcstat_demand_data_misses; 518168404Spjd kstat_named_t arcstat_demand_metadata_hits; 519168404Spjd kstat_named_t arcstat_demand_metadata_misses; 520168404Spjd kstat_named_t arcstat_prefetch_data_hits; 521168404Spjd kstat_named_t arcstat_prefetch_data_misses; 522168404Spjd kstat_named_t arcstat_prefetch_metadata_hits; 523168404Spjd kstat_named_t arcstat_prefetch_metadata_misses; 524168404Spjd kstat_named_t arcstat_mru_hits; 525168404Spjd kstat_named_t arcstat_mru_ghost_hits; 526168404Spjd kstat_named_t arcstat_mfu_hits; 527168404Spjd kstat_named_t arcstat_mfu_ghost_hits; 528205231Skmacy kstat_named_t arcstat_allocated; 529168404Spjd kstat_named_t arcstat_deleted; 530251629Sdelphij /* 531251629Sdelphij * Number of buffers that could not be evicted because the hash lock 532251629Sdelphij * was held by another thread. The lock may not necessarily be held 533251629Sdelphij * by something using the same buffer, since hash locks are shared 534251629Sdelphij * by multiple buffers. 535251629Sdelphij */ 536168404Spjd kstat_named_t arcstat_mutex_miss; 537251629Sdelphij /* 538251629Sdelphij * Number of buffers skipped because they have I/O in progress, are 539251629Sdelphij * indrect prefetch buffers that have not lived long enough, or are 540251629Sdelphij * not from the spa we're trying to evict from. 541251629Sdelphij */ 542168404Spjd kstat_named_t arcstat_evict_skip; 543286763Smav /* 544286763Smav * Number of times arc_evict_state() was unable to evict enough 545286763Smav * buffers to reach it's target amount. 546286763Smav */ 547286763Smav kstat_named_t arcstat_evict_not_enough; 548208373Smm kstat_named_t arcstat_evict_l2_cached; 549208373Smm kstat_named_t arcstat_evict_l2_eligible; 550208373Smm kstat_named_t arcstat_evict_l2_ineligible; 551286763Smav kstat_named_t arcstat_evict_l2_skip; 552168404Spjd kstat_named_t arcstat_hash_elements; 553168404Spjd kstat_named_t arcstat_hash_elements_max; 554168404Spjd kstat_named_t arcstat_hash_collisions; 555168404Spjd kstat_named_t arcstat_hash_chains; 556168404Spjd kstat_named_t arcstat_hash_chain_max; 557168404Spjd kstat_named_t arcstat_p; 558168404Spjd kstat_named_t arcstat_c; 559168404Spjd kstat_named_t arcstat_c_min; 560168404Spjd kstat_named_t arcstat_c_max; 561168404Spjd kstat_named_t arcstat_size; 562286574Smav /* 563321610Smav * Number of compressed bytes stored in the arc_buf_hdr_t's b_pabd. 564307265Smav * Note that the compressed bytes may match the uncompressed bytes 565307265Smav * if the block is either not compressed or compressed arc is disabled. 566307265Smav */ 567307265Smav kstat_named_t arcstat_compressed_size; 568307265Smav /* 569321610Smav * Uncompressed size of the data stored in b_pabd. If compressed 570307265Smav * arc is disabled then this value will be identical to the stat 571307265Smav * above. 572307265Smav */ 573307265Smav kstat_named_t arcstat_uncompressed_size; 574307265Smav /* 575307265Smav * Number of bytes stored in all the arc_buf_t's. This is classified 576307265Smav * as "overhead" since this data is typically short-lived and will 577307265Smav * be evicted from the arc when it becomes unreferenced unless the 578307265Smav * zfs_keep_uncompressed_metadata or zfs_keep_uncompressed_level 579307265Smav * values have been set (see comment in dbuf.c for more information). 580307265Smav */ 581307265Smav kstat_named_t arcstat_overhead_size; 582307265Smav /* 583286574Smav * Number of bytes consumed by internal ARC structures necessary 584286574Smav * for tracking purposes; these structures are not actually 585286574Smav * backed by ARC buffers. This includes arc_buf_hdr_t structures 586286574Smav * (allocated via arc_buf_hdr_t_full and arc_buf_hdr_t_l2only 587286574Smav * caches), and arc_buf_t structures (allocated via arc_buf_t 588286574Smav * cache). 589286574Smav */ 590185029Spjd kstat_named_t arcstat_hdr_size; 591286574Smav /* 592286574Smav * Number of bytes consumed by ARC buffers of type equal to 593286574Smav * ARC_BUFC_DATA. This is generally consumed by buffers backing 594286574Smav * on disk user data (e.g. plain file contents). 595286574Smav */ 596208373Smm kstat_named_t arcstat_data_size; 597286574Smav /* 598286574Smav * Number of bytes consumed by ARC buffers of type equal to 599286574Smav * ARC_BUFC_METADATA. This is generally consumed by buffers 600286574Smav * backing on disk data that is used for internal ZFS 601286574Smav * structures (e.g. ZAP, dnode, indirect blocks, etc). 602286574Smav */ 603286574Smav kstat_named_t arcstat_metadata_size; 604286574Smav /* 605286574Smav * Number of bytes consumed by various buffers and structures 606286574Smav * not actually backed with ARC buffers. This includes bonus 607286574Smav * buffers (allocated directly via zio_buf_* functions), 608286574Smav * dmu_buf_impl_t structures (allocated via dmu_buf_impl_t 609286574Smav * cache), and dnode_t structures (allocated via dnode_t cache). 610286574Smav */ 611208373Smm kstat_named_t arcstat_other_size; 612286574Smav /* 613286574Smav * Total number of bytes consumed by ARC buffers residing in the 614286574Smav * arc_anon state. This includes *all* buffers in the arc_anon 615286574Smav * state; e.g. data, metadata, evictable, and unevictable buffers 616286574Smav * are all included in this value. 617286574Smav */ 618286574Smav kstat_named_t arcstat_anon_size; 619286574Smav /* 620286574Smav * Number of bytes consumed by ARC buffers that meet the 621286574Smav * following criteria: backing buffers of type ARC_BUFC_DATA, 622286574Smav * residing in the arc_anon state, and are eligible for eviction 623286574Smav * (e.g. have no outstanding holds on the buffer). 624286574Smav */ 625286574Smav kstat_named_t arcstat_anon_evictable_data; 626286574Smav /* 627286574Smav * Number of bytes consumed by ARC buffers that meet the 628286574Smav * following criteria: backing buffers of type ARC_BUFC_METADATA, 629286574Smav * residing in the arc_anon state, and are eligible for eviction 630286574Smav * (e.g. have no outstanding holds on the buffer). 631286574Smav */ 632286574Smav kstat_named_t arcstat_anon_evictable_metadata; 633286574Smav /* 634286574Smav * Total number of bytes consumed by ARC buffers residing in the 635286574Smav * arc_mru state. This includes *all* buffers in the arc_mru 636286574Smav * state; e.g. data, metadata, evictable, and unevictable buffers 637286574Smav * are all included in this value. 638286574Smav */ 639286574Smav kstat_named_t arcstat_mru_size; 640286574Smav /* 641286574Smav * Number of bytes consumed by ARC buffers that meet the 642286574Smav * following criteria: backing buffers of type ARC_BUFC_DATA, 643286574Smav * residing in the arc_mru state, and are eligible for eviction 644286574Smav * (e.g. have no outstanding holds on the buffer). 645286574Smav */ 646286574Smav kstat_named_t arcstat_mru_evictable_data; 647286574Smav /* 648286574Smav * Number of bytes consumed by ARC buffers that meet the 649286574Smav * following criteria: backing buffers of type ARC_BUFC_METADATA, 650286574Smav * residing in the arc_mru state, and are eligible for eviction 651286574Smav * (e.g. have no outstanding holds on the buffer). 652286574Smav */ 653286574Smav kstat_named_t arcstat_mru_evictable_metadata; 654286574Smav /* 655286574Smav * Total number of bytes that *would have been* consumed by ARC 656286574Smav * buffers in the arc_mru_ghost state. The key thing to note 657286574Smav * here, is the fact that this size doesn't actually indicate 658286574Smav * RAM consumption. The ghost lists only consist of headers and 659286574Smav * don't actually have ARC buffers linked off of these headers. 660286574Smav * Thus, *if* the headers had associated ARC buffers, these 661286574Smav * buffers *would have* consumed this number of bytes. 662286574Smav */ 663286574Smav kstat_named_t arcstat_mru_ghost_size; 664286574Smav /* 665286574Smav * Number of bytes that *would have been* consumed by ARC 666286574Smav * buffers that are eligible for eviction, of type 667286574Smav * ARC_BUFC_DATA, and linked off the arc_mru_ghost state. 668286574Smav */ 669286574Smav kstat_named_t arcstat_mru_ghost_evictable_data; 670286574Smav /* 671286574Smav * Number of bytes that *would have been* consumed by ARC 672286574Smav * buffers that are eligible for eviction, of type 673286574Smav * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state. 674286574Smav */ 675286574Smav kstat_named_t arcstat_mru_ghost_evictable_metadata; 676286574Smav /* 677286574Smav * Total number of bytes consumed by ARC buffers residing in the 678286574Smav * arc_mfu state. This includes *all* buffers in the arc_mfu 679286574Smav * state; e.g. data, metadata, evictable, and unevictable buffers 680286574Smav * are all included in this value. 681286574Smav */ 682286574Smav kstat_named_t arcstat_mfu_size; 683286574Smav /* 684286574Smav * Number of bytes consumed by ARC buffers that are eligible for 685286574Smav * eviction, of type ARC_BUFC_DATA, and reside in the arc_mfu 686286574Smav * state. 687286574Smav */ 688286574Smav kstat_named_t arcstat_mfu_evictable_data; 689286574Smav /* 690286574Smav * Number of bytes consumed by ARC buffers that are eligible for 691286574Smav * eviction, of type ARC_BUFC_METADATA, and reside in the 692286574Smav * arc_mfu state. 693286574Smav */ 694286574Smav kstat_named_t arcstat_mfu_evictable_metadata; 695286574Smav /* 696286574Smav * Total number of bytes that *would have been* consumed by ARC 697286574Smav * buffers in the arc_mfu_ghost state. See the comment above 698286574Smav * arcstat_mru_ghost_size for more details. 699286574Smav */ 700286574Smav kstat_named_t arcstat_mfu_ghost_size; 701286574Smav /* 702286574Smav * Number of bytes that *would have been* consumed by ARC 703286574Smav * buffers that are eligible for eviction, of type 704286574Smav * ARC_BUFC_DATA, and linked off the arc_mfu_ghost state. 705286574Smav */ 706286574Smav kstat_named_t arcstat_mfu_ghost_evictable_data; 707286574Smav /* 708286574Smav * Number of bytes that *would have been* consumed by ARC 709286574Smav * buffers that are eligible for eviction, of type 710286574Smav * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state. 711286574Smav */ 712286574Smav kstat_named_t arcstat_mfu_ghost_evictable_metadata; 713185029Spjd kstat_named_t arcstat_l2_hits; 714185029Spjd kstat_named_t arcstat_l2_misses; 715185029Spjd kstat_named_t arcstat_l2_feeds; 716185029Spjd kstat_named_t arcstat_l2_rw_clash; 717208373Smm kstat_named_t arcstat_l2_read_bytes; 718208373Smm kstat_named_t arcstat_l2_write_bytes; 719185029Spjd kstat_named_t arcstat_l2_writes_sent; 720185029Spjd kstat_named_t arcstat_l2_writes_done; 721185029Spjd kstat_named_t arcstat_l2_writes_error; 722286763Smav kstat_named_t arcstat_l2_writes_lock_retry; 723185029Spjd kstat_named_t arcstat_l2_evict_lock_retry; 724185029Spjd kstat_named_t arcstat_l2_evict_reading; 725286570Smav kstat_named_t arcstat_l2_evict_l1cached; 726185029Spjd kstat_named_t arcstat_l2_free_on_write; 727185029Spjd kstat_named_t arcstat_l2_abort_lowmem; 728185029Spjd kstat_named_t arcstat_l2_cksum_bad; 729185029Spjd kstat_named_t arcstat_l2_io_error; 730323754Savg kstat_named_t arcstat_l2_lsize; 731323754Savg kstat_named_t arcstat_l2_psize; 732185029Spjd kstat_named_t arcstat_l2_hdr_size; 733205231Skmacy kstat_named_t arcstat_l2_write_trylock_fail; 734205231Skmacy kstat_named_t arcstat_l2_write_passed_headroom; 735205231Skmacy kstat_named_t arcstat_l2_write_spa_mismatch; 736206796Spjd kstat_named_t arcstat_l2_write_in_l2; 737205231Skmacy kstat_named_t arcstat_l2_write_hdr_io_in_progress; 738205231Skmacy kstat_named_t arcstat_l2_write_not_cacheable; 739205231Skmacy kstat_named_t arcstat_l2_write_full; 740205231Skmacy kstat_named_t arcstat_l2_write_buffer_iter; 741205231Skmacy kstat_named_t arcstat_l2_write_pios; 742205231Skmacy kstat_named_t arcstat_l2_write_buffer_bytes_scanned; 743205231Skmacy kstat_named_t arcstat_l2_write_buffer_list_iter; 744205231Skmacy kstat_named_t arcstat_l2_write_buffer_list_null_iter; 745242845Sdelphij kstat_named_t arcstat_memory_throttle_count; 746275748Sdelphij kstat_named_t arcstat_meta_used; 747275748Sdelphij kstat_named_t arcstat_meta_limit; 748275748Sdelphij kstat_named_t arcstat_meta_max; 749275780Sdelphij kstat_named_t arcstat_meta_min; 750287702Sdelphij kstat_named_t arcstat_sync_wait_for_async; 751287702Sdelphij kstat_named_t arcstat_demand_hit_predictive_prefetch; 752168404Spjd} arc_stats_t; 753168404Spjd 754168404Spjdstatic arc_stats_t arc_stats = { 755168404Spjd { "hits", KSTAT_DATA_UINT64 }, 756168404Spjd { "misses", KSTAT_DATA_UINT64 }, 757168404Spjd { "demand_data_hits", KSTAT_DATA_UINT64 }, 758168404Spjd { "demand_data_misses", KSTAT_DATA_UINT64 }, 759168404Spjd { "demand_metadata_hits", KSTAT_DATA_UINT64 }, 760168404Spjd { "demand_metadata_misses", KSTAT_DATA_UINT64 }, 761168404Spjd { "prefetch_data_hits", KSTAT_DATA_UINT64 }, 762168404Spjd { "prefetch_data_misses", KSTAT_DATA_UINT64 }, 763168404Spjd { "prefetch_metadata_hits", KSTAT_DATA_UINT64 }, 764168404Spjd { "prefetch_metadata_misses", KSTAT_DATA_UINT64 }, 765168404Spjd { "mru_hits", KSTAT_DATA_UINT64 }, 766168404Spjd { "mru_ghost_hits", KSTAT_DATA_UINT64 }, 767168404Spjd { "mfu_hits", KSTAT_DATA_UINT64 }, 768168404Spjd { "mfu_ghost_hits", KSTAT_DATA_UINT64 }, 769205231Skmacy { "allocated", KSTAT_DATA_UINT64 }, 770168404Spjd { "deleted", KSTAT_DATA_UINT64 }, 771168404Spjd { "mutex_miss", KSTAT_DATA_UINT64 }, 772168404Spjd { "evict_skip", KSTAT_DATA_UINT64 }, 773286763Smav { "evict_not_enough", KSTAT_DATA_UINT64 }, 774208373Smm { "evict_l2_cached", KSTAT_DATA_UINT64 }, 775208373Smm { "evict_l2_eligible", KSTAT_DATA_UINT64 }, 776208373Smm { "evict_l2_ineligible", KSTAT_DATA_UINT64 }, 777286763Smav { "evict_l2_skip", KSTAT_DATA_UINT64 }, 778168404Spjd { "hash_elements", KSTAT_DATA_UINT64 }, 779168404Spjd { "hash_elements_max", KSTAT_DATA_UINT64 }, 780168404Spjd { "hash_collisions", KSTAT_DATA_UINT64 }, 781168404Spjd { "hash_chains", KSTAT_DATA_UINT64 }, 782168404Spjd { "hash_chain_max", KSTAT_DATA_UINT64 }, 783168404Spjd { "p", KSTAT_DATA_UINT64 }, 784168404Spjd { "c", KSTAT_DATA_UINT64 }, 785168404Spjd { "c_min", KSTAT_DATA_UINT64 }, 786168404Spjd { "c_max", KSTAT_DATA_UINT64 }, 787185029Spjd { "size", KSTAT_DATA_UINT64 }, 788307265Smav { "compressed_size", KSTAT_DATA_UINT64 }, 789307265Smav { "uncompressed_size", KSTAT_DATA_UINT64 }, 790307265Smav { "overhead_size", KSTAT_DATA_UINT64 }, 791185029Spjd { "hdr_size", KSTAT_DATA_UINT64 }, 792208373Smm { "data_size", KSTAT_DATA_UINT64 }, 793286574Smav { "metadata_size", KSTAT_DATA_UINT64 }, 794208373Smm { "other_size", KSTAT_DATA_UINT64 }, 795286574Smav { "anon_size", KSTAT_DATA_UINT64 }, 796286574Smav { "anon_evictable_data", KSTAT_DATA_UINT64 }, 797286574Smav { "anon_evictable_metadata", KSTAT_DATA_UINT64 }, 798286574Smav { "mru_size", KSTAT_DATA_UINT64 }, 799286574Smav { "mru_evictable_data", KSTAT_DATA_UINT64 }, 800286574Smav { "mru_evictable_metadata", KSTAT_DATA_UINT64 }, 801286574Smav { "mru_ghost_size", KSTAT_DATA_UINT64 }, 802286574Smav { "mru_ghost_evictable_data", KSTAT_DATA_UINT64 }, 803286574Smav { "mru_ghost_evictable_metadata", KSTAT_DATA_UINT64 }, 804286574Smav { "mfu_size", KSTAT_DATA_UINT64 }, 805286574Smav { "mfu_evictable_data", KSTAT_DATA_UINT64 }, 806286574Smav { "mfu_evictable_metadata", KSTAT_DATA_UINT64 }, 807286574Smav { "mfu_ghost_size", KSTAT_DATA_UINT64 }, 808286574Smav { "mfu_ghost_evictable_data", KSTAT_DATA_UINT64 }, 809286574Smav { "mfu_ghost_evictable_metadata", KSTAT_DATA_UINT64 }, 810185029Spjd { "l2_hits", KSTAT_DATA_UINT64 }, 811185029Spjd { "l2_misses", KSTAT_DATA_UINT64 }, 812185029Spjd { "l2_feeds", KSTAT_DATA_UINT64 }, 813185029Spjd { "l2_rw_clash", KSTAT_DATA_UINT64 }, 814208373Smm { "l2_read_bytes", KSTAT_DATA_UINT64 }, 815208373Smm { "l2_write_bytes", KSTAT_DATA_UINT64 }, 816185029Spjd { "l2_writes_sent", KSTAT_DATA_UINT64 }, 817185029Spjd { "l2_writes_done", KSTAT_DATA_UINT64 }, 818185029Spjd { "l2_writes_error", KSTAT_DATA_UINT64 }, 819286763Smav { "l2_writes_lock_retry", KSTAT_DATA_UINT64 }, 820185029Spjd { "l2_evict_lock_retry", KSTAT_DATA_UINT64 }, 821185029Spjd { "l2_evict_reading", KSTAT_DATA_UINT64 }, 822286570Smav { "l2_evict_l1cached", KSTAT_DATA_UINT64 }, 823185029Spjd { "l2_free_on_write", KSTAT_DATA_UINT64 }, 824185029Spjd { "l2_abort_lowmem", KSTAT_DATA_UINT64 }, 825185029Spjd { "l2_cksum_bad", KSTAT_DATA_UINT64 }, 826185029Spjd { "l2_io_error", KSTAT_DATA_UINT64 }, 827185029Spjd { "l2_size", KSTAT_DATA_UINT64 }, 828251478Sdelphij { "l2_asize", KSTAT_DATA_UINT64 }, 829185029Spjd { "l2_hdr_size", KSTAT_DATA_UINT64 }, 830206796Spjd { "l2_write_trylock_fail", KSTAT_DATA_UINT64 }, 831206796Spjd { "l2_write_passed_headroom", KSTAT_DATA_UINT64 }, 832206796Spjd { "l2_write_spa_mismatch", KSTAT_DATA_UINT64 }, 833206796Spjd { "l2_write_in_l2", KSTAT_DATA_UINT64 }, 834206796Spjd { "l2_write_io_in_progress", KSTAT_DATA_UINT64 }, 835206796Spjd { "l2_write_not_cacheable", KSTAT_DATA_UINT64 }, 836206796Spjd { "l2_write_full", KSTAT_DATA_UINT64 }, 837206796Spjd { "l2_write_buffer_iter", KSTAT_DATA_UINT64 }, 838206796Spjd { "l2_write_pios", KSTAT_DATA_UINT64 }, 839206796Spjd { "l2_write_buffer_bytes_scanned", KSTAT_DATA_UINT64 }, 840206796Spjd { "l2_write_buffer_list_iter", KSTAT_DATA_UINT64 }, 841242845Sdelphij { "l2_write_buffer_list_null_iter", KSTAT_DATA_UINT64 }, 842242845Sdelphij { "memory_throttle_count", KSTAT_DATA_UINT64 }, 843275748Sdelphij { "arc_meta_used", KSTAT_DATA_UINT64 }, 844275748Sdelphij { "arc_meta_limit", KSTAT_DATA_UINT64 }, 845275780Sdelphij { "arc_meta_max", KSTAT_DATA_UINT64 }, 846287702Sdelphij { "arc_meta_min", KSTAT_DATA_UINT64 }, 847287702Sdelphij { "sync_wait_for_async", KSTAT_DATA_UINT64 }, 848287702Sdelphij { "demand_hit_predictive_prefetch", KSTAT_DATA_UINT64 }, 849168404Spjd}; 850168404Spjd 851168404Spjd#define ARCSTAT(stat) (arc_stats.stat.value.ui64) 852168404Spjd 853168404Spjd#define ARCSTAT_INCR(stat, val) \ 854251631Sdelphij atomic_add_64(&arc_stats.stat.value.ui64, (val)) 855168404Spjd 856206796Spjd#define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1) 857168404Spjd#define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1) 858168404Spjd 859168404Spjd#define ARCSTAT_MAX(stat, val) { \ 860168404Spjd uint64_t m; \ 861168404Spjd while ((val) > (m = arc_stats.stat.value.ui64) && \ 862168404Spjd (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val)))) \ 863168404Spjd continue; \ 864168404Spjd} 865168404Spjd 866168404Spjd#define ARCSTAT_MAXSTAT(stat) \ 867168404Spjd ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64) 868168404Spjd 869168404Spjd/* 870168404Spjd * We define a macro to allow ARC hits/misses to be easily broken down by 871168404Spjd * two separate conditions, giving a total of four different subtypes for 872168404Spjd * each of hits and misses (so eight statistics total). 873168404Spjd */ 874168404Spjd#define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \ 875168404Spjd if (cond1) { \ 876168404Spjd if (cond2) { \ 877168404Spjd ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \ 878168404Spjd } else { \ 879168404Spjd ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \ 880168404Spjd } \ 881168404Spjd } else { \ 882168404Spjd if (cond2) { \ 883168404Spjd ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \ 884168404Spjd } else { \ 885168404Spjd ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\ 886168404Spjd } \ 887168404Spjd } 888168404Spjd 889168404Spjdkstat_t *arc_ksp; 890206796Spjdstatic arc_state_t *arc_anon; 891168404Spjdstatic arc_state_t *arc_mru; 892168404Spjdstatic arc_state_t *arc_mru_ghost; 893168404Spjdstatic arc_state_t *arc_mfu; 894168404Spjdstatic arc_state_t *arc_mfu_ghost; 895185029Spjdstatic arc_state_t *arc_l2c_only; 896168404Spjd 897168404Spjd/* 898168404Spjd * There are several ARC variables that are critical to export as kstats -- 899168404Spjd * but we don't want to have to grovel around in the kstat whenever we wish to 900168404Spjd * manipulate them. For these variables, we therefore define them to be in 901168404Spjd * terms of the statistic variable. This assures that we are not introducing 902168404Spjd * the possibility of inconsistency by having shadow copies of the variables, 903168404Spjd * while still allowing the code to be readable. 904168404Spjd */ 905168404Spjd#define arc_size ARCSTAT(arcstat_size) /* actual total arc size */ 906168404Spjd#define arc_p ARCSTAT(arcstat_p) /* target size of MRU */ 907168404Spjd#define arc_c ARCSTAT(arcstat_c) /* target size of cache */ 908168404Spjd#define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */ 909168404Spjd#define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */ 910275748Sdelphij#define arc_meta_limit ARCSTAT(arcstat_meta_limit) /* max size for metadata */ 911275780Sdelphij#define arc_meta_min ARCSTAT(arcstat_meta_min) /* min size for metadata */ 912275748Sdelphij#define arc_meta_used ARCSTAT(arcstat_meta_used) /* size of metadata */ 913275748Sdelphij#define arc_meta_max ARCSTAT(arcstat_meta_max) /* max size of metadata */ 914168404Spjd 915307265Smav/* compressed size of entire arc */ 916307265Smav#define arc_compressed_size ARCSTAT(arcstat_compressed_size) 917307265Smav/* uncompressed size of entire arc */ 918307265Smav#define arc_uncompressed_size ARCSTAT(arcstat_uncompressed_size) 919307265Smav/* number of bytes in the arc from arc_buf_t's */ 920307265Smav#define arc_overhead_size ARCSTAT(arcstat_overhead_size) 921251478Sdelphij 922168404Spjdstatic int arc_no_grow; /* Don't try to grow cache size */ 923168404Spjdstatic uint64_t arc_tempreserve; 924209962Smmstatic uint64_t arc_loaned_bytes; 925168404Spjd 926168404Spjdtypedef struct arc_callback arc_callback_t; 927168404Spjd 928168404Spjdstruct arc_callback { 929168404Spjd void *acb_private; 930168404Spjd arc_done_func_t *acb_done; 931168404Spjd arc_buf_t *acb_buf; 932321535Smav boolean_t acb_compressed; 933168404Spjd zio_t *acb_zio_dummy; 934168404Spjd arc_callback_t *acb_next; 935168404Spjd}; 936168404Spjd 937168404Spjdtypedef struct arc_write_callback arc_write_callback_t; 938168404Spjd 939168404Spjdstruct arc_write_callback { 940168404Spjd void *awcb_private; 941168404Spjd arc_done_func_t *awcb_ready; 942304138Savg arc_done_func_t *awcb_children_ready; 943258632Savg arc_done_func_t *awcb_physdone; 944168404Spjd arc_done_func_t *awcb_done; 945168404Spjd arc_buf_t *awcb_buf; 946168404Spjd}; 947168404Spjd 948286570Smav/* 949286570Smav * ARC buffers are separated into multiple structs as a memory saving measure: 950286570Smav * - Common fields struct, always defined, and embedded within it: 951286570Smav * - L2-only fields, always allocated but undefined when not in L2ARC 952286570Smav * - L1-only fields, only allocated when in L1ARC 953286570Smav * 954286570Smav * Buffer in L1 Buffer only in L2 955286570Smav * +------------------------+ +------------------------+ 956286570Smav * | arc_buf_hdr_t | | arc_buf_hdr_t | 957286570Smav * | | | | 958286570Smav * | | | | 959286570Smav * | | | | 960286570Smav * +------------------------+ +------------------------+ 961286570Smav * | l2arc_buf_hdr_t | | l2arc_buf_hdr_t | 962286570Smav * | (undefined if L1-only) | | | 963286570Smav * +------------------------+ +------------------------+ 964286570Smav * | l1arc_buf_hdr_t | 965286570Smav * | | 966286570Smav * | | 967286570Smav * | | 968286570Smav * | | 969286570Smav * +------------------------+ 970286570Smav * 971286570Smav * Because it's possible for the L2ARC to become extremely large, we can wind 972286570Smav * up eating a lot of memory in L2ARC buffer headers, so the size of a header 973286570Smav * is minimized by only allocating the fields necessary for an L1-cached buffer 974286570Smav * when a header is actually in the L1 cache. The sub-headers (l1arc_buf_hdr and 975286570Smav * l2arc_buf_hdr) are embedded rather than allocated separately to save a couple 976286570Smav * words in pointers. arc_hdr_realloc() is used to switch a header between 977286570Smav * these two allocation states. 978286570Smav */ 979286570Smavtypedef struct l1arc_buf_hdr { 980168404Spjd kmutex_t b_freeze_lock; 981307265Smav zio_cksum_t *b_freeze_cksum; 982286570Smav#ifdef ZFS_DEBUG 983286570Smav /* 984321535Smav * Used for debugging with kmem_flags - by allocating and freeing 985286570Smav * b_thawed when the buffer is thawed, we get a record of the stack 986286570Smav * trace that thawed it. 987286570Smav */ 988219089Spjd void *b_thawed; 989286570Smav#endif 990168404Spjd 991168404Spjd arc_buf_t *b_buf; 992307265Smav uint32_t b_bufcnt; 993286570Smav /* for waiting on writes to complete */ 994168404Spjd kcondvar_t b_cv; 995307265Smav uint8_t b_byteswap; 996168404Spjd 997168404Spjd /* protected by arc state mutex */ 998168404Spjd arc_state_t *b_state; 999286763Smav multilist_node_t b_arc_node; 1000168404Spjd 1001168404Spjd /* updated atomically */ 1002168404Spjd clock_t b_arc_access; 1003168404Spjd 1004168404Spjd /* self protecting */ 1005168404Spjd refcount_t b_refcnt; 1006185029Spjd 1007286570Smav arc_callback_t *b_acb; 1008321610Smav abd_t *b_pabd; 1009286570Smav} l1arc_buf_hdr_t; 1010286570Smav 1011286570Smavtypedef struct l2arc_dev l2arc_dev_t; 1012286570Smav 1013286570Smavtypedef struct l2arc_buf_hdr { 1014286570Smav /* protected by arc_buf_hdr mutex */ 1015286570Smav l2arc_dev_t *b_dev; /* L2ARC device */ 1016286570Smav uint64_t b_daddr; /* disk address, offset byte */ 1017286570Smav 1018185029Spjd list_node_t b_l2node; 1019286570Smav} l2arc_buf_hdr_t; 1020286570Smav 1021286570Smavstruct arc_buf_hdr { 1022286570Smav /* protected by hash lock */ 1023286570Smav dva_t b_dva; 1024286570Smav uint64_t b_birth; 1025286570Smav 1026307265Smav arc_buf_contents_t b_type; 1027286570Smav arc_buf_hdr_t *b_hash_next; 1028286570Smav arc_flags_t b_flags; 1029286570Smav 1030307265Smav /* 1031307265Smav * This field stores the size of the data buffer after 1032307265Smav * compression, and is set in the arc's zio completion handlers. 1033307265Smav * It is in units of SPA_MINBLOCKSIZE (e.g. 1 == 512 bytes). 1034307265Smav * 1035307265Smav * While the block pointers can store up to 32MB in their psize 1036307265Smav * field, we can only store up to 32MB minus 512B. This is due 1037307265Smav * to the bp using a bias of 1, whereas we use a bias of 0 (i.e. 1038307265Smav * a field of zeros represents 512B in the bp). We can't use a 1039307265Smav * bias of 1 since we need to reserve a psize of zero, here, to 1040307265Smav * represent holes and embedded blocks. 1041307265Smav * 1042307265Smav * This isn't a problem in practice, since the maximum size of a 1043307265Smav * buffer is limited to 16MB, so we never need to store 32MB in 1044307265Smav * this field. Even in the upstream illumos code base, the 1045307265Smav * maximum size of a buffer is limited to 16MB. 1046307265Smav */ 1047307265Smav uint16_t b_psize; 1048286570Smav 1049307265Smav /* 1050307265Smav * This field stores the size of the data buffer before 1051307265Smav * compression, and cannot change once set. It is in units 1052307265Smav * of SPA_MINBLOCKSIZE (e.g. 2 == 1024 bytes) 1053307265Smav */ 1054307265Smav uint16_t b_lsize; /* immutable */ 1055307265Smav uint64_t b_spa; /* immutable */ 1056307265Smav 1057286570Smav /* L2ARC fields. Undefined when not in L2ARC. */ 1058286570Smav l2arc_buf_hdr_t b_l2hdr; 1059286570Smav /* L1ARC fields. Undefined when in l2arc_only state */ 1060286570Smav l1arc_buf_hdr_t b_l1hdr; 1061168404Spjd}; 1062168404Spjd 1063302265Ssmh#if defined(__FreeBSD__) && defined(_KERNEL) 1064275748Sdelphijstatic int 1065275748Sdelphijsysctl_vfs_zfs_arc_meta_limit(SYSCTL_HANDLER_ARGS) 1066275748Sdelphij{ 1067275748Sdelphij uint64_t val; 1068275748Sdelphij int err; 1069275748Sdelphij 1070275748Sdelphij val = arc_meta_limit; 1071275748Sdelphij err = sysctl_handle_64(oidp, &val, 0, req); 1072275748Sdelphij if (err != 0 || req->newptr == NULL) 1073275748Sdelphij return (err); 1074275748Sdelphij 1075275748Sdelphij if (val <= 0 || val > arc_c_max) 1076275748Sdelphij return (EINVAL); 1077275748Sdelphij 1078275748Sdelphij arc_meta_limit = val; 1079275748Sdelphij return (0); 1080275748Sdelphij} 1081302265Ssmh 1082302265Ssmhstatic int 1083323667Sbaptsysctl_vfs_zfs_arc_no_grow_shift(SYSCTL_HANDLER_ARGS) 1084323667Sbapt{ 1085323667Sbapt uint32_t val; 1086323667Sbapt int err; 1087323667Sbapt 1088323667Sbapt val = arc_no_grow_shift; 1089323667Sbapt err = sysctl_handle_32(oidp, &val, 0, req); 1090323667Sbapt if (err != 0 || req->newptr == NULL) 1091323667Sbapt return (err); 1092323667Sbapt 1093323667Sbapt if (val >= arc_shrink_shift) 1094323667Sbapt return (EINVAL); 1095323667Sbapt 1096323667Sbapt arc_no_grow_shift = val; 1097323667Sbapt return (0); 1098323667Sbapt} 1099323667Sbapt 1100323667Sbaptstatic int 1101302265Ssmhsysctl_vfs_zfs_arc_max(SYSCTL_HANDLER_ARGS) 1102302265Ssmh{ 1103302265Ssmh uint64_t val; 1104302265Ssmh int err; 1105302265Ssmh 1106302265Ssmh val = zfs_arc_max; 1107302265Ssmh err = sysctl_handle_64(oidp, &val, 0, req); 1108302265Ssmh if (err != 0 || req->newptr == NULL) 1109302265Ssmh return (err); 1110302265Ssmh 1111302382Ssmh if (zfs_arc_max == 0) { 1112302382Ssmh /* Loader tunable so blindly set */ 1113302382Ssmh zfs_arc_max = val; 1114302382Ssmh return (0); 1115302382Ssmh } 1116302382Ssmh 1117302265Ssmh if (val < arc_abs_min || val > kmem_size()) 1118302265Ssmh return (EINVAL); 1119302265Ssmh if (val < arc_c_min) 1120302265Ssmh return (EINVAL); 1121302265Ssmh if (zfs_arc_meta_limit > 0 && val < zfs_arc_meta_limit) 1122302265Ssmh return (EINVAL); 1123302265Ssmh 1124302265Ssmh arc_c_max = val; 1125302265Ssmh 1126302265Ssmh arc_c = arc_c_max; 1127302265Ssmh arc_p = (arc_c >> 1); 1128302265Ssmh 1129302265Ssmh if (zfs_arc_meta_limit == 0) { 1130302265Ssmh /* limit meta-data to 1/4 of the arc capacity */ 1131302265Ssmh arc_meta_limit = arc_c_max / 4; 1132302265Ssmh } 1133302265Ssmh 1134302265Ssmh /* if kmem_flags are set, lets try to use less memory */ 1135302265Ssmh if (kmem_debugging()) 1136302265Ssmh arc_c = arc_c / 2; 1137302265Ssmh 1138302265Ssmh zfs_arc_max = arc_c; 1139302265Ssmh 1140302265Ssmh return (0); 1141302265Ssmh} 1142302265Ssmh 1143302265Ssmhstatic int 1144302265Ssmhsysctl_vfs_zfs_arc_min(SYSCTL_HANDLER_ARGS) 1145302265Ssmh{ 1146302265Ssmh uint64_t val; 1147302265Ssmh int err; 1148302265Ssmh 1149302265Ssmh val = zfs_arc_min; 1150302265Ssmh err = sysctl_handle_64(oidp, &val, 0, req); 1151302265Ssmh if (err != 0 || req->newptr == NULL) 1152302265Ssmh return (err); 1153302265Ssmh 1154302382Ssmh if (zfs_arc_min == 0) { 1155302382Ssmh /* Loader tunable so blindly set */ 1156302382Ssmh zfs_arc_min = val; 1157302382Ssmh return (0); 1158302382Ssmh } 1159302382Ssmh 1160302265Ssmh if (val < arc_abs_min || val > arc_c_max) 1161302265Ssmh return (EINVAL); 1162302265Ssmh 1163302265Ssmh arc_c_min = val; 1164302265Ssmh 1165302265Ssmh if (zfs_arc_meta_min == 0) 1166302265Ssmh arc_meta_min = arc_c_min / 2; 1167302265Ssmh 1168302265Ssmh if (arc_c < arc_c_min) 1169302265Ssmh arc_c = arc_c_min; 1170302265Ssmh 1171302265Ssmh zfs_arc_min = arc_c_min; 1172302265Ssmh 1173302265Ssmh return (0); 1174302265Ssmh} 1175275748Sdelphij#endif 1176275748Sdelphij 1177168404Spjd#define GHOST_STATE(state) \ 1178185029Spjd ((state) == arc_mru_ghost || (state) == arc_mfu_ghost || \ 1179185029Spjd (state) == arc_l2c_only) 1180168404Spjd 1181275811Sdelphij#define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_FLAG_IN_HASH_TABLE) 1182275811Sdelphij#define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) 1183275811Sdelphij#define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_FLAG_IO_ERROR) 1184275811Sdelphij#define HDR_PREFETCH(hdr) ((hdr)->b_flags & ARC_FLAG_PREFETCH) 1185307265Smav#define HDR_COMPRESSION_ENABLED(hdr) \ 1186307265Smav ((hdr)->b_flags & ARC_FLAG_COMPRESSED_ARC) 1187286570Smav 1188275811Sdelphij#define HDR_L2CACHE(hdr) ((hdr)->b_flags & ARC_FLAG_L2CACHE) 1189275811Sdelphij#define HDR_L2_READING(hdr) \ 1190307265Smav (((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) && \ 1191307265Smav ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR)) 1192275811Sdelphij#define HDR_L2_WRITING(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITING) 1193275811Sdelphij#define HDR_L2_EVICTED(hdr) ((hdr)->b_flags & ARC_FLAG_L2_EVICTED) 1194275811Sdelphij#define HDR_L2_WRITE_HEAD(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITE_HEAD) 1195307265Smav#define HDR_SHARED_DATA(hdr) ((hdr)->b_flags & ARC_FLAG_SHARED_DATA) 1196168404Spjd 1197286570Smav#define HDR_ISTYPE_METADATA(hdr) \ 1198307265Smav ((hdr)->b_flags & ARC_FLAG_BUFC_METADATA) 1199286570Smav#define HDR_ISTYPE_DATA(hdr) (!HDR_ISTYPE_METADATA(hdr)) 1200286570Smav 1201286570Smav#define HDR_HAS_L1HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L1HDR) 1202286570Smav#define HDR_HAS_L2HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR) 1203286570Smav 1204307265Smav/* For storing compression mode in b_flags */ 1205307265Smav#define HDR_COMPRESS_OFFSET (highbit64(ARC_FLAG_COMPRESS_0) - 1) 1206307265Smav 1207307265Smav#define HDR_GET_COMPRESS(hdr) ((enum zio_compress)BF32_GET((hdr)->b_flags, \ 1208307265Smav HDR_COMPRESS_OFFSET, SPA_COMPRESSBITS)) 1209307265Smav#define HDR_SET_COMPRESS(hdr, cmp) BF32_SET((hdr)->b_flags, \ 1210307265Smav HDR_COMPRESS_OFFSET, SPA_COMPRESSBITS, (cmp)); 1211307265Smav 1212307265Smav#define ARC_BUF_LAST(buf) ((buf)->b_next == NULL) 1213321535Smav#define ARC_BUF_SHARED(buf) ((buf)->b_flags & ARC_BUF_FLAG_SHARED) 1214321535Smav#define ARC_BUF_COMPRESSED(buf) ((buf)->b_flags & ARC_BUF_FLAG_COMPRESSED) 1215307265Smav 1216168404Spjd/* 1217185029Spjd * Other sizes 1218185029Spjd */ 1219185029Spjd 1220286570Smav#define HDR_FULL_SIZE ((int64_t)sizeof (arc_buf_hdr_t)) 1221286570Smav#define HDR_L2ONLY_SIZE ((int64_t)offsetof(arc_buf_hdr_t, b_l1hdr)) 1222185029Spjd 1223185029Spjd/* 1224168404Spjd * Hash table routines 1225168404Spjd */ 1226168404Spjd 1227205253Skmacy#define HT_LOCK_PAD CACHE_LINE_SIZE 1228168404Spjd 1229168404Spjdstruct ht_lock { 1230168404Spjd kmutex_t ht_lock; 1231168404Spjd#ifdef _KERNEL 1232168404Spjd unsigned char pad[(HT_LOCK_PAD - sizeof (kmutex_t))]; 1233168404Spjd#endif 1234168404Spjd}; 1235168404Spjd 1236168404Spjd#define BUF_LOCKS 256 1237168404Spjdtypedef struct buf_hash_table { 1238168404Spjd uint64_t ht_mask; 1239168404Spjd arc_buf_hdr_t **ht_table; 1240205264Skmacy struct ht_lock ht_locks[BUF_LOCKS] __aligned(CACHE_LINE_SIZE); 1241168404Spjd} buf_hash_table_t; 1242168404Spjd 1243168404Spjdstatic buf_hash_table_t buf_hash_table; 1244168404Spjd 1245168404Spjd#define BUF_HASH_INDEX(spa, dva, birth) \ 1246168404Spjd (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask) 1247168404Spjd#define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)]) 1248168404Spjd#define BUF_HASH_LOCK(idx) (&(BUF_HASH_LOCK_NTRY(idx).ht_lock)) 1249219089Spjd#define HDR_LOCK(hdr) \ 1250219089Spjd (BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth))) 1251168404Spjd 1252168404Spjduint64_t zfs_crc64_table[256]; 1253168404Spjd 1254185029Spjd/* 1255185029Spjd * Level 2 ARC 1256185029Spjd */ 1257185029Spjd 1258272707Savg#define L2ARC_WRITE_SIZE (8 * 1024 * 1024) /* initial write max */ 1259251478Sdelphij#define L2ARC_HEADROOM 2 /* num of writes */ 1260251478Sdelphij/* 1261251478Sdelphij * If we discover during ARC scan any buffers to be compressed, we boost 1262251478Sdelphij * our headroom for the next scanning cycle by this percentage multiple. 1263251478Sdelphij */ 1264251478Sdelphij#define L2ARC_HEADROOM_BOOST 200 1265208373Smm#define L2ARC_FEED_SECS 1 /* caching interval secs */ 1266208373Smm#define L2ARC_FEED_MIN_MS 200 /* min caching interval ms */ 1267185029Spjd 1268185029Spjd#define l2arc_writes_sent ARCSTAT(arcstat_l2_writes_sent) 1269185029Spjd#define l2arc_writes_done ARCSTAT(arcstat_l2_writes_done) 1270185029Spjd 1271251631Sdelphij/* L2ARC Performance Tunables */ 1272185029Spjduint64_t l2arc_write_max = L2ARC_WRITE_SIZE; /* default max write size */ 1273185029Spjduint64_t l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra write during warmup */ 1274185029Spjduint64_t l2arc_headroom = L2ARC_HEADROOM; /* number of dev writes */ 1275251478Sdelphijuint64_t l2arc_headroom_boost = L2ARC_HEADROOM_BOOST; 1276185029Spjduint64_t l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */ 1277208373Smmuint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval milliseconds */ 1278219089Spjdboolean_t l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */ 1279208373Smmboolean_t l2arc_feed_again = B_TRUE; /* turbo warmup */ 1280208373Smmboolean_t l2arc_norw = B_TRUE; /* no reads during writes */ 1281185029Spjd 1282217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_max, CTLFLAG_RW, 1283205231Skmacy &l2arc_write_max, 0, "max write size"); 1284217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_boost, CTLFLAG_RW, 1285205231Skmacy &l2arc_write_boost, 0, "extra write during warmup"); 1286217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_headroom, CTLFLAG_RW, 1287205231Skmacy &l2arc_headroom, 0, "number of dev writes"); 1288217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_secs, CTLFLAG_RW, 1289205231Skmacy &l2arc_feed_secs, 0, "interval seconds"); 1290217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_min_ms, CTLFLAG_RW, 1291208373Smm &l2arc_feed_min_ms, 0, "min interval milliseconds"); 1292205231Skmacy 1293205231SkmacySYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_noprefetch, CTLFLAG_RW, 1294205231Skmacy &l2arc_noprefetch, 0, "don't cache prefetch bufs"); 1295208373SmmSYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_feed_again, CTLFLAG_RW, 1296208373Smm &l2arc_feed_again, 0, "turbo warmup"); 1297208373SmmSYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_norw, CTLFLAG_RW, 1298208373Smm &l2arc_norw, 0, "no reads during writes"); 1299205231Skmacy 1300217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_size, CTLFLAG_RD, 1301286770Smav &ARC_anon.arcs_size.rc_count, 0, "size of anonymous state"); 1302307265SmavSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_metadata_esize, CTLFLAG_RD, 1303307265Smav &ARC_anon.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, 1304307265Smav "size of anonymous state"); 1305307265SmavSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_data_esize, CTLFLAG_RD, 1306307265Smav &ARC_anon.arcs_esize[ARC_BUFC_DATA].rc_count, 0, 1307307265Smav "size of anonymous state"); 1308205231Skmacy 1309217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_size, CTLFLAG_RD, 1310286770Smav &ARC_mru.arcs_size.rc_count, 0, "size of mru state"); 1311307265SmavSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_metadata_esize, CTLFLAG_RD, 1312307265Smav &ARC_mru.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, 1313307265Smav "size of metadata in mru state"); 1314307265SmavSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_data_esize, CTLFLAG_RD, 1315307265Smav &ARC_mru.arcs_esize[ARC_BUFC_DATA].rc_count, 0, 1316307265Smav "size of data in mru state"); 1317205231Skmacy 1318217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_size, CTLFLAG_RD, 1319286770Smav &ARC_mru_ghost.arcs_size.rc_count, 0, "size of mru ghost state"); 1320307265SmavSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_metadata_esize, CTLFLAG_RD, 1321307265Smav &ARC_mru_ghost.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, 1322205231Skmacy "size of metadata in mru ghost state"); 1323307265SmavSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_data_esize, CTLFLAG_RD, 1324307265Smav &ARC_mru_ghost.arcs_esize[ARC_BUFC_DATA].rc_count, 0, 1325205231Skmacy "size of data in mru ghost state"); 1326205231Skmacy 1327217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_size, CTLFLAG_RD, 1328286770Smav &ARC_mfu.arcs_size.rc_count, 0, "size of mfu state"); 1329307265SmavSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_metadata_esize, CTLFLAG_RD, 1330307265Smav &ARC_mfu.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, 1331307265Smav "size of metadata in mfu state"); 1332307265SmavSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_data_esize, CTLFLAG_RD, 1333307265Smav &ARC_mfu.arcs_esize[ARC_BUFC_DATA].rc_count, 0, 1334307265Smav "size of data in mfu state"); 1335205231Skmacy 1336217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_size, CTLFLAG_RD, 1337286770Smav &ARC_mfu_ghost.arcs_size.rc_count, 0, "size of mfu ghost state"); 1338307265SmavSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_metadata_esize, CTLFLAG_RD, 1339307265Smav &ARC_mfu_ghost.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, 1340205231Skmacy "size of metadata in mfu ghost state"); 1341307265SmavSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_data_esize, CTLFLAG_RD, 1342307265Smav &ARC_mfu_ghost.arcs_esize[ARC_BUFC_DATA].rc_count, 0, 1343205231Skmacy "size of data in mfu ghost state"); 1344205231Skmacy 1345217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2c_only_size, CTLFLAG_RD, 1346286770Smav &ARC_l2c_only.arcs_size.rc_count, 0, "size of mru state"); 1347205231Skmacy 1348185029Spjd/* 1349185029Spjd * L2ARC Internals 1350185029Spjd */ 1351286570Smavstruct l2arc_dev { 1352185029Spjd vdev_t *l2ad_vdev; /* vdev */ 1353185029Spjd spa_t *l2ad_spa; /* spa */ 1354185029Spjd uint64_t l2ad_hand; /* next write location */ 1355185029Spjd uint64_t l2ad_start; /* first addr on device */ 1356185029Spjd uint64_t l2ad_end; /* last addr on device */ 1357185029Spjd boolean_t l2ad_first; /* first sweep through */ 1358208373Smm boolean_t l2ad_writing; /* currently writing */ 1359286570Smav kmutex_t l2ad_mtx; /* lock for buffer list */ 1360286570Smav list_t l2ad_buflist; /* buffer list */ 1361185029Spjd list_node_t l2ad_node; /* device list node */ 1362286598Smav refcount_t l2ad_alloc; /* allocated bytes */ 1363286570Smav}; 1364185029Spjd 1365185029Spjdstatic list_t L2ARC_dev_list; /* device list */ 1366185029Spjdstatic list_t *l2arc_dev_list; /* device list pointer */ 1367185029Spjdstatic kmutex_t l2arc_dev_mtx; /* device list mutex */ 1368185029Spjdstatic l2arc_dev_t *l2arc_dev_last; /* last device used */ 1369185029Spjdstatic list_t L2ARC_free_on_write; /* free after write buf list */ 1370185029Spjdstatic list_t *l2arc_free_on_write; /* free after write list ptr */ 1371185029Spjdstatic kmutex_t l2arc_free_on_write_mtx; /* mutex for list */ 1372185029Spjdstatic uint64_t l2arc_ndev; /* number of devices */ 1373185029Spjd 1374185029Spjdtypedef struct l2arc_read_callback { 1375321535Smav arc_buf_hdr_t *l2rcb_hdr; /* read header */ 1376251478Sdelphij blkptr_t l2rcb_bp; /* original blkptr */ 1377268123Sdelphij zbookmark_phys_t l2rcb_zb; /* original bookmark */ 1378251478Sdelphij int l2rcb_flags; /* original flags */ 1379321613Smav abd_t *l2rcb_abd; /* temporary buffer */ 1380185029Spjd} l2arc_read_callback_t; 1381185029Spjd 1382185029Spjdtypedef struct l2arc_write_callback { 1383185029Spjd l2arc_dev_t *l2wcb_dev; /* device info */ 1384185029Spjd arc_buf_hdr_t *l2wcb_head; /* head of write buflist */ 1385185029Spjd} l2arc_write_callback_t; 1386185029Spjd 1387185029Spjdtypedef struct l2arc_data_free { 1388185029Spjd /* protected by l2arc_free_on_write_mtx */ 1389321610Smav abd_t *l2df_abd; 1390185029Spjd size_t l2df_size; 1391307265Smav arc_buf_contents_t l2df_type; 1392185029Spjd list_node_t l2df_list_node; 1393185029Spjd} l2arc_data_free_t; 1394185029Spjd 1395185029Spjdstatic kmutex_t l2arc_feed_thr_lock; 1396185029Spjdstatic kcondvar_t l2arc_feed_thr_cv; 1397185029Spjdstatic uint8_t l2arc_thread_exit; 1398185029Spjd 1399321610Smavstatic abd_t *arc_get_data_abd(arc_buf_hdr_t *, uint64_t, void *); 1400307265Smavstatic void *arc_get_data_buf(arc_buf_hdr_t *, uint64_t, void *); 1401321610Smavstatic void arc_get_data_impl(arc_buf_hdr_t *, uint64_t, void *); 1402321610Smavstatic void arc_free_data_abd(arc_buf_hdr_t *, abd_t *, uint64_t, void *); 1403307265Smavstatic void arc_free_data_buf(arc_buf_hdr_t *, void *, uint64_t, void *); 1404321610Smavstatic void arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag); 1405321610Smavstatic void arc_hdr_free_pabd(arc_buf_hdr_t *); 1406321610Smavstatic void arc_hdr_alloc_pabd(arc_buf_hdr_t *); 1407275811Sdelphijstatic void arc_access(arc_buf_hdr_t *, kmutex_t *); 1408286763Smavstatic boolean_t arc_is_overflowing(); 1409275811Sdelphijstatic void arc_buf_watch(arc_buf_t *); 1410275811Sdelphij 1411286570Smavstatic arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *); 1412286570Smavstatic uint32_t arc_bufc_to_flags(arc_buf_contents_t); 1413307265Smavstatic inline void arc_hdr_set_flags(arc_buf_hdr_t *hdr, arc_flags_t flags); 1414307265Smavstatic inline void arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags); 1415286570Smav 1416275811Sdelphijstatic boolean_t l2arc_write_eligible(uint64_t, arc_buf_hdr_t *); 1417275811Sdelphijstatic void l2arc_read_done(zio_t *); 1418185029Spjd 1419290191Savgstatic void 1420290191Savgl2arc_trim(const arc_buf_hdr_t *hdr) 1421290191Savg{ 1422290191Savg l2arc_dev_t *dev = hdr->b_l2hdr.b_dev; 1423290191Savg 1424290191Savg ASSERT(HDR_HAS_L2HDR(hdr)); 1425290191Savg ASSERT(MUTEX_HELD(&dev->l2ad_mtx)); 1426290191Savg 1427307265Smav if (HDR_GET_PSIZE(hdr) != 0) { 1428290191Savg trim_map_free(dev->l2ad_vdev, hdr->b_l2hdr.b_daddr, 1429307265Smav HDR_GET_PSIZE(hdr), 0); 1430290191Savg } 1431290191Savg} 1432290191Savg 1433168404Spjdstatic uint64_t 1434209962Smmbuf_hash(uint64_t spa, const dva_t *dva, uint64_t birth) 1435168404Spjd{ 1436168404Spjd uint8_t *vdva = (uint8_t *)dva; 1437168404Spjd uint64_t crc = -1ULL; 1438168404Spjd int i; 1439168404Spjd 1440168404Spjd ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); 1441168404Spjd 1442168404Spjd for (i = 0; i < sizeof (dva_t); i++) 1443168404Spjd crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF]; 1444168404Spjd 1445209962Smm crc ^= (spa>>8) ^ birth; 1446168404Spjd 1447168404Spjd return (crc); 1448168404Spjd} 1449168404Spjd 1450307265Smav#define HDR_EMPTY(hdr) \ 1451307265Smav ((hdr)->b_dva.dva_word[0] == 0 && \ 1452307265Smav (hdr)->b_dva.dva_word[1] == 0) 1453168404Spjd 1454307265Smav#define HDR_EQUAL(spa, dva, birth, hdr) \ 1455307265Smav ((hdr)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \ 1456307265Smav ((hdr)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \ 1457307265Smav ((hdr)->b_birth == birth) && ((hdr)->b_spa == spa) 1458168404Spjd 1459219089Spjdstatic void 1460219089Spjdbuf_discard_identity(arc_buf_hdr_t *hdr) 1461219089Spjd{ 1462219089Spjd hdr->b_dva.dva_word[0] = 0; 1463219089Spjd hdr->b_dva.dva_word[1] = 0; 1464219089Spjd hdr->b_birth = 0; 1465219089Spjd} 1466219089Spjd 1467168404Spjdstatic arc_buf_hdr_t * 1468268075Sdelphijbuf_hash_find(uint64_t spa, const blkptr_t *bp, kmutex_t **lockp) 1469168404Spjd{ 1470268075Sdelphij const dva_t *dva = BP_IDENTITY(bp); 1471268075Sdelphij uint64_t birth = BP_PHYSICAL_BIRTH(bp); 1472168404Spjd uint64_t idx = BUF_HASH_INDEX(spa, dva, birth); 1473168404Spjd kmutex_t *hash_lock = BUF_HASH_LOCK(idx); 1474275811Sdelphij arc_buf_hdr_t *hdr; 1475168404Spjd 1476168404Spjd mutex_enter(hash_lock); 1477275811Sdelphij for (hdr = buf_hash_table.ht_table[idx]; hdr != NULL; 1478275811Sdelphij hdr = hdr->b_hash_next) { 1479307265Smav if (HDR_EQUAL(spa, dva, birth, hdr)) { 1480168404Spjd *lockp = hash_lock; 1481275811Sdelphij return (hdr); 1482168404Spjd } 1483168404Spjd } 1484168404Spjd mutex_exit(hash_lock); 1485168404Spjd *lockp = NULL; 1486168404Spjd return (NULL); 1487168404Spjd} 1488168404Spjd 1489168404Spjd/* 1490168404Spjd * Insert an entry into the hash table. If there is already an element 1491168404Spjd * equal to elem in the hash table, then the already existing element 1492168404Spjd * will be returned and the new element will not be inserted. 1493168404Spjd * Otherwise returns NULL. 1494286570Smav * If lockp == NULL, the caller is assumed to already hold the hash lock. 1495168404Spjd */ 1496168404Spjdstatic arc_buf_hdr_t * 1497275811Sdelphijbuf_hash_insert(arc_buf_hdr_t *hdr, kmutex_t **lockp) 1498168404Spjd{ 1499275811Sdelphij uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth); 1500168404Spjd kmutex_t *hash_lock = BUF_HASH_LOCK(idx); 1501275811Sdelphij arc_buf_hdr_t *fhdr; 1502168404Spjd uint32_t i; 1503168404Spjd 1504275811Sdelphij ASSERT(!DVA_IS_EMPTY(&hdr->b_dva)); 1505275811Sdelphij ASSERT(hdr->b_birth != 0); 1506275811Sdelphij ASSERT(!HDR_IN_HASH_TABLE(hdr)); 1507286570Smav 1508286570Smav if (lockp != NULL) { 1509286570Smav *lockp = hash_lock; 1510286570Smav mutex_enter(hash_lock); 1511286570Smav } else { 1512286570Smav ASSERT(MUTEX_HELD(hash_lock)); 1513286570Smav } 1514286570Smav 1515275811Sdelphij for (fhdr = buf_hash_table.ht_table[idx], i = 0; fhdr != NULL; 1516275811Sdelphij fhdr = fhdr->b_hash_next, i++) { 1517307265Smav if (HDR_EQUAL(hdr->b_spa, &hdr->b_dva, hdr->b_birth, fhdr)) 1518275811Sdelphij return (fhdr); 1519168404Spjd } 1520168404Spjd 1521275811Sdelphij hdr->b_hash_next = buf_hash_table.ht_table[idx]; 1522275811Sdelphij buf_hash_table.ht_table[idx] = hdr; 1523307265Smav arc_hdr_set_flags(hdr, ARC_FLAG_IN_HASH_TABLE); 1524168404Spjd 1525168404Spjd /* collect some hash table performance data */ 1526168404Spjd if (i > 0) { 1527168404Spjd ARCSTAT_BUMP(arcstat_hash_collisions); 1528168404Spjd if (i == 1) 1529168404Spjd ARCSTAT_BUMP(arcstat_hash_chains); 1530168404Spjd 1531168404Spjd ARCSTAT_MAX(arcstat_hash_chain_max, i); 1532168404Spjd } 1533168404Spjd 1534168404Spjd ARCSTAT_BUMP(arcstat_hash_elements); 1535168404Spjd ARCSTAT_MAXSTAT(arcstat_hash_elements); 1536168404Spjd 1537168404Spjd return (NULL); 1538168404Spjd} 1539168404Spjd 1540168404Spjdstatic void 1541275811Sdelphijbuf_hash_remove(arc_buf_hdr_t *hdr) 1542168404Spjd{ 1543275811Sdelphij arc_buf_hdr_t *fhdr, **hdrp; 1544275811Sdelphij uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth); 1545168404Spjd 1546168404Spjd ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx))); 1547275811Sdelphij ASSERT(HDR_IN_HASH_TABLE(hdr)); 1548168404Spjd 1549275811Sdelphij hdrp = &buf_hash_table.ht_table[idx]; 1550275811Sdelphij while ((fhdr = *hdrp) != hdr) { 1551307265Smav ASSERT3P(fhdr, !=, NULL); 1552275811Sdelphij hdrp = &fhdr->b_hash_next; 1553168404Spjd } 1554275811Sdelphij *hdrp = hdr->b_hash_next; 1555275811Sdelphij hdr->b_hash_next = NULL; 1556307265Smav arc_hdr_clear_flags(hdr, ARC_FLAG_IN_HASH_TABLE); 1557168404Spjd 1558168404Spjd /* collect some hash table performance data */ 1559168404Spjd ARCSTAT_BUMPDOWN(arcstat_hash_elements); 1560168404Spjd 1561168404Spjd if (buf_hash_table.ht_table[idx] && 1562168404Spjd buf_hash_table.ht_table[idx]->b_hash_next == NULL) 1563168404Spjd ARCSTAT_BUMPDOWN(arcstat_hash_chains); 1564168404Spjd} 1565168404Spjd 1566168404Spjd/* 1567168404Spjd * Global data structures and functions for the buf kmem cache. 1568168404Spjd */ 1569286570Smavstatic kmem_cache_t *hdr_full_cache; 1570286570Smavstatic kmem_cache_t *hdr_l2only_cache; 1571168404Spjdstatic kmem_cache_t *buf_cache; 1572168404Spjd 1573168404Spjdstatic void 1574168404Spjdbuf_fini(void) 1575168404Spjd{ 1576168404Spjd int i; 1577168404Spjd 1578168404Spjd kmem_free(buf_hash_table.ht_table, 1579168404Spjd (buf_hash_table.ht_mask + 1) * sizeof (void *)); 1580168404Spjd for (i = 0; i < BUF_LOCKS; i++) 1581168404Spjd mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock); 1582286570Smav kmem_cache_destroy(hdr_full_cache); 1583286570Smav kmem_cache_destroy(hdr_l2only_cache); 1584168404Spjd kmem_cache_destroy(buf_cache); 1585168404Spjd} 1586168404Spjd 1587168404Spjd/* 1588168404Spjd * Constructor callback - called when the cache is empty 1589168404Spjd * and a new buf is requested. 1590168404Spjd */ 1591168404Spjd/* ARGSUSED */ 1592168404Spjdstatic int 1593286570Smavhdr_full_cons(void *vbuf, void *unused, int kmflag) 1594168404Spjd{ 1595275811Sdelphij arc_buf_hdr_t *hdr = vbuf; 1596168404Spjd 1597286570Smav bzero(hdr, HDR_FULL_SIZE); 1598286570Smav cv_init(&hdr->b_l1hdr.b_cv, NULL, CV_DEFAULT, NULL); 1599286570Smav refcount_create(&hdr->b_l1hdr.b_refcnt); 1600286570Smav mutex_init(&hdr->b_l1hdr.b_freeze_lock, NULL, MUTEX_DEFAULT, NULL); 1601286763Smav multilist_link_init(&hdr->b_l1hdr.b_arc_node); 1602286570Smav arc_space_consume(HDR_FULL_SIZE, ARC_SPACE_HDRS); 1603185029Spjd 1604168404Spjd return (0); 1605168404Spjd} 1606168404Spjd 1607185029Spjd/* ARGSUSED */ 1608185029Spjdstatic int 1609286570Smavhdr_l2only_cons(void *vbuf, void *unused, int kmflag) 1610286570Smav{ 1611286570Smav arc_buf_hdr_t *hdr = vbuf; 1612286570Smav 1613286570Smav bzero(hdr, HDR_L2ONLY_SIZE); 1614286570Smav arc_space_consume(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS); 1615286570Smav 1616286570Smav return (0); 1617286570Smav} 1618286570Smav 1619286570Smav/* ARGSUSED */ 1620286570Smavstatic int 1621185029Spjdbuf_cons(void *vbuf, void *unused, int kmflag) 1622185029Spjd{ 1623185029Spjd arc_buf_t *buf = vbuf; 1624185029Spjd 1625185029Spjd bzero(buf, sizeof (arc_buf_t)); 1626219089Spjd mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL); 1627208373Smm arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS); 1628208373Smm 1629185029Spjd return (0); 1630185029Spjd} 1631185029Spjd 1632168404Spjd/* 1633168404Spjd * Destructor callback - called when a cached buf is 1634168404Spjd * no longer required. 1635168404Spjd */ 1636168404Spjd/* ARGSUSED */ 1637168404Spjdstatic void 1638286570Smavhdr_full_dest(void *vbuf, void *unused) 1639168404Spjd{ 1640275811Sdelphij arc_buf_hdr_t *hdr = vbuf; 1641168404Spjd 1642307265Smav ASSERT(HDR_EMPTY(hdr)); 1643286570Smav cv_destroy(&hdr->b_l1hdr.b_cv); 1644286570Smav refcount_destroy(&hdr->b_l1hdr.b_refcnt); 1645286570Smav mutex_destroy(&hdr->b_l1hdr.b_freeze_lock); 1646286763Smav ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); 1647286570Smav arc_space_return(HDR_FULL_SIZE, ARC_SPACE_HDRS); 1648168404Spjd} 1649168404Spjd 1650185029Spjd/* ARGSUSED */ 1651185029Spjdstatic void 1652286570Smavhdr_l2only_dest(void *vbuf, void *unused) 1653286570Smav{ 1654286570Smav arc_buf_hdr_t *hdr = vbuf; 1655286570Smav 1656307265Smav ASSERT(HDR_EMPTY(hdr)); 1657286570Smav arc_space_return(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS); 1658286570Smav} 1659286570Smav 1660286570Smav/* ARGSUSED */ 1661286570Smavstatic void 1662185029Spjdbuf_dest(void *vbuf, void *unused) 1663185029Spjd{ 1664185029Spjd arc_buf_t *buf = vbuf; 1665185029Spjd 1666219089Spjd mutex_destroy(&buf->b_evict_lock); 1667208373Smm arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS); 1668185029Spjd} 1669185029Spjd 1670168404Spjd/* 1671168404Spjd * Reclaim callback -- invoked when memory is low. 1672168404Spjd */ 1673168404Spjd/* ARGSUSED */ 1674168404Spjdstatic void 1675168404Spjdhdr_recl(void *unused) 1676168404Spjd{ 1677168404Spjd dprintf("hdr_recl called\n"); 1678168404Spjd /* 1679168404Spjd * umem calls the reclaim func when we destroy the buf cache, 1680168404Spjd * which is after we do arc_fini(). 1681168404Spjd */ 1682168404Spjd if (!arc_dead) 1683286763Smav cv_signal(&arc_reclaim_thread_cv); 1684168404Spjd} 1685168404Spjd 1686168404Spjdstatic void 1687168404Spjdbuf_init(void) 1688168404Spjd{ 1689168404Spjd uint64_t *ct; 1690168404Spjd uint64_t hsize = 1ULL << 12; 1691168404Spjd int i, j; 1692168404Spjd 1693168404Spjd /* 1694168404Spjd * The hash table is big enough to fill all of physical memory 1695269230Sdelphij * with an average block size of zfs_arc_average_blocksize (default 8K). 1696269230Sdelphij * By default, the table will take up 1697269230Sdelphij * totalmem * sizeof(void*) / 8K (1MB per GB with 8-byte pointers). 1698168404Spjd */ 1699269230Sdelphij while (hsize * zfs_arc_average_blocksize < (uint64_t)physmem * PAGESIZE) 1700168404Spjd hsize <<= 1; 1701168404Spjdretry: 1702168404Spjd buf_hash_table.ht_mask = hsize - 1; 1703168404Spjd buf_hash_table.ht_table = 1704168404Spjd kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP); 1705168404Spjd if (buf_hash_table.ht_table == NULL) { 1706168404Spjd ASSERT(hsize > (1ULL << 8)); 1707168404Spjd hsize >>= 1; 1708168404Spjd goto retry; 1709168404Spjd } 1710168404Spjd 1711286570Smav hdr_full_cache = kmem_cache_create("arc_buf_hdr_t_full", HDR_FULL_SIZE, 1712286570Smav 0, hdr_full_cons, hdr_full_dest, hdr_recl, NULL, NULL, 0); 1713286570Smav hdr_l2only_cache = kmem_cache_create("arc_buf_hdr_t_l2only", 1714286570Smav HDR_L2ONLY_SIZE, 0, hdr_l2only_cons, hdr_l2only_dest, hdr_recl, 1715286570Smav NULL, NULL, 0); 1716168404Spjd buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t), 1717185029Spjd 0, buf_cons, buf_dest, NULL, NULL, NULL, 0); 1718168404Spjd 1719168404Spjd for (i = 0; i < 256; i++) 1720168404Spjd for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--) 1721168404Spjd *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY); 1722168404Spjd 1723168404Spjd for (i = 0; i < BUF_LOCKS; i++) { 1724168404Spjd mutex_init(&buf_hash_table.ht_locks[i].ht_lock, 1725168404Spjd NULL, MUTEX_DEFAULT, NULL); 1726168404Spjd } 1727168404Spjd} 1728168404Spjd 1729321535Smav/* 1730321535Smav * This is the size that the buf occupies in memory. If the buf is compressed, 1731321535Smav * it will correspond to the compressed size. You should use this method of 1732321535Smav * getting the buf size unless you explicitly need the logical size. 1733321535Smav */ 1734321535Smavint32_t 1735321535Smavarc_buf_size(arc_buf_t *buf) 1736321535Smav{ 1737321535Smav return (ARC_BUF_COMPRESSED(buf) ? 1738321535Smav HDR_GET_PSIZE(buf->b_hdr) : HDR_GET_LSIZE(buf->b_hdr)); 1739321535Smav} 1740321535Smav 1741321535Smavint32_t 1742321535Smavarc_buf_lsize(arc_buf_t *buf) 1743321535Smav{ 1744321535Smav return (HDR_GET_LSIZE(buf->b_hdr)); 1745321535Smav} 1746321535Smav 1747321535Smavenum zio_compress 1748321535Smavarc_get_compression(arc_buf_t *buf) 1749321535Smav{ 1750321535Smav return (ARC_BUF_COMPRESSED(buf) ? 1751321535Smav HDR_GET_COMPRESS(buf->b_hdr) : ZIO_COMPRESS_OFF); 1752321535Smav} 1753321535Smav 1754307265Smav#define ARC_MINTIME (hz>>4) /* 62 ms */ 1755307265Smav 1756307265Smavstatic inline boolean_t 1757307265Smavarc_buf_is_shared(arc_buf_t *buf) 1758286570Smav{ 1759307265Smav boolean_t shared = (buf->b_data != NULL && 1760321610Smav buf->b_hdr->b_l1hdr.b_pabd != NULL && 1761321610Smav abd_is_linear(buf->b_hdr->b_l1hdr.b_pabd) && 1762321610Smav buf->b_data == abd_to_buf(buf->b_hdr->b_l1hdr.b_pabd)); 1763307265Smav IMPLY(shared, HDR_SHARED_DATA(buf->b_hdr)); 1764321535Smav IMPLY(shared, ARC_BUF_SHARED(buf)); 1765321535Smav IMPLY(shared, ARC_BUF_COMPRESSED(buf) || ARC_BUF_LAST(buf)); 1766321535Smav 1767321535Smav /* 1768321535Smav * It would be nice to assert arc_can_share() too, but the "hdr isn't 1769321535Smav * already being shared" requirement prevents us from doing that. 1770321535Smav */ 1771321535Smav 1772307265Smav return (shared); 1773307265Smav} 1774286570Smav 1775321535Smav/* 1776321535Smav * Free the checksum associated with this header. If there is no checksum, this 1777321535Smav * is a no-op. 1778321535Smav */ 1779307265Smavstatic inline void 1780307265Smavarc_cksum_free(arc_buf_hdr_t *hdr) 1781307265Smav{ 1782307265Smav ASSERT(HDR_HAS_L1HDR(hdr)); 1783307265Smav mutex_enter(&hdr->b_l1hdr.b_freeze_lock); 1784307265Smav if (hdr->b_l1hdr.b_freeze_cksum != NULL) { 1785307265Smav kmem_free(hdr->b_l1hdr.b_freeze_cksum, sizeof (zio_cksum_t)); 1786307265Smav hdr->b_l1hdr.b_freeze_cksum = NULL; 1787286570Smav } 1788307265Smav mutex_exit(&hdr->b_l1hdr.b_freeze_lock); 1789286570Smav} 1790286570Smav 1791321535Smav/* 1792321535Smav * Return true iff at least one of the bufs on hdr is not compressed. 1793321535Smav */ 1794321535Smavstatic boolean_t 1795321535Smavarc_hdr_has_uncompressed_buf(arc_buf_hdr_t *hdr) 1796321535Smav{ 1797321535Smav for (arc_buf_t *b = hdr->b_l1hdr.b_buf; b != NULL; b = b->b_next) { 1798321535Smav if (!ARC_BUF_COMPRESSED(b)) { 1799321535Smav return (B_TRUE); 1800321535Smav } 1801321535Smav } 1802321535Smav return (B_FALSE); 1803321535Smav} 1804321535Smav 1805321535Smav/* 1806321535Smav * If we've turned on the ZFS_DEBUG_MODIFY flag, verify that the buf's data 1807321535Smav * matches the checksum that is stored in the hdr. If there is no checksum, 1808321535Smav * or if the buf is compressed, this is a no-op. 1809321535Smav */ 1810168404Spjdstatic void 1811168404Spjdarc_cksum_verify(arc_buf_t *buf) 1812168404Spjd{ 1813307265Smav arc_buf_hdr_t *hdr = buf->b_hdr; 1814168404Spjd zio_cksum_t zc; 1815168404Spjd 1816168404Spjd if (!(zfs_flags & ZFS_DEBUG_MODIFY)) 1817168404Spjd return; 1818168404Spjd 1819321535Smav if (ARC_BUF_COMPRESSED(buf)) { 1820321535Smav ASSERT(hdr->b_l1hdr.b_freeze_cksum == NULL || 1821321535Smav arc_hdr_has_uncompressed_buf(hdr)); 1822321535Smav return; 1823321535Smav } 1824321535Smav 1825307265Smav ASSERT(HDR_HAS_L1HDR(hdr)); 1826307265Smav 1827307265Smav mutex_enter(&hdr->b_l1hdr.b_freeze_lock); 1828307265Smav if (hdr->b_l1hdr.b_freeze_cksum == NULL || HDR_IO_ERROR(hdr)) { 1829307265Smav mutex_exit(&hdr->b_l1hdr.b_freeze_lock); 1830168404Spjd return; 1831168404Spjd } 1832321535Smav 1833321535Smav fletcher_2_native(buf->b_data, arc_buf_size(buf), NULL, &zc); 1834307265Smav if (!ZIO_CHECKSUM_EQUAL(*hdr->b_l1hdr.b_freeze_cksum, zc)) 1835168404Spjd panic("buffer modified while frozen!"); 1836307265Smav mutex_exit(&hdr->b_l1hdr.b_freeze_lock); 1837168404Spjd} 1838168404Spjd 1839307265Smavstatic boolean_t 1840307265Smavarc_cksum_is_equal(arc_buf_hdr_t *hdr, zio_t *zio) 1841185029Spjd{ 1842307265Smav enum zio_compress compress = BP_GET_COMPRESS(zio->io_bp); 1843307265Smav boolean_t valid_cksum; 1844185029Spjd 1845307265Smav ASSERT(!BP_IS_EMBEDDED(zio->io_bp)); 1846307265Smav VERIFY3U(BP_GET_PSIZE(zio->io_bp), ==, HDR_GET_PSIZE(hdr)); 1847185029Spjd 1848307265Smav /* 1849307265Smav * We rely on the blkptr's checksum to determine if the block 1850307265Smav * is valid or not. When compressed arc is enabled, the l2arc 1851307265Smav * writes the block to the l2arc just as it appears in the pool. 1852307265Smav * This allows us to use the blkptr's checksum to validate the 1853307265Smav * data that we just read off of the l2arc without having to store 1854307265Smav * a separate checksum in the arc_buf_hdr_t. However, if compressed 1855307265Smav * arc is disabled, then the data written to the l2arc is always 1856307265Smav * uncompressed and won't match the block as it exists in the main 1857307265Smav * pool. When this is the case, we must first compress it if it is 1858307265Smav * compressed on the main pool before we can validate the checksum. 1859307265Smav */ 1860307265Smav if (!HDR_COMPRESSION_ENABLED(hdr) && compress != ZIO_COMPRESS_OFF) { 1861307265Smav ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF); 1862307265Smav uint64_t lsize = HDR_GET_LSIZE(hdr); 1863307265Smav uint64_t csize; 1864307265Smav 1865329490Smav abd_t *cdata = abd_alloc_linear(HDR_GET_PSIZE(hdr), B_TRUE); 1866329490Smav csize = zio_compress_data(compress, zio->io_abd, 1867329490Smav abd_to_buf(cdata), lsize); 1868321610Smav 1869307265Smav ASSERT3U(csize, <=, HDR_GET_PSIZE(hdr)); 1870307265Smav if (csize < HDR_GET_PSIZE(hdr)) { 1871307265Smav /* 1872307265Smav * Compressed blocks are always a multiple of the 1873307265Smav * smallest ashift in the pool. Ideally, we would 1874307265Smav * like to round up the csize to the next 1875307265Smav * spa_min_ashift but that value may have changed 1876307265Smav * since the block was last written. Instead, 1877307265Smav * we rely on the fact that the hdr's psize 1878307265Smav * was set to the psize of the block when it was 1879307265Smav * last written. We set the csize to that value 1880307265Smav * and zero out any part that should not contain 1881307265Smav * data. 1882307265Smav */ 1883329490Smav abd_zero_off(cdata, csize, HDR_GET_PSIZE(hdr) - csize); 1884307265Smav csize = HDR_GET_PSIZE(hdr); 1885307265Smav } 1886329490Smav zio_push_transform(zio, cdata, csize, HDR_GET_PSIZE(hdr), NULL); 1887307265Smav } 1888307265Smav 1889307265Smav /* 1890307265Smav * Block pointers always store the checksum for the logical data. 1891307265Smav * If the block pointer has the gang bit set, then the checksum 1892307265Smav * it represents is for the reconstituted data and not for an 1893307265Smav * individual gang member. The zio pipeline, however, must be able to 1894307265Smav * determine the checksum of each of the gang constituents so it 1895307265Smav * treats the checksum comparison differently than what we need 1896307265Smav * for l2arc blocks. This prevents us from using the 1897307265Smav * zio_checksum_error() interface directly. Instead we must call the 1898307265Smav * zio_checksum_error_impl() so that we can ensure the checksum is 1899307265Smav * generated using the correct checksum algorithm and accounts for the 1900307265Smav * logical I/O size and not just a gang fragment. 1901307265Smav */ 1902307265Smav valid_cksum = (zio_checksum_error_impl(zio->io_spa, zio->io_bp, 1903321610Smav BP_GET_CHECKSUM(zio->io_bp), zio->io_abd, zio->io_size, 1904307265Smav zio->io_offset, NULL) == 0); 1905307265Smav zio_pop_transforms(zio); 1906307265Smav return (valid_cksum); 1907185029Spjd} 1908185029Spjd 1909321535Smav/* 1910321535Smav * Given a buf full of data, if ZFS_DEBUG_MODIFY is enabled this computes a 1911321535Smav * checksum and attaches it to the buf's hdr so that we can ensure that the buf 1912321535Smav * isn't modified later on. If buf is compressed or there is already a checksum 1913321535Smav * on the hdr, this is a no-op (we only checksum uncompressed bufs). 1914321535Smav */ 1915168404Spjdstatic void 1916307265Smavarc_cksum_compute(arc_buf_t *buf) 1917168404Spjd{ 1918307265Smav arc_buf_hdr_t *hdr = buf->b_hdr; 1919307265Smav 1920307265Smav if (!(zfs_flags & ZFS_DEBUG_MODIFY)) 1921168404Spjd return; 1922168404Spjd 1923307265Smav ASSERT(HDR_HAS_L1HDR(hdr)); 1924321535Smav 1925286570Smav mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1926307265Smav if (hdr->b_l1hdr.b_freeze_cksum != NULL) { 1927321535Smav ASSERT(arc_hdr_has_uncompressed_buf(hdr)); 1928307265Smav mutex_exit(&hdr->b_l1hdr.b_freeze_lock); 1929168404Spjd return; 1930321535Smav } else if (ARC_BUF_COMPRESSED(buf)) { 1931321535Smav mutex_exit(&hdr->b_l1hdr.b_freeze_lock); 1932321535Smav return; 1933168404Spjd } 1934321535Smav 1935321535Smav ASSERT(!ARC_BUF_COMPRESSED(buf)); 1936307265Smav hdr->b_l1hdr.b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), 1937307265Smav KM_SLEEP); 1938321535Smav fletcher_2_native(buf->b_data, arc_buf_size(buf), NULL, 1939307265Smav hdr->b_l1hdr.b_freeze_cksum); 1940307265Smav mutex_exit(&hdr->b_l1hdr.b_freeze_lock); 1941240133Smm#ifdef illumos 1942240133Smm arc_buf_watch(buf); 1943277300Ssmh#endif 1944168404Spjd} 1945168404Spjd 1946240133Smm#ifdef illumos 1947240133Smm#ifndef _KERNEL 1948240133Smmtypedef struct procctl { 1949240133Smm long cmd; 1950240133Smm prwatch_t prwatch; 1951240133Smm} procctl_t; 1952240133Smm#endif 1953240133Smm 1954240133Smm/* ARGSUSED */ 1955240133Smmstatic void 1956240133Smmarc_buf_unwatch(arc_buf_t *buf) 1957240133Smm{ 1958240133Smm#ifndef _KERNEL 1959240133Smm if (arc_watch) { 1960240133Smm int result; 1961240133Smm procctl_t ctl; 1962240133Smm ctl.cmd = PCWATCH; 1963240133Smm ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data; 1964240133Smm ctl.prwatch.pr_size = 0; 1965240133Smm ctl.prwatch.pr_wflags = 0; 1966240133Smm result = write(arc_procfd, &ctl, sizeof (ctl)); 1967240133Smm ASSERT3U(result, ==, sizeof (ctl)); 1968240133Smm } 1969240133Smm#endif 1970240133Smm} 1971240133Smm 1972240133Smm/* ARGSUSED */ 1973240133Smmstatic void 1974240133Smmarc_buf_watch(arc_buf_t *buf) 1975240133Smm{ 1976240133Smm#ifndef _KERNEL 1977240133Smm if (arc_watch) { 1978240133Smm int result; 1979240133Smm procctl_t ctl; 1980240133Smm ctl.cmd = PCWATCH; 1981240133Smm ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data; 1982321535Smav ctl.prwatch.pr_size = arc_buf_size(buf); 1983240133Smm ctl.prwatch.pr_wflags = WA_WRITE; 1984240133Smm result = write(arc_procfd, &ctl, sizeof (ctl)); 1985240133Smm ASSERT3U(result, ==, sizeof (ctl)); 1986240133Smm } 1987240133Smm#endif 1988240133Smm} 1989240133Smm#endif /* illumos */ 1990240133Smm 1991286570Smavstatic arc_buf_contents_t 1992286570Smavarc_buf_type(arc_buf_hdr_t *hdr) 1993286570Smav{ 1994307265Smav arc_buf_contents_t type; 1995286570Smav if (HDR_ISTYPE_METADATA(hdr)) { 1996307265Smav type = ARC_BUFC_METADATA; 1997286570Smav } else { 1998307265Smav type = ARC_BUFC_DATA; 1999286570Smav } 2000307265Smav VERIFY3U(hdr->b_type, ==, type); 2001307265Smav return (type); 2002286570Smav} 2003286570Smav 2004321535Smavboolean_t 2005321535Smavarc_is_metadata(arc_buf_t *buf) 2006321535Smav{ 2007321535Smav return (HDR_ISTYPE_METADATA(buf->b_hdr) != 0); 2008321535Smav} 2009321535Smav 2010286570Smavstatic uint32_t 2011286570Smavarc_bufc_to_flags(arc_buf_contents_t type) 2012286570Smav{ 2013286570Smav switch (type) { 2014286570Smav case ARC_BUFC_DATA: 2015286570Smav /* metadata field is 0 if buffer contains normal data */ 2016286570Smav return (0); 2017286570Smav case ARC_BUFC_METADATA: 2018286570Smav return (ARC_FLAG_BUFC_METADATA); 2019286570Smav default: 2020286570Smav break; 2021286570Smav } 2022286570Smav panic("undefined ARC buffer type!"); 2023286570Smav return ((uint32_t)-1); 2024286570Smav} 2025286570Smav 2026168404Spjdvoid 2027168404Spjdarc_buf_thaw(arc_buf_t *buf) 2028168404Spjd{ 2029307265Smav arc_buf_hdr_t *hdr = buf->b_hdr; 2030307265Smav 2031321535Smav ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); 2032321535Smav ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 2033321535Smav 2034321535Smav arc_cksum_verify(buf); 2035321535Smav 2036321535Smav /* 2037321535Smav * Compressed buffers do not manipulate the b_freeze_cksum or 2038321535Smav * allocate b_thawed. 2039321535Smav */ 2040321535Smav if (ARC_BUF_COMPRESSED(buf)) { 2041321535Smav ASSERT(hdr->b_l1hdr.b_freeze_cksum == NULL || 2042321535Smav arc_hdr_has_uncompressed_buf(hdr)); 2043321535Smav return; 2044185029Spjd } 2045168404Spjd 2046307265Smav ASSERT(HDR_HAS_L1HDR(hdr)); 2047307265Smav arc_cksum_free(hdr); 2048219089Spjd 2049307265Smav mutex_enter(&hdr->b_l1hdr.b_freeze_lock); 2050286570Smav#ifdef ZFS_DEBUG 2051219089Spjd if (zfs_flags & ZFS_DEBUG_MODIFY) { 2052307265Smav if (hdr->b_l1hdr.b_thawed != NULL) 2053307265Smav kmem_free(hdr->b_l1hdr.b_thawed, 1); 2054307265Smav hdr->b_l1hdr.b_thawed = kmem_alloc(1, KM_SLEEP); 2055219089Spjd } 2056286570Smav#endif 2057219089Spjd 2058307265Smav mutex_exit(&hdr->b_l1hdr.b_freeze_lock); 2059240133Smm 2060240133Smm#ifdef illumos 2061240133Smm arc_buf_unwatch(buf); 2062277300Ssmh#endif 2063168404Spjd} 2064168404Spjd 2065168404Spjdvoid 2066168404Spjdarc_buf_freeze(arc_buf_t *buf) 2067168404Spjd{ 2068307265Smav arc_buf_hdr_t *hdr = buf->b_hdr; 2069219089Spjd kmutex_t *hash_lock; 2070219089Spjd 2071168404Spjd if (!(zfs_flags & ZFS_DEBUG_MODIFY)) 2072168404Spjd return; 2073168404Spjd 2074321535Smav if (ARC_BUF_COMPRESSED(buf)) { 2075321535Smav ASSERT(hdr->b_l1hdr.b_freeze_cksum == NULL || 2076321535Smav arc_hdr_has_uncompressed_buf(hdr)); 2077321535Smav return; 2078321535Smav } 2079321535Smav 2080307265Smav hash_lock = HDR_LOCK(hdr); 2081219089Spjd mutex_enter(hash_lock); 2082219089Spjd 2083307265Smav ASSERT(HDR_HAS_L1HDR(hdr)); 2084307265Smav ASSERT(hdr->b_l1hdr.b_freeze_cksum != NULL || 2085307265Smav hdr->b_l1hdr.b_state == arc_anon); 2086307265Smav arc_cksum_compute(buf); 2087219089Spjd mutex_exit(hash_lock); 2088168404Spjd} 2089168404Spjd 2090307265Smav/* 2091307265Smav * The arc_buf_hdr_t's b_flags should never be modified directly. Instead, 2092307265Smav * the following functions should be used to ensure that the flags are 2093307265Smav * updated in a thread-safe way. When manipulating the flags either 2094307265Smav * the hash_lock must be held or the hdr must be undiscoverable. This 2095307265Smav * ensures that we're not racing with any other threads when updating 2096307265Smav * the flags. 2097307265Smav */ 2098307265Smavstatic inline void 2099307265Smavarc_hdr_set_flags(arc_buf_hdr_t *hdr, arc_flags_t flags) 2100307265Smav{ 2101307265Smav ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); 2102307265Smav hdr->b_flags |= flags; 2103307265Smav} 2104307265Smav 2105307265Smavstatic inline void 2106307265Smavarc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags) 2107307265Smav{ 2108307265Smav ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); 2109307265Smav hdr->b_flags &= ~flags; 2110307265Smav} 2111307265Smav 2112307265Smav/* 2113307265Smav * Setting the compression bits in the arc_buf_hdr_t's b_flags is 2114307265Smav * done in a special way since we have to clear and set bits 2115307265Smav * at the same time. Consumers that wish to set the compression bits 2116307265Smav * must use this function to ensure that the flags are updated in 2117307265Smav * thread-safe manner. 2118307265Smav */ 2119168404Spjdstatic void 2120307265Smavarc_hdr_set_compress(arc_buf_hdr_t *hdr, enum zio_compress cmp) 2121168404Spjd{ 2122307265Smav ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); 2123307265Smav 2124307265Smav /* 2125307265Smav * Holes and embedded blocks will always have a psize = 0 so 2126307265Smav * we ignore the compression of the blkptr and set the 2127307265Smav * arc_buf_hdr_t's compression to ZIO_COMPRESS_OFF. 2128307265Smav * Holes and embedded blocks remain anonymous so we don't 2129307265Smav * want to uncompress them. Mark them as uncompressed. 2130307265Smav */ 2131307265Smav if (!zfs_compressed_arc_enabled || HDR_GET_PSIZE(hdr) == 0) { 2132307265Smav arc_hdr_clear_flags(hdr, ARC_FLAG_COMPRESSED_ARC); 2133307265Smav HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_OFF); 2134307265Smav ASSERT(!HDR_COMPRESSION_ENABLED(hdr)); 2135307265Smav ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF); 2136307265Smav } else { 2137307265Smav arc_hdr_set_flags(hdr, ARC_FLAG_COMPRESSED_ARC); 2138307265Smav HDR_SET_COMPRESS(hdr, cmp); 2139307265Smav ASSERT3U(HDR_GET_COMPRESS(hdr), ==, cmp); 2140307265Smav ASSERT(HDR_COMPRESSION_ENABLED(hdr)); 2141307265Smav } 2142307265Smav} 2143307265Smav 2144321535Smav/* 2145321535Smav * Looks for another buf on the same hdr which has the data decompressed, copies 2146321535Smav * from it, and returns true. If no such buf exists, returns false. 2147321535Smav */ 2148321535Smavstatic boolean_t 2149321535Smavarc_buf_try_copy_decompressed_data(arc_buf_t *buf) 2150321535Smav{ 2151321535Smav arc_buf_hdr_t *hdr = buf->b_hdr; 2152321535Smav boolean_t copied = B_FALSE; 2153321535Smav 2154321535Smav ASSERT(HDR_HAS_L1HDR(hdr)); 2155321535Smav ASSERT3P(buf->b_data, !=, NULL); 2156321535Smav ASSERT(!ARC_BUF_COMPRESSED(buf)); 2157321535Smav 2158321535Smav for (arc_buf_t *from = hdr->b_l1hdr.b_buf; from != NULL; 2159321535Smav from = from->b_next) { 2160321535Smav /* can't use our own data buffer */ 2161321535Smav if (from == buf) { 2162321535Smav continue; 2163321535Smav } 2164321535Smav 2165321535Smav if (!ARC_BUF_COMPRESSED(from)) { 2166321535Smav bcopy(from->b_data, buf->b_data, arc_buf_size(buf)); 2167321535Smav copied = B_TRUE; 2168321535Smav break; 2169321535Smav } 2170321535Smav } 2171321535Smav 2172321535Smav /* 2173321535Smav * There were no decompressed bufs, so there should not be a 2174321535Smav * checksum on the hdr either. 2175321535Smav */ 2176321535Smav EQUIV(!copied, hdr->b_l1hdr.b_freeze_cksum == NULL); 2177321535Smav 2178321535Smav return (copied); 2179321535Smav} 2180321535Smav 2181321535Smav/* 2182321535Smav * Given a buf that has a data buffer attached to it, this function will 2183321535Smav * efficiently fill the buf with data of the specified compression setting from 2184321535Smav * the hdr and update the hdr's b_freeze_cksum if necessary. If the buf and hdr 2185321535Smav * are already sharing a data buf, no copy is performed. 2186321535Smav * 2187321535Smav * If the buf is marked as compressed but uncompressed data was requested, this 2188321535Smav * will allocate a new data buffer for the buf, remove that flag, and fill the 2189321535Smav * buf with uncompressed data. You can't request a compressed buf on a hdr with 2190321535Smav * uncompressed data, and (since we haven't added support for it yet) if you 2191321535Smav * want compressed data your buf must already be marked as compressed and have 2192321535Smav * the correct-sized data buffer. 2193321535Smav */ 2194307265Smavstatic int 2195321535Smavarc_buf_fill(arc_buf_t *buf, boolean_t compressed) 2196307265Smav{ 2197307265Smav arc_buf_hdr_t *hdr = buf->b_hdr; 2198321535Smav boolean_t hdr_compressed = (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF); 2199307265Smav dmu_object_byteswap_t bswap = hdr->b_l1hdr.b_byteswap; 2200307265Smav 2201321535Smav ASSERT3P(buf->b_data, !=, NULL); 2202321535Smav IMPLY(compressed, hdr_compressed); 2203321535Smav IMPLY(compressed, ARC_BUF_COMPRESSED(buf)); 2204321535Smav 2205321535Smav if (hdr_compressed == compressed) { 2206321535Smav if (!arc_buf_is_shared(buf)) { 2207321610Smav abd_copy_to_buf(buf->b_data, hdr->b_l1hdr.b_pabd, 2208321535Smav arc_buf_size(buf)); 2209321535Smav } 2210321535Smav } else { 2211321535Smav ASSERT(hdr_compressed); 2212321535Smav ASSERT(!compressed); 2213321535Smav ASSERT3U(HDR_GET_LSIZE(hdr), !=, HDR_GET_PSIZE(hdr)); 2214321535Smav 2215307265Smav /* 2216321535Smav * If the buf is sharing its data with the hdr, unlink it and 2217321535Smav * allocate a new data buffer for the buf. 2218307265Smav */ 2219321535Smav if (arc_buf_is_shared(buf)) { 2220321535Smav ASSERT(ARC_BUF_COMPRESSED(buf)); 2221321535Smav 2222321535Smav /* We need to give the buf it's own b_data */ 2223321535Smav buf->b_flags &= ~ARC_BUF_FLAG_SHARED; 2224321535Smav buf->b_data = 2225321535Smav arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf); 2226321535Smav arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA); 2227321535Smav 2228321535Smav /* Previously overhead was 0; just add new overhead */ 2229321535Smav ARCSTAT_INCR(arcstat_overhead_size, HDR_GET_LSIZE(hdr)); 2230321535Smav } else if (ARC_BUF_COMPRESSED(buf)) { 2231321535Smav /* We need to reallocate the buf's b_data */ 2232321535Smav arc_free_data_buf(hdr, buf->b_data, HDR_GET_PSIZE(hdr), 2233321535Smav buf); 2234321535Smav buf->b_data = 2235321535Smav arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf); 2236321535Smav 2237321535Smav /* We increased the size of b_data; update overhead */ 2238321535Smav ARCSTAT_INCR(arcstat_overhead_size, 2239321535Smav HDR_GET_LSIZE(hdr) - HDR_GET_PSIZE(hdr)); 2240307265Smav } 2241321535Smav 2242321535Smav /* 2243321535Smav * Regardless of the buf's previous compression settings, it 2244321535Smav * should not be compressed at the end of this function. 2245321535Smav */ 2246321535Smav buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED; 2247321535Smav 2248321535Smav /* 2249321535Smav * Try copying the data from another buf which already has a 2250321535Smav * decompressed version. If that's not possible, it's time to 2251321535Smav * bite the bullet and decompress the data from the hdr. 2252321535Smav */ 2253321535Smav if (arc_buf_try_copy_decompressed_data(buf)) { 2254321535Smav /* Skip byteswapping and checksumming (already done) */ 2255321535Smav ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, !=, NULL); 2256321535Smav return (0); 2257321535Smav } else { 2258321535Smav int error = zio_decompress_data(HDR_GET_COMPRESS(hdr), 2259321610Smav hdr->b_l1hdr.b_pabd, buf->b_data, 2260321535Smav HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr)); 2261321535Smav 2262321535Smav /* 2263321535Smav * Absent hardware errors or software bugs, this should 2264321535Smav * be impossible, but log it anyway so we can debug it. 2265321535Smav */ 2266321535Smav if (error != 0) { 2267321535Smav zfs_dbgmsg( 2268321535Smav "hdr %p, compress %d, psize %d, lsize %d", 2269321535Smav hdr, HDR_GET_COMPRESS(hdr), 2270321535Smav HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr)); 2271321535Smav return (SET_ERROR(EIO)); 2272321535Smav } 2273321535Smav } 2274307265Smav } 2275321535Smav 2276321535Smav /* Byteswap the buf's data if necessary */ 2277307265Smav if (bswap != DMU_BSWAP_NUMFUNCS) { 2278307265Smav ASSERT(!HDR_SHARED_DATA(hdr)); 2279307265Smav ASSERT3U(bswap, <, DMU_BSWAP_NUMFUNCS); 2280307265Smav dmu_ot_byteswap[bswap].ob_func(buf->b_data, HDR_GET_LSIZE(hdr)); 2281307265Smav } 2282321535Smav 2283321535Smav /* Compute the hdr's checksum if necessary */ 2284307265Smav arc_cksum_compute(buf); 2285321535Smav 2286307265Smav return (0); 2287307265Smav} 2288307265Smav 2289321535Smavint 2290321535Smavarc_decompress(arc_buf_t *buf) 2291321535Smav{ 2292321535Smav return (arc_buf_fill(buf, B_FALSE)); 2293321535Smav} 2294321535Smav 2295307265Smav/* 2296321610Smav * Return the size of the block, b_pabd, that is stored in the arc_buf_hdr_t. 2297307265Smav */ 2298307265Smavstatic uint64_t 2299307265Smavarc_hdr_size(arc_buf_hdr_t *hdr) 2300307265Smav{ 2301307265Smav uint64_t size; 2302307265Smav 2303307265Smav if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF && 2304307265Smav HDR_GET_PSIZE(hdr) > 0) { 2305307265Smav size = HDR_GET_PSIZE(hdr); 2306307265Smav } else { 2307307265Smav ASSERT3U(HDR_GET_LSIZE(hdr), !=, 0); 2308307265Smav size = HDR_GET_LSIZE(hdr); 2309307265Smav } 2310307265Smav return (size); 2311307265Smav} 2312307265Smav 2313307265Smav/* 2314307265Smav * Increment the amount of evictable space in the arc_state_t's refcount. 2315307265Smav * We account for the space used by the hdr and the arc buf individually 2316307265Smav * so that we can add and remove them from the refcount individually. 2317307265Smav */ 2318307265Smavstatic void 2319307265Smavarc_evictable_space_increment(arc_buf_hdr_t *hdr, arc_state_t *state) 2320307265Smav{ 2321307265Smav arc_buf_contents_t type = arc_buf_type(hdr); 2322307265Smav 2323286570Smav ASSERT(HDR_HAS_L1HDR(hdr)); 2324307265Smav 2325307265Smav if (GHOST_STATE(state)) { 2326307265Smav ASSERT0(hdr->b_l1hdr.b_bufcnt); 2327307265Smav ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); 2328321610Smav ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); 2329321535Smav (void) refcount_add_many(&state->arcs_esize[type], 2330321535Smav HDR_GET_LSIZE(hdr), hdr); 2331307265Smav return; 2332307265Smav } 2333307265Smav 2334307265Smav ASSERT(!GHOST_STATE(state)); 2335321610Smav if (hdr->b_l1hdr.b_pabd != NULL) { 2336307265Smav (void) refcount_add_many(&state->arcs_esize[type], 2337307265Smav arc_hdr_size(hdr), hdr); 2338307265Smav } 2339307265Smav for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; 2340307265Smav buf = buf->b_next) { 2341321535Smav if (arc_buf_is_shared(buf)) 2342307265Smav continue; 2343321535Smav (void) refcount_add_many(&state->arcs_esize[type], 2344321535Smav arc_buf_size(buf), buf); 2345307265Smav } 2346307265Smav} 2347307265Smav 2348307265Smav/* 2349307265Smav * Decrement the amount of evictable space in the arc_state_t's refcount. 2350307265Smav * We account for the space used by the hdr and the arc buf individually 2351307265Smav * so that we can add and remove them from the refcount individually. 2352307265Smav */ 2353307265Smavstatic void 2354321535Smavarc_evictable_space_decrement(arc_buf_hdr_t *hdr, arc_state_t *state) 2355307265Smav{ 2356307265Smav arc_buf_contents_t type = arc_buf_type(hdr); 2357307265Smav 2358307265Smav ASSERT(HDR_HAS_L1HDR(hdr)); 2359307265Smav 2360307265Smav if (GHOST_STATE(state)) { 2361307265Smav ASSERT0(hdr->b_l1hdr.b_bufcnt); 2362307265Smav ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); 2363321610Smav ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); 2364307265Smav (void) refcount_remove_many(&state->arcs_esize[type], 2365321535Smav HDR_GET_LSIZE(hdr), hdr); 2366307265Smav return; 2367307265Smav } 2368307265Smav 2369307265Smav ASSERT(!GHOST_STATE(state)); 2370321610Smav if (hdr->b_l1hdr.b_pabd != NULL) { 2371307265Smav (void) refcount_remove_many(&state->arcs_esize[type], 2372307265Smav arc_hdr_size(hdr), hdr); 2373307265Smav } 2374307265Smav for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; 2375307265Smav buf = buf->b_next) { 2376321535Smav if (arc_buf_is_shared(buf)) 2377307265Smav continue; 2378307265Smav (void) refcount_remove_many(&state->arcs_esize[type], 2379321535Smav arc_buf_size(buf), buf); 2380307265Smav } 2381307265Smav} 2382307265Smav 2383307265Smav/* 2384307265Smav * Add a reference to this hdr indicating that someone is actively 2385307265Smav * referencing that memory. When the refcount transitions from 0 to 1, 2386307265Smav * we remove it from the respective arc_state_t list to indicate that 2387307265Smav * it is not evictable. 2388307265Smav */ 2389307265Smavstatic void 2390307265Smavadd_reference(arc_buf_hdr_t *hdr, void *tag) 2391307265Smav{ 2392307265Smav ASSERT(HDR_HAS_L1HDR(hdr)); 2393307265Smav if (!MUTEX_HELD(HDR_LOCK(hdr))) { 2394307265Smav ASSERT(hdr->b_l1hdr.b_state == arc_anon); 2395307265Smav ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 2396307265Smav ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); 2397307265Smav } 2398307265Smav 2399286570Smav arc_state_t *state = hdr->b_l1hdr.b_state; 2400168404Spjd 2401286570Smav if ((refcount_add(&hdr->b_l1hdr.b_refcnt, tag) == 1) && 2402286570Smav (state != arc_anon)) { 2403286570Smav /* We don't use the L2-only state list. */ 2404286570Smav if (state != arc_l2c_only) { 2405321553Smav multilist_remove(state->arcs_list[arc_buf_type(hdr)], 2406307265Smav hdr); 2407321535Smav arc_evictable_space_decrement(hdr, state); 2408168404Spjd } 2409185029Spjd /* remove the prefetch flag if we get a reference */ 2410307265Smav arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH); 2411168404Spjd } 2412168404Spjd} 2413168404Spjd 2414307265Smav/* 2415307265Smav * Remove a reference from this hdr. When the reference transitions from 2416307265Smav * 1 to 0 and we're not anonymous, then we add this hdr to the arc_state_t's 2417307265Smav * list making it eligible for eviction. 2418307265Smav */ 2419168404Spjdstatic int 2420275811Sdelphijremove_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag) 2421168404Spjd{ 2422168404Spjd int cnt; 2423286570Smav arc_state_t *state = hdr->b_l1hdr.b_state; 2424168404Spjd 2425286570Smav ASSERT(HDR_HAS_L1HDR(hdr)); 2426168404Spjd ASSERT(state == arc_anon || MUTEX_HELD(hash_lock)); 2427168404Spjd ASSERT(!GHOST_STATE(state)); 2428168404Spjd 2429286570Smav /* 2430286570Smav * arc_l2c_only counts as a ghost state so we don't need to explicitly 2431286570Smav * check to prevent usage of the arc_l2c_only list. 2432286570Smav */ 2433286570Smav if (((cnt = refcount_remove(&hdr->b_l1hdr.b_refcnt, tag)) == 0) && 2434168404Spjd (state != arc_anon)) { 2435321553Smav multilist_insert(state->arcs_list[arc_buf_type(hdr)], hdr); 2436307265Smav ASSERT3U(hdr->b_l1hdr.b_bufcnt, >, 0); 2437307265Smav arc_evictable_space_increment(hdr, state); 2438168404Spjd } 2439168404Spjd return (cnt); 2440168404Spjd} 2441168404Spjd 2442168404Spjd/* 2443286763Smav * Move the supplied buffer to the indicated state. The hash lock 2444168404Spjd * for the buffer must be held by the caller. 2445168404Spjd */ 2446168404Spjdstatic void 2447275811Sdelphijarc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, 2448275811Sdelphij kmutex_t *hash_lock) 2449168404Spjd{ 2450286570Smav arc_state_t *old_state; 2451286570Smav int64_t refcnt; 2452307265Smav uint32_t bufcnt; 2453307265Smav boolean_t update_old, update_new; 2454286570Smav arc_buf_contents_t buftype = arc_buf_type(hdr); 2455168404Spjd 2456286570Smav /* 2457286570Smav * We almost always have an L1 hdr here, since we call arc_hdr_realloc() 2458286570Smav * in arc_read() when bringing a buffer out of the L2ARC. However, the 2459286570Smav * L1 hdr doesn't always exist when we change state to arc_anon before 2460286570Smav * destroying a header, in which case reallocating to add the L1 hdr is 2461286570Smav * pointless. 2462286570Smav */ 2463286570Smav if (HDR_HAS_L1HDR(hdr)) { 2464286570Smav old_state = hdr->b_l1hdr.b_state; 2465286570Smav refcnt = refcount_count(&hdr->b_l1hdr.b_refcnt); 2466307265Smav bufcnt = hdr->b_l1hdr.b_bufcnt; 2467321610Smav update_old = (bufcnt > 0 || hdr->b_l1hdr.b_pabd != NULL); 2468286570Smav } else { 2469286570Smav old_state = arc_l2c_only; 2470286570Smav refcnt = 0; 2471307265Smav bufcnt = 0; 2472307265Smav update_old = B_FALSE; 2473286570Smav } 2474307265Smav update_new = update_old; 2475286570Smav 2476168404Spjd ASSERT(MUTEX_HELD(hash_lock)); 2477258632Savg ASSERT3P(new_state, !=, old_state); 2478307265Smav ASSERT(!GHOST_STATE(new_state) || bufcnt == 0); 2479307265Smav ASSERT(old_state != arc_anon || bufcnt <= 1); 2480168404Spjd 2481168404Spjd /* 2482168404Spjd * If this buffer is evictable, transfer it from the 2483168404Spjd * old state list to the new state list. 2484168404Spjd */ 2485168404Spjd if (refcnt == 0) { 2486286570Smav if (old_state != arc_anon && old_state != arc_l2c_only) { 2487286570Smav ASSERT(HDR_HAS_L1HDR(hdr)); 2488321553Smav multilist_remove(old_state->arcs_list[buftype], hdr); 2489168404Spjd 2490307265Smav if (GHOST_STATE(old_state)) { 2491307265Smav ASSERT0(bufcnt); 2492307265Smav ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); 2493307265Smav update_old = B_TRUE; 2494168404Spjd } 2495321535Smav arc_evictable_space_decrement(hdr, old_state); 2496168404Spjd } 2497286570Smav if (new_state != arc_anon && new_state != arc_l2c_only) { 2498168404Spjd 2499286570Smav /* 2500286570Smav * An L1 header always exists here, since if we're 2501286570Smav * moving to some L1-cached state (i.e. not l2c_only or 2502286570Smav * anonymous), we realloc the header to add an L1hdr 2503286570Smav * beforehand. 2504286570Smav */ 2505286570Smav ASSERT(HDR_HAS_L1HDR(hdr)); 2506321553Smav multilist_insert(new_state->arcs_list[buftype], hdr); 2507168404Spjd 2508168404Spjd if (GHOST_STATE(new_state)) { 2509307265Smav ASSERT0(bufcnt); 2510307265Smav ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); 2511307265Smav update_new = B_TRUE; 2512168404Spjd } 2513307265Smav arc_evictable_space_increment(hdr, new_state); 2514168404Spjd } 2515168404Spjd } 2516168404Spjd 2517307265Smav ASSERT(!HDR_EMPTY(hdr)); 2518275811Sdelphij if (new_state == arc_anon && HDR_IN_HASH_TABLE(hdr)) 2519275811Sdelphij buf_hash_remove(hdr); 2520168404Spjd 2521286570Smav /* adjust state sizes (ignore arc_l2c_only) */ 2522286766Smav 2523307265Smav if (update_new && new_state != arc_l2c_only) { 2524286766Smav ASSERT(HDR_HAS_L1HDR(hdr)); 2525286766Smav if (GHOST_STATE(new_state)) { 2526307265Smav ASSERT0(bufcnt); 2527286766Smav 2528286766Smav /* 2529307265Smav * When moving a header to a ghost state, we first 2530286766Smav * remove all arc buffers. Thus, we'll have a 2531307265Smav * bufcnt of zero, and no arc buffer to use for 2532286766Smav * the reference. As a result, we use the arc 2533286766Smav * header pointer for the reference. 2534286766Smav */ 2535286766Smav (void) refcount_add_many(&new_state->arcs_size, 2536307265Smav HDR_GET_LSIZE(hdr), hdr); 2537321610Smav ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); 2538286766Smav } else { 2539307265Smav uint32_t buffers = 0; 2540286766Smav 2541286766Smav /* 2542286766Smav * Each individual buffer holds a unique reference, 2543286766Smav * thus we must remove each of these references one 2544286766Smav * at a time. 2545286766Smav */ 2546286766Smav for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; 2547286766Smav buf = buf->b_next) { 2548307265Smav ASSERT3U(bufcnt, !=, 0); 2549307265Smav buffers++; 2550307265Smav 2551307265Smav /* 2552307265Smav * When the arc_buf_t is sharing the data 2553307265Smav * block with the hdr, the owner of the 2554307265Smav * reference belongs to the hdr. Only 2555307265Smav * add to the refcount if the arc_buf_t is 2556307265Smav * not shared. 2557307265Smav */ 2558321535Smav if (arc_buf_is_shared(buf)) 2559307265Smav continue; 2560307265Smav 2561286766Smav (void) refcount_add_many(&new_state->arcs_size, 2562321535Smav arc_buf_size(buf), buf); 2563286766Smav } 2564307265Smav ASSERT3U(bufcnt, ==, buffers); 2565307265Smav 2566321610Smav if (hdr->b_l1hdr.b_pabd != NULL) { 2567307265Smav (void) refcount_add_many(&new_state->arcs_size, 2568307265Smav arc_hdr_size(hdr), hdr); 2569307265Smav } else { 2570307265Smav ASSERT(GHOST_STATE(old_state)); 2571307265Smav } 2572286766Smav } 2573286766Smav } 2574286766Smav 2575307265Smav if (update_old && old_state != arc_l2c_only) { 2576286766Smav ASSERT(HDR_HAS_L1HDR(hdr)); 2577286766Smav if (GHOST_STATE(old_state)) { 2578307265Smav ASSERT0(bufcnt); 2579321610Smav ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); 2580307265Smav 2581286766Smav /* 2582286766Smav * When moving a header off of a ghost state, 2583307265Smav * the header will not contain any arc buffers. 2584307265Smav * We use the arc header pointer for the reference 2585307265Smav * which is exactly what we did when we put the 2586307265Smav * header on the ghost state. 2587286766Smav */ 2588286766Smav 2589286766Smav (void) refcount_remove_many(&old_state->arcs_size, 2590307265Smav HDR_GET_LSIZE(hdr), hdr); 2591286766Smav } else { 2592307265Smav uint32_t buffers = 0; 2593286766Smav 2594286766Smav /* 2595286766Smav * Each individual buffer holds a unique reference, 2596286766Smav * thus we must remove each of these references one 2597286766Smav * at a time. 2598286766Smav */ 2599286766Smav for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; 2600286766Smav buf = buf->b_next) { 2601321535Smav ASSERT3U(bufcnt, !=, 0); 2602307265Smav buffers++; 2603307265Smav 2604307265Smav /* 2605307265Smav * When the arc_buf_t is sharing the data 2606307265Smav * block with the hdr, the owner of the 2607307265Smav * reference belongs to the hdr. Only 2608307265Smav * add to the refcount if the arc_buf_t is 2609307265Smav * not shared. 2610307265Smav */ 2611321535Smav if (arc_buf_is_shared(buf)) 2612307265Smav continue; 2613307265Smav 2614286766Smav (void) refcount_remove_many( 2615321535Smav &old_state->arcs_size, arc_buf_size(buf), 2616307265Smav buf); 2617286766Smav } 2618307265Smav ASSERT3U(bufcnt, ==, buffers); 2619321610Smav ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); 2620307265Smav (void) refcount_remove_many( 2621307265Smav &old_state->arcs_size, arc_hdr_size(hdr), hdr); 2622286766Smav } 2623168404Spjd } 2624286766Smav 2625286570Smav if (HDR_HAS_L1HDR(hdr)) 2626286570Smav hdr->b_l1hdr.b_state = new_state; 2627185029Spjd 2628286570Smav /* 2629286570Smav * L2 headers should never be on the L2 state list since they don't 2630286570Smav * have L1 headers allocated. 2631286570Smav */ 2632321553Smav ASSERT(multilist_is_empty(arc_l2c_only->arcs_list[ARC_BUFC_DATA]) && 2633321553Smav multilist_is_empty(arc_l2c_only->arcs_list[ARC_BUFC_METADATA])); 2634168404Spjd} 2635168404Spjd 2636185029Spjdvoid 2637208373Smmarc_space_consume(uint64_t space, arc_space_type_t type) 2638185029Spjd{ 2639208373Smm ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES); 2640208373Smm 2641208373Smm switch (type) { 2642208373Smm case ARC_SPACE_DATA: 2643208373Smm ARCSTAT_INCR(arcstat_data_size, space); 2644208373Smm break; 2645286574Smav case ARC_SPACE_META: 2646286574Smav ARCSTAT_INCR(arcstat_metadata_size, space); 2647286574Smav break; 2648208373Smm case ARC_SPACE_OTHER: 2649208373Smm ARCSTAT_INCR(arcstat_other_size, space); 2650208373Smm break; 2651208373Smm case ARC_SPACE_HDRS: 2652208373Smm ARCSTAT_INCR(arcstat_hdr_size, space); 2653208373Smm break; 2654208373Smm case ARC_SPACE_L2HDRS: 2655208373Smm ARCSTAT_INCR(arcstat_l2_hdr_size, space); 2656208373Smm break; 2657208373Smm } 2658208373Smm 2659286574Smav if (type != ARC_SPACE_DATA) 2660286574Smav ARCSTAT_INCR(arcstat_meta_used, space); 2661286574Smav 2662185029Spjd atomic_add_64(&arc_size, space); 2663185029Spjd} 2664185029Spjd 2665185029Spjdvoid 2666208373Smmarc_space_return(uint64_t space, arc_space_type_t type) 2667185029Spjd{ 2668208373Smm ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES); 2669208373Smm 2670208373Smm switch (type) { 2671208373Smm case ARC_SPACE_DATA: 2672208373Smm ARCSTAT_INCR(arcstat_data_size, -space); 2673208373Smm break; 2674286574Smav case ARC_SPACE_META: 2675286574Smav ARCSTAT_INCR(arcstat_metadata_size, -space); 2676286574Smav break; 2677208373Smm case ARC_SPACE_OTHER: 2678208373Smm ARCSTAT_INCR(arcstat_other_size, -space); 2679208373Smm break; 2680208373Smm case ARC_SPACE_HDRS: 2681208373Smm ARCSTAT_INCR(arcstat_hdr_size, -space); 2682208373Smm break; 2683208373Smm case ARC_SPACE_L2HDRS: 2684208373Smm ARCSTAT_INCR(arcstat_l2_hdr_size, -space); 2685208373Smm break; 2686208373Smm } 2687208373Smm 2688286574Smav if (type != ARC_SPACE_DATA) { 2689286574Smav ASSERT(arc_meta_used >= space); 2690286574Smav if (arc_meta_max < arc_meta_used) 2691286574Smav arc_meta_max = arc_meta_used; 2692286574Smav ARCSTAT_INCR(arcstat_meta_used, -space); 2693286574Smav } 2694286574Smav 2695185029Spjd ASSERT(arc_size >= space); 2696185029Spjd atomic_add_64(&arc_size, -space); 2697185029Spjd} 2698185029Spjd 2699307265Smav/* 2700321535Smav * Given a hdr and a buf, returns whether that buf can share its b_data buffer 2701321610Smav * with the hdr's b_pabd. 2702307265Smav */ 2703321535Smavstatic boolean_t 2704321535Smavarc_can_share(arc_buf_hdr_t *hdr, arc_buf_t *buf) 2705168404Spjd{ 2706321535Smav /* 2707321535Smav * The criteria for sharing a hdr's data are: 2708321535Smav * 1. the hdr's compression matches the buf's compression 2709321535Smav * 2. the hdr doesn't need to be byteswapped 2710321535Smav * 3. the hdr isn't already being shared 2711321535Smav * 4. the buf is either compressed or it is the last buf in the hdr list 2712321535Smav * 2713321535Smav * Criterion #4 maintains the invariant that shared uncompressed 2714321535Smav * bufs must be the final buf in the hdr's b_buf list. Reading this, you 2715321535Smav * might ask, "if a compressed buf is allocated first, won't that be the 2716321535Smav * last thing in the list?", but in that case it's impossible to create 2717321535Smav * a shared uncompressed buf anyway (because the hdr must be compressed 2718321535Smav * to have the compressed buf). You might also think that #3 is 2719321535Smav * sufficient to make this guarantee, however it's possible 2720321535Smav * (specifically in the rare L2ARC write race mentioned in 2721321535Smav * arc_buf_alloc_impl()) there will be an existing uncompressed buf that 2722321535Smav * is sharable, but wasn't at the time of its allocation. Rather than 2723321535Smav * allow a new shared uncompressed buf to be created and then shuffle 2724321535Smav * the list around to make it the last element, this simply disallows 2725321535Smav * sharing if the new buf isn't the first to be added. 2726321535Smav */ 2727321535Smav ASSERT3P(buf->b_hdr, ==, hdr); 2728321535Smav boolean_t hdr_compressed = HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF; 2729321535Smav boolean_t buf_compressed = ARC_BUF_COMPRESSED(buf) != 0; 2730321535Smav return (buf_compressed == hdr_compressed && 2731321535Smav hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS && 2732321535Smav !HDR_SHARED_DATA(hdr) && 2733321535Smav (ARC_BUF_LAST(buf) || ARC_BUF_COMPRESSED(buf))); 2734321535Smav} 2735321535Smav 2736321535Smav/* 2737321535Smav * Allocate a buf for this hdr. If you care about the data that's in the hdr, 2738321535Smav * or if you want a compressed buffer, pass those flags in. Returns 0 if the 2739321535Smav * copy was made successfully, or an error code otherwise. 2740321535Smav */ 2741321535Smavstatic int 2742321535Smavarc_buf_alloc_impl(arc_buf_hdr_t *hdr, void *tag, boolean_t compressed, 2743321535Smav boolean_t fill, arc_buf_t **ret) 2744321535Smav{ 2745168404Spjd arc_buf_t *buf; 2746168404Spjd 2747307265Smav ASSERT(HDR_HAS_L1HDR(hdr)); 2748307265Smav ASSERT3U(HDR_GET_LSIZE(hdr), >, 0); 2749307265Smav VERIFY(hdr->b_type == ARC_BUFC_DATA || 2750307265Smav hdr->b_type == ARC_BUFC_METADATA); 2751321535Smav ASSERT3P(ret, !=, NULL); 2752321535Smav ASSERT3P(*ret, ==, NULL); 2753286570Smav 2754321535Smav buf = *ret = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); 2755168404Spjd buf->b_hdr = hdr; 2756168404Spjd buf->b_data = NULL; 2757321535Smav buf->b_next = hdr->b_l1hdr.b_buf; 2758321535Smav buf->b_flags = 0; 2759286570Smav 2760307265Smav add_reference(hdr, tag); 2761286570Smav 2762307265Smav /* 2763307265Smav * We're about to change the hdr's b_flags. We must either 2764307265Smav * hold the hash_lock or be undiscoverable. 2765307265Smav */ 2766307265Smav ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); 2767307265Smav 2768307265Smav /* 2769321535Smav * Only honor requests for compressed bufs if the hdr is actually 2770321535Smav * compressed. 2771307265Smav */ 2772321535Smav if (compressed && HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF) 2773321535Smav buf->b_flags |= ARC_BUF_FLAG_COMPRESSED; 2774321535Smav 2775321535Smav /* 2776321535Smav * If the hdr's data can be shared then we share the data buffer and 2777321535Smav * set the appropriate bit in the hdr's b_flags to indicate the hdr is 2778321610Smav * sharing it's b_pabd with the arc_buf_t. Otherwise, we allocate a new 2779321535Smav * buffer to store the buf's data. 2780321535Smav * 2781321610Smav * There are two additional restrictions here because we're sharing 2782321610Smav * hdr -> buf instead of the usual buf -> hdr. First, the hdr can't be 2783321610Smav * actively involved in an L2ARC write, because if this buf is used by 2784321610Smav * an arc_write() then the hdr's data buffer will be released when the 2785321535Smav * write completes, even though the L2ARC write might still be using it. 2786321610Smav * Second, the hdr's ABD must be linear so that the buf's user doesn't 2787321610Smav * need to be ABD-aware. 2788321535Smav */ 2789321610Smav boolean_t can_share = arc_can_share(hdr, buf) && !HDR_L2_WRITING(hdr) && 2790321610Smav abd_is_linear(hdr->b_l1hdr.b_pabd); 2791321535Smav 2792321535Smav /* Set up b_data and sharing */ 2793321535Smav if (can_share) { 2794321610Smav buf->b_data = abd_to_buf(hdr->b_l1hdr.b_pabd); 2795321535Smav buf->b_flags |= ARC_BUF_FLAG_SHARED; 2796307265Smav arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA); 2797307265Smav } else { 2798321535Smav buf->b_data = 2799321535Smav arc_get_data_buf(hdr, arc_buf_size(buf), buf); 2800321535Smav ARCSTAT_INCR(arcstat_overhead_size, arc_buf_size(buf)); 2801307265Smav } 2802307265Smav VERIFY3P(buf->b_data, !=, NULL); 2803307265Smav 2804286570Smav hdr->b_l1hdr.b_buf = buf; 2805307265Smav hdr->b_l1hdr.b_bufcnt += 1; 2806286570Smav 2807321535Smav /* 2808321535Smav * If the user wants the data from the hdr, we need to either copy or 2809321535Smav * decompress the data. 2810321535Smav */ 2811321535Smav if (fill) { 2812321535Smav return (arc_buf_fill(buf, ARC_BUF_COMPRESSED(buf) != 0)); 2813321535Smav } 2814321535Smav 2815321535Smav return (0); 2816307265Smav} 2817168404Spjd 2818321535Smavstatic char *arc_onloan_tag = "onloan"; 2819321535Smav 2820321535Smavstatic inline void 2821321535Smavarc_loaned_bytes_update(int64_t delta) 2822307265Smav{ 2823321535Smav atomic_add_64(&arc_loaned_bytes, delta); 2824307265Smav 2825321535Smav /* assert that it did not wrap around */ 2826321535Smav ASSERT3S(atomic_add_64_nv(&arc_loaned_bytes, 0), >=, 0); 2827168404Spjd} 2828168404Spjd 2829209962Smm/* 2830209962Smm * Loan out an anonymous arc buffer. Loaned buffers are not counted as in 2831209962Smm * flight data by arc_tempreserve_space() until they are "returned". Loaned 2832209962Smm * buffers must be returned to the arc before they can be used by the DMU or 2833209962Smm * freed. 2834209962Smm */ 2835209962Smmarc_buf_t * 2836321535Smavarc_loan_buf(spa_t *spa, boolean_t is_metadata, int size) 2837209962Smm{ 2838321535Smav arc_buf_t *buf = arc_alloc_buf(spa, arc_onloan_tag, 2839321535Smav is_metadata ? ARC_BUFC_METADATA : ARC_BUFC_DATA, size); 2840209962Smm 2841321535Smav arc_loaned_bytes_update(size); 2842209962Smm 2843209962Smm return (buf); 2844209962Smm} 2845209962Smm 2846321535Smavarc_buf_t * 2847321535Smavarc_loan_compressed_buf(spa_t *spa, uint64_t psize, uint64_t lsize, 2848321535Smav enum zio_compress compression_type) 2849321535Smav{ 2850321535Smav arc_buf_t *buf = arc_alloc_compressed_buf(spa, arc_onloan_tag, 2851321535Smav psize, lsize, compression_type); 2852321535Smav 2853321535Smav arc_loaned_bytes_update(psize); 2854321535Smav 2855321535Smav return (buf); 2856321535Smav} 2857321535Smav 2858321535Smav 2859209962Smm/* 2860209962Smm * Return a loaned arc buffer to the arc. 2861209962Smm */ 2862209962Smmvoid 2863209962Smmarc_return_buf(arc_buf_t *buf, void *tag) 2864209962Smm{ 2865209962Smm arc_buf_hdr_t *hdr = buf->b_hdr; 2866209962Smm 2867307265Smav ASSERT3P(buf->b_data, !=, NULL); 2868286570Smav ASSERT(HDR_HAS_L1HDR(hdr)); 2869286570Smav (void) refcount_add(&hdr->b_l1hdr.b_refcnt, tag); 2870286570Smav (void) refcount_remove(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag); 2871209962Smm 2872321535Smav arc_loaned_bytes_update(-arc_buf_size(buf)); 2873209962Smm} 2874209962Smm 2875219089Spjd/* Detach an arc_buf from a dbuf (tag) */ 2876219089Spjdvoid 2877219089Spjdarc_loan_inuse_buf(arc_buf_t *buf, void *tag) 2878219089Spjd{ 2879286570Smav arc_buf_hdr_t *hdr = buf->b_hdr; 2880219089Spjd 2881307265Smav ASSERT3P(buf->b_data, !=, NULL); 2882286570Smav ASSERT(HDR_HAS_L1HDR(hdr)); 2883286570Smav (void) refcount_add(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag); 2884286570Smav (void) refcount_remove(&hdr->b_l1hdr.b_refcnt, tag); 2885219089Spjd 2886321535Smav arc_loaned_bytes_update(arc_buf_size(buf)); 2887219089Spjd} 2888219089Spjd 2889274172Savgstatic void 2890321610Smavl2arc_free_abd_on_write(abd_t *abd, size_t size, arc_buf_contents_t type) 2891274172Savg{ 2892307265Smav l2arc_data_free_t *df = kmem_alloc(sizeof (*df), KM_SLEEP); 2893274172Savg 2894321610Smav df->l2df_abd = abd; 2895274172Savg df->l2df_size = size; 2896307265Smav df->l2df_type = type; 2897274172Savg mutex_enter(&l2arc_free_on_write_mtx); 2898274172Savg list_insert_head(l2arc_free_on_write, df); 2899274172Savg mutex_exit(&l2arc_free_on_write_mtx); 2900274172Savg} 2901274172Savg 2902168404Spjdstatic void 2903307265Smavarc_hdr_free_on_write(arc_buf_hdr_t *hdr) 2904185029Spjd{ 2905307265Smav arc_state_t *state = hdr->b_l1hdr.b_state; 2906307265Smav arc_buf_contents_t type = arc_buf_type(hdr); 2907307265Smav uint64_t size = arc_hdr_size(hdr); 2908240133Smm 2909307265Smav /* protected by hash lock, if in the hash table */ 2910307265Smav if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) { 2911307265Smav ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 2912307265Smav ASSERT(state != arc_anon && state != arc_l2c_only); 2913307265Smav 2914307265Smav (void) refcount_remove_many(&state->arcs_esize[type], 2915307265Smav size, hdr); 2916185029Spjd } 2917307265Smav (void) refcount_remove_many(&state->arcs_size, size, hdr); 2918315834Savg if (type == ARC_BUFC_METADATA) { 2919315834Savg arc_space_return(size, ARC_SPACE_META); 2920315834Savg } else { 2921315834Savg ASSERT(type == ARC_BUFC_DATA); 2922315834Savg arc_space_return(size, ARC_SPACE_DATA); 2923315834Savg } 2924307265Smav 2925321610Smav l2arc_free_abd_on_write(hdr->b_l1hdr.b_pabd, size, type); 2926185029Spjd} 2927185029Spjd 2928307265Smav/* 2929307265Smav * Share the arc_buf_t's data with the hdr. Whenever we are sharing the 2930307265Smav * data buffer, we transfer the refcount ownership to the hdr and update 2931307265Smav * the appropriate kstats. 2932307265Smav */ 2933185029Spjdstatic void 2934307265Smavarc_share_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf) 2935274172Savg{ 2936307265Smav arc_state_t *state = hdr->b_l1hdr.b_state; 2937297848Savg 2938321535Smav ASSERT(arc_can_share(hdr, buf)); 2939321610Smav ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); 2940307265Smav ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); 2941274172Savg 2942286570Smav /* 2943307265Smav * Start sharing the data buffer. We transfer the 2944307265Smav * refcount ownership to the hdr since it always owns 2945307265Smav * the refcount whenever an arc_buf_t is shared. 2946286570Smav */ 2947307265Smav refcount_transfer_ownership(&state->arcs_size, buf, hdr); 2948321610Smav hdr->b_l1hdr.b_pabd = abd_get_from_buf(buf->b_data, arc_buf_size(buf)); 2949321610Smav abd_take_ownership_of_buf(hdr->b_l1hdr.b_pabd, 2950321610Smav HDR_ISTYPE_METADATA(hdr)); 2951307265Smav arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA); 2952321535Smav buf->b_flags |= ARC_BUF_FLAG_SHARED; 2953274172Savg 2954286763Smav /* 2955307265Smav * Since we've transferred ownership to the hdr we need 2956307265Smav * to increment its compressed and uncompressed kstats and 2957307265Smav * decrement the overhead size. 2958286763Smav */ 2959307265Smav ARCSTAT_INCR(arcstat_compressed_size, arc_hdr_size(hdr)); 2960307265Smav ARCSTAT_INCR(arcstat_uncompressed_size, HDR_GET_LSIZE(hdr)); 2961321535Smav ARCSTAT_INCR(arcstat_overhead_size, -arc_buf_size(buf)); 2962307265Smav} 2963274172Savg 2964307265Smavstatic void 2965307265Smavarc_unshare_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf) 2966307265Smav{ 2967307265Smav arc_state_t *state = hdr->b_l1hdr.b_state; 2968286570Smav 2969307265Smav ASSERT(arc_buf_is_shared(buf)); 2970321610Smav ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); 2971307265Smav ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); 2972307265Smav 2973286763Smav /* 2974307265Smav * We are no longer sharing this buffer so we need 2975307265Smav * to transfer its ownership to the rightful owner. 2976286763Smav */ 2977307265Smav refcount_transfer_ownership(&state->arcs_size, hdr, buf); 2978307265Smav arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA); 2979321610Smav abd_release_ownership_of_buf(hdr->b_l1hdr.b_pabd); 2980321610Smav abd_put(hdr->b_l1hdr.b_pabd); 2981321610Smav hdr->b_l1hdr.b_pabd = NULL; 2982321535Smav buf->b_flags &= ~ARC_BUF_FLAG_SHARED; 2983286763Smav 2984297848Savg /* 2985307265Smav * Since the buffer is no longer shared between 2986307265Smav * the arc buf and the hdr, count it as overhead. 2987297848Savg */ 2988307265Smav ARCSTAT_INCR(arcstat_compressed_size, -arc_hdr_size(hdr)); 2989307265Smav ARCSTAT_INCR(arcstat_uncompressed_size, -HDR_GET_LSIZE(hdr)); 2990321535Smav ARCSTAT_INCR(arcstat_overhead_size, arc_buf_size(buf)); 2991274172Savg} 2992274172Savg 2993286767Smav/* 2994321535Smav * Remove an arc_buf_t from the hdr's buf list and return the last 2995321535Smav * arc_buf_t on the list. If no buffers remain on the list then return 2996321535Smav * NULL. 2997286767Smav */ 2998321535Smavstatic arc_buf_t * 2999321535Smavarc_buf_remove(arc_buf_hdr_t *hdr, arc_buf_t *buf) 3000321535Smav{ 3001321535Smav ASSERT(HDR_HAS_L1HDR(hdr)); 3002321535Smav ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); 3003321535Smav 3004321535Smav arc_buf_t **bufp = &hdr->b_l1hdr.b_buf; 3005321535Smav arc_buf_t *lastbuf = NULL; 3006321535Smav 3007321535Smav /* 3008321535Smav * Remove the buf from the hdr list and locate the last 3009321535Smav * remaining buffer on the list. 3010321535Smav */ 3011321535Smav while (*bufp != NULL) { 3012321535Smav if (*bufp == buf) 3013321535Smav *bufp = buf->b_next; 3014321535Smav 3015321535Smav /* 3016321535Smav * If we've removed a buffer in the middle of 3017321535Smav * the list then update the lastbuf and update 3018321535Smav * bufp. 3019321535Smav */ 3020321535Smav if (*bufp != NULL) { 3021321535Smav lastbuf = *bufp; 3022321535Smav bufp = &(*bufp)->b_next; 3023321535Smav } 3024321535Smav } 3025321535Smav buf->b_next = NULL; 3026321535Smav ASSERT3P(lastbuf, !=, buf); 3027321535Smav IMPLY(hdr->b_l1hdr.b_bufcnt > 0, lastbuf != NULL); 3028321535Smav IMPLY(hdr->b_l1hdr.b_bufcnt > 0, hdr->b_l1hdr.b_buf != NULL); 3029321535Smav IMPLY(lastbuf != NULL, ARC_BUF_LAST(lastbuf)); 3030321535Smav 3031321535Smav return (lastbuf); 3032321535Smav} 3033321535Smav 3034321535Smav/* 3035321535Smav * Free up buf->b_data and pull the arc_buf_t off of the the arc_buf_hdr_t's 3036321535Smav * list and free it. 3037321535Smav */ 3038274172Savgstatic void 3039321535Smavarc_buf_destroy_impl(arc_buf_t *buf) 3040168404Spjd{ 3041307265Smav arc_buf_hdr_t *hdr = buf->b_hdr; 3042168404Spjd 3043307265Smav /* 3044321535Smav * Free up the data associated with the buf but only if we're not 3045321535Smav * sharing this with the hdr. If we are sharing it with the hdr, the 3046321535Smav * hdr is responsible for doing the free. 3047307265Smav */ 3048286570Smav if (buf->b_data != NULL) { 3049307265Smav /* 3050307265Smav * We're about to change the hdr's b_flags. We must either 3051307265Smav * hold the hash_lock or be undiscoverable. 3052307265Smav */ 3053307265Smav ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); 3054168404Spjd 3055168404Spjd arc_cksum_verify(buf); 3056240133Smm#ifdef illumos 3057240133Smm arc_buf_unwatch(buf); 3058277300Ssmh#endif 3059219089Spjd 3060321535Smav if (arc_buf_is_shared(buf)) { 3061307265Smav arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA); 3062286763Smav } else { 3063321535Smav uint64_t size = arc_buf_size(buf); 3064307265Smav arc_free_data_buf(hdr, buf->b_data, size, buf); 3065307265Smav ARCSTAT_INCR(arcstat_overhead_size, -size); 3066168404Spjd } 3067168404Spjd buf->b_data = NULL; 3068242845Sdelphij 3069307265Smav ASSERT(hdr->b_l1hdr.b_bufcnt > 0); 3070307265Smav hdr->b_l1hdr.b_bufcnt -= 1; 3071168404Spjd } 3072168404Spjd 3073321535Smav arc_buf_t *lastbuf = arc_buf_remove(hdr, buf); 3074168404Spjd 3075321535Smav if (ARC_BUF_SHARED(buf) && !ARC_BUF_COMPRESSED(buf)) { 3076307265Smav /* 3077321535Smav * If the current arc_buf_t is sharing its data buffer with the 3078321610Smav * hdr, then reassign the hdr's b_pabd to share it with the new 3079321535Smav * buffer at the end of the list. The shared buffer is always 3080321535Smav * the last one on the hdr's buffer list. 3081321535Smav * 3082321535Smav * There is an equivalent case for compressed bufs, but since 3083321535Smav * they aren't guaranteed to be the last buf in the list and 3084321535Smav * that is an exceedingly rare case, we just allow that space be 3085321535Smav * wasted temporarily. 3086307265Smav */ 3087321535Smav if (lastbuf != NULL) { 3088321535Smav /* Only one buf can be shared at once */ 3089321535Smav VERIFY(!arc_buf_is_shared(lastbuf)); 3090321535Smav /* hdr is uncompressed so can't have compressed buf */ 3091321535Smav VERIFY(!ARC_BUF_COMPRESSED(lastbuf)); 3092168404Spjd 3093321610Smav ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); 3094321610Smav arc_hdr_free_pabd(hdr); 3095168404Spjd 3096321535Smav /* 3097321535Smav * We must setup a new shared block between the 3098321535Smav * last buffer and the hdr. The data would have 3099321535Smav * been allocated by the arc buf so we need to transfer 3100321535Smav * ownership to the hdr since it's now being shared. 3101321535Smav */ 3102321535Smav arc_share_buf(hdr, lastbuf); 3103321535Smav } 3104321535Smav } else if (HDR_SHARED_DATA(hdr)) { 3105307265Smav /* 3106321535Smav * Uncompressed shared buffers are always at the end 3107321535Smav * of the list. Compressed buffers don't have the 3108321535Smav * same requirements. This makes it hard to 3109321535Smav * simply assert that the lastbuf is shared so 3110321535Smav * we rely on the hdr's compression flags to determine 3111321535Smav * if we have a compressed, shared buffer. 3112307265Smav */ 3113321535Smav ASSERT3P(lastbuf, !=, NULL); 3114321535Smav ASSERT(arc_buf_is_shared(lastbuf) || 3115321535Smav HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF); 3116307265Smav } 3117307265Smav 3118321535Smav /* 3119321535Smav * Free the checksum if we're removing the last uncompressed buf from 3120321535Smav * this hdr. 3121321535Smav */ 3122321535Smav if (!arc_hdr_has_uncompressed_buf(hdr)) { 3123307265Smav arc_cksum_free(hdr); 3124321535Smav } 3125307265Smav 3126168404Spjd /* clean up the buf */ 3127168404Spjd buf->b_hdr = NULL; 3128168404Spjd kmem_cache_free(buf_cache, buf); 3129168404Spjd} 3130168404Spjd 3131168404Spjdstatic void 3132321610Smavarc_hdr_alloc_pabd(arc_buf_hdr_t *hdr) 3133286598Smav{ 3134307265Smav ASSERT3U(HDR_GET_LSIZE(hdr), >, 0); 3135307265Smav ASSERT(HDR_HAS_L1HDR(hdr)); 3136307265Smav ASSERT(!HDR_SHARED_DATA(hdr)); 3137286598Smav 3138321610Smav ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); 3139321610Smav hdr->b_l1hdr.b_pabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr); 3140307265Smav hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS; 3141321610Smav ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); 3142307265Smav 3143307265Smav ARCSTAT_INCR(arcstat_compressed_size, arc_hdr_size(hdr)); 3144307265Smav ARCSTAT_INCR(arcstat_uncompressed_size, HDR_GET_LSIZE(hdr)); 3145307265Smav} 3146307265Smav 3147307265Smavstatic void 3148321610Smavarc_hdr_free_pabd(arc_buf_hdr_t *hdr) 3149307265Smav{ 3150307265Smav ASSERT(HDR_HAS_L1HDR(hdr)); 3151321610Smav ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); 3152307265Smav 3153307265Smav /* 3154307265Smav * If the hdr is currently being written to the l2arc then 3155307265Smav * we defer freeing the data by adding it to the l2arc_free_on_write 3156307265Smav * list. The l2arc will free the data once it's finished 3157307265Smav * writing it to the l2arc device. 3158307265Smav */ 3159307265Smav if (HDR_L2_WRITING(hdr)) { 3160307265Smav arc_hdr_free_on_write(hdr); 3161307265Smav ARCSTAT_BUMP(arcstat_l2_free_on_write); 3162307265Smav } else { 3163321610Smav arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd, 3164307265Smav arc_hdr_size(hdr), hdr); 3165307265Smav } 3166321610Smav hdr->b_l1hdr.b_pabd = NULL; 3167307265Smav hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS; 3168307265Smav 3169307265Smav ARCSTAT_INCR(arcstat_compressed_size, -arc_hdr_size(hdr)); 3170307265Smav ARCSTAT_INCR(arcstat_uncompressed_size, -HDR_GET_LSIZE(hdr)); 3171307265Smav} 3172307265Smav 3173307265Smavstatic arc_buf_hdr_t * 3174307265Smavarc_hdr_alloc(uint64_t spa, int32_t psize, int32_t lsize, 3175321535Smav enum zio_compress compression_type, arc_buf_contents_t type) 3176307265Smav{ 3177307265Smav arc_buf_hdr_t *hdr; 3178307265Smav 3179307265Smav VERIFY(type == ARC_BUFC_DATA || type == ARC_BUFC_METADATA); 3180307265Smav 3181307265Smav hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE); 3182307265Smav ASSERT(HDR_EMPTY(hdr)); 3183307265Smav ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); 3184307265Smav ASSERT3P(hdr->b_l1hdr.b_thawed, ==, NULL); 3185307265Smav HDR_SET_PSIZE(hdr, psize); 3186307265Smav HDR_SET_LSIZE(hdr, lsize); 3187307265Smav hdr->b_spa = spa; 3188307265Smav hdr->b_type = type; 3189307265Smav hdr->b_flags = 0; 3190307265Smav arc_hdr_set_flags(hdr, arc_bufc_to_flags(type) | ARC_FLAG_HAS_L1HDR); 3191321535Smav arc_hdr_set_compress(hdr, compression_type); 3192307265Smav 3193307265Smav hdr->b_l1hdr.b_state = arc_anon; 3194307265Smav hdr->b_l1hdr.b_arc_access = 0; 3195307265Smav hdr->b_l1hdr.b_bufcnt = 0; 3196307265Smav hdr->b_l1hdr.b_buf = NULL; 3197307265Smav 3198307265Smav /* 3199307265Smav * Allocate the hdr's buffer. This will contain either 3200307265Smav * the compressed or uncompressed data depending on the block 3201307265Smav * it references and compressed arc enablement. 3202307265Smav */ 3203321610Smav arc_hdr_alloc_pabd(hdr); 3204307265Smav ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 3205307265Smav 3206307265Smav return (hdr); 3207307265Smav} 3208307265Smav 3209307265Smav/* 3210307265Smav * Transition between the two allocation states for the arc_buf_hdr struct. 3211307265Smav * The arc_buf_hdr struct can be allocated with (hdr_full_cache) or without 3212307265Smav * (hdr_l2only_cache) the fields necessary for the L1 cache - the smaller 3213307265Smav * version is used when a cache buffer is only in the L2ARC in order to reduce 3214307265Smav * memory usage. 3215307265Smav */ 3216307265Smavstatic arc_buf_hdr_t * 3217307265Smavarc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new) 3218307265Smav{ 3219286598Smav ASSERT(HDR_HAS_L2HDR(hdr)); 3220286598Smav 3221307265Smav arc_buf_hdr_t *nhdr; 3222307265Smav l2arc_dev_t *dev = hdr->b_l2hdr.b_dev; 3223286598Smav 3224307265Smav ASSERT((old == hdr_full_cache && new == hdr_l2only_cache) || 3225307265Smav (old == hdr_l2only_cache && new == hdr_full_cache)); 3226307265Smav 3227307265Smav nhdr = kmem_cache_alloc(new, KM_PUSHPAGE); 3228307265Smav 3229307265Smav ASSERT(MUTEX_HELD(HDR_LOCK(hdr))); 3230307265Smav buf_hash_remove(hdr); 3231307265Smav 3232307265Smav bcopy(hdr, nhdr, HDR_L2ONLY_SIZE); 3233307265Smav 3234307265Smav if (new == hdr_full_cache) { 3235307265Smav arc_hdr_set_flags(nhdr, ARC_FLAG_HAS_L1HDR); 3236307265Smav /* 3237307265Smav * arc_access and arc_change_state need to be aware that a 3238307265Smav * header has just come out of L2ARC, so we set its state to 3239307265Smav * l2c_only even though it's about to change. 3240307265Smav */ 3241307265Smav nhdr->b_l1hdr.b_state = arc_l2c_only; 3242307265Smav 3243307265Smav /* Verify previous threads set to NULL before freeing */ 3244321610Smav ASSERT3P(nhdr->b_l1hdr.b_pabd, ==, NULL); 3245307265Smav } else { 3246307265Smav ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); 3247307265Smav ASSERT0(hdr->b_l1hdr.b_bufcnt); 3248307265Smav ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); 3249307265Smav 3250307265Smav /* 3251307265Smav * If we've reached here, We must have been called from 3252307265Smav * arc_evict_hdr(), as such we should have already been 3253307265Smav * removed from any ghost list we were previously on 3254307265Smav * (which protects us from racing with arc_evict_state), 3255307265Smav * thus no locking is needed during this check. 3256307265Smav */ 3257307265Smav ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); 3258307265Smav 3259307265Smav /* 3260307265Smav * A buffer must not be moved into the arc_l2c_only 3261307265Smav * state if it's not finished being written out to the 3262321610Smav * l2arc device. Otherwise, the b_l1hdr.b_pabd field 3263307265Smav * might try to be accessed, even though it was removed. 3264307265Smav */ 3265307265Smav VERIFY(!HDR_L2_WRITING(hdr)); 3266321610Smav VERIFY3P(hdr->b_l1hdr.b_pabd, ==, NULL); 3267307265Smav 3268307265Smav#ifdef ZFS_DEBUG 3269307265Smav if (hdr->b_l1hdr.b_thawed != NULL) { 3270307265Smav kmem_free(hdr->b_l1hdr.b_thawed, 1); 3271307265Smav hdr->b_l1hdr.b_thawed = NULL; 3272307265Smav } 3273307265Smav#endif 3274307265Smav 3275307265Smav arc_hdr_clear_flags(nhdr, ARC_FLAG_HAS_L1HDR); 3276307265Smav } 3277286598Smav /* 3278307265Smav * The header has been reallocated so we need to re-insert it into any 3279307265Smav * lists it was on. 3280286598Smav */ 3281307265Smav (void) buf_hash_insert(nhdr, NULL); 3282286598Smav 3283307265Smav ASSERT(list_link_active(&hdr->b_l2hdr.b_l2node)); 3284307265Smav 3285307265Smav mutex_enter(&dev->l2ad_mtx); 3286307265Smav 3287286598Smav /* 3288307265Smav * We must place the realloc'ed header back into the list at 3289307265Smav * the same spot. Otherwise, if it's placed earlier in the list, 3290307265Smav * l2arc_write_buffers() could find it during the function's 3291307265Smav * write phase, and try to write it out to the l2arc. 3292286598Smav */ 3293307265Smav list_insert_after(&dev->l2ad_buflist, hdr, nhdr); 3294307265Smav list_remove(&dev->l2ad_buflist, hdr); 3295286598Smav 3296307265Smav mutex_exit(&dev->l2ad_mtx); 3297307265Smav 3298286598Smav /* 3299307265Smav * Since we're using the pointer address as the tag when 3300307265Smav * incrementing and decrementing the l2ad_alloc refcount, we 3301307265Smav * must remove the old pointer (that we're about to destroy) and 3302307265Smav * add the new pointer to the refcount. Otherwise we'd remove 3303307265Smav * the wrong pointer address when calling arc_hdr_destroy() later. 3304286598Smav */ 3305286598Smav 3306307265Smav (void) refcount_remove_many(&dev->l2ad_alloc, arc_hdr_size(hdr), hdr); 3307307265Smav (void) refcount_add_many(&dev->l2ad_alloc, arc_hdr_size(nhdr), nhdr); 3308286598Smav 3309307265Smav buf_discard_identity(hdr); 3310307265Smav kmem_cache_free(old, hdr); 3311286598Smav 3312307265Smav return (nhdr); 3313286598Smav} 3314286598Smav 3315307265Smav/* 3316307265Smav * Allocate a new arc_buf_hdr_t and arc_buf_t and return the buf to the caller. 3317307265Smav * The buf is returned thawed since we expect the consumer to modify it. 3318307265Smav */ 3319307265Smavarc_buf_t * 3320321535Smavarc_alloc_buf(spa_t *spa, void *tag, arc_buf_contents_t type, int32_t size) 3321307265Smav{ 3322307265Smav arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), size, size, 3323307265Smav ZIO_COMPRESS_OFF, type); 3324307265Smav ASSERT(!MUTEX_HELD(HDR_LOCK(hdr))); 3325321535Smav 3326321535Smav arc_buf_t *buf = NULL; 3327321535Smav VERIFY0(arc_buf_alloc_impl(hdr, tag, B_FALSE, B_FALSE, &buf)); 3328307265Smav arc_buf_thaw(buf); 3329321535Smav 3330307265Smav return (buf); 3331307265Smav} 3332307265Smav 3333321535Smav/* 3334321535Smav * Allocate a compressed buf in the same manner as arc_alloc_buf. Don't use this 3335321535Smav * for bufs containing metadata. 3336321535Smav */ 3337321535Smavarc_buf_t * 3338321535Smavarc_alloc_compressed_buf(spa_t *spa, void *tag, uint64_t psize, uint64_t lsize, 3339321535Smav enum zio_compress compression_type) 3340321535Smav{ 3341321535Smav ASSERT3U(lsize, >, 0); 3342321535Smav ASSERT3U(lsize, >=, psize); 3343321535Smav ASSERT(compression_type > ZIO_COMPRESS_OFF); 3344321535Smav ASSERT(compression_type < ZIO_COMPRESS_FUNCTIONS); 3345321535Smav 3346321535Smav arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize, 3347321535Smav compression_type, ARC_BUFC_DATA); 3348321535Smav ASSERT(!MUTEX_HELD(HDR_LOCK(hdr))); 3349321535Smav 3350321535Smav arc_buf_t *buf = NULL; 3351321535Smav VERIFY0(arc_buf_alloc_impl(hdr, tag, B_TRUE, B_FALSE, &buf)); 3352321535Smav arc_buf_thaw(buf); 3353321535Smav ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); 3354321535Smav 3355321610Smav if (!arc_buf_is_shared(buf)) { 3356321610Smav /* 3357321610Smav * To ensure that the hdr has the correct data in it if we call 3358321610Smav * arc_decompress() on this buf before it's been written to 3359321610Smav * disk, it's easiest if we just set up sharing between the 3360321610Smav * buf and the hdr. 3361321610Smav */ 3362321610Smav ASSERT(!abd_is_linear(hdr->b_l1hdr.b_pabd)); 3363321610Smav arc_hdr_free_pabd(hdr); 3364321610Smav arc_share_buf(hdr, buf); 3365321610Smav } 3366321610Smav 3367321535Smav return (buf); 3368321535Smav} 3369321535Smav 3370286598Smavstatic void 3371307265Smavarc_hdr_l2hdr_destroy(arc_buf_hdr_t *hdr) 3372307265Smav{ 3373307265Smav l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr; 3374307265Smav l2arc_dev_t *dev = l2hdr->b_dev; 3375323754Savg uint64_t psize = arc_hdr_size(hdr); 3376307265Smav 3377307265Smav ASSERT(MUTEX_HELD(&dev->l2ad_mtx)); 3378307265Smav ASSERT(HDR_HAS_L2HDR(hdr)); 3379307265Smav 3380307265Smav list_remove(&dev->l2ad_buflist, hdr); 3381307265Smav 3382323754Savg ARCSTAT_INCR(arcstat_l2_psize, -psize); 3383323754Savg ARCSTAT_INCR(arcstat_l2_lsize, -HDR_GET_LSIZE(hdr)); 3384307265Smav 3385323754Savg vdev_space_update(dev->l2ad_vdev, -psize, 0, 0); 3386307265Smav 3387323754Savg (void) refcount_remove_many(&dev->l2ad_alloc, psize, hdr); 3388307265Smav arc_hdr_clear_flags(hdr, ARC_FLAG_HAS_L2HDR); 3389307265Smav} 3390307265Smav 3391307265Smavstatic void 3392168404Spjdarc_hdr_destroy(arc_buf_hdr_t *hdr) 3393168404Spjd{ 3394286570Smav if (HDR_HAS_L1HDR(hdr)) { 3395286570Smav ASSERT(hdr->b_l1hdr.b_buf == NULL || 3396307265Smav hdr->b_l1hdr.b_bufcnt > 0); 3397286570Smav ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 3398286570Smav ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); 3399286570Smav } 3400168404Spjd ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 3401286570Smav ASSERT(!HDR_IN_HASH_TABLE(hdr)); 3402168404Spjd 3403307265Smav if (!HDR_EMPTY(hdr)) 3404307265Smav buf_discard_identity(hdr); 3405307265Smav 3406286570Smav if (HDR_HAS_L2HDR(hdr)) { 3407286598Smav l2arc_dev_t *dev = hdr->b_l2hdr.b_dev; 3408286598Smav boolean_t buflist_held = MUTEX_HELD(&dev->l2ad_mtx); 3409286570Smav 3410286598Smav if (!buflist_held) 3411286598Smav mutex_enter(&dev->l2ad_mtx); 3412219089Spjd 3413286570Smav /* 3414286598Smav * Even though we checked this conditional above, we 3415286598Smav * need to check this again now that we have the 3416286598Smav * l2ad_mtx. This is because we could be racing with 3417286598Smav * another thread calling l2arc_evict() which might have 3418286598Smav * destroyed this header's L2 portion as we were waiting 3419286598Smav * to acquire the l2ad_mtx. If that happens, we don't 3420286598Smav * want to re-destroy the header's L2 portion. 3421286570Smav */ 3422286598Smav if (HDR_HAS_L2HDR(hdr)) { 3423290191Savg l2arc_trim(hdr); 3424286598Smav arc_hdr_l2hdr_destroy(hdr); 3425286598Smav } 3426286570Smav 3427219089Spjd if (!buflist_held) 3428286598Smav mutex_exit(&dev->l2ad_mtx); 3429185029Spjd } 3430185029Spjd 3431307265Smav if (HDR_HAS_L1HDR(hdr)) { 3432307265Smav arc_cksum_free(hdr); 3433286776Smav 3434307265Smav while (hdr->b_l1hdr.b_buf != NULL) 3435321535Smav arc_buf_destroy_impl(hdr->b_l1hdr.b_buf); 3436286570Smav 3437286570Smav#ifdef ZFS_DEBUG 3438286570Smav if (hdr->b_l1hdr.b_thawed != NULL) { 3439286570Smav kmem_free(hdr->b_l1hdr.b_thawed, 1); 3440286570Smav hdr->b_l1hdr.b_thawed = NULL; 3441286570Smav } 3442286570Smav#endif 3443307265Smav 3444321610Smav if (hdr->b_l1hdr.b_pabd != NULL) { 3445321610Smav arc_hdr_free_pabd(hdr); 3446307265Smav } 3447219089Spjd } 3448168404Spjd 3449168404Spjd ASSERT3P(hdr->b_hash_next, ==, NULL); 3450286570Smav if (HDR_HAS_L1HDR(hdr)) { 3451286763Smav ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); 3452286570Smav ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); 3453286570Smav kmem_cache_free(hdr_full_cache, hdr); 3454286570Smav } else { 3455286570Smav kmem_cache_free(hdr_l2only_cache, hdr); 3456286570Smav } 3457168404Spjd} 3458168404Spjd 3459168404Spjdvoid 3460307265Smavarc_buf_destroy(arc_buf_t *buf, void* tag) 3461168404Spjd{ 3462168404Spjd arc_buf_hdr_t *hdr = buf->b_hdr; 3463168404Spjd kmutex_t *hash_lock = HDR_LOCK(hdr); 3464168404Spjd 3465286570Smav if (hdr->b_l1hdr.b_state == arc_anon) { 3466307265Smav ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1); 3467307265Smav ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 3468307265Smav VERIFY0(remove_reference(hdr, NULL, tag)); 3469307265Smav arc_hdr_destroy(hdr); 3470307265Smav return; 3471168404Spjd } 3472168404Spjd 3473168404Spjd mutex_enter(hash_lock); 3474307265Smav ASSERT3P(hdr, ==, buf->b_hdr); 3475307265Smav ASSERT(hdr->b_l1hdr.b_bufcnt > 0); 3476219089Spjd ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 3477307265Smav ASSERT3P(hdr->b_l1hdr.b_state, !=, arc_anon); 3478307265Smav ASSERT3P(buf->b_data, !=, NULL); 3479168404Spjd 3480168404Spjd (void) remove_reference(hdr, hash_lock, tag); 3481321535Smav arc_buf_destroy_impl(buf); 3482168404Spjd mutex_exit(hash_lock); 3483168404Spjd} 3484168404Spjd 3485168404Spjd/* 3486286763Smav * Evict the arc_buf_hdr that is provided as a parameter. The resultant 3487286763Smav * state of the header is dependent on it's state prior to entering this 3488286763Smav * function. The following transitions are possible: 3489185029Spjd * 3490286763Smav * - arc_mru -> arc_mru_ghost 3491286763Smav * - arc_mfu -> arc_mfu_ghost 3492286763Smav * - arc_mru_ghost -> arc_l2c_only 3493286763Smav * - arc_mru_ghost -> deleted 3494286763Smav * - arc_mfu_ghost -> arc_l2c_only 3495286763Smav * - arc_mfu_ghost -> deleted 3496168404Spjd */ 3497286763Smavstatic int64_t 3498286763Smavarc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) 3499168404Spjd{ 3500286763Smav arc_state_t *evicted_state, *state; 3501286763Smav int64_t bytes_evicted = 0; 3502168404Spjd 3503286763Smav ASSERT(MUTEX_HELD(hash_lock)); 3504286763Smav ASSERT(HDR_HAS_L1HDR(hdr)); 3505168404Spjd 3506286763Smav state = hdr->b_l1hdr.b_state; 3507286763Smav if (GHOST_STATE(state)) { 3508286763Smav ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 3509307265Smav ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); 3510206796Spjd 3511286763Smav /* 3512286763Smav * l2arc_write_buffers() relies on a header's L1 portion 3513321610Smav * (i.e. its b_pabd field) during it's write phase. 3514286763Smav * Thus, we cannot push a header onto the arc_l2c_only 3515286763Smav * state (removing it's L1 piece) until the header is 3516286763Smav * done being written to the l2arc. 3517286763Smav */ 3518286763Smav if (HDR_HAS_L2HDR(hdr) && HDR_L2_WRITING(hdr)) { 3519286763Smav ARCSTAT_BUMP(arcstat_evict_l2_skip); 3520286763Smav return (bytes_evicted); 3521286763Smav } 3522286762Smav 3523286763Smav ARCSTAT_BUMP(arcstat_deleted); 3524307265Smav bytes_evicted += HDR_GET_LSIZE(hdr); 3525286762Smav 3526286763Smav DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, hdr); 3527286763Smav 3528321610Smav ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); 3529286763Smav if (HDR_HAS_L2HDR(hdr)) { 3530275780Sdelphij /* 3531286763Smav * This buffer is cached on the 2nd Level ARC; 3532286763Smav * don't destroy the header. 3533275780Sdelphij */ 3534286763Smav arc_change_state(arc_l2c_only, hdr, hash_lock); 3535286763Smav /* 3536286763Smav * dropping from L1+L2 cached to L2-only, 3537286763Smav * realloc to remove the L1 header. 3538286763Smav */ 3539286763Smav hdr = arc_hdr_realloc(hdr, hdr_full_cache, 3540286763Smav hdr_l2only_cache); 3541286763Smav } else { 3542286763Smav arc_change_state(arc_anon, hdr, hash_lock); 3543286763Smav arc_hdr_destroy(hdr); 3544275780Sdelphij } 3545286763Smav return (bytes_evicted); 3546275780Sdelphij } 3547275780Sdelphij 3548286763Smav ASSERT(state == arc_mru || state == arc_mfu); 3549286763Smav evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost; 3550206796Spjd 3551286763Smav /* prefetch buffers have a minimum lifespan */ 3552286763Smav if (HDR_IO_IN_PROGRESS(hdr) || 3553286763Smav ((hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT)) && 3554286763Smav ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access < 3555286763Smav arc_min_prefetch_lifespan)) { 3556286763Smav ARCSTAT_BUMP(arcstat_evict_skip); 3557286763Smav return (bytes_evicted); 3558286763Smav } 3559286763Smav 3560286763Smav ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt)); 3561286763Smav while (hdr->b_l1hdr.b_buf) { 3562286763Smav arc_buf_t *buf = hdr->b_l1hdr.b_buf; 3563286763Smav if (!mutex_tryenter(&buf->b_evict_lock)) { 3564286763Smav ARCSTAT_BUMP(arcstat_mutex_miss); 3565286763Smav break; 3566168404Spjd } 3567286763Smav if (buf->b_data != NULL) 3568307265Smav bytes_evicted += HDR_GET_LSIZE(hdr); 3569307265Smav mutex_exit(&buf->b_evict_lock); 3570321535Smav arc_buf_destroy_impl(buf); 3571286763Smav } 3572258632Savg 3573286763Smav if (HDR_HAS_L2HDR(hdr)) { 3574307265Smav ARCSTAT_INCR(arcstat_evict_l2_cached, HDR_GET_LSIZE(hdr)); 3575286763Smav } else { 3576307265Smav if (l2arc_write_eligible(hdr->b_spa, hdr)) { 3577307265Smav ARCSTAT_INCR(arcstat_evict_l2_eligible, 3578307265Smav HDR_GET_LSIZE(hdr)); 3579307265Smav } else { 3580307265Smav ARCSTAT_INCR(arcstat_evict_l2_ineligible, 3581307265Smav HDR_GET_LSIZE(hdr)); 3582307265Smav } 3583286763Smav } 3584258632Savg 3585307265Smav if (hdr->b_l1hdr.b_bufcnt == 0) { 3586307265Smav arc_cksum_free(hdr); 3587307265Smav 3588307265Smav bytes_evicted += arc_hdr_size(hdr); 3589307265Smav 3590307265Smav /* 3591307265Smav * If this hdr is being evicted and has a compressed 3592307265Smav * buffer then we discard it here before we change states. 3593307265Smav * This ensures that the accounting is updated correctly 3594321610Smav * in arc_free_data_impl(). 3595307265Smav */ 3596321610Smav arc_hdr_free_pabd(hdr); 3597307265Smav 3598286763Smav arc_change_state(evicted_state, hdr, hash_lock); 3599286763Smav ASSERT(HDR_IN_HASH_TABLE(hdr)); 3600307265Smav arc_hdr_set_flags(hdr, ARC_FLAG_IN_HASH_TABLE); 3601286763Smav DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, hdr); 3602286763Smav } 3603286763Smav 3604286763Smav return (bytes_evicted); 3605286763Smav} 3606286763Smav 3607286763Smavstatic uint64_t 3608286763Smavarc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker, 3609286763Smav uint64_t spa, int64_t bytes) 3610286763Smav{ 3611286763Smav multilist_sublist_t *mls; 3612286763Smav uint64_t bytes_evicted = 0; 3613286763Smav arc_buf_hdr_t *hdr; 3614286763Smav kmutex_t *hash_lock; 3615286763Smav int evict_count = 0; 3616286763Smav 3617286763Smav ASSERT3P(marker, !=, NULL); 3618286763Smav IMPLY(bytes < 0, bytes == ARC_EVICT_ALL); 3619286763Smav 3620286763Smav mls = multilist_sublist_lock(ml, idx); 3621286763Smav 3622286763Smav for (hdr = multilist_sublist_prev(mls, marker); hdr != NULL; 3623286763Smav hdr = multilist_sublist_prev(mls, marker)) { 3624286763Smav if ((bytes != ARC_EVICT_ALL && bytes_evicted >= bytes) || 3625286763Smav (evict_count >= zfs_arc_evict_batch_limit)) 3626286763Smav break; 3627286763Smav 3628258632Savg /* 3629286763Smav * To keep our iteration location, move the marker 3630286763Smav * forward. Since we're not holding hdr's hash lock, we 3631286763Smav * must be very careful and not remove 'hdr' from the 3632286763Smav * sublist. Otherwise, other consumers might mistake the 3633286763Smav * 'hdr' as not being on a sublist when they call the 3634286763Smav * multilist_link_active() function (they all rely on 3635286763Smav * the hash lock protecting concurrent insertions and 3636286763Smav * removals). multilist_sublist_move_forward() was 3637286763Smav * specifically implemented to ensure this is the case 3638286763Smav * (only 'marker' will be removed and re-inserted). 3639258632Savg */ 3640286763Smav multilist_sublist_move_forward(mls, marker); 3641286763Smav 3642286763Smav /* 3643286763Smav * The only case where the b_spa field should ever be 3644286763Smav * zero, is the marker headers inserted by 3645286763Smav * arc_evict_state(). It's possible for multiple threads 3646286763Smav * to be calling arc_evict_state() concurrently (e.g. 3647286763Smav * dsl_pool_close() and zio_inject_fault()), so we must 3648286763Smav * skip any markers we see from these other threads. 3649286763Smav */ 3650286763Smav if (hdr->b_spa == 0) 3651258632Savg continue; 3652286763Smav 3653286763Smav /* we're only interested in evicting buffers of a certain spa */ 3654286763Smav if (spa != 0 && hdr->b_spa != spa) { 3655286763Smav ARCSTAT_BUMP(arcstat_evict_skip); 3656286763Smav continue; 3657258632Savg } 3658258632Savg 3659275811Sdelphij hash_lock = HDR_LOCK(hdr); 3660208373Smm 3661286763Smav /* 3662286763Smav * We aren't calling this function from any code path 3663286763Smav * that would already be holding a hash lock, so we're 3664286763Smav * asserting on this assumption to be defensive in case 3665286763Smav * this ever changes. Without this check, it would be 3666286763Smav * possible to incorrectly increment arcstat_mutex_miss 3667286763Smav * below (e.g. if the code changed such that we called 3668286763Smav * this function with a hash lock held). 3669286763Smav */ 3670286763Smav ASSERT(!MUTEX_HELD(hash_lock)); 3671208373Smm 3672286763Smav if (mutex_tryenter(hash_lock)) { 3673286763Smav uint64_t evicted = arc_evict_hdr(hdr, hash_lock); 3674286763Smav mutex_exit(hash_lock); 3675286763Smav 3676286763Smav bytes_evicted += evicted; 3677286763Smav 3678286763Smav /* 3679286763Smav * If evicted is zero, arc_evict_hdr() must have 3680286763Smav * decided to skip this header, don't increment 3681286763Smav * evict_count in this case. 3682286763Smav */ 3683286763Smav if (evicted != 0) 3684286763Smav evict_count++; 3685286763Smav 3686286763Smav /* 3687286763Smav * If arc_size isn't overflowing, signal any 3688286763Smav * threads that might happen to be waiting. 3689286763Smav * 3690286763Smav * For each header evicted, we wake up a single 3691286763Smav * thread. If we used cv_broadcast, we could 3692286763Smav * wake up "too many" threads causing arc_size 3693286763Smav * to significantly overflow arc_c; since 3694321610Smav * arc_get_data_impl() doesn't check for overflow 3695286763Smav * when it's woken up (it doesn't because it's 3696286763Smav * possible for the ARC to be overflowing while 3697286763Smav * full of un-evictable buffers, and the 3698286763Smav * function should proceed in this case). 3699286763Smav * 3700286763Smav * If threads are left sleeping, due to not 3701286763Smav * using cv_broadcast, they will be woken up 3702286763Smav * just before arc_reclaim_thread() sleeps. 3703286763Smav */ 3704286763Smav mutex_enter(&arc_reclaim_lock); 3705286763Smav if (!arc_is_overflowing()) 3706286763Smav cv_signal(&arc_reclaim_waiters_cv); 3707286763Smav mutex_exit(&arc_reclaim_lock); 3708168404Spjd } else { 3709286763Smav ARCSTAT_BUMP(arcstat_mutex_miss); 3710168404Spjd } 3711168404Spjd } 3712168404Spjd 3713286763Smav multilist_sublist_unlock(mls); 3714206796Spjd 3715286763Smav return (bytes_evicted); 3716286763Smav} 3717168404Spjd 3718286763Smav/* 3719286763Smav * Evict buffers from the given arc state, until we've removed the 3720286763Smav * specified number of bytes. Move the removed buffers to the 3721286763Smav * appropriate evict state. 3722286763Smav * 3723286763Smav * This function makes a "best effort". It skips over any buffers 3724286763Smav * it can't get a hash_lock on, and so, may not catch all candidates. 3725286763Smav * It may also return without evicting as much space as requested. 3726286763Smav * 3727286763Smav * If bytes is specified using the special value ARC_EVICT_ALL, this 3728286763Smav * will evict all available (i.e. unlocked and evictable) buffers from 3729286763Smav * the given arc state; which is used by arc_flush(). 3730286763Smav */ 3731286763Smavstatic uint64_t 3732286763Smavarc_evict_state(arc_state_t *state, uint64_t spa, int64_t bytes, 3733286763Smav arc_buf_contents_t type) 3734286763Smav{ 3735286763Smav uint64_t total_evicted = 0; 3736321553Smav multilist_t *ml = state->arcs_list[type]; 3737286763Smav int num_sublists; 3738286763Smav arc_buf_hdr_t **markers; 3739168404Spjd 3740286763Smav IMPLY(bytes < 0, bytes == ARC_EVICT_ALL); 3741168404Spjd 3742286763Smav num_sublists = multilist_get_num_sublists(ml); 3743286763Smav 3744185029Spjd /* 3745286763Smav * If we've tried to evict from each sublist, made some 3746286763Smav * progress, but still have not hit the target number of bytes 3747286763Smav * to evict, we want to keep trying. The markers allow us to 3748286763Smav * pick up where we left off for each individual sublist, rather 3749286763Smav * than starting from the tail each time. 3750185029Spjd */ 3751286763Smav markers = kmem_zalloc(sizeof (*markers) * num_sublists, KM_SLEEP); 3752286763Smav for (int i = 0; i < num_sublists; i++) { 3753286763Smav markers[i] = kmem_cache_alloc(hdr_full_cache, KM_SLEEP); 3754185029Spjd 3755286763Smav /* 3756286763Smav * A b_spa of 0 is used to indicate that this header is 3757286763Smav * a marker. This fact is used in arc_adjust_type() and 3758286763Smav * arc_evict_state_impl(). 3759286763Smav */ 3760286763Smav markers[i]->b_spa = 0; 3761168404Spjd 3762286763Smav multilist_sublist_t *mls = multilist_sublist_lock(ml, i); 3763286763Smav multilist_sublist_insert_tail(mls, markers[i]); 3764286763Smav multilist_sublist_unlock(mls); 3765286763Smav } 3766168404Spjd 3767286763Smav /* 3768286763Smav * While we haven't hit our target number of bytes to evict, or 3769286763Smav * we're evicting all available buffers. 3770286763Smav */ 3771286763Smav while (total_evicted < bytes || bytes == ARC_EVICT_ALL) { 3772286763Smav /* 3773286763Smav * Start eviction using a randomly selected sublist, 3774286763Smav * this is to try and evenly balance eviction across all 3775286763Smav * sublists. Always starting at the same sublist 3776286763Smav * (e.g. index 0) would cause evictions to favor certain 3777286763Smav * sublists over others. 3778286763Smav */ 3779286763Smav int sublist_idx = multilist_get_random_index(ml); 3780286763Smav uint64_t scan_evicted = 0; 3781219089Spjd 3782286763Smav for (int i = 0; i < num_sublists; i++) { 3783286763Smav uint64_t bytes_remaining; 3784286763Smav uint64_t bytes_evicted; 3785219089Spjd 3786286763Smav if (bytes == ARC_EVICT_ALL) 3787286763Smav bytes_remaining = ARC_EVICT_ALL; 3788286763Smav else if (total_evicted < bytes) 3789286763Smav bytes_remaining = bytes - total_evicted; 3790286763Smav else 3791286763Smav break; 3792258632Savg 3793286763Smav bytes_evicted = arc_evict_state_impl(ml, sublist_idx, 3794286763Smav markers[sublist_idx], spa, bytes_remaining); 3795286763Smav 3796286763Smav scan_evicted += bytes_evicted; 3797286763Smav total_evicted += bytes_evicted; 3798286763Smav 3799286763Smav /* we've reached the end, wrap to the beginning */ 3800286763Smav if (++sublist_idx >= num_sublists) 3801286763Smav sublist_idx = 0; 3802286763Smav } 3803286763Smav 3804258632Savg /* 3805286763Smav * If we didn't evict anything during this scan, we have 3806286763Smav * no reason to believe we'll evict more during another 3807286763Smav * scan, so break the loop. 3808258632Savg */ 3809286763Smav if (scan_evicted == 0) { 3810286763Smav /* This isn't possible, let's make that obvious */ 3811286763Smav ASSERT3S(bytes, !=, 0); 3812185029Spjd 3813286763Smav /* 3814286763Smav * When bytes is ARC_EVICT_ALL, the only way to 3815286763Smav * break the loop is when scan_evicted is zero. 3816286763Smav * In that case, we actually have evicted enough, 3817286763Smav * so we don't want to increment the kstat. 3818286763Smav */ 3819286763Smav if (bytes != ARC_EVICT_ALL) { 3820286763Smav ASSERT3S(total_evicted, <, bytes); 3821286763Smav ARCSTAT_BUMP(arcstat_evict_not_enough); 3822185029Spjd } 3823185029Spjd 3824286763Smav break; 3825258632Savg } 3826286763Smav } 3827258632Savg 3828286763Smav for (int i = 0; i < num_sublists; i++) { 3829286763Smav multilist_sublist_t *mls = multilist_sublist_lock(ml, i); 3830286763Smav multilist_sublist_remove(mls, markers[i]); 3831286763Smav multilist_sublist_unlock(mls); 3832286763Smav 3833286763Smav kmem_cache_free(hdr_full_cache, markers[i]); 3834168404Spjd } 3835286763Smav kmem_free(markers, sizeof (*markers) * num_sublists); 3836206796Spjd 3837286763Smav return (total_evicted); 3838286763Smav} 3839286763Smav 3840286763Smav/* 3841286763Smav * Flush all "evictable" data of the given type from the arc state 3842286763Smav * specified. This will not evict any "active" buffers (i.e. referenced). 3843286763Smav * 3844307265Smav * When 'retry' is set to B_FALSE, the function will make a single pass 3845286763Smav * over the state and evict any buffers that it can. Since it doesn't 3846286763Smav * continually retry the eviction, it might end up leaving some buffers 3847286763Smav * in the ARC due to lock misses. 3848286763Smav * 3849307265Smav * When 'retry' is set to B_TRUE, the function will continually retry the 3850286763Smav * eviction until *all* evictable buffers have been removed from the 3851286763Smav * state. As a result, if concurrent insertions into the state are 3852286763Smav * allowed (e.g. if the ARC isn't shutting down), this function might 3853286763Smav * wind up in an infinite loop, continually trying to evict buffers. 3854286763Smav */ 3855286763Smavstatic uint64_t 3856286763Smavarc_flush_state(arc_state_t *state, uint64_t spa, arc_buf_contents_t type, 3857286763Smav boolean_t retry) 3858286763Smav{ 3859286763Smav uint64_t evicted = 0; 3860286763Smav 3861307265Smav while (refcount_count(&state->arcs_esize[type]) != 0) { 3862286763Smav evicted += arc_evict_state(state, spa, ARC_EVICT_ALL, type); 3863286763Smav 3864286763Smav if (!retry) 3865286763Smav break; 3866185029Spjd } 3867185029Spjd 3868286763Smav return (evicted); 3869286763Smav} 3870286763Smav 3871286763Smav/* 3872286763Smav * Evict the specified number of bytes from the state specified, 3873286763Smav * restricting eviction to the spa and type given. This function 3874286763Smav * prevents us from trying to evict more from a state's list than 3875286763Smav * is "evictable", and to skip evicting altogether when passed a 3876286763Smav * negative value for "bytes". In contrast, arc_evict_state() will 3877286763Smav * evict everything it can, when passed a negative value for "bytes". 3878286763Smav */ 3879286763Smavstatic uint64_t 3880286763Smavarc_adjust_impl(arc_state_t *state, uint64_t spa, int64_t bytes, 3881286763Smav arc_buf_contents_t type) 3882286763Smav{ 3883286763Smav int64_t delta; 3884286763Smav 3885307265Smav if (bytes > 0 && refcount_count(&state->arcs_esize[type]) > 0) { 3886307265Smav delta = MIN(refcount_count(&state->arcs_esize[type]), bytes); 3887286763Smav return (arc_evict_state(state, spa, delta, type)); 3888168404Spjd } 3889168404Spjd 3890286763Smav return (0); 3891168404Spjd} 3892168404Spjd 3893286763Smav/* 3894286763Smav * Evict metadata buffers from the cache, such that arc_meta_used is 3895286763Smav * capped by the arc_meta_limit tunable. 3896286763Smav */ 3897286763Smavstatic uint64_t 3898286763Smavarc_adjust_meta(void) 3899286763Smav{ 3900286763Smav uint64_t total_evicted = 0; 3901286763Smav int64_t target; 3902286763Smav 3903286763Smav /* 3904286763Smav * If we're over the meta limit, we want to evict enough 3905286763Smav * metadata to get back under the meta limit. We don't want to 3906286763Smav * evict so much that we drop the MRU below arc_p, though. If 3907286763Smav * we're over the meta limit more than we're over arc_p, we 3908286763Smav * evict some from the MRU here, and some from the MFU below. 3909286763Smav */ 3910286763Smav target = MIN((int64_t)(arc_meta_used - arc_meta_limit), 3911286766Smav (int64_t)(refcount_count(&arc_anon->arcs_size) + 3912286766Smav refcount_count(&arc_mru->arcs_size) - arc_p)); 3913286763Smav 3914286763Smav total_evicted += arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA); 3915286763Smav 3916286763Smav /* 3917286763Smav * Similar to the above, we want to evict enough bytes to get us 3918286763Smav * below the meta limit, but not so much as to drop us below the 3919321535Smav * space allotted to the MFU (which is defined as arc_c - arc_p). 3920286763Smav */ 3921286763Smav target = MIN((int64_t)(arc_meta_used - arc_meta_limit), 3922286766Smav (int64_t)(refcount_count(&arc_mfu->arcs_size) - (arc_c - arc_p))); 3923286763Smav 3924286763Smav total_evicted += arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA); 3925286763Smav 3926286763Smav return (total_evicted); 3927286763Smav} 3928286763Smav 3929286763Smav/* 3930286763Smav * Return the type of the oldest buffer in the given arc state 3931286763Smav * 3932286763Smav * This function will select a random sublist of type ARC_BUFC_DATA and 3933286763Smav * a random sublist of type ARC_BUFC_METADATA. The tail of each sublist 3934286763Smav * is compared, and the type which contains the "older" buffer will be 3935286763Smav * returned. 3936286763Smav */ 3937286763Smavstatic arc_buf_contents_t 3938286763Smavarc_adjust_type(arc_state_t *state) 3939286763Smav{ 3940321553Smav multilist_t *data_ml = state->arcs_list[ARC_BUFC_DATA]; 3941321553Smav multilist_t *meta_ml = state->arcs_list[ARC_BUFC_METADATA]; 3942286763Smav int data_idx = multilist_get_random_index(data_ml); 3943286763Smav int meta_idx = multilist_get_random_index(meta_ml); 3944286763Smav multilist_sublist_t *data_mls; 3945286763Smav multilist_sublist_t *meta_mls; 3946286763Smav arc_buf_contents_t type; 3947286763Smav arc_buf_hdr_t *data_hdr; 3948286763Smav arc_buf_hdr_t *meta_hdr; 3949286763Smav 3950286763Smav /* 3951286763Smav * We keep the sublist lock until we're finished, to prevent 3952286763Smav * the headers from being destroyed via arc_evict_state(). 3953286763Smav */ 3954286763Smav data_mls = multilist_sublist_lock(data_ml, data_idx); 3955286763Smav meta_mls = multilist_sublist_lock(meta_ml, meta_idx); 3956286763Smav 3957286763Smav /* 3958286763Smav * These two loops are to ensure we skip any markers that 3959286763Smav * might be at the tail of the lists due to arc_evict_state(). 3960286763Smav */ 3961286763Smav 3962286763Smav for (data_hdr = multilist_sublist_tail(data_mls); data_hdr != NULL; 3963286763Smav data_hdr = multilist_sublist_prev(data_mls, data_hdr)) { 3964286763Smav if (data_hdr->b_spa != 0) 3965286763Smav break; 3966286763Smav } 3967286763Smav 3968286763Smav for (meta_hdr = multilist_sublist_tail(meta_mls); meta_hdr != NULL; 3969286763Smav meta_hdr = multilist_sublist_prev(meta_mls, meta_hdr)) { 3970286763Smav if (meta_hdr->b_spa != 0) 3971286763Smav break; 3972286763Smav } 3973286763Smav 3974286763Smav if (data_hdr == NULL && meta_hdr == NULL) { 3975286763Smav type = ARC_BUFC_DATA; 3976286763Smav } else if (data_hdr == NULL) { 3977286763Smav ASSERT3P(meta_hdr, !=, NULL); 3978286763Smav type = ARC_BUFC_METADATA; 3979286763Smav } else if (meta_hdr == NULL) { 3980286763Smav ASSERT3P(data_hdr, !=, NULL); 3981286763Smav type = ARC_BUFC_DATA; 3982286763Smav } else { 3983286763Smav ASSERT3P(data_hdr, !=, NULL); 3984286763Smav ASSERT3P(meta_hdr, !=, NULL); 3985286763Smav 3986286763Smav /* The headers can't be on the sublist without an L1 header */ 3987286763Smav ASSERT(HDR_HAS_L1HDR(data_hdr)); 3988286763Smav ASSERT(HDR_HAS_L1HDR(meta_hdr)); 3989286763Smav 3990286763Smav if (data_hdr->b_l1hdr.b_arc_access < 3991286763Smav meta_hdr->b_l1hdr.b_arc_access) { 3992286763Smav type = ARC_BUFC_DATA; 3993286763Smav } else { 3994286763Smav type = ARC_BUFC_METADATA; 3995286763Smav } 3996286763Smav } 3997286763Smav 3998286763Smav multilist_sublist_unlock(meta_mls); 3999286763Smav multilist_sublist_unlock(data_mls); 4000286763Smav 4001286763Smav return (type); 4002286763Smav} 4003286763Smav 4004286763Smav/* 4005286763Smav * Evict buffers from the cache, such that arc_size is capped by arc_c. 4006286763Smav */ 4007286763Smavstatic uint64_t 4008168404Spjdarc_adjust(void) 4009168404Spjd{ 4010286763Smav uint64_t total_evicted = 0; 4011286763Smav uint64_t bytes; 4012286763Smav int64_t target; 4013168404Spjd 4014208373Smm /* 4015286763Smav * If we're over arc_meta_limit, we want to correct that before 4016286763Smav * potentially evicting data buffers below. 4017286763Smav */ 4018286763Smav total_evicted += arc_adjust_meta(); 4019286763Smav 4020286763Smav /* 4021208373Smm * Adjust MRU size 4022286763Smav * 4023286763Smav * If we're over the target cache size, we want to evict enough 4024286763Smav * from the list to get back to our target size. We don't want 4025286763Smav * to evict too much from the MRU, such that it drops below 4026286763Smav * arc_p. So, if we're over our target cache size more than 4027286763Smav * the MRU is over arc_p, we'll evict enough to get back to 4028286763Smav * arc_p here, and then evict more from the MFU below. 4029208373Smm */ 4030286763Smav target = MIN((int64_t)(arc_size - arc_c), 4031286766Smav (int64_t)(refcount_count(&arc_anon->arcs_size) + 4032286766Smav refcount_count(&arc_mru->arcs_size) + arc_meta_used - arc_p)); 4033208373Smm 4034286763Smav /* 4035286763Smav * If we're below arc_meta_min, always prefer to evict data. 4036286763Smav * Otherwise, try to satisfy the requested number of bytes to 4037286763Smav * evict from the type which contains older buffers; in an 4038286763Smav * effort to keep newer buffers in the cache regardless of their 4039286763Smav * type. If we cannot satisfy the number of bytes from this 4040286763Smav * type, spill over into the next type. 4041286763Smav */ 4042286763Smav if (arc_adjust_type(arc_mru) == ARC_BUFC_METADATA && 4043286763Smav arc_meta_used > arc_meta_min) { 4044286763Smav bytes = arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA); 4045286763Smav total_evicted += bytes; 4046168404Spjd 4047286763Smav /* 4048286763Smav * If we couldn't evict our target number of bytes from 4049286763Smav * metadata, we try to get the rest from data. 4050286763Smav */ 4051286763Smav target -= bytes; 4052286763Smav 4053286763Smav total_evicted += 4054286763Smav arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_DATA); 4055286763Smav } else { 4056286763Smav bytes = arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_DATA); 4057286763Smav total_evicted += bytes; 4058286763Smav 4059286763Smav /* 4060286763Smav * If we couldn't evict our target number of bytes from 4061286763Smav * data, we try to get the rest from metadata. 4062286763Smav */ 4063286763Smav target -= bytes; 4064286763Smav 4065286763Smav total_evicted += 4066286763Smav arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA); 4067185029Spjd } 4068185029Spjd 4069208373Smm /* 4070208373Smm * Adjust MFU size 4071286763Smav * 4072286763Smav * Now that we've tried to evict enough from the MRU to get its 4073286763Smav * size back to arc_p, if we're still above the target cache 4074286763Smav * size, we evict the rest from the MFU. 4075208373Smm */ 4076286763Smav target = arc_size - arc_c; 4077168404Spjd 4078286764Smav if (arc_adjust_type(arc_mfu) == ARC_BUFC_METADATA && 4079286763Smav arc_meta_used > arc_meta_min) { 4080286763Smav bytes = arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA); 4081286763Smav total_evicted += bytes; 4082208373Smm 4083286763Smav /* 4084286763Smav * If we couldn't evict our target number of bytes from 4085286763Smav * metadata, we try to get the rest from data. 4086286763Smav */ 4087286763Smav target -= bytes; 4088168404Spjd 4089286763Smav total_evicted += 4090286763Smav arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_DATA); 4091286763Smav } else { 4092286763Smav bytes = arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_DATA); 4093286763Smav total_evicted += bytes; 4094286763Smav 4095286763Smav /* 4096286763Smav * If we couldn't evict our target number of bytes from 4097286763Smav * data, we try to get the rest from data. 4098286763Smav */ 4099286763Smav target -= bytes; 4100286763Smav 4101286763Smav total_evicted += 4102286763Smav arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA); 4103208373Smm } 4104168404Spjd 4105208373Smm /* 4106208373Smm * Adjust ghost lists 4107286763Smav * 4108286763Smav * In addition to the above, the ARC also defines target values 4109286763Smav * for the ghost lists. The sum of the mru list and mru ghost 4110286763Smav * list should never exceed the target size of the cache, and 4111286763Smav * the sum of the mru list, mfu list, mru ghost list, and mfu 4112286763Smav * ghost list should never exceed twice the target size of the 4113286763Smav * cache. The following logic enforces these limits on the ghost 4114286763Smav * caches, and evicts from them as needed. 4115208373Smm */ 4116286766Smav target = refcount_count(&arc_mru->arcs_size) + 4117286766Smav refcount_count(&arc_mru_ghost->arcs_size) - arc_c; 4118168404Spjd 4119286763Smav bytes = arc_adjust_impl(arc_mru_ghost, 0, target, ARC_BUFC_DATA); 4120286763Smav total_evicted += bytes; 4121168404Spjd 4122286763Smav target -= bytes; 4123185029Spjd 4124286763Smav total_evicted += 4125286763Smav arc_adjust_impl(arc_mru_ghost, 0, target, ARC_BUFC_METADATA); 4126208373Smm 4127286763Smav /* 4128286763Smav * We assume the sum of the mru list and mfu list is less than 4129286763Smav * or equal to arc_c (we enforced this above), which means we 4130286763Smav * can use the simpler of the two equations below: 4131286763Smav * 4132286763Smav * mru + mfu + mru ghost + mfu ghost <= 2 * arc_c 4133286763Smav * mru ghost + mfu ghost <= arc_c 4134286763Smav */ 4135286766Smav target = refcount_count(&arc_mru_ghost->arcs_size) + 4136286766Smav refcount_count(&arc_mfu_ghost->arcs_size) - arc_c; 4137286763Smav 4138286763Smav bytes = arc_adjust_impl(arc_mfu_ghost, 0, target, ARC_BUFC_DATA); 4139286763Smav total_evicted += bytes; 4140286763Smav 4141286763Smav target -= bytes; 4142286763Smav 4143286763Smav total_evicted += 4144286763Smav arc_adjust_impl(arc_mfu_ghost, 0, target, ARC_BUFC_METADATA); 4145286763Smav 4146286763Smav return (total_evicted); 4147168404Spjd} 4148168404Spjd 4149168404Spjdvoid 4150286763Smavarc_flush(spa_t *spa, boolean_t retry) 4151168404Spjd{ 4152209962Smm uint64_t guid = 0; 4153209962Smm 4154286763Smav /* 4155307265Smav * If retry is B_TRUE, a spa must not be specified since we have 4156286763Smav * no good way to determine if all of a spa's buffers have been 4157286763Smav * evicted from an arc state. 4158286763Smav */ 4159286763Smav ASSERT(!retry || spa == 0); 4160286763Smav 4161286570Smav if (spa != NULL) 4162228103Smm guid = spa_load_guid(spa); 4163209962Smm 4164286763Smav (void) arc_flush_state(arc_mru, guid, ARC_BUFC_DATA, retry); 4165286763Smav (void) arc_flush_state(arc_mru, guid, ARC_BUFC_METADATA, retry); 4166168404Spjd 4167286763Smav (void) arc_flush_state(arc_mfu, guid, ARC_BUFC_DATA, retry); 4168286763Smav (void) arc_flush_state(arc_mfu, guid, ARC_BUFC_METADATA, retry); 4169168404Spjd 4170286763Smav (void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_DATA, retry); 4171286763Smav (void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_METADATA, retry); 4172286763Smav 4173286763Smav (void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_DATA, retry); 4174286763Smav (void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_METADATA, retry); 4175168404Spjd} 4176168404Spjd 4177168404Spjdvoid 4178286625Smavarc_shrink(int64_t to_free) 4179168404Spjd{ 4180168404Spjd if (arc_c > arc_c_min) { 4181272483Ssmh DTRACE_PROBE4(arc__shrink, uint64_t, arc_c, uint64_t, 4182272483Ssmh arc_c_min, uint64_t, arc_p, uint64_t, to_free); 4183168404Spjd if (arc_c > arc_c_min + to_free) 4184168404Spjd atomic_add_64(&arc_c, -to_free); 4185168404Spjd else 4186168404Spjd arc_c = arc_c_min; 4187168404Spjd 4188168404Spjd atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift)); 4189168404Spjd if (arc_c > arc_size) 4190168404Spjd arc_c = MAX(arc_size, arc_c_min); 4191168404Spjd if (arc_p > arc_c) 4192168404Spjd arc_p = (arc_c >> 1); 4193272483Ssmh 4194272483Ssmh DTRACE_PROBE2(arc__shrunk, uint64_t, arc_c, uint64_t, 4195272483Ssmh arc_p); 4196272483Ssmh 4197168404Spjd ASSERT(arc_c >= arc_c_min); 4198168404Spjd ASSERT((int64_t)arc_p >= 0); 4199168404Spjd } 4200168404Spjd 4201270759Ssmh if (arc_size > arc_c) { 4202270759Ssmh DTRACE_PROBE2(arc__shrink_adjust, uint64_t, arc_size, 4203270759Ssmh uint64_t, arc_c); 4204286763Smav (void) arc_adjust(); 4205270759Ssmh } 4206168404Spjd} 4207168404Spjd 4208286625Smavtypedef enum free_memory_reason_t { 4209286625Smav FMR_UNKNOWN, 4210286625Smav FMR_NEEDFREE, 4211286625Smav FMR_LOTSFREE, 4212286625Smav FMR_SWAPFS_MINFREE, 4213286625Smav FMR_PAGES_PP_MAXIMUM, 4214286625Smav FMR_HEAP_ARENA, 4215286625Smav FMR_ZIO_ARENA, 4216286625Smav FMR_ZIO_FRAG, 4217286625Smav} free_memory_reason_t; 4218286625Smav 4219286625Smavint64_t last_free_memory; 4220286625Smavfree_memory_reason_t last_free_reason; 4221286625Smav 4222286625Smav/* 4223286625Smav * Additional reserve of pages for pp_reserve. 4224286625Smav */ 4225286625Smavint64_t arc_pages_pp_reserve = 64; 4226286625Smav 4227286625Smav/* 4228286625Smav * Additional reserve of pages for swapfs. 4229286625Smav */ 4230286625Smavint64_t arc_swapfs_reserve = 64; 4231286625Smav 4232286625Smav/* 4233286625Smav * Return the amount of memory that can be consumed before reclaim will be 4234286625Smav * needed. Positive if there is sufficient free memory, negative indicates 4235286625Smav * the amount of memory that needs to be freed up. 4236286625Smav */ 4237286625Smavstatic int64_t 4238286625Smavarc_available_memory(void) 4239168404Spjd{ 4240286625Smav int64_t lowest = INT64_MAX; 4241286625Smav int64_t n; 4242286625Smav free_memory_reason_t r = FMR_UNKNOWN; 4243168404Spjd 4244168404Spjd#ifdef _KERNEL 4245330061Savg#ifdef __FreeBSD__ 4246191902Skmacy /* 4247212780Savg * Cooperate with pagedaemon when it's time for it to scan 4248212780Savg * and reclaim some pages. 4249191902Skmacy */ 4250286655Smav n = PAGESIZE * ((int64_t)freemem - zfs_arc_free_target); 4251286625Smav if (n < lowest) { 4252286625Smav lowest = n; 4253286625Smav r = FMR_LOTSFREE; 4254270759Ssmh } 4255191902Skmacy 4256330061Savg#else 4257330061Savg if (needfree > 0) { 4258330061Savg n = PAGESIZE * (-needfree); 4259330061Savg if (n < lowest) { 4260330061Savg lowest = n; 4261330061Savg r = FMR_NEEDFREE; 4262330061Savg } 4263330061Savg } 4264330061Savg 4265168404Spjd /* 4266185029Spjd * check that we're out of range of the pageout scanner. It starts to 4267185029Spjd * schedule paging if freemem is less than lotsfree and needfree. 4268185029Spjd * lotsfree is the high-water mark for pageout, and needfree is the 4269185029Spjd * number of needed free pages. We add extra pages here to make sure 4270185029Spjd * the scanner doesn't start up while we're freeing memory. 4271185029Spjd */ 4272286625Smav n = PAGESIZE * (freemem - lotsfree - needfree - desfree); 4273286625Smav if (n < lowest) { 4274286625Smav lowest = n; 4275286625Smav r = FMR_LOTSFREE; 4276286625Smav } 4277185029Spjd 4278185029Spjd /* 4279168404Spjd * check to make sure that swapfs has enough space so that anon 4280185029Spjd * reservations can still succeed. anon_resvmem() checks that the 4281168404Spjd * availrmem is greater than swapfs_minfree, and the number of reserved 4282168404Spjd * swap pages. We also add a bit of extra here just to prevent 4283168404Spjd * circumstances from getting really dire. 4284168404Spjd */ 4285286625Smav n = PAGESIZE * (availrmem - swapfs_minfree - swapfs_reserve - 4286286625Smav desfree - arc_swapfs_reserve); 4287286625Smav if (n < lowest) { 4288286625Smav lowest = n; 4289286625Smav r = FMR_SWAPFS_MINFREE; 4290286625Smav } 4291168404Spjd 4292286625Smav 4293168404Spjd /* 4294272483Ssmh * Check that we have enough availrmem that memory locking (e.g., via 4295272483Ssmh * mlock(3C) or memcntl(2)) can still succeed. (pages_pp_maximum 4296272483Ssmh * stores the number of pages that cannot be locked; when availrmem 4297272483Ssmh * drops below pages_pp_maximum, page locking mechanisms such as 4298272483Ssmh * page_pp_lock() will fail.) 4299272483Ssmh */ 4300286625Smav n = PAGESIZE * (availrmem - pages_pp_maximum - 4301286625Smav arc_pages_pp_reserve); 4302286625Smav if (n < lowest) { 4303286625Smav lowest = n; 4304286625Smav r = FMR_PAGES_PP_MAXIMUM; 4305286625Smav } 4306272483Ssmh 4307330061Savg#endif /* __FreeBSD__ */ 4308272483Ssmh#if defined(__i386) || !defined(UMA_MD_SMALL_ALLOC) 4309272483Ssmh /* 4310168404Spjd * If we're on an i386 platform, it's possible that we'll exhaust the 4311168404Spjd * kernel heap space before we ever run out of available physical 4312168404Spjd * memory. Most checks of the size of the heap_area compare against 4313168404Spjd * tune.t_minarmem, which is the minimum available real memory that we 4314168404Spjd * can have in the system. However, this is generally fixed at 25 pages 4315168404Spjd * which is so low that it's useless. In this comparison, we seek to 4316168404Spjd * calculate the total heap-size, and reclaim if more than 3/4ths of the 4317185029Spjd * heap is allocated. (Or, in the calculation, if less than 1/4th is 4318168404Spjd * free) 4319168404Spjd */ 4320286655Smav n = (int64_t)vmem_size(heap_arena, VMEM_FREE) - 4321286628Smav (vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC) >> 2); 4322286625Smav if (n < lowest) { 4323286625Smav lowest = n; 4324286625Smav r = FMR_HEAP_ARENA; 4325270861Ssmh } 4326281026Smav#define zio_arena NULL 4327281026Smav#else 4328281026Smav#define zio_arena heap_arena 4329270861Ssmh#endif 4330281026Smav 4331272483Ssmh /* 4332272483Ssmh * If zio data pages are being allocated out of a separate heap segment, 4333272483Ssmh * then enforce that the size of available vmem for this arena remains 4334331383Smav * above about 1/4th (1/(2^arc_zio_arena_free_shift)) free. 4335272483Ssmh * 4336331383Smav * Note that reducing the arc_zio_arena_free_shift keeps more virtual 4337331383Smav * memory (in the zio_arena) free, which can avoid memory 4338331383Smav * fragmentation issues. 4339272483Ssmh */ 4340286625Smav if (zio_arena != NULL) { 4341286655Smav n = (int64_t)vmem_size(zio_arena, VMEM_FREE) - 4342331383Smav (vmem_size(zio_arena, VMEM_ALLOC) >> 4343331383Smav arc_zio_arena_free_shift); 4344286625Smav if (n < lowest) { 4345286625Smav lowest = n; 4346286625Smav r = FMR_ZIO_ARENA; 4347286625Smav } 4348286625Smav } 4349281026Smav 4350281026Smav /* 4351281026Smav * Above limits know nothing about real level of KVA fragmentation. 4352281026Smav * Start aggressive reclamation if too little sequential KVA left. 4353281026Smav */ 4354286625Smav if (lowest > 0) { 4355317470Ssmh n = (vmem_size(heap_arena, VMEM_MAXFREE) < SPA_MAXBLOCKSIZE) ? 4356286655Smav -((int64_t)vmem_size(heap_arena, VMEM_ALLOC) >> 4) : 4357286655Smav INT64_MAX; 4358286625Smav if (n < lowest) { 4359286625Smav lowest = n; 4360286625Smav r = FMR_ZIO_FRAG; 4361286625Smav } 4362281109Smav } 4363281026Smav 4364272483Ssmh#else /* _KERNEL */ 4365286625Smav /* Every 100 calls, free a small amount */ 4366168404Spjd if (spa_get_random(100) == 0) 4367286625Smav lowest = -1024; 4368272483Ssmh#endif /* _KERNEL */ 4369270759Ssmh 4370286625Smav last_free_memory = lowest; 4371286625Smav last_free_reason = r; 4372286625Smav DTRACE_PROBE2(arc__available_memory, int64_t, lowest, int, r); 4373286625Smav return (lowest); 4374168404Spjd} 4375168404Spjd 4376286625Smav 4377286625Smav/* 4378286625Smav * Determine if the system is under memory pressure and is asking 4379307265Smav * to reclaim memory. A return value of B_TRUE indicates that the system 4380286625Smav * is under memory pressure and that the arc should adjust accordingly. 4381286625Smav */ 4382286625Smavstatic boolean_t 4383286625Smavarc_reclaim_needed(void) 4384286625Smav{ 4385286625Smav return (arc_available_memory() < 0); 4386286625Smav} 4387286625Smav 4388208454Spjdextern kmem_cache_t *zio_buf_cache[]; 4389208454Spjdextern kmem_cache_t *zio_data_buf_cache[]; 4390272527Sdelphijextern kmem_cache_t *range_seg_cache; 4391321610Smavextern kmem_cache_t *abd_chunk_cache; 4392208454Spjd 4393278040Ssmhstatic __noinline void 4394286625Smavarc_kmem_reap_now(void) 4395168404Spjd{ 4396168404Spjd size_t i; 4397168404Spjd kmem_cache_t *prev_cache = NULL; 4398168404Spjd kmem_cache_t *prev_data_cache = NULL; 4399168404Spjd 4400272483Ssmh DTRACE_PROBE(arc__kmem_reap_start); 4401168404Spjd#ifdef _KERNEL 4402185029Spjd if (arc_meta_used >= arc_meta_limit) { 4403185029Spjd /* 4404185029Spjd * We are exceeding our meta-data cache limit. 4405185029Spjd * Purge some DNLC entries to release holds on meta-data. 4406185029Spjd */ 4407185029Spjd dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent); 4408185029Spjd } 4409168404Spjd#if defined(__i386) 4410168404Spjd /* 4411168404Spjd * Reclaim unused memory from all kmem caches. 4412168404Spjd */ 4413168404Spjd kmem_reap(); 4414168404Spjd#endif 4415168404Spjd#endif 4416168404Spjd 4417168404Spjd for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) { 4418168404Spjd if (zio_buf_cache[i] != prev_cache) { 4419168404Spjd prev_cache = zio_buf_cache[i]; 4420168404Spjd kmem_cache_reap_now(zio_buf_cache[i]); 4421168404Spjd } 4422168404Spjd if (zio_data_buf_cache[i] != prev_data_cache) { 4423168404Spjd prev_data_cache = zio_data_buf_cache[i]; 4424168404Spjd kmem_cache_reap_now(zio_data_buf_cache[i]); 4425168404Spjd } 4426168404Spjd } 4427321610Smav kmem_cache_reap_now(abd_chunk_cache); 4428168404Spjd kmem_cache_reap_now(buf_cache); 4429286570Smav kmem_cache_reap_now(hdr_full_cache); 4430286570Smav kmem_cache_reap_now(hdr_l2only_cache); 4431272506Sdelphij kmem_cache_reap_now(range_seg_cache); 4432272483Ssmh 4433277300Ssmh#ifdef illumos 4434286625Smav if (zio_arena != NULL) { 4435286625Smav /* 4436286625Smav * Ask the vmem arena to reclaim unused memory from its 4437286625Smav * quantum caches. 4438286625Smav */ 4439272483Ssmh vmem_qcache_reap(zio_arena); 4440286625Smav } 4441272483Ssmh#endif 4442272483Ssmh DTRACE_PROBE(arc__kmem_reap_end); 4443168404Spjd} 4444168404Spjd 4445286763Smav/* 4446321610Smav * Threads can block in arc_get_data_impl() waiting for this thread to evict 4447286763Smav * enough data and signal them to proceed. When this happens, the threads in 4448321610Smav * arc_get_data_impl() are sleeping while holding the hash lock for their 4449286763Smav * particular arc header. Thus, we must be careful to never sleep on a 4450286763Smav * hash lock in this thread. This is to prevent the following deadlock: 4451286763Smav * 4452321610Smav * - Thread A sleeps on CV in arc_get_data_impl() holding hash lock "L", 4453286763Smav * waiting for the reclaim thread to signal it. 4454286763Smav * 4455286763Smav * - arc_reclaim_thread() tries to acquire hash lock "L" using mutex_enter, 4456286763Smav * fails, and goes to sleep forever. 4457286763Smav * 4458286763Smav * This possible deadlock is avoided by always acquiring a hash lock 4459286763Smav * using mutex_tryenter() from arc_reclaim_thread(). 4460286763Smav */ 4461331399Smav/* ARGSUSED */ 4462168404Spjdstatic void 4463331399Smavarc_reclaim_thread(void *unused __unused) 4464168404Spjd{ 4465296530Smav hrtime_t growtime = 0; 4466168404Spjd callb_cpr_t cpr; 4467168404Spjd 4468286763Smav CALLB_CPR_INIT(&cpr, &arc_reclaim_lock, callb_generic_cpr, FTAG); 4469168404Spjd 4470286763Smav mutex_enter(&arc_reclaim_lock); 4471286763Smav while (!arc_reclaim_thread_exit) { 4472286763Smav uint64_t evicted = 0; 4473286763Smav 4474307265Smav /* 4475307265Smav * This is necessary in order for the mdb ::arc dcmd to 4476307265Smav * show up to date information. Since the ::arc command 4477307265Smav * does not call the kstat's update function, without 4478307265Smav * this call, the command may show stale stats for the 4479307265Smav * anon, mru, mru_ghost, mfu, and mfu_ghost lists. Even 4480307265Smav * with this change, the data might be up to 1 second 4481307265Smav * out of date; but that should suffice. The arc_state_t 4482307265Smav * structures can be queried directly if more accurate 4483307265Smav * information is needed. 4484307265Smav */ 4485307265Smav if (arc_ksp != NULL) 4486307265Smav arc_ksp->ks_update(arc_ksp, KSTAT_READ); 4487307265Smav 4488286763Smav mutex_exit(&arc_reclaim_lock); 4489286763Smav 4490314873Sjpaetzel /* 4491314873Sjpaetzel * We call arc_adjust() before (possibly) calling 4492314873Sjpaetzel * arc_kmem_reap_now(), so that we can wake up 4493321610Smav * arc_get_data_impl() sooner. 4494314873Sjpaetzel */ 4495314873Sjpaetzel evicted = arc_adjust(); 4496314873Sjpaetzel 4497314873Sjpaetzel int64_t free_memory = arc_available_memory(); 4498286625Smav if (free_memory < 0) { 4499168404Spjd 4500286625Smav arc_no_grow = B_TRUE; 4501286625Smav arc_warm = B_TRUE; 4502168404Spjd 4503286625Smav /* 4504286625Smav * Wait at least zfs_grow_retry (default 60) seconds 4505286625Smav * before considering growing. 4506286625Smav */ 4507296530Smav growtime = gethrtime() + SEC2NSEC(arc_grow_retry); 4508168404Spjd 4509286625Smav arc_kmem_reap_now(); 4510286625Smav 4511286625Smav /* 4512286625Smav * If we are still low on memory, shrink the ARC 4513286625Smav * so that we have arc_shrink_min free space. 4514286625Smav */ 4515286625Smav free_memory = arc_available_memory(); 4516286625Smav 4517286625Smav int64_t to_free = 4518286625Smav (arc_c >> arc_shrink_shift) - free_memory; 4519286625Smav if (to_free > 0) { 4520330061Savg#ifdef _KERNEL 4521330061Savg#ifdef illumos 4522330061Savg to_free = MAX(to_free, ptob(needfree)); 4523330061Savg#endif 4524330061Savg#endif 4525286625Smav arc_shrink(to_free); 4526168404Spjd } 4527286625Smav } else if (free_memory < arc_c >> arc_no_grow_shift) { 4528286625Smav arc_no_grow = B_TRUE; 4529296530Smav } else if (gethrtime() >= growtime) { 4530286625Smav arc_no_grow = B_FALSE; 4531168404Spjd } 4532168404Spjd 4533286763Smav mutex_enter(&arc_reclaim_lock); 4534168404Spjd 4535286763Smav /* 4536286763Smav * If evicted is zero, we couldn't evict anything via 4537286763Smav * arc_adjust(). This could be due to hash lock 4538286763Smav * collisions, but more likely due to the majority of 4539286763Smav * arc buffers being unevictable. Therefore, even if 4540286763Smav * arc_size is above arc_c, another pass is unlikely to 4541286763Smav * be helpful and could potentially cause us to enter an 4542286763Smav * infinite loop. 4543286763Smav */ 4544286763Smav if (arc_size <= arc_c || evicted == 0) { 4545286763Smav /* 4546286763Smav * We're either no longer overflowing, or we 4547286763Smav * can't evict anything more, so we should wake 4548286763Smav * up any threads before we go to sleep. 4549286763Smav */ 4550286763Smav cv_broadcast(&arc_reclaim_waiters_cv); 4551168404Spjd 4552286763Smav /* 4553286763Smav * Block until signaled, or after one second (we 4554286763Smav * might need to perform arc_kmem_reap_now() 4555286763Smav * even if we aren't being signalled) 4556286763Smav */ 4557286763Smav CALLB_CPR_SAFE_BEGIN(&cpr); 4558296530Smav (void) cv_timedwait_hires(&arc_reclaim_thread_cv, 4559296530Smav &arc_reclaim_lock, SEC2NSEC(1), MSEC2NSEC(1), 0); 4560286763Smav CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_lock); 4561286763Smav } 4562286763Smav } 4563286763Smav 4564307265Smav arc_reclaim_thread_exit = B_FALSE; 4565286763Smav cv_broadcast(&arc_reclaim_thread_cv); 4566286763Smav CALLB_CPR_EXIT(&cpr); /* drops arc_reclaim_lock */ 4567286763Smav thread_exit(); 4568286763Smav} 4569286763Smav 4570301997Skibstatic u_int arc_dnlc_evicts_arg; 4571301997Skibextern struct vfsops zfs_vfsops; 4572301997Skib 4573301997Skibstatic void 4574301997Skibarc_dnlc_evicts_thread(void *dummy __unused) 4575301997Skib{ 4576301997Skib callb_cpr_t cpr; 4577301997Skib u_int percent; 4578301997Skib 4579301997Skib CALLB_CPR_INIT(&cpr, &arc_dnlc_evicts_lock, callb_generic_cpr, FTAG); 4580301997Skib 4581301997Skib mutex_enter(&arc_dnlc_evicts_lock); 4582301997Skib while (!arc_dnlc_evicts_thread_exit) { 4583301997Skib CALLB_CPR_SAFE_BEGIN(&cpr); 4584301997Skib (void) cv_wait(&arc_dnlc_evicts_cv, &arc_dnlc_evicts_lock); 4585301997Skib CALLB_CPR_SAFE_END(&cpr, &arc_dnlc_evicts_lock); 4586301997Skib if (arc_dnlc_evicts_arg != 0) { 4587301997Skib percent = arc_dnlc_evicts_arg; 4588301997Skib mutex_exit(&arc_dnlc_evicts_lock); 4589301997Skib#ifdef _KERNEL 4590301997Skib vnlru_free(desiredvnodes * percent / 100, &zfs_vfsops); 4591301997Skib#endif 4592301997Skib mutex_enter(&arc_dnlc_evicts_lock); 4593301997Skib /* 4594301997Skib * Clear our token only after vnlru_free() 4595301997Skib * pass is done, to avoid false queueing of 4596301997Skib * the requests. 4597301997Skib */ 4598301997Skib arc_dnlc_evicts_arg = 0; 4599301997Skib } 4600301997Skib } 4601301997Skib arc_dnlc_evicts_thread_exit = FALSE; 4602301997Skib cv_broadcast(&arc_dnlc_evicts_cv); 4603301997Skib CALLB_CPR_EXIT(&cpr); 4604301997Skib thread_exit(); 4605301997Skib} 4606301997Skib 4607301997Skibvoid 4608301997Skibdnlc_reduce_cache(void *arg) 4609301997Skib{ 4610301997Skib u_int percent; 4611301997Skib 4612302012Skib percent = (u_int)(uintptr_t)arg; 4613301997Skib mutex_enter(&arc_dnlc_evicts_lock); 4614301997Skib if (arc_dnlc_evicts_arg == 0) { 4615301997Skib arc_dnlc_evicts_arg = percent; 4616301997Skib cv_broadcast(&arc_dnlc_evicts_cv); 4617301997Skib } 4618301997Skib mutex_exit(&arc_dnlc_evicts_lock); 4619301997Skib} 4620301997Skib 4621168404Spjd/* 4622168404Spjd * Adapt arc info given the number of bytes we are trying to add and 4623168404Spjd * the state that we are comming from. This function is only called 4624168404Spjd * when we are adding new content to the cache. 4625168404Spjd */ 4626168404Spjdstatic void 4627168404Spjdarc_adapt(int bytes, arc_state_t *state) 4628168404Spjd{ 4629168404Spjd int mult; 4630208373Smm uint64_t arc_p_min = (arc_c >> arc_p_min_shift); 4631286766Smav int64_t mrug_size = refcount_count(&arc_mru_ghost->arcs_size); 4632286766Smav int64_t mfug_size = refcount_count(&arc_mfu_ghost->arcs_size); 4633168404Spjd 4634185029Spjd if (state == arc_l2c_only) 4635185029Spjd return; 4636185029Spjd 4637168404Spjd ASSERT(bytes > 0); 4638168404Spjd /* 4639168404Spjd * Adapt the target size of the MRU list: 4640168404Spjd * - if we just hit in the MRU ghost list, then increase 4641168404Spjd * the target size of the MRU list. 4642168404Spjd * - if we just hit in the MFU ghost list, then increase 4643168404Spjd * the target size of the MFU list by decreasing the 4644168404Spjd * target size of the MRU list. 4645168404Spjd */ 4646168404Spjd if (state == arc_mru_ghost) { 4647286766Smav mult = (mrug_size >= mfug_size) ? 1 : (mfug_size / mrug_size); 4648209275Smm mult = MIN(mult, 10); /* avoid wild arc_p adjustment */ 4649168404Spjd 4650208373Smm arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult); 4651168404Spjd } else if (state == arc_mfu_ghost) { 4652208373Smm uint64_t delta; 4653208373Smm 4654286766Smav mult = (mfug_size >= mrug_size) ? 1 : (mrug_size / mfug_size); 4655209275Smm mult = MIN(mult, 10); 4656168404Spjd 4657208373Smm delta = MIN(bytes * mult, arc_p); 4658208373Smm arc_p = MAX(arc_p_min, arc_p - delta); 4659168404Spjd } 4660168404Spjd ASSERT((int64_t)arc_p >= 0); 4661168404Spjd 4662168404Spjd if (arc_reclaim_needed()) { 4663286763Smav cv_signal(&arc_reclaim_thread_cv); 4664168404Spjd return; 4665168404Spjd } 4666168404Spjd 4667168404Spjd if (arc_no_grow) 4668168404Spjd return; 4669168404Spjd 4670168404Spjd if (arc_c >= arc_c_max) 4671168404Spjd return; 4672168404Spjd 4673168404Spjd /* 4674168404Spjd * If we're within (2 * maxblocksize) bytes of the target 4675168404Spjd * cache size, increment the target cache size 4676168404Spjd */ 4677168404Spjd if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) { 4678272483Ssmh DTRACE_PROBE1(arc__inc_adapt, int, bytes); 4679168404Spjd atomic_add_64(&arc_c, (int64_t)bytes); 4680168404Spjd if (arc_c > arc_c_max) 4681168404Spjd arc_c = arc_c_max; 4682168404Spjd else if (state == arc_anon) 4683168404Spjd atomic_add_64(&arc_p, (int64_t)bytes); 4684168404Spjd if (arc_p > arc_c) 4685168404Spjd arc_p = arc_c; 4686168404Spjd } 4687168404Spjd ASSERT((int64_t)arc_p >= 0); 4688168404Spjd} 4689168404Spjd 4690168404Spjd/* 4691286763Smav * Check if arc_size has grown past our upper threshold, determined by 4692286763Smav * zfs_arc_overflow_shift. 4693168404Spjd */ 4694286763Smavstatic boolean_t 4695286763Smavarc_is_overflowing(void) 4696168404Spjd{ 4697286763Smav /* Always allow at least one block of overflow */ 4698286763Smav uint64_t overflow = MAX(SPA_MAXBLOCKSIZE, 4699286763Smav arc_c >> zfs_arc_overflow_shift); 4700185029Spjd 4701286763Smav return (arc_size >= arc_c + overflow); 4702168404Spjd} 4703168404Spjd 4704321610Smavstatic abd_t * 4705321610Smavarc_get_data_abd(arc_buf_hdr_t *hdr, uint64_t size, void *tag) 4706321610Smav{ 4707321610Smav arc_buf_contents_t type = arc_buf_type(hdr); 4708321610Smav 4709321610Smav arc_get_data_impl(hdr, size, tag); 4710321610Smav if (type == ARC_BUFC_METADATA) { 4711321610Smav return (abd_alloc(size, B_TRUE)); 4712321610Smav } else { 4713321610Smav ASSERT(type == ARC_BUFC_DATA); 4714321610Smav return (abd_alloc(size, B_FALSE)); 4715321610Smav } 4716321610Smav} 4717321610Smav 4718321610Smavstatic void * 4719321610Smavarc_get_data_buf(arc_buf_hdr_t *hdr, uint64_t size, void *tag) 4720321610Smav{ 4721321610Smav arc_buf_contents_t type = arc_buf_type(hdr); 4722321610Smav 4723321610Smav arc_get_data_impl(hdr, size, tag); 4724321610Smav if (type == ARC_BUFC_METADATA) { 4725321610Smav return (zio_buf_alloc(size)); 4726321610Smav } else { 4727321610Smav ASSERT(type == ARC_BUFC_DATA); 4728321610Smav return (zio_data_buf_alloc(size)); 4729321610Smav } 4730321610Smav} 4731321610Smav 4732168404Spjd/* 4733307265Smav * Allocate a block and return it to the caller. If we are hitting the 4734307265Smav * hard limit for the cache size, we must sleep, waiting for the eviction 4735307265Smav * thread to catch up. If we're past the target size but below the hard 4736307265Smav * limit, we'll only signal the reclaim thread and continue on. 4737168404Spjd */ 4738321610Smavstatic void 4739321610Smavarc_get_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag) 4740168404Spjd{ 4741321610Smav arc_state_t *state = hdr->b_l1hdr.b_state; 4742321610Smav arc_buf_contents_t type = arc_buf_type(hdr); 4743168404Spjd 4744168404Spjd arc_adapt(size, state); 4745168404Spjd 4746168404Spjd /* 4747286763Smav * If arc_size is currently overflowing, and has grown past our 4748286763Smav * upper limit, we must be adding data faster than the evict 4749286763Smav * thread can evict. Thus, to ensure we don't compound the 4750286763Smav * problem by adding more data and forcing arc_size to grow even 4751286763Smav * further past it's target size, we halt and wait for the 4752286763Smav * eviction thread to catch up. 4753286763Smav * 4754286763Smav * It's also possible that the reclaim thread is unable to evict 4755286763Smav * enough buffers to get arc_size below the overflow limit (e.g. 4756286763Smav * due to buffers being un-evictable, or hash lock collisions). 4757286763Smav * In this case, we want to proceed regardless if we're 4758286763Smav * overflowing; thus we don't use a while loop here. 4759168404Spjd */ 4760286763Smav if (arc_is_overflowing()) { 4761286763Smav mutex_enter(&arc_reclaim_lock); 4762286763Smav 4763286763Smav /* 4764286763Smav * Now that we've acquired the lock, we may no longer be 4765286763Smav * over the overflow limit, lets check. 4766286763Smav * 4767286763Smav * We're ignoring the case of spurious wake ups. If that 4768286763Smav * were to happen, it'd let this thread consume an ARC 4769286763Smav * buffer before it should have (i.e. before we're under 4770286763Smav * the overflow limit and were signalled by the reclaim 4771286763Smav * thread). As long as that is a rare occurrence, it 4772286763Smav * shouldn't cause any harm. 4773286763Smav */ 4774286763Smav if (arc_is_overflowing()) { 4775286763Smav cv_signal(&arc_reclaim_thread_cv); 4776286763Smav cv_wait(&arc_reclaim_waiters_cv, &arc_reclaim_lock); 4777168404Spjd } 4778286763Smav 4779286763Smav mutex_exit(&arc_reclaim_lock); 4780168404Spjd } 4781168404Spjd 4782307265Smav VERIFY3U(hdr->b_type, ==, type); 4783286763Smav if (type == ARC_BUFC_METADATA) { 4784286763Smav arc_space_consume(size, ARC_SPACE_META); 4785168404Spjd } else { 4786286763Smav arc_space_consume(size, ARC_SPACE_DATA); 4787168404Spjd } 4788286763Smav 4789168404Spjd /* 4790168404Spjd * Update the state size. Note that ghost states have a 4791168404Spjd * "ghost size" and so don't need to be updated. 4792168404Spjd */ 4793307265Smav if (!GHOST_STATE(state)) { 4794168404Spjd 4795307265Smav (void) refcount_add_many(&state->arcs_size, size, tag); 4796286763Smav 4797286763Smav /* 4798286763Smav * If this is reached via arc_read, the link is 4799286763Smav * protected by the hash lock. If reached via 4800286763Smav * arc_buf_alloc, the header should not be accessed by 4801286763Smav * any other thread. And, if reached via arc_read_done, 4802286763Smav * the hash lock will protect it if it's found in the 4803286763Smav * hash table; otherwise no other thread should be 4804286763Smav * trying to [add|remove]_reference it. 4805286763Smav */ 4806286763Smav if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) { 4807286570Smav ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 4808307265Smav (void) refcount_add_many(&state->arcs_esize[type], 4809307265Smav size, tag); 4810168404Spjd } 4811307265Smav 4812168404Spjd /* 4813168404Spjd * If we are growing the cache, and we are adding anonymous 4814168404Spjd * data, and we have outgrown arc_p, update arc_p 4815168404Spjd */ 4816286570Smav if (arc_size < arc_c && hdr->b_l1hdr.b_state == arc_anon && 4817286766Smav (refcount_count(&arc_anon->arcs_size) + 4818286766Smav refcount_count(&arc_mru->arcs_size) > arc_p)) 4819168404Spjd arc_p = MIN(arc_c, arc_p + size); 4820168404Spjd } 4821205231Skmacy ARCSTAT_BUMP(arcstat_allocated); 4822168404Spjd} 4823168404Spjd 4824321610Smavstatic void 4825321610Smavarc_free_data_abd(arc_buf_hdr_t *hdr, abd_t *abd, uint64_t size, void *tag) 4826321610Smav{ 4827321610Smav arc_free_data_impl(hdr, size, tag); 4828321610Smav abd_free(abd); 4829321610Smav} 4830321610Smav 4831321610Smavstatic void 4832321610Smavarc_free_data_buf(arc_buf_hdr_t *hdr, void *buf, uint64_t size, void *tag) 4833321610Smav{ 4834321610Smav arc_buf_contents_t type = arc_buf_type(hdr); 4835321610Smav 4836321610Smav arc_free_data_impl(hdr, size, tag); 4837321610Smav if (type == ARC_BUFC_METADATA) { 4838321610Smav zio_buf_free(buf, size); 4839321610Smav } else { 4840321610Smav ASSERT(type == ARC_BUFC_DATA); 4841321610Smav zio_data_buf_free(buf, size); 4842321610Smav } 4843321610Smav} 4844321610Smav 4845168404Spjd/* 4846307265Smav * Free the arc data buffer. 4847307265Smav */ 4848307265Smavstatic void 4849321610Smavarc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag) 4850307265Smav{ 4851307265Smav arc_state_t *state = hdr->b_l1hdr.b_state; 4852307265Smav arc_buf_contents_t type = arc_buf_type(hdr); 4853307265Smav 4854307265Smav /* protected by hash lock, if in the hash table */ 4855307265Smav if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) { 4856307265Smav ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 4857307265Smav ASSERT(state != arc_anon && state != arc_l2c_only); 4858307265Smav 4859307265Smav (void) refcount_remove_many(&state->arcs_esize[type], 4860307265Smav size, tag); 4861307265Smav } 4862307265Smav (void) refcount_remove_many(&state->arcs_size, size, tag); 4863307265Smav 4864307265Smav VERIFY3U(hdr->b_type, ==, type); 4865307265Smav if (type == ARC_BUFC_METADATA) { 4866307265Smav arc_space_return(size, ARC_SPACE_META); 4867307265Smav } else { 4868307265Smav ASSERT(type == ARC_BUFC_DATA); 4869307265Smav arc_space_return(size, ARC_SPACE_DATA); 4870307265Smav } 4871307265Smav} 4872307265Smav 4873307265Smav/* 4874168404Spjd * This routine is called whenever a buffer is accessed. 4875168404Spjd * NOTE: the hash lock is dropped in this function. 4876168404Spjd */ 4877168404Spjdstatic void 4878275811Sdelphijarc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) 4879168404Spjd{ 4880219089Spjd clock_t now; 4881219089Spjd 4882168404Spjd ASSERT(MUTEX_HELD(hash_lock)); 4883286570Smav ASSERT(HDR_HAS_L1HDR(hdr)); 4884168404Spjd 4885286570Smav if (hdr->b_l1hdr.b_state == arc_anon) { 4886168404Spjd /* 4887168404Spjd * This buffer is not in the cache, and does not 4888168404Spjd * appear in our "ghost" list. Add the new buffer 4889168404Spjd * to the MRU state. 4890168404Spjd */ 4891168404Spjd 4892286570Smav ASSERT0(hdr->b_l1hdr.b_arc_access); 4893286570Smav hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); 4894275811Sdelphij DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr); 4895275811Sdelphij arc_change_state(arc_mru, hdr, hash_lock); 4896168404Spjd 4897286570Smav } else if (hdr->b_l1hdr.b_state == arc_mru) { 4898219089Spjd now = ddi_get_lbolt(); 4899219089Spjd 4900168404Spjd /* 4901168404Spjd * If this buffer is here because of a prefetch, then either: 4902168404Spjd * - clear the flag if this is a "referencing" read 4903168404Spjd * (any subsequent access will bump this into the MFU state). 4904168404Spjd * or 4905168404Spjd * - move the buffer to the head of the list if this is 4906168404Spjd * another prefetch (to make it less likely to be evicted). 4907168404Spjd */ 4908286570Smav if (HDR_PREFETCH(hdr)) { 4909286570Smav if (refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) { 4910286763Smav /* link protected by hash lock */ 4911286763Smav ASSERT(multilist_link_active( 4912286570Smav &hdr->b_l1hdr.b_arc_node)); 4913168404Spjd } else { 4914307265Smav arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH); 4915168404Spjd ARCSTAT_BUMP(arcstat_mru_hits); 4916168404Spjd } 4917286570Smav hdr->b_l1hdr.b_arc_access = now; 4918168404Spjd return; 4919168404Spjd } 4920168404Spjd 4921168404Spjd /* 4922168404Spjd * This buffer has been "accessed" only once so far, 4923168404Spjd * but it is still in the cache. Move it to the MFU 4924168404Spjd * state. 4925168404Spjd */ 4926286570Smav if (now > hdr->b_l1hdr.b_arc_access + ARC_MINTIME) { 4927168404Spjd /* 4928168404Spjd * More than 125ms have passed since we 4929168404Spjd * instantiated this buffer. Move it to the 4930168404Spjd * most frequently used state. 4931168404Spjd */ 4932286570Smav hdr->b_l1hdr.b_arc_access = now; 4933275811Sdelphij DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); 4934275811Sdelphij arc_change_state(arc_mfu, hdr, hash_lock); 4935168404Spjd } 4936168404Spjd ARCSTAT_BUMP(arcstat_mru_hits); 4937286570Smav } else if (hdr->b_l1hdr.b_state == arc_mru_ghost) { 4938168404Spjd arc_state_t *new_state; 4939168404Spjd /* 4940168404Spjd * This buffer has been "accessed" recently, but 4941168404Spjd * was evicted from the cache. Move it to the 4942168404Spjd * MFU state. 4943168404Spjd */ 4944168404Spjd 4945286570Smav if (HDR_PREFETCH(hdr)) { 4946168404Spjd new_state = arc_mru; 4947286570Smav if (refcount_count(&hdr->b_l1hdr.b_refcnt) > 0) 4948307265Smav arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH); 4949275811Sdelphij DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr); 4950168404Spjd } else { 4951168404Spjd new_state = arc_mfu; 4952275811Sdelphij DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); 4953168404Spjd } 4954168404Spjd 4955286570Smav hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); 4956275811Sdelphij arc_change_state(new_state, hdr, hash_lock); 4957168404Spjd 4958168404Spjd ARCSTAT_BUMP(arcstat_mru_ghost_hits); 4959286570Smav } else if (hdr->b_l1hdr.b_state == arc_mfu) { 4960168404Spjd /* 4961168404Spjd * This buffer has been accessed more than once and is 4962168404Spjd * still in the cache. Keep it in the MFU state. 4963168404Spjd * 4964168404Spjd * NOTE: an add_reference() that occurred when we did 4965168404Spjd * the arc_read() will have kicked this off the list. 4966168404Spjd * If it was a prefetch, we will explicitly move it to 4967168404Spjd * the head of the list now. 4968168404Spjd */ 4969286570Smav if ((HDR_PREFETCH(hdr)) != 0) { 4970286570Smav ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 4971286763Smav /* link protected by hash_lock */ 4972286763Smav ASSERT(multilist_link_active(&hdr->b_l1hdr.b_arc_node)); 4973168404Spjd } 4974168404Spjd ARCSTAT_BUMP(arcstat_mfu_hits); 4975286570Smav hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); 4976286570Smav } else if (hdr->b_l1hdr.b_state == arc_mfu_ghost) { 4977168404Spjd arc_state_t *new_state = arc_mfu; 4978168404Spjd /* 4979168404Spjd * This buffer has been accessed more than once but has 4980168404Spjd * been evicted from the cache. Move it back to the 4981168404Spjd * MFU state. 4982168404Spjd */ 4983168404Spjd 4984286570Smav if (HDR_PREFETCH(hdr)) { 4985168404Spjd /* 4986168404Spjd * This is a prefetch access... 4987168404Spjd * move this block back to the MRU state. 4988168404Spjd */ 4989286570Smav ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt)); 4990168404Spjd new_state = arc_mru; 4991168404Spjd } 4992168404Spjd 4993286570Smav hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); 4994275811Sdelphij DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); 4995275811Sdelphij arc_change_state(new_state, hdr, hash_lock); 4996168404Spjd 4997168404Spjd ARCSTAT_BUMP(arcstat_mfu_ghost_hits); 4998286570Smav } else if (hdr->b_l1hdr.b_state == arc_l2c_only) { 4999185029Spjd /* 5000185029Spjd * This buffer is on the 2nd Level ARC. 5001185029Spjd */ 5002185029Spjd 5003286570Smav hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); 5004275811Sdelphij DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); 5005275811Sdelphij arc_change_state(arc_mfu, hdr, hash_lock); 5006168404Spjd } else { 5007168404Spjd ASSERT(!"invalid arc state"); 5008168404Spjd } 5009168404Spjd} 5010168404Spjd 5011168404Spjd/* a generic arc_done_func_t which you can use */ 5012168404Spjd/* ARGSUSED */ 5013168404Spjdvoid 5014168404Spjdarc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg) 5015168404Spjd{ 5016219089Spjd if (zio == NULL || zio->io_error == 0) 5017321535Smav bcopy(buf->b_data, arg, arc_buf_size(buf)); 5018307265Smav arc_buf_destroy(buf, arg); 5019168404Spjd} 5020168404Spjd 5021185029Spjd/* a generic arc_done_func_t */ 5022168404Spjdvoid 5023168404Spjdarc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg) 5024168404Spjd{ 5025168404Spjd arc_buf_t **bufp = arg; 5026168404Spjd if (zio && zio->io_error) { 5027307265Smav arc_buf_destroy(buf, arg); 5028168404Spjd *bufp = NULL; 5029168404Spjd } else { 5030168404Spjd *bufp = buf; 5031219089Spjd ASSERT(buf->b_data); 5032168404Spjd } 5033168404Spjd} 5034168404Spjd 5035168404Spjdstatic void 5036307265Smavarc_hdr_verify(arc_buf_hdr_t *hdr, blkptr_t *bp) 5037307265Smav{ 5038307265Smav if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) { 5039307265Smav ASSERT3U(HDR_GET_PSIZE(hdr), ==, 0); 5040307265Smav ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF); 5041307265Smav } else { 5042307265Smav if (HDR_COMPRESSION_ENABLED(hdr)) { 5043307265Smav ASSERT3U(HDR_GET_COMPRESS(hdr), ==, 5044307265Smav BP_GET_COMPRESS(bp)); 5045307265Smav } 5046307265Smav ASSERT3U(HDR_GET_LSIZE(hdr), ==, BP_GET_LSIZE(bp)); 5047307265Smav ASSERT3U(HDR_GET_PSIZE(hdr), ==, BP_GET_PSIZE(bp)); 5048307265Smav } 5049307265Smav} 5050307265Smav 5051307265Smavstatic void 5052168404Spjdarc_read_done(zio_t *zio) 5053168404Spjd{ 5054307265Smav arc_buf_hdr_t *hdr = zio->io_private; 5055268075Sdelphij kmutex_t *hash_lock = NULL; 5056321535Smav arc_callback_t *callback_list; 5057321535Smav arc_callback_t *acb; 5058321535Smav boolean_t freeable = B_FALSE; 5059321535Smav boolean_t no_zio_error = (zio->io_error == 0); 5060168404Spjd 5061168404Spjd /* 5062168404Spjd * The hdr was inserted into hash-table and removed from lists 5063168404Spjd * prior to starting I/O. We should find this header, since 5064168404Spjd * it's in the hash table, and it should be legit since it's 5065168404Spjd * not possible to evict it during the I/O. The only possible 5066168404Spjd * reason for it not to be found is if we were freed during the 5067168404Spjd * read. 5068168404Spjd */ 5069268075Sdelphij if (HDR_IN_HASH_TABLE(hdr)) { 5070268075Sdelphij ASSERT3U(hdr->b_birth, ==, BP_PHYSICAL_BIRTH(zio->io_bp)); 5071268075Sdelphij ASSERT3U(hdr->b_dva.dva_word[0], ==, 5072268075Sdelphij BP_IDENTITY(zio->io_bp)->dva_word[0]); 5073268075Sdelphij ASSERT3U(hdr->b_dva.dva_word[1], ==, 5074268075Sdelphij BP_IDENTITY(zio->io_bp)->dva_word[1]); 5075168404Spjd 5076268075Sdelphij arc_buf_hdr_t *found = buf_hash_find(hdr->b_spa, zio->io_bp, 5077268075Sdelphij &hash_lock); 5078168404Spjd 5079307265Smav ASSERT((found == hdr && 5080268075Sdelphij DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) || 5081268075Sdelphij (found == hdr && HDR_L2_READING(hdr))); 5082307265Smav ASSERT3P(hash_lock, !=, NULL); 5083268075Sdelphij } 5084268075Sdelphij 5085321535Smav if (no_zio_error) { 5086307265Smav /* byteswap if necessary */ 5087307265Smav if (BP_SHOULD_BYTESWAP(zio->io_bp)) { 5088307265Smav if (BP_GET_LEVEL(zio->io_bp) > 0) { 5089307265Smav hdr->b_l1hdr.b_byteswap = DMU_BSWAP_UINT64; 5090307265Smav } else { 5091307265Smav hdr->b_l1hdr.b_byteswap = 5092307265Smav DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp)); 5093307265Smav } 5094307265Smav } else { 5095307265Smav hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS; 5096307265Smav } 5097307265Smav } 5098307265Smav 5099307265Smav arc_hdr_clear_flags(hdr, ARC_FLAG_L2_EVICTED); 5100286570Smav if (l2arc_noprefetch && HDR_PREFETCH(hdr)) 5101307265Smav arc_hdr_clear_flags(hdr, ARC_FLAG_L2CACHE); 5102206796Spjd 5103286570Smav callback_list = hdr->b_l1hdr.b_acb; 5104307265Smav ASSERT3P(callback_list, !=, NULL); 5105168404Spjd 5106321535Smav if (hash_lock && no_zio_error && hdr->b_l1hdr.b_state == arc_anon) { 5107219089Spjd /* 5108219089Spjd * Only call arc_access on anonymous buffers. This is because 5109219089Spjd * if we've issued an I/O for an evicted buffer, we've already 5110219089Spjd * called arc_access (to prevent any simultaneous readers from 5111219089Spjd * getting confused). 5112219089Spjd */ 5113219089Spjd arc_access(hdr, hash_lock); 5114219089Spjd } 5115219089Spjd 5116321535Smav /* 5117321535Smav * If a read request has a callback (i.e. acb_done is not NULL), then we 5118321535Smav * make a buf containing the data according to the parameters which were 5119321535Smav * passed in. The implementation of arc_buf_alloc_impl() ensures that we 5120321535Smav * aren't needlessly decompressing the data multiple times. 5121321535Smav */ 5122321535Smav int callback_cnt = 0; 5123321535Smav for (acb = callback_list; acb != NULL; acb = acb->acb_next) { 5124321535Smav if (!acb->acb_done) 5125321535Smav continue; 5126321535Smav 5127321535Smav /* This is a demand read since prefetches don't use callbacks */ 5128321535Smav callback_cnt++; 5129321535Smav 5130321535Smav int error = arc_buf_alloc_impl(hdr, acb->acb_private, 5131321535Smav acb->acb_compressed, no_zio_error, &acb->acb_buf); 5132321535Smav if (no_zio_error) { 5133321535Smav zio->io_error = error; 5134168404Spjd } 5135168404Spjd } 5136286570Smav hdr->b_l1hdr.b_acb = NULL; 5137307265Smav arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); 5138321535Smav if (callback_cnt == 0) { 5139307265Smav ASSERT(HDR_PREFETCH(hdr)); 5140307265Smav ASSERT0(hdr->b_l1hdr.b_bufcnt); 5141321610Smav ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); 5142219089Spjd } 5143168404Spjd 5144286570Smav ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt) || 5145286570Smav callback_list != NULL); 5146168404Spjd 5147321535Smav if (no_zio_error) { 5148307265Smav arc_hdr_verify(hdr, zio->io_bp); 5149307265Smav } else { 5150307265Smav arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR); 5151286570Smav if (hdr->b_l1hdr.b_state != arc_anon) 5152168404Spjd arc_change_state(arc_anon, hdr, hash_lock); 5153168404Spjd if (HDR_IN_HASH_TABLE(hdr)) 5154168404Spjd buf_hash_remove(hdr); 5155286570Smav freeable = refcount_is_zero(&hdr->b_l1hdr.b_refcnt); 5156168404Spjd } 5157168404Spjd 5158168404Spjd /* 5159168404Spjd * Broadcast before we drop the hash_lock to avoid the possibility 5160168404Spjd * that the hdr (and hence the cv) might be freed before we get to 5161168404Spjd * the cv_broadcast(). 5162168404Spjd */ 5163286570Smav cv_broadcast(&hdr->b_l1hdr.b_cv); 5164168404Spjd 5165286570Smav if (hash_lock != NULL) { 5166168404Spjd mutex_exit(hash_lock); 5167168404Spjd } else { 5168168404Spjd /* 5169168404Spjd * This block was freed while we waited for the read to 5170168404Spjd * complete. It has been removed from the hash table and 5171168404Spjd * moved to the anonymous state (so that it won't show up 5172168404Spjd * in the cache). 5173168404Spjd */ 5174286570Smav ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); 5175286570Smav freeable = refcount_is_zero(&hdr->b_l1hdr.b_refcnt); 5176168404Spjd } 5177168404Spjd 5178168404Spjd /* execute each callback and free its structure */ 5179168404Spjd while ((acb = callback_list) != NULL) { 5180168404Spjd if (acb->acb_done) 5181168404Spjd acb->acb_done(zio, acb->acb_buf, acb->acb_private); 5182168404Spjd 5183168404Spjd if (acb->acb_zio_dummy != NULL) { 5184168404Spjd acb->acb_zio_dummy->io_error = zio->io_error; 5185168404Spjd zio_nowait(acb->acb_zio_dummy); 5186168404Spjd } 5187168404Spjd 5188168404Spjd callback_list = acb->acb_next; 5189168404Spjd kmem_free(acb, sizeof (arc_callback_t)); 5190168404Spjd } 5191168404Spjd 5192168404Spjd if (freeable) 5193168404Spjd arc_hdr_destroy(hdr); 5194168404Spjd} 5195168404Spjd 5196168404Spjd/* 5197286762Smav * "Read" the block at the specified DVA (in bp) via the 5198168404Spjd * cache. If the block is found in the cache, invoke the provided 5199168404Spjd * callback immediately and return. Note that the `zio' parameter 5200168404Spjd * in the callback will be NULL in this case, since no IO was 5201168404Spjd * required. If the block is not in the cache pass the read request 5202168404Spjd * on to the spa with a substitute callback function, so that the 5203168404Spjd * requested block will be added to the cache. 5204168404Spjd * 5205168404Spjd * If a read request arrives for a block that has a read in-progress, 5206168404Spjd * either wait for the in-progress read to complete (and return the 5207168404Spjd * results); or, if this is a read with a "done" func, add a record 5208168404Spjd * to the read to invoke the "done" func when the read completes, 5209168404Spjd * and return; or just return. 5210168404Spjd * 5211168404Spjd * arc_read_done() will invoke all the requested "done" functions 5212168404Spjd * for readers of this block. 5213168404Spjd */ 5214168404Spjdint 5215246666Smmarc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, 5216275811Sdelphij void *private, zio_priority_t priority, int zio_flags, 5217275811Sdelphij arc_flags_t *arc_flags, const zbookmark_phys_t *zb) 5218168404Spjd{ 5219268075Sdelphij arc_buf_hdr_t *hdr = NULL; 5220268075Sdelphij kmutex_t *hash_lock = NULL; 5221185029Spjd zio_t *rzio; 5222228103Smm uint64_t guid = spa_load_guid(spa); 5223321535Smav boolean_t compressed_read = (zio_flags & ZIO_FLAG_RAW) != 0; 5224168404Spjd 5225268075Sdelphij ASSERT(!BP_IS_EMBEDDED(bp) || 5226268075Sdelphij BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA); 5227268075Sdelphij 5228168404Spjdtop: 5229268075Sdelphij if (!BP_IS_EMBEDDED(bp)) { 5230268075Sdelphij /* 5231268075Sdelphij * Embedded BP's have no DVA and require no I/O to "read". 5232268075Sdelphij * Create an anonymous arc buf to back it. 5233268075Sdelphij */ 5234268075Sdelphij hdr = buf_hash_find(guid, bp, &hash_lock); 5235268075Sdelphij } 5236168404Spjd 5237321610Smav if (hdr != NULL && HDR_HAS_L1HDR(hdr) && hdr->b_l1hdr.b_pabd != NULL) { 5238307265Smav arc_buf_t *buf = NULL; 5239275811Sdelphij *arc_flags |= ARC_FLAG_CACHED; 5240168404Spjd 5241168404Spjd if (HDR_IO_IN_PROGRESS(hdr)) { 5242168404Spjd 5243287702Sdelphij if ((hdr->b_flags & ARC_FLAG_PRIO_ASYNC_READ) && 5244287702Sdelphij priority == ZIO_PRIORITY_SYNC_READ) { 5245287702Sdelphij /* 5246287702Sdelphij * This sync read must wait for an 5247287702Sdelphij * in-progress async read (e.g. a predictive 5248287702Sdelphij * prefetch). Async reads are queued 5249287702Sdelphij * separately at the vdev_queue layer, so 5250287702Sdelphij * this is a form of priority inversion. 5251287702Sdelphij * Ideally, we would "inherit" the demand 5252287702Sdelphij * i/o's priority by moving the i/o from 5253287702Sdelphij * the async queue to the synchronous queue, 5254287702Sdelphij * but there is currently no mechanism to do 5255287702Sdelphij * so. Track this so that we can evaluate 5256287702Sdelphij * the magnitude of this potential performance 5257287702Sdelphij * problem. 5258287702Sdelphij * 5259287702Sdelphij * Note that if the prefetch i/o is already 5260287702Sdelphij * active (has been issued to the device), 5261287702Sdelphij * the prefetch improved performance, because 5262287702Sdelphij * we issued it sooner than we would have 5263287702Sdelphij * without the prefetch. 5264287702Sdelphij */ 5265287702Sdelphij DTRACE_PROBE1(arc__sync__wait__for__async, 5266287702Sdelphij arc_buf_hdr_t *, hdr); 5267287702Sdelphij ARCSTAT_BUMP(arcstat_sync_wait_for_async); 5268287702Sdelphij } 5269287702Sdelphij if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) { 5270307265Smav arc_hdr_clear_flags(hdr, 5271307265Smav ARC_FLAG_PREDICTIVE_PREFETCH); 5272287702Sdelphij } 5273287702Sdelphij 5274275811Sdelphij if (*arc_flags & ARC_FLAG_WAIT) { 5275286570Smav cv_wait(&hdr->b_l1hdr.b_cv, hash_lock); 5276168404Spjd mutex_exit(hash_lock); 5277168404Spjd goto top; 5278168404Spjd } 5279275811Sdelphij ASSERT(*arc_flags & ARC_FLAG_NOWAIT); 5280168404Spjd 5281168404Spjd if (done) { 5282287702Sdelphij arc_callback_t *acb = NULL; 5283168404Spjd 5284168404Spjd acb = kmem_zalloc(sizeof (arc_callback_t), 5285168404Spjd KM_SLEEP); 5286168404Spjd acb->acb_done = done; 5287168404Spjd acb->acb_private = private; 5288321535Smav acb->acb_compressed = compressed_read; 5289168404Spjd if (pio != NULL) 5290168404Spjd acb->acb_zio_dummy = zio_null(pio, 5291209962Smm spa, NULL, NULL, NULL, zio_flags); 5292168404Spjd 5293307265Smav ASSERT3P(acb->acb_done, !=, NULL); 5294286570Smav acb->acb_next = hdr->b_l1hdr.b_acb; 5295286570Smav hdr->b_l1hdr.b_acb = acb; 5296168404Spjd mutex_exit(hash_lock); 5297168404Spjd return (0); 5298168404Spjd } 5299168404Spjd mutex_exit(hash_lock); 5300168404Spjd return (0); 5301168404Spjd } 5302168404Spjd 5303286570Smav ASSERT(hdr->b_l1hdr.b_state == arc_mru || 5304286570Smav hdr->b_l1hdr.b_state == arc_mfu); 5305168404Spjd 5306168404Spjd if (done) { 5307287702Sdelphij if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) { 5308287702Sdelphij /* 5309287702Sdelphij * This is a demand read which does not have to 5310287702Sdelphij * wait for i/o because we did a predictive 5311287702Sdelphij * prefetch i/o for it, which has completed. 5312287702Sdelphij */ 5313287702Sdelphij DTRACE_PROBE1( 5314287702Sdelphij arc__demand__hit__predictive__prefetch, 5315287702Sdelphij arc_buf_hdr_t *, hdr); 5316287702Sdelphij ARCSTAT_BUMP( 5317287702Sdelphij arcstat_demand_hit_predictive_prefetch); 5318307265Smav arc_hdr_clear_flags(hdr, 5319307265Smav ARC_FLAG_PREDICTIVE_PREFETCH); 5320287702Sdelphij } 5321307265Smav ASSERT(!BP_IS_EMBEDDED(bp) || !BP_IS_HOLE(bp)); 5322307265Smav 5323321535Smav /* Get a buf with the desired data in it. */ 5324321535Smav VERIFY0(arc_buf_alloc_impl(hdr, private, 5325321535Smav compressed_read, B_TRUE, &buf)); 5326275811Sdelphij } else if (*arc_flags & ARC_FLAG_PREFETCH && 5327286570Smav refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) { 5328307265Smav arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH); 5329168404Spjd } 5330168404Spjd DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); 5331168404Spjd arc_access(hdr, hash_lock); 5332275811Sdelphij if (*arc_flags & ARC_FLAG_L2CACHE) 5333307265Smav arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE); 5334168404Spjd mutex_exit(hash_lock); 5335168404Spjd ARCSTAT_BUMP(arcstat_hits); 5336286570Smav ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr), 5337286570Smav demand, prefetch, !HDR_ISTYPE_METADATA(hdr), 5338168404Spjd data, metadata, hits); 5339168404Spjd 5340168404Spjd if (done) 5341168404Spjd done(NULL, buf, private); 5342168404Spjd } else { 5343307265Smav uint64_t lsize = BP_GET_LSIZE(bp); 5344307265Smav uint64_t psize = BP_GET_PSIZE(bp); 5345268075Sdelphij arc_callback_t *acb; 5346185029Spjd vdev_t *vd = NULL; 5347247187Smm uint64_t addr = 0; 5348208373Smm boolean_t devw = B_FALSE; 5349307265Smav uint64_t size; 5350168404Spjd 5351168404Spjd if (hdr == NULL) { 5352168404Spjd /* this block is not in the cache */ 5353268075Sdelphij arc_buf_hdr_t *exists = NULL; 5354168404Spjd arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp); 5355307265Smav hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize, 5356307265Smav BP_GET_COMPRESS(bp), type); 5357307265Smav 5358268075Sdelphij if (!BP_IS_EMBEDDED(bp)) { 5359268075Sdelphij hdr->b_dva = *BP_IDENTITY(bp); 5360268075Sdelphij hdr->b_birth = BP_PHYSICAL_BIRTH(bp); 5361268075Sdelphij exists = buf_hash_insert(hdr, &hash_lock); 5362268075Sdelphij } 5363268075Sdelphij if (exists != NULL) { 5364168404Spjd /* somebody beat us to the hash insert */ 5365168404Spjd mutex_exit(hash_lock); 5366219089Spjd buf_discard_identity(hdr); 5367307265Smav arc_hdr_destroy(hdr); 5368168404Spjd goto top; /* restart the IO request */ 5369168404Spjd } 5370168404Spjd } else { 5371286570Smav /* 5372286570Smav * This block is in the ghost cache. If it was L2-only 5373286570Smav * (and thus didn't have an L1 hdr), we realloc the 5374286570Smav * header to add an L1 hdr. 5375286570Smav */ 5376286570Smav if (!HDR_HAS_L1HDR(hdr)) { 5377286570Smav hdr = arc_hdr_realloc(hdr, hdr_l2only_cache, 5378286570Smav hdr_full_cache); 5379286570Smav } 5380321610Smav ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); 5381286570Smav ASSERT(GHOST_STATE(hdr->b_l1hdr.b_state)); 5382168404Spjd ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 5383286570Smav ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 5384286763Smav ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); 5385321535Smav ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); 5386168404Spjd 5387287702Sdelphij /* 5388307265Smav * This is a delicate dance that we play here. 5389307265Smav * This hdr is in the ghost list so we access it 5390307265Smav * to move it out of the ghost list before we 5391307265Smav * initiate the read. If it's a prefetch then 5392307265Smav * it won't have a callback so we'll remove the 5393307265Smav * reference that arc_buf_alloc_impl() created. We 5394307265Smav * do this after we've called arc_access() to 5395307265Smav * avoid hitting an assert in remove_reference(). 5396287702Sdelphij */ 5397219089Spjd arc_access(hdr, hash_lock); 5398321610Smav arc_hdr_alloc_pabd(hdr); 5399168404Spjd } 5400321610Smav ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); 5401307265Smav size = arc_hdr_size(hdr); 5402168404Spjd 5403307265Smav /* 5404307265Smav * If compression is enabled on the hdr, then will do 5405307265Smav * RAW I/O and will store the compressed data in the hdr's 5406307265Smav * data block. Otherwise, the hdr's data block will contain 5407307265Smav * the uncompressed data. 5408307265Smav */ 5409307265Smav if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF) { 5410307265Smav zio_flags |= ZIO_FLAG_RAW; 5411307265Smav } 5412307265Smav 5413307265Smav if (*arc_flags & ARC_FLAG_PREFETCH) 5414307265Smav arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH); 5415307265Smav if (*arc_flags & ARC_FLAG_L2CACHE) 5416307265Smav arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE); 5417307265Smav if (BP_GET_LEVEL(bp) > 0) 5418307265Smav arc_hdr_set_flags(hdr, ARC_FLAG_INDIRECT); 5419287702Sdelphij if (*arc_flags & ARC_FLAG_PREDICTIVE_PREFETCH) 5420307265Smav arc_hdr_set_flags(hdr, ARC_FLAG_PREDICTIVE_PREFETCH); 5421286570Smav ASSERT(!GHOST_STATE(hdr->b_l1hdr.b_state)); 5422219089Spjd 5423168404Spjd acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); 5424168404Spjd acb->acb_done = done; 5425168404Spjd acb->acb_private = private; 5426321535Smav acb->acb_compressed = compressed_read; 5427168404Spjd 5428307265Smav ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); 5429286570Smav hdr->b_l1hdr.b_acb = acb; 5430307265Smav arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); 5431168404Spjd 5432286570Smav if (HDR_HAS_L2HDR(hdr) && 5433286570Smav (vd = hdr->b_l2hdr.b_dev->l2ad_vdev) != NULL) { 5434286570Smav devw = hdr->b_l2hdr.b_dev->l2ad_writing; 5435286570Smav addr = hdr->b_l2hdr.b_daddr; 5436185029Spjd /* 5437332525Smav * Lock out L2ARC device removal. 5438185029Spjd */ 5439185029Spjd if (vdev_is_dead(vd) || 5440185029Spjd !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER)) 5441185029Spjd vd = NULL; 5442185029Spjd } 5443185029Spjd 5444307265Smav if (priority == ZIO_PRIORITY_ASYNC_READ) 5445307265Smav arc_hdr_set_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ); 5446307265Smav else 5447307265Smav arc_hdr_clear_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ); 5448307265Smav 5449268075Sdelphij if (hash_lock != NULL) 5450268075Sdelphij mutex_exit(hash_lock); 5451168404Spjd 5452251629Sdelphij /* 5453251629Sdelphij * At this point, we have a level 1 cache miss. Try again in 5454251629Sdelphij * L2ARC if possible. 5455251629Sdelphij */ 5456307265Smav ASSERT3U(HDR_GET_LSIZE(hdr), ==, lsize); 5457307265Smav 5458219089Spjd DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp, 5459307265Smav uint64_t, lsize, zbookmark_phys_t *, zb); 5460168404Spjd ARCSTAT_BUMP(arcstat_misses); 5461286570Smav ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr), 5462286570Smav demand, prefetch, !HDR_ISTYPE_METADATA(hdr), 5463168404Spjd data, metadata, misses); 5464228392Spjd#ifdef _KERNEL 5465297633Strasz#ifdef RACCT 5466297633Strasz if (racct_enable) { 5467297633Strasz PROC_LOCK(curproc); 5468297633Strasz racct_add_force(curproc, RACCT_READBPS, size); 5469297633Strasz racct_add_force(curproc, RACCT_READIOPS, 1); 5470297633Strasz PROC_UNLOCK(curproc); 5471297633Strasz } 5472297633Strasz#endif /* RACCT */ 5473228392Spjd curthread->td_ru.ru_inblock++; 5474228392Spjd#endif 5475168404Spjd 5476208373Smm if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) { 5477185029Spjd /* 5478185029Spjd * Read from the L2ARC if the following are true: 5479185029Spjd * 1. The L2ARC vdev was previously cached. 5480185029Spjd * 2. This buffer still has L2ARC metadata. 5481185029Spjd * 3. This buffer isn't currently writing to the L2ARC. 5482185029Spjd * 4. The L2ARC entry wasn't evicted, which may 5483185029Spjd * also have invalidated the vdev. 5484208373Smm * 5. This isn't prefetch and l2arc_noprefetch is set. 5485185029Spjd */ 5486286570Smav if (HDR_HAS_L2HDR(hdr) && 5487208373Smm !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) && 5488208373Smm !(l2arc_noprefetch && HDR_PREFETCH(hdr))) { 5489185029Spjd l2arc_read_callback_t *cb; 5490321610Smav abd_t *abd; 5491321610Smav uint64_t asize; 5492185029Spjd 5493185029Spjd DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr); 5494185029Spjd ARCSTAT_BUMP(arcstat_l2_hits); 5495185029Spjd 5496185029Spjd cb = kmem_zalloc(sizeof (l2arc_read_callback_t), 5497185029Spjd KM_SLEEP); 5498307265Smav cb->l2rcb_hdr = hdr; 5499185029Spjd cb->l2rcb_bp = *bp; 5500185029Spjd cb->l2rcb_zb = *zb; 5501185029Spjd cb->l2rcb_flags = zio_flags; 5502321610Smav 5503321610Smav asize = vdev_psize_to_asize(vd, size); 5504307265Smav if (asize != size) { 5505321610Smav abd = abd_alloc_for_io(asize, 5506321610Smav HDR_ISTYPE_METADATA(hdr)); 5507321610Smav cb->l2rcb_abd = abd; 5508297848Savg } else { 5509321610Smav abd = hdr->b_l1hdr.b_pabd; 5510297848Savg } 5511185029Spjd 5512247187Smm ASSERT(addr >= VDEV_LABEL_START_SIZE && 5513321610Smav addr + asize <= vd->vdev_psize - 5514247187Smm VDEV_LABEL_END_SIZE); 5515247187Smm 5516185029Spjd /* 5517185029Spjd * l2arc read. The SCL_L2ARC lock will be 5518185029Spjd * released by l2arc_read_done(). 5519251478Sdelphij * Issue a null zio if the underlying buffer 5520251478Sdelphij * was squashed to zero size by compression. 5521185029Spjd */ 5522307265Smav ASSERT3U(HDR_GET_COMPRESS(hdr), !=, 5523307265Smav ZIO_COMPRESS_EMPTY); 5524307265Smav rzio = zio_read_phys(pio, vd, addr, 5525321610Smav asize, abd, 5526307265Smav ZIO_CHECKSUM_OFF, 5527307265Smav l2arc_read_done, cb, priority, 5528307265Smav zio_flags | ZIO_FLAG_DONT_CACHE | 5529307265Smav ZIO_FLAG_CANFAIL | 5530307265Smav ZIO_FLAG_DONT_PROPAGATE | 5531307265Smav ZIO_FLAG_DONT_RETRY, B_FALSE); 5532185029Spjd DTRACE_PROBE2(l2arc__read, vdev_t *, vd, 5533185029Spjd zio_t *, rzio); 5534307265Smav ARCSTAT_INCR(arcstat_l2_read_bytes, size); 5535185029Spjd 5536275811Sdelphij if (*arc_flags & ARC_FLAG_NOWAIT) { 5537185029Spjd zio_nowait(rzio); 5538185029Spjd return (0); 5539185029Spjd } 5540185029Spjd 5541275811Sdelphij ASSERT(*arc_flags & ARC_FLAG_WAIT); 5542185029Spjd if (zio_wait(rzio) == 0) 5543185029Spjd return (0); 5544185029Spjd 5545185029Spjd /* l2arc read error; goto zio_read() */ 5546185029Spjd } else { 5547185029Spjd DTRACE_PROBE1(l2arc__miss, 5548185029Spjd arc_buf_hdr_t *, hdr); 5549185029Spjd ARCSTAT_BUMP(arcstat_l2_misses); 5550185029Spjd if (HDR_L2_WRITING(hdr)) 5551185029Spjd ARCSTAT_BUMP(arcstat_l2_rw_clash); 5552185029Spjd spa_config_exit(spa, SCL_L2ARC, vd); 5553185029Spjd } 5554208373Smm } else { 5555208373Smm if (vd != NULL) 5556208373Smm spa_config_exit(spa, SCL_L2ARC, vd); 5557208373Smm if (l2arc_ndev != 0) { 5558208373Smm DTRACE_PROBE1(l2arc__miss, 5559208373Smm arc_buf_hdr_t *, hdr); 5560208373Smm ARCSTAT_BUMP(arcstat_l2_misses); 5561208373Smm } 5562185029Spjd } 5563185029Spjd 5564321610Smav rzio = zio_read(pio, spa, bp, hdr->b_l1hdr.b_pabd, size, 5565307265Smav arc_read_done, hdr, priority, zio_flags, zb); 5566168404Spjd 5567275811Sdelphij if (*arc_flags & ARC_FLAG_WAIT) 5568168404Spjd return (zio_wait(rzio)); 5569168404Spjd 5570275811Sdelphij ASSERT(*arc_flags & ARC_FLAG_NOWAIT); 5571168404Spjd zio_nowait(rzio); 5572168404Spjd } 5573168404Spjd return (0); 5574168404Spjd} 5575168404Spjd 5576168404Spjd/* 5577251520Sdelphij * Notify the arc that a block was freed, and thus will never be used again. 5578251520Sdelphij */ 5579251520Sdelphijvoid 5580251520Sdelphijarc_freed(spa_t *spa, const blkptr_t *bp) 5581251520Sdelphij{ 5582251520Sdelphij arc_buf_hdr_t *hdr; 5583251520Sdelphij kmutex_t *hash_lock; 5584251520Sdelphij uint64_t guid = spa_load_guid(spa); 5585251520Sdelphij 5586268075Sdelphij ASSERT(!BP_IS_EMBEDDED(bp)); 5587268075Sdelphij 5588268075Sdelphij hdr = buf_hash_find(guid, bp, &hash_lock); 5589251520Sdelphij if (hdr == NULL) 5590251520Sdelphij return; 5591307265Smav 5592307265Smav /* 5593307265Smav * We might be trying to free a block that is still doing I/O 5594307265Smav * (i.e. prefetch) or has a reference (i.e. a dedup-ed, 5595307265Smav * dmu_sync-ed block). If this block is being prefetched, then it 5596307265Smav * would still have the ARC_FLAG_IO_IN_PROGRESS flag set on the hdr 5597307265Smav * until the I/O completes. A block may also have a reference if it is 5598307265Smav * part of a dedup-ed, dmu_synced write. The dmu_sync() function would 5599307265Smav * have written the new block to its final resting place on disk but 5600307265Smav * without the dedup flag set. This would have left the hdr in the MRU 5601307265Smav * state and discoverable. When the txg finally syncs it detects that 5602307265Smav * the block was overridden in open context and issues an override I/O. 5603307265Smav * Since this is a dedup block, the override I/O will determine if the 5604307265Smav * block is already in the DDT. If so, then it will replace the io_bp 5605307265Smav * with the bp from the DDT and allow the I/O to finish. When the I/O 5606307265Smav * reaches the done callback, dbuf_write_override_done, it will 5607307265Smav * check to see if the io_bp and io_bp_override are identical. 5608307265Smav * If they are not, then it indicates that the bp was replaced with 5609307265Smav * the bp in the DDT and the override bp is freed. This allows 5610307265Smav * us to arrive here with a reference on a block that is being 5611307265Smav * freed. So if we have an I/O in progress, or a reference to 5612307265Smav * this hdr, then we don't destroy the hdr. 5613307265Smav */ 5614307265Smav if (!HDR_HAS_L1HDR(hdr) || (!HDR_IO_IN_PROGRESS(hdr) && 5615307265Smav refcount_is_zero(&hdr->b_l1hdr.b_refcnt))) { 5616307265Smav arc_change_state(arc_anon, hdr, hash_lock); 5617307265Smav arc_hdr_destroy(hdr); 5618251520Sdelphij mutex_exit(hash_lock); 5619251520Sdelphij } else { 5620251520Sdelphij mutex_exit(hash_lock); 5621251520Sdelphij } 5622251520Sdelphij 5623251520Sdelphij} 5624251520Sdelphij 5625251520Sdelphij/* 5626251629Sdelphij * Release this buffer from the cache, making it an anonymous buffer. This 5627251629Sdelphij * must be done after a read and prior to modifying the buffer contents. 5628168404Spjd * If the buffer has more than one reference, we must make 5629185029Spjd * a new hdr for the buffer. 5630168404Spjd */ 5631168404Spjdvoid 5632168404Spjdarc_release(arc_buf_t *buf, void *tag) 5633168404Spjd{ 5634286570Smav arc_buf_hdr_t *hdr = buf->b_hdr; 5635168404Spjd 5636219089Spjd /* 5637219089Spjd * It would be nice to assert that if it's DMU metadata (level > 5638219089Spjd * 0 || it's the dnode file), then it must be syncing context. 5639219089Spjd * But we don't know that information at this level. 5640219089Spjd */ 5641219089Spjd 5642219089Spjd mutex_enter(&buf->b_evict_lock); 5643286776Smav 5644286776Smav ASSERT(HDR_HAS_L1HDR(hdr)); 5645286776Smav 5646286570Smav /* 5647286570Smav * We don't grab the hash lock prior to this check, because if 5648286570Smav * the buffer's header is in the arc_anon state, it won't be 5649286570Smav * linked into the hash table. 5650286570Smav */ 5651286570Smav if (hdr->b_l1hdr.b_state == arc_anon) { 5652286570Smav mutex_exit(&buf->b_evict_lock); 5653286570Smav ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 5654286570Smav ASSERT(!HDR_IN_HASH_TABLE(hdr)); 5655286570Smav ASSERT(!HDR_HAS_L2HDR(hdr)); 5656307265Smav ASSERT(HDR_EMPTY(hdr)); 5657307265Smav ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1); 5658286570Smav ASSERT3S(refcount_count(&hdr->b_l1hdr.b_refcnt), ==, 1); 5659286570Smav ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node)); 5660185029Spjd 5661307265Smav hdr->b_l1hdr.b_arc_access = 0; 5662168404Spjd 5663307265Smav /* 5664307265Smav * If the buf is being overridden then it may already 5665307265Smav * have a hdr that is not empty. 5666307265Smav */ 5667307265Smav buf_discard_identity(hdr); 5668286570Smav arc_buf_thaw(buf); 5669286570Smav 5670286570Smav return; 5671168404Spjd } 5672168404Spjd 5673286570Smav kmutex_t *hash_lock = HDR_LOCK(hdr); 5674286570Smav mutex_enter(hash_lock); 5675286570Smav 5676286570Smav /* 5677286570Smav * This assignment is only valid as long as the hash_lock is 5678286570Smav * held, we must be careful not to reference state or the 5679286570Smav * b_state field after dropping the lock. 5680286570Smav */ 5681286570Smav arc_state_t *state = hdr->b_l1hdr.b_state; 5682286570Smav ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 5683286570Smav ASSERT3P(state, !=, arc_anon); 5684286570Smav 5685286570Smav /* this buffer is not on any list */ 5686321535Smav ASSERT3S(refcount_count(&hdr->b_l1hdr.b_refcnt), >, 0); 5687286570Smav 5688286570Smav if (HDR_HAS_L2HDR(hdr)) { 5689286570Smav mutex_enter(&hdr->b_l2hdr.b_dev->l2ad_mtx); 5690286570Smav 5691286570Smav /* 5692286598Smav * We have to recheck this conditional again now that 5693286598Smav * we're holding the l2ad_mtx to prevent a race with 5694286598Smav * another thread which might be concurrently calling 5695286598Smav * l2arc_evict(). In that case, l2arc_evict() might have 5696286598Smav * destroyed the header's L2 portion as we were waiting 5697286598Smav * to acquire the l2ad_mtx. 5698286570Smav */ 5699286598Smav if (HDR_HAS_L2HDR(hdr)) { 5700290191Savg l2arc_trim(hdr); 5701286598Smav arc_hdr_l2hdr_destroy(hdr); 5702286598Smav } 5703286570Smav 5704286570Smav mutex_exit(&hdr->b_l2hdr.b_dev->l2ad_mtx); 5705185029Spjd } 5706185029Spjd 5707168404Spjd /* 5708168404Spjd * Do we have more than one buf? 5709168404Spjd */ 5710307265Smav if (hdr->b_l1hdr.b_bufcnt > 1) { 5711168404Spjd arc_buf_hdr_t *nhdr; 5712209962Smm uint64_t spa = hdr->b_spa; 5713307265Smav uint64_t psize = HDR_GET_PSIZE(hdr); 5714307265Smav uint64_t lsize = HDR_GET_LSIZE(hdr); 5715307265Smav enum zio_compress compress = HDR_GET_COMPRESS(hdr); 5716286570Smav arc_buf_contents_t type = arc_buf_type(hdr); 5717307265Smav VERIFY3U(hdr->b_type, ==, type); 5718168404Spjd 5719286570Smav ASSERT(hdr->b_l1hdr.b_buf != buf || buf->b_next != NULL); 5720307265Smav (void) remove_reference(hdr, hash_lock, tag); 5721307265Smav 5722321535Smav if (arc_buf_is_shared(buf) && !ARC_BUF_COMPRESSED(buf)) { 5723307265Smav ASSERT3P(hdr->b_l1hdr.b_buf, !=, buf); 5724307265Smav ASSERT(ARC_BUF_LAST(buf)); 5725307265Smav } 5726307265Smav 5727168404Spjd /* 5728219089Spjd * Pull the data off of this hdr and attach it to 5729307265Smav * a new anonymous hdr. Also find the last buffer 5730307265Smav * in the hdr's buffer list. 5731168404Spjd */ 5732321535Smav arc_buf_t *lastbuf = arc_buf_remove(hdr, buf); 5733307265Smav ASSERT3P(lastbuf, !=, NULL); 5734168404Spjd 5735307265Smav /* 5736307265Smav * If the current arc_buf_t and the hdr are sharing their data 5737321535Smav * buffer, then we must stop sharing that block. 5738307265Smav */ 5739307265Smav if (arc_buf_is_shared(buf)) { 5740307265Smav VERIFY(!arc_buf_is_shared(lastbuf)); 5741307265Smav 5742307265Smav /* 5743307265Smav * First, sever the block sharing relationship between 5744321535Smav * buf and the arc_buf_hdr_t. 5745307265Smav */ 5746307265Smav arc_unshare_buf(hdr, buf); 5747321535Smav 5748321535Smav /* 5749321610Smav * Now we need to recreate the hdr's b_pabd. Since we 5750321535Smav * have lastbuf handy, we try to share with it, but if 5751321610Smav * we can't then we allocate a new b_pabd and copy the 5752321535Smav * data from buf into it. 5753321535Smav */ 5754321535Smav if (arc_can_share(hdr, lastbuf)) { 5755321535Smav arc_share_buf(hdr, lastbuf); 5756321535Smav } else { 5757321610Smav arc_hdr_alloc_pabd(hdr); 5758321610Smav abd_copy_from_buf(hdr->b_l1hdr.b_pabd, 5759321610Smav buf->b_data, psize); 5760321535Smav } 5761307265Smav VERIFY3P(lastbuf->b_data, !=, NULL); 5762307265Smav } else if (HDR_SHARED_DATA(hdr)) { 5763321535Smav /* 5764321535Smav * Uncompressed shared buffers are always at the end 5765321535Smav * of the list. Compressed buffers don't have the 5766321535Smav * same requirements. This makes it hard to 5767321535Smav * simply assert that the lastbuf is shared so 5768321535Smav * we rely on the hdr's compression flags to determine 5769321535Smav * if we have a compressed, shared buffer. 5770321535Smav */ 5771321535Smav ASSERT(arc_buf_is_shared(lastbuf) || 5772321535Smav HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF); 5773321535Smav ASSERT(!ARC_BUF_SHARED(buf)); 5774307265Smav } 5775321610Smav ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); 5776286570Smav ASSERT3P(state, !=, arc_l2c_only); 5777286766Smav 5778307265Smav (void) refcount_remove_many(&state->arcs_size, 5779321535Smav arc_buf_size(buf), buf); 5780286766Smav 5781286570Smav if (refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) { 5782286570Smav ASSERT3P(state, !=, arc_l2c_only); 5783307265Smav (void) refcount_remove_many(&state->arcs_esize[type], 5784321535Smav arc_buf_size(buf), buf); 5785168404Spjd } 5786242845Sdelphij 5787307265Smav hdr->b_l1hdr.b_bufcnt -= 1; 5788168404Spjd arc_cksum_verify(buf); 5789240133Smm#ifdef illumos 5790240133Smm arc_buf_unwatch(buf); 5791277300Ssmh#endif 5792168404Spjd 5793168404Spjd mutex_exit(hash_lock); 5794168404Spjd 5795307265Smav /* 5796321610Smav * Allocate a new hdr. The new hdr will contain a b_pabd 5797307265Smav * buffer which will be freed in arc_write(). 5798307265Smav */ 5799307265Smav nhdr = arc_hdr_alloc(spa, psize, lsize, compress, type); 5800307265Smav ASSERT3P(nhdr->b_l1hdr.b_buf, ==, NULL); 5801307265Smav ASSERT0(nhdr->b_l1hdr.b_bufcnt); 5802307265Smav ASSERT0(refcount_count(&nhdr->b_l1hdr.b_refcnt)); 5803307265Smav VERIFY3U(nhdr->b_type, ==, type); 5804307265Smav ASSERT(!HDR_SHARED_DATA(nhdr)); 5805286570Smav 5806286570Smav nhdr->b_l1hdr.b_buf = buf; 5807307265Smav nhdr->b_l1hdr.b_bufcnt = 1; 5808286570Smav (void) refcount_add(&nhdr->b_l1hdr.b_refcnt, tag); 5809168404Spjd buf->b_hdr = nhdr; 5810307265Smav 5811219089Spjd mutex_exit(&buf->b_evict_lock); 5812307265Smav (void) refcount_add_many(&arc_anon->arcs_size, 5813321535Smav arc_buf_size(buf), buf); 5814168404Spjd } else { 5815219089Spjd mutex_exit(&buf->b_evict_lock); 5816286570Smav ASSERT(refcount_count(&hdr->b_l1hdr.b_refcnt) == 1); 5817286763Smav /* protected by hash lock, or hdr is on arc_anon */ 5818286763Smav ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); 5819168404Spjd ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 5820286570Smav arc_change_state(arc_anon, hdr, hash_lock); 5821286570Smav hdr->b_l1hdr.b_arc_access = 0; 5822286570Smav mutex_exit(hash_lock); 5823185029Spjd 5824219089Spjd buf_discard_identity(hdr); 5825168404Spjd arc_buf_thaw(buf); 5826168404Spjd } 5827168404Spjd} 5828168404Spjd 5829168404Spjdint 5830168404Spjdarc_released(arc_buf_t *buf) 5831168404Spjd{ 5832185029Spjd int released; 5833185029Spjd 5834219089Spjd mutex_enter(&buf->b_evict_lock); 5835286570Smav released = (buf->b_data != NULL && 5836286570Smav buf->b_hdr->b_l1hdr.b_state == arc_anon); 5837219089Spjd mutex_exit(&buf->b_evict_lock); 5838185029Spjd return (released); 5839168404Spjd} 5840168404Spjd 5841168404Spjd#ifdef ZFS_DEBUG 5842168404Spjdint 5843168404Spjdarc_referenced(arc_buf_t *buf) 5844168404Spjd{ 5845185029Spjd int referenced; 5846185029Spjd 5847219089Spjd mutex_enter(&buf->b_evict_lock); 5848286570Smav referenced = (refcount_count(&buf->b_hdr->b_l1hdr.b_refcnt)); 5849219089Spjd mutex_exit(&buf->b_evict_lock); 5850185029Spjd return (referenced); 5851168404Spjd} 5852168404Spjd#endif 5853168404Spjd 5854168404Spjdstatic void 5855168404Spjdarc_write_ready(zio_t *zio) 5856168404Spjd{ 5857168404Spjd arc_write_callback_t *callback = zio->io_private; 5858168404Spjd arc_buf_t *buf = callback->awcb_buf; 5859185029Spjd arc_buf_hdr_t *hdr = buf->b_hdr; 5860307265Smav uint64_t psize = BP_IS_HOLE(zio->io_bp) ? 0 : BP_GET_PSIZE(zio->io_bp); 5861168404Spjd 5862286570Smav ASSERT(HDR_HAS_L1HDR(hdr)); 5863286570Smav ASSERT(!refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt)); 5864307265Smav ASSERT(hdr->b_l1hdr.b_bufcnt > 0); 5865185029Spjd 5866185029Spjd /* 5867307265Smav * If we're reexecuting this zio because the pool suspended, then 5868307265Smav * cleanup any state that was previously set the first time the 5869321535Smav * callback was invoked. 5870185029Spjd */ 5871307265Smav if (zio->io_flags & ZIO_FLAG_REEXECUTED) { 5872307265Smav arc_cksum_free(hdr); 5873307265Smav#ifdef illumos 5874307265Smav arc_buf_unwatch(buf); 5875307265Smav#endif 5876321610Smav if (hdr->b_l1hdr.b_pabd != NULL) { 5877307265Smav if (arc_buf_is_shared(buf)) { 5878307265Smav arc_unshare_buf(hdr, buf); 5879307265Smav } else { 5880321610Smav arc_hdr_free_pabd(hdr); 5881307265Smav } 5882185029Spjd } 5883168404Spjd } 5884321610Smav ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); 5885307265Smav ASSERT(!HDR_SHARED_DATA(hdr)); 5886307265Smav ASSERT(!arc_buf_is_shared(buf)); 5887307265Smav 5888307265Smav callback->awcb_ready(zio, buf, callback->awcb_private); 5889307265Smav 5890307265Smav if (HDR_IO_IN_PROGRESS(hdr)) 5891307265Smav ASSERT(zio->io_flags & ZIO_FLAG_REEXECUTED); 5892307265Smav 5893307265Smav arc_cksum_compute(buf); 5894307265Smav arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); 5895307265Smav 5896307265Smav enum zio_compress compress; 5897307265Smav if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) { 5898307265Smav compress = ZIO_COMPRESS_OFF; 5899307265Smav } else { 5900307265Smav ASSERT3U(HDR_GET_LSIZE(hdr), ==, BP_GET_LSIZE(zio->io_bp)); 5901307265Smav compress = BP_GET_COMPRESS(zio->io_bp); 5902307265Smav } 5903307265Smav HDR_SET_PSIZE(hdr, psize); 5904307265Smav arc_hdr_set_compress(hdr, compress); 5905307265Smav 5906321610Smav 5907307265Smav /* 5908321610Smav * Fill the hdr with data. If the hdr is compressed, the data we want 5909321610Smav * is available from the zio, otherwise we can take it from the buf. 5910321610Smav * 5911321610Smav * We might be able to share the buf's data with the hdr here. However, 5912321610Smav * doing so would cause the ARC to be full of linear ABDs if we write a 5913321610Smav * lot of shareable data. As a compromise, we check whether scattered 5914321610Smav * ABDs are allowed, and assume that if they are then the user wants 5915321610Smav * the ARC to be primarily filled with them regardless of the data being 5916321610Smav * written. Therefore, if they're allowed then we allocate one and copy 5917321610Smav * the data into it; otherwise, we share the data directly if we can. 5918307265Smav */ 5919321610Smav if (zfs_abd_scatter_enabled || !arc_can_share(hdr, buf)) { 5920321610Smav arc_hdr_alloc_pabd(hdr); 5921321610Smav 5922321610Smav /* 5923321610Smav * Ideally, we would always copy the io_abd into b_pabd, but the 5924321610Smav * user may have disabled compressed ARC, thus we must check the 5925321610Smav * hdr's compression setting rather than the io_bp's. 5926321610Smav */ 5927321610Smav if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF) { 5928321610Smav ASSERT3U(BP_GET_COMPRESS(zio->io_bp), !=, 5929321610Smav ZIO_COMPRESS_OFF); 5930321610Smav ASSERT3U(psize, >, 0); 5931321610Smav 5932321610Smav abd_copy(hdr->b_l1hdr.b_pabd, zio->io_abd, psize); 5933321610Smav } else { 5934321610Smav ASSERT3U(zio->io_orig_size, ==, arc_hdr_size(hdr)); 5935321610Smav 5936321610Smav abd_copy_from_buf(hdr->b_l1hdr.b_pabd, buf->b_data, 5937321610Smav arc_buf_size(buf)); 5938321610Smav } 5939307265Smav } else { 5940321610Smav ASSERT3P(buf->b_data, ==, abd_to_buf(zio->io_orig_abd)); 5941321535Smav ASSERT3U(zio->io_orig_size, ==, arc_buf_size(buf)); 5942307265Smav ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1); 5943307265Smav 5944307265Smav arc_share_buf(hdr, buf); 5945307265Smav } 5946321610Smav 5947307265Smav arc_hdr_verify(hdr, zio->io_bp); 5948168404Spjd} 5949168404Spjd 5950304138Savgstatic void 5951304138Savgarc_write_children_ready(zio_t *zio) 5952304138Savg{ 5953304138Savg arc_write_callback_t *callback = zio->io_private; 5954304138Savg arc_buf_t *buf = callback->awcb_buf; 5955304138Savg 5956304138Savg callback->awcb_children_ready(zio, buf, callback->awcb_private); 5957304138Savg} 5958304138Savg 5959258632Savg/* 5960258632Savg * The SPA calls this callback for each physical write that happens on behalf 5961258632Savg * of a logical write. See the comment in dbuf_write_physdone() for details. 5962258632Savg */ 5963168404Spjdstatic void 5964258632Savgarc_write_physdone(zio_t *zio) 5965258632Savg{ 5966258632Savg arc_write_callback_t *cb = zio->io_private; 5967258632Savg if (cb->awcb_physdone != NULL) 5968258632Savg cb->awcb_physdone(zio, cb->awcb_buf, cb->awcb_private); 5969258632Savg} 5970258632Savg 5971258632Savgstatic void 5972168404Spjdarc_write_done(zio_t *zio) 5973168404Spjd{ 5974168404Spjd arc_write_callback_t *callback = zio->io_private; 5975168404Spjd arc_buf_t *buf = callback->awcb_buf; 5976168404Spjd arc_buf_hdr_t *hdr = buf->b_hdr; 5977168404Spjd 5978307265Smav ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); 5979168404Spjd 5980219089Spjd if (zio->io_error == 0) { 5981307265Smav arc_hdr_verify(hdr, zio->io_bp); 5982307265Smav 5983268075Sdelphij if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) { 5984260150Sdelphij buf_discard_identity(hdr); 5985260150Sdelphij } else { 5986260150Sdelphij hdr->b_dva = *BP_IDENTITY(zio->io_bp); 5987260150Sdelphij hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp); 5988260150Sdelphij } 5989219089Spjd } else { 5990307265Smav ASSERT(HDR_EMPTY(hdr)); 5991219089Spjd } 5992219089Spjd 5993168404Spjd /* 5994268075Sdelphij * If the block to be written was all-zero or compressed enough to be 5995268075Sdelphij * embedded in the BP, no write was performed so there will be no 5996268075Sdelphij * dva/birth/checksum. The buffer must therefore remain anonymous 5997268075Sdelphij * (and uncached). 5998168404Spjd */ 5999307265Smav if (!HDR_EMPTY(hdr)) { 6000168404Spjd arc_buf_hdr_t *exists; 6001168404Spjd kmutex_t *hash_lock; 6002168404Spjd 6003321535Smav ASSERT3U(zio->io_error, ==, 0); 6004219089Spjd 6005168404Spjd arc_cksum_verify(buf); 6006168404Spjd 6007168404Spjd exists = buf_hash_insert(hdr, &hash_lock); 6008286570Smav if (exists != NULL) { 6009168404Spjd /* 6010168404Spjd * This can only happen if we overwrite for 6011168404Spjd * sync-to-convergence, because we remove 6012168404Spjd * buffers from the hash table when we arc_free(). 6013168404Spjd */ 6014219089Spjd if (zio->io_flags & ZIO_FLAG_IO_REWRITE) { 6015219089Spjd if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp)) 6016219089Spjd panic("bad overwrite, hdr=%p exists=%p", 6017219089Spjd (void *)hdr, (void *)exists); 6018286570Smav ASSERT(refcount_is_zero( 6019286570Smav &exists->b_l1hdr.b_refcnt)); 6020219089Spjd arc_change_state(arc_anon, exists, hash_lock); 6021219089Spjd mutex_exit(hash_lock); 6022219089Spjd arc_hdr_destroy(exists); 6023219089Spjd exists = buf_hash_insert(hdr, &hash_lock); 6024219089Spjd ASSERT3P(exists, ==, NULL); 6025243524Smm } else if (zio->io_flags & ZIO_FLAG_NOPWRITE) { 6026243524Smm /* nopwrite */ 6027243524Smm ASSERT(zio->io_prop.zp_nopwrite); 6028243524Smm if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp)) 6029243524Smm panic("bad nopwrite, hdr=%p exists=%p", 6030243524Smm (void *)hdr, (void *)exists); 6031219089Spjd } else { 6032219089Spjd /* Dedup */ 6033307265Smav ASSERT(hdr->b_l1hdr.b_bufcnt == 1); 6034286570Smav ASSERT(hdr->b_l1hdr.b_state == arc_anon); 6035219089Spjd ASSERT(BP_GET_DEDUP(zio->io_bp)); 6036219089Spjd ASSERT(BP_GET_LEVEL(zio->io_bp) == 0); 6037219089Spjd } 6038168404Spjd } 6039307265Smav arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); 6040185029Spjd /* if it's not anon, we are doing a scrub */ 6041286570Smav if (exists == NULL && hdr->b_l1hdr.b_state == arc_anon) 6042185029Spjd arc_access(hdr, hash_lock); 6043168404Spjd mutex_exit(hash_lock); 6044168404Spjd } else { 6045307265Smav arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); 6046168404Spjd } 6047168404Spjd 6048286570Smav ASSERT(!refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 6049219089Spjd callback->awcb_done(zio, buf, callback->awcb_private); 6050168404Spjd 6051321610Smav abd_put(zio->io_abd); 6052168404Spjd kmem_free(callback, sizeof (arc_write_callback_t)); 6053168404Spjd} 6054168404Spjd 6055168404Spjdzio_t * 6056307265Smavarc_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, arc_buf_t *buf, 6057307265Smav boolean_t l2arc, const zio_prop_t *zp, arc_done_func_t *ready, 6058304138Savg arc_done_func_t *children_ready, arc_done_func_t *physdone, 6059258632Savg arc_done_func_t *done, void *private, zio_priority_t priority, 6060268123Sdelphij int zio_flags, const zbookmark_phys_t *zb) 6061168404Spjd{ 6062168404Spjd arc_buf_hdr_t *hdr = buf->b_hdr; 6063168404Spjd arc_write_callback_t *callback; 6064185029Spjd zio_t *zio; 6065321573Smav zio_prop_t localprop = *zp; 6066168404Spjd 6067307265Smav ASSERT3P(ready, !=, NULL); 6068307265Smav ASSERT3P(done, !=, NULL); 6069168404Spjd ASSERT(!HDR_IO_ERROR(hdr)); 6070286570Smav ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 6071307265Smav ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); 6072307265Smav ASSERT3U(hdr->b_l1hdr.b_bufcnt, >, 0); 6073185029Spjd if (l2arc) 6074307265Smav arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE); 6075321535Smav if (ARC_BUF_COMPRESSED(buf)) { 6076321573Smav /* 6077321573Smav * We're writing a pre-compressed buffer. Make the 6078321573Smav * compression algorithm requested by the zio_prop_t match 6079321573Smav * the pre-compressed buffer's compression algorithm. 6080321573Smav */ 6081321573Smav localprop.zp_compress = HDR_GET_COMPRESS(hdr); 6082321573Smav 6083321535Smav ASSERT3U(HDR_GET_LSIZE(hdr), !=, arc_buf_size(buf)); 6084321535Smav zio_flags |= ZIO_FLAG_RAW; 6085321535Smav } 6086168404Spjd callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP); 6087168404Spjd callback->awcb_ready = ready; 6088304138Savg callback->awcb_children_ready = children_ready; 6089258632Savg callback->awcb_physdone = physdone; 6090168404Spjd callback->awcb_done = done; 6091168404Spjd callback->awcb_private = private; 6092168404Spjd callback->awcb_buf = buf; 6093168404Spjd 6094307265Smav /* 6095321610Smav * The hdr's b_pabd is now stale, free it now. A new data block 6096307265Smav * will be allocated when the zio pipeline calls arc_write_ready(). 6097307265Smav */ 6098321610Smav if (hdr->b_l1hdr.b_pabd != NULL) { 6099307265Smav /* 6100307265Smav * If the buf is currently sharing the data block with 6101307265Smav * the hdr then we need to break that relationship here. 6102307265Smav * The hdr will remain with a NULL data pointer and the 6103307265Smav * buf will take sole ownership of the block. 6104307265Smav */ 6105307265Smav if (arc_buf_is_shared(buf)) { 6106307265Smav arc_unshare_buf(hdr, buf); 6107307265Smav } else { 6108321610Smav arc_hdr_free_pabd(hdr); 6109307265Smav } 6110307265Smav VERIFY3P(buf->b_data, !=, NULL); 6111307265Smav arc_hdr_set_compress(hdr, ZIO_COMPRESS_OFF); 6112307265Smav } 6113307265Smav ASSERT(!arc_buf_is_shared(buf)); 6114321610Smav ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); 6115307265Smav 6116321610Smav zio = zio_write(pio, spa, txg, bp, 6117321610Smav abd_get_from_buf(buf->b_data, HDR_GET_LSIZE(hdr)), 6118321573Smav HDR_GET_LSIZE(hdr), arc_buf_size(buf), &localprop, arc_write_ready, 6119304138Savg (children_ready != NULL) ? arc_write_children_ready : NULL, 6120304138Savg arc_write_physdone, arc_write_done, callback, 6121258632Savg priority, zio_flags, zb); 6122185029Spjd 6123168404Spjd return (zio); 6124168404Spjd} 6125168404Spjd 6126185029Spjdstatic int 6127258632Savgarc_memory_throttle(uint64_t reserve, uint64_t txg) 6128185029Spjd{ 6129185029Spjd#ifdef _KERNEL 6130272483Ssmh uint64_t available_memory = ptob(freemem); 6131185029Spjd static uint64_t page_load = 0; 6132185029Spjd static uint64_t last_txg = 0; 6133185029Spjd 6134272483Ssmh#if defined(__i386) || !defined(UMA_MD_SMALL_ALLOC) 6135185029Spjd available_memory = 6136272483Ssmh MIN(available_memory, ptob(vmem_size(heap_arena, VMEM_FREE))); 6137185029Spjd#endif 6138258632Savg 6139272483Ssmh if (freemem > (uint64_t)physmem * arc_lotsfree_percent / 100) 6140185029Spjd return (0); 6141185029Spjd 6142185029Spjd if (txg > last_txg) { 6143185029Spjd last_txg = txg; 6144185029Spjd page_load = 0; 6145185029Spjd } 6146185029Spjd /* 6147185029Spjd * If we are in pageout, we know that memory is already tight, 6148185029Spjd * the arc is already going to be evicting, so we just want to 6149185029Spjd * continue to let page writes occur as quickly as possible. 6150185029Spjd */ 6151185029Spjd if (curproc == pageproc) { 6152272483Ssmh if (page_load > MAX(ptob(minfree), available_memory) / 4) 6153249195Smm return (SET_ERROR(ERESTART)); 6154185029Spjd /* Note: reserve is inflated, so we deflate */ 6155185029Spjd page_load += reserve / 8; 6156185029Spjd return (0); 6157185029Spjd } else if (page_load > 0 && arc_reclaim_needed()) { 6158185029Spjd /* memory is low, delay before restarting */ 6159185029Spjd ARCSTAT_INCR(arcstat_memory_throttle_count, 1); 6160249195Smm return (SET_ERROR(EAGAIN)); 6161185029Spjd } 6162185029Spjd page_load = 0; 6163185029Spjd#endif 6164185029Spjd return (0); 6165185029Spjd} 6166185029Spjd 6167168404Spjdvoid 6168185029Spjdarc_tempreserve_clear(uint64_t reserve) 6169168404Spjd{ 6170185029Spjd atomic_add_64(&arc_tempreserve, -reserve); 6171168404Spjd ASSERT((int64_t)arc_tempreserve >= 0); 6172168404Spjd} 6173168404Spjd 6174168404Spjdint 6175185029Spjdarc_tempreserve_space(uint64_t reserve, uint64_t txg) 6176168404Spjd{ 6177185029Spjd int error; 6178209962Smm uint64_t anon_size; 6179185029Spjd 6180272483Ssmh if (reserve > arc_c/4 && !arc_no_grow) { 6181185029Spjd arc_c = MIN(arc_c_max, reserve * 4); 6182272483Ssmh DTRACE_PROBE1(arc__set_reserve, uint64_t, arc_c); 6183272483Ssmh } 6184185029Spjd if (reserve > arc_c) 6185249195Smm return (SET_ERROR(ENOMEM)); 6186168404Spjd 6187168404Spjd /* 6188209962Smm * Don't count loaned bufs as in flight dirty data to prevent long 6189209962Smm * network delays from blocking transactions that are ready to be 6190209962Smm * assigned to a txg. 6191209962Smm */ 6192321535Smav 6193321535Smav /* assert that it has not wrapped around */ 6194321535Smav ASSERT3S(atomic_add_64_nv(&arc_loaned_bytes, 0), >=, 0); 6195321535Smav 6196286766Smav anon_size = MAX((int64_t)(refcount_count(&arc_anon->arcs_size) - 6197286766Smav arc_loaned_bytes), 0); 6198209962Smm 6199209962Smm /* 6200185029Spjd * Writes will, almost always, require additional memory allocations 6201251631Sdelphij * in order to compress/encrypt/etc the data. We therefore need to 6202185029Spjd * make sure that there is sufficient available memory for this. 6203185029Spjd */ 6204258632Savg error = arc_memory_throttle(reserve, txg); 6205258632Savg if (error != 0) 6206185029Spjd return (error); 6207185029Spjd 6208185029Spjd /* 6209168404Spjd * Throttle writes when the amount of dirty data in the cache 6210168404Spjd * gets too large. We try to keep the cache less than half full 6211168404Spjd * of dirty blocks so that our sync times don't grow too large. 6212168404Spjd * Note: if two requests come in concurrently, we might let them 6213168404Spjd * both succeed, when one of them should fail. Not a huge deal. 6214168404Spjd */ 6215209962Smm 6216209962Smm if (reserve + arc_tempreserve + anon_size > arc_c / 2 && 6217209962Smm anon_size > arc_c / 4) { 6218307265Smav uint64_t meta_esize = 6219307265Smav refcount_count(&arc_anon->arcs_esize[ARC_BUFC_METADATA]); 6220307265Smav uint64_t data_esize = 6221307265Smav refcount_count(&arc_anon->arcs_esize[ARC_BUFC_DATA]); 6222185029Spjd dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK " 6223185029Spjd "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n", 6224307265Smav arc_tempreserve >> 10, meta_esize >> 10, 6225307265Smav data_esize >> 10, reserve >> 10, arc_c >> 10); 6226249195Smm return (SET_ERROR(ERESTART)); 6227168404Spjd } 6228185029Spjd atomic_add_64(&arc_tempreserve, reserve); 6229168404Spjd return (0); 6230168404Spjd} 6231168404Spjd 6232286626Smavstatic void 6233286626Smavarc_kstat_update_state(arc_state_t *state, kstat_named_t *size, 6234286626Smav kstat_named_t *evict_data, kstat_named_t *evict_metadata) 6235286626Smav{ 6236286766Smav size->value.ui64 = refcount_count(&state->arcs_size); 6237307265Smav evict_data->value.ui64 = 6238307265Smav refcount_count(&state->arcs_esize[ARC_BUFC_DATA]); 6239307265Smav evict_metadata->value.ui64 = 6240307265Smav refcount_count(&state->arcs_esize[ARC_BUFC_METADATA]); 6241286626Smav} 6242286626Smav 6243286626Smavstatic int 6244286626Smavarc_kstat_update(kstat_t *ksp, int rw) 6245286626Smav{ 6246286626Smav arc_stats_t *as = ksp->ks_data; 6247286626Smav 6248286626Smav if (rw == KSTAT_WRITE) { 6249286626Smav return (EACCES); 6250286626Smav } else { 6251286626Smav arc_kstat_update_state(arc_anon, 6252286626Smav &as->arcstat_anon_size, 6253286626Smav &as->arcstat_anon_evictable_data, 6254286626Smav &as->arcstat_anon_evictable_metadata); 6255286626Smav arc_kstat_update_state(arc_mru, 6256286626Smav &as->arcstat_mru_size, 6257286626Smav &as->arcstat_mru_evictable_data, 6258286626Smav &as->arcstat_mru_evictable_metadata); 6259286626Smav arc_kstat_update_state(arc_mru_ghost, 6260286626Smav &as->arcstat_mru_ghost_size, 6261286626Smav &as->arcstat_mru_ghost_evictable_data, 6262286626Smav &as->arcstat_mru_ghost_evictable_metadata); 6263286626Smav arc_kstat_update_state(arc_mfu, 6264286626Smav &as->arcstat_mfu_size, 6265286626Smav &as->arcstat_mfu_evictable_data, 6266286626Smav &as->arcstat_mfu_evictable_metadata); 6267286626Smav arc_kstat_update_state(arc_mfu_ghost, 6268286626Smav &as->arcstat_mfu_ghost_size, 6269286626Smav &as->arcstat_mfu_ghost_evictable_data, 6270286626Smav &as->arcstat_mfu_ghost_evictable_metadata); 6271286626Smav } 6272286626Smav 6273286626Smav return (0); 6274286626Smav} 6275286626Smav 6276286763Smav/* 6277286763Smav * This function *must* return indices evenly distributed between all 6278286763Smav * sublists of the multilist. This is needed due to how the ARC eviction 6279286763Smav * code is laid out; arc_evict_state() assumes ARC buffers are evenly 6280286763Smav * distributed between all sublists and uses this assumption when 6281286763Smav * deciding which sublist to evict from and how much to evict from it. 6282286763Smav */ 6283286763Smavunsigned int 6284286763Smavarc_state_multilist_index_func(multilist_t *ml, void *obj) 6285286763Smav{ 6286286763Smav arc_buf_hdr_t *hdr = obj; 6287286763Smav 6288286763Smav /* 6289286763Smav * We rely on b_dva to generate evenly distributed index 6290286763Smav * numbers using buf_hash below. So, as an added precaution, 6291286763Smav * let's make sure we never add empty buffers to the arc lists. 6292286763Smav */ 6293307265Smav ASSERT(!HDR_EMPTY(hdr)); 6294286763Smav 6295286763Smav /* 6296286763Smav * The assumption here, is the hash value for a given 6297286763Smav * arc_buf_hdr_t will remain constant throughout it's lifetime 6298286763Smav * (i.e. it's b_spa, b_dva, and b_birth fields don't change). 6299286763Smav * Thus, we don't need to store the header's sublist index 6300286763Smav * on insertion, as this index can be recalculated on removal. 6301286763Smav * 6302286763Smav * Also, the low order bits of the hash value are thought to be 6303286763Smav * distributed evenly. Otherwise, in the case that the multilist 6304286763Smav * has a power of two number of sublists, each sublists' usage 6305286763Smav * would not be evenly distributed. 6306286763Smav */ 6307286763Smav return (buf_hash(hdr->b_spa, &hdr->b_dva, hdr->b_birth) % 6308286763Smav multilist_get_num_sublists(ml)); 6309286763Smav} 6310286763Smav 6311168404Spjd#ifdef _KERNEL 6312168566Spjdstatic eventhandler_tag arc_event_lowmem = NULL; 6313168404Spjd 6314168404Spjdstatic void 6315168566Spjdarc_lowmem(void *arg __unused, int howto __unused) 6316168404Spjd{ 6317168404Spjd 6318286763Smav mutex_enter(&arc_reclaim_lock); 6319326619Sbapt DTRACE_PROBE1(arc__needfree, int64_t, ((int64_t)freemem - zfs_arc_free_target) * PAGESIZE); 6320286763Smav cv_signal(&arc_reclaim_thread_cv); 6321241773Savg 6322241773Savg /* 6323241773Savg * It is unsafe to block here in arbitrary threads, because we can come 6324241773Savg * here from ARC itself and may hold ARC locks and thus risk a deadlock 6325241773Savg * with ARC reclaim thread. 6326241773Savg */ 6327286623Smav if (curproc == pageproc) 6328286763Smav (void) cv_wait(&arc_reclaim_waiters_cv, &arc_reclaim_lock); 6329286763Smav mutex_exit(&arc_reclaim_lock); 6330168404Spjd} 6331168404Spjd#endif 6332168404Spjd 6333307265Smavstatic void 6334307265Smavarc_state_init(void) 6335307265Smav{ 6336307265Smav arc_anon = &ARC_anon; 6337307265Smav arc_mru = &ARC_mru; 6338307265Smav arc_mru_ghost = &ARC_mru_ghost; 6339307265Smav arc_mfu = &ARC_mfu; 6340307265Smav arc_mfu_ghost = &ARC_mfu_ghost; 6341307265Smav arc_l2c_only = &ARC_l2c_only; 6342307265Smav 6343321553Smav arc_mru->arcs_list[ARC_BUFC_METADATA] = 6344321553Smav multilist_create(sizeof (arc_buf_hdr_t), 6345307265Smav offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), 6346321552Smav arc_state_multilist_index_func); 6347321553Smav arc_mru->arcs_list[ARC_BUFC_DATA] = 6348321553Smav multilist_create(sizeof (arc_buf_hdr_t), 6349307265Smav offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), 6350321552Smav arc_state_multilist_index_func); 6351321553Smav arc_mru_ghost->arcs_list[ARC_BUFC_METADATA] = 6352321553Smav multilist_create(sizeof (arc_buf_hdr_t), 6353307265Smav offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), 6354321552Smav arc_state_multilist_index_func); 6355321553Smav arc_mru_ghost->arcs_list[ARC_BUFC_DATA] = 6356321553Smav multilist_create(sizeof (arc_buf_hdr_t), 6357307265Smav offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), 6358321552Smav arc_state_multilist_index_func); 6359321553Smav arc_mfu->arcs_list[ARC_BUFC_METADATA] = 6360321553Smav multilist_create(sizeof (arc_buf_hdr_t), 6361307265Smav offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), 6362321552Smav arc_state_multilist_index_func); 6363321553Smav arc_mfu->arcs_list[ARC_BUFC_DATA] = 6364321553Smav multilist_create(sizeof (arc_buf_hdr_t), 6365307265Smav offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), 6366321552Smav arc_state_multilist_index_func); 6367321553Smav arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA] = 6368321553Smav multilist_create(sizeof (arc_buf_hdr_t), 6369307265Smav offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), 6370321552Smav arc_state_multilist_index_func); 6371321553Smav arc_mfu_ghost->arcs_list[ARC_BUFC_DATA] = 6372321553Smav multilist_create(sizeof (arc_buf_hdr_t), 6373307265Smav offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), 6374321552Smav arc_state_multilist_index_func); 6375321553Smav arc_l2c_only->arcs_list[ARC_BUFC_METADATA] = 6376321553Smav multilist_create(sizeof (arc_buf_hdr_t), 6377307265Smav offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), 6378321552Smav arc_state_multilist_index_func); 6379321553Smav arc_l2c_only->arcs_list[ARC_BUFC_DATA] = 6380321553Smav multilist_create(sizeof (arc_buf_hdr_t), 6381307265Smav offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), 6382321552Smav arc_state_multilist_index_func); 6383307265Smav 6384307265Smav refcount_create(&arc_anon->arcs_esize[ARC_BUFC_METADATA]); 6385307265Smav refcount_create(&arc_anon->arcs_esize[ARC_BUFC_DATA]); 6386307265Smav refcount_create(&arc_mru->arcs_esize[ARC_BUFC_METADATA]); 6387307265Smav refcount_create(&arc_mru->arcs_esize[ARC_BUFC_DATA]); 6388307265Smav refcount_create(&arc_mru_ghost->arcs_esize[ARC_BUFC_METADATA]); 6389307265Smav refcount_create(&arc_mru_ghost->arcs_esize[ARC_BUFC_DATA]); 6390307265Smav refcount_create(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]); 6391307265Smav refcount_create(&arc_mfu->arcs_esize[ARC_BUFC_DATA]); 6392307265Smav refcount_create(&arc_mfu_ghost->arcs_esize[ARC_BUFC_METADATA]); 6393307265Smav refcount_create(&arc_mfu_ghost->arcs_esize[ARC_BUFC_DATA]); 6394307265Smav refcount_create(&arc_l2c_only->arcs_esize[ARC_BUFC_METADATA]); 6395307265Smav refcount_create(&arc_l2c_only->arcs_esize[ARC_BUFC_DATA]); 6396307265Smav 6397307265Smav refcount_create(&arc_anon->arcs_size); 6398307265Smav refcount_create(&arc_mru->arcs_size); 6399307265Smav refcount_create(&arc_mru_ghost->arcs_size); 6400307265Smav refcount_create(&arc_mfu->arcs_size); 6401307265Smav refcount_create(&arc_mfu_ghost->arcs_size); 6402307265Smav refcount_create(&arc_l2c_only->arcs_size); 6403307265Smav} 6404307265Smav 6405307265Smavstatic void 6406307265Smavarc_state_fini(void) 6407307265Smav{ 6408307265Smav refcount_destroy(&arc_anon->arcs_esize[ARC_BUFC_METADATA]); 6409307265Smav refcount_destroy(&arc_anon->arcs_esize[ARC_BUFC_DATA]); 6410307265Smav refcount_destroy(&arc_mru->arcs_esize[ARC_BUFC_METADATA]); 6411307265Smav refcount_destroy(&arc_mru->arcs_esize[ARC_BUFC_DATA]); 6412307265Smav refcount_destroy(&arc_mru_ghost->arcs_esize[ARC_BUFC_METADATA]); 6413307265Smav refcount_destroy(&arc_mru_ghost->arcs_esize[ARC_BUFC_DATA]); 6414307265Smav refcount_destroy(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]); 6415307265Smav refcount_destroy(&arc_mfu->arcs_esize[ARC_BUFC_DATA]); 6416307265Smav refcount_destroy(&arc_mfu_ghost->arcs_esize[ARC_BUFC_METADATA]); 6417307265Smav refcount_destroy(&arc_mfu_ghost->arcs_esize[ARC_BUFC_DATA]); 6418307265Smav refcount_destroy(&arc_l2c_only->arcs_esize[ARC_BUFC_METADATA]); 6419307265Smav refcount_destroy(&arc_l2c_only->arcs_esize[ARC_BUFC_DATA]); 6420307265Smav 6421307265Smav refcount_destroy(&arc_anon->arcs_size); 6422307265Smav refcount_destroy(&arc_mru->arcs_size); 6423307265Smav refcount_destroy(&arc_mru_ghost->arcs_size); 6424307265Smav refcount_destroy(&arc_mfu->arcs_size); 6425307265Smav refcount_destroy(&arc_mfu_ghost->arcs_size); 6426307265Smav refcount_destroy(&arc_l2c_only->arcs_size); 6427307265Smav 6428321553Smav multilist_destroy(arc_mru->arcs_list[ARC_BUFC_METADATA]); 6429321553Smav multilist_destroy(arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]); 6430321553Smav multilist_destroy(arc_mfu->arcs_list[ARC_BUFC_METADATA]); 6431321553Smav multilist_destroy(arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]); 6432321553Smav multilist_destroy(arc_mru->arcs_list[ARC_BUFC_DATA]); 6433321553Smav multilist_destroy(arc_mru_ghost->arcs_list[ARC_BUFC_DATA]); 6434321553Smav multilist_destroy(arc_mfu->arcs_list[ARC_BUFC_DATA]); 6435321553Smav multilist_destroy(arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]); 6436307265Smav} 6437307265Smav 6438307265Smavuint64_t 6439307265Smavarc_max_bytes(void) 6440307265Smav{ 6441307265Smav return (arc_c_max); 6442307265Smav} 6443307265Smav 6444168404Spjdvoid 6445168404Spjdarc_init(void) 6446168404Spjd{ 6447219089Spjd int i, prefetch_tunable_set = 0; 6448205231Skmacy 6449321562Smav /* 6450321562Smav * allmem is "all memory that we could possibly use". 6451321562Smav */ 6452321562Smav#ifdef illumos 6453321562Smav#ifdef _KERNEL 6454321562Smav uint64_t allmem = ptob(physmem - swapfs_minfree); 6455321562Smav#else 6456321562Smav uint64_t allmem = (physmem * PAGESIZE) / 2; 6457321562Smav#endif 6458321562Smav#else 6459321562Smav uint64_t allmem = kmem_size(); 6460321562Smav#endif 6461321562Smav 6462321562Smav 6463286763Smav mutex_init(&arc_reclaim_lock, NULL, MUTEX_DEFAULT, NULL); 6464286763Smav cv_init(&arc_reclaim_thread_cv, NULL, CV_DEFAULT, NULL); 6465286763Smav cv_init(&arc_reclaim_waiters_cv, NULL, CV_DEFAULT, NULL); 6466168404Spjd 6467301997Skib mutex_init(&arc_dnlc_evicts_lock, NULL, MUTEX_DEFAULT, NULL); 6468301997Skib cv_init(&arc_dnlc_evicts_cv, NULL, CV_DEFAULT, NULL); 6469301997Skib 6470168404Spjd /* Convert seconds to clock ticks */ 6471168404Spjd arc_min_prefetch_lifespan = 1 * hz; 6472168404Spjd 6473302265Ssmh /* set min cache to 1/32 of all memory, or arc_abs_min, whichever is more */ 6474321562Smav arc_c_min = MAX(allmem / 32, arc_abs_min); 6475321562Smav /* set max to 5/8 of all memory, or all but 1GB, whichever is more */ 6476321562Smav if (allmem >= 1 << 30) 6477321562Smav arc_c_max = allmem - (1 << 30); 6478168404Spjd else 6479168404Spjd arc_c_max = arc_c_min; 6480321562Smav arc_c_max = MAX(allmem * 5 / 8, arc_c_max); 6481219089Spjd 6482289305Smav /* 6483289305Smav * In userland, there's only the memory pressure that we artificially 6484289305Smav * create (see arc_available_memory()). Don't let arc_c get too 6485289305Smav * small, because it can cause transactions to be larger than 6486289305Smav * arc_c, causing arc_tempreserve_space() to fail. 6487289305Smav */ 6488289305Smav#ifndef _KERNEL 6489289305Smav arc_c_min = arc_c_max / 2; 6490289305Smav#endif 6491289305Smav 6492168481Spjd#ifdef _KERNEL 6493168404Spjd /* 6494168404Spjd * Allow the tunables to override our calculations if they are 6495302265Ssmh * reasonable. 6496168404Spjd */ 6497321562Smav if (zfs_arc_max > arc_abs_min && zfs_arc_max < allmem) { 6498168404Spjd arc_c_max = zfs_arc_max; 6499307297Smav arc_c_min = MIN(arc_c_min, arc_c_max); 6500307297Smav } 6501302265Ssmh if (zfs_arc_min > arc_abs_min && zfs_arc_min <= arc_c_max) 6502168404Spjd arc_c_min = zfs_arc_min; 6503168481Spjd#endif 6504219089Spjd 6505168404Spjd arc_c = arc_c_max; 6506168404Spjd arc_p = (arc_c >> 1); 6507307265Smav arc_size = 0; 6508168404Spjd 6509185029Spjd /* limit meta-data to 1/4 of the arc capacity */ 6510185029Spjd arc_meta_limit = arc_c_max / 4; 6511185029Spjd 6512321563Smav#ifdef _KERNEL 6513321563Smav /* 6514321563Smav * Metadata is stored in the kernel's heap. Don't let us 6515321563Smav * use more than half the heap for the ARC. 6516321563Smav */ 6517321563Smav arc_meta_limit = MIN(arc_meta_limit, 6518321563Smav vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 2); 6519321563Smav#endif 6520321563Smav 6521185029Spjd /* Allow the tunable to override if it is reasonable */ 6522185029Spjd if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max) 6523185029Spjd arc_meta_limit = zfs_arc_meta_limit; 6524185029Spjd 6525185029Spjd if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0) 6526185029Spjd arc_c_min = arc_meta_limit / 2; 6527185029Spjd 6528275780Sdelphij if (zfs_arc_meta_min > 0) { 6529275780Sdelphij arc_meta_min = zfs_arc_meta_min; 6530275780Sdelphij } else { 6531275780Sdelphij arc_meta_min = arc_c_min / 2; 6532275780Sdelphij } 6533275780Sdelphij 6534208373Smm if (zfs_arc_grow_retry > 0) 6535208373Smm arc_grow_retry = zfs_arc_grow_retry; 6536208373Smm 6537208373Smm if (zfs_arc_shrink_shift > 0) 6538208373Smm arc_shrink_shift = zfs_arc_shrink_shift; 6539208373Smm 6540323667Sbapt if (zfs_arc_no_grow_shift > 0) 6541323667Sbapt arc_no_grow_shift = zfs_arc_no_grow_shift; 6542286625Smav /* 6543286625Smav * Ensure that arc_no_grow_shift is less than arc_shrink_shift. 6544286625Smav */ 6545286625Smav if (arc_no_grow_shift >= arc_shrink_shift) 6546286625Smav arc_no_grow_shift = arc_shrink_shift - 1; 6547286625Smav 6548208373Smm if (zfs_arc_p_min_shift > 0) 6549208373Smm arc_p_min_shift = zfs_arc_p_min_shift; 6550208373Smm 6551168404Spjd /* if kmem_flags are set, lets try to use less memory */ 6552168404Spjd if (kmem_debugging()) 6553168404Spjd arc_c = arc_c / 2; 6554168404Spjd if (arc_c < arc_c_min) 6555168404Spjd arc_c = arc_c_min; 6556168404Spjd 6557168473Spjd zfs_arc_min = arc_c_min; 6558168473Spjd zfs_arc_max = arc_c_max; 6559168473Spjd 6560307265Smav arc_state_init(); 6561168404Spjd buf_init(); 6562168404Spjd 6563307265Smav arc_reclaim_thread_exit = B_FALSE; 6564301997Skib arc_dnlc_evicts_thread_exit = FALSE; 6565168404Spjd 6566168404Spjd arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED, 6567168404Spjd sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); 6568168404Spjd 6569168404Spjd if (arc_ksp != NULL) { 6570168404Spjd arc_ksp->ks_data = &arc_stats; 6571286574Smav arc_ksp->ks_update = arc_kstat_update; 6572168404Spjd kstat_install(arc_ksp); 6573168404Spjd } 6574168404Spjd 6575168404Spjd (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0, 6576168404Spjd TS_RUN, minclsyspri); 6577168404Spjd 6578168404Spjd#ifdef _KERNEL 6579168566Spjd arc_event_lowmem = EVENTHANDLER_REGISTER(vm_lowmem, arc_lowmem, NULL, 6580168404Spjd EVENTHANDLER_PRI_FIRST); 6581168404Spjd#endif 6582168404Spjd 6583301997Skib (void) thread_create(NULL, 0, arc_dnlc_evicts_thread, NULL, 0, &p0, 6584301997Skib TS_RUN, minclsyspri); 6585301997Skib 6586307265Smav arc_dead = B_FALSE; 6587185029Spjd arc_warm = B_FALSE; 6588168566Spjd 6589258632Savg /* 6590258632Savg * Calculate maximum amount of dirty data per pool. 6591258632Savg * 6592258632Savg * If it has been set by /etc/system, take that. 6593258632Savg * Otherwise, use a percentage of physical memory defined by 6594258632Savg * zfs_dirty_data_max_percent (default 10%) with a cap at 6595258632Savg * zfs_dirty_data_max_max (default 4GB). 6596258632Savg */ 6597258632Savg if (zfs_dirty_data_max == 0) { 6598258632Savg zfs_dirty_data_max = ptob(physmem) * 6599258632Savg zfs_dirty_data_max_percent / 100; 6600258632Savg zfs_dirty_data_max = MIN(zfs_dirty_data_max, 6601258632Savg zfs_dirty_data_max_max); 6602258632Savg } 6603185029Spjd 6604168566Spjd#ifdef _KERNEL 6605194043Skmacy if (TUNABLE_INT_FETCH("vfs.zfs.prefetch_disable", &zfs_prefetch_disable)) 6606193953Skmacy prefetch_tunable_set = 1; 6607206796Spjd 6608193878Skmacy#ifdef __i386__ 6609193953Skmacy if (prefetch_tunable_set == 0) { 6610196863Strasz printf("ZFS NOTICE: Prefetch is disabled by default on i386 " 6611196863Strasz "-- to enable,\n"); 6612196863Strasz printf(" add \"vfs.zfs.prefetch_disable=0\" " 6613196863Strasz "to /boot/loader.conf.\n"); 6614219089Spjd zfs_prefetch_disable = 1; 6615193878Skmacy } 6616206796Spjd#else 6617193878Skmacy if ((((uint64_t)physmem * PAGESIZE) < (1ULL << 32)) && 6618193953Skmacy prefetch_tunable_set == 0) { 6619196863Strasz printf("ZFS NOTICE: Prefetch is disabled by default if less " 6620196941Strasz "than 4GB of RAM is present;\n" 6621196863Strasz " to enable, add \"vfs.zfs.prefetch_disable=0\" " 6622196863Strasz "to /boot/loader.conf.\n"); 6623219089Spjd zfs_prefetch_disable = 1; 6624193878Skmacy } 6625206796Spjd#endif 6626175633Spjd /* Warn about ZFS memory and address space requirements. */ 6627168696Spjd if (((uint64_t)physmem * PAGESIZE) < (256 + 128 + 64) * (1 << 20)) { 6628168987Sbmah printf("ZFS WARNING: Recommended minimum RAM size is 512MB; " 6629168987Sbmah "expect unstable behavior.\n"); 6630175633Spjd } 6631321562Smav if (allmem < 512 * (1 << 20)) { 6632173419Spjd printf("ZFS WARNING: Recommended minimum kmem_size is 512MB; " 6633168987Sbmah "expect unstable behavior.\n"); 6634185029Spjd printf(" Consider tuning vm.kmem_size and " 6635173419Spjd "vm.kmem_size_max\n"); 6636185029Spjd printf(" in /boot/loader.conf.\n"); 6637168566Spjd } 6638168566Spjd#endif 6639168404Spjd} 6640168404Spjd 6641168404Spjdvoid 6642168404Spjdarc_fini(void) 6643168404Spjd{ 6644327491Smarkj#ifdef _KERNEL 6645327491Smarkj if (arc_event_lowmem != NULL) 6646327491Smarkj EVENTHANDLER_DEREGISTER(vm_lowmem, arc_event_lowmem); 6647327491Smarkj#endif 6648327491Smarkj 6649286763Smav mutex_enter(&arc_reclaim_lock); 6650307265Smav arc_reclaim_thread_exit = B_TRUE; 6651286763Smav /* 6652286763Smav * The reclaim thread will set arc_reclaim_thread_exit back to 6653307265Smav * B_FALSE when it is finished exiting; we're waiting for that. 6654286763Smav */ 6655286763Smav while (arc_reclaim_thread_exit) { 6656286763Smav cv_signal(&arc_reclaim_thread_cv); 6657286763Smav cv_wait(&arc_reclaim_thread_cv, &arc_reclaim_lock); 6658286763Smav } 6659286763Smav mutex_exit(&arc_reclaim_lock); 6660168404Spjd 6661307265Smav /* Use B_TRUE to ensure *all* buffers are evicted */ 6662307265Smav arc_flush(NULL, B_TRUE); 6663168404Spjd 6664301997Skib mutex_enter(&arc_dnlc_evicts_lock); 6665301997Skib arc_dnlc_evicts_thread_exit = TRUE; 6666301997Skib /* 6667301997Skib * The user evicts thread will set arc_user_evicts_thread_exit 6668301997Skib * to FALSE when it is finished exiting; we're waiting for that. 6669301997Skib */ 6670301997Skib while (arc_dnlc_evicts_thread_exit) { 6671301997Skib cv_signal(&arc_dnlc_evicts_cv); 6672301997Skib cv_wait(&arc_dnlc_evicts_cv, &arc_dnlc_evicts_lock); 6673301997Skib } 6674301997Skib mutex_exit(&arc_dnlc_evicts_lock); 6675301997Skib 6676307265Smav arc_dead = B_TRUE; 6677286763Smav 6678168404Spjd if (arc_ksp != NULL) { 6679168404Spjd kstat_delete(arc_ksp); 6680168404Spjd arc_ksp = NULL; 6681168404Spjd } 6682168404Spjd 6683286763Smav mutex_destroy(&arc_reclaim_lock); 6684286763Smav cv_destroy(&arc_reclaim_thread_cv); 6685286763Smav cv_destroy(&arc_reclaim_waiters_cv); 6686168404Spjd 6687301997Skib mutex_destroy(&arc_dnlc_evicts_lock); 6688301997Skib cv_destroy(&arc_dnlc_evicts_cv); 6689301997Skib 6690307265Smav arc_state_fini(); 6691168404Spjd buf_fini(); 6692168404Spjd 6693286570Smav ASSERT0(arc_loaned_bytes); 6694168404Spjd} 6695185029Spjd 6696185029Spjd/* 6697185029Spjd * Level 2 ARC 6698185029Spjd * 6699185029Spjd * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk. 6700185029Spjd * It uses dedicated storage devices to hold cached data, which are populated 6701185029Spjd * using large infrequent writes. The main role of this cache is to boost 6702185029Spjd * the performance of random read workloads. The intended L2ARC devices 6703185029Spjd * include short-stroked disks, solid state disks, and other media with 6704185029Spjd * substantially faster read latency than disk. 6705185029Spjd * 6706185029Spjd * +-----------------------+ 6707185029Spjd * | ARC | 6708185029Spjd * +-----------------------+ 6709185029Spjd * | ^ ^ 6710185029Spjd * | | | 6711185029Spjd * l2arc_feed_thread() arc_read() 6712185029Spjd * | | | 6713185029Spjd * | l2arc read | 6714185029Spjd * V | | 6715185029Spjd * +---------------+ | 6716185029Spjd * | L2ARC | | 6717185029Spjd * +---------------+ | 6718185029Spjd * | ^ | 6719185029Spjd * l2arc_write() | | 6720185029Spjd * | | | 6721185029Spjd * V | | 6722185029Spjd * +-------+ +-------+ 6723185029Spjd * | vdev | | vdev | 6724185029Spjd * | cache | | cache | 6725185029Spjd * +-------+ +-------+ 6726185029Spjd * +=========+ .-----. 6727185029Spjd * : L2ARC : |-_____-| 6728185029Spjd * : devices : | Disks | 6729185029Spjd * +=========+ `-_____-' 6730185029Spjd * 6731185029Spjd * Read requests are satisfied from the following sources, in order: 6732185029Spjd * 6733185029Spjd * 1) ARC 6734185029Spjd * 2) vdev cache of L2ARC devices 6735185029Spjd * 3) L2ARC devices 6736185029Spjd * 4) vdev cache of disks 6737185029Spjd * 5) disks 6738185029Spjd * 6739185029Spjd * Some L2ARC device types exhibit extremely slow write performance. 6740185029Spjd * To accommodate for this there are some significant differences between 6741185029Spjd * the L2ARC and traditional cache design: 6742185029Spjd * 6743185029Spjd * 1. There is no eviction path from the ARC to the L2ARC. Evictions from 6744185029Spjd * the ARC behave as usual, freeing buffers and placing headers on ghost 6745185029Spjd * lists. The ARC does not send buffers to the L2ARC during eviction as 6746185029Spjd * this would add inflated write latencies for all ARC memory pressure. 6747185029Spjd * 6748185029Spjd * 2. The L2ARC attempts to cache data from the ARC before it is evicted. 6749185029Spjd * It does this by periodically scanning buffers from the eviction-end of 6750185029Spjd * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are 6751251478Sdelphij * not already there. It scans until a headroom of buffers is satisfied, 6752251478Sdelphij * which itself is a buffer for ARC eviction. If a compressible buffer is 6753251478Sdelphij * found during scanning and selected for writing to an L2ARC device, we 6754251478Sdelphij * temporarily boost scanning headroom during the next scan cycle to make 6755251478Sdelphij * sure we adapt to compression effects (which might significantly reduce 6756251478Sdelphij * the data volume we write to L2ARC). The thread that does this is 6757185029Spjd * l2arc_feed_thread(), illustrated below; example sizes are included to 6758185029Spjd * provide a better sense of ratio than this diagram: 6759185029Spjd * 6760185029Spjd * head --> tail 6761185029Spjd * +---------------------+----------+ 6762185029Spjd * ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->. # already on L2ARC 6763185029Spjd * +---------------------+----------+ | o L2ARC eligible 6764185029Spjd * ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->| : ARC buffer 6765185029Spjd * +---------------------+----------+ | 6766185029Spjd * 15.9 Gbytes ^ 32 Mbytes | 6767185029Spjd * headroom | 6768185029Spjd * l2arc_feed_thread() 6769185029Spjd * | 6770185029Spjd * l2arc write hand <--[oooo]--' 6771185029Spjd * | 8 Mbyte 6772185029Spjd * | write max 6773185029Spjd * V 6774185029Spjd * +==============================+ 6775185029Spjd * L2ARC dev |####|#|###|###| |####| ... | 6776185029Spjd * +==============================+ 6777185029Spjd * 32 Gbytes 6778185029Spjd * 6779185029Spjd * 3. If an ARC buffer is copied to the L2ARC but then hit instead of 6780185029Spjd * evicted, then the L2ARC has cached a buffer much sooner than it probably 6781185029Spjd * needed to, potentially wasting L2ARC device bandwidth and storage. It is 6782185029Spjd * safe to say that this is an uncommon case, since buffers at the end of 6783185029Spjd * the ARC lists have moved there due to inactivity. 6784185029Spjd * 6785185029Spjd * 4. If the ARC evicts faster than the L2ARC can maintain a headroom, 6786185029Spjd * then the L2ARC simply misses copying some buffers. This serves as a 6787185029Spjd * pressure valve to prevent heavy read workloads from both stalling the ARC 6788185029Spjd * with waits and clogging the L2ARC with writes. This also helps prevent 6789185029Spjd * the potential for the L2ARC to churn if it attempts to cache content too 6790185029Spjd * quickly, such as during backups of the entire pool. 6791185029Spjd * 6792185029Spjd * 5. After system boot and before the ARC has filled main memory, there are 6793185029Spjd * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru 6794185029Spjd * lists can remain mostly static. Instead of searching from tail of these 6795185029Spjd * lists as pictured, the l2arc_feed_thread() will search from the list heads 6796185029Spjd * for eligible buffers, greatly increasing its chance of finding them. 6797185029Spjd * 6798185029Spjd * The L2ARC device write speed is also boosted during this time so that 6799185029Spjd * the L2ARC warms up faster. Since there have been no ARC evictions yet, 6800185029Spjd * there are no L2ARC reads, and no fear of degrading read performance 6801185029Spjd * through increased writes. 6802185029Spjd * 6803185029Spjd * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that 6804185029Spjd * the vdev queue can aggregate them into larger and fewer writes. Each 6805185029Spjd * device is written to in a rotor fashion, sweeping writes through 6806185029Spjd * available space then repeating. 6807185029Spjd * 6808185029Spjd * 7. The L2ARC does not store dirty content. It never needs to flush 6809185029Spjd * write buffers back to disk based storage. 6810185029Spjd * 6811185029Spjd * 8. If an ARC buffer is written (and dirtied) which also exists in the 6812185029Spjd * L2ARC, the now stale L2ARC buffer is immediately dropped. 6813185029Spjd * 6814185029Spjd * The performance of the L2ARC can be tweaked by a number of tunables, which 6815185029Spjd * may be necessary for different workloads: 6816185029Spjd * 6817185029Spjd * l2arc_write_max max write bytes per interval 6818185029Spjd * l2arc_write_boost extra write bytes during device warmup 6819185029Spjd * l2arc_noprefetch skip caching prefetched buffers 6820185029Spjd * l2arc_headroom number of max device writes to precache 6821251478Sdelphij * l2arc_headroom_boost when we find compressed buffers during ARC 6822251478Sdelphij * scanning, we multiply headroom by this 6823251478Sdelphij * percentage factor for the next scan cycle, 6824251478Sdelphij * since more compressed buffers are likely to 6825251478Sdelphij * be present 6826185029Spjd * l2arc_feed_secs seconds between L2ARC writing 6827185029Spjd * 6828185029Spjd * Tunables may be removed or added as future performance improvements are 6829185029Spjd * integrated, and also may become zpool properties. 6830208373Smm * 6831208373Smm * There are three key functions that control how the L2ARC warms up: 6832208373Smm * 6833208373Smm * l2arc_write_eligible() check if a buffer is eligible to cache 6834208373Smm * l2arc_write_size() calculate how much to write 6835208373Smm * l2arc_write_interval() calculate sleep delay between writes 6836208373Smm * 6837208373Smm * These three functions determine what to write, how much, and how quickly 6838208373Smm * to send writes. 6839185029Spjd */ 6840185029Spjd 6841208373Smmstatic boolean_t 6842275811Sdelphijl2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *hdr) 6843208373Smm{ 6844208373Smm /* 6845208373Smm * A buffer is *not* eligible for the L2ARC if it: 6846208373Smm * 1. belongs to a different spa. 6847208373Smm * 2. is already cached on the L2ARC. 6848208373Smm * 3. has an I/O in progress (it may be an incomplete read). 6849208373Smm * 4. is flagged not eligible (zfs property). 6850208373Smm */ 6851275811Sdelphij if (hdr->b_spa != spa_guid) { 6852208373Smm ARCSTAT_BUMP(arcstat_l2_write_spa_mismatch); 6853208373Smm return (B_FALSE); 6854208373Smm } 6855286570Smav if (HDR_HAS_L2HDR(hdr)) { 6856208373Smm ARCSTAT_BUMP(arcstat_l2_write_in_l2); 6857208373Smm return (B_FALSE); 6858208373Smm } 6859275811Sdelphij if (HDR_IO_IN_PROGRESS(hdr)) { 6860208373Smm ARCSTAT_BUMP(arcstat_l2_write_hdr_io_in_progress); 6861208373Smm return (B_FALSE); 6862208373Smm } 6863275811Sdelphij if (!HDR_L2CACHE(hdr)) { 6864208373Smm ARCSTAT_BUMP(arcstat_l2_write_not_cacheable); 6865208373Smm return (B_FALSE); 6866208373Smm } 6867208373Smm 6868208373Smm return (B_TRUE); 6869208373Smm} 6870208373Smm 6871208373Smmstatic uint64_t 6872251478Sdelphijl2arc_write_size(void) 6873208373Smm{ 6874208373Smm uint64_t size; 6875208373Smm 6876251478Sdelphij /* 6877251478Sdelphij * Make sure our globals have meaningful values in case the user 6878251478Sdelphij * altered them. 6879251478Sdelphij */ 6880251478Sdelphij size = l2arc_write_max; 6881251478Sdelphij if (size == 0) { 6882251478Sdelphij cmn_err(CE_NOTE, "Bad value for l2arc_write_max, value must " 6883251478Sdelphij "be greater than zero, resetting it to the default (%d)", 6884251478Sdelphij L2ARC_WRITE_SIZE); 6885251478Sdelphij size = l2arc_write_max = L2ARC_WRITE_SIZE; 6886251478Sdelphij } 6887208373Smm 6888208373Smm if (arc_warm == B_FALSE) 6889251478Sdelphij size += l2arc_write_boost; 6890208373Smm 6891208373Smm return (size); 6892208373Smm 6893208373Smm} 6894208373Smm 6895208373Smmstatic clock_t 6896208373Smml2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote) 6897208373Smm{ 6898219089Spjd clock_t interval, next, now; 6899208373Smm 6900208373Smm /* 6901208373Smm * If the ARC lists are busy, increase our write rate; if the 6902208373Smm * lists are stale, idle back. This is achieved by checking 6903208373Smm * how much we previously wrote - if it was more than half of 6904208373Smm * what we wanted, schedule the next write much sooner. 6905208373Smm */ 6906208373Smm if (l2arc_feed_again && wrote > (wanted / 2)) 6907208373Smm interval = (hz * l2arc_feed_min_ms) / 1000; 6908208373Smm else 6909208373Smm interval = hz * l2arc_feed_secs; 6910208373Smm 6911219089Spjd now = ddi_get_lbolt(); 6912219089Spjd next = MAX(now, MIN(now + interval, began + interval)); 6913208373Smm 6914208373Smm return (next); 6915208373Smm} 6916208373Smm 6917185029Spjd/* 6918185029Spjd * Cycle through L2ARC devices. This is how L2ARC load balances. 6919185029Spjd * If a device is returned, this also returns holding the spa config lock. 6920185029Spjd */ 6921185029Spjdstatic l2arc_dev_t * 6922185029Spjdl2arc_dev_get_next(void) 6923185029Spjd{ 6924185029Spjd l2arc_dev_t *first, *next = NULL; 6925185029Spjd 6926185029Spjd /* 6927185029Spjd * Lock out the removal of spas (spa_namespace_lock), then removal 6928185029Spjd * of cache devices (l2arc_dev_mtx). Once a device has been selected, 6929185029Spjd * both locks will be dropped and a spa config lock held instead. 6930185029Spjd */ 6931185029Spjd mutex_enter(&spa_namespace_lock); 6932185029Spjd mutex_enter(&l2arc_dev_mtx); 6933185029Spjd 6934185029Spjd /* if there are no vdevs, there is nothing to do */ 6935185029Spjd if (l2arc_ndev == 0) 6936185029Spjd goto out; 6937185029Spjd 6938185029Spjd first = NULL; 6939185029Spjd next = l2arc_dev_last; 6940185029Spjd do { 6941185029Spjd /* loop around the list looking for a non-faulted vdev */ 6942185029Spjd if (next == NULL) { 6943185029Spjd next = list_head(l2arc_dev_list); 6944185029Spjd } else { 6945185029Spjd next = list_next(l2arc_dev_list, next); 6946185029Spjd if (next == NULL) 6947185029Spjd next = list_head(l2arc_dev_list); 6948185029Spjd } 6949185029Spjd 6950185029Spjd /* if we have come back to the start, bail out */ 6951185029Spjd if (first == NULL) 6952185029Spjd first = next; 6953185029Spjd else if (next == first) 6954185029Spjd break; 6955185029Spjd 6956185029Spjd } while (vdev_is_dead(next->l2ad_vdev)); 6957185029Spjd 6958185029Spjd /* if we were unable to find any usable vdevs, return NULL */ 6959185029Spjd if (vdev_is_dead(next->l2ad_vdev)) 6960185029Spjd next = NULL; 6961185029Spjd 6962185029Spjd l2arc_dev_last = next; 6963185029Spjd 6964185029Spjdout: 6965185029Spjd mutex_exit(&l2arc_dev_mtx); 6966185029Spjd 6967185029Spjd /* 6968185029Spjd * Grab the config lock to prevent the 'next' device from being 6969185029Spjd * removed while we are writing to it. 6970185029Spjd */ 6971185029Spjd if (next != NULL) 6972185029Spjd spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER); 6973185029Spjd mutex_exit(&spa_namespace_lock); 6974185029Spjd 6975185029Spjd return (next); 6976185029Spjd} 6977185029Spjd 6978185029Spjd/* 6979185029Spjd * Free buffers that were tagged for destruction. 6980185029Spjd */ 6981185029Spjdstatic void 6982185029Spjdl2arc_do_free_on_write() 6983185029Spjd{ 6984185029Spjd list_t *buflist; 6985185029Spjd l2arc_data_free_t *df, *df_prev; 6986185029Spjd 6987185029Spjd mutex_enter(&l2arc_free_on_write_mtx); 6988185029Spjd buflist = l2arc_free_on_write; 6989185029Spjd 6990185029Spjd for (df = list_tail(buflist); df; df = df_prev) { 6991185029Spjd df_prev = list_prev(buflist, df); 6992321610Smav ASSERT3P(df->l2df_abd, !=, NULL); 6993321610Smav abd_free(df->l2df_abd); 6994185029Spjd list_remove(buflist, df); 6995185029Spjd kmem_free(df, sizeof (l2arc_data_free_t)); 6996185029Spjd } 6997185029Spjd 6998185029Spjd mutex_exit(&l2arc_free_on_write_mtx); 6999185029Spjd} 7000185029Spjd 7001185029Spjd/* 7002185029Spjd * A write to a cache device has completed. Update all headers to allow 7003185029Spjd * reads from these buffers to begin. 7004185029Spjd */ 7005185029Spjdstatic void 7006185029Spjdl2arc_write_done(zio_t *zio) 7007185029Spjd{ 7008185029Spjd l2arc_write_callback_t *cb; 7009185029Spjd l2arc_dev_t *dev; 7010185029Spjd list_t *buflist; 7011275811Sdelphij arc_buf_hdr_t *head, *hdr, *hdr_prev; 7012185029Spjd kmutex_t *hash_lock; 7013268085Sdelphij int64_t bytes_dropped = 0; 7014185029Spjd 7015185029Spjd cb = zio->io_private; 7016307265Smav ASSERT3P(cb, !=, NULL); 7017185029Spjd dev = cb->l2wcb_dev; 7018307265Smav ASSERT3P(dev, !=, NULL); 7019185029Spjd head = cb->l2wcb_head; 7020307265Smav ASSERT3P(head, !=, NULL); 7021286570Smav buflist = &dev->l2ad_buflist; 7022307265Smav ASSERT3P(buflist, !=, NULL); 7023185029Spjd DTRACE_PROBE2(l2arc__iodone, zio_t *, zio, 7024185029Spjd l2arc_write_callback_t *, cb); 7025185029Spjd 7026185029Spjd if (zio->io_error != 0) 7027185029Spjd ARCSTAT_BUMP(arcstat_l2_writes_error); 7028185029Spjd 7029185029Spjd /* 7030185029Spjd * All writes completed, or an error was hit. 7031185029Spjd */ 7032286763Smavtop: 7033286763Smav mutex_enter(&dev->l2ad_mtx); 7034275811Sdelphij for (hdr = list_prev(buflist, head); hdr; hdr = hdr_prev) { 7035275811Sdelphij hdr_prev = list_prev(buflist, hdr); 7036185029Spjd 7037275811Sdelphij hash_lock = HDR_LOCK(hdr); 7038286763Smav 7039286763Smav /* 7040286763Smav * We cannot use mutex_enter or else we can deadlock 7041286763Smav * with l2arc_write_buffers (due to swapping the order 7042286763Smav * the hash lock and l2ad_mtx are taken). 7043286763Smav */ 7044185029Spjd if (!mutex_tryenter(hash_lock)) { 7045185029Spjd /* 7046286763Smav * Missed the hash lock. We must retry so we 7047286763Smav * don't leave the ARC_FLAG_L2_WRITING bit set. 7048185029Spjd */ 7049286763Smav ARCSTAT_BUMP(arcstat_l2_writes_lock_retry); 7050286763Smav 7051286763Smav /* 7052286763Smav * We don't want to rescan the headers we've 7053286763Smav * already marked as having been written out, so 7054286763Smav * we reinsert the head node so we can pick up 7055286763Smav * where we left off. 7056286763Smav */ 7057286763Smav list_remove(buflist, head); 7058286763Smav list_insert_after(buflist, hdr, head); 7059286763Smav 7060286763Smav mutex_exit(&dev->l2ad_mtx); 7061286763Smav 7062286763Smav /* 7063286763Smav * We wait for the hash lock to become available 7064286763Smav * to try and prevent busy waiting, and increase 7065286763Smav * the chance we'll be able to acquire the lock 7066286763Smav * the next time around. 7067286763Smav */ 7068286763Smav mutex_enter(hash_lock); 7069286763Smav mutex_exit(hash_lock); 7070286763Smav goto top; 7071185029Spjd } 7072185029Spjd 7073286570Smav /* 7074286763Smav * We could not have been moved into the arc_l2c_only 7075286763Smav * state while in-flight due to our ARC_FLAG_L2_WRITING 7076286763Smav * bit being set. Let's just ensure that's being enforced. 7077286570Smav */ 7078286763Smav ASSERT(HDR_HAS_L1HDR(hdr)); 7079286570Smav 7080185029Spjd if (zio->io_error != 0) { 7081185029Spjd /* 7082185029Spjd * Error - drop L2ARC entry. 7083185029Spjd */ 7084286776Smav list_remove(buflist, hdr); 7085290191Savg l2arc_trim(hdr); 7086307265Smav arc_hdr_clear_flags(hdr, ARC_FLAG_HAS_L2HDR); 7087286570Smav 7088323754Savg ARCSTAT_INCR(arcstat_l2_psize, -arc_hdr_size(hdr)); 7089323754Savg ARCSTAT_INCR(arcstat_l2_lsize, -HDR_GET_LSIZE(hdr)); 7090286598Smav 7091307265Smav bytes_dropped += arc_hdr_size(hdr); 7092286598Smav (void) refcount_remove_many(&dev->l2ad_alloc, 7093307265Smav arc_hdr_size(hdr), hdr); 7094185029Spjd } 7095185029Spjd 7096185029Spjd /* 7097286763Smav * Allow ARC to begin reads and ghost list evictions to 7098286763Smav * this L2ARC entry. 7099185029Spjd */ 7100307265Smav arc_hdr_clear_flags(hdr, ARC_FLAG_L2_WRITING); 7101185029Spjd 7102185029Spjd mutex_exit(hash_lock); 7103185029Spjd } 7104185029Spjd 7105185029Spjd atomic_inc_64(&l2arc_writes_done); 7106185029Spjd list_remove(buflist, head); 7107286570Smav ASSERT(!HDR_HAS_L1HDR(head)); 7108286570Smav kmem_cache_free(hdr_l2only_cache, head); 7109286570Smav mutex_exit(&dev->l2ad_mtx); 7110185029Spjd 7111268085Sdelphij vdev_space_update(dev->l2ad_vdev, -bytes_dropped, 0, 0); 7112268085Sdelphij 7113185029Spjd l2arc_do_free_on_write(); 7114185029Spjd 7115185029Spjd kmem_free(cb, sizeof (l2arc_write_callback_t)); 7116185029Spjd} 7117185029Spjd 7118185029Spjd/* 7119185029Spjd * A read to a cache device completed. Validate buffer contents before 7120185029Spjd * handing over to the regular ARC routines. 7121185029Spjd */ 7122185029Spjdstatic void 7123185029Spjdl2arc_read_done(zio_t *zio) 7124185029Spjd{ 7125185029Spjd l2arc_read_callback_t *cb; 7126185029Spjd arc_buf_hdr_t *hdr; 7127185029Spjd kmutex_t *hash_lock; 7128307265Smav boolean_t valid_cksum; 7129185029Spjd 7130307265Smav ASSERT3P(zio->io_vd, !=, NULL); 7131185029Spjd ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE); 7132185029Spjd 7133185029Spjd spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd); 7134185029Spjd 7135185029Spjd cb = zio->io_private; 7136307265Smav ASSERT3P(cb, !=, NULL); 7137307265Smav hdr = cb->l2rcb_hdr; 7138307265Smav ASSERT3P(hdr, !=, NULL); 7139185029Spjd 7140307265Smav hash_lock = HDR_LOCK(hdr); 7141185029Spjd mutex_enter(hash_lock); 7142219089Spjd ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 7143185029Spjd 7144185029Spjd /* 7145297848Savg * If the data was read into a temporary buffer, 7146297848Savg * move it and free the buffer. 7147297848Savg */ 7148321610Smav if (cb->l2rcb_abd != NULL) { 7149307265Smav ASSERT3U(arc_hdr_size(hdr), <, zio->io_size); 7150307265Smav if (zio->io_error == 0) { 7151321610Smav abd_copy(hdr->b_l1hdr.b_pabd, cb->l2rcb_abd, 7152307265Smav arc_hdr_size(hdr)); 7153307265Smav } 7154297848Savg 7155297848Savg /* 7156297848Savg * The following must be done regardless of whether 7157297848Savg * there was an error: 7158297848Savg * - free the temporary buffer 7159297848Savg * - point zio to the real ARC buffer 7160297848Savg * - set zio size accordingly 7161297848Savg * These are required because zio is either re-used for 7162297848Savg * an I/O of the block in the case of the error 7163297848Savg * or the zio is passed to arc_read_done() and it 7164297848Savg * needs real data. 7165297848Savg */ 7166321610Smav abd_free(cb->l2rcb_abd); 7167307265Smav zio->io_size = zio->io_orig_size = arc_hdr_size(hdr); 7168321610Smav zio->io_abd = zio->io_orig_abd = hdr->b_l1hdr.b_pabd; 7169297848Savg } 7170297848Savg 7171321610Smav ASSERT3P(zio->io_abd, !=, NULL); 7172251478Sdelphij 7173251478Sdelphij /* 7174185029Spjd * Check this survived the L2ARC journey. 7175185029Spjd */ 7176321610Smav ASSERT3P(zio->io_abd, ==, hdr->b_l1hdr.b_pabd); 7177307265Smav zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 */ 7178307265Smav zio->io_bp = &zio->io_bp_copy; /* XXX fix in L2ARC 2.0 */ 7179307265Smav 7180307265Smav valid_cksum = arc_cksum_is_equal(hdr, zio); 7181307265Smav if (valid_cksum && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) { 7182185029Spjd mutex_exit(hash_lock); 7183307265Smav zio->io_private = hdr; 7184185029Spjd arc_read_done(zio); 7185185029Spjd } else { 7186185029Spjd mutex_exit(hash_lock); 7187185029Spjd /* 7188185029Spjd * Buffer didn't survive caching. Increment stats and 7189185029Spjd * reissue to the original storage device. 7190185029Spjd */ 7191185029Spjd if (zio->io_error != 0) { 7192185029Spjd ARCSTAT_BUMP(arcstat_l2_io_error); 7193185029Spjd } else { 7194249195Smm zio->io_error = SET_ERROR(EIO); 7195185029Spjd } 7196307265Smav if (!valid_cksum) 7197185029Spjd ARCSTAT_BUMP(arcstat_l2_cksum_bad); 7198185029Spjd 7199185029Spjd /* 7200185029Spjd * If there's no waiter, issue an async i/o to the primary 7201185029Spjd * storage now. If there *is* a waiter, the caller must 7202185029Spjd * issue the i/o in a context where it's OK to block. 7203185029Spjd */ 7204209962Smm if (zio->io_waiter == NULL) { 7205209962Smm zio_t *pio = zio_unique_parent(zio); 7206209962Smm 7207209962Smm ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL); 7208209962Smm 7209307265Smav zio_nowait(zio_read(pio, zio->io_spa, zio->io_bp, 7210321610Smav hdr->b_l1hdr.b_pabd, zio->io_size, arc_read_done, 7211307265Smav hdr, zio->io_priority, cb->l2rcb_flags, 7212307265Smav &cb->l2rcb_zb)); 7213209962Smm } 7214185029Spjd } 7215185029Spjd 7216185029Spjd kmem_free(cb, sizeof (l2arc_read_callback_t)); 7217185029Spjd} 7218185029Spjd 7219185029Spjd/* 7220185029Spjd * This is the list priority from which the L2ARC will search for pages to 7221185029Spjd * cache. This is used within loops (0..3) to cycle through lists in the 7222185029Spjd * desired order. This order can have a significant effect on cache 7223185029Spjd * performance. 7224185029Spjd * 7225185029Spjd * Currently the metadata lists are hit first, MFU then MRU, followed by 7226185029Spjd * the data lists. This function returns a locked list, and also returns 7227185029Spjd * the lock pointer. 7228185029Spjd */ 7229286763Smavstatic multilist_sublist_t * 7230286763Smavl2arc_sublist_lock(int list_num) 7231185029Spjd{ 7232286763Smav multilist_t *ml = NULL; 7233286763Smav unsigned int idx; 7234185029Spjd 7235286762Smav ASSERT(list_num >= 0 && list_num <= 3); 7236206796Spjd 7237286762Smav switch (list_num) { 7238286762Smav case 0: 7239321553Smav ml = arc_mfu->arcs_list[ARC_BUFC_METADATA]; 7240286762Smav break; 7241286762Smav case 1: 7242321553Smav ml = arc_mru->arcs_list[ARC_BUFC_METADATA]; 7243286762Smav break; 7244286762Smav case 2: 7245321553Smav ml = arc_mfu->arcs_list[ARC_BUFC_DATA]; 7246286762Smav break; 7247286762Smav case 3: 7248321553Smav ml = arc_mru->arcs_list[ARC_BUFC_DATA]; 7249286762Smav break; 7250185029Spjd } 7251185029Spjd 7252286763Smav /* 7253286763Smav * Return a randomly-selected sublist. This is acceptable 7254286763Smav * because the caller feeds only a little bit of data for each 7255286763Smav * call (8MB). Subsequent calls will result in different 7256286763Smav * sublists being selected. 7257286763Smav */ 7258286763Smav idx = multilist_get_random_index(ml); 7259286763Smav return (multilist_sublist_lock(ml, idx)); 7260185029Spjd} 7261185029Spjd 7262185029Spjd/* 7263185029Spjd * Evict buffers from the device write hand to the distance specified in 7264185029Spjd * bytes. This distance may span populated buffers, it may span nothing. 7265185029Spjd * This is clearing a region on the L2ARC device ready for writing. 7266185029Spjd * If the 'all' boolean is set, every buffer is evicted. 7267185029Spjd */ 7268185029Spjdstatic void 7269185029Spjdl2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) 7270185029Spjd{ 7271185029Spjd list_t *buflist; 7272275811Sdelphij arc_buf_hdr_t *hdr, *hdr_prev; 7273185029Spjd kmutex_t *hash_lock; 7274185029Spjd uint64_t taddr; 7275185029Spjd 7276286570Smav buflist = &dev->l2ad_buflist; 7277185029Spjd 7278185029Spjd if (!all && dev->l2ad_first) { 7279185029Spjd /* 7280185029Spjd * This is the first sweep through the device. There is 7281185029Spjd * nothing to evict. 7282185029Spjd */ 7283185029Spjd return; 7284185029Spjd } 7285185029Spjd 7286185029Spjd if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) { 7287185029Spjd /* 7288185029Spjd * When nearing the end of the device, evict to the end 7289185029Spjd * before the device write hand jumps to the start. 7290185029Spjd */ 7291185029Spjd taddr = dev->l2ad_end; 7292185029Spjd } else { 7293185029Spjd taddr = dev->l2ad_hand + distance; 7294185029Spjd } 7295185029Spjd DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist, 7296185029Spjd uint64_t, taddr, boolean_t, all); 7297185029Spjd 7298185029Spjdtop: 7299286570Smav mutex_enter(&dev->l2ad_mtx); 7300275811Sdelphij for (hdr = list_tail(buflist); hdr; hdr = hdr_prev) { 7301275811Sdelphij hdr_prev = list_prev(buflist, hdr); 7302185029Spjd 7303275811Sdelphij hash_lock = HDR_LOCK(hdr); 7304286763Smav 7305286763Smav /* 7306286763Smav * We cannot use mutex_enter or else we can deadlock 7307286763Smav * with l2arc_write_buffers (due to swapping the order 7308286763Smav * the hash lock and l2ad_mtx are taken). 7309286763Smav */ 7310185029Spjd if (!mutex_tryenter(hash_lock)) { 7311185029Spjd /* 7312185029Spjd * Missed the hash lock. Retry. 7313185029Spjd */ 7314185029Spjd ARCSTAT_BUMP(arcstat_l2_evict_lock_retry); 7315286570Smav mutex_exit(&dev->l2ad_mtx); 7316185029Spjd mutex_enter(hash_lock); 7317185029Spjd mutex_exit(hash_lock); 7318185029Spjd goto top; 7319185029Spjd } 7320185029Spjd 7321323752Savg /* 7322323752Savg * A header can't be on this list if it doesn't have L2 header. 7323323752Savg */ 7324323752Savg ASSERT(HDR_HAS_L2HDR(hdr)); 7325185029Spjd 7326323752Savg /* Ensure this header has finished being written. */ 7327323752Savg ASSERT(!HDR_L2_WRITING(hdr)); 7328323752Savg ASSERT(!HDR_L2_WRITE_HEAD(hdr)); 7329323752Savg 7330323752Savg if (!all && (hdr->b_l2hdr.b_daddr >= taddr || 7331286570Smav hdr->b_l2hdr.b_daddr < dev->l2ad_hand)) { 7332185029Spjd /* 7333185029Spjd * We've evicted to the target address, 7334185029Spjd * or the end of the device. 7335185029Spjd */ 7336185029Spjd mutex_exit(hash_lock); 7337185029Spjd break; 7338185029Spjd } 7339185029Spjd 7340286570Smav if (!HDR_HAS_L1HDR(hdr)) { 7341275811Sdelphij ASSERT(!HDR_L2_READING(hdr)); 7342185029Spjd /* 7343185029Spjd * This doesn't exist in the ARC. Destroy. 7344185029Spjd * arc_hdr_destroy() will call list_remove() 7345323754Savg * and decrement arcstat_l2_lsize. 7346185029Spjd */ 7347275811Sdelphij arc_change_state(arc_anon, hdr, hash_lock); 7348275811Sdelphij arc_hdr_destroy(hdr); 7349185029Spjd } else { 7350286570Smav ASSERT(hdr->b_l1hdr.b_state != arc_l2c_only); 7351286570Smav ARCSTAT_BUMP(arcstat_l2_evict_l1cached); 7352185029Spjd /* 7353185029Spjd * Invalidate issued or about to be issued 7354185029Spjd * reads, since we may be about to write 7355185029Spjd * over this location. 7356185029Spjd */ 7357275811Sdelphij if (HDR_L2_READING(hdr)) { 7358185029Spjd ARCSTAT_BUMP(arcstat_l2_evict_reading); 7359307265Smav arc_hdr_set_flags(hdr, ARC_FLAG_L2_EVICTED); 7360185029Spjd } 7361185029Spjd 7362286598Smav arc_hdr_l2hdr_destroy(hdr); 7363185029Spjd } 7364185029Spjd mutex_exit(hash_lock); 7365185029Spjd } 7366286570Smav mutex_exit(&dev->l2ad_mtx); 7367185029Spjd} 7368185029Spjd 7369185029Spjd/* 7370185029Spjd * Find and write ARC buffers to the L2ARC device. 7371185029Spjd * 7372275811Sdelphij * An ARC_FLAG_L2_WRITING flag is set so that the L2ARC buffers are not valid 7373185029Spjd * for reading until they have completed writing. 7374251478Sdelphij * The headroom_boost is an in-out parameter used to maintain headroom boost 7375251478Sdelphij * state between calls to this function. 7376251478Sdelphij * 7377251478Sdelphij * Returns the number of bytes actually written (which may be smaller than 7378251478Sdelphij * the delta by which the device hand has changed due to alignment). 7379185029Spjd */ 7380208373Smmstatic uint64_t 7381307265Smavl2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) 7382185029Spjd{ 7383275811Sdelphij arc_buf_hdr_t *hdr, *hdr_prev, *head; 7384323754Savg uint64_t write_asize, write_psize, write_lsize, headroom; 7385251478Sdelphij boolean_t full; 7386185029Spjd l2arc_write_callback_t *cb; 7387185029Spjd zio_t *pio, *wzio; 7388228103Smm uint64_t guid = spa_load_guid(spa); 7389185029Spjd int try; 7390185029Spjd 7391307265Smav ASSERT3P(dev->l2ad_vdev, !=, NULL); 7392185029Spjd 7393185029Spjd pio = NULL; 7394323754Savg write_lsize = write_asize = write_psize = 0; 7395185029Spjd full = B_FALSE; 7396286570Smav head = kmem_cache_alloc(hdr_l2only_cache, KM_PUSHPAGE); 7397307265Smav arc_hdr_set_flags(head, ARC_FLAG_L2_WRITE_HEAD | ARC_FLAG_HAS_L2HDR); 7398185029Spjd 7399205231Skmacy ARCSTAT_BUMP(arcstat_l2_write_buffer_iter); 7400185029Spjd /* 7401185029Spjd * Copy buffers for L2ARC writing. 7402185029Spjd */ 7403286762Smav for (try = 0; try <= 3; try++) { 7404286763Smav multilist_sublist_t *mls = l2arc_sublist_lock(try); 7405251478Sdelphij uint64_t passed_sz = 0; 7406251478Sdelphij 7407205231Skmacy ARCSTAT_BUMP(arcstat_l2_write_buffer_list_iter); 7408185029Spjd 7409185029Spjd /* 7410185029Spjd * L2ARC fast warmup. 7411185029Spjd * 7412185029Spjd * Until the ARC is warm and starts to evict, read from the 7413185029Spjd * head of the ARC lists rather than the tail. 7414185029Spjd */ 7415185029Spjd if (arc_warm == B_FALSE) 7416286763Smav hdr = multilist_sublist_head(mls); 7417185029Spjd else 7418286763Smav hdr = multilist_sublist_tail(mls); 7419275811Sdelphij if (hdr == NULL) 7420205231Skmacy ARCSTAT_BUMP(arcstat_l2_write_buffer_list_null_iter); 7421185029Spjd 7422286762Smav headroom = target_sz * l2arc_headroom; 7423307265Smav if (zfs_compressed_arc_enabled) 7424251478Sdelphij headroom = (headroom * l2arc_headroom_boost) / 100; 7425251478Sdelphij 7426275811Sdelphij for (; hdr; hdr = hdr_prev) { 7427251478Sdelphij kmutex_t *hash_lock; 7428251478Sdelphij 7429185029Spjd if (arc_warm == B_FALSE) 7430286763Smav hdr_prev = multilist_sublist_next(mls, hdr); 7431185029Spjd else 7432286763Smav hdr_prev = multilist_sublist_prev(mls, hdr); 7433307265Smav ARCSTAT_INCR(arcstat_l2_write_buffer_bytes_scanned, 7434307265Smav HDR_GET_LSIZE(hdr)); 7435206796Spjd 7436275811Sdelphij hash_lock = HDR_LOCK(hdr); 7437251478Sdelphij if (!mutex_tryenter(hash_lock)) { 7438205231Skmacy ARCSTAT_BUMP(arcstat_l2_write_trylock_fail); 7439185029Spjd /* 7440185029Spjd * Skip this buffer rather than waiting. 7441185029Spjd */ 7442185029Spjd continue; 7443185029Spjd } 7444185029Spjd 7445307265Smav passed_sz += HDR_GET_LSIZE(hdr); 7446185029Spjd if (passed_sz > headroom) { 7447185029Spjd /* 7448185029Spjd * Searched too far. 7449185029Spjd */ 7450185029Spjd mutex_exit(hash_lock); 7451205231Skmacy ARCSTAT_BUMP(arcstat_l2_write_passed_headroom); 7452185029Spjd break; 7453185029Spjd } 7454185029Spjd 7455275811Sdelphij if (!l2arc_write_eligible(guid, hdr)) { 7456185029Spjd mutex_exit(hash_lock); 7457185029Spjd continue; 7458185029Spjd } 7459185029Spjd 7460315072Savg /* 7461315072Savg * We rely on the L1 portion of the header below, so 7462315072Savg * it's invalid for this header to have been evicted out 7463315072Savg * of the ghost cache, prior to being written out. The 7464315072Savg * ARC_FLAG_L2_WRITING bit ensures this won't happen. 7465315072Savg */ 7466315072Savg ASSERT(HDR_HAS_L1HDR(hdr)); 7467315072Savg 7468315072Savg ASSERT3U(HDR_GET_PSIZE(hdr), >, 0); 7469321610Smav ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); 7470315072Savg ASSERT3U(arc_hdr_size(hdr), >, 0); 7471323754Savg uint64_t psize = arc_hdr_size(hdr); 7472315072Savg uint64_t asize = vdev_psize_to_asize(dev->l2ad_vdev, 7473323754Savg psize); 7474315072Savg 7475323754Savg if ((write_asize + asize) > target_sz) { 7476185029Spjd full = B_TRUE; 7477185029Spjd mutex_exit(hash_lock); 7478205231Skmacy ARCSTAT_BUMP(arcstat_l2_write_full); 7479185029Spjd break; 7480185029Spjd } 7481185029Spjd 7482185029Spjd if (pio == NULL) { 7483185029Spjd /* 7484185029Spjd * Insert a dummy header on the buflist so 7485185029Spjd * l2arc_write_done() can find where the 7486185029Spjd * write buffers begin without searching. 7487185029Spjd */ 7488286763Smav mutex_enter(&dev->l2ad_mtx); 7489286570Smav list_insert_head(&dev->l2ad_buflist, head); 7490286763Smav mutex_exit(&dev->l2ad_mtx); 7491185029Spjd 7492185029Spjd cb = kmem_alloc( 7493185029Spjd sizeof (l2arc_write_callback_t), KM_SLEEP); 7494185029Spjd cb->l2wcb_dev = dev; 7495185029Spjd cb->l2wcb_head = head; 7496185029Spjd pio = zio_root(spa, l2arc_write_done, cb, 7497185029Spjd ZIO_FLAG_CANFAIL); 7498205231Skmacy ARCSTAT_BUMP(arcstat_l2_write_pios); 7499185029Spjd } 7500185029Spjd 7501286570Smav hdr->b_l2hdr.b_dev = dev; 7502307265Smav hdr->b_l2hdr.b_daddr = dev->l2ad_hand; 7503307265Smav arc_hdr_set_flags(hdr, 7504307265Smav ARC_FLAG_L2_WRITING | ARC_FLAG_HAS_L2HDR); 7505251478Sdelphij 7506307265Smav mutex_enter(&dev->l2ad_mtx); 7507307265Smav list_insert_head(&dev->l2ad_buflist, hdr); 7508307265Smav mutex_exit(&dev->l2ad_mtx); 7509307265Smav 7510323754Savg (void) refcount_add_many(&dev->l2ad_alloc, psize, hdr); 7511251478Sdelphij 7512185029Spjd /* 7513307265Smav * Normally the L2ARC can use the hdr's data, but if 7514307265Smav * we're sharing data between the hdr and one of its 7515307265Smav * bufs, L2ARC needs its own copy of the data so that 7516321613Smav * the ZIO below can't race with the buf consumer. 7517321613Smav * Another case where we need to create a copy of the 7518321613Smav * data is when the buffer size is not device-aligned 7519321613Smav * and we need to pad the block to make it such. 7520321613Smav * That also keeps the clock hand suitably aligned. 7521321613Smav * 7522321613Smav * To ensure that the copy will be available for the 7523307265Smav * lifetime of the ZIO and be cleaned up afterwards, we 7524307265Smav * add it to the l2arc_free_on_write queue. 7525185029Spjd */ 7526321610Smav abd_t *to_write; 7527323754Savg if (!HDR_SHARED_DATA(hdr) && psize == asize) { 7528321610Smav to_write = hdr->b_l1hdr.b_pabd; 7529307265Smav } else { 7530321610Smav to_write = abd_alloc_for_io(asize, 7531321610Smav HDR_ISTYPE_METADATA(hdr)); 7532323754Savg abd_copy(to_write, hdr->b_l1hdr.b_pabd, psize); 7533323754Savg if (asize != psize) { 7534323754Savg abd_zero_off(to_write, psize, 7535323754Savg asize - psize); 7536307265Smav } 7537321610Smav l2arc_free_abd_on_write(to_write, asize, 7538321610Smav arc_buf_type(hdr)); 7539307265Smav } 7540307265Smav wzio = zio_write_phys(pio, dev->l2ad_vdev, 7541307265Smav hdr->b_l2hdr.b_daddr, asize, to_write, 7542307265Smav ZIO_CHECKSUM_OFF, NULL, hdr, 7543307265Smav ZIO_PRIORITY_ASYNC_WRITE, 7544307265Smav ZIO_FLAG_CANFAIL, B_FALSE); 7545307265Smav 7546323754Savg write_lsize += HDR_GET_LSIZE(hdr); 7547307265Smav DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev, 7548307265Smav zio_t *, wzio); 7549307265Smav 7550323754Savg write_psize += psize; 7551323754Savg write_asize += asize; 7552307265Smav dev->l2ad_hand += asize; 7553307265Smav 7554185029Spjd mutex_exit(hash_lock); 7555185029Spjd 7556307265Smav (void) zio_nowait(wzio); 7557251478Sdelphij } 7558251478Sdelphij 7559286763Smav multilist_sublist_unlock(mls); 7560251478Sdelphij 7561251478Sdelphij if (full == B_TRUE) 7562251478Sdelphij break; 7563251478Sdelphij } 7564251478Sdelphij 7565251478Sdelphij /* No buffers selected for writing? */ 7566251478Sdelphij if (pio == NULL) { 7567323754Savg ASSERT0(write_lsize); 7568286570Smav ASSERT(!HDR_HAS_L1HDR(head)); 7569286570Smav kmem_cache_free(hdr_l2only_cache, head); 7570251478Sdelphij return (0); 7571251478Sdelphij } 7572251478Sdelphij 7573315072Savg ASSERT3U(write_psize, <=, target_sz); 7574185029Spjd ARCSTAT_BUMP(arcstat_l2_writes_sent); 7575323754Savg ARCSTAT_INCR(arcstat_l2_write_bytes, write_psize); 7576323754Savg ARCSTAT_INCR(arcstat_l2_lsize, write_lsize); 7577323754Savg ARCSTAT_INCR(arcstat_l2_psize, write_psize); 7578323754Savg vdev_space_update(dev->l2ad_vdev, write_psize, 0, 0); 7579185029Spjd 7580185029Spjd /* 7581185029Spjd * Bump device hand to the device start if it is approaching the end. 7582185029Spjd * l2arc_evict() will already have evicted ahead for this case. 7583185029Spjd */ 7584185029Spjd if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) { 7585185029Spjd dev->l2ad_hand = dev->l2ad_start; 7586185029Spjd dev->l2ad_first = B_FALSE; 7587185029Spjd } 7588185029Spjd 7589208373Smm dev->l2ad_writing = B_TRUE; 7590185029Spjd (void) zio_wait(pio); 7591208373Smm dev->l2ad_writing = B_FALSE; 7592208373Smm 7593251478Sdelphij return (write_asize); 7594185029Spjd} 7595185029Spjd 7596185029Spjd/* 7597185029Spjd * This thread feeds the L2ARC at regular intervals. This is the beating 7598185029Spjd * heart of the L2ARC. 7599185029Spjd */ 7600331399Smav/* ARGSUSED */ 7601185029Spjdstatic void 7602331399Smavl2arc_feed_thread(void *unused __unused) 7603185029Spjd{ 7604185029Spjd callb_cpr_t cpr; 7605185029Spjd l2arc_dev_t *dev; 7606185029Spjd spa_t *spa; 7607208373Smm uint64_t size, wrote; 7608219089Spjd clock_t begin, next = ddi_get_lbolt(); 7609185029Spjd 7610185029Spjd CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG); 7611185029Spjd 7612185029Spjd mutex_enter(&l2arc_feed_thr_lock); 7613185029Spjd 7614185029Spjd while (l2arc_thread_exit == 0) { 7615185029Spjd CALLB_CPR_SAFE_BEGIN(&cpr); 7616185029Spjd (void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock, 7617219089Spjd next - ddi_get_lbolt()); 7618185029Spjd CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock); 7619219089Spjd next = ddi_get_lbolt() + hz; 7620185029Spjd 7621185029Spjd /* 7622185029Spjd * Quick check for L2ARC devices. 7623185029Spjd */ 7624185029Spjd mutex_enter(&l2arc_dev_mtx); 7625185029Spjd if (l2arc_ndev == 0) { 7626185029Spjd mutex_exit(&l2arc_dev_mtx); 7627185029Spjd continue; 7628185029Spjd } 7629185029Spjd mutex_exit(&l2arc_dev_mtx); 7630219089Spjd begin = ddi_get_lbolt(); 7631185029Spjd 7632185029Spjd /* 7633185029Spjd * This selects the next l2arc device to write to, and in 7634185029Spjd * doing so the next spa to feed from: dev->l2ad_spa. This 7635185029Spjd * will return NULL if there are now no l2arc devices or if 7636185029Spjd * they are all faulted. 7637185029Spjd * 7638185029Spjd * If a device is returned, its spa's config lock is also 7639185029Spjd * held to prevent device removal. l2arc_dev_get_next() 7640185029Spjd * will grab and release l2arc_dev_mtx. 7641185029Spjd */ 7642185029Spjd if ((dev = l2arc_dev_get_next()) == NULL) 7643185029Spjd continue; 7644185029Spjd 7645185029Spjd spa = dev->l2ad_spa; 7646307265Smav ASSERT3P(spa, !=, NULL); 7647185029Spjd 7648185029Spjd /* 7649219089Spjd * If the pool is read-only then force the feed thread to 7650219089Spjd * sleep a little longer. 7651219089Spjd */ 7652219089Spjd if (!spa_writeable(spa)) { 7653219089Spjd next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz; 7654219089Spjd spa_config_exit(spa, SCL_L2ARC, dev); 7655219089Spjd continue; 7656219089Spjd } 7657219089Spjd 7658219089Spjd /* 7659185029Spjd * Avoid contributing to memory pressure. 7660185029Spjd */ 7661185029Spjd if (arc_reclaim_needed()) { 7662185029Spjd ARCSTAT_BUMP(arcstat_l2_abort_lowmem); 7663185029Spjd spa_config_exit(spa, SCL_L2ARC, dev); 7664185029Spjd continue; 7665185029Spjd } 7666185029Spjd 7667185029Spjd ARCSTAT_BUMP(arcstat_l2_feeds); 7668185029Spjd 7669251478Sdelphij size = l2arc_write_size(); 7670185029Spjd 7671185029Spjd /* 7672185029Spjd * Evict L2ARC buffers that will be overwritten. 7673185029Spjd */ 7674185029Spjd l2arc_evict(dev, size, B_FALSE); 7675185029Spjd 7676185029Spjd /* 7677185029Spjd * Write ARC buffers. 7678185029Spjd */ 7679307265Smav wrote = l2arc_write_buffers(spa, dev, size); 7680208373Smm 7681208373Smm /* 7682208373Smm * Calculate interval between writes. 7683208373Smm */ 7684208373Smm next = l2arc_write_interval(begin, size, wrote); 7685185029Spjd spa_config_exit(spa, SCL_L2ARC, dev); 7686185029Spjd } 7687185029Spjd 7688185029Spjd l2arc_thread_exit = 0; 7689185029Spjd cv_broadcast(&l2arc_feed_thr_cv); 7690185029Spjd CALLB_CPR_EXIT(&cpr); /* drops l2arc_feed_thr_lock */ 7691185029Spjd thread_exit(); 7692185029Spjd} 7693185029Spjd 7694185029Spjdboolean_t 7695185029Spjdl2arc_vdev_present(vdev_t *vd) 7696185029Spjd{ 7697185029Spjd l2arc_dev_t *dev; 7698185029Spjd 7699185029Spjd mutex_enter(&l2arc_dev_mtx); 7700185029Spjd for (dev = list_head(l2arc_dev_list); dev != NULL; 7701185029Spjd dev = list_next(l2arc_dev_list, dev)) { 7702185029Spjd if (dev->l2ad_vdev == vd) 7703185029Spjd break; 7704185029Spjd } 7705185029Spjd mutex_exit(&l2arc_dev_mtx); 7706185029Spjd 7707185029Spjd return (dev != NULL); 7708185029Spjd} 7709185029Spjd 7710185029Spjd/* 7711185029Spjd * Add a vdev for use by the L2ARC. By this point the spa has already 7712185029Spjd * validated the vdev and opened it. 7713185029Spjd */ 7714185029Spjdvoid 7715219089Spjdl2arc_add_vdev(spa_t *spa, vdev_t *vd) 7716185029Spjd{ 7717185029Spjd l2arc_dev_t *adddev; 7718185029Spjd 7719185029Spjd ASSERT(!l2arc_vdev_present(vd)); 7720185029Spjd 7721255753Sgibbs vdev_ashift_optimize(vd); 7722255753Sgibbs 7723185029Spjd /* 7724185029Spjd * Create a new l2arc device entry. 7725185029Spjd */ 7726185029Spjd adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP); 7727185029Spjd adddev->l2ad_spa = spa; 7728185029Spjd adddev->l2ad_vdev = vd; 7729219089Spjd adddev->l2ad_start = VDEV_LABEL_START_SIZE; 7730219089Spjd adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd); 7731185029Spjd adddev->l2ad_hand = adddev->l2ad_start; 7732185029Spjd adddev->l2ad_first = B_TRUE; 7733208373Smm adddev->l2ad_writing = B_FALSE; 7734185029Spjd 7735286570Smav mutex_init(&adddev->l2ad_mtx, NULL, MUTEX_DEFAULT, NULL); 7736185029Spjd /* 7737185029Spjd * This is a list of all ARC buffers that are still valid on the 7738185029Spjd * device. 7739185029Spjd */ 7740286570Smav list_create(&adddev->l2ad_buflist, sizeof (arc_buf_hdr_t), 7741286570Smav offsetof(arc_buf_hdr_t, b_l2hdr.b_l2node)); 7742185029Spjd 7743219089Spjd vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand); 7744286598Smav refcount_create(&adddev->l2ad_alloc); 7745185029Spjd 7746185029Spjd /* 7747185029Spjd * Add device to global list 7748185029Spjd */ 7749185029Spjd mutex_enter(&l2arc_dev_mtx); 7750185029Spjd list_insert_head(l2arc_dev_list, adddev); 7751185029Spjd atomic_inc_64(&l2arc_ndev); 7752185029Spjd mutex_exit(&l2arc_dev_mtx); 7753185029Spjd} 7754185029Spjd 7755185029Spjd/* 7756185029Spjd * Remove a vdev from the L2ARC. 7757185029Spjd */ 7758185029Spjdvoid 7759185029Spjdl2arc_remove_vdev(vdev_t *vd) 7760185029Spjd{ 7761185029Spjd l2arc_dev_t *dev, *nextdev, *remdev = NULL; 7762185029Spjd 7763185029Spjd /* 7764185029Spjd * Find the device by vdev 7765185029Spjd */ 7766185029Spjd mutex_enter(&l2arc_dev_mtx); 7767185029Spjd for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) { 7768185029Spjd nextdev = list_next(l2arc_dev_list, dev); 7769185029Spjd if (vd == dev->l2ad_vdev) { 7770185029Spjd remdev = dev; 7771185029Spjd break; 7772185029Spjd } 7773185029Spjd } 7774307265Smav ASSERT3P(remdev, !=, NULL); 7775185029Spjd 7776185029Spjd /* 7777185029Spjd * Remove device from global list 7778185029Spjd */ 7779185029Spjd list_remove(l2arc_dev_list, remdev); 7780185029Spjd l2arc_dev_last = NULL; /* may have been invalidated */ 7781185029Spjd atomic_dec_64(&l2arc_ndev); 7782185029Spjd mutex_exit(&l2arc_dev_mtx); 7783185029Spjd 7784185029Spjd /* 7785185029Spjd * Clear all buflists and ARC references. L2ARC device flush. 7786185029Spjd */ 7787185029Spjd l2arc_evict(remdev, 0, B_TRUE); 7788286570Smav list_destroy(&remdev->l2ad_buflist); 7789286570Smav mutex_destroy(&remdev->l2ad_mtx); 7790286598Smav refcount_destroy(&remdev->l2ad_alloc); 7791185029Spjd kmem_free(remdev, sizeof (l2arc_dev_t)); 7792185029Spjd} 7793185029Spjd 7794185029Spjdvoid 7795185029Spjdl2arc_init(void) 7796185029Spjd{ 7797185029Spjd l2arc_thread_exit = 0; 7798185029Spjd l2arc_ndev = 0; 7799185029Spjd l2arc_writes_sent = 0; 7800185029Spjd l2arc_writes_done = 0; 7801185029Spjd 7802185029Spjd mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL); 7803185029Spjd cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL); 7804185029Spjd mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL); 7805185029Spjd mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL); 7806185029Spjd 7807185029Spjd l2arc_dev_list = &L2ARC_dev_list; 7808185029Spjd l2arc_free_on_write = &L2ARC_free_on_write; 7809185029Spjd list_create(l2arc_dev_list, sizeof (l2arc_dev_t), 7810185029Spjd offsetof(l2arc_dev_t, l2ad_node)); 7811185029Spjd list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t), 7812185029Spjd offsetof(l2arc_data_free_t, l2df_list_node)); 7813185029Spjd} 7814185029Spjd 7815185029Spjdvoid 7816185029Spjdl2arc_fini(void) 7817185029Spjd{ 7818185029Spjd /* 7819185029Spjd * This is called from dmu_fini(), which is called from spa_fini(); 7820185029Spjd * Because of this, we can assume that all l2arc devices have 7821185029Spjd * already been removed when the pools themselves were removed. 7822185029Spjd */ 7823185029Spjd 7824185029Spjd l2arc_do_free_on_write(); 7825185029Spjd 7826185029Spjd mutex_destroy(&l2arc_feed_thr_lock); 7827185029Spjd cv_destroy(&l2arc_feed_thr_cv); 7828185029Spjd mutex_destroy(&l2arc_dev_mtx); 7829185029Spjd mutex_destroy(&l2arc_free_on_write_mtx); 7830185029Spjd 7831185029Spjd list_destroy(l2arc_dev_list); 7832185029Spjd list_destroy(l2arc_free_on_write); 7833185029Spjd} 7834185029Spjd 7835185029Spjdvoid 7836185029Spjdl2arc_start(void) 7837185029Spjd{ 7838209962Smm if (!(spa_mode_global & FWRITE)) 7839185029Spjd return; 7840185029Spjd 7841185029Spjd (void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0, 7842185029Spjd TS_RUN, minclsyspri); 7843185029Spjd} 7844185029Spjd 7845185029Spjdvoid 7846185029Spjdl2arc_stop(void) 7847185029Spjd{ 7848209962Smm if (!(spa_mode_global & FWRITE)) 7849185029Spjd return; 7850185029Spjd 7851185029Spjd mutex_enter(&l2arc_feed_thr_lock); 7852185029Spjd cv_signal(&l2arc_feed_thr_cv); /* kick thread out of startup */ 7853185029Spjd l2arc_thread_exit = 1; 7854185029Spjd while (l2arc_thread_exit != 0) 7855185029Spjd cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock); 7856185029Spjd mutex_exit(&l2arc_feed_thr_lock); 7857185029Spjd} 7858